From 0496a2aeaf0710d64ded4376204971f1629a6097 Mon Sep 17 00:00:00 2001 From: lsmertina Date: Sat, 7 Oct 2023 10:41:44 +1100 Subject: [PATCH 01/24] Add fastq filter module --- modules/fastq_filter_functions.py | 106 ++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 modules/fastq_filter_functions.py diff --git a/modules/fastq_filter_functions.py b/modules/fastq_filter_functions.py new file mode 100644 index 0000000..758db74 --- /dev/null +++ b/modules/fastq_filter_functions.py @@ -0,0 +1,106 @@ +def calc_gc_content(seq: str) -> float: + """ + Calculates gc content + Argument is string + Returns float in % + """ + seq_lower = seq.lower() + length_seq = len(seq_lower) + gc_count = 0 + for nt in seq_lower: + if nt=='g' or nt=='c': + gc_count+=1 + gc_content = (gc_count/length_seq)*100 + return gc_content +def seq_length(seq: str) -> str: + """ + Calculates sequence length + Argument is string + Returns string + """ + return len(seq) +def quality_score(seq: str) -> int: + """ + Calculates numeric quality score + Argument is string + Returns int value + """ + score_count = 0 + length_q_seq = len(seq) + for symbol in seq: + score_num = ord(symbol) - 33 + score_count+=score_num + mean_qs = (score_count/length_q_seq) + return mean_qs +def length_filter(seqs: dict, length_bounds=(0,1000)) -> dict: + """ + Filters fastq reads by length + Arguments: + -dictionary + -sequence length parameters (>= and <=) + Returns filtered dictionary + """ + #seqs = {'name': ('sequence', 'quality')} + output = [] + result = dict() + for name, (sequence, quality) in seqs.items(): + + if seq_length(sequence) <= length_bounds[1] and seq_length(sequence) >= length_bounds[0]: + output.append(name) + if name in output: + result[name] = (sequence, quality) + + return result +def quality_filter(seqs: dict, quality_threshold=25) -> dict: + """ + Filters fastq reads by quality score + Arguments: + -dictionary + -quality score threshold (>=) + Returns filtered dict + """ + #seqs = {'name': ('sequence', 'quality')} + output = [] + result = dict() + for name, (sequence, quality) in seqs.items(): + + if quality_score(quality) >= quality_threshold: + output.append(name) + if name in output: + result[name] = (sequence, quality) + + return result +def gc_filter(seqs: dict, gc_bounds=(0,100)) -> dict: + """ + Filters fastq reads by gc content + Arguments: + -dict + -gc content parameters (>= and <=) + Returns filtered dict + """ + #seqs = {'name': ('sequence', 'quality')} + output = [] + result = dict() + for name, (sequence, quality) in seqs.items(): + + if calc_gc_content(sequence) >= gc_bounds[0] and calc_gc_content(sequence) <= gc_bounds[1]: + output.append(name) + if name in output: + result[name] = (sequence, quality) + + return result +def fastq_filter(seqs: dict, gc_bounds: int, length_bounds: int, quality_threshold: int) -> dict: + """ + Filters fastq sequence by gc content, length and quality score + Arguments: dict with fastq sequences, filtering parameters + Returns filtered dictionary + """ + resulting_sequences = dict() + gc_filtered = gc_filter(seqs, gc_bounds = (0,100)) + length_filtered = length_filter(seqs, length_bounds = (0,1000)) + quality_filtered = quality_filter(seqs, quality_threshold = 15) + intersection = gc_filtered.keys() & length_filtered.keys() & quality_filtered.keys() + #intersection = {keys: gc_filtered[keys] for keys in gc_filtered.keys() & length_filtered.keys()} + #for keys, (sequence, quality) in intersection: + # resulting_sequences[keys] = (sequence, quality) + return intersection From 5f3ef9fe7bfcd24f7466c8091595af9b82754616 Mon Sep 17 00:00:00 2001 From: lsmertina Date: Sat, 7 Oct 2023 11:16:23 +1100 Subject: [PATCH 02/24] Add functions for protein sequences --- modules/protein_module.py | 130 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 modules/protein_module.py diff --git a/modules/protein_module.py b/modules/protein_module.py new file mode 100644 index 0000000..d71da27 --- /dev/null +++ b/modules/protein_module.py @@ -0,0 +1,130 @@ +from typing import Optional + +aa_code_dict = {'C':'Cys', 'c':'Cys', 'D':'Asp', 'd':'Asp', 'S':'Ser', 's':'Ser', 'Q':'Gln', 'q':'Gln', + 'K':'Lys', 'k':'Lys', 'I':'Ile', 'i':'Ile', 'P':'Pro', 'p':'Pro', 'T':'Thr', 't':'Thr', + 'F':'Phe', 'f':'Phe', 'N':'Asn', 'n':'Asn', 'G':'Gly', 'g':'Gly', 'H':'His', 'h':'His', + 'L':'Leu', 'l':'Leu', 'R':'Arg', 'r':'Arg', 'W':'Trp', 'w':'Trp', 'A':'Ala', 'a':'Ala', + 'V':'Val', 'v':'Val', 'E':'Glu', 'e':'Glu', 'Y':'Tyr', 'y':'Tyr', 'M':'Met', 'm':'Met'} + +aa_weight_dict = {'G':75, 'g':75, 'A':89, 'a':89, 'R':174, 'r':174, 'N':132, 'n':132, + 'D':133, 'd':133, 'C':121, 'c':133, 'E':147, 'e':147, 'Q':146, 'q':146, + 'H':155, 'h':155, 'I':131, 'i':131, 'L':131, 'l':131, 'K':146, 'k':146, + 'M':149, 'm':149, 'F':165, 'f':165, 'P':115, 'p':115, 'S':105, 's':105, + 'T':119, 't':119, 'W':204, 'w':204, 'Y':181, 'y':181, 'V':117, 'v':117} +def amino_acid_frequency(seq: str) -> dict: + """ + Calculates amino acid frequencies + Arguments: + -seq (str) input protein sequence + Return: + -dictionary with amino acid and its frequency + """ + freq_dict = {} + for letter in seq: + if letter in freq_dict: + freq_dict[letter] += 1 + else: + freq_dict[letter] = 1 + for letter in freq_dict: + freq_dict[letter] = round(freq_dict[letter] / len(seq) * 100, 2) + return freq_dict + + +def find_motifs(seq: str, motif: str): + """ + Finds a motif of interest in a protein sequence + Arguments: + -seq (str) input protein sequence + -motif (str) motif to be found in sequence + Return: + -position(s) of the motif in seq + """ + positions = [] + for i in range(len(seq) - len(motif) + 1): + window = seq[i:i+len(motif)] + if window == motif: + positions.append(i) + return positions + + +def check_protein_seq(seq: str) -> str: + """ + Checks whether a sequence is written using 1-letter amino acid code + Arguments: + -seq (str) input protein sequence + Return: + - str, 'single_letter_prot_seq' otherwise 'Invalid Input' error is raised + """ + unique_chars = set(seq) + single_letter = set('GALMFWKQESPVICYHRNDTgalmfwkqespvicyhrndt') + + if unique_chars <= single_letter: + seq = 'single_letter_prot_seq' + + else: + raise ValueError("Invalid Input") + return seq + + +def molecular_weight(seq: str) -> int: + """ + Calculates molecular weight of a protein + Arguments: + - seq (str) 1-letter coded protein sequence + Return: + - int, molecular weight (g/mol) rounded to integer + """ + list_input_seq = list(seq) + water_mw = 18 + for aa in list_input_seq: + total_mw = sum(aa_weight_dict[a] for a in list_input_seq) + mw_water_removed = (total_mw - (water_mw * (len(list_input_seq)-1))) + return mw_water_removed + + +def one_to_three_letter(seq: str) -> str: + """ + Converts a 1-letter amino acid code sequence into a 3-letter sequence + Arguments: + - seq (str) sequence to convert, must be 1-letter coded protein sequence + Return: + - str, a 3-letter coded protein sequence without spaces + """ + three_letter_aa = '' + for aa in seq: + three_letter_aa_seq += aa_code_dict[aa] + return three_letter_aa_seq + + +def run_protein_tool(*args: str, function: str, motif: Optional[str]=None): + """ + This is the main function + Arguments: + -seq(str) protein sequence(s) + -function(str) specify the function + -motif(str), optional argument for find_motifs function + Return: + -result of the specified function + """ + results = [] + for seq in args: + if check_protein_seq(seq) == 'single_letter_prot_seq': + if function == 'check_protein_seq': + for seq in args: + results.append(check_protein_seq(seq)) + elif function == 'molecular_weight': + for seq in args: + results.append(molecular_weight(seq)) + elif function == 'one_to_three_letter': + for seq in args: + results.append(one_to_three_letter(seq)) + elif function == 'amino_acid_frequency': + for seq in args: + results.append(amino_acid_frequency(seq)) + elif function == 'find_motifs': + for seq in args: + results.append(find_motifs(seq, motif)) + if len(results) == 1: + results = results[0] + return results + From e02237542fb084f09964ed1695304531d60c8ac3 Mon Sep 17 00:00:00 2001 From: lsmertina Date: Sat, 7 Oct 2023 17:16:08 +1100 Subject: [PATCH 03/24] Add nucleic acid functions from hw3 --- modules/nucleic_acid_module.py | 125 +++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 modules/nucleic_acid_module.py diff --git a/modules/nucleic_acid_module.py b/modules/nucleic_acid_module.py new file mode 100644 index 0000000..c24d2bd --- /dev/null +++ b/modules/nucleic_acid_module.py @@ -0,0 +1,125 @@ +def transcribe(seq: str) -> str: + """ + Transcribes DNA->RNA + Argument is string + Return is string + """ + list_input = list(seq) + for i in range(len(seq)): + if (list_input[i] == 'T'): + list_input[i] = 'U' + elif (list_input[i] == 't'): + list_input[i]='u' + return "".join(list_input) + + +def reverse(seq: str) -> str: + """ + Returns reversed sequence + Argument is string + Return is string + """ + output = seq[::-1] + return output + + +def complement(seq: str) -> str: + """ + Returns a complementary sequence + Argument is a string + Return is a string + """ + list_input = list(seq) + for i in range(len(seq)): + if (list_input[i]=='G'): + list_input[i]='C' + elif (list_input[i]== 'g'): + list_input[i]='c' + elif (list_input[i]=='C'): + list_input[i]='G' + elif (list_input[i]=='c'): + list_input[i]='g' + elif (list_input[i] == 'T'): + list_input[i] = 'A' + elif (list_input[i] == 't'): + list_input[i]='a' + elif (list_input[i] == 'A'): + list_input[i] = 'T' + elif (list_input[i]=='a'): + list_input[i]='t' + + else: + + list_input = list(seq) + for i in range(len(seq)): + if (list_input[i]=='G'): + list_input[i]='C' + elif (list_input[i]== 'g'): + list_input[i]='c' + elif (list_input[i]=='C'): + list_input[i]='G' + elif (list_input[i]=='c'): + list_input[i]='g' + elif (list_input[i] == 'U'): + list_input[i] = 'A' + elif (list_input[i] == 'u'): + list_input[i]='a' + elif (list_input[i] == 'A'): + list_input[i] = 'U' + elif (list_input[i]=='a'): + list_input[i]='u' + return "".join(list_input) + + +def check_nucleic_acid(seq: str) -> str: + """ + This function checks whether input sequence(s) is a nucleic acid + Argument is str + Return is str + """ + unique_chars = set(seq) + nucleotides_dna = set('ATGCatgc') + nucleotides_rna = set('AUGCaugc') + if unique_chars <= nucleotides_dna: + seq = 'dna' + elif unique_chars <= nucleotides_rna: + seq = 'rna' + else: + raise ValueError("Invalid Input") + return seq + + +def reverse_complement(seq: str) -> str: + """ + This function returns a reversed complementary sequence + Argument is str + Return is str + """ + complement_seq = complement(seq) + reverse_compl_seq = reverse(complement_seq) + return reverse_compl_seq + + +def run_dna_rna_tools(*args: str, function: str) -> str: + """ + This function combines the functions above + Arguments: *args are input sequences, function is a function of choice + Returns: str, processed seqeunces depending on the function chosen + """ + results = [] + for seq in args: + check_nucleic_acid(seq) + if function == 'transcribe': + results.append(transcribe(seq)) + if function == 'complement': + results.append(complement(seq)) + if function == 'reverse': + results.append(reverse(seq)) + if function == 'reverse_complement': + results.append(reverse_complement(seq)) + if len(results) == 1: + results = results[0] + + return results + + From fc02f8280ed6c5950e528e4aa932764bd23a4698 Mon Sep 17 00:00:00 2001 From: lsmertina Date: Sat, 7 Oct 2023 17:46:39 +1100 Subject: [PATCH 04/24] Add the main script with 3 functions --- miscellaneous.py | 71 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 miscellaneous.py diff --git a/miscellaneous.py b/miscellaneous.py new file mode 100644 index 0000000..540510b --- /dev/null +++ b/miscellaneous.py @@ -0,0 +1,71 @@ +def fastq_filter(seqs: dict, gc_bounds: int, length_bounds: int, quality_threshold: int) -> dict: + """ + Filters fastq sequence by gc content, length and quality score + Arguments: dict with fastq sequences, filtering parameters + Returns filtered dictionary + """ + result = dict() + gc_filtered = gc_filter(seqs, gc_bounds) + length_filtered = length_filter(seqs, length_bounds) + quality_filtered = quality_filter(seqs, quality_threshold) + intersection = gc_filtered.keys() & length_filtered.keys() & quality_filtered.keys() + + for keys, (sequence, quality) in seqs.items(): + if keys in intersection: + result[keys] = (sequence, quality) + return result + + +def run_dna_rna_tools(*args: str, function: str) -> str: + """ + This function combines the functions above + Arguments: *args are input sequences, function is a function of choice + Returns: str, processed seqeunces depending on the function chosen + """ + results = [] + for seq in args: + check_nucleic_acid(seq) + if function == 'transcribe': + results.append(transcribe(seq)) + if function == 'complement': + results.append(complement(seq)) + if function == 'reverse': + results.append(reverse(seq)) + if function == 'reverse_complement': + results.append(reverse_complement(seq)) + if len(results) == 1: + results = results[0] + return results + + +def run_protein_tool(*args: str, function: str, motif: Optional[str]=None): + """ + This is the main function + Arguments: + -seq(str) protein sequence(s) + -function(str) specify the function + -motif(str), optional argument for find_motifs function + Return: + -result of the specified function + """ + results = [] + for seq in args: + if check_protein_seq(seq) == 'single_letter_prot_seq': + if function == 'check_protein_seq': + for seq in args: + results.append(check_protein_seq(seq)) + elif function == 'molecular_weight': + for seq in args: + results.append(molecular_weight(seq)) + elif function == 'one_to_three_letter': + for seq in args: + results.append(one_to_three_letter(seq)) + elif function == 'amino_acid_frequency': + for seq in args: + results.append(amino_acid_frequency(seq)) + elif function == 'find_motifs': + for seq in args: + results.append(find_motifs(seq, motif)) + if len(results) == 1: + results = results[0] + return results From e44c87b7791641256bab32049202c59aa9bdd9e4 Mon Sep 17 00:00:00 2001 From: lsmertina Date: Sat, 14 Oct 2023 12:27:13 +1100 Subject: [PATCH 05/24] Create a script --- updated_HW5.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 updated_HW5.py diff --git a/updated_HW5.py b/updated_HW5.py new file mode 100644 index 0000000..e69de29 From 92e6910a23f779b2b834d31b5e86bfe483bd8e82 Mon Sep 17 00:00:00 2001 From: lsmertina Date: Wed, 18 Oct 2023 13:04:22 +1100 Subject: [PATCH 06/24] Create bio_files_processor.py file for HW6 --- bio_files_processor.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 bio_files_processor.py diff --git a/bio_files_processor.py b/bio_files_processor.py new file mode 100644 index 0000000..e69de29 From b8cd252b6390b3c2cf45d7502afbe1a01b4b3c04 Mon Sep 17 00:00:00 2001 From: lsmertina Date: Fri, 23 Feb 2024 17:50:44 +1100 Subject: [PATCH 07/24] add script and sample data --- biopython_gc_filter.ipynb | 77 ++++++ example_fastq.fastq | 356 ++++++++++++++++++++++++++++ example_fastq.fastq:Zone.Identifier | 3 + misc_module | 1 + 4 files changed, 437 insertions(+) create mode 100644 biopython_gc_filter.ipynb create mode 100644 example_fastq.fastq create mode 100644 example_fastq.fastq:Zone.Identifier create mode 160000 misc_module diff --git a/biopython_gc_filter.ipynb b/biopython_gc_filter.ipynb new file mode 100644 index 0000000..a3da0d8 --- /dev/null +++ b/biopython_gc_filter.ipynb @@ -0,0 +1,77 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 7, + "id": "21c5a784", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "9\n" + ] + } + ], + "source": [ + "from Bio import SeqIO\n", + "from Bio.SeqUtils import GC\n", + "\n", + "def filter_fastq(input_path: str, quality_threshold: int, output_filename=\"final_filtered.fastq\", gc_bounds=(40, 60), length_bounds=(50, 350)):\n", + " filename = input_path\n", + " records = SeqIO.parse(filename, \"fastq\")\n", + " ###quality filter\n", + " good_reads = (rec for rec in records if min(rec.letter_annotations[\"phred_quality\"]) >= quality_threshold)\n", + " result_quality = SeqIO.write(good_reads, \"good_quality.fastq\", \"fastq\")\n", + " result_quality_GC = SeqIO.parse(\"good_quality.fastq\", \"fastq\")\n", + " \n", + " ###GC content filter\n", + " min_gc_content = gc_bounds[0]\n", + " max_gc_content = gc_bounds[1]\n", + " GC_quality_filt = []\n", + " \n", + " for sequence in result_quality_GC:\n", + " if min_gc_content <= GC(sequence.seq) <= max_gc_content:\n", + " GC_quality_filt.append(sequence)\n", + " \n", + " result_quality = SeqIO.write(GC_quality_filt, \"good_quality_GC.fastq\", \"fastq\")\n", + " result_quality_GC_length = SeqIO.parse(\"good_quality_GC.fastq\", \"fastq\")\n", + " \n", + " ##length filter\n", + " filtered_GC_quality_length = []\n", + " \n", + " for sequence in result_quality_GC_length:\n", + " if len(sequence.seq) >= length_bounds[0] and len(sequence.seq) <= length_bounds[1]:\n", + " filtered_GC_quality_length.append(sequence)\n", + " \n", + " result_quality = SeqIO.write(filtered_GC_quality_length, output_filename, \"fastq\")\n", + " \n", + " print(result_quality)\n", + "\n", + "#filter_fastq(\"example_fastq.fastq\", 15)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/example_fastq.fastq b/example_fastq.fastq new file mode 100644 index 0000000..883b51f --- /dev/null +++ b/example_fastq.fastq @@ -0,0 +1,356 @@ +@SRX079804:1:SRR292678:1:1101:21885:21885 1:N:0:1 BH:ok +ACAGCAACATAAACATGATGGGATGGCGTAAGCCCCCGAGATATCAGTTTACCCAGGATAAGAGATTAAATTATGAGCAACATTATTAA ++SRX079804:1:SRR292678:1:1101:21885:21885 1:N:0:1 BH:ok +FGGGFGGGFGGGFGDFGCEBB@CCDFDDFFFFBFFGFGEFDFFFF;D@DD>C@DDGGGDFGDGG?GFGFEGFGGEF@FDGGGFGFBGGD +@SRX079804:1:SRR292678:1:1101:24563:24563 1:N:0:1 BH:failed +ATTAGCGAGGAGGAGTGCTGAGAAGATGTCGCCTACGCCGTTGAAATTCCCTTCAATCAGGGGGTACTGGAGGATACGAGTTTGTGTG ++SRX079804:1:SRR292678:1:1101:24563:24563 1:N:0:1 BH:failed +BFFFFFFFB@B@A<@D>BDDACDDDEBEDEFFFBFFFEFFDFFF=CC@DDFD8FFFFFFF8/+.2,@7<<:?B/:<><-><@.A*C>D +@SRX079804:1:SRR292678:1:1101:30161:30161 1:N:0:1 BH:failed +GAACGACAGCAGCTCCTGCATAACCGCGTCCTTCTTCTTTAGCGTTGTGCAAAGCATGTTTTGTATTACGGGCATCTCGAGCGAATC ++SRX079804:1:SRR292678:1:1101:30161:30161 1:N:0:1 BH:failed +DFFFEGDGGGGFGGEDCCDCEFFFFCCCCCB>CEBFGFBGGG?DE=:6@=>AD?D8DCEE:>EEABE5D@5:DDCA;EEE-DCD +@SRX079804:1:SRR292678:1:1101:47176:47176 1:N:0:1 BH:failed +TGAAGCGTCGATAGAAGTTAGCAAACCCGCGGAACTTCCGTACATCAGACACATTCCGGGGGGTGGGCCAATCCATGATGCCTTTG ++SRX079804:1:SRR292678:1:1101:47176:47176 1:N:0:1 BH:failed +FF@FFBEEEEFFEFFD@EDEFFB=DFEEFFFE8FFE8EEDBFDFEEBE+E46.'8.5::EE:?E>A6@?)>;>9D<C9DEBAAB=5C?<@0=A?D@BDB;:BA?BDDFH?B@DCB6BEBDA??AA9. +@SRX079804:1:SRR292678:1:1101:52180:52180 1:N:0:1 BH:failed +TTTTTGGTTTTAGGTCTAACATGTAAGTCTTTAATCTATTTTGAATTAATTTTTGGATGAGGTGTAAGGAAGGGATCCAGTTTC ++SRX079804:1:SRR292678:1:1101:52180:52180 1:N:0:1 BH:failed +FEFFFFFF=FEEFDFD>EE:?<5@BFAFCFEE7>C>:/6:2<344DA:6DDCDC>)34773DD?@DBA9B +@SRX079804:1:SRR292678:1:1101:105156:105156 1:N:0:1 BH:ok +ACTGCTGAGCTTAAATGGCGGCAGTCTGACGGTTACCAACGGGGGCACTTCAACCGGTTCGTTAACGGGGAGCGGAGAGCTGA ++SRX079804:1:SRR292678:1:1101:105156:105156 1:N:0:1 BH:ok +GFFEGGFGGGGEGGGGGGGGGFDD=DDE7EDD6CD?FEDEE@EBEFEE.DD5DDD@B<7>/0543C?BEE?@@BE@; +@SRX079804:1:SRR292678:1:1101:135168:135168 1:N:0:1 BH:ok +CCTTCCTAAAAATTAAGAATCTTAACAATTAGCAGCACAACCAAAATTATTACCGAAAGGACTTACTCCTCCGCCAAATCCA ++SRX079804:1:SRR292678:1:1101:135168:135168 1:N:0:1 BH:ok +GGGEGGGEGGBFFFFFGCFFEGG6AEEEDEEGGFGGCG4EGFFGDFFBGEGFGGFECBDEGGEFGBFF?CGFBFGGGGFGFF +@SRX079804:1:SRR292678:1:1101:149302:149302 1:N:0:1 BH:changed:1 +TAGGGTTGTATTTGCAGATCCATGGCATGCCAAAAAGAACATCGTCCCGTCCAATATCTGCAACATACCAGTTGGTTGGTA ++SRX079804:1:SRR292678:1:1101:149302:149302 1:N:0:1 BH:changed:1 +@;CBA=:@;@DBDCDEEE/EEEEEEF@>FBEEB=EFA>EEBD=DAEEEEB9)99>B99BC)@,@<9CDD=C,5;B::?@;A +@SRX079804:1:SRR292678:1:1101:170868:170868 2:N:0:1 BH:failed +CTGCCGAGACTGTTCTCAGACATGGAAAGCTCGATTCGCATACACTCGCTGAGTAAGAGAGTCACACCAAATCACAGATT ++SRX079804:1:SRR292678:1:1101:170868:170868 2:N:0:1 BH:failed +E;FFFEGFGIGGFBG;C6D<@C7CDGFEFGFHDFEHHHBBHHFDFEFBAEEEEDE@A2=DA:??C3:@>EEBEEHEFEHHFFHH?FGBGFBBD77B;;C?FFFFGGFED.BBABBG@DBBE +@SRX079804:1:SRR292678:1:1101:190845:190845 1:N:0:1 BH:changed:1 +CCTCAGCGTGGATTGCCGCTCATGCAGGAGCAGATAATCCCTTCGCCATCCCATTAAGCGCCGTTGTCGGTATTCC ++SRX079804:1:SRR292678:1:1101:190845:190845 1:N:0:1 BH:changed:1 +FF@FFCFEECEBEC@@BBBBDFBBFFDFFEFFEB8FFFFFFFFEFCEB/>BBA@AFFFEEEEECE;ACD@DBBEEE +@SRX079804:1:SRR292678:1:1101:198993:198993 2:N:0:1 BH:failed +AGTTATTTATGCATCATTCTCATGTATGAGCCAACAAGATAGTACAAGTTTTATTGCTATGAGTTCAGTACAACA ++SRX079804:1:SRR292678:1:1101:198993:198993 2:N:0:1 BH:failed +<<<=;@B??@<>@><48876EADEG6B.BB@.?+98204<:<>@?A=@EFEFFFEEFB +@SRX079804:1:SRR292678:1:1101:204480:204480 1:N:0:1 BH:failed +AGTGAGACACCCCTGAACATTCCTAGTAAGACATCTTTGAATATTACTAGTTAGCCACACTTTAAAATGACCCG ++SRX079804:1:SRR292678:1:1101:204480:204480 1:N:0:1 BH:failed +<98;<@@@:@CD@BCCDD=DBBCEBBAAA@9???@BCDBCGF=GEGDFGDBEEEEEFFFF=EDEE=DCD@@BBC +@SRX079804:1:SRR292678:1:1101:212327:212327 2:N:0:1 BH:failed +TTACCTCTGCTTTTTCGCCTGTTACTTCTACTAATCCTTCATCTATTGCGAATGGCCCTACTACTGACGAAAT ++SRX079804:1:SRR292678:1:1101:212327:212327 2:N:0:1 BH:failed +DBCCC@@;A@BDCCCE>BBED>GDCDBFBFFEBEECFGGD@@BCB<<8@;09746:@@>@EEECEEDE/FEED +@SRX079804:1:SRR292678:1:1101:230386:230386 1:N:0:1 BH:failed +TTCTGATTGGAGTGAGAGTGCCATTTGTTTCGCTGATTGGACGTTGGAAAGCGCCTTGACCTTTGACAGCAG ++SRX079804:1:SRR292678:1:1101:230386:230386 1:N:0:1 BH:failed +=BDD=DCDBBCDADD@@B;B@CC7C@B@>=BACDD,=??8DDCDD7CCCCDBDEDBDDDDDEEBDDDBD?DB +@SRX079804:1:SRR292678:1:1101:236093:236093 1:N:0:1 BH:failed +CATCTAACTCTCATTCTAGATTCTTAAGTTGGCTACACTTTGCCGTCATTCTCGGTGGATTAGCTATTGGG ++SRX079804:1:SRR292678:1:1101:236093:236093 1:N:0:1 BH:failed +A:@A@;@BB@GGFGG@A@@817729B +@SRX079804:1:SRR292678:1:1101:251912:251912 2:N:0:1 BH:ok +TGTAAATGGAAATGAACCTAATATGTATGCACAAACTATTAAAGCATATCTTGCAAAAGGAGCAATGGCG ++SRX079804:1:SRR292678:1:1101:251912:251912 2:N:0:1 BH:ok +GHFDHHHGHHHFFHHGHHHGGGGGGDEFFEGGGGEFHHFHGFGHHHHFHHDDD6@=DCACFFGGEFBBG +@SRX079804:1:SRR292678:1:1101:278698:278698 1:N:0:1 BH:ok +CTAATAATGGTAATTGAACCATAGAAGATAAGTTCATAATGTAATAAATACATCCATAGAGTTATTAA ++SRX079804:1:SRR292678:1:1101:278698:278698 1:N:0:1 BH:ok +DDBDBCCCDD@FFFB9<<<@DA=DA@B:@=@@AC@GGFCGECFFDGGCGFFGGFFCEBF9>?@>BDFF +@SRX079804:1:SRR292678:1:1101:295878:295878 1:N:0:1 BH:changed:1 +AATGCAAACAGGATGATATTTGAATCCGTAATACTGTTCTTTCATCATAAATAATTTATGCAGATAC ++SRX079804:1:SRR292678:1:1101:295878:295878 1:N:0:1 BH:changed:1 +HHHHHEGFHDGFEGBCBEEEGGGG@EDGCGBBBEGF4?EFDBDDBFE8DEE-E?EE;B@EFC=;FDE +@SRX079804:1:SRR292678:1:1101:306575:306575 1:N:0:1 BH:failed +ATCTTTGAACTTTGTCTCACTTTCCCCCATCTCACCATTCCTCCTGTTCTGTGAACCCCAGTTTCA ++SRX079804:1:SRR292678:1:1101:306575:306575 1:N:0:1 BH:failed +E::EA@E<6B8>97:<6084649?@:?EDED=BEBEGGFEDGECECBDEFDDEEGGEDDFFF +@SRX079804:1:SRR292678:1:1101:403661:403661 1:N:0:1 BH:failed +GATGGCTTTGCTTTCTCATTCTCCTCTCCATCGTTCCCATCTTCGCCCTCAGACGCTGATTGAT ++SRX079804:1:SRR292678:1:1101:403661:403661 1:N:0:1 BH:failed +EG=DFFDFFDDGDGGGGGFFGGGGBADADCEEE5EC>CCCE6BEEEGGGGCBEGGE@9BCEF;>>D@D +@SRX079804:1:SRR292678:1:1101:425870:425870 1:N:0:1 BH:failed +GTACAGCTCTCCTCGTTACCAGCACATCTTGGACACCCGACGAAGACTTTGACATGCTCCTC ++SRX079804:1:SRR292678:1:1101:425870:425870 1:N:0:1 BH:failed +E@EED@EF=D>=EED@D@7DBF +@SRX079804:1:SRR292678:1:1101:429745:429745 1:N:0:1 BH:failed +GGCGATTGTGAAGGCATAAGAGTGGGACATAGTTCAAGTCCAGAACGAATTAAACGCACAA ++SRX079804:1:SRR292678:1:1101:429745:429745 1:N:0:1 BH:failed +BB2B=A@A:BFBFFFFF +@SRX079804:1:SRR292678:1:1101:475293:475293 2:N:0:1 BH:ok +TACATTTGATTTCTTTATAAGATTTCTTACTGTAAAATCATCGCTATTTAACAGCTTATT ++SRX079804:1:SRR292678:1:1101:475293:475293 2:N:0:1 BH:ok +FHFFFFDC@FGFEDGE?EEDC6EEEDEF?EEEE8EHHGGHFGFFEGGGGBFBDBDEBCBCA +@SRX079804:1:SRR292678:1:1101:511594:511594 1:N:0:1 BH:ok +AGCACAATCAGATTCGCTTATGACGGCGATGAAGAAATTGCGATGAAATGTGAGGTGA ++SRX079804:1:SRR292678:1:1101:511594:511594 1:N:0:1 BH:ok +HCHFFHHHGFHBFFFEGFFEFHFEHGBGECHEHB?CDDEFCDBFF9DFCD.BC +@SRX079804:1:SRR292678:1:1101:527839:527839 2:N:0:1 BH:changed:4 +GACATTTCTTTCTGGGAAGGCTTATTAAACGATAAAGATAATGATATACGTTTTGAT ++SRX079804:1:SRR292678:1:1101:527839:527839 2:N:0:1 BH:changed:4 +>35/-;,><04%'A.?4?:>BE.DC-@???CE:@EDFDBDG7B;=<)?1.@?2A5<5 +@SRX079804:1:SRR292678:1:1101:547309:547309 1:N:0:1 BH:failed +AGGTCTGCACTGGCTCTTCAGAGCGCAAGCGAGGGAAGAGAAATATAGTGACGCAC ++SRX079804:1:SRR292678:1:1101:547309:547309 1:N:0:1 BH:failed +GEGFGFD=FDB8B7DDFFFF@/DC@+2:66>-@>9);<2: +@SRX079804:1:SRR292678:1:1101:570367:570367 1:N:0:1 BH:failed +CTATGGCCACTTTTGTATTCTCGATTGAGGTTATTCGCTCACCCATCTTTTCCAA ++SRX079804:1:SRR292678:1:1101:570367:570367 1:N:0:1 BH:failed +FFEE;FFFFBFFFFF<9BEDEBEACDD3DD0B5>>0?:@>FFBEEBEFFFFGCCBCDGIGDGHEFGG=GGGGHFHEHF@FEFE?CGEEEFADEAEC +@SRX079804:1:SRR292678:1:1101:590521:590521 1:N:0:1 BH:failed +CCTCTTCTCTTTCGCGGTCATCTTGGGTTTCGCGCCTTTCTTCTTGACGACAC ++SRX079804:1:SRR292678:1:1101:590521:590521 1:N:0:1 BH:failed +EGFDD@FDF=FDFFEEGGBEFDGF=FGAGEEEEEDFBGGEE8EEE@>5;>90< +@SRX079804:1:SRR292678:1:1101:601307:601307 1:N:0:1 BH:changed:1 +TTGGCGTGCTGATGATTATCGGTATCTTCAAAGGCGCGCAGCCTGCGGGCTG ++SRX079804:1:SRR292678:1:1101:601307:601307 1:N:0:1 BH:changed:1 +GGGEGFGGEGE:EE>GFFGGGGDCGEBFFF>G=EBFFEC?DFGAD?DDECBE +@SRX079804:1:SRR292678:1:1101:631057:631057 1:N:0:1 BH:failed +CACCATTATCCTATTTCTGAACACATTTGACAGTCACGGCACTAGCATTGG ++SRX079804:1:SRR292678:1:1101:631057:631057 1:N:0:1 BH:failed +GGGGGGGGBFGGGEGFGGGGGFGEFFDFFFF?EBFEEDBFGEE@BE;E?E7 +@SRX079804:1:SRR292678:1:1101:654270:654270 1:N:0:1 BH:failed +ATCTTTCTCTCTCTGCTGCATTCTCCGCTTCAGCTCCTCAATTTCAATCA ++SRX079804:1:SRR292678:1:1101:654270:654270 1:N:0:1 BH:failed +GGGGGGEDGGFEGDGCGGBCEFGGGGFBFGGEGGGGGEGFCFEEGGFEDB +@SRX079804:1:SRR292678:1:1101:667761:667761 1:N:0:1 BH:failed +CAGCCTTTTGAGGTCGTCTATCGCAGCGTGTCCGCGACGTTTTGTTGCG ++SRX079804:1:SRR292678:1:1101:667761:667761 1:N:0:1 BH:failed +GGFGGG=GGGG@GFGGG@GGEGFGGGEGGGFGGEG@EEEDE8EE=E=DE +@SRX079804:1:SRR292678:1:1101:671526:671526 1:N:0:1 BH:ok +CAAGTAGAAGGTTAGCGCCTCTCTGTAAAAGGAGTCAAGCGCTATGTC ++SRX079804:1:SRR292678:1:1101:671526:671526 1:N:0:1 BH:ok +HHHHHGHEHHDGGGCGGGGHFGGGGEDGGGGGBEBEBEEEFGHFHFHH +@SRX079804:1:SRR292678:1:1101:685633:685633 2:N:0:1 BH:failed +GCTGTAATGGATCCACTAATTGGGACAGTGGTGGATAAAACGAATAC ++SRX079804:1:SRR292678:1:1101:685633:685633 2:N:0:1 BH:failed +DADADCGFEFEEEGEGGEFDEEEEBDAC;C10<9?AGBGGGFF7DFB +@SRX079804:1:SRR292678:1:1101:769626:769626 1:N:0:1 BH:failed +GTATTCCTTGACAGTCGAAAGAATCACTGCTAACCCAGGC ++SRX079804:1:SRR292678:1:1101:769626:769626 1:N:0:1 BH:failed +C>5BBCCCD=ACDD@A7@@B@A?B?=8B??EDEEDEBDEE +@SRX079804:1:SRR292678:1:1101:776222:776222 1:N:0:1 BH:failed +AGAACGTACACCCTACGCTAAGCAGTGGCTCCATGCCAA ++SRX079804:1:SRR292678:1:1101:776222:776222 1:N:0:1 BH:failed +BGDGGGGFEGGGGEGGGGFGFF6=FCFAFFEE,=C?EDD +@SRX079804:1:SRR292678:1:1101:782183:782183 2:N:0:1 BH:failed +TACGCGTAACGACGTCATAGCCATGACGCTTCAATAAA ++SRX079804:1:SRR292678:1:1101:782183:782183 2:N:0:1 BH:failed +D +@SRX079804:1:SRR292678:1:1101:828383:828383 1:N:0:1 BH:failed +AAACCACATATGACATGAGTGACGGGACTAAAGTTC ++SRX079804:1:SRR292678:1:1101:828383:828383 1:N:0:1 BH:failed +FFEEFEEB=E,C>CDEEEECEBEEEC?F;BDDDDAE +@SRX079804:1:SRR292678:1:1101:829239:829239 1:N:0:1 BH:ok +TCGATCCTTCTGCCTCAAAGTATACTAGGACGCAT ++SRX079804:1:SRR292678:1:1101:829239:829239 1:N:0:1 BH:ok +GGGDFGGBGFFEBFEDCBCDCGGGGBEEE=GE?EE +@SRX079804:1:SRR292678:1:1101:868419:868419 1:N:0:1 BH:ok +ATTCGTCAGGCCCAATAACATCATGAATTTCCAG ++SRX079804:1:SRR292678:1:1101:868419:868419 1:N:0:1 BH:ok +DEEEEEEEBDFFFFFFFF8FEED8@FFFBFFEFF +@SRX079804:1:SRR292678:1:1101:892716:892716 2:N:0:1 BH:ok +AATTTACCTAATGGAATCAATGAGGCTACTCCA ++SRX079804:1:SRR292678:1:1101:892716:892716 2:N:0:1 BH:ok +@BCBBBCCBCCCCCCCDDAAFFGEEEDBF@EE@ +@SRX079804:1:SRR292678:1:1101:893159:893159 2:N:0:1 BH:ok +TTCTCTGCTTTTCATATCTTGTCATAAAAATT ++SRX079804:1:SRR292678:1:1101:893159:893159 2:N:0:1 BH:ok +CBDCDEEEEEBEEEEGDDFDEEEEDGFFEGFE +@SRX079804:1:SRR292678:1:1101:918742:918742 1:N:0:1 BH:failed +CTCTCCATGCACAAAGAATATCACAGCCAAA ++SRX079804:1:SRR292678:1:1101:918742:918742 1:N:0:1 BH:failed +EEEBA?@;B@EEE@BEE=?EDDDDADCDA?E +@SRX079804:1:SRR292678:1:1101:923787:923787 2:N:0:1 BH:ok +TTGTGAAGGATGGGATATTAGTGTAGATGA ++SRX079804:1:SRR292678:1:1101:923787:923787 2:N:0:1 BH:ok +EEBBEGEEE=BBB<@DCDCGD@D>=DEGEE +@SRX079804:1:SRR292678:1:1101:933189:933189 1:N:0:1 BH:failed +GTCTGCACTATCGAGGGCTGTGCCTTTGC ++SRX079804:1:SRR292678:1:1101:933189:933189 1:N:0:1 BH:failed +FEFFDBFF8FE>?DFFFCEBCEEBBEDE6 +@SRX079804:1:SRR292678:1:1101:937136:937136 1:N:0:1 BH:failed +TTTCTTTGGCTTAAAGATAGTTTTAGTC ++SRX079804:1:SRR292678:1:1101:937136:937136 1:N:0:1 BH:failed +EFFFEEEEFCBCDDDDE@/E?@@7@@3< +@SRX079804:1:SRR292678:1:1101:940351:940351 1:N:0:1 BH:changed:1 +TGCCGTGGGAATGACAAACAAGCATCC ++SRX079804:1:SRR292678:1:1101:940351:940351 1:N:0:1 BH:changed:1 +DECC@GFFBF=EBEAFDFGD?FFF8FF +@SRX079804:1:SRR292678:1:1101:940693:940693 1:N:0:1 BH:failed +CACATTATGAACTATGGGCACTGCAT ++SRX079804:1:SRR292678:1:1101:940693:940693 1:N:0:1 BH:failed +EEEGFDEDFEGGGGGFEGBGGGFGGG +@SRX079804:1:SRR292678:1:1101:955819:955819 1:N:0:1 BH:failed +CACCTAGCAGCAACGGACGAGTCAG ++SRX079804:1:SRR292678:1:1101:955819:955819 1:N:0:1 BH:failed +GGGGGEEEGGEGGGFGEGG;F@EFF +@SRX079804:1:SRR292678:1:1101:958051:958051 2:N:0:1 BH:ok +TTAATATTTCCATCTGAACTTCGC ++SRX079804:1:SRR292678:1:1101:958051:958051 2:N:0:1 BH:ok +EDDBGFEGFGHHFHGGEDEGBGDB +@SRX079804:1:SRR292678:1:1101:996098:996098 1:N:0:1 BH:failed +CTAAGAGAGTTTGTAATGCGGAC ++SRX079804:1:SRR292678:1:1101:996098:996098 1:N:0:1 BH:failed +DD=DBDBDC4EFFFD@?CD@ACD +@SRX079804:1:SRR292678:1:1101:1020278:1020278 2:N:0:1 BH:ok +AAAGTGCAGAACATGCAGATAT ++SRX079804:1:SRR292678:1:1101:1020278:1020278 2:N:0:1 BH:ok +D>AC?GDDCD?DDADE@GABDG +@SRX079804:1:SRR292678:1:1101:1022234:1022234 1:N:0:1 BH:ok +GACCTTTCCGCAAGCTGTCGC ++SRX079804:1:SRR292678:1:1101:1022234:1022234 1:N:0:1 BH:ok +HHHHFEHHHHGGHHHGDHEEG +@SRX079804:1:SRR292678:1:1101:1024144:1024144 1:N:0:1 BH:failed +TCCATTATGAAAGAAGAAAA ++SRX079804:1:SRR292678:1:1101:1024144:1024144 1:N:0:1 BH:failed +@A><96:6: +@SRX079804:1:SRR292678:1:1101:1175112:1175112 1:N:0:1 BH:failed +AGGCC ++SRX079804:1:SRR292678:1:1101:1175112:1175112 1:N:0:1 BH:failed +EC8EE +@SRX079804:1:SRR292678:1:1101:1182927:1182927 1:N:0:1 BH:changed:1 +TGAA ++SRX079804:1:SRR292678:1:1101:1182927:1182927 1:N:0:1 BH:changed:1 +GGBH +@SRX079804:1:SRR292678:1:1101:1243474:1243474 1:N:0:1 BH:ok +ACG ++SRX079804:1:SRR292678:1:1101:1243474:1243474 1:N:0:1 BH:ok +EEE +@SRX079804:1:SRR292678:1:1101:1266246:1266246 1:N:0:1 BH:failed +AA ++SRX079804:1:SRR292678:1:1101:1266246:1266246 1:N:0:1 BH:failed +C@ +@SRX079804:1:SRR292678:1:1101:1269735:1269735 1:N:0:1 BH:failed +C ++SRX079804:1:SRR292678:1:1101:1269735:1269735 1:N:0:1 BH:failed +G diff --git a/example_fastq.fastq:Zone.Identifier b/example_fastq.fastq:Zone.Identifier new file mode 100644 index 0000000..1bf0b28 --- /dev/null +++ b/example_fastq.fastq:Zone.Identifier @@ -0,0 +1,3 @@ +[ZoneTransfer] +ZoneId=3 +HostUrl=https://github.com/ diff --git a/misc_module b/misc_module new file mode 160000 index 0000000..d29d55c --- /dev/null +++ b/misc_module @@ -0,0 +1 @@ +Subproject commit d29d55c0582e79c291546bcf9594f22155295f5e From abc3bd63ec15e3c900fbfba04e86fd099a4ef2ca Mon Sep 17 00:00:00 2001 From: Elena Smertina <104040609+sme229@users.noreply.github.com> Date: Fri, 23 Feb 2024 17:58:52 +1100 Subject: [PATCH 08/24] Delete modules directory --- modules/fastq_filter_functions.py | 106 ------------------------ modules/nucleic_acid_module.py | 125 ---------------------------- modules/protein_module.py | 130 ------------------------------ 3 files changed, 361 deletions(-) delete mode 100644 modules/fastq_filter_functions.py delete mode 100644 modules/nucleic_acid_module.py delete mode 100644 modules/protein_module.py diff --git a/modules/fastq_filter_functions.py b/modules/fastq_filter_functions.py deleted file mode 100644 index 758db74..0000000 --- a/modules/fastq_filter_functions.py +++ /dev/null @@ -1,106 +0,0 @@ -def calc_gc_content(seq: str) -> float: - """ - Calculates gc content - Argument is string - Returns float in % - """ - seq_lower = seq.lower() - length_seq = len(seq_lower) - gc_count = 0 - for nt in seq_lower: - if nt=='g' or nt=='c': - gc_count+=1 - gc_content = (gc_count/length_seq)*100 - return gc_content -def seq_length(seq: str) -> str: - """ - Calculates sequence length - Argument is string - Returns string - """ - return len(seq) -def quality_score(seq: str) -> int: - """ - Calculates numeric quality score - Argument is string - Returns int value - """ - score_count = 0 - length_q_seq = len(seq) - for symbol in seq: - score_num = ord(symbol) - 33 - score_count+=score_num - mean_qs = (score_count/length_q_seq) - return mean_qs -def length_filter(seqs: dict, length_bounds=(0,1000)) -> dict: - """ - Filters fastq reads by length - Arguments: - -dictionary - -sequence length parameters (>= and <=) - Returns filtered dictionary - """ - #seqs = {'name': ('sequence', 'quality')} - output = [] - result = dict() - for name, (sequence, quality) in seqs.items(): - - if seq_length(sequence) <= length_bounds[1] and seq_length(sequence) >= length_bounds[0]: - output.append(name) - if name in output: - result[name] = (sequence, quality) - - return result -def quality_filter(seqs: dict, quality_threshold=25) -> dict: - """ - Filters fastq reads by quality score - Arguments: - -dictionary - -quality score threshold (>=) - Returns filtered dict - """ - #seqs = {'name': ('sequence', 'quality')} - output = [] - result = dict() - for name, (sequence, quality) in seqs.items(): - - if quality_score(quality) >= quality_threshold: - output.append(name) - if name in output: - result[name] = (sequence, quality) - - return result -def gc_filter(seqs: dict, gc_bounds=(0,100)) -> dict: - """ - Filters fastq reads by gc content - Arguments: - -dict - -gc content parameters (>= and <=) - Returns filtered dict - """ - #seqs = {'name': ('sequence', 'quality')} - output = [] - result = dict() - for name, (sequence, quality) in seqs.items(): - - if calc_gc_content(sequence) >= gc_bounds[0] and calc_gc_content(sequence) <= gc_bounds[1]: - output.append(name) - if name in output: - result[name] = (sequence, quality) - - return result -def fastq_filter(seqs: dict, gc_bounds: int, length_bounds: int, quality_threshold: int) -> dict: - """ - Filters fastq sequence by gc content, length and quality score - Arguments: dict with fastq sequences, filtering parameters - Returns filtered dictionary - """ - resulting_sequences = dict() - gc_filtered = gc_filter(seqs, gc_bounds = (0,100)) - length_filtered = length_filter(seqs, length_bounds = (0,1000)) - quality_filtered = quality_filter(seqs, quality_threshold = 15) - intersection = gc_filtered.keys() & length_filtered.keys() & quality_filtered.keys() - #intersection = {keys: gc_filtered[keys] for keys in gc_filtered.keys() & length_filtered.keys()} - #for keys, (sequence, quality) in intersection: - # resulting_sequences[keys] = (sequence, quality) - return intersection diff --git a/modules/nucleic_acid_module.py b/modules/nucleic_acid_module.py deleted file mode 100644 index c24d2bd..0000000 --- a/modules/nucleic_acid_module.py +++ /dev/null @@ -1,125 +0,0 @@ -def transcribe(seq: str) -> str: - """ - Transcribes DNA->RNA - Argument is string - Return is string - """ - list_input = list(seq) - for i in range(len(seq)): - if (list_input[i] == 'T'): - list_input[i] = 'U' - elif (list_input[i] == 't'): - list_input[i]='u' - return "".join(list_input) - - -def reverse(seq: str) -> str: - """ - Returns reversed sequence - Argument is string - Return is string - """ - output = seq[::-1] - return output - - -def complement(seq: str) -> str: - """ - Returns a complementary sequence - Argument is a string - Return is a string - """ - list_input = list(seq) - for i in range(len(seq)): - if (list_input[i]=='G'): - list_input[i]='C' - elif (list_input[i]== 'g'): - list_input[i]='c' - elif (list_input[i]=='C'): - list_input[i]='G' - elif (list_input[i]=='c'): - list_input[i]='g' - elif (list_input[i] == 'T'): - list_input[i] = 'A' - elif (list_input[i] == 't'): - list_input[i]='a' - elif (list_input[i] == 'A'): - list_input[i] = 'T' - elif (list_input[i]=='a'): - list_input[i]='t' - - else: - - list_input = list(seq) - for i in range(len(seq)): - if (list_input[i]=='G'): - list_input[i]='C' - elif (list_input[i]== 'g'): - list_input[i]='c' - elif (list_input[i]=='C'): - list_input[i]='G' - elif (list_input[i]=='c'): - list_input[i]='g' - elif (list_input[i] == 'U'): - list_input[i] = 'A' - elif (list_input[i] == 'u'): - list_input[i]='a' - elif (list_input[i] == 'A'): - list_input[i] = 'U' - elif (list_input[i]=='a'): - list_input[i]='u' - return "".join(list_input) - - -def check_nucleic_acid(seq: str) -> str: - """ - This function checks whether input sequence(s) is a nucleic acid - Argument is str - Return is str - """ - unique_chars = set(seq) - nucleotides_dna = set('ATGCatgc') - nucleotides_rna = set('AUGCaugc') - if unique_chars <= nucleotides_dna: - seq = 'dna' - elif unique_chars <= nucleotides_rna: - seq = 'rna' - else: - raise ValueError("Invalid Input") - return seq - - -def reverse_complement(seq: str) -> str: - """ - This function returns a reversed complementary sequence - Argument is str - Return is str - """ - complement_seq = complement(seq) - reverse_compl_seq = reverse(complement_seq) - return reverse_compl_seq - - -def run_dna_rna_tools(*args: str, function: str) -> str: - """ - This function combines the functions above - Arguments: *args are input sequences, function is a function of choice - Returns: str, processed seqeunces depending on the function chosen - """ - results = [] - for seq in args: - check_nucleic_acid(seq) - if function == 'transcribe': - results.append(transcribe(seq)) - if function == 'complement': - results.append(complement(seq)) - if function == 'reverse': - results.append(reverse(seq)) - if function == 'reverse_complement': - results.append(reverse_complement(seq)) - if len(results) == 1: - results = results[0] - - return results - - diff --git a/modules/protein_module.py b/modules/protein_module.py deleted file mode 100644 index d71da27..0000000 --- a/modules/protein_module.py +++ /dev/null @@ -1,130 +0,0 @@ -from typing import Optional - -aa_code_dict = {'C':'Cys', 'c':'Cys', 'D':'Asp', 'd':'Asp', 'S':'Ser', 's':'Ser', 'Q':'Gln', 'q':'Gln', - 'K':'Lys', 'k':'Lys', 'I':'Ile', 'i':'Ile', 'P':'Pro', 'p':'Pro', 'T':'Thr', 't':'Thr', - 'F':'Phe', 'f':'Phe', 'N':'Asn', 'n':'Asn', 'G':'Gly', 'g':'Gly', 'H':'His', 'h':'His', - 'L':'Leu', 'l':'Leu', 'R':'Arg', 'r':'Arg', 'W':'Trp', 'w':'Trp', 'A':'Ala', 'a':'Ala', - 'V':'Val', 'v':'Val', 'E':'Glu', 'e':'Glu', 'Y':'Tyr', 'y':'Tyr', 'M':'Met', 'm':'Met'} - -aa_weight_dict = {'G':75, 'g':75, 'A':89, 'a':89, 'R':174, 'r':174, 'N':132, 'n':132, - 'D':133, 'd':133, 'C':121, 'c':133, 'E':147, 'e':147, 'Q':146, 'q':146, - 'H':155, 'h':155, 'I':131, 'i':131, 'L':131, 'l':131, 'K':146, 'k':146, - 'M':149, 'm':149, 'F':165, 'f':165, 'P':115, 'p':115, 'S':105, 's':105, - 'T':119, 't':119, 'W':204, 'w':204, 'Y':181, 'y':181, 'V':117, 'v':117} -def amino_acid_frequency(seq: str) -> dict: - """ - Calculates amino acid frequencies - Arguments: - -seq (str) input protein sequence - Return: - -dictionary with amino acid and its frequency - """ - freq_dict = {} - for letter in seq: - if letter in freq_dict: - freq_dict[letter] += 1 - else: - freq_dict[letter] = 1 - for letter in freq_dict: - freq_dict[letter] = round(freq_dict[letter] / len(seq) * 100, 2) - return freq_dict - - -def find_motifs(seq: str, motif: str): - """ - Finds a motif of interest in a protein sequence - Arguments: - -seq (str) input protein sequence - -motif (str) motif to be found in sequence - Return: - -position(s) of the motif in seq - """ - positions = [] - for i in range(len(seq) - len(motif) + 1): - window = seq[i:i+len(motif)] - if window == motif: - positions.append(i) - return positions - - -def check_protein_seq(seq: str) -> str: - """ - Checks whether a sequence is written using 1-letter amino acid code - Arguments: - -seq (str) input protein sequence - Return: - - str, 'single_letter_prot_seq' otherwise 'Invalid Input' error is raised - """ - unique_chars = set(seq) - single_letter = set('GALMFWKQESPVICYHRNDTgalmfwkqespvicyhrndt') - - if unique_chars <= single_letter: - seq = 'single_letter_prot_seq' - - else: - raise ValueError("Invalid Input") - return seq - - -def molecular_weight(seq: str) -> int: - """ - Calculates molecular weight of a protein - Arguments: - - seq (str) 1-letter coded protein sequence - Return: - - int, molecular weight (g/mol) rounded to integer - """ - list_input_seq = list(seq) - water_mw = 18 - for aa in list_input_seq: - total_mw = sum(aa_weight_dict[a] for a in list_input_seq) - mw_water_removed = (total_mw - (water_mw * (len(list_input_seq)-1))) - return mw_water_removed - - -def one_to_three_letter(seq: str) -> str: - """ - Converts a 1-letter amino acid code sequence into a 3-letter sequence - Arguments: - - seq (str) sequence to convert, must be 1-letter coded protein sequence - Return: - - str, a 3-letter coded protein sequence without spaces - """ - three_letter_aa = '' - for aa in seq: - three_letter_aa_seq += aa_code_dict[aa] - return three_letter_aa_seq - - -def run_protein_tool(*args: str, function: str, motif: Optional[str]=None): - """ - This is the main function - Arguments: - -seq(str) protein sequence(s) - -function(str) specify the function - -motif(str), optional argument for find_motifs function - Return: - -result of the specified function - """ - results = [] - for seq in args: - if check_protein_seq(seq) == 'single_letter_prot_seq': - if function == 'check_protein_seq': - for seq in args: - results.append(check_protein_seq(seq)) - elif function == 'molecular_weight': - for seq in args: - results.append(molecular_weight(seq)) - elif function == 'one_to_three_letter': - for seq in args: - results.append(one_to_three_letter(seq)) - elif function == 'amino_acid_frequency': - for seq in args: - results.append(amino_acid_frequency(seq)) - elif function == 'find_motifs': - for seq in args: - results.append(find_motifs(seq, motif)) - if len(results) == 1: - results = results[0] - return results - From 3b1af5cd09f72db92f6c4fc6ad1302a5c3373bf9 Mon Sep 17 00:00:00 2001 From: Elena Smertina <104040609+sme229@users.noreply.github.com> Date: Fri, 23 Feb 2024 18:01:04 +1100 Subject: [PATCH 09/24] Delete miscellaneous.py --- miscellaneous.py | 71 ------------------------------------------------ 1 file changed, 71 deletions(-) delete mode 100644 miscellaneous.py diff --git a/miscellaneous.py b/miscellaneous.py deleted file mode 100644 index 540510b..0000000 --- a/miscellaneous.py +++ /dev/null @@ -1,71 +0,0 @@ -def fastq_filter(seqs: dict, gc_bounds: int, length_bounds: int, quality_threshold: int) -> dict: - """ - Filters fastq sequence by gc content, length and quality score - Arguments: dict with fastq sequences, filtering parameters - Returns filtered dictionary - """ - result = dict() - gc_filtered = gc_filter(seqs, gc_bounds) - length_filtered = length_filter(seqs, length_bounds) - quality_filtered = quality_filter(seqs, quality_threshold) - intersection = gc_filtered.keys() & length_filtered.keys() & quality_filtered.keys() - - for keys, (sequence, quality) in seqs.items(): - if keys in intersection: - result[keys] = (sequence, quality) - return result - - -def run_dna_rna_tools(*args: str, function: str) -> str: - """ - This function combines the functions above - Arguments: *args are input sequences, function is a function of choice - Returns: str, processed seqeunces depending on the function chosen - """ - results = [] - for seq in args: - check_nucleic_acid(seq) - if function == 'transcribe': - results.append(transcribe(seq)) - if function == 'complement': - results.append(complement(seq)) - if function == 'reverse': - results.append(reverse(seq)) - if function == 'reverse_complement': - results.append(reverse_complement(seq)) - if len(results) == 1: - results = results[0] - return results - - -def run_protein_tool(*args: str, function: str, motif: Optional[str]=None): - """ - This is the main function - Arguments: - -seq(str) protein sequence(s) - -function(str) specify the function - -motif(str), optional argument for find_motifs function - Return: - -result of the specified function - """ - results = [] - for seq in args: - if check_protein_seq(seq) == 'single_letter_prot_seq': - if function == 'check_protein_seq': - for seq in args: - results.append(check_protein_seq(seq)) - elif function == 'molecular_weight': - for seq in args: - results.append(molecular_weight(seq)) - elif function == 'one_to_three_letter': - for seq in args: - results.append(one_to_three_letter(seq)) - elif function == 'amino_acid_frequency': - for seq in args: - results.append(amino_acid_frequency(seq)) - elif function == 'find_motifs': - for seq in args: - results.append(find_motifs(seq, motif)) - if len(results) == 1: - results = results[0] - return results From 92774b21a6c8391db186956873325dff12bda676 Mon Sep 17 00:00:00 2001 From: Elena Smertina <104040609+sme229@users.noreply.github.com> Date: Fri, 23 Feb 2024 18:03:40 +1100 Subject: [PATCH 10/24] Delete updated_HW5.py --- updated_HW5.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 updated_HW5.py diff --git a/updated_HW5.py b/updated_HW5.py deleted file mode 100644 index e69de29..0000000 From ca3388da69d2efba7cb4af90d4faf80ea94ab6b7 Mon Sep 17 00:00:00 2001 From: lsmertina Date: Fri, 23 Feb 2024 18:17:06 +1100 Subject: [PATCH 11/24] add requirements txt --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b96f261 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +bioframe==0.5.1 +biopython==1.81 \ No newline at end of file From 0fca5dcd8ae1628ca1eded469d4e513a181f92c4 Mon Sep 17 00:00:00 2001 From: Elena Smertina <104040609+sme229@users.noreply.github.com> Date: Fri, 23 Feb 2024 18:20:03 +1100 Subject: [PATCH 12/24] Create requirements.txt --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..10211e5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +bioframe==0.5.1 +biopython==1.81 From 8a853c6c6457d5584928592d5e9c08f02ee4bb54 Mon Sep 17 00:00:00 2001 From: lsmertina Date: Fri, 23 Feb 2024 20:48:04 +1100 Subject: [PATCH 13/24] delete files --- biopython_gc_filter.ipynb | 77 ------ example_fastq.fastq | 356 ---------------------------- example_fastq.fastq:Zone.Identifier | 3 - requirements.txt | 2 - 4 files changed, 438 deletions(-) delete mode 100644 biopython_gc_filter.ipynb delete mode 100644 example_fastq.fastq delete mode 100644 example_fastq.fastq:Zone.Identifier delete mode 100644 requirements.txt diff --git a/biopython_gc_filter.ipynb b/biopython_gc_filter.ipynb deleted file mode 100644 index a3da0d8..0000000 --- a/biopython_gc_filter.ipynb +++ /dev/null @@ -1,77 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 7, - "id": "21c5a784", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "9\n" - ] - } - ], - "source": [ - "from Bio import SeqIO\n", - "from Bio.SeqUtils import GC\n", - "\n", - "def filter_fastq(input_path: str, quality_threshold: int, output_filename=\"final_filtered.fastq\", gc_bounds=(40, 60), length_bounds=(50, 350)):\n", - " filename = input_path\n", - " records = SeqIO.parse(filename, \"fastq\")\n", - " ###quality filter\n", - " good_reads = (rec for rec in records if min(rec.letter_annotations[\"phred_quality\"]) >= quality_threshold)\n", - " result_quality = SeqIO.write(good_reads, \"good_quality.fastq\", \"fastq\")\n", - " result_quality_GC = SeqIO.parse(\"good_quality.fastq\", \"fastq\")\n", - " \n", - " ###GC content filter\n", - " min_gc_content = gc_bounds[0]\n", - " max_gc_content = gc_bounds[1]\n", - " GC_quality_filt = []\n", - " \n", - " for sequence in result_quality_GC:\n", - " if min_gc_content <= GC(sequence.seq) <= max_gc_content:\n", - " GC_quality_filt.append(sequence)\n", - " \n", - " result_quality = SeqIO.write(GC_quality_filt, \"good_quality_GC.fastq\", \"fastq\")\n", - " result_quality_GC_length = SeqIO.parse(\"good_quality_GC.fastq\", \"fastq\")\n", - " \n", - " ##length filter\n", - " filtered_GC_quality_length = []\n", - " \n", - " for sequence in result_quality_GC_length:\n", - " if len(sequence.seq) >= length_bounds[0] and len(sequence.seq) <= length_bounds[1]:\n", - " filtered_GC_quality_length.append(sequence)\n", - " \n", - " result_quality = SeqIO.write(filtered_GC_quality_length, output_filename, \"fastq\")\n", - " \n", - " print(result_quality)\n", - "\n", - "#filter_fastq(\"example_fastq.fastq\", 15)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/example_fastq.fastq b/example_fastq.fastq deleted file mode 100644 index 883b51f..0000000 --- a/example_fastq.fastq +++ /dev/null @@ -1,356 +0,0 @@ -@SRX079804:1:SRR292678:1:1101:21885:21885 1:N:0:1 BH:ok -ACAGCAACATAAACATGATGGGATGGCGTAAGCCCCCGAGATATCAGTTTACCCAGGATAAGAGATTAAATTATGAGCAACATTATTAA -+SRX079804:1:SRR292678:1:1101:21885:21885 1:N:0:1 BH:ok -FGGGFGGGFGGGFGDFGCEBB@CCDFDDFFFFBFFGFGEFDFFFF;D@DD>C@DDGGGDFGDGG?GFGFEGFGGEF@FDGGGFGFBGGD -@SRX079804:1:SRR292678:1:1101:24563:24563 1:N:0:1 BH:failed -ATTAGCGAGGAGGAGTGCTGAGAAGATGTCGCCTACGCCGTTGAAATTCCCTTCAATCAGGGGGTACTGGAGGATACGAGTTTGTGTG -+SRX079804:1:SRR292678:1:1101:24563:24563 1:N:0:1 BH:failed -BFFFFFFFB@B@A<@D>BDDACDDDEBEDEFFFBFFFEFFDFFF=CC@DDFD8FFFFFFF8/+.2,@7<<:?B/:<><-><@.A*C>D -@SRX079804:1:SRR292678:1:1101:30161:30161 1:N:0:1 BH:failed -GAACGACAGCAGCTCCTGCATAACCGCGTCCTTCTTCTTTAGCGTTGTGCAAAGCATGTTTTGTATTACGGGCATCTCGAGCGAATC -+SRX079804:1:SRR292678:1:1101:30161:30161 1:N:0:1 BH:failed -DFFFEGDGGGGFGGEDCCDCEFFFFCCCCCB>CEBFGFBGGG?DE=:6@=>AD?D8DCEE:>EEABE5D@5:DDCA;EEE-DCD -@SRX079804:1:SRR292678:1:1101:47176:47176 1:N:0:1 BH:failed -TGAAGCGTCGATAGAAGTTAGCAAACCCGCGGAACTTCCGTACATCAGACACATTCCGGGGGGTGGGCCAATCCATGATGCCTTTG -+SRX079804:1:SRR292678:1:1101:47176:47176 1:N:0:1 BH:failed -FF@FFBEEEEFFEFFD@EDEFFB=DFEEFFFE8FFE8EEDBFDFEEBE+E46.'8.5::EE:?E>A6@?)>;>9D<C9DEBAAB=5C?<@0=A?D@BDB;:BA?BDDFH?B@DCB6BEBDA??AA9. -@SRX079804:1:SRR292678:1:1101:52180:52180 1:N:0:1 BH:failed -TTTTTGGTTTTAGGTCTAACATGTAAGTCTTTAATCTATTTTGAATTAATTTTTGGATGAGGTGTAAGGAAGGGATCCAGTTTC -+SRX079804:1:SRR292678:1:1101:52180:52180 1:N:0:1 BH:failed -FEFFFFFF=FEEFDFD>EE:?<5@BFAFCFEE7>C>:/6:2<344DA:6DDCDC>)34773DD?@DBA9B -@SRX079804:1:SRR292678:1:1101:105156:105156 1:N:0:1 BH:ok -ACTGCTGAGCTTAAATGGCGGCAGTCTGACGGTTACCAACGGGGGCACTTCAACCGGTTCGTTAACGGGGAGCGGAGAGCTGA -+SRX079804:1:SRR292678:1:1101:105156:105156 1:N:0:1 BH:ok -GFFEGGFGGGGEGGGGGGGGGFDD=DDE7EDD6CD?FEDEE@EBEFEE.DD5DDD@B<7>/0543C?BEE?@@BE@; -@SRX079804:1:SRR292678:1:1101:135168:135168 1:N:0:1 BH:ok -CCTTCCTAAAAATTAAGAATCTTAACAATTAGCAGCACAACCAAAATTATTACCGAAAGGACTTACTCCTCCGCCAAATCCA -+SRX079804:1:SRR292678:1:1101:135168:135168 1:N:0:1 BH:ok -GGGEGGGEGGBFFFFFGCFFEGG6AEEEDEEGGFGGCG4EGFFGDFFBGEGFGGFECBDEGGEFGBFF?CGFBFGGGGFGFF -@SRX079804:1:SRR292678:1:1101:149302:149302 1:N:0:1 BH:changed:1 -TAGGGTTGTATTTGCAGATCCATGGCATGCCAAAAAGAACATCGTCCCGTCCAATATCTGCAACATACCAGTTGGTTGGTA -+SRX079804:1:SRR292678:1:1101:149302:149302 1:N:0:1 BH:changed:1 -@;CBA=:@;@DBDCDEEE/EEEEEEF@>FBEEB=EFA>EEBD=DAEEEEB9)99>B99BC)@,@<9CDD=C,5;B::?@;A -@SRX079804:1:SRR292678:1:1101:170868:170868 2:N:0:1 BH:failed -CTGCCGAGACTGTTCTCAGACATGGAAAGCTCGATTCGCATACACTCGCTGAGTAAGAGAGTCACACCAAATCACAGATT -+SRX079804:1:SRR292678:1:1101:170868:170868 2:N:0:1 BH:failed -E;FFFEGFGIGGFBG;C6D<@C7CDGFEFGFHDFEHHHBBHHFDFEFBAEEEEDE@A2=DA:??C3:@>EEBEEHEFEHHFFHH?FGBGFBBD77B;;C?FFFFGGFED.BBABBG@DBBE -@SRX079804:1:SRR292678:1:1101:190845:190845 1:N:0:1 BH:changed:1 -CCTCAGCGTGGATTGCCGCTCATGCAGGAGCAGATAATCCCTTCGCCATCCCATTAAGCGCCGTTGTCGGTATTCC -+SRX079804:1:SRR292678:1:1101:190845:190845 1:N:0:1 BH:changed:1 -FF@FFCFEECEBEC@@BBBBDFBBFFDFFEFFEB8FFFFFFFFEFCEB/>BBA@AFFFEEEEECE;ACD@DBBEEE -@SRX079804:1:SRR292678:1:1101:198993:198993 2:N:0:1 BH:failed -AGTTATTTATGCATCATTCTCATGTATGAGCCAACAAGATAGTACAAGTTTTATTGCTATGAGTTCAGTACAACA -+SRX079804:1:SRR292678:1:1101:198993:198993 2:N:0:1 BH:failed -<<<=;@B??@<>@><48876EADEG6B.BB@.?+98204<:<>@?A=@EFEFFFEEFB -@SRX079804:1:SRR292678:1:1101:204480:204480 1:N:0:1 BH:failed -AGTGAGACACCCCTGAACATTCCTAGTAAGACATCTTTGAATATTACTAGTTAGCCACACTTTAAAATGACCCG -+SRX079804:1:SRR292678:1:1101:204480:204480 1:N:0:1 BH:failed -<98;<@@@:@CD@BCCDD=DBBCEBBAAA@9???@BCDBCGF=GEGDFGDBEEEEEFFFF=EDEE=DCD@@BBC -@SRX079804:1:SRR292678:1:1101:212327:212327 2:N:0:1 BH:failed -TTACCTCTGCTTTTTCGCCTGTTACTTCTACTAATCCTTCATCTATTGCGAATGGCCCTACTACTGACGAAAT -+SRX079804:1:SRR292678:1:1101:212327:212327 2:N:0:1 BH:failed -DBCCC@@;A@BDCCCE>BBED>GDCDBFBFFEBEECFGGD@@BCB<<8@;09746:@@>@EEECEEDE/FEED -@SRX079804:1:SRR292678:1:1101:230386:230386 1:N:0:1 BH:failed -TTCTGATTGGAGTGAGAGTGCCATTTGTTTCGCTGATTGGACGTTGGAAAGCGCCTTGACCTTTGACAGCAG -+SRX079804:1:SRR292678:1:1101:230386:230386 1:N:0:1 BH:failed -=BDD=DCDBBCDADD@@B;B@CC7C@B@>=BACDD,=??8DDCDD7CCCCDBDEDBDDDDDEEBDDDBD?DB -@SRX079804:1:SRR292678:1:1101:236093:236093 1:N:0:1 BH:failed -CATCTAACTCTCATTCTAGATTCTTAAGTTGGCTACACTTTGCCGTCATTCTCGGTGGATTAGCTATTGGG -+SRX079804:1:SRR292678:1:1101:236093:236093 1:N:0:1 BH:failed -A:@A@;@BB@GGFGG@A@@817729B -@SRX079804:1:SRR292678:1:1101:251912:251912 2:N:0:1 BH:ok -TGTAAATGGAAATGAACCTAATATGTATGCACAAACTATTAAAGCATATCTTGCAAAAGGAGCAATGGCG -+SRX079804:1:SRR292678:1:1101:251912:251912 2:N:0:1 BH:ok -GHFDHHHGHHHFFHHGHHHGGGGGGDEFFEGGGGEFHHFHGFGHHHHFHHDDD6@=DCACFFGGEFBBG -@SRX079804:1:SRR292678:1:1101:278698:278698 1:N:0:1 BH:ok -CTAATAATGGTAATTGAACCATAGAAGATAAGTTCATAATGTAATAAATACATCCATAGAGTTATTAA -+SRX079804:1:SRR292678:1:1101:278698:278698 1:N:0:1 BH:ok -DDBDBCCCDD@FFFB9<<<@DA=DA@B:@=@@AC@GGFCGECFFDGGCGFFGGFFCEBF9>?@>BDFF -@SRX079804:1:SRR292678:1:1101:295878:295878 1:N:0:1 BH:changed:1 -AATGCAAACAGGATGATATTTGAATCCGTAATACTGTTCTTTCATCATAAATAATTTATGCAGATAC -+SRX079804:1:SRR292678:1:1101:295878:295878 1:N:0:1 BH:changed:1 -HHHHHEGFHDGFEGBCBEEEGGGG@EDGCGBBBEGF4?EFDBDDBFE8DEE-E?EE;B@EFC=;FDE -@SRX079804:1:SRR292678:1:1101:306575:306575 1:N:0:1 BH:failed -ATCTTTGAACTTTGTCTCACTTTCCCCCATCTCACCATTCCTCCTGTTCTGTGAACCCCAGTTTCA -+SRX079804:1:SRR292678:1:1101:306575:306575 1:N:0:1 BH:failed -E::EA@E<6B8>97:<6084649?@:?EDED=BEBEGGFEDGECECBDEFDDEEGGEDDFFF -@SRX079804:1:SRR292678:1:1101:403661:403661 1:N:0:1 BH:failed -GATGGCTTTGCTTTCTCATTCTCCTCTCCATCGTTCCCATCTTCGCCCTCAGACGCTGATTGAT -+SRX079804:1:SRR292678:1:1101:403661:403661 1:N:0:1 BH:failed -EG=DFFDFFDDGDGGGGGFFGGGGBADADCEEE5EC>CCCE6BEEEGGGGCBEGGE@9BCEF;>>D@D -@SRX079804:1:SRR292678:1:1101:425870:425870 1:N:0:1 BH:failed -GTACAGCTCTCCTCGTTACCAGCACATCTTGGACACCCGACGAAGACTTTGACATGCTCCTC -+SRX079804:1:SRR292678:1:1101:425870:425870 1:N:0:1 BH:failed -E@EED@EF=D>=EED@D@7DBF -@SRX079804:1:SRR292678:1:1101:429745:429745 1:N:0:1 BH:failed -GGCGATTGTGAAGGCATAAGAGTGGGACATAGTTCAAGTCCAGAACGAATTAAACGCACAA -+SRX079804:1:SRR292678:1:1101:429745:429745 1:N:0:1 BH:failed -BB2B=A@A:BFBFFFFF -@SRX079804:1:SRR292678:1:1101:475293:475293 2:N:0:1 BH:ok -TACATTTGATTTCTTTATAAGATTTCTTACTGTAAAATCATCGCTATTTAACAGCTTATT -+SRX079804:1:SRR292678:1:1101:475293:475293 2:N:0:1 BH:ok -FHFFFFDC@FGFEDGE?EEDC6EEEDEF?EEEE8EHHGGHFGFFEGGGGBFBDBDEBCBCA -@SRX079804:1:SRR292678:1:1101:511594:511594 1:N:0:1 BH:ok -AGCACAATCAGATTCGCTTATGACGGCGATGAAGAAATTGCGATGAAATGTGAGGTGA -+SRX079804:1:SRR292678:1:1101:511594:511594 1:N:0:1 BH:ok -HCHFFHHHGFHBFFFEGFFEFHFEHGBGECHEHB?CDDEFCDBFF9DFCD.BC -@SRX079804:1:SRR292678:1:1101:527839:527839 2:N:0:1 BH:changed:4 -GACATTTCTTTCTGGGAAGGCTTATTAAACGATAAAGATAATGATATACGTTTTGAT -+SRX079804:1:SRR292678:1:1101:527839:527839 2:N:0:1 BH:changed:4 ->35/-;,><04%'A.?4?:>BE.DC-@???CE:@EDFDBDG7B;=<)?1.@?2A5<5 -@SRX079804:1:SRR292678:1:1101:547309:547309 1:N:0:1 BH:failed -AGGTCTGCACTGGCTCTTCAGAGCGCAAGCGAGGGAAGAGAAATATAGTGACGCAC -+SRX079804:1:SRR292678:1:1101:547309:547309 1:N:0:1 BH:failed -GEGFGFD=FDB8B7DDFFFF@/DC@+2:66>-@>9);<2: -@SRX079804:1:SRR292678:1:1101:570367:570367 1:N:0:1 BH:failed -CTATGGCCACTTTTGTATTCTCGATTGAGGTTATTCGCTCACCCATCTTTTCCAA -+SRX079804:1:SRR292678:1:1101:570367:570367 1:N:0:1 BH:failed -FFEE;FFFFBFFFFF<9BEDEBEACDD3DD0B5>>0?:@>FFBEEBEFFFFGCCBCDGIGDGHEFGG=GGGGHFHEHF@FEFE?CGEEEFADEAEC -@SRX079804:1:SRR292678:1:1101:590521:590521 1:N:0:1 BH:failed -CCTCTTCTCTTTCGCGGTCATCTTGGGTTTCGCGCCTTTCTTCTTGACGACAC -+SRX079804:1:SRR292678:1:1101:590521:590521 1:N:0:1 BH:failed -EGFDD@FDF=FDFFEEGGBEFDGF=FGAGEEEEEDFBGGEE8EEE@>5;>90< -@SRX079804:1:SRR292678:1:1101:601307:601307 1:N:0:1 BH:changed:1 -TTGGCGTGCTGATGATTATCGGTATCTTCAAAGGCGCGCAGCCTGCGGGCTG -+SRX079804:1:SRR292678:1:1101:601307:601307 1:N:0:1 BH:changed:1 -GGGEGFGGEGE:EE>GFFGGGGDCGEBFFF>G=EBFFEC?DFGAD?DDECBE -@SRX079804:1:SRR292678:1:1101:631057:631057 1:N:0:1 BH:failed -CACCATTATCCTATTTCTGAACACATTTGACAGTCACGGCACTAGCATTGG -+SRX079804:1:SRR292678:1:1101:631057:631057 1:N:0:1 BH:failed -GGGGGGGGBFGGGEGFGGGGGFGEFFDFFFF?EBFEEDBFGEE@BE;E?E7 -@SRX079804:1:SRR292678:1:1101:654270:654270 1:N:0:1 BH:failed -ATCTTTCTCTCTCTGCTGCATTCTCCGCTTCAGCTCCTCAATTTCAATCA -+SRX079804:1:SRR292678:1:1101:654270:654270 1:N:0:1 BH:failed -GGGGGGEDGGFEGDGCGGBCEFGGGGFBFGGEGGGGGEGFCFEEGGFEDB -@SRX079804:1:SRR292678:1:1101:667761:667761 1:N:0:1 BH:failed -CAGCCTTTTGAGGTCGTCTATCGCAGCGTGTCCGCGACGTTTTGTTGCG -+SRX079804:1:SRR292678:1:1101:667761:667761 1:N:0:1 BH:failed -GGFGGG=GGGG@GFGGG@GGEGFGGGEGGGFGGEG@EEEDE8EE=E=DE -@SRX079804:1:SRR292678:1:1101:671526:671526 1:N:0:1 BH:ok -CAAGTAGAAGGTTAGCGCCTCTCTGTAAAAGGAGTCAAGCGCTATGTC -+SRX079804:1:SRR292678:1:1101:671526:671526 1:N:0:1 BH:ok -HHHHHGHEHHDGGGCGGGGHFGGGGEDGGGGGBEBEBEEEFGHFHFHH -@SRX079804:1:SRR292678:1:1101:685633:685633 2:N:0:1 BH:failed -GCTGTAATGGATCCACTAATTGGGACAGTGGTGGATAAAACGAATAC -+SRX079804:1:SRR292678:1:1101:685633:685633 2:N:0:1 BH:failed -DADADCGFEFEEEGEGGEFDEEEEBDAC;C10<9?AGBGGGFF7DFB -@SRX079804:1:SRR292678:1:1101:769626:769626 1:N:0:1 BH:failed -GTATTCCTTGACAGTCGAAAGAATCACTGCTAACCCAGGC -+SRX079804:1:SRR292678:1:1101:769626:769626 1:N:0:1 BH:failed -C>5BBCCCD=ACDD@A7@@B@A?B?=8B??EDEEDEBDEE -@SRX079804:1:SRR292678:1:1101:776222:776222 1:N:0:1 BH:failed -AGAACGTACACCCTACGCTAAGCAGTGGCTCCATGCCAA -+SRX079804:1:SRR292678:1:1101:776222:776222 1:N:0:1 BH:failed -BGDGGGGFEGGGGEGGGGFGFF6=FCFAFFEE,=C?EDD -@SRX079804:1:SRR292678:1:1101:782183:782183 2:N:0:1 BH:failed -TACGCGTAACGACGTCATAGCCATGACGCTTCAATAAA -+SRX079804:1:SRR292678:1:1101:782183:782183 2:N:0:1 BH:failed -D -@SRX079804:1:SRR292678:1:1101:828383:828383 1:N:0:1 BH:failed -AAACCACATATGACATGAGTGACGGGACTAAAGTTC -+SRX079804:1:SRR292678:1:1101:828383:828383 1:N:0:1 BH:failed -FFEEFEEB=E,C>CDEEEECEBEEEC?F;BDDDDAE -@SRX079804:1:SRR292678:1:1101:829239:829239 1:N:0:1 BH:ok -TCGATCCTTCTGCCTCAAAGTATACTAGGACGCAT -+SRX079804:1:SRR292678:1:1101:829239:829239 1:N:0:1 BH:ok -GGGDFGGBGFFEBFEDCBCDCGGGGBEEE=GE?EE -@SRX079804:1:SRR292678:1:1101:868419:868419 1:N:0:1 BH:ok -ATTCGTCAGGCCCAATAACATCATGAATTTCCAG -+SRX079804:1:SRR292678:1:1101:868419:868419 1:N:0:1 BH:ok -DEEEEEEEBDFFFFFFFF8FEED8@FFFBFFEFF -@SRX079804:1:SRR292678:1:1101:892716:892716 2:N:0:1 BH:ok -AATTTACCTAATGGAATCAATGAGGCTACTCCA -+SRX079804:1:SRR292678:1:1101:892716:892716 2:N:0:1 BH:ok -@BCBBBCCBCCCCCCCDDAAFFGEEEDBF@EE@ -@SRX079804:1:SRR292678:1:1101:893159:893159 2:N:0:1 BH:ok -TTCTCTGCTTTTCATATCTTGTCATAAAAATT -+SRX079804:1:SRR292678:1:1101:893159:893159 2:N:0:1 BH:ok -CBDCDEEEEEBEEEEGDDFDEEEEDGFFEGFE -@SRX079804:1:SRR292678:1:1101:918742:918742 1:N:0:1 BH:failed -CTCTCCATGCACAAAGAATATCACAGCCAAA -+SRX079804:1:SRR292678:1:1101:918742:918742 1:N:0:1 BH:failed -EEEBA?@;B@EEE@BEE=?EDDDDADCDA?E -@SRX079804:1:SRR292678:1:1101:923787:923787 2:N:0:1 BH:ok -TTGTGAAGGATGGGATATTAGTGTAGATGA -+SRX079804:1:SRR292678:1:1101:923787:923787 2:N:0:1 BH:ok -EEBBEGEEE=BBB<@DCDCGD@D>=DEGEE -@SRX079804:1:SRR292678:1:1101:933189:933189 1:N:0:1 BH:failed -GTCTGCACTATCGAGGGCTGTGCCTTTGC -+SRX079804:1:SRR292678:1:1101:933189:933189 1:N:0:1 BH:failed -FEFFDBFF8FE>?DFFFCEBCEEBBEDE6 -@SRX079804:1:SRR292678:1:1101:937136:937136 1:N:0:1 BH:failed -TTTCTTTGGCTTAAAGATAGTTTTAGTC -+SRX079804:1:SRR292678:1:1101:937136:937136 1:N:0:1 BH:failed -EFFFEEEEFCBCDDDDE@/E?@@7@@3< -@SRX079804:1:SRR292678:1:1101:940351:940351 1:N:0:1 BH:changed:1 -TGCCGTGGGAATGACAAACAAGCATCC -+SRX079804:1:SRR292678:1:1101:940351:940351 1:N:0:1 BH:changed:1 -DECC@GFFBF=EBEAFDFGD?FFF8FF -@SRX079804:1:SRR292678:1:1101:940693:940693 1:N:0:1 BH:failed -CACATTATGAACTATGGGCACTGCAT -+SRX079804:1:SRR292678:1:1101:940693:940693 1:N:0:1 BH:failed -EEEGFDEDFEGGGGGFEGBGGGFGGG -@SRX079804:1:SRR292678:1:1101:955819:955819 1:N:0:1 BH:failed -CACCTAGCAGCAACGGACGAGTCAG -+SRX079804:1:SRR292678:1:1101:955819:955819 1:N:0:1 BH:failed -GGGGGEEEGGEGGGFGEGG;F@EFF -@SRX079804:1:SRR292678:1:1101:958051:958051 2:N:0:1 BH:ok -TTAATATTTCCATCTGAACTTCGC -+SRX079804:1:SRR292678:1:1101:958051:958051 2:N:0:1 BH:ok -EDDBGFEGFGHHFHGGEDEGBGDB -@SRX079804:1:SRR292678:1:1101:996098:996098 1:N:0:1 BH:failed -CTAAGAGAGTTTGTAATGCGGAC -+SRX079804:1:SRR292678:1:1101:996098:996098 1:N:0:1 BH:failed -DD=DBDBDC4EFFFD@?CD@ACD -@SRX079804:1:SRR292678:1:1101:1020278:1020278 2:N:0:1 BH:ok -AAAGTGCAGAACATGCAGATAT -+SRX079804:1:SRR292678:1:1101:1020278:1020278 2:N:0:1 BH:ok -D>AC?GDDCD?DDADE@GABDG -@SRX079804:1:SRR292678:1:1101:1022234:1022234 1:N:0:1 BH:ok -GACCTTTCCGCAAGCTGTCGC -+SRX079804:1:SRR292678:1:1101:1022234:1022234 1:N:0:1 BH:ok -HHHHFEHHHHGGHHHGDHEEG -@SRX079804:1:SRR292678:1:1101:1024144:1024144 1:N:0:1 BH:failed -TCCATTATGAAAGAAGAAAA -+SRX079804:1:SRR292678:1:1101:1024144:1024144 1:N:0:1 BH:failed -@A><96:6: -@SRX079804:1:SRR292678:1:1101:1175112:1175112 1:N:0:1 BH:failed -AGGCC -+SRX079804:1:SRR292678:1:1101:1175112:1175112 1:N:0:1 BH:failed -EC8EE -@SRX079804:1:SRR292678:1:1101:1182927:1182927 1:N:0:1 BH:changed:1 -TGAA -+SRX079804:1:SRR292678:1:1101:1182927:1182927 1:N:0:1 BH:changed:1 -GGBH -@SRX079804:1:SRR292678:1:1101:1243474:1243474 1:N:0:1 BH:ok -ACG -+SRX079804:1:SRR292678:1:1101:1243474:1243474 1:N:0:1 BH:ok -EEE -@SRX079804:1:SRR292678:1:1101:1266246:1266246 1:N:0:1 BH:failed -AA -+SRX079804:1:SRR292678:1:1101:1266246:1266246 1:N:0:1 BH:failed -C@ -@SRX079804:1:SRR292678:1:1101:1269735:1269735 1:N:0:1 BH:failed -C -+SRX079804:1:SRR292678:1:1101:1269735:1269735 1:N:0:1 BH:failed -G diff --git a/example_fastq.fastq:Zone.Identifier b/example_fastq.fastq:Zone.Identifier deleted file mode 100644 index 1bf0b28..0000000 --- a/example_fastq.fastq:Zone.Identifier +++ /dev/null @@ -1,3 +0,0 @@ -[ZoneTransfer] -ZoneId=3 -HostUrl=https://github.com/ diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index b96f261..0000000 --- a/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -bioframe==0.5.1 -biopython==1.81 \ No newline at end of file From 8886a582375cffe0a7b0d01837f778266e44f04b Mon Sep 17 00:00:00 2001 From: lsmertina Date: Fri, 23 Feb 2024 20:54:23 +1100 Subject: [PATCH 14/24] add biopython fastq filter script --- biopython_fastq_filter.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 biopython_fastq_filter.py diff --git a/biopython_fastq_filter.py b/biopython_fastq_filter.py new file mode 100644 index 0000000..b3d4927 --- /dev/null +++ b/biopython_fastq_filter.py @@ -0,0 +1,34 @@ +from Bio import SeqIO +from Bio.SeqUtils import GC + +def filter_fastq(input_path: str, quality_threshold: int, output_filename="final_filtered.fastq", gc_bounds=(40, 60), length_bounds=(50, 350)): + filename = input_path + records = SeqIO.parse(filename, "fastq") + ###quality filter + good_reads = (rec for rec in records if min(rec.letter_annotations["phred_quality"]) >= quality_threshold) + result_quality = SeqIO.write(good_reads, "good_quality.fastq", "fastq") + result_quality_GC = SeqIO.parse("good_quality.fastq", "fastq") + ###GC content filter + min_gc_content = gc_bounds[0] + max_gc_content = gc_bounds[1] + GC_quality_filt = [] + + for sequence in result_quality_GC: + if min_gc_content <= GC(sequence.seq) <= max_gc_content: + GC_quality_filt.append(sequence) + + result_quality = SeqIO.write(GC_quality_filt, "good_quality_GC.fastq", "fastq") + result_quality_GC_length = SeqIO.parse("good_quality_GC.fastq", "fastq") + + ##length filter + filtered_GC_quality_length = [] + + for sequence in result_quality_GC_length: + if len(sequence.seq) >= length_bounds[0] and len(sequence.seq) <= length_bounds[1]: + filtered_GC_quality_length.append(sequence) + + result_quality = SeqIO.write(filtered_GC_quality_length, output_filename, "fastq") + + print(result_quality) + +#filter_fastq("example_fastq.fastq", 15) From c08ab84fdf520f700245e61d9659446152f6b166 Mon Sep 17 00:00:00 2001 From: lsmertina Date: Fri, 23 Feb 2024 20:55:38 +1100 Subject: [PATCH 15/24] add an example fastq file --- example_fastq.fastq | 356 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 356 insertions(+) create mode 100644 example_fastq.fastq diff --git a/example_fastq.fastq b/example_fastq.fastq new file mode 100644 index 0000000..883b51f --- /dev/null +++ b/example_fastq.fastq @@ -0,0 +1,356 @@ +@SRX079804:1:SRR292678:1:1101:21885:21885 1:N:0:1 BH:ok +ACAGCAACATAAACATGATGGGATGGCGTAAGCCCCCGAGATATCAGTTTACCCAGGATAAGAGATTAAATTATGAGCAACATTATTAA ++SRX079804:1:SRR292678:1:1101:21885:21885 1:N:0:1 BH:ok +FGGGFGGGFGGGFGDFGCEBB@CCDFDDFFFFBFFGFGEFDFFFF;D@DD>C@DDGGGDFGDGG?GFGFEGFGGEF@FDGGGFGFBGGD +@SRX079804:1:SRR292678:1:1101:24563:24563 1:N:0:1 BH:failed +ATTAGCGAGGAGGAGTGCTGAGAAGATGTCGCCTACGCCGTTGAAATTCCCTTCAATCAGGGGGTACTGGAGGATACGAGTTTGTGTG ++SRX079804:1:SRR292678:1:1101:24563:24563 1:N:0:1 BH:failed +BFFFFFFFB@B@A<@D>BDDACDDDEBEDEFFFBFFFEFFDFFF=CC@DDFD8FFFFFFF8/+.2,@7<<:?B/:<><-><@.A*C>D +@SRX079804:1:SRR292678:1:1101:30161:30161 1:N:0:1 BH:failed +GAACGACAGCAGCTCCTGCATAACCGCGTCCTTCTTCTTTAGCGTTGTGCAAAGCATGTTTTGTATTACGGGCATCTCGAGCGAATC ++SRX079804:1:SRR292678:1:1101:30161:30161 1:N:0:1 BH:failed +DFFFEGDGGGGFGGEDCCDCEFFFFCCCCCB>CEBFGFBGGG?DE=:6@=>AD?D8DCEE:>EEABE5D@5:DDCA;EEE-DCD +@SRX079804:1:SRR292678:1:1101:47176:47176 1:N:0:1 BH:failed +TGAAGCGTCGATAGAAGTTAGCAAACCCGCGGAACTTCCGTACATCAGACACATTCCGGGGGGTGGGCCAATCCATGATGCCTTTG ++SRX079804:1:SRR292678:1:1101:47176:47176 1:N:0:1 BH:failed +FF@FFBEEEEFFEFFD@EDEFFB=DFEEFFFE8FFE8EEDBFDFEEBE+E46.'8.5::EE:?E>A6@?)>;>9D<C9DEBAAB=5C?<@0=A?D@BDB;:BA?BDDFH?B@DCB6BEBDA??AA9. +@SRX079804:1:SRR292678:1:1101:52180:52180 1:N:0:1 BH:failed +TTTTTGGTTTTAGGTCTAACATGTAAGTCTTTAATCTATTTTGAATTAATTTTTGGATGAGGTGTAAGGAAGGGATCCAGTTTC ++SRX079804:1:SRR292678:1:1101:52180:52180 1:N:0:1 BH:failed +FEFFFFFF=FEEFDFD>EE:?<5@BFAFCFEE7>C>:/6:2<344DA:6DDCDC>)34773DD?@DBA9B +@SRX079804:1:SRR292678:1:1101:105156:105156 1:N:0:1 BH:ok +ACTGCTGAGCTTAAATGGCGGCAGTCTGACGGTTACCAACGGGGGCACTTCAACCGGTTCGTTAACGGGGAGCGGAGAGCTGA ++SRX079804:1:SRR292678:1:1101:105156:105156 1:N:0:1 BH:ok +GFFEGGFGGGGEGGGGGGGGGFDD=DDE7EDD6CD?FEDEE@EBEFEE.DD5DDD@B<7>/0543C?BEE?@@BE@; +@SRX079804:1:SRR292678:1:1101:135168:135168 1:N:0:1 BH:ok +CCTTCCTAAAAATTAAGAATCTTAACAATTAGCAGCACAACCAAAATTATTACCGAAAGGACTTACTCCTCCGCCAAATCCA ++SRX079804:1:SRR292678:1:1101:135168:135168 1:N:0:1 BH:ok +GGGEGGGEGGBFFFFFGCFFEGG6AEEEDEEGGFGGCG4EGFFGDFFBGEGFGGFECBDEGGEFGBFF?CGFBFGGGGFGFF +@SRX079804:1:SRR292678:1:1101:149302:149302 1:N:0:1 BH:changed:1 +TAGGGTTGTATTTGCAGATCCATGGCATGCCAAAAAGAACATCGTCCCGTCCAATATCTGCAACATACCAGTTGGTTGGTA ++SRX079804:1:SRR292678:1:1101:149302:149302 1:N:0:1 BH:changed:1 +@;CBA=:@;@DBDCDEEE/EEEEEEF@>FBEEB=EFA>EEBD=DAEEEEB9)99>B99BC)@,@<9CDD=C,5;B::?@;A +@SRX079804:1:SRR292678:1:1101:170868:170868 2:N:0:1 BH:failed +CTGCCGAGACTGTTCTCAGACATGGAAAGCTCGATTCGCATACACTCGCTGAGTAAGAGAGTCACACCAAATCACAGATT ++SRX079804:1:SRR292678:1:1101:170868:170868 2:N:0:1 BH:failed +E;FFFEGFGIGGFBG;C6D<@C7CDGFEFGFHDFEHHHBBHHFDFEFBAEEEEDE@A2=DA:??C3:@>EEBEEHEFEHHFFHH?FGBGFBBD77B;;C?FFFFGGFED.BBABBG@DBBE +@SRX079804:1:SRR292678:1:1101:190845:190845 1:N:0:1 BH:changed:1 +CCTCAGCGTGGATTGCCGCTCATGCAGGAGCAGATAATCCCTTCGCCATCCCATTAAGCGCCGTTGTCGGTATTCC ++SRX079804:1:SRR292678:1:1101:190845:190845 1:N:0:1 BH:changed:1 +FF@FFCFEECEBEC@@BBBBDFBBFFDFFEFFEB8FFFFFFFFEFCEB/>BBA@AFFFEEEEECE;ACD@DBBEEE +@SRX079804:1:SRR292678:1:1101:198993:198993 2:N:0:1 BH:failed +AGTTATTTATGCATCATTCTCATGTATGAGCCAACAAGATAGTACAAGTTTTATTGCTATGAGTTCAGTACAACA ++SRX079804:1:SRR292678:1:1101:198993:198993 2:N:0:1 BH:failed +<<<=;@B??@<>@><48876EADEG6B.BB@.?+98204<:<>@?A=@EFEFFFEEFB +@SRX079804:1:SRR292678:1:1101:204480:204480 1:N:0:1 BH:failed +AGTGAGACACCCCTGAACATTCCTAGTAAGACATCTTTGAATATTACTAGTTAGCCACACTTTAAAATGACCCG ++SRX079804:1:SRR292678:1:1101:204480:204480 1:N:0:1 BH:failed +<98;<@@@:@CD@BCCDD=DBBCEBBAAA@9???@BCDBCGF=GEGDFGDBEEEEEFFFF=EDEE=DCD@@BBC +@SRX079804:1:SRR292678:1:1101:212327:212327 2:N:0:1 BH:failed +TTACCTCTGCTTTTTCGCCTGTTACTTCTACTAATCCTTCATCTATTGCGAATGGCCCTACTACTGACGAAAT ++SRX079804:1:SRR292678:1:1101:212327:212327 2:N:0:1 BH:failed +DBCCC@@;A@BDCCCE>BBED>GDCDBFBFFEBEECFGGD@@BCB<<8@;09746:@@>@EEECEEDE/FEED +@SRX079804:1:SRR292678:1:1101:230386:230386 1:N:0:1 BH:failed +TTCTGATTGGAGTGAGAGTGCCATTTGTTTCGCTGATTGGACGTTGGAAAGCGCCTTGACCTTTGACAGCAG ++SRX079804:1:SRR292678:1:1101:230386:230386 1:N:0:1 BH:failed +=BDD=DCDBBCDADD@@B;B@CC7C@B@>=BACDD,=??8DDCDD7CCCCDBDEDBDDDDDEEBDDDBD?DB +@SRX079804:1:SRR292678:1:1101:236093:236093 1:N:0:1 BH:failed +CATCTAACTCTCATTCTAGATTCTTAAGTTGGCTACACTTTGCCGTCATTCTCGGTGGATTAGCTATTGGG ++SRX079804:1:SRR292678:1:1101:236093:236093 1:N:0:1 BH:failed +A:@A@;@BB@GGFGG@A@@817729B +@SRX079804:1:SRR292678:1:1101:251912:251912 2:N:0:1 BH:ok +TGTAAATGGAAATGAACCTAATATGTATGCACAAACTATTAAAGCATATCTTGCAAAAGGAGCAATGGCG ++SRX079804:1:SRR292678:1:1101:251912:251912 2:N:0:1 BH:ok +GHFDHHHGHHHFFHHGHHHGGGGGGDEFFEGGGGEFHHFHGFGHHHHFHHDDD6@=DCACFFGGEFBBG +@SRX079804:1:SRR292678:1:1101:278698:278698 1:N:0:1 BH:ok +CTAATAATGGTAATTGAACCATAGAAGATAAGTTCATAATGTAATAAATACATCCATAGAGTTATTAA ++SRX079804:1:SRR292678:1:1101:278698:278698 1:N:0:1 BH:ok +DDBDBCCCDD@FFFB9<<<@DA=DA@B:@=@@AC@GGFCGECFFDGGCGFFGGFFCEBF9>?@>BDFF +@SRX079804:1:SRR292678:1:1101:295878:295878 1:N:0:1 BH:changed:1 +AATGCAAACAGGATGATATTTGAATCCGTAATACTGTTCTTTCATCATAAATAATTTATGCAGATAC ++SRX079804:1:SRR292678:1:1101:295878:295878 1:N:0:1 BH:changed:1 +HHHHHEGFHDGFEGBCBEEEGGGG@EDGCGBBBEGF4?EFDBDDBFE8DEE-E?EE;B@EFC=;FDE +@SRX079804:1:SRR292678:1:1101:306575:306575 1:N:0:1 BH:failed +ATCTTTGAACTTTGTCTCACTTTCCCCCATCTCACCATTCCTCCTGTTCTGTGAACCCCAGTTTCA ++SRX079804:1:SRR292678:1:1101:306575:306575 1:N:0:1 BH:failed +E::EA@E<6B8>97:<6084649?@:?EDED=BEBEGGFEDGECECBDEFDDEEGGEDDFFF +@SRX079804:1:SRR292678:1:1101:403661:403661 1:N:0:1 BH:failed +GATGGCTTTGCTTTCTCATTCTCCTCTCCATCGTTCCCATCTTCGCCCTCAGACGCTGATTGAT ++SRX079804:1:SRR292678:1:1101:403661:403661 1:N:0:1 BH:failed +EG=DFFDFFDDGDGGGGGFFGGGGBADADCEEE5EC>CCCE6BEEEGGGGCBEGGE@9BCEF;>>D@D +@SRX079804:1:SRR292678:1:1101:425870:425870 1:N:0:1 BH:failed +GTACAGCTCTCCTCGTTACCAGCACATCTTGGACACCCGACGAAGACTTTGACATGCTCCTC ++SRX079804:1:SRR292678:1:1101:425870:425870 1:N:0:1 BH:failed +E@EED@EF=D>=EED@D@7DBF +@SRX079804:1:SRR292678:1:1101:429745:429745 1:N:0:1 BH:failed +GGCGATTGTGAAGGCATAAGAGTGGGACATAGTTCAAGTCCAGAACGAATTAAACGCACAA ++SRX079804:1:SRR292678:1:1101:429745:429745 1:N:0:1 BH:failed +BB2B=A@A:BFBFFFFF +@SRX079804:1:SRR292678:1:1101:475293:475293 2:N:0:1 BH:ok +TACATTTGATTTCTTTATAAGATTTCTTACTGTAAAATCATCGCTATTTAACAGCTTATT ++SRX079804:1:SRR292678:1:1101:475293:475293 2:N:0:1 BH:ok +FHFFFFDC@FGFEDGE?EEDC6EEEDEF?EEEE8EHHGGHFGFFEGGGGBFBDBDEBCBCA +@SRX079804:1:SRR292678:1:1101:511594:511594 1:N:0:1 BH:ok +AGCACAATCAGATTCGCTTATGACGGCGATGAAGAAATTGCGATGAAATGTGAGGTGA ++SRX079804:1:SRR292678:1:1101:511594:511594 1:N:0:1 BH:ok +HCHFFHHHGFHBFFFEGFFEFHFEHGBGECHEHB?CDDEFCDBFF9DFCD.BC +@SRX079804:1:SRR292678:1:1101:527839:527839 2:N:0:1 BH:changed:4 +GACATTTCTTTCTGGGAAGGCTTATTAAACGATAAAGATAATGATATACGTTTTGAT ++SRX079804:1:SRR292678:1:1101:527839:527839 2:N:0:1 BH:changed:4 +>35/-;,><04%'A.?4?:>BE.DC-@???CE:@EDFDBDG7B;=<)?1.@?2A5<5 +@SRX079804:1:SRR292678:1:1101:547309:547309 1:N:0:1 BH:failed +AGGTCTGCACTGGCTCTTCAGAGCGCAAGCGAGGGAAGAGAAATATAGTGACGCAC ++SRX079804:1:SRR292678:1:1101:547309:547309 1:N:0:1 BH:failed +GEGFGFD=FDB8B7DDFFFF@/DC@+2:66>-@>9);<2: +@SRX079804:1:SRR292678:1:1101:570367:570367 1:N:0:1 BH:failed +CTATGGCCACTTTTGTATTCTCGATTGAGGTTATTCGCTCACCCATCTTTTCCAA ++SRX079804:1:SRR292678:1:1101:570367:570367 1:N:0:1 BH:failed +FFEE;FFFFBFFFFF<9BEDEBEACDD3DD0B5>>0?:@>FFBEEBEFFFFGCCBCDGIGDGHEFGG=GGGGHFHEHF@FEFE?CGEEEFADEAEC +@SRX079804:1:SRR292678:1:1101:590521:590521 1:N:0:1 BH:failed +CCTCTTCTCTTTCGCGGTCATCTTGGGTTTCGCGCCTTTCTTCTTGACGACAC ++SRX079804:1:SRR292678:1:1101:590521:590521 1:N:0:1 BH:failed +EGFDD@FDF=FDFFEEGGBEFDGF=FGAGEEEEEDFBGGEE8EEE@>5;>90< +@SRX079804:1:SRR292678:1:1101:601307:601307 1:N:0:1 BH:changed:1 +TTGGCGTGCTGATGATTATCGGTATCTTCAAAGGCGCGCAGCCTGCGGGCTG ++SRX079804:1:SRR292678:1:1101:601307:601307 1:N:0:1 BH:changed:1 +GGGEGFGGEGE:EE>GFFGGGGDCGEBFFF>G=EBFFEC?DFGAD?DDECBE +@SRX079804:1:SRR292678:1:1101:631057:631057 1:N:0:1 BH:failed +CACCATTATCCTATTTCTGAACACATTTGACAGTCACGGCACTAGCATTGG ++SRX079804:1:SRR292678:1:1101:631057:631057 1:N:0:1 BH:failed +GGGGGGGGBFGGGEGFGGGGGFGEFFDFFFF?EBFEEDBFGEE@BE;E?E7 +@SRX079804:1:SRR292678:1:1101:654270:654270 1:N:0:1 BH:failed +ATCTTTCTCTCTCTGCTGCATTCTCCGCTTCAGCTCCTCAATTTCAATCA ++SRX079804:1:SRR292678:1:1101:654270:654270 1:N:0:1 BH:failed +GGGGGGEDGGFEGDGCGGBCEFGGGGFBFGGEGGGGGEGFCFEEGGFEDB +@SRX079804:1:SRR292678:1:1101:667761:667761 1:N:0:1 BH:failed +CAGCCTTTTGAGGTCGTCTATCGCAGCGTGTCCGCGACGTTTTGTTGCG ++SRX079804:1:SRR292678:1:1101:667761:667761 1:N:0:1 BH:failed +GGFGGG=GGGG@GFGGG@GGEGFGGGEGGGFGGEG@EEEDE8EE=E=DE +@SRX079804:1:SRR292678:1:1101:671526:671526 1:N:0:1 BH:ok +CAAGTAGAAGGTTAGCGCCTCTCTGTAAAAGGAGTCAAGCGCTATGTC ++SRX079804:1:SRR292678:1:1101:671526:671526 1:N:0:1 BH:ok +HHHHHGHEHHDGGGCGGGGHFGGGGEDGGGGGBEBEBEEEFGHFHFHH +@SRX079804:1:SRR292678:1:1101:685633:685633 2:N:0:1 BH:failed +GCTGTAATGGATCCACTAATTGGGACAGTGGTGGATAAAACGAATAC ++SRX079804:1:SRR292678:1:1101:685633:685633 2:N:0:1 BH:failed +DADADCGFEFEEEGEGGEFDEEEEBDAC;C10<9?AGBGGGFF7DFB +@SRX079804:1:SRR292678:1:1101:769626:769626 1:N:0:1 BH:failed +GTATTCCTTGACAGTCGAAAGAATCACTGCTAACCCAGGC ++SRX079804:1:SRR292678:1:1101:769626:769626 1:N:0:1 BH:failed +C>5BBCCCD=ACDD@A7@@B@A?B?=8B??EDEEDEBDEE +@SRX079804:1:SRR292678:1:1101:776222:776222 1:N:0:1 BH:failed +AGAACGTACACCCTACGCTAAGCAGTGGCTCCATGCCAA ++SRX079804:1:SRR292678:1:1101:776222:776222 1:N:0:1 BH:failed +BGDGGGGFEGGGGEGGGGFGFF6=FCFAFFEE,=C?EDD +@SRX079804:1:SRR292678:1:1101:782183:782183 2:N:0:1 BH:failed +TACGCGTAACGACGTCATAGCCATGACGCTTCAATAAA ++SRX079804:1:SRR292678:1:1101:782183:782183 2:N:0:1 BH:failed +D +@SRX079804:1:SRR292678:1:1101:828383:828383 1:N:0:1 BH:failed +AAACCACATATGACATGAGTGACGGGACTAAAGTTC ++SRX079804:1:SRR292678:1:1101:828383:828383 1:N:0:1 BH:failed +FFEEFEEB=E,C>CDEEEECEBEEEC?F;BDDDDAE +@SRX079804:1:SRR292678:1:1101:829239:829239 1:N:0:1 BH:ok +TCGATCCTTCTGCCTCAAAGTATACTAGGACGCAT ++SRX079804:1:SRR292678:1:1101:829239:829239 1:N:0:1 BH:ok +GGGDFGGBGFFEBFEDCBCDCGGGGBEEE=GE?EE +@SRX079804:1:SRR292678:1:1101:868419:868419 1:N:0:1 BH:ok +ATTCGTCAGGCCCAATAACATCATGAATTTCCAG ++SRX079804:1:SRR292678:1:1101:868419:868419 1:N:0:1 BH:ok +DEEEEEEEBDFFFFFFFF8FEED8@FFFBFFEFF +@SRX079804:1:SRR292678:1:1101:892716:892716 2:N:0:1 BH:ok +AATTTACCTAATGGAATCAATGAGGCTACTCCA ++SRX079804:1:SRR292678:1:1101:892716:892716 2:N:0:1 BH:ok +@BCBBBCCBCCCCCCCDDAAFFGEEEDBF@EE@ +@SRX079804:1:SRR292678:1:1101:893159:893159 2:N:0:1 BH:ok +TTCTCTGCTTTTCATATCTTGTCATAAAAATT ++SRX079804:1:SRR292678:1:1101:893159:893159 2:N:0:1 BH:ok +CBDCDEEEEEBEEEEGDDFDEEEEDGFFEGFE +@SRX079804:1:SRR292678:1:1101:918742:918742 1:N:0:1 BH:failed +CTCTCCATGCACAAAGAATATCACAGCCAAA ++SRX079804:1:SRR292678:1:1101:918742:918742 1:N:0:1 BH:failed +EEEBA?@;B@EEE@BEE=?EDDDDADCDA?E +@SRX079804:1:SRR292678:1:1101:923787:923787 2:N:0:1 BH:ok +TTGTGAAGGATGGGATATTAGTGTAGATGA ++SRX079804:1:SRR292678:1:1101:923787:923787 2:N:0:1 BH:ok +EEBBEGEEE=BBB<@DCDCGD@D>=DEGEE +@SRX079804:1:SRR292678:1:1101:933189:933189 1:N:0:1 BH:failed +GTCTGCACTATCGAGGGCTGTGCCTTTGC ++SRX079804:1:SRR292678:1:1101:933189:933189 1:N:0:1 BH:failed +FEFFDBFF8FE>?DFFFCEBCEEBBEDE6 +@SRX079804:1:SRR292678:1:1101:937136:937136 1:N:0:1 BH:failed +TTTCTTTGGCTTAAAGATAGTTTTAGTC ++SRX079804:1:SRR292678:1:1101:937136:937136 1:N:0:1 BH:failed +EFFFEEEEFCBCDDDDE@/E?@@7@@3< +@SRX079804:1:SRR292678:1:1101:940351:940351 1:N:0:1 BH:changed:1 +TGCCGTGGGAATGACAAACAAGCATCC ++SRX079804:1:SRR292678:1:1101:940351:940351 1:N:0:1 BH:changed:1 +DECC@GFFBF=EBEAFDFGD?FFF8FF +@SRX079804:1:SRR292678:1:1101:940693:940693 1:N:0:1 BH:failed +CACATTATGAACTATGGGCACTGCAT ++SRX079804:1:SRR292678:1:1101:940693:940693 1:N:0:1 BH:failed +EEEGFDEDFEGGGGGFEGBGGGFGGG +@SRX079804:1:SRR292678:1:1101:955819:955819 1:N:0:1 BH:failed +CACCTAGCAGCAACGGACGAGTCAG ++SRX079804:1:SRR292678:1:1101:955819:955819 1:N:0:1 BH:failed +GGGGGEEEGGEGGGFGEGG;F@EFF +@SRX079804:1:SRR292678:1:1101:958051:958051 2:N:0:1 BH:ok +TTAATATTTCCATCTGAACTTCGC ++SRX079804:1:SRR292678:1:1101:958051:958051 2:N:0:1 BH:ok +EDDBGFEGFGHHFHGGEDEGBGDB +@SRX079804:1:SRR292678:1:1101:996098:996098 1:N:0:1 BH:failed +CTAAGAGAGTTTGTAATGCGGAC ++SRX079804:1:SRR292678:1:1101:996098:996098 1:N:0:1 BH:failed +DD=DBDBDC4EFFFD@?CD@ACD +@SRX079804:1:SRR292678:1:1101:1020278:1020278 2:N:0:1 BH:ok +AAAGTGCAGAACATGCAGATAT ++SRX079804:1:SRR292678:1:1101:1020278:1020278 2:N:0:1 BH:ok +D>AC?GDDCD?DDADE@GABDG +@SRX079804:1:SRR292678:1:1101:1022234:1022234 1:N:0:1 BH:ok +GACCTTTCCGCAAGCTGTCGC ++SRX079804:1:SRR292678:1:1101:1022234:1022234 1:N:0:1 BH:ok +HHHHFEHHHHGGHHHGDHEEG +@SRX079804:1:SRR292678:1:1101:1024144:1024144 1:N:0:1 BH:failed +TCCATTATGAAAGAAGAAAA ++SRX079804:1:SRR292678:1:1101:1024144:1024144 1:N:0:1 BH:failed +@A><96:6: +@SRX079804:1:SRR292678:1:1101:1175112:1175112 1:N:0:1 BH:failed +AGGCC ++SRX079804:1:SRR292678:1:1101:1175112:1175112 1:N:0:1 BH:failed +EC8EE +@SRX079804:1:SRR292678:1:1101:1182927:1182927 1:N:0:1 BH:changed:1 +TGAA ++SRX079804:1:SRR292678:1:1101:1182927:1182927 1:N:0:1 BH:changed:1 +GGBH +@SRX079804:1:SRR292678:1:1101:1243474:1243474 1:N:0:1 BH:ok +ACG ++SRX079804:1:SRR292678:1:1101:1243474:1243474 1:N:0:1 BH:ok +EEE +@SRX079804:1:SRR292678:1:1101:1266246:1266246 1:N:0:1 BH:failed +AA ++SRX079804:1:SRR292678:1:1101:1266246:1266246 1:N:0:1 BH:failed +C@ +@SRX079804:1:SRR292678:1:1101:1269735:1269735 1:N:0:1 BH:failed +C ++SRX079804:1:SRR292678:1:1101:1269735:1269735 1:N:0:1 BH:failed +G From d962c092abdabf627e25b869de6803e963e19f64 Mon Sep 17 00:00:00 2001 From: lsmertina Date: Sun, 25 Feb 2024 08:25:10 +1100 Subject: [PATCH 16/24] add script for task 5 hw14 --- biopython_fastq_filter.py | 87 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/biopython_fastq_filter.py b/biopython_fastq_filter.py index b3d4927..b2fb7e3 100644 --- a/biopython_fastq_filter.py +++ b/biopython_fastq_filter.py @@ -32,3 +32,90 @@ def filter_fastq(input_path: str, quality_threshold: int, output_filename="final print(result_quality) #filter_fastq("example_fastq.fastq", 15) + + +from abc import ABC, abstractmethod + +class InvalidInputError(ValueError): + pass + +class BiologicalSequence(ABC, str): + @abstractmethod + def __init__(self, seq): + self.seq = seq + + def __len__(self): + return len(self.seq) + + def __getitem__(self, index): + return self.seq[int(index)] + + def __repr__(self): + return __str__(self.seq) + + def check_nucleic_acid(self): + unique_chars = set(self.seq) + nucleotides_dna = set('ATGCatgc') + nucleotides_rna = set('AUGCaugc') + if unique_chars <= nucleotides_dna: + seq = 'dna' + elif unique_chars <= nucleotides_rna: + seq = 'rna' + else: + raise InvalidInputError() + return seq_type + +class NucleicAcidSequence(BiologicalSequence): + #complement_dict = None + def __init__(self, seq): + super().__init__(seq) + self.check_nucleic_acid() + self.length = len(self.seq) + + def complement(self): + list_input = list(self.seq) + for i in range(len(self.seq)): + if list_input[i] in self.complement_dict: + list_input[i] = self.complement_dict[list_input[i]] + return "".join(list_input) + +class DNASequence(NucleicAcidSequence): + complement_dict = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'a': 't', 't': 'a', 'g': 'c', 'c': 'g'} + def __init__(self, seq): + super().__init__(seq) + self.complement() + + def transcribe(self): + list_input = list(self.seq) + for i in range(len(self.seq)): + if (list_input[i] == 'T'): + list_input[i] = 'U' + elif (list_input[i] == 't'): + list_input[i]='u' + return "".join(list_input) + +class RNASequence(NucleicAcidSequence): + complement_dict = {'A': 'U', 'U': 'A', 'G': 'C', 'C': 'G', 'a': 'u', 'u': 'a', 'g': 'c', 'c': 'g'} + def __init__(self, seq): + super().__init__(seq) + self.complement() + +class AminoAcidSequence(BiologicalSequence): + def __init__(self, seq): + self.seq = seq + + def amino_acid_frequency(self): + """Calculates molecular weight of a protein + Arguments: + - seq (str) 1-letter coded protein sequence + Return: + - int, molecular weight (g/mol) rounded to integer""" + freq_dict = {} + for letter in self.seq: + if letter in freq_dict: + freq_dict[letter] += 1 + else: + freq_dict[letter] = 1 + for letter in freq_dict: + freq_dict[letter] = round(freq_dict[letter] / len(self.seq) * 100, 2) + return freq_dict From b3ad95070e9ed3a7c851c858305b96bc5ac203a1 Mon Sep 17 00:00:00 2001 From: lsmertina Date: Wed, 1 May 2024 19:05:24 +1000 Subject: [PATCH 17/24] add files and data --- Showcases.ipynb | 449 ++++++++++++++++++ bio_files_processor.py | 73 +++ biopython_fastq_filter.py | 84 +++- custom_random_forest.py | 78 +++ data/example_fasta.fasta | 18 + data/example_fasta.fasta:Zone.Identifier | 3 + .../example_fastq.fastq | 0 data/example_fastq.fastq:Zone.Identifier | 3 + data/sequence.fasta | 268 +++++++++++ test_my_tools.py | 112 +++++ 10 files changed, 1083 insertions(+), 5 deletions(-) create mode 100644 Showcases.ipynb create mode 100644 custom_random_forest.py create mode 100644 data/example_fasta.fasta create mode 100644 data/example_fasta.fasta:Zone.Identifier rename example_fastq.fastq => data/example_fastq.fastq (100%) create mode 100644 data/example_fastq.fastq:Zone.Identifier create mode 100644 data/sequence.fasta create mode 100644 test_my_tools.py diff --git a/Showcases.ipynb b/Showcases.ipynb new file mode 100644 index 0000000..676ef72 --- /dev/null +++ b/Showcases.ipynb @@ -0,0 +1,449 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "39e3aac3-ed97-4c58-871e-a334ee17c2a4", + "metadata": {}, + "source": [ + "# OpenFasta" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "fec18e02-01e2-41ef-97c0-26aeffd2f057", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id = 'GTD323452'\n", + " description = '5S_rRNA'\n", + " sequence = 'ACGGCCATAGGACTTTGAAAGCACCGCATCCCGTCCGATCTGCGAAGTTAACCAAGATGCCGCCTGGTTAGTACCATGGTGGGGGACCACATGGGAATCCCTGGTGCTGTG'\n" + ] + } + ], + "source": [ + "import time\n", + "import os\n", + "from typing import Optional\n", + "\n", + "from bio_files_processor import OpenFasta\n", + "\n", + "fasta_file = \"data/example_fasta.fasta\"\n", + "\n", + "with OpenFasta(fasta_file) as fasta:\n", + " for record in fasta.read_records():\n", + " print(record) " + ] + }, + { + "cell_type": "markdown", + "id": "f2cff184-adc1-4396-aabe-5634dd654efc", + "metadata": {}, + "source": [ + "# Run_genscan" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2bf845cb-ff8a-4dac-b121-c40f5351dc99", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Cannot provide both a sequence containing file and a sequence in the form. Please choose one.\n", + "\n", + "GENSCAN Output\n", + "\n", + "\n", + "\n", + "

GENSCAN Output

\n", + "
\n", + "View gene model output: PS | PDF\n", + "
\n",
+       "GENSCAN 1.0\tDate run: 29-Apr-124\tTime: 18:45:12\n",
+       "\n",
+       "\n",
+       "\n",
+       "Sequence /tmp/04_29_24-18:45:12.fasta : 18651 bp : 48.67% C+G : Isochore 2 (43 - 51 C+G%)\n",
+       "\n",
+       "\n",
+       "\n",
+       "Parameter matrix: HumanIso.smat\n",
+       "\n",
+       "\n",
+       "\n",
+       "Predicted genes/exons:\n",
+       "\n",
+       "\n",
+       "\n",
+       "Gn.Ex Type S .Begin ...End .Len Fr Ph I/Ac Do/T CodRg P.... Tscr..\n",
+       "\n",
+       "----- ---- - ------ ------ ---- -- -- ---- ---- ----- ----- ------\n",
+       "\n",
+       "\n",
+       "\n",
+       " 1.01 Intr +     64    144   81  2  0   40   99   113 0.455   7.21\n",
+       "\n",
+       " 1.02 Intr +   8152   8490  339  2  0   86  110   352 0.992  32.65\n",
+       "\n",
+       " 1.03 Intr +   9870   9933   64  1  1   88  115    68 0.994   7.28\n",
+       "\n",
+       " 1.04 Intr +  14487  14599  113  0  2   71  100   118 0.742  11.32\n",
+       "\n",
+       " 1.05 Intr +  16751  16773   23  0  2   84  110     7 0.471  -0.24\n",
+       "\n",
+       " 1.06 Intr +  17109  17243  135  2  0  101   74   173 0.993  17.96\n",
+       "\n",
+       " 1.07 Term +  17717  17872  156  1  0  109   38    97 0.989   4.73\n",
+       "\n",
+       " 1.08 PlyA +  18627  18632    6                               1.05\n",
+       "\n",
+       "\n",
+       "\n",
+       "\n",
+       "\n",
+       "Suboptimal exons with probability > 1.000\n",
+       "\n",
+       "\n",
+       "\n",
+       "Exnum Type S .Begin ...End .Len Fr Ph B/Ac Do/T CodRg P.... Tscr..\n",
+       "\n",
+       "----- ---- - ------ ------ ---- -- -- ---- ---- ----- ----- ------\n",
+       "\n",
+       "\n",
+       "\n",
+       "\n",
+       "\n",
+       "NO EXONS FOUND AT GIVEN PROBABILITY CUTOFF\n",
+       "\n",
+       "\n",
+       "\n",
+       "\n",
+       "\n",
+       "Predicted peptide sequence(s):\n",
+       "\n",
+       "\n",
+       "\n",
+       "\n",
+       "\n",
+       ">/tmp/04_29_24-18:45:12.fasta|GENSCAN_predicted_peptide_1|303_aa\n",
+       "\n",
+       "XSTEGNGDLSEEKMPLLTLYLLLFWLSGYSIVTQITGPTTVNGLERGSLTVQCVYRSGWE\n",
+       "\n",
+       "TYLKWWCRGAIWRDCKILVKTSGSEQEVKRDRVSIKDNQKNRTFTVTMEDLMKTDADTYW\n",
+       "\n",
+       "CGIEKTGNDLGVTVQVTIDPAPVTQEETSSSPTLTGHHLDNRHKLLKLSVLLPLIFTILL\n",
+       "\n",
+       "LLLVAASLLAWRMMKYQQKAAGMSPEQVLQPLEGDLCYADLTLQLAGTSPQKATTKLSSA\n",
+       "\n",
+       "QVDQVEVEYVTMASLPKEDISYASLTLGAEDQEPTYCNMGHLSSHLPGRGPEEPTEYSTI\n",
+       "\n",
+       "SRP\n",
+       "\n",
+       "
\n", + "
\n", + "
\n", + "

Back to GENSCAN\n", + "

\n" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from bs4 import BeautifulSoup\n", + "from biopython_fastq_filter import run_genscan\n", + "\n", + "output = run_genscan(sequence=None, sequence_file=\"data/sequence.fasta\", organism=\"Vertebrate\", exon_cutoff=1.00, sequence_name=\"\")\n", + "soup = BeautifulSoup(output, 'html.parser')\n", + "lines = soup.prettify().split(\"\\n\")\n", + "cds_list = []\n", + "soup" + ] + }, + { + "cell_type": "markdown", + "id": "aea07324-6768-4c42-8371-c7ba75436049", + "metadata": {}, + "source": [ + "# RNASequence, DNASequence, AminoAcidSequence " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "2aafe0ba-fecb-467c-af55-977c5850c59d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AAUU\n", + "{'K': 33.33, 'P': 33.33, 'L': 33.33}\n", + "CCTT\n" + ] + }, + { + "data": { + "text/plain": [ + "str" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from biopython_fastq_filter import NucleicAcidSequence, BiologicalSequence, RNASequence, DNASequence, AminoAcidSequence \n", + "\n", + "new_rna = RNASequence('UUAA')\n", + "print(new_rna.complement())\n", + "\n", + "new_protein = AminoAcidSequence('KKPPLL')\n", + "print(new_protein.amino_acid_frequency())\n", + "\n", + "new_dna = DNASequence('GGAA')\n", + "print(new_dna.complement())\n", + "type(new_dna.complement())" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "134df12f-398c-49f7-9b32-c95ebfe3f89e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1m============================= test session starts ==============================\u001b[0m\n", + "platform linux -- Python 3.12.0, pytest-8.2.0, pluggy-1.5.0\n", + "rootdir: /home/lsmertina/misc_module\n", + "plugins: anyio-4.3.0, requests-mock-1.12.1\n", + "collected 8 items \u001b[0m\n", + "\n", + "test_my_tools.py \u001b[32m.\u001b[0m\u001b[32m.\u001b[0m\u001b[32m.\u001b[0m\u001b[32m.\u001b[0m\u001b[32m.\u001b[0m\u001b[32m.\u001b[0m\u001b[32m.\u001b[0m\u001b[32m.\u001b[0m\u001b[33m [100%]\u001b[0m\n", + "\n", + "\u001b[33m=============================== warnings summary ===============================\u001b[0m\n", + "test_my_tools.py::test_filter_fastq\n", + " /home/lsmertina/miniforge3/envs/testing/lib/python3.12/site-packages/Bio/SeqUtils/__init__.py:144: BiopythonDeprecationWarning: GC is deprecated; please use gc_fraction instead.\n", + " warnings.warn(\n", + "\n", + "-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html\n", + "\u001b[33m========================= \u001b[32m8 passed\u001b[0m, \u001b[33m\u001b[1m1 warning\u001b[0m\u001b[33m in 0.29s\u001b[0m\u001b[33m =========================\u001b[0m\n" + ] + } + ], + "source": [ + "! python -m pytest" + ] + }, + { + "cell_type": "markdown", + "id": "5ef9fe6c-3fb6-41e4-97fe-eecde2c97028", + "metadata": {}, + "source": [ + "# RandomForestClassifierCustom" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d9581b19-6f73-474d-9159-b74c97c3ab40", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import warnings\n", + "import random\n", + "import math\n", + "import pandas as pd\n", + "import xgboost\n", + "import lightgbm\n", + "import catboost\n", + "\n", + "from matplotlib.colors import ListedColormap\n", + "from scipy.stats import pearsonr\n", + "from itertools import combinations\n", + "from sklearn.base import BaseEstimator\n", + "from sklearn import datasets\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.ensemble import (RandomForestClassifier,\n", + " ExtraTreesClassifier,\n", + " VotingClassifier)\n", + "from sklearn.tree import (DecisionTreeRegressor,\n", + " DecisionTreeClassifier)\n", + "from custom_random_forest import RandomForestClassifierCustom\n", + "import time\n", + "from sklearn.datasets import make_classification" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f03467db-99fa-4b07-9e18-9dc015b2b28d", + "metadata": {}, + "outputs": [], + "source": [ + "X, y = make_classification(n_samples=100000)\n", + "random_forest = RandomForestClassifierCustom(max_depth=30, n_estimators=10, \n", + " max_features=2, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7b735b11-70e9-4b19-b82f-fa7cca0aa901", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.9622790813446045\n" + ] + } + ], + "source": [ + "import time\n", + "start_time = time.time()\n", + "random_forest.fit(X, y, n_processes=1)\n", + "fit_time_1_process = time.time() - start_time\n", + "print(fit_time_1_process)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "3c7c0196-8209-4518-95f0-753972c2a22e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.2971296310424805\n" + ] + } + ], + "source": [ + "#Fit with 2 processes\n", + "start_time = time.time()\n", + "random_forest.fit(X, y, n_processes=2)\n", + "fit_time_2_processes = time.time() - start_time\n", + "print(fit_time_2_processes)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "e66deca9-1f5a-44a6-98bf-bff0ea6cade6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.21371793746948242\n" + ] + } + ], + "source": [ + "#Predict with 1 process\n", + "start_time = time.time()\n", + "predictions_1_process = random_forest.predict(X, n_processes=1)\n", + "predict_time_1_process = time.time() - start_time\n", + "print(predict_time_1_process)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d6bfb90d-51bd-4a76-a405-67172981d061", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.2030501365661621\n" + ] + } + ], + "source": [ + "#Predict with 2 processes\n", + "start_time = time.time()\n", + "predictions_2_processes = random_forest.predict(X, n_processes=2)\n", + "predict_time_2_processes = time.time() - start_time\n", + "print(predict_time_2_processes)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "c9ea23c3-3e68-4208-82c3-64be50b47992", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fit time with 1 process: 2.9622790813446045\n", + "Fit time with 2 processes: 2.2971296310424805\n", + "Predict time with 1 process: 0.21371793746948242\n", + "Predict time with 2 processes: 0.2030501365661621\n", + "Predictions match: True\n" + ] + } + ], + "source": [ + "#Check if predictions are the same\n", + "predictions_match = np.array_equal(predictions_1_process, predictions_2_processes)\n", + "\n", + "print(\"Fit time with 1 process:\", fit_time_1_process)\n", + "print(\"Fit time with 2 processes:\", fit_time_2_processes)\n", + "print(\"Predict time with 1 process:\", predict_time_1_process)\n", + "print(\"Predict time with 2 processes:\", predict_time_2_processes)\n", + "print(\"Predictions match:\", predictions_match)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/bio_files_processor.py b/bio_files_processor.py index e69de29..8a9f12d 100644 --- a/bio_files_processor.py +++ b/bio_files_processor.py @@ -0,0 +1,73 @@ +import os +from typing import Optional +from dataclasses import dataclass + +def convert_multiline_fasta_to_oneline(input_fasta: str, output_fasta: Optional[str]=None) -> str: + """ + This function converts a multiline fasta file into one line fasta + Arguments: input file and output file name (optional) + Returns a file in current working directory + """ + if output_fasta == None: + output_fasta = 'output_fasta.fasta' + current_dir = str(os.getcwd()) + output_file = os.path.join(current_dir, output_fasta) + with open(input_fasta) as input_file, open(output_fasta, mode='w') as output_file: + help_list = [] + for line in input_file: + if line.startswith('>'): + if len(help_list)!=0: + output_file.write(''.join(help_list) + '\n') + help_list = [] + output_file.write(line) + else: + help_list.append(line.strip()) + if len(help_list)!=0: + output_file.write(''.join(help_list) + '\n') + return output_file + +@dataclass +class FastaRecord: + id: str + description: str + sequence: str + + def __repr__(self): + return f"id = '{self.id}'\n description = '{self.description}'\n sequence = '{self.sequence}'" + +class OpenFasta: + def __init__(self, filename): + self.filename = filename + self.file = None + + def __enter__(self): + self.file = open(self.filename, 'r') + return self + + def __exit__(self, exc_type, exc_value, traceback): + if self.file: + self.file.close() + + def read_record(self): + name_line = self.file.readline().strip() + if not name_line: + return None + parts = name_line.split(' ', 2) + if len(parts) < 2: + return None + id = parts[0][1:] + desc = parts[1] + seq = '' + line = self.file.readline().strip() + while line and not line.startswith(">"): + seq += line + line = self.file.readline().strip() + return FastaRecord(id=id, description=desc, sequence=seq) + + def read_records(self): + records = [] + record = self.read_record() + while record: + records.append(record) + record = self.read_record() + return records \ No newline at end of file diff --git a/biopython_fastq_filter.py b/biopython_fastq_filter.py index b2fb7e3..1f525b8 100644 --- a/biopython_fastq_filter.py +++ b/biopython_fastq_filter.py @@ -1,7 +1,17 @@ from Bio import SeqIO from Bio.SeqUtils import GC +import requests +from bs4 import BeautifulSoup +import time +import os +from dotenv import load_dotenv +from dataclasses import dataclass +from typing import List -def filter_fastq(input_path: str, quality_threshold: int, output_filename="final_filtered.fastq", gc_bounds=(40, 60), length_bounds=(50, 350)): +load_dotenv() + + +def filter_fastq(input_path: str, quality_threshold: int, output_filename="final_filtered.fastq", gc_bounds=(40, 60), length_bounds=(30, 350)): filename = input_path records = SeqIO.parse(filename, "fastq") ###quality filter @@ -29,9 +39,7 @@ def filter_fastq(input_path: str, quality_threshold: int, output_filename="final result_quality = SeqIO.write(filtered_GC_quality_length, output_filename, "fastq") - print(result_quality) - -#filter_fastq("example_fastq.fastq", 15) + return result_quality from abc import ABC, abstractmethod @@ -66,7 +74,7 @@ def check_nucleic_acid(self): return seq_type class NucleicAcidSequence(BiologicalSequence): - #complement_dict = None + complement_dict = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'a': 't', 't': 'a', 'g': 'c', 'c': 'g'} def __init__(self, seq): super().__init__(seq) self.check_nucleic_acid() @@ -110,7 +118,13 @@ def amino_acid_frequency(self): - seq (str) 1-letter coded protein sequence Return: - int, molecular weight (g/mol) rounded to integer""" + unique_aa = set(self.seq) freq_dict = {} + amino_acids = set('GAVLITSMCPFYWHKRDENQgavlitsmcpfywhkrdenq') + if unique_aa <= amino_acids: + seq = 'peptide' + else: + raise InvalidInputError() for letter in self.seq: if letter in freq_dict: freq_dict[letter] += 1 @@ -119,3 +133,63 @@ def amino_acid_frequency(self): for letter in freq_dict: freq_dict[letter] = round(freq_dict[letter] / len(self.seq) * 100, 2) return freq_dict + +token = os.environ.get('TG_API_TOKEN') + +def send_telegram_message(chat_id: str, message:str): + """this function uses the bot token and a message generated in telegram_logger function + and sends this message through the telegram bot""" + url = f"https://api.telegram.org/bot{token}/sendMessage" + data = {"chat_id": chat_id, "text": message, "parse_mode": "Markdown"} + response = requests.post(url, data=data) + return response.json() + +def telegram_logger(chat_id: str): + """this function is a decorator that times a function and generates a message + regarding its execution result""" + def decorator(func): + def inner_func(*args, **kwargs): + start = time.time() + try: + result = func(*args, **kwargs) + end = time.time() + execution_time = end - start + if execution_time < 86400: + time_str = time.strftime('%H:%M:%S.%f', time.gmtime(execution_time))[:-3] + message = f"🎉Function `{func.__name__}` has finished in `{time_str}` " + else: + days = int(execution_time // 86400) + time_str = str(timedelta(seconds=execution_time)) + message = f"Function `{func.__name__}` has finished in `{days}` days, `{str(timedelta(seconds=execution_time))}` " + send_telegram_message(chat_id, message) + return result + except Exception as error: + message = f"😞Function `{func.__name__}` failed with an exception:\nType: `{type(error).__name__}`\nError: `{str(error)}` " + send_telegram_message(chat_id, message) + return inner_func + return decorator + +@dataclass +class GenscanOutput: + status: str + cds_list: List[str] + intron_list: List[dict] + exon_list: List[dict] + +def run_genscan(sequence=None, sequence_file=None, organism="Vertebrate", exon_cutoff=1.00, sequence_name=""): + url = "http://argonaute.mit.edu/cgi-bin/genscanw_py.cgi" + + if sequence_file: + with open(sequence_file, 'rb') as file: + sequence = file.read().strip() + data = { + "-o": organism, + "-e": exon_cutoff, + "-n": sequence_name, + "-p": "Predicted peptides only", + "-u": sequence_file, + "-s": sequence + } + response = requests.post(url, data=data) + status = response.status_code + return response.content \ No newline at end of file diff --git a/custom_random_forest.py b/custom_random_forest.py new file mode 100644 index 0000000..3d27abd --- /dev/null +++ b/custom_random_forest.py @@ -0,0 +1,78 @@ +import multiprocessing +import random +import numpy as np +from sklearn.base import BaseEstimator +import time +from sklearn.datasets import make_classification +from sklearn.ensemble import (RandomForestClassifier, + ExtraTreesClassifier, + VotingClassifier) +from sklearn.tree import (DecisionTreeRegressor, + DecisionTreeClassifier) + +SEED = 111 +random.seed(SEED) +np.random.seed(SEED) + +import multiprocessing +from sklearn.base import BaseEstimator + +class RandomForestClassifierCustom(BaseEstimator): + def __init__(self, n_estimators=10, max_depth=None, max_features=None, random_state=SEED): + self.n_estimators = n_estimators + self.max_depth = max_depth + self.max_features = max_features + self.random_state = random_state + self.trees = [] + self.feat_ids_by_tree = [] + + def fit(self, X, y, n_processes=1): + self.classes_ = sorted(np.unique(y)) + def fit_tree_process(i, queue): + np.random.seed(self.random_state + i) + feat_ids = np.random.choice(range(X.shape[1]), size=self.max_features, replace=False) + pseudo_ids = np.random.choice(range(X.shape[0]), size=X.shape[0], replace=True) + pseudo_X = X[pseudo_ids, :][:, feat_ids] + pseudo_y = y[pseudo_ids] + dt_clf = DecisionTreeClassifier(max_depth=self.max_depth, + max_features=self.max_features, + random_state=self.random_state + i) + dt_clf.fit(pseudo_X, pseudo_y) + queue.put((dt_clf, feat_ids)) + queue = multiprocessing.Queue() + processes = [] + for i in range(self.n_estimators): + p = multiprocessing.Process(target=fit_tree_process, args=(i, queue)) + processes.append(p) + p.start() + results = [] + for _ in range(self.n_estimators): + results.append(queue.get()) + for p in processes: + p.join() + self.trees, self.feat_ids_by_tree = zip(*results) + return self + + def predict_proba(self, X, n_processes=1): + def predict_proba_process(tree, feat_ids, queue): + proba = tree.predict_proba(X[:, feat_ids]) + queue.put(proba) + queue = multiprocessing.Queue() + processes = [] + for tree, feat_ids in zip(self.trees, self.feat_ids_by_tree): + p = multiprocessing.Process(target=predict_proba_process, args=(tree, feat_ids, queue)) + processes.append(p) + p.start() + probas = [] + for _ in range(self.n_estimators): + probas.append(queue.get()) + + for p in processes: + p.join() + + return sum(probas) / self.n_estimators + + def predict(self, X, n_processes=1): + probas = self.predict_proba(X, n_processes=n_processes) + return np.argmax(probas, axis=1) + \ No newline at end of file diff --git a/data/example_fasta.fasta b/data/example_fasta.fasta new file mode 100644 index 0000000..ae5498d --- /dev/null +++ b/data/example_fasta.fasta @@ -0,0 +1,18 @@ +>GTD323452 5S_rRNA NODE_272_length_223_cov_0.720238:18-129(+) +ACGGCCATAGGACTTTGAAAGCACCGCATCCCGTCCGATCTGCGAAGTTAACCAAGATGCCGCCTGGTTAGTACCATGGTGGGGGACCACATGGGAATCCCT +GGTGCTGTG +>GTD678345 16S_rRNA NODE_80_length_720_cov_1.094737:313-719(+) +TTGGCTTCTTAGAGGGACTTTTGATGTTTAATCAAAGGAAGTTTGAGGCAATAACAGGTCTGTGATGCCCTTAGATGTTCTGGGCCGCACGCGCGCTACACT +GAGCCCTTGGGAGTGGTCCATTTGAGCCGGCAACGGCACGTTTGGACTGCAAACTTGGGCAAACTTGGTCATTTAGAGGAAGTAAAAGTCGTAACAAGGT +>GTD174893 16S_rRNA NODE_1_length_2558431_cov_75.185164:2153860-2155398(+) +TTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAACAGCTTGCTGTTTCGCTGACGAGTGG +GAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATC +ACCTCCTT +>GTD906783 16S_rRNA NODE_1_length_2558431_cov_75.185164:793941-795479(-) +TTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAACAGCTTGCTGTTTCGCTGACGAGTGG +GAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATC +ACCTCCTT +>GTD129563 16S_rRNA NODE_4_length_428221_cov_75.638017:281055-282593(-) +CGGACGGGTGAGTAATGTCTGGGAAACTGCCTGATGGAGGGGGATAACTACTGGAAACGGTAGCTAATACCGCATAACGTCGCAAGACCAAAGAGGGGGACC +GAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATC +ACCTCCTT \ No newline at end of file diff --git a/data/example_fasta.fasta:Zone.Identifier b/data/example_fasta.fasta:Zone.Identifier new file mode 100644 index 0000000..1bf0b28 --- /dev/null +++ b/data/example_fasta.fasta:Zone.Identifier @@ -0,0 +1,3 @@ +[ZoneTransfer] +ZoneId=3 +HostUrl=https://github.com/ diff --git a/example_fastq.fastq b/data/example_fastq.fastq similarity index 100% rename from example_fastq.fastq rename to data/example_fastq.fastq diff --git a/data/example_fastq.fastq:Zone.Identifier b/data/example_fastq.fastq:Zone.Identifier new file mode 100644 index 0000000..1bf0b28 --- /dev/null +++ b/data/example_fastq.fastq:Zone.Identifier @@ -0,0 +1,3 @@ +[ZoneTransfer] +ZoneId=3 +HostUrl=https://github.com/ diff --git a/data/sequence.fasta b/data/sequence.fasta new file mode 100644 index 0000000..434d31e --- /dev/null +++ b/data/sequence.fasta @@ -0,0 +1,268 @@ +>NC_000017.11:c74712923-74694317 Homo sapiens chromosome 17, GRCh38.p14 Primary Assembly +AGTTTGTTCCTGCTGCCAGGCTCCACTGAGGGGAACGGGGACCTGTCTGAAGAGAAGATGCCCCTGCTGA +CACTCTACCTGCTCCTCTTCTGGCTCTCAGGTGAGCGGGCCTGGGTCTGTCTTCTTGGGGAAGCTTAGCA +AGCAGGAGGAGGTGGCTGAGAGAGGGAAGAAGGGACCCGGCCAGAAGGTGTCCTTAATGGCATGAGCCTT +GTGCGTTTCATCCACATACCCACCATTCACCTAGCATTTCCGTTGCCAAGGCAGGAATACAGGGCATGAC +CTTCATAATGAAGCCAGTGCAAGGCAGAATTCCTGACCCCCAGGGGCTGATCATGTGAAATGGACAGACT +GTAGGCCCCGCCGGCCAGGGAAAAGCAGAGGAGGAACATGGGCTGCTCAGGAGAGCAGGCGACTGCCTGA +TCCTGGACTGGGTGATCTGTCACCTGCGTGGAGGTGTTCTGAGCAAGGGCAGGCGTCAGGGAAGAGGTGA +GGCTGGAGGACTTTGAAAGGAGGGAAGAGGGAGTTTGCTGGGCAGAGAGGGAGTGGTGGTTCTAGGCATT +TGGGGCAGAAGGAAATAGTAGAAGTTGCTGAGGGACGGAAAGGTCAAGAGGAGATAAGACTGGCAGGGAC +GTAGAGGAAAGGAGGGACTGAGATGTGGGCAGAGCCAGAGGGCAGGAGACACCTTAAGTTGGTGCATACA +ATTTTTATTAAAAATGAAGGGAGGCCAGGGAAAGAGGCTCATTCCTGTAATCCCAGCACTTTGGGAGGCT +GATTGCTTGAGCTCAGGGATTCGAGACCAGCTTGGGTAACATGTGGAAACCCTGTCTCTACAAAAAGTAC +AAAAATTAGCCAGGTATGGTGGCATGTGCCTATGGTCCCAGCTACTTGGGAGGCTGGAGGCTGAGGCATG +AGGATCACTTGAGCCCAGGAGTTGGAGGCTGCAGTGACCTGTAATCACACTACTGCCTTCCAGCATGGGT +GACAGAGCGAGACTCTGTTTCAAAAAAAAAAAAAAAAGAAAAAAGAAAAAAGAAAAAAAAAGAAAGAAAA +CAAAAAAATAATTTCAAGAAGTGAGAAGTGATAAGTACTGTGAGGAAAACAGAACAAGATGATATGTTAC +AGTGGGACCCAGGTAGGGGGATGCTGTAAGTCAGGTGGGCTGAAACAACCCCCCTGAAGAGGGTCCCAGA +TGCCTGTCCGAGCCGAGACCTAGGAAAAGGATCAGACACAGGATCGGGACCAGCTTTCCAGGCACAGGGA +ACAGTAGATACGAGGGCTCTTGTGTAGGAATGAGCTTGGACTGGGCACGTCTGAGGAAGAGGGAGGTCAG +CACGGCTTTGCCTCATGCTTCAGAAGATCCTGGTGGGACAGATTGGAGAAAAGGTCTTGGGGAAGAAAGT +GACTTGTCCAGGACTGCTAGGCTGGTACAGGAAGAGCAGGGAGTGGCTCATGGACACTTGGCCCCGAGTT +TCAGCATGTAAGTGGAGAAGGCAGACTTTGGGTGAGATGTTATGAAATGCATTGGCACAGTGAGACCAGG +ACAGGTATGGCTGCTTTTTTTCTGACACCTGGAAATGAGAGAATTTGGTAGTGAGCAGATGTGAGAGTCG +TGTTGGGAGTGGGAGTGGTTTGGGACCTCTGTCTGAAGTGGGCAGAAAAGCCAGCAGGCACAGCTATGGG +CGCCAACATGGAGCCCGGATACCACTGTGAACCTGCATGCATCAGCTCTTGCCAGCCTCTCCTGCAGAAC +TTCCACCCACAAGCAGCAAGTGTGCTTACCCTGTTTCCCTCTTCTAAATAATTTATCGTTTTAAAACACC +CAAGAAAAAAAATACCCATCCGGAAGTTTGTTTTCTGTGAGAAAGGAAGGTTTTTTATTTTTTTTGAGTT +GGGGTCTCAGTCTGTTGCCTAGGCTGGAGTGTGGTGGCATGATCATAGCTCACTGCAGCCTCAACTCCTG +GGCTCAAGTGATCCTCCTGCCTTGGCCGTCTGAGTTGCTGGGATTAAAAGCATGAGCCACCGTGCCTGGC +AAGAACAGATTTTTTGTTTTTTTTTTTGAGACAGAGTCTCGCTCTGTCACTCAGGCTGGAGTGCAGTGGC +GCGATCTTGGCTCACTGCAACCTCTGCCTCCCAGATTCAAGTGATTCTTCTGCCTCAGCCTCCCAAGTAG +CTGGGATTACAGGTGTGCCAGCATGCCTGGCTATTTTTTTTTTTTTTTCGTATTTTTAGTAGAGATGGGG +TTTTACCATGTTGGTTAGGCTGGTCTCGAACTCCTGACCTCAAGTGATCCACCCGCCTTGGCCTCCCAAA +GTGTTGGGATTACAGGCGTGAGCCACCGCGCCCAGCCTCACTGTGCTTTTTTTCCATAAGTCTTTATTGT +TTCTGTTAAAGTTATTCCTAGGAATTTAATACACTTTTATTGCTAATGTAAATGAGCACTTTTCTACTTT +ACTTTCCAACTATTTATTAGCAGTTTATAGGAAAGCCATTGATTTTTACATGTTTATTATTTAGCCACTC +TATTAAATTCAATTATGAATTAAAATACTTTTTCATTTAAAATTAAAAAATTTTCTCTAACTTTTTTCCC +TTTTGATAATTATACATTTTATGTTTTTCCTTATCTTCCTATATTGATTAGAATTTCCAGGCTGGGCACA +GTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCAAGGCAGGCGGATCACAAGGTCAGGAGATCGAG +ACCATCCTGGCTACGGTGAAACCCCATCTCTACCAAAAGTACAAAAAATTAGCTGGGCGTGGTGGTGGGC +GCCTGTAGTCCCAGCTACTCAGGAGGCTGAGGCAGGAGAATGGCGTGAACCCGGGAGGCGGAACTTGCAG +TGAGCCGAGATTGAGCCACTGCACTCCAGCCTGGGCGACAGAGCCTGACTCCATCTCAAAAAAACAAACA +AACAAAAAAGAACTTCCAGAATCATGTTATTATATATATTGGGAATTCATGTTTTGTTTATGATGTCAAT +AGGAATGCCTATAGTATTTCACTGTTAAGAATAATTTGGCTGGTGATGCATGATGGCTCAAGCCTGTAAT +CCCAGCACTTTGTGAGGCTAAGGCAGGCAGATCGGTTGAGTCCAGGAGTTTGAGACCAGCTTGGGCAACA +AAGTGAGACCTTGTCTCTCCAAAAAAAAAAATACTAGCCTGGTATGGTGGTGCATGCCTGTGGTCCTGTC +GTCCCAGCTACATAAGAGGTTACAGCAAAAGGATCACTTGAGCCCAGGACTTGAGGCTGCAGTGACCCAT +GTTCGTACCAGTGCACTCCGGCCTGGGTGACAGGCGACCAAGCAAGACTCCATCTCAAAAAAAAAAGAAT +AATTTGGCTCTTGGTTTATGGTGTCCTAAAAGTCTTTTTAATTGTGGAAAATAAAATGTACAAAAAAGTG +CATAAAACTTCAACATCCATGTTAACAAATTATTATAAAGTACATATGAATGTCTCGCACTAAGTGTGGT +GCTTGCTGAGGTTTTTGATAGACAATCTTTATCAGATAAAGGCAGTTCTCTTAGTTTGCTAAGAGTTTTT +AAAAAATCATGAATGAGCGTTGATTCCTATCACTACTTCTCTCTGCACCTAAGATGATCACAAGATTTTT +CTCCTTGACTCTGTTAATATGGCATATTACCCTTATTGATTTTCTTTTTTCCTTTTTTTTTTTGAGTCAG +TCTTGCTCTGTCTCCAGGCTGGAGTGCAGTGGTGTGATCTTGGCTCACTGCAACCTCCGCCTCCTGGGTT +TAAGCGATTCTCCTGCCTCAGCCTCCCGAGCAGCTGGGACTACAGGCGCCCGCCACCACGCCCAGCTAAT +TTTTTGAATTTTTAGTAGAGACGGGGTTTCACCATGTTGGCCAGGATTGTCTCGATCTCTTGGCCTCGTG +ATCTGCCTGACTCGGACTCCCAAAGTGCTGGGATTGCAGGTGTAAACCACCGTGCCTGGCCTCCTTATTG +ATTTTCCTTTTTTTTTTTGAGACGGAGTCTTGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCG +GCTCACTGCAAGCTCCGCCCCCCCGGGTTCACGTCATTCTCCTGCCTCAGCCTCCGAGTAGCTGGGACTA +CAGGCGCCCGCCACCACGCCTGGCTAATTTTTTGTATTTTTAGTAGAGACAGGGTTTCACCGTGTTAGCC +AGGATGGTCTCGATCTCCTCACCTCCTGATCCGCCCGCCTTGGCCTCCCAAAATGCTGGGATTACCGGTG +TAAGCCACCGCGTCCAGCCTCCTTATTGATTTTTTAATGTTAAAAACCATTTTGGGTGTTTGGCATAAAC +CCAACTTGGTGATAATGTTTTATCCTGTTTATGTATCACCAGATTCCGTATGCTGATCTTTTGTTTCAGG +TTTTTGCATTTATGTTAATGAGTGATATTTGCCTACAATACTTTCTTGCTTTGCCCTTGTCAGGTTTTGG +CATCAAGATTATGTTAATCTCGTGAAATGGATTTGGGATTTCATCTTCTTTTTCTATGCTCTGGAAAATT +ATCTATAAAATAGACATTATTTCTTCTTTACGTGTTTGGTAAATATTTCCATTGAAGCTGTATGGGTCTG +GAGTTTTATTCTGGCGGAAGTTTTAAACAAGGGATTTGATTTATTTAAATATAATAGCTGTGGAATGATT +CAGGTTTTCTATTTATTTTGTGCTGATTTTAGCATATTGTATTTTTTTTCTTTTCTTTCTTTTTCTTTTT +TCTTTTTTTTTTTTTTGAGATGGAGTCTTGTTCTGTTGTTCAGCCTGGAGTGCAGTGATGTGATCTCGGC +TCACTGAAACTTCCACCTCCTGGGTTCAAGCGATTCTCCTGTCTCAGTCTCCCAAGTGGCTGGGGCTACA +GGTGTGTGCCACCACGCCAGGCTAATTTTTGTATTTTTAGTAGAGATGGGGTTTCACCATGTTGGCCAGG +CTGGTCTTGAACTCCTGACTTCAGGTGATCCACCTGCCTCGGCCTCTCACAGTGCCTGGCCTGTATAATT +TTTTTCTAGGAATTTGTTTATTCCATCTCAAATTGTAAATTCATTGGCATAACATTGTTTAAATATCCTT +TTATTATTATATCCTTTTATCTTATTCATGGTTTATTAACTTTTTAATATGTGCAGGATATACAGTCATA +TACCACTTTTAATTTTTTTTTTTTTTTCTGAGATGGAGTCTCACTCTGTTGCTCGGGCTGGAATGCAGTG +GCACGATCTTGGCTCACTGCAACCTCCGCCTCCCAGGTTCAGACGATTCTCCCACCTCAGCCTCCCAAGT +AGCTGGGATTACAGGTGCACTCCACCACACCTGGTTAATTTTTGTATTTTTAGTAGAGACAAGATTTCAC +CATGTTGCCTAGGCTGGTCTCAAACTCCTGACTTCAGATGATCCACCCAATGTGGCCTCCCAAAGTGCTG +GAATTACAAGTGTGAATACTGTGCCTGACCCTCTTTTGGTTTTAATTTGCATTTTTCTGGTGATCAGAGA +TGTTGAGCATTTAAAAATAGATCTGTTGCCCATTTATAGCTCCTCTTTTGAGAAATATCTGGTCAGATTC +CTTGCCCAGTTTTTAATTAGATTATTTGTTTTCTTGTTACTGATTTGTTTGAGTTCCTTATGTATTTTGG +ATATCAGCTCTTTGTCAGATTTATGGTTTGCAAATATTTTCTCTCACTCTGTAGGTTGTCGCTTCATTCT +CTTGGTTGTTTCCTTTGCTGTGCAGAAGCTCTTTGGTTTTGTGAGTCCCATTTGTCTAGTTTTGCTTTTG +TTGCCTGTGCTTTTGGCGTCCTCTCCAAGACATCATCGCCAAGACCAATGTTGTGGAGCTTTTCCATTAG +GTTTTCTTCTGAGAGATTTAAGTTTCAGGTCTTACATTTACATCTTCTCTATTGTGAGTTGATTTTTGTA +TATGTTGTGAGATAAAGGTCCGGCTTCATTCTTCTGGACGTGGATGTCGAGTTTTCCCAGCACTGTTTAT +TGAAGAGACTGTCCTTTCCCCATTGTGTGTTCTTGTCACAGGGCCCGCTTCTTTCCCCAGGACCCGGGCT +GGAGTGGAAGGAGGGCTGTGCCTTCGCATGGATACCCTGCTCTCCAATCCAGGCTGCCGTGGTCCTCACC +CCTTCCCAGGGTCTCTCTGACACTGAGGCCCTCTGTCCGCGCCGTTGGTTGTTGTGCTTGAACTGCTGGT +TTTTGTTTTGTTTTGTTTTGTTTTTGAGATGGAGTTTTGCTCTTGTTGCCCAGGCTGGAGTGCTATGGCG +CCATCTTGGCTCATTGCAACCTCTGCCTCCCGGGTTCAAGCGATTCTCTTGCCTCAGCCTCCCGAGTAGC +TGGGATTACAGGCATGTGCCATCATGCCAGGCTAATTTTTCTATGTTTAGTAGAGATGGCGTTTCTCCAT +GTTGGTCAGGCTGATCTCGAACTCCCAACCTCATGTTGGGAATTTATATATATATGTTCTTTTTATTCTT +TCTCTCATATTCTCACCTTTGAAAGATTCCGGATACAGGGAAATATTTCTTTACTTAAAAAAAAAAAAAA +GGCCAGGCACAGTTGCTCACACCTATAATCGCAGCACTTTGGGATATATATATATATATATTTTTTTTTT +CCTTTAAAATGTTTTTTATTTTTAGAGTCAGTGGGTACCTGTGCAGGTTTGTTACATGGATATATTGCAT +GATGCTGAGGTTTGAGCCTCAATCAAACCTGTCACCCAGATAGTGAACATGGTACCCAATTGGTAGTTTT +TTCAGTGCTTGCTCCCTTCCCTCCTCTGTTTTGTTCTTGCCCCTTTCCCTCCTCTGCCTGTTGTTCCCAT +CTTTATGTCCATGCATAACCAATGTTTAGCTCCCACTTATAAGTGAGAACATGCGGTATTTGGTTTTCTG +TTTCTGCGTTAATTCACTTAGGATAATGGCCTCCAGCCACATCCATGTTGCTGCAAAGGACATGATTTTG +AAGGAAAACATGTATTTTAAGAAAATACAGGCCAGGCGTTGTGGGGCCCGCCTGTAATCCCAACACTTTG +TGAGGCTGAGGTGGGTGGATCGCTTGAGCCCGGGAGTTTGAGACCAGCTTGGACAACATGGCAAAATCCT +GTTTCTACTAAAACTACAAAAATTAGCCAGGCGTGGAGGCATGTGCCTGTAGTCCCAGCTACTTGGGAGG +CTGAAGTGGGAGGATCAATTAAGCCTGGGAGGTGAAGGCTTCAGTGAGCCATGATTGAGCCATGATTGTG +CCACTGTGTTCCAGCCTGGGCGACAGAGTGAGAGCCTGTTTCAAAAGAAAAAAAGAGAAAAAGAAAAAAA +AAAGTAAATACTGCCTTACTGTCTATAATACCTGCTTTTCTACAAGGTGATAGTAATCAGGAGAATGTAG +GATTGGCAGAAAGGTGGACAAATAGATCAATGGAACAGAATGGGAAATCCACAAATTTGCAATTTGACAA +AGGGGTCAAAGCAATTCATTAAAGAAAGTGGGATCTGTGTTGTATGGATCCTATAGCACATGGAGACCAG +TTGCCGTTGCTGTTTTGATCAACTTCAAAGGCATTCATCCAGGGTGGTTTATTCCAGAACCACTAGAAAT +CATTAGGACCACCAGGACAGCCTCTCCACAGCCATCTGATGTGGCTGCCTCAGCTCGACCTCATGAGGGT +CATCAGTGCTAAGAGTCAAGGTAAGACCCTGTGGAAAGGGGTCTCCTGGTATTTCCTCCCCACCACACTG +GACTAACACAAAGATGGCTGCAAAAGGACATTCCTTGAAATATACAGATTACCTTGGCCGTAGGGTGGAG +AGAGGGGTGAACTGCAGGAAGTTATTATTGCTTATGCCAATATTTTTATTTTTGGGGGGATCCGTTGTTG +TGCTCTTCAGTGATTTTCCTGGTGGCTTTGACCCAGTGGAAAGCTGAAGGACAAAAGTTTTGTCTGCGGA +AGGTGTATTTCTTCCTTGACTGTTATTTAACCTGTGAAACTTGGATTTGGGAGCTATTCCCTAAACAGAA +AGCTGTGGCCCTGCCCTTGGGGTGGGAGGTGACAGCACATGAATTTGTGTTTTCCAGGCTACTCCATTGT +CACTCAAATCACCGGTCCAACAACAGTGAATGGCTTGGAGCGGGGCTCCTTGACCGTGCAGTGTGTTTAC +AGATCAGGCTGGGAGACCTACTTGAAGTGGTGGTGTCGAGGAGCTATTTGGCGTGACTGCAAGATCCTTG +TTAAAACCAGTGGGTCAGAGCAGGAGGTGAAGAGGGACCGGGTGTCCATCAAGGACAATCAGAAAAACCG +CACGTTCACTGTGACCATGGAGGATCTCATGAAAACTGATGCTGACACTTACTGGTGTGGAATTGAGAAA +ACTGGAAATGACCTTGGGGTCACAGTTCAAGTGACCATTGACCCAGGTAAGAGGGAGTGTATATATGTGT +GTGTCTCTCAGGGCCTGCTCTGTCCTGGTCTCTGAGGTCCTACTCAAGTGATTTAATTGTCACTGAGTGA +TCTATCACTTGAGTCCCGAGTCTCATAGAACCCTGACTGACCACCTGGGACTGGGGGAGCAGGGCCTCTC +TTCAATGCCCCCATGGCTCCCAGGGCTCCCTCCACGACGGGATTAAGCCTTTCTAGGCACATTTTTTACC +TCTGCACAGCTCAGTGCCTGAGGACAAGGTGATGGTCCCAGTTTCAGCCCATGGGCCAAAGGGATGCCTT +CCCATGGTACCCAGAGACCCCAGTCCTTGCCATTGCTTTTTGGAGACCTCTGGTGCCCAGGTGTGTCCTG +CTTTTGGTGGAGTTCTGGGCTGGGGCTTCATAGCTGCTGTTCCCATCTCCCATTCCATATCTAACCTGGG +GGAAAAAGAGGCTCAGCATAGTCGGGGTGCTGGGGTCTAATTCCCAAGTCCAGAAAGGTCTTCACAGTCA +AACGGCTGCATTCCTGGTCCCTGACATCCCACCATGAGCCATTTGTGTGTGTGGACTTAATGATATTTCT +TCCTATGTCCTTCTAGAATGCAGACACTCCTGGGAGCGTTAGCTCAGGGGTCCCAGTGCCTCGGTTTGCA +TGGGTTTATTTGTTGGGCTTGTTCTGCTGTCCCTCAGAAGATAAGCAGGATGGCTCGAGGGCCCTTGTGG +CATGGAAGGAGCACCACAGTCCTTGGGGCCCCAGGTTGCACCCGGAGCCTCTGGGGCATATTCTAGTTTG +CAAAGTGCTGAACGTGCATTGTCTCAATGGGTCCTCACAATACCCCTGAGAAGGGGGCATCCCCATCATT +GAAGTTTTGTAACTTGCCCAAAGTCAAAGTTCTTCCTGAATATGATGATTTCATTCAGGGCTGGTCCTGA +CCTTTGATGTCACAGATCTAGGGTCACTGTGCAAGTTCCCACTCTGCTCCTGTTTTTGCTGGATAAATAG +GGCTGGGGGTGGAGATTGACCAAGGCACTGTCATAAGCAGCAGGTTGCTCAAGTTCAGCTGGTCTCCAGC +TGCCCTTAGGCTCTTCCCAGCCCCCTCTAACCCTCCCAGGGCCAGCTCAGCGTCCTCCCTGGACCCTGCC +CAGCCCCACCTGCCTATGAGGCTCTGGAGTCTCCTAGAGACCAGGGAGACATGAGTGGTCCAGGCTGCAG +CCCCTTCCCATAGTAGCAGTCCAGACCCCCGCCCAGGCTCAGAGCTGTGTGCAGGGCTGCGGGTGGGCTC +ATGGGCAGGGGCATCTGTGATCTGGTTTATACACCAAGCATCAACTACAGCGTCGACTCCTGCCCCCACC +ACGCCTACCTCCACTACGTTTACAGCACCAGTCACCCAAGAAGAAACTAGCAGCTCCCCAACTCTGACCG +GCCACCACTTGGACAACAGGTAAGCCAGCTCTGGTTCCACTGTGGCCTACTTGGCCCAAACTCCTCCAAT +GGAGAACTCTTTGAAGTCCCATTTTCCAGCCTGAGCTTTGTCTGGGACCTGCATGCTCCTTGGTGAGGAT +GAGGAAGCTGGGAGCCCTTCCCACTTGCTGTCAGGCCACATGGTGTCCTTTCAGTCTCCACAGATCCTTC +ATTCTGCAGCTCAGTGCTGTTTGTAGGTTCATTCATTCGTTCGCCCCACTTTTACTGTGTTATAATAACT +TCTACATGGCTGCACCGTGCTGAGCTCTGGGGACACAGACATGGACTGTCCCCGTGGAGCTCAGTGTGGG +AGACCGATGAGTTGGCAGTCAGTGAGGACAGTGTGTCTCATGTGAGTGTGGGCGTGGGGTCATGGAGCCG +CAGGGATGACTCTTGGTTCAACTTCTGGGAGGACAAGGAAAGGAGGGCTTGGGGAAGTCCTGTTTCCCTT +AATTTTCCTCTGATTACTATTATGGCAAGAACAGTGATGCCTATAGTTTCTCACATCTCTAGAGCAACTG +ACAGCTCACAAAAGGCTTTCCTGTGCATTATTTTGTTTTCTTTGACCTTTCCTACAACCACCCCATGTGG +AAGAATGGGCAGATACAAATGTTCCCATTTTACAGAAGAAGAAACTGAGCCTCAGTCCCTTCCTCAGTGA +AGGGTGGTGCTGGAGCTTGGATGGAAGTCTCTGGGCTCTACACCATAGCTCAGGTCATGCCTCTTAGTGC +CCTGCCCAACCATCTGCAGTCAGCTGGGCTTTCTAGAAGAGGAACGTCCCCTTTAATTTCTTCCTTTCAC +TCTCCTTGGCCCTCTTGACTTTGTTTTTAGATCTGCCAAGTGCCAAGTTCTCCTTTTGCTGGGACCTTGC +CACAAAATCCCTATGGGAGGTCCCCGGTATTCTAACCACCTGAGACCCTTCTACTCCCCCTCTCCCATTT +GCCTTCTGTTTTTTTTTTAAATTAAACTTTTTAAAGTTTTCTTTTTTTTTATTTTTAGACAGTCTTGCTC +TGTAGCTCAGACTTGAGTGCATGGTGCCATCTTGGCTCACTGCAACCTCTGCCTCTCATGTTCAAGCAAT +TCTCATGCCTCAGTCTCCTGAGTAGCTGGGGGAACTATAGTCGCCCACCACCACACCCAGCTAATTGTTT +TTTTTTTTTTTTTTTTGACGGATTCTCACTCTGTTGCCCAGGTTGGAGTGCAGTAGTATAATCTCAGCTC +ACTGCAACTTCCACCTCCCAGGTTCATGTGATTCTCATACCTCAGCTTCTCGAGTAGCTGCAACTACAGG +CGTAAGCCACCATGCCCGGCCAATTTTTAGTAGCGATGGGGTTTTGTTATGTTGGCCAAGCTGGTCTCCA +ACTCCTGTCCTCAAGTGATTTGCCCACCTCAGCCTCCCAAAGTGCTGAGATTAAAAGCATAAGCCACCTT +GCTTTGTCCCTATTTTTCTTTTTCCTAAGCAAATCAATACAGGATACCCTCTGATTTGAAAGTTTATTTT +CTACCAAACTGCTTTGAATAAGTAATAGTTAAGGTTATTTATTTATTTATTATTTTATGAATCGAAGCCT +TATGTAAGTGCCCGATTAGGGCCTTTGATTAGCATGAGATGGTTTGTTCACTCATTTATTTAACACATAA +CTCTTGAGCACTCACTCTAGCCTGTCACTGTTTAAGGGAATTAAAAAGGGTACAAGAATGGACCAAATAA +TCAAAATGCCTGCCCTCGAGGTACTTACAATCTAGAGCTAATTAGTGTAACCAAATAGGGTTGACACAAT +TAATTGACTGATGAATTTCTTTCTCAAATTTCTGCACATCCAGTGATGAAGGAGATAATCCTTATATTAG +TTTGCTTGGGTTGCCCCTAACAAAATACCTCAGAATGGGGGCTTAAATCACAAAAATTTATTTTCTCACA +ATTCTGGAGGCTAAAAGTCTGAGATCAAGTGGTCCACAAGGTTGGTTTCTTCTGAGTCTTCTCAGCTTGT +AGATGGCTATCTTTTTCCTATGTGTTCACGTGGTCGTCCCTCTGCGTGTGTCTGTGTCCTGATCTCTTCT +AATGACACCAGTCATAATGGATTAGGGCCTGCCCCAGTGACCTCACTTAACCTCAATCACCTCTTTAAAG +ATCCTATCTTCAAATACAATCACATTCTGAGATACTAAAGGGTGGGACTTCAACATATGGATTGGGGTGG +GGGGGTGGTGGTGTACAATCCAGCCCAGCCCACAACAGTTTTTTTTGTTGTTTGTTTGTTTTGAGACGGA +GTGTTGCTCTGTCACCCATGCTGGAGTGCAGTGGCAGGATCTCGGCTCACTGCAACCTACGCCTCCCAGG +TTCAAGCGATTCTCCTGCCTCAGCCTCCCAAGCAGCTGGGAGTAGCCAGGATTACAGGAATGCACCACCA +CACCCGGCTGTTATTTTGTATTTTTAGTAGAGATGGGGTTTTGCCATGTTGGCCAGGCTGGTCTCGAACT +CCCGACCTCATGTGATCAGCCTGCCTCGGCCTCCCAAAGTGCTGGGATTACAGGTGTAAGCCACCGCGCT +CAGTCGAGTGATTCTTGAGCAAAGACTGGAAGAGGTGAGAAAGTGAGCCACACAGAGCTCTGGAGGGAGC +ATGTTCCTGGAGAAAAATGCAACTGGAATAAGGTCCGTAATGTGGGGATGTGCCTGGAATATTTGAGGAG +CAGCAGGAAGTGAGGCTGGAGTGAAGCAGGCAGGGGTGAGTCGTAGGCAGGAGTGGAGGCCATTGTCAGC +CCTGGCCATTTTCATGGGCTGAAATGGGGCTGTTACGGGATTTATTTATTTATTTATTTTATTTTGATTG +TTTGTTTGTTTGTTTATTTATTTATTTATTTATTTATTTATTGAGACAGGGTCTTGCTCTGTCGACCAGG +CTGGAGTGCAGTGGCGCGATCTCAGCTCACTGCAACCTCCGCCTCCTGGGTTCAAGCAATTCTCCCACCT +CAGCTTCCCAAGTAGCTGGGATTACAGACACACGCCACCCCACCCAGCTAATTTTTGTATTTTTTGGTAG +AGACGGAGTTTCACCATGTTGGTCAGGCTGGTCTTGAACTCCTGACCTCAAGTGACCTGCCTACTTCAGC +CTCCCAAGTTGCTGGGATTACAGGTATGAGCCACTGTGCCTGGCCTGTTGCAGAATTTTGAGAAAGAGAG +GGGTGTGCTTTGACGTTTGGTTCACAGGCCCCCCGGCTGCTGTGGCAATGATCCACTACAGTAGGGCCTG +TCAGTGTCCTGTGGCCGCCGTTACGAGTGGCTCAAAGCAAAAGAAATACGTTCTCTCACAGCTAAGGAGG +CCAGGAATTGGAAATCAAGGTGTCACCAGGGCTGTTGCCACCCTGGGGGCTCAAAGGGAGAGTCTGTCTC +ATGCCTCTGTTCCAGCTGCTGGTGGGCGCCTGCGATCCTTTGTGTTCCCGGTCTTGTGGCGGCATCACTC +TGATCCCTGTCTCTGTCGTCTCATGGTCTTCTTCCTGTGTGTGTTTATGTGTCCAAATTTCCCTCTTGTA +AGGACACCAGTCGCTGGATCAGGGCCCATGCTAATCAACTATGACCTCATCTTAACTTGATTACATCCAC +AAATAGGTCACATTCACGGGTCTGAGTGGATGTGAATTCGAGGGGAAGAGGACGCTATTCAACCCAGTAG +AGAGGGCAGGTGTAGAGGCAGGAGGTGTTTTAGGAGTCTGCTGCCCTACAATTTCTGTTAGGCTTCTAGA +CACATTAAAACCACCTCATGGAACCCTTTTCTCCTTTTAAAAACTATCTTTTGTTAAGGCCGGGCGCTGT +GGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCTGAGGTGGGCGGATCATGAGGTTAGGAGTTCGAGAC +CAGCCTGGCTTACATGGCAAAATCCTGTCTCTACTAAAAATACAAAAATTAGCTAGGTGTGGTGGTGGGT +GCCTGTAATCCCCGCTACTTGGGAGGCTGAGGCAGAAGAATCTCTTGAACCTGGGAGGTGGAGGTTGCAG +TGAGCCGAGATCATGCCACTGTACTCCAGCCTGGGTGACAGAGCAAGACTCTGTGTCAAAAAAATGAAAA +ACAAAGAAAACCAAAAACAACTATCTTTTGTTAAAATTCTTAAATTTTAAAAACTTTTATTTTAGCTTCA +GGGGTACACGTACAGATTTGTTATATCGGTAAATTGCACGTCACCGGGGTTTTGTGTACAGATTATTTTG +TCACCCAGGTAATAGGTATAGTACCCAATAGGTAGTTTGTAGTTTTTTGATTCTTTGCTTCCTCCCACCC +TCCATCCTCAAGTAGGCCCTGGTGTCTCTTGTTCCATTCCCTTCTTTGTGTCTATGTGTACTCATTGTAA +ACCCATCTTTGAGGATCCCCAGGAGTGAGGGTTCCCAGGCTGCAAACTACAGACCTGCAGCTCATCAGGT +GTGTGGCCCTCCCCTGCTGCTGGGTGGGTGGGATTGGGCTGAGAGGCAGGTGGTGAGCCTGGGATGCAGG +GGACGCTTGCCATTGTTTCTAGGCACAAGCTCCTGAAGCTCAGTGTCCTCCTGCCCCTCATCTTCACCAT +ATTGCTGCTGCTTTTGGTGGCCGCCTCACTCTTGGCTTGGAGGATGATGAAGTACCAGCAGAAAGGTGAG +AGGACCTGGGTGAGGCTGGGCTGAGACTGGGCCGGGTGGCTGGGGATGTGGCATCTGGGAGCTATCTGGT +CCAAGGGATTACCCACACAATCAGGAAATGGCACAGTCCCAAAGGCCCTCAGCAGCTGGCAGTGCCAGGG +GACCGGGGGGCGCCCTGGAGCTTGCAGGGGAGCCCCAGGAGGTTTTGGCATTCGCCGCCTGCCCTGCACC +TCCGTCCTTCCCCTTCTCTTCTCTCTGATCCACCTCCCTGTCTCTTTCCTGGATTGACTCTTGATGGCAT +AGGTGGCAATGGTCTTAGGTGGGTGGTCCAGTCTGGGGTGTCTAGACTAGACGCTAGAGTTGGGTCTATG +TGGTTGTCAAGGAGGTACTGAGCTGACATGGAATGGGGAGGGAGAAACAGACACAAACACACCAGTGGAT +GCACACACACATGCGCGATCATGGGCACACACACTGCATGGATCCCACAGTGCATGTGCACTCATCACAT +TACACAAGCATGCACACACACACACAGCCCTCCAAGTCTGATCAGGAAATTGGAATGGCCTTTCCTTCTG +TCCCTCCCTTGCCCACTGGACTTTCCCAGGCTGACCTACTTGCCCTTTGTGAACTAGCTCTTCAGGGAGG +AGCCTCTTGGCGCAGGAGTCTACACTCACAGATCATGAGGCCTGGCCTCAACAAAACCCCAGTTGTGTGA +GAATGTGGCTTCACTTCCACATTCTCACACAATTGTACCTCTTTCTCTTCTAGAACAGCTGGGTCCTAAC +TCTCACAGGGTTCCCTCCAACACTTCTCCCTCCCCACAGCCTGGGAAATCTTTCGCCTGGGAAGAGGGGA +GCCTGACTCTCTGTTAGTTTGTTTTCTGTGATGCTTTTTCAAAATTAGAGCCTGACTTTTATTCATCGCT +TTTTCTCCCAAATGTATTGCCCACGGCCTAAGGTCTAAGTGCTTTCTGTGTTCTCCAGCATCTAGACTTT +TGAAACCAGGGGAAAACCCCCCATTTCTTTGTTGTCCCTAGGGCAGGGATGTGGGGCTGGTGGCTGGGGA +GGGGAGAAAGAAGACAGGGACCAGCAACCACCTGAAAATCGGTCCCAGTGGAAAAACAATCCAATCCAAT +CTCTGGGTGTCGTGGCTCATGCCTATAATCTCCGCACTTAGGGAGGTTGAGGCGGGTGGATCACTTGAGG +TCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCTTCTCTACTAAAAATGCAAAAAAATTAGC +TGGATATGGTGGCACACATCTGTAGTTCCAGCTACTCAGGAGGCTAAGGCAGGAGAATCGCTTGAACCCA +GGAGGTGGAGGTTGCAGTGAGCCAAGATGGTGCCATTGCCCTCCGGCATGGGTGACAAGAGAGAAACTCC +ATCTCAAAACCAAACCAAACCAAACCAAACCAAACCAAACCAAACCAAACCAAACCAAACAAAATCGGTC +CCTACTGTGCCGACCCAGCTTGCACATTCTACTCGGGGTTGGGGGACAGCTTGGTGCATTCCTGTCTCTG +CAGACAACTTTTACTGATGGAGCACCACCTGCTCAGCACAGCCAAGCACCCAGGTTGTAAGGAAGGGACA +GGCCCAGCCACCCTGCAGCTTACAGCCCACCTGAGGACTAACTAGAATTCAGTAGGACGAAGATGATGAG +ACAGACATAGAGCAAGTGCCCCCCGGGGGCCCGGGAGGGGGATTAGACAGCCTGGGTAGCTGCAGAAGGC +AGGCATGACTTTGAATTGACCCCTGAAGGATGAATCAGGTAGAGAGACATGGTGGAATTCCAGGCCGAGT +TGGGATTCTGCAGAAGCAAACTTACTCCCAGGGGAGCCGATGGCTGGCAGGGCCACCCGATCAGCTGAGT +GCACAGTTCTATGATGTGGGGAGCAGGGGGCTGGGGTGTCCCTCTTTTCCTGGTGGAGAAGTGAAAGGTC +AGAGAGGTGACATGACGAATGTCACAGACTGTGTCATCTGAGGAATCAGAATTAGAACCAGGTCCCTGTC +TCTGAGCCTGGACCCAATTCACCTCTTTTCTTCCATATTTCTCTTCCCTAGACCTGGGGGTTCTTGTGAG +TCTGGGGAATGGGCTTTCTAATTCTGAATTGTTGTTTGTCTTTTAGCAGCCGGGATGTCCCCAGAGCAGG +TAAGATAGCCCCCAAGGTCGGACCAGAGAGACCAGGCAGCTTCTTTCCCCTTCCTCATTTTCCATCTCTC +AGAAGGTCCAAAATATCAGAGAAGTCCTGTGCCAAAGTATCCTGTCCTCATGGCACCCTGCAGGGCCTGG +GGGCTGAGGGCCTGGAAACAGCCAGGGGGACAAGGGCTGAGGGGTACAGGAGCCTGAGAAGTGGAGGACA +CGGAGGGAAATGGGAGAGGGGCACCTGGGGAAGTGGAGAGGCTCTCGTCCCGTCCCACCGTGGAAAAGGA +ACCTGTGTCCACAGACAGATCTTGTGACTCCCCAGCCTGTGTTCCCTGTCCCAGGTACTGCAGCCCCTGG +AGGGCGACCTCTGCTATGCAGACCTGACCCTGCAGCTGGCCGGAACCTCCCCGCAAAAGGCTACCACGAA +GCTTTCCTCTGCCCAGGTTGACCAGGTGGAAGTGGAATATGTCACCATGGTGCGTCCTCCATGGGGGCTG +CTGTGAGGCTGGGGCAGGGGGACACAGGCCTGCCGTTGCACCCCGTTGGAGGGTGGGCTTTCTCCTGCTC +ATGTGGGCATAGAGGAGGCAGGTGTGGGGCAAAGCCATAGTCACTGCAGGACTCGCCTAGGAGCTCCCAC +AGGCAAGTCTCAGTCCTGGGGACACAGGGACTGGCAAACACAGTCCCCACGTGGGCAGCAGAGTCCCAGA +CAGGCTGAGTGACTTGGGATTCCACAGGAAATGTGCTGCACTGGTGCAGGGTTGGGGGAGAATCTGTTCT +GTTGTGGTTGAAGCTCAGGGTGCTTGGCAGAGTGAAGGGGGAAGCTGGAGGGCTTGAGTGGGATTAGAAG +CCCTGAATGGTCCATCCCCACTATCCTCCTCCGAGGGCTCCTCTGCCCCGTGACCGTCACTTCCTGCCGT +CCTCTCTGACCTGGAGCCCGCCTTTGCTGCAGGCTTCCTTGCCGAAGGAGGACATTTCCTATGCATCTCT +GACCTTGGGTGCTGAGGATCAGGAACCGACCTACTGCAACATGGGCCACCTCAGTAGCCACCTCCCCGGC +AGGGGCCCTGAGGAGCCCACGGAATACAGCACCATCAGCAGGCCTTAGCCTGCACTCCAGGCTCCTTCTT +GGACCCCAGGCTGTGAGCACACTCCTGCCTCATCGACCGTCTGCCCCCTGCTCCCCTCATCAGGACCAAC +CCGGGGACTGGTGCCTCTGCCTGATCAGCCAGCATTGCCCCTAGCTCTGGGTTGGGCTTGGGGCCAAGTC +TCAGGGGGCTTCTAGGAGTTGGGGTTTTCTAAACGTCCCCTCCTCTCCTACATAGTTGAGGAGGGGGCTA +GGGATATGCTCTGGGGCTTTCATGGGAATGATGAAGATGATAATGAGAAAAATGTTATCATTATTATCAT +GAAGTACCATTATCGTAATACAATGAACCTTTATTTATTGCCTACCACATGTTATGGGCTGAATAATGGC +CCCCAAAGATATCTGTGTCCTAATCCTCAGAACCTGTGACTGTTACCTTCTGTGGCAGAAAGGGACAGTG +CAGATGTATGTAAGTTAAGGACTTTGAGATAGAGAGGTTATTCTTGCTGATTCAGGTGGGCCCAAAATAT +CACCACAAGGGTCCTCATAAGAAAGAGGCCAGAAGGTCAAAGAGGTAGAGACAAAGTGATGATGGAAGTG +GACGTGGGTGTGACGTGAGCAGGGGCCATGAATGCCGCAGCCTTCAGATGCCAGAAAGGGAAAGGAATGG +ATTCCCCTGCCTGGAGCCTCCAAAAGAAACCAGCCCTGCCCACGCCTTGACTTGAGCCCATTGAAACTGA +TCTTGAGCTCCTGGCCTCCAGAATTGCAGGAGAATAAATTTGTGTTGTTTTTAATGA + diff --git a/test_my_tools.py b/test_my_tools.py new file mode 100644 index 0000000..18081df --- /dev/null +++ b/test_my_tools.py @@ -0,0 +1,112 @@ +import unittest +import pytest +import os + +from Bio import SeqIO +from biopython_fastq_filter import BiologicalSequence, NucleicAcidSequence, InvalidInputError, DNASequence, RNASequence, AminoAcidSequence, filter_fastq, send_telegram_message +from bio_files_processor import convert_multiline_fasta_to_oneline +import requests_mock +import requests + +def test_complement(): + inp = NucleicAcidSequence('ATGC') + target = 'TACG' + result = NucleicAcidSequence(inp).complement() + assert target == result + +def test_amino_acid_frequency(): + peptide = AminoAcidSequence('AAGG') + result = peptide.amino_acid_frequency() + target = {'A': 50.0, 'G': 50.0} + assert target == result + +def test_transcribe(): + dna = 'ATGC' + target = 'AUGC' + result = DNASequence(dna).transcribe() + assert target == result + +def test_check_nucleic_acid(): + input = 'ATGCU' + with pytest.raises(InvalidInputError): + DNASequence(input) + +def test_amino_acid_type(): + input = 'AAGGZZ' + with pytest.raises(InvalidInputError): + AminoAcidSequence(input).amino_acid_frequency() + +test_fasta_data = """>seq1 +ATGC +ATGC +ATGC +""" +expected_fasta_data = """>seq1 +ATGCATGCATGC +""" + +@pytest.fixture +def tmp_fasta_file(tmp_path): + fasta_file = tmp_path / "test.fasta" + fasta_file.write_text(test_fasta_data) + return fasta_file + +def test_convert_multiline_fasta_to_oneline(tmp_fasta_file): + output_file = tmp_fasta_file.parent / "output.fasta" + result_file = convert_multiline_fasta_to_oneline(tmp_fasta_file, output_file) + + with open(output_file) as f: + result_data = f.read() + assert result_data == expected_fasta_data + output_file.unlink() + + +test_fastq_data = """@SRX079804:1:SRR292678:1:1101:21885:21885 +ACAGCAACATAAACATGATGGGATGGCGTAAGCCCCCGAGATATCAGTTTACCCAGGATAAGAGATTAAATTATGAGCAACATTATTAA ++SRX079804:1:SRR292678:1:1101:21885:21885 +FGGGFGGGFGGGFGDFGCEBB@CCDFDDFFFFBFFGFGEFDFFFF;D@DD>C@DDGGGDFGDGG?GFGFEGFGGEF@FDGGGFGFBGGD +@SRX079804:1:SRR292678:1:1101:212327:212327 +TTACCTCTGCTTTTTCGCCTGTTACTTCTACTAATCCTTCATCTATTGCGAATGGCCCTACTACTGACGAAAT ++SRX079804:1:SRR292678:1:1101:212327:212327 +DBCCC@@;A@BDCCCE>BBED>GDCDBFBFFEBEECFGGD@@BCB<<8@;09746:@@>@EEECEEDE/FEED +@SRX079804:1:SRR292678:1:1101:230386:230386 +TTCTGATTGGAGTGAGAGTGCCATTTGTTTCGCTGATTGGACGTTGGAAAGCGCCTTGACCTTTGACAGCAG ++SRX079804:1:SRR292678:1:1101:230386:230386 +=BDD=DCDBBCDADD@@B;B@CC7C@B@>=BACDD,=??8DDCDD7CCCCDBDEDBDDDDDEEBDDDBD?DB +""" + +@pytest.fixture +def tmp_fastq_file(tmp_path): + fastq_file = tmp_path / "test.fastq" + fastq_file.write_text(test_fastq_data) + return fastq_file + +def test_filter_fastq(tmp_fastq_file, tmp_path): + quality_threshold = 10 + gc_bounds = (30, 70) + length_bounds = (10, 100) + + output_file = tmp_path / "filtered.fastq" + result = filter_fastq(tmp_fastq_file, quality_threshold, output_file, gc_bounds, length_bounds) + with open(output_file) as f: + filtered_records = list(SeqIO.parse(f, "fastq")) + assert len(filtered_records) == int(result) + +token = os.environ.get('TG_API_TOKEN') +def test_send_telegram_message(requests_mock): + chat_id = "508988457" + message = "Test message" + expected_json_response = { + "ok": True, + "result": { + "message_id": 1234, + } + } + requests_mock.post( + f"https://api.telegram.org/bot{os.environ['TG_API_TOKEN']}/sendMessage", + json=expected_json_response, + status_code=200, + ) + response = send_telegram_message(chat_id, message) + assert response == expected_json_response + From df1a3f9ae7bd36bb3ab225482cfa4c8647021203 Mon Sep 17 00:00:00 2001 From: Elena Smertina <104040609+sme229@users.noreply.github.com> Date: Wed, 1 May 2024 19:16:23 +1000 Subject: [PATCH 18/24] Update README.md --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ba78686..ed4d751 100644 --- a/README.md +++ b/README.md @@ -1 +1,3 @@ -# misc_module \ No newline at end of file +# misc_module + +This module contains assignments for the BI course on python. The scripts include tasks on OOP, API and various functions useful for basic bioinformatics. From 2f3798055d4f889649f1e25b7d4ad4a3d1b466ae Mon Sep 17 00:00:00 2001 From: Elena Smertina <104040609+sme229@users.noreply.github.com> Date: Wed, 1 May 2024 19:27:56 +1000 Subject: [PATCH 19/24] Update requirements.txt --- requirements.txt | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 10211e5..29abb64 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,9 @@ -bioframe==0.5.1 +beautifulsoup4==4.12.3 +Bio==1.7.0 biopython==1.81 +numpy==1.26.4 +pytest==8.2.0 +python-dotenv==1.0.1 +Requests==2.31.0 +requests_mock==1.12.1 +scikit_learn==1.4.1.post1 From a910d5c230c217c760ec21decbb10d3e9522e36a Mon Sep 17 00:00:00 2001 From: Elena Smertina <104040609+sme229@users.noreply.github.com> Date: Wed, 15 May 2024 07:53:15 +1000 Subject: [PATCH 20/24] Update README.md --- README.md | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 91 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ed4d751..2028e64 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,93 @@ # misc_module -This module contains assignments for the BI course on python. The scripts include tasks on OOP, API and various functions useful for basic bioinformatics. +This module contains assignments for the BI course on python. The scripts include tasks on OOP, API and various functions useful for basic bioinformatics. +The following functions can be found in `bio_files_processor.py`: + +``` +convert_multiline_fasta_to_oneline +``` + +Example: + +` +convert_multiline_fasta_to_oneline(input_fasta='example_multiline_fasta.fasta') +` + +Input file: + +![before](https://github.com/sme229/misc_module/assets/104040609/65e68a7a-a47c-4335-8d10-a88387fa3bdd) + +After conversion to a single line: + +![after](https://github.com/sme229/misc_module/assets/104040609/c85e4283-295e-4689-a156-5c464cec2164) + +` +OpenFasta +` +This is a context manager that works with fasta files. + +- It returns records as `FastaRecord` class objects +- Includes `read_record` and `read_records` methods + +Input and output example: + +```fasta +>GTD326487.1 Species anonymous 24 chromosome +ATCGACTACGACTAGCATCACGATCACGATACG +ATGCATCAGTAGCACTAGATCA +``` + +```python +id = 'GTD326487.1' +description = 'Species anonymous 24 chromosome' +sequence = 'ATCGACTACGACTAGCATCACGATCACGATACGATGCATCAGTAGCACTAGATCA' +``` + +In biopython_fastq_filter.py the following functions are located: + +` +fastq_filter +` + +This function takes fastq sequences organised as a dictinary for input: {'seq_name': ('sequence', 'quality')}, for example: + +` +{'@SRX079804:1:SRR292678:1:1101:21885:21885': ('ACAGCAACATAAACATGATGGGATGGCGTAAGCCCCCGAGATATCAGTTTACCCAGGATAAGAGATTAAATTATGAGCAACATTATTAA', 'FGGGFGGGFGGGFGDFGCEBB@CCDFDDFFFFBFFGFGEFDFFFF;D@DD>C@DDGGGDFGDGG?GFGFEGFGGEF@FDGGGFGFBGGD'), '@SRX079804:1:SRR292678:1:1101:24563:24563': ('ATTAGCGAGGAGGAGTGCTGAGAAGATGTCGCCTACGCCGTTGAAATTCCCTTCAATCAGGGGGTACTGGAGGATACGAGTTTGTGTG', 'BFFFFFFFB@B@A<@D>BDDACDDDEBEDEFFFBFFFEFFDFFF=CC@DDFD8FFFFFFF8/+.2,@7<<:?B/:<><-><@.A*C>D')} +` + +It filters fastq sequences by GC content, sequence length and quality score. These parameters can be specified in the function call: + +` +fastq_filter({'@SRX079804:1:SRR292678:1:1101:21885:21885': ('ACAGCAACATAAACATGATGGGATGGCGTAAGCCCCCGAGATATCAGTTTACCCAGGATAAGAGATTAAATTATGAGCAACATTATTAA', 'FGGGFGGGFGGGFGDFGCEBB@CCDFDDFFFFBFFGFGEFDFFFF;D@DD>C@DDGGGDFGDGG?GFGFEGFGGEF@FDGGGFGFBGGD')}, gc_bounds=(0,80), length_bounds=(100,200), quality_threshold=20) +` + +In this example, the GC content filter is set up as >= 0 and <= 80, the length filter filters out sequences that are outside of >= 100 and <= 200 length and the quality score cut off is set to >= 20. +A filtered dictionary with sequences that passed all 3 filters is returned. + + +` +BiologicalSequence +` + +This is an abstract class that includes: + +Class NucleicAcidSequence which has `complement` and `gc_content` methods. It's a parent class to DNASequence and RNASequence classes. + +Class AminoAcidSequence has `amino_acid_frequency` method. + + +` +telegram_logger +` + +This function send a message from a telegram bot about the status of some process: + +![Untitled.png](attachment:6fbf5b6f-1a8d-4ca8-9536-eb059edc0b2e.png) + + +` +run_genscan +` +This is a python API for this web tool http://hollywood.mit.edu/GENSCAN.html + +In `custom_random_forest.py` there is a `RandomForestClassifierCustom` class that works with a custom number of threads which makes it fast. From c6f090f11138d147e8a5e90f0fb044225ce930a2 Mon Sep 17 00:00:00 2001 From: Elena Smertina <104040609+sme229@users.noreply.github.com> Date: Wed, 15 May 2024 07:53:42 +1000 Subject: [PATCH 21/24] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 2028e64..7e47bf4 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,7 @@ After conversion to a single line: ` OpenFasta ` + This is a context manager that works with fasta files. - It returns records as `FastaRecord` class objects From 28a1b947e495210ffa6b3a10d53f27a5887334c5 Mon Sep 17 00:00:00 2001 From: Elena Smertina <104040609+sme229@users.noreply.github.com> Date: Wed, 15 May 2024 07:54:40 +1000 Subject: [PATCH 22/24] Update README.md --- README.md | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 7e47bf4..23a60c2 100644 --- a/README.md +++ b/README.md @@ -21,9 +21,9 @@ After conversion to a single line: ![after](https://github.com/sme229/misc_module/assets/104040609/c85e4283-295e-4689-a156-5c464cec2164) -` +``` OpenFasta -` +``` This is a context manager that works with fasta files. @@ -46,9 +46,9 @@ sequence = 'ATCGACTACGACTAGCATCACGATCACGATACGATGCATCAGTAGCACTAGATCA' In biopython_fastq_filter.py the following functions are located: -` +``` fastq_filter -` +``` This function takes fastq sequences organised as a dictinary for input: {'seq_name': ('sequence', 'quality')}, for example: @@ -66,9 +66,9 @@ In this example, the GC content filter is set up as >= 0 and <= 80, the length f A filtered dictionary with sequences that passed all 3 filters is returned. -` +``` BiologicalSequence -` +``` This is an abstract class that includes: @@ -77,18 +77,19 @@ Class NucleicAcidSequence which has `complement` and `gc_content` methods. It's Class AminoAcidSequence has `amino_acid_frequency` method. -` +``` telegram_logger -` +``` This function send a message from a telegram bot about the status of some process: ![Untitled.png](attachment:6fbf5b6f-1a8d-4ca8-9536-eb059edc0b2e.png) -` +``` run_genscan -` +``` + This is a python API for this web tool http://hollywood.mit.edu/GENSCAN.html In `custom_random_forest.py` there is a `RandomForestClassifierCustom` class that works with a custom number of threads which makes it fast. From 56d12d355b482ac6a5fe554682646d549357dea3 Mon Sep 17 00:00:00 2001 From: Elena Smertina <104040609+sme229@users.noreply.github.com> Date: Wed, 15 May 2024 07:56:15 +1000 Subject: [PATCH 23/24] Update README.md --- README.md | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 23a60c2..b5c7c30 100644 --- a/README.md +++ b/README.md @@ -50,20 +50,7 @@ In biopython_fastq_filter.py the following functions are located: fastq_filter ``` -This function takes fastq sequences organised as a dictinary for input: {'seq_name': ('sequence', 'quality')}, for example: - -` -{'@SRX079804:1:SRR292678:1:1101:21885:21885': ('ACAGCAACATAAACATGATGGGATGGCGTAAGCCCCCGAGATATCAGTTTACCCAGGATAAGAGATTAAATTATGAGCAACATTATTAA', 'FGGGFGGGFGGGFGDFGCEBB@CCDFDDFFFFBFFGFGEFDFFFF;D@DD>C@DDGGGDFGDGG?GFGFEGFGGEF@FDGGGFGFBGGD'), '@SRX079804:1:SRR292678:1:1101:24563:24563': ('ATTAGCGAGGAGGAGTGCTGAGAAGATGTCGCCTACGCCGTTGAAATTCCCTTCAATCAGGGGGTACTGGAGGATACGAGTTTGTGTG', 'BFFFFFFFB@B@A<@D>BDDACDDDEBEDEFFFBFFFEFFDFFF=CC@DDFD8FFFFFFF8/+.2,@7<<:?B/:<><-><@.A*C>D')} -` - -It filters fastq sequences by GC content, sequence length and quality score. These parameters can be specified in the function call: - -` -fastq_filter({'@SRX079804:1:SRR292678:1:1101:21885:21885': ('ACAGCAACATAAACATGATGGGATGGCGTAAGCCCCCGAGATATCAGTTTACCCAGGATAAGAGATTAAATTATGAGCAACATTATTAA', 'FGGGFGGGFGGGFGDFGCEBB@CCDFDDFFFFBFFGFGEFDFFFF;D@DD>C@DDGGGDFGDGG?GFGFEGFGGEF@FDGGGFGFBGGD')}, gc_bounds=(0,80), length_bounds=(100,200), quality_threshold=20) -` - -In this example, the GC content filter is set up as >= 0 and <= 80, the length filter filters out sequences that are outside of >= 100 and <= 200 length and the quality score cut off is set to >= 20. -A filtered dictionary with sequences that passed all 3 filters is returned. +This function uses BioPython and filters fastq sequences by GC content, sequence length and quality score. ``` @@ -92,4 +79,5 @@ run_genscan This is a python API for this web tool http://hollywood.mit.edu/GENSCAN.html + In `custom_random_forest.py` there is a `RandomForestClassifierCustom` class that works with a custom number of threads which makes it fast. From 7d74c1d882a4e26e9d3342161b13b7f03366dc83 Mon Sep 17 00:00:00 2001 From: Elena Smertina <104040609+sme229@users.noreply.github.com> Date: Wed, 15 May 2024 07:58:17 +1000 Subject: [PATCH 24/24] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b5c7c30..06139ab 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ telegram_logger This function send a message from a telegram bot about the status of some process: -![Untitled.png](attachment:6fbf5b6f-1a8d-4ca8-9536-eb059edc0b2e.png) +![Untitled](https://github.com/sme229/misc_module/assets/104040609/141f1cd1-1430-48c7-b8ab-dda41db214ea) ```