From 0496a2aeaf0710d64ded4376204971f1629a6097 Mon Sep 17 00:00:00 2001 From: lsmertina Date: Sat, 7 Oct 2023 10:41:44 +1100 Subject: [PATCH 01/22] Add fastq filter module --- modules/fastq_filter_functions.py | 106 ++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 modules/fastq_filter_functions.py diff --git a/modules/fastq_filter_functions.py b/modules/fastq_filter_functions.py new file mode 100644 index 0000000..758db74 --- /dev/null +++ b/modules/fastq_filter_functions.py @@ -0,0 +1,106 @@ +def calc_gc_content(seq: str) -> float: + """ + Calculates gc content + Argument is string + Returns float in % + """ + seq_lower = seq.lower() + length_seq = len(seq_lower) + gc_count = 0 + for nt in seq_lower: + if nt=='g' or nt=='c': + gc_count+=1 + gc_content = (gc_count/length_seq)*100 + return gc_content +def seq_length(seq: str) -> str: + """ + Calculates sequence length + Argument is string + Returns string + """ + return len(seq) +def quality_score(seq: str) -> int: + """ + Calculates numeric quality score + Argument is string + Returns int value + """ + score_count = 0 + length_q_seq = len(seq) + for symbol in seq: + score_num = ord(symbol) - 33 + score_count+=score_num + mean_qs = (score_count/length_q_seq) + return mean_qs +def length_filter(seqs: dict, length_bounds=(0,1000)) -> dict: + """ + Filters fastq reads by length + Arguments: + -dictionary + -sequence length parameters (>= and <=) + Returns filtered dictionary + """ + #seqs = {'name': ('sequence', 'quality')} + output = [] + result = dict() + for name, (sequence, quality) in seqs.items(): + + if seq_length(sequence) <= length_bounds[1] and seq_length(sequence) >= length_bounds[0]: + output.append(name) + if name in output: + result[name] = (sequence, quality) + + return result +def quality_filter(seqs: dict, quality_threshold=25) -> dict: + """ + Filters fastq reads by quality score + Arguments: + -dictionary + -quality score threshold (>=) + Returns filtered dict + """ + #seqs = {'name': ('sequence', 'quality')} + output = [] + result = dict() + for name, (sequence, quality) in seqs.items(): + + if quality_score(quality) >= quality_threshold: + output.append(name) + if name in output: + result[name] = (sequence, quality) + + return result +def gc_filter(seqs: dict, gc_bounds=(0,100)) -> dict: + """ + Filters fastq reads by gc content + Arguments: + -dict + -gc content parameters (>= and <=) + Returns filtered dict + """ + #seqs = {'name': ('sequence', 'quality')} + output = [] + result = dict() + for name, (sequence, quality) in seqs.items(): + + if calc_gc_content(sequence) >= gc_bounds[0] and calc_gc_content(sequence) <= gc_bounds[1]: + output.append(name) + if name in output: + result[name] = (sequence, quality) + + return result +def fastq_filter(seqs: dict, gc_bounds: int, length_bounds: int, quality_threshold: int) -> dict: + """ + Filters fastq sequence by gc content, length and quality score + Arguments: dict with fastq sequences, filtering parameters + Returns filtered dictionary + """ + resulting_sequences = dict() + gc_filtered = gc_filter(seqs, gc_bounds = (0,100)) + length_filtered = length_filter(seqs, length_bounds = (0,1000)) + quality_filtered = quality_filter(seqs, quality_threshold = 15) + intersection = gc_filtered.keys() & length_filtered.keys() & quality_filtered.keys() + #intersection = {keys: gc_filtered[keys] for keys in gc_filtered.keys() & length_filtered.keys()} + #for keys, (sequence, quality) in intersection: + # resulting_sequences[keys] = (sequence, quality) + return intersection From 5f3ef9fe7bfcd24f7466c8091595af9b82754616 Mon Sep 17 00:00:00 2001 From: lsmertina Date: Sat, 7 Oct 2023 11:16:23 +1100 Subject: [PATCH 02/22] Add functions for protein sequences --- modules/protein_module.py | 130 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 modules/protein_module.py diff --git a/modules/protein_module.py b/modules/protein_module.py new file mode 100644 index 0000000..d71da27 --- /dev/null +++ b/modules/protein_module.py @@ -0,0 +1,130 @@ +from typing import Optional + +aa_code_dict = {'C':'Cys', 'c':'Cys', 'D':'Asp', 'd':'Asp', 'S':'Ser', 's':'Ser', 'Q':'Gln', 'q':'Gln', + 'K':'Lys', 'k':'Lys', 'I':'Ile', 'i':'Ile', 'P':'Pro', 'p':'Pro', 'T':'Thr', 't':'Thr', + 'F':'Phe', 'f':'Phe', 'N':'Asn', 'n':'Asn', 'G':'Gly', 'g':'Gly', 'H':'His', 'h':'His', + 'L':'Leu', 'l':'Leu', 'R':'Arg', 'r':'Arg', 'W':'Trp', 'w':'Trp', 'A':'Ala', 'a':'Ala', + 'V':'Val', 'v':'Val', 'E':'Glu', 'e':'Glu', 'Y':'Tyr', 'y':'Tyr', 'M':'Met', 'm':'Met'} + +aa_weight_dict = {'G':75, 'g':75, 'A':89, 'a':89, 'R':174, 'r':174, 'N':132, 'n':132, + 'D':133, 'd':133, 'C':121, 'c':133, 'E':147, 'e':147, 'Q':146, 'q':146, + 'H':155, 'h':155, 'I':131, 'i':131, 'L':131, 'l':131, 'K':146, 'k':146, + 'M':149, 'm':149, 'F':165, 'f':165, 'P':115, 'p':115, 'S':105, 's':105, + 'T':119, 't':119, 'W':204, 'w':204, 'Y':181, 'y':181, 'V':117, 'v':117} +def amino_acid_frequency(seq: str) -> dict: + """ + Calculates amino acid frequencies + Arguments: + -seq (str) input protein sequence + Return: + -dictionary with amino acid and its frequency + """ + freq_dict = {} + for letter in seq: + if letter in freq_dict: + freq_dict[letter] += 1 + else: + freq_dict[letter] = 1 + for letter in freq_dict: + freq_dict[letter] = round(freq_dict[letter] / len(seq) * 100, 2) + return freq_dict + + +def find_motifs(seq: str, motif: str): + """ + Finds a motif of interest in a protein sequence + Arguments: + -seq (str) input protein sequence + -motif (str) motif to be found in sequence + Return: + -position(s) of the motif in seq + """ + positions = [] + for i in range(len(seq) - len(motif) + 1): + window = seq[i:i+len(motif)] + if window == motif: + positions.append(i) + return positions + + +def check_protein_seq(seq: str) -> str: + """ + Checks whether a sequence is written using 1-letter amino acid code + Arguments: + -seq (str) input protein sequence + Return: + - str, 'single_letter_prot_seq' otherwise 'Invalid Input' error is raised + """ + unique_chars = set(seq) + single_letter = set('GALMFWKQESPVICYHRNDTgalmfwkqespvicyhrndt') + + if unique_chars <= single_letter: + seq = 'single_letter_prot_seq' + + else: + raise ValueError("Invalid Input") + return seq + + +def molecular_weight(seq: str) -> int: + """ + Calculates molecular weight of a protein + Arguments: + - seq (str) 1-letter coded protein sequence + Return: + - int, molecular weight (g/mol) rounded to integer + """ + list_input_seq = list(seq) + water_mw = 18 + for aa in list_input_seq: + total_mw = sum(aa_weight_dict[a] for a in list_input_seq) + mw_water_removed = (total_mw - (water_mw * (len(list_input_seq)-1))) + return mw_water_removed + + +def one_to_three_letter(seq: str) -> str: + """ + Converts a 1-letter amino acid code sequence into a 3-letter sequence + Arguments: + - seq (str) sequence to convert, must be 1-letter coded protein sequence + Return: + - str, a 3-letter coded protein sequence without spaces + """ + three_letter_aa = '' + for aa in seq: + three_letter_aa_seq += aa_code_dict[aa] + return three_letter_aa_seq + + +def run_protein_tool(*args: str, function: str, motif: Optional[str]=None): + """ + This is the main function + Arguments: + -seq(str) protein sequence(s) + -function(str) specify the function + -motif(str), optional argument for find_motifs function + Return: + -result of the specified function + """ + results = [] + for seq in args: + if check_protein_seq(seq) == 'single_letter_prot_seq': + if function == 'check_protein_seq': + for seq in args: + results.append(check_protein_seq(seq)) + elif function == 'molecular_weight': + for seq in args: + results.append(molecular_weight(seq)) + elif function == 'one_to_three_letter': + for seq in args: + results.append(one_to_three_letter(seq)) + elif function == 'amino_acid_frequency': + for seq in args: + results.append(amino_acid_frequency(seq)) + elif function == 'find_motifs': + for seq in args: + results.append(find_motifs(seq, motif)) + if len(results) == 1: + results = results[0] + return results + From e02237542fb084f09964ed1695304531d60c8ac3 Mon Sep 17 00:00:00 2001 From: lsmertina Date: Sat, 7 Oct 2023 17:16:08 +1100 Subject: [PATCH 03/22] Add nucleic acid functions from hw3 --- modules/nucleic_acid_module.py | 125 +++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 modules/nucleic_acid_module.py diff --git a/modules/nucleic_acid_module.py b/modules/nucleic_acid_module.py new file mode 100644 index 0000000..c24d2bd --- /dev/null +++ b/modules/nucleic_acid_module.py @@ -0,0 +1,125 @@ +def transcribe(seq: str) -> str: + """ + Transcribes DNA->RNA + Argument is string + Return is string + """ + list_input = list(seq) + for i in range(len(seq)): + if (list_input[i] == 'T'): + list_input[i] = 'U' + elif (list_input[i] == 't'): + list_input[i]='u' + return "".join(list_input) + + +def reverse(seq: str) -> str: + """ + Returns reversed sequence + Argument is string + Return is string + """ + output = seq[::-1] + return output + + +def complement(seq: str) -> str: + """ + Returns a complementary sequence + Argument is a string + Return is a string + """ + list_input = list(seq) + for i in range(len(seq)): + if (list_input[i]=='G'): + list_input[i]='C' + elif (list_input[i]== 'g'): + list_input[i]='c' + elif (list_input[i]=='C'): + list_input[i]='G' + elif (list_input[i]=='c'): + list_input[i]='g' + elif (list_input[i] == 'T'): + list_input[i] = 'A' + elif (list_input[i] == 't'): + list_input[i]='a' + elif (list_input[i] == 'A'): + list_input[i] = 'T' + elif (list_input[i]=='a'): + list_input[i]='t' + + else: + + list_input = list(seq) + for i in range(len(seq)): + if (list_input[i]=='G'): + list_input[i]='C' + elif (list_input[i]== 'g'): + list_input[i]='c' + elif (list_input[i]=='C'): + list_input[i]='G' + elif (list_input[i]=='c'): + list_input[i]='g' + elif (list_input[i] == 'U'): + list_input[i] = 'A' + elif (list_input[i] == 'u'): + list_input[i]='a' + elif (list_input[i] == 'A'): + list_input[i] = 'U' + elif (list_input[i]=='a'): + list_input[i]='u' + return "".join(list_input) + + +def check_nucleic_acid(seq: str) -> str: + """ + This function checks whether input sequence(s) is a nucleic acid + Argument is str + Return is str + """ + unique_chars = set(seq) + nucleotides_dna = set('ATGCatgc') + nucleotides_rna = set('AUGCaugc') + if unique_chars <= nucleotides_dna: + seq = 'dna' + elif unique_chars <= nucleotides_rna: + seq = 'rna' + else: + raise ValueError("Invalid Input") + return seq + + +def reverse_complement(seq: str) -> str: + """ + This function returns a reversed complementary sequence + Argument is str + Return is str + """ + complement_seq = complement(seq) + reverse_compl_seq = reverse(complement_seq) + return reverse_compl_seq + + +def run_dna_rna_tools(*args: str, function: str) -> str: + """ + This function combines the functions above + Arguments: *args are input sequences, function is a function of choice + Returns: str, processed seqeunces depending on the function chosen + """ + results = [] + for seq in args: + check_nucleic_acid(seq) + if function == 'transcribe': + results.append(transcribe(seq)) + if function == 'complement': + results.append(complement(seq)) + if function == 'reverse': + results.append(reverse(seq)) + if function == 'reverse_complement': + results.append(reverse_complement(seq)) + if len(results) == 1: + results = results[0] + + return results + + From fc02f8280ed6c5950e528e4aa932764bd23a4698 Mon Sep 17 00:00:00 2001 From: lsmertina Date: Sat, 7 Oct 2023 17:46:39 +1100 Subject: [PATCH 04/22] Add the main script with 3 functions --- miscellaneous.py | 71 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 miscellaneous.py diff --git a/miscellaneous.py b/miscellaneous.py new file mode 100644 index 0000000..540510b --- /dev/null +++ b/miscellaneous.py @@ -0,0 +1,71 @@ +def fastq_filter(seqs: dict, gc_bounds: int, length_bounds: int, quality_threshold: int) -> dict: + """ + Filters fastq sequence by gc content, length and quality score + Arguments: dict with fastq sequences, filtering parameters + Returns filtered dictionary + """ + result = dict() + gc_filtered = gc_filter(seqs, gc_bounds) + length_filtered = length_filter(seqs, length_bounds) + quality_filtered = quality_filter(seqs, quality_threshold) + intersection = gc_filtered.keys() & length_filtered.keys() & quality_filtered.keys() + + for keys, (sequence, quality) in seqs.items(): + if keys in intersection: + result[keys] = (sequence, quality) + return result + + +def run_dna_rna_tools(*args: str, function: str) -> str: + """ + This function combines the functions above + Arguments: *args are input sequences, function is a function of choice + Returns: str, processed seqeunces depending on the function chosen + """ + results = [] + for seq in args: + check_nucleic_acid(seq) + if function == 'transcribe': + results.append(transcribe(seq)) + if function == 'complement': + results.append(complement(seq)) + if function == 'reverse': + results.append(reverse(seq)) + if function == 'reverse_complement': + results.append(reverse_complement(seq)) + if len(results) == 1: + results = results[0] + return results + + +def run_protein_tool(*args: str, function: str, motif: Optional[str]=None): + """ + This is the main function + Arguments: + -seq(str) protein sequence(s) + -function(str) specify the function + -motif(str), optional argument for find_motifs function + Return: + -result of the specified function + """ + results = [] + for seq in args: + if check_protein_seq(seq) == 'single_letter_prot_seq': + if function == 'check_protein_seq': + for seq in args: + results.append(check_protein_seq(seq)) + elif function == 'molecular_weight': + for seq in args: + results.append(molecular_weight(seq)) + elif function == 'one_to_three_letter': + for seq in args: + results.append(one_to_three_letter(seq)) + elif function == 'amino_acid_frequency': + for seq in args: + results.append(amino_acid_frequency(seq)) + elif function == 'find_motifs': + for seq in args: + results.append(find_motifs(seq, motif)) + if len(results) == 1: + results = results[0] + return results From e44c87b7791641256bab32049202c59aa9bdd9e4 Mon Sep 17 00:00:00 2001 From: lsmertina Date: Sat, 14 Oct 2023 12:27:13 +1100 Subject: [PATCH 05/22] Create a script --- updated_HW5.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 updated_HW5.py diff --git a/updated_HW5.py b/updated_HW5.py new file mode 100644 index 0000000..e69de29 From 92e6910a23f779b2b834d31b5e86bfe483bd8e82 Mon Sep 17 00:00:00 2001 From: lsmertina Date: Wed, 18 Oct 2023 13:04:22 +1100 Subject: [PATCH 06/22] Create bio_files_processor.py file for HW6 --- bio_files_processor.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 bio_files_processor.py diff --git a/bio_files_processor.py b/bio_files_processor.py new file mode 100644 index 0000000..e69de29 From b8cd252b6390b3c2cf45d7502afbe1a01b4b3c04 Mon Sep 17 00:00:00 2001 From: lsmertina Date: Fri, 23 Feb 2024 17:50:44 +1100 Subject: [PATCH 07/22] add script and sample data --- biopython_gc_filter.ipynb | 77 ++++++ example_fastq.fastq | 356 ++++++++++++++++++++++++++++ example_fastq.fastq:Zone.Identifier | 3 + misc_module | 1 + 4 files changed, 437 insertions(+) create mode 100644 biopython_gc_filter.ipynb create mode 100644 example_fastq.fastq create mode 100644 example_fastq.fastq:Zone.Identifier create mode 160000 misc_module diff --git a/biopython_gc_filter.ipynb b/biopython_gc_filter.ipynb new file mode 100644 index 0000000..a3da0d8 --- /dev/null +++ b/biopython_gc_filter.ipynb @@ -0,0 +1,77 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 7, + "id": "21c5a784", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "9\n" + ] + } + ], + "source": [ + "from Bio import SeqIO\n", + "from Bio.SeqUtils import GC\n", + "\n", + "def filter_fastq(input_path: str, quality_threshold: int, output_filename=\"final_filtered.fastq\", gc_bounds=(40, 60), length_bounds=(50, 350)):\n", + " filename = input_path\n", + " records = SeqIO.parse(filename, \"fastq\")\n", + " ###quality filter\n", + " good_reads = (rec for rec in records if min(rec.letter_annotations[\"phred_quality\"]) >= quality_threshold)\n", + " result_quality = SeqIO.write(good_reads, \"good_quality.fastq\", \"fastq\")\n", + " result_quality_GC = SeqIO.parse(\"good_quality.fastq\", \"fastq\")\n", + " \n", + " ###GC content filter\n", + " min_gc_content = gc_bounds[0]\n", + " max_gc_content = gc_bounds[1]\n", + " GC_quality_filt = []\n", + " \n", + " for sequence in result_quality_GC:\n", + " if min_gc_content <= GC(sequence.seq) <= max_gc_content:\n", + " GC_quality_filt.append(sequence)\n", + " \n", + " result_quality = SeqIO.write(GC_quality_filt, \"good_quality_GC.fastq\", \"fastq\")\n", + " result_quality_GC_length = SeqIO.parse(\"good_quality_GC.fastq\", \"fastq\")\n", + " \n", + " ##length filter\n", + " filtered_GC_quality_length = []\n", + " \n", + " for sequence in result_quality_GC_length:\n", + " if len(sequence.seq) >= length_bounds[0] and len(sequence.seq) <= length_bounds[1]:\n", + " filtered_GC_quality_length.append(sequence)\n", + " \n", + " result_quality = SeqIO.write(filtered_GC_quality_length, output_filename, \"fastq\")\n", + " \n", + " print(result_quality)\n", + "\n", + "#filter_fastq(\"example_fastq.fastq\", 15)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/example_fastq.fastq b/example_fastq.fastq new file mode 100644 index 0000000..883b51f --- /dev/null +++ b/example_fastq.fastq @@ -0,0 +1,356 @@ +@SRX079804:1:SRR292678:1:1101:21885:21885 1:N:0:1 BH:ok +ACAGCAACATAAACATGATGGGATGGCGTAAGCCCCCGAGATATCAGTTTACCCAGGATAAGAGATTAAATTATGAGCAACATTATTAA ++SRX079804:1:SRR292678:1:1101:21885:21885 1:N:0:1 BH:ok +FGGGFGGGFGGGFGDFGCEBB@CCDFDDFFFFBFFGFGEFDFFFF;D@DD>C@DDGGGDFGDGG?GFGFEGFGGEF@FDGGGFGFBGGD +@SRX079804:1:SRR292678:1:1101:24563:24563 1:N:0:1 BH:failed +ATTAGCGAGGAGGAGTGCTGAGAAGATGTCGCCTACGCCGTTGAAATTCCCTTCAATCAGGGGGTACTGGAGGATACGAGTTTGTGTG ++SRX079804:1:SRR292678:1:1101:24563:24563 1:N:0:1 BH:failed +BFFFFFFFB@B@A<@D>BDDACDDDEBEDEFFFBFFFEFFDFFF=CC@DDFD8FFFFFFF8/+.2,@7<<:?B/:<><-><@.A*C>D +@SRX079804:1:SRR292678:1:1101:30161:30161 1:N:0:1 BH:failed +GAACGACAGCAGCTCCTGCATAACCGCGTCCTTCTTCTTTAGCGTTGTGCAAAGCATGTTTTGTATTACGGGCATCTCGAGCGAATC ++SRX079804:1:SRR292678:1:1101:30161:30161 1:N:0:1 BH:failed +DFFFEGDGGGGFGGEDCCDCEFFFFCCCCCB>CEBFGFBGGG?DE=:6@=>AD?D8DCEE:>EEABE5D@5:DDCA;EEE-DCD +@SRX079804:1:SRR292678:1:1101:47176:47176 1:N:0:1 BH:failed +TGAAGCGTCGATAGAAGTTAGCAAACCCGCGGAACTTCCGTACATCAGACACATTCCGGGGGGTGGGCCAATCCATGATGCCTTTG ++SRX079804:1:SRR292678:1:1101:47176:47176 1:N:0:1 BH:failed +FF@FFBEEEEFFEFFD@EDEFFB=DFEEFFFE8FFE8EEDBFDFEEBE+E46.'8.5::EE:?E>A6@?)>;>9D<C9DEBAAB=5C?<@0=A?D@BDB;:BA?BDDFH?B@DCB6BEBDA??AA9. +@SRX079804:1:SRR292678:1:1101:52180:52180 1:N:0:1 BH:failed +TTTTTGGTTTTAGGTCTAACATGTAAGTCTTTAATCTATTTTGAATTAATTTTTGGATGAGGTGTAAGGAAGGGATCCAGTTTC ++SRX079804:1:SRR292678:1:1101:52180:52180 1:N:0:1 BH:failed +FEFFFFFF=FEEFDFD>EE:?<5@BFAFCFEE7>C>:/6:2<344DA:6DDCDC>)34773DD?@DBA9B +@SRX079804:1:SRR292678:1:1101:105156:105156 1:N:0:1 BH:ok +ACTGCTGAGCTTAAATGGCGGCAGTCTGACGGTTACCAACGGGGGCACTTCAACCGGTTCGTTAACGGGGAGCGGAGAGCTGA ++SRX079804:1:SRR292678:1:1101:105156:105156 1:N:0:1 BH:ok +GFFEGGFGGGGEGGGGGGGGGFDD=DDE7EDD6CD?FEDEE@EBEFEE.DD5DDD@B<7>/0543C?BEE?@@BE@; +@SRX079804:1:SRR292678:1:1101:135168:135168 1:N:0:1 BH:ok +CCTTCCTAAAAATTAAGAATCTTAACAATTAGCAGCACAACCAAAATTATTACCGAAAGGACTTACTCCTCCGCCAAATCCA ++SRX079804:1:SRR292678:1:1101:135168:135168 1:N:0:1 BH:ok +GGGEGGGEGGBFFFFFGCFFEGG6AEEEDEEGGFGGCG4EGFFGDFFBGEGFGGFECBDEGGEFGBFF?CGFBFGGGGFGFF +@SRX079804:1:SRR292678:1:1101:149302:149302 1:N:0:1 BH:changed:1 +TAGGGTTGTATTTGCAGATCCATGGCATGCCAAAAAGAACATCGTCCCGTCCAATATCTGCAACATACCAGTTGGTTGGTA ++SRX079804:1:SRR292678:1:1101:149302:149302 1:N:0:1 BH:changed:1 +@;CBA=:@;@DBDCDEEE/EEEEEEF@>FBEEB=EFA>EEBD=DAEEEEB9)99>B99BC)@,@<9CDD=C,5;B::?@;A +@SRX079804:1:SRR292678:1:1101:170868:170868 2:N:0:1 BH:failed +CTGCCGAGACTGTTCTCAGACATGGAAAGCTCGATTCGCATACACTCGCTGAGTAAGAGAGTCACACCAAATCACAGATT ++SRX079804:1:SRR292678:1:1101:170868:170868 2:N:0:1 BH:failed +E;FFFEGFGIGGFBG;C6D<@C7CDGFEFGFHDFEHHHBBHHFDFEFBAEEEEDE@A2=DA:??C3:@>EEBEEHEFEHHFFHH?FGBGFBBD77B;;C?FFFFGGFED.BBABBG@DBBE +@SRX079804:1:SRR292678:1:1101:190845:190845 1:N:0:1 BH:changed:1 +CCTCAGCGTGGATTGCCGCTCATGCAGGAGCAGATAATCCCTTCGCCATCCCATTAAGCGCCGTTGTCGGTATTCC ++SRX079804:1:SRR292678:1:1101:190845:190845 1:N:0:1 BH:changed:1 +FF@FFCFEECEBEC@@BBBBDFBBFFDFFEFFEB8FFFFFFFFEFCEB/>BBA@AFFFEEEEECE;ACD@DBBEEE +@SRX079804:1:SRR292678:1:1101:198993:198993 2:N:0:1 BH:failed +AGTTATTTATGCATCATTCTCATGTATGAGCCAACAAGATAGTACAAGTTTTATTGCTATGAGTTCAGTACAACA ++SRX079804:1:SRR292678:1:1101:198993:198993 2:N:0:1 BH:failed +<<<=;@B??@<>@><48876EADEG6B.BB@.?+98204<:<>@?A=@EFEFFFEEFB +@SRX079804:1:SRR292678:1:1101:204480:204480 1:N:0:1 BH:failed +AGTGAGACACCCCTGAACATTCCTAGTAAGACATCTTTGAATATTACTAGTTAGCCACACTTTAAAATGACCCG ++SRX079804:1:SRR292678:1:1101:204480:204480 1:N:0:1 BH:failed +<98;<@@@:@CD@BCCDD=DBBCEBBAAA@9???@BCDBCGF=GEGDFGDBEEEEEFFFF=EDEE=DCD@@BBC +@SRX079804:1:SRR292678:1:1101:212327:212327 2:N:0:1 BH:failed +TTACCTCTGCTTTTTCGCCTGTTACTTCTACTAATCCTTCATCTATTGCGAATGGCCCTACTACTGACGAAAT ++SRX079804:1:SRR292678:1:1101:212327:212327 2:N:0:1 BH:failed +DBCCC@@;A@BDCCCE>BBED>GDCDBFBFFEBEECFGGD@@BCB<<8@;09746:@@>@EEECEEDE/FEED +@SRX079804:1:SRR292678:1:1101:230386:230386 1:N:0:1 BH:failed +TTCTGATTGGAGTGAGAGTGCCATTTGTTTCGCTGATTGGACGTTGGAAAGCGCCTTGACCTTTGACAGCAG ++SRX079804:1:SRR292678:1:1101:230386:230386 1:N:0:1 BH:failed +=BDD=DCDBBCDADD@@B;B@CC7C@B@>=BACDD,=??8DDCDD7CCCCDBDEDBDDDDDEEBDDDBD?DB +@SRX079804:1:SRR292678:1:1101:236093:236093 1:N:0:1 BH:failed +CATCTAACTCTCATTCTAGATTCTTAAGTTGGCTACACTTTGCCGTCATTCTCGGTGGATTAGCTATTGGG ++SRX079804:1:SRR292678:1:1101:236093:236093 1:N:0:1 BH:failed +A:@A@;@BB@GGFGG@A@@817729B +@SRX079804:1:SRR292678:1:1101:251912:251912 2:N:0:1 BH:ok +TGTAAATGGAAATGAACCTAATATGTATGCACAAACTATTAAAGCATATCTTGCAAAAGGAGCAATGGCG ++SRX079804:1:SRR292678:1:1101:251912:251912 2:N:0:1 BH:ok +GHFDHHHGHHHFFHHGHHHGGGGGGDEFFEGGGGEFHHFHGFGHHHHFHHDDD6@=DCACFFGGEFBBG +@SRX079804:1:SRR292678:1:1101:278698:278698 1:N:0:1 BH:ok +CTAATAATGGTAATTGAACCATAGAAGATAAGTTCATAATGTAATAAATACATCCATAGAGTTATTAA ++SRX079804:1:SRR292678:1:1101:278698:278698 1:N:0:1 BH:ok +DDBDBCCCDD@FFFB9<<<@DA=DA@B:@=@@AC@GGFCGECFFDGGCGFFGGFFCEBF9>?@>BDFF +@SRX079804:1:SRR292678:1:1101:295878:295878 1:N:0:1 BH:changed:1 +AATGCAAACAGGATGATATTTGAATCCGTAATACTGTTCTTTCATCATAAATAATTTATGCAGATAC ++SRX079804:1:SRR292678:1:1101:295878:295878 1:N:0:1 BH:changed:1 +HHHHHEGFHDGFEGBCBEEEGGGG@EDGCGBBBEGF4?EFDBDDBFE8DEE-E?EE;B@EFC=;FDE +@SRX079804:1:SRR292678:1:1101:306575:306575 1:N:0:1 BH:failed +ATCTTTGAACTTTGTCTCACTTTCCCCCATCTCACCATTCCTCCTGTTCTGTGAACCCCAGTTTCA ++SRX079804:1:SRR292678:1:1101:306575:306575 1:N:0:1 BH:failed +E::EA@E<6B8>97:<6084649?@:?EDED=BEBEGGFEDGECECBDEFDDEEGGEDDFFF +@SRX079804:1:SRR292678:1:1101:403661:403661 1:N:0:1 BH:failed +GATGGCTTTGCTTTCTCATTCTCCTCTCCATCGTTCCCATCTTCGCCCTCAGACGCTGATTGAT ++SRX079804:1:SRR292678:1:1101:403661:403661 1:N:0:1 BH:failed +EG=DFFDFFDDGDGGGGGFFGGGGBADADCEEE5EC>CCCE6BEEEGGGGCBEGGE@9BCEF;>>D@D +@SRX079804:1:SRR292678:1:1101:425870:425870 1:N:0:1 BH:failed +GTACAGCTCTCCTCGTTACCAGCACATCTTGGACACCCGACGAAGACTTTGACATGCTCCTC ++SRX079804:1:SRR292678:1:1101:425870:425870 1:N:0:1 BH:failed +E@EED@EF=D>=EED@D@7DBF +@SRX079804:1:SRR292678:1:1101:429745:429745 1:N:0:1 BH:failed +GGCGATTGTGAAGGCATAAGAGTGGGACATAGTTCAAGTCCAGAACGAATTAAACGCACAA ++SRX079804:1:SRR292678:1:1101:429745:429745 1:N:0:1 BH:failed +BB2B=A@A:BFBFFFFF +@SRX079804:1:SRR292678:1:1101:475293:475293 2:N:0:1 BH:ok +TACATTTGATTTCTTTATAAGATTTCTTACTGTAAAATCATCGCTATTTAACAGCTTATT ++SRX079804:1:SRR292678:1:1101:475293:475293 2:N:0:1 BH:ok +FHFFFFDC@FGFEDGE?EEDC6EEEDEF?EEEE8EHHGGHFGFFEGGGGBFBDBDEBCBCA +@SRX079804:1:SRR292678:1:1101:511594:511594 1:N:0:1 BH:ok +AGCACAATCAGATTCGCTTATGACGGCGATGAAGAAATTGCGATGAAATGTGAGGTGA ++SRX079804:1:SRR292678:1:1101:511594:511594 1:N:0:1 BH:ok +HCHFFHHHGFHBFFFEGFFEFHFEHGBGECHEHB?CDDEFCDBFF9DFCD.BC +@SRX079804:1:SRR292678:1:1101:527839:527839 2:N:0:1 BH:changed:4 +GACATTTCTTTCTGGGAAGGCTTATTAAACGATAAAGATAATGATATACGTTTTGAT ++SRX079804:1:SRR292678:1:1101:527839:527839 2:N:0:1 BH:changed:4 +>35/-;,><04%'A.?4?:>BE.DC-@???CE:@EDFDBDG7B;=<)?1.@?2A5<5 +@SRX079804:1:SRR292678:1:1101:547309:547309 1:N:0:1 BH:failed +AGGTCTGCACTGGCTCTTCAGAGCGCAAGCGAGGGAAGAGAAATATAGTGACGCAC ++SRX079804:1:SRR292678:1:1101:547309:547309 1:N:0:1 BH:failed +GEGFGFD=FDB8B7DDFFFF@/DC@+2:66>-@>9);<2: +@SRX079804:1:SRR292678:1:1101:570367:570367 1:N:0:1 BH:failed +CTATGGCCACTTTTGTATTCTCGATTGAGGTTATTCGCTCACCCATCTTTTCCAA ++SRX079804:1:SRR292678:1:1101:570367:570367 1:N:0:1 BH:failed +FFEE;FFFFBFFFFF<9BEDEBEACDD3DD0B5>>0?:@>FFBEEBEFFFFGCCBCDGIGDGHEFGG=GGGGHFHEHF@FEFE?CGEEEFADEAEC +@SRX079804:1:SRR292678:1:1101:590521:590521 1:N:0:1 BH:failed +CCTCTTCTCTTTCGCGGTCATCTTGGGTTTCGCGCCTTTCTTCTTGACGACAC ++SRX079804:1:SRR292678:1:1101:590521:590521 1:N:0:1 BH:failed +EGFDD@FDF=FDFFEEGGBEFDGF=FGAGEEEEEDFBGGEE8EEE@>5;>90< +@SRX079804:1:SRR292678:1:1101:601307:601307 1:N:0:1 BH:changed:1 +TTGGCGTGCTGATGATTATCGGTATCTTCAAAGGCGCGCAGCCTGCGGGCTG ++SRX079804:1:SRR292678:1:1101:601307:601307 1:N:0:1 BH:changed:1 +GGGEGFGGEGE:EE>GFFGGGGDCGEBFFF>G=EBFFEC?DFGAD?DDECBE +@SRX079804:1:SRR292678:1:1101:631057:631057 1:N:0:1 BH:failed +CACCATTATCCTATTTCTGAACACATTTGACAGTCACGGCACTAGCATTGG ++SRX079804:1:SRR292678:1:1101:631057:631057 1:N:0:1 BH:failed +GGGGGGGGBFGGGEGFGGGGGFGEFFDFFFF?EBFEEDBFGEE@BE;E?E7 +@SRX079804:1:SRR292678:1:1101:654270:654270 1:N:0:1 BH:failed +ATCTTTCTCTCTCTGCTGCATTCTCCGCTTCAGCTCCTCAATTTCAATCA ++SRX079804:1:SRR292678:1:1101:654270:654270 1:N:0:1 BH:failed +GGGGGGEDGGFEGDGCGGBCEFGGGGFBFGGEGGGGGEGFCFEEGGFEDB +@SRX079804:1:SRR292678:1:1101:667761:667761 1:N:0:1 BH:failed +CAGCCTTTTGAGGTCGTCTATCGCAGCGTGTCCGCGACGTTTTGTTGCG ++SRX079804:1:SRR292678:1:1101:667761:667761 1:N:0:1 BH:failed +GGFGGG=GGGG@GFGGG@GGEGFGGGEGGGFGGEG@EEEDE8EE=E=DE +@SRX079804:1:SRR292678:1:1101:671526:671526 1:N:0:1 BH:ok +CAAGTAGAAGGTTAGCGCCTCTCTGTAAAAGGAGTCAAGCGCTATGTC ++SRX079804:1:SRR292678:1:1101:671526:671526 1:N:0:1 BH:ok +HHHHHGHEHHDGGGCGGGGHFGGGGEDGGGGGBEBEBEEEFGHFHFHH +@SRX079804:1:SRR292678:1:1101:685633:685633 2:N:0:1 BH:failed +GCTGTAATGGATCCACTAATTGGGACAGTGGTGGATAAAACGAATAC ++SRX079804:1:SRR292678:1:1101:685633:685633 2:N:0:1 BH:failed +DADADCGFEFEEEGEGGEFDEEEEBDAC;C10<9?AGBGGGFF7DFB +@SRX079804:1:SRR292678:1:1101:769626:769626 1:N:0:1 BH:failed +GTATTCCTTGACAGTCGAAAGAATCACTGCTAACCCAGGC ++SRX079804:1:SRR292678:1:1101:769626:769626 1:N:0:1 BH:failed +C>5BBCCCD=ACDD@A7@@B@A?B?=8B??EDEEDEBDEE +@SRX079804:1:SRR292678:1:1101:776222:776222 1:N:0:1 BH:failed +AGAACGTACACCCTACGCTAAGCAGTGGCTCCATGCCAA ++SRX079804:1:SRR292678:1:1101:776222:776222 1:N:0:1 BH:failed +BGDGGGGFEGGGGEGGGGFGFF6=FCFAFFEE,=C?EDD +@SRX079804:1:SRR292678:1:1101:782183:782183 2:N:0:1 BH:failed +TACGCGTAACGACGTCATAGCCATGACGCTTCAATAAA ++SRX079804:1:SRR292678:1:1101:782183:782183 2:N:0:1 BH:failed +D +@SRX079804:1:SRR292678:1:1101:828383:828383 1:N:0:1 BH:failed +AAACCACATATGACATGAGTGACGGGACTAAAGTTC ++SRX079804:1:SRR292678:1:1101:828383:828383 1:N:0:1 BH:failed +FFEEFEEB=E,C>CDEEEECEBEEEC?F;BDDDDAE +@SRX079804:1:SRR292678:1:1101:829239:829239 1:N:0:1 BH:ok +TCGATCCTTCTGCCTCAAAGTATACTAGGACGCAT ++SRX079804:1:SRR292678:1:1101:829239:829239 1:N:0:1 BH:ok +GGGDFGGBGFFEBFEDCBCDCGGGGBEEE=GE?EE +@SRX079804:1:SRR292678:1:1101:868419:868419 1:N:0:1 BH:ok +ATTCGTCAGGCCCAATAACATCATGAATTTCCAG ++SRX079804:1:SRR292678:1:1101:868419:868419 1:N:0:1 BH:ok +DEEEEEEEBDFFFFFFFF8FEED8@FFFBFFEFF +@SRX079804:1:SRR292678:1:1101:892716:892716 2:N:0:1 BH:ok +AATTTACCTAATGGAATCAATGAGGCTACTCCA ++SRX079804:1:SRR292678:1:1101:892716:892716 2:N:0:1 BH:ok +@BCBBBCCBCCCCCCCDDAAFFGEEEDBF@EE@ +@SRX079804:1:SRR292678:1:1101:893159:893159 2:N:0:1 BH:ok +TTCTCTGCTTTTCATATCTTGTCATAAAAATT ++SRX079804:1:SRR292678:1:1101:893159:893159 2:N:0:1 BH:ok +CBDCDEEEEEBEEEEGDDFDEEEEDGFFEGFE +@SRX079804:1:SRR292678:1:1101:918742:918742 1:N:0:1 BH:failed +CTCTCCATGCACAAAGAATATCACAGCCAAA ++SRX079804:1:SRR292678:1:1101:918742:918742 1:N:0:1 BH:failed +EEEBA?@;B@EEE@BEE=?EDDDDADCDA?E +@SRX079804:1:SRR292678:1:1101:923787:923787 2:N:0:1 BH:ok +TTGTGAAGGATGGGATATTAGTGTAGATGA ++SRX079804:1:SRR292678:1:1101:923787:923787 2:N:0:1 BH:ok +EEBBEGEEE=BBB<@DCDCGD@D>=DEGEE +@SRX079804:1:SRR292678:1:1101:933189:933189 1:N:0:1 BH:failed +GTCTGCACTATCGAGGGCTGTGCCTTTGC ++SRX079804:1:SRR292678:1:1101:933189:933189 1:N:0:1 BH:failed +FEFFDBFF8FE>?DFFFCEBCEEBBEDE6 +@SRX079804:1:SRR292678:1:1101:937136:937136 1:N:0:1 BH:failed +TTTCTTTGGCTTAAAGATAGTTTTAGTC ++SRX079804:1:SRR292678:1:1101:937136:937136 1:N:0:1 BH:failed +EFFFEEEEFCBCDDDDE@/E?@@7@@3< +@SRX079804:1:SRR292678:1:1101:940351:940351 1:N:0:1 BH:changed:1 +TGCCGTGGGAATGACAAACAAGCATCC ++SRX079804:1:SRR292678:1:1101:940351:940351 1:N:0:1 BH:changed:1 +DECC@GFFBF=EBEAFDFGD?FFF8FF +@SRX079804:1:SRR292678:1:1101:940693:940693 1:N:0:1 BH:failed +CACATTATGAACTATGGGCACTGCAT ++SRX079804:1:SRR292678:1:1101:940693:940693 1:N:0:1 BH:failed +EEEGFDEDFEGGGGGFEGBGGGFGGG +@SRX079804:1:SRR292678:1:1101:955819:955819 1:N:0:1 BH:failed +CACCTAGCAGCAACGGACGAGTCAG ++SRX079804:1:SRR292678:1:1101:955819:955819 1:N:0:1 BH:failed +GGGGGEEEGGEGGGFGEGG;F@EFF +@SRX079804:1:SRR292678:1:1101:958051:958051 2:N:0:1 BH:ok +TTAATATTTCCATCTGAACTTCGC ++SRX079804:1:SRR292678:1:1101:958051:958051 2:N:0:1 BH:ok +EDDBGFEGFGHHFHGGEDEGBGDB +@SRX079804:1:SRR292678:1:1101:996098:996098 1:N:0:1 BH:failed +CTAAGAGAGTTTGTAATGCGGAC ++SRX079804:1:SRR292678:1:1101:996098:996098 1:N:0:1 BH:failed +DD=DBDBDC4EFFFD@?CD@ACD +@SRX079804:1:SRR292678:1:1101:1020278:1020278 2:N:0:1 BH:ok +AAAGTGCAGAACATGCAGATAT ++SRX079804:1:SRR292678:1:1101:1020278:1020278 2:N:0:1 BH:ok +D>AC?GDDCD?DDADE@GABDG +@SRX079804:1:SRR292678:1:1101:1022234:1022234 1:N:0:1 BH:ok +GACCTTTCCGCAAGCTGTCGC ++SRX079804:1:SRR292678:1:1101:1022234:1022234 1:N:0:1 BH:ok +HHHHFEHHHHGGHHHGDHEEG +@SRX079804:1:SRR292678:1:1101:1024144:1024144 1:N:0:1 BH:failed +TCCATTATGAAAGAAGAAAA ++SRX079804:1:SRR292678:1:1101:1024144:1024144 1:N:0:1 BH:failed +@A><96:6: +@SRX079804:1:SRR292678:1:1101:1175112:1175112 1:N:0:1 BH:failed +AGGCC ++SRX079804:1:SRR292678:1:1101:1175112:1175112 1:N:0:1 BH:failed +EC8EE +@SRX079804:1:SRR292678:1:1101:1182927:1182927 1:N:0:1 BH:changed:1 +TGAA ++SRX079804:1:SRR292678:1:1101:1182927:1182927 1:N:0:1 BH:changed:1 +GGBH +@SRX079804:1:SRR292678:1:1101:1243474:1243474 1:N:0:1 BH:ok +ACG ++SRX079804:1:SRR292678:1:1101:1243474:1243474 1:N:0:1 BH:ok +EEE +@SRX079804:1:SRR292678:1:1101:1266246:1266246 1:N:0:1 BH:failed +AA ++SRX079804:1:SRR292678:1:1101:1266246:1266246 1:N:0:1 BH:failed +C@ +@SRX079804:1:SRR292678:1:1101:1269735:1269735 1:N:0:1 BH:failed +C ++SRX079804:1:SRR292678:1:1101:1269735:1269735 1:N:0:1 BH:failed +G diff --git a/example_fastq.fastq:Zone.Identifier b/example_fastq.fastq:Zone.Identifier new file mode 100644 index 0000000..1bf0b28 --- /dev/null +++ b/example_fastq.fastq:Zone.Identifier @@ -0,0 +1,3 @@ +[ZoneTransfer] +ZoneId=3 +HostUrl=https://github.com/ diff --git a/misc_module b/misc_module new file mode 160000 index 0000000..d29d55c --- /dev/null +++ b/misc_module @@ -0,0 +1 @@ +Subproject commit d29d55c0582e79c291546bcf9594f22155295f5e From abc3bd63ec15e3c900fbfba04e86fd099a4ef2ca Mon Sep 17 00:00:00 2001 From: Elena Smertina <104040609+sme229@users.noreply.github.com> Date: Fri, 23 Feb 2024 17:58:52 +1100 Subject: [PATCH 08/22] Delete modules directory --- modules/fastq_filter_functions.py | 106 ------------------------ modules/nucleic_acid_module.py | 125 ---------------------------- modules/protein_module.py | 130 ------------------------------ 3 files changed, 361 deletions(-) delete mode 100644 modules/fastq_filter_functions.py delete mode 100644 modules/nucleic_acid_module.py delete mode 100644 modules/protein_module.py diff --git a/modules/fastq_filter_functions.py b/modules/fastq_filter_functions.py deleted file mode 100644 index 758db74..0000000 --- a/modules/fastq_filter_functions.py +++ /dev/null @@ -1,106 +0,0 @@ -def calc_gc_content(seq: str) -> float: - """ - Calculates gc content - Argument is string - Returns float in % - """ - seq_lower = seq.lower() - length_seq = len(seq_lower) - gc_count = 0 - for nt in seq_lower: - if nt=='g' or nt=='c': - gc_count+=1 - gc_content = (gc_count/length_seq)*100 - return gc_content -def seq_length(seq: str) -> str: - """ - Calculates sequence length - Argument is string - Returns string - """ - return len(seq) -def quality_score(seq: str) -> int: - """ - Calculates numeric quality score - Argument is string - Returns int value - """ - score_count = 0 - length_q_seq = len(seq) - for symbol in seq: - score_num = ord(symbol) - 33 - score_count+=score_num - mean_qs = (score_count/length_q_seq) - return mean_qs -def length_filter(seqs: dict, length_bounds=(0,1000)) -> dict: - """ - Filters fastq reads by length - Arguments: - -dictionary - -sequence length parameters (>= and <=) - Returns filtered dictionary - """ - #seqs = {'name': ('sequence', 'quality')} - output = [] - result = dict() - for name, (sequence, quality) in seqs.items(): - - if seq_length(sequence) <= length_bounds[1] and seq_length(sequence) >= length_bounds[0]: - output.append(name) - if name in output: - result[name] = (sequence, quality) - - return result -def quality_filter(seqs: dict, quality_threshold=25) -> dict: - """ - Filters fastq reads by quality score - Arguments: - -dictionary - -quality score threshold (>=) - Returns filtered dict - """ - #seqs = {'name': ('sequence', 'quality')} - output = [] - result = dict() - for name, (sequence, quality) in seqs.items(): - - if quality_score(quality) >= quality_threshold: - output.append(name) - if name in output: - result[name] = (sequence, quality) - - return result -def gc_filter(seqs: dict, gc_bounds=(0,100)) -> dict: - """ - Filters fastq reads by gc content - Arguments: - -dict - -gc content parameters (>= and <=) - Returns filtered dict - """ - #seqs = {'name': ('sequence', 'quality')} - output = [] - result = dict() - for name, (sequence, quality) in seqs.items(): - - if calc_gc_content(sequence) >= gc_bounds[0] and calc_gc_content(sequence) <= gc_bounds[1]: - output.append(name) - if name in output: - result[name] = (sequence, quality) - - return result -def fastq_filter(seqs: dict, gc_bounds: int, length_bounds: int, quality_threshold: int) -> dict: - """ - Filters fastq sequence by gc content, length and quality score - Arguments: dict with fastq sequences, filtering parameters - Returns filtered dictionary - """ - resulting_sequences = dict() - gc_filtered = gc_filter(seqs, gc_bounds = (0,100)) - length_filtered = length_filter(seqs, length_bounds = (0,1000)) - quality_filtered = quality_filter(seqs, quality_threshold = 15) - intersection = gc_filtered.keys() & length_filtered.keys() & quality_filtered.keys() - #intersection = {keys: gc_filtered[keys] for keys in gc_filtered.keys() & length_filtered.keys()} - #for keys, (sequence, quality) in intersection: - # resulting_sequences[keys] = (sequence, quality) - return intersection diff --git a/modules/nucleic_acid_module.py b/modules/nucleic_acid_module.py deleted file mode 100644 index c24d2bd..0000000 --- a/modules/nucleic_acid_module.py +++ /dev/null @@ -1,125 +0,0 @@ -def transcribe(seq: str) -> str: - """ - Transcribes DNA->RNA - Argument is string - Return is string - """ - list_input = list(seq) - for i in range(len(seq)): - if (list_input[i] == 'T'): - list_input[i] = 'U' - elif (list_input[i] == 't'): - list_input[i]='u' - return "".join(list_input) - - -def reverse(seq: str) -> str: - """ - Returns reversed sequence - Argument is string - Return is string - """ - output = seq[::-1] - return output - - -def complement(seq: str) -> str: - """ - Returns a complementary sequence - Argument is a string - Return is a string - """ - list_input = list(seq) - for i in range(len(seq)): - if (list_input[i]=='G'): - list_input[i]='C' - elif (list_input[i]== 'g'): - list_input[i]='c' - elif (list_input[i]=='C'): - list_input[i]='G' - elif (list_input[i]=='c'): - list_input[i]='g' - elif (list_input[i] == 'T'): - list_input[i] = 'A' - elif (list_input[i] == 't'): - list_input[i]='a' - elif (list_input[i] == 'A'): - list_input[i] = 'T' - elif (list_input[i]=='a'): - list_input[i]='t' - - else: - - list_input = list(seq) - for i in range(len(seq)): - if (list_input[i]=='G'): - list_input[i]='C' - elif (list_input[i]== 'g'): - list_input[i]='c' - elif (list_input[i]=='C'): - list_input[i]='G' - elif (list_input[i]=='c'): - list_input[i]='g' - elif (list_input[i] == 'U'): - list_input[i] = 'A' - elif (list_input[i] == 'u'): - list_input[i]='a' - elif (list_input[i] == 'A'): - list_input[i] = 'U' - elif (list_input[i]=='a'): - list_input[i]='u' - return "".join(list_input) - - -def check_nucleic_acid(seq: str) -> str: - """ - This function checks whether input sequence(s) is a nucleic acid - Argument is str - Return is str - """ - unique_chars = set(seq) - nucleotides_dna = set('ATGCatgc') - nucleotides_rna = set('AUGCaugc') - if unique_chars <= nucleotides_dna: - seq = 'dna' - elif unique_chars <= nucleotides_rna: - seq = 'rna' - else: - raise ValueError("Invalid Input") - return seq - - -def reverse_complement(seq: str) -> str: - """ - This function returns a reversed complementary sequence - Argument is str - Return is str - """ - complement_seq = complement(seq) - reverse_compl_seq = reverse(complement_seq) - return reverse_compl_seq - - -def run_dna_rna_tools(*args: str, function: str) -> str: - """ - This function combines the functions above - Arguments: *args are input sequences, function is a function of choice - Returns: str, processed seqeunces depending on the function chosen - """ - results = [] - for seq in args: - check_nucleic_acid(seq) - if function == 'transcribe': - results.append(transcribe(seq)) - if function == 'complement': - results.append(complement(seq)) - if function == 'reverse': - results.append(reverse(seq)) - if function == 'reverse_complement': - results.append(reverse_complement(seq)) - if len(results) == 1: - results = results[0] - - return results - - diff --git a/modules/protein_module.py b/modules/protein_module.py deleted file mode 100644 index d71da27..0000000 --- a/modules/protein_module.py +++ /dev/null @@ -1,130 +0,0 @@ -from typing import Optional - -aa_code_dict = {'C':'Cys', 'c':'Cys', 'D':'Asp', 'd':'Asp', 'S':'Ser', 's':'Ser', 'Q':'Gln', 'q':'Gln', - 'K':'Lys', 'k':'Lys', 'I':'Ile', 'i':'Ile', 'P':'Pro', 'p':'Pro', 'T':'Thr', 't':'Thr', - 'F':'Phe', 'f':'Phe', 'N':'Asn', 'n':'Asn', 'G':'Gly', 'g':'Gly', 'H':'His', 'h':'His', - 'L':'Leu', 'l':'Leu', 'R':'Arg', 'r':'Arg', 'W':'Trp', 'w':'Trp', 'A':'Ala', 'a':'Ala', - 'V':'Val', 'v':'Val', 'E':'Glu', 'e':'Glu', 'Y':'Tyr', 'y':'Tyr', 'M':'Met', 'm':'Met'} - -aa_weight_dict = {'G':75, 'g':75, 'A':89, 'a':89, 'R':174, 'r':174, 'N':132, 'n':132, - 'D':133, 'd':133, 'C':121, 'c':133, 'E':147, 'e':147, 'Q':146, 'q':146, - 'H':155, 'h':155, 'I':131, 'i':131, 'L':131, 'l':131, 'K':146, 'k':146, - 'M':149, 'm':149, 'F':165, 'f':165, 'P':115, 'p':115, 'S':105, 's':105, - 'T':119, 't':119, 'W':204, 'w':204, 'Y':181, 'y':181, 'V':117, 'v':117} -def amino_acid_frequency(seq: str) -> dict: - """ - Calculates amino acid frequencies - Arguments: - -seq (str) input protein sequence - Return: - -dictionary with amino acid and its frequency - """ - freq_dict = {} - for letter in seq: - if letter in freq_dict: - freq_dict[letter] += 1 - else: - freq_dict[letter] = 1 - for letter in freq_dict: - freq_dict[letter] = round(freq_dict[letter] / len(seq) * 100, 2) - return freq_dict - - -def find_motifs(seq: str, motif: str): - """ - Finds a motif of interest in a protein sequence - Arguments: - -seq (str) input protein sequence - -motif (str) motif to be found in sequence - Return: - -position(s) of the motif in seq - """ - positions = [] - for i in range(len(seq) - len(motif) + 1): - window = seq[i:i+len(motif)] - if window == motif: - positions.append(i) - return positions - - -def check_protein_seq(seq: str) -> str: - """ - Checks whether a sequence is written using 1-letter amino acid code - Arguments: - -seq (str) input protein sequence - Return: - - str, 'single_letter_prot_seq' otherwise 'Invalid Input' error is raised - """ - unique_chars = set(seq) - single_letter = set('GALMFWKQESPVICYHRNDTgalmfwkqespvicyhrndt') - - if unique_chars <= single_letter: - seq = 'single_letter_prot_seq' - - else: - raise ValueError("Invalid Input") - return seq - - -def molecular_weight(seq: str) -> int: - """ - Calculates molecular weight of a protein - Arguments: - - seq (str) 1-letter coded protein sequence - Return: - - int, molecular weight (g/mol) rounded to integer - """ - list_input_seq = list(seq) - water_mw = 18 - for aa in list_input_seq: - total_mw = sum(aa_weight_dict[a] for a in list_input_seq) - mw_water_removed = (total_mw - (water_mw * (len(list_input_seq)-1))) - return mw_water_removed - - -def one_to_three_letter(seq: str) -> str: - """ - Converts a 1-letter amino acid code sequence into a 3-letter sequence - Arguments: - - seq (str) sequence to convert, must be 1-letter coded protein sequence - Return: - - str, a 3-letter coded protein sequence without spaces - """ - three_letter_aa = '' - for aa in seq: - three_letter_aa_seq += aa_code_dict[aa] - return three_letter_aa_seq - - -def run_protein_tool(*args: str, function: str, motif: Optional[str]=None): - """ - This is the main function - Arguments: - -seq(str) protein sequence(s) - -function(str) specify the function - -motif(str), optional argument for find_motifs function - Return: - -result of the specified function - """ - results = [] - for seq in args: - if check_protein_seq(seq) == 'single_letter_prot_seq': - if function == 'check_protein_seq': - for seq in args: - results.append(check_protein_seq(seq)) - elif function == 'molecular_weight': - for seq in args: - results.append(molecular_weight(seq)) - elif function == 'one_to_three_letter': - for seq in args: - results.append(one_to_three_letter(seq)) - elif function == 'amino_acid_frequency': - for seq in args: - results.append(amino_acid_frequency(seq)) - elif function == 'find_motifs': - for seq in args: - results.append(find_motifs(seq, motif)) - if len(results) == 1: - results = results[0] - return results - From 3b1af5cd09f72db92f6c4fc6ad1302a5c3373bf9 Mon Sep 17 00:00:00 2001 From: Elena Smertina <104040609+sme229@users.noreply.github.com> Date: Fri, 23 Feb 2024 18:01:04 +1100 Subject: [PATCH 09/22] Delete miscellaneous.py --- miscellaneous.py | 71 ------------------------------------------------ 1 file changed, 71 deletions(-) delete mode 100644 miscellaneous.py diff --git a/miscellaneous.py b/miscellaneous.py deleted file mode 100644 index 540510b..0000000 --- a/miscellaneous.py +++ /dev/null @@ -1,71 +0,0 @@ -def fastq_filter(seqs: dict, gc_bounds: int, length_bounds: int, quality_threshold: int) -> dict: - """ - Filters fastq sequence by gc content, length and quality score - Arguments: dict with fastq sequences, filtering parameters - Returns filtered dictionary - """ - result = dict() - gc_filtered = gc_filter(seqs, gc_bounds) - length_filtered = length_filter(seqs, length_bounds) - quality_filtered = quality_filter(seqs, quality_threshold) - intersection = gc_filtered.keys() & length_filtered.keys() & quality_filtered.keys() - - for keys, (sequence, quality) in seqs.items(): - if keys in intersection: - result[keys] = (sequence, quality) - return result - - -def run_dna_rna_tools(*args: str, function: str) -> str: - """ - This function combines the functions above - Arguments: *args are input sequences, function is a function of choice - Returns: str, processed seqeunces depending on the function chosen - """ - results = [] - for seq in args: - check_nucleic_acid(seq) - if function == 'transcribe': - results.append(transcribe(seq)) - if function == 'complement': - results.append(complement(seq)) - if function == 'reverse': - results.append(reverse(seq)) - if function == 'reverse_complement': - results.append(reverse_complement(seq)) - if len(results) == 1: - results = results[0] - return results - - -def run_protein_tool(*args: str, function: str, motif: Optional[str]=None): - """ - This is the main function - Arguments: - -seq(str) protein sequence(s) - -function(str) specify the function - -motif(str), optional argument for find_motifs function - Return: - -result of the specified function - """ - results = [] - for seq in args: - if check_protein_seq(seq) == 'single_letter_prot_seq': - if function == 'check_protein_seq': - for seq in args: - results.append(check_protein_seq(seq)) - elif function == 'molecular_weight': - for seq in args: - results.append(molecular_weight(seq)) - elif function == 'one_to_three_letter': - for seq in args: - results.append(one_to_three_letter(seq)) - elif function == 'amino_acid_frequency': - for seq in args: - results.append(amino_acid_frequency(seq)) - elif function == 'find_motifs': - for seq in args: - results.append(find_motifs(seq, motif)) - if len(results) == 1: - results = results[0] - return results From 92774b21a6c8391db186956873325dff12bda676 Mon Sep 17 00:00:00 2001 From: Elena Smertina <104040609+sme229@users.noreply.github.com> Date: Fri, 23 Feb 2024 18:03:40 +1100 Subject: [PATCH 10/22] Delete updated_HW5.py --- updated_HW5.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 updated_HW5.py diff --git a/updated_HW5.py b/updated_HW5.py deleted file mode 100644 index e69de29..0000000 From ca3388da69d2efba7cb4af90d4faf80ea94ab6b7 Mon Sep 17 00:00:00 2001 From: lsmertina Date: Fri, 23 Feb 2024 18:17:06 +1100 Subject: [PATCH 11/22] add requirements txt --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b96f261 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +bioframe==0.5.1 +biopython==1.81 \ No newline at end of file From 0fca5dcd8ae1628ca1eded469d4e513a181f92c4 Mon Sep 17 00:00:00 2001 From: Elena Smertina <104040609+sme229@users.noreply.github.com> Date: Fri, 23 Feb 2024 18:20:03 +1100 Subject: [PATCH 12/22] Create requirements.txt --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..10211e5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +bioframe==0.5.1 +biopython==1.81 From 8a853c6c6457d5584928592d5e9c08f02ee4bb54 Mon Sep 17 00:00:00 2001 From: lsmertina Date: Fri, 23 Feb 2024 20:48:04 +1100 Subject: [PATCH 13/22] delete files --- biopython_gc_filter.ipynb | 77 ------ example_fastq.fastq | 356 ---------------------------- example_fastq.fastq:Zone.Identifier | 3 - requirements.txt | 2 - 4 files changed, 438 deletions(-) delete mode 100644 biopython_gc_filter.ipynb delete mode 100644 example_fastq.fastq delete mode 100644 example_fastq.fastq:Zone.Identifier delete mode 100644 requirements.txt diff --git a/biopython_gc_filter.ipynb b/biopython_gc_filter.ipynb deleted file mode 100644 index a3da0d8..0000000 --- a/biopython_gc_filter.ipynb +++ /dev/null @@ -1,77 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 7, - "id": "21c5a784", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "9\n" - ] - } - ], - "source": [ - "from Bio import SeqIO\n", - "from Bio.SeqUtils import GC\n", - "\n", - "def filter_fastq(input_path: str, quality_threshold: int, output_filename=\"final_filtered.fastq\", gc_bounds=(40, 60), length_bounds=(50, 350)):\n", - " filename = input_path\n", - " records = SeqIO.parse(filename, \"fastq\")\n", - " ###quality filter\n", - " good_reads = (rec for rec in records if min(rec.letter_annotations[\"phred_quality\"]) >= quality_threshold)\n", - " result_quality = SeqIO.write(good_reads, \"good_quality.fastq\", \"fastq\")\n", - " result_quality_GC = SeqIO.parse(\"good_quality.fastq\", \"fastq\")\n", - " \n", - " ###GC content filter\n", - " min_gc_content = gc_bounds[0]\n", - " max_gc_content = gc_bounds[1]\n", - " GC_quality_filt = []\n", - " \n", - " for sequence in result_quality_GC:\n", - " if min_gc_content <= GC(sequence.seq) <= max_gc_content:\n", - " GC_quality_filt.append(sequence)\n", - " \n", - " result_quality = SeqIO.write(GC_quality_filt, \"good_quality_GC.fastq\", \"fastq\")\n", - " result_quality_GC_length = SeqIO.parse(\"good_quality_GC.fastq\", \"fastq\")\n", - " \n", - " ##length filter\n", - " filtered_GC_quality_length = []\n", - " \n", - " for sequence in result_quality_GC_length:\n", - " if len(sequence.seq) >= length_bounds[0] and len(sequence.seq) <= length_bounds[1]:\n", - " filtered_GC_quality_length.append(sequence)\n", - " \n", - " result_quality = SeqIO.write(filtered_GC_quality_length, output_filename, \"fastq\")\n", - " \n", - " print(result_quality)\n", - "\n", - "#filter_fastq(\"example_fastq.fastq\", 15)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/example_fastq.fastq b/example_fastq.fastq deleted file mode 100644 index 883b51f..0000000 --- a/example_fastq.fastq +++ /dev/null @@ -1,356 +0,0 @@ -@SRX079804:1:SRR292678:1:1101:21885:21885 1:N:0:1 BH:ok -ACAGCAACATAAACATGATGGGATGGCGTAAGCCCCCGAGATATCAGTTTACCCAGGATAAGAGATTAAATTATGAGCAACATTATTAA -+SRX079804:1:SRR292678:1:1101:21885:21885 1:N:0:1 BH:ok -FGGGFGGGFGGGFGDFGCEBB@CCDFDDFFFFBFFGFGEFDFFFF;D@DD>C@DDGGGDFGDGG?GFGFEGFGGEF@FDGGGFGFBGGD -@SRX079804:1:SRR292678:1:1101:24563:24563 1:N:0:1 BH:failed -ATTAGCGAGGAGGAGTGCTGAGAAGATGTCGCCTACGCCGTTGAAATTCCCTTCAATCAGGGGGTACTGGAGGATACGAGTTTGTGTG -+SRX079804:1:SRR292678:1:1101:24563:24563 1:N:0:1 BH:failed -BFFFFFFFB@B@A<@D>BDDACDDDEBEDEFFFBFFFEFFDFFF=CC@DDFD8FFFFFFF8/+.2,@7<<:?B/:<><-><@.A*C>D -@SRX079804:1:SRR292678:1:1101:30161:30161 1:N:0:1 BH:failed -GAACGACAGCAGCTCCTGCATAACCGCGTCCTTCTTCTTTAGCGTTGTGCAAAGCATGTTTTGTATTACGGGCATCTCGAGCGAATC -+SRX079804:1:SRR292678:1:1101:30161:30161 1:N:0:1 BH:failed -DFFFEGDGGGGFGGEDCCDCEFFFFCCCCCB>CEBFGFBGGG?DE=:6@=>AD?D8DCEE:>EEABE5D@5:DDCA;EEE-DCD -@SRX079804:1:SRR292678:1:1101:47176:47176 1:N:0:1 BH:failed -TGAAGCGTCGATAGAAGTTAGCAAACCCGCGGAACTTCCGTACATCAGACACATTCCGGGGGGTGGGCCAATCCATGATGCCTTTG -+SRX079804:1:SRR292678:1:1101:47176:47176 1:N:0:1 BH:failed -FF@FFBEEEEFFEFFD@EDEFFB=DFEEFFFE8FFE8EEDBFDFEEBE+E46.'8.5::EE:?E>A6@?)>;>9D<C9DEBAAB=5C?<@0=A?D@BDB;:BA?BDDFH?B@DCB6BEBDA??AA9. -@SRX079804:1:SRR292678:1:1101:52180:52180 1:N:0:1 BH:failed -TTTTTGGTTTTAGGTCTAACATGTAAGTCTTTAATCTATTTTGAATTAATTTTTGGATGAGGTGTAAGGAAGGGATCCAGTTTC -+SRX079804:1:SRR292678:1:1101:52180:52180 1:N:0:1 BH:failed -FEFFFFFF=FEEFDFD>EE:?<5@BFAFCFEE7>C>:/6:2<344DA:6DDCDC>)34773DD?@DBA9B -@SRX079804:1:SRR292678:1:1101:105156:105156 1:N:0:1 BH:ok -ACTGCTGAGCTTAAATGGCGGCAGTCTGACGGTTACCAACGGGGGCACTTCAACCGGTTCGTTAACGGGGAGCGGAGAGCTGA -+SRX079804:1:SRR292678:1:1101:105156:105156 1:N:0:1 BH:ok -GFFEGGFGGGGEGGGGGGGGGFDD=DDE7EDD6CD?FEDEE@EBEFEE.DD5DDD@B<7>/0543C?BEE?@@BE@; -@SRX079804:1:SRR292678:1:1101:135168:135168 1:N:0:1 BH:ok -CCTTCCTAAAAATTAAGAATCTTAACAATTAGCAGCACAACCAAAATTATTACCGAAAGGACTTACTCCTCCGCCAAATCCA -+SRX079804:1:SRR292678:1:1101:135168:135168 1:N:0:1 BH:ok -GGGEGGGEGGBFFFFFGCFFEGG6AEEEDEEGGFGGCG4EGFFGDFFBGEGFGGFECBDEGGEFGBFF?CGFBFGGGGFGFF -@SRX079804:1:SRR292678:1:1101:149302:149302 1:N:0:1 BH:changed:1 -TAGGGTTGTATTTGCAGATCCATGGCATGCCAAAAAGAACATCGTCCCGTCCAATATCTGCAACATACCAGTTGGTTGGTA -+SRX079804:1:SRR292678:1:1101:149302:149302 1:N:0:1 BH:changed:1 -@;CBA=:@;@DBDCDEEE/EEEEEEF@>FBEEB=EFA>EEBD=DAEEEEB9)99>B99BC)@,@<9CDD=C,5;B::?@;A -@SRX079804:1:SRR292678:1:1101:170868:170868 2:N:0:1 BH:failed -CTGCCGAGACTGTTCTCAGACATGGAAAGCTCGATTCGCATACACTCGCTGAGTAAGAGAGTCACACCAAATCACAGATT -+SRX079804:1:SRR292678:1:1101:170868:170868 2:N:0:1 BH:failed -E;FFFEGFGIGGFBG;C6D<@C7CDGFEFGFHDFEHHHBBHHFDFEFBAEEEEDE@A2=DA:??C3:@>EEBEEHEFEHHFFHH?FGBGFBBD77B;;C?FFFFGGFED.BBABBG@DBBE -@SRX079804:1:SRR292678:1:1101:190845:190845 1:N:0:1 BH:changed:1 -CCTCAGCGTGGATTGCCGCTCATGCAGGAGCAGATAATCCCTTCGCCATCCCATTAAGCGCCGTTGTCGGTATTCC -+SRX079804:1:SRR292678:1:1101:190845:190845 1:N:0:1 BH:changed:1 -FF@FFCFEECEBEC@@BBBBDFBBFFDFFEFFEB8FFFFFFFFEFCEB/>BBA@AFFFEEEEECE;ACD@DBBEEE -@SRX079804:1:SRR292678:1:1101:198993:198993 2:N:0:1 BH:failed -AGTTATTTATGCATCATTCTCATGTATGAGCCAACAAGATAGTACAAGTTTTATTGCTATGAGTTCAGTACAACA -+SRX079804:1:SRR292678:1:1101:198993:198993 2:N:0:1 BH:failed -<<<=;@B??@<>@><48876EADEG6B.BB@.?+98204<:<>@?A=@EFEFFFEEFB -@SRX079804:1:SRR292678:1:1101:204480:204480 1:N:0:1 BH:failed -AGTGAGACACCCCTGAACATTCCTAGTAAGACATCTTTGAATATTACTAGTTAGCCACACTTTAAAATGACCCG -+SRX079804:1:SRR292678:1:1101:204480:204480 1:N:0:1 BH:failed -<98;<@@@:@CD@BCCDD=DBBCEBBAAA@9???@BCDBCGF=GEGDFGDBEEEEEFFFF=EDEE=DCD@@BBC -@SRX079804:1:SRR292678:1:1101:212327:212327 2:N:0:1 BH:failed -TTACCTCTGCTTTTTCGCCTGTTACTTCTACTAATCCTTCATCTATTGCGAATGGCCCTACTACTGACGAAAT -+SRX079804:1:SRR292678:1:1101:212327:212327 2:N:0:1 BH:failed -DBCCC@@;A@BDCCCE>BBED>GDCDBFBFFEBEECFGGD@@BCB<<8@;09746:@@>@EEECEEDE/FEED -@SRX079804:1:SRR292678:1:1101:230386:230386 1:N:0:1 BH:failed -TTCTGATTGGAGTGAGAGTGCCATTTGTTTCGCTGATTGGACGTTGGAAAGCGCCTTGACCTTTGACAGCAG -+SRX079804:1:SRR292678:1:1101:230386:230386 1:N:0:1 BH:failed -=BDD=DCDBBCDADD@@B;B@CC7C@B@>=BACDD,=??8DDCDD7CCCCDBDEDBDDDDDEEBDDDBD?DB -@SRX079804:1:SRR292678:1:1101:236093:236093 1:N:0:1 BH:failed -CATCTAACTCTCATTCTAGATTCTTAAGTTGGCTACACTTTGCCGTCATTCTCGGTGGATTAGCTATTGGG -+SRX079804:1:SRR292678:1:1101:236093:236093 1:N:0:1 BH:failed -A:@A@;@BB@GGFGG@A@@817729B -@SRX079804:1:SRR292678:1:1101:251912:251912 2:N:0:1 BH:ok -TGTAAATGGAAATGAACCTAATATGTATGCACAAACTATTAAAGCATATCTTGCAAAAGGAGCAATGGCG -+SRX079804:1:SRR292678:1:1101:251912:251912 2:N:0:1 BH:ok -GHFDHHHGHHHFFHHGHHHGGGGGGDEFFEGGGGEFHHFHGFGHHHHFHHDDD6@=DCACFFGGEFBBG -@SRX079804:1:SRR292678:1:1101:278698:278698 1:N:0:1 BH:ok -CTAATAATGGTAATTGAACCATAGAAGATAAGTTCATAATGTAATAAATACATCCATAGAGTTATTAA -+SRX079804:1:SRR292678:1:1101:278698:278698 1:N:0:1 BH:ok -DDBDBCCCDD@FFFB9<<<@DA=DA@B:@=@@AC@GGFCGECFFDGGCGFFGGFFCEBF9>?@>BDFF -@SRX079804:1:SRR292678:1:1101:295878:295878 1:N:0:1 BH:changed:1 -AATGCAAACAGGATGATATTTGAATCCGTAATACTGTTCTTTCATCATAAATAATTTATGCAGATAC -+SRX079804:1:SRR292678:1:1101:295878:295878 1:N:0:1 BH:changed:1 -HHHHHEGFHDGFEGBCBEEEGGGG@EDGCGBBBEGF4?EFDBDDBFE8DEE-E?EE;B@EFC=;FDE -@SRX079804:1:SRR292678:1:1101:306575:306575 1:N:0:1 BH:failed -ATCTTTGAACTTTGTCTCACTTTCCCCCATCTCACCATTCCTCCTGTTCTGTGAACCCCAGTTTCA -+SRX079804:1:SRR292678:1:1101:306575:306575 1:N:0:1 BH:failed -E::EA@E<6B8>97:<6084649?@:?EDED=BEBEGGFEDGECECBDEFDDEEGGEDDFFF -@SRX079804:1:SRR292678:1:1101:403661:403661 1:N:0:1 BH:failed -GATGGCTTTGCTTTCTCATTCTCCTCTCCATCGTTCCCATCTTCGCCCTCAGACGCTGATTGAT -+SRX079804:1:SRR292678:1:1101:403661:403661 1:N:0:1 BH:failed -EG=DFFDFFDDGDGGGGGFFGGGGBADADCEEE5EC>CCCE6BEEEGGGGCBEGGE@9BCEF;>>D@D -@SRX079804:1:SRR292678:1:1101:425870:425870 1:N:0:1 BH:failed -GTACAGCTCTCCTCGTTACCAGCACATCTTGGACACCCGACGAAGACTTTGACATGCTCCTC -+SRX079804:1:SRR292678:1:1101:425870:425870 1:N:0:1 BH:failed -E@EED@EF=D>=EED@D@7DBF -@SRX079804:1:SRR292678:1:1101:429745:429745 1:N:0:1 BH:failed -GGCGATTGTGAAGGCATAAGAGTGGGACATAGTTCAAGTCCAGAACGAATTAAACGCACAA -+SRX079804:1:SRR292678:1:1101:429745:429745 1:N:0:1 BH:failed -BB2B=A@A:BFBFFFFF -@SRX079804:1:SRR292678:1:1101:475293:475293 2:N:0:1 BH:ok -TACATTTGATTTCTTTATAAGATTTCTTACTGTAAAATCATCGCTATTTAACAGCTTATT -+SRX079804:1:SRR292678:1:1101:475293:475293 2:N:0:1 BH:ok -FHFFFFDC@FGFEDGE?EEDC6EEEDEF?EEEE8EHHGGHFGFFEGGGGBFBDBDEBCBCA -@SRX079804:1:SRR292678:1:1101:511594:511594 1:N:0:1 BH:ok -AGCACAATCAGATTCGCTTATGACGGCGATGAAGAAATTGCGATGAAATGTGAGGTGA -+SRX079804:1:SRR292678:1:1101:511594:511594 1:N:0:1 BH:ok -HCHFFHHHGFHBFFFEGFFEFHFEHGBGECHEHB?CDDEFCDBFF9DFCD.BC -@SRX079804:1:SRR292678:1:1101:527839:527839 2:N:0:1 BH:changed:4 -GACATTTCTTTCTGGGAAGGCTTATTAAACGATAAAGATAATGATATACGTTTTGAT -+SRX079804:1:SRR292678:1:1101:527839:527839 2:N:0:1 BH:changed:4 ->35/-;,><04%'A.?4?:>BE.DC-@???CE:@EDFDBDG7B;=<)?1.@?2A5<5 -@SRX079804:1:SRR292678:1:1101:547309:547309 1:N:0:1 BH:failed -AGGTCTGCACTGGCTCTTCAGAGCGCAAGCGAGGGAAGAGAAATATAGTGACGCAC -+SRX079804:1:SRR292678:1:1101:547309:547309 1:N:0:1 BH:failed -GEGFGFD=FDB8B7DDFFFF@/DC@+2:66>-@>9);<2: -@SRX079804:1:SRR292678:1:1101:570367:570367 1:N:0:1 BH:failed -CTATGGCCACTTTTGTATTCTCGATTGAGGTTATTCGCTCACCCATCTTTTCCAA -+SRX079804:1:SRR292678:1:1101:570367:570367 1:N:0:1 BH:failed -FFEE;FFFFBFFFFF<9BEDEBEACDD3DD0B5>>0?:@>FFBEEBEFFFFGCCBCDGIGDGHEFGG=GGGGHFHEHF@FEFE?CGEEEFADEAEC -@SRX079804:1:SRR292678:1:1101:590521:590521 1:N:0:1 BH:failed -CCTCTTCTCTTTCGCGGTCATCTTGGGTTTCGCGCCTTTCTTCTTGACGACAC -+SRX079804:1:SRR292678:1:1101:590521:590521 1:N:0:1 BH:failed -EGFDD@FDF=FDFFEEGGBEFDGF=FGAGEEEEEDFBGGEE8EEE@>5;>90< -@SRX079804:1:SRR292678:1:1101:601307:601307 1:N:0:1 BH:changed:1 -TTGGCGTGCTGATGATTATCGGTATCTTCAAAGGCGCGCAGCCTGCGGGCTG -+SRX079804:1:SRR292678:1:1101:601307:601307 1:N:0:1 BH:changed:1 -GGGEGFGGEGE:EE>GFFGGGGDCGEBFFF>G=EBFFEC?DFGAD?DDECBE -@SRX079804:1:SRR292678:1:1101:631057:631057 1:N:0:1 BH:failed -CACCATTATCCTATTTCTGAACACATTTGACAGTCACGGCACTAGCATTGG -+SRX079804:1:SRR292678:1:1101:631057:631057 1:N:0:1 BH:failed -GGGGGGGGBFGGGEGFGGGGGFGEFFDFFFF?EBFEEDBFGEE@BE;E?E7 -@SRX079804:1:SRR292678:1:1101:654270:654270 1:N:0:1 BH:failed -ATCTTTCTCTCTCTGCTGCATTCTCCGCTTCAGCTCCTCAATTTCAATCA -+SRX079804:1:SRR292678:1:1101:654270:654270 1:N:0:1 BH:failed -GGGGGGEDGGFEGDGCGGBCEFGGGGFBFGGEGGGGGEGFCFEEGGFEDB -@SRX079804:1:SRR292678:1:1101:667761:667761 1:N:0:1 BH:failed -CAGCCTTTTGAGGTCGTCTATCGCAGCGTGTCCGCGACGTTTTGTTGCG -+SRX079804:1:SRR292678:1:1101:667761:667761 1:N:0:1 BH:failed -GGFGGG=GGGG@GFGGG@GGEGFGGGEGGGFGGEG@EEEDE8EE=E=DE -@SRX079804:1:SRR292678:1:1101:671526:671526 1:N:0:1 BH:ok -CAAGTAGAAGGTTAGCGCCTCTCTGTAAAAGGAGTCAAGCGCTATGTC -+SRX079804:1:SRR292678:1:1101:671526:671526 1:N:0:1 BH:ok -HHHHHGHEHHDGGGCGGGGHFGGGGEDGGGGGBEBEBEEEFGHFHFHH -@SRX079804:1:SRR292678:1:1101:685633:685633 2:N:0:1 BH:failed -GCTGTAATGGATCCACTAATTGGGACAGTGGTGGATAAAACGAATAC -+SRX079804:1:SRR292678:1:1101:685633:685633 2:N:0:1 BH:failed -DADADCGFEFEEEGEGGEFDEEEEBDAC;C10<9?AGBGGGFF7DFB -@SRX079804:1:SRR292678:1:1101:769626:769626 1:N:0:1 BH:failed -GTATTCCTTGACAGTCGAAAGAATCACTGCTAACCCAGGC -+SRX079804:1:SRR292678:1:1101:769626:769626 1:N:0:1 BH:failed -C>5BBCCCD=ACDD@A7@@B@A?B?=8B??EDEEDEBDEE -@SRX079804:1:SRR292678:1:1101:776222:776222 1:N:0:1 BH:failed -AGAACGTACACCCTACGCTAAGCAGTGGCTCCATGCCAA -+SRX079804:1:SRR292678:1:1101:776222:776222 1:N:0:1 BH:failed -BGDGGGGFEGGGGEGGGGFGFF6=FCFAFFEE,=C?EDD -@SRX079804:1:SRR292678:1:1101:782183:782183 2:N:0:1 BH:failed -TACGCGTAACGACGTCATAGCCATGACGCTTCAATAAA -+SRX079804:1:SRR292678:1:1101:782183:782183 2:N:0:1 BH:failed -D -@SRX079804:1:SRR292678:1:1101:828383:828383 1:N:0:1 BH:failed -AAACCACATATGACATGAGTGACGGGACTAAAGTTC -+SRX079804:1:SRR292678:1:1101:828383:828383 1:N:0:1 BH:failed -FFEEFEEB=E,C>CDEEEECEBEEEC?F;BDDDDAE -@SRX079804:1:SRR292678:1:1101:829239:829239 1:N:0:1 BH:ok -TCGATCCTTCTGCCTCAAAGTATACTAGGACGCAT -+SRX079804:1:SRR292678:1:1101:829239:829239 1:N:0:1 BH:ok -GGGDFGGBGFFEBFEDCBCDCGGGGBEEE=GE?EE -@SRX079804:1:SRR292678:1:1101:868419:868419 1:N:0:1 BH:ok -ATTCGTCAGGCCCAATAACATCATGAATTTCCAG -+SRX079804:1:SRR292678:1:1101:868419:868419 1:N:0:1 BH:ok -DEEEEEEEBDFFFFFFFF8FEED8@FFFBFFEFF -@SRX079804:1:SRR292678:1:1101:892716:892716 2:N:0:1 BH:ok -AATTTACCTAATGGAATCAATGAGGCTACTCCA -+SRX079804:1:SRR292678:1:1101:892716:892716 2:N:0:1 BH:ok -@BCBBBCCBCCCCCCCDDAAFFGEEEDBF@EE@ -@SRX079804:1:SRR292678:1:1101:893159:893159 2:N:0:1 BH:ok -TTCTCTGCTTTTCATATCTTGTCATAAAAATT -+SRX079804:1:SRR292678:1:1101:893159:893159 2:N:0:1 BH:ok -CBDCDEEEEEBEEEEGDDFDEEEEDGFFEGFE -@SRX079804:1:SRR292678:1:1101:918742:918742 1:N:0:1 BH:failed -CTCTCCATGCACAAAGAATATCACAGCCAAA -+SRX079804:1:SRR292678:1:1101:918742:918742 1:N:0:1 BH:failed -EEEBA?@;B@EEE@BEE=?EDDDDADCDA?E -@SRX079804:1:SRR292678:1:1101:923787:923787 2:N:0:1 BH:ok -TTGTGAAGGATGGGATATTAGTGTAGATGA -+SRX079804:1:SRR292678:1:1101:923787:923787 2:N:0:1 BH:ok -EEBBEGEEE=BBB<@DCDCGD@D>=DEGEE -@SRX079804:1:SRR292678:1:1101:933189:933189 1:N:0:1 BH:failed -GTCTGCACTATCGAGGGCTGTGCCTTTGC -+SRX079804:1:SRR292678:1:1101:933189:933189 1:N:0:1 BH:failed -FEFFDBFF8FE>?DFFFCEBCEEBBEDE6 -@SRX079804:1:SRR292678:1:1101:937136:937136 1:N:0:1 BH:failed -TTTCTTTGGCTTAAAGATAGTTTTAGTC -+SRX079804:1:SRR292678:1:1101:937136:937136 1:N:0:1 BH:failed -EFFFEEEEFCBCDDDDE@/E?@@7@@3< -@SRX079804:1:SRR292678:1:1101:940351:940351 1:N:0:1 BH:changed:1 -TGCCGTGGGAATGACAAACAAGCATCC -+SRX079804:1:SRR292678:1:1101:940351:940351 1:N:0:1 BH:changed:1 -DECC@GFFBF=EBEAFDFGD?FFF8FF -@SRX079804:1:SRR292678:1:1101:940693:940693 1:N:0:1 BH:failed -CACATTATGAACTATGGGCACTGCAT -+SRX079804:1:SRR292678:1:1101:940693:940693 1:N:0:1 BH:failed -EEEGFDEDFEGGGGGFEGBGGGFGGG -@SRX079804:1:SRR292678:1:1101:955819:955819 1:N:0:1 BH:failed -CACCTAGCAGCAACGGACGAGTCAG -+SRX079804:1:SRR292678:1:1101:955819:955819 1:N:0:1 BH:failed -GGGGGEEEGGEGGGFGEGG;F@EFF -@SRX079804:1:SRR292678:1:1101:958051:958051 2:N:0:1 BH:ok -TTAATATTTCCATCTGAACTTCGC -+SRX079804:1:SRR292678:1:1101:958051:958051 2:N:0:1 BH:ok -EDDBGFEGFGHHFHGGEDEGBGDB -@SRX079804:1:SRR292678:1:1101:996098:996098 1:N:0:1 BH:failed -CTAAGAGAGTTTGTAATGCGGAC -+SRX079804:1:SRR292678:1:1101:996098:996098 1:N:0:1 BH:failed -DD=DBDBDC4EFFFD@?CD@ACD -@SRX079804:1:SRR292678:1:1101:1020278:1020278 2:N:0:1 BH:ok -AAAGTGCAGAACATGCAGATAT -+SRX079804:1:SRR292678:1:1101:1020278:1020278 2:N:0:1 BH:ok -D>AC?GDDCD?DDADE@GABDG -@SRX079804:1:SRR292678:1:1101:1022234:1022234 1:N:0:1 BH:ok -GACCTTTCCGCAAGCTGTCGC -+SRX079804:1:SRR292678:1:1101:1022234:1022234 1:N:0:1 BH:ok -HHHHFEHHHHGGHHHGDHEEG -@SRX079804:1:SRR292678:1:1101:1024144:1024144 1:N:0:1 BH:failed -TCCATTATGAAAGAAGAAAA -+SRX079804:1:SRR292678:1:1101:1024144:1024144 1:N:0:1 BH:failed -@A><96:6: -@SRX079804:1:SRR292678:1:1101:1175112:1175112 1:N:0:1 BH:failed -AGGCC -+SRX079804:1:SRR292678:1:1101:1175112:1175112 1:N:0:1 BH:failed -EC8EE -@SRX079804:1:SRR292678:1:1101:1182927:1182927 1:N:0:1 BH:changed:1 -TGAA -+SRX079804:1:SRR292678:1:1101:1182927:1182927 1:N:0:1 BH:changed:1 -GGBH -@SRX079804:1:SRR292678:1:1101:1243474:1243474 1:N:0:1 BH:ok -ACG -+SRX079804:1:SRR292678:1:1101:1243474:1243474 1:N:0:1 BH:ok -EEE -@SRX079804:1:SRR292678:1:1101:1266246:1266246 1:N:0:1 BH:failed -AA -+SRX079804:1:SRR292678:1:1101:1266246:1266246 1:N:0:1 BH:failed -C@ -@SRX079804:1:SRR292678:1:1101:1269735:1269735 1:N:0:1 BH:failed -C -+SRX079804:1:SRR292678:1:1101:1269735:1269735 1:N:0:1 BH:failed -G diff --git a/example_fastq.fastq:Zone.Identifier b/example_fastq.fastq:Zone.Identifier deleted file mode 100644 index 1bf0b28..0000000 --- a/example_fastq.fastq:Zone.Identifier +++ /dev/null @@ -1,3 +0,0 @@ -[ZoneTransfer] -ZoneId=3 -HostUrl=https://github.com/ diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index b96f261..0000000 --- a/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -bioframe==0.5.1 -biopython==1.81 \ No newline at end of file From 8886a582375cffe0a7b0d01837f778266e44f04b Mon Sep 17 00:00:00 2001 From: lsmertina Date: Fri, 23 Feb 2024 20:54:23 +1100 Subject: [PATCH 14/22] add biopython fastq filter script --- biopython_fastq_filter.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 biopython_fastq_filter.py diff --git a/biopython_fastq_filter.py b/biopython_fastq_filter.py new file mode 100644 index 0000000..b3d4927 --- /dev/null +++ b/biopython_fastq_filter.py @@ -0,0 +1,34 @@ +from Bio import SeqIO +from Bio.SeqUtils import GC + +def filter_fastq(input_path: str, quality_threshold: int, output_filename="final_filtered.fastq", gc_bounds=(40, 60), length_bounds=(50, 350)): + filename = input_path + records = SeqIO.parse(filename, "fastq") + ###quality filter + good_reads = (rec for rec in records if min(rec.letter_annotations["phred_quality"]) >= quality_threshold) + result_quality = SeqIO.write(good_reads, "good_quality.fastq", "fastq") + result_quality_GC = SeqIO.parse("good_quality.fastq", "fastq") + ###GC content filter + min_gc_content = gc_bounds[0] + max_gc_content = gc_bounds[1] + GC_quality_filt = [] + + for sequence in result_quality_GC: + if min_gc_content <= GC(sequence.seq) <= max_gc_content: + GC_quality_filt.append(sequence) + + result_quality = SeqIO.write(GC_quality_filt, "good_quality_GC.fastq", "fastq") + result_quality_GC_length = SeqIO.parse("good_quality_GC.fastq", "fastq") + + ##length filter + filtered_GC_quality_length = [] + + for sequence in result_quality_GC_length: + if len(sequence.seq) >= length_bounds[0] and len(sequence.seq) <= length_bounds[1]: + filtered_GC_quality_length.append(sequence) + + result_quality = SeqIO.write(filtered_GC_quality_length, output_filename, "fastq") + + print(result_quality) + +#filter_fastq("example_fastq.fastq", 15) From c08ab84fdf520f700245e61d9659446152f6b166 Mon Sep 17 00:00:00 2001 From: lsmertina Date: Fri, 23 Feb 2024 20:55:38 +1100 Subject: [PATCH 15/22] add an example fastq file --- example_fastq.fastq | 356 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 356 insertions(+) create mode 100644 example_fastq.fastq diff --git a/example_fastq.fastq b/example_fastq.fastq new file mode 100644 index 0000000..883b51f --- /dev/null +++ b/example_fastq.fastq @@ -0,0 +1,356 @@ +@SRX079804:1:SRR292678:1:1101:21885:21885 1:N:0:1 BH:ok +ACAGCAACATAAACATGATGGGATGGCGTAAGCCCCCGAGATATCAGTTTACCCAGGATAAGAGATTAAATTATGAGCAACATTATTAA ++SRX079804:1:SRR292678:1:1101:21885:21885 1:N:0:1 BH:ok +FGGGFGGGFGGGFGDFGCEBB@CCDFDDFFFFBFFGFGEFDFFFF;D@DD>C@DDGGGDFGDGG?GFGFEGFGGEF@FDGGGFGFBGGD +@SRX079804:1:SRR292678:1:1101:24563:24563 1:N:0:1 BH:failed +ATTAGCGAGGAGGAGTGCTGAGAAGATGTCGCCTACGCCGTTGAAATTCCCTTCAATCAGGGGGTACTGGAGGATACGAGTTTGTGTG ++SRX079804:1:SRR292678:1:1101:24563:24563 1:N:0:1 BH:failed +BFFFFFFFB@B@A<@D>BDDACDDDEBEDEFFFBFFFEFFDFFF=CC@DDFD8FFFFFFF8/+.2,@7<<:?B/:<><-><@.A*C>D +@SRX079804:1:SRR292678:1:1101:30161:30161 1:N:0:1 BH:failed +GAACGACAGCAGCTCCTGCATAACCGCGTCCTTCTTCTTTAGCGTTGTGCAAAGCATGTTTTGTATTACGGGCATCTCGAGCGAATC ++SRX079804:1:SRR292678:1:1101:30161:30161 1:N:0:1 BH:failed +DFFFEGDGGGGFGGEDCCDCEFFFFCCCCCB>CEBFGFBGGG?DE=:6@=>AD?D8DCEE:>EEABE5D@5:DDCA;EEE-DCD +@SRX079804:1:SRR292678:1:1101:47176:47176 1:N:0:1 BH:failed +TGAAGCGTCGATAGAAGTTAGCAAACCCGCGGAACTTCCGTACATCAGACACATTCCGGGGGGTGGGCCAATCCATGATGCCTTTG ++SRX079804:1:SRR292678:1:1101:47176:47176 1:N:0:1 BH:failed +FF@FFBEEEEFFEFFD@EDEFFB=DFEEFFFE8FFE8EEDBFDFEEBE+E46.'8.5::EE:?E>A6@?)>;>9D<C9DEBAAB=5C?<@0=A?D@BDB;:BA?BDDFH?B@DCB6BEBDA??AA9. +@SRX079804:1:SRR292678:1:1101:52180:52180 1:N:0:1 BH:failed +TTTTTGGTTTTAGGTCTAACATGTAAGTCTTTAATCTATTTTGAATTAATTTTTGGATGAGGTGTAAGGAAGGGATCCAGTTTC ++SRX079804:1:SRR292678:1:1101:52180:52180 1:N:0:1 BH:failed +FEFFFFFF=FEEFDFD>EE:?<5@BFAFCFEE7>C>:/6:2<344DA:6DDCDC>)34773DD?@DBA9B +@SRX079804:1:SRR292678:1:1101:105156:105156 1:N:0:1 BH:ok +ACTGCTGAGCTTAAATGGCGGCAGTCTGACGGTTACCAACGGGGGCACTTCAACCGGTTCGTTAACGGGGAGCGGAGAGCTGA ++SRX079804:1:SRR292678:1:1101:105156:105156 1:N:0:1 BH:ok +GFFEGGFGGGGEGGGGGGGGGFDD=DDE7EDD6CD?FEDEE@EBEFEE.DD5DDD@B<7>/0543C?BEE?@@BE@; +@SRX079804:1:SRR292678:1:1101:135168:135168 1:N:0:1 BH:ok +CCTTCCTAAAAATTAAGAATCTTAACAATTAGCAGCACAACCAAAATTATTACCGAAAGGACTTACTCCTCCGCCAAATCCA ++SRX079804:1:SRR292678:1:1101:135168:135168 1:N:0:1 BH:ok +GGGEGGGEGGBFFFFFGCFFEGG6AEEEDEEGGFGGCG4EGFFGDFFBGEGFGGFECBDEGGEFGBFF?CGFBFGGGGFGFF +@SRX079804:1:SRR292678:1:1101:149302:149302 1:N:0:1 BH:changed:1 +TAGGGTTGTATTTGCAGATCCATGGCATGCCAAAAAGAACATCGTCCCGTCCAATATCTGCAACATACCAGTTGGTTGGTA ++SRX079804:1:SRR292678:1:1101:149302:149302 1:N:0:1 BH:changed:1 +@;CBA=:@;@DBDCDEEE/EEEEEEF@>FBEEB=EFA>EEBD=DAEEEEB9)99>B99BC)@,@<9CDD=C,5;B::?@;A +@SRX079804:1:SRR292678:1:1101:170868:170868 2:N:0:1 BH:failed +CTGCCGAGACTGTTCTCAGACATGGAAAGCTCGATTCGCATACACTCGCTGAGTAAGAGAGTCACACCAAATCACAGATT ++SRX079804:1:SRR292678:1:1101:170868:170868 2:N:0:1 BH:failed +E;FFFEGFGIGGFBG;C6D<@C7CDGFEFGFHDFEHHHBBHHFDFEFBAEEEEDE@A2=DA:??C3:@>EEBEEHEFEHHFFHH?FGBGFBBD77B;;C?FFFFGGFED.BBABBG@DBBE +@SRX079804:1:SRR292678:1:1101:190845:190845 1:N:0:1 BH:changed:1 +CCTCAGCGTGGATTGCCGCTCATGCAGGAGCAGATAATCCCTTCGCCATCCCATTAAGCGCCGTTGTCGGTATTCC ++SRX079804:1:SRR292678:1:1101:190845:190845 1:N:0:1 BH:changed:1 +FF@FFCFEECEBEC@@BBBBDFBBFFDFFEFFEB8FFFFFFFFEFCEB/>BBA@AFFFEEEEECE;ACD@DBBEEE +@SRX079804:1:SRR292678:1:1101:198993:198993 2:N:0:1 BH:failed +AGTTATTTATGCATCATTCTCATGTATGAGCCAACAAGATAGTACAAGTTTTATTGCTATGAGTTCAGTACAACA ++SRX079804:1:SRR292678:1:1101:198993:198993 2:N:0:1 BH:failed +<<<=;@B??@<>@><48876EADEG6B.BB@.?+98204<:<>@?A=@EFEFFFEEFB +@SRX079804:1:SRR292678:1:1101:204480:204480 1:N:0:1 BH:failed +AGTGAGACACCCCTGAACATTCCTAGTAAGACATCTTTGAATATTACTAGTTAGCCACACTTTAAAATGACCCG ++SRX079804:1:SRR292678:1:1101:204480:204480 1:N:0:1 BH:failed +<98;<@@@:@CD@BCCDD=DBBCEBBAAA@9???@BCDBCGF=GEGDFGDBEEEEEFFFF=EDEE=DCD@@BBC +@SRX079804:1:SRR292678:1:1101:212327:212327 2:N:0:1 BH:failed +TTACCTCTGCTTTTTCGCCTGTTACTTCTACTAATCCTTCATCTATTGCGAATGGCCCTACTACTGACGAAAT ++SRX079804:1:SRR292678:1:1101:212327:212327 2:N:0:1 BH:failed +DBCCC@@;A@BDCCCE>BBED>GDCDBFBFFEBEECFGGD@@BCB<<8@;09746:@@>@EEECEEDE/FEED +@SRX079804:1:SRR292678:1:1101:230386:230386 1:N:0:1 BH:failed +TTCTGATTGGAGTGAGAGTGCCATTTGTTTCGCTGATTGGACGTTGGAAAGCGCCTTGACCTTTGACAGCAG ++SRX079804:1:SRR292678:1:1101:230386:230386 1:N:0:1 BH:failed +=BDD=DCDBBCDADD@@B;B@CC7C@B@>=BACDD,=??8DDCDD7CCCCDBDEDBDDDDDEEBDDDBD?DB +@SRX079804:1:SRR292678:1:1101:236093:236093 1:N:0:1 BH:failed +CATCTAACTCTCATTCTAGATTCTTAAGTTGGCTACACTTTGCCGTCATTCTCGGTGGATTAGCTATTGGG ++SRX079804:1:SRR292678:1:1101:236093:236093 1:N:0:1 BH:failed +A:@A@;@BB@GGFGG@A@@817729B +@SRX079804:1:SRR292678:1:1101:251912:251912 2:N:0:1 BH:ok +TGTAAATGGAAATGAACCTAATATGTATGCACAAACTATTAAAGCATATCTTGCAAAAGGAGCAATGGCG ++SRX079804:1:SRR292678:1:1101:251912:251912 2:N:0:1 BH:ok +GHFDHHHGHHHFFHHGHHHGGGGGGDEFFEGGGGEFHHFHGFGHHHHFHHDDD6@=DCACFFGGEFBBG +@SRX079804:1:SRR292678:1:1101:278698:278698 1:N:0:1 BH:ok +CTAATAATGGTAATTGAACCATAGAAGATAAGTTCATAATGTAATAAATACATCCATAGAGTTATTAA ++SRX079804:1:SRR292678:1:1101:278698:278698 1:N:0:1 BH:ok +DDBDBCCCDD@FFFB9<<<@DA=DA@B:@=@@AC@GGFCGECFFDGGCGFFGGFFCEBF9>?@>BDFF +@SRX079804:1:SRR292678:1:1101:295878:295878 1:N:0:1 BH:changed:1 +AATGCAAACAGGATGATATTTGAATCCGTAATACTGTTCTTTCATCATAAATAATTTATGCAGATAC ++SRX079804:1:SRR292678:1:1101:295878:295878 1:N:0:1 BH:changed:1 +HHHHHEGFHDGFEGBCBEEEGGGG@EDGCGBBBEGF4?EFDBDDBFE8DEE-E?EE;B@EFC=;FDE +@SRX079804:1:SRR292678:1:1101:306575:306575 1:N:0:1 BH:failed +ATCTTTGAACTTTGTCTCACTTTCCCCCATCTCACCATTCCTCCTGTTCTGTGAACCCCAGTTTCA ++SRX079804:1:SRR292678:1:1101:306575:306575 1:N:0:1 BH:failed +E::EA@E<6B8>97:<6084649?@:?EDED=BEBEGGFEDGECECBDEFDDEEGGEDDFFF +@SRX079804:1:SRR292678:1:1101:403661:403661 1:N:0:1 BH:failed +GATGGCTTTGCTTTCTCATTCTCCTCTCCATCGTTCCCATCTTCGCCCTCAGACGCTGATTGAT ++SRX079804:1:SRR292678:1:1101:403661:403661 1:N:0:1 BH:failed +EG=DFFDFFDDGDGGGGGFFGGGGBADADCEEE5EC>CCCE6BEEEGGGGCBEGGE@9BCEF;>>D@D +@SRX079804:1:SRR292678:1:1101:425870:425870 1:N:0:1 BH:failed +GTACAGCTCTCCTCGTTACCAGCACATCTTGGACACCCGACGAAGACTTTGACATGCTCCTC ++SRX079804:1:SRR292678:1:1101:425870:425870 1:N:0:1 BH:failed +E@EED@EF=D>=EED@D@7DBF +@SRX079804:1:SRR292678:1:1101:429745:429745 1:N:0:1 BH:failed +GGCGATTGTGAAGGCATAAGAGTGGGACATAGTTCAAGTCCAGAACGAATTAAACGCACAA ++SRX079804:1:SRR292678:1:1101:429745:429745 1:N:0:1 BH:failed +BB2B=A@A:BFBFFFFF +@SRX079804:1:SRR292678:1:1101:475293:475293 2:N:0:1 BH:ok +TACATTTGATTTCTTTATAAGATTTCTTACTGTAAAATCATCGCTATTTAACAGCTTATT ++SRX079804:1:SRR292678:1:1101:475293:475293 2:N:0:1 BH:ok +FHFFFFDC@FGFEDGE?EEDC6EEEDEF?EEEE8EHHGGHFGFFEGGGGBFBDBDEBCBCA +@SRX079804:1:SRR292678:1:1101:511594:511594 1:N:0:1 BH:ok +AGCACAATCAGATTCGCTTATGACGGCGATGAAGAAATTGCGATGAAATGTGAGGTGA ++SRX079804:1:SRR292678:1:1101:511594:511594 1:N:0:1 BH:ok +HCHFFHHHGFHBFFFEGFFEFHFEHGBGECHEHB?CDDEFCDBFF9DFCD.BC +@SRX079804:1:SRR292678:1:1101:527839:527839 2:N:0:1 BH:changed:4 +GACATTTCTTTCTGGGAAGGCTTATTAAACGATAAAGATAATGATATACGTTTTGAT ++SRX079804:1:SRR292678:1:1101:527839:527839 2:N:0:1 BH:changed:4 +>35/-;,><04%'A.?4?:>BE.DC-@???CE:@EDFDBDG7B;=<)?1.@?2A5<5 +@SRX079804:1:SRR292678:1:1101:547309:547309 1:N:0:1 BH:failed +AGGTCTGCACTGGCTCTTCAGAGCGCAAGCGAGGGAAGAGAAATATAGTGACGCAC ++SRX079804:1:SRR292678:1:1101:547309:547309 1:N:0:1 BH:failed +GEGFGFD=FDB8B7DDFFFF@/DC@+2:66>-@>9);<2: +@SRX079804:1:SRR292678:1:1101:570367:570367 1:N:0:1 BH:failed +CTATGGCCACTTTTGTATTCTCGATTGAGGTTATTCGCTCACCCATCTTTTCCAA ++SRX079804:1:SRR292678:1:1101:570367:570367 1:N:0:1 BH:failed +FFEE;FFFFBFFFFF<9BEDEBEACDD3DD0B5>>0?:@>FFBEEBEFFFFGCCBCDGIGDGHEFGG=GGGGHFHEHF@FEFE?CGEEEFADEAEC +@SRX079804:1:SRR292678:1:1101:590521:590521 1:N:0:1 BH:failed +CCTCTTCTCTTTCGCGGTCATCTTGGGTTTCGCGCCTTTCTTCTTGACGACAC ++SRX079804:1:SRR292678:1:1101:590521:590521 1:N:0:1 BH:failed +EGFDD@FDF=FDFFEEGGBEFDGF=FGAGEEEEEDFBGGEE8EEE@>5;>90< +@SRX079804:1:SRR292678:1:1101:601307:601307 1:N:0:1 BH:changed:1 +TTGGCGTGCTGATGATTATCGGTATCTTCAAAGGCGCGCAGCCTGCGGGCTG ++SRX079804:1:SRR292678:1:1101:601307:601307 1:N:0:1 BH:changed:1 +GGGEGFGGEGE:EE>GFFGGGGDCGEBFFF>G=EBFFEC?DFGAD?DDECBE +@SRX079804:1:SRR292678:1:1101:631057:631057 1:N:0:1 BH:failed +CACCATTATCCTATTTCTGAACACATTTGACAGTCACGGCACTAGCATTGG ++SRX079804:1:SRR292678:1:1101:631057:631057 1:N:0:1 BH:failed +GGGGGGGGBFGGGEGFGGGGGFGEFFDFFFF?EBFEEDBFGEE@BE;E?E7 +@SRX079804:1:SRR292678:1:1101:654270:654270 1:N:0:1 BH:failed +ATCTTTCTCTCTCTGCTGCATTCTCCGCTTCAGCTCCTCAATTTCAATCA ++SRX079804:1:SRR292678:1:1101:654270:654270 1:N:0:1 BH:failed +GGGGGGEDGGFEGDGCGGBCEFGGGGFBFGGEGGGGGEGFCFEEGGFEDB +@SRX079804:1:SRR292678:1:1101:667761:667761 1:N:0:1 BH:failed +CAGCCTTTTGAGGTCGTCTATCGCAGCGTGTCCGCGACGTTTTGTTGCG ++SRX079804:1:SRR292678:1:1101:667761:667761 1:N:0:1 BH:failed +GGFGGG=GGGG@GFGGG@GGEGFGGGEGGGFGGEG@EEEDE8EE=E=DE +@SRX079804:1:SRR292678:1:1101:671526:671526 1:N:0:1 BH:ok +CAAGTAGAAGGTTAGCGCCTCTCTGTAAAAGGAGTCAAGCGCTATGTC ++SRX079804:1:SRR292678:1:1101:671526:671526 1:N:0:1 BH:ok +HHHHHGHEHHDGGGCGGGGHFGGGGEDGGGGGBEBEBEEEFGHFHFHH +@SRX079804:1:SRR292678:1:1101:685633:685633 2:N:0:1 BH:failed +GCTGTAATGGATCCACTAATTGGGACAGTGGTGGATAAAACGAATAC ++SRX079804:1:SRR292678:1:1101:685633:685633 2:N:0:1 BH:failed +DADADCGFEFEEEGEGGEFDEEEEBDAC;C10<9?AGBGGGFF7DFB +@SRX079804:1:SRR292678:1:1101:769626:769626 1:N:0:1 BH:failed +GTATTCCTTGACAGTCGAAAGAATCACTGCTAACCCAGGC ++SRX079804:1:SRR292678:1:1101:769626:769626 1:N:0:1 BH:failed +C>5BBCCCD=ACDD@A7@@B@A?B?=8B??EDEEDEBDEE +@SRX079804:1:SRR292678:1:1101:776222:776222 1:N:0:1 BH:failed +AGAACGTACACCCTACGCTAAGCAGTGGCTCCATGCCAA ++SRX079804:1:SRR292678:1:1101:776222:776222 1:N:0:1 BH:failed +BGDGGGGFEGGGGEGGGGFGFF6=FCFAFFEE,=C?EDD +@SRX079804:1:SRR292678:1:1101:782183:782183 2:N:0:1 BH:failed +TACGCGTAACGACGTCATAGCCATGACGCTTCAATAAA ++SRX079804:1:SRR292678:1:1101:782183:782183 2:N:0:1 BH:failed +D +@SRX079804:1:SRR292678:1:1101:828383:828383 1:N:0:1 BH:failed +AAACCACATATGACATGAGTGACGGGACTAAAGTTC ++SRX079804:1:SRR292678:1:1101:828383:828383 1:N:0:1 BH:failed +FFEEFEEB=E,C>CDEEEECEBEEEC?F;BDDDDAE +@SRX079804:1:SRR292678:1:1101:829239:829239 1:N:0:1 BH:ok +TCGATCCTTCTGCCTCAAAGTATACTAGGACGCAT ++SRX079804:1:SRR292678:1:1101:829239:829239 1:N:0:1 BH:ok +GGGDFGGBGFFEBFEDCBCDCGGGGBEEE=GE?EE +@SRX079804:1:SRR292678:1:1101:868419:868419 1:N:0:1 BH:ok +ATTCGTCAGGCCCAATAACATCATGAATTTCCAG ++SRX079804:1:SRR292678:1:1101:868419:868419 1:N:0:1 BH:ok +DEEEEEEEBDFFFFFFFF8FEED8@FFFBFFEFF +@SRX079804:1:SRR292678:1:1101:892716:892716 2:N:0:1 BH:ok +AATTTACCTAATGGAATCAATGAGGCTACTCCA ++SRX079804:1:SRR292678:1:1101:892716:892716 2:N:0:1 BH:ok +@BCBBBCCBCCCCCCCDDAAFFGEEEDBF@EE@ +@SRX079804:1:SRR292678:1:1101:893159:893159 2:N:0:1 BH:ok +TTCTCTGCTTTTCATATCTTGTCATAAAAATT ++SRX079804:1:SRR292678:1:1101:893159:893159 2:N:0:1 BH:ok +CBDCDEEEEEBEEEEGDDFDEEEEDGFFEGFE +@SRX079804:1:SRR292678:1:1101:918742:918742 1:N:0:1 BH:failed +CTCTCCATGCACAAAGAATATCACAGCCAAA ++SRX079804:1:SRR292678:1:1101:918742:918742 1:N:0:1 BH:failed +EEEBA?@;B@EEE@BEE=?EDDDDADCDA?E +@SRX079804:1:SRR292678:1:1101:923787:923787 2:N:0:1 BH:ok +TTGTGAAGGATGGGATATTAGTGTAGATGA ++SRX079804:1:SRR292678:1:1101:923787:923787 2:N:0:1 BH:ok +EEBBEGEEE=BBB<@DCDCGD@D>=DEGEE +@SRX079804:1:SRR292678:1:1101:933189:933189 1:N:0:1 BH:failed +GTCTGCACTATCGAGGGCTGTGCCTTTGC ++SRX079804:1:SRR292678:1:1101:933189:933189 1:N:0:1 BH:failed +FEFFDBFF8FE>?DFFFCEBCEEBBEDE6 +@SRX079804:1:SRR292678:1:1101:937136:937136 1:N:0:1 BH:failed +TTTCTTTGGCTTAAAGATAGTTTTAGTC ++SRX079804:1:SRR292678:1:1101:937136:937136 1:N:0:1 BH:failed +EFFFEEEEFCBCDDDDE@/E?@@7@@3< +@SRX079804:1:SRR292678:1:1101:940351:940351 1:N:0:1 BH:changed:1 +TGCCGTGGGAATGACAAACAAGCATCC ++SRX079804:1:SRR292678:1:1101:940351:940351 1:N:0:1 BH:changed:1 +DECC@GFFBF=EBEAFDFGD?FFF8FF +@SRX079804:1:SRR292678:1:1101:940693:940693 1:N:0:1 BH:failed +CACATTATGAACTATGGGCACTGCAT ++SRX079804:1:SRR292678:1:1101:940693:940693 1:N:0:1 BH:failed +EEEGFDEDFEGGGGGFEGBGGGFGGG +@SRX079804:1:SRR292678:1:1101:955819:955819 1:N:0:1 BH:failed +CACCTAGCAGCAACGGACGAGTCAG ++SRX079804:1:SRR292678:1:1101:955819:955819 1:N:0:1 BH:failed +GGGGGEEEGGEGGGFGEGG;F@EFF +@SRX079804:1:SRR292678:1:1101:958051:958051 2:N:0:1 BH:ok +TTAATATTTCCATCTGAACTTCGC ++SRX079804:1:SRR292678:1:1101:958051:958051 2:N:0:1 BH:ok +EDDBGFEGFGHHFHGGEDEGBGDB +@SRX079804:1:SRR292678:1:1101:996098:996098 1:N:0:1 BH:failed +CTAAGAGAGTTTGTAATGCGGAC ++SRX079804:1:SRR292678:1:1101:996098:996098 1:N:0:1 BH:failed +DD=DBDBDC4EFFFD@?CD@ACD +@SRX079804:1:SRR292678:1:1101:1020278:1020278 2:N:0:1 BH:ok +AAAGTGCAGAACATGCAGATAT ++SRX079804:1:SRR292678:1:1101:1020278:1020278 2:N:0:1 BH:ok +D>AC?GDDCD?DDADE@GABDG +@SRX079804:1:SRR292678:1:1101:1022234:1022234 1:N:0:1 BH:ok +GACCTTTCCGCAAGCTGTCGC ++SRX079804:1:SRR292678:1:1101:1022234:1022234 1:N:0:1 BH:ok +HHHHFEHHHHGGHHHGDHEEG +@SRX079804:1:SRR292678:1:1101:1024144:1024144 1:N:0:1 BH:failed +TCCATTATGAAAGAAGAAAA ++SRX079804:1:SRR292678:1:1101:1024144:1024144 1:N:0:1 BH:failed +@A><96:6: +@SRX079804:1:SRR292678:1:1101:1175112:1175112 1:N:0:1 BH:failed +AGGCC ++SRX079804:1:SRR292678:1:1101:1175112:1175112 1:N:0:1 BH:failed +EC8EE +@SRX079804:1:SRR292678:1:1101:1182927:1182927 1:N:0:1 BH:changed:1 +TGAA ++SRX079804:1:SRR292678:1:1101:1182927:1182927 1:N:0:1 BH:changed:1 +GGBH +@SRX079804:1:SRR292678:1:1101:1243474:1243474 1:N:0:1 BH:ok +ACG ++SRX079804:1:SRR292678:1:1101:1243474:1243474 1:N:0:1 BH:ok +EEE +@SRX079804:1:SRR292678:1:1101:1266246:1266246 1:N:0:1 BH:failed +AA ++SRX079804:1:SRR292678:1:1101:1266246:1266246 1:N:0:1 BH:failed +C@ +@SRX079804:1:SRR292678:1:1101:1269735:1269735 1:N:0:1 BH:failed +C ++SRX079804:1:SRR292678:1:1101:1269735:1269735 1:N:0:1 BH:failed +G From d962c092abdabf627e25b869de6803e963e19f64 Mon Sep 17 00:00:00 2001 From: lsmertina Date: Sun, 25 Feb 2024 08:25:10 +1100 Subject: [PATCH 16/22] add script for task 5 hw14 --- biopython_fastq_filter.py | 87 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/biopython_fastq_filter.py b/biopython_fastq_filter.py index b3d4927..b2fb7e3 100644 --- a/biopython_fastq_filter.py +++ b/biopython_fastq_filter.py @@ -32,3 +32,90 @@ def filter_fastq(input_path: str, quality_threshold: int, output_filename="final print(result_quality) #filter_fastq("example_fastq.fastq", 15) + + +from abc import ABC, abstractmethod + +class InvalidInputError(ValueError): + pass + +class BiologicalSequence(ABC, str): + @abstractmethod + def __init__(self, seq): + self.seq = seq + + def __len__(self): + return len(self.seq) + + def __getitem__(self, index): + return self.seq[int(index)] + + def __repr__(self): + return __str__(self.seq) + + def check_nucleic_acid(self): + unique_chars = set(self.seq) + nucleotides_dna = set('ATGCatgc') + nucleotides_rna = set('AUGCaugc') + if unique_chars <= nucleotides_dna: + seq = 'dna' + elif unique_chars <= nucleotides_rna: + seq = 'rna' + else: + raise InvalidInputError() + return seq_type + +class NucleicAcidSequence(BiologicalSequence): + #complement_dict = None + def __init__(self, seq): + super().__init__(seq) + self.check_nucleic_acid() + self.length = len(self.seq) + + def complement(self): + list_input = list(self.seq) + for i in range(len(self.seq)): + if list_input[i] in self.complement_dict: + list_input[i] = self.complement_dict[list_input[i]] + return "".join(list_input) + +class DNASequence(NucleicAcidSequence): + complement_dict = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'a': 't', 't': 'a', 'g': 'c', 'c': 'g'} + def __init__(self, seq): + super().__init__(seq) + self.complement() + + def transcribe(self): + list_input = list(self.seq) + for i in range(len(self.seq)): + if (list_input[i] == 'T'): + list_input[i] = 'U' + elif (list_input[i] == 't'): + list_input[i]='u' + return "".join(list_input) + +class RNASequence(NucleicAcidSequence): + complement_dict = {'A': 'U', 'U': 'A', 'G': 'C', 'C': 'G', 'a': 'u', 'u': 'a', 'g': 'c', 'c': 'g'} + def __init__(self, seq): + super().__init__(seq) + self.complement() + +class AminoAcidSequence(BiologicalSequence): + def __init__(self, seq): + self.seq = seq + + def amino_acid_frequency(self): + """Calculates molecular weight of a protein + Arguments: + - seq (str) 1-letter coded protein sequence + Return: + - int, molecular weight (g/mol) rounded to integer""" + freq_dict = {} + for letter in self.seq: + if letter in freq_dict: + freq_dict[letter] += 1 + else: + freq_dict[letter] = 1 + for letter in freq_dict: + freq_dict[letter] = round(freq_dict[letter] / len(self.seq) * 100, 2) + return freq_dict From 597f3c0414bbf4a42005fddde3f4b62ad97775e7 Mon Sep 17 00:00:00 2001 From: Elena Smertina <104040609+sme229@users.noreply.github.com> Date: Sun, 25 Feb 2024 08:44:32 +1100 Subject: [PATCH 17/22] Update biopython_fastq_filter.py --- biopython_fastq_filter.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/biopython_fastq_filter.py b/biopython_fastq_filter.py index b2fb7e3..ed17fc2 100644 --- a/biopython_fastq_filter.py +++ b/biopython_fastq_filter.py @@ -66,7 +66,6 @@ def check_nucleic_acid(self): return seq_type class NucleicAcidSequence(BiologicalSequence): - #complement_dict = None def __init__(self, seq): super().__init__(seq) self.check_nucleic_acid() @@ -83,7 +82,7 @@ class DNASequence(NucleicAcidSequence): complement_dict = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'a': 't', 't': 'a', 'g': 'c', 'c': 'g'} def __init__(self, seq): super().__init__(seq) - self.complement() + #self.complement() def transcribe(self): list_input = list(self.seq) @@ -98,7 +97,7 @@ class RNASequence(NucleicAcidSequence): complement_dict = {'A': 'U', 'U': 'A', 'G': 'C', 'C': 'G', 'a': 'u', 'u': 'a', 'g': 'c', 'c': 'g'} def __init__(self, seq): super().__init__(seq) - self.complement() + #self.complement() class AminoAcidSequence(BiologicalSequence): def __init__(self, seq): From 3b0861ef92857d678b36aa4a5bc3db024ddde5c0 Mon Sep 17 00:00:00 2001 From: Elena Smertina <104040609+sme229@users.noreply.github.com> Date: Sun, 25 Feb 2024 08:44:54 +1100 Subject: [PATCH 18/22] Update biopython_fastq_filter.py --- biopython_fastq_filter.py | 1 + 1 file changed, 1 insertion(+) diff --git a/biopython_fastq_filter.py b/biopython_fastq_filter.py index ed17fc2..eea8ace 100644 --- a/biopython_fastq_filter.py +++ b/biopython_fastq_filter.py @@ -118,3 +118,4 @@ def amino_acid_frequency(self): for letter in freq_dict: freq_dict[letter] = round(freq_dict[letter] / len(self.seq) * 100, 2) return freq_dict + From ca88ffc473adb01dfa7c9a496b212682ec2b9bc1 Mon Sep 17 00:00:00 2001 From: Elena Smertina <104040609+sme229@users.noreply.github.com> Date: Sun, 25 Feb 2024 09:01:59 +1100 Subject: [PATCH 19/22] Update biopython_fastq_filter.py --- biopython_fastq_filter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/biopython_fastq_filter.py b/biopython_fastq_filter.py index eea8ace..0e09502 100644 --- a/biopython_fastq_filter.py +++ b/biopython_fastq_filter.py @@ -76,7 +76,7 @@ def complement(self): for i in range(len(self.seq)): if list_input[i] in self.complement_dict: list_input[i] = self.complement_dict[list_input[i]] - return "".join(list_input) + return NucleicAcidSequence("".join(list_input)) class DNASequence(NucleicAcidSequence): complement_dict = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'a': 't', 't': 'a', 'g': 'c', 'c': 'g'} From 840cbb7ec635af0921ff7742c8deeee8064b5a71 Mon Sep 17 00:00:00 2001 From: Elena Smertina <104040609+sme229@users.noreply.github.com> Date: Sun, 25 Feb 2024 09:02:51 +1100 Subject: [PATCH 20/22] Update biopython_fastq_filter.py --- biopython_fastq_filter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/biopython_fastq_filter.py b/biopython_fastq_filter.py index 0e09502..eea8ace 100644 --- a/biopython_fastq_filter.py +++ b/biopython_fastq_filter.py @@ -76,7 +76,7 @@ def complement(self): for i in range(len(self.seq)): if list_input[i] in self.complement_dict: list_input[i] = self.complement_dict[list_input[i]] - return NucleicAcidSequence("".join(list_input)) + return "".join(list_input) class DNASequence(NucleicAcidSequence): complement_dict = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'a': 't', 't': 'a', 'g': 'c', 'c': 'g'} From 75d762938b40bd820c9cfcbf11e2b282d05f6fec Mon Sep 17 00:00:00 2001 From: Elena Smertina <104040609+sme229@users.noreply.github.com> Date: Sun, 25 Feb 2024 09:05:13 +1100 Subject: [PATCH 21/22] Update biopython_fastq_filter.py --- biopython_fastq_filter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/biopython_fastq_filter.py b/biopython_fastq_filter.py index eea8ace..ed65868 100644 --- a/biopython_fastq_filter.py +++ b/biopython_fastq_filter.py @@ -76,7 +76,7 @@ def complement(self): for i in range(len(self.seq)): if list_input[i] in self.complement_dict: list_input[i] = self.complement_dict[list_input[i]] - return "".join(list_input) + return self.__class__("".join(list_input)) class DNASequence(NucleicAcidSequence): complement_dict = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'a': 't', 't': 'a', 'g': 'c', 'c': 'g'} From 29ce70b51c9b040d65faa773781ea94346533c94 Mon Sep 17 00:00:00 2001 From: Elena Smertina <104040609+sme229@users.noreply.github.com> Date: Sun, 25 Feb 2024 09:07:53 +1100 Subject: [PATCH 22/22] Update biopython_fastq_filter.py --- biopython_fastq_filter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/biopython_fastq_filter.py b/biopython_fastq_filter.py index ed65868..eea8ace 100644 --- a/biopython_fastq_filter.py +++ b/biopython_fastq_filter.py @@ -76,7 +76,7 @@ def complement(self): for i in range(len(self.seq)): if list_input[i] in self.complement_dict: list_input[i] = self.complement_dict[list_input[i]] - return self.__class__("".join(list_input)) + return "".join(list_input) class DNASequence(NucleicAcidSequence): complement_dict = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'a': 't', 't': 'a', 'g': 'c', 'c': 'g'}