From 4af5968e4adf389c7168892c3367becabe0d821b Mon Sep 17 00:00:00 2001 From: NataKich Date: Sat, 3 May 2025 10:16:21 +0300 Subject: [PATCH 1/4] a new script --- fastq_filter.py | 206 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 206 insertions(+) create mode 100644 fastq_filter.py diff --git a/fastq_filter.py b/fastq_filter.py new file mode 100644 index 0000000..17f25d5 --- /dev/null +++ b/fastq_filter.py @@ -0,0 +1,206 @@ +import Bio.SeqIO +import Bio.SeqUtils +import os +import click +import logging +import sys + + +logger = logging.getLogger() + + +def _in_bounds(value, bounds): + if type(bounds) is int: + bounds = [0, bounds] + return bounds[0] <= value <= bounds[1] + + +def filter_fastq(input_fastq: str, output_fastq: str, + gc_bounds: int | tuple = (0, 100), + length_bounds: int | tuple = (0, 2**32), + quality_threshold: int = 0): + """ + Function filter_fastq(input_fastq, output_fastq, gc_bounds,length_bounds, + quality_threshold) returns new file with fastq sequnces according to + filtering parameteres. + Function create new file in the 'filtered' directory in same directory as + input_fastq. + Function get 5 arguments: + input_fastq -absolut path (row line r'path') to input fastq file; + output_fastq - name of output file with filtered fastq seqs. + Write new file in 'filtered' directory in the same directory as + input_file. + gc_bounds - interval of quantity of GC in percenst, default + gc_bound = (0, 100)from 0% to 100%, if gc_bound = (20, 80)- + from 20% to 80% including. In case of one number, it counts + that it is high limit, gc_bound = 45 - quantity of GC lower or + equal 45%; + length_bounds - interval of read length, default + length_bounds = (0, 2**32),from 0 to 2 in power 32. + In case of one number, it counts that it is high limit, + length_bounds = 32 - from 0 to 32; + quality_threshold - treshhold for quality of mean read, + default 0(scale Phred33). Reads with quality lower then + treshold are decline. + """ + if not os.path.isfile(input_fastq): + logger.error("ERROR: Can not find file %s", input_fastq) + raise Exception(f'Can not find file {input_fastq}') + + parent_dir = os.path.dirname(input_fastq) + new_file_path = os.path.join(parent_dir, 'filtered', output_fastq) + + c = 0 + + with open(new_file_path, "w") as output: + for rec in Bio.SeqIO.parse(input_fastq, "fastq"): + seq = rec.seq + quality = rec.letter_annotations["phred_quality"] + + cond1 = _in_bounds(len(seq), length_bounds) + cond2 = _in_bounds(Bio.SeqUtils.gc_fraction(seq) * 100, gc_bounds) + cond3 = sum(quality) / len(quality) >= quality_threshold + if cond1 and cond2 and cond3: + Bio.SeqIO.write(rec, output, "fastq") + c += 1 + + logger.info("INFO from %s to %s filteren %s records", input_fastq, output_fastq, c) + + +class BiologicalSequence: + def __len__(self): + pass + def __getitem__(self, index): + pass + def __str__(self): + pass + def check_alphabet(self): + pass + +class NucleicAcidSequence(BiologicalSequence): + def __init__(self, seq): + self.seq = seq + def complement(self): + res = [] + for c in self.seq: + res.append(self.complement_dict[c]) + return self.__class__("".join(res)) + def reverse(self): + return self.__class__(self.seq[::-1]) + def reverse_complement(self): + return self.complement().reverse() + def __len__(self): + return len(self.seq) + def __getitem__(self, index): + return self.seq[index] + def __str__(self): + return self.seq + def check_alphabet(self): + valid = set(self.complement_dict.keys()) + for c in self.seq: + if c not in valid: + return False + return True + + +class DNASequence(NucleicAcidSequence): + complement_dict = {'T': 'A', 'A': 'T', + 'a': 't', 't': 'a', + 'G': 'C', 'C': 'G', + 'g': 'c', 'c': 'g'} + + def transcribe(self): + s = self.seq.replace("T", "U") + s = s.replace("t", "u") + return RNASequence(s) + + +class RNASequence(NucleicAcidSequence): + complement_dict = {'G': 'C', 'C': 'G', + 'g': 'c', 'c': 'g', + 'A': 'U', 'U': 'A', + 'a': 'u', 'u': 'a'} + + +class AminoAcidSequence(BiologicalSequence): + def __init__(self, seq): + self.seq = seq + def __len__(self): + return len(self.seq) + def __getitem__(self, index): + return self.seq[index] + def __str__(self): + return "".join(self.seq) + def check_alphabet(self): + valid = set("ABCDEFGHIJKLMNOPQRSTUVWYZ") + for c in self.seq: + if c not in valid: + return False + return True + + + +@click.group() +def main(): + pass + + +@click.command() +@click.option("-q", "--quality", default=0, type=click.IntRange(min=0, max_open=True)) +@click.option("-gc-low", default=0, type=click.IntRange(min=0, max=100)) +@click.option("-gc-high", default=100, type=click.IntRange(min=0, max=100)) +@click.option("-len-low", default=0, type=click.IntRange(min=0, max=2**32)) +@click.option("-len-high", default=2**32, type=click.IntRange(min=0, max=2**32)) +@click.argument("infile") +@click.argument("outfile") +def filter(quality, gc_low, gc_high, len_low, len_high, infile, outfile): + try: + filter_fastq(infile, outfile, + gc_bounds=(gc_low, gc_high), + length_bounds=(len_low, len_high), + quality_threshold=quality) + except Exception as e: + print(e, file=sys.stderr) + + +@click.command() +@click.argument("cmd", type=click.Choice(["compl", "reverse", "complreverse", "transcribe"])) +@click.argument("seq") +def dna(cmd, seq): + x = DNASequence(seq) + if not x.check_alphabet(): + print("invalid DNA sequence", file=sys.stderr) + return 1 + if cmd == "compl": + print(x.complement()) + elif cmd == "reverse": + print(x.reverse()) + elif cmd == "complreverse": + print(x.reverse_complement()) + elif cmd == "transcribe": + print(x.transcribe()) + + +@click.command() +@click.argument("cmd", type=click.Choice(["compl", "reverse", "complreverse"])) +@click.argument("seq") +def rna(cmd, seq): + x = RNASequence(seq) + if not x.check_alphabet(): + print("invalid RNA sequence", file=sys.stderr) + return 1 + if cmd == "compl": + print(x.complement()) + elif cmd == "reverse": + print(x.reverse()) + elif cmd == "complreverse": + print(x.reverse_complement()) + + +main.add_command(filter) +main.add_command(dna) +main.add_command(rna) + + +if __name__ == "__main__": + main() \ No newline at end of file From 6812f609a561f23b71a1a7a33dd274fcf331a32a Mon Sep 17 00:00:00 2001 From: NataKich Date: Sat, 3 May 2025 10:16:54 +0300 Subject: [PATCH 2/4] a new script --- fastq_filter_test.py | 93 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 fastq_filter_test.py diff --git a/fastq_filter_test.py b/fastq_filter_test.py new file mode 100644 index 0000000..3a22385 --- /dev/null +++ b/fastq_filter_test.py @@ -0,0 +1,93 @@ +import unittest +import fastq_filter as genom +import tempfile +import os + + +class DnaTests(unittest.TestCase): + def test_check_alphabet(self): + dna = genom.DNASequence("ATGC") + self.assertTrue(dna.check_alphabet()) + dna = genom.DNASequence("ATGCU") + self.assertFalse(dna.check_alphabet()) + + def test_complement(self): + dna = genom.DNASequence("ATGC") + compl = dna.complement() + self.assertEqual(compl.seq, "TACG") + + def test_reverse(self): + dna = genom.DNASequence("ATGC") + rev = dna.reverse() + self.assertEqual(rev.seq, "CGTA") + + +class RnaTests(unittest.TestCase): + def test_check_alphabet(self): + rna = genom.RNASequence("AUGC") + self.assertTrue(rna.check_alphabet()) + rna = genom.RNASequence("ATGCU") + self.assertFalse(rna.check_alphabet()) + + def test_complement(self): + rna = genom.RNASequence("AUGC") + compl = rna.complement() + self.assertEqual(compl.seq, "UACG") + + def test_reverse(self): + rna = genom.RNASequence("AUGC") + rev = rna.reverse() + self.assertEqual(rev.seq, "CGUA") + + +class FilterTests(unittest.TestCase): + seq = """@SRR1363257.37 GWZHISEQ01:153:C1W31ACXX:5:1101:14027:2198 length=101 +GGTTGCAGATTCGCAGTGTCGCTGTTCCAGCGCATCACATCTTTGATGTTCACGCCGTGGCGTTTAGCAATGCTTGAAAGCGAATCGCCTTTGCCCACACG ++ +@?:=:;DBFADH;CAECEE@@E:FFHGAE4?C?DE?>FHE4BFFIIFHIBABEECA83;>>@>@CCCDC9@@CC08<@?@BB@9:CC####### +""" + + def test_filter_no_file(self): + with self.assertRaises(Exception): + # where is no such file + genom.filter_fastq("---------------") + + def test_filter_id(self): + name_in = tempfile.mktemp(suffix=".fastq") + name_out = tempfile.mktemp(suffix=".fastq") + try: + with open(name_in, "w") as f: + f.write(self.seq) + genom.filter_fastq(name_in, name_out) + self.assertTrue(os.path.isfile(name_out)) + + with open(name_out, "r") as f: + res = f.read() + self.assertTrue(res.strip() == self.seq.strip()) + finally: + if os.path.isfile(name_in): + os.remove(name_in) + if os.path.isfile(name_out): + os.remove(name_out) + + def test_filter_all(self): + name_in = tempfile.mktemp(suffix=".fastq") + name_out = tempfile.mktemp(suffix=".fastq") + try: + with open(name_in, "w") as f: + f.write(self.seq) + genom.filter_fastq(name_in, name_out, length_bounds=(102, 1000)) + self.assertTrue(os.path.isfile(name_out)) + + with open(name_out, "r") as f: + res = f.read() + self.assertTrue(res.strip() == "") + finally: + if os.path.isfile(name_in): + os.remove(name_in) + if os.path.isfile(name_out): + os.remove(name_out) + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file From 5c1fda3d644fc3a55a520b1b89128fb119b518f4 Mon Sep 17 00:00:00 2001 From: NataKich Date: Sat, 3 May 2025 10:18:35 +0300 Subject: [PATCH 3/4] a new repository structure --- genom.py | 132 ------------------------------------------------------- 1 file changed, 132 deletions(-) delete mode 100644 genom.py diff --git a/genom.py b/genom.py deleted file mode 100644 index 2a6d89f..0000000 --- a/genom.py +++ /dev/null @@ -1,132 +0,0 @@ -import Bio.SeqIO -import Bio.SeqUtils -import os - - -def _in_bounds(value, bounds): - if type(bounds) is int: - bounds = [0, bounds] - return bounds[0] <= value <= bounds[1] - - -def filter_fastq(input_fastq: str, output_fastq: str, - gc_bounds: int | tuple = (0, 100), - length_bounds: int | tuple = (0, 2**32), - quality_threshold: int = 0) -> dict: - """ - Function filter_fastq(input_fastq, output_fastq, gc_bounds,length_bounds, - quality_threshold) returns new file with fastq sequnces according to - filtering parameteres. - Function create new file in the 'filtered' directory in same directory as - input_fastq. - Function get 5 arguments: - input_fastq -absolut path (row line r'path') to input fastq file; - output_fastq - name of output file with filtered fastq seqs. - Write new file in 'filtered' directory in the same directory as - input_file. - gc_bounds - interval of quantity of GC in percenst, default - gc_bound = (0, 100)from 0% to 100%, if gc_bound = (20, 80)- - from 20% to 80% including. In case of one number, it counts - that it is high limit, gc_bound = 45 - quantity of GC lower or - equal 45%; - length_bounds - interval of read length, default - length_bounds = (0, 2**32),from 0 to 2 in power 32. - In case of one number, it counts that it is high limit, - length_bounds = 32 - from 0 to 32; - quality_threshold - treshhold for quality of mean read, - default 0(scale Phred33). Reads with quality lower then - treshold are decline. - """ - parent_dir = os.path.dirname(input_fastq) - new_file_path = os.path.join(parent_dir, 'filtered', output_fastq) - '''Checks the name of new file is no the same as name of other files - in directory. If it is the same, returns 'Please insert other name - for output file, file is exsist' - ''' - if os.path.isfile(new_file_path): - return 'Please insert other name for output file, file is exsist' - - with open(new_file_path, "w") as output: - for rec in Bio.SeqIO.parse(input, "fastq"): - seq = rec.seq - quality = rec.letter_annotations["phred_quality"] - - cond1 = _in_bounds(len(seq), length_bounds) - cond2 = _in_bounds(Bio.SeqUtils.gc_fraction(seq) * 100, gc_bounds) - cond3 = sum(quality) / len(quality) >= quality_threshold - if cond1 and cond2 and cond3: - Bio.SeqIO.write(rec, output, "fastq") - - return 'Filtering is complete' - - -class BiologicalSequence: - def __len__(self): - pass - def __getitem__(self, index): - pass - def __str__(self): - pass - def check_alphabet(self): - pass - -class NucleicAcidSequence(BiologicalSequence): - def __init__(self, seq): - self.seq = seq - def complement(self): - res = [] - for c in self.seq: - res.append(self.complement_dict[c]) - return self.__class__("".join(res)) - def reverse(self): - return self.__class__(self.seq[::-1]) - def reverse_complement(self): - return self.complement().reverse() - def __len__(self): - return len(self.seq) - def __getitem__(self, index): - return self.seq[index] - def __str__(self): - return self.seq - def check_alphabet(self): - valid = set(self.complement_dict.keys()) - for c in self.seq: - if c not in valid: - return False - return True - - -class DNASequence(NucleicAcidSequence): - complement_dict = {'T': 'A', 'A': 'T', - 'a': 't', 't': 'a', - 'G': 'C', 'C': 'G', - 'g': 'c', 'c': 'g'} - - def transcribe(self): - s = self.seq.replace("T", "U") - s = s.replace("t", "u") - return RNASequence(s) - - -class RNASequence(NucleicAcidSequence): - complement_dict = {'G': 'C', 'C': 'G', - 'g': 'c', 'c': 'g', - 'A': 'U', 'U': 'A', - 'a': 'u', 'u': 'a'} - - -class AminoAcidSequence(BiologicalSequence): - def __init__(self, seq): - self.seq = seq - def __len__(self): - return len(self.seq) - def __getitem__(self, index): - return self.seq[index] - def __str__(self): - return "".join(self.seq) - def check_alphabet(self): - valid = set("ABCDEFGHIJKLMNOPQRSTUVWYZ") - for c in self.seq: - if c not in valid: - return False - return True From e644cc52416c49f37ed583ca4e014e69001658cb Mon Sep 17 00:00:00 2001 From: NataKich Date: Sat, 3 May 2025 10:25:07 +0300 Subject: [PATCH 4/4] a new file --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..73ad1d9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +biopython==1.85 +click==8.1.8