From 4af5968e4adf389c7168892c3367becabe0d821b Mon Sep 17 00:00:00 2001
From: NataKich <natastarosto4ka@gmail.com>
Date: Sat, 3 May 2025 10:16:21 +0300
Subject: [PATCH 1/4] a new script

---
 fastq_filter.py | 206 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 206 insertions(+)
 create mode 100644 fastq_filter.py

diff --git a/fastq_filter.py b/fastq_filter.py
new file mode 100644
index 0000000..17f25d5
--- /dev/null
+++ b/fastq_filter.py
@@ -0,0 +1,206 @@
+import Bio.SeqIO
+import Bio.SeqUtils
+import os
+import click
+import logging
+import sys
+
+
+logger = logging.getLogger()
+
+
+def _in_bounds(value, bounds):
+    if type(bounds) is int:
+        bounds = [0, bounds]
+    return bounds[0] <= value <= bounds[1]
+
+
+def filter_fastq(input_fastq: str, output_fastq: str,
+                 gc_bounds: int | tuple = (0, 100),
+                 length_bounds: int | tuple = (0, 2**32),
+                 quality_threshold: int = 0):
+    """
+    Function filter_fastq(input_fastq, output_fastq, gc_bounds,length_bounds,
+    quality_threshold) returns new file with fastq sequnces according to
+    filtering parameteres.
+    Function create new file in the 'filtered' directory in same directory as
+    input_fastq.
+    Function get 5 arguments:
+        input_fastq -absolut path (row line r'path') to input fastq file;
+        output_fastq - name of output file with filtered fastq seqs.
+        Write new file in 'filtered' directory in the same directory as
+        input_file.
+        gc_bounds - interval of quantity of GC in percenst, default
+        gc_bound = (0, 100)from 0% to 100%, if gc_bound = (20, 80)-
+        from 20% to 80% including. In case of one number, it counts
+        that it is high limit, gc_bound = 45 - quantity of GC lower or
+        equal 45%;
+        length_bounds - interval of read length, default
+        length_bounds = (0, 2**32),from 0 to 2 in power 32.
+        In case of one number, it counts that it is high limit,
+        length_bounds = 32 - from 0 to 32;
+        quality_threshold - treshhold for quality of mean read,
+        default 0(scale Phred33). Reads with quality lower then
+        treshold are decline.
+    """
+    if not os.path.isfile(input_fastq):
+        logger.error("ERROR: Can not find file %s", input_fastq)
+        raise Exception(f'Can not find file {input_fastq}')     
+
+    parent_dir = os.path.dirname(input_fastq)
+    new_file_path = os.path.join(parent_dir, 'filtered', output_fastq)
+
+    c = 0
+    
+    with open(new_file_path, "w") as output:
+        for rec in Bio.SeqIO.parse(input_fastq, "fastq"):
+            seq = rec.seq
+            quality = rec.letter_annotations["phred_quality"]
+
+            cond1 = _in_bounds(len(seq), length_bounds)
+            cond2 = _in_bounds(Bio.SeqUtils.gc_fraction(seq) * 100, gc_bounds)
+            cond3 = sum(quality) / len(quality) >= quality_threshold
+            if cond1 and cond2 and cond3:
+                Bio.SeqIO.write(rec, output, "fastq")
+                c += 1
+
+    logger.info("INFO from %s to %s filteren %s records", input_fastq, output_fastq, c)
+
+
+class BiologicalSequence:
+    def __len__(self):
+        pass
+    def __getitem__(self, index):
+        pass
+    def __str__(self):
+        pass
+    def check_alphabet(self):
+        pass
+
+class NucleicAcidSequence(BiologicalSequence):
+    def __init__(self, seq):
+        self.seq = seq
+    def complement(self):
+        res = []
+        for c in self.seq:
+            res.append(self.complement_dict[c])
+        return self.__class__("".join(res))
+    def reverse(self):
+        return self.__class__(self.seq[::-1])
+    def reverse_complement(self):
+        return self.complement().reverse()    
+    def __len__(self):
+        return len(self.seq)
+    def __getitem__(self, index):
+        return self.seq[index]
+    def __str__(self):
+        return self.seq
+    def check_alphabet(self):
+        valid = set(self.complement_dict.keys())
+        for c in self.seq:
+            if c not in valid:
+                return False
+        return True
+
+
+class DNASequence(NucleicAcidSequence):
+    complement_dict = {'T': 'A', 'A': 'T',
+                'a': 't', 't': 'a',
+                'G': 'C', 'C': 'G',
+                'g': 'c', 'c': 'g'}
+    
+    def transcribe(self):
+        s = self.seq.replace("T", "U")
+        s = s.replace("t", "u")
+        return RNASequence(s)
+
+
+class RNASequence(NucleicAcidSequence):
+    complement_dict = {'G': 'C', 'C': 'G',
+            'g': 'c', 'c': 'g',
+            'A': 'U', 'U': 'A',
+            'a': 'u', 'u': 'a'}
+
+
+class AminoAcidSequence(BiologicalSequence):
+    def __init__(self, seq):
+        self.seq = seq
+    def __len__(self):
+        return len(self.seq)
+    def __getitem__(self, index):
+        return self.seq[index]
+    def __str__(self):
+        return "".join(self.seq)
+    def check_alphabet(self):
+        valid = set("ABCDEFGHIJKLMNOPQRSTUVWYZ")
+        for c in self.seq:
+            if c not in valid:
+                return False
+        return True
+
+
+
+@click.group()
+def main():
+    pass
+
+
+@click.command()
+@click.option("-q", "--quality", default=0, type=click.IntRange(min=0, max_open=True))
+@click.option("-gc-low", default=0, type=click.IntRange(min=0, max=100))
+@click.option("-gc-high", default=100, type=click.IntRange(min=0, max=100))
+@click.option("-len-low", default=0, type=click.IntRange(min=0, max=2**32))
+@click.option("-len-high", default=2**32, type=click.IntRange(min=0, max=2**32))
+@click.argument("infile")
+@click.argument("outfile")
+def filter(quality, gc_low, gc_high, len_low, len_high, infile, outfile):
+    try:
+        filter_fastq(infile, outfile, 
+                 gc_bounds=(gc_low, gc_high), 
+                 length_bounds=(len_low, len_high), 
+                 quality_threshold=quality)
+    except Exception as e:
+        print(e, file=sys.stderr)
+
+
+@click.command()
+@click.argument("cmd", type=click.Choice(["compl", "reverse", "complreverse", "transcribe"]))
+@click.argument("seq")
+def dna(cmd, seq):
+    x = DNASequence(seq)
+    if not x.check_alphabet():
+        print("invalid DNA sequence", file=sys.stderr)
+        return 1
+    if cmd == "compl":
+        print(x.complement())
+    elif cmd == "reverse":
+        print(x.reverse())
+    elif cmd == "complreverse":
+        print(x.reverse_complement())
+    elif cmd == "transcribe":
+        print(x.transcribe())
+
+
+@click.command()
+@click.argument("cmd", type=click.Choice(["compl", "reverse", "complreverse"]))
+@click.argument("seq")
+def rna(cmd, seq):
+    x = RNASequence(seq)
+    if not x.check_alphabet():
+        print("invalid RNA sequence", file=sys.stderr)
+        return 1
+    if cmd == "compl":
+        print(x.complement())
+    elif cmd == "reverse":
+        print(x.reverse())
+    elif cmd == "complreverse":
+        print(x.reverse_complement())
+
+
+main.add_command(filter)
+main.add_command(dna)
+main.add_command(rna)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 6812f609a561f23b71a1a7a33dd274fcf331a32a Mon Sep 17 00:00:00 2001
From: NataKich <natastarosto4ka@gmail.com>
Date: Sat, 3 May 2025 10:16:54 +0300
Subject: [PATCH 2/4] a new script

---
 fastq_filter_test.py | 93 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)
 create mode 100644 fastq_filter_test.py

diff --git a/fastq_filter_test.py b/fastq_filter_test.py
new file mode 100644
index 0000000..3a22385
--- /dev/null
+++ b/fastq_filter_test.py
@@ -0,0 +1,93 @@
+import unittest
+import fastq_filter as genom
+import tempfile
+import os
+
+
+class DnaTests(unittest.TestCase):
+    def test_check_alphabet(self):
+        dna = genom.DNASequence("ATGC")
+        self.assertTrue(dna.check_alphabet())
+        dna = genom.DNASequence("ATGCU")
+        self.assertFalse(dna.check_alphabet())
+
+    def test_complement(self):
+        dna = genom.DNASequence("ATGC")
+        compl = dna.complement()
+        self.assertEqual(compl.seq, "TACG")
+
+    def test_reverse(self):
+        dna = genom.DNASequence("ATGC")
+        rev = dna.reverse()
+        self.assertEqual(rev.seq, "CGTA")
+
+
+class RnaTests(unittest.TestCase):
+    def test_check_alphabet(self):
+        rna = genom.RNASequence("AUGC")
+        self.assertTrue(rna.check_alphabet())
+        rna = genom.RNASequence("ATGCU")
+        self.assertFalse(rna.check_alphabet())
+
+    def test_complement(self):
+        rna = genom.RNASequence("AUGC")
+        compl = rna.complement()
+        self.assertEqual(compl.seq, "UACG")
+
+    def test_reverse(self):
+        rna = genom.RNASequence("AUGC")
+        rev = rna.reverse()
+        self.assertEqual(rev.seq, "CGUA")
+
+
+class FilterTests(unittest.TestCase):
+    seq = """@SRR1363257.37 GWZHISEQ01:153:C1W31ACXX:5:1101:14027:2198 length=101
+GGTTGCAGATTCGCAGTGTCGCTGTTCCAGCGCATCACATCTTTGATGTTCACGCCGTGGCGTTTAGCAATGCTTGAAAGCGAATCGCCTTTGCCCACACG
++
+@?:=:;DBFADH;CAECEE@@E:FFHGAE4?C?DE<BFGEC>?>FHE4BFFIIFHIBABEECA83;>>@>@CCCDC9@@CC08<@?@BB@9:CC#######
+"""
+
+    def test_filter_no_file(self):
+        with self.assertRaises(Exception):
+            # where is no such file
+            genom.filter_fastq("---------------")
+
+    def test_filter_id(self):
+        name_in = tempfile.mktemp(suffix=".fastq")
+        name_out = tempfile.mktemp(suffix=".fastq")
+        try:
+            with open(name_in, "w") as f:
+                f.write(self.seq)
+            genom.filter_fastq(name_in, name_out)
+            self.assertTrue(os.path.isfile(name_out))
+
+            with open(name_out, "r") as f:
+                res = f.read()
+            self.assertTrue(res.strip() == self.seq.strip())
+        finally:
+            if os.path.isfile(name_in):
+                os.remove(name_in)
+            if os.path.isfile(name_out):
+                os.remove(name_out)
+        
+    def test_filter_all(self):
+        name_in = tempfile.mktemp(suffix=".fastq")
+        name_out = tempfile.mktemp(suffix=".fastq")
+        try:
+            with open(name_in, "w") as f:
+                f.write(self.seq)
+            genom.filter_fastq(name_in, name_out, length_bounds=(102, 1000))
+            self.assertTrue(os.path.isfile(name_out))
+
+            with open(name_out, "r") as f:
+                res = f.read()
+            self.assertTrue(res.strip() == "")
+        finally:
+            if os.path.isfile(name_in):
+                os.remove(name_in)
+            if os.path.isfile(name_out):
+                os.remove(name_out)
+
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file

From 5c1fda3d644fc3a55a520b1b89128fb119b518f4 Mon Sep 17 00:00:00 2001
From: NataKich <natastarosto4ka@gmail.com>
Date: Sat, 3 May 2025 10:18:35 +0300
Subject: [PATCH 3/4] a new repository structure

---
 genom.py | 132 -------------------------------------------------------
 1 file changed, 132 deletions(-)
 delete mode 100644 genom.py

diff --git a/genom.py b/genom.py
deleted file mode 100644
index 2a6d89f..0000000
--- a/genom.py
+++ /dev/null
@@ -1,132 +0,0 @@
-import Bio.SeqIO
-import Bio.SeqUtils
-import os
-
-
-def _in_bounds(value, bounds):
-    if type(bounds) is int:
-        bounds = [0, bounds]
-    return bounds[0] <= value <= bounds[1]
-
-
-def filter_fastq(input_fastq: str, output_fastq: str,
-                 gc_bounds: int | tuple = (0, 100),
-                 length_bounds: int | tuple = (0, 2**32),
-                 quality_threshold: int = 0) -> dict:
-    """
-    Function filter_fastq(input_fastq, output_fastq, gc_bounds,length_bounds,
-    quality_threshold) returns new file with fastq sequnces according to
-    filtering parameteres.
-    Function create new file in the 'filtered' directory in same directory as
-    input_fastq.
-    Function get 5 arguments:
-        input_fastq -absolut path (row line r'path') to input fastq file;
-        output_fastq - name of output file with filtered fastq seqs.
-        Write new file in 'filtered' directory in the same directory as
-        input_file.
-        gc_bounds - interval of quantity of GC in percenst, default
-        gc_bound = (0, 100)from 0% to 100%, if gc_bound = (20, 80)-
-        from 20% to 80% including. In case of one number, it counts
-        that it is high limit, gc_bound = 45 - quantity of GC lower or
-        equal 45%;
-        length_bounds - interval of read length, default
-        length_bounds = (0, 2**32),from 0 to 2 in power 32.
-        In case of one number, it counts that it is high limit,
-        length_bounds = 32 - from 0 to 32;
-        quality_threshold - treshhold for quality of mean read,
-        default 0(scale Phred33). Reads with quality lower then
-        treshold are decline.
-    """
-    parent_dir = os.path.dirname(input_fastq)
-    new_file_path = os.path.join(parent_dir, 'filtered', output_fastq)
-    '''Checks the name of new file is no the same as name of other files
-    in directory. If it is the same, returns 'Please insert other name
-    for output file, file is exsist'
-    '''
-    if os.path.isfile(new_file_path):
-        return 'Please insert other name for output file, file is exsist'
-    
-    with open(new_file_path, "w") as output:
-        for rec in Bio.SeqIO.parse(input, "fastq"):
-            seq = rec.seq
-            quality = rec.letter_annotations["phred_quality"]
-
-            cond1 = _in_bounds(len(seq), length_bounds)
-            cond2 = _in_bounds(Bio.SeqUtils.gc_fraction(seq) * 100, gc_bounds)
-            cond3 = sum(quality) / len(quality) >= quality_threshold
-            if cond1 and cond2 and cond3:
-                Bio.SeqIO.write(rec, output, "fastq")
-
-    return 'Filtering is complete'
-
-
-class BiologicalSequence:
-    def __len__(self):
-        pass
-    def __getitem__(self, index):
-        pass
-    def __str__(self):
-        pass
-    def check_alphabet(self):
-        pass
-
-class NucleicAcidSequence(BiologicalSequence):
-    def __init__(self, seq):
-        self.seq = seq
-    def complement(self):
-        res = []
-        for c in self.seq:
-            res.append(self.complement_dict[c])
-        return self.__class__("".join(res))
-    def reverse(self):
-        return self.__class__(self.seq[::-1])
-    def reverse_complement(self):
-        return self.complement().reverse()    
-    def __len__(self):
-        return len(self.seq)
-    def __getitem__(self, index):
-        return self.seq[index]
-    def __str__(self):
-        return self.seq
-    def check_alphabet(self):
-        valid = set(self.complement_dict.keys())
-        for c in self.seq:
-            if c not in valid:
-                return False
-        return True
-
-
-class DNASequence(NucleicAcidSequence):
-    complement_dict = {'T': 'A', 'A': 'T',
-                'a': 't', 't': 'a',
-                'G': 'C', 'C': 'G',
-                'g': 'c', 'c': 'g'}
-    
-    def transcribe(self):
-        s = self.seq.replace("T", "U")
-        s = s.replace("t", "u")
-        return RNASequence(s)
-
-
-class RNASequence(NucleicAcidSequence):
-    complement_dict = {'G': 'C', 'C': 'G',
-            'g': 'c', 'c': 'g',
-            'A': 'U', 'U': 'A',
-            'a': 'u', 'u': 'a'}
-
-
-class AminoAcidSequence(BiologicalSequence):
-    def __init__(self, seq):
-        self.seq = seq
-    def __len__(self):
-        return len(self.seq)
-    def __getitem__(self, index):
-        return self.seq[index]
-    def __str__(self):
-        return "".join(self.seq)
-    def check_alphabet(self):
-        valid = set("ABCDEFGHIJKLMNOPQRSTUVWYZ")
-        for c in self.seq:
-            if c not in valid:
-                return False
-        return True

From e644cc52416c49f37ed583ca4e014e69001658cb Mon Sep 17 00:00:00 2001
From: NataKich <natastarosto4ka@gmail.com>
Date: Sat, 3 May 2025 10:25:07 +0300
Subject: [PATCH 4/4] a new file

---
 requirements.txt | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..73ad1d9
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+biopython==1.85
+click==8.1.8