From 6c194e88eabc3142fb723dfad9584abe5321342a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=94=D0=B0=D1=88=D0=B0=20=D0=91=D0=BE=D0=B1=D1=80=D0=BE?=
 =?UTF-8?q?=D0=B2=D0=B0?= <bobrova2116@gmail.com>
Date: Sat, 11 Oct 2025 23:46:51 +0300
Subject: [PATCH 1/5] HW4: functions have simplified, a common utility for DNA
 and RNA has created

---
 BioToolkit.py                            | 51 ++++-----------
 tools/{dna_tools.py => dna_rna_tools.py} | 56 ++++++++++++-----
 tools/other_tools.py                     |  2 +
 tools/rna_tools.py                       | 79 ------------------------
 4 files changed, 54 insertions(+), 134 deletions(-)
 rename tools/{dna_tools.py => dna_rna_tools.py} (54%)
 delete mode 100644 tools/rna_tools.py

diff --git a/BioToolkit.py b/BioToolkit.py
index 4752717..3476498 100644
--- a/BioToolkit.py
+++ b/BioToolkit.py
@@ -1,19 +1,8 @@
 from tools.utils import gc_content, mean_quality
 
-from tools.dna_tools import (
-    is_dna, 
-    transcribe, 
-    reverse, 
-    complement, 
-    reverse_complement
-)
-
-from tools.rna_tools import (
-    is_rna,
-    reverse_transcribe,
-    reverse as rna_reverse,
-    complement as rna_complement,
-    reverse_complement as rna_reverse_complement
+from tools.rna_dna_tools import (
+    is_dna, is_rna, transcribe, reverse_transcribe,
+    reverse, complement, reverse_complement
 )
 
 def run_dna_rna_tools(*args):
@@ -30,7 +19,6 @@ def run_dna_rna_tools(*args):
     *sequences, procedure = args
     results = []
 
-
     for seq in sequences:
         if procedure == "is_nucleic_acid":
             results.append(is_nucleic_acid(seq))
@@ -39,30 +27,13 @@ def run_dna_rna_tools(*args):
         elif procedure == "reverse_transcribe":
             results.append(reverse_transcribe(seq))
         elif procedure == "reverse":
-            if is_dna(seq):
-                results.append(reverse(seq))
-            elif is_rna(seq):
-                results.append(rna_reverse(seq))
-            else:
-                raise ValueError("Sequence should be DNA or RNA.")
+            results.append(reverse(seq))
         elif procedure == "complement":
-            if is_dna(seq):
-                results.append(complement(seq))
-            elif is_rna(seq):
-                results.append(rna_complement(seq))
-            else:
-                raise ValueError("Sequence should be DNA or RNA.")
+            results.append(complement(seq))
         elif procedure == "reverse_complement":
-            if is_dna(seq):
-                results.append(reverse_complement(seq))
-            elif is_rna(seq):
-                results.append(rna_reverse_complement(seq))
-            else:
-                raise ValueError("Sequence should be DNA or RNA.")
+            results.append(reverse_complement(seq))
         else:
-            raise ValueError(
-                f"There is no such procedure in the function: {procedure}"
-            )
+            raise ValueError(f"There is no such procedure in the function: {procedure}")
 
     return results[0] if len(results) == 1 else results
 
@@ -82,13 +53,13 @@ def filter_fastq(seqs: dict,
     """
 
     filtered = {}
-    
-    if type(gc_bounds) in (int, float):
+
+    if isinstance(gc_bounds, (int, float)):
         gc_min, gc_max = 0, float(gc_bounds)
     else:
         gc_min, gc_max = gc_bounds
-
-    if type(length_bounds) in (int, float):
+        
+    if isinstance(length_bounds, (int, float)):
         len_min, len_max = 0, int(length_bounds)
     else:
         len_min, len_max = length_bounds
diff --git a/tools/dna_tools.py b/tools/dna_rna_tools.py
similarity index 54%
rename from tools/dna_tools.py
rename to tools/dna_rna_tools.py
index 566f06d..e6e06a3 100644
--- a/tools/dna_tools.py
+++ b/tools/dna_rna_tools.py
@@ -10,6 +10,18 @@ def is_dna(seq: str) -> bool:
     return all(base in "aAtTgGcC" for base in seq)
 
 
+def is_rna(seq: str) -> bool:
+    """
+    A function for checking whether a sequence is a RNA sequence
+
+    Args: seq (str): Input string to check.
+
+    Returns: bool: True if the sequence contains only A, U, G, C.
+
+    """
+    return all(base in "aAuUgGcC" for base in seq)
+
+
 def transcribe(seq: str) -> str:
     """
     A function that transcribes DNA into RNA.
@@ -21,12 +33,29 @@ def transcribe(seq: str) -> str:
     """
     if not is_dna(seq):
         raise ValueError(
-            "This sequence cannot be transcribed because it is not DNA. "
+            "This sequence cannot be transcribed because it is not DNA."
             "For retroviruses, use reverse_transcribe."
         )
     return seq.replace("T", "U").replace("t", "u")
 
 
+def reverse_transcribe(seq: str) -> str:
+    """
+    A function that reverse-transcribes RNA into DNA.
+
+    Args: seq (str): Input string to be reverse transcribed.
+
+    Returns: str: DNA sequence where U are replaced by T.
+
+    """
+    if not is_rna(seq):
+        raise ValueError(
+            "This sequence cannot be transcribed because it is not RNA."
+            "For DNA use transcribe"
+        )
+    return seq.replace("U", "T").replace("u", "t")
+
+
 def reverse(seq: str) -> str:
     """
     A function that reverse sequence.
@@ -48,21 +77,21 @@ def complement(seq: str) -> str:
     Returns: str: Complementary sequence.
 
     """
-    dna = {
-        "A": "T",
-        "T": "A",
-        "G": "C",
-        "C": "G",
-        "a": "t",
-        "t": "a",
-        "g": "c",
-        "c": "g",
+    dna_table = {
+        "A": "T", "T": "A", "G": "C", "C": "G",
+        "a": "t", "t": "a", "g": "c", "c": "g",
+    }
+    rna_table = {
+        "A": "U", "U": "A", "G": "C", "C": "G",
+        "a": "u", "u": "a", "g": "c", "c": "g",
     }
 
     if is_dna(seq):
-        return "".join(dna[base] for base in seq)
+        return "".join(dna_table[base] for base in seq)
+    elif is_rna(seq):
+        return "".join(rna_table[base] for base in seq)
     else:
-        raise ValueError("Sequence should be DNA.")
+        raise ValueError("Sequence should be DNA or RNA.")
 
 
 def reverse_complement(seq: str) -> str:
@@ -74,7 +103,4 @@ def reverse_complement(seq: str) -> str:
     Returns: str: Complement sequence written in reverse order.
 
     """
-    if not is_dna(seq):
-        raise ValueError("Sequence should be DNA.")
-
     return reverse(complement(seq))
diff --git a/tools/other_tools.py b/tools/other_tools.py
index 07166c3..d41342c 100644
--- a/tools/other_tools.py
+++ b/tools/other_tools.py
@@ -1,3 +1,5 @@
+from tools.rna_dna_tools import is_dna, is_rna
+
 def is_nucleic_acid(seq: str) -> bool:
     """
     A function that checks whether DNA and RNA have mixed.
diff --git a/tools/rna_tools.py b/tools/rna_tools.py
deleted file mode 100644
index 926bf5c..0000000
--- a/tools/rna_tools.py
+++ /dev/null
@@ -1,79 +0,0 @@
-def is_rna(seq: str) -> bool:
-    """
-    A function for checking whether a sequence is a RNA sequence
-
-    Args: seq (str): Input string to check.
-
-    Returns: bool: True if the sequence contains only A, U, G, C.
-
-    """
-    return all(base in "aAuUgGcC" for base in seq)
-
-def reverse_transcribe(seq: str) -> str:
-    """
-    A function that reverse-transcribes RNA into DNA.
-
-    Args: seq (str): Input string to be reverse transcribed.
-
-    Returns: str: DNA sequence where U are replaced by T.
-
-    """
-    if not is_rna(seq):
-        raise ValueError(
-            "This sequence cannot be transcribed because it is not RNA."
-            "For DNA use transcribe"
-        )
-    return seq.replace("U", "T").replace("u", "t")
-
-
-def reverse(seq: str) -> str:
-    """
-    A function that reverse sequence.
-
-    Args: seq (str): Input string to be reversed.
-
-    Returns: str: Sequence written in reverse order.
-
-    """
-    return seq[::-1]
-
-
-def complement(seq: str) -> str:
-    """
-    Return the complementary RNA sequence.
-
-    Args: seq (str): RNA sequence.
-
-    Returns: str: Complementary sequence.
-
-    """
-    rna = {
-        "A": "U",
-        "U": "A",
-        "G": "C",
-        "C": "G",
-        "a": "u",
-        "u": "a",
-        "g": "c",
-        "c": "g",
-    }
-
-    if is_rna(seq):
-        return "".join(rna[base] for base in seq)
-    else:
-        raise ValueError("Sequence should be RNA.")
-
-
-def reverse_complement(seq: str) -> str:
-    """
-    A function that reverse complement.
-
-    Args: seq (str): Input string to be reversed.
-
-    Returns: str: Complement sequence written in reverse order.
-
-    """
-    if not is_rna(seq):
-        raise ValueError("Sequence should b DNA or RNA.")
-
-    return reverse(complement(seq))

From 2b7aa283c056dac69826d6f17003125134341789 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=94=D0=B0=D1=88=D0=B0=20=D0=91=D0=BE=D0=B1=D1=80=D0=BE?=
 =?UTF-8?q?=D0=B2=D0=B0?= <bobrova2116@gmail.com>
Date: Sun, 12 Oct 2025 22:40:11 +0300
Subject: [PATCH 2/5] HW5: add bio_files_processor

---
 bio_files_processor.py | 123 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)
 create mode 100644 bio_files_processor.py

diff --git a/bio_files_processor.py b/bio_files_processor.py
new file mode 100644
index 0000000..d6d5965
--- /dev/null
+++ b/bio_files_processor.py
@@ -0,0 +1,123 @@
+def convert_multiline_fasta_to_oneline(input_fasta, output_fasta=None):
+    """
+    A function that converts a FASTA file where sequences
+    are split across multiple lines
+    into a format where each sequence is on a single line.
+
+    Args:
+        input_fasta (str): path to the input file.
+        output_fasta (str, optional): path for the converted file.
+
+    Returns:
+        str: path to the converted FASTA file.
+    """
+
+    if output_fasta is None:
+        output_fasta = "converted_" + input_fasta.split("/")[-1]
+
+    with open(input_fasta, "r") as infile, open(output_fasta, "w") as outfile:
+
+        header = None
+        seq = ""
+
+        for line in infile:
+            line = line.strip()
+
+            if not line:
+                continue
+
+            if line.startswith(">"):
+                if header is not None:
+                    outfile.write(f"{header}\n{seq}\n")
+
+                header = line
+                seq = ""
+            else:
+                seq += line
+
+        if header is not None:
+            outfile.write(f"{header}\n{seq}\n")
+
+
+def parse_blast_output(input_file, output_file):
+    """
+    A function for extracting the first name for QUERY
+    from a BLAST report and saves all found names.
+
+    Args:
+        input_file (str): path to the BLAST file.
+        output_file (str): path to the file will be saved.
+
+    Returns:
+        None
+    """
+
+    results = []
+
+    with open(input_file, "r") as f:
+        lines = f.readlines()
+
+    for i in range(len(lines)):
+        line = lines[i].strip()
+
+        if "Sequences producing significant alignments" in line:
+            if i + 1 < len(lines):
+                next_line = lines[i + 1].strip()
+                if next_line:
+                    results.append(next_line)
+    results.sort()
+
+    with open(output_file, "w") as out:
+        for res in results:
+            out.write(res + "\n")
+
+
+def select_genes_from_gbk_to_fasta(
+    input_gbk: str,
+    genes: "str | list[str]",
+    n_before: int = 1,
+    n_after: int = 1,
+    output_fasta: str = "selected_genes.fasta",
+) -> None:
+    """
+    The function for extracting protein sequences
+    for genes near specified genes of interest.
+    """
+
+    with open(input_gbk, "r") as f:
+        all_lines = f.readlines()
+
+    all_genes = []
+    gene = ""
+    translation = ""
+
+    for line in all_lines:
+        line = line.rstrip()
+        if line.startswith("/gene="):
+            gene = line.split('"')[1]
+        elif line.startswith("/translation="):
+            translation = line.split('"')[1]
+            all_genes.append({"gene_name": gene, "protein": translation})
+            gene = ""
+            translation = ""
+
+    if isinstance(genes, str):
+        genes_to_find = [genes]
+    else:
+        genes_to_find = genes
+
+    genes_to_write = []
+    index = 0
+    while index < len(all_genes):
+        g = all_genes[index]
+        if g["gene_name"] in genes_to_find:
+            start_index = max(0, index - n_before)
+            end_index = min(len(all_genes), index + n_after + 1)
+            for k in range(start_index, end_index):
+                if k != index:
+                    genes_to_write.append(all_genes[k])
+        index += 1
+
+    with open(output_fasta, "w") as out_file:
+        for item in genes_to_write:
+            out_file.write(f">{item['gene_name']}\n{item['protein']}\n")

From 0f8cabf7b07ec563fb610fbd99d55ad9f5d42315 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=94=D0=B0=D1=88=D0=B0=20=D0=91=D0=BE=D0=B1=D1=80=D0=BE?=
 =?UTF-8?q?=D0=B2=D0=B0?= <bobrova2116@gmail.com>
Date: Sun, 12 Oct 2025 22:48:53 +0300
Subject: [PATCH 3/5] HW5: add read_tools for FASTA

---
 tools/read_tools.py | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 tools/read_tools.py

diff --git a/tools/read_tools.py b/tools/read_tools.py
new file mode 100644
index 0000000..6e74539
--- /dev/null
+++ b/tools/read_tools.py
@@ -0,0 +1,29 @@
+def read_fastq(input_fastq: str):
+    """
+    A function that reads a FASTQ file sequence by sequence.
+
+    Yields: tuple(str, str, str): (sequence_name, sequence, quality)
+    """
+    with open(input_fastq, "r") as f:
+        while True:
+            name = f.readline().rstrip()
+            if not name:
+                break
+            seq = f.readline().rstrip()
+            f.readline()  # plus line
+            qual = f.readline().rstrip()
+            yield name[1:], seq, qual  # remove '@' from name
+
+def write_fastq(sequence_data: tuple, output_fastq: str, overwrite=False):
+    """
+    A function that writes a single sequence to the output FASTQ file.
+
+    Args:
+    sequence_data: tuple(str, str, str) — (name, sequence, quality)
+    output_fastq: str — path to the output file
+    overwrite: bool — if True, clears the file before writing
+    """
+    mode = "w" if overwrite else "a"
+    name, seq, qual = sequence_data
+    with open(output_fastq, mode) as f:
+        f.write(f"@{name}\n{seq}\n+\n{qual}\n")

From ea45dab08effbeff6dad4bc9202c641985fe9a45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=94=D0=B0=D1=88=D0=B0=20=D0=91=D0=BE=D0=B1=D1=80=D0=BE?=
 =?UTF-8?q?=D0=B2=D0=B0?= <bobrova2116@gmail.com>
Date: Sun, 12 Oct 2025 23:09:20 +0300
Subject: [PATCH 4/5] HW5: update filter_fastq

---
 BioToolkit.py | 77 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 46 insertions(+), 31 deletions(-)

diff --git a/BioToolkit.py b/BioToolkit.py
index 3476498..a3cb5b3 100644
--- a/BioToolkit.py
+++ b/BioToolkit.py
@@ -1,15 +1,22 @@
 from tools.utils import gc_content, mean_quality
 
 from tools.rna_dna_tools import (
-    is_dna, is_rna, transcribe, reverse_transcribe,
-    reverse, complement, reverse_complement
+    is_dna,
+    is_rna,
+    transcribe,
+    reverse_transcribe,
+    reverse,
+    complement,
+    reverse_complement,
 )
+from tools.read_tools import read_fastq, write_fastq
+
 
 def run_dna_rna_tools(*args):
     """
     The main function.
 
-    Args: The last argument is the procedure name (for example, 'transcribe').
+    Args: The last argument is the procedure name.
 
      Returns: All previous arguments are sequences.
     """
@@ -20,8 +27,10 @@ def run_dna_rna_tools(*args):
     results = []
 
     for seq in sequences:
-        if procedure == "is_nucleic_acid":
-            results.append(is_nucleic_acid(seq))
+        if procedure == "is_rna":
+            results.append(is_rna(seq))
+        elif procedure == "is_dna":
+            results.append(is_dna(seq))
         elif procedure == "transcribe":
             results.append(transcribe(seq))
         elif procedure == "reverse_transcribe":
@@ -33,45 +42,51 @@ def run_dna_rna_tools(*args):
         elif procedure == "reverse_complement":
             results.append(reverse_complement(seq))
         else:
-            raise ValueError(f"There is no such procedure in the function: {procedure}")
+            raise ValueError(f"There is no such {procedure} in the function")
 
     return results[0] if len(results) == 1 else results
 
-def filter_fastq(seqs: dict, 
-                 gc_bounds=(0, 100), 
-                 length_bounds=(0, 2**32), 
-                 quality_threshold=0) -> dict:
-    """
-    A function for filtering FASTQ sequences.
-    
-    Args: seqs (dict): Dictionary with sequences and their quality.
-    gc_bounds: GC content range (in percentage).
-    length_bounds: Range of read lengths.
-    quality_threshold: Minimum average quality value.
-
-    Returns: dict: Only sequences that passed two filters.
+
+def filter_fastq(
+    input_fastq: str,
+    output_fastq: str,
+    gc_bounds=(0, 100),
+    length_bounds=(0, 2**32),
+    quality_threshold=0,
+) -> None:
     """
+    Function for Filtering sequences from a FASTQ file
+    and writes only those passing
+    GC content, length, and quality thresholds to an output FASTQ file.
 
-    filtered = {}
+    Args:
+        input_fastq (str): path to the input file.
+        output_fastq (str): path to save the filtered sequences.
+        gc_bounds (tuple or float): GC content range.
+        length_bounds (tuple or int): length range.
+        quality_threshold (int): min average quality.
+
+    Returns:
+        None
+    """
 
     if isinstance(gc_bounds, (int, float)):
         gc_min, gc_max = 0, float(gc_bounds)
     else:
         gc_min, gc_max = gc_bounds
-        
+
     if isinstance(length_bounds, (int, float)):
         len_min, len_max = 0, int(length_bounds)
     else:
         len_min, len_max = length_bounds
 
-    for name, (seq, qual) in seqs.items():
-        seq_gc = gc_content(seq)
-        seq_len = len(seq)
-        seq_qual = mean_quality(qual)
-
-        if (gc_min <= seq_gc <= gc_max and
-            len_min <= seq_len <= len_max and
-            seq_qual >= quality_threshold):
-            filtered[name] = (seq, qual)
+    mode = "w" if overwrite else "a"
 
-    return filtered
+    with open(output_fastq, mode) as out:
+        for name, seq, qual in read_fastq(input_fastq):
+            if (
+                gc_min <= gc_content(seq) <= gc_max
+                and len_min <= len(seq) <= len_max
+                and mean_quality(qual) >= quality_threshold
+            ):
+                out.write(f"@{name}\n{seq}\n+\n{qual}\n")

From 243ca3843b680c0da61a50d4d84ef6cb28267ecf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=94=D0=B0=D1=88=D0=B0=20=D0=91=D0=BE=D0=B1=D1=80=D0=BE?=
 =?UTF-8?q?=D0=B2=D0=B0?= <bobrova2116@gmail.com>
Date: Sun, 12 Oct 2025 23:20:19 +0300
Subject: [PATCH 5/5] HW5: update information

---
 README.md | 53 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 30 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index ff13af4..e6c28c8 100644
--- a/README.md
+++ b/README.md
@@ -1,23 +1,30 @@
-# BioToolkit
-
-This is small munbers of bioinformatics toolkit for work with DNA and RNA sequences.
-
-## What it can do
-
-- DNA tools: check sequence is DNA, transcribe DNA to RNA, reverse, complement, reverse complement.
-- RNA tools: check sequence is RNA, reverse transcribe RNA to DNA, reverse, complement, reverse complement.
-- Filter FASTQ sequences by GC content, length, or quality.
-
-## How to use it
-
-Clone repo:
-
-```bash
-git clone https://github.com/YOUR_USERNAME/BioToolkit.git
-cd BioToolkit
-
-## or
-
-from BioToolkit import run_dna_rna_tools
-from BioToolkit import filter_fastq
-
+# BioToolkit
+
+This is a small bioinformatics toolkit for working with DNA and RNA sequences be dadaist2001.
+
+## What it can do
+
+- DNA tools: check if a sequence is DNA, transcribe DNA to RNA, reverse, complement, reverse complement.
+- RNA tools: check if a sequence is RNA, reverse transcribe RNA to DNA, reverse, complement, reverse complement.
+- Filter FASTQ sequences by GC content, length, or quality..
+- Bioinformatics file utilities (HW5):
+  - `convert_multiline_fasta_to_oneline(input_fasta, output_fasta=None)` – converts multi-line FASTA sequences to single-line format.
+  - `parse_blast_output(input_file, output_file)` – extracts top hits from BLAST reports and saves them sorted.
+  - `select_genes_from_gbk_to_fasta` – extracts neighboring genes from GenBank files into FASTA format (monster-function).
+
+## How to use it
+
+Clone the repository:
+
+```bash
+git clone https://github.com/YOUR_USERNAME/BioToolkit.git
+cd BioToolkit
+```
+
+## For Python
+
+from BioToolkit import run_dna_rna_tools
+from BioToolkit import filter_fastq
+from BioToolkit import convert_multiline_fasta_to_oneline
+from BioToolkit import parse_blast_output
+from BioToolkit import select_genes_from_gbk_to_fasta
\ No newline at end of file