dadaist2001 · dadaist2001 · Oct 11, 2025 · Oct 12, 2025 · Oct 12, 2025 · Oct 12, 2025
diff --git a/BioToolkit.py b/BioToolkit.py
@@ -1,26 +1,22 @@
 from tools.utils import gc_content, mean_quality
 
-from tools.dna_tools import (
-    is_dna, 
-    transcribe, 
-    reverse, 
-    complement, 
-    reverse_complement
-)
-
-from tools.rna_tools import (
+from tools.rna_dna_tools import (
+    is_dna,
     is_rna,
+    transcribe,
     reverse_transcribe,
-    reverse as rna_reverse,
-    complement as rna_complement,
-    reverse_complement as rna_reverse_complement
+    reverse,
+    complement,
+    reverse_complement,
 )
+from tools.read_tools import read_fastq, write_fastq
+
 
 def run_dna_rna_tools(*args):
     """
     The main function.
 
-    Args: The last argument is the procedure name (for example, 'transcribe').
+    Args: The last argument is the procedure name.
 
      Returns: All previous arguments are sequences.
     """
@@ -30,77 +26,67 @@ def run_dna_rna_tools(*args):
     *sequences, procedure = args
     results = []
 
-
     for seq in sequences:
-        if procedure == "is_nucleic_acid":
-            results.append(is_nucleic_acid(seq))
+        if procedure == "is_rna":
+            results.append(is_rna(seq))
+        elif procedure == "is_dna":
+            results.append(is_dna(seq))
         elif procedure == "transcribe":
             results.append(transcribe(seq))
         elif procedure == "reverse_transcribe":
             results.append(reverse_transcribe(seq))
         elif procedure == "reverse":
-            if is_dna(seq):
-                results.append(reverse(seq))
-            elif is_rna(seq):
-                results.append(rna_reverse(seq))
-            else:
-                raise ValueError("Sequence should be DNA or RNA.")
+            results.append(reverse(seq))
         elif procedure == "complement":
-            if is_dna(seq):
-                results.append(complement(seq))
-            elif is_rna(seq):
-                results.append(rna_complement(seq))
-            else:
-                raise ValueError("Sequence should be DNA or RNA.")
+            results.append(complement(seq))
         elif procedure == "reverse_complement":
-            if is_dna(seq):
-                results.append(reverse_complement(seq))
-            elif is_rna(seq):
-                results.append(rna_reverse_complement(seq))
-            else:
-                raise ValueError("Sequence should be DNA or RNA.")
+            results.append(reverse_complement(seq))
         else:
-            raise ValueError(
-                f"There is no such procedure in the function: {procedure}"
-            )
+            raise ValueError(f"There is no such {procedure} in the function")
 
     return results[0] if len(results) == 1 else results
 
-def filter_fastq(seqs: dict, 
-                 gc_bounds=(0, 100), 
-                 length_bounds=(0, 2**32), 
-                 quality_threshold=0) -> dict:
+
+def filter_fastq(
+    input_fastq: str,
+    output_fastq: str,
+    gc_bounds=(0, 100),
+    length_bounds=(0, 2**32),
+    quality_threshold=0,
+) -> None:
     """
-    A function for filtering FASTQ sequences.
-
-    Args: seqs (dict): Dictionary with sequences and their quality.
-    gc_bounds: GC content range (in percentage).
-    length_bounds: Range of read lengths.
-    quality_threshold: Minimum average quality value.
-
-    Returns: dict: Only sequences that passed two filters.
+    Function for Filtering sequences from a FASTQ file
+    and writes only those passing
+    GC content, length, and quality thresholds to an output FASTQ file.
+
+    Args:
+        input_fastq (str): path to the input file.
+        output_fastq (str): path to save the filtered sequences.
+        gc_bounds (tuple or float): GC content range.
+        length_bounds (tuple or int): length range.
+        quality_threshold (int): min average quality.
+
+    Returns:
+        None
     """
 
-    filtered = {}
-
-    if type(gc_bounds) in (int, float):
+    if isinstance(gc_bounds, (int, float)):
         gc_min, gc_max = 0, float(gc_bounds)
     else:
         gc_min, gc_max = gc_bounds
 
-    if type(length_bounds) in (int, float):
+    if isinstance(length_bounds, (int, float)):
         len_min, len_max = 0, int(length_bounds)
     else:
         len_min, len_max = length_bounds
 
-    for name, (seq, qual) in seqs.items():
-        seq_gc = gc_content(seq)
-        seq_len = len(seq)
-        seq_qual = mean_quality(qual)
-
-        if (gc_min <= seq_gc <= gc_max and
-            len_min <= seq_len <= len_max and
-            seq_qual >= quality_threshold):
-            filtered[name] = (seq, qual)
+    mode = "w" if overwrite else "a"
 
-    return filtered
+    with open(output_fastq, mode) as out:
+        for name, seq, qual in read_fastq(input_fastq):
+            if (
+                gc_min <= gc_content(seq) <= gc_max
+                and len_min <= len(seq) <= len_max
+                and mean_quality(qual) >= quality_threshold
+            ):
+                out.write(f"@{name}\n{seq}\n+\n{qual}\n")
diff --git a/README.md b/README.md
@@ -1,23 +1,30 @@
-# BioToolkit
-
-This is small munbers of bioinformatics toolkit for work with DNA and RNA sequences.
-
-## What it can do
-
-- DNA tools: check sequence is DNA, transcribe DNA to RNA, reverse, complement, reverse complement.
-- RNA tools: check sequence is RNA, reverse transcribe RNA to DNA, reverse, complement, reverse complement.
-- Filter FASTQ sequences by GC content, length, or quality.
-
-## How to use it
-
-Clone repo:
-
-```bash
-git clone https://github.com/YOUR_USERNAME/BioToolkit.git
-cd BioToolkit
-
-## or
-
-from BioToolkit import run_dna_rna_tools
-from BioToolkit import filter_fastq
-
+# BioToolkit
+
+This is a small bioinformatics toolkit for working with DNA and RNA sequences be dadaist2001.
-This is a small bioinformatics toolkit for working with DNA and RNA sequences be dadaist2001.
+This is a small bioinformatics toolkit for working with DNA and RNA sequences by dadaist2001.
+
-This is a small bioinformatics toolkit for working with DNA and RNA sequences be dadaist2001.
+This is a small bioinformatics toolkit for working with DNA and RNA sequences by dadaist2001.
+
+
+## What it can do
+
+- DNA tools: check if a sequence is DNA, transcribe DNA to RNA, reverse, complement, reverse complement.
+- RNA tools: check if a sequence is RNA, reverse transcribe RNA to DNA, reverse, complement, reverse complement.
+- Filter FASTQ sequences by GC content, length, or quality..
+- Bioinformatics file utilities (HW5):
+  - `convert_multiline_fasta_to_oneline(input_fasta, output_fasta=None)` – converts multi-line FASTA sequences to single-line format.
+  - `parse_blast_output(input_file, output_file)` – extracts top hits from BLAST reports and saves them sorted.
+  - `select_genes_from_gbk_to_fasta` – extracts neighboring genes from GenBank files into FASTA format (monster-function).
+
+## How to use it
+
+Clone the repository:
+
+```bash
+git clone https://github.com/YOUR_USERNAME/BioToolkit.git
+cd BioToolkit
+```
+
+## For Python
+
+from BioToolkit import run_dna_rna_tools
+from BioToolkit import filter_fastq
+from BioToolkit import convert_multiline_fasta_to_oneline
+from BioToolkit import parse_blast_output
+from BioToolkit import select_genes_from_gbk_to_fasta
diff --git a/bio_files_processor.py b/bio_files_processor.py
@@ -0,0 +1,123 @@
+def convert_multiline_fasta_to_oneline(input_fasta, output_fasta=None):
+    """
+    A function that converts a FASTA file where sequences
+    are split across multiple lines
+    into a format where each sequence is on a single line.
+
+    Args:
+        input_fasta (str): path to the input file.
+        output_fasta (str, optional): path for the converted file.
+
+    Returns:
+        str: path to the converted FASTA file.
+    """
+
+    if output_fasta is None:
+        output_fasta = "converted_" + input_fasta.split("/")[-1]
+
+    with open(input_fasta, "r") as infile, open(output_fasta, "w") as outfile:
+
+        header = None
+        seq = ""
+
+        for line in infile:
+            line = line.strip()
+
+            if not line:
+                continue
+
+            if line.startswith(">"):
+                if header is not None:
+                    outfile.write(f"{header}\n{seq}\n")
+
+                header = line
+                seq = ""
+            else:
+                seq += line
+
+        if header is not None:
+            outfile.write(f"{header}\n{seq}\n")
+
+
+def parse_blast_output(input_file, output_file):
+    """
+    A function for extracting the first name for QUERY
+    from a BLAST report and saves all found names.
+
+    Args:
+        input_file (str): path to the BLAST file.
+        output_file (str): path to the file will be saved.
+
+    Returns:
+        None
+    """
+
+    results = []
+
+    with open(input_file, "r") as f:
+        lines = f.readlines()
+
+    for i in range(len(lines)):
+        line = lines[i].strip()
+
+        if "Sequences producing significant alignments" in line:
+            if i + 1 < len(lines):
+                next_line = lines[i + 1].strip()
+                if next_line:
+                    results.append(next_line)
+    results.sort()
+
+    with open(output_file, "w") as out:
+        for res in results:
+            out.write(res + "\n")
+
+
+def select_genes_from_gbk_to_fasta(
+    input_gbk: str,
+    genes: "str | list[str]",
+    n_before: int = 1,
+    n_after: int = 1,
+    output_fasta: str = "selected_genes.fasta",
+) -> None:
+    """
+    The function for extracting protein sequences
+    for genes near specified genes of interest.
+    """
+
+    with open(input_gbk, "r") as f:
+        all_lines = f.readlines()
+
+    all_genes = []
+    gene = ""
+    translation = ""
+
+    for line in all_lines:
+        line = line.rstrip()
+        if line.startswith("/gene="):
+            gene = line.split('"')[1]
+        elif line.startswith("/translation="):
+            translation = line.split('"')[1]
+            all_genes.append({"gene_name": gene, "protein": translation})
+            gene = ""
+            translation = ""
+
+    if isinstance(genes, str):
+        genes_to_find = [genes]
+    else:
+        genes_to_find = genes
+
+    genes_to_write = []
+    index = 0
+    while index < len(all_genes):
+        g = all_genes[index]
+        if g["gene_name"] in genes_to_find:
+            start_index = max(0, index - n_before)
+            end_index = min(len(all_genes), index + n_after + 1)
+            for k in range(start_index, end_index):
+                if k != index:
+                    genes_to_write.append(all_genes[k])
+        index += 1
+
+    with open(output_fasta, "w") as out_file:
+        for item in genes_to_write:
+            out_file.write(f">{item['gene_name']}\n{item['protein']}\n")