From 6c194e88eabc3142fb723dfad9584abe5321342a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=B0=D1=88=D0=B0=20=D0=91=D0=BE=D0=B1=D1=80=D0=BE?= =?UTF-8?q?=D0=B2=D0=B0?= Date: Sat, 11 Oct 2025 23:46:51 +0300 Subject: [PATCH 1/5] HW4: functions have simplified, a common utility for DNA and RNA has created --- BioToolkit.py | 51 ++++----------- tools/{dna_tools.py => dna_rna_tools.py} | 56 ++++++++++++----- tools/other_tools.py | 2 + tools/rna_tools.py | 79 ------------------------ 4 files changed, 54 insertions(+), 134 deletions(-) rename tools/{dna_tools.py => dna_rna_tools.py} (54%) delete mode 100644 tools/rna_tools.py diff --git a/BioToolkit.py b/BioToolkit.py index 4752717..3476498 100644 --- a/BioToolkit.py +++ b/BioToolkit.py @@ -1,19 +1,8 @@ from tools.utils import gc_content, mean_quality -from tools.dna_tools import ( - is_dna, - transcribe, - reverse, - complement, - reverse_complement -) - -from tools.rna_tools import ( - is_rna, - reverse_transcribe, - reverse as rna_reverse, - complement as rna_complement, - reverse_complement as rna_reverse_complement +from tools.rna_dna_tools import ( + is_dna, is_rna, transcribe, reverse_transcribe, + reverse, complement, reverse_complement ) def run_dna_rna_tools(*args): @@ -30,7 +19,6 @@ def run_dna_rna_tools(*args): *sequences, procedure = args results = [] - for seq in sequences: if procedure == "is_nucleic_acid": results.append(is_nucleic_acid(seq)) @@ -39,30 +27,13 @@ def run_dna_rna_tools(*args): elif procedure == "reverse_transcribe": results.append(reverse_transcribe(seq)) elif procedure == "reverse": - if is_dna(seq): - results.append(reverse(seq)) - elif is_rna(seq): - results.append(rna_reverse(seq)) - else: - raise ValueError("Sequence should be DNA or RNA.") + results.append(reverse(seq)) elif procedure == "complement": - if is_dna(seq): - results.append(complement(seq)) - elif is_rna(seq): - results.append(rna_complement(seq)) - else: - raise ValueError("Sequence should be DNA or RNA.") + results.append(complement(seq)) elif procedure == "reverse_complement": - if is_dna(seq): - results.append(reverse_complement(seq)) - elif is_rna(seq): - results.append(rna_reverse_complement(seq)) - else: - raise ValueError("Sequence should be DNA or RNA.") + results.append(reverse_complement(seq)) else: - raise ValueError( - f"There is no such procedure in the function: {procedure}" - ) + raise ValueError(f"There is no such procedure in the function: {procedure}") return results[0] if len(results) == 1 else results @@ -82,13 +53,13 @@ def filter_fastq(seqs: dict, """ filtered = {} - - if type(gc_bounds) in (int, float): + + if isinstance(gc_bounds, (int, float)): gc_min, gc_max = 0, float(gc_bounds) else: gc_min, gc_max = gc_bounds - - if type(length_bounds) in (int, float): + + if isinstance(length_bounds, (int, float)): len_min, len_max = 0, int(length_bounds) else: len_min, len_max = length_bounds diff --git a/tools/dna_tools.py b/tools/dna_rna_tools.py similarity index 54% rename from tools/dna_tools.py rename to tools/dna_rna_tools.py index 566f06d..e6e06a3 100644 --- a/tools/dna_tools.py +++ b/tools/dna_rna_tools.py @@ -10,6 +10,18 @@ def is_dna(seq: str) -> bool: return all(base in "aAtTgGcC" for base in seq) +def is_rna(seq: str) -> bool: + """ + A function for checking whether a sequence is a RNA sequence + + Args: seq (str): Input string to check. + + Returns: bool: True if the sequence contains only A, U, G, C. + + """ + return all(base in "aAuUgGcC" for base in seq) + + def transcribe(seq: str) -> str: """ A function that transcribes DNA into RNA. @@ -21,12 +33,29 @@ def transcribe(seq: str) -> str: """ if not is_dna(seq): raise ValueError( - "This sequence cannot be transcribed because it is not DNA. " + "This sequence cannot be transcribed because it is not DNA." "For retroviruses, use reverse_transcribe." ) return seq.replace("T", "U").replace("t", "u") +def reverse_transcribe(seq: str) -> str: + """ + A function that reverse-transcribes RNA into DNA. + + Args: seq (str): Input string to be reverse transcribed. + + Returns: str: DNA sequence where U are replaced by T. + + """ + if not is_rna(seq): + raise ValueError( + "This sequence cannot be transcribed because it is not RNA." + "For DNA use transcribe" + ) + return seq.replace("U", "T").replace("u", "t") + + def reverse(seq: str) -> str: """ A function that reverse sequence. @@ -48,21 +77,21 @@ def complement(seq: str) -> str: Returns: str: Complementary sequence. """ - dna = { - "A": "T", - "T": "A", - "G": "C", - "C": "G", - "a": "t", - "t": "a", - "g": "c", - "c": "g", + dna_table = { + "A": "T", "T": "A", "G": "C", "C": "G", + "a": "t", "t": "a", "g": "c", "c": "g", + } + rna_table = { + "A": "U", "U": "A", "G": "C", "C": "G", + "a": "u", "u": "a", "g": "c", "c": "g", } if is_dna(seq): - return "".join(dna[base] for base in seq) + return "".join(dna_table[base] for base in seq) + elif is_rna(seq): + return "".join(rna_table[base] for base in seq) else: - raise ValueError("Sequence should be DNA.") + raise ValueError("Sequence should be DNA or RNA.") def reverse_complement(seq: str) -> str: @@ -74,7 +103,4 @@ def reverse_complement(seq: str) -> str: Returns: str: Complement sequence written in reverse order. """ - if not is_dna(seq): - raise ValueError("Sequence should be DNA.") - return reverse(complement(seq)) diff --git a/tools/other_tools.py b/tools/other_tools.py index 07166c3..d41342c 100644 --- a/tools/other_tools.py +++ b/tools/other_tools.py @@ -1,3 +1,5 @@ +from tools.rna_dna_tools import is_dna, is_rna + def is_nucleic_acid(seq: str) -> bool: """ A function that checks whether DNA and RNA have mixed. diff --git a/tools/rna_tools.py b/tools/rna_tools.py deleted file mode 100644 index 926bf5c..0000000 --- a/tools/rna_tools.py +++ /dev/null @@ -1,79 +0,0 @@ -def is_rna(seq: str) -> bool: - """ - A function for checking whether a sequence is a RNA sequence - - Args: seq (str): Input string to check. - - Returns: bool: True if the sequence contains only A, U, G, C. - - """ - return all(base in "aAuUgGcC" for base in seq) - -def reverse_transcribe(seq: str) -> str: - """ - A function that reverse-transcribes RNA into DNA. - - Args: seq (str): Input string to be reverse transcribed. - - Returns: str: DNA sequence where U are replaced by T. - - """ - if not is_rna(seq): - raise ValueError( - "This sequence cannot be transcribed because it is not RNA." - "For DNA use transcribe" - ) - return seq.replace("U", "T").replace("u", "t") - - -def reverse(seq: str) -> str: - """ - A function that reverse sequence. - - Args: seq (str): Input string to be reversed. - - Returns: str: Sequence written in reverse order. - - """ - return seq[::-1] - - -def complement(seq: str) -> str: - """ - Return the complementary RNA sequence. - - Args: seq (str): RNA sequence. - - Returns: str: Complementary sequence. - - """ - rna = { - "A": "U", - "U": "A", - "G": "C", - "C": "G", - "a": "u", - "u": "a", - "g": "c", - "c": "g", - } - - if is_rna(seq): - return "".join(rna[base] for base in seq) - else: - raise ValueError("Sequence should be RNA.") - - -def reverse_complement(seq: str) -> str: - """ - A function that reverse complement. - - Args: seq (str): Input string to be reversed. - - Returns: str: Complement sequence written in reverse order. - - """ - if not is_rna(seq): - raise ValueError("Sequence should b DNA or RNA.") - - return reverse(complement(seq)) From 2b7aa283c056dac69826d6f17003125134341789 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=B0=D1=88=D0=B0=20=D0=91=D0=BE=D0=B1=D1=80=D0=BE?= =?UTF-8?q?=D0=B2=D0=B0?= Date: Sun, 12 Oct 2025 22:40:11 +0300 Subject: [PATCH 2/5] HW5: add bio_files_processor --- bio_files_processor.py | 123 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 bio_files_processor.py diff --git a/bio_files_processor.py b/bio_files_processor.py new file mode 100644 index 0000000..d6d5965 --- /dev/null +++ b/bio_files_processor.py @@ -0,0 +1,123 @@ +def convert_multiline_fasta_to_oneline(input_fasta, output_fasta=None): + """ + A function that converts a FASTA file where sequences + are split across multiple lines + into a format where each sequence is on a single line. + + Args: + input_fasta (str): path to the input file. + output_fasta (str, optional): path for the converted file. + + Returns: + str: path to the converted FASTA file. + """ + + if output_fasta is None: + output_fasta = "converted_" + input_fasta.split("/")[-1] + + with open(input_fasta, "r") as infile, open(output_fasta, "w") as outfile: + + header = None + seq = "" + + for line in infile: + line = line.strip() + + if not line: + continue + + if line.startswith(">"): + if header is not None: + outfile.write(f"{header}\n{seq}\n") + + header = line + seq = "" + else: + seq += line + + if header is not None: + outfile.write(f"{header}\n{seq}\n") + + +def parse_blast_output(input_file, output_file): + """ + A function for extracting the first name for QUERY + from a BLAST report and saves all found names. + + Args: + input_file (str): path to the BLAST file. + output_file (str): path to the file will be saved. + + Returns: + None + """ + + results = [] + + with open(input_file, "r") as f: + lines = f.readlines() + + for i in range(len(lines)): + line = lines[i].strip() + + if "Sequences producing significant alignments" in line: + if i + 1 < len(lines): + next_line = lines[i + 1].strip() + if next_line: + results.append(next_line) + results.sort() + + with open(output_file, "w") as out: + for res in results: + out.write(res + "\n") + + +def select_genes_from_gbk_to_fasta( + input_gbk: str, + genes: "str | list[str]", + n_before: int = 1, + n_after: int = 1, + output_fasta: str = "selected_genes.fasta", +) -> None: + """ + The function for extracting protein sequences + for genes near specified genes of interest. + """ + + with open(input_gbk, "r") as f: + all_lines = f.readlines() + + all_genes = [] + gene = "" + translation = "" + + for line in all_lines: + line = line.rstrip() + if line.startswith("/gene="): + gene = line.split('"')[1] + elif line.startswith("/translation="): + translation = line.split('"')[1] + all_genes.append({"gene_name": gene, "protein": translation}) + gene = "" + translation = "" + + if isinstance(genes, str): + genes_to_find = [genes] + else: + genes_to_find = genes + + genes_to_write = [] + index = 0 + while index < len(all_genes): + g = all_genes[index] + if g["gene_name"] in genes_to_find: + start_index = max(0, index - n_before) + end_index = min(len(all_genes), index + n_after + 1) + for k in range(start_index, end_index): + if k != index: + genes_to_write.append(all_genes[k]) + index += 1 + + with open(output_fasta, "w") as out_file: + for item in genes_to_write: + out_file.write(f">{item['gene_name']}\n{item['protein']}\n") From 0f8cabf7b07ec563fb610fbd99d55ad9f5d42315 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=B0=D1=88=D0=B0=20=D0=91=D0=BE=D0=B1=D1=80=D0=BE?= =?UTF-8?q?=D0=B2=D0=B0?= Date: Sun, 12 Oct 2025 22:48:53 +0300 Subject: [PATCH 3/5] HW5: add read_tools for FASTA --- tools/read_tools.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 tools/read_tools.py diff --git a/tools/read_tools.py b/tools/read_tools.py new file mode 100644 index 0000000..6e74539 --- /dev/null +++ b/tools/read_tools.py @@ -0,0 +1,29 @@ +def read_fastq(input_fastq: str): + """ + A function that reads a FASTQ file sequence by sequence. + + Yields: tuple(str, str, str): (sequence_name, sequence, quality) + """ + with open(input_fastq, "r") as f: + while True: + name = f.readline().rstrip() + if not name: + break + seq = f.readline().rstrip() + f.readline() # plus line + qual = f.readline().rstrip() + yield name[1:], seq, qual # remove '@' from name + +def write_fastq(sequence_data: tuple, output_fastq: str, overwrite=False): + """ + A function that writes a single sequence to the output FASTQ file. + + Args: + sequence_data: tuple(str, str, str) — (name, sequence, quality) + output_fastq: str — path to the output file + overwrite: bool — if True, clears the file before writing + """ + mode = "w" if overwrite else "a" + name, seq, qual = sequence_data + with open(output_fastq, mode) as f: + f.write(f"@{name}\n{seq}\n+\n{qual}\n") From ea45dab08effbeff6dad4bc9202c641985fe9a45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=B0=D1=88=D0=B0=20=D0=91=D0=BE=D0=B1=D1=80=D0=BE?= =?UTF-8?q?=D0=B2=D0=B0?= Date: Sun, 12 Oct 2025 23:09:20 +0300 Subject: [PATCH 4/5] HW5: update filter_fastq --- BioToolkit.py | 77 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 46 insertions(+), 31 deletions(-) diff --git a/BioToolkit.py b/BioToolkit.py index 3476498..a3cb5b3 100644 --- a/BioToolkit.py +++ b/BioToolkit.py @@ -1,15 +1,22 @@ from tools.utils import gc_content, mean_quality from tools.rna_dna_tools import ( - is_dna, is_rna, transcribe, reverse_transcribe, - reverse, complement, reverse_complement + is_dna, + is_rna, + transcribe, + reverse_transcribe, + reverse, + complement, + reverse_complement, ) +from tools.read_tools import read_fastq, write_fastq + def run_dna_rna_tools(*args): """ The main function. - Args: The last argument is the procedure name (for example, 'transcribe'). + Args: The last argument is the procedure name. Returns: All previous arguments are sequences. """ @@ -20,8 +27,10 @@ def run_dna_rna_tools(*args): results = [] for seq in sequences: - if procedure == "is_nucleic_acid": - results.append(is_nucleic_acid(seq)) + if procedure == "is_rna": + results.append(is_rna(seq)) + elif procedure == "is_dna": + results.append(is_dna(seq)) elif procedure == "transcribe": results.append(transcribe(seq)) elif procedure == "reverse_transcribe": @@ -33,45 +42,51 @@ def run_dna_rna_tools(*args): elif procedure == "reverse_complement": results.append(reverse_complement(seq)) else: - raise ValueError(f"There is no such procedure in the function: {procedure}") + raise ValueError(f"There is no such {procedure} in the function") return results[0] if len(results) == 1 else results -def filter_fastq(seqs: dict, - gc_bounds=(0, 100), - length_bounds=(0, 2**32), - quality_threshold=0) -> dict: - """ - A function for filtering FASTQ sequences. - - Args: seqs (dict): Dictionary with sequences and their quality. - gc_bounds: GC content range (in percentage). - length_bounds: Range of read lengths. - quality_threshold: Minimum average quality value. - - Returns: dict: Only sequences that passed two filters. + +def filter_fastq( + input_fastq: str, + output_fastq: str, + gc_bounds=(0, 100), + length_bounds=(0, 2**32), + quality_threshold=0, +) -> None: """ + Function for Filtering sequences from a FASTQ file + and writes only those passing + GC content, length, and quality thresholds to an output FASTQ file. - filtered = {} + Args: + input_fastq (str): path to the input file. + output_fastq (str): path to save the filtered sequences. + gc_bounds (tuple or float): GC content range. + length_bounds (tuple or int): length range. + quality_threshold (int): min average quality. + + Returns: + None + """ if isinstance(gc_bounds, (int, float)): gc_min, gc_max = 0, float(gc_bounds) else: gc_min, gc_max = gc_bounds - + if isinstance(length_bounds, (int, float)): len_min, len_max = 0, int(length_bounds) else: len_min, len_max = length_bounds - for name, (seq, qual) in seqs.items(): - seq_gc = gc_content(seq) - seq_len = len(seq) - seq_qual = mean_quality(qual) - - if (gc_min <= seq_gc <= gc_max and - len_min <= seq_len <= len_max and - seq_qual >= quality_threshold): - filtered[name] = (seq, qual) + mode = "w" if overwrite else "a" - return filtered + with open(output_fastq, mode) as out: + for name, seq, qual in read_fastq(input_fastq): + if ( + gc_min <= gc_content(seq) <= gc_max + and len_min <= len(seq) <= len_max + and mean_quality(qual) >= quality_threshold + ): + out.write(f"@{name}\n{seq}\n+\n{qual}\n") From 243ca3843b680c0da61a50d4d84ef6cb28267ecf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=B0=D1=88=D0=B0=20=D0=91=D0=BE=D0=B1=D1=80=D0=BE?= =?UTF-8?q?=D0=B2=D0=B0?= Date: Sun, 12 Oct 2025 23:20:19 +0300 Subject: [PATCH 5/5] HW5: update information --- README.md | 53 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index ff13af4..e6c28c8 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,30 @@ -# BioToolkit - -This is small munbers of bioinformatics toolkit for work with DNA and RNA sequences. - -## What it can do - -- DNA tools: check sequence is DNA, transcribe DNA to RNA, reverse, complement, reverse complement. -- RNA tools: check sequence is RNA, reverse transcribe RNA to DNA, reverse, complement, reverse complement. -- Filter FASTQ sequences by GC content, length, or quality. - -## How to use it - -Clone repo: - -```bash -git clone https://github.com/YOUR_USERNAME/BioToolkit.git -cd BioToolkit - -## or - -from BioToolkit import run_dna_rna_tools -from BioToolkit import filter_fastq - +# BioToolkit + +This is a small bioinformatics toolkit for working with DNA and RNA sequences be dadaist2001. + +## What it can do + +- DNA tools: check if a sequence is DNA, transcribe DNA to RNA, reverse, complement, reverse complement. +- RNA tools: check if a sequence is RNA, reverse transcribe RNA to DNA, reverse, complement, reverse complement. +- Filter FASTQ sequences by GC content, length, or quality.. +- Bioinformatics file utilities (HW5): + - `convert_multiline_fasta_to_oneline(input_fasta, output_fasta=None)` – converts multi-line FASTA sequences to single-line format. + - `parse_blast_output(input_file, output_file)` – extracts top hits from BLAST reports and saves them sorted. + - `select_genes_from_gbk_to_fasta` – extracts neighboring genes from GenBank files into FASTA format (monster-function). + +## How to use it + +Clone the repository: + +```bash +git clone https://github.com/YOUR_USERNAME/BioToolkit.git +cd BioToolkit +``` + +## For Python + +from BioToolkit import run_dna_rna_tools +from BioToolkit import filter_fastq +from BioToolkit import convert_multiline_fasta_to_oneline +from BioToolkit import parse_blast_output +from BioToolkit import select_genes_from_gbk_to_fasta \ No newline at end of file