Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 49 additions & 63 deletions BioToolkit.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,22 @@
from tools.utils import gc_content, mean_quality

from tools.dna_tools import (
is_dna,
transcribe,
reverse,
complement,
reverse_complement
)

from tools.rna_tools import (
from tools.rna_dna_tools import (
is_dna,
is_rna,
transcribe,
reverse_transcribe,
reverse as rna_reverse,
complement as rna_complement,
reverse_complement as rna_reverse_complement
reverse,
complement,
reverse_complement,
)
from tools.read_tools import read_fastq, write_fastq


def run_dna_rna_tools(*args):
"""
The main function.

Args: The last argument is the procedure name (for example, 'transcribe').
Args: The last argument is the procedure name.

Returns: All previous arguments are sequences.
"""
Expand All @@ -30,77 +26,67 @@ def run_dna_rna_tools(*args):
*sequences, procedure = args
results = []


for seq in sequences:
if procedure == "is_nucleic_acid":
results.append(is_nucleic_acid(seq))
if procedure == "is_rna":
results.append(is_rna(seq))
elif procedure == "is_dna":
results.append(is_dna(seq))
elif procedure == "transcribe":
results.append(transcribe(seq))
elif procedure == "reverse_transcribe":
results.append(reverse_transcribe(seq))
elif procedure == "reverse":
if is_dna(seq):
results.append(reverse(seq))
elif is_rna(seq):
results.append(rna_reverse(seq))
else:
raise ValueError("Sequence should be DNA or RNA.")
results.append(reverse(seq))
elif procedure == "complement":
if is_dna(seq):
results.append(complement(seq))
elif is_rna(seq):
results.append(rna_complement(seq))
else:
raise ValueError("Sequence should be DNA or RNA.")
results.append(complement(seq))
elif procedure == "reverse_complement":
if is_dna(seq):
results.append(reverse_complement(seq))
elif is_rna(seq):
results.append(rna_reverse_complement(seq))
else:
raise ValueError("Sequence should be DNA or RNA.")
results.append(reverse_complement(seq))
else:
raise ValueError(
f"There is no such procedure in the function: {procedure}"
)
raise ValueError(f"There is no such {procedure} in the function")

return results[0] if len(results) == 1 else results

def filter_fastq(seqs: dict,
gc_bounds=(0, 100),
length_bounds=(0, 2**32),
quality_threshold=0) -> dict:

def filter_fastq(
input_fastq: str,
output_fastq: str,
gc_bounds=(0, 100),
length_bounds=(0, 2**32),
quality_threshold=0,
) -> None:
"""
A function for filtering FASTQ sequences.

Args: seqs (dict): Dictionary with sequences and their quality.
gc_bounds: GC content range (in percentage).
length_bounds: Range of read lengths.
quality_threshold: Minimum average quality value.

Returns: dict: Only sequences that passed two filters.
Function for Filtering sequences from a FASTQ file
and writes only those passing
GC content, length, and quality thresholds to an output FASTQ file.

Args:
input_fastq (str): path to the input file.
output_fastq (str): path to save the filtered sequences.
gc_bounds (tuple or float): GC content range.
length_bounds (tuple or int): length range.
quality_threshold (int): min average quality.

Returns:
None
"""

filtered = {}

if type(gc_bounds) in (int, float):
if isinstance(gc_bounds, (int, float)):
gc_min, gc_max = 0, float(gc_bounds)
else:
gc_min, gc_max = gc_bounds

if type(length_bounds) in (int, float):
if isinstance(length_bounds, (int, float)):
len_min, len_max = 0, int(length_bounds)
else:
len_min, len_max = length_bounds

for name, (seq, qual) in seqs.items():
seq_gc = gc_content(seq)
seq_len = len(seq)
seq_qual = mean_quality(qual)

if (gc_min <= seq_gc <= gc_max and
len_min <= seq_len <= len_max and
seq_qual >= quality_threshold):
filtered[name] = (seq, qual)
mode = "w" if overwrite else "a"

return filtered
with open(output_fastq, mode) as out:

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Она принимает из главной фунции сиквенсы на запись и output_fastq, и сохраняет отфильтрованные данные в папку filtered (ее может и не быть) по имени output_fastq.

Не реализована часть задания. Совсем для красоты было бы еще проверить, output_fastq уже .fastq или нет, но это так.

for name, seq, qual in read_fastq(input_fastq):
if (
gc_min <= gc_content(seq) <= gc_max
and len_min <= len(seq) <= len_max
and mean_quality(qual) >= quality_threshold
):
out.write(f"@{name}\n{seq}\n+\n{qual}\n")
53 changes: 30 additions & 23 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,23 +1,30 @@
# BioToolkit

This is small munbers of bioinformatics toolkit for work with DNA and RNA sequences.

## What it can do

- DNA tools: check sequence is DNA, transcribe DNA to RNA, reverse, complement, reverse complement.
- RNA tools: check sequence is RNA, reverse transcribe RNA to DNA, reverse, complement, reverse complement.
- Filter FASTQ sequences by GC content, length, or quality.

## How to use it

Clone repo:

```bash
git clone https://github.com/YOUR_USERNAME/BioToolkit.git
cd BioToolkit

## or

from BioToolkit import run_dna_rna_tools
from BioToolkit import filter_fastq

# BioToolkit

This is a small bioinformatics toolkit for working with DNA and RNA sequences be dadaist2001.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
This is a small bioinformatics toolkit for working with DNA and RNA sequences be dadaist2001.
This is a small bioinformatics toolkit for working with DNA and RNA sequences by dadaist2001.


## What it can do

- DNA tools: check if a sequence is DNA, transcribe DNA to RNA, reverse, complement, reverse complement.
- RNA tools: check if a sequence is RNA, reverse transcribe RNA to DNA, reverse, complement, reverse complement.
- Filter FASTQ sequences by GC content, length, or quality..
- Bioinformatics file utilities (HW5):
- `convert_multiline_fasta_to_oneline(input_fasta, output_fasta=None)` – converts multi-line FASTA sequences to single-line format.
- `parse_blast_output(input_file, output_file)` – extracts top hits from BLAST reports and saves them sorted.
- `select_genes_from_gbk_to_fasta` – extracts neighboring genes from GenBank files into FASTA format (monster-function).

## How to use it

Clone the repository:

```bash
git clone https://github.com/YOUR_USERNAME/BioToolkit.git
cd BioToolkit
```

## For Python

from BioToolkit import run_dna_rna_tools
from BioToolkit import filter_fastq
from BioToolkit import convert_multiline_fasta_to_oneline
from BioToolkit import parse_blast_output
from BioToolkit import select_genes_from_gbk_to_fasta
123 changes: 123 additions & 0 deletions bio_files_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
def convert_multiline_fasta_to_oneline(input_fasta, output_fasta=None):
"""
A function that converts a FASTA file where sequences
are split across multiple lines
into a format where each sequence is on a single line.

Args:
input_fasta (str): path to the input file.
output_fasta (str, optional): path for the converted file.

Returns:
str: path to the converted FASTA file.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Неправда, не возвращает такое.

"""

if output_fasta is None:
output_fasta = "converted_" + input_fasta.split("/")[-1]
Comment on lines +15 to +16

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Да, хороший способ справиться. Единственное что на свой вкус я бы подписала как "oneline" или еще с каким-то более явным указанием на то, что конвертед то.


with open(input_fasta, "r") as infile, open(output_fasta, "w") as outfile:

header = None
seq = ""

for line in infile:
line = line.strip()

if not line:
continue

if line.startswith(">"):
if header is not None:
outfile.write(f"{header}\n{seq}\n")

header = line
seq = ""
else:
seq += line

if header is not None:
outfile.write(f"{header}\n{seq}\n")


def parse_blast_output(input_file, output_file):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Не хватает аннотаций типов.

"""
A function for extracting the first name for QUERY
from a BLAST report and saves all found names.

Args:
input_file (str): path to the BLAST file.
output_file (str): path to the file will be saved.

Returns:
None
"""

results = []

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Список вместо множества. Возможны дубликаты при нескольких QUERY с одинаковым топ-хитом.


with open(input_file, "r") as f:
lines = f.readlines()

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Для файлов BLAST это может быть большим объемом, не стоить загружать их целиком в память.


for i in range(len(lines)):
line = lines[i].strip()

if "Sequences producing significant alignments" in line:
if i + 1 < len(lines):
next_line = lines[i + 1].strip()

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Вот из-за этой части в ответе не ожидаемый формат, а
Scientific Common Max Total Query E Per. Acc.
Нужно искать не следующую строку, а следующую не пустую строку, которая при этом не заголовки таблиц. Тут может быть несколько подходов, например, при итерации по строкам можно найти строку, начинающуюся с Description, и взять следующую за ней.

if next_line:
results.append(next_line)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

В задании нужен только Description, а тут добавится полная строка, в которой есть и другие вещи (даже если исправить комментарий выше).

results.sort()

with open(output_file, "w") as out:
for res in results:
out.write(res + "\n")


def select_genes_from_gbk_to_fasta(
input_gbk: str,
genes: "str | list[str]",
n_before: int = 1,
n_after: int = 1,
output_fasta: str = "selected_genes.fasta",
) -> None:
"""
The function for extracting protein sequences
for genes near specified genes of interest.
"""

with open(input_gbk, "r") as f:
all_lines = f.readlines()

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Аналогично, не стоит читать файлы целиком в память. Через readline можно работать с ними на лету, читая по 1 строке за раз.


all_genes = []
gene = ""
translation = ""

for line in all_lines:
line = line.rstrip()

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rstrip это rights strip, он не удалит отступы слева и строка никогда не будет начинаться с того, что ищется дальше.

if line.startswith("/gene="):
gene = line.split('"')[1]
elif line.startswith("/translation="):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Перевод часто многострочный и это потеряется в таком подходе.

translation = line.split('"')[1]

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

А в таком формате потеряется всегда, потому что в строке просто не будет закрывающей кавычки.

all_genes.append({"gene_name": gene, "protein": translation})
gene = ""
translation = ""

if isinstance(genes, str):
genes_to_find = [genes]
else:
genes_to_find = genes

genes_to_write = []
index = 0
while index < len(all_genes):
g = all_genes[index]
if g["gene_name"] in genes_to_find:
start_index = max(0, index - n_before)
end_index = min(len(all_genes), index + n_after + 1)
for k in range(start_index, end_index):
if k != index:
genes_to_write.append(all_genes[k])
index += 1

with open(output_fasta, "w") as out_file:
for item in genes_to_write:
out_file.write(f">{item['gene_name']}\n{item['protein']}\n")
Loading