Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
317 changes: 317 additions & 0 deletions ELMOD1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,317 @@
from Bio import SeqIO
from Bio.SeqUtils import gc_fraction
from Bio.SeqRecord import SeqRecord
from typing import List, Dict, Union, Tuple
from abc import ABC, abstractmethod
import os
Comment on lines +1 to +6
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Секунда духоты. По PEP8 правильнее так, но это не критично совсем :)

Suggested change
from Bio import SeqIO
from Bio.SeqUtils import gc_fraction
from Bio.SeqRecord import SeqRecord
from typing import List, Dict, Union, Tuple
from abc import ABC, abstractmethod
import os
from typing import List, Dict, Union, Tuple
from abc import ABC, abstractmethod
from Bio import SeqIO
from Bio.SeqUtils import gc_fraction
from Bio.SeqRecord import SeqRecord

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Совершенно верно, ревью это то место где душнить можно и нужно
Я бы еще пустую строку поставил между встроенными и сторонними либами



class FastQFilter:
def __init__(self, input_path: str, output_filename: str, gc_bounds: Union[int, Tuple[int, int]] = (0, 100),
length_bounds: Union[int, Tuple[int, int]] = (0, 2**32), quality_threshold: int = 0):
Comment on lines +10 to +11
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Классно, что по PEP8!

"""
Initialize FastQFilter instance.

Parameters:
- input_path (str): Path to the input FastQ file.
- output_filename (str): Name for the output FastQ file.
- gc_bounds (Union[int, Tuple[int, int]]): GC content bounds for filtering.
- length_bounds (Union[int, Tuple[int, int]]): Length bounds for filtering.
- quality_threshold (int): Quality threshold for filtering.
Comment on lines +10 to +20
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Круто, что есть аннотация типов и докстринга

"""
self.input_path = input_path
self.output_filename = output_filename
self.gc_bounds = gc_bounds
self.length_bounds = length_bounds
self.quality_threshold = quality_threshold

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Было бы здорово прописать проверку вводного файла:

Suggested change
if not os.path.isfile(self.input_path):
raise FileNotFoundError(f"Input file '{self.input_path}' not found.")

def filter_fastq(self) -> None:
"""
Read, filter, and write FastQ records based on specified criteria.
"""
records = self.read_fastq()
filtered_records = self.apply_filters(records)
self.write_fastq(filtered_records)

def read_fastq(self) -> List[SeqRecord]:
"""
Read FastQ file and return a list of SeqRecord objects.
"""
records = list(SeqIO.parse(self.input_path, "fastq"))
Comment on lines +36 to +40
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

На мой взгляд, было бы лучше читать FASTQ файл по одному сиквенсу, сразу проверять и записывать последовательность в файл, если она прошла проверку. Чтобы не хранить в памяти сразу все последовательности.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Совершенно верно

return records
Comment on lines +36 to +41

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Это уже из разряда "что можно сделать, чтобы вообще конфетка была". Можно добавить блок исключений при чтении fastq файла для более корректной обработки возможных ошибок ввода-вывода

Suggested change
def read_fastq(self) -> List[SeqRecord]:
"""
Read FastQ file and return a list of SeqRecord objects.
"""
records = list(SeqIO.parse(self.input_path, "fastq"))
return records
def read_fastq(self) -> List[SeqRecord]:
"""
Read FastQ file and return a list of SeqRecord objects.
"""
try:
records = list(SeqIO.parse(self.input_path, "fastq"))
return records
except IOError as e:
print(f"Error reading FastQ file: {e}")
return []


def apply_filters(self, records: List[SeqRecord]) -> List[SeqRecord]:
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Здорово, что прописаны типы данных везде!

"""
Filter SeqRecord objects based on specified criteria.

Parameters:
- records (List[SeqRecord]): List of SeqRecord objects to filter.

Returns:
- List[SeqRecord]: Filtered list of SeqRecord objects.
"""
filtered_records = []

for record in records:
gc_content = gc_fraction(record.seq)
length = len(record.seq)
avg_quality = sum(record.letter_annotations["phred_quality"]) / len(record.letter_annotations["phred_quality"])

gc_pass = self.check_bounds(gc_content, self.gc_bounds)
length_pass = self.check_bounds(length, self.length_bounds)
quality_pass = avg_quality >= self.quality_threshold

if gc_pass and length_pass and quality_pass:
filtered_records.append(record)
Comment on lines +64 to +65

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Вместо if ___ and ___ and ___: можно всё запихнуть в if all:

Хотя это очень сильно дискутабельно, поскольку то, что предложил я выглядит монструозно и не элегантно. Просто весь остальной код вполне хорошо, и я пока не знаю, что можно отревьюить...

Suggested change
if gc_pass and length_pass and quality_pass:
filtered_records.append(record)
if all([
self.check_bounds(gc_content, self.gc_bounds),
self.check_bounds(length, self.length_bounds),
avg_quality >= self.quality_threshold
]):
filtered_records.append(record)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Отличное предложение!


return filtered_records
Comment on lines +43 to +67

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Вместо вызова self.check_bounds для каждого критерия, можно вычислить результаты сразу:

Suggested change
def apply_filters(self, records: List[SeqRecord]) -> List[SeqRecord]:
"""
Filter SeqRecord objects based on specified criteria.
Parameters:
- records (List[SeqRecord]): List of SeqRecord objects to filter.
Returns:
- List[SeqRecord]: Filtered list of SeqRecord objects.
"""
filtered_records = []
for record in records:
gc_content = gc_fraction(record.seq)
length = len(record.seq)
avg_quality = sum(record.letter_annotations["phred_quality"]) / len(record.letter_annotations["phred_quality"])
gc_pass = self.check_bounds(gc_content, self.gc_bounds)
length_pass = self.check_bounds(length, self.length_bounds)
quality_pass = avg_quality >= self.quality_threshold
if gc_pass and length_pass and quality_pass:
filtered_records.append(record)
return filtered_records
def apply_filters(self, records: List[SeqRecord]) -> List[SeqRecord]:
"""
Filter SeqRecord objects based on specified criteria.
Parameters:
- records (List[SeqRecord]): List of SeqRecord objects to filter.
Returns:
- List[SeqRecord]: Filtered list of SeqRecord objects.
"""
return [record for record in records if
self.check_bounds(gc_fraction(record.seq), self.gc_bounds)
and self.check_bounds(len(record.seq), self.length_bounds)
and sum(record.letter_annotations["phred_quality"]) / len(record.letter_annotations["phred_quality"]) >= self.quality_threshold]


def check_bounds(self, value: int, bounds: Union[int, Tuple[int, int]]) -> bool:
"""
Check if a value is within the specified bounds.

Parameters:
- value (int): Value to check.
- bounds (Union[int, Tuple[int, int]]): Bounds to check against.

Returns:
- bool: True if value is within bounds, False otherwise.
"""
if isinstance(bounds, int):
return value <= bounds
else:
return bounds[0] <= value <= bounds[1]

def write_fastq(self, records: List[SeqRecord]) -> None:
"""
Write SeqRecord objects to a new FastQ file.

Parameters:
- records (List[SeqRecord]): List of SeqRecord objects to write.
"""
output_dir = "fastq_filtrator_results"
output_path = f"{output_dir}/{self.output_filename}.fastq"

os.makedirs(output_dir, exist_ok=True)

SeqIO.write(records, output_path, "fastq")
print(f"Filtered sequences written to {output_path}")
Comment on lines +95 to +98
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Очень удобно, что для результатов создается отдельная папка!


class BiologicalSequence(ABC):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Для абстрактного класса он слишком не абстрактный как-то... В нём много функционала в общем!

"""
Abstract base class for biological sequences.
"""

def __init__(self, sequence: str):
"""
Initialize a BiologicalSequence instance.

Parameters:
- sequence (str): The biological sequence.

The sequence is automatically converted to uppercase.
"""
self.sequence = sequence.upper()
self.validate_alphabet()

def __len__(self) -> int:
"""
Return the length of the sequence.

Returns:
- int: Length of the sequence.
"""
return len(self.sequence)

def __getitem__(self, index: int) -> str:
"""
Get the character at the specified index in the sequence.

Parameters:
- index (int): The index to retrieve.

Returns:
- str: The character at the specified index.
"""
return self.sequence[index]
Comment on lines +117 to +136
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Поскольку это абстрактный класс, думаю, что не стоит прописывать здесь какой-то смысловой код.


def __str__(self) -> str:
"""
Return the string representation of the sequence.

Returns:
- str: String representation of the sequence.
"""
return self.sequence

@abstractmethod
def is_valid_alphabet(self) -> bool:
"""
Check if the sequence contains a valid alphabet.

Returns:
- bool: True if the alphabet is valid, False otherwise.
"""
pass

def validate_alphabet(self) -> None:
"""
Validate the alphabet of the sequence.

Raises:
- ValueError: If the alphabet is not valid.
"""
if not self.is_valid_alphabet():
raise ValueError("Invalid alphabet for {}: {}".format(self.__class__.__name__, str(self)))
Comment on lines +164 to +165
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Хорошо, что вызывается ошибка, в случае если подается несуществующая последовательность


class NucleicAcidSequence(BiologicalSequence):
"""
Class representing nucleic acid sequences.
"""
def complement(self) -> 'NucleicAcidSequence':
"""
Return the complement of the sequence.

Returns:
- NucleicAcidSequence: Complement of the sequence.
"""
self.validate_alphabet()
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Так как мы делаем валидацию при инициализации, то повторять ее тут не нужно

Suggested change
self.validate_alphabet()

return self.__class__(''.join([self.complement_base(base) for base in self.sequence]))

def gc_content(self) -> float:
"""
Calculate the GC content of the sequence.

Returns:
- float: GC content of the sequence.
"""
self.validate_alphabet()
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Аналогично предыдущему комментарию

Suggested change
self.validate_alphabet()

gc_count = sum(1 for base in self.sequence if base in {'G', 'C'})
return gc_count / len(self)
Comment on lines +188 to +190

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Мне кажется, такая запись будет более понятна

Suggested change
self.validate_alphabet()
gc_count = sum(1 for base in self.sequence if base in {'G', 'C'})
return gc_count / len(self)
self.validate_alphabet()
gc_count = self.sequence.count('G') + self.sequence.count('C')
return gc_count / len(self)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Но так получается подсчет за 2 прогона. В общем получается:

  • Вариант немного эффективнее
  • Вариант немного понятнее

Какой лучше я тут сказать не могу, ахах, на ваш вкус)


class DNASequence(NucleicAcidSequence):
"""
Class representing DNA sequences.
"""
def transcribe(self) -> 'RNASequence':
"""
Transcribe the DNA sequence into RNA.

Returns:
- RNASequence: Transcribed RNA sequence.
"""
self.validate_alphabet()
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Аналогично предыдущему комментарию

Suggested change
self.validate_alphabet()

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

В 3 предыдущих функция также происходит проверка на валидность алфавита. Наверное будет лучше 1 раз проверить алфавит при инициализации экземпляра, чтобы потом уже не проверять.

Suggested change
self.validate_alphabet()

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Присоединяюсь к другим ревьюерам.

return RNASequence(''.join(['U' if base == 'T' else base for base in self.sequence]))

def complement_base(self, base: str) -> str:
"""
Return the complement of a DNA base.

Parameters:
- base (str): The DNA base.

Returns:
- str: Complement of the DNA base.
"""
if base == 'A':
return 'T'
elif base == 'T':
return 'A'
elif base == 'C':
return 'G'
elif base == 'G':
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Если реализовать это через словарь - будет более красиво

return 'C'
Comment on lines +216 to +223
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Мне кажется, что логичнее и правильнее реализовать это через словарь:

Suggested change
if base == 'A':
return 'T'
elif base == 'T':
return 'A'
elif base == 'C':
return 'G'
elif base == 'G':
return 'C'
complement_dict = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
return complement_dict.get(base)

Comment on lines +216 to +223

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Да, нам говорили скрывать комментарии других ревьюеров при составлении своего ревью! Но, признаюсь честно, это бросается в глаза... Лучше сделать через словарь.
Хотя у меня в коде наверное всё также и реализовано...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

А что с другим ревьюером? Там вроде тоже советовали убрать это в словарь и совершенно верно, правильно что вы это написали


def is_valid_alphabet(self) -> bool:
"""
Check if the DNA sequence contains a valid alphabet.

Returns:
- bool: True if the alphabet is valid, False otherwise.
"""
return set(self.sequence) <= {'A', 'T', 'C', 'G'}

class RNASequence(NucleicAcidSequence):
"""
Class representing RNA sequences.
"""
def is_valid_alphabet(self) -> bool:
"""
Check if the RNA sequence contains a valid alphabet.

Returns:
- bool: True if the alphabet is valid, False otherwise.
"""
return set(self.sequence) <= {'A', 'U', 'C', 'G'}
Comment on lines +225 to +245
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Для того, чтобы не прописывать в каждом классе, который наследуется от NucleicAcidSequence метод is_valid_alphabet можно сделать классовый атрибут alphabet для RNASequence и для DNASequence. И метод для проверки прописать в NucleicAcidSequence:

   def is_valid_alphabet(self, seq: str) -> bool:
       """Checks the sequence to match the alphabet"""
       return set(seq).issubset(type(self).alphabet)

В данном случае в качестве alphabet возьмется алфавит в зависимости от класса.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

У маркдауновского кода добавляйте python в первой строчке после ``` - так будет подсвечиваться синтакс:)


def complement_base(self, base: str) -> str:
"""
Return the complement of an RNA base.

Parameters:
- base (str): The RNA base.

Returns:
- str: Complement of the RNA base.
"""
if base == 'A':
return 'U'
elif base == 'U':
return 'A'
elif base == 'C':
return 'G'
elif base == 'G':
return 'C'
Comment on lines +257 to +264
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Аналогичное замечание, как в классе с ДНК. Словарь будет лучше, чем циклы. Это улучшает читаемость кода, повышает его производительность. Такую форму записи легче редактировать и вносить в нее изменения.


class AminoAcidSequence(BiologicalSequence):
"""
Class representing amino acid sequences.
"""
amino_brutto = {
"A": (3, 7, 1, 2, 0),
"R": (6, 14, 4, 2, 0),
"N": (4, 8, 2, 3, 0),
"D": (4, 7, 1, 4, 0),
"V": (5, 11, 1, 2, 0),
"H": (6, 9, 3, 2, 0),
"G": (2, 5, 1, 2, 0),
"Q": (5, 10, 2, 3, 0),
"E": (5, 9, 1, 4, 0),
"I": (6, 13, 1, 2, 0),
"L": (6, 13, 1, 2, 0),
"K": (6, 14, 2, 2, 0),
"M": (5, 11, 1, 2, 1),
"P": (5, 9, 1, 2, 0),
"S": (3, 7, 1, 3, 0),
"Y": (9, 11, 1, 3, 0),
"T": (4, 9, 11, 1, 3, 0),
"W": (11, 12, 2, 2, 0),
"F": (9, 11, 1, 2, 0),
"C": (3, 7, 1, 2, 1),
}
Comment on lines +270 to +291
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Я бы этот словарь добавила в init_ и так как мы вносим в него изменения, обязательно добавляем метод super()

Suggested change
amino_brutto = {
"A": (3, 7, 1, 2, 0),
"R": (6, 14, 4, 2, 0),
"N": (4, 8, 2, 3, 0),
"D": (4, 7, 1, 4, 0),
"V": (5, 11, 1, 2, 0),
"H": (6, 9, 3, 2, 0),
"G": (2, 5, 1, 2, 0),
"Q": (5, 10, 2, 3, 0),
"E": (5, 9, 1, 4, 0),
"I": (6, 13, 1, 2, 0),
"L": (6, 13, 1, 2, 0),
"K": (6, 14, 2, 2, 0),
"M": (5, 11, 1, 2, 1),
"P": (5, 9, 1, 2, 0),
"S": (3, 7, 1, 3, 0),
"Y": (9, 11, 1, 3, 0),
"T": (4, 9, 11, 1, 3, 0),
"W": (11, 12, 2, 2, 0),
"F": (9, 11, 1, 2, 0),
"C": (3, 7, 1, 2, 1),
}
def __init__(self, sequence: str):
"""
Initialize an AminoAcidSequence instance.
"""
super().__init__(sequence)
amino_brutto = {
"A": (3, 7, 1, 2, 0),
"R": (6, 14, 4, 2, 0),
"N": (4, 8, 2, 3, 0),
"D": (4, 7, 1, 4, 0),
"V": (5, 11, 1, 2, 0),
"H": (6, 9, 3, 2, 0),
"G": (2, 5, 1, 2, 0),
"Q": (5, 10, 2, 3, 0),
"E": (5, 9, 1, 4, 0),
"I": (6, 13, 1, 2, 0),
"L": (6, 13, 1, 2, 0),
"K": (6, 14, 2, 2, 0),
"M": (5, 11, 1, 2, 1),
"P": (5, 9, 1, 2, 0),
"S": (3, 7, 1, 3, 0),
"Y": (9, 11, 1, 3, 0),
"T": (4, 9, 11, 1, 3, 0),
"W": (11, 12, 2, 2, 0),
"F": (9, 11, 1, 2, 0),
"C": (3, 7, 1, 2, 1),
}

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Я бы этот словарь добавила в init_ и так как мы вносим в него изменения, обязательно добавляем метод super()

Обрати внимание, тут маркдаун немного закофликтовал. Такие вещи надо сперва оформлять как код, а потом уже андерскоры


def is_valid_alphabet(self) -> bool:
"""
Check if the amino acid sequence contains a valid alphabet.

Returns:
- bool: True if the alphabet is valid, False otherwise.
"""
return set(self.sequence) <= {'A', 'R', 'N', 'D', 'V', 'H', 'G', 'Q', 'E', 'I', 'L', 'K', 'M',
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Так как эти буквы уже прописаны в словаре amino_brutto, можно сделать немного хитрее:

Suggested change
return set(self.sequence) <= {'A', 'R', 'N', 'D', 'V', 'H', 'G', 'Q', 'E', 'I', 'L', 'K', 'M',
return set(self) <= set(self.amino_brutto.keys())

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

При проверке на ключи можно даже не ставить keys(), так как словарь по умолчанию итерируется по ключам. Хотя не знаю как лучше

'P', 'S', 'Y', 'T', 'W', 'F', 'C'}

def brutto_count(self) -> List[Dict[str, int]]:
"""
Calculate the Brutto formula values for each amino acid in the sequence.

Returns:
- List[Dict[str, int]]: List of dictionaries containing Brutto formula values for each amino acid.
"""
self.validate_alphabet()
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Так как мы мы делаем валидацию при инициализации, нам это не нужно

Suggested change
self.validate_alphabet()

elements = ["C", "H", "N", "O", "S"]
result = []
for letter in self.sequence:
brutto_values = self.amino_brutto.get(letter, (0, 0, 0, 0, 0))
result.append(dict(zip(elements, brutto_values)))
return result
Comment on lines +303 to +316

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Вместо использования цикла for и append, можно использовать генератор списков для более компактного кода

Suggested change
def brutto_count(self) -> List[Dict[str, int]]:
"""
Calculate the Brutto formula values for each amino acid in the sequence.
Returns:
- List[Dict[str, int]]: List of dictionaries containing Brutto formula values for each amino acid.
"""
self.validate_alphabet()
elements = ["C", "H", "N", "O", "S"]
result = []
for letter in self.sequence:
brutto_values = self.amino_brutto.get(letter, (0, 0, 0, 0, 0))
result.append(dict(zip(elements, brutto_values)))
return result
def brutto_count(self) -> List[Dict[str, int]]:
"""
Calculate the Brutto formula values for each amino acid in the sequence.
Returns:
- List[Dict[str, int]]: List of dictionaries containing Brutto formula values for each amino acid.
"""
self.validate_alphabet()
elements = ["C", "H", "N", "O", "S"]
return [dict(zip(elements, self.amino_brutto.get(letter, (0, 0, 0, 0, 0)))) for letter in self.sequence]

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Хорошее предложение! Хотя в таком случае надо следить за длиной строки