-
Notifications
You must be signed in to change notification settings - Fork 0
Hw14 #5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Hw14 #5
Changes from all commits
0496a2a
5f3ef9f
e022375
fc02f82
e44c87b
92e6910
b8cd252
abc3bd6
3b1af5c
92774b2
ca3388d
0fca5dc
8a853c6
3eb47f6
8886a58
c08ab84
d962c09
597f3c0
3b0861e
ca88ffc
840cbb7
75d7629
29ce70b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,121 @@ | ||||||
| from Bio import SeqIO | ||||||
| from Bio.SeqUtils import GC | ||||||
|
|
||||||
| def filter_fastq(input_path: str, quality_threshold: int, output_filename="final_filtered.fastq", gc_bounds=(40, 60), length_bounds=(50, 350)): | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Тут не хватает докстринги (тем более же что по идее увас они были в прошлом семестре |
||||||
| filename = input_path | ||||||
| records = SeqIO.parse(filename, "fastq") | ||||||
| ###quality filter | ||||||
| good_reads = (rec for rec in records if min(rec.letter_annotations["phred_quality"]) >= quality_threshold) | ||||||
| result_quality = SeqIO.write(good_reads, "good_quality.fastq", "fastq") | ||||||
| result_quality_GC = SeqIO.parse("good_quality.fastq", "fastq") | ||||||
| ###GC content filter | ||||||
| min_gc_content = gc_bounds[0] | ||||||
| max_gc_content = gc_bounds[1] | ||||||
| GC_quality_filt = [] | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Переменные только с маленькими буквами! |
||||||
|
|
||||||
| for sequence in result_quality_GC: | ||||||
| if min_gc_content <= GC(sequence.seq) <= max_gc_content: | ||||||
| GC_quality_filt.append(sequence) | ||||||
|
|
||||||
| result_quality = SeqIO.write(GC_quality_filt, "good_quality_GC.fastq", "fastq") | ||||||
| result_quality_GC_length = SeqIO.parse("good_quality_GC.fastq", "fastq") | ||||||
|
Comment on lines
+20
to
+21
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. С точки зрения использования функционала биопитона - все супер. Но как код организован - не очень хорошо к сожалению. У тебя получается как-то разбросаны проверки, при чем после каждой проверке ты создаешь новый файл на компьютере куда записываешь сиквенсы. Ну и между этими промежутками чтения и записи все файлы ты держишь в памяти. В этом плане все можно было бы организовать гораздо проще и еще и лучше. Сперва мы открываем файл на запись. После этого мы в цикле for читаем записи из входного файла. На каждой итерации цикла мы держим в памяти одну запись, мы проверяем ее всеми нужными проверками и если все ок - то записыаем в файл. При этом никаких промежуточных файлов в большом количестве не создается. Псведокод: with open(output) as outf:
for seq in SeqIO.parse(input):
if ok and ok and ok:
SeqIO.write(seq, outf) |
||||||
|
|
||||||
| ##length filter | ||||||
| filtered_GC_quality_length = [] | ||||||
|
|
||||||
| for sequence in result_quality_GC_length: | ||||||
| if len(sequence.seq) >= length_bounds[0] and len(sequence.seq) <= length_bounds[1]: | ||||||
| filtered_GC_quality_length.append(sequence) | ||||||
|
|
||||||
| result_quality = SeqIO.write(filtered_GC_quality_length, output_filename, "fastq") | ||||||
|
|
||||||
| print(result_quality) | ||||||
|
|
||||||
| #filter_fastq("example_fastq.fastq", 15) | ||||||
|
|
||||||
|
|
||||||
| from abc import ABC, abstractmethod | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Импорты должны быть в начале скрипта |
||||||
|
|
||||||
| class InvalidInputError(ValueError): | ||||||
| pass | ||||||
|
|
||||||
| class BiologicalSequence(ABC, str): | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Тут у тебя получается не только фильтратор:) |
||||||
| @abstractmethod | ||||||
| def __init__(self, seq): | ||||||
| self.seq = seq | ||||||
|
|
||||||
| def __len__(self): | ||||||
| return len(self.seq) | ||||||
|
|
||||||
| def __getitem__(self, index): | ||||||
| return self.seq[int(index)] | ||||||
|
|
||||||
| def __repr__(self): | ||||||
| return __str__(self.seq) | ||||||
|
|
||||||
| def check_nucleic_acid(self): | ||||||
| unique_chars = set(self.seq) | ||||||
| nucleotides_dna = set('ATGCatgc') | ||||||
| nucleotides_rna = set('AUGCaugc') | ||||||
| if unique_chars <= nucleotides_dna: | ||||||
| seq = 'dna' | ||||||
| elif unique_chars <= nucleotides_rna: | ||||||
| seq = 'rna' | ||||||
|
Comment on lines
+57
to
+63
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Тут эта проверка не нужна. Какая у нас последовательность, такой класс и используем для создания экземпляра |
||||||
| else: | ||||||
| raise InvalidInputError() | ||||||
| return seq_type | ||||||
|
Comment on lines
+43
to
+66
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Это все должны быть абстрактные методы абстрактного класса, то есть никакого функционала тут быть не должно. Абстрактные классы создают только шаблон структуры.
Comment on lines
+65
to
+66
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Кажется в 100% если попадем в else то остановимся на raise и return не сработает |
||||||
|
|
||||||
| class NucleicAcidSequence(BiologicalSequence): | ||||||
| def __init__(self, seq): | ||||||
| super().__init__(seq) | ||||||
| self.check_nucleic_acid() | ||||||
| self.length = len(self.seq) | ||||||
|
|
||||||
| def complement(self): | ||||||
| list_input = list(self.seq) | ||||||
| for i in range(len(self.seq)): | ||||||
|
Comment on lines
+75
to
+76
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. не надо итерироваться по индексам и потом брать [i] если можно сразу делать цикл по буквам |
||||||
| if list_input[i] in self.complement_dict: | ||||||
| list_input[i] = self.complement_dict[list_input[i]] | ||||||
|
Comment on lines
+77
to
+78
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. А что если |
||||||
| return "".join(list_input) | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. У тебя получается была ДНК, а после complement стала просто строка. Чтобы тип данных оставался как есть можно сделать:
Suggested change
Если не понимаешь что тут - можем на консультации еще обсудить |
||||||
|
|
||||||
| class DNASequence(NucleicAcidSequence): | ||||||
| complement_dict = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'a': 't', 't': 'a', 'g': 'c', 'c': 'g'} | ||||||
| def __init__(self, seq): | ||||||
| super().__init__(seq) | ||||||
| #self.complement() | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
|
|
||||||
| def transcribe(self): | ||||||
| list_input = list(self.seq) | ||||||
| for i in range(len(self.seq)): | ||||||
| if (list_input[i] == 'T'): | ||||||
| list_input[i] = 'U' | ||||||
| elif (list_input[i] == 't'): | ||||||
| list_input[i]='u' | ||||||
| return "".join(list_input) | ||||||
|
|
||||||
| class RNASequence(NucleicAcidSequence): | ||||||
| complement_dict = {'A': 'U', 'U': 'A', 'G': 'C', 'C': 'G', 'a': 'u', 'u': 'a', 'g': 'c', 'c': 'g'} | ||||||
| def __init__(self, seq): | ||||||
| super().__init__(seq) | ||||||
|
Comment on lines
+98
to
+99
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. в целом это можно не писать, так как по дефолту инит родителя и будет работать |
||||||
| #self.complement() | ||||||
|
|
||||||
| class AminoAcidSequence(BiologicalSequence): | ||||||
| def __init__(self, seq): | ||||||
| self.seq = seq | ||||||
|
|
||||||
| def amino_acid_frequency(self): | ||||||
| """Calculates molecular weight of a protein | ||||||
| Arguments: | ||||||
| - seq (str) 1-letter coded protein sequence | ||||||
| Return: | ||||||
| - int, molecular weight (g/mol) rounded to integer""" | ||||||
| freq_dict = {} | ||||||
| for letter in self.seq: | ||||||
| if letter in freq_dict: | ||||||
| freq_dict[letter] += 1 | ||||||
| else: | ||||||
| freq_dict[letter] = 1 | ||||||
| for letter in freq_dict: | ||||||
| freq_dict[letter] = round(freq_dict[letter] / len(self.seq) * 100, 2) | ||||||
| return freq_dict | ||||||
|
|
||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Раз файл пустой можно удалить:)