diff --git a/README.md b/README.md index 79c801a..69985e2 100644 --- a/README.md +++ b/README.md @@ -1,143 +1,31 @@ # my_first_tool -**This program consists of 3 tools:** +**This program consists several of my homeworks:** +## Main script +**Main.py** consists of several tools in the form of +1) functions: - 'dna_rna_tools' - List of procedures: `transcribe` (return the transcribed sequence), `reverse` (return the reversed sequence), `complement` (return the complementary sequence), `reverse_complement` (return the reverse complementary sequence); -- 'protein_tools'- List of procedures: `length` (return the number of amino acids in protein sequence(s)), `percentage` (return percentage of each amino acid in sequence), `pattern` (return all non-overlaping instances of a given pattern in sequences), `3Letter_name` (return three-letter amino acids into a three-letter amino acids), `DNA_code` ( return transformed protein sequence(s) to DNA sequence(s)), `fastq_tools` (return percentage of each amino acid in sequence); -- 'fastq_tools' - Procedure: filtering of the dictionary of sequences by the length, quality of sequencing of each nucleotides and GC% content. +- 'protein_tools'- List of procedures: `length` (return the number of amino acids in protein sequence(s)), `percentage` (return percentage of each amino acid in sequence), `pattern` (return all non-overlaping instances of a given pattern in sequences), `3Letter_name` (return three-letter amino acids into a three-letter amino acids), `DNA_code` ( return transformed protein sequence(s) to DNA sequence(s)), +2) Class with using BioPython: +- 'filter_fastq'- Procedure: filtering of the dictionary of sequences by the length, quality of sequencing of each nucleotides and GC% content. -## run_dna_rna_tools.py -> *description of how the run_dna_rna_tools.py program works* +Also here You can find abbility to fork with API and telegramm_bot -This program contains the function `run_dna_rna_tools`. The `run_dna_rna_tools` function takes as input an arbitrary number of arguments containing DNA or RNA sequences in the form (*str*), as well as the name of the procedure to be performed, specified as the last argument. After this, the command performs the specified action on all transmitted sequences. If one sequence is supplied, a string with the result is returned. If several are submitted, a list of strings is returned. -**Use example** -```python -run_dna_rna_tools('ATG', 'transcribe') # 'AUG' -run_dna_rna_tools('ATG', 'reverse') # 'GTA' -run_dna_rna_tools('AtG', 'complement') # 'TaC' -run_dna_rna_tools('ATg', 'reverse_complement') # 'cAT' -run_dna_rna_tools('ATG', 'aT', 'reverse') # ['GTA', 'Ta'] -``` +## bio_files_processor +**bio_files_processor.py** consists of function for converting multiline fasta to one line fasta. -## protein_tools.py -> *Discription how the protein_tools.py works:* -This program contains the function `protein_tool`. The `protein_tool` function takes as input an arbitrary number of arguments in the form of amino acid (aa)/protein sequences of type *str*, as well as the name for the procedure to be performed. After this, the function performs the specified action on all provided sequences. Carefully read the rules of usage for each option, because they specify correct ways of entering arguments, as well as the output and the type of data in the output. -### :warning: Attention: 1) The programm is register-dependent; 2) Before using some of the options read 'Procedures description' carefully. 3) If you input sequenses or 'options' incorrectly, the program will provide you with helpful error messages. +## custom_random_forest +**custom_random_forest.py** consists of custom Class for creating CustomRandomForest but instead of usual RandomForest from sklearn library it has abbility for parallel work. -***compare*** -**Introduction** -The **compare** procedure compares the first amino acid sequence provided with the following ones. -***Inputs*** -To start using the length procedure, enter sevreal arguments: -- _an arbitrary number_ of sequences, where the first sequence is a reference to which the following sequences are compared; each argument should be of type 'str'. -- _second-to-last_ argument is the number of decimals to round the number to; type 'int' -- _last_ argument determines whether percentages are returned instead of fractions; type 'bool' -**Outputs** -It returns a 'dict' object where: -- *keys* are compared-to sequences (type str) -- *values* are either fractions or percentages (type float). -**Usage example** -```python -protein_tool('LAlLAlwWGPdPA', 'LAlLAl', 3, False, options = 'compare') # {'LAlLAl': 1.0} -protein_tool('LAlLAlwWGPdPA', 'LAlLAl', 'GPdPA', 3, True, options = 'compare')) # {'LAlLAl': 100.0, 'GPdPA': 20.0} -``` +## test_my_first_tool +**test_my_first_tool.py** consist of tests for chaking mistakes in the code and several processes -***length*** -**Introduction** -The **length** procedure calculates the length of protein sequence(s) (equal to the number of amino acids). -**Inputs** -To start using the length procedure, enter one or more protein sequences for which you want to get a summary, and at the end add `options = ‘length’`. -**Outputs** -The result of the procedure is a list with the numbers of amino acids in each sequence. The list contains only numbers of amico acids in the sequence. -**Usage example*** -```python -protein_tool('LAlLAlwWGPdPA', options = 'length') # [13] -protein_tool('RRRrrrR', 'WGPdPA', 'LAlLAlw', options = 'length') # [7, 6, 7] -``` +## Showcases.ipynb +**Showcases.ipynb** consist of showing how the CustomRandomForest work -***percentage*** -**Introduction** -The **percentage** procedure calculates the percentage of all 20 proteinogenic amino acid residues, case-sensitive in the protein sequences -**Input** -To start using the count_percentage procedure, enter one or more protein sequences for which you want to get a summary, and at the end add `options = ‘percentage’`. -**Outputs** -The result of the procedure is a list of dictionaries with the percentages of the corresponding amino acids in each sequence. The dictionary contains only amino acid residues whose percentage in the sequence is not equal to 0 (which are contained in the sequence at all). Also, the dictionary is ordered from the largest percentage of content to the smallest. Cases of amino acid residues are taken into account. -> :warning: Attention: We use rounding to 2 decimal places. In some cases, **the sum of percentages** of all amino acid residues for sequence **may not be exactly 100%** due to rounding. -**Usage example** -```python -protein_tool('LAlLAlwWGPdPA', options = 'percentage') # [{'A': 23.08, 'L': 15.38, 'l': 15.38, 'P': 15.38, 'w': 7.69, 'W': 7.69, 'G': 7.69, 'd': 7.69}] -protein_tool('RRRrrrR', 'WGPdPA', 'LAlLAlw', options = 'percentage') # [{'R': 57.14, 'r': 42.86}, {'P': 33.33, 'W': 16.67, 'G': 16.67, 'd': 16.67, 'A': 16.67}, {'L': 28.57, 'A': 28.57, 'l': 28.57, 'w': 14.29}] -``` -***pattern*** -**Introduction** -The **pattern** procedure finds all non-overlaping cases of a given pattern in amino acid sequence(s) provided. -**Inputs** -To start using the pattern procedure, enter one or more protein sequences for which you want to get a summary, where the first sequence is a pattern, which is searched for in the following sequences; each argument should be of type 'str' and at the end add `options = ‘pattern’`. -The *find_pattern()* function goes through a sequence in the following way: it takes a subsequence of amino acids in front of an index equal in length to the pattern and compares it to the pattern. If there is no match, index is moved one amino acid to the end of the sequence. If there is a match, the index is saved, and the function jumps to an aminoacid next to the end of the subsequence, then the algorithm repeats. Comparison is performed by *compare_pattern* subfunction. -The image explanation of that function. -![The image explanation of that function **pattern**](https://github.com/GlebBobkov/HW4_Bobkov/raw/HW4_Bobkov/HW4_Bobkov/explanation.jpg) -**Outputs** -The result of this procedure is a 'dict' object where: -- *keys* are amino acid sequences (type 'str') -- _values_ are lists where the first element is a number of pattern instances in a given sequence, and the following elements are indexes of these occurances -**Usage example** -```python -protein_tool('LAlLAlwWGPdPA', 'LAlLAl', 'GPdPA', options = 'pattern') # {'LAlLAl': [2, 0, 3], 'GPdPA': [0]} -protein_tool('LAlLAlwWGPdPA', 'AlLAl', options = 'pattern') # {'AlLAl': [1, 2]} -``` -***3Letter_name*** -**Introduction** -The **3Letter_name** procedure transforms one-letter amino acid entry sequences to three-letter amino acid sequences, separated by a specified separator. It is a case-sensitive procedure. -**Inputs** -To start using the rename_three_letter_name procedure, enter one or more protein sequences for which you want to get three-letter sequences. After the protein sequences put a symbol (type 'str') that will be a separator. And specify the `options = ‘3Letter_name’`. -**Outputs** -The result of the procedure is a list of three-letter sequences. Each amino acid is separated by the specified separator. The case of the three-letter amino acid coincides with the case of the one-letter designation at the input. -**Usage example** -```python -protein_tool('wWGPdPA', '', options = '3Letter_name') # ['trpTRPGLYPROaspPROALA'] -protein_tool('LAlLAlwWGPdPA', '-', options = '3Letter_name') # ['LEU-ALA-leu-LEU-ALA-leu-trp-TRP-GLY-PRO-asp-PRO-ALA'] -protein_tool('RRRrrrR', 'WGPdPA', 'LAlLAlw', options = 'percentage') # [{'R': 57.14, 'r': 42.86}, {'P': 33.33, 'W': 16.67, 'G': 16.67, 'd': 16.67, 'A': 16.67}, {'L': 28.57, 'A': 28.57, 'l': 28.57, 'w': 14.29}] -protein_tool('qwerty', 'G', options = '3Letter_name') # ['glnGtrpGgluGargGthrGtyr'] -``` - -***DNA_code*** -**Introduction** -The **DNA_code** procedure transforms a protein into a DNA sequence that may encode it (this can be used in genetic ingeneering). -P.S. codons chosen at the discretion of the tool authors. -**Inputs** -To start using the DNA_code procedure, enter one or more protein sequences for which you want to get a summary, and at the end add `options = ‘DNA_code’`. -**Outputs** -The result of the procedure is a list with type 'str' elements - nucleotide sequence that corresponds to the amino acid sequence. -**Usage example** -```python -protein_tool('LAlLAlwWGPdPA', options = 'DNA_code') # ['TTAGCAttaTTAGCAttatggTGGGGGCCCgcaCCCGCA'] -protein_tool('RRRrrrR', 'WGPdPA', 'LAlLAlw', options = 'DNA_code') # ['CGACGACGAcgacgacgaCGA', 'TGGGGGCCCgcaCCCGCA', 'TTAGCAttaTTAGCAttatgg'] -``` - -## fastq_tools.py -> *description of how the run_dna_rna_tools.py program works* - -This program contains the function `fastq_tool`. The `fastq_tools` function takes as input a dictionary of the fastaq data and sort it by the length, quality of sequencing of each nucleotides and GC% content. -The result of the running of the tool is sorted dictionary of the dictionary from the input. -'gc_bounds' - GC composition interval (in percentage) for filtering (by default results (0, 100), i.e. all reads in the direction). If you pass one number as an argument, it is considered to be the upper limit. Examples: gc_bounds = (20, 80) - save only reads with GC content from 20 to 80%, gc_bounds = 44.4 - save reads with GC content less than 44.4%. -'length_bounds' - length of the interval for filtering, still gc_bounds, but by default it is (0, 2**32). -'quality_threshold' - threshold value of average read quality for the filter, default is 0 (phred33 scale). Reads contribute to quality for all nucleotides below the threshold are discarded. - -**Use example** -```python -seqs = { - # 'name' : ('sequence', 'quality') - '@SRX079804:1:SRR292678:1:1101:21885:21885': ('ACAGCAACATAAACATGATGGGATGGCGTAAGCCCCCGAGATATCAGTTTACCCAGGATAAGAGATTAAATTATGAGCAACATTATTAA', 'FGGGFGGGFGGGFGDFGCEBB@CCDFDDFFFFBFFGFGEFDFFFF;D@DD>C@DDGGGDFGDGG?GFGFEGFGGEF@FDGGGFGFBGGD'), - '@SRX079804:1:SRR292678:1:1101:24563:24563': ('ATTAGCGAGGAGGAGTGCTGAGAAGATGTCGCCTACGCCGTTGAAATTCCCTTCAATCAGGGGGTACTGGAGGATACGAGTTTGTGTG', 'BFFFFFFFB@B@A<@D>BDDACDDDEBEDEFFFBFFFEFFDFFF=CC@DDFD8FFFFFFF8/+.2,@7<<:?B/:<><-><@.A*C>D'), - '@SRX079804:1:SRR292678:1:1101:30161:30161': ('GAACGACAGCAGCTCCTGCATAACCGCGTCCTTCTTCTTTAGCGTTGTGCAAAGCATGTTTTGTATTACGGGCATCTCGAGCGAATC', 'DFFFEGDGGGGFGGEDCCDCEFFFFCCCCCB>CEBFGFBGGG?DE=:6@=>AD?D8DCEE:>EEABE5D@5:DDCA;EEE-DCD'), - '@SRX079804:1:SRR292678:1:1101:171075:171075': ('CATTATAGTAATACGGAAGATGACTTGCTGTTATCATTACAGCTCCATCGCATGAATAATTCTCTAATATAGTTGTCAT', 'HGHHHHGFHHHHFHHEHHHHFGEHFGFGGGHHEEGHHEEHBHHFGDDECEGGGEFGFC@DDGGGDFGDGG?GFGFEGFGGEF@FDGGGFGFBGGD'), '@SRX079804:1:SRR292678:1:1101:171075:171075': ('CATTATAGTAATACGGAAGATGACTTGCTGTTATCATTACAGCTCCATCGCATGAATAATTCTCTAATATAGTTGTCAT', 'HGHHHHGFHHHHFHHEHHHHFGEHFGFGGGHHEEGHHEEHBHHFGDDECEGGGEFGF'): + if header is not None: + return FastaRecord(fasta_id=header[0], description=header[1], sequence=''.join(lines)) + else: + header = self.parse_header(line) + lines = [] + else: + lines.append(line) + + def parse_header(self, header_line): + parts = header_line[1:].split(maxsplit=1) + fasta_id = parts[0] + description = parts[1] if len(parts) > 1 else '' + return fasta_id, description + + def read_record(self): + return next(self) + + def read_records(self): + return list(self) diff --git a/custom_random_forest.py b/custom_random_forest.py new file mode 100644 index 0000000..94e042f --- /dev/null +++ b/custom_random_forest.py @@ -0,0 +1,52 @@ +import numpy as np +import multiprocessing as mp +import time +from functools import partial +from sklearn.base import BaseEstimator +from sklearn.tree import DecisionTreeClassifier +from sklearn.datasets import make_classification +from sklearn.model_selection import train_test_split + +class RandomForestClassifierCustom(BaseEstimator): + def __init__( + self, n_estimators=10, max_depth=None, max_features=None, random_state=None, n_jobs=-1 + ): + self.n_estimators = n_estimators + self.max_depth = max_depth + self.max_features = max_features + self.random_state = random_state + self.n_jobs = max(1, n_jobs) # Ensure n_jobs is at least 1 + + self.trees = [] + self.feat_ids_by_tree = [] + + def fit(self, X, y): + self.classes_ = np.unique(y) + with mp.Pool(processes=self.n_jobs) as pool: + results = pool.map(partial(self._fit_tree, X=X, y=y), range(self.n_estimators)) + # Extract trees and feat_ids_by_tree from results + self.trees = [result[0] for result in results] + self.feat_ids_by_tree = [result[1] for result in results] + return self + + def _fit_tree(self, i, X, y): + np.random.seed(self.random_state + i) + feat_ids = np.random.choice(range(X.shape[1]), self.max_features, replace=False) + self.feat_ids_by_tree.append(feat_ids) + sample_indices = np.random.choice(range(X.shape[0]), X.shape[0], replace=True) + X_sampled = X[sample_indices][:, feat_ids] + y_sampled = y[sample_indices] + tree = DecisionTreeClassifier( + max_depth=self.max_depth, max_features=self.max_features, random_state=self.random_state + ) + tree.fit(X_sampled, y_sampled) + return tree, feat_ids + + def predict(self, X): + predictions = [self._predict_tree(X, i) for i in range(self.n_estimators)] + return np.mean(predictions, axis=0) + + def _predict_tree(self, X, i): + feat_ids = self.feat_ids_by_tree[i] + X_subset = X[:, feat_ids] + return self.trees[i].predict(X_subset) \ No newline at end of file diff --git a/main.py b/main.py index a0a22d5..a3f6d07 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,530 @@ -import moduls.dna_rna_tools -import moduls.fastq_tools -import moduls.protein_tools - - +import os +import sys +import time +import datetime +import requests +import traceback +import functools +from io import StringIO +from dotenv import load_dotenv + + +def transcribe(sequns: str) -> str: + """ + That function transcribe DNA to RNA + :param sequns: DNA sequence + :return: RNA sequence + """ + return "".join([transcription_dict[i] for i in sequns]) + + +def reverse(sequns: str) -> str: + """ + That function reverses DNA sequence + :param sequns:DNA sequense + :return: reversed DNA sequence + """ + return sequns[::-1] + + +def complement(sequns: str) -> str: + """ + That function create a complementarity chain fot the DNA sequence + :param sequns: DNA sequense + :return: complementarity chain to the DNA sequence + """ + return "".join([complement_dict[i] for i in sequns]) + + +def reverse_complement(sequns: str) -> str: + """ + That function create reverse complementarity chain for the DNA sequence + :param sequns: sequense + :return: reversed complimentarity chain to the DNA sequence + """ + rev_sequns = sequns[::-1] + return "".join([complement_dict[i] for i in rev_sequns]) + + +operations = { + "transcribe": transcribe, + "reverse": reverse, + "complement": complement, + "reverse_complement": reverse_complement, +} + + +transcription_dict = { + "A": "U", + "a": "u", + "T": "A", + "t": "a", + "G": "C", + "g": "c", + "C": "g", + "c": "g", +} + + +complement_dict = { + "A": "T", + "a": "t", + "T": "A", + "t": "a", + "G": "C", + "g": "c", + "C": "g", + "c": "g", +} + + +def dna_rna_tools(*args): + *sequnses, def_name = args + for sequns in sequnses: + if RNA_checking(sequns) == True: + continue + else: + raise ValueError("Incorrect options input, please try again") + result = [] + for sequns in sequnses: + res = operations[def_name](sequns) + result.append(res) + return result + + +from os import makedirs +import os.path + + +def GC_cont_check(sequence_for_filtering: str, gc_bounds: list) -> bool: + """ + Check the GC content of the sequence for filtering + :param sequence_for_filtering: analyzed sequence + :param gc_bounds: list with limitation for GC content + :return: bool argument for filtering in fasta_filtering function + """ + GC_counter = 0 + for i in sequence_for_filtering: + if i == "C" or i == "G" or i == "c" or i == "g": + GC_counter += 1 + GC_calculc = GC_counter / len(sequence_for_filtering) * 100 + if len(gc_bounds) == 1: + if GC_calculc <= gc_bounds[0]: + return True + if len(gc_bounds) == 2: + if GC_calculc <= gc_bounds[1] and GC_calculc >= gc_bounds[0]: + return True + + +def lenght_chech(sequence_for_filtering: str, length_bounds: list) -> bool: + """ + Check the lenght of the sequence for filtering + :param sequence_for_filtering: analyzed sequence + :param length_bounds: list with limitations for lenght + :return: bool argument for filtering in fasta_filtering function + """ + if len(length_bounds) == 1: + if len(sequence_for_filtering) <= length_bounds[0]: + return True + if len(length_bounds) == 2: + if ( + len(sequence_for_filtering) <= length_bounds[1] + and len(sequence_for_filtering) >= length_bounds[0] + ): + return True + + +def quality_chech( + quality_of_sequence_for_filterring: str, quality_threshold: int +) -> bool: + """ + Check the quality of the sequence for filtering + :param quality_of_sequence_for_filterring: 2nd key for the sequense with quality of each nucleotide reading + :param quality_threshold: limitation for the quality of nucleotides reading + :return: bool argument for filtering in fasta_filtering function + """ + quality_threshold = str(quality_threshold) + quality_counter = 0 + for i in quality_of_sequence_for_filterring: + quality_counter += ord(i) + quality = quality_counter / len(quality_of_sequence_for_filterring) + if quality >= ord(quality_threshold): + return True + + +def seqs_creation(input_path_input: str) -> dict: + """ + Reading of the input file and creating the dictinary of the seqs + with structure + {name' : ('sequence', 'comment' 'quality') + } + :param input_path_input: + :return: dictinary of esquenses + """ + + path_input = str(input_path_input) + inline_new_dit_fasta = {} + outline_new_dict_fasta = {} + py_file = open(path_input) + lines = py_file.readlines() + i = 0 + seq = {} + seqs = {} + while i != (len(lines)): + key = lines[i] + key = key[:-1] + value_1 = lines[i + 1] + value_1 = value_1[:-1] + value_3 = lines[i + 2] + value_3 = value_3[:-1] + value_2 = lines[i + 3] + if value_2[-1] == "\n": + value_2 = value_2[:-1] + value = [value_1, value_3, value_2] + seq[key] = value + seqs = {**seq} + i += 4 + return seqs + + +def output_creating(input_path: str, output_filename: str, outline_new_dict_fasta: str): + """ + Create the output file and folder for the file + :param input_path: the name of input file for extracting sequenses + :param output_filename: the name of output file for writing sequenses + :param outline_new_dict_fasta: the name of the folder fot the keeping output filename + :return: nothing + """ + if output_filename != None: + os.makedirs("fastq_filtrator_resuls", exist_ok=True) + file_for_output_filename = ( + "fastq_filtrator_resuls/" + output_filename + ".fastq" + ) + with open(file_for_output_filename, mode="w") as f: + for key, value in outline_new_dict_fasta.items(): + f.write(key + "\n") + f.write(value[0] + "\n") + f.write(value[1] + "\n") + f.write(value[2] + "\n") + else: + os.makedirs("fastq_filtrator_resuls", exist_ok=True) + file_for_output_filename = "fastq_filtrator_resuls/" + input_path + with open(file_for_output_filename, mode="w") as f: + for key, value in outline_new_dict_fasta.items(): + f.write(key + "\n") + f.write(value[0] + "\n") + f.write(value[1] + "\n") + f.write(value[2] + "\n") + + +def fasta_filtering( + seqs, gc_bounds=(0, 100), length_bounds=(0, 2**32), quality_threshold=0 +): + if type(gc_bounds) != tuple: + gc_bounds = (0, gc_bounds) + if type(length_bounds) != tuple: + length_bounds = (0, length_bounds) + inline_new_dit_fasta = {} + for key, value in seqs.items(): + sequence_for_filtering = value[0] + quality_of_sequence_for_filterring = value[1] + if ( + GC_cont_check(sequence_for_filtering, gc_bounds) + and lenght_chech(sequence_for_filtering, length_bounds) + and quality_chech(quality_of_sequence_for_filterring, quality_threshold) + ): + inline_new_dit_fasta[key] = value + return inline_new_dict_fasta + + +def length_info(protein: str) -> int: + """ + Сounting the length of an amino acid sequence/protein in the number of amino acids + :param protein: sequence of protein + :return: number of amino acids in an amino acid sequence/protein + """ + return len(protein) + + +def count_percentage_aa(seq: str) -> dict: + """ + Count percentage of each amino acid in sequence + arguments: + - seq (str): sequence for counting + return: + - dict: dictionary with counted percentage + """ + l = count_length(seq) + result = {} + for aa in seq: + if aa not in result: + result[aa] = 1 + else: + result[aa] += 1 + result.update((key, round(value / l * 100, 2)) for key, value in result.items()) + res = { + key: value + for key, value in sorted(result.items(), key=lambda item: item[1], reverse=True) + } + return res + + +def compare_pattern(sequence: str, pattern: str) -> bool: + """ + Compare a given pattern to a fragment of sequence of the same length + arguments: + - sequence (str): sequence fragment to compare with the pattern + - pattern (str): pattern for comparison + return: + - (bool): whether pattern and fragment match + """ + for i in range(0, len(sequence)): + if not sequence[i] == pattern[i]: + return False + return True + + +def find_pattern(sequences: list, pattern: str) -> dict: + """ + Find all non-overlaping instances of a given pattern in sequences + arguments: + - sequences (list): sequences to find the pattern in + - pattern (str): pattern in question + return + - finds(dict): dictionary with sequences as keys and lists of indexes of patterns and the number of patterns as values + """ + finds = {} + for j in range(0, len(sequences)): + find = [] + for i in range(0, len(sequences[j])): + if compare_pattern(sequences[j][i : i + len(pattern)], pattern): + find.append(i) + i += len(pattern) + finds[sequences[j]] = [len(find)] + find + return finds + + +def get_protein_gene(protein): + """ + Transforming of an amino acid sequence/protein to DNA sequence + :param protein: amino acid sequence of protein + :return: sequence of protein in the DNA sequence form + """ + return "".join([retrnaslation_dict[aa] for i in protein]) + + +def rename_three_letter_name(seqs: list, sep="") -> list: + """ + Transform into a three-letter amino acids entry. + arguments: + - seqs (list): list of sequences for transforming to three-letter entire + - sep (str): separator between aminoacids, default = '' + return: + - list: transformed sequences with separators + """ + res = [] + for seq in seqs: + threel_form = "" + for aa in seq: + threel_form = threel_form + threel[aa] + sep + if sep: + threel_form = threel_form[:-1] + res.append(threel_form) + return res + + +coperations = { + "length": length_info, + "percentage": count_percentage_aa, + "pattern": find_pattern, + "3Letter_name": rename_three_letter_name, + "DNA_code": get_protein_gene, +} + + +retrnaslation_dict = { + "F": "TTC", + "f": "ttc", + "L": "TTA", + "l": "tta", + "S": "TCG", + "s": "tcg", + "Y": "TAC", + "y": "tac", + "C": "TGC", + "c": "tgc", + "W": "TGG", + "w": "tgg", + "P": "CCC", + "p": "ccc", + "H": "CAT", + "h": "cat", + "Q": "GAA", + "q": "gaa", + "R": "CGA", + "r": "cga", + "I": "ATT", + "i": "att", + "M": "ATG", + "m": "atg", + "T": "ACC", + "t": "acc", + "N": "AAT", + "n": "aat", + "K": "AAA", + "k": "aaa", + "V": "GTT", + "v": "gtt", + "A": "GCA", + "a": "gca", + "D": "GAT", + "d": "gca", + "E": "GAG", + "e": "gag", + "G": "GGG", + "g": "ggg", +} + + +threel = { + "A": "ALA", + "R": "ARG", + "N": "ASN", + "D": "ASP", + "V": "VAL", + "H": "HIS", + "G": "GLY", + "Q": "GLN", + "E": "GLU", + "I": "ILE", + "L": "LEU", + "K": "LYS", + "M": "MET", + "P": "PRO", + "S": "SER", + "Y": "TYR", + "T": "THR", + "W": "TRP", + "F": "PHE", + "C": "CYS", + "a": "ala", + "r": "arg", + "n": "asn", + "d": "asp", + "v": "val", + "h": "his", + "g": "gly", + "q": "gln", + "e": "glu", + "i": "ile", + "l": "leu", + "k": "lys", + "m": "met", + "p": "pro", + "s": "ser", + "y": "tyr", + "t": "thr", + "w": "trp", + "f": "phe", + "c": "cys", +} + + +def protein_tool(*proteins, options=None): + proteins = list(proteins) + + operations = { + "compare": compare, + "length": length_info, + "percentage": count_percentage_aa, + "pattern": find_pattern, + "3Letter_name": rename_three_letter_name, + "DNA_code": get_protein_gene, + } + + if options == "compare": + result = operations[options](proteins[:-2], proteins[-2], proteins[-1]) + return result + elif options == "3Letter_name": + result = operations[options](proteins[:-1], proteins[-1]) + return result + elif options == "length" or options == "percentage" or options == "DNA_code": + result = [] + for protein in proteins: + res = operations[options](protein) + result.append(res) + return result + else: + raise ValueError("Incorrect options input, please try again") + + +from Bio import SeqIO +class FastQFilter: + def __init__(self, input_file, output_file, min_length, min_quality, min_gc): + self.input_file = input_file + self.output_file = output_file + self.min_length = min_length + self.min_quality = min_quality + self.min_gc = min_gc + + def filter_fastq(self): + with open(self.output_file, "w") as output_handle: + for record in SeqIO.parse(self.input_file, "fastq"): + if ( + len(record.seq) >= self.min_length + and min(record.letter_annotations["phred_quality"]) + >= self.min_quality + and Bio.SeqUtils.GC(record.seq) >= self.min_gc + ): + SeqIO.write(record, output_handle, "fastq") + + +load_dotenv("bot.env") + +TG_API_TOKEN = os.getenv("TG_API_TOKEN") + +TELEGRAM_API_URL = f"https://api.telegram.org/bot{TG_API_TOKEN}/sendMessage" + + +def telegram_logger(chat_id): + def decorator(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + start_time = time.time() + stdout = StringIO() + stderr = StringIO() + sys.stdout = stdout + sys.stderr = stderr + try: + result = func(*args, **kwargs) + status = "completed successfully" + except Exception as e: + error_info = "".join(traceback.format_exception_only(type(e), e)) + status = f"failed with error: {error_info}" + finally: + sys.stdout = sys.__stdout__ + sys.stderr = sys.__stderr__ + stdout_str = stdout.getvalue().strip() + stderr_str = stderr.getvalue().strip() + elapsed_time = time.time() - start_time + elapsed_time_str = str(datetime.timedelta(seconds=elapsed_time)) + message = ( + f"Function '{func.__name__}' {status} in {elapsed_time_str}\n\n" + ) + if stdout_str: + message += f"STDOUT:\n{stdout_str}\n\n" + if stderr_str: + message += f"STDERR:\n{stderr_str}\n\n" + send_telegram_message(chat_id, message) + return result + + return wrapper + + return decorator + + +def send_telegram_message(chat_id, text): + params = {"chat_id": chat_id, "text": text, "parse_mode": "HTML"} + response = requests.post(TELEGRAM_API_URL, params=params) + if not response.ok: + print(f"Failed to send Telegram message: {response.text}") diff --git a/moduls/dna_rna_tools.py b/moduls/dna_rna_tools.py deleted file mode 100644 index 49fbe83..0000000 --- a/moduls/dna_rna_tools.py +++ /dev/null @@ -1,71 +0,0 @@ -operations = { - 'transcribe': transcribe, - 'reverse': reverse, - 'complement': complement, - 'reverse_complement': reverse_complement - } - -transcription_dict = { - 'A': 'U', 'a': 'u', - 'T': 'A', 't': 'a', - 'G': 'C', 'g': 'c', - 'C': 'g', 'c': 'g' -} - - -complement_dict = { - 'A': 'T', 'a': 't', - 'T': 'A', 't': 'a', - 'G': 'C', 'g': 'c', - 'C': 'g', 'c': 'g' -} - - -def transcribe(sequns: str) -> str: - """ - That function transcribe DNA to RNA - :param sequns: DNA sequence - :return: RNA sequence - """ - return ''.join([transcription_dict[i] for i in sequns]) - - -def reverse (sequns: str) -> str: - """ - That function reverses DNA sequence - :param sequns:DNA sequense - :return: reversed DNA sequence - """ - return sequns[::-1] - -def complement (sequns: str) -> str: - """ - That function create a complementarity chain fot the DNA sequence - :param sequns: DNA sequense - :return: complementarity chain to the DNA sequence - """ - return ''.join([complement_dict[i] for i in sequns]) - - -def reverse_complement (sequns: str) -> str: - """ - That function create reverse complementarity chain for the DNA sequence - :param sequns: sequense - :return: reversed complimentarity chain to the DNA sequence - """ - rev_sequns = sequns[::-1] - return ''.join([complement_dict[i] for i in rev_sequns]) - - -def dna_rna_tools (*args): - *sequnses, def_name = args - for sequns in sequnses: - if RNA_checking(sequns) == True: - continue - else: - raise ValueError('Incorrect options input, please try again') - result = [] - for sequns in sequnses: - res = operations[def_name](sequns) - result.append(res) - return (result) diff --git a/moduls/fastq_tools.py b/moduls/fastq_tools.py deleted file mode 100644 index 20a76aa..0000000 --- a/moduls/fastq_tools.py +++ /dev/null @@ -1,129 +0,0 @@ -from os import makedirs -import os.path - -def GC_cont_check(sequence_for_filtering: str , gc_bounds: list) -> bool: - """ - Check the GC content of the sequence for filtering - :param sequence_for_filtering: analyzed sequence - :param gc_bounds: list with limitation for GC content - :return: bool argument for filtering in fasta_filtering function - """ - GC_counter = 0 - for i in sequence_for_filtering: - if i == "C" or i == "G" or i == "c" or i == "g": - GC_counter += 1 - GC_calculc = GC_counter/len(sequence_for_filtering)*100 - if len(gc_bounds)==1: - if GC_calculc <= gc_bounds[0]: - return True - if len(gc_bounds) == 2: - if GC_calculc <= gc_bounds[1] and GC_calculc >= gc_bounds[0]: - return True - -def lenght_chech (sequence_for_filtering: str, length_bounds: list) -> bool: - """ - Check the lenght of the sequence for filtering - :param sequence_for_filtering: analyzed sequence - :param length_bounds: list with limitations for lenght - :return: bool argument for filtering in fasta_filtering function - """ - if len(length_bounds)==1: - if len(sequence_for_filtering) <= length_bounds[0]: - return True - if len(length_bounds) == 2: - if len(sequence_for_filtering) <= length_bounds[1] and len(sequence_for_filtering) >= length_bounds[0]: - return True - -def quality_chech (quality_of_sequence_for_filterring: str, quality_threshold: int) -> bool: - """ - Check the quality of the sequence for filtering - :param quality_of_sequence_for_filterring: 2nd key for the sequense with quality of each nucleotide reading - :param quality_threshold: limitation for the quality of nucleotides reading - :return: bool argument for filtering in fasta_filtering function - """ - quality_threshold = str(quality_threshold) - quality_counter = 0 - for i in quality_of_sequence_for_filterring: - quality_counter += ord(i) - quality = quality_counter/len(quality_of_sequence_for_filterring) - if quality >= ord(quality_threshold): - return True - - -def seqs_creation(input_path_input: str) -> dict: - """ - Reading of the input file and creating the dictinary of the seqs - with structure - {name' : ('sequence', 'comment' 'quality') - } - :param input_path_input: - :return: dictinary of esquenses - """ - - path_input = str(input_path_input) - inline_new_dit_fasta = {} - outline_new_dict_fasta = {} - py_file = open (path_input) - lines = py_file.readlines() - i = 0 - seq = {} - seqs = {} - while i != (len(lines)): - key = lines[i] - key = key[:-1] - value_1 = lines[i+1] - value_1 = value_1 [:-1] - value_3 = lines[i+2] - value_3 = value_3[:-1] - value_2 = lines[i+3] - if value_2[-1] == '\n': - value_2 = value_2[:-1] - value = [value_1, value_3 ,value_2] - seq[key] = value - seqs = {**seq} - i += 4 - return seqs - - -def output_creating (input_path: str, output_filename: str, outline_new_dict_fasta: str): - """ - Create the output file and folder for the file - :param input_path: the name of input file for extracting sequenses - :param output_filename: the name of output file for writing sequenses - :param outline_new_dict_fasta: the name of the folder fot the keeping output filename - :return: nothing - """ - if output_filename != None: - os.makedirs('fastq_filtrator_resuls', exist_ok=True) - file_for_output_filename = 'fastq_filtrator_resuls/' + output_filename + '.fastq' - with open(file_for_output_filename, mode='w') as f: - for key, value in outline_new_dict_fasta.items(): - f.write(key + '\n') - f.write(value[0] + '\n') - f.write(value[1] + '\n') - f.write(value[2] + '\n') - else: - os.makedirs('fastq_filtrator_resuls', exist_ok=True) - file_for_output_filename = 'fastq_filtrator_resuls/' + input_path - with open(file_for_output_filename, mode='w') as f: - for key, value in outline_new_dict_fasta.items(): - f.write(key + '\n') - f.write(value[0] + '\n') - f.write(value[1] + '\n') - f.write(value[2] + '\n') - - -def fasta_filtering(seqs, gc_bounds = (0, 100), length_bounds = (0, 2**32), quality_threshold = 0): - if type(gc_bounds) != tuple: - gc_bounds = (0, gc_bounds) - if type(length_bounds) != tuple: - length_bounds = (0, length_bounds) - inline_new_dit_fasta = {} - for key, value in seqs.items(): - sequence_for_filtering = value[0] - quality_of_sequence_for_filterring = value[1] - if GC_cont_check(sequence_for_filtering, gc_bounds) and lenght_chech(sequence_for_filtering, length_bounds) and quality_chech(quality_of_sequence_for_filterring, quality_threshold): - inline_new_dit_fasta[key]= value - return (inline_new_dict_fasta) - - diff --git a/moduls/protein_tools.py b/moduls/protein_tools.py deleted file mode 100644 index 9e7382e..0000000 --- a/moduls/protein_tools.py +++ /dev/null @@ -1,163 +0,0 @@ -coperations = { - 'length': length_info, - 'percentage': count_percentage_aa, - 'pattern': find_pattern, - '3Letter_name': rename_three_letter_name, - 'DNA_code': get_protein_gene - } - - -retrnaslation_dict = { - 'F': 'TTC', 'f': 'ttc', - 'L': 'TTA', 'l': 'tta', - 'S': 'TCG', 's': 'tcg', - 'Y': 'TAC', 'y': 'tac', - 'C': 'TGC', 'c': 'tgc', - 'W': 'TGG', 'w': 'tgg', - 'P': 'CCC', 'p': 'ccc', - 'H': 'CAT', 'h': 'cat', - 'Q': 'GAA', 'q': 'gaa', - 'R': 'CGA', 'r': 'cga', - 'I': 'ATT', 'i': 'att', - 'M': 'ATG', 'm': 'atg', - 'T': 'ACC', 't': 'acc', - 'N': 'AAT', 'n': 'aat', - 'K': 'AAA', 'k': 'aaa', - 'V': 'GTT', 'v': 'gtt', - 'A': 'GCA', 'a': 'gca', - 'D': 'GAT', 'd': 'gca', - 'E': 'GAG', 'e': 'gag', - 'G': 'GGG', 'g': 'ggg' - } - - -threel = {'A': 'ALA', 'R': 'ARG', 'N': 'ASN', 'D': "ASP", 'V': 'VAL', - 'H': 'HIS', 'G': "GLY", 'Q': "GLN", 'E': 'GLU', 'I': 'ILE', - 'L': 'LEU', 'K': 'LYS', 'M': 'MET', 'P': 'PRO', 'S': 'SER', - 'Y': 'TYR', 'T': 'THR', 'W': 'TRP', 'F': 'PHE', 'C': 'CYS', - 'a': 'ala', 'r': 'arg', 'n': 'asn', 'd': "asp", 'v': 'val', - 'h': 'his', 'g': "gly", 'q': "gln", 'e': 'glu', 'i': 'ile', - 'l': 'leu', 'k': 'lys', 'm': 'met', 'p': 'pro', 's': 'ser', - 'y': 'tyr', 't': 'thr', 'w': 'trp', 'f': 'phe', 'c': 'cys' -} - - -def length_info (protein: str) -> int: - """ - Сounting the length of an amino acid sequence/protein in the number of amino acids - :param protein: sequence of protein - :return: number of amino acids in an amino acid sequence/protein - """ - return len(protein) - - -def count_percentage_aa(seq: str) -> dict: - """ - Count percentage of each amino acid in sequence - arguments: - - seq (str): sequence for counting - return: - - dict: dictionary with counted percentage - """ - l = count_length(seq) - result = {} - for aa in seq: - if aa not in result: - result[aa] = 1 - else: - result[aa] += 1 - result.update((key, round(value / l * 100, 2)) for key, value in result.items()) - res = {key: value for key, value in sorted(result.items(), key=lambda item: item[1], reverse=True)} - return res - - -def compare_pattern(sequence: str, pattern: str) -> bool: - """ - Compare a given pattern to a fragment of sequence of the same length - arguments: - - sequence (str): sequence fragment to compare with the pattern - - pattern (str): pattern for comparison - return: - - (bool): whether pattern and fragment match - """ - for i in range(0, len(sequence)): - if not sequence[i] == pattern[i]: - return False - return True - - -def find_pattern(sequences: list, pattern: str) -> dict: - """ - Find all non-overlaping instances of a given pattern in sequences - arguments: - - sequences (list): sequences to find the pattern in - - pattern (str): pattern in question - return - - finds(dict): dictionary with sequences as keys and lists of indexes of patterns and the number of patterns as values - """ - finds = {} - for j in range(0, len(sequences)): - find = [] - for i in range(0, len(sequences[j])): - if compare_pattern(sequences[j][i:i + len(pattern)], pattern): - find.append(i) - i += len(pattern) - finds[sequences[j]] = [len(find)] + find - return finds - - -def get_protein_gene(protein): - """ - Transforming of an amino acid sequence/protein to DNA sequence - :param protein: amino acid sequence of protein - :return: sequence of protein in the DNA sequence form - """ - return ''.join([retrnaslation_dict[aa] for i in protein]) - - -def rename_three_letter_name(seqs: list, sep='') -> list: - """ - Transform into a three-letter amino acids entry. - arguments: - - seqs (list): list of sequences for transforming to three-letter entire - - sep (str): separator between aminoacids, default = '' - return: - - list: transformed sequences with separators - """ - res = [] - for seq in seqs: - threel_form = '' - for aa in seq: - threel_form = threel_form + threel[aa] + sep - if sep: - threel_form = threel_form[:-1] - res.append(threel_form) - return res - - -def protein_tool(*proteins, options=None): - proteins = list(proteins) - - operations = { - 'compare': compare, - 'length': length_info, - 'percentage': count_percentage_aa, - 'pattern': find_pattern, - '3Letter_name': rename_three_letter_name, - 'DNA_code': get_protein_gene - } - - if options == 'compare': - result = operations[options](proteins[:-2], proteins[-2], proteins[-1]) - return (result) - elif options == '3Letter_name': - result = operations[options](proteins[:-1], proteins[-1]) - return result - elif options == 'length' or options == 'percentage' or options == 'DNA_code': - result = [] - for protein in proteins: - res = operations[options](protein) - result.append(res) - return (result) - else: - raise ValueError('Incorrect options input, please try again') diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..621c910 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,404 @@ +adal==1.2.2 +aiohttp==3.8.1 +aiosignal==1.2.0 +appdirs==1.4.4 +arcp==0.2.1 +argcomplete==1.8.1 +async-timeout==4.0.1 +attrs==21.2.0 +avro==1.11.0 +azure-agrifood-farming==1.0.0b1 +azure-ai-anomalydetector==3.0.0b3 +azure-ai-formrecognizer==3.2.0b2 +azure-ai-language-conversations==1.0.0b1 +azure-ai-language-questionanswering==1.0.0 +azure-ai-metricsadvisor==1.0.1 +azure-ai-textanalytics==5.2.0b3 +azure-ai-translation-document==1.0.0b6 +azure-appconfiguration==1.3.0 +azure-applicationinsights==0.1.0 +azure-batch==11.0.0 +azure-cognitiveservices-anomalydetector==0.3.0 +azure-cognitiveservices-formrecognizer==0.1.1 +azure-cognitiveservices-knowledge-qnamaker==0.3.0 +azure-cognitiveservices-language-luis==0.7.0 +azure-cognitiveservices-language-spellcheck==2.0.0 +azure-cognitiveservices-language-textanalytics==0.2.0 +azure-cognitiveservices-personalizer==0.1.0 +azure-cognitiveservices-search-autosuggest==0.2.0 +azure-cognitiveservices-search-customimagesearch==0.2.0 +azure-cognitiveservices-search-customsearch==0.3.0 +azure-cognitiveservices-search-entitysearch==2.0.0 +azure-cognitiveservices-search-imagesearch==2.0.0 +azure-cognitiveservices-search-newssearch==2.0.0 +azure-cognitiveservices-search-videosearch==2.0.0 +azure-cognitiveservices-search-visualsearch==0.2.0 +azure-cognitiveservices-search-websearch==2.0.0 +azure-cognitiveservices-vision-computervision==0.9.0 +azure-cognitiveservices-vision-contentmoderator==1.0.0 +azure-cognitiveservices-vision-customvision==3.1.0 +azure-cognitiveservices-vision-face==0.5.0 +azure-common==1.1.28 +azure-communication-chat==1.1.0 +azure-communication-identity==1.0.1 +azure-communication-networktraversal==1.0.0b1 +azure-communication-phonenumbers==1.0.1 +azure-communication-sms==1.0.1 +azure-confidentialledger==1.0.0b1 +azure-containerregistry==1.0.0b7 +azure-core==1.20.1 +azure-data-tables==12.1.1 +azure-digitaltwins-core==1.1.0 +azure-eventgrid==4.7.0 +azure-eventhub==5.6.2 +azure-eventhub-checkpointstoreblob==1.1.4 +azure-eventhub-checkpointstoreblob-aio==1.1.4 +azure-eventhub-checkpointstoretable==1.0.0b1 +azure-graphrbac==0.61.1 +azure-identity==1.7.1 +azure-iot-deviceupdate==1.0.0b1 +azure-iot-modelsrepository==1.0.0b2 +azure-keyvault==4.1.0 +azure-keyvault-administration==4.1.0b2 +azure-keyvault-certificates==4.4.0b2 +azure-keyvault-keys==4.5.0b5 +azure-keyvault-secrets==4.4.0b2 +azure-loganalytics==0.1.0 +azure-media-analytics-edge==1.0.0b1 +azure-media-videoanalyzer-edge==1.0.0b3 +azure-messaging-webpubsubservice==1.0.0b3 +azure-mgmt-advisor==9.0.0 +azure-mgmt-agfood==1.0.0b1 +azure-mgmt-agrifood==1.0.0b1 +azure-mgmt-alertsmanagement==1.0.0 +azure-mgmt-apimanagement==2.1.0 +azure-mgmt-appconfiguration==2.0.0 +azure-mgmt-applicationinsights==1.0.0 +azure-mgmt-appplatform==6.1.0 +azure-mgmt-attestation==1.0.0 +azure-mgmt-authorization==2.0.0 +azure-mgmt-automanage==1.0.0b2 +azure-mgmt-automation==1.1.0b1 +azure-mgmt-avs==7.0.0b1 +azure-mgmt-azureadb2c==1.0.0b1 +azure-mgmt-azurearcdata==1.0.0 +azure-mgmt-azurestack==1.0.0 +azure-mgmt-azurestackhci==6.1.0b1 +azure-mgmt-baremetalinfrastructure==1.0.0 +azure-mgmt-batch==16.0.0 +azure-mgmt-batchai==7.0.0b1 +azure-mgmt-billing==6.0.0 +azure-mgmt-botservice==1.0.0 +azure-mgmt-cdn==11.0.0 +azure-mgmt-changeanalysis==1.0.0 +azure-mgmt-chaos==1.0.0b2 +azure-mgmt-cognitiveservices==12.0.0 +azure-mgmt-commerce==6.0.0 +azure-mgmt-communication==1.0.0 +azure-mgmt-compute==23.1.0 +azure-mgmt-confidentialledger==1.0.0b1 +azure-mgmt-confluent==2.0.0b1 +azure-mgmt-consumption==8.0.0 +azure-mgmt-containerinstance==9.1.0 +azure-mgmt-containerregistry==8.2.0 +azure-mgmt-containerservice==16.3.0 +azure-mgmt-core==1.3.0 +azure-mgmt-cosmosdb==7.0.0b2 +azure-mgmt-costmanagement==3.0.0 +azure-mgmt-customproviders==1.0.0 +azure-mgmt-databox==1.0.0 +azure-mgmt-databoxedge==1.0.0 +azure-mgmt-databricks==1.1.0b1 +azure-mgmt-datadog==2.0.0 +azure-mgmt-datafactory==2.0.0 +azure-mgmt-datalake-analytics==0.6.0 +azure-mgmt-datalake-store==1.0.0 +azure-mgmt-datamigration==10.0.0 +azure-mgmt-dataprotection==1.0.0b1 +azure-mgmt-datashare==1.0.0 +azure-mgmt-deploymentmanager==1.0.0 +azure-mgmt-deviceupdate==1.0.0b3 +azure-mgmt-devspaces==1.0.0b1 +azure-mgmt-devtestlabs==9.0.0 +azure-mgmt-digitaltwins==6.0.0 +azure-mgmt-dns==8.0.0 +azure-mgmt-documentdb==0.1.3 +azure-mgmt-edgegateway==0.1.0 +azure-mgmt-edgeorder==1.0.0b1 +azure-mgmt-elastic==1.0.0 +azure-mgmt-eventgrid==9.0.0 +azure-mgmt-eventhub==10.0.0 +azure-mgmt-extendedlocation==1.0.0 +azure-mgmt-fluidrelay==1.0.0b1 +azure-mgmt-frontdoor==1.0.0 +azure-mgmt-guestconfig==1.0.0b1 +azure-mgmt-hanaonazure==1.0.0 +azure-mgmt-hdinsight==9.0.0 +azure-mgmt-healthbot==1.0.0b1 +azure-mgmt-healthcareapis==1.1.0b1 +azure-mgmt-hybridcompute==7.0.0 +azure-mgmt-hybridkubernetes==1.1.0 +azure-mgmt-hybridnetwork==1.0.0 +azure-mgmt-imagebuilder==1.0.0b1 +azure-mgmt-iotcentral==9.0.0b1 +azure-mgmt-iothub==2.1.0 +azure-mgmt-iothubprovisioningservices==1.0.0 +azure-mgmt-keyvault==9.2.0 +azure-mgmt-kubernetesconfiguration==1.0.0b1 +azure-mgmt-kusto==2.1.0 +azure-mgmt-labservices==1.0.0 +azure-mgmt-loganalytics==11.0.0 +azure-mgmt-logic==9.0.0 +azure-mgmt-logz==1.0.0 +azure-mgmt-machinelearningcompute==1.0.0b1 +azure-mgmt-machinelearningservices==1.0.0 +azure-mgmt-maintenance==2.0.0 +azure-mgmt-managedservices==6.0.0 +azure-mgmt-managementgroups==1.0.0 +azure-mgmt-managementpartner==1.0.0 +azure-mgmt-maps==2.0.0 +azure-mgmt-marketplaceordering==1.1.0 +azure-mgmt-media==8.0.0 +azure-mgmt-mixedreality==1.0.0 +azure-mgmt-monitor==2.0.0 +azure-mgmt-msi==6.0.0b1 +azure-mgmt-netapp==5.1.0 +azure-mgmt-network==19.2.0 +azure-mgmt-notificationhubs==7.0.0 +azure-mgmt-operationsmanagement==1.0.0 +azure-mgmt-peering==1.0.0 +azure-mgmt-policyinsights==1.1.0b1 +azure-mgmt-portal==1.0.0 +azure-mgmt-powerbidedicated==1.0.0 +azure-mgmt-powerbiembedded==2.0.0 +azure-mgmt-privatedns==1.0.0 +azure-mgmt-purview==1.0.0 +azure-mgmt-quantum==1.0.0b2 +azure-mgmt-quota==1.0.0b2 +azure-mgmt-rdbms==10.0.0 +azure-mgmt-recoveryservices==2.0.0 +azure-mgmt-recoveryservicesbackup==3.0.0 +azure-mgmt-recoveryservicessiterecovery==1.0.0b1 +azure-mgmt-redhatopenshift==1.0.0 +azure-mgmt-redis==13.0.0 +azure-mgmt-redisenterprise==1.0.0 +azure-mgmt-regionmove==1.0.0b1 +azure-mgmt-relay==1.1.0 +azure-mgmt-reservations==1.0.0 +azure-mgmt-resource==20.0.0 +azure-mgmt-resourcegraph==8.1.0b1 +azure-mgmt-resourcehealth==1.0.0b1 +azure-mgmt-resourcemover==1.1.0b2 +azure-mgmt-scheduler==7.0.0b1 +azure-mgmt-search==8.0.0 +azure-mgmt-security==2.0.0b1 +azure-mgmt-securityinsight==1.0.0b1 +azure-mgmt-serialconsole==1.0.0 +azure-mgmt-servermanager==2.0.0 +azure-mgmt-servicebus==7.1.0 +azure-mgmt-servicefabric==2.0.0 +azure-mgmt-servicefabricmanagedclusters==1.0.0 +azure-mgmt-servicelinker==1.0.0b1 +azure-mgmt-signalr==1.0.0 +azure-mgmt-sql==3.0.1 +azure-mgmt-sqlvirtualmachine==1.0.0b1 +azure-mgmt-storage==19.0.0 +azure-mgmt-storagecache==1.1.0 +azure-mgmt-storageimportexport==1.0.0b1 +azure-mgmt-storagepool==1.0.0 +azure-mgmt-storagesync==1.0.0 +azure-mgmt-streamanalytics==1.0.0rc1 +azure-mgmt-subscription==1.0.0 +azure-mgmt-support==6.0.0 +azure-mgmt-synapse==2.1.0b2 +azure-mgmt-testbase==1.0.0b1 +azure-mgmt-timeseriesinsights==1.0.0 +azure-mgmt-trafficmanager==1.0.0b1 +azure-mgmt-videoanalyzer==1.0.0b3 +azure-mgmt-vmwarecloudsimple==1.0.0b1 +azure-mgmt-web==5.0.0 +azure-mgmt-webpubsub==1.0.0 +azure-mgmt-workloadmonitor==1.0.0b2 +azure-mixedreality-authentication==1.0.0b1 +azure-monitor-opentelemetry-exporter==1.0.0b6 +azure-monitor-query==1.0.1 +azure-purview-administration==1.0.0b2 +azure-purview-catalog==1.0.0b3 +azure-purview-scanning==1.0.0b3 +azure-schemaregistry==1.0.0 +azure-schemaregistry-avroserializer==1.0.0b4 +azure-search-documents==11.3.0b5 +azure-security-attestation==1.0.1 +azure-servicebus==7.4.0 +azure-servicefabric==8.0.0.0 +azure-servicemanagement-legacy==0.20.7 +azure-storage-blob==12.9.0 +azure-storage-blob-changefeed==12.0.0b3 +azure-storage-file-datalake==12.5.0 +azure-storage-file-share==12.6.0 +azure-storage-queue==12.1.6 +azure-synapse==0.1.1 +azure-synapse-accesscontrol==0.8.0 +azure-synapse-artifacts==0.10.0 +azure-synapse-managedprivateendpoints==0.5.0 +azure-synapse-monitoring==0.3.0 +azure-synapse-spark==0.8.0 +azure-template==0.0.18b3 +Babel==2.8.0 +bcrypt==3.2.0 +beautifulsoup4==4.10.0 +biopython==1.81 +black==21.10b0 +blinker==1.4 +boto3==1.20.34 +botocore==1.23.34 +CacheControl==0.12.10 +cachetools==5.0.0 +certifi==2020.6.20 +chardet==4.0.0 +charset-normalizer==2.0.6 +click==8.0.3 +colorama==0.4.4 +coloredlogs==7.3 +command-not-found==0.3 +ConfigArgParse==1.5.3 +connection-pool==0.0.3 +contourpy==1.2.1 +cryptography==3.4.8 +cwltool==3.1.20220224085855 +cycler==0.12.1 +datrie==0.8.2 +dbus-python==1.2.18 +decorator==4.4.2 +distro==1.7.0 +distro-info==1.1+ubuntu0.2 +docutils==0.17.1 +drmaa==0.7.9 +dropbox==11.26.0 +filelock==3.6.0 +Flask==2.0.1 +fonttools==4.51.0 +frozenlist==1.2.0 +ftputil==5.0.3 +gevent==21.8.0 +gitdb==4.0.9 +GitPython==3.1.24 +google-auth==1.5.1 +greenlet==1.1.2 +gyp==0.1 +html5lib==1.1 +httplib2==0.20.2 +humanfriendly==10.0 +idna==3.3 +importlib-metadata==4.6.4 +ipython_genutils==0.2.0 +isodate==0.6.1 +itsdangerous==2.1.0 +jeepney==0.7.1 +Jinja2==3.0.3 +jmespath==0.10.0 +joblib==1.4.0 +jsonschema==3.2.0 +jupyter-core==4.9.1 +keyring==23.5.0 +kiwisolver==1.4.5 +kubernetes==12.0.1 +launchpadlib==1.10.16 +lazr.restfulclient==0.14.4 +lazr.uri==1.0.6 +lockfile==0.12.2 +lxml==4.8.0 +MarkupSafe==2.0.1 +matplotlib==3.8.4 +mistune==0.8.4 +more-itertools==8.10.0 +msal==1.17.0 +msal-extensions==1.0.0 +msgpack==1.0.3 +msrest==0.6.21 +msrestazure==0.6.4 +multidict==5.1.0 +mypy-extensions==0.4.3 +nbformat==5.1.3 +netifaces==0.11.0 +networkx==2.4 +numpy==1.26.1 +oauthlib==3.2.0 +olefile==0.46 +packaging==24.0 +paramiko==2.9.3 +pathspec==0.9.0 +Pillow==9.0.1 +platformdirs==2.5.1 +ply==3.11 +portalocker==2.2.1 +prettytable==2.5.0 +prov==2.0.0 +psutil==5.9.0 +PuLP==2.5.1 +pyasn1==0.4.8 +pyasn1-modules==0.2.1 +pydot==1.4.2 +Pygments==2.11.2 +PyGObject==3.42.1 +pygraphviz==1.7 +pyinotify==0.9.6 +PyJWT==2.3.0 +PyNaCl==1.5.0 +pyOpenSSL==21.0.0 +pyparsing==2.4.7 +pyrsistent==0.18.1 +python-apt==2.4.0+ubuntu3 +python-dateutil==2.8.1 +python-dotenv==1.0.1 +python-irodsclient==0.8.1 +pytz==2022.1 +PyYAML==5.4.1 +ratelimiter==1.2.0.post0 +rdflib==6.1.1 +rdflib-jsonld==0.6.1 +reportlab==3.6.8 +requests==2.25.1 +requests-oauthlib==1.3.0 +roman==3.3 +rsa==4.8 +ruamel.yaml==0.17.16 +ruamel.yaml.clib==0.2.6 +s3transfer==0.5.0 +schema-salad==8.2.20220103095339 +scikit-learn==1.4.2 +scipy==1.13.0 +SecretStorage==3.3.1 +shellescape==3.4.1 +simplejson==3.17.6 +six==1.16.0 +smart-open==5.2.1 +smmap==5.0.0 +snakemake==6.15.1 +soupsieve==2.3.1 +SPARQLWrapper==1.8.5 +stone==3.3.1 +stopit==1.1.2 +systemd-python==234 +tabulate==0.8.9 +threadpoolctl==3.5.0 +tomli==1.2.2 +toposort==1.6 +traitlets==5.1.1 +typing-extensions==3.10.0.2 +uamqp==1.5.1 +ubuntu-advantage-tools==8001 +ufw==0.36.1 +unattended-upgrades==0.1 +urllib3==1.26.5 +wadllib==1.3.6 +wcwidth==0.2.5 +webencodings==0.5.1 +websocket-client==1.2.3 +Werkzeug==2.0.2 +wrapt==1.13.3 +yappi==1.3.3 +yarl==1.7.2 +zipp==1.0.0 +zope.event==4.4 +zope.interface==5.4.0 diff --git a/test_my_first_tool.py b/test_my_first_tool.py new file mode 100644 index 0000000..859b8a8 --- /dev/null +++ b/test_my_first_tool.py @@ -0,0 +1,95 @@ +import unittest +from main_script import (FastQFilter, + protein_tool, + fasta_filtering, + output_creating, + lenght_chech, + quality_chech, + dna_rna_tools) +#проверка чтения fastq +class TestFastQFilter(unittest.TestCase): + def setUp(self): + self.input_file = "test_input.fastq" + self.output_file = "test_output.fastq" + self.min_length = 50 + self.min_quality = 20 + self.min_gc = 40 + self.filter_obj = FastQFilter(self.input_file, self.output_file, self.min_length, self.min_quality, self.min_gc) + def tearDown(self): + import os + os.remove(self.output_file) + def test_filter_fastq(self): + with open(self.input_file, "w") as f: + f.write("@test_sequence\nACTG\n+\n{}\n".format("".join(chr(self.min_quality + 10) for _ in range(4)))) + self.filter_obj.filter_fastq() + with open(self.output_file, "r") as f: + filtered_record = f.read().strip() + self.assertEqual(filtered_record, "@test_sequence\nACTG\n+\n{}\n".format("".join(chr(self.min_quality + 10) for _ in range(4)))) + +#Проверка ошибок, для функции с ValueError +class TestProteinTool(unittest.TestCase): + def test_incorrect_options_input(self): + with self.assertRaises(ValueError): + protein_tool("protein1", "protein2", options="incorrect_option") + +class TestFastaFiltering(unittest.TestCase): + def test_incorrect_gc_bounds_input(self): + seqs = {"seq1": ("ATGC", "good_quality")} + with self.assertRaises(ValueError): + fasta_filtering(seqs, gc_bounds="incorrect_input") + + def test_incorrect_length_bounds_input(self): + seqs = {"seq1": ("ATGC", "good_quality")} + with self.assertRaises(ValueError): + fasta_filtering(seqs, length_bounds="incorrect_input") + +class TestOutputCreating(unittest.TestCase): + def test_output_creation_with_filename(self): + # Проверяем, что создается выходной файл с заданным именем + outline_new_dict_fasta = {"seq1": ("ATGC", "comment", "quality")} + output_filename = "output_file" + output_creating("input_file.fasta", output_filename, outline_new_dict_fasta) + self.assertTrue(os.path.exists("fastq_filtrator_resuls/" + output_filename + ".fastq")) + + def test_output_creation_without_filename(self): + # Проверяем, что создается выходной файл с именем входного файла + outline_new_dict_fasta = {"seq1": ("ATGC", "comment", "quality")} + input_path = "input_file.fasta" + output_creating(input_path, None, outline_new_dict_fasta) + self.assertTrue(os.path.exists("fastq_filtrator_resuls/" + input_path)) + +class TestLengthCheck(unittest.TestCase): + def test_length_check_within_bounds(self): + # Проверяем, что функция возвращает True, если длина последовательности внутри заданных границ + sequence = "ATGC" + length_bounds = (1, 10) + self.assertTrue(lenght_chech(sequence, length_bounds)) + + def test_length_check_outside_bounds(self): + # Проверяем, что функция возвращает False, если длина последовательности вне заданных границ + sequence = "ATGC" + length_bounds = (10, 20) + self.assertFalse(lenght_chech(sequence, length_bounds)) + +class TestQualityCheck(unittest.TestCase): + def test_quality_check_above_threshold(self): + # Проверяем, что функция возвращает True, если качество последовательности выше порога + quality_sequence = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + quality_threshold = 60 + self.assertTrue(quality_chech(quality_sequence, quality_threshold)) + + def test_quality_check_below_threshold(self): + # Проверяем, что функция возвращает False, если качество последовательности ниже порога + quality_sequence = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + quality_threshold = 100 + self.assertFalse(quality_chech(quality_sequence, quality_threshold)) + +class TestDNARnaTools(unittest.TestCase): + def test_input_rna_sequence(self): + # Проверяем, что функция выбрасывает ValueError, если на вход подается РНК + with self.assertRaises(ValueError): + dna_rna_tools("AUGC", "length") + + +if __name__ == '__main__': + unittest.main()