diff --git a/README.md b/README.md index ba78686..06139ab 100644 --- a/README.md +++ b/README.md @@ -1 +1,83 @@ -# misc_module \ No newline at end of file +# misc_module + +This module contains assignments for the BI course on python. The scripts include tasks on OOP, API and various functions useful for basic bioinformatics. +The following functions can be found in `bio_files_processor.py`: + +``` +convert_multiline_fasta_to_oneline +``` + +Example: + +` +convert_multiline_fasta_to_oneline(input_fasta='example_multiline_fasta.fasta') +` + +Input file: + +![before](https://github.com/sme229/misc_module/assets/104040609/65e68a7a-a47c-4335-8d10-a88387fa3bdd) + +After conversion to a single line: + +![after](https://github.com/sme229/misc_module/assets/104040609/c85e4283-295e-4689-a156-5c464cec2164) + +``` +OpenFasta +``` + +This is a context manager that works with fasta files. + +- It returns records as `FastaRecord` class objects +- Includes `read_record` and `read_records` methods + +Input and output example: + +```fasta +>GTD326487.1 Species anonymous 24 chromosome +ATCGACTACGACTAGCATCACGATCACGATACG +ATGCATCAGTAGCACTAGATCA +``` + +```python +id = 'GTD326487.1' +description = 'Species anonymous 24 chromosome' +sequence = 'ATCGACTACGACTAGCATCACGATCACGATACGATGCATCAGTAGCACTAGATCA' +``` + +In biopython_fastq_filter.py the following functions are located: + +``` +fastq_filter +``` + +This function uses BioPython and filters fastq sequences by GC content, sequence length and quality score. + + +``` +BiologicalSequence +``` + +This is an abstract class that includes: + +Class NucleicAcidSequence which has `complement` and `gc_content` methods. It's a parent class to DNASequence and RNASequence classes. + +Class AminoAcidSequence has `amino_acid_frequency` method. + + +``` +telegram_logger +``` + +This function send a message from a telegram bot about the status of some process: + +![Untitled](https://github.com/sme229/misc_module/assets/104040609/141f1cd1-1430-48c7-b8ab-dda41db214ea) + + +``` +run_genscan +``` + +This is a python API for this web tool http://hollywood.mit.edu/GENSCAN.html + + +In `custom_random_forest.py` there is a `RandomForestClassifierCustom` class that works with a custom number of threads which makes it fast. diff --git a/Showcases.ipynb b/Showcases.ipynb new file mode 100644 index 0000000..676ef72 --- /dev/null +++ b/Showcases.ipynb @@ -0,0 +1,449 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "39e3aac3-ed97-4c58-871e-a334ee17c2a4", + "metadata": {}, + "source": [ + "# OpenFasta" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "fec18e02-01e2-41ef-97c0-26aeffd2f057", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id = 'GTD323452'\n", + " description = '5S_rRNA'\n", + " sequence = 'ACGGCCATAGGACTTTGAAAGCACCGCATCCCGTCCGATCTGCGAAGTTAACCAAGATGCCGCCTGGTTAGTACCATGGTGGGGGACCACATGGGAATCCCTGGTGCTGTG'\n" + ] + } + ], + "source": [ + "import time\n", + "import os\n", + "from typing import Optional\n", + "\n", + "from bio_files_processor import OpenFasta\n", + "\n", + "fasta_file = \"data/example_fasta.fasta\"\n", + "\n", + "with OpenFasta(fasta_file) as fasta:\n", + " for record in fasta.read_records():\n", + " print(record) " + ] + }, + { + "cell_type": "markdown", + "id": "f2cff184-adc1-4396-aabe-5634dd654efc", + "metadata": {}, + "source": [ + "# Run_genscan" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2bf845cb-ff8a-4dac-b121-c40f5351dc99", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Cannot provide both a sequence containing file and a sequence in the form. Please choose one.\n", + "\n", + "GENSCAN Output\n", + "\n", + "\n", + "\n", + "

GENSCAN Output

\n", + "
\n", + "View gene model output: PS | PDF\n", + "
\n",
+       "GENSCAN 1.0\tDate run: 29-Apr-124\tTime: 18:45:12\n",
+       "\n",
+       "\n",
+       "\n",
+       "Sequence /tmp/04_29_24-18:45:12.fasta : 18651 bp : 48.67% C+G : Isochore 2 (43 - 51 C+G%)\n",
+       "\n",
+       "\n",
+       "\n",
+       "Parameter matrix: HumanIso.smat\n",
+       "\n",
+       "\n",
+       "\n",
+       "Predicted genes/exons:\n",
+       "\n",
+       "\n",
+       "\n",
+       "Gn.Ex Type S .Begin ...End .Len Fr Ph I/Ac Do/T CodRg P.... Tscr..\n",
+       "\n",
+       "----- ---- - ------ ------ ---- -- -- ---- ---- ----- ----- ------\n",
+       "\n",
+       "\n",
+       "\n",
+       " 1.01 Intr +     64    144   81  2  0   40   99   113 0.455   7.21\n",
+       "\n",
+       " 1.02 Intr +   8152   8490  339  2  0   86  110   352 0.992  32.65\n",
+       "\n",
+       " 1.03 Intr +   9870   9933   64  1  1   88  115    68 0.994   7.28\n",
+       "\n",
+       " 1.04 Intr +  14487  14599  113  0  2   71  100   118 0.742  11.32\n",
+       "\n",
+       " 1.05 Intr +  16751  16773   23  0  2   84  110     7 0.471  -0.24\n",
+       "\n",
+       " 1.06 Intr +  17109  17243  135  2  0  101   74   173 0.993  17.96\n",
+       "\n",
+       " 1.07 Term +  17717  17872  156  1  0  109   38    97 0.989   4.73\n",
+       "\n",
+       " 1.08 PlyA +  18627  18632    6                               1.05\n",
+       "\n",
+       "\n",
+       "\n",
+       "\n",
+       "\n",
+       "Suboptimal exons with probability > 1.000\n",
+       "\n",
+       "\n",
+       "\n",
+       "Exnum Type S .Begin ...End .Len Fr Ph B/Ac Do/T CodRg P.... Tscr..\n",
+       "\n",
+       "----- ---- - ------ ------ ---- -- -- ---- ---- ----- ----- ------\n",
+       "\n",
+       "\n",
+       "\n",
+       "\n",
+       "\n",
+       "NO EXONS FOUND AT GIVEN PROBABILITY CUTOFF\n",
+       "\n",
+       "\n",
+       "\n",
+       "\n",
+       "\n",
+       "Predicted peptide sequence(s):\n",
+       "\n",
+       "\n",
+       "\n",
+       "\n",
+       "\n",
+       ">/tmp/04_29_24-18:45:12.fasta|GENSCAN_predicted_peptide_1|303_aa\n",
+       "\n",
+       "XSTEGNGDLSEEKMPLLTLYLLLFWLSGYSIVTQITGPTTVNGLERGSLTVQCVYRSGWE\n",
+       "\n",
+       "TYLKWWCRGAIWRDCKILVKTSGSEQEVKRDRVSIKDNQKNRTFTVTMEDLMKTDADTYW\n",
+       "\n",
+       "CGIEKTGNDLGVTVQVTIDPAPVTQEETSSSPTLTGHHLDNRHKLLKLSVLLPLIFTILL\n",
+       "\n",
+       "LLLVAASLLAWRMMKYQQKAAGMSPEQVLQPLEGDLCYADLTLQLAGTSPQKATTKLSSA\n",
+       "\n",
+       "QVDQVEVEYVTMASLPKEDISYASLTLGAEDQEPTYCNMGHLSSHLPGRGPEEPTEYSTI\n",
+       "\n",
+       "SRP\n",
+       "\n",
+       "
\n", + "
\n", + "
\n", + "

Back to GENSCAN\n", + "

\n" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from bs4 import BeautifulSoup\n", + "from biopython_fastq_filter import run_genscan\n", + "\n", + "output = run_genscan(sequence=None, sequence_file=\"data/sequence.fasta\", organism=\"Vertebrate\", exon_cutoff=1.00, sequence_name=\"\")\n", + "soup = BeautifulSoup(output, 'html.parser')\n", + "lines = soup.prettify().split(\"\\n\")\n", + "cds_list = []\n", + "soup" + ] + }, + { + "cell_type": "markdown", + "id": "aea07324-6768-4c42-8371-c7ba75436049", + "metadata": {}, + "source": [ + "# RNASequence, DNASequence, AminoAcidSequence " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "2aafe0ba-fecb-467c-af55-977c5850c59d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AAUU\n", + "{'K': 33.33, 'P': 33.33, 'L': 33.33}\n", + "CCTT\n" + ] + }, + { + "data": { + "text/plain": [ + "str" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from biopython_fastq_filter import NucleicAcidSequence, BiologicalSequence, RNASequence, DNASequence, AminoAcidSequence \n", + "\n", + "new_rna = RNASequence('UUAA')\n", + "print(new_rna.complement())\n", + "\n", + "new_protein = AminoAcidSequence('KKPPLL')\n", + "print(new_protein.amino_acid_frequency())\n", + "\n", + "new_dna = DNASequence('GGAA')\n", + "print(new_dna.complement())\n", + "type(new_dna.complement())" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "134df12f-398c-49f7-9b32-c95ebfe3f89e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1m============================= test session starts ==============================\u001b[0m\n", + "platform linux -- Python 3.12.0, pytest-8.2.0, pluggy-1.5.0\n", + "rootdir: /home/lsmertina/misc_module\n", + "plugins: anyio-4.3.0, requests-mock-1.12.1\n", + "collected 8 items \u001b[0m\n", + "\n", + "test_my_tools.py \u001b[32m.\u001b[0m\u001b[32m.\u001b[0m\u001b[32m.\u001b[0m\u001b[32m.\u001b[0m\u001b[32m.\u001b[0m\u001b[32m.\u001b[0m\u001b[32m.\u001b[0m\u001b[32m.\u001b[0m\u001b[33m [100%]\u001b[0m\n", + "\n", + "\u001b[33m=============================== warnings summary ===============================\u001b[0m\n", + "test_my_tools.py::test_filter_fastq\n", + " /home/lsmertina/miniforge3/envs/testing/lib/python3.12/site-packages/Bio/SeqUtils/__init__.py:144: BiopythonDeprecationWarning: GC is deprecated; please use gc_fraction instead.\n", + " warnings.warn(\n", + "\n", + "-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html\n", + "\u001b[33m========================= \u001b[32m8 passed\u001b[0m, \u001b[33m\u001b[1m1 warning\u001b[0m\u001b[33m in 0.29s\u001b[0m\u001b[33m =========================\u001b[0m\n" + ] + } + ], + "source": [ + "! python -m pytest" + ] + }, + { + "cell_type": "markdown", + "id": "5ef9fe6c-3fb6-41e4-97fe-eecde2c97028", + "metadata": {}, + "source": [ + "# RandomForestClassifierCustom" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d9581b19-6f73-474d-9159-b74c97c3ab40", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import warnings\n", + "import random\n", + "import math\n", + "import pandas as pd\n", + "import xgboost\n", + "import lightgbm\n", + "import catboost\n", + "\n", + "from matplotlib.colors import ListedColormap\n", + "from scipy.stats import pearsonr\n", + "from itertools import combinations\n", + "from sklearn.base import BaseEstimator\n", + "from sklearn import datasets\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.ensemble import (RandomForestClassifier,\n", + " ExtraTreesClassifier,\n", + " VotingClassifier)\n", + "from sklearn.tree import (DecisionTreeRegressor,\n", + " DecisionTreeClassifier)\n", + "from custom_random_forest import RandomForestClassifierCustom\n", + "import time\n", + "from sklearn.datasets import make_classification" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f03467db-99fa-4b07-9e18-9dc015b2b28d", + "metadata": {}, + "outputs": [], + "source": [ + "X, y = make_classification(n_samples=100000)\n", + "random_forest = RandomForestClassifierCustom(max_depth=30, n_estimators=10, \n", + " max_features=2, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7b735b11-70e9-4b19-b82f-fa7cca0aa901", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.9622790813446045\n" + ] + } + ], + "source": [ + "import time\n", + "start_time = time.time()\n", + "random_forest.fit(X, y, n_processes=1)\n", + "fit_time_1_process = time.time() - start_time\n", + "print(fit_time_1_process)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "3c7c0196-8209-4518-95f0-753972c2a22e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.2971296310424805\n" + ] + } + ], + "source": [ + "#Fit with 2 processes\n", + "start_time = time.time()\n", + "random_forest.fit(X, y, n_processes=2)\n", + "fit_time_2_processes = time.time() - start_time\n", + "print(fit_time_2_processes)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "e66deca9-1f5a-44a6-98bf-bff0ea6cade6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.21371793746948242\n" + ] + } + ], + "source": [ + "#Predict with 1 process\n", + "start_time = time.time()\n", + "predictions_1_process = random_forest.predict(X, n_processes=1)\n", + "predict_time_1_process = time.time() - start_time\n", + "print(predict_time_1_process)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d6bfb90d-51bd-4a76-a405-67172981d061", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.2030501365661621\n" + ] + } + ], + "source": [ + "#Predict with 2 processes\n", + "start_time = time.time()\n", + "predictions_2_processes = random_forest.predict(X, n_processes=2)\n", + "predict_time_2_processes = time.time() - start_time\n", + "print(predict_time_2_processes)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "c9ea23c3-3e68-4208-82c3-64be50b47992", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fit time with 1 process: 2.9622790813446045\n", + "Fit time with 2 processes: 2.2971296310424805\n", + "Predict time with 1 process: 0.21371793746948242\n", + "Predict time with 2 processes: 0.2030501365661621\n", + "Predictions match: True\n" + ] + } + ], + "source": [ + "#Check if predictions are the same\n", + "predictions_match = np.array_equal(predictions_1_process, predictions_2_processes)\n", + "\n", + "print(\"Fit time with 1 process:\", fit_time_1_process)\n", + "print(\"Fit time with 2 processes:\", fit_time_2_processes)\n", + "print(\"Predict time with 1 process:\", predict_time_1_process)\n", + "print(\"Predict time with 2 processes:\", predict_time_2_processes)\n", + "print(\"Predictions match:\", predictions_match)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/bio_files_processor.py b/bio_files_processor.py new file mode 100644 index 0000000..8a9f12d --- /dev/null +++ b/bio_files_processor.py @@ -0,0 +1,73 @@ +import os +from typing import Optional +from dataclasses import dataclass + +def convert_multiline_fasta_to_oneline(input_fasta: str, output_fasta: Optional[str]=None) -> str: + """ + This function converts a multiline fasta file into one line fasta + Arguments: input file and output file name (optional) + Returns a file in current working directory + """ + if output_fasta == None: + output_fasta = 'output_fasta.fasta' + current_dir = str(os.getcwd()) + output_file = os.path.join(current_dir, output_fasta) + with open(input_fasta) as input_file, open(output_fasta, mode='w') as output_file: + help_list = [] + for line in input_file: + if line.startswith('>'): + if len(help_list)!=0: + output_file.write(''.join(help_list) + '\n') + help_list = [] + output_file.write(line) + else: + help_list.append(line.strip()) + if len(help_list)!=0: + output_file.write(''.join(help_list) + '\n') + return output_file + +@dataclass +class FastaRecord: + id: str + description: str + sequence: str + + def __repr__(self): + return f"id = '{self.id}'\n description = '{self.description}'\n sequence = '{self.sequence}'" + +class OpenFasta: + def __init__(self, filename): + self.filename = filename + self.file = None + + def __enter__(self): + self.file = open(self.filename, 'r') + return self + + def __exit__(self, exc_type, exc_value, traceback): + if self.file: + self.file.close() + + def read_record(self): + name_line = self.file.readline().strip() + if not name_line: + return None + parts = name_line.split(' ', 2) + if len(parts) < 2: + return None + id = parts[0][1:] + desc = parts[1] + seq = '' + line = self.file.readline().strip() + while line and not line.startswith(">"): + seq += line + line = self.file.readline().strip() + return FastaRecord(id=id, description=desc, sequence=seq) + + def read_records(self): + records = [] + record = self.read_record() + while record: + records.append(record) + record = self.read_record() + return records \ No newline at end of file diff --git a/biopython_fastq_filter.py b/biopython_fastq_filter.py new file mode 100644 index 0000000..1f525b8 --- /dev/null +++ b/biopython_fastq_filter.py @@ -0,0 +1,195 @@ +from Bio import SeqIO +from Bio.SeqUtils import GC +import requests +from bs4 import BeautifulSoup +import time +import os +from dotenv import load_dotenv +from dataclasses import dataclass +from typing import List + +load_dotenv() + + +def filter_fastq(input_path: str, quality_threshold: int, output_filename="final_filtered.fastq", gc_bounds=(40, 60), length_bounds=(30, 350)): + filename = input_path + records = SeqIO.parse(filename, "fastq") + ###quality filter + good_reads = (rec for rec in records if min(rec.letter_annotations["phred_quality"]) >= quality_threshold) + result_quality = SeqIO.write(good_reads, "good_quality.fastq", "fastq") + result_quality_GC = SeqIO.parse("good_quality.fastq", "fastq") + ###GC content filter + min_gc_content = gc_bounds[0] + max_gc_content = gc_bounds[1] + GC_quality_filt = [] + + for sequence in result_quality_GC: + if min_gc_content <= GC(sequence.seq) <= max_gc_content: + GC_quality_filt.append(sequence) + + result_quality = SeqIO.write(GC_quality_filt, "good_quality_GC.fastq", "fastq") + result_quality_GC_length = SeqIO.parse("good_quality_GC.fastq", "fastq") + + ##length filter + filtered_GC_quality_length = [] + + for sequence in result_quality_GC_length: + if len(sequence.seq) >= length_bounds[0] and len(sequence.seq) <= length_bounds[1]: + filtered_GC_quality_length.append(sequence) + + result_quality = SeqIO.write(filtered_GC_quality_length, output_filename, "fastq") + + return result_quality + + +from abc import ABC, abstractmethod + +class InvalidInputError(ValueError): + pass + +class BiologicalSequence(ABC, str): + @abstractmethod + def __init__(self, seq): + self.seq = seq + + def __len__(self): + return len(self.seq) + + def __getitem__(self, index): + return self.seq[int(index)] + + def __repr__(self): + return __str__(self.seq) + + def check_nucleic_acid(self): + unique_chars = set(self.seq) + nucleotides_dna = set('ATGCatgc') + nucleotides_rna = set('AUGCaugc') + if unique_chars <= nucleotides_dna: + seq = 'dna' + elif unique_chars <= nucleotides_rna: + seq = 'rna' + else: + raise InvalidInputError() + return seq_type + +class NucleicAcidSequence(BiologicalSequence): + complement_dict = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'a': 't', 't': 'a', 'g': 'c', 'c': 'g'} + def __init__(self, seq): + super().__init__(seq) + self.check_nucleic_acid() + self.length = len(self.seq) + + def complement(self): + list_input = list(self.seq) + for i in range(len(self.seq)): + if list_input[i] in self.complement_dict: + list_input[i] = self.complement_dict[list_input[i]] + return "".join(list_input) + +class DNASequence(NucleicAcidSequence): + complement_dict = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'a': 't', 't': 'a', 'g': 'c', 'c': 'g'} + def __init__(self, seq): + super().__init__(seq) + self.complement() + + def transcribe(self): + list_input = list(self.seq) + for i in range(len(self.seq)): + if (list_input[i] == 'T'): + list_input[i] = 'U' + elif (list_input[i] == 't'): + list_input[i]='u' + return "".join(list_input) + +class RNASequence(NucleicAcidSequence): + complement_dict = {'A': 'U', 'U': 'A', 'G': 'C', 'C': 'G', 'a': 'u', 'u': 'a', 'g': 'c', 'c': 'g'} + def __init__(self, seq): + super().__init__(seq) + self.complement() + +class AminoAcidSequence(BiologicalSequence): + def __init__(self, seq): + self.seq = seq + + def amino_acid_frequency(self): + """Calculates molecular weight of a protein + Arguments: + - seq (str) 1-letter coded protein sequence + Return: + - int, molecular weight (g/mol) rounded to integer""" + unique_aa = set(self.seq) + freq_dict = {} + amino_acids = set('GAVLITSMCPFYWHKRDENQgavlitsmcpfywhkrdenq') + if unique_aa <= amino_acids: + seq = 'peptide' + else: + raise InvalidInputError() + for letter in self.seq: + if letter in freq_dict: + freq_dict[letter] += 1 + else: + freq_dict[letter] = 1 + for letter in freq_dict: + freq_dict[letter] = round(freq_dict[letter] / len(self.seq) * 100, 2) + return freq_dict + +token = os.environ.get('TG_API_TOKEN') + +def send_telegram_message(chat_id: str, message:str): + """this function uses the bot token and a message generated in telegram_logger function + and sends this message through the telegram bot""" + url = f"https://api.telegram.org/bot{token}/sendMessage" + data = {"chat_id": chat_id, "text": message, "parse_mode": "Markdown"} + response = requests.post(url, data=data) + return response.json() + +def telegram_logger(chat_id: str): + """this function is a decorator that times a function and generates a message + regarding its execution result""" + def decorator(func): + def inner_func(*args, **kwargs): + start = time.time() + try: + result = func(*args, **kwargs) + end = time.time() + execution_time = end - start + if execution_time < 86400: + time_str = time.strftime('%H:%M:%S.%f', time.gmtime(execution_time))[:-3] + message = f"🎉Function `{func.__name__}` has finished in `{time_str}` " + else: + days = int(execution_time // 86400) + time_str = str(timedelta(seconds=execution_time)) + message = f"Function `{func.__name__}` has finished in `{days}` days, `{str(timedelta(seconds=execution_time))}` " + send_telegram_message(chat_id, message) + return result + except Exception as error: + message = f"😞Function `{func.__name__}` failed with an exception:\nType: `{type(error).__name__}`\nError: `{str(error)}` " + send_telegram_message(chat_id, message) + return inner_func + return decorator + +@dataclass +class GenscanOutput: + status: str + cds_list: List[str] + intron_list: List[dict] + exon_list: List[dict] + +def run_genscan(sequence=None, sequence_file=None, organism="Vertebrate", exon_cutoff=1.00, sequence_name=""): + url = "http://argonaute.mit.edu/cgi-bin/genscanw_py.cgi" + + if sequence_file: + with open(sequence_file, 'rb') as file: + sequence = file.read().strip() + data = { + "-o": organism, + "-e": exon_cutoff, + "-n": sequence_name, + "-p": "Predicted peptides only", + "-u": sequence_file, + "-s": sequence + } + response = requests.post(url, data=data) + status = response.status_code + return response.content \ No newline at end of file diff --git a/custom_random_forest.py b/custom_random_forest.py new file mode 100644 index 0000000..3d27abd --- /dev/null +++ b/custom_random_forest.py @@ -0,0 +1,78 @@ +import multiprocessing +import random +import numpy as np +from sklearn.base import BaseEstimator +import time +from sklearn.datasets import make_classification +from sklearn.ensemble import (RandomForestClassifier, + ExtraTreesClassifier, + VotingClassifier) +from sklearn.tree import (DecisionTreeRegressor, + DecisionTreeClassifier) + +SEED = 111 +random.seed(SEED) +np.random.seed(SEED) + +import multiprocessing +from sklearn.base import BaseEstimator + +class RandomForestClassifierCustom(BaseEstimator): + def __init__(self, n_estimators=10, max_depth=None, max_features=None, random_state=SEED): + self.n_estimators = n_estimators + self.max_depth = max_depth + self.max_features = max_features + self.random_state = random_state + self.trees = [] + self.feat_ids_by_tree = [] + + def fit(self, X, y, n_processes=1): + self.classes_ = sorted(np.unique(y)) + def fit_tree_process(i, queue): + np.random.seed(self.random_state + i) + feat_ids = np.random.choice(range(X.shape[1]), size=self.max_features, replace=False) + pseudo_ids = np.random.choice(range(X.shape[0]), size=X.shape[0], replace=True) + pseudo_X = X[pseudo_ids, :][:, feat_ids] + pseudo_y = y[pseudo_ids] + dt_clf = DecisionTreeClassifier(max_depth=self.max_depth, + max_features=self.max_features, + random_state=self.random_state + i) + dt_clf.fit(pseudo_X, pseudo_y) + queue.put((dt_clf, feat_ids)) + queue = multiprocessing.Queue() + processes = [] + for i in range(self.n_estimators): + p = multiprocessing.Process(target=fit_tree_process, args=(i, queue)) + processes.append(p) + p.start() + results = [] + for _ in range(self.n_estimators): + results.append(queue.get()) + for p in processes: + p.join() + self.trees, self.feat_ids_by_tree = zip(*results) + return self + + def predict_proba(self, X, n_processes=1): + def predict_proba_process(tree, feat_ids, queue): + proba = tree.predict_proba(X[:, feat_ids]) + queue.put(proba) + queue = multiprocessing.Queue() + processes = [] + for tree, feat_ids in zip(self.trees, self.feat_ids_by_tree): + p = multiprocessing.Process(target=predict_proba_process, args=(tree, feat_ids, queue)) + processes.append(p) + p.start() + probas = [] + for _ in range(self.n_estimators): + probas.append(queue.get()) + + for p in processes: + p.join() + + return sum(probas) / self.n_estimators + + def predict(self, X, n_processes=1): + probas = self.predict_proba(X, n_processes=n_processes) + return np.argmax(probas, axis=1) + \ No newline at end of file diff --git a/data/example_fasta.fasta b/data/example_fasta.fasta new file mode 100644 index 0000000..ae5498d --- /dev/null +++ b/data/example_fasta.fasta @@ -0,0 +1,18 @@ +>GTD323452 5S_rRNA NODE_272_length_223_cov_0.720238:18-129(+) +ACGGCCATAGGACTTTGAAAGCACCGCATCCCGTCCGATCTGCGAAGTTAACCAAGATGCCGCCTGGTTAGTACCATGGTGGGGGACCACATGGGAATCCCT +GGTGCTGTG +>GTD678345 16S_rRNA NODE_80_length_720_cov_1.094737:313-719(+) +TTGGCTTCTTAGAGGGACTTTTGATGTTTAATCAAAGGAAGTTTGAGGCAATAACAGGTCTGTGATGCCCTTAGATGTTCTGGGCCGCACGCGCGCTACACT +GAGCCCTTGGGAGTGGTCCATTTGAGCCGGCAACGGCACGTTTGGACTGCAAACTTGGGCAAACTTGGTCATTTAGAGGAAGTAAAAGTCGTAACAAGGT +>GTD174893 16S_rRNA NODE_1_length_2558431_cov_75.185164:2153860-2155398(+) +TTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAACAGCTTGCTGTTTCGCTGACGAGTGG +GAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATC +ACCTCCTT +>GTD906783 16S_rRNA NODE_1_length_2558431_cov_75.185164:793941-795479(-) +TTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAACAGCTTGCTGTTTCGCTGACGAGTGG +GAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATC +ACCTCCTT +>GTD129563 16S_rRNA NODE_4_length_428221_cov_75.638017:281055-282593(-) +CGGACGGGTGAGTAATGTCTGGGAAACTGCCTGATGGAGGGGGATAACTACTGGAAACGGTAGCTAATACCGCATAACGTCGCAAGACCAAAGAGGGGGACC +GAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATC +ACCTCCTT \ No newline at end of file diff --git a/data/example_fasta.fasta:Zone.Identifier b/data/example_fasta.fasta:Zone.Identifier new file mode 100644 index 0000000..1bf0b28 --- /dev/null +++ b/data/example_fasta.fasta:Zone.Identifier @@ -0,0 +1,3 @@ +[ZoneTransfer] +ZoneId=3 +HostUrl=https://github.com/ diff --git a/data/example_fastq.fastq b/data/example_fastq.fastq new file mode 100644 index 0000000..883b51f --- /dev/null +++ b/data/example_fastq.fastq @@ -0,0 +1,356 @@ +@SRX079804:1:SRR292678:1:1101:21885:21885 1:N:0:1 BH:ok +ACAGCAACATAAACATGATGGGATGGCGTAAGCCCCCGAGATATCAGTTTACCCAGGATAAGAGATTAAATTATGAGCAACATTATTAA ++SRX079804:1:SRR292678:1:1101:21885:21885 1:N:0:1 BH:ok +FGGGFGGGFGGGFGDFGCEBB@CCDFDDFFFFBFFGFGEFDFFFF;D@DD>C@DDGGGDFGDGG?GFGFEGFGGEF@FDGGGFGFBGGD +@SRX079804:1:SRR292678:1:1101:24563:24563 1:N:0:1 BH:failed +ATTAGCGAGGAGGAGTGCTGAGAAGATGTCGCCTACGCCGTTGAAATTCCCTTCAATCAGGGGGTACTGGAGGATACGAGTTTGTGTG ++SRX079804:1:SRR292678:1:1101:24563:24563 1:N:0:1 BH:failed +BFFFFFFFB@B@A<@D>BDDACDDDEBEDEFFFBFFFEFFDFFF=CC@DDFD8FFFFFFF8/+.2,@7<<:?B/:<><-><@.A*C>D +@SRX079804:1:SRR292678:1:1101:30161:30161 1:N:0:1 BH:failed +GAACGACAGCAGCTCCTGCATAACCGCGTCCTTCTTCTTTAGCGTTGTGCAAAGCATGTTTTGTATTACGGGCATCTCGAGCGAATC ++SRX079804:1:SRR292678:1:1101:30161:30161 1:N:0:1 BH:failed +DFFFEGDGGGGFGGEDCCDCEFFFFCCCCCB>CEBFGFBGGG?DE=:6@=>AD?D8DCEE:>EEABE5D@5:DDCA;EEE-DCD +@SRX079804:1:SRR292678:1:1101:47176:47176 1:N:0:1 BH:failed +TGAAGCGTCGATAGAAGTTAGCAAACCCGCGGAACTTCCGTACATCAGACACATTCCGGGGGGTGGGCCAATCCATGATGCCTTTG ++SRX079804:1:SRR292678:1:1101:47176:47176 1:N:0:1 BH:failed +FF@FFBEEEEFFEFFD@EDEFFB=DFEEFFFE8FFE8EEDBFDFEEBE+E46.'8.5::EE:?E>A6@?)>;>9D<C9DEBAAB=5C?<@0=A?D@BDB;:BA?BDDFH?B@DCB6BEBDA??AA9. +@SRX079804:1:SRR292678:1:1101:52180:52180 1:N:0:1 BH:failed +TTTTTGGTTTTAGGTCTAACATGTAAGTCTTTAATCTATTTTGAATTAATTTTTGGATGAGGTGTAAGGAAGGGATCCAGTTTC ++SRX079804:1:SRR292678:1:1101:52180:52180 1:N:0:1 BH:failed +FEFFFFFF=FEEFDFD>EE:?<5@BFAFCFEE7>C>:/6:2<344DA:6DDCDC>)34773DD?@DBA9B +@SRX079804:1:SRR292678:1:1101:105156:105156 1:N:0:1 BH:ok +ACTGCTGAGCTTAAATGGCGGCAGTCTGACGGTTACCAACGGGGGCACTTCAACCGGTTCGTTAACGGGGAGCGGAGAGCTGA ++SRX079804:1:SRR292678:1:1101:105156:105156 1:N:0:1 BH:ok +GFFEGGFGGGGEGGGGGGGGGFDD=DDE7EDD6CD?FEDEE@EBEFEE.DD5DDD@B<7>/0543C?BEE?@@BE@; +@SRX079804:1:SRR292678:1:1101:135168:135168 1:N:0:1 BH:ok +CCTTCCTAAAAATTAAGAATCTTAACAATTAGCAGCACAACCAAAATTATTACCGAAAGGACTTACTCCTCCGCCAAATCCA ++SRX079804:1:SRR292678:1:1101:135168:135168 1:N:0:1 BH:ok +GGGEGGGEGGBFFFFFGCFFEGG6AEEEDEEGGFGGCG4EGFFGDFFBGEGFGGFECBDEGGEFGBFF?CGFBFGGGGFGFF +@SRX079804:1:SRR292678:1:1101:149302:149302 1:N:0:1 BH:changed:1 +TAGGGTTGTATTTGCAGATCCATGGCATGCCAAAAAGAACATCGTCCCGTCCAATATCTGCAACATACCAGTTGGTTGGTA ++SRX079804:1:SRR292678:1:1101:149302:149302 1:N:0:1 BH:changed:1 +@;CBA=:@;@DBDCDEEE/EEEEEEF@>FBEEB=EFA>EEBD=DAEEEEB9)99>B99BC)@,@<9CDD=C,5;B::?@;A +@SRX079804:1:SRR292678:1:1101:170868:170868 2:N:0:1 BH:failed +CTGCCGAGACTGTTCTCAGACATGGAAAGCTCGATTCGCATACACTCGCTGAGTAAGAGAGTCACACCAAATCACAGATT ++SRX079804:1:SRR292678:1:1101:170868:170868 2:N:0:1 BH:failed +E;FFFEGFGIGGFBG;C6D<@C7CDGFEFGFHDFEHHHBBHHFDFEFBAEEEEDE@A2=DA:??C3:@>EEBEEHEFEHHFFHH?FGBGFBBD77B;;C?FFFFGGFED.BBABBG@DBBE +@SRX079804:1:SRR292678:1:1101:190845:190845 1:N:0:1 BH:changed:1 +CCTCAGCGTGGATTGCCGCTCATGCAGGAGCAGATAATCCCTTCGCCATCCCATTAAGCGCCGTTGTCGGTATTCC ++SRX079804:1:SRR292678:1:1101:190845:190845 1:N:0:1 BH:changed:1 +FF@FFCFEECEBEC@@BBBBDFBBFFDFFEFFEB8FFFFFFFFEFCEB/>BBA@AFFFEEEEECE;ACD@DBBEEE +@SRX079804:1:SRR292678:1:1101:198993:198993 2:N:0:1 BH:failed +AGTTATTTATGCATCATTCTCATGTATGAGCCAACAAGATAGTACAAGTTTTATTGCTATGAGTTCAGTACAACA ++SRX079804:1:SRR292678:1:1101:198993:198993 2:N:0:1 BH:failed +<<<=;@B??@<>@><48876EADEG6B.BB@.?+98204<:<>@?A=@EFEFFFEEFB +@SRX079804:1:SRR292678:1:1101:204480:204480 1:N:0:1 BH:failed +AGTGAGACACCCCTGAACATTCCTAGTAAGACATCTTTGAATATTACTAGTTAGCCACACTTTAAAATGACCCG ++SRX079804:1:SRR292678:1:1101:204480:204480 1:N:0:1 BH:failed +<98;<@@@:@CD@BCCDD=DBBCEBBAAA@9???@BCDBCGF=GEGDFGDBEEEEEFFFF=EDEE=DCD@@BBC +@SRX079804:1:SRR292678:1:1101:212327:212327 2:N:0:1 BH:failed +TTACCTCTGCTTTTTCGCCTGTTACTTCTACTAATCCTTCATCTATTGCGAATGGCCCTACTACTGACGAAAT ++SRX079804:1:SRR292678:1:1101:212327:212327 2:N:0:1 BH:failed +DBCCC@@;A@BDCCCE>BBED>GDCDBFBFFEBEECFGGD@@BCB<<8@;09746:@@>@EEECEEDE/FEED +@SRX079804:1:SRR292678:1:1101:230386:230386 1:N:0:1 BH:failed +TTCTGATTGGAGTGAGAGTGCCATTTGTTTCGCTGATTGGACGTTGGAAAGCGCCTTGACCTTTGACAGCAG ++SRX079804:1:SRR292678:1:1101:230386:230386 1:N:0:1 BH:failed +=BDD=DCDBBCDADD@@B;B@CC7C@B@>=BACDD,=??8DDCDD7CCCCDBDEDBDDDDDEEBDDDBD?DB +@SRX079804:1:SRR292678:1:1101:236093:236093 1:N:0:1 BH:failed +CATCTAACTCTCATTCTAGATTCTTAAGTTGGCTACACTTTGCCGTCATTCTCGGTGGATTAGCTATTGGG ++SRX079804:1:SRR292678:1:1101:236093:236093 1:N:0:1 BH:failed +A:@A@;@BB@GGFGG@A@@817729B +@SRX079804:1:SRR292678:1:1101:251912:251912 2:N:0:1 BH:ok +TGTAAATGGAAATGAACCTAATATGTATGCACAAACTATTAAAGCATATCTTGCAAAAGGAGCAATGGCG ++SRX079804:1:SRR292678:1:1101:251912:251912 2:N:0:1 BH:ok +GHFDHHHGHHHFFHHGHHHGGGGGGDEFFEGGGGEFHHFHGFGHHHHFHHDDD6@=DCACFFGGEFBBG +@SRX079804:1:SRR292678:1:1101:278698:278698 1:N:0:1 BH:ok +CTAATAATGGTAATTGAACCATAGAAGATAAGTTCATAATGTAATAAATACATCCATAGAGTTATTAA ++SRX079804:1:SRR292678:1:1101:278698:278698 1:N:0:1 BH:ok +DDBDBCCCDD@FFFB9<<<@DA=DA@B:@=@@AC@GGFCGECFFDGGCGFFGGFFCEBF9>?@>BDFF +@SRX079804:1:SRR292678:1:1101:295878:295878 1:N:0:1 BH:changed:1 +AATGCAAACAGGATGATATTTGAATCCGTAATACTGTTCTTTCATCATAAATAATTTATGCAGATAC ++SRX079804:1:SRR292678:1:1101:295878:295878 1:N:0:1 BH:changed:1 +HHHHHEGFHDGFEGBCBEEEGGGG@EDGCGBBBEGF4?EFDBDDBFE8DEE-E?EE;B@EFC=;FDE +@SRX079804:1:SRR292678:1:1101:306575:306575 1:N:0:1 BH:failed +ATCTTTGAACTTTGTCTCACTTTCCCCCATCTCACCATTCCTCCTGTTCTGTGAACCCCAGTTTCA ++SRX079804:1:SRR292678:1:1101:306575:306575 1:N:0:1 BH:failed +E::EA@E<6B8>97:<6084649?@:?EDED=BEBEGGFEDGECECBDEFDDEEGGEDDFFF +@SRX079804:1:SRR292678:1:1101:403661:403661 1:N:0:1 BH:failed +GATGGCTTTGCTTTCTCATTCTCCTCTCCATCGTTCCCATCTTCGCCCTCAGACGCTGATTGAT ++SRX079804:1:SRR292678:1:1101:403661:403661 1:N:0:1 BH:failed +EG=DFFDFFDDGDGGGGGFFGGGGBADADCEEE5EC>CCCE6BEEEGGGGCBEGGE@9BCEF;>>D@D +@SRX079804:1:SRR292678:1:1101:425870:425870 1:N:0:1 BH:failed +GTACAGCTCTCCTCGTTACCAGCACATCTTGGACACCCGACGAAGACTTTGACATGCTCCTC ++SRX079804:1:SRR292678:1:1101:425870:425870 1:N:0:1 BH:failed +E@EED@EF=D>=EED@D@7DBF +@SRX079804:1:SRR292678:1:1101:429745:429745 1:N:0:1 BH:failed +GGCGATTGTGAAGGCATAAGAGTGGGACATAGTTCAAGTCCAGAACGAATTAAACGCACAA ++SRX079804:1:SRR292678:1:1101:429745:429745 1:N:0:1 BH:failed +BB2B=A@A:BFBFFFFF +@SRX079804:1:SRR292678:1:1101:475293:475293 2:N:0:1 BH:ok +TACATTTGATTTCTTTATAAGATTTCTTACTGTAAAATCATCGCTATTTAACAGCTTATT ++SRX079804:1:SRR292678:1:1101:475293:475293 2:N:0:1 BH:ok +FHFFFFDC@FGFEDGE?EEDC6EEEDEF?EEEE8EHHGGHFGFFEGGGGBFBDBDEBCBCA +@SRX079804:1:SRR292678:1:1101:511594:511594 1:N:0:1 BH:ok +AGCACAATCAGATTCGCTTATGACGGCGATGAAGAAATTGCGATGAAATGTGAGGTGA ++SRX079804:1:SRR292678:1:1101:511594:511594 1:N:0:1 BH:ok +HCHFFHHHGFHBFFFEGFFEFHFEHGBGECHEHB?CDDEFCDBFF9DFCD.BC +@SRX079804:1:SRR292678:1:1101:527839:527839 2:N:0:1 BH:changed:4 +GACATTTCTTTCTGGGAAGGCTTATTAAACGATAAAGATAATGATATACGTTTTGAT ++SRX079804:1:SRR292678:1:1101:527839:527839 2:N:0:1 BH:changed:4 +>35/-;,><04%'A.?4?:>BE.DC-@???CE:@EDFDBDG7B;=<)?1.@?2A5<5 +@SRX079804:1:SRR292678:1:1101:547309:547309 1:N:0:1 BH:failed +AGGTCTGCACTGGCTCTTCAGAGCGCAAGCGAGGGAAGAGAAATATAGTGACGCAC ++SRX079804:1:SRR292678:1:1101:547309:547309 1:N:0:1 BH:failed +GEGFGFD=FDB8B7DDFFFF@/DC@+2:66>-@>9);<2: +@SRX079804:1:SRR292678:1:1101:570367:570367 1:N:0:1 BH:failed +CTATGGCCACTTTTGTATTCTCGATTGAGGTTATTCGCTCACCCATCTTTTCCAA ++SRX079804:1:SRR292678:1:1101:570367:570367 1:N:0:1 BH:failed +FFEE;FFFFBFFFFF<9BEDEBEACDD3DD0B5>>0?:@>FFBEEBEFFFFGCCBCDGIGDGHEFGG=GGGGHFHEHF@FEFE?CGEEEFADEAEC +@SRX079804:1:SRR292678:1:1101:590521:590521 1:N:0:1 BH:failed +CCTCTTCTCTTTCGCGGTCATCTTGGGTTTCGCGCCTTTCTTCTTGACGACAC ++SRX079804:1:SRR292678:1:1101:590521:590521 1:N:0:1 BH:failed +EGFDD@FDF=FDFFEEGGBEFDGF=FGAGEEEEEDFBGGEE8EEE@>5;>90< +@SRX079804:1:SRR292678:1:1101:601307:601307 1:N:0:1 BH:changed:1 +TTGGCGTGCTGATGATTATCGGTATCTTCAAAGGCGCGCAGCCTGCGGGCTG ++SRX079804:1:SRR292678:1:1101:601307:601307 1:N:0:1 BH:changed:1 +GGGEGFGGEGE:EE>GFFGGGGDCGEBFFF>G=EBFFEC?DFGAD?DDECBE +@SRX079804:1:SRR292678:1:1101:631057:631057 1:N:0:1 BH:failed +CACCATTATCCTATTTCTGAACACATTTGACAGTCACGGCACTAGCATTGG ++SRX079804:1:SRR292678:1:1101:631057:631057 1:N:0:1 BH:failed +GGGGGGGGBFGGGEGFGGGGGFGEFFDFFFF?EBFEEDBFGEE@BE;E?E7 +@SRX079804:1:SRR292678:1:1101:654270:654270 1:N:0:1 BH:failed +ATCTTTCTCTCTCTGCTGCATTCTCCGCTTCAGCTCCTCAATTTCAATCA ++SRX079804:1:SRR292678:1:1101:654270:654270 1:N:0:1 BH:failed +GGGGGGEDGGFEGDGCGGBCEFGGGGFBFGGEGGGGGEGFCFEEGGFEDB +@SRX079804:1:SRR292678:1:1101:667761:667761 1:N:0:1 BH:failed +CAGCCTTTTGAGGTCGTCTATCGCAGCGTGTCCGCGACGTTTTGTTGCG ++SRX079804:1:SRR292678:1:1101:667761:667761 1:N:0:1 BH:failed +GGFGGG=GGGG@GFGGG@GGEGFGGGEGGGFGGEG@EEEDE8EE=E=DE +@SRX079804:1:SRR292678:1:1101:671526:671526 1:N:0:1 BH:ok +CAAGTAGAAGGTTAGCGCCTCTCTGTAAAAGGAGTCAAGCGCTATGTC ++SRX079804:1:SRR292678:1:1101:671526:671526 1:N:0:1 BH:ok +HHHHHGHEHHDGGGCGGGGHFGGGGEDGGGGGBEBEBEEEFGHFHFHH +@SRX079804:1:SRR292678:1:1101:685633:685633 2:N:0:1 BH:failed +GCTGTAATGGATCCACTAATTGGGACAGTGGTGGATAAAACGAATAC ++SRX079804:1:SRR292678:1:1101:685633:685633 2:N:0:1 BH:failed +DADADCGFEFEEEGEGGEFDEEEEBDAC;C10<9?AGBGGGFF7DFB +@SRX079804:1:SRR292678:1:1101:769626:769626 1:N:0:1 BH:failed +GTATTCCTTGACAGTCGAAAGAATCACTGCTAACCCAGGC ++SRX079804:1:SRR292678:1:1101:769626:769626 1:N:0:1 BH:failed +C>5BBCCCD=ACDD@A7@@B@A?B?=8B??EDEEDEBDEE +@SRX079804:1:SRR292678:1:1101:776222:776222 1:N:0:1 BH:failed +AGAACGTACACCCTACGCTAAGCAGTGGCTCCATGCCAA ++SRX079804:1:SRR292678:1:1101:776222:776222 1:N:0:1 BH:failed +BGDGGGGFEGGGGEGGGGFGFF6=FCFAFFEE,=C?EDD +@SRX079804:1:SRR292678:1:1101:782183:782183 2:N:0:1 BH:failed +TACGCGTAACGACGTCATAGCCATGACGCTTCAATAAA ++SRX079804:1:SRR292678:1:1101:782183:782183 2:N:0:1 BH:failed +D +@SRX079804:1:SRR292678:1:1101:828383:828383 1:N:0:1 BH:failed +AAACCACATATGACATGAGTGACGGGACTAAAGTTC ++SRX079804:1:SRR292678:1:1101:828383:828383 1:N:0:1 BH:failed +FFEEFEEB=E,C>CDEEEECEBEEEC?F;BDDDDAE +@SRX079804:1:SRR292678:1:1101:829239:829239 1:N:0:1 BH:ok +TCGATCCTTCTGCCTCAAAGTATACTAGGACGCAT ++SRX079804:1:SRR292678:1:1101:829239:829239 1:N:0:1 BH:ok +GGGDFGGBGFFEBFEDCBCDCGGGGBEEE=GE?EE +@SRX079804:1:SRR292678:1:1101:868419:868419 1:N:0:1 BH:ok +ATTCGTCAGGCCCAATAACATCATGAATTTCCAG ++SRX079804:1:SRR292678:1:1101:868419:868419 1:N:0:1 BH:ok +DEEEEEEEBDFFFFFFFF8FEED8@FFFBFFEFF +@SRX079804:1:SRR292678:1:1101:892716:892716 2:N:0:1 BH:ok +AATTTACCTAATGGAATCAATGAGGCTACTCCA ++SRX079804:1:SRR292678:1:1101:892716:892716 2:N:0:1 BH:ok +@BCBBBCCBCCCCCCCDDAAFFGEEEDBF@EE@ +@SRX079804:1:SRR292678:1:1101:893159:893159 2:N:0:1 BH:ok +TTCTCTGCTTTTCATATCTTGTCATAAAAATT ++SRX079804:1:SRR292678:1:1101:893159:893159 2:N:0:1 BH:ok +CBDCDEEEEEBEEEEGDDFDEEEEDGFFEGFE +@SRX079804:1:SRR292678:1:1101:918742:918742 1:N:0:1 BH:failed +CTCTCCATGCACAAAGAATATCACAGCCAAA ++SRX079804:1:SRR292678:1:1101:918742:918742 1:N:0:1 BH:failed +EEEBA?@;B@EEE@BEE=?EDDDDADCDA?E +@SRX079804:1:SRR292678:1:1101:923787:923787 2:N:0:1 BH:ok +TTGTGAAGGATGGGATATTAGTGTAGATGA ++SRX079804:1:SRR292678:1:1101:923787:923787 2:N:0:1 BH:ok +EEBBEGEEE=BBB<@DCDCGD@D>=DEGEE +@SRX079804:1:SRR292678:1:1101:933189:933189 1:N:0:1 BH:failed +GTCTGCACTATCGAGGGCTGTGCCTTTGC ++SRX079804:1:SRR292678:1:1101:933189:933189 1:N:0:1 BH:failed +FEFFDBFF8FE>?DFFFCEBCEEBBEDE6 +@SRX079804:1:SRR292678:1:1101:937136:937136 1:N:0:1 BH:failed +TTTCTTTGGCTTAAAGATAGTTTTAGTC ++SRX079804:1:SRR292678:1:1101:937136:937136 1:N:0:1 BH:failed +EFFFEEEEFCBCDDDDE@/E?@@7@@3< +@SRX079804:1:SRR292678:1:1101:940351:940351 1:N:0:1 BH:changed:1 +TGCCGTGGGAATGACAAACAAGCATCC ++SRX079804:1:SRR292678:1:1101:940351:940351 1:N:0:1 BH:changed:1 +DECC@GFFBF=EBEAFDFGD?FFF8FF +@SRX079804:1:SRR292678:1:1101:940693:940693 1:N:0:1 BH:failed +CACATTATGAACTATGGGCACTGCAT ++SRX079804:1:SRR292678:1:1101:940693:940693 1:N:0:1 BH:failed +EEEGFDEDFEGGGGGFEGBGGGFGGG +@SRX079804:1:SRR292678:1:1101:955819:955819 1:N:0:1 BH:failed +CACCTAGCAGCAACGGACGAGTCAG ++SRX079804:1:SRR292678:1:1101:955819:955819 1:N:0:1 BH:failed +GGGGGEEEGGEGGGFGEGG;F@EFF +@SRX079804:1:SRR292678:1:1101:958051:958051 2:N:0:1 BH:ok +TTAATATTTCCATCTGAACTTCGC ++SRX079804:1:SRR292678:1:1101:958051:958051 2:N:0:1 BH:ok +EDDBGFEGFGHHFHGGEDEGBGDB +@SRX079804:1:SRR292678:1:1101:996098:996098 1:N:0:1 BH:failed +CTAAGAGAGTTTGTAATGCGGAC ++SRX079804:1:SRR292678:1:1101:996098:996098 1:N:0:1 BH:failed +DD=DBDBDC4EFFFD@?CD@ACD +@SRX079804:1:SRR292678:1:1101:1020278:1020278 2:N:0:1 BH:ok +AAAGTGCAGAACATGCAGATAT ++SRX079804:1:SRR292678:1:1101:1020278:1020278 2:N:0:1 BH:ok +D>AC?GDDCD?DDADE@GABDG +@SRX079804:1:SRR292678:1:1101:1022234:1022234 1:N:0:1 BH:ok +GACCTTTCCGCAAGCTGTCGC ++SRX079804:1:SRR292678:1:1101:1022234:1022234 1:N:0:1 BH:ok +HHHHFEHHHHGGHHHGDHEEG +@SRX079804:1:SRR292678:1:1101:1024144:1024144 1:N:0:1 BH:failed +TCCATTATGAAAGAAGAAAA ++SRX079804:1:SRR292678:1:1101:1024144:1024144 1:N:0:1 BH:failed +@A><96:6: +@SRX079804:1:SRR292678:1:1101:1175112:1175112 1:N:0:1 BH:failed +AGGCC ++SRX079804:1:SRR292678:1:1101:1175112:1175112 1:N:0:1 BH:failed +EC8EE +@SRX079804:1:SRR292678:1:1101:1182927:1182927 1:N:0:1 BH:changed:1 +TGAA ++SRX079804:1:SRR292678:1:1101:1182927:1182927 1:N:0:1 BH:changed:1 +GGBH +@SRX079804:1:SRR292678:1:1101:1243474:1243474 1:N:0:1 BH:ok +ACG ++SRX079804:1:SRR292678:1:1101:1243474:1243474 1:N:0:1 BH:ok +EEE +@SRX079804:1:SRR292678:1:1101:1266246:1266246 1:N:0:1 BH:failed +AA ++SRX079804:1:SRR292678:1:1101:1266246:1266246 1:N:0:1 BH:failed +C@ +@SRX079804:1:SRR292678:1:1101:1269735:1269735 1:N:0:1 BH:failed +C ++SRX079804:1:SRR292678:1:1101:1269735:1269735 1:N:0:1 BH:failed +G diff --git a/data/example_fastq.fastq:Zone.Identifier b/data/example_fastq.fastq:Zone.Identifier new file mode 100644 index 0000000..1bf0b28 --- /dev/null +++ b/data/example_fastq.fastq:Zone.Identifier @@ -0,0 +1,3 @@ +[ZoneTransfer] +ZoneId=3 +HostUrl=https://github.com/ diff --git a/data/sequence.fasta b/data/sequence.fasta new file mode 100644 index 0000000..434d31e --- /dev/null +++ b/data/sequence.fasta @@ -0,0 +1,268 @@ +>NC_000017.11:c74712923-74694317 Homo sapiens chromosome 17, GRCh38.p14 Primary Assembly +AGTTTGTTCCTGCTGCCAGGCTCCACTGAGGGGAACGGGGACCTGTCTGAAGAGAAGATGCCCCTGCTGA +CACTCTACCTGCTCCTCTTCTGGCTCTCAGGTGAGCGGGCCTGGGTCTGTCTTCTTGGGGAAGCTTAGCA +AGCAGGAGGAGGTGGCTGAGAGAGGGAAGAAGGGACCCGGCCAGAAGGTGTCCTTAATGGCATGAGCCTT +GTGCGTTTCATCCACATACCCACCATTCACCTAGCATTTCCGTTGCCAAGGCAGGAATACAGGGCATGAC +CTTCATAATGAAGCCAGTGCAAGGCAGAATTCCTGACCCCCAGGGGCTGATCATGTGAAATGGACAGACT +GTAGGCCCCGCCGGCCAGGGAAAAGCAGAGGAGGAACATGGGCTGCTCAGGAGAGCAGGCGACTGCCTGA +TCCTGGACTGGGTGATCTGTCACCTGCGTGGAGGTGTTCTGAGCAAGGGCAGGCGTCAGGGAAGAGGTGA +GGCTGGAGGACTTTGAAAGGAGGGAAGAGGGAGTTTGCTGGGCAGAGAGGGAGTGGTGGTTCTAGGCATT +TGGGGCAGAAGGAAATAGTAGAAGTTGCTGAGGGACGGAAAGGTCAAGAGGAGATAAGACTGGCAGGGAC +GTAGAGGAAAGGAGGGACTGAGATGTGGGCAGAGCCAGAGGGCAGGAGACACCTTAAGTTGGTGCATACA +ATTTTTATTAAAAATGAAGGGAGGCCAGGGAAAGAGGCTCATTCCTGTAATCCCAGCACTTTGGGAGGCT +GATTGCTTGAGCTCAGGGATTCGAGACCAGCTTGGGTAACATGTGGAAACCCTGTCTCTACAAAAAGTAC +AAAAATTAGCCAGGTATGGTGGCATGTGCCTATGGTCCCAGCTACTTGGGAGGCTGGAGGCTGAGGCATG +AGGATCACTTGAGCCCAGGAGTTGGAGGCTGCAGTGACCTGTAATCACACTACTGCCTTCCAGCATGGGT +GACAGAGCGAGACTCTGTTTCAAAAAAAAAAAAAAAAGAAAAAAGAAAAAAGAAAAAAAAAGAAAGAAAA +CAAAAAAATAATTTCAAGAAGTGAGAAGTGATAAGTACTGTGAGGAAAACAGAACAAGATGATATGTTAC +AGTGGGACCCAGGTAGGGGGATGCTGTAAGTCAGGTGGGCTGAAACAACCCCCCTGAAGAGGGTCCCAGA +TGCCTGTCCGAGCCGAGACCTAGGAAAAGGATCAGACACAGGATCGGGACCAGCTTTCCAGGCACAGGGA +ACAGTAGATACGAGGGCTCTTGTGTAGGAATGAGCTTGGACTGGGCACGTCTGAGGAAGAGGGAGGTCAG +CACGGCTTTGCCTCATGCTTCAGAAGATCCTGGTGGGACAGATTGGAGAAAAGGTCTTGGGGAAGAAAGT +GACTTGTCCAGGACTGCTAGGCTGGTACAGGAAGAGCAGGGAGTGGCTCATGGACACTTGGCCCCGAGTT +TCAGCATGTAAGTGGAGAAGGCAGACTTTGGGTGAGATGTTATGAAATGCATTGGCACAGTGAGACCAGG +ACAGGTATGGCTGCTTTTTTTCTGACACCTGGAAATGAGAGAATTTGGTAGTGAGCAGATGTGAGAGTCG +TGTTGGGAGTGGGAGTGGTTTGGGACCTCTGTCTGAAGTGGGCAGAAAAGCCAGCAGGCACAGCTATGGG +CGCCAACATGGAGCCCGGATACCACTGTGAACCTGCATGCATCAGCTCTTGCCAGCCTCTCCTGCAGAAC +TTCCACCCACAAGCAGCAAGTGTGCTTACCCTGTTTCCCTCTTCTAAATAATTTATCGTTTTAAAACACC +CAAGAAAAAAAATACCCATCCGGAAGTTTGTTTTCTGTGAGAAAGGAAGGTTTTTTATTTTTTTTGAGTT +GGGGTCTCAGTCTGTTGCCTAGGCTGGAGTGTGGTGGCATGATCATAGCTCACTGCAGCCTCAACTCCTG +GGCTCAAGTGATCCTCCTGCCTTGGCCGTCTGAGTTGCTGGGATTAAAAGCATGAGCCACCGTGCCTGGC +AAGAACAGATTTTTTGTTTTTTTTTTTGAGACAGAGTCTCGCTCTGTCACTCAGGCTGGAGTGCAGTGGC +GCGATCTTGGCTCACTGCAACCTCTGCCTCCCAGATTCAAGTGATTCTTCTGCCTCAGCCTCCCAAGTAG +CTGGGATTACAGGTGTGCCAGCATGCCTGGCTATTTTTTTTTTTTTTTCGTATTTTTAGTAGAGATGGGG +TTTTACCATGTTGGTTAGGCTGGTCTCGAACTCCTGACCTCAAGTGATCCACCCGCCTTGGCCTCCCAAA +GTGTTGGGATTACAGGCGTGAGCCACCGCGCCCAGCCTCACTGTGCTTTTTTTCCATAAGTCTTTATTGT +TTCTGTTAAAGTTATTCCTAGGAATTTAATACACTTTTATTGCTAATGTAAATGAGCACTTTTCTACTTT +ACTTTCCAACTATTTATTAGCAGTTTATAGGAAAGCCATTGATTTTTACATGTTTATTATTTAGCCACTC +TATTAAATTCAATTATGAATTAAAATACTTTTTCATTTAAAATTAAAAAATTTTCTCTAACTTTTTTCCC +TTTTGATAATTATACATTTTATGTTTTTCCTTATCTTCCTATATTGATTAGAATTTCCAGGCTGGGCACA +GTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCAAGGCAGGCGGATCACAAGGTCAGGAGATCGAG +ACCATCCTGGCTACGGTGAAACCCCATCTCTACCAAAAGTACAAAAAATTAGCTGGGCGTGGTGGTGGGC +GCCTGTAGTCCCAGCTACTCAGGAGGCTGAGGCAGGAGAATGGCGTGAACCCGGGAGGCGGAACTTGCAG +TGAGCCGAGATTGAGCCACTGCACTCCAGCCTGGGCGACAGAGCCTGACTCCATCTCAAAAAAACAAACA +AACAAAAAAGAACTTCCAGAATCATGTTATTATATATATTGGGAATTCATGTTTTGTTTATGATGTCAAT +AGGAATGCCTATAGTATTTCACTGTTAAGAATAATTTGGCTGGTGATGCATGATGGCTCAAGCCTGTAAT +CCCAGCACTTTGTGAGGCTAAGGCAGGCAGATCGGTTGAGTCCAGGAGTTTGAGACCAGCTTGGGCAACA +AAGTGAGACCTTGTCTCTCCAAAAAAAAAAATACTAGCCTGGTATGGTGGTGCATGCCTGTGGTCCTGTC +GTCCCAGCTACATAAGAGGTTACAGCAAAAGGATCACTTGAGCCCAGGACTTGAGGCTGCAGTGACCCAT +GTTCGTACCAGTGCACTCCGGCCTGGGTGACAGGCGACCAAGCAAGACTCCATCTCAAAAAAAAAAGAAT +AATTTGGCTCTTGGTTTATGGTGTCCTAAAAGTCTTTTTAATTGTGGAAAATAAAATGTACAAAAAAGTG +CATAAAACTTCAACATCCATGTTAACAAATTATTATAAAGTACATATGAATGTCTCGCACTAAGTGTGGT +GCTTGCTGAGGTTTTTGATAGACAATCTTTATCAGATAAAGGCAGTTCTCTTAGTTTGCTAAGAGTTTTT +AAAAAATCATGAATGAGCGTTGATTCCTATCACTACTTCTCTCTGCACCTAAGATGATCACAAGATTTTT +CTCCTTGACTCTGTTAATATGGCATATTACCCTTATTGATTTTCTTTTTTCCTTTTTTTTTTTGAGTCAG +TCTTGCTCTGTCTCCAGGCTGGAGTGCAGTGGTGTGATCTTGGCTCACTGCAACCTCCGCCTCCTGGGTT +TAAGCGATTCTCCTGCCTCAGCCTCCCGAGCAGCTGGGACTACAGGCGCCCGCCACCACGCCCAGCTAAT +TTTTTGAATTTTTAGTAGAGACGGGGTTTCACCATGTTGGCCAGGATTGTCTCGATCTCTTGGCCTCGTG +ATCTGCCTGACTCGGACTCCCAAAGTGCTGGGATTGCAGGTGTAAACCACCGTGCCTGGCCTCCTTATTG +ATTTTCCTTTTTTTTTTTGAGACGGAGTCTTGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCG +GCTCACTGCAAGCTCCGCCCCCCCGGGTTCACGTCATTCTCCTGCCTCAGCCTCCGAGTAGCTGGGACTA +CAGGCGCCCGCCACCACGCCTGGCTAATTTTTTGTATTTTTAGTAGAGACAGGGTTTCACCGTGTTAGCC +AGGATGGTCTCGATCTCCTCACCTCCTGATCCGCCCGCCTTGGCCTCCCAAAATGCTGGGATTACCGGTG +TAAGCCACCGCGTCCAGCCTCCTTATTGATTTTTTAATGTTAAAAACCATTTTGGGTGTTTGGCATAAAC +CCAACTTGGTGATAATGTTTTATCCTGTTTATGTATCACCAGATTCCGTATGCTGATCTTTTGTTTCAGG +TTTTTGCATTTATGTTAATGAGTGATATTTGCCTACAATACTTTCTTGCTTTGCCCTTGTCAGGTTTTGG +CATCAAGATTATGTTAATCTCGTGAAATGGATTTGGGATTTCATCTTCTTTTTCTATGCTCTGGAAAATT +ATCTATAAAATAGACATTATTTCTTCTTTACGTGTTTGGTAAATATTTCCATTGAAGCTGTATGGGTCTG +GAGTTTTATTCTGGCGGAAGTTTTAAACAAGGGATTTGATTTATTTAAATATAATAGCTGTGGAATGATT +CAGGTTTTCTATTTATTTTGTGCTGATTTTAGCATATTGTATTTTTTTTCTTTTCTTTCTTTTTCTTTTT +TCTTTTTTTTTTTTTTGAGATGGAGTCTTGTTCTGTTGTTCAGCCTGGAGTGCAGTGATGTGATCTCGGC +TCACTGAAACTTCCACCTCCTGGGTTCAAGCGATTCTCCTGTCTCAGTCTCCCAAGTGGCTGGGGCTACA +GGTGTGTGCCACCACGCCAGGCTAATTTTTGTATTTTTAGTAGAGATGGGGTTTCACCATGTTGGCCAGG +CTGGTCTTGAACTCCTGACTTCAGGTGATCCACCTGCCTCGGCCTCTCACAGTGCCTGGCCTGTATAATT +TTTTTCTAGGAATTTGTTTATTCCATCTCAAATTGTAAATTCATTGGCATAACATTGTTTAAATATCCTT +TTATTATTATATCCTTTTATCTTATTCATGGTTTATTAACTTTTTAATATGTGCAGGATATACAGTCATA +TACCACTTTTAATTTTTTTTTTTTTTTCTGAGATGGAGTCTCACTCTGTTGCTCGGGCTGGAATGCAGTG +GCACGATCTTGGCTCACTGCAACCTCCGCCTCCCAGGTTCAGACGATTCTCCCACCTCAGCCTCCCAAGT +AGCTGGGATTACAGGTGCACTCCACCACACCTGGTTAATTTTTGTATTTTTAGTAGAGACAAGATTTCAC +CATGTTGCCTAGGCTGGTCTCAAACTCCTGACTTCAGATGATCCACCCAATGTGGCCTCCCAAAGTGCTG +GAATTACAAGTGTGAATACTGTGCCTGACCCTCTTTTGGTTTTAATTTGCATTTTTCTGGTGATCAGAGA +TGTTGAGCATTTAAAAATAGATCTGTTGCCCATTTATAGCTCCTCTTTTGAGAAATATCTGGTCAGATTC +CTTGCCCAGTTTTTAATTAGATTATTTGTTTTCTTGTTACTGATTTGTTTGAGTTCCTTATGTATTTTGG +ATATCAGCTCTTTGTCAGATTTATGGTTTGCAAATATTTTCTCTCACTCTGTAGGTTGTCGCTTCATTCT +CTTGGTTGTTTCCTTTGCTGTGCAGAAGCTCTTTGGTTTTGTGAGTCCCATTTGTCTAGTTTTGCTTTTG +TTGCCTGTGCTTTTGGCGTCCTCTCCAAGACATCATCGCCAAGACCAATGTTGTGGAGCTTTTCCATTAG +GTTTTCTTCTGAGAGATTTAAGTTTCAGGTCTTACATTTACATCTTCTCTATTGTGAGTTGATTTTTGTA +TATGTTGTGAGATAAAGGTCCGGCTTCATTCTTCTGGACGTGGATGTCGAGTTTTCCCAGCACTGTTTAT +TGAAGAGACTGTCCTTTCCCCATTGTGTGTTCTTGTCACAGGGCCCGCTTCTTTCCCCAGGACCCGGGCT +GGAGTGGAAGGAGGGCTGTGCCTTCGCATGGATACCCTGCTCTCCAATCCAGGCTGCCGTGGTCCTCACC +CCTTCCCAGGGTCTCTCTGACACTGAGGCCCTCTGTCCGCGCCGTTGGTTGTTGTGCTTGAACTGCTGGT +TTTTGTTTTGTTTTGTTTTGTTTTTGAGATGGAGTTTTGCTCTTGTTGCCCAGGCTGGAGTGCTATGGCG +CCATCTTGGCTCATTGCAACCTCTGCCTCCCGGGTTCAAGCGATTCTCTTGCCTCAGCCTCCCGAGTAGC +TGGGATTACAGGCATGTGCCATCATGCCAGGCTAATTTTTCTATGTTTAGTAGAGATGGCGTTTCTCCAT +GTTGGTCAGGCTGATCTCGAACTCCCAACCTCATGTTGGGAATTTATATATATATGTTCTTTTTATTCTT +TCTCTCATATTCTCACCTTTGAAAGATTCCGGATACAGGGAAATATTTCTTTACTTAAAAAAAAAAAAAA +GGCCAGGCACAGTTGCTCACACCTATAATCGCAGCACTTTGGGATATATATATATATATATTTTTTTTTT +CCTTTAAAATGTTTTTTATTTTTAGAGTCAGTGGGTACCTGTGCAGGTTTGTTACATGGATATATTGCAT +GATGCTGAGGTTTGAGCCTCAATCAAACCTGTCACCCAGATAGTGAACATGGTACCCAATTGGTAGTTTT +TTCAGTGCTTGCTCCCTTCCCTCCTCTGTTTTGTTCTTGCCCCTTTCCCTCCTCTGCCTGTTGTTCCCAT +CTTTATGTCCATGCATAACCAATGTTTAGCTCCCACTTATAAGTGAGAACATGCGGTATTTGGTTTTCTG +TTTCTGCGTTAATTCACTTAGGATAATGGCCTCCAGCCACATCCATGTTGCTGCAAAGGACATGATTTTG +AAGGAAAACATGTATTTTAAGAAAATACAGGCCAGGCGTTGTGGGGCCCGCCTGTAATCCCAACACTTTG +TGAGGCTGAGGTGGGTGGATCGCTTGAGCCCGGGAGTTTGAGACCAGCTTGGACAACATGGCAAAATCCT +GTTTCTACTAAAACTACAAAAATTAGCCAGGCGTGGAGGCATGTGCCTGTAGTCCCAGCTACTTGGGAGG +CTGAAGTGGGAGGATCAATTAAGCCTGGGAGGTGAAGGCTTCAGTGAGCCATGATTGAGCCATGATTGTG +CCACTGTGTTCCAGCCTGGGCGACAGAGTGAGAGCCTGTTTCAAAAGAAAAAAAGAGAAAAAGAAAAAAA +AAAGTAAATACTGCCTTACTGTCTATAATACCTGCTTTTCTACAAGGTGATAGTAATCAGGAGAATGTAG +GATTGGCAGAAAGGTGGACAAATAGATCAATGGAACAGAATGGGAAATCCACAAATTTGCAATTTGACAA +AGGGGTCAAAGCAATTCATTAAAGAAAGTGGGATCTGTGTTGTATGGATCCTATAGCACATGGAGACCAG +TTGCCGTTGCTGTTTTGATCAACTTCAAAGGCATTCATCCAGGGTGGTTTATTCCAGAACCACTAGAAAT +CATTAGGACCACCAGGACAGCCTCTCCACAGCCATCTGATGTGGCTGCCTCAGCTCGACCTCATGAGGGT +CATCAGTGCTAAGAGTCAAGGTAAGACCCTGTGGAAAGGGGTCTCCTGGTATTTCCTCCCCACCACACTG +GACTAACACAAAGATGGCTGCAAAAGGACATTCCTTGAAATATACAGATTACCTTGGCCGTAGGGTGGAG +AGAGGGGTGAACTGCAGGAAGTTATTATTGCTTATGCCAATATTTTTATTTTTGGGGGGATCCGTTGTTG +TGCTCTTCAGTGATTTTCCTGGTGGCTTTGACCCAGTGGAAAGCTGAAGGACAAAAGTTTTGTCTGCGGA +AGGTGTATTTCTTCCTTGACTGTTATTTAACCTGTGAAACTTGGATTTGGGAGCTATTCCCTAAACAGAA +AGCTGTGGCCCTGCCCTTGGGGTGGGAGGTGACAGCACATGAATTTGTGTTTTCCAGGCTACTCCATTGT +CACTCAAATCACCGGTCCAACAACAGTGAATGGCTTGGAGCGGGGCTCCTTGACCGTGCAGTGTGTTTAC +AGATCAGGCTGGGAGACCTACTTGAAGTGGTGGTGTCGAGGAGCTATTTGGCGTGACTGCAAGATCCTTG +TTAAAACCAGTGGGTCAGAGCAGGAGGTGAAGAGGGACCGGGTGTCCATCAAGGACAATCAGAAAAACCG +CACGTTCACTGTGACCATGGAGGATCTCATGAAAACTGATGCTGACACTTACTGGTGTGGAATTGAGAAA +ACTGGAAATGACCTTGGGGTCACAGTTCAAGTGACCATTGACCCAGGTAAGAGGGAGTGTATATATGTGT +GTGTCTCTCAGGGCCTGCTCTGTCCTGGTCTCTGAGGTCCTACTCAAGTGATTTAATTGTCACTGAGTGA +TCTATCACTTGAGTCCCGAGTCTCATAGAACCCTGACTGACCACCTGGGACTGGGGGAGCAGGGCCTCTC +TTCAATGCCCCCATGGCTCCCAGGGCTCCCTCCACGACGGGATTAAGCCTTTCTAGGCACATTTTTTACC +TCTGCACAGCTCAGTGCCTGAGGACAAGGTGATGGTCCCAGTTTCAGCCCATGGGCCAAAGGGATGCCTT +CCCATGGTACCCAGAGACCCCAGTCCTTGCCATTGCTTTTTGGAGACCTCTGGTGCCCAGGTGTGTCCTG +CTTTTGGTGGAGTTCTGGGCTGGGGCTTCATAGCTGCTGTTCCCATCTCCCATTCCATATCTAACCTGGG +GGAAAAAGAGGCTCAGCATAGTCGGGGTGCTGGGGTCTAATTCCCAAGTCCAGAAAGGTCTTCACAGTCA +AACGGCTGCATTCCTGGTCCCTGACATCCCACCATGAGCCATTTGTGTGTGTGGACTTAATGATATTTCT +TCCTATGTCCTTCTAGAATGCAGACACTCCTGGGAGCGTTAGCTCAGGGGTCCCAGTGCCTCGGTTTGCA +TGGGTTTATTTGTTGGGCTTGTTCTGCTGTCCCTCAGAAGATAAGCAGGATGGCTCGAGGGCCCTTGTGG +CATGGAAGGAGCACCACAGTCCTTGGGGCCCCAGGTTGCACCCGGAGCCTCTGGGGCATATTCTAGTTTG +CAAAGTGCTGAACGTGCATTGTCTCAATGGGTCCTCACAATACCCCTGAGAAGGGGGCATCCCCATCATT +GAAGTTTTGTAACTTGCCCAAAGTCAAAGTTCTTCCTGAATATGATGATTTCATTCAGGGCTGGTCCTGA +CCTTTGATGTCACAGATCTAGGGTCACTGTGCAAGTTCCCACTCTGCTCCTGTTTTTGCTGGATAAATAG +GGCTGGGGGTGGAGATTGACCAAGGCACTGTCATAAGCAGCAGGTTGCTCAAGTTCAGCTGGTCTCCAGC +TGCCCTTAGGCTCTTCCCAGCCCCCTCTAACCCTCCCAGGGCCAGCTCAGCGTCCTCCCTGGACCCTGCC +CAGCCCCACCTGCCTATGAGGCTCTGGAGTCTCCTAGAGACCAGGGAGACATGAGTGGTCCAGGCTGCAG +CCCCTTCCCATAGTAGCAGTCCAGACCCCCGCCCAGGCTCAGAGCTGTGTGCAGGGCTGCGGGTGGGCTC +ATGGGCAGGGGCATCTGTGATCTGGTTTATACACCAAGCATCAACTACAGCGTCGACTCCTGCCCCCACC +ACGCCTACCTCCACTACGTTTACAGCACCAGTCACCCAAGAAGAAACTAGCAGCTCCCCAACTCTGACCG +GCCACCACTTGGACAACAGGTAAGCCAGCTCTGGTTCCACTGTGGCCTACTTGGCCCAAACTCCTCCAAT +GGAGAACTCTTTGAAGTCCCATTTTCCAGCCTGAGCTTTGTCTGGGACCTGCATGCTCCTTGGTGAGGAT +GAGGAAGCTGGGAGCCCTTCCCACTTGCTGTCAGGCCACATGGTGTCCTTTCAGTCTCCACAGATCCTTC +ATTCTGCAGCTCAGTGCTGTTTGTAGGTTCATTCATTCGTTCGCCCCACTTTTACTGTGTTATAATAACT +TCTACATGGCTGCACCGTGCTGAGCTCTGGGGACACAGACATGGACTGTCCCCGTGGAGCTCAGTGTGGG +AGACCGATGAGTTGGCAGTCAGTGAGGACAGTGTGTCTCATGTGAGTGTGGGCGTGGGGTCATGGAGCCG +CAGGGATGACTCTTGGTTCAACTTCTGGGAGGACAAGGAAAGGAGGGCTTGGGGAAGTCCTGTTTCCCTT +AATTTTCCTCTGATTACTATTATGGCAAGAACAGTGATGCCTATAGTTTCTCACATCTCTAGAGCAACTG +ACAGCTCACAAAAGGCTTTCCTGTGCATTATTTTGTTTTCTTTGACCTTTCCTACAACCACCCCATGTGG +AAGAATGGGCAGATACAAATGTTCCCATTTTACAGAAGAAGAAACTGAGCCTCAGTCCCTTCCTCAGTGA +AGGGTGGTGCTGGAGCTTGGATGGAAGTCTCTGGGCTCTACACCATAGCTCAGGTCATGCCTCTTAGTGC +CCTGCCCAACCATCTGCAGTCAGCTGGGCTTTCTAGAAGAGGAACGTCCCCTTTAATTTCTTCCTTTCAC +TCTCCTTGGCCCTCTTGACTTTGTTTTTAGATCTGCCAAGTGCCAAGTTCTCCTTTTGCTGGGACCTTGC +CACAAAATCCCTATGGGAGGTCCCCGGTATTCTAACCACCTGAGACCCTTCTACTCCCCCTCTCCCATTT +GCCTTCTGTTTTTTTTTTAAATTAAACTTTTTAAAGTTTTCTTTTTTTTTATTTTTAGACAGTCTTGCTC +TGTAGCTCAGACTTGAGTGCATGGTGCCATCTTGGCTCACTGCAACCTCTGCCTCTCATGTTCAAGCAAT +TCTCATGCCTCAGTCTCCTGAGTAGCTGGGGGAACTATAGTCGCCCACCACCACACCCAGCTAATTGTTT +TTTTTTTTTTTTTTTTGACGGATTCTCACTCTGTTGCCCAGGTTGGAGTGCAGTAGTATAATCTCAGCTC +ACTGCAACTTCCACCTCCCAGGTTCATGTGATTCTCATACCTCAGCTTCTCGAGTAGCTGCAACTACAGG +CGTAAGCCACCATGCCCGGCCAATTTTTAGTAGCGATGGGGTTTTGTTATGTTGGCCAAGCTGGTCTCCA +ACTCCTGTCCTCAAGTGATTTGCCCACCTCAGCCTCCCAAAGTGCTGAGATTAAAAGCATAAGCCACCTT +GCTTTGTCCCTATTTTTCTTTTTCCTAAGCAAATCAATACAGGATACCCTCTGATTTGAAAGTTTATTTT +CTACCAAACTGCTTTGAATAAGTAATAGTTAAGGTTATTTATTTATTTATTATTTTATGAATCGAAGCCT +TATGTAAGTGCCCGATTAGGGCCTTTGATTAGCATGAGATGGTTTGTTCACTCATTTATTTAACACATAA +CTCTTGAGCACTCACTCTAGCCTGTCACTGTTTAAGGGAATTAAAAAGGGTACAAGAATGGACCAAATAA +TCAAAATGCCTGCCCTCGAGGTACTTACAATCTAGAGCTAATTAGTGTAACCAAATAGGGTTGACACAAT +TAATTGACTGATGAATTTCTTTCTCAAATTTCTGCACATCCAGTGATGAAGGAGATAATCCTTATATTAG +TTTGCTTGGGTTGCCCCTAACAAAATACCTCAGAATGGGGGCTTAAATCACAAAAATTTATTTTCTCACA +ATTCTGGAGGCTAAAAGTCTGAGATCAAGTGGTCCACAAGGTTGGTTTCTTCTGAGTCTTCTCAGCTTGT +AGATGGCTATCTTTTTCCTATGTGTTCACGTGGTCGTCCCTCTGCGTGTGTCTGTGTCCTGATCTCTTCT +AATGACACCAGTCATAATGGATTAGGGCCTGCCCCAGTGACCTCACTTAACCTCAATCACCTCTTTAAAG +ATCCTATCTTCAAATACAATCACATTCTGAGATACTAAAGGGTGGGACTTCAACATATGGATTGGGGTGG +GGGGGTGGTGGTGTACAATCCAGCCCAGCCCACAACAGTTTTTTTTGTTGTTTGTTTGTTTTGAGACGGA +GTGTTGCTCTGTCACCCATGCTGGAGTGCAGTGGCAGGATCTCGGCTCACTGCAACCTACGCCTCCCAGG +TTCAAGCGATTCTCCTGCCTCAGCCTCCCAAGCAGCTGGGAGTAGCCAGGATTACAGGAATGCACCACCA +CACCCGGCTGTTATTTTGTATTTTTAGTAGAGATGGGGTTTTGCCATGTTGGCCAGGCTGGTCTCGAACT +CCCGACCTCATGTGATCAGCCTGCCTCGGCCTCCCAAAGTGCTGGGATTACAGGTGTAAGCCACCGCGCT +CAGTCGAGTGATTCTTGAGCAAAGACTGGAAGAGGTGAGAAAGTGAGCCACACAGAGCTCTGGAGGGAGC +ATGTTCCTGGAGAAAAATGCAACTGGAATAAGGTCCGTAATGTGGGGATGTGCCTGGAATATTTGAGGAG +CAGCAGGAAGTGAGGCTGGAGTGAAGCAGGCAGGGGTGAGTCGTAGGCAGGAGTGGAGGCCATTGTCAGC +CCTGGCCATTTTCATGGGCTGAAATGGGGCTGTTACGGGATTTATTTATTTATTTATTTTATTTTGATTG +TTTGTTTGTTTGTTTATTTATTTATTTATTTATTTATTTATTGAGACAGGGTCTTGCTCTGTCGACCAGG +CTGGAGTGCAGTGGCGCGATCTCAGCTCACTGCAACCTCCGCCTCCTGGGTTCAAGCAATTCTCCCACCT +CAGCTTCCCAAGTAGCTGGGATTACAGACACACGCCACCCCACCCAGCTAATTTTTGTATTTTTTGGTAG +AGACGGAGTTTCACCATGTTGGTCAGGCTGGTCTTGAACTCCTGACCTCAAGTGACCTGCCTACTTCAGC +CTCCCAAGTTGCTGGGATTACAGGTATGAGCCACTGTGCCTGGCCTGTTGCAGAATTTTGAGAAAGAGAG +GGGTGTGCTTTGACGTTTGGTTCACAGGCCCCCCGGCTGCTGTGGCAATGATCCACTACAGTAGGGCCTG +TCAGTGTCCTGTGGCCGCCGTTACGAGTGGCTCAAAGCAAAAGAAATACGTTCTCTCACAGCTAAGGAGG +CCAGGAATTGGAAATCAAGGTGTCACCAGGGCTGTTGCCACCCTGGGGGCTCAAAGGGAGAGTCTGTCTC +ATGCCTCTGTTCCAGCTGCTGGTGGGCGCCTGCGATCCTTTGTGTTCCCGGTCTTGTGGCGGCATCACTC +TGATCCCTGTCTCTGTCGTCTCATGGTCTTCTTCCTGTGTGTGTTTATGTGTCCAAATTTCCCTCTTGTA +AGGACACCAGTCGCTGGATCAGGGCCCATGCTAATCAACTATGACCTCATCTTAACTTGATTACATCCAC +AAATAGGTCACATTCACGGGTCTGAGTGGATGTGAATTCGAGGGGAAGAGGACGCTATTCAACCCAGTAG +AGAGGGCAGGTGTAGAGGCAGGAGGTGTTTTAGGAGTCTGCTGCCCTACAATTTCTGTTAGGCTTCTAGA +CACATTAAAACCACCTCATGGAACCCTTTTCTCCTTTTAAAAACTATCTTTTGTTAAGGCCGGGCGCTGT +GGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCTGAGGTGGGCGGATCATGAGGTTAGGAGTTCGAGAC +CAGCCTGGCTTACATGGCAAAATCCTGTCTCTACTAAAAATACAAAAATTAGCTAGGTGTGGTGGTGGGT +GCCTGTAATCCCCGCTACTTGGGAGGCTGAGGCAGAAGAATCTCTTGAACCTGGGAGGTGGAGGTTGCAG +TGAGCCGAGATCATGCCACTGTACTCCAGCCTGGGTGACAGAGCAAGACTCTGTGTCAAAAAAATGAAAA +ACAAAGAAAACCAAAAACAACTATCTTTTGTTAAAATTCTTAAATTTTAAAAACTTTTATTTTAGCTTCA +GGGGTACACGTACAGATTTGTTATATCGGTAAATTGCACGTCACCGGGGTTTTGTGTACAGATTATTTTG +TCACCCAGGTAATAGGTATAGTACCCAATAGGTAGTTTGTAGTTTTTTGATTCTTTGCTTCCTCCCACCC +TCCATCCTCAAGTAGGCCCTGGTGTCTCTTGTTCCATTCCCTTCTTTGTGTCTATGTGTACTCATTGTAA +ACCCATCTTTGAGGATCCCCAGGAGTGAGGGTTCCCAGGCTGCAAACTACAGACCTGCAGCTCATCAGGT +GTGTGGCCCTCCCCTGCTGCTGGGTGGGTGGGATTGGGCTGAGAGGCAGGTGGTGAGCCTGGGATGCAGG +GGACGCTTGCCATTGTTTCTAGGCACAAGCTCCTGAAGCTCAGTGTCCTCCTGCCCCTCATCTTCACCAT +ATTGCTGCTGCTTTTGGTGGCCGCCTCACTCTTGGCTTGGAGGATGATGAAGTACCAGCAGAAAGGTGAG +AGGACCTGGGTGAGGCTGGGCTGAGACTGGGCCGGGTGGCTGGGGATGTGGCATCTGGGAGCTATCTGGT +CCAAGGGATTACCCACACAATCAGGAAATGGCACAGTCCCAAAGGCCCTCAGCAGCTGGCAGTGCCAGGG +GACCGGGGGGCGCCCTGGAGCTTGCAGGGGAGCCCCAGGAGGTTTTGGCATTCGCCGCCTGCCCTGCACC +TCCGTCCTTCCCCTTCTCTTCTCTCTGATCCACCTCCCTGTCTCTTTCCTGGATTGACTCTTGATGGCAT +AGGTGGCAATGGTCTTAGGTGGGTGGTCCAGTCTGGGGTGTCTAGACTAGACGCTAGAGTTGGGTCTATG +TGGTTGTCAAGGAGGTACTGAGCTGACATGGAATGGGGAGGGAGAAACAGACACAAACACACCAGTGGAT +GCACACACACATGCGCGATCATGGGCACACACACTGCATGGATCCCACAGTGCATGTGCACTCATCACAT +TACACAAGCATGCACACACACACACAGCCCTCCAAGTCTGATCAGGAAATTGGAATGGCCTTTCCTTCTG +TCCCTCCCTTGCCCACTGGACTTTCCCAGGCTGACCTACTTGCCCTTTGTGAACTAGCTCTTCAGGGAGG +AGCCTCTTGGCGCAGGAGTCTACACTCACAGATCATGAGGCCTGGCCTCAACAAAACCCCAGTTGTGTGA +GAATGTGGCTTCACTTCCACATTCTCACACAATTGTACCTCTTTCTCTTCTAGAACAGCTGGGTCCTAAC +TCTCACAGGGTTCCCTCCAACACTTCTCCCTCCCCACAGCCTGGGAAATCTTTCGCCTGGGAAGAGGGGA +GCCTGACTCTCTGTTAGTTTGTTTTCTGTGATGCTTTTTCAAAATTAGAGCCTGACTTTTATTCATCGCT +TTTTCTCCCAAATGTATTGCCCACGGCCTAAGGTCTAAGTGCTTTCTGTGTTCTCCAGCATCTAGACTTT +TGAAACCAGGGGAAAACCCCCCATTTCTTTGTTGTCCCTAGGGCAGGGATGTGGGGCTGGTGGCTGGGGA +GGGGAGAAAGAAGACAGGGACCAGCAACCACCTGAAAATCGGTCCCAGTGGAAAAACAATCCAATCCAAT +CTCTGGGTGTCGTGGCTCATGCCTATAATCTCCGCACTTAGGGAGGTTGAGGCGGGTGGATCACTTGAGG +TCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCTTCTCTACTAAAAATGCAAAAAAATTAGC +TGGATATGGTGGCACACATCTGTAGTTCCAGCTACTCAGGAGGCTAAGGCAGGAGAATCGCTTGAACCCA +GGAGGTGGAGGTTGCAGTGAGCCAAGATGGTGCCATTGCCCTCCGGCATGGGTGACAAGAGAGAAACTCC +ATCTCAAAACCAAACCAAACCAAACCAAACCAAACCAAACCAAACCAAACCAAACCAAACAAAATCGGTC +CCTACTGTGCCGACCCAGCTTGCACATTCTACTCGGGGTTGGGGGACAGCTTGGTGCATTCCTGTCTCTG +CAGACAACTTTTACTGATGGAGCACCACCTGCTCAGCACAGCCAAGCACCCAGGTTGTAAGGAAGGGACA +GGCCCAGCCACCCTGCAGCTTACAGCCCACCTGAGGACTAACTAGAATTCAGTAGGACGAAGATGATGAG +ACAGACATAGAGCAAGTGCCCCCCGGGGGCCCGGGAGGGGGATTAGACAGCCTGGGTAGCTGCAGAAGGC +AGGCATGACTTTGAATTGACCCCTGAAGGATGAATCAGGTAGAGAGACATGGTGGAATTCCAGGCCGAGT +TGGGATTCTGCAGAAGCAAACTTACTCCCAGGGGAGCCGATGGCTGGCAGGGCCACCCGATCAGCTGAGT +GCACAGTTCTATGATGTGGGGAGCAGGGGGCTGGGGTGTCCCTCTTTTCCTGGTGGAGAAGTGAAAGGTC +AGAGAGGTGACATGACGAATGTCACAGACTGTGTCATCTGAGGAATCAGAATTAGAACCAGGTCCCTGTC +TCTGAGCCTGGACCCAATTCACCTCTTTTCTTCCATATTTCTCTTCCCTAGACCTGGGGGTTCTTGTGAG +TCTGGGGAATGGGCTTTCTAATTCTGAATTGTTGTTTGTCTTTTAGCAGCCGGGATGTCCCCAGAGCAGG +TAAGATAGCCCCCAAGGTCGGACCAGAGAGACCAGGCAGCTTCTTTCCCCTTCCTCATTTTCCATCTCTC +AGAAGGTCCAAAATATCAGAGAAGTCCTGTGCCAAAGTATCCTGTCCTCATGGCACCCTGCAGGGCCTGG +GGGCTGAGGGCCTGGAAACAGCCAGGGGGACAAGGGCTGAGGGGTACAGGAGCCTGAGAAGTGGAGGACA +CGGAGGGAAATGGGAGAGGGGCACCTGGGGAAGTGGAGAGGCTCTCGTCCCGTCCCACCGTGGAAAAGGA +ACCTGTGTCCACAGACAGATCTTGTGACTCCCCAGCCTGTGTTCCCTGTCCCAGGTACTGCAGCCCCTGG +AGGGCGACCTCTGCTATGCAGACCTGACCCTGCAGCTGGCCGGAACCTCCCCGCAAAAGGCTACCACGAA +GCTTTCCTCTGCCCAGGTTGACCAGGTGGAAGTGGAATATGTCACCATGGTGCGTCCTCCATGGGGGCTG +CTGTGAGGCTGGGGCAGGGGGACACAGGCCTGCCGTTGCACCCCGTTGGAGGGTGGGCTTTCTCCTGCTC +ATGTGGGCATAGAGGAGGCAGGTGTGGGGCAAAGCCATAGTCACTGCAGGACTCGCCTAGGAGCTCCCAC +AGGCAAGTCTCAGTCCTGGGGACACAGGGACTGGCAAACACAGTCCCCACGTGGGCAGCAGAGTCCCAGA +CAGGCTGAGTGACTTGGGATTCCACAGGAAATGTGCTGCACTGGTGCAGGGTTGGGGGAGAATCTGTTCT +GTTGTGGTTGAAGCTCAGGGTGCTTGGCAGAGTGAAGGGGGAAGCTGGAGGGCTTGAGTGGGATTAGAAG +CCCTGAATGGTCCATCCCCACTATCCTCCTCCGAGGGCTCCTCTGCCCCGTGACCGTCACTTCCTGCCGT +CCTCTCTGACCTGGAGCCCGCCTTTGCTGCAGGCTTCCTTGCCGAAGGAGGACATTTCCTATGCATCTCT +GACCTTGGGTGCTGAGGATCAGGAACCGACCTACTGCAACATGGGCCACCTCAGTAGCCACCTCCCCGGC +AGGGGCCCTGAGGAGCCCACGGAATACAGCACCATCAGCAGGCCTTAGCCTGCACTCCAGGCTCCTTCTT +GGACCCCAGGCTGTGAGCACACTCCTGCCTCATCGACCGTCTGCCCCCTGCTCCCCTCATCAGGACCAAC +CCGGGGACTGGTGCCTCTGCCTGATCAGCCAGCATTGCCCCTAGCTCTGGGTTGGGCTTGGGGCCAAGTC +TCAGGGGGCTTCTAGGAGTTGGGGTTTTCTAAACGTCCCCTCCTCTCCTACATAGTTGAGGAGGGGGCTA +GGGATATGCTCTGGGGCTTTCATGGGAATGATGAAGATGATAATGAGAAAAATGTTATCATTATTATCAT +GAAGTACCATTATCGTAATACAATGAACCTTTATTTATTGCCTACCACATGTTATGGGCTGAATAATGGC +CCCCAAAGATATCTGTGTCCTAATCCTCAGAACCTGTGACTGTTACCTTCTGTGGCAGAAAGGGACAGTG +CAGATGTATGTAAGTTAAGGACTTTGAGATAGAGAGGTTATTCTTGCTGATTCAGGTGGGCCCAAAATAT +CACCACAAGGGTCCTCATAAGAAAGAGGCCAGAAGGTCAAAGAGGTAGAGACAAAGTGATGATGGAAGTG +GACGTGGGTGTGACGTGAGCAGGGGCCATGAATGCCGCAGCCTTCAGATGCCAGAAAGGGAAAGGAATGG +ATTCCCCTGCCTGGAGCCTCCAAAAGAAACCAGCCCTGCCCACGCCTTGACTTGAGCCCATTGAAACTGA +TCTTGAGCTCCTGGCCTCCAGAATTGCAGGAGAATAAATTTGTGTTGTTTTTAATGA + diff --git a/misc_module b/misc_module new file mode 160000 index 0000000..d29d55c --- /dev/null +++ b/misc_module @@ -0,0 +1 @@ +Subproject commit d29d55c0582e79c291546bcf9594f22155295f5e diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..29abb64 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +beautifulsoup4==4.12.3 +Bio==1.7.0 +biopython==1.81 +numpy==1.26.4 +pytest==8.2.0 +python-dotenv==1.0.1 +Requests==2.31.0 +requests_mock==1.12.1 +scikit_learn==1.4.1.post1 diff --git a/test_my_tools.py b/test_my_tools.py new file mode 100644 index 0000000..18081df --- /dev/null +++ b/test_my_tools.py @@ -0,0 +1,112 @@ +import unittest +import pytest +import os + +from Bio import SeqIO +from biopython_fastq_filter import BiologicalSequence, NucleicAcidSequence, InvalidInputError, DNASequence, RNASequence, AminoAcidSequence, filter_fastq, send_telegram_message +from bio_files_processor import convert_multiline_fasta_to_oneline +import requests_mock +import requests + +def test_complement(): + inp = NucleicAcidSequence('ATGC') + target = 'TACG' + result = NucleicAcidSequence(inp).complement() + assert target == result + +def test_amino_acid_frequency(): + peptide = AminoAcidSequence('AAGG') + result = peptide.amino_acid_frequency() + target = {'A': 50.0, 'G': 50.0} + assert target == result + +def test_transcribe(): + dna = 'ATGC' + target = 'AUGC' + result = DNASequence(dna).transcribe() + assert target == result + +def test_check_nucleic_acid(): + input = 'ATGCU' + with pytest.raises(InvalidInputError): + DNASequence(input) + +def test_amino_acid_type(): + input = 'AAGGZZ' + with pytest.raises(InvalidInputError): + AminoAcidSequence(input).amino_acid_frequency() + +test_fasta_data = """>seq1 +ATGC +ATGC +ATGC +""" +expected_fasta_data = """>seq1 +ATGCATGCATGC +""" + +@pytest.fixture +def tmp_fasta_file(tmp_path): + fasta_file = tmp_path / "test.fasta" + fasta_file.write_text(test_fasta_data) + return fasta_file + +def test_convert_multiline_fasta_to_oneline(tmp_fasta_file): + output_file = tmp_fasta_file.parent / "output.fasta" + result_file = convert_multiline_fasta_to_oneline(tmp_fasta_file, output_file) + + with open(output_file) as f: + result_data = f.read() + assert result_data == expected_fasta_data + output_file.unlink() + + +test_fastq_data = """@SRX079804:1:SRR292678:1:1101:21885:21885 +ACAGCAACATAAACATGATGGGATGGCGTAAGCCCCCGAGATATCAGTTTACCCAGGATAAGAGATTAAATTATGAGCAACATTATTAA ++SRX079804:1:SRR292678:1:1101:21885:21885 +FGGGFGGGFGGGFGDFGCEBB@CCDFDDFFFFBFFGFGEFDFFFF;D@DD>C@DDGGGDFGDGG?GFGFEGFGGEF@FDGGGFGFBGGD +@SRX079804:1:SRR292678:1:1101:212327:212327 +TTACCTCTGCTTTTTCGCCTGTTACTTCTACTAATCCTTCATCTATTGCGAATGGCCCTACTACTGACGAAAT ++SRX079804:1:SRR292678:1:1101:212327:212327 +DBCCC@@;A@BDCCCE>BBED>GDCDBFBFFEBEECFGGD@@BCB<<8@;09746:@@>@EEECEEDE/FEED +@SRX079804:1:SRR292678:1:1101:230386:230386 +TTCTGATTGGAGTGAGAGTGCCATTTGTTTCGCTGATTGGACGTTGGAAAGCGCCTTGACCTTTGACAGCAG ++SRX079804:1:SRR292678:1:1101:230386:230386 +=BDD=DCDBBCDADD@@B;B@CC7C@B@>=BACDD,=??8DDCDD7CCCCDBDEDBDDDDDEEBDDDBD?DB +""" + +@pytest.fixture +def tmp_fastq_file(tmp_path): + fastq_file = tmp_path / "test.fastq" + fastq_file.write_text(test_fastq_data) + return fastq_file + +def test_filter_fastq(tmp_fastq_file, tmp_path): + quality_threshold = 10 + gc_bounds = (30, 70) + length_bounds = (10, 100) + + output_file = tmp_path / "filtered.fastq" + result = filter_fastq(tmp_fastq_file, quality_threshold, output_file, gc_bounds, length_bounds) + with open(output_file) as f: + filtered_records = list(SeqIO.parse(f, "fastq")) + assert len(filtered_records) == int(result) + +token = os.environ.get('TG_API_TOKEN') +def test_send_telegram_message(requests_mock): + chat_id = "508988457" + message = "Test message" + expected_json_response = { + "ok": True, + "result": { + "message_id": 1234, + } + } + requests_mock.post( + f"https://api.telegram.org/bot{os.environ['TG_API_TOKEN']}/sendMessage", + json=expected_json_response, + status_code=200, + ) + response = send_telegram_message(chat_id, message) + assert response == expected_json_response +