diff --git a/README.md b/README.md index ba78686..06139ab 100644 --- a/README.md +++ b/README.md @@ -1 +1,83 @@ -# misc_module \ No newline at end of file +# misc_module + +This module contains assignments for the BI course on python. The scripts include tasks on OOP, API and various functions useful for basic bioinformatics. +The following functions can be found in `bio_files_processor.py`: + +``` +convert_multiline_fasta_to_oneline +``` + +Example: + +` +convert_multiline_fasta_to_oneline(input_fasta='example_multiline_fasta.fasta') +` + +Input file: + + + +After conversion to a single line: + + + +``` +OpenFasta +``` + +This is a context manager that works with fasta files. + +- It returns records as `FastaRecord` class objects +- Includes `read_record` and `read_records` methods + +Input and output example: + +```fasta +>GTD326487.1 Species anonymous 24 chromosome +ATCGACTACGACTAGCATCACGATCACGATACG +ATGCATCAGTAGCACTAGATCA +``` + +```python +id = 'GTD326487.1' +description = 'Species anonymous 24 chromosome' +sequence = 'ATCGACTACGACTAGCATCACGATCACGATACGATGCATCAGTAGCACTAGATCA' +``` + +In biopython_fastq_filter.py the following functions are located: + +``` +fastq_filter +``` + +This function uses BioPython and filters fastq sequences by GC content, sequence length and quality score. + + +``` +BiologicalSequence +``` + +This is an abstract class that includes: + +Class NucleicAcidSequence which has `complement` and `gc_content` methods. It's a parent class to DNASequence and RNASequence classes. + +Class AminoAcidSequence has `amino_acid_frequency` method. + + +``` +telegram_logger +``` + +This function send a message from a telegram bot about the status of some process: + + + + +``` +run_genscan +``` + +This is a python API for this web tool http://hollywood.mit.edu/GENSCAN.html + + +In `custom_random_forest.py` there is a `RandomForestClassifierCustom` class that works with a custom number of threads which makes it fast. diff --git a/Showcases.ipynb b/Showcases.ipynb new file mode 100644 index 0000000..676ef72 --- /dev/null +++ b/Showcases.ipynb @@ -0,0 +1,449 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "39e3aac3-ed97-4c58-871e-a334ee17c2a4", + "metadata": {}, + "source": [ + "# OpenFasta" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "fec18e02-01e2-41ef-97c0-26aeffd2f057", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id = 'GTD323452'\n", + " description = '5S_rRNA'\n", + " sequence = 'ACGGCCATAGGACTTTGAAAGCACCGCATCCCGTCCGATCTGCGAAGTTAACCAAGATGCCGCCTGGTTAGTACCATGGTGGGGGACCACATGGGAATCCCTGGTGCTGTG'\n" + ] + } + ], + "source": [ + "import time\n", + "import os\n", + "from typing import Optional\n", + "\n", + "from bio_files_processor import OpenFasta\n", + "\n", + "fasta_file = \"data/example_fasta.fasta\"\n", + "\n", + "with OpenFasta(fasta_file) as fasta:\n", + " for record in fasta.read_records():\n", + " print(record) " + ] + }, + { + "cell_type": "markdown", + "id": "f2cff184-adc1-4396-aabe-5634dd654efc", + "metadata": {}, + "source": [ + "# Run_genscan" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2bf845cb-ff8a-4dac-b121-c40f5351dc99", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Cannot provide both a sequence containing file and a sequence in the form. Please choose one.\n", + "\n", + "
\n", + "View gene model output: PS | PDF\n", + "\n", + "\n", + "GENSCAN 1.0\tDate run: 29-Apr-124\tTime: 18:45:12\n", + "\n", + "\n", + "\n", + "Sequence /tmp/04_29_24-18:45:12.fasta : 18651 bp : 48.67% C+G : Isochore 2 (43 - 51 C+G%)\n", + "\n", + "\n", + "\n", + "Parameter matrix: HumanIso.smat\n", + "\n", + "\n", + "\n", + "Predicted genes/exons:\n", + "\n", + "\n", + "\n", + "Gn.Ex Type S .Begin ...End .Len Fr Ph I/Ac Do/T CodRg P.... Tscr..\n", + "\n", + "----- ---- - ------ ------ ---- -- -- ---- ---- ----- ----- ------\n", + "\n", + "\n", + "\n", + " 1.01 Intr + 64 144 81 2 0 40 99 113 0.455 7.21\n", + "\n", + " 1.02 Intr + 8152 8490 339 2 0 86 110 352 0.992 32.65\n", + "\n", + " 1.03 Intr + 9870 9933 64 1 1 88 115 68 0.994 7.28\n", + "\n", + " 1.04 Intr + 14487 14599 113 0 2 71 100 118 0.742 11.32\n", + "\n", + " 1.05 Intr + 16751 16773 23 0 2 84 110 7 0.471 -0.24\n", + "\n", + " 1.06 Intr + 17109 17243 135 2 0 101 74 173 0.993 17.96\n", + "\n", + " 1.07 Term + 17717 17872 156 1 0 109 38 97 0.989 4.73\n", + "\n", + " 1.08 PlyA + 18627 18632 6 1.05\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Suboptimal exons with probability > 1.000\n", + "\n", + "\n", + "\n", + "Exnum Type S .Begin ...End .Len Fr Ph B/Ac Do/T CodRg P.... Tscr..\n", + "\n", + "----- ---- - ------ ------ ---- -- -- ---- ---- ----- ----- ------\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "NO EXONS FOUND AT GIVEN PROBABILITY CUTOFF\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Predicted peptide sequence(s):\n", + "\n", + "\n", + "\n", + "\n", + "\n", + ">/tmp/04_29_24-18:45:12.fasta|GENSCAN_predicted_peptide_1|303_aa\n", + "\n", + "XSTEGNGDLSEEKMPLLTLYLLLFWLSGYSIVTQITGPTTVNGLERGSLTVQCVYRSGWE\n", + "\n", + "TYLKWWCRGAIWRDCKILVKTSGSEQEVKRDRVSIKDNQKNRTFTVTMEDLMKTDADTYW\n", + "\n", + "CGIEKTGNDLGVTVQVTIDPAPVTQEETSSSPTLTGHHLDNRHKLLKLSVLLPLIFTILL\n", + "\n", + "LLLVAASLLAWRMMKYQQKAAGMSPEQVLQPLEGDLCYADLTLQLAGTSPQKATTKLSSA\n", + "\n", + "QVDQVEVEYVTMASLPKEDISYASLTLGAEDQEPTYCNMGHLSSHLPGRGPEEPTEYSTI\n", + "\n", + "SRP\n", + "\n", + "\n", + "
Back to GENSCAN\n", + "
\n" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from bs4 import BeautifulSoup\n", + "from biopython_fastq_filter import run_genscan\n", + "\n", + "output = run_genscan(sequence=None, sequence_file=\"data/sequence.fasta\", organism=\"Vertebrate\", exon_cutoff=1.00, sequence_name=\"\")\n", + "soup = BeautifulSoup(output, 'html.parser')\n", + "lines = soup.prettify().split(\"\\n\")\n", + "cds_list = []\n", + "soup" + ] + }, + { + "cell_type": "markdown", + "id": "aea07324-6768-4c42-8371-c7ba75436049", + "metadata": {}, + "source": [ + "# RNASequence, DNASequence, AminoAcidSequence " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "2aafe0ba-fecb-467c-af55-977c5850c59d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AAUU\n", + "{'K': 33.33, 'P': 33.33, 'L': 33.33}\n", + "CCTT\n" + ] + }, + { + "data": { + "text/plain": [ + "str" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from biopython_fastq_filter import NucleicAcidSequence, BiologicalSequence, RNASequence, DNASequence, AminoAcidSequence \n", + "\n", + "new_rna = RNASequence('UUAA')\n", + "print(new_rna.complement())\n", + "\n", + "new_protein = AminoAcidSequence('KKPPLL')\n", + "print(new_protein.amino_acid_frequency())\n", + "\n", + "new_dna = DNASequence('GGAA')\n", + "print(new_dna.complement())\n", + "type(new_dna.complement())" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "134df12f-398c-49f7-9b32-c95ebfe3f89e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1m============================= test session starts ==============================\u001b[0m\n", + "platform linux -- Python 3.12.0, pytest-8.2.0, pluggy-1.5.0\n", + "rootdir: /home/lsmertina/misc_module\n", + "plugins: anyio-4.3.0, requests-mock-1.12.1\n", + "collected 8 items \u001b[0m\n", + "\n", + "test_my_tools.py \u001b[32m.\u001b[0m\u001b[32m.\u001b[0m\u001b[32m.\u001b[0m\u001b[32m.\u001b[0m\u001b[32m.\u001b[0m\u001b[32m.\u001b[0m\u001b[32m.\u001b[0m\u001b[32m.\u001b[0m\u001b[33m [100%]\u001b[0m\n", + "\n", + "\u001b[33m=============================== warnings summary ===============================\u001b[0m\n", + "test_my_tools.py::test_filter_fastq\n", + " /home/lsmertina/miniforge3/envs/testing/lib/python3.12/site-packages/Bio/SeqUtils/__init__.py:144: BiopythonDeprecationWarning: GC is deprecated; please use gc_fraction instead.\n", + " warnings.warn(\n", + "\n", + "-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html\n", + "\u001b[33m========================= \u001b[32m8 passed\u001b[0m, \u001b[33m\u001b[1m1 warning\u001b[0m\u001b[33m in 0.29s\u001b[0m\u001b[33m =========================\u001b[0m\n" + ] + } + ], + "source": [ + "! python -m pytest" + ] + }, + { + "cell_type": "markdown", + "id": "5ef9fe6c-3fb6-41e4-97fe-eecde2c97028", + "metadata": {}, + "source": [ + "# RandomForestClassifierCustom" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d9581b19-6f73-474d-9159-b74c97c3ab40", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import warnings\n", + "import random\n", + "import math\n", + "import pandas as pd\n", + "import xgboost\n", + "import lightgbm\n", + "import catboost\n", + "\n", + "from matplotlib.colors import ListedColormap\n", + "from scipy.stats import pearsonr\n", + "from itertools import combinations\n", + "from sklearn.base import BaseEstimator\n", + "from sklearn import datasets\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.ensemble import (RandomForestClassifier,\n", + " ExtraTreesClassifier,\n", + " VotingClassifier)\n", + "from sklearn.tree import (DecisionTreeRegressor,\n", + " DecisionTreeClassifier)\n", + "from custom_random_forest import RandomForestClassifierCustom\n", + "import time\n", + "from sklearn.datasets import make_classification" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f03467db-99fa-4b07-9e18-9dc015b2b28d", + "metadata": {}, + "outputs": [], + "source": [ + "X, y = make_classification(n_samples=100000)\n", + "random_forest = RandomForestClassifierCustom(max_depth=30, n_estimators=10, \n", + " max_features=2, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7b735b11-70e9-4b19-b82f-fa7cca0aa901", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.9622790813446045\n" + ] + } + ], + "source": [ + "import time\n", + "start_time = time.time()\n", + "random_forest.fit(X, y, n_processes=1)\n", + "fit_time_1_process = time.time() - start_time\n", + "print(fit_time_1_process)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "3c7c0196-8209-4518-95f0-753972c2a22e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.2971296310424805\n" + ] + } + ], + "source": [ + "#Fit with 2 processes\n", + "start_time = time.time()\n", + "random_forest.fit(X, y, n_processes=2)\n", + "fit_time_2_processes = time.time() - start_time\n", + "print(fit_time_2_processes)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "e66deca9-1f5a-44a6-98bf-bff0ea6cade6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.21371793746948242\n" + ] + } + ], + "source": [ + "#Predict with 1 process\n", + "start_time = time.time()\n", + "predictions_1_process = random_forest.predict(X, n_processes=1)\n", + "predict_time_1_process = time.time() - start_time\n", + "print(predict_time_1_process)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d6bfb90d-51bd-4a76-a405-67172981d061", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.2030501365661621\n" + ] + } + ], + "source": [ + "#Predict with 2 processes\n", + "start_time = time.time()\n", + "predictions_2_processes = random_forest.predict(X, n_processes=2)\n", + "predict_time_2_processes = time.time() - start_time\n", + "print(predict_time_2_processes)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "c9ea23c3-3e68-4208-82c3-64be50b47992", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fit time with 1 process: 2.9622790813446045\n", + "Fit time with 2 processes: 2.2971296310424805\n", + "Predict time with 1 process: 0.21371793746948242\n", + "Predict time with 2 processes: 0.2030501365661621\n", + "Predictions match: True\n" + ] + } + ], + "source": [ + "#Check if predictions are the same\n", + "predictions_match = np.array_equal(predictions_1_process, predictions_2_processes)\n", + "\n", + "print(\"Fit time with 1 process:\", fit_time_1_process)\n", + "print(\"Fit time with 2 processes:\", fit_time_2_processes)\n", + "print(\"Predict time with 1 process:\", predict_time_1_process)\n", + "print(\"Predict time with 2 processes:\", predict_time_2_processes)\n", + "print(\"Predictions match:\", predictions_match)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/bio_files_processor.py b/bio_files_processor.py new file mode 100644 index 0000000..8a9f12d --- /dev/null +++ b/bio_files_processor.py @@ -0,0 +1,73 @@ +import os +from typing import Optional +from dataclasses import dataclass + +def convert_multiline_fasta_to_oneline(input_fasta: str, output_fasta: Optional[str]=None) -> str: + """ + This function converts a multiline fasta file into one line fasta + Arguments: input file and output file name (optional) + Returns a file in current working directory + """ + if output_fasta == None: + output_fasta = 'output_fasta.fasta' + current_dir = str(os.getcwd()) + output_file = os.path.join(current_dir, output_fasta) + with open(input_fasta) as input_file, open(output_fasta, mode='w') as output_file: + help_list = [] + for line in input_file: + if line.startswith('>'): + if len(help_list)!=0: + output_file.write(''.join(help_list) + '\n') + help_list = [] + output_file.write(line) + else: + help_list.append(line.strip()) + if len(help_list)!=0: + output_file.write(''.join(help_list) + '\n') + return output_file + +@dataclass +class FastaRecord: + id: str + description: str + sequence: str + + def __repr__(self): + return f"id = '{self.id}'\n description = '{self.description}'\n sequence = '{self.sequence}'" + +class OpenFasta: + def __init__(self, filename): + self.filename = filename + self.file = None + + def __enter__(self): + self.file = open(self.filename, 'r') + return self + + def __exit__(self, exc_type, exc_value, traceback): + if self.file: + self.file.close() + + def read_record(self): + name_line = self.file.readline().strip() + if not name_line: + return None + parts = name_line.split(' ', 2) + if len(parts) < 2: + return None + id = parts[0][1:] + desc = parts[1] + seq = '' + line = self.file.readline().strip() + while line and not line.startswith(">"): + seq += line + line = self.file.readline().strip() + return FastaRecord(id=id, description=desc, sequence=seq) + + def read_records(self): + records = [] + record = self.read_record() + while record: + records.append(record) + record = self.read_record() + return records \ No newline at end of file diff --git a/biopython_fastq_filter.py b/biopython_fastq_filter.py new file mode 100644 index 0000000..1f525b8 --- /dev/null +++ b/biopython_fastq_filter.py @@ -0,0 +1,195 @@ +from Bio import SeqIO +from Bio.SeqUtils import GC +import requests +from bs4 import BeautifulSoup +import time +import os +from dotenv import load_dotenv +from dataclasses import dataclass +from typing import List + +load_dotenv() + + +def filter_fastq(input_path: str, quality_threshold: int, output_filename="final_filtered.fastq", gc_bounds=(40, 60), length_bounds=(30, 350)): + filename = input_path + records = SeqIO.parse(filename, "fastq") + ###quality filter + good_reads = (rec for rec in records if min(rec.letter_annotations["phred_quality"]) >= quality_threshold) + result_quality = SeqIO.write(good_reads, "good_quality.fastq", "fastq") + result_quality_GC = SeqIO.parse("good_quality.fastq", "fastq") + ###GC content filter + min_gc_content = gc_bounds[0] + max_gc_content = gc_bounds[1] + GC_quality_filt = [] + + for sequence in result_quality_GC: + if min_gc_content <= GC(sequence.seq) <= max_gc_content: + GC_quality_filt.append(sequence) + + result_quality = SeqIO.write(GC_quality_filt, "good_quality_GC.fastq", "fastq") + result_quality_GC_length = SeqIO.parse("good_quality_GC.fastq", "fastq") + + ##length filter + filtered_GC_quality_length = [] + + for sequence in result_quality_GC_length: + if len(sequence.seq) >= length_bounds[0] and len(sequence.seq) <= length_bounds[1]: + filtered_GC_quality_length.append(sequence) + + result_quality = SeqIO.write(filtered_GC_quality_length, output_filename, "fastq") + + return result_quality + + +from abc import ABC, abstractmethod + +class InvalidInputError(ValueError): + pass + +class BiologicalSequence(ABC, str): + @abstractmethod + def __init__(self, seq): + self.seq = seq + + def __len__(self): + return len(self.seq) + + def __getitem__(self, index): + return self.seq[int(index)] + + def __repr__(self): + return __str__(self.seq) + + def check_nucleic_acid(self): + unique_chars = set(self.seq) + nucleotides_dna = set('ATGCatgc') + nucleotides_rna = set('AUGCaugc') + if unique_chars <= nucleotides_dna: + seq = 'dna' + elif unique_chars <= nucleotides_rna: + seq = 'rna' + else: + raise InvalidInputError() + return seq_type + +class NucleicAcidSequence(BiologicalSequence): + complement_dict = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'a': 't', 't': 'a', 'g': 'c', 'c': 'g'} + def __init__(self, seq): + super().__init__(seq) + self.check_nucleic_acid() + self.length = len(self.seq) + + def complement(self): + list_input = list(self.seq) + for i in range(len(self.seq)): + if list_input[i] in self.complement_dict: + list_input[i] = self.complement_dict[list_input[i]] + return "".join(list_input) + +class DNASequence(NucleicAcidSequence): + complement_dict = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'a': 't', 't': 'a', 'g': 'c', 'c': 'g'} + def __init__(self, seq): + super().__init__(seq) + self.complement() + + def transcribe(self): + list_input = list(self.seq) + for i in range(len(self.seq)): + if (list_input[i] == 'T'): + list_input[i] = 'U' + elif (list_input[i] == 't'): + list_input[i]='u' + return "".join(list_input) + +class RNASequence(NucleicAcidSequence): + complement_dict = {'A': 'U', 'U': 'A', 'G': 'C', 'C': 'G', 'a': 'u', 'u': 'a', 'g': 'c', 'c': 'g'} + def __init__(self, seq): + super().__init__(seq) + self.complement() + +class AminoAcidSequence(BiologicalSequence): + def __init__(self, seq): + self.seq = seq + + def amino_acid_frequency(self): + """Calculates molecular weight of a protein + Arguments: + - seq (str) 1-letter coded protein sequence + Return: + - int, molecular weight (g/mol) rounded to integer""" + unique_aa = set(self.seq) + freq_dict = {} + amino_acids = set('GAVLITSMCPFYWHKRDENQgavlitsmcpfywhkrdenq') + if unique_aa <= amino_acids: + seq = 'peptide' + else: + raise InvalidInputError() + for letter in self.seq: + if letter in freq_dict: + freq_dict[letter] += 1 + else: + freq_dict[letter] = 1 + for letter in freq_dict: + freq_dict[letter] = round(freq_dict[letter] / len(self.seq) * 100, 2) + return freq_dict + +token = os.environ.get('TG_API_TOKEN') + +def send_telegram_message(chat_id: str, message:str): + """this function uses the bot token and a message generated in telegram_logger function + and sends this message through the telegram bot""" + url = f"https://api.telegram.org/bot{token}/sendMessage" + data = {"chat_id": chat_id, "text": message, "parse_mode": "Markdown"} + response = requests.post(url, data=data) + return response.json() + +def telegram_logger(chat_id: str): + """this function is a decorator that times a function and generates a message + regarding its execution result""" + def decorator(func): + def inner_func(*args, **kwargs): + start = time.time() + try: + result = func(*args, **kwargs) + end = time.time() + execution_time = end - start + if execution_time < 86400: + time_str = time.strftime('%H:%M:%S.%f', time.gmtime(execution_time))[:-3] + message = f"🎉Function `{func.__name__}` has finished in `{time_str}` " + else: + days = int(execution_time // 86400) + time_str = str(timedelta(seconds=execution_time)) + message = f"Function `{func.__name__}` has finished in `{days}` days, `{str(timedelta(seconds=execution_time))}` " + send_telegram_message(chat_id, message) + return result + except Exception as error: + message = f"😞Function `{func.__name__}` failed with an exception:\nType: `{type(error).__name__}`\nError: `{str(error)}` " + send_telegram_message(chat_id, message) + return inner_func + return decorator + +@dataclass +class GenscanOutput: + status: str + cds_list: List[str] + intron_list: List[dict] + exon_list: List[dict] + +def run_genscan(sequence=None, sequence_file=None, organism="Vertebrate", exon_cutoff=1.00, sequence_name=""): + url = "http://argonaute.mit.edu/cgi-bin/genscanw_py.cgi" + + if sequence_file: + with open(sequence_file, 'rb') as file: + sequence = file.read().strip() + data = { + "-o": organism, + "-e": exon_cutoff, + "-n": sequence_name, + "-p": "Predicted peptides only", + "-u": sequence_file, + "-s": sequence + } + response = requests.post(url, data=data) + status = response.status_code + return response.content \ No newline at end of file diff --git a/custom_random_forest.py b/custom_random_forest.py new file mode 100644 index 0000000..3d27abd --- /dev/null +++ b/custom_random_forest.py @@ -0,0 +1,78 @@ +import multiprocessing +import random +import numpy as np +from sklearn.base import BaseEstimator +import time +from sklearn.datasets import make_classification +from sklearn.ensemble import (RandomForestClassifier, + ExtraTreesClassifier, + VotingClassifier) +from sklearn.tree import (DecisionTreeRegressor, + DecisionTreeClassifier) + +SEED = 111 +random.seed(SEED) +np.random.seed(SEED) + +import multiprocessing +from sklearn.base import BaseEstimator + +class RandomForestClassifierCustom(BaseEstimator): + def __init__(self, n_estimators=10, max_depth=None, max_features=None, random_state=SEED): + self.n_estimators = n_estimators + self.max_depth = max_depth + self.max_features = max_features + self.random_state = random_state + self.trees = [] + self.feat_ids_by_tree = [] + + def fit(self, X, y, n_processes=1): + self.classes_ = sorted(np.unique(y)) + def fit_tree_process(i, queue): + np.random.seed(self.random_state + i) + feat_ids = np.random.choice(range(X.shape[1]), size=self.max_features, replace=False) + pseudo_ids = np.random.choice(range(X.shape[0]), size=X.shape[0], replace=True) + pseudo_X = X[pseudo_ids, :][:, feat_ids] + pseudo_y = y[pseudo_ids] + dt_clf = DecisionTreeClassifier(max_depth=self.max_depth, + max_features=self.max_features, + random_state=self.random_state + i) + dt_clf.fit(pseudo_X, pseudo_y) + queue.put((dt_clf, feat_ids)) + queue = multiprocessing.Queue() + processes = [] + for i in range(self.n_estimators): + p = multiprocessing.Process(target=fit_tree_process, args=(i, queue)) + processes.append(p) + p.start() + results = [] + for _ in range(self.n_estimators): + results.append(queue.get()) + for p in processes: + p.join() + self.trees, self.feat_ids_by_tree = zip(*results) + return self + + def predict_proba(self, X, n_processes=1): + def predict_proba_process(tree, feat_ids, queue): + proba = tree.predict_proba(X[:, feat_ids]) + queue.put(proba) + queue = multiprocessing.Queue() + processes = [] + for tree, feat_ids in zip(self.trees, self.feat_ids_by_tree): + p = multiprocessing.Process(target=predict_proba_process, args=(tree, feat_ids, queue)) + processes.append(p) + p.start() + probas = [] + for _ in range(self.n_estimators): + probas.append(queue.get()) + + for p in processes: + p.join() + + return sum(probas) / self.n_estimators + + def predict(self, X, n_processes=1): + probas = self.predict_proba(X, n_processes=n_processes) + return np.argmax(probas, axis=1) + \ No newline at end of file diff --git a/data/example_fasta.fasta b/data/example_fasta.fasta new file mode 100644 index 0000000..ae5498d --- /dev/null +++ b/data/example_fasta.fasta @@ -0,0 +1,18 @@ +>GTD323452 5S_rRNA NODE_272_length_223_cov_0.720238:18-129(+) +ACGGCCATAGGACTTTGAAAGCACCGCATCCCGTCCGATCTGCGAAGTTAACCAAGATGCCGCCTGGTTAGTACCATGGTGGGGGACCACATGGGAATCCCT +GGTGCTGTG +>GTD678345 16S_rRNA NODE_80_length_720_cov_1.094737:313-719(+) +TTGGCTTCTTAGAGGGACTTTTGATGTTTAATCAAAGGAAGTTTGAGGCAATAACAGGTCTGTGATGCCCTTAGATGTTCTGGGCCGCACGCGCGCTACACT +GAGCCCTTGGGAGTGGTCCATTTGAGCCGGCAACGGCACGTTTGGACTGCAAACTTGGGCAAACTTGGTCATTTAGAGGAAGTAAAAGTCGTAACAAGGT +>GTD174893 16S_rRNA NODE_1_length_2558431_cov_75.185164:2153860-2155398(+) +TTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAACAGCTTGCTGTTTCGCTGACGAGTGG +GAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATC +ACCTCCTT +>GTD906783 16S_rRNA NODE_1_length_2558431_cov_75.185164:793941-795479(-) +TTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAACAGCTTGCTGTTTCGCTGACGAGTGG +GAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATC +ACCTCCTT +>GTD129563 16S_rRNA NODE_4_length_428221_cov_75.638017:281055-282593(-) +CGGACGGGTGAGTAATGTCTGGGAAACTGCCTGATGGAGGGGGATAACTACTGGAAACGGTAGCTAATACCGCATAACGTCGCAAGACCAAAGAGGGGGACC +GAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATC +ACCTCCTT \ No newline at end of file diff --git a/data/example_fasta.fasta:Zone.Identifier b/data/example_fasta.fasta:Zone.Identifier new file mode 100644 index 0000000..1bf0b28 --- /dev/null +++ b/data/example_fasta.fasta:Zone.Identifier @@ -0,0 +1,3 @@ +[ZoneTransfer] +ZoneId=3 +HostUrl=https://github.com/ diff --git a/data/example_fastq.fastq b/data/example_fastq.fastq new file mode 100644 index 0000000..883b51f --- /dev/null +++ b/data/example_fastq.fastq @@ -0,0 +1,356 @@ +@SRX079804:1:SRR292678:1:1101:21885:21885 1:N:0:1 BH:ok +ACAGCAACATAAACATGATGGGATGGCGTAAGCCCCCGAGATATCAGTTTACCCAGGATAAGAGATTAAATTATGAGCAACATTATTAA ++SRX079804:1:SRR292678:1:1101:21885:21885 1:N:0:1 BH:ok +FGGGFGGGFGGGFGDFGCEBB@CCDFDDFFFFBFFGFGEFDFFFF;D@DD>C@DDGGGDFGDGG?GFGFEGFGGEF@FDGGGFGFBGGD +@SRX079804:1:SRR292678:1:1101:24563:24563 1:N:0:1 BH:failed +ATTAGCGAGGAGGAGTGCTGAGAAGATGTCGCCTACGCCGTTGAAATTCCCTTCAATCAGGGGGTACTGGAGGATACGAGTTTGTGTG ++SRX079804:1:SRR292678:1:1101:24563:24563 1:N:0:1 BH:failed +BFFFFFFFB@B@A<@D>BDDACDDDEBEDEFFFBFFFEFFDFFF=CC@DDFD8FFFFFFF8/+.2,@7<<:?B/:<><-><@.A*C>D +@SRX079804:1:SRR292678:1:1101:30161:30161 1:N:0:1 BH:failed +GAACGACAGCAGCTCCTGCATAACCGCGTCCTTCTTCTTTAGCGTTGTGCAAAGCATGTTTTGTATTACGGGCATCTCGAGCGAATC ++SRX079804:1:SRR292678:1:1101:30161:30161 1:N:0:1 BH:failed +DFFFEGDGGGGFGGEDCCDCEFFFFCCCCCB>CEBFGFBGGG?DE=:6@=>AD?D8DCEE:>EEABE5D@5:DDCA;EEE-DCD +@SRX079804:1:SRR292678:1:1101:47176:47176 1:N:0:1 BH:failed +TGAAGCGTCGATAGAAGTTAGCAAACCCGCGGAACTTCCGTACATCAGACACATTCCGGGGGGTGGGCCAATCCATGATGCCTTTG ++SRX079804:1:SRR292678:1:1101:47176:47176 1:N:0:1 BH:failed +FF@FFBEEEEFFEFFD@EDEFFB=DFEEFFFE8FFE8EEDBFDFEEBE+E