From 0ca259cf2b2021a742d7832a2452f7818696d882 Mon Sep 17 00:00:00 2001 From: kpotoh Date: Mon, 25 Nov 2024 18:19:04 +0200 Subject: [PATCH 01/14] new states classes --- pymutspec/_legacy/states.py | 298 +++++++++++ pymutspec/io/__init__.py | 9 +- pymutspec/io/auxiliary.py | 68 +-- pymutspec/io/gb.py | 67 +++ pymutspec/io/states.py | 708 ++++++++++++++++---------- scripts/collect_mutations_parallel.py | 521 +++++++++++++++++++ 6 files changed, 1335 insertions(+), 336 deletions(-) create mode 100644 pymutspec/_legacy/states.py create mode 100644 pymutspec/io/gb.py create mode 100644 scripts/collect_mutations_parallel.py diff --git a/pymutspec/_legacy/states.py b/pymutspec/_legacy/states.py new file mode 100644 index 0000000..a1dacb4 --- /dev/null +++ b/pymutspec/_legacy/states.py @@ -0,0 +1,298 @@ +import os +import sys +import sqlite3 +from collections import defaultdict +from typing import List + +import numpy as np +import pandas as pd +from Bio import SeqIO +import tqdm + + + +class OldGenesStates: + """ + Load genome (gene) states files and access gene by record name + + tips: + - use mode="dict" if your mtDNA tree is small (~1500 nodes x 10,000 nucleotides require ~350MB of RAM); + big trees with 100,000 nodes require tens of GB of RAM, therefore use mode="db" + + Arguments + --------- + path_states: list or iterable of string + files with states; if some records not presented in all files table will be filled by '-' (gaps) + TODO + states_fmt: string + format of files aligment; supported any format from biopython (fasta, phylip, etc.) + + Return + --------- + states_obj: GenesStates - + + """ + def __init__( + self, path_states: List[str], path_to_db=None, mode="dict", rewrite=False, + use_proba=True, path_to_rates=None, cat_cutoff=1, states_fmt="table", + ): + fmt_variants = ["fasta", "phylip", "table"] + if states_fmt not in fmt_variants: + raise ValueError(f"Appropriate states_fmt: {repr(fmt_variants)}") + if states_fmt != "table" and use_proba == True: + print("Ignore use_proba option and forcely set use_proba=False due to input alignment that doesn't contain probabilities", file=sys.stderr) + use_proba = False + + self.mode = mode + self.input_states_fmt = states_fmt + self.use_proba = use_proba + print(f"Genome states storage mode = '{mode}'", file=sys.stderr) + self.path_to_db = path_to_db + if mode == "dict": + if states_fmt == "table": + self._prepare_node2genome(path_states) + else: + states = self.read_alignment(path_states) + self._prepare_node2genome(states) + elif mode == "db": + if states_fmt in ["fasta", "phylip"]: + raise NotImplementedError + + if path_to_db is None: + raise ValueError("Pass the path to database to use or write it") + else: + self._prepare_db(path_states, rewrite) + else: + raise ValueError("Mode must be 'dict' or 'db'") + + genes_sizes = {part: len(x) for part, x in self.get_random_genome().items()} + self.genome_size = sum(genes_sizes.values()) + self.category = self.read_rates(path_to_rates) if path_to_rates else None + self.mask = None + + if self.category: + for part in genes_sizes: + if genes_sizes[part] != len(self.category[part]): + raise RuntimeError("Wrong rates: number of positions in alignment are not equal to number of rates", file=sys.stderr) + self.mask = self.get_mask(self.category, cat_cutoff) + + def get_genome(self, node: str): + if self.mode == "dict": + return self.node2genome[node] + + elif self.mode == "db": + genome_raw = defaultdict(list) + cur = self.con.cursor() + if self.use_proba: + query = f"""SELECT Part, Site, p_A, p_C, p_G, p_T FROM states WHERE Node='{node}'""" + dtype = [ + ("Site", np.int32), + ("p_A", np.float32), ("p_C", np.float32), + ("p_G", np.float32), ("p_T", np.float32), + ] + else: + query = f"""SELECT Part, Site, State FROM states WHERE Node='{node}'""" + dtype = [("Site", np.int32), ("State", np.object_)] + + for row in cur.execute(query): + part = str(row[0]) + state = row[1:] + genome_raw[part].append(state) + + genome = {} + for part, state_tup in genome_raw.items(): + state = np.array(state_tup, dtype=dtype) + state_sorted = np.sort(state, order="Site") + state_sorted_devoid = np.array(list(map(list, state_sorted))) + state = state_sorted_devoid[:, 1:] if self.use_proba else state_sorted_devoid[:, 1] + genome[part] = state + return genome + + else: + raise ValueError("mode must be 'dict' or 'db'") + + def get_random_genome(self): + node = next(iter(self.nodes)) + return self.get_genome(node) + + def _prepare_db(self, path_states, rewrite=False): + """sequentially read tsv and write to db""" + done = False + if os.path.exists(self.path_to_db): + if rewrite: + os.remove(self.path_to_db) + else: + print(f"Loading existing database from {self.path_to_db}", file=sys.stderr) + done = True + try: + con = sqlite3.connect(self.path_to_db) + except: + raise ValueError("Cannot connect to database by given path") + + cur = con.cursor() + self.con = con + if done: + self.nodes = self._get_nodes_from_db() + return + + print("Writing new database file", file=sys.stderr) + cur.execute('''CREATE TABLE states + (Node TEXT, Part INTEGER, Site INTEGER, State TEXT, p_A REAL, p_C REAL, p_G REAL, p_T REAL)''' + ) + for path in path_states: + if path is None: + continue + print(f"Loading {path} ...", file=sys.stderr) + handle = open(path, "r") + header = handle.readline().strip() + if header != "Node\tPart\tSite\tState\tp_A\tp_C\tp_G\tp_T": + handle.close() + con.close() + raise ValueError(f"Inappropriate type of table, expected another columns order,\ngot {repr(header)}") + + for line in tqdm.tqdm(handle, total=8652300): # TODO estimate total + row = line.strip().split() + query = "INSERT INTO states VALUES ('{}',{},{},'{}',{},{},{},{})".format(*row) + cur.execute(query) + + con.commit() + handle.close() + + self.nodes = self._get_nodes_from_db() + # con.close() + + def _get_nodes_from_db(self): + nodes = set() + cur = self.con.cursor() + for node in cur.execute("SELECT DISTINCT Node from states"): + nodes.add(node[0]) + return nodes + + def states2dct(self, states: pd.DataFrame, out=None): + # all genes (genomes) must have same length + aln_sizes = states.groupby("Node").apply(len) + assert aln_sizes.nunique() == 1, "uncomplete state table: some site states absent in some node genomes" + + node2genome = defaultdict(dict) if out is None else out + gr = states.sort_values(["Node", "Part", "Site"]).groupby(["Node", "Part"]) + for (node, part), gene_df in gr: + if self.use_proba: + gene_states = gene_df[["p_A", "p_C", "p_G", "p_T"]].values + else: + gene_states = gene_df.State.values + node2genome[node][part] = gene_states + return node2genome + + def _prepare_node2genome(self, path_states, states_dtype=np.float32): + node2genome = defaultdict(dict) + if isinstance(path_states, pd.DataFrame): + states = path_states + self.states2dct(states, node2genome) + elif isinstance(path_states, (list, tuple)): + dtypes = { + "p_A": states_dtype, "p_C": states_dtype, + "p_G": states_dtype, "p_T": states_dtype, + "Site": np.int32, "Node": str, "Part": str, + } + if self.use_proba: + usecols = ["Node", "Part", "Site", "p_A", "p_C", "p_G", "p_T"] + else: + usecols = ["Node", "Part", "Site", "State"] + + for path in path_states: + if path is None: + continue + print(f"Loading {path}...", file=sys.stderr) + states = pd.read_csv(path, sep="\t", comment='#', usecols=usecols, dtype=dtypes) + self.states2dct(states, node2genome) + + # aln_sizes = states.groupby("Node").apply(len) + # assert aln_sizes.nunique() == 1, "uncomplete leaves state table" + # states = states.sort_values(["Node", "Part", "Site"]) + # gr = states.groupby(["Node", "Part"]) + # for (node, part), gene_pos_ids in tqdm.tqdm(gr.groups.items()): + # gene_df = states.loc[gene_pos_ids] # TODO simplify: iterate over gr + # if self.use_proba: + # gene_states = gene_df[["p_A", "p_C", "p_G", "p_T"]].values + # else: + # gene_states = gene_df.State.values + # node2genome[node][part] = gene_states + else: + + raise NotImplementedError('path_to_states must be list of paths or pd.DataFrame') + + self.node2genome = node2genome + self.nodes = set(node2genome.keys()) + + @staticmethod + def read_alignment(files: list, fmt="fasta"): + """ + Read files alignments and prepare states table + + Arguments + --------- + files: list or iterable of string + files with alignments; if some records not presented in all files table will be filled by '-' (gaps) + fmt: string + format of files aligment; supported any format from biopython (fasta, phylip, etc.) + + Return + --------- + states: pd.DataFrame + """ + ngenes = len(files) + columns = "Node Part Site State p_A p_C p_G p_T".split() + nucls = "ACGT" + aln_lens = dict() + files = set(files) + history = defaultdict(list) + data = [] + for filepath in files: + part = "1" if ngenes == 1 else os.path.basename(filepath).replace(".fna", "") + alignment = SeqIO.parse(filepath, fmt) + for rec in alignment: + node = rec.name + history[node].append(part) + seq = str(rec.seq) + for site, state in enumerate(seq, 1): + site_data = [node, part, site, state] + for nucl in nucls: + p = int(nucl == state) + site_data.append(p) + data.append(site_data) + aln_lens[part] = len(seq) + + if ngenes > 1: + # get max set of parts. Only for multiple files + for node, parts in history.items(): + if len(parts) == ngenes: + full_parts = parts.copy() + break + + # fill missing genes by '-'. Only for multiple files + for node, parts in history.items(): + if len(parts) != ngenes: + unseen_parts = set(full_parts).difference(parts) + for unp in unseen_parts: + print(f"Gap filling for node {node}, part {unp}...", file=sys.stderr) + for site in range(1, aln_lens[unp] + 1): + site_data = [node, unp, site, "-", 0, 0, 0, 0] + data.append(site_data) + + df = pd.DataFrame(data, columns=columns) + return df + + @staticmethod + def read_rates(path: str): + """ TODO write for many genes """ + category = read_rates(path) + category = {"1": category} # for each aln part + return category + + @staticmethod + def get_mask(category: dict, cat_cutoff=1): + mask = {part: np.array(cats) >= cat_cutoff for part, cats in category.items()} + return mask + + def close_db(self): + self.con.close() diff --git a/pymutspec/io/__init__.py b/pymutspec/io/__init__.py index 6c4a9d1..e91a905 100644 --- a/pymutspec/io/__init__.py +++ b/pymutspec/io/__init__.py @@ -1,6 +1,9 @@ -from .auxiliary import get_aln_files, load_scheme, read_genbank_ref -from .states import GenesStates, read_rates +from .auxiliary import get_aln_files, load_scheme +from .gb import read_genbank_ref +from .states import GenomeStates, GenomeStatesTotal, read_rates, read_alignment __all__ = [ - "get_aln_files", "load_scheme", "read_genbank_ref", "GenesStates", "read_rates", + 'GenomeStates', 'GenomeStatesTotal', + 'read_alignment', 'read_genbank_ref', 'read_rates', + 'get_aln_files', 'load_scheme', ] diff --git a/pymutspec/io/auxiliary.py b/pymutspec/io/auxiliary.py index 20dc86a..2818112 100644 --- a/pymutspec/io/auxiliary.py +++ b/pymutspec/io/auxiliary.py @@ -1,11 +1,6 @@ import os import re -from typing import Dict, Union - -import numpy as np -import pandas as pd -from Bio.SeqRecord import SeqRecord -from Bio import SeqIO +from typing import Dict def load_scheme(path: str) -> Dict[str, str]: @@ -28,64 +23,3 @@ def get_aln_files(path: str): [os.path.join(path, x) for x in raw_files if x.endswith(".fna")] ) return files - - -def read_genbank_ref(gb: Union[str, SeqRecord]): - if isinstance(gb, str): - genome = next(SeqIO.parse(gb, "genbank")) - elif isinstance(gb, SeqRecord): - genome = gb - else: - raise NotImplementedError - - ftypes = {"CDS", "rRNA", "tRNA"} - full_nucls = set("ACGT") - data = [] - df: pd.DataFrame = None - gene_qualifier = None - for ftr in genome.features: - if ftr.type == "source": - source = ftr.extract(genome) - seq = str(source.seq) - for pos, nuc in enumerate(seq): - context = seq[pos - 1: pos + 2] - if len(context) < 3 or len(set(context).difference(full_nucls)) != 0: - context = None - if nuc not in full_nucls: - nuc = context = None - data.append({"Pos": pos + 1, "Nuc": nuc, "Context": context}) - df = pd.DataFrame(data) - df["Strand"] = 0 - continue - elif gene_qualifier is None and ftr.type in ftypes: - for qualifier in ["gene", "product", "protein_id"]: - if qualifier in ftr.qualifiers: - gene_qualifier = qualifier - break - if gene_qualifier is None: - raise RuntimeError(f"Cannot find any expected qualifier of feature: {ftr}; with following qualifiers: {ftr.qualifiers}") - - for pos in list(ftr.location): - df.at[pos, "Type"] = ftr.type - df.at[pos, "Strand"] = ftr.strand - if ftr.type in ftypes: - df.at[pos, qualifier] = ftr.qualifiers[qualifier][0] - - # add codon features - df["PosInGene"] = -1 - df["PosInCodon"] = -1 - for gene_name in df[(df.Type == "CDS") & (df.Strand == 1)][gene_qualifier].unique(): - gdf = df[df[gene_qualifier] == gene_name] - seq = gdf.Nuc.values - for pos_in_gene, pos in enumerate(gdf.index): - pic = pos_in_gene % 3 - cdn = seq[pos_in_gene - pic: pos_in_gene - pic + 3] - cdn = "".join(cdn) if len(set(cdn).difference(full_nucls)) == 0 else None - df.at[pos, "Codon"] = cdn - df.at[pos, "PosInGene"] = pos_in_gene + 1 - df.at[pos, "PosInCodon"] = pic + 1 - - df["Strand"] = df["Strand"].astype(np.int8) - df["PosInCodon"] = df["PosInCodon"].astype(np.int8) - df["PosInGene"] = df["PosInGene"].astype(np.int32) - return df diff --git a/pymutspec/io/gb.py b/pymutspec/io/gb.py new file mode 100644 index 0000000..d576831 --- /dev/null +++ b/pymutspec/io/gb.py @@ -0,0 +1,67 @@ +from typing import Union + +import numpy as np +import pandas as pd +from Bio.SeqRecord import SeqRecord +from Bio import SeqIO + + +def read_genbank_ref(gb: Union[str, SeqRecord]): + if isinstance(gb, str): + genome = next(SeqIO.parse(gb, "genbank")) + elif isinstance(gb, SeqRecord): + genome = gb + else: + raise NotImplementedError + + ftypes = {"CDS", "rRNA", "tRNA"} + full_nucls = set("ACGT") + data = [] + df: pd.DataFrame = None + gene_qualifier = None + for ftr in genome.features: + if ftr.type == "source": + source = ftr.extract(genome) + seq = str(source.seq) + for pos, nuc in enumerate(seq): + context = seq[pos - 1: pos + 2] + if len(context) < 3 or len(set(context).difference(full_nucls)) != 0: + context = None + if nuc not in full_nucls: + nuc = context = None + data.append({"Pos": pos + 1, "Nuc": nuc, "Context": context}) + df = pd.DataFrame(data) + df["Strand"] = 0 + continue + elif gene_qualifier is None and ftr.type in ftypes: + for qualifier in ["gene", "product", "protein_id"]: + if qualifier in ftr.qualifiers: + gene_qualifier = qualifier + break + if gene_qualifier is None: + raise RuntimeError(f"Cannot find any expected qualifier of feature: {ftr}; with following qualifiers: {ftr.qualifiers}") + + for pos in list(ftr.location): + df.at[pos, "Type"] = ftr.type + df.at[pos, "Strand"] = ftr.strand + if ftr.type in ftypes: + df.at[pos, qualifier] = ftr.qualifiers[qualifier][0] + + # add codon features + df["PosInGene"] = -1 + df["PosInCodon"] = -1 + for gene_name in df[(df.Type == "CDS") & (df.Strand == 1)][gene_qualifier].unique(): + gdf = df[df[gene_qualifier] == gene_name] + seq = gdf.Nuc.values + for pos_in_gene, pos in enumerate(gdf.index): + pic = pos_in_gene % 3 + cdn = seq[pos_in_gene - pic: pos_in_gene - pic + 3] + cdn = "".join(cdn) if len(set(cdn).difference(full_nucls)) == 0 else None + df.at[pos, "Codon"] = cdn + df.at[pos, "PosInGene"] = pos_in_gene + 1 + df.at[pos, "PosInCodon"] = pic + 1 + + df["Strand"] = df["Strand"].astype(np.int8) + df["PosInCodon"] = df["PosInCodon"].astype(np.int8) + df["PosInGene"] = df["PosInGene"].astype(np.int32) + return df diff --git a/pymutspec/io/states.py b/pymutspec/io/states.py index 26f609a..407e1b3 100644 --- a/pymutspec/io/states.py +++ b/pymutspec/io/states.py @@ -1,302 +1,420 @@ import os import sys +import random import sqlite3 from collections import defaultdict from typing import List -from unicodedata import category +import tqdm import numpy as np import pandas as pd from Bio import SeqIO -import tqdm +from ..utils import basic_logger -class GenesStates: - """ - Load genome (gene) states files and access gene by record name +class GenomeStates: + def __init__(self, path_to_states, path_to_gappy_sites=None, format='tsv', logger=None): + for path in list([path_to_states, path_to_gappy_sites]): + if not os.path.exists(path): + raise ValueError(f"Path to states doesn't exist: '{path}'") + + self.logger = logger or basic_logger() + + self.logger.info(f'Reading states "{path_to_states}"') + states = self.read_states(path, format) + + nodes_arr = states['Node'].unique() + self.node2id = dict(zip(nodes_arr, range(len(nodes_arr)))) + self.nodes = set(nodes_arr) + self.logger.info(f'Loaded {len(self.nodes)} genomes') + + states['NodeId'] = states['Node'].map(self.node2id) + if len(self.nodes) < 2 ** 31 - 1: + states['NodeId'] = states['NodeId'].astype(np.int32) + + states.drop('Node', axis=1, inplace=True) + self.logger.debug('Node column deleted') + + states.set_index('NodeId', inplace=True) + self.logger.debug('States reindexed') + + self.genome_size = len(states.loc[0]) + self.logger.info(f'Total alignement size = {self.genome_size}') + + self.nongappy_sites = self.read_nongappy_sites(path_to_gappy_sites) + self.states = states + self.logger.info('States initialiaztion is done') + + def __contains__(self, item: str): + return item in self.nodes + + def __getitem__(self, item: str): + return self.get_genome(item) + + def read_states(self, path, fmt='iqtree'): + fpt = np.float32 + dtype = { + "p_A": fpt, "p_C": fpt, "p_G": fpt, "p_T": fpt, + "Site": np.int32, "Node": str, + } + if fmt == 'iqtree': + states = pd.read_csv(path, sep='\t', comment='#', dtype=dtype) + if fmt == 'csv': + states = pd.read_csv(path, dtype=dtype) + elif fmt == 'tsv': + states = pd.read_csv(path, sep='\t', dtype=dtype) + elif fmt == 'parquet': + states = pd.read_parquet(path) + return states + + def read_nongappy_sites(self, path): + '''path must contain list of 1-based sites of alignment that must be excluded from loaded alignment + + return array of sorted non-gappy sites in 1-based mode + ''' + if path is None: + nongappy_sites = np.arange(1, self.genome_size + 1) + self.logger.debug('Use all alignment sites') + else: + self.logger.debug(f'Reading gappy sites "{path}"') + gappy_sites = set(pd.read_csv(path, header=None).values.flatten().tolist()) + nongappy_sites = np.array([i for i in range(1, self.genome_size + 1) \ + if i not in gappy_sites]) + self.logger.debug(f'Number of used (non-gappy) sites = {len(nongappy_sites)}') + return nongappy_sites + + def get_genome(self, node: str) -> pd.DataFrame: + if node not in self.nodes: + raise ValueError(f'Input node "{node}" have no genome') + node_id = self.node2id[node] + genome = self.states.loc[node_id].set_index('Site').loc[self.nongappy_sites] + return genome + + +class GenomeStatesTotal(): + def __init__(self, path_to_states1: str, path_to_states2: str, path_to_gappy_sites: str): + gs1 = GenomeStates(path_to_states1, path_to_gappy_sites) + gs2 = GenomeStates(path_to_states2, path_to_gappy_sites) + + assert gs1.genome_size == gs2.genome_size + self.genome_size = gs1.genome_size + + self.nodes = gs1.nodes.union(gs2.nodes) + self.gs1 = gs1 + self.gs2 = gs2 + + def __contains__(self, item): + return item in self.nodes + + def __getitem__(self, item): + if item in self.gs1: + genome = self.gs1[item] + elif item in self.gs2: + genome = self.gs2[item] + else: + raise ValueError('node does not exist') + return genome + + def get_genome(self, node: str) -> pd.DataFrame: + if node in self.gs1: + genome = self.gs1[node] + elif node in self.gs2: + genome = self.gs2[node] + else: + raise ValueError('node does not exist') + return genome + + def get_random_genome(self) -> pd.DataFrame: + node = random.choice(list(self.nodes)) + return self[node] + + +# class GenesStates: +# """ +# Load genome (gene) states files and access gene by record name - tips: - - use mode="dict" if your mtDNA tree is small (~1500 nodes x 10,000 nucleotides require ~350MB of RAM); - big trees with 100,000 nodes require tens of GB of RAM, therefore use mode="db" +# tips: +# - use mode="dict" if your mtDNA tree is small (~1500 nodes x 10,000 nucleotides require ~350MB of RAM); +# big trees with 100,000 nodes require tens of GB of RAM, therefore use mode="db" - Arguments - --------- - path_states: list or iterable of string - files with states; if some records not presented in all files table will be filled by '-' (gaps) - TODO - states_fmt: string - format of files aligment; supported any format from biopython (fasta, phylip, etc.) +# Arguments +# --------- +# path_states: list or iterable of string +# files with states; if some records not presented in all files table will be filled by '-' (gaps) +# TODO +# states_fmt: string +# format of files aligment; supported any format from biopython (fasta, phylip, etc.) - Return - --------- - states_obj: GenesStates - +# Return +# --------- +# states_obj: GenesStates - - """ - def __init__( - self, path_states: List[str], path_to_db=None, mode="dict", rewrite=False, - use_proba=True, path_to_rates=None, cat_cutoff=1, states_fmt="table", - ): - fmt_variants = ["fasta", "phylip", "table"] - if states_fmt not in fmt_variants: - raise ValueError(f"Appropriate states_fmt: {repr(fmt_variants)}") - if states_fmt != "table" and use_proba == True: - print("Ignore use_proba option and forcely set use_proba=False due to input alignment that doesn't contain probabilities", file=sys.stderr) - use_proba = False +# """ +# def __init__( +# self, path_states: List[str], path_to_db=None, mode="dict", rewrite=False, +# use_proba=True, path_to_rates=None, cat_cutoff=1, states_fmt="table", +# ): +# fmt_variants = ["fasta", "phylip", "table"] +# if states_fmt not in fmt_variants: +# raise ValueError(f"Appropriate states_fmt: {repr(fmt_variants)}") +# if states_fmt != "table" and use_proba == True: +# print("Ignore use_proba option and forcely set use_proba=False due to input alignment that doesn't contain probabilities", file=sys.stderr) +# use_proba = False - self.mode = mode - self.input_states_fmt = states_fmt - self.use_proba = use_proba - print(f"Genome states storage mode = '{mode}'", file=sys.stderr) - self.path_to_db = path_to_db - if mode == "dict": - if states_fmt == "table": - self._prepare_node2genome(path_states) - else: - states = self.read_alignment(path_states) - self._prepare_node2genome(states) - elif mode == "db": - if states_fmt in ["fasta", "phylip"]: - raise NotImplementedError +# self.mode = mode +# self.input_states_fmt = states_fmt +# self.use_proba = use_proba +# print(f"Genome states storage mode = '{mode}'", file=sys.stderr) +# self.path_to_db = path_to_db +# if mode == "dict": +# if states_fmt == "table": +# self._prepare_node2genome(path_states) +# else: +# states = self.read_alignment(path_states) +# self._prepare_node2genome(states) +# elif mode == "db": +# if states_fmt in ["fasta", "phylip"]: +# raise NotImplementedError - if path_to_db is None: - raise ValueError("Pass the path to database to use or write it") - else: - self._prepare_db(path_states, rewrite) - else: - raise ValueError("Mode must be 'dict' or 'db'") +# if path_to_db is None: +# raise ValueError("Pass the path to database to use or write it") +# else: +# self._prepare_db(path_states, rewrite) +# else: +# raise ValueError("Mode must be 'dict' or 'db'") - genes_sizes = {part: len(x) for part, x in self.get_random_genome().items()} - self.genome_size = sum(genes_sizes.values()) - self.category = self.read_rates(path_to_rates) if path_to_rates else None - self.mask = None - - if self.category: - for part in genes_sizes: - if genes_sizes[part] != len(self.category[part]): - raise RuntimeError("Wrong rates: number of positions in alignment are not equal to number of rates", file=sys.stderr) - self.mask = self.get_mask(self.category, cat_cutoff) +# genes_sizes = {part: len(x) for part, x in self.get_random_genome().items()} +# self.genome_size = sum(genes_sizes.values()) +# self.category = self.read_rates(path_to_rates) if path_to_rates else None +# self.mask = None + +# if self.category: +# for part in genes_sizes: +# if genes_sizes[part] != len(self.category[part]): +# raise RuntimeError("Wrong rates: number of positions in alignment are not equal to number of rates", file=sys.stderr) +# self.mask = self.get_mask(self.category, cat_cutoff) - def get_genome(self, node: str): - if self.mode == "dict": - return self.node2genome[node] +# def get_genome(self, node: str): +# if self.mode == "dict": +# return self.node2genome[node] - elif self.mode == "db": - genome_raw = defaultdict(list) - cur = self.con.cursor() - if self.use_proba: - query = f"""SELECT Part, Site, p_A, p_C, p_G, p_T FROM states WHERE Node='{node}'""" - dtype = [ - ("Site", np.int32), - ("p_A", np.float32), ("p_C", np.float32), - ("p_G", np.float32), ("p_T", np.float32), - ] - else: - query = f"""SELECT Part, Site, State FROM states WHERE Node='{node}'""" - dtype = [("Site", np.int32), ("State", np.object_)] - - for row in cur.execute(query): - part = str(row[0]) - state = row[1:] - genome_raw[part].append(state) +# elif self.mode == "db": +# genome_raw = defaultdict(list) +# cur = self.con.cursor() +# if self.use_proba: +# query = f"""SELECT Part, Site, p_A, p_C, p_G, p_T FROM states WHERE Node='{node}'""" +# dtype = [ +# ("Site", np.int32), +# ("p_A", np.float32), ("p_C", np.float32), +# ("p_G", np.float32), ("p_T", np.float32), +# ] +# else: +# query = f"""SELECT Part, Site, State FROM states WHERE Node='{node}'""" +# dtype = [("Site", np.int32), ("State", np.object_)] + +# for row in cur.execute(query): +# part = str(row[0]) +# state = row[1:] +# genome_raw[part].append(state) - genome = {} - for part, state_tup in genome_raw.items(): - state = np.array(state_tup, dtype=dtype) - state_sorted = np.sort(state, order="Site") - state_sorted_devoid = np.array(list(map(list, state_sorted))) - state = state_sorted_devoid[:, 1:] if self.use_proba else state_sorted_devoid[:, 1] - genome[part] = state - return genome +# genome = {} +# for part, state_tup in genome_raw.items(): +# state = np.array(state_tup, dtype=dtype) +# state_sorted = np.sort(state, order="Site") +# state_sorted_devoid = np.array(list(map(list, state_sorted))) +# state = state_sorted_devoid[:, 1:] if self.use_proba else state_sorted_devoid[:, 1] +# genome[part] = state +# return genome - else: - raise ValueError("mode must be 'dict' or 'db'") +# else: +# raise ValueError("mode must be 'dict' or 'db'") - def get_random_genome(self): - node = next(iter(self.nodes)) - return self.get_genome(node) - - def _prepare_db(self, path_states, rewrite=False): - """sequentially read tsv and write to db""" - done = False - if os.path.exists(self.path_to_db): - if rewrite: - os.remove(self.path_to_db) - else: - print(f"Loading existing database from {self.path_to_db}", file=sys.stderr) - done = True - try: - con = sqlite3.connect(self.path_to_db) - except: - raise ValueError("Cannot connect to database by given path") +# def get_random_genome(self): +# node = next(iter(self.nodes)) +# return self.get_genome(node) + +# def _prepare_db(self, path_states, rewrite=False): +# """sequentially read tsv and write to db""" +# done = False +# if os.path.exists(self.path_to_db): +# if rewrite: +# os.remove(self.path_to_db) +# else: +# print(f"Loading existing database from {self.path_to_db}", file=sys.stderr) +# done = True +# try: +# con = sqlite3.connect(self.path_to_db) +# except: +# raise ValueError("Cannot connect to database by given path") - cur = con.cursor() - self.con = con - if done: - self.nodes = self._get_nodes_from_db() - return - - print("Writing new database file", file=sys.stderr) - cur.execute('''CREATE TABLE states - (Node TEXT, Part INTEGER, Site INTEGER, State TEXT, p_A REAL, p_C REAL, p_G REAL, p_T REAL)''' - ) - for path in path_states: - if path is None: - continue - print(f"Loading {path} ...", file=sys.stderr) - handle = open(path, "r") - header = handle.readline().strip() - if header != "Node\tPart\tSite\tState\tp_A\tp_C\tp_G\tp_T": - handle.close() - con.close() - raise ValueError(f"Inappropriate type of table, expected another columns order,\ngot {repr(header)}") - - for line in tqdm.tqdm(handle, total=8652300): # TODO estimate total - row = line.strip().split() - query = "INSERT INTO states VALUES ('{}',{},{},'{}',{},{},{},{})".format(*row) - cur.execute(query) - - con.commit() - handle.close() - - self.nodes = self._get_nodes_from_db() - # con.close() +# cur = con.cursor() +# self.con = con +# if done: +# self.nodes = self._get_nodes_from_db() +# return + +# print("Writing new database file", file=sys.stderr) +# cur.execute('''CREATE TABLE states +# (Node TEXT, Part INTEGER, Site INTEGER, State TEXT, p_A REAL, p_C REAL, p_G REAL, p_T REAL)''' +# ) +# for path in path_states: +# if path is None: +# continue +# print(f"Loading {path} ...", file=sys.stderr) +# handle = open(path, "r") +# header = handle.readline().strip() +# if header != "Node\tPart\tSite\tState\tp_A\tp_C\tp_G\tp_T": +# handle.close() +# con.close() +# raise ValueError(f"Inappropriate type of table, expected another columns order,\ngot {repr(header)}") + +# for line in tqdm.tqdm(handle, total=8652300): # TODO estimate total +# row = line.strip().split() +# query = "INSERT INTO states VALUES ('{}',{},{},'{}',{},{},{},{})".format(*row) +# cur.execute(query) + +# con.commit() +# handle.close() + +# self.nodes = self._get_nodes_from_db() +# # con.close() - def _get_nodes_from_db(self): - nodes = set() - cur = self.con.cursor() - for node in cur.execute("SELECT DISTINCT Node from states"): - nodes.add(node[0]) - return nodes +# def _get_nodes_from_db(self): +# nodes = set() +# cur = self.con.cursor() +# for node in cur.execute("SELECT DISTINCT Node from states"): +# nodes.add(node[0]) +# return nodes - def states2dct(self, states: pd.DataFrame, out=None): - # all genes (genomes) must have same length - aln_sizes = states.groupby("Node").apply(len) - assert aln_sizes.nunique() == 1, "uncomplete state table: some site states absent in some node genomes" +# def states2dct(self, states: pd.DataFrame, out=None): +# # all genes (genomes) must have same length +# aln_sizes = states.groupby("Node").apply(len) +# assert aln_sizes.nunique() == 1, "uncomplete state table: some site states absent in some node genomes" - node2genome = defaultdict(dict) if out is None else out - gr = states.sort_values(["Node", "Part", "Site"]).groupby(["Node", "Part"]) - for (node, part), gene_df in gr: - if self.use_proba: - gene_states = gene_df[["p_A", "p_C", "p_G", "p_T"]].values - else: - gene_states = gene_df.State.values - node2genome[node][part] = gene_states - return node2genome - - def _prepare_node2genome(self, path_states, states_dtype=np.float32): - node2genome = defaultdict(dict) - if isinstance(path_states, pd.DataFrame): - states = path_states - self.states2dct(states, node2genome) - elif isinstance(path_states, (list, tuple)): - dtypes = { - "p_A": states_dtype, "p_C": states_dtype, - "p_G": states_dtype, "p_T": states_dtype, - "Site": np.int32, "Node": str, "Part": str, - } - if self.use_proba: - usecols = ["Node", "Part", "Site", "p_A", "p_C", "p_G", "p_T"] - else: - usecols = ["Node", "Part", "Site", "State"] - - for path in path_states: - if path is None: - continue - print(f"Loading {path}...", file=sys.stderr) - states = pd.read_csv(path, sep="\t", comment='#', usecols=usecols, dtype=dtypes) - self.states2dct(states, node2genome) - - # aln_sizes = states.groupby("Node").apply(len) - # assert aln_sizes.nunique() == 1, "uncomplete leaves state table" - # states = states.sort_values(["Node", "Part", "Site"]) - # gr = states.groupby(["Node", "Part"]) - # for (node, part), gene_pos_ids in tqdm.tqdm(gr.groups.items()): - # gene_df = states.loc[gene_pos_ids] # TODO simplify: iterate over gr - # if self.use_proba: - # gene_states = gene_df[["p_A", "p_C", "p_G", "p_T"]].values - # else: - # gene_states = gene_df.State.values - # node2genome[node][part] = gene_states - else: +# node2genome = defaultdict(dict) if out is None else out +# gr = states.sort_values(["Node", "Part", "Site"]).groupby(["Node", "Part"]) +# for (node, part), gene_df in gr: +# if self.use_proba: +# gene_states = gene_df[["p_A", "p_C", "p_G", "p_T"]].values +# else: +# gene_states = gene_df.State.values +# node2genome[node][part] = gene_states +# return node2genome + +# def _prepare_node2genome(self, path_states, states_dtype=np.float32): +# node2genome = defaultdict(dict) +# if isinstance(path_states, pd.DataFrame): +# states = path_states +# self.states2dct(states, node2genome) +# elif isinstance(path_states, (list, tuple)): +# dtypes = { +# "p_A": states_dtype, "p_C": states_dtype, +# "p_G": states_dtype, "p_T": states_dtype, +# "Site": np.int32, "Node": str, "Part": str, +# } +# if self.use_proba: +# usecols = ["Node", "Part", "Site", "p_A", "p_C", "p_G", "p_T"] +# else: +# usecols = ["Node", "Part", "Site", "State"] + +# for path in path_states: +# if path is None: +# continue +# print(f"Loading {path}...", file=sys.stderr) +# states = pd.read_csv(path, sep="\t", comment='#', usecols=usecols, dtype=dtypes) +# self.states2dct(states, node2genome) - raise NotImplementedError('path_to_states must be list of paths or pd.DataFrame') +# # aln_sizes = states.groupby("Node").apply(len) +# # assert aln_sizes.nunique() == 1, "uncomplete leaves state table" +# # states = states.sort_values(["Node", "Part", "Site"]) +# # gr = states.groupby(["Node", "Part"]) +# # for (node, part), gene_pos_ids in tqdm.tqdm(gr.groups.items()): +# # gene_df = states.loc[gene_pos_ids] # TODO simplify: iterate over gr +# # if self.use_proba: +# # gene_states = gene_df[["p_A", "p_C", "p_G", "p_T"]].values +# # else: +# # gene_states = gene_df.State.values +# # node2genome[node][part] = gene_states +# else: + +# raise NotImplementedError('path_to_states must be list of paths or pd.DataFrame') - self.node2genome = node2genome - self.nodes = set(node2genome.keys()) - - @staticmethod - def read_alignment(files: list, fmt="fasta"): - """ - Read files alignments and prepare states table - - Arguments - --------- - files: list or iterable of string - files with alignments; if some records not presented in all files table will be filled by '-' (gaps) - fmt: string - format of files aligment; supported any format from biopython (fasta, phylip, etc.) +# self.node2genome = node2genome +# self.nodes = set(node2genome.keys()) + +# @staticmethod +# def read_alignment(files: list, fmt="fasta"): +# """ +# Read files alignments and prepare states table + +# Arguments +# --------- +# files: list or iterable of string +# files with alignments; if some records not presented in all files table will be filled by '-' (gaps) +# fmt: string +# format of files aligment; supported any format from biopython (fasta, phylip, etc.) - Return - --------- - states: pd.DataFrame - """ - ngenes = len(files) - columns = "Node Part Site State p_A p_C p_G p_T".split() - nucls = "ACGT" - aln_lens = dict() - files = set(files) - history = defaultdict(list) - data = [] - for filepath in files: - part = "1" if ngenes == 1 else os.path.basename(filepath).replace(".fna", "") - alignment = SeqIO.parse(filepath, fmt) - for rec in alignment: - node = rec.name - history[node].append(part) - seq = str(rec.seq) - for site, state in enumerate(seq, 1): - site_data = [node, part, site, state] - for nucl in nucls: - p = int(nucl == state) - site_data.append(p) - data.append(site_data) - aln_lens[part] = len(seq) - - if ngenes > 1: - # get max set of parts. Only for multiple files - for node, parts in history.items(): - if len(parts) == ngenes: - full_parts = parts.copy() - break - - # fill missing genes by '-'. Only for multiple files - for node, parts in history.items(): - if len(parts) != ngenes: - unseen_parts = set(full_parts).difference(parts) - for unp in unseen_parts: - print(f"Gap filling for node {node}, part {unp}...", file=sys.stderr) - for site in range(1, aln_lens[unp] + 1): - site_data = [node, unp, site, "-", 0, 0, 0, 0] - data.append(site_data) - - df = pd.DataFrame(data, columns=columns) - return df - - @staticmethod - def read_rates(path: str): - """ TODO write for many genes """ - category = read_rates(path) - category = {"1": category} # for each aln part - return category +# Return +# --------- +# states: pd.DataFrame +# """ +# ngenes = len(files) +# columns = "Node Part Site State p_A p_C p_G p_T".split() +# nucls = "ACGT" +# aln_lens = dict() +# files = set(files) +# history = defaultdict(list) +# data = [] +# for filepath in files: +# part = "1" if ngenes == 1 else os.path.basename(filepath).replace(".fna", "") +# alignment = SeqIO.parse(filepath, fmt) +# for rec in alignment: +# node = rec.name +# history[node].append(part) +# seq = str(rec.seq) +# for site, state in enumerate(seq, 1): +# site_data = [node, part, site, state] +# for nucl in nucls: +# p = int(nucl == state) +# site_data.append(p) +# data.append(site_data) +# aln_lens[part] = len(seq) + +# if ngenes > 1: +# # get max set of parts. Only for multiple files +# for node, parts in history.items(): +# if len(parts) == ngenes: +# full_parts = parts.copy() +# break + +# # fill missing genes by '-'. Only for multiple files +# for node, parts in history.items(): +# if len(parts) != ngenes: +# unseen_parts = set(full_parts).difference(parts) +# for unp in unseen_parts: +# print(f"Gap filling for node {node}, part {unp}...", file=sys.stderr) +# for site in range(1, aln_lens[unp] + 1): +# site_data = [node, unp, site, "-", 0, 0, 0, 0] +# data.append(site_data) + +# df = pd.DataFrame(data, columns=columns) +# return df + +# @staticmethod +# def read_rates(path: str): +# """ TODO write for many genes """ +# category = read_rates(path) +# category = {"1": category} # for each aln part +# return category - @staticmethod - def get_mask(category: dict, cat_cutoff=1): - mask = {part: np.array(cats) >= cat_cutoff for part, cats in category.items()} - return mask +# @staticmethod +# def get_mask(category: dict, cat_cutoff=1): +# mask = {part: np.array(cats) >= cat_cutoff for part, cats in category.items()} +# return mask - def close_db(self): - self.con.close() +# def close_db(self): +# self.con.close() def read_rates(path: str): @@ -304,3 +422,61 @@ def read_rates(path: str): df = pd.read_csv(path, sep="\t", comment="#").sort_values("Site") category = df.Cat.values return category + + +def read_alignment(files: list, fmt="fasta"): + """ + Read files alignments and prepare states table + + Arguments + --------- + files: list or iterable of string + files with alignments; if some records not presented in all files table will be filled by '-' (gaps) + fmt: string + format of files aligment; supported any format from biopython (fasta, phylip, etc.) + + Return + --------- + states: pd.DataFrame + """ + ngenes = len(files) + columns = "Node Part Site State p_A p_C p_G p_T".split() + nucls = "ACGT" + aln_lens = dict() + files = set(files) + history = defaultdict(list) + data = [] + for filepath in files: + part = "1" if ngenes == 1 else os.path.basename(filepath).replace(".fna", "") + alignment = SeqIO.parse(filepath, fmt) + for rec in alignment: + node = rec.name + history[node].append(part) + seq = str(rec.seq) + for site, state in enumerate(seq, 1): + site_data = [node, part, site, state] + for nucl in nucls: + p = int(nucl == state) + site_data.append(p) + data.append(site_data) + aln_lens[part] = len(seq) + + if ngenes > 1: + # get max set of parts. Only for multiple files + for node, parts in history.items(): + if len(parts) == ngenes: + full_parts = parts.copy() + break + + # fill missing genes by '-'. Only for multiple files + for node, parts in history.items(): + if len(parts) != ngenes: + unseen_parts = set(full_parts).difference(parts) + for unp in unseen_parts: + print(f"Gap filling for node {node}, part {unp}...", file=sys.stderr) + for site in range(1, aln_lens[unp] + 1): + site_data = [node, unp, site, "-", 0, 0, 0, 0] + data.append(site_data) + + df = pd.DataFrame(data, columns=columns) + return df \ No newline at end of file diff --git a/scripts/collect_mutations_parallel.py b/scripts/collect_mutations_parallel.py new file mode 100644 index 0000000..2f202c3 --- /dev/null +++ b/scripts/collect_mutations_parallel.py @@ -0,0 +1,521 @@ +#!/usr/bin/env python3 +""" +pic is position in codon + +""" + +import os +import sys +import time +import random +from shutil import rmtree +from typing import Dict +from warnings import warn +import multiprocessing as mp +from datetime import datetime +from collections import defaultdict + +import numpy as np +import pandas as pd +from ete3 import PhyloTree +from pymutspec.annotation import ( + CodonAnnotation, iter_tree_edges, get_tree_len, +) +from pymutspec.utils import load_logger, basic_logger + +logger = basic_logger() + +nucl_order5 = ['A', 'C', 'G', 'T', '-'] + + +def calc_phylocoefs(tree: PhyloTree): + tree_len = get_tree_len(tree, 'geom_mean') + logger.info(f'Tree len = {tree_len:.3f}') + phylocoefs = {tree.name: 1 - min(0.999, tree.get_closest_leaf()[1] / tree_len)} + for node in tree.iter_descendants(): + _closest, d = node.get_closest_leaf() + phylocoefs[node.name] = 1 - min(0.99999, d / tree_len) + logger.info(f'Phylocoefs range: [{min(phylocoefs.values()):.3f}, {max(phylocoefs.values()):.3f}]') + return phylocoefs + + +class GenomeStates: + def __init__(self, path_to_states, path_to_gappy_sites): + for path in list([path_to_states, path_to_gappy_sites]): + if not os.path.exists(path): + raise ValueError(f"Path to states doesn't exist: '{path}'") + + logger.info(f'Reading states "{path_to_states}"') + states = pd.read_parquet(path_to_states) + + nodes_arr = states['Node'].unique() + self.node2id = dict(zip(nodes_arr, range(len(nodes_arr)))) + self.nodes = set(nodes_arr) + logger.debug('Node2id prepared') + + logger.info(f'Loaded {len(self.nodes)} genomes') + + states['NodeId'] = states['Node'].map(self.node2id).astype(np.int32) + states.drop('Node', axis=1, inplace=True) + logger.debug('Node column deleted') + states.set_index('NodeId', inplace=True) + logger.debug('States indexed') + + self.genome_size = len(states.loc[0]) + logger.info(f'Total alignement size = {self.genome_size}') + + self.nongappy_sites = self.read_nongappy_sites(path_to_gappy_sites) + self.states = states + logger.info('States initialiaztion is done') + + def __contains__(self, item: str): + return item in self.nodes + + def __getitem__(self, item: str): + return self.get_genome(item) + + def read_nongappy_sites(self, path): + logger.debug(f'Reading gappy sites "{path}"') + gappy_sites = set(pd.read_csv(path, header=None).values.flatten().tolist()) + nongappy_sites = np.array([i for i in range(self.genome_size) \ + if i not in gappy_sites]) + 1 # 1-based + logger.debug(f'Number of used (non-gappy) sites = {len(nongappy_sites)}') + return nongappy_sites + + def get_genome(self, node: str) -> pd.DataFrame: + if node not in self.nodes: + raise ValueError(f'Passed node "{node}" have no genome') + node_id = self.node2id[node] + genome = self.states.loc[node_id].set_index('Site').loc[self.nongappy_sites] + return genome + + +class GenomeStatesTotal(): + def __init__(self, path_to_states1: str, path_to_states2: str, path_to_gappy_sites: str): + gs1 = GenomeStates(path_to_states1, path_to_gappy_sites) + gs2 = GenomeStates(path_to_states2, path_to_gappy_sites) + + assert gs1.genome_size == gs2.genome_size + self.genome_size = gs1.genome_size + + self.nodes = gs1.nodes.union(gs2.nodes) + self.gs1 = gs1 + self.gs2 = gs2 + + def __contains__(self, item): + return item in self.nodes + + def __getitem__(self, item): + if item in self.gs1: + genome = self.gs1[item] + elif item in self.gs2: + genome = self.gs2[item] + else: + raise ValueError('node does not exist') + return genome + + def get_genome(self, node: str) -> pd.DataFrame: + if node in self.gs1: + genome = self.gs1[node] + elif node in self.gs2: + genome = self.gs2[node] + else: + raise ValueError('node does not exist') + return genome + + def get_random_genome(self) -> pd.DataFrame: + node = random.choice(list(self.nodes)) + return self[node] + + +class Branch: + def __init__(self, index: int, ref_name: str, alt_name: str, + ref_genome: pd.DataFrame, alt_genome: pd.DataFrame, + phylocoef: float, proba_cutoff: float, genome_size: int, + site2index: Dict[int, int]) -> None: + self.index = index + self.ref_name = ref_name + self.alt_name = alt_name + self.ref_genome = ref_genome + self.alt_genome = alt_genome + self.phylocoef = phylocoef + self.proba_cutoff = proba_cutoff + self.genome_size = genome_size + self.site2index = site2index + + def is_substitution(self, cxt1, cxt2): + """ + check that contexts contain substitution. + + Example inputs and outputs: + - (CAT,CGG) -> False (T != G) + - (CAT,CAG) -> False (central nuclaotide are constant) + - (ACTAC,ACAAC) -> True + - (ACTAC,AC-AC) -> False (indel != substitution) + """ + ok = True + center_pos = len(cxt1) // 2 + if cxt1[center_pos] == cxt2[center_pos]: + ok = False + elif cxt1[center_pos] == '-' or cxt2[center_pos] == '-': + ok = False + elif cxt1[center_pos-1] != cxt2[center_pos-1] or \ + cxt1[center_pos+1] != cxt2[center_pos+1]: + logger.warning(f"Neighbouring positions of substitution are different: {cxt1}, {cxt2}") + ok = False + return ok + + def process_branch(self): + edge_mutations = self.extract_mutations_proba( + self.ref_genome, self.alt_genome, self.phylocoef, self.proba_cutoff) + + edge_mutations["RefNode"] = self.ref_name + edge_mutations["AltNode"] = self.alt_name + + mut_num = edge_mutations['ProbaFull'].sum() if 'ProbaFull' in edge_mutations.columns else len(edge_mutations) + if mut_num == 0: + logger.info(f"No mutations from branch {self.index:03} ({self.ref_name} - {self.alt_name})") + elif mut_num > self.genome_size * 0.1: + logger.warning(f"Observed too many mutations ({mut_num} > {self.genome_size} * 0.1) " + f"for branch ({self.ref_name} - {self.alt_name})") + else: + logger.info(f"{mut_num:.2f} mutations from branch {self.index:03} ({self.ref_name} - {self.alt_name})") + + return edge_mutations + + # @profile + def extract_mutations_proba(self, g1: pd.DataFrame, g2: pd.DataFrame, + phylocoef: float, mut_proba_cutoff: float): + """ + Extract alterations of g2 comparing to g1 + + conditions: + - in one codon could be only sbs + - in the context of one mutation couldn't be other sbs + - indels are not sbs and codons and contexts with sbs are not considered + + Arguments + - g1 - reference sequence (parent node) + - g2 - alternative sequence (child node) + + return: + - mut - dataframe of mutations + """ + n, m = len(g1), len(g2) + assert n == m, f"genomes lengths are not equal: {n} != {m}" + + mutations = [] + g1_most_prob = g1.values.argmax(1) + g2_most_prob = g2.values.argmax(1) + # pass initial and last nucleotides without context + for site in g1.index[2:-2]: + site_mutations = self.process_site( + g1, g2, site, g1_most_prob, g2_most_prob, mut_proba_cutoff, phylocoef) + mutations.extend(site_mutations) + mut_df = pd.DataFrame(mutations) + return mut_df + + def process_site(self, g1: pd.DataFrame, g2: pd.DataFrame, site: int, + g1_most_prob: np.ndarray, g2_most_prob: np.ndarray, + mut_proba_cutoff: float, phylocoef: float): + pos_in_states = self.site2index[site] + states1, states2 = self.prepare_local_states( + g1.values, g2.values, pos_in_states, g1_most_prob, g2_most_prob) + if states1 is None: + return [] + + mutations = [] + for cxt1, proba1 in self.sample_context5(2, states1, mut_proba_cutoff): + for cxt2, proba2 in self.sample_context5(2, states2, mut_proba_cutoff): + if not self.is_substitution(cxt1, cxt2): + continue + + p = proba1 * proba2 + if p < mut_proba_cutoff: + continue + + p_adj = p * phylocoef + + cxt1 = ''.join(cxt1) + cxt2 = ''.join(cxt2) + muttype = f'{cxt1[0:2]}[{cxt1[2]}>{cxt2[2]}]{cxt1[3:]}' + sbs = { + "Mut": muttype, + "Site": site, + "ProbaRef": proba1, + "ProbaMut": p, + "ProbaFull": p_adj, + } + mutations.append(sbs) + return mutations + + def prepare_local_states_simple(self, g1: np.ndarray, g2: np.ndarray, pos: int): + return g1[pos-2: pos+3], g2[pos-2: pos+3] + + def prepare_local_states(self, g1: np.ndarray, g2: np.ndarray, pos: int, + g1_most_prob: np.ndarray, g2_most_prob: np.ndarray): + # prepare local states: exclude gaps from both seqs + ## first, search upstream + up_states1, up_states2 = [g1[pos]], [g2[pos]] + down_states1, down_states2 = [], [] + for i in range(1, 10): # don't collect gaps at all + cur_state1, cur_state2 = g1[pos-i], g2[pos-i] + state_id1, state_id2 = g1_most_prob[pos-i], g2_most_prob[pos-i] + if not (state_id1 == state_id2 == 4): # both states are not equal to gap + up_states1.append(cur_state1) + up_states2.append(cur_state2) + if len(up_states1) == 3 or pos-i-1 <= 0: + break + + ## second, search downstream + for i in range(1, 10): + cur_state1, cur_state2 = g1[pos+i], g2[pos+i] + state_id1, state_id2 = g1_most_prob[pos+i], g2_most_prob[pos+i] + if not (state_id1 == state_id2 == 4): # not equal to gap + down_states1.append(cur_state1) + down_states2.append(cur_state2) + if len(down_states1) == 2 or pos+i+1 >= len(g1): + break + + if any([len(up_states1) != 3, + len(up_states2) != 3, + len(down_states1) != 2 , + len(down_states2) != 2]): + logger.warning(f'Cannot collect local contexts for position {pos}') + return None, None + + up_states1.reverse() + up_states2.reverse() + up_states1.extend(down_states1) + up_states2.extend(down_states2) + states1 = np.array(up_states1) + states2 = np.array(up_states2) + + return states1, states2 + + def sample_context5(self, pos: int, g: np.ndarray, cutoff=0.25): + probas = g[pos-2] * g[pos-1][:, None] * g[pos][:, None, None] * \ + g[pos+1][:, None, None, None] * g[pos+2][:, None, None, None, None] + + indexes = np.where(probas > cutoff) + for idx in range(len(indexes[0])): + cxt_ids = [indexes[i][idx] for i in list(range(len(indexes)))[::-1]] + cxt = tuple(nucl_order5[x] for x in cxt_ids) + pr = probas[tuple(cxt_ids[::-1])] + yield cxt, pr + + def sample_context7(self, pos: int, g: np.ndarray, cutoff=0.25): + probas = g[pos-3] * g[pos-2][:, None] * g[pos-1][:, None, None] * \ + g[pos][:, None, None, None] * g[pos+1][:, None, None, None, None] * \ + g[pos+2][:, None, None, None, None, None] * \ + g[pos+3][:, None, None, None, None, None, None] + + indexes = np.where(probas > cutoff) + for idx in range(len(indexes[0])): + cxt_ids = [indexes[i][idx] for i in list(range(len(indexes)))[::-1]] + cxt = tuple(nucl_order5[x] for x in cxt_ids) + pr = probas[tuple(cxt_ids[::-1])] + yield cxt, pr + + +class MutSpec(CodonAnnotation): + def __init__( + self, path_to_tree, outdir, gcode=2, + use_proba=False, proba_cutoff=0.25, use_phylocoef=False, + genomes: GenomeStatesTotal = None, num_processes=8. + ): + if not os.path.exists(path_to_tree): + raise ValueError(f"Path to tree doesn't exist: '{path_to_tree}'") + + CodonAnnotation.__init__(self, gencode=gcode) + + self.num_processes = num_processes + self.genomes = genomes + self.gcode = gcode + self.use_proba = use_proba + self.proba_cutoff = proba_cutoff + self.use_phylocoef = use_phylocoef if use_proba else False + self.outdir = outdir + logger.info(f"Using gencode {gcode}") + logger.info(f"Use probabilities of genomic states: {use_proba}") + logger.info(f"Use phylogenetic uncertainty coefficient: {use_phylocoef}") + logger.info(f"Minimal probability for mutations to use: {proba_cutoff}") + + self.fp_format = np.float32 + self.tree = PhyloTree(path_to_tree, format=1) + logger.info( + f"Tree loaded, number of leaf nodes: {len(self.tree)}, " + f"total number of nodes: {len(self.tree.get_cached_content())}, " + ) + rnd_genome = self.genomes.get_random_genome() + logger.info(f"Number of MSA sites: {len(rnd_genome)}") + + # mapping of MSA sites to truncated states ids + self.site2index = dict(zip(rnd_genome.index, range(len(rnd_genome)))) + + def open_handles(self, outdir): + self.handle = dict() + self.handle["mut"] = open(os.path.join(outdir, "mutations.tsv"), "w") + logger.debug("Handles opened") + + def close_handles(self): + for file in self.handle.values(): + file.close() + logger.debug("Handles closed") + + def extract_mutspec_from_tree(self): + t = self.num_processes + if not isinstance(t, int) or t < 1: + raise ValueError('num_processes must be positive integer') + + if t == 1: + self._derive_mutspec() + elif t > 1: + self._derive_mutspec_parallel() + + def _derive_mutspec(self): + logger.info("Start mutation extraction from tree") + self.open_handles(self.outdir) + add_header = defaultdict(lambda: True) + total_mut_num = 0 + + for edge_data in self.iter_branches(): + edge_mutations = self.process_branch(*edge_data[1:]) + mut_num = edge_mutations['ProbaFull'].sum() if self.use_proba and \ + 'ProbaFull' in edge_mutations.columns else len(edge_mutations) + total_mut_num += mut_num + + # dump current edge mutations + self.dump_table(edge_mutations, self.handle["mut"], add_header["mut"]) + add_header["mut"] = False + + self.close_handles() + logger.info(f"Processed {edge_data[1]} tree edges") + logger.info(f"Observed {total_mut_num:.3f} substitutions") + logger.info("Extraction of mutations from phylogenetic tree completed succesfully") + + def _derive_mutspec_parallel(self): + logger.info("Start parallel mutation extraction from tree") + + with mp.Pool(processes=self.num_processes) as pool: + genome_mutations_lst = pool.map(Branch.process_branch, self.iter_branches()) + + genome_mutations = pd.concat(genome_mutations_lst) + + self.open_handles(self.outdir) + self.dump_table(genome_mutations, self.handle["mut"], True) + self.close_handles() + + total_mut_num = genome_mutations['ProbaFull'].sum() if self.use_proba and \ + 'ProbaFull' in genome_mutations.columns else len(genome_mutations) + logger.info(f"Observed {total_mut_num:.3f} substitutions") + logger.info("Extraction of mutations from phylogenetic tree completed succesfully") + + # def iter_chunks(self, n): + # lst = self.get_edges_data() + # for i in range(0, len(lst), n): + # yield lst[i:i + n] + + def iter_branches(self): + """yield (self, edge_id, ref_node_name, alt_node_name, ref_genome, alt_genome, phylocoef)""" + # calculate phylogenetic uncertainty correction + if self.use_phylocoef: + phylocoefs = calc_phylocoefs(self.tree) + + for ei, (ref_node, alt_node) in enumerate(iter_tree_edges(self.tree), 1): + if alt_node.name not in self.genomes.nodes: + logger.warning(f"Skip edge '{ref_node.name}'-'{alt_node.name}' due to absence of '{alt_node.name}' genome") + continue + if ref_node.name not in self.genomes.nodes: + logger.warning(f"Skip edge '{ref_node.name}'-'{alt_node.name}' due to absence of '{ref_node.name}' genome") + continue + + # calculate phylogenetic uncertainty correction: + # coef for ref edge * coef for alt edge + if self.use_phylocoef: + phylocoef = self.fp_format(phylocoefs[ref_node.name] * phylocoefs[alt_node.name]) + else: + phylocoef = self.fp_format(1) + + # get genomes from storage + ref_genome = self.genomes.get_genome(ref_node.name) + alt_genome = self.genomes.get_genome(alt_node.name) + logger.debug(f'Genomes retrieved for branch {ei}') + + yield Branch(ei, ref_node.name, alt_node.name, ref_genome, alt_genome, + phylocoef, self.proba_cutoff, self.genomes.genome_size, self.site2index) + + def get_edges_data(self): + return list(self.iter_branches()) + + @staticmethod + def dump_table(df: pd.DataFrame, handle, header=False): + if header: + handle.write("\t".join(df.columns) + "\n") + handle.write(df.to_csv(sep="\t", index=None, header=None, float_format='%g')) + + def turn_to_MAP(self, states: np.ndarray): + if isinstance(states, pd.DataFrame): + warn('pd.DataFrame used as input instead of np.ndarray', ResourceWarning) + states = states.values + return ''.join([nucl_order5[x] for x in states.argmax(1)]) + + +def main(): + debug = False + current_datetime = str(datetime.now()).split('.')[0].replace(' ', '_') + if debug: + path_to_tree = 'data/tiny_tree2.nwk' + path_to_states = 'data/tiny_states2.parquet' + path_to_alignment = 'data/tiny_alignment2.parquet' + outdir = f'data/tmp_{current_datetime}' + else: + path_to_tree = 'data/ancrec.root.nwk' + path_to_states = 'data/states2.parquet' + path_to_alignment = 'data/alignment2.parquet' + outdir = 'data/multicore_run_v4' + + path_to_gappy_sites = './data/gappy_sites.csv' + gencode = 2 + proba = phylocoef = True + proba_cutoff = 0.25 + force = False + num_processes = 64 + + if os.path.exists(outdir): + if not force: + answer = input(f"Delete existing directory '{outdir}'? [Y/n] ") + if answer.upper() != "Y" and answer != "": + print("Interapted", file=sys.stderr) + exit(0) + rmtree(outdir) + print(f"Directory '{outdir}' deleted", file=sys.stderr) + os.makedirs(outdir) + + global logger + logfile = os.path.join(outdir, "run.log") + logger = load_logger(filename=logfile, stream_level='INFO') + + logger.info(f"Writing logs to '{logfile}'") + logger.debug("Command: " + " ".join(sys.argv)) + logger.debug(f"Output directory '{outdir}' created") + if debug: + logger.warning('DEBUG MODE ACTIVATED') + + start_time = time.time() + + genomes = GenomeStatesTotal(path_to_states, path_to_alignment, path_to_gappy_sites) + + MutSpec( + path_to_tree, outdir, gcode=gencode, + use_proba=proba, proba_cutoff=proba_cutoff, use_phylocoef=phylocoef, + genomes=genomes, num_processes=num_processes, + ).extract_mutspec_from_tree() + + end_time = time.time() + execution_time = end_time - start_time + logger.info(f"Script completed taking {execution_time:.2f} seconds ({execution_time/3600:.2f} hours)") + + +if __name__ == "__main__": + main() From c0191b23b9e4327033075ae2066e5962db8bd1dc Mon Sep 17 00:00:00 2001 From: kpotoh Date: Mon, 25 Nov 2024 18:36:23 +0200 Subject: [PATCH 02/14] MutSpecExtractor added --- pymutspec/annotation/__init__.py | 2 +- pymutspec/annotation/mut.py | 342 ++++++++++++++++++++++++++++++- pymutspec/annotation/tree.py | 6 +- 3 files changed, 345 insertions(+), 5 deletions(-) diff --git a/pymutspec/annotation/__init__.py b/pymutspec/annotation/__init__.py index eb50453..a3bc16d 100644 --- a/pymutspec/annotation/__init__.py +++ b/pymutspec/annotation/__init__.py @@ -6,6 +6,6 @@ get_cossim, get_eucdist, filter_outlier_branches, sample_spectrum, get_iqr_bounds, ) -from .mut import CodonAnnotation, mutations_summary +from .mut import Branch, MutSpec, CodonAnnotation, mutations_summary translator = transcriptor diff --git a/pymutspec/annotation/mut.py b/pymutspec/annotation/mut.py index 35b1be2..812d837 100644 --- a/pymutspec/annotation/mut.py +++ b/pymutspec/annotation/mut.py @@ -1,13 +1,22 @@ +import os +import sys from collections import defaultdict from typing import Set, Union, Dict, Iterable -import sys +import multiprocessing as mp +from warnings import warn import numpy as np import pandas as pd from Bio.Data import CodonTable from Bio.Data.CodonTable import NCBICodonTableDNA +from ete3 import PhyloTree from pymutspec.constants import * +from pymutspec.utils import basic_logger +from ..io import GenomeStatesTotal +from .tree import iter_tree_edges, calc_phylocoefs + +nucl_order5 = ['A', 'C', 'G', 'T', '-'] class CodonAnnotation: @@ -749,6 +758,337 @@ def read_start_stop_codons(codontable: Union[NCBICodonTableDNA, int]): return set(codontable.start_codons), set(codontable.stop_codons) +class Branch: + def __init__(self, index: int, ref_name: str, alt_name: str, + ref_genome: pd.DataFrame, alt_genome: pd.DataFrame, + phylocoef: float, proba_cutoff: float, genome_size: int, + site2index: Dict[int, int], logger=None) -> None: + self.index = index + self.ref_name = ref_name + self.alt_name = alt_name + self.ref_genome = ref_genome + self.alt_genome = alt_genome + self.phylocoef = phylocoef + self.proba_cutoff = proba_cutoff + self.genome_size = genome_size + self.site2index = site2index + self.logger = logger or basic_logger() + + def is_substitution(self, cxt1, cxt2): + """ + check that contexts contain substitution. + + Example inputs and outputs: + - (CAT,CGG) -> False (T != G) + - (CAT,CAG) -> False (central nuclaotide are constant) + - (ACTAC,ACAAC) -> True + - (ACTAC,AC-AC) -> False (indel != substitution) + """ + ok = True + center_pos = len(cxt1) // 2 + if cxt1[center_pos] == cxt2[center_pos]: + ok = False + elif cxt1[center_pos] == '-' or cxt2[center_pos] == '-': + ok = False + elif cxt1[center_pos-1] != cxt2[center_pos-1] or \ + cxt1[center_pos+1] != cxt2[center_pos+1]: + self.logger.warning(f"Neighbouring positions of substitution are different: {cxt1}, {cxt2}") + ok = False + return ok + + def process_branch(self): + edge_mutations = self.extract_mutations_proba( + self.ref_genome, self.alt_genome, self.phylocoef, self.proba_cutoff) + + edge_mutations["RefNode"] = self.ref_name + edge_mutations["AltNode"] = self.alt_name + + mut_num = edge_mutations['ProbaFull'].sum() if 'ProbaFull' in edge_mutations.columns else len(edge_mutations) + if mut_num == 0: + self.logger.info(f"No mutations from branch {self.index:03} ({self.ref_name} - {self.alt_name})") + elif mut_num > self.genome_size * 0.1: + self.logger.warning(f"Observed too many mutations ({mut_num} > {self.genome_size} * 0.1) " + f"for branch ({self.ref_name} - {self.alt_name})") + else: + self.logger.info(f"{mut_num:.2f} mutations from branch {self.index:03} ({self.ref_name} - {self.alt_name})") + + return edge_mutations + + def extract_mutations_proba(self, g1: pd.DataFrame, g2: pd.DataFrame, + phylocoef: float, mut_proba_cutoff: float): + """ + Extract alterations of g2 comparing to g1 + + conditions: + - in one codon could be only sbs + - in the context of one mutation couldn't be other sbs + - indels are not sbs and codons and contexts with sbs are not considered + + Arguments + - g1 - reference sequence (parent node) + - g2 - alternative sequence (child node) + + return: + - mut - dataframe of mutations + """ + n, m = len(g1), len(g2) + assert n == m, f"genomes lengths are not equal: {n} != {m}" + + mutations = [] + g1_most_prob = g1.values.argmax(1) + g2_most_prob = g2.values.argmax(1) + # pass initial and last nucleotides without context + for site in g1.index[2:-2]: + site_mutations = self.process_site( + g1, g2, site, g1_most_prob, g2_most_prob, mut_proba_cutoff, phylocoef) + mutations.extend(site_mutations) + mut_df = pd.DataFrame(mutations) + return mut_df + + def process_site(self, g1: pd.DataFrame, g2: pd.DataFrame, site: int, + g1_most_prob: np.ndarray, g2_most_prob: np.ndarray, + mut_proba_cutoff: float, phylocoef: float): + pos_in_states = self.site2index[site] + states1, states2 = self.prepare_local_states( + g1.values, g2.values, pos_in_states, g1_most_prob, g2_most_prob) + if states1 is None: + return [] + + mutations = [] + for cxt1, proba1 in self.sample_context5(2, states1, mut_proba_cutoff): + for cxt2, proba2 in self.sample_context5(2, states2, mut_proba_cutoff): + if not self.is_substitution(cxt1, cxt2): + continue + + p = proba1 * proba2 + if p < mut_proba_cutoff: + continue + + p_adj = p * phylocoef + + cxt1 = ''.join(cxt1) + cxt2 = ''.join(cxt2) + muttype = f'{cxt1[0:2]}[{cxt1[2]}>{cxt2[2]}]{cxt1[3:]}' + sbs = { + "Mut": muttype, + "Site": site, + "ProbaRef": proba1, + "ProbaMut": p, + "ProbaFull": p_adj, + } + mutations.append(sbs) + return mutations + + def prepare_local_states_simple(self, g1: np.ndarray, g2: np.ndarray, pos: int): + return g1[pos-2: pos+3], g2[pos-2: pos+3] + + def prepare_local_states(self, g1: np.ndarray, g2: np.ndarray, pos: int, + g1_most_prob: np.ndarray, g2_most_prob: np.ndarray): + # prepare local states: exclude gaps from both seqs + ## first, search upstream + up_states1, up_states2 = [g1[pos]], [g2[pos]] + down_states1, down_states2 = [], [] + for i in range(1, 10): # don't collect gaps at all + cur_state1, cur_state2 = g1[pos-i], g2[pos-i] + state_id1, state_id2 = g1_most_prob[pos-i], g2_most_prob[pos-i] + if not (state_id1 == state_id2 == 4): # both states are not equal to gap + up_states1.append(cur_state1) + up_states2.append(cur_state2) + if len(up_states1) == 3 or pos-i-1 <= 0: + break + + ## second, search downstream + for i in range(1, 10): + cur_state1, cur_state2 = g1[pos+i], g2[pos+i] + state_id1, state_id2 = g1_most_prob[pos+i], g2_most_prob[pos+i] + if not (state_id1 == state_id2 == 4): # not equal to gap + down_states1.append(cur_state1) + down_states2.append(cur_state2) + if len(down_states1) == 2 or pos+i+1 >= len(g1): + break + + if any([len(up_states1) != 3, + len(up_states2) != 3, + len(down_states1) != 2 , + len(down_states2) != 2]): + self.logger.warning(f'Cannot collect local contexts for position {pos}') + return None, None + + up_states1.reverse() + up_states2.reverse() + up_states1.extend(down_states1) + up_states2.extend(down_states2) + states1 = np.array(up_states1) + states2 = np.array(up_states2) + + return states1, states2 + + def sample_context5(self, pos: int, g: np.ndarray, cutoff=0.25): + probas = g[pos-2] * g[pos-1][:, None] * g[pos][:, None, None] * \ + g[pos+1][:, None, None, None] * g[pos+2][:, None, None, None, None] + + indexes = np.where(probas > cutoff) + for idx in range(len(indexes[0])): + cxt_ids = [indexes[i][idx] for i in list(range(len(indexes)))[::-1]] + cxt = tuple(nucl_order5[x] for x in cxt_ids) + pr = probas[tuple(cxt_ids[::-1])] + yield cxt, pr + + def sample_context7(self, pos: int, g: np.ndarray, cutoff=0.25): + probas = g[pos-3] * g[pos-2][:, None] * g[pos-1][:, None, None] * \ + g[pos][:, None, None, None] * g[pos+1][:, None, None, None, None] * \ + g[pos+2][:, None, None, None, None, None] * \ + g[pos+3][:, None, None, None, None, None, None] + + indexes = np.where(probas > cutoff) + for idx in range(len(indexes[0])): + cxt_ids = [indexes[i][idx] for i in list(range(len(indexes)))[::-1]] + cxt = tuple(nucl_order5[x] for x in cxt_ids) + pr = probas[tuple(cxt_ids[::-1])] + yield cxt, pr + + +class MutSpecExtractor(CodonAnnotation): + def __init__( + self, path_to_tree, outdir, gcode=2, + use_proba=False, proba_cutoff=0.25, use_phylocoef=False, + genomes: GenomeStatesTotal = None, num_processes=8, + logger=None, + ): + if not os.path.exists(path_to_tree): + raise ValueError(f"Path to tree doesn't exist: '{path_to_tree}'") + + CodonAnnotation.__init__(self, gencode=gcode) + + self.num_processes = num_processes + self.genomes = genomes + self.gcode = gcode + self.use_proba = use_proba + self.proba_cutoff = proba_cutoff + self.use_phylocoef = use_phylocoef if use_proba else False + self.outdir = outdir + self.logger = logger or basic_logger() + + self.logger.info(f"Using gencode {gcode}") + self.logger.info(f"Use probabilities of genomic states: {use_proba}") + self.logger.info(f"Use phylogenetic uncertainty coefficient: {use_phylocoef}") + self.logger.info(f"Minimal probability for mutations to use: {proba_cutoff}") + + self.fp_format = np.float32 + self.tree = PhyloTree(path_to_tree, format=1) + self.logger.info( + f"Tree loaded, number of leaf nodes: {len(self.tree)}, " + f"total number of nodes: {len(self.tree.get_cached_content())}, " + ) + rnd_genome = self.genomes.get_random_genome() + self.logger.info(f"Number of MSA sites: {len(rnd_genome)}") + + # mapping of MSA sites to truncated states ids + self.site2index = dict(zip(rnd_genome.index, range(len(rnd_genome)))) + + def open_handles(self, outdir): + self.handle = dict() + self.handle["mut"] = open(os.path.join(outdir, "mutations.tsv"), "w") + self.logger.debug("Handles opened") + + def close_handles(self): + for file in self.handle.values(): + file.close() + self.logger.debug("Handles closed") + + def extract_mutspec_from_tree(self): + t = self.num_processes + if not isinstance(t, int) or t < 1: + raise ValueError('num_processes must be positive integer') + + if t == 1: + self._derive_mutspec() + elif t > 1: + self._derive_mutspec_parallel() + + def _derive_mutspec(self): + self.logger.info("Start mutation extraction from tree") + self.open_handles(self.outdir) + add_header = defaultdict(lambda: True) + total_mut_num = 0 + + for edge_data in self.iter_branches(): + edge_mutations = self.process_branch(*edge_data[1:]) + mut_num = edge_mutations['ProbaFull'].sum() if self.use_proba and \ + 'ProbaFull' in edge_mutations.columns else len(edge_mutations) + total_mut_num += mut_num + + # dump current edge mutations + self.dump_table(edge_mutations, self.handle["mut"], add_header["mut"]) + add_header["mut"] = False + + self.close_handles() + self.logger.info(f"Processed {edge_data[1]} tree edges") + self.logger.info(f"Observed {total_mut_num:.3f} substitutions") + self.logger.info("Extraction of mutations from phylogenetic tree completed succesfully") + + def _derive_mutspec_parallel(self): + self.logger.info("Start parallel mutation extraction from tree") + + with mp.Pool(processes=self.num_processes) as pool: + genome_mutations_lst = pool.map(Branch.process_branch, self.iter_branches()) + + genome_mutations = pd.concat(genome_mutations_lst) + + self.open_handles(self.outdir) + self.dump_table(genome_mutations, self.handle["mut"], True) + self.close_handles() + + total_mut_num = genome_mutations['ProbaFull'].sum() if self.use_proba and \ + 'ProbaFull' in genome_mutations.columns else len(genome_mutations) + self.logger.info(f"Observed {total_mut_num:.3f} substitutions") + self.logger.info("Extraction of mutations from phylogenetic tree completed succesfully") + + def iter_branches(self): + """yield (self, edge_id, ref_node_name, alt_node_name, ref_genome, alt_genome, phylocoef)""" + # calculate phylogenetic uncertainty correction + if self.use_phylocoef: + phylocoefs = calc_phylocoefs(self.tree) + + for ei, (ref_node, alt_node) in enumerate(iter_tree_edges(self.tree), 1): + if alt_node.name not in self.genomes.nodes: + self.logger.warning(f"Skip edge '{ref_node.name}'-'{alt_node.name}' due to absence of '{alt_node.name}' genome") + continue + if ref_node.name not in self.genomes.nodes: + self.logger.warning(f"Skip edge '{ref_node.name}'-'{alt_node.name}' due to absence of '{ref_node.name}' genome") + continue + + # calculate phylogenetic uncertainty correction: + # coef for ref edge * coef for alt edge + if self.use_phylocoef: + phylocoef = self.fp_format(phylocoefs[ref_node.name] * phylocoefs[alt_node.name]) + else: + phylocoef = self.fp_format(1) + + # get genomes from storage + ref_genome = self.genomes.get_genome(ref_node.name) + alt_genome = self.genomes.get_genome(alt_node.name) + self.logger.debug(f'Genomes retrieved for branch {ei}') + + yield Branch(ei, ref_node.name, alt_node.name, ref_genome, alt_genome, + phylocoef, self.proba_cutoff, self.genomes.genome_size, self.site2index) + + def get_edges_data(self): + return list(self.iter_branches()) + + @staticmethod + def dump_table(df: pd.DataFrame, handle, header=False): + if header: + handle.write("\t".join(df.columns) + "\n") + handle.write(df.to_csv(sep="\t", index=None, header=None, float_format='%g')) + + def turn_to_MAP(self, states: np.ndarray): + if isinstance(states, pd.DataFrame): + warn('pd.DataFrame used as input instead of np.ndarray', ResourceWarning) + states = states.values + return ''.join([nucl_order5[x] for x in states.argmax(1)]) + + def mutations_summary(mutations: pd.DataFrame, gene_col=None, proba_col=None, gene_name_mapper: dict = None): """ form mutations annotation: how many synonymous, fourfold or stop loss/gain observed in the table diff --git a/pymutspec/annotation/tree.py b/pymutspec/annotation/tree.py index 813927d..526505c 100644 --- a/pymutspec/annotation/tree.py +++ b/pymutspec/annotation/tree.py @@ -75,10 +75,10 @@ def get_ingroup_root(tree: PhyloTree, outgrp='OUTGRP'): def calc_phylocoefs(tree: PhyloTree, outgrp='OUTGRP'): tree_len = get_tree_len(get_ingroup_root(tree, outgrp ), 'geom_mean') - phylocoefs = {} + phylocoefs = {tree.name: 1 - min(0.999, tree.get_closest_leaf()[1] / tree_len)} for node in tree.iter_descendants(): - d = node.get_closest_leaf()[1] - phylocoefs[node.name] = 1 - min(0.9999, d / tree_len) + _closest, d = node.get_closest_leaf() + phylocoefs[node.name] = 1 - min(0.99999, d / tree_len) return phylocoefs From f5ec4c7950a4078303eb7d59c6ff545f8f55ee89 Mon Sep 17 00:00:00 2001 From: kpotoh Date: Thu, 27 Mar 2025 17:58:51 +0200 Subject: [PATCH 03/14] init update collect_mutations step; Outgroup is not required now --- pymutspec/annotation/__init__.py | 2 +- pymutspec/annotation/tree.py | 41 ++++++++++---------------------- scripts/collect_mutations.py | 8 +++---- tests/test_tree_annotation.py | 15 ++++-------- 4 files changed, 22 insertions(+), 44 deletions(-) diff --git a/pymutspec/annotation/__init__.py b/pymutspec/annotation/__init__.py index a3bc16d..29a5c74 100644 --- a/pymutspec/annotation/__init__.py +++ b/pymutspec/annotation/__init__.py @@ -1,5 +1,5 @@ from .auxiliary import lbl2lbl_id, lbl_id2lbl, rev_comp, transcriptor -from .tree import iter_tree_edges, node_parent, calc_phylocoefs, get_ingroup_root, get_tree_len, get_tree_outgrp_name +from .tree import iter_tree_edges, node_parent, calc_phylocoefs, get_ingroup_root, get_tree_len from .spectra import ( calculate_mutspec, collapse_mutspec, calc_edgewise_spectra, complete_sbs192_columns, jackknife_spectra_sampling, collapse_sbs192, diff --git a/pymutspec/annotation/tree.py b/pymutspec/annotation/tree.py index 526505c..556958b 100644 --- a/pymutspec/annotation/tree.py +++ b/pymutspec/annotation/tree.py @@ -58,40 +58,25 @@ def get_tree_len(tree: PhyloTree, mode='geom_mean'): return md -def get_ingroup_root(tree: PhyloTree, outgrp='OUTGRP'): - found_outgrp, found_ingroup = False, False - ingroup = None - for n in tree.children: - if n.name == outgrp: - found_outgrp = True +def get_ingroup_root(tree: PhyloTree) -> PhyloTree: + assert len(tree.children) == 2, 'Tree must be binary' + found_outgroup = False + for node in tree.children: + if node.is_leaf(): + found_outgroup = True else: - found_ingroup = True - ingroup = n - if found_ingroup and found_outgrp: - return ingroup + ingrp = node + + if found_outgroup: + return ingrp else: - raise Exception('Cannot extract ingroup root') + return tree -def calc_phylocoefs(tree: PhyloTree, outgrp='OUTGRP'): - tree_len = get_tree_len(get_ingroup_root(tree, outgrp ), 'geom_mean') +def calc_phylocoefs(tree: PhyloTree): + tree_len = get_tree_len(get_ingroup_root(tree), 'geom_mean') phylocoefs = {tree.name: 1 - min(0.999, tree.get_closest_leaf()[1] / tree_len)} for node in tree.iter_descendants(): _closest, d = node.get_closest_leaf() phylocoefs[node.name] = 1 - min(0.99999, d / tree_len) return phylocoefs - - -def get_tree_outgrp_name(tree: PhyloTree): - c1, c2 = tree.children - done = False - if len(c1.children) == 0: - done = True - outgrp = c1 - if not done and len(c2.children) == 0: - done = True - outgrp = c2 - if not done: - raise Exception("Cannot get outgroup from tree") - - return outgrp.name diff --git a/scripts/collect_mutations.py b/scripts/collect_mutations.py index 631162c..a8b9a27 100644 --- a/scripts/collect_mutations.py +++ b/scripts/collect_mutations.py @@ -17,7 +17,7 @@ from pymutspec.annotation import ( CodonAnnotation, calculate_mutspec, - iter_tree_edges, lbl2lbl_id, calc_phylocoefs, get_tree_outgrp_name, + iter_tree_edges, lbl2lbl_id, calc_phylocoefs, ) from pymutspec.constants import possible_sbs12, possible_sbs192 from pymutspec.io import GenesStates @@ -78,11 +78,9 @@ def __init__( logger.info(f"Types of mutations to collect and process: {self.mut_labels}") self.fp_format = np.float32 self.tree = PhyloTree(path_to_tree, format=1) - self.outgrp_name = get_tree_outgrp_name(self.tree) logger.info( f"Tree loaded, number of leaf nodes: {len(self.tree)}, " - f"total number of nodes: {len(self.tree.get_cached_content())}, " - f"outgroup name: {self.outgrp_name}" + f"total number of nodes: {len(self.tree.get_cached_content())}" ) rnd_genome = self.get_random_genome() logger.info(f"Number of genes: {len(rnd_genome)}, number of sites: {[len(x) for x in rnd_genome.values()]}") @@ -119,7 +117,7 @@ def extract_mutspec_from_tree(self): # calculate phylogenetic uncertainty correction if self.use_phylocoef: - phylocoefs = calc_phylocoefs(self.tree, self.outgrp_name) + phylocoefs = calc_phylocoefs(self.tree) for ei, (ref_node, alt_node) in enumerate(iter_tree_edges(self.tree), 1): if alt_node.name not in self.nodes: diff --git a/tests/test_tree_annotation.py b/tests/test_tree_annotation.py index d2dec82..8a0b76d 100644 --- a/tests/test_tree_annotation.py +++ b/tests/test_tree_annotation.py @@ -1,20 +1,15 @@ import pytest -from pymutspec.annotation import get_tree_outgrp_name, get_ingroup_root, get_tree_len, calc_phylocoefs - - -def test_get_tree_outgrp_name(tree_rooted): - outgrp = get_tree_outgrp_name(tree_rooted) - assert outgrp == 'Acanthisitta_chloris' +from pymutspec.annotation import get_ingroup_root, get_tree_len, calc_phylocoefs def test_get_ingroup_root(tree_rooted): - ingroup = get_ingroup_root(tree_rooted, 'Acanthisitta_chloris') + ingroup = get_ingroup_root(tree_rooted) assert ingroup.name == 'Node1' assert round(ingroup.dist, 4) == 0.0739 def test_get_tree_len(tree_rooted): - ingroup = get_ingroup_root(tree_rooted, 'Acanthisitta_chloris') + ingroup = get_ingroup_root(tree_rooted) l1 = get_tree_len(tree_rooted, 'geom_mean') l2 = get_tree_len(ingroup, 'geom_mean') @@ -24,9 +19,9 @@ def test_get_tree_len(tree_rooted): def test_calc_phylocoefs(tree_rooted): - phylocoefs = calc_phylocoefs(tree_rooted, 'Acanthisitta_chloris') + phylocoefs = calc_phylocoefs(tree_rooted) - ingroup = get_ingroup_root(tree_rooted, 'Acanthisitta_chloris') + ingroup = get_ingroup_root(tree_rooted) tl = get_tree_len(ingroup, 'geom_mean') node = 'Node1' From 0da4ecfdf3b968ff229da78dc7ed48410b4aa90e Mon Sep 17 00:00:00 2001 From: kpotoh Date: Fri, 28 Mar 2025 14:37:38 +0200 Subject: [PATCH 04/14] tests repaired --- pymutspec/annotation/__init__.py | 2 +- pymutspec/io/__init__.py | 8 +- pymutspec/io/states.py | 534 +++++++++++++++---------------- 3 files changed, 269 insertions(+), 275 deletions(-) diff --git a/pymutspec/annotation/__init__.py b/pymutspec/annotation/__init__.py index 29a5c74..3a9c792 100644 --- a/pymutspec/annotation/__init__.py +++ b/pymutspec/annotation/__init__.py @@ -6,6 +6,6 @@ get_cossim, get_eucdist, filter_outlier_branches, sample_spectrum, get_iqr_bounds, ) -from .mut import Branch, MutSpec, CodonAnnotation, mutations_summary +from .mut import CodonAnnotation, mutations_summary translator = transcriptor diff --git a/pymutspec/io/__init__.py b/pymutspec/io/__init__.py index e91a905..8f9e8c9 100644 --- a/pymutspec/io/__init__.py +++ b/pymutspec/io/__init__.py @@ -1,9 +1,3 @@ from .auxiliary import get_aln_files, load_scheme from .gb import read_genbank_ref -from .states import GenomeStates, GenomeStatesTotal, read_rates, read_alignment - -__all__ = [ - 'GenomeStates', 'GenomeStatesTotal', - 'read_alignment', 'read_genbank_ref', 'read_rates', - 'get_aln_files', 'load_scheme', -] +from .states import GenesStates, GenomeStates, GenomeStatesTotal, read_rates, read_alignment diff --git a/pymutspec/io/states.py b/pymutspec/io/states.py index 407e1b3..9eb7417 100644 --- a/pymutspec/io/states.py +++ b/pymutspec/io/states.py @@ -130,291 +130,291 @@ def get_random_genome(self) -> pd.DataFrame: return self[node] -# class GenesStates: -# """ -# Load genome (gene) states files and access gene by record name +class GenesStates: + """ + Load genome (gene) states files and access gene by record name -# tips: -# - use mode="dict" if your mtDNA tree is small (~1500 nodes x 10,000 nucleotides require ~350MB of RAM); -# big trees with 100,000 nodes require tens of GB of RAM, therefore use mode="db" - -# Arguments -# --------- -# path_states: list or iterable of string -# files with states; if some records not presented in all files table will be filled by '-' (gaps) -# TODO -# states_fmt: string -# format of files aligment; supported any format from biopython (fasta, phylip, etc.) + tips: + - use mode="dict" if your mtDNA tree is small (~1500 nodes x 10,000 nucleotides require ~350MB of RAM); + big trees with 100,000 nodes require tens of GB of RAM, therefore use mode="db" + + Arguments + --------- + path_states: list or iterable of string + files with states; if some records not presented in all files table will be filled by '-' (gaps) + TODO + states_fmt: string + format of files aligment; supported any format from biopython (fasta, phylip, etc.) -# Return -# --------- -# states_obj: GenesStates - + Return + --------- + states_obj: GenesStates - -# """ -# def __init__( -# self, path_states: List[str], path_to_db=None, mode="dict", rewrite=False, -# use_proba=True, path_to_rates=None, cat_cutoff=1, states_fmt="table", -# ): -# fmt_variants = ["fasta", "phylip", "table"] -# if states_fmt not in fmt_variants: -# raise ValueError(f"Appropriate states_fmt: {repr(fmt_variants)}") -# if states_fmt != "table" and use_proba == True: -# print("Ignore use_proba option and forcely set use_proba=False due to input alignment that doesn't contain probabilities", file=sys.stderr) -# use_proba = False + """ + def __init__( + self, path_states: List[str], path_to_db=None, mode="dict", rewrite=False, + use_proba=True, path_to_rates=None, cat_cutoff=1, states_fmt="table", + ): + fmt_variants = ["fasta", "phylip", "table"] + if states_fmt not in fmt_variants: + raise ValueError(f"Appropriate states_fmt: {repr(fmt_variants)}") + if states_fmt != "table" and use_proba == True: + print("Ignore use_proba option and forcely set use_proba=False due to input alignment that doesn't contain probabilities", file=sys.stderr) + use_proba = False -# self.mode = mode -# self.input_states_fmt = states_fmt -# self.use_proba = use_proba -# print(f"Genome states storage mode = '{mode}'", file=sys.stderr) -# self.path_to_db = path_to_db -# if mode == "dict": -# if states_fmt == "table": -# self._prepare_node2genome(path_states) -# else: -# states = self.read_alignment(path_states) -# self._prepare_node2genome(states) -# elif mode == "db": -# if states_fmt in ["fasta", "phylip"]: -# raise NotImplementedError + self.mode = mode + self.input_states_fmt = states_fmt + self.use_proba = use_proba + print(f"Genome states storage mode = '{mode}'", file=sys.stderr) + self.path_to_db = path_to_db + if mode == "dict": + if states_fmt == "table": + self._prepare_node2genome(path_states) + else: + states = self.read_alignment(path_states) + self._prepare_node2genome(states) + elif mode == "db": + if states_fmt in ["fasta", "phylip"]: + raise NotImplementedError -# if path_to_db is None: -# raise ValueError("Pass the path to database to use or write it") -# else: -# self._prepare_db(path_states, rewrite) -# else: -# raise ValueError("Mode must be 'dict' or 'db'") + if path_to_db is None: + raise ValueError("Pass the path to database to use or write it") + else: + self._prepare_db(path_states, rewrite) + else: + raise ValueError("Mode must be 'dict' or 'db'") -# genes_sizes = {part: len(x) for part, x in self.get_random_genome().items()} -# self.genome_size = sum(genes_sizes.values()) -# self.category = self.read_rates(path_to_rates) if path_to_rates else None -# self.mask = None - -# if self.category: -# for part in genes_sizes: -# if genes_sizes[part] != len(self.category[part]): -# raise RuntimeError("Wrong rates: number of positions in alignment are not equal to number of rates", file=sys.stderr) -# self.mask = self.get_mask(self.category, cat_cutoff) + genes_sizes = {part: len(x) for part, x in self.get_random_genome().items()} + self.genome_size = sum(genes_sizes.values()) + self.category = self.read_rates(path_to_rates) if path_to_rates else None + self.mask = None + + if self.category: + for part in genes_sizes: + if genes_sizes[part] != len(self.category[part]): + raise RuntimeError("Wrong rates: number of positions in alignment are not equal to number of rates", file=sys.stderr) + self.mask = self.get_mask(self.category, cat_cutoff) -# def get_genome(self, node: str): -# if self.mode == "dict": -# return self.node2genome[node] + def get_genome(self, node: str): + if self.mode == "dict": + return self.node2genome[node] -# elif self.mode == "db": -# genome_raw = defaultdict(list) -# cur = self.con.cursor() -# if self.use_proba: -# query = f"""SELECT Part, Site, p_A, p_C, p_G, p_T FROM states WHERE Node='{node}'""" -# dtype = [ -# ("Site", np.int32), -# ("p_A", np.float32), ("p_C", np.float32), -# ("p_G", np.float32), ("p_T", np.float32), -# ] -# else: -# query = f"""SELECT Part, Site, State FROM states WHERE Node='{node}'""" -# dtype = [("Site", np.int32), ("State", np.object_)] - -# for row in cur.execute(query): -# part = str(row[0]) -# state = row[1:] -# genome_raw[part].append(state) + elif self.mode == "db": + genome_raw = defaultdict(list) + cur = self.con.cursor() + if self.use_proba: + query = f"""SELECT Part, Site, p_A, p_C, p_G, p_T FROM states WHERE Node='{node}'""" + dtype = [ + ("Site", np.int32), + ("p_A", np.float32), ("p_C", np.float32), + ("p_G", np.float32), ("p_T", np.float32), + ] + else: + query = f"""SELECT Part, Site, State FROM states WHERE Node='{node}'""" + dtype = [("Site", np.int32), ("State", np.object_)] + + for row in cur.execute(query): + part = str(row[0]) + state = row[1:] + genome_raw[part].append(state) -# genome = {} -# for part, state_tup in genome_raw.items(): -# state = np.array(state_tup, dtype=dtype) -# state_sorted = np.sort(state, order="Site") -# state_sorted_devoid = np.array(list(map(list, state_sorted))) -# state = state_sorted_devoid[:, 1:] if self.use_proba else state_sorted_devoid[:, 1] -# genome[part] = state -# return genome - -# else: -# raise ValueError("mode must be 'dict' or 'db'") + genome = {} + for part, state_tup in genome_raw.items(): + state = np.array(state_tup, dtype=dtype) + state_sorted = np.sort(state, order="Site") + state_sorted_devoid = np.array(list(map(list, state_sorted))) + state = state_sorted_devoid[:, 1:] if self.use_proba else state_sorted_devoid[:, 1] + genome[part] = state + return genome + + else: + raise ValueError("mode must be 'dict' or 'db'") -# def get_random_genome(self): -# node = next(iter(self.nodes)) -# return self.get_genome(node) - -# def _prepare_db(self, path_states, rewrite=False): -# """sequentially read tsv and write to db""" -# done = False -# if os.path.exists(self.path_to_db): -# if rewrite: -# os.remove(self.path_to_db) -# else: -# print(f"Loading existing database from {self.path_to_db}", file=sys.stderr) -# done = True -# try: -# con = sqlite3.connect(self.path_to_db) -# except: -# raise ValueError("Cannot connect to database by given path") + def get_random_genome(self): + node = next(iter(self.nodes)) + return self.get_genome(node) + + def _prepare_db(self, path_states, rewrite=False): + """sequentially read tsv and write to db""" + done = False + if os.path.exists(self.path_to_db): + if rewrite: + os.remove(self.path_to_db) + else: + print(f"Loading existing database from {self.path_to_db}", file=sys.stderr) + done = True + try: + con = sqlite3.connect(self.path_to_db) + except: + raise ValueError("Cannot connect to database by given path") -# cur = con.cursor() -# self.con = con -# if done: -# self.nodes = self._get_nodes_from_db() -# return - -# print("Writing new database file", file=sys.stderr) -# cur.execute('''CREATE TABLE states -# (Node TEXT, Part INTEGER, Site INTEGER, State TEXT, p_A REAL, p_C REAL, p_G REAL, p_T REAL)''' -# ) -# for path in path_states: -# if path is None: -# continue -# print(f"Loading {path} ...", file=sys.stderr) -# handle = open(path, "r") -# header = handle.readline().strip() -# if header != "Node\tPart\tSite\tState\tp_A\tp_C\tp_G\tp_T": -# handle.close() -# con.close() -# raise ValueError(f"Inappropriate type of table, expected another columns order,\ngot {repr(header)}") - -# for line in tqdm.tqdm(handle, total=8652300): # TODO estimate total -# row = line.strip().split() -# query = "INSERT INTO states VALUES ('{}',{},{},'{}',{},{},{},{})".format(*row) -# cur.execute(query) - -# con.commit() -# handle.close() - -# self.nodes = self._get_nodes_from_db() -# # con.close() + cur = con.cursor() + self.con = con + if done: + self.nodes = self._get_nodes_from_db() + return + + print("Writing new database file", file=sys.stderr) + cur.execute('''CREATE TABLE states + (Node TEXT, Part INTEGER, Site INTEGER, State TEXT, p_A REAL, p_C REAL, p_G REAL, p_T REAL)''' + ) + for path in path_states: + if path is None: + continue + print(f"Loading {path} ...", file=sys.stderr) + handle = open(path, "r") + header = handle.readline().strip() + if header != "Node\tPart\tSite\tState\tp_A\tp_C\tp_G\tp_T": + handle.close() + con.close() + raise ValueError(f"Inappropriate type of table, expected another columns order,\ngot {repr(header)}") + + for line in tqdm.tqdm(handle, total=8652300): # TODO estimate total + row = line.strip().split() + query = "INSERT INTO states VALUES ('{}',{},{},'{}',{},{},{},{})".format(*row) + cur.execute(query) + + con.commit() + handle.close() + + self.nodes = self._get_nodes_from_db() + # con.close() -# def _get_nodes_from_db(self): -# nodes = set() -# cur = self.con.cursor() -# for node in cur.execute("SELECT DISTINCT Node from states"): -# nodes.add(node[0]) -# return nodes + def _get_nodes_from_db(self): + nodes = set() + cur = self.con.cursor() + for node in cur.execute("SELECT DISTINCT Node from states"): + nodes.add(node[0]) + return nodes -# def states2dct(self, states: pd.DataFrame, out=None): -# # all genes (genomes) must have same length -# aln_sizes = states.groupby("Node").apply(len) -# assert aln_sizes.nunique() == 1, "uncomplete state table: some site states absent in some node genomes" + def states2dct(self, states: pd.DataFrame, out=None): + # all genes (genomes) must have same length + aln_sizes = states.groupby("Node").apply(len) + assert aln_sizes.nunique() == 1, "uncomplete state table: some site states absent in some node genomes" -# node2genome = defaultdict(dict) if out is None else out -# gr = states.sort_values(["Node", "Part", "Site"]).groupby(["Node", "Part"]) -# for (node, part), gene_df in gr: -# if self.use_proba: -# gene_states = gene_df[["p_A", "p_C", "p_G", "p_T"]].values -# else: -# gene_states = gene_df.State.values -# node2genome[node][part] = gene_states -# return node2genome - -# def _prepare_node2genome(self, path_states, states_dtype=np.float32): -# node2genome = defaultdict(dict) -# if isinstance(path_states, pd.DataFrame): -# states = path_states -# self.states2dct(states, node2genome) -# elif isinstance(path_states, (list, tuple)): -# dtypes = { -# "p_A": states_dtype, "p_C": states_dtype, -# "p_G": states_dtype, "p_T": states_dtype, -# "Site": np.int32, "Node": str, "Part": str, -# } -# if self.use_proba: -# usecols = ["Node", "Part", "Site", "p_A", "p_C", "p_G", "p_T"] -# else: -# usecols = ["Node", "Part", "Site", "State"] - -# for path in path_states: -# if path is None: -# continue -# print(f"Loading {path}...", file=sys.stderr) -# states = pd.read_csv(path, sep="\t", comment='#', usecols=usecols, dtype=dtypes) -# self.states2dct(states, node2genome) - -# # aln_sizes = states.groupby("Node").apply(len) -# # assert aln_sizes.nunique() == 1, "uncomplete leaves state table" -# # states = states.sort_values(["Node", "Part", "Site"]) -# # gr = states.groupby(["Node", "Part"]) -# # for (node, part), gene_pos_ids in tqdm.tqdm(gr.groups.items()): -# # gene_df = states.loc[gene_pos_ids] # TODO simplify: iterate over gr -# # if self.use_proba: -# # gene_states = gene_df[["p_A", "p_C", "p_G", "p_T"]].values -# # else: -# # gene_states = gene_df.State.values -# # node2genome[node][part] = gene_states -# else: - -# raise NotImplementedError('path_to_states must be list of paths or pd.DataFrame') + node2genome = defaultdict(dict) if out is None else out + gr = states.sort_values(["Node", "Part", "Site"]).groupby(["Node", "Part"]) + for (node, part), gene_df in gr: + if self.use_proba: + gene_states = gene_df[["p_A", "p_C", "p_G", "p_T"]].values + else: + gene_states = gene_df.State.values + node2genome[node][part] = gene_states + return node2genome + + def _prepare_node2genome(self, path_states, states_dtype=np.float32): + node2genome = defaultdict(dict) + if isinstance(path_states, pd.DataFrame): + states = path_states + self.states2dct(states, node2genome) + elif isinstance(path_states, (list, tuple)): + dtypes = { + "p_A": states_dtype, "p_C": states_dtype, + "p_G": states_dtype, "p_T": states_dtype, + "Site": np.int32, "Node": str, "Part": str, + } + if self.use_proba: + usecols = ["Node", "Part", "Site", "p_A", "p_C", "p_G", "p_T"] + else: + usecols = ["Node", "Part", "Site", "State"] + + for path in path_states: + if path is None: + continue + print(f"Loading {path}...", file=sys.stderr) + states = pd.read_csv(path, sep="\t", comment='#', usecols=usecols, dtype=dtypes) + self.states2dct(states, node2genome) + + # aln_sizes = states.groupby("Node").apply(len) + # assert aln_sizes.nunique() == 1, "uncomplete leaves state table" + # states = states.sort_values(["Node", "Part", "Site"]) + # gr = states.groupby(["Node", "Part"]) + # for (node, part), gene_pos_ids in tqdm.tqdm(gr.groups.items()): + # gene_df = states.loc[gene_pos_ids] # TODO simplify: iterate over gr + # if self.use_proba: + # gene_states = gene_df[["p_A", "p_C", "p_G", "p_T"]].values + # else: + # gene_states = gene_df.State.values + # node2genome[node][part] = gene_states + else: + + raise NotImplementedError('path_to_states must be list of paths or pd.DataFrame') -# self.node2genome = node2genome -# self.nodes = set(node2genome.keys()) - -# @staticmethod -# def read_alignment(files: list, fmt="fasta"): -# """ -# Read files alignments and prepare states table - -# Arguments -# --------- -# files: list or iterable of string -# files with alignments; if some records not presented in all files table will be filled by '-' (gaps) -# fmt: string -# format of files aligment; supported any format from biopython (fasta, phylip, etc.) + self.node2genome = node2genome + self.nodes = set(node2genome.keys()) + + @staticmethod + def read_alignment(files: list, fmt="fasta"): + """ + Read files alignments and prepare states table + + Arguments + --------- + files: list or iterable of string + files with alignments; if some records not presented in all files table will be filled by '-' (gaps) + fmt: string + format of files aligment; supported any format from biopython (fasta, phylip, etc.) -# Return -# --------- -# states: pd.DataFrame -# """ -# ngenes = len(files) -# columns = "Node Part Site State p_A p_C p_G p_T".split() -# nucls = "ACGT" -# aln_lens = dict() -# files = set(files) -# history = defaultdict(list) -# data = [] -# for filepath in files: -# part = "1" if ngenes == 1 else os.path.basename(filepath).replace(".fna", "") -# alignment = SeqIO.parse(filepath, fmt) -# for rec in alignment: -# node = rec.name -# history[node].append(part) -# seq = str(rec.seq) -# for site, state in enumerate(seq, 1): -# site_data = [node, part, site, state] -# for nucl in nucls: -# p = int(nucl == state) -# site_data.append(p) -# data.append(site_data) -# aln_lens[part] = len(seq) - -# if ngenes > 1: -# # get max set of parts. Only for multiple files -# for node, parts in history.items(): -# if len(parts) == ngenes: -# full_parts = parts.copy() -# break - -# # fill missing genes by '-'. Only for multiple files -# for node, parts in history.items(): -# if len(parts) != ngenes: -# unseen_parts = set(full_parts).difference(parts) -# for unp in unseen_parts: -# print(f"Gap filling for node {node}, part {unp}...", file=sys.stderr) -# for site in range(1, aln_lens[unp] + 1): -# site_data = [node, unp, site, "-", 0, 0, 0, 0] -# data.append(site_data) - -# df = pd.DataFrame(data, columns=columns) -# return df - -# @staticmethod -# def read_rates(path: str): -# """ TODO write for many genes """ -# category = read_rates(path) -# category = {"1": category} # for each aln part -# return category + Return + --------- + states: pd.DataFrame + """ + ngenes = len(files) + columns = "Node Part Site State p_A p_C p_G p_T".split() + nucls = "ACGT" + aln_lens = dict() + files = set(files) + history = defaultdict(list) + data = [] + for filepath in files: + part = "1" if ngenes == 1 else os.path.basename(filepath).replace(".fna", "") + alignment = SeqIO.parse(filepath, fmt) + for rec in alignment: + node = rec.name + history[node].append(part) + seq = str(rec.seq) + for site, state in enumerate(seq, 1): + site_data = [node, part, site, state] + for nucl in nucls: + p = int(nucl == state) + site_data.append(p) + data.append(site_data) + aln_lens[part] = len(seq) + + if ngenes > 1: + # get max set of parts. Only for multiple files + for node, parts in history.items(): + if len(parts) == ngenes: + full_parts = parts.copy() + break + + # fill missing genes by '-'. Only for multiple files + for node, parts in history.items(): + if len(parts) != ngenes: + unseen_parts = set(full_parts).difference(parts) + for unp in unseen_parts: + print(f"Gap filling for node {node}, part {unp}...", file=sys.stderr) + for site in range(1, aln_lens[unp] + 1): + site_data = [node, unp, site, "-", 0, 0, 0, 0] + data.append(site_data) + + df = pd.DataFrame(data, columns=columns) + return df + + @staticmethod + def read_rates(path: str): + """ TODO write for many genes """ + category = read_rates(path) + category = {"1": category} # for each aln part + return category -# @staticmethod -# def get_mask(category: dict, cat_cutoff=1): -# mask = {part: np.array(cats) >= cat_cutoff for part, cats in category.items()} -# return mask + @staticmethod + def get_mask(category: dict, cat_cutoff=1): + mask = {part: np.array(cats) >= cat_cutoff for part, cats in category.items()} + return mask -# def close_db(self): -# self.con.close() + def close_db(self): + self.con.close() def read_rates(path: str): From b07c9d23c0e51aa893b4b8b6f2c67beb8e13bb23 Mon Sep 17 00:00:00 2001 From: kpotoh Date: Fri, 28 Mar 2025 14:53:58 +0200 Subject: [PATCH 05/14] drop useless versions from reqs --- requirements.txt | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/requirements.txt b/requirements.txt index ba98d4d..f2b1047 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,12 @@ -numpy~=1.22.2 -pandas~=1.4.0 +numpy<2.0 +pandas<2.0 matplotlib~=3.5.1 seaborn==0.12.2 -biopython==1.79 -ete3==3.1.2 -tqdm~=4.62.3 -PyYAML~=6.0 -pytest~=7.1.2 -click~=8.1.3 +biopython>=1.75 +ete3>=3 +tqdm +PyYAML +pytest +click # scipy~=1.8.1 # statsmodels~=0.13.2 From d018cc49e89f3b60560f58fb95b5f7a2040c5726 Mon Sep 17 00:00:00 2001 From: kpotoh Date: Fri, 28 Mar 2025 16:09:47 +0200 Subject: [PATCH 06/14] delete _legacy with old genes_states --- pymutspec/_legacy/states.py | 298 ------------------------------------ 1 file changed, 298 deletions(-) delete mode 100644 pymutspec/_legacy/states.py diff --git a/pymutspec/_legacy/states.py b/pymutspec/_legacy/states.py deleted file mode 100644 index a1dacb4..0000000 --- a/pymutspec/_legacy/states.py +++ /dev/null @@ -1,298 +0,0 @@ -import os -import sys -import sqlite3 -from collections import defaultdict -from typing import List - -import numpy as np -import pandas as pd -from Bio import SeqIO -import tqdm - - - -class OldGenesStates: - """ - Load genome (gene) states files and access gene by record name - - tips: - - use mode="dict" if your mtDNA tree is small (~1500 nodes x 10,000 nucleotides require ~350MB of RAM); - big trees with 100,000 nodes require tens of GB of RAM, therefore use mode="db" - - Arguments - --------- - path_states: list or iterable of string - files with states; if some records not presented in all files table will be filled by '-' (gaps) - TODO - states_fmt: string - format of files aligment; supported any format from biopython (fasta, phylip, etc.) - - Return - --------- - states_obj: GenesStates - - - """ - def __init__( - self, path_states: List[str], path_to_db=None, mode="dict", rewrite=False, - use_proba=True, path_to_rates=None, cat_cutoff=1, states_fmt="table", - ): - fmt_variants = ["fasta", "phylip", "table"] - if states_fmt not in fmt_variants: - raise ValueError(f"Appropriate states_fmt: {repr(fmt_variants)}") - if states_fmt != "table" and use_proba == True: - print("Ignore use_proba option and forcely set use_proba=False due to input alignment that doesn't contain probabilities", file=sys.stderr) - use_proba = False - - self.mode = mode - self.input_states_fmt = states_fmt - self.use_proba = use_proba - print(f"Genome states storage mode = '{mode}'", file=sys.stderr) - self.path_to_db = path_to_db - if mode == "dict": - if states_fmt == "table": - self._prepare_node2genome(path_states) - else: - states = self.read_alignment(path_states) - self._prepare_node2genome(states) - elif mode == "db": - if states_fmt in ["fasta", "phylip"]: - raise NotImplementedError - - if path_to_db is None: - raise ValueError("Pass the path to database to use or write it") - else: - self._prepare_db(path_states, rewrite) - else: - raise ValueError("Mode must be 'dict' or 'db'") - - genes_sizes = {part: len(x) for part, x in self.get_random_genome().items()} - self.genome_size = sum(genes_sizes.values()) - self.category = self.read_rates(path_to_rates) if path_to_rates else None - self.mask = None - - if self.category: - for part in genes_sizes: - if genes_sizes[part] != len(self.category[part]): - raise RuntimeError("Wrong rates: number of positions in alignment are not equal to number of rates", file=sys.stderr) - self.mask = self.get_mask(self.category, cat_cutoff) - - def get_genome(self, node: str): - if self.mode == "dict": - return self.node2genome[node] - - elif self.mode == "db": - genome_raw = defaultdict(list) - cur = self.con.cursor() - if self.use_proba: - query = f"""SELECT Part, Site, p_A, p_C, p_G, p_T FROM states WHERE Node='{node}'""" - dtype = [ - ("Site", np.int32), - ("p_A", np.float32), ("p_C", np.float32), - ("p_G", np.float32), ("p_T", np.float32), - ] - else: - query = f"""SELECT Part, Site, State FROM states WHERE Node='{node}'""" - dtype = [("Site", np.int32), ("State", np.object_)] - - for row in cur.execute(query): - part = str(row[0]) - state = row[1:] - genome_raw[part].append(state) - - genome = {} - for part, state_tup in genome_raw.items(): - state = np.array(state_tup, dtype=dtype) - state_sorted = np.sort(state, order="Site") - state_sorted_devoid = np.array(list(map(list, state_sorted))) - state = state_sorted_devoid[:, 1:] if self.use_proba else state_sorted_devoid[:, 1] - genome[part] = state - return genome - - else: - raise ValueError("mode must be 'dict' or 'db'") - - def get_random_genome(self): - node = next(iter(self.nodes)) - return self.get_genome(node) - - def _prepare_db(self, path_states, rewrite=False): - """sequentially read tsv and write to db""" - done = False - if os.path.exists(self.path_to_db): - if rewrite: - os.remove(self.path_to_db) - else: - print(f"Loading existing database from {self.path_to_db}", file=sys.stderr) - done = True - try: - con = sqlite3.connect(self.path_to_db) - except: - raise ValueError("Cannot connect to database by given path") - - cur = con.cursor() - self.con = con - if done: - self.nodes = self._get_nodes_from_db() - return - - print("Writing new database file", file=sys.stderr) - cur.execute('''CREATE TABLE states - (Node TEXT, Part INTEGER, Site INTEGER, State TEXT, p_A REAL, p_C REAL, p_G REAL, p_T REAL)''' - ) - for path in path_states: - if path is None: - continue - print(f"Loading {path} ...", file=sys.stderr) - handle = open(path, "r") - header = handle.readline().strip() - if header != "Node\tPart\tSite\tState\tp_A\tp_C\tp_G\tp_T": - handle.close() - con.close() - raise ValueError(f"Inappropriate type of table, expected another columns order,\ngot {repr(header)}") - - for line in tqdm.tqdm(handle, total=8652300): # TODO estimate total - row = line.strip().split() - query = "INSERT INTO states VALUES ('{}',{},{},'{}',{},{},{},{})".format(*row) - cur.execute(query) - - con.commit() - handle.close() - - self.nodes = self._get_nodes_from_db() - # con.close() - - def _get_nodes_from_db(self): - nodes = set() - cur = self.con.cursor() - for node in cur.execute("SELECT DISTINCT Node from states"): - nodes.add(node[0]) - return nodes - - def states2dct(self, states: pd.DataFrame, out=None): - # all genes (genomes) must have same length - aln_sizes = states.groupby("Node").apply(len) - assert aln_sizes.nunique() == 1, "uncomplete state table: some site states absent in some node genomes" - - node2genome = defaultdict(dict) if out is None else out - gr = states.sort_values(["Node", "Part", "Site"]).groupby(["Node", "Part"]) - for (node, part), gene_df in gr: - if self.use_proba: - gene_states = gene_df[["p_A", "p_C", "p_G", "p_T"]].values - else: - gene_states = gene_df.State.values - node2genome[node][part] = gene_states - return node2genome - - def _prepare_node2genome(self, path_states, states_dtype=np.float32): - node2genome = defaultdict(dict) - if isinstance(path_states, pd.DataFrame): - states = path_states - self.states2dct(states, node2genome) - elif isinstance(path_states, (list, tuple)): - dtypes = { - "p_A": states_dtype, "p_C": states_dtype, - "p_G": states_dtype, "p_T": states_dtype, - "Site": np.int32, "Node": str, "Part": str, - } - if self.use_proba: - usecols = ["Node", "Part", "Site", "p_A", "p_C", "p_G", "p_T"] - else: - usecols = ["Node", "Part", "Site", "State"] - - for path in path_states: - if path is None: - continue - print(f"Loading {path}...", file=sys.stderr) - states = pd.read_csv(path, sep="\t", comment='#', usecols=usecols, dtype=dtypes) - self.states2dct(states, node2genome) - - # aln_sizes = states.groupby("Node").apply(len) - # assert aln_sizes.nunique() == 1, "uncomplete leaves state table" - # states = states.sort_values(["Node", "Part", "Site"]) - # gr = states.groupby(["Node", "Part"]) - # for (node, part), gene_pos_ids in tqdm.tqdm(gr.groups.items()): - # gene_df = states.loc[gene_pos_ids] # TODO simplify: iterate over gr - # if self.use_proba: - # gene_states = gene_df[["p_A", "p_C", "p_G", "p_T"]].values - # else: - # gene_states = gene_df.State.values - # node2genome[node][part] = gene_states - else: - - raise NotImplementedError('path_to_states must be list of paths or pd.DataFrame') - - self.node2genome = node2genome - self.nodes = set(node2genome.keys()) - - @staticmethod - def read_alignment(files: list, fmt="fasta"): - """ - Read files alignments and prepare states table - - Arguments - --------- - files: list or iterable of string - files with alignments; if some records not presented in all files table will be filled by '-' (gaps) - fmt: string - format of files aligment; supported any format from biopython (fasta, phylip, etc.) - - Return - --------- - states: pd.DataFrame - """ - ngenes = len(files) - columns = "Node Part Site State p_A p_C p_G p_T".split() - nucls = "ACGT" - aln_lens = dict() - files = set(files) - history = defaultdict(list) - data = [] - for filepath in files: - part = "1" if ngenes == 1 else os.path.basename(filepath).replace(".fna", "") - alignment = SeqIO.parse(filepath, fmt) - for rec in alignment: - node = rec.name - history[node].append(part) - seq = str(rec.seq) - for site, state in enumerate(seq, 1): - site_data = [node, part, site, state] - for nucl in nucls: - p = int(nucl == state) - site_data.append(p) - data.append(site_data) - aln_lens[part] = len(seq) - - if ngenes > 1: - # get max set of parts. Only for multiple files - for node, parts in history.items(): - if len(parts) == ngenes: - full_parts = parts.copy() - break - - # fill missing genes by '-'. Only for multiple files - for node, parts in history.items(): - if len(parts) != ngenes: - unseen_parts = set(full_parts).difference(parts) - for unp in unseen_parts: - print(f"Gap filling for node {node}, part {unp}...", file=sys.stderr) - for site in range(1, aln_lens[unp] + 1): - site_data = [node, unp, site, "-", 0, 0, 0, 0] - data.append(site_data) - - df = pd.DataFrame(data, columns=columns) - return df - - @staticmethod - def read_rates(path: str): - """ TODO write for many genes """ - category = read_rates(path) - category = {"1": category} # for each aln part - return category - - @staticmethod - def get_mask(category: dict, cat_cutoff=1): - mask = {part: np.array(cats) >= cat_cutoff for part, cats in category.items()} - return mask - - def close_db(self): - self.con.close() From 841bd2e89607f030699420a48914c36f44c8f270 Mon Sep 17 00:00:00 2001 From: kpotoh Date: Fri, 28 Mar 2025 16:51:54 +0200 Subject: [PATCH 07/14] delete setup.py. All into pyproject.toml --- pymutspec/__init__.py | 2 +- pyproject.toml | 19 ++++++++++--------- requirements.dev.txt | 18 +++++++++--------- requirements.txt | 12 ------------ setup.py | 15 --------------- 5 files changed, 20 insertions(+), 46 deletions(-) delete mode 100644 requirements.txt delete mode 100644 setup.py diff --git a/pymutspec/__init__.py b/pymutspec/__init__.py index 7d0b805..d08d943 100644 --- a/pymutspec/__init__.py +++ b/pymutspec/__init__.py @@ -1,4 +1,4 @@ from pymutspec import annotation, draw, io, constants, utils # Define the pymutspec version -__version__ = "0.0.12" \ No newline at end of file +__version__ = "0.0.13" \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 7c93600..f6ed865 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "PyMutSpec" -version = "0.0.12" +version = "0.0.13" description = "Mutational spectra analysis package" authors = [ { name="kpotoh", email="axepon@mail.ru" }, @@ -25,16 +25,17 @@ classifiers = [ "License :: OSI Approved :: MIT License", ] dependencies = [ - "numpy~=1.22.2", - "pandas~=1.4.0", + "numpy<2.0", + "pandas<2.0", "seaborn==0.12.2", - "biopython==1.79", - "ete3==3.1.2", - "tqdm~=4.62.3", - "PyYAML~=6.0", - "pytest~=7.1.2", - "click~=8.1.3", + "biopython>=1.75", + "ete3>=3", + "tqdm", + "PyYAML", + "pytest", + "click", ] +# TODO incluce scripts [project.urls] "Homepage" = "https://github.com/mitoclub/PyMutSpec" diff --git a/requirements.dev.txt b/requirements.dev.txt index 470e307..eabcf45 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -1,13 +1,13 @@ -numpy~=1.22.2 -pandas~=1.4.0 +numpy<2.0 +pandas<2.0 matplotlib~=3.5.1 -seaborn~=0.12.2 -biopython~=1.79 -ete3~=3.1.2 -tqdm~=4.62.3 -PyYAML~=6.0 -pytest~=7.1.2 -click~=8.1.3 +seaborn==0.12.2 +biopython>=1.75 +ete3>=3 +tqdm +PyYAML +pytest +click # scipy~=1.8.1 # statsmodels~=0.13.2 pytest diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index f2b1047..0000000 --- a/requirements.txt +++ /dev/null @@ -1,12 +0,0 @@ -numpy<2.0 -pandas<2.0 -matplotlib~=3.5.1 -seaborn==0.12.2 -biopython>=1.75 -ete3>=3 -tqdm -PyYAML -pytest -click -# scipy~=1.8.1 -# statsmodels~=0.13.2 diff --git a/setup.py b/setup.py deleted file mode 100644 index e264b64..0000000 --- a/setup.py +++ /dev/null @@ -1,15 +0,0 @@ -import os -from setuptools import setup, find_packages - -scripts = [] -if os.path.exists("scripts"): - for file in os.listdir("scripts"): - if file.endswith(".py") and not file.startswith("_"): - fp = os.path.join("scripts", file) - scripts.append(fp) - -setup( - packages=find_packages(), - scripts=scripts, - package_data={'pymutspec': ['utils/configs/log_settings.yaml']}, -) From 7571ac5c85edd0a51224d08608ab1540fecb87c0 Mon Sep 17 00:00:00 2001 From: kpotoh Date: Mon, 31 Mar 2025 16:17:52 +0200 Subject: [PATCH 08/14] fix pyproject.toml and add setup.py for compartibility; scripts fixed and now all have main func --- .github/workflows/python-package.yml | 3 +-- LICENSE | 2 +- pyproject.toml | 23 ++++++++++++++++++----- scripts/iqtree_states_add_part.py | 13 +++++++------ scripts/raxml_states2iqtree_states.py | 12 +++++++----- scripts/rename_internal_nodes.py | 14 ++++++++------ setup.py | 3 +++ 7 files changed, 45 insertions(+), 25 deletions(-) create mode 100644 setup.py diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index bc71bb9..8000090 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -27,8 +27,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install flake8 pytest - python setup.py install + pip install ".[dev]" - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names diff --git a/LICENSE b/LICENSE index 335ea9d..b6f22dd 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2018 The Python Packaging Authority +Copyright (c) 2025 mitofungen.com Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/pyproject.toml b/pyproject.toml index f6ed865..1146ec0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,14 +4,15 @@ build-backend = "setuptools.build_meta" [project] name = "PyMutSpec" -version = "0.0.13" +dynamic = ["version"] description = "Mutational spectra analysis package" authors = [ { name="kpotoh", email="axepon@mail.ru" }, ] -license = { file = "LICENSE" } +license = "MIT" +license-files = ["LICEN[CS]E"] readme = {file = "README.md", content-type = "text/markdown"} -requires-python = ">=3.8" +keywords = ["Bioinformatics", "Genetics", "Mutational spectra"] classifiers = [ "Operating System :: OS Independent", "Development Status :: 3 - Alpha", @@ -22,8 +23,8 @@ classifiers = [ "Topic :: Scientific/Engineering", "Topic :: Scientific/Engineering :: Bio-Informatics", "Topic :: Software Development :: Libraries :: Python Modules", - "License :: OSI Approved :: MIT License", ] +requires-python = ">=3.8" dependencies = [ "numpy<2.0", "pandas<2.0", @@ -35,8 +36,20 @@ dependencies = [ "pytest", "click", ] -# TODO incluce scripts + +[project.optional-dependencies] +dev = ["pytest", "pytest-cov", "flake8"] [project.urls] "Homepage" = "https://github.com/mitoclub/PyMutSpec" "Bug Tracker" = "https://github.com/mitoclub/PyMutSpec/issues" + +[tool.setuptools] +packages = ["pymutspec"] +script-files = ["scripts/multifasta_coding.py", "scripts/alignment2iqtree_states.py", + "scripts/select_records.py", "scripts/iqtree_states_add_part.py", + "scripts/collect_mutations.py", "scripts/calculate_mutspec.py"] + + +[tool.setuptools.dynamic] +version = {attr = "pymutspec.__version__"} # any module attribute compatible with ast.literal_eval diff --git a/scripts/iqtree_states_add_part.py b/scripts/iqtree_states_add_part.py index 1585b77..a5bbe24 100644 --- a/scripts/iqtree_states_add_part.py +++ b/scripts/iqtree_states_add_part.py @@ -6,7 +6,12 @@ import sys -def main(path_to_states, path_to_out): +def main(): + try: + path_to_states, path_to_out = sys.argv[1:] + except: + print("ERROR\nUSAGE: script.py path_to_states path_to_out_states", file=sys.stderr) + with open(path_to_out, "w") as fout: with open(path_to_states) as fin: for line in fin: @@ -22,8 +27,4 @@ def main(path_to_states, path_to_out): if __name__ == "__main__": - try: - path_to_states, path_to_out = sys.argv[1:] - main(path_to_states, path_to_out) - except: - print("ERROR\nUSAGE: script.py path_to_states path_to_out_states", file=sys.stderr) + main() diff --git a/scripts/raxml_states2iqtree_states.py b/scripts/raxml_states2iqtree_states.py index 8f1deda..d7df672 100644 --- a/scripts/raxml_states2iqtree_states.py +++ b/scripts/raxml_states2iqtree_states.py @@ -23,7 +23,12 @@ def agrmax(a): return imax -def main(path_to_raxml_states, out_path): +def main(): + try: + path_to_raxml_states, out_path = sys.argv[1], sys.argv[2] + except: + RuntimeError("USAGE: script.py PATH_TO_RAxML_STATES PATH_TO_OUT_IQTREE_STATES") + with open(out_path, "w") as fout: fout.write("\t".join(header) + "\n") with open(path_to_raxml_states) as fin: @@ -45,7 +50,4 @@ def main(path_to_raxml_states, out_path): if __name__ == "__main__": # path_to_raxml_states = "data/RAxML_marginalAncestralProbabilities.tsv" # out_path = "data/out.state" - try: - main(sys.argv[1], sys.argv[2]) - except: - RuntimeError("USAGE: script.py PATH_TO_RAxML_STATES PATH_TO_OUT_IQTREE_STATES") + main() \ No newline at end of file diff --git a/scripts/rename_internal_nodes.py b/scripts/rename_internal_nodes.py index 3c21804..d3be9ef 100644 --- a/scripts/rename_internal_nodes.py +++ b/scripts/rename_internal_nodes.py @@ -13,8 +13,13 @@ def node_parent(node: PhyloNode): return None -def main(path_to_dist_tree, path_to_named_tree, path_to_out): +def main(): """Iteratively going from leaves to root and add names to dist tree""" + try: + _, path_to_dist_tree, path_to_named_tree, path_to_out = sys.argv + except: + print("ERROR\nUSAGE: script.py path_to_dist_tree path_to_named_tree path_to_out_tree", file=sys.stderr) + tree_dist = PhyloTree(path_to_dist_tree, format=0) tree_named = PhyloTree(path_to_named_tree, format=8) @@ -45,8 +50,5 @@ def main(path_to_dist_tree, path_to_named_tree, path_to_out): if __name__ == "__main__": - try: - _, path_to_dist_tree, path_to_named_tree, path_to_out = sys.argv - main(path_to_dist_tree, path_to_named_tree, path_to_out) - except: - print("ERROR\nUSAGE: script.py path_to_dist_tree path_to_named_tree path_to_out_tree", file=sys.stderr) + main() + diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..fc1f76c --- /dev/null +++ b/setup.py @@ -0,0 +1,3 @@ +from setuptools import setup + +setup() \ No newline at end of file From 311b7762e9440ac63d0e5c417c2056acbd7bf4e2 Mon Sep 17 00:00:00 2001 From: kpotoh Date: Mon, 31 Mar 2025 16:25:45 +0200 Subject: [PATCH 09/14] fix license issue in toml --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1146ec0..ff0608c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,8 +9,7 @@ description = "Mutational spectra analysis package" authors = [ { name="kpotoh", email="axepon@mail.ru" }, ] -license = "MIT" -license-files = ["LICEN[CS]E"] +license = {file = "LICENSE"} readme = {file = "README.md", content-type = "text/markdown"} keywords = ["Bioinformatics", "Genetics", "Mutational spectra"] classifiers = [ @@ -23,6 +22,7 @@ classifiers = [ "Topic :: Scientific/Engineering", "Topic :: Scientific/Engineering :: Bio-Informatics", "Topic :: Software Development :: Libraries :: Python Modules", + "License :: OSI Approved :: MIT License", ] requires-python = ">=3.8" dependencies = [ From 61aa42a99a76400595ab9cf823b0fb92f1a8c565 Mon Sep 17 00:00:00 2001 From: kpotoh Date: Mon, 31 Mar 2025 16:55:55 +0200 Subject: [PATCH 10/14] fixed circular imports and added changelog file --- CHANGELOG.md | 64 +++++++++++++++++++++++++++++++++ pymutspec/__init__.py | 2 +- pymutspec/annotation/mut.py | 4 +-- pymutspec/annotation/spectra.py | 2 +- pymutspec/draw/sbs_orders.py | 4 +-- pymutspec/draw/spectra.py | 2 +- pyproject.toml | 3 +- 7 files changed, 73 insertions(+), 8 deletions(-) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..9bb100f --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,64 @@ +# CHANGELOG + + + +## 0.0.13 (2025-03-31) + +Features: + +- Added new functions for parallel mutations extraction +- Outgroup now is optional node in the tree; ingroup now is just the sister node of outgroup and outgroup is the node that grow up from the root + +Fixes: + +- Repair package; pyproject.toml is work now. Setup.py deprecated, but simplified version saved. +- circular imports fixed +- all scripts now have main func + +## 0.0.11 (2024-10-24) + +Features: + +- Edges sampling by @kpotoh in https://github.com/mitoclub/PyMutSpec/pull/9 +- added barplots with errorbars +- citation added to readme + +Fixes: + +- fixed bug that allow short sequences be used in expected freqs calculation + +**Full Changelog**: https://github.com/mitoclub/PyMutSpec/compare/0.0.10...0.0.11 + +## 0.0.10 (2024-04-14) + +Features: + +- added availability to collect non-syn mutational spectrum +- added a couple of arguments to calculate_mutspec func: scale, drop_underrepresented (this will drop mut types less than 0.9 by default) + +Fixes: + +- drop Times New Roman from default font for spectrum barplots +- deleted legacy log/doc files from repo +- added few tests for expected mutations collecting funcs +- fix scripts/calculate_mutspec.py : previously it failed when there was no mutations on branches, and reduced mutnum thresholds + +## 0.0.8 (2023-11-13) + +Features: + +- Added little tests for plot functions +- Added few new functions from analysis notebooks, that used to tree spectra analysis + +Fixes: + +- Fixed uncertainty coefficient (phylocoef) calculation: based on only ingroup and geometric mean of distances from root to leaves +- General test for collect_mutations.py main class diff --git a/pymutspec/__init__.py b/pymutspec/__init__.py index d08d943..a27b048 100644 --- a/pymutspec/__init__.py +++ b/pymutspec/__init__.py @@ -1,4 +1,4 @@ -from pymutspec import annotation, draw, io, constants, utils +from . import annotation, draw, io, constants, utils # Define the pymutspec version __version__ = "0.0.13" \ No newline at end of file diff --git a/pymutspec/annotation/mut.py b/pymutspec/annotation/mut.py index 812d837..d6d7479 100644 --- a/pymutspec/annotation/mut.py +++ b/pymutspec/annotation/mut.py @@ -11,8 +11,8 @@ from Bio.Data.CodonTable import NCBICodonTableDNA from ete3 import PhyloTree -from pymutspec.constants import * -from pymutspec.utils import basic_logger +from ..constants import * +from ..utils import basic_logger from ..io import GenomeStatesTotal from .tree import iter_tree_edges, calc_phylocoefs diff --git a/pymutspec/annotation/spectra.py b/pymutspec/annotation/spectra.py index 827613f..5d2e55c 100644 --- a/pymutspec/annotation/spectra.py +++ b/pymutspec/annotation/spectra.py @@ -4,7 +4,7 @@ import numpy as np import pandas as pd -from pymutspec.constants import ( +from ..constants import ( possible_sbs12_set, possible_sbs192_set, possible_sbs192, possible_sbs12 ) diff --git a/pymutspec/draw/sbs_orders.py b/pymutspec/draw/sbs_orders.py index 39ec257..c5ed8bc 100644 --- a/pymutspec/draw/sbs_orders.py +++ b/pymutspec/draw/sbs_orders.py @@ -1,7 +1,7 @@ import pandas as pd -from pymutspec.constants import possible_sbs192 -from pymutspec.annotation import rev_comp, transcriptor +from ..constants import possible_sbs192 +from ..annotation import rev_comp, transcriptor kk_lbls = "A>C A>G A>T C>T G>C G>T".split() cosmic_lbls = "C>A C>G C>T T>A T>C T>G".split() diff --git a/pymutspec/draw/spectra.py b/pymutspec/draw/spectra.py index 772f07d..cd4bc43 100644 --- a/pymutspec/draw/spectra.py +++ b/pymutspec/draw/spectra.py @@ -9,7 +9,7 @@ import matplotlib.pyplot as plt import seaborn as sns -from pymutspec.constants import possible_sbs192, possible_sbs96 +from ..constants import possible_sbs192, possible_sbs96 from .sbs_orders import ordered_sbs192_kk, ordered_sbs192_kp color_mapping6 = { diff --git a/pyproject.toml b/pyproject.toml index ff0608c..196cb89 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,8 @@ dependencies = [ dev = ["pytest", "pytest-cov", "flake8"] [project.urls] -"Homepage" = "https://github.com/mitoclub/PyMutSpec" +Homepage = "https://github.com/mitoclub/PyMutSpec" +Changelog = "https://github.com/mitoclub/PyMutSpec/blob/master/CHANGELOG.md" "Bug Tracker" = "https://github.com/mitoclub/PyMutSpec/issues" [tool.setuptools] From d171524b89b7b5db8cd3a798062f0b5052674961 Mon Sep 17 00:00:00 2001 From: kpotoh Date: Mon, 31 Mar 2025 17:01:27 +0200 Subject: [PATCH 11/14] fix main init file --- pymutspec/__init__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pymutspec/__init__.py b/pymutspec/__init__.py index a27b048..31898f5 100644 --- a/pymutspec/__init__.py +++ b/pymutspec/__init__.py @@ -1,4 +1 @@ -from . import annotation, draw, io, constants, utils - -# Define the pymutspec version __version__ = "0.0.13" \ No newline at end of file From 94374283f3713e05868f596c0bfd9ccc463db87c Mon Sep 17 00:00:00 2001 From: kpotoh Date: Mon, 31 Mar 2025 17:12:37 +0200 Subject: [PATCH 12/14] update python-package.yml --- .github/workflows/python-package.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 8000090..c51e64c 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -1,5 +1,5 @@ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions -# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python name: Python package @@ -16,10 +16,10 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.8", "3.9", "3.10"] + python-version: ["3.9", "3.10", "3.11"] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v3 with: From 3416c3af38c4ddd6d07cc03e975facc57bd90f3a Mon Sep 17 00:00:00 2001 From: kpotoh Date: Mon, 31 Mar 2025 17:16:36 +0200 Subject: [PATCH 13/14] tool.pytest.ini_options added --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 196cb89..36205e4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,3 +54,6 @@ script-files = ["scripts/multifasta_coding.py", "scripts/alignment2iqtree_states [tool.setuptools.dynamic] version = {attr = "pymutspec.__version__"} # any module attribute compatible with ast.literal_eval + +[tool.pytest.ini_options] +pythonpath = ["pymutspec"] From 25124005ef1b8976d26cf2c4aa122a2dc83bb4de Mon Sep 17 00:00:00 2001 From: kpotoh Date: Mon, 31 Mar 2025 17:18:27 +0200 Subject: [PATCH 14/14] kek --- .github/workflows/python-package.yml | 2 +- pyproject.toml | 3 --- tests/__init__.py | 0 3 files changed, 1 insertion(+), 4 deletions(-) create mode 100644 tests/__init__.py diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index c51e64c..2d5b631 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.9", "3.10", "3.11"] + python-version: ["3.9"] steps: - uses: actions/checkout@v4 diff --git a/pyproject.toml b/pyproject.toml index 36205e4..196cb89 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,6 +54,3 @@ script-files = ["scripts/multifasta_coding.py", "scripts/alignment2iqtree_states [tool.setuptools.dynamic] version = {attr = "pymutspec.__version__"} # any module attribute compatible with ast.literal_eval - -[tool.pytest.ini_options] -pythonpath = ["pymutspec"] diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29