diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index bc71bb9..2d5b631 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -1,5 +1,5 @@ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions -# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python name: Python package @@ -16,10 +16,10 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.8", "3.9", "3.10"] + python-version: ["3.9"] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v3 with: @@ -27,8 +27,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install flake8 pytest - python setup.py install + pip install ".[dev]" - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..9bb100f --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,64 @@ +# CHANGELOG + + + +## 0.0.13 (2025-03-31) + +Features: + +- Added new functions for parallel mutations extraction +- Outgroup now is optional node in the tree; ingroup now is just the sister node of outgroup and outgroup is the node that grow up from the root + +Fixes: + +- Repair package; pyproject.toml is work now. Setup.py deprecated, but simplified version saved. +- circular imports fixed +- all scripts now have main func + +## 0.0.11 (2024-10-24) + +Features: + +- Edges sampling by @kpotoh in https://github.com/mitoclub/PyMutSpec/pull/9 +- added barplots with errorbars +- citation added to readme + +Fixes: + +- fixed bug that allow short sequences be used in expected freqs calculation + +**Full Changelog**: https://github.com/mitoclub/PyMutSpec/compare/0.0.10...0.0.11 + +## 0.0.10 (2024-04-14) + +Features: + +- added availability to collect non-syn mutational spectrum +- added a couple of arguments to calculate_mutspec func: scale, drop_underrepresented (this will drop mut types less than 0.9 by default) + +Fixes: + +- drop Times New Roman from default font for spectrum barplots +- deleted legacy log/doc files from repo +- added few tests for expected mutations collecting funcs +- fix scripts/calculate_mutspec.py : previously it failed when there was no mutations on branches, and reduced mutnum thresholds + +## 0.0.8 (2023-11-13) + +Features: + +- Added little tests for plot functions +- Added few new functions from analysis notebooks, that used to tree spectra analysis + +Fixes: + +- Fixed uncertainty coefficient (phylocoef) calculation: based on only ingroup and geometric mean of distances from root to leaves +- General test for collect_mutations.py main class diff --git a/LICENSE b/LICENSE index 335ea9d..b6f22dd 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2018 The Python Packaging Authority +Copyright (c) 2025 mitofungen.com Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/pymutspec/__init__.py b/pymutspec/__init__.py index 7d0b805..31898f5 100644 --- a/pymutspec/__init__.py +++ b/pymutspec/__init__.py @@ -1,4 +1 @@ -from pymutspec import annotation, draw, io, constants, utils - -# Define the pymutspec version -__version__ = "0.0.12" \ No newline at end of file +__version__ = "0.0.13" \ No newline at end of file diff --git a/pymutspec/annotation/__init__.py b/pymutspec/annotation/__init__.py index eb50453..3a9c792 100644 --- a/pymutspec/annotation/__init__.py +++ b/pymutspec/annotation/__init__.py @@ -1,5 +1,5 @@ from .auxiliary import lbl2lbl_id, lbl_id2lbl, rev_comp, transcriptor -from .tree import iter_tree_edges, node_parent, calc_phylocoefs, get_ingroup_root, get_tree_len, get_tree_outgrp_name +from .tree import iter_tree_edges, node_parent, calc_phylocoefs, get_ingroup_root, get_tree_len from .spectra import ( calculate_mutspec, collapse_mutspec, calc_edgewise_spectra, complete_sbs192_columns, jackknife_spectra_sampling, collapse_sbs192, diff --git a/pymutspec/annotation/mut.py b/pymutspec/annotation/mut.py index 35b1be2..d6d7479 100644 --- a/pymutspec/annotation/mut.py +++ b/pymutspec/annotation/mut.py @@ -1,13 +1,22 @@ +import os +import sys from collections import defaultdict from typing import Set, Union, Dict, Iterable -import sys +import multiprocessing as mp +from warnings import warn import numpy as np import pandas as pd from Bio.Data import CodonTable from Bio.Data.CodonTable import NCBICodonTableDNA +from ete3 import PhyloTree -from pymutspec.constants import * +from ..constants import * +from ..utils import basic_logger +from ..io import GenomeStatesTotal +from .tree import iter_tree_edges, calc_phylocoefs + +nucl_order5 = ['A', 'C', 'G', 'T', '-'] class CodonAnnotation: @@ -749,6 +758,337 @@ def read_start_stop_codons(codontable: Union[NCBICodonTableDNA, int]): return set(codontable.start_codons), set(codontable.stop_codons) +class Branch: + def __init__(self, index: int, ref_name: str, alt_name: str, + ref_genome: pd.DataFrame, alt_genome: pd.DataFrame, + phylocoef: float, proba_cutoff: float, genome_size: int, + site2index: Dict[int, int], logger=None) -> None: + self.index = index + self.ref_name = ref_name + self.alt_name = alt_name + self.ref_genome = ref_genome + self.alt_genome = alt_genome + self.phylocoef = phylocoef + self.proba_cutoff = proba_cutoff + self.genome_size = genome_size + self.site2index = site2index + self.logger = logger or basic_logger() + + def is_substitution(self, cxt1, cxt2): + """ + check that contexts contain substitution. + + Example inputs and outputs: + - (CAT,CGG) -> False (T != G) + - (CAT,CAG) -> False (central nuclaotide are constant) + - (ACTAC,ACAAC) -> True + - (ACTAC,AC-AC) -> False (indel != substitution) + """ + ok = True + center_pos = len(cxt1) // 2 + if cxt1[center_pos] == cxt2[center_pos]: + ok = False + elif cxt1[center_pos] == '-' or cxt2[center_pos] == '-': + ok = False + elif cxt1[center_pos-1] != cxt2[center_pos-1] or \ + cxt1[center_pos+1] != cxt2[center_pos+1]: + self.logger.warning(f"Neighbouring positions of substitution are different: {cxt1}, {cxt2}") + ok = False + return ok + + def process_branch(self): + edge_mutations = self.extract_mutations_proba( + self.ref_genome, self.alt_genome, self.phylocoef, self.proba_cutoff) + + edge_mutations["RefNode"] = self.ref_name + edge_mutations["AltNode"] = self.alt_name + + mut_num = edge_mutations['ProbaFull'].sum() if 'ProbaFull' in edge_mutations.columns else len(edge_mutations) + if mut_num == 0: + self.logger.info(f"No mutations from branch {self.index:03} ({self.ref_name} - {self.alt_name})") + elif mut_num > self.genome_size * 0.1: + self.logger.warning(f"Observed too many mutations ({mut_num} > {self.genome_size} * 0.1) " + f"for branch ({self.ref_name} - {self.alt_name})") + else: + self.logger.info(f"{mut_num:.2f} mutations from branch {self.index:03} ({self.ref_name} - {self.alt_name})") + + return edge_mutations + + def extract_mutations_proba(self, g1: pd.DataFrame, g2: pd.DataFrame, + phylocoef: float, mut_proba_cutoff: float): + """ + Extract alterations of g2 comparing to g1 + + conditions: + - in one codon could be only sbs + - in the context of one mutation couldn't be other sbs + - indels are not sbs and codons and contexts with sbs are not considered + + Arguments + - g1 - reference sequence (parent node) + - g2 - alternative sequence (child node) + + return: + - mut - dataframe of mutations + """ + n, m = len(g1), len(g2) + assert n == m, f"genomes lengths are not equal: {n} != {m}" + + mutations = [] + g1_most_prob = g1.values.argmax(1) + g2_most_prob = g2.values.argmax(1) + # pass initial and last nucleotides without context + for site in g1.index[2:-2]: + site_mutations = self.process_site( + g1, g2, site, g1_most_prob, g2_most_prob, mut_proba_cutoff, phylocoef) + mutations.extend(site_mutations) + mut_df = pd.DataFrame(mutations) + return mut_df + + def process_site(self, g1: pd.DataFrame, g2: pd.DataFrame, site: int, + g1_most_prob: np.ndarray, g2_most_prob: np.ndarray, + mut_proba_cutoff: float, phylocoef: float): + pos_in_states = self.site2index[site] + states1, states2 = self.prepare_local_states( + g1.values, g2.values, pos_in_states, g1_most_prob, g2_most_prob) + if states1 is None: + return [] + + mutations = [] + for cxt1, proba1 in self.sample_context5(2, states1, mut_proba_cutoff): + for cxt2, proba2 in self.sample_context5(2, states2, mut_proba_cutoff): + if not self.is_substitution(cxt1, cxt2): + continue + + p = proba1 * proba2 + if p < mut_proba_cutoff: + continue + + p_adj = p * phylocoef + + cxt1 = ''.join(cxt1) + cxt2 = ''.join(cxt2) + muttype = f'{cxt1[0:2]}[{cxt1[2]}>{cxt2[2]}]{cxt1[3:]}' + sbs = { + "Mut": muttype, + "Site": site, + "ProbaRef": proba1, + "ProbaMut": p, + "ProbaFull": p_adj, + } + mutations.append(sbs) + return mutations + + def prepare_local_states_simple(self, g1: np.ndarray, g2: np.ndarray, pos: int): + return g1[pos-2: pos+3], g2[pos-2: pos+3] + + def prepare_local_states(self, g1: np.ndarray, g2: np.ndarray, pos: int, + g1_most_prob: np.ndarray, g2_most_prob: np.ndarray): + # prepare local states: exclude gaps from both seqs + ## first, search upstream + up_states1, up_states2 = [g1[pos]], [g2[pos]] + down_states1, down_states2 = [], [] + for i in range(1, 10): # don't collect gaps at all + cur_state1, cur_state2 = g1[pos-i], g2[pos-i] + state_id1, state_id2 = g1_most_prob[pos-i], g2_most_prob[pos-i] + if not (state_id1 == state_id2 == 4): # both states are not equal to gap + up_states1.append(cur_state1) + up_states2.append(cur_state2) + if len(up_states1) == 3 or pos-i-1 <= 0: + break + + ## second, search downstream + for i in range(1, 10): + cur_state1, cur_state2 = g1[pos+i], g2[pos+i] + state_id1, state_id2 = g1_most_prob[pos+i], g2_most_prob[pos+i] + if not (state_id1 == state_id2 == 4): # not equal to gap + down_states1.append(cur_state1) + down_states2.append(cur_state2) + if len(down_states1) == 2 or pos+i+1 >= len(g1): + break + + if any([len(up_states1) != 3, + len(up_states2) != 3, + len(down_states1) != 2 , + len(down_states2) != 2]): + self.logger.warning(f'Cannot collect local contexts for position {pos}') + return None, None + + up_states1.reverse() + up_states2.reverse() + up_states1.extend(down_states1) + up_states2.extend(down_states2) + states1 = np.array(up_states1) + states2 = np.array(up_states2) + + return states1, states2 + + def sample_context5(self, pos: int, g: np.ndarray, cutoff=0.25): + probas = g[pos-2] * g[pos-1][:, None] * g[pos][:, None, None] * \ + g[pos+1][:, None, None, None] * g[pos+2][:, None, None, None, None] + + indexes = np.where(probas > cutoff) + for idx in range(len(indexes[0])): + cxt_ids = [indexes[i][idx] for i in list(range(len(indexes)))[::-1]] + cxt = tuple(nucl_order5[x] for x in cxt_ids) + pr = probas[tuple(cxt_ids[::-1])] + yield cxt, pr + + def sample_context7(self, pos: int, g: np.ndarray, cutoff=0.25): + probas = g[pos-3] * g[pos-2][:, None] * g[pos-1][:, None, None] * \ + g[pos][:, None, None, None] * g[pos+1][:, None, None, None, None] * \ + g[pos+2][:, None, None, None, None, None] * \ + g[pos+3][:, None, None, None, None, None, None] + + indexes = np.where(probas > cutoff) + for idx in range(len(indexes[0])): + cxt_ids = [indexes[i][idx] for i in list(range(len(indexes)))[::-1]] + cxt = tuple(nucl_order5[x] for x in cxt_ids) + pr = probas[tuple(cxt_ids[::-1])] + yield cxt, pr + + +class MutSpecExtractor(CodonAnnotation): + def __init__( + self, path_to_tree, outdir, gcode=2, + use_proba=False, proba_cutoff=0.25, use_phylocoef=False, + genomes: GenomeStatesTotal = None, num_processes=8, + logger=None, + ): + if not os.path.exists(path_to_tree): + raise ValueError(f"Path to tree doesn't exist: '{path_to_tree}'") + + CodonAnnotation.__init__(self, gencode=gcode) + + self.num_processes = num_processes + self.genomes = genomes + self.gcode = gcode + self.use_proba = use_proba + self.proba_cutoff = proba_cutoff + self.use_phylocoef = use_phylocoef if use_proba else False + self.outdir = outdir + self.logger = logger or basic_logger() + + self.logger.info(f"Using gencode {gcode}") + self.logger.info(f"Use probabilities of genomic states: {use_proba}") + self.logger.info(f"Use phylogenetic uncertainty coefficient: {use_phylocoef}") + self.logger.info(f"Minimal probability for mutations to use: {proba_cutoff}") + + self.fp_format = np.float32 + self.tree = PhyloTree(path_to_tree, format=1) + self.logger.info( + f"Tree loaded, number of leaf nodes: {len(self.tree)}, " + f"total number of nodes: {len(self.tree.get_cached_content())}, " + ) + rnd_genome = self.genomes.get_random_genome() + self.logger.info(f"Number of MSA sites: {len(rnd_genome)}") + + # mapping of MSA sites to truncated states ids + self.site2index = dict(zip(rnd_genome.index, range(len(rnd_genome)))) + + def open_handles(self, outdir): + self.handle = dict() + self.handle["mut"] = open(os.path.join(outdir, "mutations.tsv"), "w") + self.logger.debug("Handles opened") + + def close_handles(self): + for file in self.handle.values(): + file.close() + self.logger.debug("Handles closed") + + def extract_mutspec_from_tree(self): + t = self.num_processes + if not isinstance(t, int) or t < 1: + raise ValueError('num_processes must be positive integer') + + if t == 1: + self._derive_mutspec() + elif t > 1: + self._derive_mutspec_parallel() + + def _derive_mutspec(self): + self.logger.info("Start mutation extraction from tree") + self.open_handles(self.outdir) + add_header = defaultdict(lambda: True) + total_mut_num = 0 + + for edge_data in self.iter_branches(): + edge_mutations = self.process_branch(*edge_data[1:]) + mut_num = edge_mutations['ProbaFull'].sum() if self.use_proba and \ + 'ProbaFull' in edge_mutations.columns else len(edge_mutations) + total_mut_num += mut_num + + # dump current edge mutations + self.dump_table(edge_mutations, self.handle["mut"], add_header["mut"]) + add_header["mut"] = False + + self.close_handles() + self.logger.info(f"Processed {edge_data[1]} tree edges") + self.logger.info(f"Observed {total_mut_num:.3f} substitutions") + self.logger.info("Extraction of mutations from phylogenetic tree completed succesfully") + + def _derive_mutspec_parallel(self): + self.logger.info("Start parallel mutation extraction from tree") + + with mp.Pool(processes=self.num_processes) as pool: + genome_mutations_lst = pool.map(Branch.process_branch, self.iter_branches()) + + genome_mutations = pd.concat(genome_mutations_lst) + + self.open_handles(self.outdir) + self.dump_table(genome_mutations, self.handle["mut"], True) + self.close_handles() + + total_mut_num = genome_mutations['ProbaFull'].sum() if self.use_proba and \ + 'ProbaFull' in genome_mutations.columns else len(genome_mutations) + self.logger.info(f"Observed {total_mut_num:.3f} substitutions") + self.logger.info("Extraction of mutations from phylogenetic tree completed succesfully") + + def iter_branches(self): + """yield (self, edge_id, ref_node_name, alt_node_name, ref_genome, alt_genome, phylocoef)""" + # calculate phylogenetic uncertainty correction + if self.use_phylocoef: + phylocoefs = calc_phylocoefs(self.tree) + + for ei, (ref_node, alt_node) in enumerate(iter_tree_edges(self.tree), 1): + if alt_node.name not in self.genomes.nodes: + self.logger.warning(f"Skip edge '{ref_node.name}'-'{alt_node.name}' due to absence of '{alt_node.name}' genome") + continue + if ref_node.name not in self.genomes.nodes: + self.logger.warning(f"Skip edge '{ref_node.name}'-'{alt_node.name}' due to absence of '{ref_node.name}' genome") + continue + + # calculate phylogenetic uncertainty correction: + # coef for ref edge * coef for alt edge + if self.use_phylocoef: + phylocoef = self.fp_format(phylocoefs[ref_node.name] * phylocoefs[alt_node.name]) + else: + phylocoef = self.fp_format(1) + + # get genomes from storage + ref_genome = self.genomes.get_genome(ref_node.name) + alt_genome = self.genomes.get_genome(alt_node.name) + self.logger.debug(f'Genomes retrieved for branch {ei}') + + yield Branch(ei, ref_node.name, alt_node.name, ref_genome, alt_genome, + phylocoef, self.proba_cutoff, self.genomes.genome_size, self.site2index) + + def get_edges_data(self): + return list(self.iter_branches()) + + @staticmethod + def dump_table(df: pd.DataFrame, handle, header=False): + if header: + handle.write("\t".join(df.columns) + "\n") + handle.write(df.to_csv(sep="\t", index=None, header=None, float_format='%g')) + + def turn_to_MAP(self, states: np.ndarray): + if isinstance(states, pd.DataFrame): + warn('pd.DataFrame used as input instead of np.ndarray', ResourceWarning) + states = states.values + return ''.join([nucl_order5[x] for x in states.argmax(1)]) + + def mutations_summary(mutations: pd.DataFrame, gene_col=None, proba_col=None, gene_name_mapper: dict = None): """ form mutations annotation: how many synonymous, fourfold or stop loss/gain observed in the table diff --git a/pymutspec/annotation/spectra.py b/pymutspec/annotation/spectra.py index 827613f..5d2e55c 100644 --- a/pymutspec/annotation/spectra.py +++ b/pymutspec/annotation/spectra.py @@ -4,7 +4,7 @@ import numpy as np import pandas as pd -from pymutspec.constants import ( +from ..constants import ( possible_sbs12_set, possible_sbs192_set, possible_sbs192, possible_sbs12 ) diff --git a/pymutspec/annotation/tree.py b/pymutspec/annotation/tree.py index 813927d..556958b 100644 --- a/pymutspec/annotation/tree.py +++ b/pymutspec/annotation/tree.py @@ -58,40 +58,25 @@ def get_tree_len(tree: PhyloTree, mode='geom_mean'): return md -def get_ingroup_root(tree: PhyloTree, outgrp='OUTGRP'): - found_outgrp, found_ingroup = False, False - ingroup = None - for n in tree.children: - if n.name == outgrp: - found_outgrp = True +def get_ingroup_root(tree: PhyloTree) -> PhyloTree: + assert len(tree.children) == 2, 'Tree must be binary' + found_outgroup = False + for node in tree.children: + if node.is_leaf(): + found_outgroup = True else: - found_ingroup = True - ingroup = n - if found_ingroup and found_outgrp: - return ingroup + ingrp = node + + if found_outgroup: + return ingrp else: - raise Exception('Cannot extract ingroup root') + return tree -def calc_phylocoefs(tree: PhyloTree, outgrp='OUTGRP'): - tree_len = get_tree_len(get_ingroup_root(tree, outgrp ), 'geom_mean') - phylocoefs = {} +def calc_phylocoefs(tree: PhyloTree): + tree_len = get_tree_len(get_ingroup_root(tree), 'geom_mean') + phylocoefs = {tree.name: 1 - min(0.999, tree.get_closest_leaf()[1] / tree_len)} for node in tree.iter_descendants(): - d = node.get_closest_leaf()[1] - phylocoefs[node.name] = 1 - min(0.9999, d / tree_len) + _closest, d = node.get_closest_leaf() + phylocoefs[node.name] = 1 - min(0.99999, d / tree_len) return phylocoefs - - -def get_tree_outgrp_name(tree: PhyloTree): - c1, c2 = tree.children - done = False - if len(c1.children) == 0: - done = True - outgrp = c1 - if not done and len(c2.children) == 0: - done = True - outgrp = c2 - if not done: - raise Exception("Cannot get outgroup from tree") - - return outgrp.name diff --git a/pymutspec/draw/sbs_orders.py b/pymutspec/draw/sbs_orders.py index 39ec257..c5ed8bc 100644 --- a/pymutspec/draw/sbs_orders.py +++ b/pymutspec/draw/sbs_orders.py @@ -1,7 +1,7 @@ import pandas as pd -from pymutspec.constants import possible_sbs192 -from pymutspec.annotation import rev_comp, transcriptor +from ..constants import possible_sbs192 +from ..annotation import rev_comp, transcriptor kk_lbls = "A>C A>G A>T C>T G>C G>T".split() cosmic_lbls = "C>A C>G C>T T>A T>C T>G".split() diff --git a/pymutspec/draw/spectra.py b/pymutspec/draw/spectra.py index 772f07d..cd4bc43 100644 --- a/pymutspec/draw/spectra.py +++ b/pymutspec/draw/spectra.py @@ -9,7 +9,7 @@ import matplotlib.pyplot as plt import seaborn as sns -from pymutspec.constants import possible_sbs192, possible_sbs96 +from ..constants import possible_sbs192, possible_sbs96 from .sbs_orders import ordered_sbs192_kk, ordered_sbs192_kp color_mapping6 = { diff --git a/pymutspec/io/__init__.py b/pymutspec/io/__init__.py index 6c4a9d1..8f9e8c9 100644 --- a/pymutspec/io/__init__.py +++ b/pymutspec/io/__init__.py @@ -1,6 +1,3 @@ -from .auxiliary import get_aln_files, load_scheme, read_genbank_ref -from .states import GenesStates, read_rates - -__all__ = [ - "get_aln_files", "load_scheme", "read_genbank_ref", "GenesStates", "read_rates", -] +from .auxiliary import get_aln_files, load_scheme +from .gb import read_genbank_ref +from .states import GenesStates, GenomeStates, GenomeStatesTotal, read_rates, read_alignment diff --git a/pymutspec/io/auxiliary.py b/pymutspec/io/auxiliary.py index 20dc86a..2818112 100644 --- a/pymutspec/io/auxiliary.py +++ b/pymutspec/io/auxiliary.py @@ -1,11 +1,6 @@ import os import re -from typing import Dict, Union - -import numpy as np -import pandas as pd -from Bio.SeqRecord import SeqRecord -from Bio import SeqIO +from typing import Dict def load_scheme(path: str) -> Dict[str, str]: @@ -28,64 +23,3 @@ def get_aln_files(path: str): [os.path.join(path, x) for x in raw_files if x.endswith(".fna")] ) return files - - -def read_genbank_ref(gb: Union[str, SeqRecord]): - if isinstance(gb, str): - genome = next(SeqIO.parse(gb, "genbank")) - elif isinstance(gb, SeqRecord): - genome = gb - else: - raise NotImplementedError - - ftypes = {"CDS", "rRNA", "tRNA"} - full_nucls = set("ACGT") - data = [] - df: pd.DataFrame = None - gene_qualifier = None - for ftr in genome.features: - if ftr.type == "source": - source = ftr.extract(genome) - seq = str(source.seq) - for pos, nuc in enumerate(seq): - context = seq[pos - 1: pos + 2] - if len(context) < 3 or len(set(context).difference(full_nucls)) != 0: - context = None - if nuc not in full_nucls: - nuc = context = None - data.append({"Pos": pos + 1, "Nuc": nuc, "Context": context}) - df = pd.DataFrame(data) - df["Strand"] = 0 - continue - elif gene_qualifier is None and ftr.type in ftypes: - for qualifier in ["gene", "product", "protein_id"]: - if qualifier in ftr.qualifiers: - gene_qualifier = qualifier - break - if gene_qualifier is None: - raise RuntimeError(f"Cannot find any expected qualifier of feature: {ftr}; with following qualifiers: {ftr.qualifiers}") - - for pos in list(ftr.location): - df.at[pos, "Type"] = ftr.type - df.at[pos, "Strand"] = ftr.strand - if ftr.type in ftypes: - df.at[pos, qualifier] = ftr.qualifiers[qualifier][0] - - # add codon features - df["PosInGene"] = -1 - df["PosInCodon"] = -1 - for gene_name in df[(df.Type == "CDS") & (df.Strand == 1)][gene_qualifier].unique(): - gdf = df[df[gene_qualifier] == gene_name] - seq = gdf.Nuc.values - for pos_in_gene, pos in enumerate(gdf.index): - pic = pos_in_gene % 3 - cdn = seq[pos_in_gene - pic: pos_in_gene - pic + 3] - cdn = "".join(cdn) if len(set(cdn).difference(full_nucls)) == 0 else None - df.at[pos, "Codon"] = cdn - df.at[pos, "PosInGene"] = pos_in_gene + 1 - df.at[pos, "PosInCodon"] = pic + 1 - - df["Strand"] = df["Strand"].astype(np.int8) - df["PosInCodon"] = df["PosInCodon"].astype(np.int8) - df["PosInGene"] = df["PosInGene"].astype(np.int32) - return df diff --git a/pymutspec/io/gb.py b/pymutspec/io/gb.py new file mode 100644 index 0000000..d576831 --- /dev/null +++ b/pymutspec/io/gb.py @@ -0,0 +1,67 @@ +from typing import Union + +import numpy as np +import pandas as pd +from Bio.SeqRecord import SeqRecord +from Bio import SeqIO + + +def read_genbank_ref(gb: Union[str, SeqRecord]): + if isinstance(gb, str): + genome = next(SeqIO.parse(gb, "genbank")) + elif isinstance(gb, SeqRecord): + genome = gb + else: + raise NotImplementedError + + ftypes = {"CDS", "rRNA", "tRNA"} + full_nucls = set("ACGT") + data = [] + df: pd.DataFrame = None + gene_qualifier = None + for ftr in genome.features: + if ftr.type == "source": + source = ftr.extract(genome) + seq = str(source.seq) + for pos, nuc in enumerate(seq): + context = seq[pos - 1: pos + 2] + if len(context) < 3 or len(set(context).difference(full_nucls)) != 0: + context = None + if nuc not in full_nucls: + nuc = context = None + data.append({"Pos": pos + 1, "Nuc": nuc, "Context": context}) + df = pd.DataFrame(data) + df["Strand"] = 0 + continue + elif gene_qualifier is None and ftr.type in ftypes: + for qualifier in ["gene", "product", "protein_id"]: + if qualifier in ftr.qualifiers: + gene_qualifier = qualifier + break + if gene_qualifier is None: + raise RuntimeError(f"Cannot find any expected qualifier of feature: {ftr}; with following qualifiers: {ftr.qualifiers}") + + for pos in list(ftr.location): + df.at[pos, "Type"] = ftr.type + df.at[pos, "Strand"] = ftr.strand + if ftr.type in ftypes: + df.at[pos, qualifier] = ftr.qualifiers[qualifier][0] + + # add codon features + df["PosInGene"] = -1 + df["PosInCodon"] = -1 + for gene_name in df[(df.Type == "CDS") & (df.Strand == 1)][gene_qualifier].unique(): + gdf = df[df[gene_qualifier] == gene_name] + seq = gdf.Nuc.values + for pos_in_gene, pos in enumerate(gdf.index): + pic = pos_in_gene % 3 + cdn = seq[pos_in_gene - pic: pos_in_gene - pic + 3] + cdn = "".join(cdn) if len(set(cdn).difference(full_nucls)) == 0 else None + df.at[pos, "Codon"] = cdn + df.at[pos, "PosInGene"] = pos_in_gene + 1 + df.at[pos, "PosInCodon"] = pic + 1 + + df["Strand"] = df["Strand"].astype(np.int8) + df["PosInCodon"] = df["PosInCodon"].astype(np.int8) + df["PosInGene"] = df["PosInGene"].astype(np.int32) + return df diff --git a/pymutspec/io/states.py b/pymutspec/io/states.py index 26f609a..9eb7417 100644 --- a/pymutspec/io/states.py +++ b/pymutspec/io/states.py @@ -1,15 +1,133 @@ import os import sys +import random import sqlite3 from collections import defaultdict from typing import List -from unicodedata import category +import tqdm import numpy as np import pandas as pd from Bio import SeqIO -import tqdm +from ..utils import basic_logger + + +class GenomeStates: + def __init__(self, path_to_states, path_to_gappy_sites=None, format='tsv', logger=None): + for path in list([path_to_states, path_to_gappy_sites]): + if not os.path.exists(path): + raise ValueError(f"Path to states doesn't exist: '{path}'") + + self.logger = logger or basic_logger() + + self.logger.info(f'Reading states "{path_to_states}"') + states = self.read_states(path, format) + + nodes_arr = states['Node'].unique() + self.node2id = dict(zip(nodes_arr, range(len(nodes_arr)))) + self.nodes = set(nodes_arr) + self.logger.info(f'Loaded {len(self.nodes)} genomes') + + states['NodeId'] = states['Node'].map(self.node2id) + if len(self.nodes) < 2 ** 31 - 1: + states['NodeId'] = states['NodeId'].astype(np.int32) + + states.drop('Node', axis=1, inplace=True) + self.logger.debug('Node column deleted') + + states.set_index('NodeId', inplace=True) + self.logger.debug('States reindexed') + + self.genome_size = len(states.loc[0]) + self.logger.info(f'Total alignement size = {self.genome_size}') + + self.nongappy_sites = self.read_nongappy_sites(path_to_gappy_sites) + self.states = states + self.logger.info('States initialiaztion is done') + + def __contains__(self, item: str): + return item in self.nodes + + def __getitem__(self, item: str): + return self.get_genome(item) + + def read_states(self, path, fmt='iqtree'): + fpt = np.float32 + dtype = { + "p_A": fpt, "p_C": fpt, "p_G": fpt, "p_T": fpt, + "Site": np.int32, "Node": str, + } + if fmt == 'iqtree': + states = pd.read_csv(path, sep='\t', comment='#', dtype=dtype) + if fmt == 'csv': + states = pd.read_csv(path, dtype=dtype) + elif fmt == 'tsv': + states = pd.read_csv(path, sep='\t', dtype=dtype) + elif fmt == 'parquet': + states = pd.read_parquet(path) + return states + + def read_nongappy_sites(self, path): + '''path must contain list of 1-based sites of alignment that must be excluded from loaded alignment + + return array of sorted non-gappy sites in 1-based mode + ''' + if path is None: + nongappy_sites = np.arange(1, self.genome_size + 1) + self.logger.debug('Use all alignment sites') + else: + self.logger.debug(f'Reading gappy sites "{path}"') + gappy_sites = set(pd.read_csv(path, header=None).values.flatten().tolist()) + nongappy_sites = np.array([i for i in range(1, self.genome_size + 1) \ + if i not in gappy_sites]) + self.logger.debug(f'Number of used (non-gappy) sites = {len(nongappy_sites)}') + return nongappy_sites + + def get_genome(self, node: str) -> pd.DataFrame: + if node not in self.nodes: + raise ValueError(f'Input node "{node}" have no genome') + node_id = self.node2id[node] + genome = self.states.loc[node_id].set_index('Site').loc[self.nongappy_sites] + return genome + + +class GenomeStatesTotal(): + def __init__(self, path_to_states1: str, path_to_states2: str, path_to_gappy_sites: str): + gs1 = GenomeStates(path_to_states1, path_to_gappy_sites) + gs2 = GenomeStates(path_to_states2, path_to_gappy_sites) + + assert gs1.genome_size == gs2.genome_size + self.genome_size = gs1.genome_size + + self.nodes = gs1.nodes.union(gs2.nodes) + self.gs1 = gs1 + self.gs2 = gs2 + + def __contains__(self, item): + return item in self.nodes + + def __getitem__(self, item): + if item in self.gs1: + genome = self.gs1[item] + elif item in self.gs2: + genome = self.gs2[item] + else: + raise ValueError('node does not exist') + return genome + + def get_genome(self, node: str) -> pd.DataFrame: + if node in self.gs1: + genome = self.gs1[node] + elif node in self.gs2: + genome = self.gs2[node] + else: + raise ValueError('node does not exist') + return genome + + def get_random_genome(self) -> pd.DataFrame: + node = random.choice(list(self.nodes)) + return self[node] class GenesStates: @@ -304,3 +422,61 @@ def read_rates(path: str): df = pd.read_csv(path, sep="\t", comment="#").sort_values("Site") category = df.Cat.values return category + + +def read_alignment(files: list, fmt="fasta"): + """ + Read files alignments and prepare states table + + Arguments + --------- + files: list or iterable of string + files with alignments; if some records not presented in all files table will be filled by '-' (gaps) + fmt: string + format of files aligment; supported any format from biopython (fasta, phylip, etc.) + + Return + --------- + states: pd.DataFrame + """ + ngenes = len(files) + columns = "Node Part Site State p_A p_C p_G p_T".split() + nucls = "ACGT" + aln_lens = dict() + files = set(files) + history = defaultdict(list) + data = [] + for filepath in files: + part = "1" if ngenes == 1 else os.path.basename(filepath).replace(".fna", "") + alignment = SeqIO.parse(filepath, fmt) + for rec in alignment: + node = rec.name + history[node].append(part) + seq = str(rec.seq) + for site, state in enumerate(seq, 1): + site_data = [node, part, site, state] + for nucl in nucls: + p = int(nucl == state) + site_data.append(p) + data.append(site_data) + aln_lens[part] = len(seq) + + if ngenes > 1: + # get max set of parts. Only for multiple files + for node, parts in history.items(): + if len(parts) == ngenes: + full_parts = parts.copy() + break + + # fill missing genes by '-'. Only for multiple files + for node, parts in history.items(): + if len(parts) != ngenes: + unseen_parts = set(full_parts).difference(parts) + for unp in unseen_parts: + print(f"Gap filling for node {node}, part {unp}...", file=sys.stderr) + for site in range(1, aln_lens[unp] + 1): + site_data = [node, unp, site, "-", 0, 0, 0, 0] + data.append(site_data) + + df = pd.DataFrame(data, columns=columns) + return df \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 7c93600..196cb89 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,14 +4,14 @@ build-backend = "setuptools.build_meta" [project] name = "PyMutSpec" -version = "0.0.12" +dynamic = ["version"] description = "Mutational spectra analysis package" authors = [ { name="kpotoh", email="axepon@mail.ru" }, ] -license = { file = "LICENSE" } +license = {file = "LICENSE"} readme = {file = "README.md", content-type = "text/markdown"} -requires-python = ">=3.8" +keywords = ["Bioinformatics", "Genetics", "Mutational spectra"] classifiers = [ "Operating System :: OS Independent", "Development Status :: 3 - Alpha", @@ -24,18 +24,33 @@ classifiers = [ "Topic :: Software Development :: Libraries :: Python Modules", "License :: OSI Approved :: MIT License", ] +requires-python = ">=3.8" dependencies = [ - "numpy~=1.22.2", - "pandas~=1.4.0", + "numpy<2.0", + "pandas<2.0", "seaborn==0.12.2", - "biopython==1.79", - "ete3==3.1.2", - "tqdm~=4.62.3", - "PyYAML~=6.0", - "pytest~=7.1.2", - "click~=8.1.3", + "biopython>=1.75", + "ete3>=3", + "tqdm", + "PyYAML", + "pytest", + "click", ] +[project.optional-dependencies] +dev = ["pytest", "pytest-cov", "flake8"] + [project.urls] -"Homepage" = "https://github.com/mitoclub/PyMutSpec" +Homepage = "https://github.com/mitoclub/PyMutSpec" +Changelog = "https://github.com/mitoclub/PyMutSpec/blob/master/CHANGELOG.md" "Bug Tracker" = "https://github.com/mitoclub/PyMutSpec/issues" + +[tool.setuptools] +packages = ["pymutspec"] +script-files = ["scripts/multifasta_coding.py", "scripts/alignment2iqtree_states.py", + "scripts/select_records.py", "scripts/iqtree_states_add_part.py", + "scripts/collect_mutations.py", "scripts/calculate_mutspec.py"] + + +[tool.setuptools.dynamic] +version = {attr = "pymutspec.__version__"} # any module attribute compatible with ast.literal_eval diff --git a/requirements.dev.txt b/requirements.dev.txt index 470e307..eabcf45 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -1,13 +1,13 @@ -numpy~=1.22.2 -pandas~=1.4.0 +numpy<2.0 +pandas<2.0 matplotlib~=3.5.1 -seaborn~=0.12.2 -biopython~=1.79 -ete3~=3.1.2 -tqdm~=4.62.3 -PyYAML~=6.0 -pytest~=7.1.2 -click~=8.1.3 +seaborn==0.12.2 +biopython>=1.75 +ete3>=3 +tqdm +PyYAML +pytest +click # scipy~=1.8.1 # statsmodels~=0.13.2 pytest diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index ba98d4d..0000000 --- a/requirements.txt +++ /dev/null @@ -1,12 +0,0 @@ -numpy~=1.22.2 -pandas~=1.4.0 -matplotlib~=3.5.1 -seaborn==0.12.2 -biopython==1.79 -ete3==3.1.2 -tqdm~=4.62.3 -PyYAML~=6.0 -pytest~=7.1.2 -click~=8.1.3 -# scipy~=1.8.1 -# statsmodels~=0.13.2 diff --git a/scripts/collect_mutations.py b/scripts/collect_mutations.py index 631162c..a8b9a27 100644 --- a/scripts/collect_mutations.py +++ b/scripts/collect_mutations.py @@ -17,7 +17,7 @@ from pymutspec.annotation import ( CodonAnnotation, calculate_mutspec, - iter_tree_edges, lbl2lbl_id, calc_phylocoefs, get_tree_outgrp_name, + iter_tree_edges, lbl2lbl_id, calc_phylocoefs, ) from pymutspec.constants import possible_sbs12, possible_sbs192 from pymutspec.io import GenesStates @@ -78,11 +78,9 @@ def __init__( logger.info(f"Types of mutations to collect and process: {self.mut_labels}") self.fp_format = np.float32 self.tree = PhyloTree(path_to_tree, format=1) - self.outgrp_name = get_tree_outgrp_name(self.tree) logger.info( f"Tree loaded, number of leaf nodes: {len(self.tree)}, " - f"total number of nodes: {len(self.tree.get_cached_content())}, " - f"outgroup name: {self.outgrp_name}" + f"total number of nodes: {len(self.tree.get_cached_content())}" ) rnd_genome = self.get_random_genome() logger.info(f"Number of genes: {len(rnd_genome)}, number of sites: {[len(x) for x in rnd_genome.values()]}") @@ -119,7 +117,7 @@ def extract_mutspec_from_tree(self): # calculate phylogenetic uncertainty correction if self.use_phylocoef: - phylocoefs = calc_phylocoefs(self.tree, self.outgrp_name) + phylocoefs = calc_phylocoefs(self.tree) for ei, (ref_node, alt_node) in enumerate(iter_tree_edges(self.tree), 1): if alt_node.name not in self.nodes: diff --git a/scripts/collect_mutations_parallel.py b/scripts/collect_mutations_parallel.py new file mode 100644 index 0000000..2f202c3 --- /dev/null +++ b/scripts/collect_mutations_parallel.py @@ -0,0 +1,521 @@ +#!/usr/bin/env python3 +""" +pic is position in codon + +""" + +import os +import sys +import time +import random +from shutil import rmtree +from typing import Dict +from warnings import warn +import multiprocessing as mp +from datetime import datetime +from collections import defaultdict + +import numpy as np +import pandas as pd +from ete3 import PhyloTree +from pymutspec.annotation import ( + CodonAnnotation, iter_tree_edges, get_tree_len, +) +from pymutspec.utils import load_logger, basic_logger + +logger = basic_logger() + +nucl_order5 = ['A', 'C', 'G', 'T', '-'] + + +def calc_phylocoefs(tree: PhyloTree): + tree_len = get_tree_len(tree, 'geom_mean') + logger.info(f'Tree len = {tree_len:.3f}') + phylocoefs = {tree.name: 1 - min(0.999, tree.get_closest_leaf()[1] / tree_len)} + for node in tree.iter_descendants(): + _closest, d = node.get_closest_leaf() + phylocoefs[node.name] = 1 - min(0.99999, d / tree_len) + logger.info(f'Phylocoefs range: [{min(phylocoefs.values()):.3f}, {max(phylocoefs.values()):.3f}]') + return phylocoefs + + +class GenomeStates: + def __init__(self, path_to_states, path_to_gappy_sites): + for path in list([path_to_states, path_to_gappy_sites]): + if not os.path.exists(path): + raise ValueError(f"Path to states doesn't exist: '{path}'") + + logger.info(f'Reading states "{path_to_states}"') + states = pd.read_parquet(path_to_states) + + nodes_arr = states['Node'].unique() + self.node2id = dict(zip(nodes_arr, range(len(nodes_arr)))) + self.nodes = set(nodes_arr) + logger.debug('Node2id prepared') + + logger.info(f'Loaded {len(self.nodes)} genomes') + + states['NodeId'] = states['Node'].map(self.node2id).astype(np.int32) + states.drop('Node', axis=1, inplace=True) + logger.debug('Node column deleted') + states.set_index('NodeId', inplace=True) + logger.debug('States indexed') + + self.genome_size = len(states.loc[0]) + logger.info(f'Total alignement size = {self.genome_size}') + + self.nongappy_sites = self.read_nongappy_sites(path_to_gappy_sites) + self.states = states + logger.info('States initialiaztion is done') + + def __contains__(self, item: str): + return item in self.nodes + + def __getitem__(self, item: str): + return self.get_genome(item) + + def read_nongappy_sites(self, path): + logger.debug(f'Reading gappy sites "{path}"') + gappy_sites = set(pd.read_csv(path, header=None).values.flatten().tolist()) + nongappy_sites = np.array([i for i in range(self.genome_size) \ + if i not in gappy_sites]) + 1 # 1-based + logger.debug(f'Number of used (non-gappy) sites = {len(nongappy_sites)}') + return nongappy_sites + + def get_genome(self, node: str) -> pd.DataFrame: + if node not in self.nodes: + raise ValueError(f'Passed node "{node}" have no genome') + node_id = self.node2id[node] + genome = self.states.loc[node_id].set_index('Site').loc[self.nongappy_sites] + return genome + + +class GenomeStatesTotal(): + def __init__(self, path_to_states1: str, path_to_states2: str, path_to_gappy_sites: str): + gs1 = GenomeStates(path_to_states1, path_to_gappy_sites) + gs2 = GenomeStates(path_to_states2, path_to_gappy_sites) + + assert gs1.genome_size == gs2.genome_size + self.genome_size = gs1.genome_size + + self.nodes = gs1.nodes.union(gs2.nodes) + self.gs1 = gs1 + self.gs2 = gs2 + + def __contains__(self, item): + return item in self.nodes + + def __getitem__(self, item): + if item in self.gs1: + genome = self.gs1[item] + elif item in self.gs2: + genome = self.gs2[item] + else: + raise ValueError('node does not exist') + return genome + + def get_genome(self, node: str) -> pd.DataFrame: + if node in self.gs1: + genome = self.gs1[node] + elif node in self.gs2: + genome = self.gs2[node] + else: + raise ValueError('node does not exist') + return genome + + def get_random_genome(self) -> pd.DataFrame: + node = random.choice(list(self.nodes)) + return self[node] + + +class Branch: + def __init__(self, index: int, ref_name: str, alt_name: str, + ref_genome: pd.DataFrame, alt_genome: pd.DataFrame, + phylocoef: float, proba_cutoff: float, genome_size: int, + site2index: Dict[int, int]) -> None: + self.index = index + self.ref_name = ref_name + self.alt_name = alt_name + self.ref_genome = ref_genome + self.alt_genome = alt_genome + self.phylocoef = phylocoef + self.proba_cutoff = proba_cutoff + self.genome_size = genome_size + self.site2index = site2index + + def is_substitution(self, cxt1, cxt2): + """ + check that contexts contain substitution. + + Example inputs and outputs: + - (CAT,CGG) -> False (T != G) + - (CAT,CAG) -> False (central nuclaotide are constant) + - (ACTAC,ACAAC) -> True + - (ACTAC,AC-AC) -> False (indel != substitution) + """ + ok = True + center_pos = len(cxt1) // 2 + if cxt1[center_pos] == cxt2[center_pos]: + ok = False + elif cxt1[center_pos] == '-' or cxt2[center_pos] == '-': + ok = False + elif cxt1[center_pos-1] != cxt2[center_pos-1] or \ + cxt1[center_pos+1] != cxt2[center_pos+1]: + logger.warning(f"Neighbouring positions of substitution are different: {cxt1}, {cxt2}") + ok = False + return ok + + def process_branch(self): + edge_mutations = self.extract_mutations_proba( + self.ref_genome, self.alt_genome, self.phylocoef, self.proba_cutoff) + + edge_mutations["RefNode"] = self.ref_name + edge_mutations["AltNode"] = self.alt_name + + mut_num = edge_mutations['ProbaFull'].sum() if 'ProbaFull' in edge_mutations.columns else len(edge_mutations) + if mut_num == 0: + logger.info(f"No mutations from branch {self.index:03} ({self.ref_name} - {self.alt_name})") + elif mut_num > self.genome_size * 0.1: + logger.warning(f"Observed too many mutations ({mut_num} > {self.genome_size} * 0.1) " + f"for branch ({self.ref_name} - {self.alt_name})") + else: + logger.info(f"{mut_num:.2f} mutations from branch {self.index:03} ({self.ref_name} - {self.alt_name})") + + return edge_mutations + + # @profile + def extract_mutations_proba(self, g1: pd.DataFrame, g2: pd.DataFrame, + phylocoef: float, mut_proba_cutoff: float): + """ + Extract alterations of g2 comparing to g1 + + conditions: + - in one codon could be only sbs + - in the context of one mutation couldn't be other sbs + - indels are not sbs and codons and contexts with sbs are not considered + + Arguments + - g1 - reference sequence (parent node) + - g2 - alternative sequence (child node) + + return: + - mut - dataframe of mutations + """ + n, m = len(g1), len(g2) + assert n == m, f"genomes lengths are not equal: {n} != {m}" + + mutations = [] + g1_most_prob = g1.values.argmax(1) + g2_most_prob = g2.values.argmax(1) + # pass initial and last nucleotides without context + for site in g1.index[2:-2]: + site_mutations = self.process_site( + g1, g2, site, g1_most_prob, g2_most_prob, mut_proba_cutoff, phylocoef) + mutations.extend(site_mutations) + mut_df = pd.DataFrame(mutations) + return mut_df + + def process_site(self, g1: pd.DataFrame, g2: pd.DataFrame, site: int, + g1_most_prob: np.ndarray, g2_most_prob: np.ndarray, + mut_proba_cutoff: float, phylocoef: float): + pos_in_states = self.site2index[site] + states1, states2 = self.prepare_local_states( + g1.values, g2.values, pos_in_states, g1_most_prob, g2_most_prob) + if states1 is None: + return [] + + mutations = [] + for cxt1, proba1 in self.sample_context5(2, states1, mut_proba_cutoff): + for cxt2, proba2 in self.sample_context5(2, states2, mut_proba_cutoff): + if not self.is_substitution(cxt1, cxt2): + continue + + p = proba1 * proba2 + if p < mut_proba_cutoff: + continue + + p_adj = p * phylocoef + + cxt1 = ''.join(cxt1) + cxt2 = ''.join(cxt2) + muttype = f'{cxt1[0:2]}[{cxt1[2]}>{cxt2[2]}]{cxt1[3:]}' + sbs = { + "Mut": muttype, + "Site": site, + "ProbaRef": proba1, + "ProbaMut": p, + "ProbaFull": p_adj, + } + mutations.append(sbs) + return mutations + + def prepare_local_states_simple(self, g1: np.ndarray, g2: np.ndarray, pos: int): + return g1[pos-2: pos+3], g2[pos-2: pos+3] + + def prepare_local_states(self, g1: np.ndarray, g2: np.ndarray, pos: int, + g1_most_prob: np.ndarray, g2_most_prob: np.ndarray): + # prepare local states: exclude gaps from both seqs + ## first, search upstream + up_states1, up_states2 = [g1[pos]], [g2[pos]] + down_states1, down_states2 = [], [] + for i in range(1, 10): # don't collect gaps at all + cur_state1, cur_state2 = g1[pos-i], g2[pos-i] + state_id1, state_id2 = g1_most_prob[pos-i], g2_most_prob[pos-i] + if not (state_id1 == state_id2 == 4): # both states are not equal to gap + up_states1.append(cur_state1) + up_states2.append(cur_state2) + if len(up_states1) == 3 or pos-i-1 <= 0: + break + + ## second, search downstream + for i in range(1, 10): + cur_state1, cur_state2 = g1[pos+i], g2[pos+i] + state_id1, state_id2 = g1_most_prob[pos+i], g2_most_prob[pos+i] + if not (state_id1 == state_id2 == 4): # not equal to gap + down_states1.append(cur_state1) + down_states2.append(cur_state2) + if len(down_states1) == 2 or pos+i+1 >= len(g1): + break + + if any([len(up_states1) != 3, + len(up_states2) != 3, + len(down_states1) != 2 , + len(down_states2) != 2]): + logger.warning(f'Cannot collect local contexts for position {pos}') + return None, None + + up_states1.reverse() + up_states2.reverse() + up_states1.extend(down_states1) + up_states2.extend(down_states2) + states1 = np.array(up_states1) + states2 = np.array(up_states2) + + return states1, states2 + + def sample_context5(self, pos: int, g: np.ndarray, cutoff=0.25): + probas = g[pos-2] * g[pos-1][:, None] * g[pos][:, None, None] * \ + g[pos+1][:, None, None, None] * g[pos+2][:, None, None, None, None] + + indexes = np.where(probas > cutoff) + for idx in range(len(indexes[0])): + cxt_ids = [indexes[i][idx] for i in list(range(len(indexes)))[::-1]] + cxt = tuple(nucl_order5[x] for x in cxt_ids) + pr = probas[tuple(cxt_ids[::-1])] + yield cxt, pr + + def sample_context7(self, pos: int, g: np.ndarray, cutoff=0.25): + probas = g[pos-3] * g[pos-2][:, None] * g[pos-1][:, None, None] * \ + g[pos][:, None, None, None] * g[pos+1][:, None, None, None, None] * \ + g[pos+2][:, None, None, None, None, None] * \ + g[pos+3][:, None, None, None, None, None, None] + + indexes = np.where(probas > cutoff) + for idx in range(len(indexes[0])): + cxt_ids = [indexes[i][idx] for i in list(range(len(indexes)))[::-1]] + cxt = tuple(nucl_order5[x] for x in cxt_ids) + pr = probas[tuple(cxt_ids[::-1])] + yield cxt, pr + + +class MutSpec(CodonAnnotation): + def __init__( + self, path_to_tree, outdir, gcode=2, + use_proba=False, proba_cutoff=0.25, use_phylocoef=False, + genomes: GenomeStatesTotal = None, num_processes=8. + ): + if not os.path.exists(path_to_tree): + raise ValueError(f"Path to tree doesn't exist: '{path_to_tree}'") + + CodonAnnotation.__init__(self, gencode=gcode) + + self.num_processes = num_processes + self.genomes = genomes + self.gcode = gcode + self.use_proba = use_proba + self.proba_cutoff = proba_cutoff + self.use_phylocoef = use_phylocoef if use_proba else False + self.outdir = outdir + logger.info(f"Using gencode {gcode}") + logger.info(f"Use probabilities of genomic states: {use_proba}") + logger.info(f"Use phylogenetic uncertainty coefficient: {use_phylocoef}") + logger.info(f"Minimal probability for mutations to use: {proba_cutoff}") + + self.fp_format = np.float32 + self.tree = PhyloTree(path_to_tree, format=1) + logger.info( + f"Tree loaded, number of leaf nodes: {len(self.tree)}, " + f"total number of nodes: {len(self.tree.get_cached_content())}, " + ) + rnd_genome = self.genomes.get_random_genome() + logger.info(f"Number of MSA sites: {len(rnd_genome)}") + + # mapping of MSA sites to truncated states ids + self.site2index = dict(zip(rnd_genome.index, range(len(rnd_genome)))) + + def open_handles(self, outdir): + self.handle = dict() + self.handle["mut"] = open(os.path.join(outdir, "mutations.tsv"), "w") + logger.debug("Handles opened") + + def close_handles(self): + for file in self.handle.values(): + file.close() + logger.debug("Handles closed") + + def extract_mutspec_from_tree(self): + t = self.num_processes + if not isinstance(t, int) or t < 1: + raise ValueError('num_processes must be positive integer') + + if t == 1: + self._derive_mutspec() + elif t > 1: + self._derive_mutspec_parallel() + + def _derive_mutspec(self): + logger.info("Start mutation extraction from tree") + self.open_handles(self.outdir) + add_header = defaultdict(lambda: True) + total_mut_num = 0 + + for edge_data in self.iter_branches(): + edge_mutations = self.process_branch(*edge_data[1:]) + mut_num = edge_mutations['ProbaFull'].sum() if self.use_proba and \ + 'ProbaFull' in edge_mutations.columns else len(edge_mutations) + total_mut_num += mut_num + + # dump current edge mutations + self.dump_table(edge_mutations, self.handle["mut"], add_header["mut"]) + add_header["mut"] = False + + self.close_handles() + logger.info(f"Processed {edge_data[1]} tree edges") + logger.info(f"Observed {total_mut_num:.3f} substitutions") + logger.info("Extraction of mutations from phylogenetic tree completed succesfully") + + def _derive_mutspec_parallel(self): + logger.info("Start parallel mutation extraction from tree") + + with mp.Pool(processes=self.num_processes) as pool: + genome_mutations_lst = pool.map(Branch.process_branch, self.iter_branches()) + + genome_mutations = pd.concat(genome_mutations_lst) + + self.open_handles(self.outdir) + self.dump_table(genome_mutations, self.handle["mut"], True) + self.close_handles() + + total_mut_num = genome_mutations['ProbaFull'].sum() if self.use_proba and \ + 'ProbaFull' in genome_mutations.columns else len(genome_mutations) + logger.info(f"Observed {total_mut_num:.3f} substitutions") + logger.info("Extraction of mutations from phylogenetic tree completed succesfully") + + # def iter_chunks(self, n): + # lst = self.get_edges_data() + # for i in range(0, len(lst), n): + # yield lst[i:i + n] + + def iter_branches(self): + """yield (self, edge_id, ref_node_name, alt_node_name, ref_genome, alt_genome, phylocoef)""" + # calculate phylogenetic uncertainty correction + if self.use_phylocoef: + phylocoefs = calc_phylocoefs(self.tree) + + for ei, (ref_node, alt_node) in enumerate(iter_tree_edges(self.tree), 1): + if alt_node.name not in self.genomes.nodes: + logger.warning(f"Skip edge '{ref_node.name}'-'{alt_node.name}' due to absence of '{alt_node.name}' genome") + continue + if ref_node.name not in self.genomes.nodes: + logger.warning(f"Skip edge '{ref_node.name}'-'{alt_node.name}' due to absence of '{ref_node.name}' genome") + continue + + # calculate phylogenetic uncertainty correction: + # coef for ref edge * coef for alt edge + if self.use_phylocoef: + phylocoef = self.fp_format(phylocoefs[ref_node.name] * phylocoefs[alt_node.name]) + else: + phylocoef = self.fp_format(1) + + # get genomes from storage + ref_genome = self.genomes.get_genome(ref_node.name) + alt_genome = self.genomes.get_genome(alt_node.name) + logger.debug(f'Genomes retrieved for branch {ei}') + + yield Branch(ei, ref_node.name, alt_node.name, ref_genome, alt_genome, + phylocoef, self.proba_cutoff, self.genomes.genome_size, self.site2index) + + def get_edges_data(self): + return list(self.iter_branches()) + + @staticmethod + def dump_table(df: pd.DataFrame, handle, header=False): + if header: + handle.write("\t".join(df.columns) + "\n") + handle.write(df.to_csv(sep="\t", index=None, header=None, float_format='%g')) + + def turn_to_MAP(self, states: np.ndarray): + if isinstance(states, pd.DataFrame): + warn('pd.DataFrame used as input instead of np.ndarray', ResourceWarning) + states = states.values + return ''.join([nucl_order5[x] for x in states.argmax(1)]) + + +def main(): + debug = False + current_datetime = str(datetime.now()).split('.')[0].replace(' ', '_') + if debug: + path_to_tree = 'data/tiny_tree2.nwk' + path_to_states = 'data/tiny_states2.parquet' + path_to_alignment = 'data/tiny_alignment2.parquet' + outdir = f'data/tmp_{current_datetime}' + else: + path_to_tree = 'data/ancrec.root.nwk' + path_to_states = 'data/states2.parquet' + path_to_alignment = 'data/alignment2.parquet' + outdir = 'data/multicore_run_v4' + + path_to_gappy_sites = './data/gappy_sites.csv' + gencode = 2 + proba = phylocoef = True + proba_cutoff = 0.25 + force = False + num_processes = 64 + + if os.path.exists(outdir): + if not force: + answer = input(f"Delete existing directory '{outdir}'? [Y/n] ") + if answer.upper() != "Y" and answer != "": + print("Interapted", file=sys.stderr) + exit(0) + rmtree(outdir) + print(f"Directory '{outdir}' deleted", file=sys.stderr) + os.makedirs(outdir) + + global logger + logfile = os.path.join(outdir, "run.log") + logger = load_logger(filename=logfile, stream_level='INFO') + + logger.info(f"Writing logs to '{logfile}'") + logger.debug("Command: " + " ".join(sys.argv)) + logger.debug(f"Output directory '{outdir}' created") + if debug: + logger.warning('DEBUG MODE ACTIVATED') + + start_time = time.time() + + genomes = GenomeStatesTotal(path_to_states, path_to_alignment, path_to_gappy_sites) + + MutSpec( + path_to_tree, outdir, gcode=gencode, + use_proba=proba, proba_cutoff=proba_cutoff, use_phylocoef=phylocoef, + genomes=genomes, num_processes=num_processes, + ).extract_mutspec_from_tree() + + end_time = time.time() + execution_time = end_time - start_time + logger.info(f"Script completed taking {execution_time:.2f} seconds ({execution_time/3600:.2f} hours)") + + +if __name__ == "__main__": + main() diff --git a/scripts/iqtree_states_add_part.py b/scripts/iqtree_states_add_part.py index 1585b77..a5bbe24 100644 --- a/scripts/iqtree_states_add_part.py +++ b/scripts/iqtree_states_add_part.py @@ -6,7 +6,12 @@ import sys -def main(path_to_states, path_to_out): +def main(): + try: + path_to_states, path_to_out = sys.argv[1:] + except: + print("ERROR\nUSAGE: script.py path_to_states path_to_out_states", file=sys.stderr) + with open(path_to_out, "w") as fout: with open(path_to_states) as fin: for line in fin: @@ -22,8 +27,4 @@ def main(path_to_states, path_to_out): if __name__ == "__main__": - try: - path_to_states, path_to_out = sys.argv[1:] - main(path_to_states, path_to_out) - except: - print("ERROR\nUSAGE: script.py path_to_states path_to_out_states", file=sys.stderr) + main() diff --git a/scripts/raxml_states2iqtree_states.py b/scripts/raxml_states2iqtree_states.py index 8f1deda..d7df672 100644 --- a/scripts/raxml_states2iqtree_states.py +++ b/scripts/raxml_states2iqtree_states.py @@ -23,7 +23,12 @@ def agrmax(a): return imax -def main(path_to_raxml_states, out_path): +def main(): + try: + path_to_raxml_states, out_path = sys.argv[1], sys.argv[2] + except: + RuntimeError("USAGE: script.py PATH_TO_RAxML_STATES PATH_TO_OUT_IQTREE_STATES") + with open(out_path, "w") as fout: fout.write("\t".join(header) + "\n") with open(path_to_raxml_states) as fin: @@ -45,7 +50,4 @@ def main(path_to_raxml_states, out_path): if __name__ == "__main__": # path_to_raxml_states = "data/RAxML_marginalAncestralProbabilities.tsv" # out_path = "data/out.state" - try: - main(sys.argv[1], sys.argv[2]) - except: - RuntimeError("USAGE: script.py PATH_TO_RAxML_STATES PATH_TO_OUT_IQTREE_STATES") + main() \ No newline at end of file diff --git a/scripts/rename_internal_nodes.py b/scripts/rename_internal_nodes.py index 3c21804..d3be9ef 100644 --- a/scripts/rename_internal_nodes.py +++ b/scripts/rename_internal_nodes.py @@ -13,8 +13,13 @@ def node_parent(node: PhyloNode): return None -def main(path_to_dist_tree, path_to_named_tree, path_to_out): +def main(): """Iteratively going from leaves to root and add names to dist tree""" + try: + _, path_to_dist_tree, path_to_named_tree, path_to_out = sys.argv + except: + print("ERROR\nUSAGE: script.py path_to_dist_tree path_to_named_tree path_to_out_tree", file=sys.stderr) + tree_dist = PhyloTree(path_to_dist_tree, format=0) tree_named = PhyloTree(path_to_named_tree, format=8) @@ -45,8 +50,5 @@ def main(path_to_dist_tree, path_to_named_tree, path_to_out): if __name__ == "__main__": - try: - _, path_to_dist_tree, path_to_named_tree, path_to_out = sys.argv - main(path_to_dist_tree, path_to_named_tree, path_to_out) - except: - print("ERROR\nUSAGE: script.py path_to_dist_tree path_to_named_tree path_to_out_tree", file=sys.stderr) + main() + diff --git a/setup.py b/setup.py index e264b64..fc1f76c 100644 --- a/setup.py +++ b/setup.py @@ -1,15 +1,3 @@ -import os -from setuptools import setup, find_packages +from setuptools import setup -scripts = [] -if os.path.exists("scripts"): - for file in os.listdir("scripts"): - if file.endswith(".py") and not file.startswith("_"): - fp = os.path.join("scripts", file) - scripts.append(fp) - -setup( - packages=find_packages(), - scripts=scripts, - package_data={'pymutspec': ['utils/configs/log_settings.yaml']}, -) +setup() \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_tree_annotation.py b/tests/test_tree_annotation.py index d2dec82..8a0b76d 100644 --- a/tests/test_tree_annotation.py +++ b/tests/test_tree_annotation.py @@ -1,20 +1,15 @@ import pytest -from pymutspec.annotation import get_tree_outgrp_name, get_ingroup_root, get_tree_len, calc_phylocoefs - - -def test_get_tree_outgrp_name(tree_rooted): - outgrp = get_tree_outgrp_name(tree_rooted) - assert outgrp == 'Acanthisitta_chloris' +from pymutspec.annotation import get_ingroup_root, get_tree_len, calc_phylocoefs def test_get_ingroup_root(tree_rooted): - ingroup = get_ingroup_root(tree_rooted, 'Acanthisitta_chloris') + ingroup = get_ingroup_root(tree_rooted) assert ingroup.name == 'Node1' assert round(ingroup.dist, 4) == 0.0739 def test_get_tree_len(tree_rooted): - ingroup = get_ingroup_root(tree_rooted, 'Acanthisitta_chloris') + ingroup = get_ingroup_root(tree_rooted) l1 = get_tree_len(tree_rooted, 'geom_mean') l2 = get_tree_len(ingroup, 'geom_mean') @@ -24,9 +19,9 @@ def test_get_tree_len(tree_rooted): def test_calc_phylocoefs(tree_rooted): - phylocoefs = calc_phylocoefs(tree_rooted, 'Acanthisitta_chloris') + phylocoefs = calc_phylocoefs(tree_rooted) - ingroup = get_ingroup_root(tree_rooted, 'Acanthisitta_chloris') + ingroup = get_ingroup_root(tree_rooted) tl = get_tree_len(ingroup, 'geom_mean') node = 'Node1'