From bf52a8fcca41d2de775e4b6a089aed885b0e8ae4 Mon Sep 17 00:00:00 2001 From: Philip W Fowler Date: Wed, 15 Nov 2023 06:55:31 +0000 Subject: [PATCH] created Sample class to abstract some logic --- src/sbmlsim/Batch.py | 88 +++++++---------------------------------- src/sbmlsim/Sample.py | 86 ++++++++++++++++++++++++++++++++++++++++ src/sbmlsim/__init__.py | 1 + 3 files changed, 102 insertions(+), 73 deletions(-) create mode 100644 src/sbmlsim/Sample.py diff --git a/src/sbmlsim/Batch.py b/src/sbmlsim/Batch.py index 5b57b5b..e922cbb 100644 --- a/src/sbmlsim/Batch.py +++ b/src/sbmlsim/Batch.py @@ -1,11 +1,12 @@ import numpy -import copy import random import pandas as pd import gumpy import piezo +import sbmlsim + class Batch: """ @@ -97,8 +98,7 @@ def generate( # iterate through the required number of samples for n_sample in range(n_samples): - # make a copy of the reference gene - sample_gene = copy.deepcopy(self.reference_gene) + sample = sbmlsim.Sample(self.reference_gene) # decide the phenotype of the sample label = self._get_sample_type(proportion_resistant) @@ -109,12 +109,12 @@ def generate( number_resistant, selected_resistant_mutations, remaining_aa_positions, - ) = self._get_resistant_mutations(n_res, distribution, sample_gene) + ) = self._get_resistant_mutations(n_res, distribution, sample) else: number_resistant = 0 selected_resistant_mutations = [] - remaining_aa_positions = sample_gene.amino_acid_number + remaining_aa_positions = sample.gene.amino_acid_number # determine the susceptible mutations, if any if n_sus > 0: @@ -122,7 +122,7 @@ def generate( number_susceptible, selected_susceptible_mutations, ) = self._get_susceptible_mutations( - n_sus, remaining_aa_positions, sample_gene + n_sus, remaining_aa_positions, sample ) else: selected_susceptible_mutations = [] @@ -134,20 +134,12 @@ def generate( ) # apply the mutations to the sample gene - self._apply_mutations(selected_mutations, sample_gene) - - # translate the nucleotide sequence to amino acids - sample_gene._translate_sequence() - - # create a string of the amino acid sequence - sample_amino_acid_sequence = "".join( - i for i in sample_gene.amino_acid_sequence - ) + sample.apply_mutations(selected_mutations) # construct the row for the sample table sample_metadata = [ n_sample, - sample_amino_acid_sequence, + sample.amino_acid_sequence, label, number_resistant, number_susceptible, @@ -156,8 +148,7 @@ def generate( samples_rows.append(sample_metadata) # mutations dataframe - diff = self.reference_gene - sample_gene - for i in diff.mutations: + for i in sample.mutations: if i in selected_resistant_mutations: mut_label = "R" else: @@ -205,9 +196,6 @@ def _define_lookups(self): [a + b + c for a in bases for b in bases for c in bases] ) self.codon_to_amino_acid = dict(zip(all_codons, aminoacids)) - self.amino_acid_to_codon = {} - for amino_acid, codon in zip(aminoacids, all_codons): - self.amino_acid_to_codon.setdefault(amino_acid, []).append(codon) def _get_mutations(self): # subset down to mutations that are missense (not premature stop) SNPs in the CDS of the gene of interest @@ -238,7 +226,7 @@ def _get_sample_type(self, proportion_resistant): return label - def _get_resistant_mutations(self, n_res, distribution, sample_gene): + def _get_resistant_mutations(self, n_res, distribution, sample): # choose resistance-conferring mutations for a sample if distribution == "poisson": @@ -276,13 +264,13 @@ def _get_resistant_mutations(self, n_res, distribution, sample_gene): assert mutation[-1] != "!", "cannot mutate to STOP codon" # Get amino acid positions that are not altered by selected resistant mutations - remaining_aa_positions = sample_gene.amino_acid_number[ - ~numpy.isin(sample_gene.amino_acid_number, selected_resistant_positions) + remaining_aa_positions = sample.gene.amino_acid_number[ + ~numpy.isin(sample.gene.amino_acid_number, selected_resistant_positions) ] return number_resistant, selected_resistant_mutations, remaining_aa_positions - def _get_susceptible_mutations(self, n_sus, remaining_aa_positions, sample_gene): + def _get_susceptible_mutations(self, n_sus, remaining_aa_positions, sample): # choose susceptible mutations for a sample # randomly choose a number of susceptible mutations @@ -295,8 +283,8 @@ def _get_susceptible_mutations(self, n_sus, remaining_aa_positions, sample_gene) list(remaining_aa_positions), k=number_susceptible ): # find out the wildtype codon - ref_codon = sample_gene.codons[ - sample_gene.amino_acid_number == susceptible_position + ref_codon = sample.gene.codons[ + sample.gene.amino_acid_number == susceptible_position ][0] # find out the wildtype amino acid @@ -325,49 +313,3 @@ def _get_susceptible_mutations(self, n_sus, remaining_aa_positions, sample_gene) ) return number_susceptible, selected_susceptible_mutations - - def _apply_mutations(self, selected_mutations, sample_gene): - # apply mutations to sample gene - - for mutation in selected_mutations: - ref_aa = mutation[0] - alt_aa = mutation[-1] - aa_pos = int(mutation[1:-1]) - - assert ( - ref_aa - == sample_gene.amino_acid_sequence[ - sample_gene.amino_acid_number == aa_pos - ][0] - ), "reference amino acid supplied in mutation does not match gene!" - - assert alt_aa != "!", "cannot mutate to STOP codon" - - ref_codon = sample_gene.codons[sample_gene.amino_acid_number == aa_pos][0] - - # Get base changes - alt_codon = None - for codon in self.amino_acid_to_codon[alt_aa]: - counter = sum(1 for a, b in zip(ref_codon, codon) if a != b) - if counter == 1: - alt_codon = codon - break - - base_pos = 3 * aa_pos - 2 - for i, j in zip(ref_codon, alt_codon): - if i != j: - ref_base = i - alt_base = j - break - base_pos += 1 - - assert ( - self.reference_gene.nucleotide_sequence[ - self.reference_gene.nucleotide_number == base_pos - ][0] - == ref_base - ) - - sample_gene.nucleotide_sequence[ - sample_gene.nucleotide_number == base_pos - ] = alt_base diff --git a/src/sbmlsim/Sample.py b/src/sbmlsim/Sample.py new file mode 100644 index 0000000..3bc1608 --- /dev/null +++ b/src/sbmlsim/Sample.py @@ -0,0 +1,86 @@ +import copy +import numpy + + +class Sample: + """ + Class for a single sample + + Args: + reference: reference gumpy.Gene object + """ + + def __init__(self, reference): + self.reference = reference + # make a copy of the reference gene + self.gene = copy.deepcopy(reference) + + aminoacids = "FFLLSSSSYY!!CC!WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG" + bases = ["t", "c", "a", "g"] + all_codons = numpy.array( + [a + b + c for a in bases for b in bases for c in bases] + ) + self.amino_acid_to_codon = {} + for amino_acid, codon in zip(aminoacids, all_codons): + self.amino_acid_to_codon.setdefault(amino_acid, []).append(codon) + + def apply_mutations(self, mutations): + """ + Apply mutations to the sample + + Args: + mutations: list of mutations to apply to the sample + """ + + for mutation in mutations: + ref_aa = mutation[0] + alt_aa = mutation[-1] + aa_pos = int(mutation[1:-1]) + + assert ( + ref_aa + == self.gene.amino_acid_sequence[self.gene.amino_acid_number == aa_pos][ + 0 + ] + ), "reference amino acid supplied in mutation does not match gene!" + + assert alt_aa != "!", "cannot mutate to STOP codon" + + ref_codon = self.gene.codons[self.gene.amino_acid_number == aa_pos][0] + + # Get base changes + alt_codon = None + for codon in self.amino_acid_to_codon[alt_aa]: + counter = sum(1 for a, b in zip(ref_codon, codon) if a != b) + if counter == 1: + alt_codon = codon + break + + base_pos = 3 * aa_pos - 2 + for i, j in zip(ref_codon, alt_codon): + if i != j: + ref_base = i + alt_base = j + break + base_pos += 1 + + assert ( + self.reference.nucleotide_sequence[ + self.reference.nucleotide_number == base_pos + ][0] + == ref_base + ) + + self.gene.nucleotide_sequence[ + self.gene.nucleotide_number == base_pos + ] = alt_base + + # translate the nucleotide sequence to amino acids + self.gene._translate_sequence() + + # create a string of the amino acid sequence + self.amino_acid_sequence = "".join(i for i in self.gene.amino_acid_sequence) + + diff = self.reference - self.gene + + self.mutations = diff.mutations diff --git a/src/sbmlsim/__init__.py b/src/sbmlsim/__init__.py index 662e9e2..8b3f9dc 100644 --- a/src/sbmlsim/__init__.py +++ b/src/sbmlsim/__init__.py @@ -3,3 +3,4 @@ __version__ = "0.0.1" from .Batch import Batch +from .Sample import Sample