From 213ae9b30c77216ccf956ba449723a53c61712cb Mon Sep 17 00:00:00 2001 From: clairenoble Date: Fri, 11 Jul 2025 10:39:59 +0100 Subject: [PATCH] Updated DNA blocks property. I found issues with some sequences where the blocks from different parents were being assigned different overhangs when I ran generate_libraries function when the amino acids in different variants were different at the breakpoints, leading to incompatible assemblies. The updated function takes into account the amino acids at the breakpoints for all the parents, so all the blocks for all the homologues can be made cross-compatible --- src/schemarecomb/libraries.py | 185 +++++++++++++++++++--------------- tests/unit/test_libraries.py | 49 +++++++-- 2 files changed, 144 insertions(+), 90 deletions(-) diff --git a/src/schemarecomb/libraries.py b/src/schemarecomb/libraries.py index a246c5f..1e77724 100644 --- a/src/schemarecomb/libraries.py +++ b/src/schemarecomb/libraries.py @@ -403,126 +403,143 @@ def _schemarecomb_version(self): @property def dna_blocks(self): - # Not cached_property for construction purposes. + """ + Generates DNA blocks for Golden Gate assembly. + + Finds a single 'design' codon/amino acid combination chosen by + the optimizer for each junction by testing all possibilities for all + homologues at the correct sequence position, ensuring consistent + overhangs. + """ try: return self._dna_blocks except AttributeError: - # Must calculate dna_blocks for first time. parents = self.energy_function.parents - aln_parents = zip(*parents.alignment) - + breakpoints = self.breakpoints blk_indices = self.block_indices - overhangs = self.gg_overhangs.copy() - # Need to add dummy start and end overhangs if blk_indices[0] or - # [-1] do not match breakpoints. - DUMMY_OVERHANG = Overhang(-1, '') - if blk_indices[0] != self.breakpoints[0].position: - overhangs.insert(0, DUMMY_OVERHANG) - if blk_indices[-1] != self.breakpoints[-1].position: - overhangs.append(DUMMY_OVERHANG) + breakpoint_map = breakpoints.copy() + if blk_indices[0] != breakpoints[0].position: + overhangs.insert(0, None) + breakpoint_map.insert(0, None) + + if blk_indices[-1] != breakpoints[-1].position: + overhangs.append(None) + breakpoint_map.append(None) - start_end_overhangs = [ohs for ohs in zip(overhangs, - overhangs[1:])] + start_end_overhangs = list(zip(overhangs, overhangs[1:])) + start_end_breakpoints = list( + zip(breakpoint_map, breakpoint_map[1:])) resenz = self.gg_enzyme oh_len = resenz.overhang_len - dna_blocks = [] - for parent_sr, parent_aa in zip(parents.records, aln_parents): + + for parent_sr in parents.records: + parent_aa = parent_sr.seq par_blocks = [] - for blk_num, (ohs, positions) \ - in enumerate(zip(start_end_overhangs, blk_indices)): - oh_start, oh_end = ohs + for blk_num, positions in enumerate(blk_indices): + oh_start, oh_end = start_end_overhangs[blk_num] + bp_start, bp_end = start_end_breakpoints[blk_num] start, end = positions - aa_block = [aa for aa in parent_aa[start:end] if aa != '-'] - - # Start and end are affected by the GG sites. - aa_start, *aa_block_internal, aa_end = aa_block - - cdn_options = [self.amino_to_cdn[aa] for aa - in aa_block_internal] - dna_block_internal = ''.join(choice(tuple(cdn_set)) - for cdn_set in cdn_options) - - # Handle start. - if oh_start == DUMMY_OVERHANG: - beg_dna = choice(tuple(self.amino_to_cdn[aa_start])) + aa_block = [aa for aa + in parent_aa[start:end] if aa != '-'] + aa_start_parent, *aa_block_internal, \ + aa_end_parent = aa_block + cdn_options = [self.amino_to_cdn[aa] for aa in + aa_block_internal] + dna_block_internal = ''.join( + choice(tuple(cdn_set)) for cdn_set in cdn_options) + + # Handle start of the block + if oh_start is None: + beg_dna = choice( + tuple(self.amino_to_cdn[aa_start_parent])) else: - # Number of dots at end of pattern. end_dot = 6 - oh_start.ind - oh_len + pattern = (oh_start.seq + '.' * end_dot)[-3:] + + pos = bp_start.position + possible_aas = {p.seq[pos] for p in parents.records if + p.seq[pos] != '-'} + + matching_codon = '' + for aa in possible_aas: + if aa in self.amino_to_cdn: + for codon in self.amino_to_cdn[aa]: + if fullmatch(pattern, codon): + matching_codon = codon + break + if matching_codon: + break + + if not matching_codon: + # Should never happen + raise ValueError( + f"Could not find a matching codon for pattern " + f"'{pattern}' at position {pos}." + f" Searched AAs: {possible_aas}") if end_dot: - # Find the valid codon pattern, e.g. ATG, CG* - pattern = (oh_start.seq + '.'*end_dot)[-3:] - matching_codon = '' - for cand_codon in self.amino_to_cdn[aa_start]: - if fullmatch(pattern, cand_codon) is not None: - matching_codon = cand_codon - break - if matching_codon == '': - # This should never happen. - raise ValueError('No matching codon found.') - - # Eliminate parts of codon overlapping with - # overhang. - oh_seq = oh_start.seq.lower() - overhang_amino = oh_seq + matching_codon[-end_dot:] + overhang_amino = oh_start.seq.lower()\ + + matching_codon[-end_dot:] else: - # Overhang encompasses amino. overhang_amino = oh_start.seq.lower() - - # TODO: add additional bases at the ends? beg_dna = resenz.for_restriction_site + overhang_amino - if oh_end == DUMMY_OVERHANG: - end_dna = choice(tuple(self.amino_to_cdn[aa_end])) + # Handle end of the block + if oh_end is None: + end_dna = choice( + tuple(self.amino_to_cdn[aa_end_parent])) else: - # Number of dots at beginning of pattern. begin_dot = oh_end.ind + pattern = ('.' * begin_dot + oh_end.seq)[:3] + + #: The amino acid being modified is at position-1 of + # the breakpoint. + pos = bp_end.position - 1 + + possible_aas = {p.seq[pos] for p in parents.records if + p.seq[pos] != '-'} + + matching_codon = '' + for aa in possible_aas: + if aa in self.amino_to_cdn: + for codon in self.amino_to_cdn[aa]: + if fullmatch(pattern, codon): + matching_codon = codon + break + if matching_codon: + break + + if not matching_codon: + # Should never raise + raise ValueError( + f"Could not find a matching codon for pattern" + f" '{pattern}' at position {pos + 1} " + f"(AA at pos {pos}). " + f"Searched AAs: {possible_aas}") if begin_dot: - pattern = ('.'*begin_dot + oh_end.seq)[:3] - matching_codon = '' - for cand_codon in self.amino_to_cdn[aa_end]: - if fullmatch(pattern, cand_codon) is not None: - matching_codon = cand_codon - break - if matching_codon == '': - # This should never happen. - raise ValueError('No matching codon found for' - ' end_dna.') - - oh_seq = oh_end.seq.lower() - amino_overhang = (matching_codon[:begin_dot] - + oh_seq) + amino_overhang = matching_codon[ + :begin_dot] + oh_end.seq.lower() else: - amino_overhang = oh_start.seq.lower() - + amino_overhang = oh_end.seq.lower() end_dna = amino_overhang + resenz.rev_restriction_site - # Add gg sites. block_seq = Seq(beg_dna + dna_block_internal + end_dna) - - # TODO: Maybe want to change description to include block - # info? - block_name = parent_sr.id + f'_block-{blk_num}' - block_sr = SeqRecord( - block_seq, - id=block_name, - name=block_name, - description=parent_sr.description - ) - + block_name = f'{parent_sr.id}_block-{blk_num}' + block_sr = SeqRecord(block_seq, id=block_name, + name=block_name, + description=f'{parent_sr.description}' + f' | block {blk_num}') par_blocks.append(block_sr) dna_blocks.extend(par_blocks) - # Cache so that the generated blocks don't change. self._dna_blocks = dna_blocks - return dna_blocks def find_best_overhangs(self) -> None: diff --git a/tests/unit/test_libraries.py b/tests/unit/test_libraries.py index 4a9025a..150fbf6 100644 --- a/tests/unit/test_libraries.py +++ b/tests/unit/test_libraries.py @@ -7,6 +7,7 @@ import pytest import schemarecomb as sr +from Bio import Seq @pytest.fixture @@ -159,6 +160,30 @@ def test_json(lib_config, breakpoints): assert in_lib.amino_to_cdn == lib.amino_to_cdn +def bsai_cut(frag): + """BsaI digest fragment""" + if frag[:6] == "ggtctc": + frag = frag[7:] + + if frag[-6:] == "gagacc": + frag = frag[:-7] + + return frag + + +def assemble(cut_frags): + """ Assemble compatible overhangs""" + overhangs = [] + assembly = cut_frags[0] + for frag in cut_frags[1:]: + if assembly[-4:] == frag[:4]: + assembly += frag[4:] + overhangs.append(frag[:4].upper()) + else: + raise ValueError + return assembly, overhangs + + def test_dna_blocks(lib_config, breakpoints): parents = lib_config.energy_function.parents max_ind = len(parents.alignment) @@ -168,13 +193,25 @@ def test_dna_blocks(lib_config, breakpoints): lib = sr.Library.calc_from_config(bp_set, Decimal(1.0), lib_config) - # TODO: Simulate Golden Gate and compare to parents. + # see how many fragments and parents in assembly. + no_frags = len(lib.block_indices) + hom_seq = parents.records + + # BsaI digest + cut_frags = [bsai_cut(frag.seq) for frag in lib.dna_blocks] + overhangs_list = [] + + for hom in range(0, len(hom_seq)): + start = (hom) * no_frags + end = (hom + 1) * no_frags + print(f'assembling {start} to {end}') + assembly, overhangs = assemble(cut_frags[start:end]) + overhangs_list.append(overhangs) + assembly_aa_seq = str(Seq.Seq(assembly).translate()) + + br_aa_seq = str(hom_seq[hom].seq).replace("-", "") - assert lib.dna_blocks - for a in lib.dna_blocks: - assert a - for b in a: - assert b + assert assembly_aa_seq == br_aa_seq # TODO: Test find_best_overhangs.