Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
185 changes: 101 additions & 84 deletions src/schemarecomb/libraries.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,126 +403,143 @@ def _schemarecomb_version(self):

@property
def dna_blocks(self):
# Not cached_property for construction purposes.
"""
Generates DNA blocks for Golden Gate assembly.

Finds a single 'design' codon/amino acid combination chosen by
the optimizer for each junction by testing all possibilities for all
homologues at the correct sequence position, ensuring consistent
overhangs.
"""
try:
return self._dna_blocks
except AttributeError:
# Must calculate dna_blocks for first time.
parents = self.energy_function.parents
aln_parents = zip(*parents.alignment)

breakpoints = self.breakpoints
blk_indices = self.block_indices

overhangs = self.gg_overhangs.copy()

# Need to add dummy start and end overhangs if blk_indices[0] or
# [-1] do not match breakpoints.
DUMMY_OVERHANG = Overhang(-1, '')
if blk_indices[0] != self.breakpoints[0].position:
overhangs.insert(0, DUMMY_OVERHANG)
if blk_indices[-1] != self.breakpoints[-1].position:
overhangs.append(DUMMY_OVERHANG)
breakpoint_map = breakpoints.copy()
if blk_indices[0] != breakpoints[0].position:
overhangs.insert(0, None)
breakpoint_map.insert(0, None)

if blk_indices[-1] != breakpoints[-1].position:
overhangs.append(None)
breakpoint_map.append(None)

start_end_overhangs = [ohs for ohs in zip(overhangs,
overhangs[1:])]
start_end_overhangs = list(zip(overhangs, overhangs[1:]))
start_end_breakpoints = list(
zip(breakpoint_map, breakpoint_map[1:]))

resenz = self.gg_enzyme
oh_len = resenz.overhang_len

dna_blocks = []
for parent_sr, parent_aa in zip(parents.records, aln_parents):

for parent_sr in parents.records:
parent_aa = parent_sr.seq
par_blocks = []
for blk_num, (ohs, positions) \
in enumerate(zip(start_end_overhangs, blk_indices)):
oh_start, oh_end = ohs
for blk_num, positions in enumerate(blk_indices):
oh_start, oh_end = start_end_overhangs[blk_num]
bp_start, bp_end = start_end_breakpoints[blk_num]
start, end = positions

aa_block = [aa for aa in parent_aa[start:end] if aa != '-']

# Start and end are affected by the GG sites.
aa_start, *aa_block_internal, aa_end = aa_block

cdn_options = [self.amino_to_cdn[aa] for aa
in aa_block_internal]
dna_block_internal = ''.join(choice(tuple(cdn_set))
for cdn_set in cdn_options)

# Handle start.
if oh_start == DUMMY_OVERHANG:
beg_dna = choice(tuple(self.amino_to_cdn[aa_start]))
aa_block = [aa for aa
in parent_aa[start:end] if aa != '-']
aa_start_parent, *aa_block_internal, \
aa_end_parent = aa_block
cdn_options = [self.amino_to_cdn[aa] for aa in
aa_block_internal]
dna_block_internal = ''.join(
choice(tuple(cdn_set)) for cdn_set in cdn_options)

# Handle start of the block
if oh_start is None:
beg_dna = choice(
tuple(self.amino_to_cdn[aa_start_parent]))
else:
# Number of dots at end of pattern.
end_dot = 6 - oh_start.ind - oh_len
pattern = (oh_start.seq + '.' * end_dot)[-3:]

pos = bp_start.position
possible_aas = {p.seq[pos] for p in parents.records if
p.seq[pos] != '-'}

matching_codon = ''
for aa in possible_aas:
if aa in self.amino_to_cdn:
for codon in self.amino_to_cdn[aa]:
if fullmatch(pattern, codon):
matching_codon = codon
break
if matching_codon:
break

if not matching_codon:
# Should never happen
raise ValueError(
f"Could not find a matching codon for pattern "
f"'{pattern}' at position {pos}."
f" Searched AAs: {possible_aas}")

if end_dot:
# Find the valid codon pattern, e.g. ATG, CG*
pattern = (oh_start.seq + '.'*end_dot)[-3:]
matching_codon = ''
for cand_codon in self.amino_to_cdn[aa_start]:
if fullmatch(pattern, cand_codon) is not None:
matching_codon = cand_codon
break
if matching_codon == '':
# This should never happen.
raise ValueError('No matching codon found.')

# Eliminate parts of codon overlapping with
# overhang.
oh_seq = oh_start.seq.lower()
overhang_amino = oh_seq + matching_codon[-end_dot:]
overhang_amino = oh_start.seq.lower()\
+ matching_codon[-end_dot:]
else:
# Overhang encompasses amino.
overhang_amino = oh_start.seq.lower()

# TODO: add additional bases at the ends?
beg_dna = resenz.for_restriction_site + overhang_amino

if oh_end == DUMMY_OVERHANG:
end_dna = choice(tuple(self.amino_to_cdn[aa_end]))
# Handle end of the block
if oh_end is None:
end_dna = choice(
tuple(self.amino_to_cdn[aa_end_parent]))
else:
# Number of dots at beginning of pattern.
begin_dot = oh_end.ind
pattern = ('.' * begin_dot + oh_end.seq)[:3]

#: The amino acid being modified is at position-1 of
# the breakpoint.
pos = bp_end.position - 1

possible_aas = {p.seq[pos] for p in parents.records if
p.seq[pos] != '-'}

matching_codon = ''
for aa in possible_aas:
if aa in self.amino_to_cdn:
for codon in self.amino_to_cdn[aa]:
if fullmatch(pattern, codon):
matching_codon = codon
break
if matching_codon:
break

if not matching_codon:
# Should never raise
raise ValueError(
f"Could not find a matching codon for pattern"
f" '{pattern}' at position {pos + 1} "
f"(AA at pos {pos}). "
f"Searched AAs: {possible_aas}")

if begin_dot:
pattern = ('.'*begin_dot + oh_end.seq)[:3]
matching_codon = ''
for cand_codon in self.amino_to_cdn[aa_end]:
if fullmatch(pattern, cand_codon) is not None:
matching_codon = cand_codon
break
if matching_codon == '':
# This should never happen.
raise ValueError('No matching codon found for'
' end_dna.')

oh_seq = oh_end.seq.lower()
amino_overhang = (matching_codon[:begin_dot]
+ oh_seq)
amino_overhang = matching_codon[
:begin_dot] + oh_end.seq.lower()
else:
amino_overhang = oh_start.seq.lower()

amino_overhang = oh_end.seq.lower()
end_dna = amino_overhang + resenz.rev_restriction_site

# Add gg sites.
block_seq = Seq(beg_dna + dna_block_internal + end_dna)

# TODO: Maybe want to change description to include block
# info?
block_name = parent_sr.id + f'_block-{blk_num}'
block_sr = SeqRecord(
block_seq,
id=block_name,
name=block_name,
description=parent_sr.description
)

block_name = f'{parent_sr.id}_block-{blk_num}'
block_sr = SeqRecord(block_seq, id=block_name,
name=block_name,
description=f'{parent_sr.description}'
f' | block {blk_num}')
par_blocks.append(block_sr)

dna_blocks.extend(par_blocks)

# Cache so that the generated blocks don't change.
self._dna_blocks = dna_blocks

return dna_blocks

def find_best_overhangs(self) -> None:
Expand Down
49 changes: 43 additions & 6 deletions tests/unit/test_libraries.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import pytest

import schemarecomb as sr
from Bio import Seq


@pytest.fixture
Expand Down Expand Up @@ -159,6 +160,30 @@ def test_json(lib_config, breakpoints):
assert in_lib.amino_to_cdn == lib.amino_to_cdn


def bsai_cut(frag):
"""BsaI digest fragment"""
if frag[:6] == "ggtctc":
frag = frag[7:]

if frag[-6:] == "gagacc":
frag = frag[:-7]

return frag


def assemble(cut_frags):
""" Assemble compatible overhangs"""
overhangs = []
assembly = cut_frags[0]
for frag in cut_frags[1:]:
if assembly[-4:] == frag[:4]:
assembly += frag[4:]
overhangs.append(frag[:4].upper())
else:
raise ValueError
return assembly, overhangs


def test_dna_blocks(lib_config, breakpoints):
parents = lib_config.energy_function.parents
max_ind = len(parents.alignment)
Expand All @@ -168,13 +193,25 @@ def test_dna_blocks(lib_config, breakpoints):

lib = sr.Library.calc_from_config(bp_set, Decimal(1.0), lib_config)

# TODO: Simulate Golden Gate and compare to parents.
# see how many fragments and parents in assembly.
no_frags = len(lib.block_indices)
hom_seq = parents.records

# BsaI digest
cut_frags = [bsai_cut(frag.seq) for frag in lib.dna_blocks]
overhangs_list = []

for hom in range(0, len(hom_seq)):
start = (hom) * no_frags
end = (hom + 1) * no_frags
print(f'assembling {start} to {end}')
assembly, overhangs = assemble(cut_frags[start:end])
overhangs_list.append(overhangs)
assembly_aa_seq = str(Seq.Seq(assembly).translate())

br_aa_seq = str(hom_seq[hom].seq).replace("-", "")

assert lib.dna_blocks
for a in lib.dna_blocks:
assert a
for b in a:
assert b
assert assembly_aa_seq == br_aa_seq


# TODO: Test find_best_overhangs.