From dd35cf2f1a2b38a30f0c5607356f82faa0a66f56 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Thu, 22 Jan 2026 22:41:37 +0300 Subject: [PATCH 01/41] transferred universal barcode detector, in progress --- detect_barcodes.py | 56 ++-- src/barcode_calling/__init__.py | 5 + src/barcode_calling/callers/__init__.py | 6 + src/barcode_calling/callers/base.py | 11 +- src/barcode_calling/callers/curio.py | 6 + .../callers/extraction_result.py | 121 +++++++ .../callers/molecule_structure.py | 169 ++++++++++ src/barcode_calling/callers/stereo.py | 6 + src/barcode_calling/callers/tenx.py | 5 + .../callers/universal_extraction.py | 305 ++++++++++++++++++ src/barcode_calling/common.py | 82 ++++- src/barcode_calling/indexers/base.py | 36 +++ src/modes.py | 7 +- 13 files changed, 774 insertions(+), 41 deletions(-) create mode 100644 src/barcode_calling/callers/extraction_result.py create mode 100644 src/barcode_calling/callers/molecule_structure.py create mode 100644 src/barcode_calling/callers/universal_extraction.py diff --git a/detect_barcodes.py b/detect_barcodes.py index de20df22..81597666 100755 --- a/detect_barcodes.py +++ b/detect_barcodes.py @@ -16,14 +16,13 @@ from traceback import print_exc import shutil from collections import defaultdict -import numpy import pysam from Bio import SeqIO, Seq, SeqRecord import logging from src.modes import IsoQuantMode from src.error_codes import IsoQuantExitCode -from src.barcode_calling.common import bit_to_str, reverese_complement +from src.barcode_calling.common import reverese_complement, load_barcodes from src.barcode_calling import ( TenXBarcodeDetector, CurioBarcodeDetector, @@ -31,6 +30,8 @@ SharedMemoryStereoSplittingBarcodeDetector, ReadStats, VisiumHDBarcodeDetector, + UniversalSingleMoleculeExtractor, + MoleculeStructure ) logger = logging.getLogger('IsoQuant') @@ -43,7 +44,8 @@ IsoQuantMode.stereoseq_nosplit: SharedMemoryStereoBarcodeDetector, IsoQuantMode.stereoseq: SharedMemoryStereoSplittingBarcodeDetector, IsoQuantMode.visium_5prime: TenXBarcodeDetector, - IsoQuantMode.visium_hd: VisiumHDBarcodeDetector + IsoQuantMode.visium_hd: VisiumHDBarcodeDetector, + IsoQuantMode.custom_sc: UniversalSingleMoleculeExtractor } BARCODE_FILES_REQUIRED = {IsoQuantMode.tenX_v3: [1], @@ -51,7 +53,8 @@ IsoQuantMode.stereoseq_nosplit: [1], IsoQuantMode.stereoseq: [1], IsoQuantMode.visium_5prime: [1], - IsoQuantMode.visium_hd: [2] + IsoQuantMode.visium_hd: [2], + IsoQuantMode.custom_sc: [0] } @@ -108,7 +111,7 @@ def __init__(self, output_file_name, barcode_detector, header=False, output_sequ self.output_sequences_file = open(self.output_sequences, "w") self.process_function = self._process_read_split if header: - self.output_file.write(barcode_detector.result_type().header() + "\n") + self.output_file.write(barcode_detector.header() + "\n") self.read_stat = ReadStats() def get_stats(self): @@ -261,12 +264,23 @@ def process_chunk(barcode_detector, read_chunk, output_file, num, out_fasta=None def prepare_barcodes(args): + if args.mode == IsoQuantMode.custom_sc: + if not args.molecule: + logger.critical("Custom single-cell/spatial mode requires molecule description to be provided via --molecule") + sys.exit(IsoQuantExitCode.INCOMPATIBLE_OPTIONS) + if not os.path.isfile(args.molecule): + logger.critical("Molecule file %s does not exist" % args.molecule) + sys.exit(IsoQuantExitCode.INPUT_FILE_NOT_FOUND) + + return MoleculeStructure(open(args.molecule)) + logger.info("Using barcodes from %s" % ", ".join(args.barcodes)) barcode_files = len(args.barcodes) if barcode_files not in BARCODE_FILES_REQUIRED[args.mode]: logger.critical("Barcode calling mode %s requires %s files, %d provided" % (args.mode.name, " or ".join([str(x) for x in BARCODE_FILES_REQUIRED[args.mode]]), barcode_files)) sys.exit(IsoQuantExitCode.BARCODE_FILE_COUNT_MISMATCH) + barcodes = [] for bc in args.barcodes: barcodes.append(load_barcodes(bc, needs_iterator=args.mode.needs_barcode_iterator())) @@ -399,7 +413,7 @@ def _process_single_file_in_parallel(input_file, output_tsv, out_fasta, args, ba with open(output_tsv, "w") as final_output_tsv: final_output_fasta = open(out_fasta, "w") if out_fasta else None - header = BARCODE_CALLING_MODES[args.mode].result_type().header() + header = BARCODE_CALLING_MODES[args.mode].header() final_output_tsv.write(header + "\n") stat_dict = defaultdict(int) for tmp_file, tmp_fasta in output_files: @@ -439,34 +453,6 @@ def process_in_parallel(args): logger.info("Finished barcode calling") -def load_barcodes(inf, needs_iterator=False): - if inf.endswith("h5") or inf.endswith("hdf5"): - return load_h5_barcodes_bit(inf) - - if inf.endswith("gz") or inf.endswith("gzip"): - handle = gzip.open(inf, "rt") - else: - handle = open(inf, "r") - - barcode_iterator = iter(l.strip().split()[0] for l in handle) - if needs_iterator: - return barcode_iterator - - return [b for b in barcode_iterator] - - -def load_h5_barcodes_bit(h5_file_path, dataset_name='bpMatrix_1'): - raise NotImplementedError() - import h5py - barcode_list = [] - with h5py.File(h5_file_path, 'r') as h5_file: - dataset = numpy.array(h5_file[dataset_name]) - for row in dataset: - for col in row: - barcode_list.append(bit_to_str(int(col[0]))) - return barcode_list - - def set_logger(logger_instance, args): logger_instance.setLevel(logging.INFO) if args.debug: @@ -492,6 +478,8 @@ def add_hidden_option(*args, **kwargs): # show command only with --full-help # parser.add_argument("--umi", "-u", type=str, help="potential UMIs, detected de novo if not set") parser.add_argument("--mode", type=str, help="mode to be used", choices=[x.name for x in BARCODE_CALLING_MODES.keys()], default=IsoQuantMode.stereoseq.name) + parser.add_argument("--molecule", type=str, help="MDF files with molecule description (for custom_sc mode only)") + parser.add_argument("--input", "-i", nargs='+', type=str, help="input reads in [gzipped] FASTA, FASTQ, BAM, SAM", required=True) parser.add_argument("--threads", "-t", type=int, help="threads to use (16)", default=16) diff --git a/src/barcode_calling/__init__.py b/src/barcode_calling/__init__.py index f04b78c6..ae7cca81 100644 --- a/src/barcode_calling/__init__.py +++ b/src/barcode_calling/__init__.py @@ -49,6 +49,8 @@ SharedMemoryWrapper, TenXBarcodeDetector, VisiumHDBarcodeDetector, + UniversalSingleMoleculeExtractor, + MoleculeStructure ) # Utilities @@ -82,6 +84,9 @@ # 10x detectors 'TenXBarcodeDetector', 'VisiumHDBarcodeDetector', + # universal calling + 'UniversalSingleMoleculeExtractor', + 'MoleculeStructure', # Utilities 'str_to_2bit', 'bit_to_str', diff --git a/src/barcode_calling/callers/__init__.py b/src/barcode_calling/callers/__init__.py index 4263cf5d..76ff7930 100644 --- a/src/barcode_calling/callers/__init__.py +++ b/src/barcode_calling/callers/__init__.py @@ -55,6 +55,10 @@ VisiumHDBarcodeDetector, ) +from .molecule_structure import MoleculeStructure + +from .universal_extraction import UniversalSingleMoleculeExtractor + __all__ = [ # Result classes 'BarcodeDetectionResult', @@ -76,4 +80,6 @@ # 10x detectors 'TenXBarcodeDetector', 'VisiumHDBarcodeDetector', + 'MoleculeStructure', + 'UniversalSingleMoleculeExtractor', ] diff --git a/src/barcode_calling/callers/base.py b/src/barcode_calling/callers/base.py index bc6a9258..0f86ca5d 100644 --- a/src/barcode_calling/callers/base.py +++ b/src/barcode_calling/callers/base.py @@ -30,6 +30,9 @@ def increase_if_valid(val: Optional[int], delta: int) -> Optional[int]: return val + + + class BarcodeDetectionResult: """ Base class for barcode detection results. @@ -333,11 +336,11 @@ def add_read(self, barcode_detection_result: BarcodeDetectionResult) -> None: for a in barcode_detection_result.get_additional_attributes(): self.additional_attributes_counts[a] += 1 # Count valid barcode - if barcode_detection_result.barcode != BarcodeDetectionResult.NOSEQ: - self.bc_count += 1 +# if barcode_detection_result.barcode != BarcodeDetectionResult.NOSEQ: +# self.bc_count += 1 # Count valid UMI - if barcode_detection_result.UMI_good: - self.umi_count += 1 +# if barcode_detection_result.UMI_good: +# self.umi_count += 1 def add_custom_stats(self, stat_name: str, val: int) -> None: """ diff --git a/src/barcode_calling/callers/curio.py b/src/barcode_calling/callers/curio.py index 24434359..3335060c 100644 --- a/src/barcode_calling/callers/curio.py +++ b/src/barcode_calling/callers/curio.py @@ -193,6 +193,9 @@ def _find_barcode_umi_fwd(self, read_id: str, sequence: str) -> LinkerBarcodeDet def result_type(): return LinkerBarcodeDetectionResult + def header(self): + return self.result_type().header() + class CurioIlluminaDetector: """ @@ -329,3 +332,6 @@ def _find_barcode_umi_fwd(self, read_id: str, sequence: str) -> LinkerBarcodeDet @staticmethod def result_type(): return LinkerBarcodeDetectionResult + + def header(self): + return self.result_type().header() \ No newline at end of file diff --git a/src/barcode_calling/callers/extraction_result.py b/src/barcode_calling/callers/extraction_result.py new file mode 100644 index 00000000..8df54072 --- /dev/null +++ b/src/barcode_calling/callers/extraction_result.py @@ -0,0 +1,121 @@ +########################################################################### +# Copyright (c) 2026 University of Helsinki +# # All Rights Reserved +# See file LICENSE for details. +############################################################################ + +import logging +from collections import defaultdict +from .molecule_structure import MoleculeStructure, ElementType + +logger = logging.getLogger('IsoQuant') + + +class DetectedElement: + def __init__(self, start=-1, end=-1, score=-1, seq=None): + self.start = start + self.end = end + self.score = score + self.seq = seq + + +class ExtractionResult: + def __init__(self, molecule_structure: MoleculeStructure, read_id: str, strand: str, detected_results: dict): + self.molecule_structure = molecule_structure + self.read_id = read_id + self.strand = strand + self.detected_results = detected_results + + def is_valid(self) -> bool: + """Check if a valid barcode was detected.""" + return True + + def update_coordinates(self, delta: int) -> None: + """ + Shift all genomic coordinates by delta. + + Used when processing read subsequences. + + Args: + delta: Amount to shift coordinates + """ + pass + + def more_informative_than(self, that) -> bool: + """ + Compare two results to determine which is more informative. + + Args: + that: Another detection result + + Returns: + True if this result is more informative + + Raises: + NotImplementedError: Must be implemented by subclasses + """ + raise NotImplementedError() + + def get_additional_attributes(self): + """ + Get list of detected additional features (primer, linker, etc.). + + Returns: + List of detected feature names + + Raises: + NotImplementedError: Must be implemented by subclasses + """ + return [] + + def set_strand(self, strand: str) -> None: + """Set the detected strand.""" + self.strand = strand + + def __str__(self) -> str: + res_str = "%s\t%s" % (self.read_id, self.strand) + for el in self.molecule_structure: + if el.element_type == ElementType.cDNA: continue + if el.element_type == ElementType.PolyT: + if ElementType.PolyT.name not in self.detected_results: + res_str += "\t-1\t-1" + continue + detected_element = self.detected_results[ElementType.PolyT.name] + res_str += "\t%d\t%d" % (detected_element.start, detected_element.end) + elif el.element_type.needs_only_coordinates(): + if el.element_name not in self.detected_results: + res_str += "\t-1\t-1" + continue + detected_element = self.detected_results[el.element_name] + res_str += "\t%d\t%d" % (detected_element.start, detected_element.end) + elif el.element_type.needs_sequence_extraction() or el.element_type.needs_correction(): + if el.element_name not in self.detected_results: + res_str += "\t-1\t-1\t*" + continue + detected_element = self.detected_results[el.element_name] + res_str += "\t%d\t%d\t%s" % (detected_element.start, detected_element.end, detected_element.seq) + return res_str + + def header(self) -> str: + """Get TSV header for result output.""" + return self.molecule_structure.header() + + # TODO: add simple serialization + + +class ReadStats: + def __init__(self): + self.read_count = 0 + self.pattern_counts = defaultdict(int) + + def add_read(self, barcode_detection_result): + self.read_count += 1 + for el in barcode_detection_result.detected_results: + if barcode_detection_result.detected_results[el].start != -1: + self.pattern_counts[el] += 1 + + def __str__(self): + human_readable_str = ("Total reads:\t%d\n" % self.read_count) + for a in self.pattern_counts: + human_readable_str += "%s:\t%d\n" % (a, self.pattern_counts[a]) + return human_readable_str diff --git a/src/barcode_calling/callers/molecule_structure.py b/src/barcode_calling/callers/molecule_structure.py new file mode 100644 index 00000000..0bf29f17 --- /dev/null +++ b/src/barcode_calling/callers/molecule_structure.py @@ -0,0 +1,169 @@ +########################################################################### +# Copyright (c) 2026 University of Helsinki +# # All Rights Reserved +# See file LICENSE for details. +############################################################################ + +import logging +import sys +from enum import unique, Enum +from typing import Iterator + +from ...error_codes import IsoQuantExitCode +from ..common import load_barcodes + +logger = logging.getLogger('IsoQuant') + + +@unique +class ElementType(Enum): + PolyT = 1 + cDNA = 2 + CONST = 10 + VAR_ANY = 20 + VAR_LIST = 21 + VAR_FILE = 22 + VAR_ANY_SEPARATOR = 31 + VAR_ANY_NON_T_SEPARATOR = 32 + + def needs_correction(self): + return self in (ElementType.VAR_FILE, ElementType.VAR_LIST) + + def needs_sequence_extraction(self): + return self == ElementType.VAR_ANY + + def needs_only_coordinates(self): + return self in (ElementType.CONST, ElementType.VAR_ANY_SEPARATOR, ElementType.VAR_ANY_NON_T_SEPARATOR) + + +class MoleculeElement: + def __init__(self, element_name: str, element_type: ElementType, element_value_1 = None, element_value_2 = None): + self.element_name = element_name + self.element_type = element_type + + if self.element_type in [ElementType.PolyT, ElementType.cDNA]: + self.element_value = None + self.element_length = -1 + elif self.element_type == ElementType.CONST: + self.element_value = element_value_1 + self.element_length = len(self.element_value) + elif self.element_type == ElementType.VAR_FILE: + self.element_value = load_barcodes(element_value_1, needs_iterator=False) + if element_value_2 is None: + self.element_length = len(self.element_value[0]) + else: + try: + self.element_length = int(element_value_1) + except ValueError: + logger.critical("Incorrectly specified length for element %s: %s" % (self.element_name, element_value_1)) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + elif self.element_type == ElementType.VAR_LIST: + self.element_value = element_value_1.split(',') + if element_value_2 is None: + self.element_length = len(self.element_value[0]) + else: + try: + self.element_length = int(element_value_1) + except ValueError: + logger.critical( + "Incorrectly specified length for element %s: %s" % (self.element_name, element_value_1)) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + elif self.element_type in (ElementType.VAR_ANY, ElementType.VAR_ANY_SEPARATOR, ElementType.VAR_ANY_NON_T_SEPARATOR): + self.element_value = None + try: + self.element_length = int(element_value_1) + except ValueError: + logger.critical("Incorrectly specified length for element %s: %s" % (self.element_name, element_value_1)) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + else: + logger.critical("Wrong element type %s" % element_type) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + + def is_variable(self): + return self.element_type in {ElementType.VAR_ANY, ElementType.VAR_LIST, ElementType.VAR_FILE} + + +class MoleculeStructure: + CONCAT_DELIM = "|" + DUPL_DELIM = "/" + + def __init__(self, str_iterator): + self.ordered_elements: list[MoleculeElement] = [] + self.elements_to_concatenate = {} + self.duplicated_elements = {} + + l = next(str_iterator) + elements = list(map(lambda x: x.strip(), l.strip().split(':'))) + element_properties = {} + for l in str_iterator: + v = l.strip().split() + element_name = v[0] + if element_name in element_properties: + logger.critical("Duplicated element name %s" % element_name) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + + if len(v) == 3: + element_properties[element_name] = (v[1], v[2], None) + elif len(v) == 4: + element_properties[element_name] = (v[1], v[2], v[3]) + else: + logger.critical("Incorrect number of properties in line %s" % l.strip()) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + + for el in elements: + if el not in element_properties: + if el not in ElementType.__dict__: + logger.critical("Molecule element %s was not described in the format file" % el) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + element_type = ElementType[el] + self.ordered_elements.append(MoleculeElement(el, element_type)) + else: + element_type, element_val1, element_val2 = element_properties[el] + if element_type not in ElementType.__dict__: + logger.critical("Molecule element type %s is not among the possible types" % element_type) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + element_type = ElementType[element_type] + self.ordered_elements.append(MoleculeElement(el, element_type, element_val1, element_val2)) + + if self.CONCAT_DELIM in el: + v = el.split(self.CONCAT_DELIM) + if len(v) != 2: + logger.critical("Incorrect concatenated element %s" % el) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + self.elements_to_concatenate[el] = v[0] + elif self.DUPL_DELIM in el: + v = el.split(self.DUPL_DELIM) + if len(v) != 2: + logger.critical("Incorrect duplicated element %s" % el) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + self.duplicated_elements[el] = v[0] + + @classmethod + def from_element_list(cls, element_list): + ms = cls.__new__(cls) + ms.ordered_elements = element_list + return ms + + def __iter__(self) -> Iterator[MoleculeElement]: + for e in self.ordered_elements: + yield e + + def header(self): + printed_elements = set() + header = "#read_id\tstrand" + for el in self.ordered_elements: + if el.element_type == ElementType.cDNA: continue + if el.element_type == ElementType.PolyT: + header += "\tpolyT_start\tpolyT_end" + elif el.element_type.needs_only_coordinates(): + header += "\t%s_start\t%s_end" % (el.element_name, el.element_name) + elif el.element_type.needs_sequence_extraction() or el.element_type.needs_correction(): + if el.element_name in self.elements_to_concatenate.keys(): + if self.elements_to_concatenate[el.element_name] in printed_elements: + continue + printed_elements.add(self.elements_to_concatenate[el.element_name]) + header += "\t%s_start\t%s_end\t%s_sequence" % (el.element_name, el.element_name, + self.elements_to_concatenate[el.element_name]) + else: + header += "\t%s_start\t%s_end\t%s_sequence" % (el.element_name, el.element_name, el.element_name) + return header diff --git a/src/barcode_calling/callers/stereo.py b/src/barcode_calling/callers/stereo.py index bdcbe8c0..953f96e2 100644 --- a/src/barcode_calling/callers/stereo.py +++ b/src/barcode_calling/callers/stereo.py @@ -203,6 +203,9 @@ def _find_barcode_umi_fwd(self, read_id: str, sequence: str) -> LinkerBarcodeDet def result_type(): return LinkerBarcodeDetectionResult + def header(self): + return self.result_type().header() + class SharedMemoryStereoBarcodeDetector(StereoBarcodeDetector): """Stereo-seq detector with shared memory for large barcode sets.""" @@ -490,6 +493,9 @@ def _find_barcode_umi_fwd(self, read_id: str, sequence: str) -> TSOBarcodeDetect def result_type(): return SplittingBarcodeDetectionResult + def header(self): + return self.result_type().header() + class SharedMemoryStereoSplittingBarcodeDetector(StereoSplittingBarcodeDetector): """Stereo-seq splitting detector with shared memory for large barcode sets.""" diff --git a/src/barcode_calling/callers/tenx.py b/src/barcode_calling/callers/tenx.py index 449b7798..35bbc839 100644 --- a/src/barcode_calling/callers/tenx.py +++ b/src/barcode_calling/callers/tenx.py @@ -175,6 +175,8 @@ def find_barcode_umi_no_polya(self, read_id: str, sequence: str) -> TenXBarcodeD def result_type(): return TenXBarcodeDetectionResult + def header(self): + return self.result_type().header() class VisiumHDBarcodeDetector: """Visium HD spatial transcriptomics barcode detector.""" @@ -344,3 +346,6 @@ def find_barcode_umi_no_polya(self, read_id: str, sequence: str) -> TenXBarcodeD @staticmethod def result_type(): return TenXBarcodeDetectionResult + + def header(self): + return self.result_type().header() \ No newline at end of file diff --git a/src/barcode_calling/callers/universal_extraction.py b/src/barcode_calling/callers/universal_extraction.py new file mode 100644 index 00000000..24911894 --- /dev/null +++ b/src/barcode_calling/callers/universal_extraction.py @@ -0,0 +1,305 @@ +########################################################################### +# Copyright (c) 2026 University of Helsinki +# # All Rights Reserved +# See file LICENSE for details. +############################################################################ + +import logging +import sys + +from ...error_codes import IsoQuantExitCode +from .extraction_result import DetectedElement, ExtractionResult +from ..indexers.base import KmerIndexer +from ..common import find_polyt, reverese_complement, detect_exact_positions +from .molecule_structure import ElementType, MoleculeStructure, MoleculeElement + +logger = logging.getLogger('IsoQuant') + + +class UniversalSingleMoleculeExtractor: + MIN_SCORE_COEFF = 0.75 + MIN_SCORE_COEFF_TERMMINAL = 0.5 + TERMINAL_MATCH_DELTA = 3 + MAX_LEN_DIFF = 0.2 + + def __init__(self, molecule_structure: MoleculeStructure): + self.correct_sequences = True + self.molecule_structure = molecule_structure + self.index_dict = {} + self.has_polyt = False + self.has_cdna = False + self.elements_to_detect = set() + self.elements_to_extract = set() + self.elements_to_correct = set() + + for el in self.molecule_structure: + if el.element_type == ElementType.CONST: + self.index_dict[el.element_name] = KmerIndexer([el.element_value], + max(6, int(el.element_length / 2) - 2)) + elif el.element_type == ElementType.PolyT: + if self.has_polyt: + logger.critical( + "Current version only supports a single polyT, extraction results may be suboptimal") + sys.exit(IsoQuantExitCode.INVALID_PARAMETER) + self.has_polyt = True + elif el.element_type == ElementType.cDNA: + if self.has_cdna: + logger.critical( + "Current version only supports a single cDNA, reads will not be split into molecules") + sys.exit(IsoQuantExitCode.INVALID_PARAMETER) + self.has_cdna = True + elif el.element_type.needs_only_coordinates(): + self.elements_to_detect.add(el.element_name) + elif el.element_type.needs_sequence_extraction(): + self.elements_to_extract.add(el.element_name) + elif el.element_type.needs_sequence_extraction(): + if self.correct_sequences: + self.index_dict[el.element_name] = self.prepare_barcode_index(el) + self.elements_to_correct.add(el.element_name) + else: + self.elements_to_extract.add(el.element_name) + + if not self.has_cdna: + logger.critical("Molecule must include a cDNA") + sys.exit(IsoQuantExitCode.INVALID_PARAMETER) + + def prepare_barcode_index(self, element: MoleculeElement): + if element.element_type not in (ElementType.VAR_LIST, ElementType.VAR_FILE): + return None + barcode_list = element.element_value + + # TODO: use optimal indices for long barocde lists + return KmerIndexer(barcode_list, max(6, int(element.element_length / 2) - 2)) + + def result_type(self): + return ExtractionResult + + def header(self): + return self.molecule_structure.header() + + def find_barcode_umi(self, read_id, sequence): + detected_elements_fwd = self._find_patterns_fwd(read_id, sequence) + rev_seq = reverese_complement(sequence) + detected_elements_rev = self._find_patterns_fwd(read_id, rev_seq) + + if self.has_polyt: + fwd_has_polyt = ElementType.PolyT.name in detected_elements_fwd + rev_has_polyt = ElementType.PolyT.name in detected_elements_rev + if fwd_has_polyt and not rev_has_polyt: + return ExtractionResult(self.molecule_structure, read_id, '+', detected_elements_fwd) + if not fwd_has_polyt and rev_has_polyt: + return ExtractionResult(self.molecule_structure, read_id, '-', detected_elements_rev) + + if len(detected_elements_fwd) >= len(detected_elements_rev): + return ExtractionResult(self.molecule_structure, read_id, '+', detected_elements_fwd) + return ExtractionResult(self.molecule_structure, read_id, '-', detected_elements_rev) + + def _find_patterns_fwd(self, read_id, sequence): + logger.debug("== read id %s ==" % read_id) + # TODO: make a list of DetectedElement rather that dict of str->DetectedElement for speed + detected_elements = {} + polyt_start, polyt_end = -1, -1 + if self.has_polyt: + polyt_start, polyt_end = find_polyt(sequence) + detected_elements[ElementType.PolyT.name] = DetectedElement(polyt_start, polyt_end, 0) + logger.debug("PolyT %d" % polyt_start) + + self.detect_const_elements(sequence, polyt_start, polyt_end, detected_elements) + self.extract_variable_elements5(detected_elements, sequence) + self.extract_variable_elements3(detected_elements, sequence) + logger.debug("== end read id %s ==" % read_id) + return detected_elements + + def concatenate_elements(self, detected_results): + pass + + def extract_variable_elements5(self, detected_elements, sequence): + first_detected_const_element = None + for i, el in enumerate(self.molecule_structure): + if el.element_type == ElementType.cDNA: + break + if el.element_type in [ElementType.CONST, ElementType.PolyT] and el.element_name in detected_elements: + first_detected_const_element = i + break + + if first_detected_const_element is None: return + + # extracting elements preceding first detected const element + current_pos = detected_elements[ + self.molecule_structure.ordered_elements[first_detected_const_element].element_name].start - 1 + for i in range(first_detected_const_element - 1, -1, -1): + el: MoleculeElement = self.molecule_structure.ordered_elements[i] + if not el.is_variable(): break + potential_end = current_pos + potential_start = potential_end - el.element_length + 1 + if potential_start < 0: break + detected_elements[el.element_name] = DetectedElement(potential_start, potential_end, 0, + seq=sequence[potential_start:potential_end + 1]) + current_pos = potential_start - 1 + + current_pos = detected_elements[ + self.molecule_structure.ordered_elements[first_detected_const_element].element_name].end + 1 + for i in range(first_detected_const_element + 1, len(self.molecule_structure.ordered_elements)): + if current_pos >= len(sequence): + break + + el = self.molecule_structure.ordered_elements[i] + if el.element_type in [ElementType.cDNA, ElementType.PolyT]: + break + elif el.element_type == ElementType.CONST: + if el.element_name in detected_elements: + current_pos = detected_elements[el.element_name].end + 1 + else: + current_pos += el.element_length + continue + + potential_start = current_pos + if i + 1 == len(self.molecule_structure.ordered_elements): + potential_end = potential_start + el.element_length - 1 + else: + next_el = self.molecule_structure.ordered_elements[i + 1] + if next_el.element_name in detected_elements: + potential_end = detected_elements[next_el.element_name].start - 1 + else: + potential_end = potential_start + el.element_length - 1 + if potential_end >= len(sequence): + potential_end = len(sequence) + 1 + + potential_len = potential_end - potential_start + 1 + if abs(potential_len - el.element_length) <= self.MAX_LEN_DIFF * el.element_length: + detected_elements[el.elemenst_name] = DetectedElement(potential_start, potential_end, 0, + seq=sequence[potential_start:potential_end + 1]) + current_pos = potential_end + 1 + + def extract_variable_elements3(self, detected_elements, sequence): + last_detected_const_element = None + for i in range(len(self.molecule_structure.ordered_elements) - 1, -1, -1): + el = self.molecule_structure.ordered_elements[i] + if el.element_type == ElementType.cDNA: + break + if el.element_type in [ElementType.CONST, ElementType.PolyT] and el.element_name in detected_elements: + last_detected_const_element = i + break + + if last_detected_const_element is None: return + + # extracting elements following last detected const element + current_pos = detected_elements[ + self.molecule_structure.ordered_elements[last_detected_const_element].element_name].end + 1 + for i in range(last_detected_const_element + 1, len(self.molecule_structure.ordered_elements)): + el = self.molecule_structure.ordered_elements[i] + if not el.is_variable(): break + potential_start = current_pos + potential_end = potential_start + el.element_length - 1 + if potential_end >= len(sequence): break + detected_elements[el.element_name] = DetectedElement(potential_start, potential_end, 0, + seq=sequence[potential_start:potential_end + 1]) + current_pos = potential_end + 1 + + current_pos = detected_elements[ + self.molecule_structure.ordered_elements[last_detected_const_element].element_name].start - 1 + for i in range(last_detected_const_element - 1, -1, -1): + if current_pos <= 0: + break + el = self.molecule_structure.ordered_elements[i] + if el.element_type in [ElementType.cDNA, ElementType.PolyT]: + break + elif el.element_type == ElementType.CONST: + if el.element_name in detected_elements: + current_pos = detected_elements[el.element_name].start - 1 + else: + current_pos -= el.element_length + continue + + potential_end = current_pos + if i == 0: + potential_start = potential_end - el.element_length + 1 + else: + prev_el = self.molecule_structure.ordered_elements[i - 1] + if prev_el.element_name in detected_elements: + potential_start = detected_elements[prev_el].end + 1 + else: + potential_start = potential_end - el.element_length + 1 + if potential_start < 0: + potential_start = 0 + + potential_len = potential_end - potential_start + 1 + if abs(potential_len - el.element_length) <= self.MAX_LEN_DIFF * el.element_length: + detected_elements[el.element_name] = DetectedElement(potential_start, potential_end, 0, + seq=sequence[potential_start:potential_end + 1]) + current_pos = potential_start - 1 + + def detect_const_elements(self, sequence, polyt_start, polyt_end, detected_elements): + # searching left of cDNA + first_element_detected = False + current_search_start = 0 + for i, el in enumerate(self.molecule_structure): + if el.element_type in [ElementType.cDNA, ElementType.PolyT]: + break + + if el.element_type != ElementType.CONST: + continue + + search_start = current_search_start + search_end = len(sequence) if polyt_start == -1 else polyt_start + 1 + element_occurrences = self.index_dict[el.element_name].get_occurrences_substr(sequence, search_start, + search_end) + if not first_element_detected: + min_score = int(el.element_length * self.MIN_SCORE_COEFF_TERMMINAL) + start_delta = -1 + else: + min_score = int(el.element_length * self.MIN_SCORE_COEFF) + start_delta = self.TERMINAL_MATCH_DELTA + + element_start, element_end = detect_exact_positions(sequence, + search_start, search_end, + self.index_dict[el.element_name].k, + el.element_value, + element_occurrences, + min_score=min_score, + start_delta=start_delta, + end_delta=self.TERMINAL_MATCH_DELTA) + + if element_start is not None: + current_search_start = element_end + if not first_element_detected: + first_element_detected = True + + detected_elements[el.element_name] = DetectedElement(element_start, element_end) + + last_element_detected = False + current_search_end = len(sequence) + for i in range(len(self.molecule_structure.ordered_elements) - 1, -1, -1): + el = self.molecule_structure.ordered_elements[i] + if el.element_type in [ElementType.cDNA, ElementType.PolyT]: + break + + if el.element_type != ElementType.CONST: + continue + + search_start = 0 if polyt_start == -1 else polyt_end + 1 + search_end = current_search_end + element_occurrences = self.index_dict[el.element_name].get_occurrences_substr(sequence, search_start, + search_end) + + if not last_element_detected: + min_score = int(el.element_length * self.MIN_SCORE_COEFF_TERMMINAL) + end_delta = -1 + else: + min_score = int(el.element_length * self.MIN_SCORE_COEFF) + end_delta = self.TERMINAL_MATCH_DELTA + + element_start, element_end = detect_exact_positions(sequence, + search_start, search_end, + self.index_dict[el.element_name].k, + el.element_value, + element_occurrences, + min_score=min_score, + start_delta=self.TERMINAL_MATCH_DELTA, + end_delta=end_delta) + if element_start is not None: + current_search_end = element_end + if not last_element_detected: + last_element_detected = True + + detected_elements[el.element_name] = DetectedElement(element_start, element_end) diff --git a/src/barcode_calling/common.py b/src/barcode_calling/common.py index f1b88d7f..d2b2fbe4 100644 --- a/src/barcode_calling/common.py +++ b/src/barcode_calling/common.py @@ -4,11 +4,16 @@ # See file LICENSE for details. ############################################################################ import copy +import os +import sys import logging - +import gzip import numpy as np from ssw import AlignmentMgr +from ..error_codes import IsoQuantExitCode + + logger = logging.getLogger('IsoQuant') # Try to import numba for JIT compilation of barcode conversion @@ -56,6 +61,48 @@ def find_polyt_start(seq, window_size = 16, polya_fraction = 0.75): return i + max(0, seq[i:].find('TTTT')) +def find_polyt(seq, window_size = 16, polya_fraction = 0.75): + polyA_count = int(window_size * polya_fraction) + + if len(seq) < window_size: + return -1, -1 + i = 0 + a_count = seq[0:window_size].count('T') + while i < len(seq) - window_size: + if a_count >= polyA_count: + break + first_base_a = seq[i] == 'T' + new_base_a = i + window_size < len(seq) and seq[i + window_size] == 'T' + if first_base_a and not new_base_a: + a_count -= 1 + elif not first_base_a and new_base_a: + a_count += 1 + i += 1 + + if i >= len(seq) - window_size: + return -1, -1 + polyt_start = i + max(0, seq[i:].find('TTTT')) + + while i < len(seq) - window_size: + if a_count < polyA_count: + break + first_base_a = seq[i] == 'T' + new_base_a = i + window_size < len(seq) and seq[i + window_size] == 'T' + if first_base_a and not new_base_a: + a_count -= 1 + elif not first_base_a and new_base_a: + a_count += 1 + i += 1 + + if i >= len(seq) - window_size: + polyt_end = len(seq) - 1 + else: + last_t_pos = seq[i:i + window_size].rfind('TTTT') + polyt_end = i + window_size - 1 if last_t_pos == -1 else i + last_t_pos + 4 + + return polyt_start, polyt_end + + base_comp = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', " ": " "} @@ -369,3 +416,36 @@ def bit_to_str(seq, seq_len): for i in range(seq_len): str_seq += BIN2NUCL[(seq >> ((seq_len - i - 1) * 2)) & 3] return str_seq + + + +def load_h5_barcodes_bit(h5_file_path, dataset_name='bpMatrix_1'): + raise NotImplementedError() + import h5py + barcode_list = [] + with h5py.File(h5_file_path, 'r') as h5_file: + dataset = numpy.array(h5_file[dataset_name]) + for row in dataset: + for col in row: + barcode_list.append(bit_to_str(int(col[0]))) + return barcode_list + + +def load_barcodes(inf, needs_iterator=False): + if not os.path.isfile(inf): + logger.critical("Barcode file '%s' does not exist.", inf) + sys.exit(IsoQuantExitCode.INPUT_FILE_NOT_FOUND) + + if inf.endswith("h5") or inf.endswith("hdf5"): + return load_h5_barcodes_bit(inf) + + if inf.endswith("gz") or inf.endswith("gzip"): + handle = gzip.open(inf, "rt") + else: + handle = open(inf, "r") + + barcode_iterator = iter(l.strip().split()[0] for l in handle) + if needs_iterator: + return barcode_iterator + + return list(barcode_iterator) diff --git a/src/barcode_calling/indexers/base.py b/src/barcode_calling/indexers/base.py index 1f190f7b..32a01f6f 100644 --- a/src/barcode_calling/indexers/base.py +++ b/src/barcode_calling/indexers/base.py @@ -108,6 +108,42 @@ def get_occurrences(self, sequence: str, max_hits: int = 0, min_kmers: int = 1, return result return result[:max_hits] + def _get_kmers_substr(self, seq, start, end): + if end - start + 1 < self.k: + return + kmer = seq[start:start + self.k] + yield kmer + for i in range(start + self.k, min(len(seq), end + 1)): + kmer = kmer[1:] + seq[i] + yield kmer + + # [start, end] + def get_occurrences_substr(self, sequence, start, end, max_hits=0, min_kmers=1, hits_delta=1): + barcode_counts = defaultdict(int) + barcode_positions = defaultdict(list) + + for pos, kmer in enumerate(self._get_kmers_substr(sequence, start, end)): + for i in self.index[kmer]: + barcode_counts[i] += 1 + barcode_positions[i].append(pos) + + result = [] + for i in barcode_counts.keys(): + count = barcode_counts[i] + if count < min_kmers: + continue + result.append((self.seq_list[i], count, barcode_positions[i])) + + if not result: + return {} + + top_hits = max(result, key=lambda x: x[1])[1] + result = filter(lambda x: x[1] >= top_hits - hits_delta, result) + result = sorted(result, reverse=True, key=lambda x: x[1]) + + if max_hits == 0: + return {x[0]:x for x in result} + return {x[0]:x for x in list(result)[:max_hits]} class ArrayKmerIndexer: """ diff --git a/src/modes.py b/src/modes.py index 632667f7..ff83ce83 100644 --- a/src/modes.py +++ b/src/modes.py @@ -16,6 +16,7 @@ class IsoQuantMode(Enum): stereoseq = 5 visium_hd = 6 visium_5prime = 7 + custom_sc = 10 def needs_barcode_calling(self): return self in [IsoQuantMode.tenX_v3, @@ -23,7 +24,8 @@ def needs_barcode_calling(self): IsoQuantMode.stereoseq_nosplit, IsoQuantMode.stereoseq, IsoQuantMode.visium_hd, - IsoQuantMode.visium_5prime] + IsoQuantMode.visium_5prime, + IsoQuantMode.custom_sc] def needs_pcr_deduplication(self): return self in [IsoQuantMode.tenX_v3, @@ -31,7 +33,8 @@ def needs_pcr_deduplication(self): IsoQuantMode.stereoseq_nosplit, IsoQuantMode.stereoseq, IsoQuantMode.visium_hd, - IsoQuantMode.visium_5prime] + IsoQuantMode.visium_5prime, + IsoQuantMode.custom_sc] def produces_new_fasta(self): return self in [IsoQuantMode.stereoseq] From c58d46c94faeffcf9528de8745560be4c291cd00 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Wed, 28 Jan 2026 15:59:19 +0200 Subject: [PATCH 02/41] unify barcode calling result classes --- src/barcode_calling/callers/__init__.py | 21 ++- src/barcode_calling/callers/base.py | 145 +++++++++++---- .../callers/extraction_result.py | 141 ++++++++++++--- src/barcode_calling/callers/protocol.py | 168 ++++++++++++++++++ 4 files changed, 418 insertions(+), 57 deletions(-) create mode 100644 src/barcode_calling/callers/protocol.py diff --git a/src/barcode_calling/callers/__init__.py b/src/barcode_calling/callers/__init__.py index 76ff7930..512f1045 100644 --- a/src/barcode_calling/callers/__init__.py +++ b/src/barcode_calling/callers/__init__.py @@ -7,12 +7,16 @@ """ Barcode detection classes for single-cell and spatial transcriptomics. +Protocol: + BarcodeResult: Protocol interface for all detection results + Classes: BarcodeDetectionResult: Base result class - CurioBarcodeDetectionResult: Result for Curio (double barcode) platform - StereoBarcodeDetectionResult: Result for Stereo-seq platform + LinkerBarcodeDetectionResult: Result for platforms with linker sequences + TSOBarcodeDetectionResult: Result for Stereo-seq with TSO detection TenXBarcodeDetectionResult: Result for 10x Genomics platform SplittingBarcodeDetectionResult: Result for read splitting modes + ExtractionResult: Dict-based result for universal molecule extraction ReadStats: Statistics tracker for barcode detection Detectors: @@ -25,8 +29,11 @@ SharedMemoryWrapper: Generic shared memory wrapper TenXBarcodeDetector: 10x Genomics v3 detector VisiumHDBarcodeDetector: Visium HD detector + UniversalSingleMoleculeExtractor: Universal barcode detector for custom molecules """ +from .protocol import BarcodeResult + from .base import ( BarcodeDetectionResult, LinkerBarcodeDetectionResult, @@ -37,6 +44,11 @@ increase_if_valid, ) +from .extraction_result import ( + ExtractionResult, + DetectedElement, +) + from .curio import ( CurioBarcodeDetector, CurioIlluminaDetector, @@ -60,12 +72,16 @@ from .universal_extraction import UniversalSingleMoleculeExtractor __all__ = [ + # Protocol + 'BarcodeResult', # Result classes 'BarcodeDetectionResult', 'LinkerBarcodeDetectionResult', 'TSOBarcodeDetectionResult', 'TenXBarcodeDetectionResult', 'SplittingBarcodeDetectionResult', + 'ExtractionResult', + 'DetectedElement', 'ReadStats', 'increase_if_valid', # Curio detectors @@ -80,6 +96,7 @@ # 10x detectors 'TenXBarcodeDetector', 'VisiumHDBarcodeDetector', + # Universal extraction 'MoleculeStructure', 'UniversalSingleMoleculeExtractor', ] diff --git a/src/barcode_calling/callers/base.py b/src/barcode_calling/callers/base.py index 0f86ca5d..67ceb28c 100644 --- a/src/barcode_calling/callers/base.py +++ b/src/barcode_calling/callers/base.py @@ -38,6 +38,7 @@ class BarcodeDetectionResult: Base class for barcode detection results. Stores detected barcode, UMI, and quality scores for a single read. + Implements the BarcodeResult protocol. """ NOSEQ = "*" # Sentinel for missing/undetected sequence @@ -56,19 +57,43 @@ def __init__(self, read_id: str, barcode: str = NOSEQ, UMI: str = NOSEQ, strand: Detected strand ('+', '-', or '.') """ self.read_id: str = read_id - self.barcode: str = barcode - if not self.barcode: - self.barcode = BarcodeDetectionResult.NOSEQ - self.UMI: str = UMI - if not self.UMI: - self.UMI = BarcodeDetectionResult.NOSEQ + self._barcode: str = barcode if barcode else BarcodeDetectionResult.NOSEQ + self._umi: str = UMI if UMI else BarcodeDetectionResult.NOSEQ self.BC_score: int = BC_score self.UMI_good: bool = UMI_good self.strand: str = strand + # Primary getters (preferred interface) + def get_barcode(self) -> str: + """Get the detected barcode sequence.""" + return self._barcode + + def get_umi(self) -> str: + """Get the detected UMI sequence.""" + return self._umi + + # Backward-compatible property accessors + @property + def barcode(self) -> str: + """Barcode sequence. Prefer get_barcode() for new code.""" + return self._barcode + + @barcode.setter + def barcode(self, value: str) -> None: + self._barcode = value if value else BarcodeDetectionResult.NOSEQ + + @property + def UMI(self) -> str: + """UMI sequence. Prefer get_umi() for new code.""" + return self._umi + + @UMI.setter + def UMI(self, value: str) -> None: + self._umi = value if value else BarcodeDetectionResult.NOSEQ + def is_valid(self) -> bool: """Check if a valid barcode was detected.""" - return self.barcode != BarcodeDetectionResult.NOSEQ + return self._barcode != BarcodeDetectionResult.NOSEQ def update_coordinates(self, delta: int) -> None: """ @@ -114,12 +139,16 @@ def set_strand(self, strand: str) -> None: def __str__(self) -> str: """Format result as TSV line.""" - return "%s\t%s\t%s\t%d\t%s\t%s" % (self.read_id, self.barcode, self.UMI, + return "%s\t%s\t%s\t%d\t%s\t%s" % (self.read_id, self._barcode, self._umi, self.BC_score, self.UMI_good, self.strand) - @staticmethod - def header() -> str: + def header(self) -> str: """Get TSV header for result output.""" + return BarcodeDetectionResult._header() + + @staticmethod + def _header() -> str: + """Static header for class-level access.""" return "#read_id\tbarcode\tUMI\tBC_score\tvalid_UMI\tstrand" @@ -156,9 +185,6 @@ def __init__(self, read_id: str, barcode: str = BarcodeDetectionResult.NOSEQ, self.linker_end: int = linker_end self.polyT: int = polyT - def is_valid(self) -> bool: - return self.barcode != BarcodeDetectionResult.NOSEQ - def update_coordinates(self, delta: int) -> None: self.primer = increase_if_valid(self.primer, delta) self.linker_start = increase_if_valid(self.linker_start, delta) @@ -184,16 +210,18 @@ def get_additional_attributes(self) -> List[str]: attr.append("Linker detected") return attr - def set_strand(self, strand: str) -> None: - self.strand = strand - def __str__(self) -> str: return (BarcodeDetectionResult.__str__(self) + "\t%d\t%d\t%d\t%d" % (self.polyT, self.primer, self.linker_start, self.linker_end)) + def header(self) -> str: + """Get TSV header for result output.""" + return LinkerBarcodeDetectionResult._header() + @staticmethod - def header() -> str: - return BarcodeDetectionResult.header() + "\tpolyT_start\tprimer_end\tlinker_start\tlinker_end" + def _header() -> str: + """Static header for class-level access.""" + return BarcodeDetectionResult._header() + "\tpolyT_start\tprimer_end\tlinker_start\tlinker_end" class TSOBarcodeDetectionResult(LinkerBarcodeDetectionResult): @@ -228,9 +256,14 @@ def get_additional_attributes(self) -> List[str]: attr.append("TSO detected") return attr + def header(self) -> str: + """Get TSV header for result output.""" + return TSOBarcodeDetectionResult._header() + @staticmethod - def header() -> str: - return LinkerBarcodeDetectionResult.header() + "\tTSO5" + def _header() -> str: + """Static header for class-level access.""" + return LinkerBarcodeDetectionResult._header() + "\tTSO5" class TenXBarcodeDetectionResult(BarcodeDetectionResult): @@ -244,9 +277,6 @@ def __init__(self, read_id: str, barcode: str = BarcodeDetectionResult.NOSEQ, self.r1: int = r1 self.polyT: int = polyT - def is_valid(self) -> bool: - return self.barcode != BarcodeDetectionResult.NOSEQ - def update_coordinates(self, delta: int) -> None: self.r1 = increase_if_valid(self.r1, delta) self.polyT = increase_if_valid(self.polyT, delta) @@ -266,23 +296,28 @@ def get_additional_attributes(self) -> List[str]: attr.append("R1 detected") return attr - def set_strand(self, strand: str) -> None: - self.strand = strand - def __str__(self) -> str: return (BarcodeDetectionResult.__str__(self) + "\t%d\t%d" % (self.polyT, self.r1)) + def header(self) -> str: + """Get TSV header for result output.""" + return TenXBarcodeDetectionResult._header() + @staticmethod - def header() -> str: - return BarcodeDetectionResult.header() + "\tpolyT_start\tR1_end" + def _header() -> str: + """Static header for class-level access.""" + return BarcodeDetectionResult._header() + "\tpolyT_start\tR1_end" class SplittingBarcodeDetectionResult: """Result container for read splitting modes (multiple barcodes per read).""" + NOSEQ = BarcodeDetectionResult.NOSEQ # For consistency with protocol + def __init__(self, read_id: str): self.read_id: str = read_id + self.strand: str = "." # For protocol compatibility self.detected_patterns: List[TSOBarcodeDetectionResult] = [] def append(self, barcode_detection_result: TSOBarcodeDetectionResult) -> None: @@ -296,7 +331,7 @@ def filter(self) -> None: return barcoded_results = [] for r in self.detected_patterns: - if r.barcode != BarcodeDetectionResult.NOSEQ: + if r.is_valid(): barcoded_results.append(r) if not barcoded_results: @@ -304,9 +339,55 @@ def filter(self) -> None: else: self.detected_patterns = barcoded_results - @staticmethod - def header() -> str: - return TSOBarcodeDetectionResult.header() + def get_barcode(self) -> str: + """Get barcode from first valid pattern, or NOSEQ if none.""" + for r in self.detected_patterns: + if r.is_valid(): + return r.get_barcode() + return self.NOSEQ + + def get_umi(self) -> str: + """Get UMI from first valid pattern, or NOSEQ if none.""" + for r in self.detected_patterns: + if r.is_valid(): + return r.get_umi() + return self.NOSEQ + + def is_valid(self) -> bool: + """Check if any pattern has a valid barcode.""" + return any(r.is_valid() for r in self.detected_patterns) + + def set_strand(self, strand: str) -> None: + """Set strand for all patterns.""" + self.strand = strand + for r in self.detected_patterns: + r.set_strand(strand) + + def update_coordinates(self, delta: int) -> None: + """Update coordinates for all patterns.""" + for r in self.detected_patterns: + r.update_coordinates(delta) + + def more_informative_than(self, other: 'SplittingBarcodeDetectionResult') -> bool: + """Compare by number of valid patterns.""" + self_valid = sum(1 for r in self.detected_patterns if r.is_valid()) + other_valid = sum(1 for r in other.detected_patterns if r.is_valid()) + return self_valid > other_valid + + def get_additional_attributes(self) -> List[str]: + """Get combined attributes from all patterns.""" + attrs = set() + for r in self.detected_patterns: + attrs.update(r.get_additional_attributes()) + return list(attrs) + + def __str__(self) -> str: + """Format all patterns as TSV lines.""" + return "\n".join(str(r) for r in self.detected_patterns) + + def header(self) -> str: + """Get TSV header for result output.""" + return TSOBarcodeDetectionResult._header() class ReadStats: diff --git a/src/barcode_calling/callers/extraction_result.py b/src/barcode_calling/callers/extraction_result.py index 8df54072..9bb44098 100644 --- a/src/barcode_calling/callers/extraction_result.py +++ b/src/barcode_calling/callers/extraction_result.py @@ -1,18 +1,30 @@ -########################################################################### -# Copyright (c) 2026 University of Helsinki +############################################################################ +# Copyright (c) 2023-2026 University of Helsinki # # All Rights Reserved # See file LICENSE for details. ############################################################################ +""" +Dict-based flexible detection result for universal molecule extraction. + +Unlike the attribute-based result classes in base.py, ExtractionResult stores +detected elements in a dictionary, allowing flexible molecule structures. +Implements the BarcodeResult protocol. +""" + import logging from collections import defaultdict +from typing import List + from .molecule_structure import MoleculeStructure, ElementType logger = logging.getLogger('IsoQuant') class DetectedElement: - def __init__(self, start=-1, end=-1, score=-1, seq=None): + """Container for a single detected element's coordinates and sequence.""" + + def __init__(self, start: int = -1, end: int = -1, score: int = -1, seq: str = None): self.start = start self.end = end self.score = score @@ -20,15 +32,89 @@ def __init__(self, start=-1, end=-1, score=-1, seq=None): class ExtractionResult: + """ + Dict-based flexible detection result for universal molecule extraction. + + Unlike the attribute-based result classes (BarcodeDetectionResult, etc.), + ExtractionResult stores detected elements in a dictionary, allowing + flexible molecule structures defined by MoleculeStructure. + + Implements the BarcodeResult protocol. + + Barcode/UMI elements are identified by naming convention: + - Elements with prefix "barcode" (case-insensitive) are barcodes + - Elements with prefix "umi" (case-insensitive) are UMIs + """ + + NOSEQ = "*" # Sentinel for missing/undetected sequence (matches BarcodeDetectionResult.NOSEQ) + def __init__(self, molecule_structure: MoleculeStructure, read_id: str, strand: str, detected_results: dict): + """ + Initialize extraction result. + + Args: + molecule_structure: Molecule structure definition + read_id: Read identifier + strand: Detected strand ('+', '-', or '.') + detected_results: Dict mapping element names to DetectedElement objects + """ self.molecule_structure = molecule_structure self.read_id = read_id self.strand = strand self.detected_results = detected_results + # Cache barcode/UMI element names for efficiency + self._barcode_elements: List[str] = [] + self._umi_elements: List[str] = [] + self._identify_barcode_umi_elements() + + def _identify_barcode_umi_elements(self) -> None: + """Identify barcode/UMI elements by naming convention.""" + for el in self.molecule_structure: + name_lower = el.element_name.lower() + if name_lower.startswith("barcode"): + self._barcode_elements.append(el.element_name) + elif name_lower.startswith("umi"): + self._umi_elements.append(el.element_name) + + def get_barcode(self) -> str: + """ + Get the detected barcode sequence. + + For molecule structures with multiple barcode elements (e.g., barcode1, barcode2), + returns concatenated barcodes using '|' delimiter. + + Returns: + str: Concatenated barcode sequence or NOSEQ if not detected. + """ + parts = [] + for el_name in self._barcode_elements: + if el_name in self.detected_results: + detected = self.detected_results[el_name] + if detected.seq and detected.seq != self.NOSEQ: + parts.append(detected.seq) + return "|".join(parts) if parts else self.NOSEQ + + def get_umi(self) -> str: + """ + Get the detected UMI sequence. + + For molecule structures with multiple UMI elements, + returns concatenated UMIs using '|' delimiter. + + Returns: + str: Concatenated UMI sequence or NOSEQ if not detected. + """ + parts = [] + for el_name in self._umi_elements: + if el_name in self.detected_results: + detected = self.detected_results[el_name] + if detected.seq and detected.seq != self.NOSEQ: + parts.append(detected.seq) + return "|".join(parts) if parts else self.NOSEQ def is_valid(self) -> bool: """Check if a valid barcode was detected.""" - return True + return self.get_barcode() != self.NOSEQ def update_coordinates(self, delta: int) -> None: """ @@ -39,43 +125,51 @@ def update_coordinates(self, delta: int) -> None: Args: delta: Amount to shift coordinates """ - pass + for detected in self.detected_results.values(): + if detected.start != -1: + detected.start += delta + if detected.end != -1: + detected.end += delta - def more_informative_than(self, that) -> bool: + def more_informative_than(self, other: 'ExtractionResult') -> bool: """ Compare two results to determine which is more informative. + Compares by number of detected elements, then total score. + Args: - that: Another detection result + other: Another detection result Returns: - True if this result is more informative - - Raises: - NotImplementedError: Must be implemented by subclasses + True if this result is more informative than other """ - raise NotImplementedError() - - def get_additional_attributes(self): + self_count = sum(1 for d in self.detected_results.values() if d.start != -1) + other_count = sum(1 for d in other.detected_results.values() if d.start != -1) + if self_count != other_count: + return self_count > other_count + self_score = sum(d.score for d in self.detected_results.values() if d.score > 0) + other_score = sum(d.score for d in other.detected_results.values() if d.score > 0) + return self_score > other_score + + def get_additional_attributes(self) -> List[str]: """ - Get list of detected additional features (primer, linker, etc.). + Get list of detected additional features. Returns: - List of detected feature names - - Raises: - NotImplementedError: Must be implemented by subclasses + List of detected element names with " detected" suffix. """ - return [] + return ["%s detected" % el_name for el_name, d in self.detected_results.items() if d.start != -1] def set_strand(self, strand: str) -> None: """Set the detected strand.""" self.strand = strand def __str__(self) -> str: + """Format result as TSV line.""" res_str = "%s\t%s" % (self.read_id, self.strand) for el in self.molecule_structure: - if el.element_type == ElementType.cDNA: continue + if el.element_type == ElementType.cDNA: + continue if el.element_type == ElementType.PolyT: if ElementType.PolyT.name not in self.detected_results: res_str += "\t-1\t-1" @@ -90,10 +184,11 @@ def __str__(self) -> str: res_str += "\t%d\t%d" % (detected_element.start, detected_element.end) elif el.element_type.needs_sequence_extraction() or el.element_type.needs_correction(): if el.element_name not in self.detected_results: - res_str += "\t-1\t-1\t*" + res_str += "\t-1\t-1\t%s" % self.NOSEQ continue detected_element = self.detected_results[el.element_name] - res_str += "\t%d\t%d\t%s" % (detected_element.start, detected_element.end, detected_element.seq) + seq = detected_element.seq if detected_element.seq else self.NOSEQ + res_str += "\t%d\t%d\t%s" % (detected_element.start, detected_element.end, seq) return res_str def header(self) -> str: diff --git a/src/barcode_calling/callers/protocol.py b/src/barcode_calling/callers/protocol.py new file mode 100644 index 00000000..cbdd96de --- /dev/null +++ b/src/barcode_calling/callers/protocol.py @@ -0,0 +1,168 @@ +############################################################################ +# Copyright (c) 2023-2026 University of Helsinki +# # All Rights Reserved +# See file LICENSE for details. +############################################################################ + +""" +Protocol definition for barcode detection results. + +This module defines the BarcodeResult protocol that all barcode detection +result classes implement. Using Protocol (structural typing) allows +existing classes to automatically conform without explicit inheritance. +""" + +from typing import Protocol, List, ClassVar, runtime_checkable + + +@runtime_checkable +class BarcodeResult(Protocol): + """ + Protocol defining the interface for barcode detection results. + + All barcode detection result classes (BarcodeDetectionResult, + LinkerBarcodeDetectionResult, TSOBarcodeDetectionResult, + TenXBarcodeDetectionResult, ExtractionResult) implement this interface. + + Using Protocol (PEP 544) enables structural subtyping - classes + don't need to explicitly inherit from this protocol to be + considered compatible; they just need to implement the required + methods and attributes. + + Attributes: + NOSEQ: Class variable sentinel for missing/undetected sequence ("*"). + read_id: The unique identifier of the read being analyzed. + strand: The detected strand orientation ('+', '-', or '.'). + + Example: + def process_result(result: BarcodeResult) -> None: + if result.is_valid(): + barcode = result.get_barcode() + umi = result.get_umi() + print(f"{result.read_id}: {barcode}/{umi}") + """ + + NOSEQ: ClassVar[str] + read_id: str + strand: str + + def get_barcode(self) -> str: + """ + Get the detected barcode sequence. + + Returns the barcode sequence if detected, or NOSEQ ("*") if no + barcode was found. For platforms with multiple barcode segments + (e.g., Visium HD with part1|part2), returns the concatenated + barcode with "|" delimiter. + + Returns: + str: Detected barcode sequence or NOSEQ if not found. + """ + ... + + def get_umi(self) -> str: + """ + Get the detected UMI (Unique Molecular Identifier) sequence. + + Returns the UMI sequence if detected, or NOSEQ ("*") if no + UMI was found. For platforms with multiple UMI elements, + returns concatenated UMIs with "|" delimiter. + + Returns: + str: Detected UMI sequence or NOSEQ if not found. + """ + ... + + def is_valid(self) -> bool: + """ + Check if a valid barcode was detected. + + A result is considered valid if get_barcode() returns something + other than NOSEQ. This is the primary method to check if barcode + detection was successful. + + Returns: + bool: True if a valid barcode was detected, False otherwise. + """ + ... + + def set_strand(self, strand: str) -> None: + """ + Set the detected strand orientation. + + Args: + strand: Strand orientation. Should be '+' for forward, + '-' for reverse, or '.' for unknown/unstranded. + """ + ... + + def update_coordinates(self, delta: int) -> None: + """ + Shift all genomic coordinates by a delta value. + + Used when processing read subsequences (e.g., after splitting + a concatenated read). Implementations should update all position + attributes (polyT, primer, linker_start, linker_end, etc.) + by adding delta. Coordinates that are -1 (invalid/not detected) + should remain -1. + + Args: + delta: Amount to add to all coordinate values. + """ + ... + + def more_informative_than(self, other: 'BarcodeResult') -> bool: + """ + Compare two results to determine which is more informative. + + Used when both forward and reverse complement orientations + produce results, to select the better one. The comparison + criteria vary by platform but typically consider: + - Barcode alignment score + - Presence of additional features (polyT, primer, linker) + - Feature positions + + Args: + other: Another detection result to compare against. + + Returns: + bool: True if this result is more informative than `other`. + """ + ... + + def get_additional_attributes(self) -> List[str]: + """ + Get list of detected additional features. + + Returns human-readable strings describing which optional + features were detected (e.g., "PolyT detected", "Primer detected", + "Linker detected", "TSO detected", "R1 detected"). + + Used for statistics collection and quality reporting. + + Returns: + List[str]: Names of detected features. Empty list if none. + """ + ... + + def __str__(self) -> str: + """ + Format result as a tab-separated string for TSV output. + + The format should match what header() returns. + + Returns: + str: Tab-separated values for TSV output. + """ + ... + + def header(self) -> str: + """ + Get TSV header line for result output. + + Returns the column names matching the fields output by __str__(). + + Returns: + str: Tab-separated column names with leading '#'. + """ + ... From ca5032e87710668d11526256c0e313e5175a86dc Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Wed, 28 Jan 2026 17:17:26 +0200 Subject: [PATCH 03/41] minor refactoring --- detect_barcodes.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/detect_barcodes.py b/detect_barcodes.py index 81597666..b3ee45d5 100755 --- a/detect_barcodes.py +++ b/detect_barcodes.py @@ -263,7 +263,9 @@ def process_chunk(barcode_detector, read_chunk, output_file, num, out_fasta=None return output_file, out_fasta, counter -def prepare_barcodes(args): +def create_barcode_caller(args): + logger.info("Creating barcode detector for mode %s" % args.mode.name + (", min_score=%d" % args.min_score if args.min_score else "")) + if args.mode == IsoQuantMode.custom_sc: if not args.molecule: logger.critical("Custom single-cell/spatial mode requires molecule description to be provided via --molecule") @@ -271,8 +273,10 @@ def prepare_barcodes(args): if not os.path.isfile(args.molecule): logger.critical("Molecule file %s does not exist" % args.molecule) sys.exit(IsoQuantExitCode.INPUT_FILE_NOT_FOUND) + if args.min_score: + logger.warning("Custom single-cell/spatial mode does not support --min-score, ignoring") - return MoleculeStructure(open(args.molecule)) + return BARCODE_CALLING_MODES[args.mode](MoleculeStructure(open(args.molecule))) logger.info("Using barcodes from %s" % ", ".join(args.barcodes)) barcode_files = len(args.barcodes) @@ -294,16 +298,17 @@ def prepare_barcodes(args): for i, bc in enumerate(barcodes): logger.info("Loaded %d barcodes from %s" % (len(bc), args.barcodes[i])) barcodes = tuple(barcodes) - return barcodes - -def process_single_thread(args): - logger.info("Preparing barcodes indices") - barcodes = prepare_barcodes(args) barcode_detector = BARCODE_CALLING_MODES[args.mode](barcodes) if args.min_score: barcode_detector.min_score = args.min_score + return barcode_detector + + +def process_single_thread(args): + barcode_detector = create_barcode_caller(args) + # args.input, args.output_tsv are always lists # args.out_fasta is a list or None out_fastas = args.out_fasta if args.out_fasta else [None] * len(args.input) @@ -443,7 +448,7 @@ def process_in_parallel(args): out_fastas = args.out_fasta if args.out_fasta else [None] * len(args.input) # Prepare barcodes once for all files - barcodes = prepare_barcodes(args) + barcodes = create_barcode_caller(args) for idx, (input_file, output_tsv, out_fasta) in enumerate(zip(args.input, args.output_tsv, out_fastas)): if len(args.input) > 1: From 60f3a8089ad62a757c23e2e06a823603ce3224ba Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Wed, 28 Jan 2026 18:47:00 +0200 Subject: [PATCH 04/41] add sequence correction --- .../callers/extraction_result.py | 2 +- .../callers/molecule_structure.py | 15 +- .../callers/universal_extraction.py | 168 ++++++++++++++---- 3 files changed, 143 insertions(+), 42 deletions(-) diff --git a/src/barcode_calling/callers/extraction_result.py b/src/barcode_calling/callers/extraction_result.py index 9bb44098..28b4e642 100644 --- a/src/barcode_calling/callers/extraction_result.py +++ b/src/barcode_calling/callers/extraction_result.py @@ -176,7 +176,7 @@ def __str__(self) -> str: continue detected_element = self.detected_results[ElementType.PolyT.name] res_str += "\t%d\t%d" % (detected_element.start, detected_element.end) - elif el.element_type.needs_only_coordinates(): + elif el.element_type.is_constant(): if el.element_name not in self.detected_results: res_str += "\t-1\t-1" continue diff --git a/src/barcode_calling/callers/molecule_structure.py b/src/barcode_calling/callers/molecule_structure.py index 0bf29f17..94c26db0 100644 --- a/src/barcode_calling/callers/molecule_structure.py +++ b/src/barcode_calling/callers/molecule_structure.py @@ -32,8 +32,14 @@ def needs_correction(self): def needs_sequence_extraction(self): return self == ElementType.VAR_ANY - def needs_only_coordinates(self): - return self in (ElementType.CONST, ElementType.VAR_ANY_SEPARATOR, ElementType.VAR_ANY_NON_T_SEPARATOR) + def is_base_separator(self): + return self in (ElementType.VAR_ANY_SEPARATOR, ElementType.VAR_ANY_NON_T_SEPARATOR) + + def is_variable(self): + return self in (ElementType.VAR_ANY, ElementType.VAR_LIST, ElementType.VAR_FILE) + + def is_constant(self): + return self == ElementType.CONST class MoleculeElement: @@ -79,9 +85,6 @@ def __init__(self, element_name: str, element_type: ElementType, element_value_1 logger.critical("Wrong element type %s" % element_type) sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) - def is_variable(self): - return self.element_type in {ElementType.VAR_ANY, ElementType.VAR_LIST, ElementType.VAR_FILE} - class MoleculeStructure: CONCAT_DELIM = "|" @@ -155,7 +158,7 @@ def header(self): if el.element_type == ElementType.cDNA: continue if el.element_type == ElementType.PolyT: header += "\tpolyT_start\tpolyT_end" - elif el.element_type.needs_only_coordinates(): + elif el.element_type.is_constant(): header += "\t%s_start\t%s_end" % (el.element_name, el.element_name) elif el.element_type.needs_sequence_extraction() or el.element_type.needs_correction(): if el.element_name in self.elements_to_concatenate.keys(): diff --git a/src/barcode_calling/callers/universal_extraction.py b/src/barcode_calling/callers/universal_extraction.py index 24911894..6ef64866 100644 --- a/src/barcode_calling/callers/universal_extraction.py +++ b/src/barcode_calling/callers/universal_extraction.py @@ -10,7 +10,7 @@ from ...error_codes import IsoQuantExitCode from .extraction_result import DetectedElement, ExtractionResult from ..indexers.base import KmerIndexer -from ..common import find_polyt, reverese_complement, detect_exact_positions +from ..common import find_polyt, reverese_complement, detect_exact_positions, find_candidate_with_max_score_ssw from .molecule_structure import ElementType, MoleculeStructure, MoleculeElement logger = logging.getLogger('IsoQuant') @@ -20,23 +20,21 @@ class UniversalSingleMoleculeExtractor: MIN_SCORE_COEFF = 0.75 MIN_SCORE_COEFF_TERMMINAL = 0.5 TERMINAL_MATCH_DELTA = 3 - MAX_LEN_DIFF = 0.2 + MAX_LEN_DIFF = 0.25 def __init__(self, molecule_structure: MoleculeStructure): self.correct_sequences = True self.molecule_structure = molecule_structure self.index_dict = {} + self.min_scores = {} self.has_polyt = False self.has_cdna = False - self.elements_to_detect = set() + self.constant_elements_to_detect = set() self.elements_to_extract = set() self.elements_to_correct = set() for el in self.molecule_structure: - if el.element_type == ElementType.CONST: - self.index_dict[el.element_name] = KmerIndexer([el.element_value], - max(6, int(el.element_length / 2) - 2)) - elif el.element_type == ElementType.PolyT: + if el.element_type == ElementType.PolyT: if self.has_polyt: logger.critical( "Current version only supports a single polyT, extraction results may be suboptimal") @@ -48,13 +46,18 @@ def __init__(self, molecule_structure: MoleculeStructure): "Current version only supports a single cDNA, reads will not be split into molecules") sys.exit(IsoQuantExitCode.INVALID_PARAMETER) self.has_cdna = True - elif el.element_type.needs_only_coordinates(): - self.elements_to_detect.add(el.element_name) + elif el.element_type.is_constant(): + self.constant_elements_to_detect.add(el.element_name) + self.index_dict[el.element_name] = KmerIndexer([el.element_value], + max(6, int(el.element_length / 2) - 2)) elif el.element_type.needs_sequence_extraction(): self.elements_to_extract.add(el.element_name) elif el.element_type.needs_sequence_extraction(): if self.correct_sequences: self.index_dict[el.element_name] = self.prepare_barcode_index(el) + # FIXME: min score should be based on the length of the barcode and their sparcity + # e.g. find a way to estimate min_score so that error rate does not exceed 1% + self.min_scores[el.element_name] = el.element_length - 2 self.elements_to_correct.add(el.element_name) else: self.elements_to_extract.add(el.element_name) @@ -118,7 +121,7 @@ def extract_variable_elements5(self, detected_elements, sequence): for i, el in enumerate(self.molecule_structure): if el.element_type == ElementType.cDNA: break - if el.element_type in [ElementType.CONST, ElementType.PolyT] and el.element_name in detected_elements: + if (el.element_type == ElementType.PolyT or el.element_type.is_constant()) and el.element_name in detected_elements: first_detected_const_element = i break @@ -129,13 +132,37 @@ def extract_variable_elements5(self, detected_elements, sequence): self.molecule_structure.ordered_elements[first_detected_const_element].element_name].start - 1 for i in range(first_detected_const_element - 1, -1, -1): el: MoleculeElement = self.molecule_structure.ordered_elements[i] - if not el.is_variable(): break + + if el.element_type.is_base_separator(): + current_pos -= el.element_length + continue + if not el.element_type.is_variable(): + # we do not skip undetected const elements on the open side (e.g. here we go to the left of the first detected const element) + break + potential_end = current_pos potential_start = potential_end - el.element_length + 1 if potential_start < 0: break - detected_elements[el.element_name] = DetectedElement(potential_start, potential_end, 0, - seq=sequence[potential_start:potential_end + 1]) + current_pos = potential_start - 1 + if el.element_name in self.elements_to_extract: + detected_elements[el.element_name] = DetectedElement(potential_start, potential_end, 0, + seq=sequence[potential_start:potential_end + 1]) + + elif el.element_name in self.elements_to_correct: + potential_seq = sequence[potential_start:potential_end + 1] + matching_sequences = self.index_dict[el.element_name].get_occurrences(potential_seq) + corrected_seq, seq_score, seq_start, seq_end = \ + find_candidate_with_max_score_ssw(matching_sequences, potential_seq, min_score=self.min_scores[el.element_name]) + + if corrected_seq is not None: + read_seq_start = potential_start + seq_start + read_seq_end = potential_start + seq_end - 1 + current_pos = read_seq_start - 1 + detected_elements[el.element_name] = DetectedElement(read_seq_start, read_seq_end, seq_score, + corrected_seq) + else: + detected_elements[el.element_name] = DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ) current_pos = detected_elements[ self.molecule_structure.ordered_elements[first_detected_const_element].element_name].end + 1 @@ -143,10 +170,13 @@ def extract_variable_elements5(self, detected_elements, sequence): if current_pos >= len(sequence): break - el = self.molecule_structure.ordered_elements[i] + el: MoleculeElement = self.molecule_structure.ordered_elements[i] if el.element_type in [ElementType.cDNA, ElementType.PolyT]: break - elif el.element_type == ElementType.CONST: + elif el.element_type.is_base_separator(): + current_pos += el.element_length + continue + elif el.element_type.is_constant(): if el.element_name in detected_elements: current_pos = detected_elements[el.element_name].end + 1 else: @@ -163,13 +193,34 @@ def extract_variable_elements5(self, detected_elements, sequence): else: potential_end = potential_start + el.element_length - 1 if potential_end >= len(sequence): - potential_end = len(sequence) + 1 + potential_end = len(sequence) - 1 potential_len = potential_end - potential_start + 1 - if abs(potential_len - el.element_length) <= self.MAX_LEN_DIFF * el.element_length: - detected_elements[el.elemenst_name] = DetectedElement(potential_start, potential_end, 0, - seq=sequence[potential_start:potential_end + 1]) current_pos = potential_end + 1 + if abs(potential_len - el.element_length) <= self.MAX_LEN_DIFF * el.element_length: + if el.element_name in self.elements_to_extract: + detected_elements[el.element_name] = DetectedElement(potential_start, potential_end, 0, + seq=sequence[ + potential_start:potential_end + 1]) + + elif el.element_name in self.elements_to_correct: + potential_seq = sequence[potential_start:potential_end + 1] + matching_sequences = self.index_dict[el.element_name].get_occurrences(potential_seq) + corrected_seq, seq_score, seq_start, seq_end = \ + find_candidate_with_max_score_ssw(matching_sequences, potential_seq, + min_score=self.min_scores[el.element_name]) + + if corrected_seq is not None: + read_seq_start = potential_start + seq_start + read_seq_end = potential_start + seq_end - 1 + current_pos = read_seq_end + 1 + detected_elements[el.element_name] = DetectedElement(read_seq_start, read_seq_end, seq_score, + corrected_seq) + else: + detected_elements[el.element_name] = DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ) + else: + detected_elements[el.element_name] = DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ) + def extract_variable_elements3(self, detected_elements, sequence): last_detected_const_element = None @@ -177,34 +228,62 @@ def extract_variable_elements3(self, detected_elements, sequence): el = self.molecule_structure.ordered_elements[i] if el.element_type == ElementType.cDNA: break - if el.element_type in [ElementType.CONST, ElementType.PolyT] and el.element_name in detected_elements: + if (el.element_type == ElementType.PolyT or el.element_type.is_constant()) and el.element_name in detected_elements: last_detected_const_element = i break if last_detected_const_element is None: return - # extracting elements following last detected const element + # extracting elements following the last detected const element current_pos = detected_elements[ self.molecule_structure.ordered_elements[last_detected_const_element].element_name].end + 1 for i in range(last_detected_const_element + 1, len(self.molecule_structure.ordered_elements)): - el = self.molecule_structure.ordered_elements[i] - if not el.is_variable(): break + el: MoleculeElement = self.molecule_structure.ordered_elements[i] + + if el.element_type.is_base_separator(): + current_pos += el.element_length + continue + if not el.element_type.is_variable(): + # we do not skip undetected const elements on the open side (e.g. here we go to the left of the first detected const element) + break + potential_start = current_pos potential_end = potential_start + el.element_length - 1 if potential_end >= len(sequence): break - detected_elements[el.element_name] = DetectedElement(potential_start, potential_end, 0, - seq=sequence[potential_start:potential_end + 1]) current_pos = potential_end + 1 + if el.element_name in self.elements_to_extract: + detected_elements[el.element_name] = DetectedElement(potential_start, potential_end, 0, + seq=sequence[potential_start:potential_end + 1]) + + elif el.element_name in self.elements_to_correct: + potential_seq = sequence[potential_start:potential_end + 1] + matching_sequences = self.index_dict[el.element_name].get_occurrences(potential_seq) + corrected_seq, seq_score, seq_start, seq_end = \ + find_candidate_with_max_score_ssw(matching_sequences, potential_seq, min_score=self.min_scores[el.element_name]) + + if corrected_seq is not None: + read_seq_start = potential_start + seq_start + read_seq_end = potential_start + seq_end - 1 + current_pos = read_seq_end + 1 + detected_elements[el.element_name] = DetectedElement(read_seq_start, read_seq_end, seq_score, + corrected_seq) + else: + detected_elements[el.element_name] = DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ) + current_pos = detected_elements[ self.molecule_structure.ordered_elements[last_detected_const_element].element_name].start - 1 for i in range(last_detected_const_element - 1, -1, -1): if current_pos <= 0: break - el = self.molecule_structure.ordered_elements[i] + el: MoleculeElement = self.molecule_structure.ordered_elements[i] + if el.element_type in [ElementType.cDNA, ElementType.PolyT]: break - elif el.element_type == ElementType.CONST: + elif el.element_type.is_base_separator(): + current_pos -= el.element_length + continue + elif el.element_type.is_constant(): if el.element_name in detected_elements: current_pos = detected_elements[el.element_name].start - 1 else: @@ -224,20 +303,40 @@ def extract_variable_elements3(self, detected_elements, sequence): potential_start = 0 potential_len = potential_end - potential_start + 1 - if abs(potential_len - el.element_length) <= self.MAX_LEN_DIFF * el.element_length: - detected_elements[el.element_name] = DetectedElement(potential_start, potential_end, 0, - seq=sequence[potential_start:potential_end + 1]) current_pos = potential_start - 1 + if abs(potential_len - el.element_length) <= self.MAX_LEN_DIFF * el.element_length: + if el.element_name in self.elements_to_extract: + detected_elements[el.element_name] = DetectedElement(potential_start, potential_end, 0, + seq=sequence[ + potential_start:potential_end + 1]) + + elif el.element_name in self.elements_to_correct: + potential_seq = sequence[potential_start:potential_end + 1] + matching_sequences = self.index_dict[el.element_name].get_occurrences(potential_seq) + corrected_seq, seq_score, seq_start, seq_end = \ + find_candidate_with_max_score_ssw(matching_sequences, potential_seq, + min_score=self.min_scores[el.element_name]) + + if corrected_seq is not None: + read_seq_start = potential_start + seq_start + read_seq_end = potential_start + seq_end - 1 + current_pos = read_seq_start - 1 + detected_elements[el.element_name] = DetectedElement(read_seq_start, read_seq_end, seq_score, + corrected_seq) + else: + detected_elements[el.element_name] = DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ) + else: + detected_elements[el.element_name] = DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ) + def detect_const_elements(self, sequence, polyt_start, polyt_end, detected_elements): # searching left of cDNA first_element_detected = False current_search_start = 0 - for i, el in enumerate(self.molecule_structure): + for el in self.molecule_structure: if el.element_type in [ElementType.cDNA, ElementType.PolyT]: break - - if el.element_type != ElementType.CONST: + if el.element_name not in self.constant_elements_to_detect: continue search_start = current_search_start @@ -273,8 +372,7 @@ def detect_const_elements(self, sequence, polyt_start, polyt_end, detected_eleme el = self.molecule_structure.ordered_elements[i] if el.element_type in [ElementType.cDNA, ElementType.PolyT]: break - - if el.element_type != ElementType.CONST: + if el.element_name not in self.constant_elements_to_detect: continue search_start = 0 if polyt_start == -1 else polyt_end + 1 From 1e2196373cea5cc290e1204e46a1a485d381ec59 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Wed, 28 Jan 2026 22:10:19 +0200 Subject: [PATCH 05/41] add BarcodeResult protocol and standard output format for ExtractionResult --- src/barcode_calling/callers/base.py | 35 ++++-- .../callers/extraction_result.py | 100 ++++++++++++++++-- src/barcode_calling/callers/protocol.py | 23 ++++ 3 files changed, 140 insertions(+), 18 deletions(-) diff --git a/src/barcode_calling/callers/base.py b/src/barcode_calling/callers/base.py index 67ceb28c..e8a1c77a 100644 --- a/src/barcode_calling/callers/base.py +++ b/src/barcode_calling/callers/base.py @@ -95,6 +95,14 @@ def is_valid(self) -> bool: """Check if a valid barcode was detected.""" return self._barcode != BarcodeDetectionResult.NOSEQ + def has_barcode(self) -> bool: + """Check if a barcode was detected (alias for is_valid()).""" + return self._barcode != BarcodeDetectionResult.NOSEQ + + def has_umi(self) -> bool: + """Check if a valid UMI was detected.""" + return self._umi != BarcodeDetectionResult.NOSEQ + def update_coordinates(self, delta: int) -> None: """ Shift all genomic coordinates by delta. @@ -126,12 +134,9 @@ def get_additional_attributes(self) -> List[str]: Get list of detected additional features (primer, linker, etc.). Returns: - List of detected feature names - - Raises: - NotImplementedError: Must be implemented by subclasses + List of detected feature names. Empty list for base class. """ - raise NotImplementedError() + return [] def set_strand(self, strand: str) -> None: """Set the detected strand.""" @@ -357,6 +362,14 @@ def is_valid(self) -> bool: """Check if any pattern has a valid barcode.""" return any(r.is_valid() for r in self.detected_patterns) + def has_barcode(self) -> bool: + """Check if any pattern has a valid barcode.""" + return any(r.has_barcode() for r in self.detected_patterns) + + def has_umi(self) -> bool: + """Check if any pattern has a valid UMI.""" + return any(r.has_umi() for r in self.detected_patterns) + def set_strand(self, strand: str) -> None: """Set strand for all patterns.""" self.strand = strand @@ -405,23 +418,23 @@ def __init__(self): self.umi_count: int = 0 self.additional_attributes_counts: Dict[str, int] = defaultdict(int) - def add_read(self, barcode_detection_result: BarcodeDetectionResult) -> None: + def add_read(self, barcode_detection_result) -> None: """ Add a read result to statistics. Args: - barcode_detection_result: Detection result to accumulate + barcode_detection_result: Detection result to accumulate (implements BarcodeResult protocol) """ self.read_count += 1 # Count detected features (primer, linker, etc.) for a in barcode_detection_result.get_additional_attributes(): self.additional_attributes_counts[a] += 1 # Count valid barcode -# if barcode_detection_result.barcode != BarcodeDetectionResult.NOSEQ: -# self.bc_count += 1 + if barcode_detection_result.has_barcode(): + self.bc_count += 1 # Count valid UMI -# if barcode_detection_result.UMI_good: -# self.umi_count += 1 + if barcode_detection_result.has_umi(): + self.umi_count += 1 def add_custom_stats(self, stat_name: str, val: int) -> None: """ diff --git a/src/barcode_calling/callers/extraction_result.py b/src/barcode_calling/callers/extraction_result.py index 28b4e642..94015a84 100644 --- a/src/barcode_calling/callers/extraction_result.py +++ b/src/barcode_calling/callers/extraction_result.py @@ -116,6 +116,28 @@ def is_valid(self) -> bool: """Check if a valid barcode was detected.""" return self.get_barcode() != self.NOSEQ + def has_barcode(self) -> bool: + """Check if a barcode was detected (alias for is_valid()).""" + return self.get_barcode() != self.NOSEQ + + def has_umi(self) -> bool: + """Check if a valid UMI was detected.""" + return self.get_umi() != self.NOSEQ + + def get_barcode_score(self) -> int: + """ + Get the best barcode alignment score. + + Returns the maximum score from all barcode elements, or -1 if none detected. + """ + max_score = -1 + for el_name in self._barcode_elements: + if el_name in self.detected_results: + detected = self.detected_results[el_name] + if detected.score > max_score: + max_score = detected.score + return max_score + def update_coordinates(self, delta: int) -> None: """ Shift all genomic coordinates by delta. @@ -164,12 +186,42 @@ def set_strand(self, strand: str) -> None: """Set the detected strand.""" self.strand = strand + def _get_supplementary_elements(self) -> List[str]: + """Get list of element names that are not barcode or UMI elements.""" + supplementary = [] + barcode_umi_set = set(self._barcode_elements + self._umi_elements) + for el in self.molecule_structure: + if el.element_type == ElementType.cDNA: + continue + if el.element_name not in barcode_umi_set: + supplementary.append(el.element_name) + return supplementary + def __str__(self) -> str: - """Format result as TSV line.""" - res_str = "%s\t%s" % (self.read_id, self.strand) + """ + Format result as TSV line in standard format. + + Format: read_id, barcode, UMI, BC_score, valid_UMI, strand, [supplementary elements] + This matches the format expected by the rest of the IsoQuant pipeline. + """ + # Standard fields: read_id, barcode, UMI, BC_score, valid_UMI, strand + res_str = "%s\t%s\t%s\t%d\t%s\t%s" % ( + self.read_id, + self.get_barcode(), + self.get_umi(), + self.get_barcode_score(), + self.has_umi(), + self.strand + ) + + # Supplementary elements (non-barcode, non-UMI) for el in self.molecule_structure: if el.element_type == ElementType.cDNA: continue + # Skip barcode and UMI elements (already printed above) + if el.element_name in self._barcode_elements or el.element_name in self._umi_elements: + continue + if el.element_type == ElementType.PolyT: if ElementType.PolyT.name not in self.detected_results: res_str += "\t-1\t-1" @@ -192,25 +244,59 @@ def __str__(self) -> str: return res_str def header(self) -> str: - """Get TSV header for result output.""" - return self.molecule_structure.header() + """ + Get TSV header for result output in standard format. + + Format: #read_id, barcode, UMI, BC_score, valid_UMI, strand, [supplementary elements] + """ + # Standard header fields + header = "#read_id\tbarcode\tUMI\tBC_score\tvalid_UMI\tstrand" + + # Supplementary elements (non-barcode, non-UMI) + for el in self.molecule_structure: + if el.element_type == ElementType.cDNA: + continue + # Skip barcode and UMI elements (already in standard header) + if el.element_name in self._barcode_elements or el.element_name in self._umi_elements: + continue + + if el.element_type == ElementType.PolyT: + header += "\tpolyT_start\tpolyT_end" + elif el.element_type.needs_only_coordinates(): + header += "\t%s_start\t%s_end" % (el.element_name, el.element_name) + elif el.element_type.needs_sequence_extraction() or el.element_type.needs_correction(): + header += "\t%s_start\t%s_end\t%s_seq" % (el.element_name, el.element_name, el.element_name) + return header # TODO: add simple serialization class ReadStats: + """Statistics tracker for ExtractionResult objects.""" + def __init__(self): self.read_count = 0 + self.bc_count = 0 + self.umi_count = 0 self.pattern_counts = defaultdict(int) - def add_read(self, barcode_detection_result): + def add_read(self, barcode_detection_result: ExtractionResult) -> None: + """Add an ExtractionResult to statistics.""" self.read_count += 1 + # Count valid barcodes and UMIs + if barcode_detection_result.has_barcode(): + self.bc_count += 1 + if barcode_detection_result.has_umi(): + self.umi_count += 1 + # Count individual detected elements for el in barcode_detection_result.detected_results: if barcode_detection_result.detected_results[el].start != -1: self.pattern_counts[el] += 1 - def __str__(self): - human_readable_str = ("Total reads:\t%d\n" % self.read_count) + def __str__(self) -> str: + human_readable_str = "Total reads:\t%d\n" % self.read_count + human_readable_str += "Barcode detected:\t%d\n" % self.bc_count + human_readable_str += "UMI detected:\t%d\n" % self.umi_count for a in self.pattern_counts: human_readable_str += "%s:\t%d\n" % (a, self.pattern_counts[a]) return human_readable_str diff --git a/src/barcode_calling/callers/protocol.py b/src/barcode_calling/callers/protocol.py index cbdd96de..a76f6717 100644 --- a/src/barcode_calling/callers/protocol.py +++ b/src/barcode_calling/callers/protocol.py @@ -86,6 +86,29 @@ def is_valid(self) -> bool: """ ... + def has_barcode(self) -> bool: + """ + Check if a barcode was detected (alias for is_valid()). + + Used by ReadStats for counting detected barcodes. + + Returns: + bool: True if barcode was detected. + """ + ... + + def has_umi(self) -> bool: + """ + Check if a valid UMI was detected. + + Returns True if get_umi() returns something other than NOSEQ. + Used by ReadStats for counting detected UMIs. + + Returns: + bool: True if UMI was detected. + """ + ... + def set_strand(self, strand: str) -> None: """ Set the detected strand orientation. From fe449bcd5c3219205102186fc397ae6ac746e548 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Thu, 29 Jan 2026 10:51:20 +0200 Subject: [PATCH 06/41] universal barcode calling tests --- tests/test_barcode_callers.py | 7 +- tests/test_molecule_structure.py | 354 ++++++++++++++ tests/test_universal_extraction.py | 727 +++++++++++++++++++++++++++++ 3 files changed, 1086 insertions(+), 2 deletions(-) create mode 100644 tests/test_molecule_structure.py create mode 100644 tests/test_universal_extraction.py diff --git a/tests/test_barcode_callers.py b/tests/test_barcode_callers.py index a0b1cbf0..1cf3f7f2 100644 --- a/tests/test_barcode_callers.py +++ b/tests/test_barcode_callers.py @@ -100,7 +100,8 @@ def test_str_format(self): def test_header(self): """Test TSV header.""" - header = BarcodeDetectionResult.header() + # header() is now an instance method, use _header() for static access + header = BarcodeDetectionResult._header() assert "#read_id" in header assert "barcode" in header assert "UMI" in header @@ -407,6 +408,7 @@ def test_add_read_with_barcode(self): result = LinkerBarcodeDetectionResult( "read_001", barcode="ACTG", + UMI="GGGGGGGG", # Need actual UMI for has_umi() to return True UMI_good=True, polyT=100 ) @@ -458,7 +460,8 @@ def test_add_custom_stats(self): def test_str_format(self): """Test string formatting.""" stats = ReadStats() - result = LinkerBarcodeDetectionResult("read_001", barcode="ACTG", UMI_good=True) + # Need actual UMI for has_umi() to return True + result = LinkerBarcodeDetectionResult("read_001", barcode="ACTG", UMI="GGGGGGGG", UMI_good=True) stats.add_read(result) output = str(stats) diff --git a/tests/test_molecule_structure.py b/tests/test_molecule_structure.py new file mode 100644 index 00000000..097dce42 --- /dev/null +++ b/tests/test_molecule_structure.py @@ -0,0 +1,354 @@ +############################################################################ +# Copyright (c) 2025-2026 University of Helsinki +# All Rights Reserved +# See file LICENSE for details. +############################################################################ + +""" +Tests for molecule structure parsing and element types. + +Tests the universal barcode calling infrastructure: +- ElementType enum and its method classification +- MoleculeElement initialization for different element types +- MoleculeStructure parsing from MDF format +""" + +import os +import pytest +from io import StringIO + +from src.barcode_calling.callers.molecule_structure import ( + ElementType, + MoleculeElement, + MoleculeStructure +) + +# Test data directory +TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "universal_data") + + +class TestElementType: + """Test ElementType enum methods.""" + + def test_needs_correction_var_file(self): + """VAR_FILE elements need barcode correction.""" + assert ElementType.VAR_FILE.needs_correction() is True + + def test_needs_correction_var_list(self): + """VAR_LIST elements need barcode correction.""" + assert ElementType.VAR_LIST.needs_correction() is True + + def test_needs_correction_var_any(self): + """VAR_ANY elements do NOT need correction.""" + assert ElementType.VAR_ANY.needs_correction() is False + + def test_needs_correction_const(self): + """CONST elements do NOT need correction.""" + assert ElementType.CONST.needs_correction() is False + + def test_needs_sequence_extraction_var_any(self): + """VAR_ANY elements need sequence extraction.""" + assert ElementType.VAR_ANY.needs_sequence_extraction() is True + + def test_needs_sequence_extraction_var_file(self): + """VAR_FILE elements do NOT need sequence extraction (they get corrected).""" + assert ElementType.VAR_FILE.needs_sequence_extraction() is False + + def test_needs_sequence_extraction_const(self): + """CONST elements do NOT need sequence extraction.""" + assert ElementType.CONST.needs_sequence_extraction() is False + + def test_is_constant_const(self): + """CONST is a constant element.""" + assert ElementType.CONST.is_constant() is True + + def test_is_constant_var_any(self): + """VAR_ANY is NOT a constant element.""" + assert ElementType.VAR_ANY.is_constant() is False + + def test_is_constant_polyt(self): + """PolyT is NOT a constant element.""" + assert ElementType.PolyT.is_constant() is False + + def test_is_variable_var_any(self): + """VAR_ANY is a variable element.""" + assert ElementType.VAR_ANY.is_variable() is True + + def test_is_variable_var_list(self): + """VAR_LIST is a variable element.""" + assert ElementType.VAR_LIST.is_variable() is True + + def test_is_variable_var_file(self): + """VAR_FILE is a variable element.""" + assert ElementType.VAR_FILE.is_variable() is True + + def test_is_variable_const(self): + """CONST is NOT a variable element.""" + assert ElementType.CONST.is_variable() is False + + def test_is_variable_polyt(self): + """PolyT is NOT a variable element.""" + assert ElementType.PolyT.is_variable() is False + + def test_is_base_separator_var_any_separator(self): + """VAR_ANY_SEPARATOR is a separator.""" + assert ElementType.VAR_ANY_SEPARATOR.is_base_separator() is True + + def test_is_base_separator_var_any_non_t_separator(self): + """VAR_ANY_NON_T_SEPARATOR is a separator.""" + assert ElementType.VAR_ANY_NON_T_SEPARATOR.is_base_separator() is True + + def test_is_base_separator_var_any(self): + """VAR_ANY is NOT a separator.""" + assert ElementType.VAR_ANY.is_base_separator() is False + + +class TestMoleculeElement: + """Test MoleculeElement initialization.""" + + def test_init_polyt(self): + """Test PolyT element initialization.""" + element = MoleculeElement("PolyT", ElementType.PolyT) + + assert element.element_name == "PolyT" + assert element.element_type == ElementType.PolyT + assert element.element_value is None + assert element.element_length == -1 + + def test_init_cdna(self): + """Test cDNA element initialization.""" + element = MoleculeElement("cDNA", ElementType.cDNA) + + assert element.element_name == "cDNA" + assert element.element_type == ElementType.cDNA + assert element.element_value is None + assert element.element_length == -1 + + def test_init_const(self): + """Test CONST element initialization.""" + sequence = "CTACACGACGCTCTTCCGATCT" + element = MoleculeElement("R1", ElementType.CONST, sequence) + + assert element.element_name == "R1" + assert element.element_type == ElementType.CONST + assert element.element_value == sequence + assert element.element_length == len(sequence) + + def test_init_var_list(self): + """Test VAR_LIST element initialization.""" + barcodes = "AAAA,CCCC,GGGG,TTTT" + element = MoleculeElement("Barcode", ElementType.VAR_LIST, barcodes) + + assert element.element_name == "Barcode" + assert element.element_type == ElementType.VAR_LIST + assert element.element_value == ["AAAA", "CCCC", "GGGG", "TTTT"] + assert element.element_length == 4 # Length of first barcode + + def test_init_var_any(self): + """Test VAR_ANY element initialization.""" + element = MoleculeElement("UMI", ElementType.VAR_ANY, "12") + + assert element.element_name == "UMI" + assert element.element_type == ElementType.VAR_ANY + assert element.element_value is None + assert element.element_length == 12 + + def test_init_var_file(self): + """Test VAR_FILE element initialization with real file.""" + barcode_file = os.path.join(TEST_DATA_DIR, "barcodes_small.tsv") + element = MoleculeElement("Barcode", ElementType.VAR_FILE, barcode_file) + + assert element.element_name == "Barcode" + assert element.element_type == ElementType.VAR_FILE + assert len(element.element_value) == 8 # 8 barcodes in test file + assert element.element_length == 16 # 16bp barcodes + + def test_init_var_any_separator(self): + """Test VAR_ANY_SEPARATOR element initialization.""" + element = MoleculeElement("Sep", ElementType.VAR_ANY_SEPARATOR, "5") + + assert element.element_name == "Sep" + assert element.element_type == ElementType.VAR_ANY_SEPARATOR + assert element.element_value is None + assert element.element_length == 5 + + +class TestMoleculeStructure: + """Test MoleculeStructure parsing.""" + + def test_parse_simple_structure(self): + """Test parsing simple molecule structure.""" + mdf_content = """Barcode:UMI:PolyT:cDNA +Barcode\tVAR_LIST\tAAAA,CCCC,GGGG +UMI\tVAR_ANY\t10 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + elements = list(structure) + assert len(elements) == 4 + + assert elements[0].element_name == "Barcode" + assert elements[0].element_type == ElementType.VAR_LIST + + assert elements[1].element_name == "UMI" + assert elements[1].element_type == ElementType.VAR_ANY + + assert elements[2].element_name == "PolyT" + assert elements[2].element_type == ElementType.PolyT + + assert elements[3].element_name == "cDNA" + assert elements[3].element_type == ElementType.cDNA + + def test_parse_with_const(self): + """Test parsing structure with constant element.""" + mdf_content = """R1:Barcode:UMI:PolyT:cDNA +R1\tCONST\tCTACACGACGCTCTTCCGATCT +Barcode\tVAR_LIST\tAAAA,CCCC,GGGG +UMI\tVAR_ANY\t12 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + elements = list(structure) + assert len(elements) == 5 + + assert elements[0].element_name == "R1" + assert elements[0].element_type == ElementType.CONST + assert elements[0].element_value == "CTACACGACGCTCTTCCGATCT" + + def test_parse_from_file_simple(self): + """Test parsing simple.mdf file.""" + mdf_path = os.path.join(TEST_DATA_DIR, "simple.mdf") + + with open(mdf_path) as f: + structure = MoleculeStructure(f) + + elements = list(structure) + + # Simple.mdf: Barcode:UMI:PolyT:cDNA + assert len(elements) == 4 + assert elements[0].element_name == "Barcode" + assert elements[0].element_type == ElementType.VAR_LIST + assert elements[1].element_name == "UMI" + assert elements[1].element_type == ElementType.VAR_ANY + assert elements[1].element_length == 10 + + def test_parse_from_file_10x(self): + """Test parsing 10x-like structure. + + Uses inline definition to avoid relative path issues with VAR_FILE. + """ + # Use inline definition to avoid file path issues + mdf_content = """R1:Barcode:UMI:PolyT:cDNA:TSO +R1\tCONST\tCTACACGACGCTCTTCCGATCT +Barcode\tVAR_LIST\tATCCTTAGTGTTTGTC,CTTAGGAGTGGTTAGC,TGAGGGCCAACGTGCT +UMI\tVAR_ANY\t12 +TSO\tCONST\tCCCATGTACTCTGCGTTGATACCACTGCTT +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + elements = list(structure) + + # R1:Barcode:UMI:PolyT:cDNA:TSO + assert len(elements) == 6 + assert elements[0].element_name == "R1" + assert elements[0].element_type == ElementType.CONST + assert elements[1].element_name == "Barcode" + assert elements[1].element_type == ElementType.VAR_LIST + + def test_iteration(self): + """Test iterating over structure elements.""" + mdf_content = """Barcode:UMI:cDNA +Barcode\tVAR_LIST\tAAAA,CCCC +UMI\tVAR_ANY\t8 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + names = [el.element_name for el in structure] + assert names == ["Barcode", "UMI", "cDNA"] + + def test_header_generation(self): + """Test TSV header generation.""" + mdf_content = """R1:Barcode:UMI:PolyT:cDNA +R1\tCONST\tCTACACGACGCTCTTCCGATCT +Barcode\tVAR_LIST\tAAAA,CCCC,GGGG +UMI\tVAR_ANY\t12 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + header = structure.header() + + assert "#read_id" in header + assert "strand" in header + assert "R1_start" in header + assert "R1_end" in header + assert "Barcode_start" in header + assert "Barcode_end" in header + assert "Barcode_sequence" in header + assert "UMI_start" in header + assert "UMI_end" in header + assert "UMI_sequence" in header + assert "polyT_start" in header + assert "polyT_end" in header + + def test_from_element_list(self): + """Test creating structure from element list.""" + elements = [ + MoleculeElement("Barcode", ElementType.VAR_LIST, "AAAA,CCCC"), + MoleculeElement("UMI", ElementType.VAR_ANY, "10"), + MoleculeElement("cDNA", ElementType.cDNA) + ] + + structure = MoleculeStructure.from_element_list(elements) + + assert len(list(structure)) == 3 + assert list(structure)[0].element_name == "Barcode" + + def test_concat_delimiter(self): + """Test concatenation delimiter constant.""" + assert MoleculeStructure.CONCAT_DELIM == "|" + + def test_dupl_delimiter(self): + """Test duplication delimiter constant.""" + assert MoleculeStructure.DUPL_DELIM == "/" + + +class TestMoleculeStructureEdgeCases: + """Test edge cases and error handling.""" + + def test_element_order_preserved(self): + """Test that element order from header line is preserved.""" + mdf_content = """cDNA:UMI:Barcode:PolyT +Barcode\tVAR_LIST\tAAAA,CCCC +UMI\tVAR_ANY\t8 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + names = [el.element_name for el in structure] + # Order should match header line, not definition order + assert names == ["cDNA", "UMI", "Barcode", "PolyT"] + + def test_whitespace_handling(self): + """Test that whitespace in MDF is handled correctly.""" + mdf_content = """ Barcode : UMI : cDNA +Barcode VAR_LIST AAAA,CCCC +UMI VAR_ANY 8 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + elements = list(structure) + assert len(elements) == 3 + + def test_multiple_var_list_barcodes(self): + """Test VAR_LIST with many barcodes.""" + barcodes = ",".join(["ACGT" * 4] * 100) # 100 barcodes + mdf_content = f"""Barcode:cDNA +Barcode\tVAR_LIST\t{barcodes} +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + elements = list(structure) + assert len(elements[0].element_value) == 100 + + +if __name__ == '__main__': + pytest.main([__file__, '-v']) diff --git a/tests/test_universal_extraction.py b/tests/test_universal_extraction.py new file mode 100644 index 00000000..841d409c --- /dev/null +++ b/tests/test_universal_extraction.py @@ -0,0 +1,727 @@ +############################################################################ +# Copyright (c) 2025-2026 University of Helsinki +# All Rights Reserved +# See file LICENSE for details. +############################################################################ + +""" +Tests for universal barcode extraction. + +Tests the universal barcode calling infrastructure: +- DetectedElement container class +- ExtractionResult dict-based detection result +- UniversalSingleMoleculeExtractor for pattern detection +- BarcodeResult protocol compliance +""" + +import os +import pytest + +from src.barcode_calling.callers.extraction_result import ( + DetectedElement, + ExtractionResult, + ReadStats +) +from src.barcode_calling.callers.molecule_structure import ( + ElementType, + MoleculeElement, + MoleculeStructure +) +from src.barcode_calling.callers.universal_extraction import ( + UniversalSingleMoleculeExtractor +) +from src.barcode_calling.callers.protocol import BarcodeResult + +# Test data directory +TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "universal_data") + + +# ============================================================================= +# Test Fixtures +# ============================================================================= + +def create_simple_structure(): + """Create simple molecule structure: Barcode:UMI:PolyT:cDNA""" + mdf_content = """Barcode:UMI:PolyT:cDNA +Barcode\tVAR_LIST\tATCCTTAGTGTTTGTC,CTTAGGAGTGGTTAGC,TGAGGGCCAACGTGCT +UMI\tVAR_ANY\t12 +""" + return MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + +def create_10x_structure(): + """Create 10x-like molecule structure with R1 adapter and TSO. + + Uses inline barcodes instead of file reference to avoid path issues. + """ + # Use inline barcodes to avoid relative path issues with VAR_FILE + mdf_content = """R1:Barcode:UMI:PolyT:cDNA:TSO +R1\tCONST\tCTACACGACGCTCTTCCGATCT +Barcode\tVAR_LIST\tATCCTTAGTGTTTGTC,CTTAGGAGTGGTTAGC,TGAGGGCCAACGTGCT,AAACCCGGGTTTAAAC +UMI\tVAR_ANY\t12 +TSO\tCONST\tCCCATGTACTCTGCGTTGATACCACTGCTT +""" + return MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + +def create_detected_elements(): + """Create sample detected elements for testing.""" + return { + "Barcode": DetectedElement(22, 37, 16, "ATCCTTAGTGTTTGTC"), + "UMI": DetectedElement(38, 49, 0, "AACCTTAGACCG"), + "PolyT": DetectedElement(50, 79, 0) + } + + +# ============================================================================= +# Test DetectedElement +# ============================================================================= + +class TestDetectedElement: + """Test DetectedElement container class.""" + + def test_default_init(self): + """Test initialization with default values.""" + element = DetectedElement() + + assert element.start == -1 + assert element.end == -1 + assert element.score == -1 + assert element.seq is None + + def test_custom_init(self): + """Test initialization with custom values.""" + element = DetectedElement(start=10, end=25, score=15, seq="ACTGACTG") + + assert element.start == 10 + assert element.end == 25 + assert element.score == 15 + assert element.seq == "ACTGACTG" + + def test_partial_init(self): + """Test initialization with some values.""" + element = DetectedElement(start=100, end=150) + + assert element.start == 100 + assert element.end == 150 + assert element.score == -1 + assert element.seq is None + + def test_coordinates_only(self): + """Test element with only coordinates (no sequence).""" + element = DetectedElement(50, 70, 0) + + assert element.start == 50 + assert element.end == 70 + assert element.seq is None + + +# ============================================================================= +# Test ExtractionResult +# ============================================================================= + +class TestExtractionResult: + """Test ExtractionResult dict-based detection result.""" + + def test_init(self): + """Test initialization.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.read_id == "read_001" + assert result.strand == "+" + assert result.molecule_structure is structure + assert "Barcode" in result.detected_results + + def test_barcode_identification_by_prefix(self): + """Test barcode elements are identified by 'barcode' prefix.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", "+", detected) + + # Should identify 'Barcode' element as barcode (case-insensitive prefix) + assert "Barcode" in result._barcode_elements + + def test_umi_identification_by_prefix(self): + """Test UMI elements are identified by 'umi' prefix.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", "+", detected) + + # Should identify 'UMI' element as UMI (case-insensitive prefix) + assert "UMI" in result._umi_elements + + def test_get_barcode_single(self): + """Test getting single barcode.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.get_barcode() == "ATCCTTAGTGTTTGTC" + + def test_get_barcode_missing(self): + """Test getting barcode when not detected.""" + structure = create_simple_structure() + detected = { + "UMI": DetectedElement(38, 49, 0, "AACCTTAGACCG"), + "PolyT": DetectedElement(50, 79, 0) + } + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.get_barcode() == ExtractionResult.NOSEQ + + def test_get_umi(self): + """Test getting UMI.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.get_umi() == "AACCTTAGACCG" + + def test_get_umi_missing(self): + """Test getting UMI when not detected.""" + structure = create_simple_structure() + detected = { + "Barcode": DetectedElement(22, 37, 16, "ATCCTTAGTGTTTGTC"), + "PolyT": DetectedElement(50, 79, 0) + } + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.get_umi() == ExtractionResult.NOSEQ + + def test_is_valid_with_barcode(self): + """Test is_valid returns True when barcode detected.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.is_valid() is True + + def test_is_valid_without_barcode(self): + """Test is_valid returns False when no barcode.""" + structure = create_simple_structure() + detected = {"PolyT": DetectedElement(50, 79, 0)} + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.is_valid() is False + + def test_has_barcode(self): + """Test has_barcode method.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.has_barcode() is True + + def test_has_umi(self): + """Test has_umi method.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.has_umi() is True + + def test_has_umi_false(self): + """Test has_umi returns False when no UMI.""" + structure = create_simple_structure() + detected = { + "Barcode": DetectedElement(22, 37, 16, "ATCCTTAGTGTTTGTC"), + "PolyT": DetectedElement(50, 79, 0) + } + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.has_umi() is False + + def test_get_barcode_score(self): + """Test getting barcode score.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.get_barcode_score() == 16 + + def test_get_barcode_score_no_barcode(self): + """Test barcode score when no barcode detected.""" + structure = create_simple_structure() + detected = {"PolyT": DetectedElement(50, 79, 0)} + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.get_barcode_score() == -1 + + def test_set_strand(self): + """Test setting strand.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", ".", detected) + result.set_strand("-") + + assert result.strand == "-" + + def test_update_coordinates(self): + """Test shifting all coordinates.""" + structure = create_simple_structure() + detected = { + "Barcode": DetectedElement(22, 37, 16, "ATCCTTAGTGTTTGTC"), + "UMI": DetectedElement(38, 49, 0, "AACCTTAGACCG"), + } + + result = ExtractionResult(structure, "read_001", "+", detected) + result.update_coordinates(100) + + assert result.detected_results["Barcode"].start == 122 + assert result.detected_results["Barcode"].end == 137 + assert result.detected_results["UMI"].start == 138 + assert result.detected_results["UMI"].end == 149 + + def test_update_coordinates_invalid(self): + """Test coordinate shift preserves -1 values.""" + structure = create_simple_structure() + detected = { + "Barcode": DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ), + } + + result = ExtractionResult(structure, "read_001", "+", detected) + result.update_coordinates(100) + + # -1 values (invalid/not detected) should remain -1 + assert result.detected_results["Barcode"].start == -1 + assert result.detected_results["Barcode"].end == -1 + + def test_more_informative_than_by_count(self): + """Test comparison by detected element count.""" + structure = create_simple_structure() + + detected1 = { + "Barcode": DetectedElement(22, 37, 16, "ATCCTTAGTGTTTGTC"), + "UMI": DetectedElement(38, 49, 0, "AACCTTAGACCG"), + "PolyT": DetectedElement(50, 79, 0), + } + detected2 = { + "Barcode": DetectedElement(22, 37, 16, "ATCCTTAGTGTTTGTC"), + } + + result1 = ExtractionResult(structure, "read_001", "+", detected1) + result2 = ExtractionResult(structure, "read_001", "+", detected2) + + assert result1.more_informative_than(result2) is True + assert result2.more_informative_than(result1) is False + + def test_more_informative_than_by_score(self): + """Test comparison by total score when counts equal.""" + structure = create_simple_structure() + + detected1 = { + "Barcode": DetectedElement(22, 37, 20, "ATCCTTAGTGTTTGTC"), + } + detected2 = { + "Barcode": DetectedElement(22, 37, 10, "ATCCTTAGTGTTTGTC"), + } + + result1 = ExtractionResult(structure, "read_001", "+", detected1) + result2 = ExtractionResult(structure, "read_001", "+", detected2) + + assert result1.more_informative_than(result2) is True + + def test_str_format(self): + """Test string formatting for TSV output.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", "+", detected) + output = str(result) + + # Standard format: read_id, barcode, UMI, BC_score, valid_UMI, strand + assert "read_001" in output + assert "ATCCTTAGTGTTTGTC" in output + assert "AACCTTAGACCG" in output + assert "16" in output + assert "True" in output + assert "+" in output + + def test_header_format(self): + """Test header generation.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", "+", detected) + header = result.header() + + assert "#read_id" in header + assert "barcode" in header + assert "UMI" in header + assert "BC_score" in header + assert "valid_UMI" in header + assert "strand" in header + + def test_get_additional_attributes(self): + """Test getting additional attributes.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", "+", detected) + attrs = result.get_additional_attributes() + + assert "Barcode detected" in attrs + assert "UMI detected" in attrs + assert "PolyT detected" in attrs + + def test_noseq_constant(self): + """Test NOSEQ constant.""" + assert ExtractionResult.NOSEQ == "*" + + +class TestExtractionResultProtocolCompliance: + """Test that ExtractionResult conforms to BarcodeResult protocol.""" + + def test_isinstance_check(self): + """Test that ExtractionResult is recognized as BarcodeResult.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", "+", detected) + + # Protocol is runtime_checkable + assert isinstance(result, BarcodeResult) + + def test_has_required_attributes(self): + """Test that all protocol attributes exist.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", "+", detected) + + # Required attributes + assert hasattr(result, 'NOSEQ') + assert hasattr(result, 'read_id') + assert hasattr(result, 'strand') + + def test_has_required_methods(self): + """Test that all protocol methods exist.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", "+", detected) + + # Required methods + assert callable(getattr(result, 'get_barcode', None)) + assert callable(getattr(result, 'get_umi', None)) + assert callable(getattr(result, 'is_valid', None)) + assert callable(getattr(result, 'has_barcode', None)) + assert callable(getattr(result, 'has_umi', None)) + assert callable(getattr(result, 'set_strand', None)) + assert callable(getattr(result, 'update_coordinates', None)) + assert callable(getattr(result, 'more_informative_than', None)) + assert callable(getattr(result, 'get_additional_attributes', None)) + assert callable(getattr(result, 'header', None)) + + +# ============================================================================= +# Test ExtractionResult ReadStats +# ============================================================================= + +class TestExtractionReadStats: + """Test ReadStats class for ExtractionResult.""" + + def test_init(self): + """Test initialization.""" + stats = ReadStats() + + assert stats.read_count == 0 + assert stats.bc_count == 0 + assert stats.umi_count == 0 + assert len(stats.pattern_counts) == 0 + + def test_add_read_with_barcode(self): + """Test adding read with barcode detected.""" + stats = ReadStats() + structure = create_simple_structure() + detected = create_detected_elements() + result = ExtractionResult(structure, "read_001", "+", detected) + + stats.add_read(result) + + assert stats.read_count == 1 + assert stats.bc_count == 1 + assert stats.umi_count == 1 + assert stats.pattern_counts["Barcode"] == 1 + assert stats.pattern_counts["UMI"] == 1 + assert stats.pattern_counts["PolyT"] == 1 + + def test_add_read_without_barcode(self): + """Test adding read without barcode.""" + stats = ReadStats() + structure = create_simple_structure() + detected = {"PolyT": DetectedElement(50, 79, 0)} + result = ExtractionResult(structure, "read_001", "+", detected) + + stats.add_read(result) + + assert stats.read_count == 1 + assert stats.bc_count == 0 + assert stats.umi_count == 0 + assert stats.pattern_counts["PolyT"] == 1 + + def test_add_multiple_reads(self): + """Test adding multiple reads.""" + stats = ReadStats() + structure = create_simple_structure() + + # Read 1: has barcode, UMI, polyT + detected1 = create_detected_elements() + result1 = ExtractionResult(structure, "read_001", "+", detected1) + + # Read 2: has only barcode + detected2 = {"Barcode": DetectedElement(22, 37, 16, "ATCCTTAGTGTTTGTC")} + result2 = ExtractionResult(structure, "read_002", "+", detected2) + + # Read 3: no barcode + detected3 = {"PolyT": DetectedElement(50, 79, 0)} + result3 = ExtractionResult(structure, "read_003", "+", detected3) + + stats.add_read(result1) + stats.add_read(result2) + stats.add_read(result3) + + assert stats.read_count == 3 + assert stats.bc_count == 2 # reads 1 and 2 + assert stats.umi_count == 1 # only read 1 + assert stats.pattern_counts["Barcode"] == 2 + assert stats.pattern_counts["PolyT"] == 2 + + def test_str_format(self): + """Test string formatting.""" + stats = ReadStats() + structure = create_simple_structure() + detected = create_detected_elements() + result = ExtractionResult(structure, "read_001", "+", detected) + + stats.add_read(result) + output = str(stats) + + assert "Total reads:\t1" in output + assert "Barcode detected:\t1" in output + assert "UMI detected:\t1" in output + + +# ============================================================================= +# Test UniversalSingleMoleculeExtractor +# ============================================================================= + +class TestUniversalSingleMoleculeExtractor: + """Test UniversalSingleMoleculeExtractor class.""" + + def test_init_simple_structure(self): + """Test initialization with simple structure.""" + structure = create_simple_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + assert extractor.has_polyt is True + assert extractor.has_cdna is True + assert "UMI" in extractor.elements_to_extract + + def test_init_10x_structure(self): + """Test initialization with 10x structure.""" + structure = create_10x_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + assert extractor.has_polyt is True + assert extractor.has_cdna is True + assert "R1" in extractor.constant_elements_to_detect + assert "TSO" in extractor.constant_elements_to_detect + + def test_result_type(self): + """Test result_type returns ExtractionResult.""" + structure = create_simple_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + assert extractor.result_type() is ExtractionResult + + def test_header(self): + """Test header generation.""" + structure = create_simple_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + header = extractor.header() + + assert "#read_id" in header + assert "strand" in header + + +class TestPatternDetection: + """Test actual barcode/UMI detection with synthetic sequences.""" + + def test_find_barcode_umi_forward(self): + """Test detection in forward strand read.""" + structure = create_10x_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + # Test read with known structure (from test_reads.fq READ_1) + # Structure: R1(22bp) + Barcode(16bp) + UMI(12bp) + PolyT + cDNA + read_id = "READ_1" + sequence = "TACACGACGCTCTTCCGATCTATCCTTAGTGTTTGTCAACCTTAGACCGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGGGCCTCAGCGTAATCCTATTGTGCATG" + + result = extractor.find_barcode_umi(read_id, sequence) + + assert result.read_id == read_id + assert result.strand == "+" + # Should detect polyT + assert "PolyT" in result.detected_results + polyt = result.detected_results["PolyT"] + assert polyt.start != -1 + + def test_find_barcode_umi_reverse(self): + """Test detection in reverse strand read.""" + structure = create_10x_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + # Test read with TSO at beginning (from test_reads.fq READ_2) + # This indicates reverse strand - TSO is at 3' end + read_id = "READ_2" + # Starts with TSO adapter, ends with R1 + sequence = "AAGCAGTGGTATCAACGCAGAGTACATGGGAAGCAAGGAGGCAAGGCCCGCGCCAAGGCCAAGTCGCGGTCTTCCCGGGCCCGGGCTACCTATTCCCGGTGGGGCGTGTGCACCGGCTGCTGCGCAAGGGCAACTACGCGGAGCGTGTGGGCGCCGGCGCGCCGGTATACATGGCGGCGGTGCTGGAGTACCTAACGGCCGAGATCCTGGAGCTGGCGGGCAACGCGGCCCGCGACAACAAGAAGACTCGCATCATCCCGCGCCACCTGCAGCTGGCCATCCGCAACGACGAGGAGCTCAACAAGCTGCTGGGCAAAGTGACGATCGCGCAGGGCGGCGTCCTGCCCAACATCCAGGCCGTGCTGCTGCCCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATCAAGGAGGCTCGCTAACCACTCCTAAGAGATCGGAAGAGCGTCGTGTA" + + result = extractor.find_barcode_umi(read_id, sequence) + + assert result.read_id == read_id + # Should detect in reverse orientation (TSO at start means polyA/polyT would be internal) + + def test_no_match_random_sequence(self): + """Test detection with random sequence (no patterns).""" + structure = create_simple_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + read_id = "RANDOM" + sequence = "ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT" * 5 + + result = extractor.find_barcode_umi(read_id, sequence) + + assert result.read_id == read_id + # May or may not detect anything, but should not crash + + +class TestMultipleBarcodeNaming: + """Test barcode/UMI identification with various naming conventions.""" + + def test_barcode_prefix_lowercase(self): + """Test 'barcode' prefix detection (lowercase).""" + mdf_content = """barcode1:cDNA +barcode1\tVAR_LIST\tAAAA,CCCC,GGGG +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + detected = {"barcode1": DetectedElement(0, 3, 10, "AAAA")} + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.get_barcode() == "AAAA" + assert "barcode1" in result._barcode_elements + + def test_barcode_prefix_mixedcase(self): + """Test 'Barcode' prefix detection (mixed case).""" + mdf_content = """Barcode:cDNA +Barcode\tVAR_LIST\tAAAA,CCCC,GGGG +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + detected = {"Barcode": DetectedElement(0, 3, 10, "CCCC")} + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.get_barcode() == "CCCC" + + def test_umi_prefix_lowercase(self): + """Test 'umi' prefix detection (lowercase).""" + mdf_content = """umi:cDNA +umi\tVAR_ANY\t10 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + detected = {"umi": DetectedElement(0, 9, 0, "ACGTACGTAC")} + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.get_umi() == "ACGTACGTAC" + assert "umi" in result._umi_elements + + def test_multiple_barcodes_concatenated(self): + """Test multiple barcode elements are concatenated with '|'.""" + mdf_content = """Barcode1:Barcode2:cDNA +Barcode1\tVAR_LIST\tAAAA,CCCC +Barcode2\tVAR_LIST\tGGGG,TTTT +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + detected = { + "Barcode1": DetectedElement(0, 3, 10, "AAAA"), + "Barcode2": DetectedElement(4, 7, 10, "GGGG") + } + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.get_barcode() == "AAAA|GGGG" + + def test_multiple_umis_concatenated(self): + """Test multiple UMI elements are concatenated with '|'.""" + mdf_content = """UMI1:UMI2:cDNA +UMI1\tVAR_ANY\t6 +UMI2\tVAR_ANY\t6 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + detected = { + "UMI1": DetectedElement(0, 5, 0, "AAAAAA"), + "UMI2": DetectedElement(6, 11, 0, "CCCCCC") + } + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.get_umi() == "AAAAAA|CCCCCC" + + +class TestIntegration: + """Integration tests with test data files.""" + + def test_process_test_reads_file(self): + """Test processing reads from test_reads.fq.""" + structure = create_10x_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + # Load test reads + test_file = os.path.join(TEST_DATA_DIR, "test_reads.fq") + reads = [] + with open(test_file) as f: + lines = f.readlines() + for i in range(0, len(lines), 4): + if i + 1 < len(lines): + read_id = lines[i].strip()[1:].split()[0] # Remove @ and take first part + sequence = lines[i + 1].strip() + reads.append((read_id, sequence)) + + # Process each read + results = [] + for read_id, sequence in reads: + result = extractor.find_barcode_umi(read_id, sequence) + results.append(result) + + # Should process all reads without error + assert len(results) == len(reads) + + # Check results are ExtractionResult instances + for result in results: + assert isinstance(result, ExtractionResult) + assert result.read_id in [r[0] for r in reads] + + +if __name__ == '__main__': + pytest.main([__file__, '-v']) From 6d4543c1d2b2b960de9844ad702967683068b8c3 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Thu, 29 Jan 2026 10:57:55 +0200 Subject: [PATCH 07/41] factor out element knowledge to MoleculeStructure --- .../callers/extraction_result.py | 21 ++-- .../callers/molecule_structure.py | 50 ++++++++- tests/test_molecule_structure.py | 105 ++++++++++++++++++ 3 files changed, 162 insertions(+), 14 deletions(-) diff --git a/src/barcode_calling/callers/extraction_result.py b/src/barcode_calling/callers/extraction_result.py index 94015a84..14982cb4 100644 --- a/src/barcode_calling/callers/extraction_result.py +++ b/src/barcode_calling/callers/extraction_result.py @@ -62,19 +62,16 @@ def __init__(self, molecule_structure: MoleculeStructure, read_id: str, strand: self.read_id = read_id self.strand = strand self.detected_results = detected_results - # Cache barcode/UMI element names for efficiency - self._barcode_elements: List[str] = [] - self._umi_elements: List[str] = [] - self._identify_barcode_umi_elements() - def _identify_barcode_umi_elements(self) -> None: - """Identify barcode/UMI elements by naming convention.""" - for el in self.molecule_structure: - name_lower = el.element_name.lower() - if name_lower.startswith("barcode"): - self._barcode_elements.append(el.element_name) - elif name_lower.startswith("umi"): - self._umi_elements.append(el.element_name) + @property + def _barcode_elements(self) -> List[str]: + """Get barcode element names from molecule structure.""" + return self.molecule_structure.barcode_elements + + @property + def _umi_elements(self) -> List[str]: + """Get UMI element names from molecule structure.""" + return self.molecule_structure.umi_elements def get_barcode(self) -> str: """ diff --git a/src/barcode_calling/callers/molecule_structure.py b/src/barcode_calling/callers/molecule_structure.py index 94c26db0..4b9157b9 100644 --- a/src/barcode_calling/callers/molecule_structure.py +++ b/src/barcode_calling/callers/molecule_structure.py @@ -7,7 +7,7 @@ import logging import sys from enum import unique, Enum -from typing import Iterator +from typing import Iterator, List from ...error_codes import IsoQuantExitCode from ..common import load_barcodes @@ -87,13 +87,25 @@ def __init__(self, element_name: str, element_type: ElementType, element_value_1 class MoleculeStructure: + """ + Defines the structure of a molecule for barcode/UMI extraction. + + Parses molecule definition files (MDF) and identifies which elements + are barcodes and UMIs based on naming convention: + - Elements with prefix "barcode" (case-insensitive) are barcodes + - Elements with prefix "umi" (case-insensitive) are UMIs + """ + CONCAT_DELIM = "|" DUPL_DELIM = "/" def __init__(self, str_iterator): - self.ordered_elements: list[MoleculeElement] = [] + self.ordered_elements: List[MoleculeElement] = [] self.elements_to_concatenate = {} self.duplicated_elements = {} + # Barcode/UMI element names identified by naming convention + self._barcode_elements: List[str] = [] + self._umi_elements: List[str] = [] l = next(str_iterator) elements = list(map(lambda x: x.strip(), l.strip().split(':'))) @@ -141,10 +153,44 @@ def __init__(self, str_iterator): sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) self.duplicated_elements[el] = v[0] + self._identify_barcode_umi_elements() + + def _identify_barcode_umi_elements(self) -> None: + """ + Identify barcode/UMI elements by naming convention. + + Elements with prefix "barcode" (case-insensitive) are barcodes. + Elements with prefix "umi" (case-insensitive) are UMIs. + """ + self._barcode_elements = [] + self._umi_elements = [] + for el in self.ordered_elements: + name_lower = el.element_name.lower() + if name_lower.startswith("barcode"): + self._barcode_elements.append(el.element_name) + elif name_lower.startswith("umi"): + self._umi_elements.append(el.element_name) + + @property + def barcode_elements(self) -> List[str]: + """Get list of barcode element names (identified by 'barcode' prefix).""" + return self._barcode_elements + + @property + def umi_elements(self) -> List[str]: + """Get list of UMI element names (identified by 'umi' prefix).""" + return self._umi_elements + @classmethod def from_element_list(cls, element_list): + """Create MoleculeStructure from a list of MoleculeElement objects.""" ms = cls.__new__(cls) ms.ordered_elements = element_list + ms.elements_to_concatenate = {} + ms.duplicated_elements = {} + ms._barcode_elements = [] + ms._umi_elements = [] + ms._identify_barcode_umi_elements() return ms def __iter__(self) -> Iterator[MoleculeElement]: diff --git a/tests/test_molecule_structure.py b/tests/test_molecule_structure.py index 097dce42..46694016 100644 --- a/tests/test_molecule_structure.py +++ b/tests/test_molecule_structure.py @@ -312,6 +312,111 @@ def test_dupl_delimiter(self): assert MoleculeStructure.DUPL_DELIM == "/" +class TestMoleculeStructureBarcodeUMI: + """Test barcode/UMI element identification.""" + + def test_barcode_elements_single(self): + """Test single barcode element identification.""" + mdf_content = """Barcode:UMI:cDNA +Barcode\tVAR_LIST\tAAAA,CCCC +UMI\tVAR_ANY\t10 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert structure.barcode_elements == ["Barcode"] + + def test_umi_elements_single(self): + """Test single UMI element identification.""" + mdf_content = """Barcode:UMI:cDNA +Barcode\tVAR_LIST\tAAAA,CCCC +UMI\tVAR_ANY\t10 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert structure.umi_elements == ["UMI"] + + def test_barcode_elements_multiple(self): + """Test multiple barcode elements identification.""" + mdf_content = """barcode1:barcode2:UMI:cDNA +barcode1\tVAR_LIST\tAAAA,CCCC +barcode2\tVAR_LIST\tGGGG,TTTT +UMI\tVAR_ANY\t10 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert structure.barcode_elements == ["barcode1", "barcode2"] + + def test_umi_elements_multiple(self): + """Test multiple UMI elements identification.""" + mdf_content = """Barcode:UMI1:UMI2:cDNA +Barcode\tVAR_LIST\tAAAA,CCCC +UMI1\tVAR_ANY\t6 +UMI2\tVAR_ANY\t6 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert structure.umi_elements == ["UMI1", "UMI2"] + + def test_barcode_case_insensitive(self): + """Test that barcode prefix is case-insensitive.""" + mdf_content = """BARCODE:Barcode2:barcode3:cDNA +BARCODE\tVAR_LIST\tAAAA,CCCC +Barcode2\tVAR_LIST\tGGGG,TTTT +barcode3\tVAR_LIST\tACGT,TGCA +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert len(structure.barcode_elements) == 3 + assert "BARCODE" in structure.barcode_elements + assert "Barcode2" in structure.barcode_elements + assert "barcode3" in structure.barcode_elements + + def test_umi_case_insensitive(self): + """Test that UMI prefix is case-insensitive.""" + mdf_content = """Barcode:UMI:umi2:cDNA +Barcode\tVAR_LIST\tAAAA,CCCC +UMI\tVAR_ANY\t6 +umi2\tVAR_ANY\t6 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert len(structure.umi_elements) == 2 + assert "UMI" in structure.umi_elements + assert "umi2" in structure.umi_elements + + def test_no_barcode_elements(self): + """Test structure with no barcode elements.""" + mdf_content = """R1:UMI:cDNA +R1\tCONST\tACGT +UMI\tVAR_ANY\t10 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert structure.barcode_elements == [] + + def test_no_umi_elements(self): + """Test structure with no UMI elements.""" + mdf_content = """Barcode:cDNA +Barcode\tVAR_LIST\tAAAA,CCCC +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert structure.umi_elements == [] + + def test_from_element_list_identifies_barcodes(self): + """Test that from_element_list also identifies barcode/UMI elements.""" + elements = [ + MoleculeElement("Barcode", ElementType.VAR_LIST, "AAAA,CCCC"), + MoleculeElement("UMI", ElementType.VAR_ANY, "10"), + MoleculeElement("cDNA", ElementType.cDNA) + ] + + structure = MoleculeStructure.from_element_list(elements) + + assert structure.barcode_elements == ["Barcode"] + assert structure.umi_elements == ["UMI"] + + class TestMoleculeStructureEdgeCases: """Test edge cases and error handling.""" From 998b9de5bb704009e6c616a7c7235c8f17b2952c Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Thu, 29 Jan 2026 21:27:41 +0200 Subject: [PATCH 08/41] fix parallel version --- detect_barcodes.py | 25 ++++++------------- .../callers/extraction_result.py | 9 +++++-- .../callers/molecule_structure.py | 4 +++ .../callers/universal_extraction.py | 7 ++++-- tests/test_universal_extraction.py | 8 +++--- 5 files changed, 28 insertions(+), 25 deletions(-) diff --git a/detect_barcodes.py b/detect_barcodes.py index b3ee45d5..f90c985b 100755 --- a/detect_barcodes.py +++ b/detect_barcodes.py @@ -247,13 +247,12 @@ def bam_file_chunk_reader(handler): yield current_chunk -def process_chunk(barcode_detector, read_chunk, output_file, num, out_fasta=None, min_score=None): +def process_chunk(barcode_detector, read_chunk, output_file, num, out_fasta=None): output_file += "_" + str(num) if out_fasta: out_fasta += "_" + str(num) counter = 0 - if min_score: - barcode_detector.min_score = min_score + barcode_caller = BarcodeCaller(output_file, barcode_detector, output_sequences=out_fasta) counter += barcode_caller.process_chunk(read_chunk) read_chunk.clear() @@ -325,7 +324,7 @@ def process_single_thread(args): logger.info("Finished barcode calling") -def _process_single_file_in_parallel(input_file, output_tsv, out_fasta, args, barcodes): +def _process_single_file_in_parallel(input_file, output_tsv, out_fasta, args, barcode_detector): """Process a single file in parallel (internal helper function).""" logger.info("Processing " + input_file) fname, outer_ext = os.path.splitext(os.path.basename(input_file)) @@ -355,13 +354,6 @@ def _process_single_file_in_parallel(input_file, output_tsv, out_fasta, args, ba tmp_dir = os.path.join(args.tmp_dir, tmp_dir) os.makedirs(tmp_dir) - barcode_detector = BARCODE_CALLING_MODES[args.mode](barcodes) - logger.info("Barcode caller created") - - min_score = None - if args.min_score: - min_score = args.min_score - tmp_barcode_file = os.path.join(tmp_dir, "bc") tmp_fasta_file = os.path.join(tmp_dir, "subreads") if out_fasta else None chunk_counter = 0 @@ -382,8 +374,7 @@ def _process_single_file_in_parallel(input_file, output_tsv, out_fasta, args, ba chunk, tmp_barcode_file, chunk_counter, - tmp_fasta_file, - min_score)) + tmp_fasta_file)) chunk_counter += 1 if chunk_counter >= args.threads: break @@ -418,7 +409,7 @@ def _process_single_file_in_parallel(input_file, output_tsv, out_fasta, args, ba with open(output_tsv, "w") as final_output_tsv: final_output_fasta = open(out_fasta, "w") if out_fasta else None - header = BARCODE_CALLING_MODES[args.mode].header() + header = barcode_detector.header() final_output_tsv.write(header + "\n") stat_dict = defaultdict(int) for tmp_file, tmp_fasta in output_files: @@ -446,14 +437,12 @@ def process_in_parallel(args): # args.input, args.output_tsv are always lists # args.out_fasta is a list or None out_fastas = args.out_fasta if args.out_fasta else [None] * len(args.input) - - # Prepare barcodes once for all files - barcodes = create_barcode_caller(args) + barcode_detector = create_barcode_caller(args) for idx, (input_file, output_tsv, out_fasta) in enumerate(zip(args.input, args.output_tsv, out_fastas)): if len(args.input) > 1: logger.info("Processing file %d/%d: %s" % (idx + 1, len(args.input), input_file)) - _process_single_file_in_parallel(input_file, output_tsv, out_fasta, args, barcodes) + _process_single_file_in_parallel(input_file, output_tsv, out_fasta, args, barcode_detector) logger.info("Finished barcode calling") diff --git a/src/barcode_calling/callers/extraction_result.py b/src/barcode_calling/callers/extraction_result.py index 14982cb4..82f91af4 100644 --- a/src/barcode_calling/callers/extraction_result.py +++ b/src/barcode_calling/callers/extraction_result.py @@ -172,12 +172,17 @@ def more_informative_than(self, other: 'ExtractionResult') -> bool: def get_additional_attributes(self) -> List[str]: """ - Get list of detected additional features. + Get list of detected supplementary features (excluding barcode/UMI). + + Barcode and UMI elements are excluded because they are already + tracked by ReadStats.bc_count and ReadStats.umi_count respectively. Returns: List of detected element names with " detected" suffix. """ - return ["%s detected" % el_name for el_name, d in self.detected_results.items() if d.start != -1] + barcode_umi_set = set(self._barcode_elements + self._umi_elements) + return ["%s detected" % el_name for el_name, d in self.detected_results.items() + if d.start != -1 and el_name not in barcode_umi_set] def set_strand(self, strand: str) -> None: """Set the detected strand.""" diff --git a/src/barcode_calling/callers/molecule_structure.py b/src/barcode_calling/callers/molecule_structure.py index 4b9157b9..1e1e95f8 100644 --- a/src/barcode_calling/callers/molecule_structure.py +++ b/src/barcode_calling/callers/molecule_structure.py @@ -41,6 +41,10 @@ def is_variable(self): def is_constant(self): return self == ElementType.CONST + def needs_only_coordinates(self): + """Return True if element only needs start/end coordinates (no sequence extraction).""" + return self.is_constant() + class MoleculeElement: def __init__(self, element_name: str, element_type: ElementType, element_value_1 = None, element_value_2 = None): diff --git a/src/barcode_calling/callers/universal_extraction.py b/src/barcode_calling/callers/universal_extraction.py index 6ef64866..8201681e 100644 --- a/src/barcode_calling/callers/universal_extraction.py +++ b/src/barcode_calling/callers/universal_extraction.py @@ -52,7 +52,7 @@ def __init__(self, molecule_structure: MoleculeStructure): max(6, int(el.element_length / 2) - 2)) elif el.element_type.needs_sequence_extraction(): self.elements_to_extract.add(el.element_name) - elif el.element_type.needs_sequence_extraction(): + elif el.element_type.needs_correction(): if self.correct_sequences: self.index_dict[el.element_name] = self.prepare_barcode_index(el) # FIXME: min score should be based on the length of the barcode and their sparcity @@ -78,7 +78,10 @@ def result_type(self): return ExtractionResult def header(self): - return self.molecule_structure.header() + """Get header matching ExtractionResult output format.""" + # Create a dummy result to get the correct header format + dummy_result = ExtractionResult(self.molecule_structure, "", ".", {}) + return dummy_result.header() def find_barcode_umi(self, read_id, sequence): detected_elements_fwd = self._find_patterns_fwd(read_id, sequence) diff --git a/tests/test_universal_extraction.py b/tests/test_universal_extraction.py index 841d409c..58cad5f7 100644 --- a/tests/test_universal_extraction.py +++ b/tests/test_universal_extraction.py @@ -370,15 +370,17 @@ def test_header_format(self): assert "strand" in header def test_get_additional_attributes(self): - """Test getting additional attributes.""" + """Test getting additional attributes excludes barcode/UMI elements.""" structure = create_simple_structure() detected = create_detected_elements() result = ExtractionResult(structure, "read_001", "+", detected) attrs = result.get_additional_attributes() - assert "Barcode detected" in attrs - assert "UMI detected" in attrs + # Barcode and UMI are excluded (tracked separately by ReadStats) + assert "Barcode detected" not in attrs + assert "UMI detected" not in attrs + # Supplementary elements are included assert "PolyT detected" in attrs def test_noseq_constant(self): From 8f2fd3d7a81e94640d2d66d666b03204b772a683 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Thu, 29 Jan 2026 21:46:36 +0200 Subject: [PATCH 09/41] fix headers --- detect_barcodes.py | 3 +-- src/barcode_calling/callers/curio.py | 4 ++-- .../callers/molecule_structure.py | 19 ------------------- src/barcode_calling/callers/tenx.py | 4 ++-- 4 files changed, 5 insertions(+), 25 deletions(-) diff --git a/detect_barcodes.py b/detect_barcodes.py index f90c985b..2a8f88fe 100755 --- a/detect_barcodes.py +++ b/detect_barcodes.py @@ -401,8 +401,7 @@ def _process_single_file_in_parallel(input_file, output_tsv, out_fasta, args, ba chunk, tmp_barcode_file, chunk_counter, - tmp_fasta_file, - min_score)) + tmp_fasta_file)) chunk_counter += 1 except StopIteration: reads_left = False diff --git a/src/barcode_calling/callers/curio.py b/src/barcode_calling/callers/curio.py index 3335060c..feb2edc4 100644 --- a/src/barcode_calling/callers/curio.py +++ b/src/barcode_calling/callers/curio.py @@ -194,7 +194,7 @@ def result_type(): return LinkerBarcodeDetectionResult def header(self): - return self.result_type().header() + return self.result_type()._header() class CurioIlluminaDetector: @@ -334,4 +334,4 @@ def result_type(): return LinkerBarcodeDetectionResult def header(self): - return self.result_type().header() \ No newline at end of file + return self.result_type()._header() \ No newline at end of file diff --git a/src/barcode_calling/callers/molecule_structure.py b/src/barcode_calling/callers/molecule_structure.py index 1e1e95f8..7ca44534 100644 --- a/src/barcode_calling/callers/molecule_structure.py +++ b/src/barcode_calling/callers/molecule_structure.py @@ -201,22 +201,3 @@ def __iter__(self) -> Iterator[MoleculeElement]: for e in self.ordered_elements: yield e - def header(self): - printed_elements = set() - header = "#read_id\tstrand" - for el in self.ordered_elements: - if el.element_type == ElementType.cDNA: continue - if el.element_type == ElementType.PolyT: - header += "\tpolyT_start\tpolyT_end" - elif el.element_type.is_constant(): - header += "\t%s_start\t%s_end" % (el.element_name, el.element_name) - elif el.element_type.needs_sequence_extraction() or el.element_type.needs_correction(): - if el.element_name in self.elements_to_concatenate.keys(): - if self.elements_to_concatenate[el.element_name] in printed_elements: - continue - printed_elements.add(self.elements_to_concatenate[el.element_name]) - header += "\t%s_start\t%s_end\t%s_sequence" % (el.element_name, el.element_name, - self.elements_to_concatenate[el.element_name]) - else: - header += "\t%s_start\t%s_end\t%s_sequence" % (el.element_name, el.element_name, el.element_name) - return header diff --git a/src/barcode_calling/callers/tenx.py b/src/barcode_calling/callers/tenx.py index 35bbc839..e506b63d 100644 --- a/src/barcode_calling/callers/tenx.py +++ b/src/barcode_calling/callers/tenx.py @@ -176,7 +176,7 @@ def result_type(): return TenXBarcodeDetectionResult def header(self): - return self.result_type().header() + return self.result_type()._header() class VisiumHDBarcodeDetector: """Visium HD spatial transcriptomics barcode detector.""" @@ -348,4 +348,4 @@ def result_type(): return TenXBarcodeDetectionResult def header(self): - return self.result_type().header() \ No newline at end of file + return self.result_type()._header() \ No newline at end of file From e480859d73a1f2472c93ef61bf753fc8dd5846f0 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Thu, 29 Jan 2026 22:04:09 +0200 Subject: [PATCH 10/41] fix const element detection --- .../callers/universal_extraction.py | 4 ++- src/barcode_calling/indexers/base.py | 29 ++++++++++++++----- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/src/barcode_calling/callers/universal_extraction.py b/src/barcode_calling/callers/universal_extraction.py index 8201681e..5fe83017 100644 --- a/src/barcode_calling/callers/universal_extraction.py +++ b/src/barcode_calling/callers/universal_extraction.py @@ -344,7 +344,8 @@ def detect_const_elements(self, sequence, polyt_start, polyt_end, detected_eleme search_start = current_search_start search_end = len(sequence) if polyt_start == -1 else polyt_start + 1 - element_occurrences = self.index_dict[el.element_name].get_occurrences_substr(sequence, search_start, + element_occurrences = self.index_dict[el.element_name].get_occurrences_substr(sequence, + search_start, search_end) if not first_element_detected: min_score = int(el.element_length * self.MIN_SCORE_COEFF_TERMMINAL) @@ -362,6 +363,7 @@ def detect_const_elements(self, sequence, polyt_start, polyt_end, detected_eleme start_delta=start_delta, end_delta=self.TERMINAL_MATCH_DELTA) + if element_start is not None: current_search_start = element_end if not first_element_detected: diff --git a/src/barcode_calling/indexers/base.py b/src/barcode_calling/indexers/base.py index 32a01f6f..5e3ba232 100644 --- a/src/barcode_calling/indexers/base.py +++ b/src/barcode_calling/indexers/base.py @@ -118,9 +118,23 @@ def _get_kmers_substr(self, seq, start, end): yield kmer # [start, end] - def get_occurrences_substr(self, sequence, start, end, max_hits=0, min_kmers=1, hits_delta=1): - barcode_counts = defaultdict(int) - barcode_positions = defaultdict(list) + def get_occurrences_substr(self, sequence, start, end, max_hits=0, min_kmers=1, hits_delta=1) -> List[Tuple[str, int, List[int]]]: + """ + Find indexed sequences with shared k-mers. + + Args: + sequence: Query sequence to search + max_hits: Maximum number of results (0 = unlimited) + min_kmers: Minimum shared k-mers required + hits_delta: Include results within this many k-mers of top hit + ignore_equal: Skip exact matches + + Returns: + List of (sequence, shared_kmer_count, kmer_positions) tuples, + sorted by shared k-mer count (descending) + """ + barcode_counts: DefaultDict[int, int] = defaultdict(int) + barcode_positions: DefaultDict[int, List[int]] = defaultdict(list) for pos, kmer in enumerate(self._get_kmers_substr(sequence, start, end)): for i in self.index[kmer]: @@ -135,15 +149,16 @@ def get_occurrences_substr(self, sequence, start, end, max_hits=0, min_kmers=1, result.append((self.seq_list[i], count, barcode_positions[i])) if not result: - return {} + return [] top_hits = max(result, key=lambda x: x[1])[1] result = filter(lambda x: x[1] >= top_hits - hits_delta, result) - result = sorted(result, reverse=True, key=lambda x: x[1]) + result = list(sorted(result, reverse=True, key=lambda x: x[1])) if max_hits == 0: - return {x[0]:x for x in result} - return {x[0]:x for x in list(result)[:max_hits]} + return result + return result[:max_hits] + class ArrayKmerIndexer: """ From 6cf053f79eda694a12ca147824a1aced56ff13d3 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Thu, 29 Jan 2026 22:05:38 +0200 Subject: [PATCH 11/41] remove obsolete test --- tests/test_molecule_structure.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/tests/test_molecule_structure.py b/tests/test_molecule_structure.py index 46694016..ed5af7b5 100644 --- a/tests/test_molecule_structure.py +++ b/tests/test_molecule_structure.py @@ -266,30 +266,6 @@ def test_iteration(self): names = [el.element_name for el in structure] assert names == ["Barcode", "UMI", "cDNA"] - def test_header_generation(self): - """Test TSV header generation.""" - mdf_content = """R1:Barcode:UMI:PolyT:cDNA -R1\tCONST\tCTACACGACGCTCTTCCGATCT -Barcode\tVAR_LIST\tAAAA,CCCC,GGGG -UMI\tVAR_ANY\t12 -""" - structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) - - header = structure.header() - - assert "#read_id" in header - assert "strand" in header - assert "R1_start" in header - assert "R1_end" in header - assert "Barcode_start" in header - assert "Barcode_end" in header - assert "Barcode_sequence" in header - assert "UMI_start" in header - assert "UMI_end" in header - assert "UMI_sequence" in header - assert "polyT_start" in header - assert "polyT_end" in header - def test_from_element_list(self): """Test creating structure from element list.""" elements = [ From 1c5eb36c9bfbf2d7906ad066fa9df1846c2f99ae Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Thu, 29 Jan 2026 22:16:53 +0200 Subject: [PATCH 12/41] do not allow exact positions to go out of bounds --- src/barcode_calling/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/barcode_calling/common.py b/src/barcode_calling/common.py index d2b2fbe4..431d748c 100644 --- a/src/barcode_calling/common.py +++ b/src/barcode_calling/common.py @@ -228,7 +228,7 @@ def detect_exact_positions(sequence, start, end, kmer_size, pattern, pattern_occ return None, None leftover_bases = len(pattern) - pattern_end - 1 skipped_bases = pattern_start - return start_pos - skipped_bases, end_pos + leftover_bases + return max(0, start_pos - skipped_bases), min(end_pos + leftover_bases, len(sequence) - 1) def detect_first_exact_positions(sequence, start, end, kmer_size, pattern, pattern_occurrences: list, From 8f101f292c2f80625248075c9a93b6374ece5530 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Thu, 29 Jan 2026 22:37:19 +0200 Subject: [PATCH 13/41] factor out element correction --- .../callers/universal_extraction.py | 85 +++++++------------ 1 file changed, 31 insertions(+), 54 deletions(-) diff --git a/src/barcode_calling/callers/universal_extraction.py b/src/barcode_calling/callers/universal_extraction.py index 5fe83017..0ce76639 100644 --- a/src/barcode_calling/callers/universal_extraction.py +++ b/src/barcode_calling/callers/universal_extraction.py @@ -119,6 +119,20 @@ def _find_patterns_fwd(self, read_id, sequence): def concatenate_elements(self, detected_results): pass + def correct_element(self, element: MoleculeElement, sequence: str, potential_start: int, potential_end: int) -> DetectedElement: + potential_seq = sequence[potential_start:potential_end + 1] + matching_sequences = self.index_dict[element.element_name].get_occurrences(potential_seq) + corrected_seq, seq_score, seq_start, seq_end = \ + find_candidate_with_max_score_ssw(matching_sequences, potential_seq, + min_score=self.min_scores[element.element_name]) + + if corrected_seq is not None: + read_seq_start = potential_start + seq_start + read_seq_end = potential_start + seq_end - 1 + return DetectedElement(read_seq_start, read_seq_end, seq_score, corrected_seq) + else: + return DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ) + def extract_variable_elements5(self, detected_elements, sequence): first_detected_const_element = None for i, el in enumerate(self.molecule_structure): @@ -153,19 +167,10 @@ def extract_variable_elements5(self, detected_elements, sequence): seq=sequence[potential_start:potential_end + 1]) elif el.element_name in self.elements_to_correct: - potential_seq = sequence[potential_start:potential_end + 1] - matching_sequences = self.index_dict[el.element_name].get_occurrences(potential_seq) - corrected_seq, seq_score, seq_start, seq_end = \ - find_candidate_with_max_score_ssw(matching_sequences, potential_seq, min_score=self.min_scores[el.element_name]) - - if corrected_seq is not None: - read_seq_start = potential_start + seq_start - read_seq_end = potential_start + seq_end - 1 - current_pos = read_seq_start - 1 - detected_elements[el.element_name] = DetectedElement(read_seq_start, read_seq_end, seq_score, - corrected_seq) - else: - detected_elements[el.element_name] = DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ) + detected_element = self.correct_element(el, sequence, potential_start, potential_end) + if detected_element.start != -1: + current_pos = detected_element.start - 1 + detected_elements[el.element_name] = detected_element current_pos = detected_elements[ self.molecule_structure.ordered_elements[first_detected_const_element].element_name].end + 1 @@ -207,20 +212,10 @@ def extract_variable_elements5(self, detected_elements, sequence): potential_start:potential_end + 1]) elif el.element_name in self.elements_to_correct: - potential_seq = sequence[potential_start:potential_end + 1] - matching_sequences = self.index_dict[el.element_name].get_occurrences(potential_seq) - corrected_seq, seq_score, seq_start, seq_end = \ - find_candidate_with_max_score_ssw(matching_sequences, potential_seq, - min_score=self.min_scores[el.element_name]) - - if corrected_seq is not None: - read_seq_start = potential_start + seq_start - read_seq_end = potential_start + seq_end - 1 - current_pos = read_seq_end + 1 - detected_elements[el.element_name] = DetectedElement(read_seq_start, read_seq_end, seq_score, - corrected_seq) - else: - detected_elements[el.element_name] = DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ) + detected_element = self.correct_element(el, sequence, potential_start, potential_end) + if detected_element.end != -1: + current_pos = detected_element.end + 1 + detected_elements[el.element_name] = detected_element else: detected_elements[el.element_name] = DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ) @@ -260,19 +255,10 @@ def extract_variable_elements3(self, detected_elements, sequence): seq=sequence[potential_start:potential_end + 1]) elif el.element_name in self.elements_to_correct: - potential_seq = sequence[potential_start:potential_end + 1] - matching_sequences = self.index_dict[el.element_name].get_occurrences(potential_seq) - corrected_seq, seq_score, seq_start, seq_end = \ - find_candidate_with_max_score_ssw(matching_sequences, potential_seq, min_score=self.min_scores[el.element_name]) - - if corrected_seq is not None: - read_seq_start = potential_start + seq_start - read_seq_end = potential_start + seq_end - 1 - current_pos = read_seq_end + 1 - detected_elements[el.element_name] = DetectedElement(read_seq_start, read_seq_end, seq_score, - corrected_seq) - else: - detected_elements[el.element_name] = DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ) + detected_element = self.correct_element(el, sequence, potential_start, potential_end) + if detected_element.end != -1: + current_pos = detected_element.end + 1 + detected_elements[el.element_name] = detected_element current_pos = detected_elements[ self.molecule_structure.ordered_elements[last_detected_const_element].element_name].start - 1 @@ -314,20 +300,11 @@ def extract_variable_elements3(self, detected_elements, sequence): potential_start:potential_end + 1]) elif el.element_name in self.elements_to_correct: - potential_seq = sequence[potential_start:potential_end + 1] - matching_sequences = self.index_dict[el.element_name].get_occurrences(potential_seq) - corrected_seq, seq_score, seq_start, seq_end = \ - find_candidate_with_max_score_ssw(matching_sequences, potential_seq, - min_score=self.min_scores[el.element_name]) - - if corrected_seq is not None: - read_seq_start = potential_start + seq_start - read_seq_end = potential_start + seq_end - 1 - current_pos = read_seq_start - 1 - detected_elements[el.element_name] = DetectedElement(read_seq_start, read_seq_end, seq_score, - corrected_seq) - else: - detected_elements[el.element_name] = DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ) + detected_element = self.correct_element(el, sequence, potential_start, potential_end) + if detected_element.start != -1: + current_pos = detected_element.start - 1 + detected_elements[el.element_name] = detected_element + else: detected_elements[el.element_name] = DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ) From c2a63d7fdf973f51e42da00f1d7e34df1720165e Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Fri, 30 Jan 2026 14:36:27 +0200 Subject: [PATCH 14/41] add log errors for non-variable concat and dupl elements --- src/barcode_calling/callers/molecule_structure.py | 6 ++++++ src/barcode_calling/callers/universal_extraction.py | 2 ++ 2 files changed, 8 insertions(+) diff --git a/src/barcode_calling/callers/molecule_structure.py b/src/barcode_calling/callers/molecule_structure.py index 7ca44534..ff5ba04f 100644 --- a/src/barcode_calling/callers/molecule_structure.py +++ b/src/barcode_calling/callers/molecule_structure.py @@ -145,12 +145,18 @@ def __init__(self, str_iterator): self.ordered_elements.append(MoleculeElement(el, element_type, element_val1, element_val2)) if self.CONCAT_DELIM in el: + if not (element_type.needs_sequence_extraction() or element_type.needs_correction()): + logger.critical("Concatenated elements must be variable (fix element %s)" % el) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) v = el.split(self.CONCAT_DELIM) if len(v) != 2: logger.critical("Incorrect concatenated element %s" % el) sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) self.elements_to_concatenate[el] = v[0] elif self.DUPL_DELIM in el: + if not (element_type.needs_sequence_extraction() or element_type.needs_correction()): + logger.critical("Concatenated elements must be variable (fix element %s)" % el) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) v = el.split(self.DUPL_DELIM) if len(v) != 2: logger.critical("Incorrect duplicated element %s" % el) diff --git a/src/barcode_calling/callers/universal_extraction.py b/src/barcode_calling/callers/universal_extraction.py index 0ce76639..51194469 100644 --- a/src/barcode_calling/callers/universal_extraction.py +++ b/src/barcode_calling/callers/universal_extraction.py @@ -163,6 +163,8 @@ def extract_variable_elements5(self, detected_elements, sequence): current_pos = potential_start - 1 if el.element_name in self.elements_to_extract: + # TODO: for concatenated elements, store them in a separate dict and extract/correct them only when all of them are found + # TODO: more or less the same for duplicated elements, except the procedure is different detected_elements[el.element_name] = DetectedElement(potential_start, potential_end, 0, seq=sequence[potential_start:potential_end + 1]) From 26a9c0212eec75518e15bbb794cb5d33e56fac22 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Fri, 30 Jan 2026 16:32:02 +0100 Subject: [PATCH 15/41] implement collection of duplicated and concatenated elements, make code simpler --- .../callers/molecule_structure.py | 31 ++- .../callers/universal_extraction.py | 182 ++++++++++-------- 2 files changed, 130 insertions(+), 83 deletions(-) diff --git a/src/barcode_calling/callers/molecule_structure.py b/src/barcode_calling/callers/molecule_structure.py index ff5ba04f..ff7e1cb5 100644 --- a/src/barcode_calling/callers/molecule_structure.py +++ b/src/barcode_calling/callers/molecule_structure.py @@ -8,6 +8,7 @@ import sys from enum import unique, Enum from typing import Iterator, List +from collections import defaultdict from ...error_codes import IsoQuantExitCode from ..common import load_barcodes @@ -106,7 +107,9 @@ class MoleculeStructure: def __init__(self, str_iterator): self.ordered_elements: List[MoleculeElement] = [] self.elements_to_concatenate = {} + self.concatenated_elements_counts = {} self.duplicated_elements = {} + self.duplicated_elements_counts = {} # Barcode/UMI element names identified by naming convention self._barcode_elements: List[str] = [] self._umi_elements: List[str] = [] @@ -152,7 +155,12 @@ def __init__(self, str_iterator): if len(v) != 2: logger.critical("Incorrect concatenated element %s" % el) sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) - self.elements_to_concatenate[el] = v[0] + try: + self.elements_to_concatenate[el] = (v[0], int(v[1])) + except ValueError: + logger.critical("Incorrectly specified index for concatenated element %s: %s" % (el, v[1])) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + elif self.DUPL_DELIM in el: if not (element_type.needs_sequence_extraction() or element_type.needs_correction()): logger.critical("Concatenated elements must be variable (fix element %s)" % el) @@ -161,10 +169,29 @@ def __init__(self, str_iterator): if len(v) != 2: logger.critical("Incorrect duplicated element %s" % el) sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) - self.duplicated_elements[el] = v[0] + try: + self.duplicated_elements[el] = (v[0], int(v[1])) + except ValueError: + logger.critical("Incorrectly specified index for duplicated element %s: %s" % (el, v[1])) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + self.concatenated_elements_counts = self._check_indices(self.elements_to_concatenate) + self.duplicated_elements_counts = self._check_indices(self.duplicated_elements) self._identify_barcode_umi_elements() + def _check_indices(self, element_dict): + # check for consecutive indices + index_dict = defaultdict(list) + for el in element_dict: + index_dict[self.elements_to_concatenate[el][0]].append(self.elements_to_concatenate[el][1]) + + for el in index_dict: + if sorted(index_dict[el]) != list(range(1, len(index_dict[el]) + 1)): + logger.critical("Concatenated elements must be consecutive from 1 to %d (fix element %s, indices: %s)" % (len(index_dict[el]), el, str(index_dict[el]))) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + + return {el : len(val) for el, val in element_dict.items()} + def _identify_barcode_umi_elements(self) -> None: """ Identify barcode/UMI elements by naming convention. diff --git a/src/barcode_calling/callers/universal_extraction.py b/src/barcode_calling/callers/universal_extraction.py index 51194469..3c86f20a 100644 --- a/src/barcode_calling/callers/universal_extraction.py +++ b/src/barcode_calling/callers/universal_extraction.py @@ -111,12 +111,19 @@ def _find_patterns_fwd(self, read_id, sequence): logger.debug("PolyT %d" % polyt_start) self.detect_const_elements(sequence, polyt_start, polyt_end, detected_elements) - self.extract_variable_elements5(detected_elements, sequence) - self.extract_variable_elements3(detected_elements, sequence) + concat_element_storage = {} + dupl_element_storage = {} + self.extract_variable_elements5(sequence, detected_elements, concat_element_storage, dupl_element_storage) + self.extract_variable_elements3(sequence, detected_elements, concat_element_storage, dupl_element_storage) + self.process_concatenated_elements(concat_element_storage, detected_elements) + self.process_duplicated_elements(dupl_element_storage, detected_elements) logger.debug("== end read id %s ==" % read_id) return detected_elements - def concatenate_elements(self, detected_results): + def process_concatenated_elements(self, concatenated_element_storage, detected_results): + pass + + def process_duplicated_elements(self, duplicated_element_storage, detected_results): pass def correct_element(self, element: MoleculeElement, sequence: str, potential_start: int, potential_end: int) -> DetectedElement: @@ -125,7 +132,6 @@ def correct_element(self, element: MoleculeElement, sequence: str, potential_sta corrected_seq, seq_score, seq_start, seq_end = \ find_candidate_with_max_score_ssw(matching_sequences, potential_seq, min_score=self.min_scores[element.element_name]) - if corrected_seq is not None: read_seq_start = potential_start + seq_start read_seq_end = potential_start + seq_end - 1 @@ -133,7 +139,79 @@ def correct_element(self, element: MoleculeElement, sequence: str, potential_sta else: return DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ) - def extract_variable_elements5(self, detected_elements, sequence): + def _backup_concatenated_element(self, element, potential_start, potential_end, element_storage): + concatenated_element_base_name, concatenated_element_index \ + = self.molecule_structure.elements_to_concatenate[element.element_name] + if concatenated_element_base_name not in element_storage: + # initialize the list of empty concatenated elements for the first time + element_storage[concatenated_element_base_name] = [(-1, -1) for _ in range( + len(self.molecule_structure.concatenated_elements_counts[concatenated_element_base_name]))] + element_storage[concatenated_element_base_name][concatenated_element_index] = (potential_start, potential_end) + + def _backup_duplicated_element(self, element, potential_start, potential_end, element_storage): + concatenated_element_base_name, concatenated_element_index \ + = self.molecule_structure.elements_to_concatenate[element.element_name] + if concatenated_element_base_name not in element_storage: + # initialize the list of empty concatenated elements for the first time + element_storage[concatenated_element_base_name] = [(-1, -1) for _ in range( + len(self.molecule_structure.concatenated_elements_counts[concatenated_element_base_name]))] + element_storage[concatenated_element_base_name][concatenated_element_index] = (potential_start, potential_end) + + def _detect_variable_element(self, el, sequence, potential_start, potential_end, + detected_elements, concat_element_storage, dupl_element_storage, + len_diff_threshold=None): + potential_len = potential_end - potential_start + 1 + if len_diff_threshold is None or abs(potential_len - el.element_length) <= len_diff_threshold * el.element_length: + if el.element_name in self.molecule_structure.elements_to_concatenate: + self._backup_concatenated_element(el, potential_start, potential_end, concat_element_storage) + return potential_start, potential_end + + if el.element_name in self.molecule_structure.duplicated_elements: + self._backup_duplicated_element(el, potential_start, potential_end, dupl_element_storage) + return potential_start, potential_end + + if el.element_name in self.elements_to_extract: + detected_elements[el.element_name] = DetectedElement(potential_start, potential_end, 0, + seq=sequence[potential_start:potential_end + 1]) + + elif el.element_name in self.elements_to_correct: + detected_element = self.correct_element(el, sequence, potential_start, potential_end) + if detected_element.start != -1: + detected_elements[el.element_name] = detected_element + return detected_element.start, detected_element.end + else: + detected_elements[el.element_name] = DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ) + + return potential_start, potential_end + + def _detect_potential_start(self, element, element_index, potential_end, detected_elements): + if element_index == 0: + potential_start = potential_end - element.element_length + 1 + else: + prev_el = self.molecule_structure.ordered_elements[element_index - 1] + if prev_el.element_name in detected_elements: + potential_start = detected_elements[prev_el].end + 1 + else: + potential_start = potential_end - element.element_length + 1 + if potential_start < 0: + potential_start = 0 + + return potential_start + + def _detect_potential_end(self, sequence, element, element_index, potential_start, detected_elements): + if element_index + 1 == len(self.molecule_structure.ordered_elements): + potential_end = potential_start + element.element_length - 1 + else: + next_el = self.molecule_structure.ordered_elements[element_index + 1] + if next_el.element_name in detected_elements: + potential_end = detected_elements[next_el.element_name].start - 1 + else: + potential_end = potential_start + element.element_length - 1 + if potential_end >= len(sequence): + potential_end = len(sequence) - 1 + return potential_end + + def extract_variable_elements5(self, sequence, detected_elements, concat_element_storage, dupl_element_storage): first_detected_const_element = None for i, el in enumerate(self.molecule_structure): if el.element_type == ElementType.cDNA: @@ -161,18 +239,9 @@ def extract_variable_elements5(self, detected_elements, sequence): potential_start = potential_end - el.element_length + 1 if potential_start < 0: break - current_pos = potential_start - 1 - if el.element_name in self.elements_to_extract: - # TODO: for concatenated elements, store them in a separate dict and extract/correct them only when all of them are found - # TODO: more or less the same for duplicated elements, except the procedure is different - detected_elements[el.element_name] = DetectedElement(potential_start, potential_end, 0, - seq=sequence[potential_start:potential_end + 1]) - - elif el.element_name in self.elements_to_correct: - detected_element = self.correct_element(el, sequence, potential_start, potential_end) - if detected_element.start != -1: - current_pos = detected_element.start - 1 - detected_elements[el.element_name] = detected_element + el_start, _ = self._detect_variable_element(el, sequence, potential_start, potential_end, + detected_elements, concat_element_storage, dupl_element_storage) + current_pos = el_start - 1 current_pos = detected_elements[ self.molecule_structure.ordered_elements[first_detected_const_element].element_name].end + 1 @@ -194,35 +263,15 @@ def extract_variable_elements5(self, detected_elements, sequence): continue potential_start = current_pos - if i + 1 == len(self.molecule_structure.ordered_elements): - potential_end = potential_start + el.element_length - 1 - else: - next_el = self.molecule_structure.ordered_elements[i + 1] - if next_el.element_name in detected_elements: - potential_end = detected_elements[next_el.element_name].start - 1 - else: - potential_end = potential_start + el.element_length - 1 - if potential_end >= len(sequence): - potential_end = len(sequence) - 1 - - potential_len = potential_end - potential_start + 1 - current_pos = potential_end + 1 - if abs(potential_len - el.element_length) <= self.MAX_LEN_DIFF * el.element_length: - if el.element_name in self.elements_to_extract: - detected_elements[el.element_name] = DetectedElement(potential_start, potential_end, 0, - seq=sequence[ - potential_start:potential_end + 1]) - - elif el.element_name in self.elements_to_correct: - detected_element = self.correct_element(el, sequence, potential_start, potential_end) - if detected_element.end != -1: - current_pos = detected_element.end + 1 - detected_elements[el.element_name] = detected_element - else: - detected_elements[el.element_name] = DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ) + potential_end = self._detect_potential_end(sequence, el, i, potential_start, detected_elements) + + _, el_end = self._detect_variable_element(el, sequence, potential_start, potential_end, + detected_elements, concat_element_storage, dupl_element_storage, + len_diff_threshold=self.MAX_LEN_DIFF) + current_pos = el_end + 1 - def extract_variable_elements3(self, detected_elements, sequence): + def extract_variable_elements3(self, sequence, detected_elements, concat_element_storage, dupl_element_storage): last_detected_const_element = None for i in range(len(self.molecule_structure.ordered_elements) - 1, -1, -1): el = self.molecule_structure.ordered_elements[i] @@ -250,17 +299,10 @@ def extract_variable_elements3(self, detected_elements, sequence): potential_start = current_pos potential_end = potential_start + el.element_length - 1 if potential_end >= len(sequence): break - current_pos = potential_end + 1 - if el.element_name in self.elements_to_extract: - detected_elements[el.element_name] = DetectedElement(potential_start, potential_end, 0, - seq=sequence[potential_start:potential_end + 1]) - - elif el.element_name in self.elements_to_correct: - detected_element = self.correct_element(el, sequence, potential_start, potential_end) - if detected_element.end != -1: - current_pos = detected_element.end + 1 - detected_elements[el.element_name] = detected_element + _, el_end = self._detect_variable_element(el, sequence, potential_start, potential_end, + detected_elements, concat_element_storage, dupl_element_storage) + current_pos = el_end + 1 current_pos = detected_elements[ self.molecule_structure.ordered_elements[last_detected_const_element].element_name].start - 1 @@ -282,34 +324,12 @@ def extract_variable_elements3(self, detected_elements, sequence): continue potential_end = current_pos - if i == 0: - potential_start = potential_end - el.element_length + 1 - else: - prev_el = self.molecule_structure.ordered_elements[i - 1] - if prev_el.element_name in detected_elements: - potential_start = detected_elements[prev_el].end + 1 - else: - potential_start = potential_end - el.element_length + 1 - if potential_start < 0: - potential_start = 0 - - potential_len = potential_end - potential_start + 1 - current_pos = potential_start - 1 - if abs(potential_len - el.element_length) <= self.MAX_LEN_DIFF * el.element_length: - if el.element_name in self.elements_to_extract: - detected_elements[el.element_name] = DetectedElement(potential_start, potential_end, 0, - seq=sequence[ - potential_start:potential_end + 1]) - - elif el.element_name in self.elements_to_correct: - detected_element = self.correct_element(el, sequence, potential_start, potential_end) - if detected_element.start != -1: - current_pos = detected_element.start - 1 - detected_elements[el.element_name] = detected_element - - else: - detected_elements[el.element_name] = DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ) + potential_start = self._detect_potential_start(el, i, potential_end, detected_elements) + el_start, _ = self._detect_variable_element(el, sequence, potential_start, potential_end, + detected_elements, concat_element_storage, dupl_element_storage, + len_diff_threshold=self.MAX_LEN_DIFF) + current_pos = el_start - 1 def detect_const_elements(self, sequence, polyt_start, polyt_end, detected_elements): # searching left of cDNA From cc5c3dedba05b8a695e6b08c13b943d11743464d Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Fri, 30 Jan 2026 16:52:08 +0100 Subject: [PATCH 16/41] additional checks on linked elements --- .../callers/molecule_structure.py | 63 ++++++++++++------- 1 file changed, 42 insertions(+), 21 deletions(-) diff --git a/src/barcode_calling/callers/molecule_structure.py b/src/barcode_calling/callers/molecule_structure.py index ff7e1cb5..4b6cbc2d 100644 --- a/src/barcode_calling/callers/molecule_structure.py +++ b/src/barcode_calling/callers/molecule_structure.py @@ -132,58 +132,79 @@ def __init__(self, str_iterator): logger.critical("Incorrect number of properties in line %s" % l.strip()) sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) - for el in elements: - if el not in element_properties: - if el not in ElementType.__dict__: - logger.critical("Molecule element %s was not described in the format file" % el) + element_name: str + for element_name in elements: + if element_name not in element_properties: + if element_name not in ElementType.__dict__: + logger.critical("Molecule element %s was not described in the format file" % element_name) sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) - element_type = ElementType[el] - self.ordered_elements.append(MoleculeElement(el, element_type)) + element_type = ElementType[element_name] + self.ordered_elements.append(MoleculeElement(element_name, element_type)) else: - element_type, element_val1, element_val2 = element_properties[el] + element_type, element_val1, element_val2 = element_properties[element_name] if element_type not in ElementType.__dict__: logger.critical("Molecule element type %s is not among the possible types" % element_type) sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) element_type = ElementType[element_type] - self.ordered_elements.append(MoleculeElement(el, element_type, element_val1, element_val2)) + self.ordered_elements.append(MoleculeElement(element_name, element_type, element_val1, element_val2)) - if self.CONCAT_DELIM in el: + if self.CONCAT_DELIM in element_name: if not (element_type.needs_sequence_extraction() or element_type.needs_correction()): - logger.critical("Concatenated elements must be variable (fix element %s)" % el) + logger.critical("Concatenated elements must be variable (fix element %s)" % element_name) sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) - v = el.split(self.CONCAT_DELIM) + v = element_name.split(self.CONCAT_DELIM) if len(v) != 2: - logger.critical("Incorrect concatenated element %s" % el) + logger.critical("Incorrect concatenated element %s" % element_name) sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) try: - self.elements_to_concatenate[el] = (v[0], int(v[1])) + self.elements_to_concatenate[element_name] = (v[0], int(v[1])) except ValueError: - logger.critical("Incorrectly specified index for concatenated element %s: %s" % (el, v[1])) + logger.critical("Incorrectly specified index for concatenated element %s: %s" % (element_name, v[1])) sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) - elif self.DUPL_DELIM in el: + elif self.DUPL_DELIM in element_name: if not (element_type.needs_sequence_extraction() or element_type.needs_correction()): - logger.critical("Concatenated elements must be variable (fix element %s)" % el) + logger.critical("Concatenated elements must be variable (fix element %s)" % element_name) sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) - v = el.split(self.DUPL_DELIM) + v = element_name.split(self.DUPL_DELIM) if len(v) != 2: - logger.critical("Incorrect duplicated element %s" % el) + logger.critical("Incorrect duplicated element %s" % element_name) sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) try: - self.duplicated_elements[el] = (v[0], int(v[1])) + self.duplicated_elements[element_name] = (v[0], int(v[1])) except ValueError: - logger.critical("Incorrectly specified index for duplicated element %s: %s" % (el, v[1])) + logger.critical("Incorrectly specified index for duplicated element %s: %s" % (element_name, v[1])) sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + self._check_linked_elements(self.elements_to_concatenate) self.concatenated_elements_counts = self._check_indices(self.elements_to_concatenate) + self._check_linked_elements(self.duplicated_elements) self.duplicated_elements_counts = self._check_indices(self.duplicated_elements) self._identify_barcode_umi_elements() + def _check_linked_elements(self, element_dict): + # check that duplicated or concatenated elements have the same values (e.g. barcode whitelist) + # base element name -> List[MoleculeElement] + linked_elements = defaultdict(list) + for el in self.ordered_elements: + if el.element_name in element_dict: + base_element_name = element_dict[el][0] + linked_elements[base_element_name].append(el) + + for element_list in linked_elements.values(): + if len(set(el.element_value for el in element_list)) != 1: + logger.critical("Linked elements must have the same type (fix elements: %s)" % ", ".join(el.element_name for el in element_list)) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + if len(set(el.element_value for el in element_list)) != 1: + logger.critical("Linked elements must have the same values (fix elements: %s)" % ", ".join(el.element_name for el in element_list)) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + def _check_indices(self, element_dict): # check for consecutive indices index_dict = defaultdict(list) for el in element_dict: - index_dict[self.elements_to_concatenate[el][0]].append(self.elements_to_concatenate[el][1]) + base_element_name = element_dict[el][0] + index_dict[base_element_name].append(element_dict[el][1]) for el in index_dict: if sorted(index_dict[el]) != list(range(1, len(index_dict[el]) + 1)): From 10428f7c3ed5ba7608e2a92ca7a947ea59b6ba81 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Fri, 30 Jan 2026 17:36:01 +0100 Subject: [PATCH 17/41] implement concatenated elements --- .../callers/extraction_result.py | 22 ++- .../callers/molecule_structure.py | 44 +++-- .../callers/universal_extraction.py | 79 +++++++-- tests/test_molecule_structure.py | 101 ++++++++++++ tests/test_universal_extraction.py | 155 ++++++++++++++++++ 5 files changed, 367 insertions(+), 34 deletions(-) diff --git a/src/barcode_calling/callers/extraction_result.py b/src/barcode_calling/callers/extraction_result.py index 82f91af4..4c671d22 100644 --- a/src/barcode_calling/callers/extraction_result.py +++ b/src/barcode_calling/callers/extraction_result.py @@ -189,12 +189,16 @@ def set_strand(self, strand: str) -> None: self.strand = strand def _get_supplementary_elements(self) -> List[str]: - """Get list of element names that are not barcode or UMI elements.""" + """Get list of element names that are not barcode, UMI, or concatenated/duplicated parts.""" supplementary = [] barcode_umi_set = set(self._barcode_elements + self._umi_elements) for el in self.molecule_structure: if el.element_type == ElementType.cDNA: continue + if el.element_name in self.molecule_structure.elements_to_concatenate: + continue + if el.element_name in self.molecule_structure.duplicated_elements: + continue if el.element_name not in barcode_umi_set: supplementary.append(el.element_name) return supplementary @@ -217,11 +221,17 @@ def __str__(self) -> str: ) # Supplementary elements (non-barcode, non-UMI) + barcode_umi_set = set(self._barcode_elements + self._umi_elements) for el in self.molecule_structure: if el.element_type == ElementType.cDNA: continue + # Skip concatenated/duplicated parts (data lives under base name) + if el.element_name in self.molecule_structure.elements_to_concatenate: + continue + if el.element_name in self.molecule_structure.duplicated_elements: + continue # Skip barcode and UMI elements (already printed above) - if el.element_name in self._barcode_elements or el.element_name in self._umi_elements: + if el.element_name in barcode_umi_set: continue if el.element_type == ElementType.PolyT: @@ -255,11 +265,17 @@ def header(self) -> str: header = "#read_id\tbarcode\tUMI\tBC_score\tvalid_UMI\tstrand" # Supplementary elements (non-barcode, non-UMI) + barcode_umi_set = set(self._barcode_elements + self._umi_elements) for el in self.molecule_structure: if el.element_type == ElementType.cDNA: continue + # Skip concatenated/duplicated parts (data lives under base name) + if el.element_name in self.molecule_structure.elements_to_concatenate: + continue + if el.element_name in self.molecule_structure.duplicated_elements: + continue # Skip barcode and UMI elements (already in standard header) - if el.element_name in self._barcode_elements or el.element_name in self._umi_elements: + if el.element_name in barcode_umi_set: continue if el.element_type == ElementType.PolyT: diff --git a/src/barcode_calling/callers/molecule_structure.py b/src/barcode_calling/callers/molecule_structure.py index 4b6cbc2d..e5c6828c 100644 --- a/src/barcode_calling/callers/molecule_structure.py +++ b/src/barcode_calling/callers/molecule_structure.py @@ -64,9 +64,9 @@ def __init__(self, element_name: str, element_type: ElementType, element_value_1 self.element_length = len(self.element_value[0]) else: try: - self.element_length = int(element_value_1) + self.element_length = int(element_value_2) except ValueError: - logger.critical("Incorrectly specified length for element %s: %s" % (self.element_name, element_value_1)) + logger.critical("Incorrectly specified length for element %s: %s" % (self.element_name, element_value_2)) sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) elif self.element_type == ElementType.VAR_LIST: self.element_value = element_value_1.split(',') @@ -74,10 +74,10 @@ def __init__(self, element_name: str, element_type: ElementType, element_value_1 self.element_length = len(self.element_value[0]) else: try: - self.element_length = int(element_value_1) + self.element_length = int(element_value_2) except ValueError: logger.critical( - "Incorrectly specified length for element %s: %s" % (self.element_name, element_value_1)) + "Incorrectly specified length for element %s: %s" % (self.element_name, element_value_2)) sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) elif self.element_type in (ElementType.VAR_ANY, ElementType.VAR_ANY_SEPARATOR, ElementType.VAR_ANY_NON_T_SEPARATOR): self.element_value = None @@ -188,16 +188,18 @@ def _check_linked_elements(self, element_dict): linked_elements = defaultdict(list) for el in self.ordered_elements: if el.element_name in element_dict: - base_element_name = element_dict[el][0] + base_element_name = element_dict[el.element_name][0] linked_elements[base_element_name].append(el) for element_list in linked_elements.values(): - if len(set(el.element_value for el in element_list)) != 1: + if len(set(el.element_type for el in element_list)) != 1: logger.critical("Linked elements must have the same type (fix elements: %s)" % ", ".join(el.element_name for el in element_list)) sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) - if len(set(el.element_value for el in element_list)) != 1: - logger.critical("Linked elements must have the same values (fix elements: %s)" % ", ".join(el.element_name for el in element_list)) - sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + first_value = element_list[0].element_value + for el in element_list[1:]: + if el.element_value != first_value: + logger.critical("Linked elements must have the same values (fix elements: %s)" % ", ".join(e.element_name for e in element_list)) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) def _check_indices(self, element_dict): # check for consecutive indices @@ -211,7 +213,7 @@ def _check_indices(self, element_dict): logger.critical("Concatenated elements must be consecutive from 1 to %d (fix element %s, indices: %s)" % (len(index_dict[el]), el, str(index_dict[el]))) sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) - return {el : len(val) for el, val in element_dict.items()} + return {el: len(val) for el, val in index_dict.items()} def _identify_barcode_umi_elements(self) -> None: """ @@ -219,15 +221,31 @@ def _identify_barcode_umi_elements(self) -> None: Elements with prefix "barcode" (case-insensitive) are barcodes. Elements with prefix "umi" (case-insensitive) are UMIs. + + For concatenated/duplicated elements (e.g., barcode|1, barcode|2), + the base name (e.g., "barcode") is used and deduplicated. """ self._barcode_elements = [] self._umi_elements = [] + seen = set() for el in self.ordered_elements: - name_lower = el.element_name.lower() + # For concatenated/duplicated elements, use the base name + if el.element_name in self.elements_to_concatenate: + name = self.elements_to_concatenate[el.element_name][0] + elif el.element_name in self.duplicated_elements: + name = self.duplicated_elements[el.element_name][0] + else: + name = el.element_name + + if name in seen: + continue + seen.add(name) + + name_lower = name.lower() if name_lower.startswith("barcode"): - self._barcode_elements.append(el.element_name) + self._barcode_elements.append(name) elif name_lower.startswith("umi"): - self._umi_elements.append(el.element_name) + self._umi_elements.append(name) @property def barcode_elements(self) -> List[str]: diff --git a/src/barcode_calling/callers/universal_extraction.py b/src/barcode_calling/callers/universal_extraction.py index 3c86f20a..96bd128d 100644 --- a/src/barcode_calling/callers/universal_extraction.py +++ b/src/barcode_calling/callers/universal_extraction.py @@ -53,7 +53,10 @@ def __init__(self, molecule_structure: MoleculeStructure): elif el.element_type.needs_sequence_extraction(): self.elements_to_extract.add(el.element_name) elif el.element_type.needs_correction(): - if self.correct_sequences: + if el.element_name in self.molecule_structure.elements_to_concatenate: + # Concatenated parts: skip individual index, handled after loop + pass + elif self.correct_sequences: self.index_dict[el.element_name] = self.prepare_barcode_index(el) # FIXME: min score should be based on the length of the barcode and their sparcity # e.g. find a way to estimate min_score so that error rate does not exceed 1% @@ -62,6 +65,23 @@ def __init__(self, molecule_structure: MoleculeStructure): else: self.elements_to_extract.add(el.element_name) + # Build one index per concatenated base element using the full whitelist + if self.correct_sequences: + for base_name, count in self.molecule_structure.concatenated_elements_counts.items(): + first_part_el = None + total_length = 0 + for el in self.molecule_structure: + if el.element_name in self.molecule_structure.elements_to_concatenate: + if self.molecule_structure.elements_to_concatenate[el.element_name][0] == base_name: + if first_part_el is None: + first_part_el = el + total_length += el.element_length + if first_part_el is not None and first_part_el.element_value is not None: + barcode_list = first_part_el.element_value + k = max(6, int(total_length / 2) - 2) + self.index_dict[base_name] = KmerIndexer(barcode_list, k) + self.min_scores[base_name] = total_length - 2 + if not self.has_cdna: logger.critical("Molecule must include a cDNA") sys.exit(IsoQuantExitCode.INVALID_PARAMETER) @@ -115,13 +135,38 @@ def _find_patterns_fwd(self, read_id, sequence): dupl_element_storage = {} self.extract_variable_elements5(sequence, detected_elements, concat_element_storage, dupl_element_storage) self.extract_variable_elements3(sequence, detected_elements, concat_element_storage, dupl_element_storage) - self.process_concatenated_elements(concat_element_storage, detected_elements) + self.process_concatenated_elements(concat_element_storage, detected_elements, sequence) self.process_duplicated_elements(dupl_element_storage, detected_elements) logger.debug("== end read id %s ==" % read_id) return detected_elements - def process_concatenated_elements(self, concatenated_element_storage, detected_results): - pass + def process_concatenated_elements(self, concatenated_element_storage, detected_results, sequence): + for base_name, parts in concatenated_element_storage.items(): + # Check all parts were detected + if any(start == -1 for start, end in parts): + detected_results[base_name] = DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ) + continue + + # Concatenate sequences from all parts + concatenated_seq = "" + for start, end in parts: + concatenated_seq += sequence[start:end + 1] + + # If correction is enabled and an index exists, correct against the whitelist + if self.correct_sequences and base_name in self.index_dict: + matching_sequences = self.index_dict[base_name].get_occurrences(concatenated_seq) + corrected_seq, seq_score, _, _ = find_candidate_with_max_score_ssw( + matching_sequences, concatenated_seq, + min_score=self.min_scores[base_name]) + if corrected_seq is not None: + detected_results[base_name] = DetectedElement( + parts[0][0], parts[-1][1], seq_score, corrected_seq) + else: + detected_results[base_name] = DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ) + else: + # No correction: just store the raw concatenated sequence + detected_results[base_name] = DetectedElement( + parts[0][0], parts[-1][1], 0, concatenated_seq) def process_duplicated_elements(self, duplicated_element_storage, detected_results): pass @@ -140,22 +185,20 @@ def correct_element(self, element: MoleculeElement, sequence: str, potential_sta return DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ) def _backup_concatenated_element(self, element, potential_start, potential_end, element_storage): - concatenated_element_base_name, concatenated_element_index \ - = self.molecule_structure.elements_to_concatenate[element.element_name] - if concatenated_element_base_name not in element_storage: - # initialize the list of empty concatenated elements for the first time - element_storage[concatenated_element_base_name] = [(-1, -1) for _ in range( - len(self.molecule_structure.concatenated_elements_counts[concatenated_element_base_name]))] - element_storage[concatenated_element_base_name][concatenated_element_index] = (potential_start, potential_end) + base_name, part_index = self.molecule_structure.elements_to_concatenate[element.element_name] + if base_name not in element_storage: + element_storage[base_name] = [(-1, -1) for _ in range( + self.molecule_structure.concatenated_elements_counts[base_name])] + # part_index is 1-based in MDF, convert to 0-based + element_storage[base_name][part_index - 1] = (potential_start, potential_end) def _backup_duplicated_element(self, element, potential_start, potential_end, element_storage): - concatenated_element_base_name, concatenated_element_index \ - = self.molecule_structure.elements_to_concatenate[element.element_name] - if concatenated_element_base_name not in element_storage: - # initialize the list of empty concatenated elements for the first time - element_storage[concatenated_element_base_name] = [(-1, -1) for _ in range( - len(self.molecule_structure.concatenated_elements_counts[concatenated_element_base_name]))] - element_storage[concatenated_element_base_name][concatenated_element_index] = (potential_start, potential_end) + base_name, part_index = self.molecule_structure.duplicated_elements[element.element_name] + if base_name not in element_storage: + element_storage[base_name] = [(-1, -1) for _ in range( + self.molecule_structure.duplicated_elements_counts[base_name])] + # part_index is 1-based in MDF, convert to 0-based + element_storage[base_name][part_index - 1] = (potential_start, potential_end) def _detect_variable_element(self, el, sequence, potential_start, potential_end, detected_elements, concat_element_storage, dupl_element_storage, diff --git a/tests/test_molecule_structure.py b/tests/test_molecule_structure.py index ed5af7b5..ba1ae6c3 100644 --- a/tests/test_molecule_structure.py +++ b/tests/test_molecule_structure.py @@ -431,5 +431,106 @@ def test_multiple_var_list_barcodes(self): assert len(elements[0].element_value) == 100 +class TestConcatenatedElements: + """Test concatenated element parsing and identification.""" + + def test_parse_concatenated_elements(self): + """Test parsing MDF with concatenated barcode parts.""" + mdf_content = """Barcode|1:Barcode|2:UMI:PolyT:cDNA +Barcode|1\tVAR_LIST\tAAAACCCC,GGGGTTTT\t4 +Barcode|2\tVAR_LIST\tAAAACCCC,GGGGTTTT\t4 +UMI\tVAR_ANY\t8 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert len(structure.ordered_elements) == 5 + assert "Barcode|1" in structure.elements_to_concatenate + assert "Barcode|2" in structure.elements_to_concatenate + assert structure.elements_to_concatenate["Barcode|1"] == ("Barcode", 1) + assert structure.elements_to_concatenate["Barcode|2"] == ("Barcode", 2) + + def test_concatenated_elements_counts(self): + """Test that concatenated_elements_counts has correct base name and count.""" + mdf_content = """Barcode|1:Barcode|2:cDNA +Barcode|1\tVAR_LIST\tAAAACCCC,GGGGTTTT\t4 +Barcode|2\tVAR_LIST\tAAAACCCC,GGGGTTTT\t4 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert "Barcode" in structure.concatenated_elements_counts + assert structure.concatenated_elements_counts["Barcode"] == 2 + + def test_concatenated_three_parts(self): + """Test three-part concatenated element.""" + mdf_content = """Barcode|1:Barcode|2:Barcode|3:cDNA +Barcode|1\tVAR_LIST\tAAAACCCCGGGG,TTTTAAAACCCC\t4 +Barcode|2\tVAR_LIST\tAAAACCCCGGGG,TTTTAAAACCCC\t4 +Barcode|3\tVAR_LIST\tAAAACCCCGGGG,TTTTAAAACCCC\t4 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert structure.concatenated_elements_counts["Barcode"] == 3 + + def test_concatenated_barcode_umi_identification(self): + """Test that concatenated barcode elements are identified by base name.""" + mdf_content = """Barcode|1:Barcode|2:UMI:cDNA +Barcode|1\tVAR_LIST\tAAAACCCC,GGGGTTTT\t4 +Barcode|2\tVAR_LIST\tAAAACCCC,GGGGTTTT\t4 +UMI\tVAR_ANY\t8 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + # Base name "Barcode" should be in barcode_elements, not "Barcode|1", "Barcode|2" + assert structure.barcode_elements == ["Barcode"] + assert "Barcode|1" not in structure.barcode_elements + assert "Barcode|2" not in structure.barcode_elements + assert structure.umi_elements == ["UMI"] + + def test_concatenated_umi_identification(self): + """Test that concatenated UMI elements are identified by base name.""" + mdf_content = """Barcode:UMI|1:UMI|2:cDNA +Barcode\tVAR_LIST\tAAAA,CCCC +UMI|1\tVAR_ANY\t4 +UMI|2\tVAR_ANY\t4 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert structure.barcode_elements == ["Barcode"] + assert structure.umi_elements == ["UMI"] + + def test_concatenated_explicit_length(self): + """Test that explicit length (4th field) is correctly parsed for concatenated parts.""" + mdf_content = """Barcode|1:Barcode|2:cDNA +Barcode|1\tVAR_LIST\tAAAACCCC,GGGGTTTT\t4 +Barcode|2\tVAR_LIST\tAAAACCCC,GGGGTTTT\t4 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + # Each part should have length 4, not the full barcode length 8 + for el in structure: + if el.element_name == "Barcode|1": + assert el.element_length == 4 + elif el.element_name == "Barcode|2": + assert el.element_length == 4 + + def test_concatenated_elements_must_be_variable(self): + """Test that non-variable concatenated elements are rejected.""" + mdf_content = """Adapter|1:Adapter|2:cDNA +Adapter|1\tCONST\tACGT +Adapter|2\tCONST\tTGCA +""" + with pytest.raises(SystemExit): + MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + def test_concatenated_nonconsecutive_indices_rejected(self): + """Test that non-consecutive indices (e.g., 1 and 3) are rejected.""" + mdf_content = """Barcode|1:Barcode|3:cDNA +Barcode|1\tVAR_LIST\tAAAA,CCCC\t4 +Barcode|3\tVAR_LIST\tAAAA,CCCC\t4 +""" + with pytest.raises(SystemExit): + MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + if __name__ == '__main__': pytest.main([__file__, '-v']) diff --git a/tests/test_universal_extraction.py b/tests/test_universal_extraction.py index 58cad5f7..8f58505a 100644 --- a/tests/test_universal_extraction.py +++ b/tests/test_universal_extraction.py @@ -725,5 +725,160 @@ def test_process_test_reads_file(self): assert result.read_id in [r[0] for r in reads] +class TestConcatenatedElementProcessing: + """Test concatenated element extraction and processing.""" + + def _create_concat_structure(self): + """Create structure with concatenated barcode: Barcode|1:Barcode|2:UMI:PolyT:cDNA""" + # Use 8bp barcodes split into two 4bp parts + mdf_content = """Barcode|1:Barcode|2:UMI:PolyT:cDNA +Barcode|1\tVAR_LIST\tAAAACCCC,GGGGTTTT,ACGTACGT\t4 +Barcode|2\tVAR_LIST\tAAAACCCC,GGGGTTTT,ACGTACGT\t4 +UMI\tVAR_ANY\t8 +""" + return MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + def test_concat_structure_barcode_elements(self): + """Test that concatenated barcode is identified by base name.""" + structure = self._create_concat_structure() + assert structure.barcode_elements == ["Barcode"] + assert structure.umi_elements == ["UMI"] + + def test_concat_structure_counts(self): + """Test concatenated elements counts.""" + structure = self._create_concat_structure() + assert structure.concatenated_elements_counts["Barcode"] == 2 + + def test_concat_extractor_init(self): + """Test extractor initialization with concatenated elements.""" + structure = self._create_concat_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + # Should have one index for the base name "Barcode", not individual parts + assert "Barcode" in extractor.index_dict + assert "Barcode|1" not in extractor.index_dict + assert "Barcode|2" not in extractor.index_dict + + def test_concat_extractor_header(self): + """Test header output for concatenated elements.""" + structure = self._create_concat_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + header = extractor.header() + + # Header should have standard fields, not individual parts + assert "barcode" in header + assert "UMI" in header + # Should NOT contain individual concatenated part names + assert "Barcode|1" not in header + assert "Barcode|2" not in header + + def test_process_concatenated_elements_all_detected(self): + """Test process_concatenated_elements when all parts are found.""" + structure = self._create_concat_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + # Simulate detected parts: AAAA at [10,13] and CCCC at [14,17] + sequence = "XXXXXXXXXX" + "AAAA" + "CCCC" + "XXXXXXXX" + concat_storage = {"Barcode": [(10, 13), (14, 17)]} + detected_results = {} + + extractor.process_concatenated_elements(concat_storage, detected_results, sequence) + + assert "Barcode" in detected_results + result = detected_results["Barcode"] + # Should find corrected match for "AAAACCCC" in whitelist + assert result.seq == "AAAACCCC" + assert result.start == 10 + assert result.end == 17 + + def test_process_concatenated_elements_missing_part(self): + """Test process_concatenated_elements when a part is missing.""" + structure = self._create_concat_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + # Part 2 not detected (start=-1) + concat_storage = {"Barcode": [(10, 13), (-1, -1)]} + detected_results = {} + + extractor.process_concatenated_elements(concat_storage, detected_results, "AAAA" * 10) + + assert "Barcode" in detected_results + assert detected_results["Barcode"].seq == ExtractionResult.NOSEQ + assert detected_results["Barcode"].start == -1 + + def test_process_concatenated_no_correction(self): + """Test process_concatenated_elements without correction (raw concatenation).""" + structure = self._create_concat_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + extractor.correct_sequences = False + + sequence = "XXXXXXXXXX" + "ACGT" + "TGCA" + "XXXXXXXX" + concat_storage = {"Barcode": [(10, 13), (14, 17)]} + detected_results = {} + + extractor.process_concatenated_elements(concat_storage, detected_results, sequence) + + assert "Barcode" in detected_results + result = detected_results["Barcode"] + # Without correction, raw concatenated sequence is stored + assert result.seq == "ACGTTGCA" + assert result.start == 10 + assert result.end == 17 + + def test_concat_extraction_result_get_barcode(self): + """Test ExtractionResult.get_barcode() with concatenated element result.""" + structure = self._create_concat_structure() + detected = {"Barcode": DetectedElement(10, 17, 14, "AAAACCCC")} + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.get_barcode() == "AAAACCCC" + assert result.has_barcode() is True + assert result.is_valid() is True + + def test_concat_extraction_result_str_format(self): + """Test ExtractionResult __str__ output with concatenated elements.""" + structure = self._create_concat_structure() + detected = { + "Barcode": DetectedElement(10, 17, 14, "AAAACCCC"), + "UMI": DetectedElement(18, 25, 0, "ACGTACGT"), + "PolyT": DetectedElement(26, 40, 0) + } + result = ExtractionResult(structure, "read_001", "+", detected) + output = str(result) + + fields = output.split('\t') + assert fields[0] == "read_001" + assert fields[1] == "AAAACCCC" # barcode + assert fields[2] == "ACGTACGT" # UMI + # Should NOT contain individual part names + + def test_concat_extraction_result_no_barcode(self): + """Test ExtractionResult when concatenated barcode correction fails.""" + structure = self._create_concat_structure() + detected = { + "Barcode": DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ), + "UMI": DetectedElement(18, 25, 0, "ACGTACGT") + } + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.get_barcode() == ExtractionResult.NOSEQ + assert result.has_barcode() is False + + def test_concat_additional_attributes_exclude_barcode(self): + """Test that concatenated barcode is excluded from additional attributes.""" + structure = self._create_concat_structure() + detected = { + "Barcode": DetectedElement(10, 17, 14, "AAAACCCC"), + "PolyT": DetectedElement(26, 40, 0) + } + result = ExtractionResult(structure, "read_001", "+", detected) + attrs = result.get_additional_attributes() + + assert "Barcode detected" not in attrs + assert "Barcode|1 detected" not in attrs + assert "Barcode|2 detected" not in attrs + assert "PolyT detected" in attrs + + if __name__ == '__main__': pytest.main([__file__, '-v']) From b4b84226220d79f6b65f748170245957610031d3 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Sat, 31 Jan 2026 00:31:22 +0100 Subject: [PATCH 18/41] implement duplicarted elements --- .../callers/universal_extraction.py | 95 +++++++- tests/test_molecule_structure.py | 95 ++++++++ tests/test_universal_extraction.py | 220 ++++++++++++++++++ 3 files changed, 407 insertions(+), 3 deletions(-) diff --git a/src/barcode_calling/callers/universal_extraction.py b/src/barcode_calling/callers/universal_extraction.py index 96bd128d..918a3219 100644 --- a/src/barcode_calling/callers/universal_extraction.py +++ b/src/barcode_calling/callers/universal_extraction.py @@ -56,6 +56,9 @@ def __init__(self, molecule_structure: MoleculeStructure): if el.element_name in self.molecule_structure.elements_to_concatenate: # Concatenated parts: skip individual index, handled after loop pass + elif el.element_name in self.molecule_structure.duplicated_elements: + # Duplicated parts: skip individual index, handled after loop + pass elif self.correct_sequences: self.index_dict[el.element_name] = self.prepare_barcode_index(el) # FIXME: min score should be based on the length of the barcode and their sparcity @@ -82,6 +85,21 @@ def __init__(self, molecule_structure: MoleculeStructure): self.index_dict[base_name] = KmerIndexer(barcode_list, k) self.min_scores[base_name] = total_length - 2 + # Build one index per duplicated base element using the full whitelist + if self.correct_sequences: + for base_name, count in self.molecule_structure.duplicated_elements_counts.items(): + first_part_el = None + for el in self.molecule_structure: + if el.element_name in self.molecule_structure.duplicated_elements: + if self.molecule_structure.duplicated_elements[el.element_name][0] == base_name: + first_part_el = el + break + if first_part_el is not None and first_part_el.element_value is not None: + barcode_list = first_part_el.element_value + k = max(6, int(first_part_el.element_length / 2) - 2) + self.index_dict[base_name] = KmerIndexer(barcode_list, k) + self.min_scores[base_name] = first_part_el.element_length - 2 + if not self.has_cdna: logger.critical("Molecule must include a cDNA") sys.exit(IsoQuantExitCode.INVALID_PARAMETER) @@ -136,7 +154,7 @@ def _find_patterns_fwd(self, read_id, sequence): self.extract_variable_elements5(sequence, detected_elements, concat_element_storage, dupl_element_storage) self.extract_variable_elements3(sequence, detected_elements, concat_element_storage, dupl_element_storage) self.process_concatenated_elements(concat_element_storage, detected_elements, sequence) - self.process_duplicated_elements(dupl_element_storage, detected_elements) + self.process_duplicated_elements(dupl_element_storage, detected_elements, sequence) logger.debug("== end read id %s ==" % read_id) return detected_elements @@ -168,8 +186,79 @@ def process_concatenated_elements(self, concatenated_element_storage, detected_r detected_results[base_name] = DetectedElement( parts[0][0], parts[-1][1], 0, concatenated_seq) - def process_duplicated_elements(self, duplicated_element_storage, detected_results): - pass + def process_duplicated_elements(self, duplicated_element_storage, detected_results, sequence): + for base_name, parts in duplicated_element_storage.items(): + # Extract sequences for detected parts + part_seqs = [] + part_coords = [] + for start, end in parts: + if start != -1: + part_seqs.append(sequence[start:end + 1]) + part_coords.append((start, end)) + else: + part_seqs.append(None) + part_coords.append((-1, -1)) + + detected_seqs = [s for s in part_seqs if s is not None] + if not detected_seqs: + detected_results[base_name] = DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ) + continue + + # Use first detected copy's coordinates for reporting + first_detected_idx = next(i for i, s in enumerate(part_seqs) if s is not None) + + if self.correct_sequences and base_name in self.index_dict: + # Correct each copy independently + corrected = [] + for seq in part_seqs: + if seq is None: + corrected.append((None, -1)) + continue + matching = self.index_dict[base_name].get_occurrences(seq) + corrected_seq, score, _, _ = find_candidate_with_max_score_ssw( + matching, seq, min_score=self.min_scores[base_name]) + corrected.append((corrected_seq, score)) + + successful = [(seq, score) for seq, score in corrected if seq is not None] + + if not successful: + # Neither copy corrected + detected_results[base_name] = DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ) + elif len(successful) == 1: + # One succeeded + seq, score = successful[0] + detected_results[base_name] = DetectedElement( + part_coords[first_detected_idx][0], + part_coords[first_detected_idx][1], + score, seq) + else: + # Multiple succeeded - check agreement + unique_seqs = set(seq for seq, _ in successful) + if len(unique_seqs) == 1: + # All agree + best_score = max(score for _, score in successful) + detected_results[base_name] = DetectedElement( + part_coords[first_detected_idx][0], + part_coords[first_detected_idx][1], + best_score, successful[0][0]) + else: + # Copies disagree after correction + detected_results[base_name] = DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ) + else: + # No correction: compare raw sequences + if len(set(detected_seqs)) == 1: + # All match + detected_results[base_name] = DetectedElement( + part_coords[first_detected_idx][0], + part_coords[first_detected_idx][1], + 0, detected_seqs[0]) + else: + # Differ: report all comma-separated + comma_sep = ",".join(detected_seqs) + detected_results[base_name] = DetectedElement( + part_coords[first_detected_idx][0], + part_coords[first_detected_idx][1], + 0, comma_sep) def correct_element(self, element: MoleculeElement, sequence: str, potential_start: int, potential_end: int) -> DetectedElement: potential_seq = sequence[potential_start:potential_end + 1] diff --git a/tests/test_molecule_structure.py b/tests/test_molecule_structure.py index ba1ae6c3..127d2e71 100644 --- a/tests/test_molecule_structure.py +++ b/tests/test_molecule_structure.py @@ -532,5 +532,100 @@ def test_concatenated_nonconsecutive_indices_rejected(self): MoleculeStructure(iter(mdf_content.strip().split('\n'))) +class TestDuplicatedElements: + """Test duplicated element parsing and identification.""" + + def test_parse_duplicated_elements(self): + """Test parsing MDF with duplicated barcode parts.""" + mdf_content = """Barcode/1:Barcode/2:UMI:PolyT:cDNA +Barcode/1\tVAR_LIST\tAAAACCCC,GGGGTTTT +Barcode/2\tVAR_LIST\tAAAACCCC,GGGGTTTT +UMI\tVAR_ANY\t8 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert len(structure.ordered_elements) == 5 + assert "Barcode/1" in structure.duplicated_elements + assert "Barcode/2" in structure.duplicated_elements + assert structure.duplicated_elements["Barcode/1"] == ("Barcode", 1) + assert structure.duplicated_elements["Barcode/2"] == ("Barcode", 2) + + def test_duplicated_elements_counts(self): + """Test that duplicated_elements_counts has correct base name and count.""" + mdf_content = """Barcode/1:Barcode/2:cDNA +Barcode/1\tVAR_LIST\tAAAACCCC,GGGGTTTT +Barcode/2\tVAR_LIST\tAAAACCCC,GGGGTTTT +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert "Barcode" in structure.duplicated_elements_counts + assert structure.duplicated_elements_counts["Barcode"] == 2 + + def test_duplicated_three_parts(self): + """Test three-part duplicated element.""" + mdf_content = """Barcode/1:Barcode/2:Barcode/3:cDNA +Barcode/1\tVAR_LIST\tAAAACCCC,GGGGTTTT +Barcode/2\tVAR_LIST\tAAAACCCC,GGGGTTTT +Barcode/3\tVAR_LIST\tAAAACCCC,GGGGTTTT +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert structure.duplicated_elements_counts["Barcode"] == 3 + + def test_duplicated_barcode_umi_identification(self): + """Test that duplicated barcode elements are identified by base name.""" + mdf_content = """Barcode/1:Barcode/2:UMI:cDNA +Barcode/1\tVAR_LIST\tAAAACCCC,GGGGTTTT +Barcode/2\tVAR_LIST\tAAAACCCC,GGGGTTTT +UMI\tVAR_ANY\t8 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + # Base name "Barcode" should be in barcode_elements, not "Barcode/1", "Barcode/2" + assert structure.barcode_elements == ["Barcode"] + assert "Barcode/1" not in structure.barcode_elements + assert "Barcode/2" not in structure.barcode_elements + assert structure.umi_elements == ["UMI"] + + def test_duplicated_umi_identification(self): + """Test that duplicated UMI elements are identified by base name.""" + mdf_content = """Barcode:UMI/1:UMI/2:cDNA +Barcode\tVAR_LIST\tAAAA,CCCC +UMI/1\tVAR_ANY\t8 +UMI/2\tVAR_ANY\t8 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert structure.barcode_elements == ["Barcode"] + assert structure.umi_elements == ["UMI"] + + def test_duplicated_elements_must_be_variable(self): + """Test that non-variable duplicated elements are rejected.""" + mdf_content = """Adapter/1:Adapter/2:cDNA +Adapter/1\tCONST\tACGT +Adapter/2\tCONST\tACGT +""" + with pytest.raises(SystemExit): + MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + def test_duplicated_nonconsecutive_indices_rejected(self): + """Test that non-consecutive indices (e.g., 1 and 3) are rejected.""" + mdf_content = """Barcode/1:Barcode/3:cDNA +Barcode/1\tVAR_LIST\tAAAA,CCCC +Barcode/3\tVAR_LIST\tAAAA,CCCC +""" + with pytest.raises(SystemExit): + MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + def test_duplicated_different_types_rejected(self): + """Test that duplicated elements with different types are rejected.""" + mdf_content = """Barcode/1:Barcode/2:cDNA +Barcode/1\tVAR_LIST\tAAAA,CCCC +Barcode/2\tVAR_ANY\t4 +""" + with pytest.raises(SystemExit): + MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + if __name__ == '__main__': pytest.main([__file__, '-v']) diff --git a/tests/test_universal_extraction.py b/tests/test_universal_extraction.py index 8f58505a..0bc8d064 100644 --- a/tests/test_universal_extraction.py +++ b/tests/test_universal_extraction.py @@ -880,5 +880,225 @@ def test_concat_additional_attributes_exclude_barcode(self): assert "PolyT detected" in attrs +class TestDuplicatedElementProcessing: + """Test duplicated element extraction and processing (majority vote).""" + + def _create_dupl_structure(self): + """Create structure with duplicated barcode: Barcode/1:Barcode/2:UMI:PolyT:cDNA""" + mdf_content = """Barcode/1:Barcode/2:UMI:PolyT:cDNA +Barcode/1\tVAR_LIST\tAAAACCCC,GGGGTTTT,ACGTACGT +Barcode/2\tVAR_LIST\tAAAACCCC,GGGGTTTT,ACGTACGT +UMI\tVAR_ANY\t8 +""" + return MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + def test_dupl_structure_barcode_elements(self): + """Test that duplicated barcode is identified by base name.""" + structure = self._create_dupl_structure() + assert structure.barcode_elements == ["Barcode"] + assert structure.umi_elements == ["UMI"] + + def test_dupl_structure_counts(self): + """Test duplicated elements counts.""" + structure = self._create_dupl_structure() + assert structure.duplicated_elements_counts["Barcode"] == 2 + + def test_dupl_extractor_init(self): + """Test extractor initialization with duplicated elements.""" + structure = self._create_dupl_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + # Should have one index for the base name "Barcode", not individual parts + assert "Barcode" in extractor.index_dict + assert "Barcode/1" not in extractor.index_dict + assert "Barcode/2" not in extractor.index_dict + + def test_dupl_extractor_header(self): + """Test header output for duplicated elements.""" + structure = self._create_dupl_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + header = extractor.header() + + assert "barcode" in header + assert "UMI" in header + # Should NOT contain individual duplicated part names + assert "Barcode/1" not in header + assert "Barcode/2" not in header + + def test_process_duplicated_both_agree(self): + """Test process_duplicated_elements when both copies correct to the same barcode.""" + structure = self._create_dupl_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + # Both copies have AAAACCCC (exact match to whitelist) + sequence = "XXXXXXXXXX" + "AAAACCCC" + "XXXXXXXX" + "AAAACCCC" + "XXXXXXXX" + dupl_storage = {"Barcode": [(10, 17), (26, 33)]} + detected_results = {} + + extractor.process_duplicated_elements(dupl_storage, detected_results, sequence) + + assert "Barcode" in detected_results + result = detected_results["Barcode"] + assert result.seq == "AAAACCCC" + assert result.start == 10 + assert result.end == 17 + + def test_process_duplicated_one_succeeds(self): + """Test process_duplicated_elements when one copy corrects, other fails.""" + structure = self._create_dupl_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + # Copy 1: AAAACCCC (exact match), Copy 2: totally wrong sequence + sequence = "XXXXXXXXXX" + "AAAACCCC" + "XXXXXXXX" + "NNNNNNNN" + "XXXXXXXX" + dupl_storage = {"Barcode": [(10, 17), (26, 33)]} + detected_results = {} + + extractor.process_duplicated_elements(dupl_storage, detected_results, sequence) + + assert "Barcode" in detected_results + result = detected_results["Barcode"] + assert result.seq == "AAAACCCC" + + def test_process_duplicated_both_fail(self): + """Test process_duplicated_elements when neither copy corrects.""" + structure = self._create_dupl_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + # Both copies have garbage sequences that don't match any whitelist entry + sequence = "XXXXXXXXXX" + "NNNNNNNN" + "XXXXXXXX" + "NNNNNNNN" + "XXXXXXXX" + dupl_storage = {"Barcode": [(10, 17), (26, 33)]} + detected_results = {} + + extractor.process_duplicated_elements(dupl_storage, detected_results, sequence) + + assert "Barcode" in detected_results + assert detected_results["Barcode"].seq == ExtractionResult.NOSEQ + assert detected_results["Barcode"].start == -1 + + def test_process_duplicated_disagree(self): + """Test process_duplicated_elements when copies correct to different barcodes.""" + structure = self._create_dupl_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + # Copy 1: AAAACCCC, Copy 2: GGGGTTTT (both valid but different) + sequence = "XXXXXXXXXX" + "AAAACCCC" + "XXXXXXXX" + "GGGGTTTT" + "XXXXXXXX" + dupl_storage = {"Barcode": [(10, 17), (26, 33)]} + detected_results = {} + + extractor.process_duplicated_elements(dupl_storage, detected_results, sequence) + + assert "Barcode" in detected_results + assert detected_results["Barcode"].seq == ExtractionResult.NOSEQ + assert detected_results["Barcode"].start == -1 + + def test_process_duplicated_none_detected(self): + """Test process_duplicated_elements when no copies are detected.""" + structure = self._create_dupl_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + dupl_storage = {"Barcode": [(-1, -1), (-1, -1)]} + detected_results = {} + + extractor.process_duplicated_elements(dupl_storage, detected_results, "AAAA" * 20) + + assert "Barcode" in detected_results + assert detected_results["Barcode"].seq == ExtractionResult.NOSEQ + assert detected_results["Barcode"].start == -1 + + def test_process_duplicated_one_detected(self): + """Test process_duplicated_elements when only one copy is detected.""" + structure = self._create_dupl_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + # Only copy 1 detected with AAAACCCC + sequence = "XXXXXXXXXX" + "AAAACCCC" + "XXXXXXXXXXXXXXXX" + dupl_storage = {"Barcode": [(10, 17), (-1, -1)]} + detected_results = {} + + extractor.process_duplicated_elements(dupl_storage, detected_results, sequence) + + assert "Barcode" in detected_results + result = detected_results["Barcode"] + assert result.seq == "AAAACCCC" + assert result.start == 10 + + def test_process_duplicated_no_correction_match(self): + """Test process_duplicated_elements without correction when copies match.""" + structure = self._create_dupl_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + extractor.correct_sequences = False + + sequence = "XXXXXXXXXX" + "ACGTACGT" + "XXXXXXXX" + "ACGTACGT" + "XXXXXXXX" + dupl_storage = {"Barcode": [(10, 17), (26, 33)]} + detected_results = {} + + extractor.process_duplicated_elements(dupl_storage, detected_results, sequence) + + assert "Barcode" in detected_results + result = detected_results["Barcode"] + assert result.seq == "ACGTACGT" + assert result.start == 10 + + def test_process_duplicated_no_correction_differ(self): + """Test process_duplicated_elements without correction when copies differ.""" + structure = self._create_dupl_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + extractor.correct_sequences = False + + sequence = "XXXXXXXXXX" + "ACGTACGT" + "XXXXXXXX" + "TGCATGCA" + "XXXXXXXX" + dupl_storage = {"Barcode": [(10, 17), (26, 33)]} + detected_results = {} + + extractor.process_duplicated_elements(dupl_storage, detected_results, sequence) + + assert "Barcode" in detected_results + result = detected_results["Barcode"] + # Should be comma-separated when they differ + assert result.seq == "ACGTACGT,TGCATGCA" + assert result.start == 10 + + def test_process_duplicated_no_correction_one_detected(self): + """Test process_duplicated_elements without correction, one copy detected.""" + structure = self._create_dupl_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + extractor.correct_sequences = False + + sequence = "XXXXXXXXXX" + "ACGTACGT" + "XXXXXXXXXXXXXXXX" + dupl_storage = {"Barcode": [(10, 17), (-1, -1)]} + detected_results = {} + + extractor.process_duplicated_elements(dupl_storage, detected_results, sequence) + + assert "Barcode" in detected_results + result = detected_results["Barcode"] + assert result.seq == "ACGTACGT" + assert result.start == 10 + + def test_dupl_extraction_result_get_barcode(self): + """Test ExtractionResult.get_barcode() with duplicated element result.""" + structure = self._create_dupl_structure() + detected = {"Barcode": DetectedElement(10, 17, 14, "AAAACCCC")} + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.get_barcode() == "AAAACCCC" + assert result.has_barcode() is True + assert result.is_valid() is True + + def test_dupl_additional_attributes_exclude_barcode(self): + """Test that duplicated barcode is excluded from additional attributes.""" + structure = self._create_dupl_structure() + detected = { + "Barcode": DetectedElement(10, 17, 14, "AAAACCCC"), + "PolyT": DetectedElement(26, 40, 0) + } + result = ExtractionResult(structure, "read_001", "+", detected) + attrs = result.get_additional_attributes() + + assert "Barcode detected" not in attrs + assert "Barcode/1 detected" not in attrs + assert "Barcode/2 detected" not in attrs + assert "PolyT detected" in attrs + + if __name__ == '__main__': pytest.main([__file__, '-v']) From c94e49a36959e4e74222955e3fdde8e1be59f28d Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Sun, 1 Feb 2026 16:37:02 +0100 Subject: [PATCH 19/41] --molecule to isoquant.py --- isoquant.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/isoquant.py b/isoquant.py index 59f68cfa..c7d39e90 100755 --- a/isoquant.py +++ b/isoquant.py @@ -178,6 +178,9 @@ def add_hidden_option(*args, **kwargs): # show command only with --full-help help='TSV file mapping barcode to cell type / spot id. ' 'Format: file.tsv or file.tsv:barcode_col:spot_cols ' '(e.g., file.tsv:0:1,2,3 for multiple spot columns)') + add_additional_option_to_group(sc_args_group, "--molecule", type=str, + help='molecule definition file (MDF) for custom_sc mode; ' + 'defines molecule structure for universal barcode extraction') # ALGORITHM add_additional_option_to_group(algo_args_group, "--report_novel_unspliced", "-u", type=bool_str, @@ -531,7 +534,11 @@ def check_input_params(args): args.umi_length = 0 if args.mode.needs_barcode_calling(): - if not args.barcode_whitelist and not args.barcoded_reads: + if args.mode == IsoQuantMode.custom_sc: + if not args.molecule and not args.barcoded_reads: + logger.critical("custom_sc mode requires --molecule or --barcoded_reads") + sys.exit(IsoQuantExitCode.BARCODE_WHITELIST_MISSING) + elif not args.barcode_whitelist and not args.barcoded_reads: logger.critical("You have chosen single-cell/spatial mode %s, please specify barcode whitelist or file with " "barcoded reads" % args.mode.name) sys.exit(IsoQuantExitCode.BARCODE_WHITELIST_MISSING) @@ -607,6 +614,10 @@ def check_input_files(args): else: check_file_exists(args.barcoded_reads, "Barcoded reads file") + # Check molecule definition file + if hasattr(args, 'molecule') and args.molecule: + check_file_exists(args.molecule, "Molecule definition file") + # Check barcode whitelist files if hasattr(args, 'barcode_whitelist') and args.barcode_whitelist: for wl_file in args.barcode_whitelist: @@ -958,7 +969,8 @@ def prepare_reference_genome(args): class BarcodeCallingArgs: - def __init__(self, input, barcode_whitelist, mode, output, out_fasta, tmp_dir, threads): + def __init__(self, input, barcode_whitelist, mode, output, out_fasta, tmp_dir, threads, + molecule: str | None = None): self.input = input # Can be a single file (str) or list of files self.barcodes = barcode_whitelist self.mode = mode @@ -967,6 +979,7 @@ def __init__(self, input, barcode_whitelist, mode, output, out_fasta, tmp_dir, t self.tmp_dir = tmp_dir self.threads = threads self.min_score = None + self.molecule = molecule def call_barcodes(args): @@ -999,7 +1012,8 @@ def call_barcodes(args): bc_threads = 1 if args.mode.enforces_single_thread() else args.threads bc_args = BarcodeCallingArgs(input_files, args.barcode_whitelist, args.mode, - output_barcodes_list, output_fasta_list, sample.aux_dir, bc_threads) + output_barcodes_list, output_fasta_list, sample.aux_dir, bc_threads, + molecule=getattr(args, 'molecule', None)) # Launching barcode calling in a separate process has the following reason: # Read chunks are not cleared by the GC in the end of barcode calling, leaving the main # IsoQuant process to consume ~2,5 GB even when barcode calling is done. From e309166d1e9b29be07516a44b21636834f351f72 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Mon, 2 Feb 2026 13:16:12 +0100 Subject: [PATCH 20/41] single-cell documentation base --- docs/barcode_calling.md | 236 ++++++++++++++++++++++++++++++++++++++ docs/cmd.md | 84 ++++++++------ docs/single_cell.md | 244 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 527 insertions(+), 37 deletions(-) create mode 100644 docs/barcode_calling.md create mode 100644 docs/single_cell.md diff --git a/docs/barcode_calling.md b/docs/barcode_calling.md new file mode 100644 index 00000000..64f03a9d --- /dev/null +++ b/docs/barcode_calling.md @@ -0,0 +1,236 @@ +# Barcode calling + +IsoQuant includes a built-in barcode calling module (`detect_barcodes.py`) that extracts +cell barcodes and UMIs from raw long reads. This module supports multiple single-cell and +spatial transcriptomics platforms and can also be used as a standalone tool. + +See [single-cell and spatial transcriptomics](single_cell.md) for supported platforms, +pipeline integration, MDF format, and UMI deduplication. + +## How it works + +The barcode calling module processes each read by: + +1. Searching for known constant sequences (linkers, primers, TSO) on the read +2. Locating the polyT tail to determine read orientation +3. Extracting the barcode region based on its expected position relative to the anchoring constant sequences +4. Matching the extracted barcode against a whitelist +5. Extracting the UMI sequence from its expected position + +Each read is assigned: +- A **barcode** (cell identity) corrected against the whitelist, or `*` if no match is found +- A **UMI** (unique molecular identifier), or `*` if not detected +- A **strand** orientation (`+`, `-`, or `.` if unknown) +- Platform-specific features (polyT position, linker positions, etc.) + +## Integration with IsoQuant + +When running IsoQuant with a single-cell or spatial mode (`--mode tenX_v3`, etc.), +barcode calling is performed automatically as the first pipeline step. +The barcode detection output is a TSV file with one line per read, which is then used +throughout the rest of the pipeline for grouping and UMI deduplication. + +If barcodes have already been called externally (e.g., by Cell Ranger), +use `--barcoded_reads` to skip the barcode calling step. + +## Standalone usage + +`detect_barcodes.py` can be run independently from the IsoQuant pipeline: + +```bash +python detect_barcodes.py \ + --input reads.fastq.gz \ + --barcodes barcode_whitelist.txt \ + --mode tenX_v3 \ + --output output_prefix \ + --threads 16 +``` + +### Command line options + +`--input` or `-i` (required) + +One or more input read files in FASTQ, FASTA, BAM, or SAM format. Gzipped FASTQ/FASTA files are supported. + +`--output` or `-o` (required) + +Output prefix. The barcode calling results will be written to `.barcoded_reads.tsv`. +When multiple input files are provided, outputs are numbered: `_0.barcoded_reads.tsv`, etc. + +`--barcodes` or `-b` + +One or more barcode whitelist files. The number of files depends on the mode: + +* 1 file: `tenX_v3`, `visium_5prime`, `stereoseq`, `stereoseq_nosplit` +* 1 or 2 files: `curio` +* 2 files: `visium_hd` +* Not needed for `custom_sc` (barcodes defined in MDF file) + +`--mode` + +Barcode calling mode. Available modes: `tenX_v3`, `curio`, `stereoseq`, `stereoseq_nosplit`, +`visium_5prime`, `visium_hd`, `custom_sc`. Default: `stereoseq`. + +`--molecule` + +Path to a molecule definition file (MDF) for `custom_sc` mode. +See [MDF format](single_cell.md#molecule-definition-file-mdf-format) for the format specification. + +`--threads` or `-t` + +Number of threads for parallel processing (default: 16). + +`--min_score` + +Override the minimum alignment score for barcode matching. +The scoring system uses +1 for match, -1 for mismatch, -1 for gap open, -1 for gap extension. +By default, the minimum score is set automatically based on barcode length. + +`--tmp_dir` + +Folder for temporary files during parallel processing. + +### Example commands + +10x Genomics: +```bash +python detect_barcodes.py \ + -i reads.fastq.gz \ + -b 3M-february-2018.txt.gz \ + --mode tenX_v3 \ + -o results/my_sample \ + -t 16 +``` + +Stereo-seq: +```bash +python detect_barcodes.py \ + -i reads.fastq.gz \ + -b stereo_barcodes.txt \ + --mode stereoseq \ + -o results/stereo_sample \ + -t 16 +``` + +Curio with two barcode files: +```bash +python detect_barcodes.py \ + -i reads.fastq.gz \ + -b left_barcodes.txt right_barcodes.txt \ + --mode curio \ + -o results/curio_sample +``` + +Custom molecule: +```bash +python detect_barcodes.py \ + -i reads.fastq.gz \ + --mode custom_sc \ + --molecule my_platform.mdf \ + -o results/custom_sample +``` + +## Output format + +The main output is a TSV file (`*.barcoded_reads.tsv`) with a header line followed by one line per read. +The columns depend on the platform mode. + +### Common columns (all modes) + +| Column | Description | +|--------|-------------| +| read_id | Read identifier | +| barcode | Corrected barcode sequence, or `*` if not found | +| UMI | UMI sequence, or `*` if not detected | +| BC_score | Barcode alignment score (-1 if not matched) | +| strand | Read orientation: `+`, `-`, or `.` | + +### Additional columns by mode + +**10x Genomics** (`tenX_v3`, `visium_5prime`, `visium_hd`): + +| Column | Description | +|--------|-------------| +| polyT | Position of polyT tail start (-1 if not found) | +| R1 | Position of R1 primer end (-1 if not found) | + +**Curio** (`curio`): + +| Column | Description | +|--------|-------------| +| polyT | PolyT tail position | +| primer | PCR primer position | +| linker_start | Linker start position | +| linker_end | Linker end position | + +**Stereo-seq** (`stereoseq_nosplit`, `stereoseq`): + +| Column | Description | +|--------|-------------| +| polyT | PolyT tail position | +| primer | Primer position | +| linker_start | Linker start position | +| linker_end | Linker end position | +| TSO | TSO position (-1 if not found) | + +**Custom** (`custom_sc`): columns depend on the elements defined in the MDF file. +Each variable element produces a column with its detected sequence and score. + +### Statistics file + +A statistics file (`*.barcoded_reads.tsv.stats`) is generated alongside the TSV output, +reporting the total number of reads processed, barcodes detected, and UMIs found. + +## Supported platforms + +### 10x Genomics (`tenX_v3`, `visium_5prime`) + +Molecule structure (3' to 5'): +``` +[R1 primer]---[Barcode (16bp)]---[UMI (12bp)]---[PolyT]---[cDNA]---[TSO] +``` + +Requires 1 barcode whitelist file (e.g., the 10x `3M-february-2018.txt.gz`). + +### Visium HD (`visium_hd`) + +Molecule structure (3' to 5'): +``` +[R1 primer]---[BC_part1 (16bp)]---[separator]---[BC_part2 (15bp)]---[UMI (9bp)]---[PolyT]---[cDNA]---[TSO] +``` + +Requires 2 barcode whitelist files (part 1 and part 2). +The final barcode is the concatenation of both parts. + +### Curio Bioscience (`curio`) + +Molecule structure (3' to 5'): +``` +[PCR primer]---[Left BC (8bp)]---[Linker]---[Right BC (6bp)]---[UMI (9bp)]---[PolyT]---[cDNA] +``` + +The linker sequence anchors the barcode detection. +Accepts either 1 combined whitelist file (14bp barcodes) or 2 files (8bp left + 6bp right). + +### Stereo-seq (`stereoseq`, `stereoseq_nosplit`) + +Molecule structure (3' to 5'): +``` +[Primer]---[Barcode (25bp)]---[Linker]---[UMI (10bp)]---[PolyT]---[cDNA]---[TSO] +``` + +- `stereoseq` mode: splits concatenated reads at TSO boundaries, producing a new FASTA with individual subreads +- `stereoseq_nosplit` mode: processes reads without splitting + +### Custom platform (`custom_sc`) + +Uses a molecule definition file (MDF) to describe arbitrary molecule structures. +See [MDF format](single_cell.md#molecule-definition-file-mdf-format). + +The universal extractor: +1. Tries both forward and reverse complement orientations +2. Detects polyT tail to determine strand +3. Finds constant elements (primers, linkers) on the read +4. Extracts variable elements (barcodes, UMIs) from their expected positions +5. Corrects barcodes against whitelists when provided (`VAR_FILE` or `VAR_LIST`) +6. Handles linked elements: concatenated (`|`) parts are joined then corrected; duplicated (`/`) parts use majority vote diff --git a/docs/cmd.md b/docs/cmd.md index a9bb7aef..800bbb80 100644 --- a/docs/cmd.md +++ b/docs/cmd.md @@ -124,43 +124,6 @@ Useful for grouping single-cell/spatial data by cell type or spatial region inst **Example**: `--read_group tag:CB file_name barcode_spot` creates multi-level grouping by cell barcode tag, file name, and cell type. -### Single-cell and spatial transcriptomics options - -**NB! This functionality is not officially released yet, use it at your own risk.** - -`--mode` or `-m` -IsoQuant mode for processing single-cell or spatial transcriptomics data. Available modes: - -* `bulk` - standard bulk RNA-seq mode (default) -* `tenX_v3` - 10x Genomics single-cell 3' gene expression -* `curio` - Curio Bioscience single-cell -* `visium_hd` - 10x Genomics Visium HD spatial transcriptomics -* `visium_5prime` - 10x Genomics Visium 5' spatial transcriptomics -* `stereoseq` - Stereo-seq spatial transcriptomics (BGI) -* `stereoseq_nosplit` - Stereo-seq without barcode splitting - -Single-cell and spatial modes enable automatic barcode calling and UMI-based deduplication. - -`--barcode_whitelist` -Path to file(s) with barcode whitelist for barcode calling. -Required for single-cell/spatial modes unless `--barcoded_reads` is provided. -File should contain one barcode per line. - -`--barcoded_reads` -Path to TSV file(s) with barcoded reads. -Format: `read_idbarcodeumi` (one read per line). -If not provided, barcodes will be called automatically from raw reads using `--barcode_whitelist`. - -`--barcode_column` -Column index for barcodes in the `--barcoded_reads` file (default: 1). -Read ID column is 0, barcode column is 1, UMI column is 2. - -`--barcode2spot` -Path to TSV file(s) mapping barcodes to cell types or spatial spots. -Format: `barcodecell_type` (one barcode per line). -Used with `--read_group barcode_spot` to group counts by cell type/spot instead of individual barcodes. -Useful for reducing output dimensions from thousands of barcodes to tens of cell types. - ### Output options `--sqanti_output` @@ -200,6 +163,53 @@ Useful for reducing output dimensions from thousands of barcodes to tens of cell Align reads to the reference without running IsoQuant itself. +## Single-cell and spatial transcriptomics options + +**NB! This feature is experimental and is not part of the official IsoQuant release.** + +See [single-cell and spatial transcriptomics](single_cell.md) for a detailed guide on +supported platforms, examples, molecule definition files, and UMI deduplication. + +`--mode` or `-m` +IsoQuant mode for processing single-cell or spatial transcriptomics data. Available modes: + +* `bulk` - standard bulk RNA-seq mode (default) +* `tenX_v3` - 10x Genomics single-cell 3' gene expression +* `curio` - Curio Bioscience single-cell +* `visium_hd` - 10x Genomics Visium HD spatial transcriptomics +* `visium_5prime` - 10x Genomics Visium 5' spatial transcriptomics +* `stereoseq` - Stereo-seq spatial transcriptomics (BGI) +* `stereoseq_nosplit` - Stereo-seq without read splitting +* `custom_sc` - custom single-cell/spatial mode using a [molecule definition file](single_cell.md#molecule-definition-file-mdf-format) + +Single-cell and spatial modes enable automatic barcode calling and UMI-based deduplication. + +`--barcode_whitelist` +Path to file(s) with barcode whitelist for barcode calling. +Required for single-cell/spatial modes unless `--barcoded_reads` is provided. +File should contain one barcode per line. + +`--barcoded_reads` +Path to TSV file(s) with pre-called barcoded reads. +Format: `read_idbarcodeumi` (one read per line). +If provided, IsoQuant skips barcode calling and uses these assignments directly. + +`--barcode_column` +Column index for barcodes in the `--barcoded_reads` file (default: 1). +Read ID column is 0, barcode column is 1, UMI column is 2. + +`--barcode2spot` +Path to TSV file mapping barcodes to cell types or spatial spots. +Format: `barcodecell_type` (one barcode per line). +Used with `--read_group barcode_spot` to group counts by cell type/spot instead of individual barcodes. + +`--molecule` +Path to a molecule definition file (MDF) for `custom_sc` mode. +Defines molecule structure for universal barcode extraction from any platform. +See [MDF format](single_cell.md#molecule-definition-file-mdf-format) for details. + + + ## Algorithm parameters ### Quantification diff --git a/docs/single_cell.md b/docs/single_cell.md new file mode 100644 index 00000000..6077c6f8 --- /dev/null +++ b/docs/single_cell.md @@ -0,0 +1,244 @@ +# Single-cell and spatial transcriptomics + +**NB! This feature is experimental and is not part of the official IsoQuant release.** + +IsoQuant supports single-cell and spatial transcriptomics data from multiple platforms. +When a single-cell or spatial mode is selected, IsoQuant automatically performs +barcode calling and UMI-based PCR deduplication as part of the pipeline. + +## Overview + +The single-cell/spatial pipeline extends the standard bulk pipeline with these additional steps: + +1. **Barcode calling** -- extract cell barcodes and UMIs from raw reads +2. **Standard IsoQuant processing** -- alignment, isoform assignment, transcript model construction +3. **UMI deduplication** -- remove PCR/RT duplicates within each barcode group +4. **Grouped quantification** -- produce per-cell/per-spot count matrices + +Barcode calling is handled by the built-in [barcode calling module](barcode_calling.md), +which can also be used as a standalone tool. + +## Supported platforms + +| Mode | Platform | Barcode whitelist files | UMI length | Notes | +|------|----------|------------------------|------------|-------| +| `tenX_v3` | 10x Genomics 3' v3 | 1 | 12 | Single 16bp barcode | +| `visium_5prime` | 10x Genomics Visium 5' | 1 | 12 | Same detector as tenX_v3 | +| `visium_hd` | 10x Genomics Visium HD | 2 | 9 | Two-part barcode (16bp + 15bp) | +| `curio` | Curio Bioscience | 1 or 2 | 9 | Double barcode (8bp + 6bp) with linker | +| `stereoseq` | BGI Stereo-seq | 1 | 10 | 25bp barcode, read splitting mode | +| `stereoseq_nosplit` | BGI Stereo-seq | 1 | 10 | 25bp barcode, no read splitting | +| `custom_sc` | Any platform | 0 (uses MDF) | varies | User-defined molecule structure via [MDF file](#molecule-definition-file-mdf-format) | + +## Quick start examples + +10x Genomics single-cell: +```bash +isoquant.py --reference genome.fa --genedb genes.gtf --complete_genedb \ + --fastq reads.fastq.gz --data_type nanopore \ + --mode tenX_v3 --barcode_whitelist 3M-february-2018.txt.gz \ + -o sc_output +``` + +Stereo-seq spatial: +```bash +isoquant.py --reference genome.fa --genedb genes.gtf --complete_genedb \ + --fastq reads.fastq.gz --data_type nanopore \ + --mode stereoseq --barcode_whitelist barcodes.txt \ + -o stereo_output +``` + +Custom platform with molecule definition file: +```bash +isoquant.py --reference genome.fa --genedb genes.gtf --complete_genedb \ + --fastq reads.fastq.gz --data_type nanopore \ + --mode custom_sc --molecule molecule_definition.mdf \ + -o custom_output +``` + +Pre-called barcodes (skip barcode calling): +```bash +isoquant.py --reference genome.fa --genedb genes.gtf --complete_genedb \ + --bam aligned.bam --data_type nanopore \ + --mode tenX_v3 --barcoded_reads barcodes.tsv \ + -o sc_output +``` + +## Command line options + +`--mode` or `-m` + +IsoQuant processing mode. Available modes: + +* `bulk` -- standard bulk RNA-seq mode (default) +* `tenX_v3` -- 10x Genomics single-cell 3' gene expression +* `curio` -- Curio Bioscience single-cell +* `visium_hd` -- 10x Genomics Visium HD spatial transcriptomics +* `visium_5prime` -- 10x Genomics Visium 5' spatial transcriptomics +* `stereoseq` -- Stereo-seq spatial transcriptomics (BGI), with read splitting +* `stereoseq_nosplit` -- Stereo-seq without read splitting +* `custom_sc` -- custom single-cell/spatial mode using a molecule definition file (MDF) + +All modes except `bulk` enable automatic barcode calling and UMI-based deduplication. + +`--barcode_whitelist` + +Path to file(s) with barcode whitelist for barcode calling. +Required for all single-cell/spatial modes except `custom_sc` (unless `--barcoded_reads` is provided). +File should contain one barcode per line. Supports plain text and gzipped files. + +The number of whitelist files depends on the mode: + +* 1 file: `tenX_v3`, `visium_5prime`, `stereoseq`, `stereoseq_nosplit` +* 1 or 2 files: `curio` (1 file with combined 14bp barcodes, or 2 files with 8bp left + 6bp right) +* 2 files: `visium_hd` (part 1 and part 2 barcode lists) +* Not needed: `custom_sc` (barcode lists are specified inside the MDF file) + +`--barcoded_reads` + +Path to TSV file(s) with pre-called barcoded reads. +Format: `read_idbarcodeumi` (one read per line). +When provided, IsoQuant skips the barcode calling step and uses these assignments directly. +This is useful when barcodes have already been called by an external tool (e.g., Cell Ranger). + +`--barcode_column` + +Column index for barcodes in the `--barcoded_reads` file (default: 1). +Column 0 is the read ID, column 1 is the barcode, column 2 is the UMI. + +`--barcode2spot` + +Path to TSV file mapping barcodes to cell types or spatial spots. +Format: `barcodecell_type` (one barcode per line). +Supports extended format: `file.tsv:barcode_col:spot_cols` (e.g., `mapping.tsv:0:1,2,3` for multiple spot columns). + +Used with `--read_group barcode_spot` to group counts by cell type or spatial region +instead of individual barcodes, reducing output from thousands of barcodes to +a manageable number of cell types or regions. + +`--molecule` + +Path to a molecule definition file (MDF) for `custom_sc` mode. +This file defines the structure of the sequencing molecule (barcodes, UMIs, linkers, polyT, cDNA) +and allows IsoQuant to process reads from any single-cell or spatial platform without code changes. +See the [MDF format](#molecule-definition-file-mdf-format) section below for details. + +## Molecule definition file (MDF) format + +The MDF format allows users to describe the structure of their sequencing molecule +so that IsoQuant can extract barcodes and UMIs from any platform. +The molecule is described in the 3' to 5' direction (primer end first, cDNA last). + +An MDF file has two parts: + +1. **Header line**: colon-separated list of element names defining the order of elements on the molecule (3' to 5'). +2. **Element definitions**: one line per element, tab-separated: `name type value [length]`. + +### Element types + +| Type | Description | Value field | +|------|-------------|-------------| +| `CONST` | Constant/known sequence (primer, linker, TSO) | DNA sequence | +| `VAR_FILE` | Variable sequence matched against a whitelist file | Path to TSV file | +| `VAR_LIST` | Variable sequence matched against an inline list | Comma-separated sequences | +| `VAR_ANY` | Variable-length sequence extracted as-is | Length (integer) | +| `PolyT` | PolyT/PolyA tract | (none) | +| `cDNA` | cDNA region | (none) | + +### Barcode and UMI identification + +Elements are identified as barcodes or UMIs by their name prefix (case-insensitive): +- Names starting with `barcode` are treated as barcode elements (e.g., `Barcode`, `barcode1`) +- Names starting with `umi` are treated as UMI elements (e.g., `UMI`, `umi1`) + +When multiple barcode (or UMI) elements are present, their sequences are concatenated +in the order they appear. + +### Examples + +**10x-like molecule** (3' to 5'): +``` +R1:Barcode:UMI:PolyT:cDNA:TSO +R1 CONST CTACACGACGCTCTTCCGATCT +Barcode VAR_FILE barcodes.tsv +UMI VAR_ANY 12 +TSO CONST CCCATGTACTCTGCGTTGATACCACTGCTT +``` + +**Simple molecule with inline barcodes:** +``` +Barcode:UMI:PolyT:cDNA +Barcode VAR_LIST AAACCCGGGTTTAAAC,TTTGGGCCCAAATTTG,GGGGAAAACCCCTTTT +UMI VAR_ANY 10 +``` + +### Linked elements for multi-part barcodes + +When a barcode or UMI is split into multiple parts on the molecule, use linked element notation. +All linked parts must be of the same variable type and reference the same whitelist. +Parts are numbered consecutively starting from 1. + +**Concatenated** (`|`): parts of a single barcode separated by other elements. +Parts are extracted independently, then concatenated and corrected as one sequence against the full whitelist. +``` +Barcode|1:Linker:Barcode|2:UMI:PolyT:cDNA +Barcode|1 VAR_FILE barcodes.tsv 8 +Linker CONST TCTTCAGCGTTCCCGAGA +Barcode|2 VAR_FILE barcodes.tsv 6 +UMI VAR_ANY 9 +``` + +**Duplicated** (`/`): multiple redundant copies of the same barcode on the molecule. +Each copy is corrected independently against the whitelist; +a majority vote determines the final barcode. +``` +Barcode/1:Barcode/2:UMI:PolyT:cDNA +Barcode/1 VAR_FILE barcodes.tsv 16 +Barcode/2 VAR_FILE barcodes.tsv 16 +UMI VAR_ANY 12 +``` + +## UMI deduplication + +All single-cell and spatial modes perform UMI-based PCR deduplication after isoform assignment. +Within each cell barcode and gene, reads with similar UMIs (within edit distance 3 by default) +are collapsed into a single representative read. + +The representative read is selected based on: +1. Unique isoform assignment over ambiguous +2. More exons +3. Longer transcript alignment + +## Output + +### Count matrices + +Single-cell/spatial modes produce grouped count matrices in addition to the standard IsoQuant output. +Use `--counts_format` to control the output format: + +* `default` -- automatic selection: matrix format for small numbers of groups (<=100), MTX for larger datasets +* `matrix` -- standard matrix format with genes/transcripts as rows and barcodes as columns +* `mtx` -- Matrix Market (MTX) format compatible with Seurat and Scanpy +* `none` -- no conversion (only internal linear format is produced) + +Grouped counts can also be converted after the run using `src/convert_grouped_counts.py`. + +### Grouping reads + +Use `--read_group` to control how reads are grouped for quantification. +Multiple grouping strategies can be combined (space-separated), producing separate count tables for each. + +Common configurations for single-cell/spatial data: + +```bash +# Group by barcode (from BAM CB tag) +--read_group tag:CB + +# Group by barcode and also by cell type +--read_group tag:CB barcode_spot + +# Group by barcode from barcoded_reads file and by file name +--read_group barcode file_name +``` + +See the [read grouping options](cmd.md#other-input-options) for the full list of grouping strategies. From f495136244928bd9c091af5a12365c739d486187 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Mon, 2 Feb 2026 17:15:27 +0100 Subject: [PATCH 21/41] fix header function --- src/barcode_calling/callers/base.py | 36 ++++++++------------------- src/barcode_calling/callers/curio.py | 11 +++++--- src/barcode_calling/callers/stereo.py | 10 +++++--- src/barcode_calling/callers/tenx.py | 11 +++++--- 4 files changed, 30 insertions(+), 38 deletions(-) diff --git a/src/barcode_calling/callers/base.py b/src/barcode_calling/callers/base.py index e8a1c77a..5da16187 100644 --- a/src/barcode_calling/callers/base.py +++ b/src/barcode_calling/callers/base.py @@ -147,12 +147,8 @@ def __str__(self) -> str: return "%s\t%s\t%s\t%d\t%s\t%s" % (self.read_id, self._barcode, self._umi, self.BC_score, self.UMI_good, self.strand) - def header(self) -> str: - """Get TSV header for result output.""" - return BarcodeDetectionResult._header() - @staticmethod - def _header() -> str: + def header() -> str: """Static header for class-level access.""" return "#read_id\tbarcode\tUMI\tBC_score\tvalid_UMI\tstrand" @@ -219,14 +215,10 @@ def __str__(self) -> str: return (BarcodeDetectionResult.__str__(self) + "\t%d\t%d\t%d\t%d" % (self.polyT, self.primer, self.linker_start, self.linker_end)) - def header(self) -> str: - """Get TSV header for result output.""" - return LinkerBarcodeDetectionResult._header() - @staticmethod - def _header() -> str: + def header() -> str: """Static header for class-level access.""" - return BarcodeDetectionResult._header() + "\tpolyT_start\tprimer_end\tlinker_start\tlinker_end" + return BarcodeDetectionResult.header() + "\tpolyT_start\tprimer_end\tlinker_start\tlinker_end" class TSOBarcodeDetectionResult(LinkerBarcodeDetectionResult): @@ -259,16 +251,11 @@ def get_additional_attributes(self) -> List[str]: attr.append("Linker detected") if self.tso5 != -1: attr.append("TSO detected") - return attr - - def header(self) -> str: - """Get TSV header for result output.""" - return TSOBarcodeDetectionResult._header() @staticmethod - def _header() -> str: + def header() -> str: """Static header for class-level access.""" - return LinkerBarcodeDetectionResult._header() + "\tTSO5" + return LinkerBarcodeDetectionResult.header() + "\tTSO5" class TenXBarcodeDetectionResult(BarcodeDetectionResult): @@ -305,14 +292,10 @@ def __str__(self) -> str: return (BarcodeDetectionResult.__str__(self) + "\t%d\t%d" % (self.polyT, self.r1)) - def header(self) -> str: - """Get TSV header for result output.""" - return TenXBarcodeDetectionResult._header() - @staticmethod - def _header() -> str: + def header() -> str: """Static header for class-level access.""" - return BarcodeDetectionResult._header() + "\tpolyT_start\tR1_end" + return BarcodeDetectionResult.header() + "\tpolyT_start\tR1_end" class SplittingBarcodeDetectionResult: @@ -398,9 +381,10 @@ def __str__(self) -> str: """Format all patterns as TSV lines.""" return "\n".join(str(r) for r in self.detected_patterns) - def header(self) -> str: + @staticmethod + def header() -> str: """Get TSV header for result output.""" - return TSOBarcodeDetectionResult._header() + return TSOBarcodeDetectionResult.header() class ReadStats: diff --git a/src/barcode_calling/callers/curio.py b/src/barcode_calling/callers/curio.py index feb2edc4..c4dfe645 100644 --- a/src/barcode_calling/callers/curio.py +++ b/src/barcode_calling/callers/curio.py @@ -193,8 +193,9 @@ def _find_barcode_umi_fwd(self, read_id: str, sequence: str) -> LinkerBarcodeDet def result_type(): return LinkerBarcodeDetectionResult - def header(self): - return self.result_type()._header() + @classmethod + def header(cls): + return cls.result_type().header() class CurioIlluminaDetector: @@ -333,5 +334,7 @@ def _find_barcode_umi_fwd(self, read_id: str, sequence: str) -> LinkerBarcodeDet def result_type(): return LinkerBarcodeDetectionResult - def header(self): - return self.result_type()._header() \ No newline at end of file + @classmethod + def header(cls): + return cls.result_type().header() + \ No newline at end of file diff --git a/src/barcode_calling/callers/stereo.py b/src/barcode_calling/callers/stereo.py index 953f96e2..0d2a42c4 100644 --- a/src/barcode_calling/callers/stereo.py +++ b/src/barcode_calling/callers/stereo.py @@ -203,8 +203,9 @@ def _find_barcode_umi_fwd(self, read_id: str, sequence: str) -> LinkerBarcodeDet def result_type(): return LinkerBarcodeDetectionResult - def header(self): - return self.result_type().header() + @classmethod + def header(cls): + return cls.result_type().header() class SharedMemoryStereoBarcodeDetector(StereoBarcodeDetector): @@ -493,8 +494,9 @@ def _find_barcode_umi_fwd(self, read_id: str, sequence: str) -> TSOBarcodeDetect def result_type(): return SplittingBarcodeDetectionResult - def header(self): - return self.result_type().header() + @classmethod + def header(cls): + return cls.result_type().header() class SharedMemoryStereoSplittingBarcodeDetector(StereoSplittingBarcodeDetector): diff --git a/src/barcode_calling/callers/tenx.py b/src/barcode_calling/callers/tenx.py index e506b63d..afe74469 100644 --- a/src/barcode_calling/callers/tenx.py +++ b/src/barcode_calling/callers/tenx.py @@ -175,8 +175,10 @@ def find_barcode_umi_no_polya(self, read_id: str, sequence: str) -> TenXBarcodeD def result_type(): return TenXBarcodeDetectionResult - def header(self): - return self.result_type()._header() + @classmethod + def header(cls): + return cls.result_type().header() + class VisiumHDBarcodeDetector: """Visium HD spatial transcriptomics barcode detector.""" @@ -347,5 +349,6 @@ def find_barcode_umi_no_polya(self, read_id: str, sequence: str) -> TenXBarcodeD def result_type(): return TenXBarcodeDetectionResult - def header(self): - return self.result_type()._header() \ No newline at end of file + @classmethod + def header(cls): + return cls.result_type().header() From daf0b720b845a340838a165a5b7765c027c14059 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Mon, 2 Feb 2026 19:19:19 +0100 Subject: [PATCH 22/41] fix attr --- src/barcode_calling/callers/base.py | 1 + src/barcode_calling/callers/curio.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/barcode_calling/callers/base.py b/src/barcode_calling/callers/base.py index 5da16187..d9d3714c 100644 --- a/src/barcode_calling/callers/base.py +++ b/src/barcode_calling/callers/base.py @@ -251,6 +251,7 @@ def get_additional_attributes(self) -> List[str]: attr.append("Linker detected") if self.tso5 != -1: attr.append("TSO detected") + return attr @staticmethod def header() -> str: diff --git a/src/barcode_calling/callers/curio.py b/src/barcode_calling/callers/curio.py index c4dfe645..edc2542c 100644 --- a/src/barcode_calling/callers/curio.py +++ b/src/barcode_calling/callers/curio.py @@ -337,4 +337,4 @@ def result_type(): @classmethod def header(cls): return cls.result_type().header() - \ No newline at end of file + From fb6b84f2968ebf6cb34868813028e9d07f71293f Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Tue, 3 Feb 2026 14:29:20 +0100 Subject: [PATCH 23/41] man cosmetics, rename detect_barcodes.py --- docs/barcode_calling.md | 57 ++++---- docs/cmd.md | 52 ++++--- docs/single_cell.md | 133 ++++++++++-------- isoquant.py | 8 +- ...barcodes.py => isoquant_detect_barcodes.py | 3 +- misc/assess_barcode_quality.py | 2 +- .../barcode_test_templates/stereo_single.cfg | 4 +- .../barcode_test_templates/stereo_split.cfg | 2 +- tests/github/run_barcode_test.py | 16 +-- 9 files changed, 147 insertions(+), 130 deletions(-) rename detect_barcodes.py => isoquant_detect_barcodes.py (99%) diff --git a/docs/barcode_calling.md b/docs/barcode_calling.md index 64f03a9d..13e83432 100644 --- a/docs/barcode_calling.md +++ b/docs/barcode_calling.md @@ -1,7 +1,7 @@ # Barcode calling -IsoQuant includes a built-in barcode calling module (`detect_barcodes.py`) that extracts -cell barcodes and UMIs from raw long reads. This module supports multiple single-cell and +IsoQuant includes a built-in barcode calling module (`isoquant_detect_barcodes.py`) that extracts + barcodes and UMIs from raw long reads. This module supports multiple single-cell and spatial transcriptomics platforms and can also be used as a standalone tool. See [single-cell and spatial transcriptomics](single_cell.md) for supported platforms, @@ -18,14 +18,14 @@ The barcode calling module processes each read by: 5. Extracting the UMI sequence from its expected position Each read is assigned: -- A **barcode** (cell identity) corrected against the whitelist, or `*` if no match is found +- A **barcode** (cell/spot identity) corrected against the whitelist, or `*` if no match is found - A **UMI** (unique molecular identifier), or `*` if not detected - A **strand** orientation (`+`, `-`, or `.` if unknown) - Platform-specific features (polyT position, linker positions, etc.) ## Integration with IsoQuant -When running IsoQuant with a single-cell or spatial mode (`--mode tenX_v3`, etc.), +When running IsoQuant with any single-cell or spatial mode, barcode calling is performed automatically as the first pipeline step. The barcode detection output is a TSV file with one line per read, which is then used throughout the rest of the pipeline for grouping and UMI deduplication. @@ -35,10 +35,10 @@ use `--barcoded_reads` to skip the barcode calling step. ## Standalone usage -`detect_barcodes.py` can be run independently from the IsoQuant pipeline: +`isoquant_detect_barcodes.py` can be run independently: ```bash -python detect_barcodes.py \ +python isoquant_detect_barcodes.py \ --input reads.fastq.gz \ --barcodes barcode_whitelist.txt \ --mode tenX_v3 \ @@ -50,7 +50,7 @@ python detect_barcodes.py \ `--input` or `-i` (required) -One or more input read files in FASTQ, FASTA, BAM, or SAM format. Gzipped FASTQ/FASTA files are supported. +One or more input read files in FASTQ, FASTA or BAM format. Gzipped FASTQ/FASTA files are supported. `--output` or `-o` (required) @@ -61,19 +61,25 @@ When multiple input files are provided, outputs are numbered: `_0.barcod One or more barcode whitelist files. The number of files depends on the mode: -* 1 file: `tenX_v3`, `visium_5prime`, `stereoseq`, `stereoseq_nosplit` -* 1 or 2 files: `curio` +* 1 file: `tenX_v3`, `visium_5prime`, `stereoseq`, `stereoseq_nosplit`, `curio` * 2 files: `visium_hd` * Not needed for `custom_sc` (barcodes defined in MDF file) +File should contain one barcode sequence per line. +More than 1 tab-separated column is allowed, but only the first will be used. +Supports plain text and gzipped files. + +_Note: barcode calling is performed much better if the whitelist contains a small number of barcodes. +If you have a subset of barcodes from short-read data, provide them instead of the full whitelist._ + `--mode` Barcode calling mode. Available modes: `tenX_v3`, `curio`, `stereoseq`, `stereoseq_nosplit`, -`visium_5prime`, `visium_hd`, `custom_sc`. Default: `stereoseq`. +`visium_5prime`, `visium_hd`, `custom_sc`. `--molecule` - -Path to a molecule definition file (MDF) for `custom_sc` mode. +Path to a molecule description format (MDF) file for `custom_sc` mode. +This file defines the structure of the sequencing molecule (barcodes, UMIs, linkers, polyT, cDNA)ÑŽ See [MDF format](single_cell.md#molecule-definition-file-mdf-format) for the format specification. `--threads` or `-t` @@ -94,7 +100,7 @@ Folder for temporary files during parallel processing. 10x Genomics: ```bash -python detect_barcodes.py \ +python isoquant_detect_barcodes.py \ -i reads.fastq.gz \ -b 3M-february-2018.txt.gz \ --mode tenX_v3 \ @@ -104,7 +110,7 @@ python detect_barcodes.py \ Stereo-seq: ```bash -python detect_barcodes.py \ +python isoquant_detect_barcodes.py \ -i reads.fastq.gz \ -b stereo_barcodes.txt \ --mode stereoseq \ @@ -112,25 +118,16 @@ python detect_barcodes.py \ -t 16 ``` -Curio with two barcode files: -```bash -python detect_barcodes.py \ - -i reads.fastq.gz \ - -b left_barcodes.txt right_barcodes.txt \ - --mode curio \ - -o results/curio_sample -``` - Custom molecule: ```bash -python detect_barcodes.py \ +python isoquant_detect_barcodes.py \ -i reads.fastq.gz \ --mode custom_sc \ --molecule my_platform.mdf \ -o results/custom_sample ``` -## Output format +## Output The main output is a TSV file (`*.barcoded_reads.tsv`) with a header line followed by one line per read. The columns depend on the platform mode. @@ -187,7 +184,7 @@ reporting the total number of reads processed, barcodes detected, and UMIs found Molecule structure (3' to 5'): ``` -[R1 primer]---[Barcode (16bp)]---[UMI (12bp)]---[PolyT]---[cDNA]---[TSO] +[R1 primer] [Barcode (16bp)] [UMI (12bp)] [PolyT] [cDNA] [TSO] ``` Requires 1 barcode whitelist file (e.g., the 10x `3M-february-2018.txt.gz`). @@ -196,7 +193,7 @@ Requires 1 barcode whitelist file (e.g., the 10x `3M-february-2018.txt.gz`). Molecule structure (3' to 5'): ``` -[R1 primer]---[BC_part1 (16bp)]---[separator]---[BC_part2 (15bp)]---[UMI (9bp)]---[PolyT]---[cDNA]---[TSO] +[R1 primer] [UMI (9bp)] [BC_part1 (15-16bp)] [BC_part2 (14-15bp)] VV [PolyT] [cDNA] [TSO] ``` Requires 2 barcode whitelist files (part 1 and part 2). @@ -206,17 +203,17 @@ The final barcode is the concatenation of both parts. Molecule structure (3' to 5'): ``` -[PCR primer]---[Left BC (8bp)]---[Linker]---[Right BC (6bp)]---[UMI (9bp)]---[PolyT]---[cDNA] +[PCR primer] [Left BC (8bp)] [Linker] [Right BC (6bp)] [UMI (9bp)] [PolyT] [cDNA] ``` The linker sequence anchors the barcode detection. -Accepts either 1 combined whitelist file (14bp barcodes) or 2 files (8bp left + 6bp right). +Accepts 1 combined whitelist file (14bp barcodes). ### Stereo-seq (`stereoseq`, `stereoseq_nosplit`) Molecule structure (3' to 5'): ``` -[Primer]---[Barcode (25bp)]---[Linker]---[UMI (10bp)]---[PolyT]---[cDNA]---[TSO] +[Primer] [Barcode (25bp)] [Linker] [UMI (10bp)] [PolyT] [cDNA] [TSO] ``` - `stereoseq` mode: splits concatenated reads at TSO boundaries, producing a new FASTA with individual subreads diff --git a/docs/cmd.md b/docs/cmd.md index 800bbb80..462d530e 100644 --- a/docs/cmd.md +++ b/docs/cmd.md @@ -96,7 +96,7 @@ Use this option at your own risk. Input file names are used as labels if not set. `--read_group` - Sets one or more ways to group feature counts (e.g. by cell type, file, or barcode). + Sets one or more ways to group feature counts (e.g. by cell type, file name, BAM tag or barcode). Multiple grouping strategies can be combined (space-separated). Available grouping options: @@ -106,8 +106,8 @@ If multiple BAM/FASTQ files are provided and `--read_group` option is not set, I by default. * `tag:TAG` - groups reads by BAM file read tag, where `TAG` is the tag name -(e.g. `tag:CB` uses `CB` tag values as groups, commonly used for cell barcodes in single-cell data). - +(e.g. `tag:RG` uses `RG` tag values as groups, commonly used for read group information). +q * `read_id:DELIM` - groups reads by read name suffix, where `DELIM` is the symbol/string by which the read id will be split (e.g. if `DELIM` is `_`, for read `m54158_180727_042959_59310706_ccs_NEU` the group will be `NEU`). @@ -117,11 +117,16 @@ where `FILE` is the file path, `READ_COL` is column with read ids (default: 0), `GROUP_COL(S)` is column(s) with group ids (default: 1; use comma-separated columns for multi-column grouping, e.g., `1,2,3`), `DELIM` is separator symbol (default: tab). File can be gzipped. - * `barcode_spot` or `barcode_spot:FILE` - maps barcodes to spots/cell types. -Uses barcode-to-spot mapping from `--barcode2spot` file by default, or from explicit `FILE` if specified. + * `barcode_spot` - group by barcode properties, such as spots/cell types. +Set barcode-to-spot mapping via `--barcode2spot` option. Useful for grouping single-cell/spatial data by cell type or spatial region instead of individual barcodes. -**Example**: `--read_group tag:CB file_name barcode_spot` creates multi-level grouping by cell barcode tag, file name, and cell type. +* `barcode` - groups reads by cell barcode. +When the number of barcodes is large, grouping counts by individual barcodes may take a long time. +It is recommended to use `--barcode2spot` file instead. + +**Example**: `--read_group tag:RG file_name barcode_spot` creates multi-level grouping by read group tag, +original file name, and barcode property (e.g. cell type). ### Output options @@ -168,43 +173,52 @@ Useful for grouping single-cell/spatial data by cell type or spatial region inst **NB! This feature is experimental and is not part of the official IsoQuant release.** See [single-cell and spatial transcriptomics](single_cell.md) for a detailed guide on -supported platforms, examples, molecule definition files, and UMI deduplication. +supported platforms, examples, molecule description format, and UMI deduplication. `--mode` or `-m` IsoQuant mode for processing single-cell or spatial transcriptomics data. Available modes: * `bulk` - standard bulk RNA-seq mode (default) * `tenX_v3` - 10x Genomics single-cell 3' gene expression -* `curio` - Curio Bioscience single-cell -* `visium_hd` - 10x Genomics Visium HD spatial transcriptomics * `visium_5prime` - 10x Genomics Visium 5' spatial transcriptomics -* `stereoseq` - Stereo-seq spatial transcriptomics (BGI) +* `visium_hd` - 10x Genomics Visium HD spatial transcriptomics +* `curio` - Curio Bioscience spatial data +* `stereoseq` - Stereo-seq spatial data * `stereoseq_nosplit` - Stereo-seq without read splitting * `custom_sc` - custom single-cell/spatial mode using a [molecule definition file](single_cell.md#molecule-definition-file-mdf-format) Single-cell and spatial modes enable automatic barcode calling and UMI-based deduplication. `--barcode_whitelist` -Path to file(s) with barcode whitelist for barcode calling. +Path to file(s) with barcode whitelist(s) for barcode calling. Required for single-cell/spatial modes unless `--barcoded_reads` is provided. -File should contain one barcode per line. + +File should contain one barcode sequence per line. +More than 1 tab-separated column is allowed, but only the first will be used. +Supports plain text and gzipped files. + +_Note: barcode calling is performed much better if the whitelist contains a small number of barcodes. +If you have a subset of barcodes from short-read data, provide them instead of the full whitelist._ `--barcoded_reads` Path to TSV file(s) with pre-called barcoded reads. Format: `read_idbarcodeumi` (one read per line). If provided, IsoQuant skips barcode calling and uses these assignments directly. +More than 3 columns are allowed, but only the first 3 will be used. -`--barcode_column` -Column index for barcodes in the `--barcoded_reads` file (default: 1). -Read ID column is 0, barcode column is 1, UMI column is 2. +Note! IsoQuant does not read barcodes or UMIs from BAM file tags. `--barcode2spot` -Path to TSV file mapping barcodes to cell types or spatial spots. -Format: `barcodecell_type` (one barcode per line). -Used with `--read_group barcode_spot` to group counts by cell type/spot instead of individual barcodes. +Path to TSV file mapping barcodes to cell types, spatial spots, or other barcode properties. +By default, barcode is in the first column, cell type in the second. +However, you can specify one or more columns via colon symbol (similar to `--read_group`): +`file.tsv:barcode_column:spot_column(s)` (e.g., `cell_types.tsv:0:1,2,3` for multiple barcode properties). + +When `--barcode2spot` is set, `--read_group barcode_spot` will be set automatically +to group counts by cell type, spatial regions, or other provided properties. `--molecule` -Path to a molecule definition file (MDF) for `custom_sc` mode. +Path to a molecule description file (MDF) for `custom_sc` mode. Defines molecule structure for universal barcode extraction from any platform. See [MDF format](single_cell.md#molecule-definition-file-mdf-format) for details. diff --git a/docs/single_cell.md b/docs/single_cell.md index 6077c6f8..aea0438e 100644 --- a/docs/single_cell.md +++ b/docs/single_cell.md @@ -10,24 +10,28 @@ barcode calling and UMI-based PCR deduplication as part of the pipeline. The single-cell/spatial pipeline extends the standard bulk pipeline with these additional steps: -1. **Barcode calling** -- extract cell barcodes and UMIs from raw reads -2. **Standard IsoQuant processing** -- alignment, isoform assignment, transcript model construction +1. **Barcode calling** -- extract cell/spot barcodes and UMIs from raw reads +2. **Standard IsoQuant processing** -- alignment, read-to-isoform assignment 3. **UMI deduplication** -- remove PCR/RT duplicates within each barcode group 4. **Grouped quantification** -- produce per-cell/per-spot count matrices +Note: UMI deduplication relies on read-to-gene assignment. Reads that are not assigned to any gene are discarded. +Hence, novel gene discovery will not be performed in single-cell/spatial mode. +We recommend using `bulk` mode for novel gene and transcript discovery. + Barcode calling is handled by the built-in [barcode calling module](barcode_calling.md), which can also be used as a standalone tool. ## Supported platforms -| Mode | Platform | Barcode whitelist files | UMI length | Notes | -|------|----------|------------------------|------------|-------| -| `tenX_v3` | 10x Genomics 3' v3 | 1 | 12 | Single 16bp barcode | -| `visium_5prime` | 10x Genomics Visium 5' | 1 | 12 | Same detector as tenX_v3 | -| `visium_hd` | 10x Genomics Visium HD | 2 | 9 | Two-part barcode (16bp + 15bp) | -| `curio` | Curio Bioscience | 1 or 2 | 9 | Double barcode (8bp + 6bp) with linker | -| `stereoseq` | BGI Stereo-seq | 1 | 10 | 25bp barcode, read splitting mode | -| `stereoseq_nosplit` | BGI Stereo-seq | 1 | 10 | 25bp barcode, no read splitting | +| Mode | Platform | Barcode whitelist files | UMI length | Notes | +|------|----------|--------------------|------------|--------------------------------------------------------------------------------------| +| `tenX_v3` | 10x Genomics 3' v3 | 1 | 12 | Single 16bp barcode | +| `visium_5prime` | 10x Genomics Visium 5' | 1 | 12 | Same detector as tenX_v3 | +| `visium_hd` | 10x Genomics Visium HD | 2 | 9 | Two barcodes (15/16bp + 14/15bp) | +| `curio` | Curio Bioscience | 1 | 9 | Double barcode (8bp + 6bp) with linker | +| `stereoseq` | Stereo-seq | 1 | 10 | 25bp barcode, read splitting mode | +| `stereoseq_nosplit` | Stereo-seq | 1 | 10 | 25bp barcode, no read splitting | | `custom_sc` | Any platform | 0 (uses MDF) | varies | User-defined molecule structure via [MDF file](#molecule-definition-file-mdf-format) | ## Quick start examples @@ -81,49 +85,48 @@ IsoQuant processing mode. Available modes: All modes except `bulk` enable automatic barcode calling and UMI-based deduplication. + `--barcode_whitelist` +Path to file(s) with barcode whitelist(s) for barcode calling. +Required for single-cell/spatial modes unless `--barcoded_reads` is provided. + +File should contain one barcode sequence per line. +More than 1 tab-separated column is allowed, but only the first will be used. +Supports plain text and gzipped files. -Path to file(s) with barcode whitelist for barcode calling. -Required for all single-cell/spatial modes except `custom_sc` (unless `--barcoded_reads` is provided). -File should contain one barcode per line. Supports plain text and gzipped files. +Note: barcode calling is performed much better if the whitelist contains a small number of barcodes. +If you have a subset of barcodes from short-read data, provide them instead of the full whitelist. The number of whitelist files depends on the mode: -* 1 file: `tenX_v3`, `visium_5prime`, `stereoseq`, `stereoseq_nosplit` -* 1 or 2 files: `curio` (1 file with combined 14bp barcodes, or 2 files with 8bp left + 6bp right) +* 1 file: `tenX_v3`, `visium_5prime`, `stereoseq`, `stereoseq_nosplit`, `curio` (combined 14bp barcodes) * 2 files: `visium_hd` (part 1 and part 2 barcode lists) * Not needed: `custom_sc` (barcode lists are specified inside the MDF file) `--barcoded_reads` - Path to TSV file(s) with pre-called barcoded reads. Format: `read_idbarcodeumi` (one read per line). -When provided, IsoQuant skips the barcode calling step and uses these assignments directly. -This is useful when barcodes have already been called by an external tool (e.g., Cell Ranger). - -`--barcode_column` +If provided, IsoQuant skips barcode calling and uses these assignments directly. +More than 3 columns are allowed, but only the first 3 will be used. -Column index for barcodes in the `--barcoded_reads` file (default: 1). -Column 0 is the read ID, column 1 is the barcode, column 2 is the UMI. +Note! IsoQuant does not read barcodes or UMIs from BAM file tags. `--barcode2spot` +Path to TSV file mapping barcodes to cell types, spatial spots, or other barcode properties. +By default, barcode is in the first column, cell type in the second. +However, you can specify one or more columns via colon symbol (similar to `--read_group`): +`file.tsv:barcode_column:spot_column(s)` (e.g., `cell_types.tsv:0:1,2,3` for multiple barcode properties). -Path to TSV file mapping barcodes to cell types or spatial spots. -Format: `barcodecell_type` (one barcode per line). -Supports extended format: `file.tsv:barcode_col:spot_cols` (e.g., `mapping.tsv:0:1,2,3` for multiple spot columns). - -Used with `--read_group barcode_spot` to group counts by cell type or spatial region -instead of individual barcodes, reducing output from thousands of barcodes to -a manageable number of cell types or regions. +When `--barcode2spot` is set, `--read_group barcode_spot` will be set automatically +to group counts by cell type, spatial regions, or other provided properties. `--molecule` - -Path to a molecule definition file (MDF) for `custom_sc` mode. +Path to a molecule description format (MDF) file for `custom_sc` mode. This file defines the structure of the sequencing molecule (barcodes, UMIs, linkers, polyT, cDNA) -and allows IsoQuant to process reads from any single-cell or spatial platform without code changes. +and allows IsoQuant to process reads from any single-cell or spatial platform. See the [MDF format](#molecule-definition-file-mdf-format) section below for details. -## Molecule definition file (MDF) format +## Molecule description format (MDF) The MDF format allows users to describe the structure of their sequencing molecule so that IsoQuant can extract barcodes and UMIs from any platform. @@ -136,14 +139,18 @@ An MDF file has two parts: ### Element types -| Type | Description | Value field | -|------|-------------|-------------| -| `CONST` | Constant/known sequence (primer, linker, TSO) | DNA sequence | -| `VAR_FILE` | Variable sequence matched against a whitelist file | Path to TSV file | -| `VAR_LIST` | Variable sequence matched against an inline list | Comma-separated sequences | -| `VAR_ANY` | Variable-length sequence extracted as-is | Length (integer) | -| `PolyT` | PolyT/PolyA tract | (none) | -| `cDNA` | cDNA region | (none) | +| Type | Description | Value field | +|------|-------------------------------------------------------------------------------|--------------------------| +| `CONST` | Constant/known sequence (primer, linker, TSO) | Sequence | +| `VAR_FILE` | Variable sequence matched against a whitelist file | Path to TSV file | +| `VAR_LIST` | Variable sequence matched against an inline list | Comma-separated sequences | +| `VAR_ANY` | Variable-length sequence extracted as-is | Length (integer) | +| `VAR_ANY_SEPARATOR` | Fixed-length variable separator sequence (not extracted) | Length (integer) | +| `VAR_ANY_NON_T_SEPARATOR` | Fixed-length variable separator sequence without T nucleoties (not extracted) | Length (integer) | +| `PolyT` | PolyT tail | (none) | +| `cDNA` | cDNA region | (none) | + +At the moment, only a single `cDNA` and a single `PolyT` are supported. ### Barcode and UMI identification @@ -156,7 +163,7 @@ in the order they appear. ### Examples -**10x-like molecule** (3' to 5'): +**10x 3' single-cell v3**: ``` R1:Barcode:UMI:PolyT:cDNA:TSO R1 CONST CTACACGACGCTCTTCCGATCT @@ -165,43 +172,51 @@ UMI VAR_ANY 12 TSO CONST CCCATGTACTCTGCGTTGATACCACTGCTT ``` -**Simple molecule with inline barcodes:** +**10x 3' single-cell v3** (inline barcodes): ``` -Barcode:UMI:PolyT:cDNA -Barcode VAR_LIST AAACCCGGGTTTAAAC,TTTGGGCCCAAATTTG,GGGGAAAACCCCTTTT -UMI VAR_ANY 10 +R1:Barcode:UMI:PolyT:cDNA:TSO +R1 CONST CTACACGACGCTCTTCCGATCT +Barcode VAR_FILE AAACCCGGGTTTAAAC,TTTGGGCCCAAATTTG,GGGGAAAACCCCTTTT +UMI VAR_ANY 12 +TSO CONST CCCATGTACTCTGCGTTGATACCACTGCTT ``` ### Linked elements for multi-part barcodes -When a barcode or UMI is split into multiple parts on the molecule, use linked element notation. +When a barcode is split into multiple parts in the molecule, use linked element notation. All linked parts must be of the same variable type and reference the same whitelist. Parts are numbered consecutively starting from 1. -**Concatenated** (`|`): parts of a single barcode separated by other elements. +**Concatenated** (`|`): parts of the barcode separated by other elements. Parts are extracted independently, then concatenated and corrected as one sequence against the full whitelist. +Such structure appears in, for example, Curio Bioscience single-cell protocol: + ``` -Barcode|1:Linker:Barcode|2:UMI:PolyT:cDNA +PCR_PRIMER:Barcode|1:Linker:Barcode|2:UMI:PolyT:cDNA +PCR_PRIMER TACACGACGCTCTTCCGATCT Barcode|1 VAR_FILE barcodes.tsv 8 Linker CONST TCTTCAGCGTTCCCGAGA Barcode|2 VAR_FILE barcodes.tsv 6 UMI VAR_ANY 9 ``` -**Duplicated** (`/`): multiple redundant copies of the same barcode on the molecule. +**Duplicated** (`/`): multiple redundant copies of the same barcode in the molecule. Each copy is corrected independently against the whitelist; a majority vote determines the final barcode. +For example: ``` -Barcode/1:Barcode/2:UMI:PolyT:cDNA +PCR_PRIMER:Barcode/1:UMI:PolyT:cDNA:Barcode/2:TSO +PCR_PRIMER TACACGACGCTCTTCCGATCT Barcode/1 VAR_FILE barcodes.tsv 16 Barcode/2 VAR_FILE barcodes.tsv 16 UMI VAR_ANY 12 +TSO CONST CCCATGTACTCTGCGTTGATACCACTGCTT ``` ## UMI deduplication All single-cell and spatial modes perform UMI-based PCR deduplication after isoform assignment. -Within each cell barcode and gene, reads with similar UMIs (within edit distance 3 by default) +Within each cell barcode and gene, reads with similar UMIs (similarity criteria depends on the UMI length) are collapsed into a single representative read. The representative read is selected based on: @@ -228,17 +243,13 @@ Grouped counts can also be converted after the run using `src/convert_grouped_co Use `--read_group` to control how reads are grouped for quantification. Multiple grouping strategies can be combined (space-separated), producing separate count tables for each. -Common configurations for single-cell/spatial data: +The most common use-cases for single-cell/spatial data are grouping by barcode property (cell type, spot): -```bash -# Group by barcode (from BAM CB tag) ---read_group tag:CB +`--read_group barcode_spot` (requires `--barcode2spot`) -# Group by barcode and also by cell type ---read_group tag:CB barcode_spot +or grouping by individual barcode (not recommended for datasets with many barcodes): + +`--read_group barcode file_name` -# Group by barcode from barcoded_reads file and by file name ---read_group barcode file_name -``` See the [read grouping options](cmd.md#other-input-options) for the full list of grouping strategies. diff --git a/isoquant.py b/isoquant.py index c7d39e90..f710b8b0 100755 --- a/isoquant.py +++ b/isoquant.py @@ -46,7 +46,7 @@ from src.input_data_storage import InputDataStorage, InputDataType from src.multimap_resolver import MultimapResolvingStrategy from src.stats import combine_counts -from detect_barcodes import process_single_thread, process_in_parallel, get_umi_length +from isoquant_detect_barcodes import process_single_thread, process_in_parallel, get_umi_length logger = logging.getLogger('IsoQuant') @@ -167,13 +167,9 @@ def add_hidden_option(*args, **kwargs): # show command only with --full-help help="IsoQuant modes: " + ", ".join(ISOQUANT_MODES) + "; default:%s" % IsoQuantMode.bulk.name, default=IsoQuantMode.bulk.name) add_additional_option_to_group(sc_args_group, '--barcode_whitelist', type=str, nargs='+', - help='file(s) with barcode whitelist for barcode calling') + help='file(s) with barcode whitelist(s) for barcode calling') add_additional_option_to_group(sc_args_group, "--barcoded_reads", type=str, nargs='+', help='TSV file(s) with barcoded reads; barcodes will be called automatically if not provided') - # TODO: add UMI column, support various formats - add_additional_option_to_group(sc_args_group, "--barcode_column", type=str, - help='column with barcodes in barcoded_reads file, default=1; read id column is 0', - default=1) add_additional_option_to_group(sc_args_group, "--barcode2spot", type=str, help='TSV file mapping barcode to cell type / spot id. ' 'Format: file.tsv or file.tsv:barcode_col:spot_cols ' diff --git a/detect_barcodes.py b/isoquant_detect_barcodes.py similarity index 99% rename from detect_barcodes.py rename to isoquant_detect_barcodes.py index 2a8f88fe..ce031c4a 100755 --- a/detect_barcodes.py +++ b/isoquant_detect_barcodes.py @@ -469,8 +469,7 @@ def add_hidden_option(*args, **kwargs): # show command only with --full-help parser.add_argument("--output", "-o", type=str, help="output prefix name", required=True) parser.add_argument("--barcodes", "-b", nargs='+', type=str, help="barcode whitelist(s)", required=False) # parser.add_argument("--umi", "-u", type=str, help="potential UMIs, detected de novo if not set") - parser.add_argument("--mode", type=str, help="mode to be used", choices=[x.name for x in BARCODE_CALLING_MODES.keys()], - default=IsoQuantMode.stereoseq.name) + parser.add_argument("--mode", type=str, help="mode to be used", choices=[x.name for x in BARCODE_CALLING_MODES.keys()]) parser.add_argument("--molecule", type=str, help="MDF files with molecule description (for custom_sc mode only)") parser.add_argument("--input", "-i", nargs='+', type=str, help="input reads in [gzipped] FASTA, FASTQ, BAM, SAM", diff --git a/misc/assess_barcode_quality.py b/misc/assess_barcode_quality.py index 5db838a0..66a537f5 100644 --- a/misc/assess_barcode_quality.py +++ b/misc/assess_barcode_quality.py @@ -384,7 +384,7 @@ def parse_args(): choices=['tenX_v3', 'visium_hd', 'curio', 'stereo', 'stereo_split'], help='Barcode calling mode') parser.add_argument('--input', '-i', required=True, - help='Input barcode TSV file from detect_barcodes.py') + help='Input barcode TSV file from isoquant_detect_barcodes.py') parser.add_argument('--output', '-o', help='Output metrics TSV file (default: stdout)') parser.add_argument('--barcode_col', type=int, default=1, diff --git a/tests/github/barcode_test_templates/stereo_single.cfg b/tests/github/barcode_test_templates/stereo_single.cfg index dd2b4b42..b618bf1b 100644 --- a/tests/github/barcode_test_templates/stereo_single.cfg +++ b/tests/github/barcode_test_templates/stereo_single.cfg @@ -12,13 +12,13 @@ run_type single reads /path/to/simulated_reads.fq.gz barcodes /path/to/whitelist.txt -# Barcode calling mode (for detect_barcodes.py) +# Barcode calling mode (for isoquant_detect_barcodes.py) mode stereoseq # Optional: number of threads threads 8 -# Optional: additional detect_barcodes.py options +# Optional: additional isoquant_detect_barcodes.py options # detect_options "--min_score 21" # Quality assessment mode (for assess_barcode_quality.py) diff --git a/tests/github/barcode_test_templates/stereo_split.cfg b/tests/github/barcode_test_templates/stereo_split.cfg index 5d3061cb..38959b69 100644 --- a/tests/github/barcode_test_templates/stereo_split.cfg +++ b/tests/github/barcode_test_templates/stereo_split.cfg @@ -18,7 +18,7 @@ mode stereoseq_split # Optional: number of threads threads 8 -# Optional: additional detect_barcodes.py options +# Optional: additional isoquant_detect_barcodes.py options # detect_options "" # Quality assessment mode diff --git a/tests/github/run_barcode_test.py b/tests/github/run_barcode_test.py index 6a49b845..03c0c9cf 100644 --- a/tests/github/run_barcode_test.py +++ b/tests/github/run_barcode_test.py @@ -8,7 +8,7 @@ """ Test runner for barcode calling quality assessment. -Takes a configuration file as input, runs detect_barcodes.py on simulated data, +Takes a configuration file as input, runs isoquant_detect_barcodes.py on simulated data, then runs quality assessment and compares results against etalon values. Usage: @@ -91,7 +91,7 @@ def parse_args(): parser.add_argument("--output", "-o", type=str, help="Output directory (overrides config)") parser.add_argument("--additional_options", "-a", type=str, - help="Additional options for detect_barcodes.py") + help="Additional options for isoquant_detect_barcodes.py") parser.add_argument("--debug", "-d", action="store_true", help="Enable debug output") @@ -99,13 +99,13 @@ def parse_args(): def run_detect_barcodes(args, config_dict): - """Run detect_barcodes.py with configuration options.""" + """Run isoquant_detect_barcodes.py with configuration options.""" source_dir = os.path.dirname(os.path.realpath(__file__)) isoquant_dir = os.path.join(source_dir, "../../") config_file = args.config_file run_name = config_dict["name"] - # detect_barcodes.py uses -o as a file prefix, not a directory + # isoquant_detect_barcodes.py uses -o as a file prefix, not a directory # Output will be: {output_folder}/{run_name}.barcoded_reads.tsv output_folder = args.output if args.output else config_dict["output"] output_prefix = os.path.join(output_folder, run_name) @@ -113,11 +113,11 @@ def run_detect_barcodes(args, config_dict): if not os.path.exists(output_folder): os.makedirs(output_folder) - log.info('== Running detect_barcodes.py ==') + log.info('== Running isoquant_detect_barcodes.py ==') # Build command detect_command = [ - "python3", os.path.join(isoquant_dir, "detect_barcodes.py"), + "python3", os.path.join(isoquant_dir, "isoquant_detect_barcodes.py"), "-o", output_prefix, "--mode", config_dict["mode"] ] @@ -156,7 +156,7 @@ def run_detect_barcodes(args, config_dict): result = subprocess.run(detect_command) if result.returncode != 0: - log.error("detect_barcodes.py exited with non-zero status: %d" % result.returncode) + log.error("isoquant_detect_barcodes.py exited with non-zero status: %d" % result.returncode) return -11 return 0 @@ -173,7 +173,7 @@ def run_barcode_quality(args, config_dict): output_prefix = os.path.join(output_folder, run_name) # Find barcode output file - # detect_barcodes.py outputs: {prefix}.barcoded_reads.tsv + # isoquant_detect_barcodes.py outputs: {prefix}.barcoded_reads.tsv barcode_output = output_prefix + ".barcoded_reads.tsv" if not os.path.exists(barcode_output): barcode_output = output_prefix + ".barcoded_reads.tsv.gz" From 03697081dcc33f3f0e90ec98db6379a3891a6a76 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Tue, 3 Feb 2026 15:35:11 +0100 Subject: [PATCH 24/41] remove min_score, add params for speeding up barcode callers for large barcode sets --- docs/barcode_calling.md | 6 -- isoquant.py | 1 - isoquant_detect_barcodes.py | 8 +- src/barcode_calling/callers/curio.py | 81 +++++++++---------- src/barcode_calling/callers/stereo.py | 34 ++++---- src/barcode_calling/callers/tenx.py | 58 ++++++------- .../barcode_test_templates/stereo_single.cfg | 2 +- tests/test_barcode_callers.py | 4 +- tests/test_barcode_detectors.py | 34 ++++---- 9 files changed, 103 insertions(+), 125 deletions(-) diff --git a/docs/barcode_calling.md b/docs/barcode_calling.md index 13e83432..e0d64884 100644 --- a/docs/barcode_calling.md +++ b/docs/barcode_calling.md @@ -86,12 +86,6 @@ See [MDF format](single_cell.md#molecule-definition-file-mdf-format) for the for Number of threads for parallel processing (default: 16). -`--min_score` - -Override the minimum alignment score for barcode matching. -The scoring system uses +1 for match, -1 for mismatch, -1 for gap open, -1 for gap extension. -By default, the minimum score is set automatically based on barcode length. - `--tmp_dir` Folder for temporary files during parallel processing. diff --git a/isoquant.py b/isoquant.py index f710b8b0..60aebc22 100755 --- a/isoquant.py +++ b/isoquant.py @@ -974,7 +974,6 @@ def __init__(self, input, barcode_whitelist, mode, output, out_fasta, tmp_dir, t self.out_fasta = out_fasta # Can be a single filename (str), list of filenames, or None self.tmp_dir = tmp_dir self.threads = threads - self.min_score = None self.molecule = molecule diff --git a/isoquant_detect_barcodes.py b/isoquant_detect_barcodes.py index ce031c4a..74ec47bb 100755 --- a/isoquant_detect_barcodes.py +++ b/isoquant_detect_barcodes.py @@ -263,7 +263,7 @@ def process_chunk(barcode_detector, read_chunk, output_file, num, out_fasta=None def create_barcode_caller(args): - logger.info("Creating barcode detector for mode %s" % args.mode.name + (", min_score=%d" % args.min_score if args.min_score else "")) + logger.info("Creating barcode detector for mode %s" % args.mode.name) if args.mode == IsoQuantMode.custom_sc: if not args.molecule: @@ -272,8 +272,6 @@ def create_barcode_caller(args): if not os.path.isfile(args.molecule): logger.critical("Molecule file %s does not exist" % args.molecule) sys.exit(IsoQuantExitCode.INPUT_FILE_NOT_FOUND) - if args.min_score: - logger.warning("Custom single-cell/spatial mode does not support --min-score, ignoring") return BARCODE_CALLING_MODES[args.mode](MoleculeStructure(open(args.molecule))) @@ -299,8 +297,6 @@ def create_barcode_caller(args): barcodes = tuple(barcodes) barcode_detector = BARCODE_CALLING_MODES[args.mode](barcodes) - if args.min_score: - barcode_detector.min_score = args.min_score return barcode_detector @@ -476,8 +472,6 @@ def add_hidden_option(*args, **kwargs): # show command only with --full-help required=True) parser.add_argument("--threads", "-t", type=int, help="threads to use (16)", default=16) parser.add_argument("--tmp_dir", type=str, help="folder for temporary files") - parser.add_argument("--min_score", type=int, help="minimal barcode score " - "(scoring system is +1, -1, -1, -1)") add_hidden_option('--debug', action='store_true', default=False, help='Debug log output.') args = parser.parse_args(sys_argv) diff --git a/src/barcode_calling/callers/curio.py b/src/barcode_calling/callers/curio.py index edc2542c..df2aa3ca 100644 --- a/src/barcode_calling/callers/curio.py +++ b/src/barcode_calling/callers/curio.py @@ -13,10 +13,11 @@ import logging from typing import List, Optional +from ..indexers import Array2BitKmerIndexer from ..indexers import KmerIndexer, ArrayKmerIndexer from ..common import ( find_polyt_start, reverese_complement, - find_candidate_with_max_score_ssw, detect_exact_positions, + find_candidate_with_max_score_ssw, detect_exact_positions, str_to_2bit, ) from .base import LinkerBarcodeDetectionResult @@ -41,8 +42,7 @@ class CurioBarcodeDetector: TERMINAL_MATCH_DELTA = 2 STRICT_TERMINAL_MATCH_DELTA = 1 - def __init__(self, barcode_list: List[str], umi_list: Optional[List[str]] = None, - min_score: int = 13): + def __init__(self, barcode_list: List[str]): """ Initialize Curio barcode detector. @@ -61,13 +61,15 @@ def __init__(self, barcode_list: List[str], umi_list: Optional[List[str]] = None joint_barcode_list.append(b1 + b2) else: joint_barcode_list = barcode_list - self.barcode_indexer = ArrayKmerIndexer(joint_barcode_list, kmer_size=6) - self.umi_set = None - if umi_list: - self.umi_set = set(umi_list) - logger.debug("Loaded %d UMIs" % len(umi_list)) - self.umi_indexer = KmerIndexer(umi_list, kmer_size=5) - self.min_score = min_score + if len(joint_barcode_list) <= 100000: + self.barcode_indexer = ArrayKmerIndexer(joint_barcode_list, kmer_size=6) + self.max_barcodes_hits = 20 + self.min_matching_kmers = 1 + else: + self.barcode_indexer = Array2BitKmerIndexer([str_to_2bit(b) for b in joint_barcode_list], kmer_size=6, seq_len=self.BC_LENGTH) + self.max_barcodes_hits = 10 + self.min_matching_kmers = 2 + self.min_score = 13 def find_barcode_umi(self, read_id: str, sequence: str) -> LinkerBarcodeDetectionResult: """ @@ -149,7 +151,9 @@ def _find_barcode_umi_fwd(self, read_id: str, sequence: str) -> LinkerBarcodeDet barcode_end = linker_end + self.RIGHT_BC_LENGTH + 1 potential_barcode = sequence[barcode_start:linker_start] + sequence[linker_end + 1:barcode_end + 1] logger.debug("Barcode: %s" % (potential_barcode)) - matching_barcodes = self.barcode_indexer.get_occurrences(potential_barcode) + matching_barcodes = self.barcode_indexer.get_occurrences(potential_barcode, + max_hits=self.max_barcodes_hits, + min_kmers=self.min_matching_kmers) barcode, bc_score, bc_start, bc_end = \ find_candidate_with_max_score_ssw(matching_barcodes, potential_barcode, min_score=self.min_score) @@ -167,19 +171,11 @@ def _find_barcode_umi_fwd(self, read_id: str, sequence: str) -> LinkerBarcodeDet potential_umi = sequence[potential_umi_start:potential_umi_end + 1] logger.debug("Potential UMI: %s" % potential_umi) - umi = None good_umi = False - if self.umi_set: - matching_umis = self.umi_indexer.get_occurrences(potential_umi) - umi, umi_score, umi_start, umi_end = \ - find_candidate_with_max_score_ssw(matching_umis, potential_umi, min_score=7) - logger.debug("Found UMI %s %d-%d" % (umi, umi_start, umi_end)) - - if not umi: - umi = potential_umi - if self.UMI_LEN - self.UMI_LEN_DELTA <= len(umi) <= self.UMI_LEN + self.UMI_LEN_DELTA and \ - all(x != "T" for x in umi[-self.NON_T_UMI_BASES:]): - good_umi = True + umi = potential_umi + if self.UMI_LEN - self.UMI_LEN_DELTA <= len(umi) <= self.UMI_LEN + self.UMI_LEN_DELTA and \ + all(x != "T" for x in umi[-self.NON_T_UMI_BASES:]): + good_umi = True if not umi: return LinkerBarcodeDetectionResult(read_id, barcode, BC_score=bc_score, @@ -219,8 +215,7 @@ class CurioIlluminaDetector: TERMINAL_MATCH_DELTA = 1 STRICT_TERMINAL_MATCH_DELTA = 0 - def __init__(self, joint_barcode_list: List[str], umi_list: Optional[List[str]] = None, - min_score: int = 14): + def __init__(self, joint_barcode_list: List[str]): """ Initialize Illumina Curio detector. @@ -231,13 +226,15 @@ def __init__(self, joint_barcode_list: List[str], umi_list: Optional[List[str]] """ self.pcr_primer_indexer = KmerIndexer([CurioBarcodeDetector.PCR_PRIMER], kmer_size=6) self.linker_indexer = KmerIndexer([CurioBarcodeDetector.LINKER], kmer_size=5) - self.barcode_indexer = KmerIndexer(joint_barcode_list, kmer_size=5) - self.umi_set = None - if umi_list: - self.umi_set = set(umi_list) - logger.debug("Loaded %d UMIs" % len(umi_list)) - self.umi_indexer = KmerIndexer(umi_list, kmer_size=5) - self.min_score = min_score + if len(joint_barcode_list) <= 100000: + self.barcode_indexer = ArrayKmerIndexer(joint_barcode_list, kmer_size=5) + self.max_barcodes_hits = 20 + self.min_matching_kmers = 2 + else: + self.barcode_indexer = Array2BitKmerIndexer([str_to_2bit(b) for b in joint_barcode_list], kmer_size=5, seq_len=self.BC_LENGTH) + self.max_barcodes_hits = 10 + self.min_matching_kmers = 2 + self.min_score = 14 def find_barcode_umi(self, read_id: str, sequence: str) -> LinkerBarcodeDetectionResult: """Detect barcode and UMI.""" @@ -289,7 +286,9 @@ def _find_barcode_umi_fwd(self, read_id: str, sequence: str) -> LinkerBarcodeDet logger.debug("Barcode: %s" % (potential_barcode)) if len(potential_barcode) < self.MIN_BC_LEN: return LinkerBarcodeDetectionResult(read_id, linker_start=linker_start, linker_end=linker_end) - matching_barcodes = self.barcode_indexer.get_occurrences(potential_barcode) + matching_barcodes = self.barcode_indexer.get_occurrences(potential_barcode, + max_hits=self.max_barcodes_hits, + min_kmers=self.min_matching_kmers) barcode, bc_score, bc_start, bc_end = \ find_candidate_with_max_score_ssw(matching_barcodes, potential_barcode, min_score=len(potential_barcode) - 1, score_diff=self.SCORE_DIFF) @@ -308,19 +307,11 @@ def _find_barcode_umi_fwd(self, read_id: str, sequence: str) -> LinkerBarcodeDet potential_umi = sequence[potential_umi_start:min(potential_umi_end + 1, len(sequence))] logger.debug("Potential UMI: %s" % potential_umi) - umi = None good_umi = False - if self.umi_set: - matching_umis = self.umi_indexer.get_occurrences(potential_umi) - umi, umi_score, umi_start, umi_end = \ - find_candidate_with_max_score_ssw(matching_umis, potential_umi, min_score=7) - logger.debug("Found UMI %s %d-%d" % (umi, umi_start, umi_end)) - - if not umi: - umi = potential_umi - if self.UMI_LEN - self.UMI_LEN_DELTA <= len(umi) <= self.UMI_LEN + self.UMI_LEN_DELTA and \ - all(x != "T" for x in umi[-self.NON_T_UMI_BASES:]): - good_umi = True + umi = potential_umi + if self.UMI_LEN - self.UMI_LEN_DELTA <= len(umi) <= self.UMI_LEN + self.UMI_LEN_DELTA and \ + all(x != "T" for x in umi[-self.NON_T_UMI_BASES:]): + good_umi = True if not umi: return LinkerBarcodeDetectionResult(read_id, barcode, BC_score=bc_score, diff --git a/src/barcode_calling/callers/stereo.py b/src/barcode_calling/callers/stereo.py index 0d2a42c4..58a65b96 100644 --- a/src/barcode_calling/callers/stereo.py +++ b/src/barcode_calling/callers/stereo.py @@ -36,13 +36,12 @@ class StereoBarcodeDetector: TERMINAL_MATCH_DELTA = 3 STRICT_TERMINAL_MATCH_DELTA = 1 - def __init__(self, barcodes: List[str], min_score: int = 21): + def __init__(self, barcodes: List[str]): """ Initialize Stereo-seq detector. Args: barcodes: List of known barcode sequences - min_score: Minimum alignment score for barcode matching """ self.main_primer = StereoBarcodeDetector.PC1_PRIMER self.pcr_primer_indexer = ArrayKmerIndexer([self.main_primer], kmer_size=6) @@ -55,8 +54,7 @@ def __init__(self, barcodes: List[str], min_score: int = 21): self.barcode_indexer = Dict2BitKmerIndexer(bit_barcodes, kmer_size=14, seq_len=self.BC_LENGTH) logger.info("Indexed %d barcodes" % len(bit_barcodes)) - self.umi_set = None - self.min_score = min_score + self.min_score = 21 def find_barcode_umi_multiple(self, read_id: str, sequence: str) -> List[LinkerBarcodeDetectionResult]: """Find multiple barcodes in a single read.""" @@ -213,8 +211,8 @@ class SharedMemoryStereoBarcodeDetector(StereoBarcodeDetector): MIN_BARCODES_FOR_SHARED_MEM = 1000000 - def __init__(self, barcodes: List[str], min_score: int = 21): - super().__init__([], min_score=min_score) + def __init__(self, barcodes: List[str]): + super().__init__([]) # Convert to 2-bit encoding first (memory efficient) bit_barcodes = batch_str_to_2bit_chunked(iter(barcodes), seq_len=self.BC_LENGTH) self.barcode_count = len(bit_barcodes) @@ -242,7 +240,7 @@ def __getstate__(self): def __setstate__(self, state): self.min_score = state[0] - super().__init__([], min_score=self.min_score) + super().__init__([]) self.bit_barcodes = None self.barcode_count = state[1] if self.barcode_count < self.MIN_BARCODES_FOR_SHARED_MEM: @@ -266,13 +264,12 @@ class StereoSplittingBarcodeDetector: TERMINAL_MATCH_DELTA = 3 STRICT_TERMINAL_MATCH_DELTA = 1 - def __init__(self, barcodes: List[str], min_score: int = 21): + def __init__(self, barcodes: List[str]): """ Initialize splitting detector. Args: barcodes: List of known barcode sequences - min_score: Minimum alignment score """ self.main_primer = self.PC1_PRIMER self.tso5_indexer = ArrayKmerIndexer([self.TSO5], kmer_size=8) @@ -285,8 +282,7 @@ def __init__(self, barcodes: List[str], min_score: int = 21): bit_barcodes = batch_str_to_2bit_chunked(iter(barcodes), seq_len=self.BC_LENGTH) self.barcode_indexer = Dict2BitKmerIndexer(bit_barcodes, kmer_size=14, seq_len=self.BC_LENGTH) logger.info("Indexed %d barcodes" % len(bit_barcodes)) - self.umi_set = None - self.min_score = min_score + self.min_score = 21 def find_barcode_umi(self, read_id: str, sequence: str) -> SplittingBarcodeDetectionResult: """Find multiple barcodes in a concatenated read.""" @@ -504,8 +500,8 @@ class SharedMemoryStereoSplittingBarcodeDetector(StereoSplittingBarcodeDetector) MIN_BARCODES_FOR_SHARED_MEM = 1000000 - def __init__(self, barcodes: List[str], min_score: int = 21): - super().__init__([], min_score=min_score) + def __init__(self, barcodes: List[str]): + super().__init__([]) # Convert to 2-bit encoding first (memory efficient) bit_barcodes = batch_str_to_2bit_chunked(iter(barcodes), seq_len=self.BC_LENGTH) self.barcode_count = len(bit_barcodes) @@ -533,7 +529,7 @@ def __getstate__(self): def __setstate__(self, state): self.min_score = state[0] - super().__init__([], min_score=self.min_score) + super().__init__([]) self.bit_barcodes = None self.barcode_count = state[1] if self.barcode_count < self.MIN_BARCODES_FOR_SHARED_MEM: @@ -546,18 +542,17 @@ def __setstate__(self, state): class SharedMemoryWrapper: """Generic wrapper to add shared memory support to any barcode detector.""" - def __init__(self, barcode_detector_class, barcodes: List[str], min_score: int = 21): + def __init__(self, barcode_detector_class, barcodes: List[str]): """ Initialize wrapper. Args: barcode_detector_class: Detector class to wrap barcodes: List of known barcodes - min_score: Minimum alignment score """ self.barcode_detector_class = barcode_detector_class - self.min_score = min_score - self.barcode_detector = self.barcode_detector_class([], self.min_score) + self.barcode_detector = self.barcode_detector_class([]) + self.min_score = self.barcode_detector.min_score # Convert to 2-bit and use shared memory indexer barcode_iter = iter(barcodes) if isinstance(barcodes, list) else barcodes bit_barcodes = batch_str_to_2bit_chunked(barcode_iter, seq_len=self.barcode_detector_class.BC_LENGTH) @@ -574,5 +569,6 @@ def __getstate__(self): def __setstate__(self, state): self.barcode_detector_class = state[0] self.min_score = state[1] - self.barcode_detector = self.barcode_detector_class([], self.min_score) + self.barcode_detector = self.barcode_detector_class([]) + self.barcode_detector.min_score = self.min_score self.barcode_detector.barcode_indexer = SharedMemoryArray2BitKmerIndexer.from_sharable_info(state[2]) diff --git a/src/barcode_calling/callers/tenx.py b/src/barcode_calling/callers/tenx.py index afe74469..5732466d 100644 --- a/src/barcode_calling/callers/tenx.py +++ b/src/barcode_calling/callers/tenx.py @@ -13,11 +13,12 @@ import logging from typing import List, Optional +from .. import ArrayKmerIndexer, Array2BitKmerIndexer from ..indexers import KmerIndexer from ..common import ( find_polyt_start, reverese_complement, find_candidate_with_max_score_ssw, find_candidate_with_max_score_ssw_var_len, - detect_exact_positions, + detect_exact_positions, str_to_2bit, ) from .base import TenXBarcodeDetectionResult @@ -36,7 +37,7 @@ class TenXBarcodeDetector: TERMINAL_MATCH_DELTA = 2 STRICT_TERMINAL_MATCH_DELTA = 1 - def __init__(self, barcode_list: List[str], umi_list: Optional[List[str]] = None): + def __init__(self, barcode_list: List[str]): """ Initialize 10x detector. @@ -45,15 +46,19 @@ def __init__(self, barcode_list: List[str], umi_list: Optional[List[str]] = None umi_list: Optional list of known UMIs for validation """ self.r1_indexer = KmerIndexer([TenXBarcodeDetector.R1], kmer_size=7) - self.barcode_indexer = KmerIndexer(barcode_list, kmer_size=6) - self.umi_set = None - if umi_list: - self.umi_set = set(umi_list) - logger.debug("Loaded %d UMIs" % len(umi_list)) - self.umi_indexer = KmerIndexer(umi_list, kmer_size=5) - self.min_score = 14 - if len(self.barcode_indexer.seq_list) > 100000: - self.min_score = 16 + if len(barcode_list) <= 100000: + self.barcode_indexer = ArrayKmerIndexer(barcode_list, kmer_size=6) + self.max_barcodes_hits = 20 + self.min_matching_kmers = 1 + self.min_score = 14 + self.score_diff = 0 + else: + self.barcode_indexer = Array2BitKmerIndexer([str_to_2bit(b) for b in barcode_list], kmer_size=6, seq_len=self.BARCODE_LEN_10X) + self.max_barcodes_hits = 10 + self.min_matching_kmers = 2 + self.min_score = 15 + self.score_diff = 1 + logger.debug("Min score set to %d" % self.min_score) def find_barcode_umi(self, read_id: str, sequence: str) -> TenXBarcodeDetectionResult: @@ -121,9 +126,13 @@ def _find_barcode_umi_fwd(self, read_id: str, sequence: str) -> TenXBarcodeDetec barcode_end = r1_end + self.BARCODE_LEN_10X + 1 potential_barcode = sequence[barcode_start:barcode_end + 1] logger.debug("Barcode: %s" % (potential_barcode)) - matching_barcodes = self.barcode_indexer.get_occurrences(potential_barcode) + matching_barcodes = self.barcode_indexer.get_occurrences(potential_barcode, + max_hits=self.max_barcodes_hits, + min_kmers=self.min_matching_kmers) barcode, bc_score, bc_start, bc_end = \ - find_candidate_with_max_score_ssw(matching_barcodes, potential_barcode, min_score=self.min_score) + find_candidate_with_max_score_ssw(matching_barcodes, potential_barcode, + min_score=self.min_score, + score_diff=self.score_diff) if barcode is None: return TenXBarcodeDetectionResult(read_id, polyT=polyt_start, r1=r1_end) @@ -137,18 +146,10 @@ def _find_barcode_umi_fwd(self, read_id: str, sequence: str) -> TenXBarcodeDetec potential_umi = sequence[potential_umi_start:potential_umi_end + 1] logger.debug("Potential UMI: %s" % potential_umi) - umi = None good_umi = False - if self.umi_set: - matching_umis = self.umi_indexer.get_occurrences(potential_umi) - umi, umi_score, umi_start, umi_end = \ - find_candidate_with_max_score_ssw(matching_umis, potential_umi, min_score=7) - logger.debug("Found UMI %s %d-%d" % (umi, umi_start, umi_end)) - - if not umi: - umi = potential_umi - if self.UMI_LEN - self.UMI_LEN_DELTA <= len(umi) <= self.UMI_LEN + self.UMI_LEN_DELTA: - good_umi = True + umi = potential_umi + if self.UMI_LEN - self.UMI_LEN_DELTA <= len(umi) <= self.UMI_LEN + self.UMI_LEN_DELTA: + good_umi = True if not umi: return TenXBarcodeDetectionResult(read_id, barcode, BC_score=bc_score, polyT=polyt_start, r1=r1_end) @@ -205,10 +206,13 @@ def __init__(self, barcode_pair_list: List[List[str]]): self.r1_indexer = KmerIndexer([VisiumHDBarcodeDetector.R1], kmer_size=7) self.part1_list = barcode_pair_list[0] self.part2_list = barcode_pair_list[1] - self.part1_barcode_indexer = KmerIndexer(self.part1_list, kmer_size=7) - self.part2_barcode_indexer = KmerIndexer(self.part2_list, kmer_size=7) - self.umi_set = None + + self.part1_barcode_indexer = ArrayKmerIndexer(self.part1_list, kmer_size=7) + self.part2_barcode_indexer = ArrayKmerIndexer(self.part2_list, kmer_size=7) + self.max_barcodes_hits = 20 + self.min_matching_kmers = 2 self.min_score = 13 + logger.debug("Min score set to %d" % self.min_score) def find_barcode_umi(self, read_id: str, sequence: str) -> TenXBarcodeDetectionResult: diff --git a/tests/github/barcode_test_templates/stereo_single.cfg b/tests/github/barcode_test_templates/stereo_single.cfg index b618bf1b..9a9ae74e 100644 --- a/tests/github/barcode_test_templates/stereo_single.cfg +++ b/tests/github/barcode_test_templates/stereo_single.cfg @@ -19,7 +19,7 @@ mode stereoseq threads 8 # Optional: additional isoquant_detect_barcodes.py options -# detect_options "--min_score 21" +# detect_options "" # Quality assessment mode (for assess_barcode_quality.py) # If not specified, derived from 'mode' diff --git a/tests/test_barcode_callers.py b/tests/test_barcode_callers.py index 1cf3f7f2..e5fdc5eb 100644 --- a/tests/test_barcode_callers.py +++ b/tests/test_barcode_callers.py @@ -100,8 +100,8 @@ def test_str_format(self): def test_header(self): """Test TSV header.""" - # header() is now an instance method, use _header() for static access - header = BarcodeDetectionResult._header() + # header() is a static method + header = BarcodeDetectionResult.header() assert "#read_id" in header assert "barcode" in header assert "UMI" in header diff --git a/tests/test_barcode_detectors.py b/tests/test_barcode_detectors.py index fc09bba3..a08b7754 100644 --- a/tests/test_barcode_detectors.py +++ b/tests/test_barcode_detectors.py @@ -93,18 +93,18 @@ class TestStereoBarcodeDetector: def test_init(self): """Test basic initialization.""" - detector = StereoBarcodeDetector(DUMMY_BARCODES_25, min_score=21) + detector = StereoBarcodeDetector(DUMMY_BARCODES_25) assert detector.barcode_indexer is not None assert detector.min_score == 21 def test_init_empty_barcodes(self): """Test initialization with empty barcode list.""" - detector = StereoBarcodeDetector([], min_score=21) + detector = StereoBarcodeDetector([]) assert detector.barcode_indexer is None def test_no_match_random_sequence(self): """Test that random sequence returns no valid barcode.""" - detector = StereoBarcodeDetector(DUMMY_BARCODES_25, min_score=21) + detector = StereoBarcodeDetector(DUMMY_BARCODES_25) result = detector.find_barcode_umi("test_read", RANDOM_SEQUENCE) assert isinstance(result, (LinkerBarcodeDetectionResult, TSOBarcodeDetectionResult)) @@ -141,7 +141,7 @@ def test_real_sequence(self, read_sequence, true_barcode, true_umi): "CATCAAAGGTTCACCGAATCTCCAC", "GTCGCCTTCAACGTATACTGATATA", "ACCAATTAGCAGCCATGGGAACTCC"] - detector = StereoBarcodeDetector(barcodes, min_score=21) + detector = StereoBarcodeDetector(barcodes) result = detector.find_barcode_umi("test_read", read_sequence) assert isinstance(result, (LinkerBarcodeDetectionResult, TSOBarcodeDetectionResult)) @@ -196,7 +196,7 @@ def test_real_sequence(self, read_sequence, true_barcodes, true_umis): "GGGCACATTCCTCAGAAAGCGAAAA", "GGCAGGCCATTAGTTACTCGTCGAG", "CTCCCCTAATCGTTTGTCGGTGGAC"] - detector = StereoSplittingBarcodeDetector(barcodes, min_score=21) + detector = StereoSplittingBarcodeDetector(barcodes) result = detector.find_barcode_umi("test_read", read_sequence) assert isinstance(result, SplittingBarcodeDetectionResult) @@ -214,13 +214,13 @@ class TestSharedMemStereoBarcodeDetector: def test_init(self): """Test basic initialization.""" - detector = SharedMemoryStereoBarcodeDetector(DUMMY_BARCODES_25, min_score=21) + detector = SharedMemoryStereoBarcodeDetector(DUMMY_BARCODES_25) assert detector.barcode_indexer is not None assert detector.min_score == 21 def test_no_match_random_sequence(self): """Test that random sequence returns no valid barcode.""" - detector = SharedMemoryStereoBarcodeDetector(DUMMY_BARCODES_25, min_score=21) + detector = SharedMemoryStereoBarcodeDetector(DUMMY_BARCODES_25) result = detector.find_barcode_umi("test_read", RANDOM_SEQUENCE) assert isinstance(result, (LinkerBarcodeDetectionResult, TSOBarcodeDetectionResult)) @@ -266,7 +266,7 @@ def test_real_sequence(self, read_sequence, true_barcode, true_umi): "CATCAAAGGTTCACCGAATCTCCAC", "GTCGCCTTCAACGTATACTGATATA", "ACCAATTAGCAGCCATGGGAACTCC"] - detector = SharedMemoryStereoBarcodeDetector(barcodes, min_score=21) + detector = SharedMemoryStereoBarcodeDetector(barcodes) result = detector.find_barcode_umi("test_read", read_sequence) assert isinstance(result, (LinkerBarcodeDetectionResult, TSOBarcodeDetectionResult)) @@ -289,7 +289,7 @@ def test_real_sequence_no_bc(self, read_sequence): "CATCAAAGGTTCACCGAATCTCCAC", "GTCGCCTTCAACGTATACTGATATA", "ACCAATTAGCAGCCATGGGAACTCC"] - detector = SharedMemoryStereoBarcodeDetector(barcodes, min_score=21) + detector = SharedMemoryStereoBarcodeDetector(barcodes) result = detector.find_barcode_umi("test_read", read_sequence) assert isinstance(result, (LinkerBarcodeDetectionResult, TSOBarcodeDetectionResult)) @@ -338,7 +338,7 @@ def test_real_sequence(self, read_sequence, true_barcodes, true_umis): "GGGCACATTCCTCAGAAAGCGAAAA", "GGCAGGCCATTAGTTACTCGTCGAG", "CTCCCCTAATCGTTTGTCGGTGGAC"] - detector = SharedMemoryStereoSplittingBarcodeDetector(barcodes, min_score=21) + detector = SharedMemoryStereoSplittingBarcodeDetector(barcodes) result = detector.find_barcode_umi("test_read", read_sequence) assert isinstance(result, SplittingBarcodeDetectionResult) @@ -370,7 +370,7 @@ def test_real_sequence_no_bc(self, read_sequence): "GGGCACATTCCTCAGAAAGCGAAAA", "GGCAGGCCATTAGTTACTCGTCGAG", "CTCCCCTAATCGTTTGTCGGTGGAC"] - detector = SharedMemoryStereoSplittingBarcodeDetector(barcodes, min_score=21) + detector = SharedMemoryStereoSplittingBarcodeDetector(barcodes) result = detector.find_barcode_umi("test_read", read_sequence) assert isinstance(result, SplittingBarcodeDetectionResult) @@ -399,7 +399,7 @@ def test_init_large_whitelist(self): """Test that min_score increases for large whitelists.""" large_barcodes = [f"ACTG{i:012d}"[:16] for i in range(100001)] detector = TenXBarcodeDetector(large_barcodes) - assert detector.min_score == 16 + assert detector.min_score == 15 def test_no_match_random_sequence(self): """Test that random sequence returns no valid barcode.""" @@ -567,20 +567,20 @@ class TestCurioBarcodeDetector: def test_init_joint_barcodes(self): """Test initialization with joint barcode list.""" - detector = CurioBarcodeDetector(DUMMY_BARCODES_14, min_score=13) + detector = CurioBarcodeDetector(DUMMY_BARCODES_14) assert detector.barcode_indexer is not None assert detector.min_score == 13 def test_init_split_barcodes(self): """Test initialization with separate left/right barcode lists.""" barcode_tuple = (DUMMY_BARCODES_8, DUMMY_BARCODES_6) - detector = CurioBarcodeDetector(barcode_tuple, min_score=13) + detector = CurioBarcodeDetector(barcode_tuple) # Should have 3*3 = 9 combined barcodes assert len(detector.barcode_indexer.seq_list) == 9 def test_no_match_random_sequence(self): """Test that random sequence returns no valid barcode.""" - detector = CurioBarcodeDetector(DUMMY_BARCODES_14, min_score=13) + detector = CurioBarcodeDetector(DUMMY_BARCODES_14) result = detector.find_barcode_umi("test_read", RANDOM_SEQUENCE) assert isinstance(result, LinkerBarcodeDetectionResult) @@ -637,13 +637,13 @@ class TestCurioIlluminaDetector: def test_init(self): """Test basic initialization.""" - detector = CurioIlluminaDetector(DUMMY_BARCODES_14, min_score=14) + detector = CurioIlluminaDetector(DUMMY_BARCODES_14) assert detector.barcode_indexer is not None assert detector.min_score == 14 def test_no_match_random_sequence(self): """Test that random sequence returns no valid barcode.""" - detector = CurioIlluminaDetector(DUMMY_BARCODES_14, min_score=14) + detector = CurioIlluminaDetector(DUMMY_BARCODES_14) result = detector.find_barcode_umi("test_read", RANDOM_SEQUENCE) assert isinstance(result, LinkerBarcodeDetectionResult) From 0b192d23fb54d5fd66e653efa8da280b568d42ea Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Tue, 3 Feb 2026 21:01:54 +0100 Subject: [PATCH 25/41] proper indexing params for universal barcode calling algorithm --- .../callers/universal_extraction.py | 69 +++++++++++++------ src/dataset_processor.py | 9 +-- 2 files changed, 52 insertions(+), 26 deletions(-) diff --git a/src/barcode_calling/callers/universal_extraction.py b/src/barcode_calling/callers/universal_extraction.py index 918a3219..920ab53a 100644 --- a/src/barcode_calling/callers/universal_extraction.py +++ b/src/barcode_calling/callers/universal_extraction.py @@ -5,11 +5,13 @@ ############################################################################ import logging +import math import sys +from ..indexers import ArrayKmerIndexer, Array2BitKmerIndexer, Dict2BitKmerIndexer, KmerIndexer +from ..common import batch_str_to_2bit from ...error_codes import IsoQuantExitCode from .extraction_result import DetectedElement, ExtractionResult -from ..indexers.base import KmerIndexer from ..common import find_polyt, reverese_complement, detect_exact_positions, find_candidate_with_max_score_ssw from .molecule_structure import ElementType, MoleculeStructure, MoleculeElement @@ -21,6 +23,9 @@ class UniversalSingleMoleculeExtractor: MIN_SCORE_COEFF_TERMMINAL = 0.5 TERMINAL_MATCH_DELTA = 3 MAX_LEN_DIFF = 0.25 + DEFAULT_ERROR_RATE = 0.03 # rought estimate for modern nanopore, doesn't matter that much + MAX_HITS = 10 + SCORE_DIFF = 1 def __init__(self, molecule_structure: MoleculeStructure): self.correct_sequences = True @@ -60,10 +65,7 @@ def __init__(self, molecule_structure: MoleculeStructure): # Duplicated parts: skip individual index, handled after loop pass elif self.correct_sequences: - self.index_dict[el.element_name] = self.prepare_barcode_index(el) - # FIXME: min score should be based on the length of the barcode and their sparcity - # e.g. find a way to estimate min_score so that error rate does not exceed 1% - self.min_scores[el.element_name] = el.element_length - 2 + self.prepare_barcode_index_for_element(el) self.elements_to_correct.add(el.element_name) else: self.elements_to_extract.add(el.element_name) @@ -81,9 +83,7 @@ def __init__(self, molecule_structure: MoleculeStructure): total_length += el.element_length if first_part_el is not None and first_part_el.element_value is not None: barcode_list = first_part_el.element_value - k = max(6, int(total_length / 2) - 2) - self.index_dict[base_name] = KmerIndexer(barcode_list, k) - self.min_scores[base_name] = total_length - 2 + self.prepare_barcode_index(base_name, barcode_list, total_length) # Build one index per duplicated base element using the full whitelist if self.correct_sequences: @@ -96,21 +96,46 @@ def __init__(self, molecule_structure: MoleculeStructure): break if first_part_el is not None and first_part_el.element_value is not None: barcode_list = first_part_el.element_value - k = max(6, int(first_part_el.element_length / 2) - 2) - self.index_dict[base_name] = KmerIndexer(barcode_list, k) - self.min_scores[base_name] = first_part_el.element_length - 2 + self.prepare_barcode_index(base_name, barcode_list, first_part_el.element_length) if not self.has_cdna: logger.critical("Molecule must include a cDNA") sys.exit(IsoQuantExitCode.INVALID_PARAMETER) - def prepare_barcode_index(self, element: MoleculeElement): + def prepare_barcode_index(self, base_name, barcode_list, barcode_length): + barcode_count = len(barcode_list) + error_rate = self.DEFAULT_ERROR_RATE + barcode_sparsity = math.pow(4, barcode_length) / barcode_count + filling_edit_distance = math.ceil(math.log(barcode_sparsity, 8 * barcode_length)) + error_probability = math.pow(1 - error_rate, barcode_length - filling_edit_distance) * math.pow(error_rate, filling_edit_distance) * math.comb(barcode_length, filling_edit_distance) + density = barcode_count * math.pow(8 * barcode_length, filling_edit_distance) / math.pow(4, barcode_length) + if error_probability > 0.01 or density > 1: + filling_edit_distance -= 1 + + if filling_edit_distance == 0: + self.min_scores[base_name] = barcode_length - 1 + self.min_scores[base_name] = barcode_length - filling_edit_distance + + if barcode_count > 1000000: + logger.warning("The number of barcodes for element %s is too large: %d, barcode calling may take substantial amount of time and RAM", (base_name, barcode_count)) + + kmer_size = min(max(6, int(barcode_length / 2) - 1), 15) + if kmer_size <= 8: + if barcode_count < 100000: + self.index_dict[base_name] = ArrayKmerIndexer(barcode_list, kmer_size) + else: + self.index_dict[base_name] = Array2BitKmerIndexer(batch_str_to_2bit(barcode_list), kmer_size) + else: + if barcode_count < 100000: + self.index_dict[base_name] = KmerIndexer(barcode_list, kmer_size) + else: + self.index_dict[base_name] = Dict2BitKmerIndexer(batch_str_to_2bit(barcode_list), kmer_size) + + def prepare_barcode_index_for_element(self, element: MoleculeElement): if element.element_type not in (ElementType.VAR_LIST, ElementType.VAR_FILE): - return None + return barcode_list = element.element_value - - # TODO: use optimal indices for long barocde lists - return KmerIndexer(barcode_list, max(6, int(element.element_length / 2) - 2)) + self.prepare_barcode_index(element.element_name, barcode_list, element.element_length) def result_type(self): return ExtractionResult @@ -172,10 +197,10 @@ def process_concatenated_elements(self, concatenated_element_storage, detected_r # If correction is enabled and an index exists, correct against the whitelist if self.correct_sequences and base_name in self.index_dict: - matching_sequences = self.index_dict[base_name].get_occurrences(concatenated_seq) + matching_sequences = self.index_dict[base_name].get_occurrences(concatenated_seq, max_hits=self.MAX_HITS) corrected_seq, seq_score, _, _ = find_candidate_with_max_score_ssw( matching_sequences, concatenated_seq, - min_score=self.min_scores[base_name]) + min_score=self.min_scores[base_name], score_diff=self.SCORE_DIFF) if corrected_seq is not None: detected_results[base_name] = DetectedElement( parts[0][0], parts[-1][1], seq_score, corrected_seq) @@ -214,9 +239,9 @@ def process_duplicated_elements(self, duplicated_element_storage, detected_resul if seq is None: corrected.append((None, -1)) continue - matching = self.index_dict[base_name].get_occurrences(seq) + matching = self.index_dict[base_name].get_occurrences(seq, max_hits=self.MAX_HITS) corrected_seq, score, _, _ = find_candidate_with_max_score_ssw( - matching, seq, min_score=self.min_scores[base_name]) + matching, seq, min_score=self.min_scores[base_name], score_diff=self.SCORE_DIFF) corrected.append((corrected_seq, score)) successful = [(seq, score) for seq, score in corrected if seq is not None] @@ -262,10 +287,10 @@ def process_duplicated_elements(self, duplicated_element_storage, detected_resul def correct_element(self, element: MoleculeElement, sequence: str, potential_start: int, potential_end: int) -> DetectedElement: potential_seq = sequence[potential_start:potential_end + 1] - matching_sequences = self.index_dict[element.element_name].get_occurrences(potential_seq) + matching_sequences = self.index_dict[element.element_name].get_occurrences(potential_seq, max_hits=self.MAX_HITS) corrected_seq, seq_score, seq_start, seq_end = \ find_candidate_with_max_score_ssw(matching_sequences, potential_seq, - min_score=self.min_scores[element.element_name]) + min_score=self.min_scores[element.element_name], score_diff=self.SCORE_DIFF) if corrected_seq is not None: read_seq_start = potential_start + seq_start read_seq_end = potential_start + seq_end - 1 diff --git a/src/dataset_processor.py b/src/dataset_processor.py index 9182c06c..445ea237 100644 --- a/src/dataset_processor.py +++ b/src/dataset_processor.py @@ -133,10 +133,11 @@ def process_sample(self, sample): len(sample.file_list) > 1 and self.args.read_group is not None and "file_name" in self.args.read_group) - if sample.use_technical_replicas: - logger.info("Technical replicas filtering enabled: novel transcripts must be confirmed by at least 2 files") - elif len(sample.file_list) > 1 and not self.args.use_replicas: - logger.info("Technical replicas filtering disabled by --use_replicas false") + if not self.args.no_model_construction: + if sample.use_technical_replicas: + logger.info("Technical replicas filtering enabled: novel transcripts must be confirmed by at least 2 files") + elif len(sample.file_list) > 1 and not self.args.use_replicas: + logger.info("Technical replicas filtering disabled by --use_replicas false") # Initialize all_read_groups as list of sets (one per grouping strategy) self.all_read_groups = [set() for _ in self.grouping_strategy_names] From 0601cbca60fb63e234d6557bf46a79a1cc33f4c8 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Tue, 3 Feb 2026 21:52:14 +0100 Subject: [PATCH 26/41] fix parser error handling --- src/barcode_calling/callers/molecule_structure.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/barcode_calling/callers/molecule_structure.py b/src/barcode_calling/callers/molecule_structure.py index e5c6828c..478bc28e 100644 --- a/src/barcode_calling/callers/molecule_structure.py +++ b/src/barcode_calling/callers/molecule_structure.py @@ -119,6 +119,8 @@ def __init__(self, str_iterator): element_properties = {} for l in str_iterator: v = l.strip().split() + if not v: + continue element_name = v[0] if element_name in element_properties: logger.critical("Duplicated element name %s" % element_name) @@ -135,7 +137,7 @@ def __init__(self, str_iterator): element_name: str for element_name in elements: if element_name not in element_properties: - if element_name not in ElementType.__dict__: + if element_name not in (ElementType.cDNA.name, ElementType.PolyT.name): logger.critical("Molecule element %s was not described in the format file" % element_name) sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) element_type = ElementType[element_name] From 1bd719d8aa7ce76589a75220e396c178fad3c084 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Tue, 3 Feb 2026 22:09:45 +0100 Subject: [PATCH 27/41] report only all barcodes, fix tests --- .../callers/extraction_result.py | 25 ++++++++++++++----- tests/test_universal_extraction.py | 4 +-- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/src/barcode_calling/callers/extraction_result.py b/src/barcode_calling/callers/extraction_result.py index 4c671d22..2c3fb4b7 100644 --- a/src/barcode_calling/callers/extraction_result.py +++ b/src/barcode_calling/callers/extraction_result.py @@ -78,10 +78,11 @@ def get_barcode(self) -> str: Get the detected barcode sequence. For molecule structures with multiple barcode elements (e.g., barcode1, barcode2), - returns concatenated barcodes using '|' delimiter. + returns concatenated barcodes using '|' delimiter only when ALL parts are detected. + Returns NOSEQ if any part is missing (partial barcodes are not usable). Returns: - str: Concatenated barcode sequence or NOSEQ if not detected. + str: Concatenated barcode sequence or NOSEQ if any part is not detected. """ parts = [] for el_name in self._barcode_elements: @@ -89,6 +90,8 @@ def get_barcode(self) -> str: detected = self.detected_results[el_name] if detected.seq and detected.seq != self.NOSEQ: parts.append(detected.seq) + continue + return self.NOSEQ return "|".join(parts) if parts else self.NOSEQ def get_umi(self) -> str: @@ -96,21 +99,31 @@ def get_umi(self) -> str: Get the detected UMI sequence. For molecule structures with multiple UMI elements, - returns concatenated UMIs using '|' delimiter. + returns all parts using '|' delimiter, with NOSEQ ('*') for undetected elements. Returns: str: Concatenated UMI sequence or NOSEQ if not detected. """ + if len(self._umi_elements) <= 1: + for el_name in self._umi_elements: + if el_name in self.detected_results: + detected = self.detected_results[el_name] + if detected.seq and detected.seq != self.NOSEQ: + return detected.seq + return self.NOSEQ + parts = [] for el_name in self._umi_elements: if el_name in self.detected_results: detected = self.detected_results[el_name] if detected.seq and detected.seq != self.NOSEQ: parts.append(detected.seq) - return "|".join(parts) if parts else self.NOSEQ + continue + parts.append(self.NOSEQ) + return "|".join(parts) def is_valid(self) -> bool: - """Check if a valid barcode was detected.""" + """Check if a valid barcode was detected (all parts must be present).""" return self.get_barcode() != self.NOSEQ def has_barcode(self) -> bool: @@ -118,7 +131,7 @@ def has_barcode(self) -> bool: return self.get_barcode() != self.NOSEQ def has_umi(self) -> bool: - """Check if a valid UMI was detected.""" + """Check if any valid UMI was detected.""" return self.get_umi() != self.NOSEQ def get_barcode_score(self) -> int: diff --git a/tests/test_universal_extraction.py b/tests/test_universal_extraction.py index 0bc8d064..e9363393 100644 --- a/tests/test_universal_extraction.py +++ b/tests/test_universal_extraction.py @@ -949,7 +949,7 @@ def test_process_duplicated_one_succeeds(self): extractor = UniversalSingleMoleculeExtractor(structure) # Copy 1: AAAACCCC (exact match), Copy 2: totally wrong sequence - sequence = "XXXXXXXXXX" + "AAAACCCC" + "XXXXXXXX" + "NNNNNNNN" + "XXXXXXXX" + sequence = "XXXXXXXXXX" + "AAAACCCC" + "XXXXXXXX" + "TTTTTTTT" + "XXXXXXXX" dupl_storage = {"Barcode": [(10, 17), (26, 33)]} detected_results = {} @@ -965,7 +965,7 @@ def test_process_duplicated_both_fail(self): extractor = UniversalSingleMoleculeExtractor(structure) # Both copies have garbage sequences that don't match any whitelist entry - sequence = "XXXXXXXXXX" + "NNNNNNNN" + "XXXXXXXX" + "NNNNNNNN" + "XXXXXXXX" + sequence = "XXXXXXXXXX" + "TTTTTTTT" + "XXXXXXXX" + "TTTTTTTT" + "XXXXXXXX" dupl_storage = {"Barcode": [(10, 17), (26, 33)]} detected_results = {} From bf4c5416383ebebc8221eace690f29257fbab5fb Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Wed, 4 Feb 2026 00:21:16 +0100 Subject: [PATCH 28/41] add umi filtering for custom sc --- src/dataset_processor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/dataset_processor.py b/src/dataset_processor.py index 445ea237..026b6f6f 100644 --- a/src/dataset_processor.py +++ b/src/dataset_processor.py @@ -497,7 +497,8 @@ def filter_umis(self, sample): IsoQuantMode.curio: [3], IsoQuantMode.visium_hd: [4], IsoQuantMode.stereoseq: [4], - IsoQuantMode.stereoseq_nosplit: [4]} + IsoQuantMode.stereoseq_nosplit: [4], + IsoQuantMode.custom_sc: [4]} for i, edit_distance in enumerate(umi_ed_dict[self.args.mode]): logger.info("Filtering PCR duplicates with edit distance %d" % edit_distance) From 6ef1228bff0ef3aaccdfe1b49de536fa8992dd5b Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Wed, 4 Feb 2026 11:16:17 +0100 Subject: [PATCH 29/41] log barcode index --- src/barcode_calling/callers/universal_extraction.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/barcode_calling/callers/universal_extraction.py b/src/barcode_calling/callers/universal_extraction.py index 920ab53a..6cfc614d 100644 --- a/src/barcode_calling/callers/universal_extraction.py +++ b/src/barcode_calling/callers/universal_extraction.py @@ -130,6 +130,7 @@ def prepare_barcode_index(self, base_name, barcode_list, barcode_length): self.index_dict[base_name] = KmerIndexer(barcode_list, kmer_size) else: self.index_dict[base_name] = Dict2BitKmerIndexer(batch_str_to_2bit(barcode_list), kmer_size) + logger.info("Indexed %d barcodes for element %s" % (barcode_count, base_name)) def prepare_barcode_index_for_element(self, element: MoleculeElement): if element.element_type not in (ElementType.VAR_LIST, ElementType.VAR_FILE): From f5730fbadae302f5034a6b66aeb2d0ffbc778709 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Wed, 4 Feb 2026 11:21:01 +0100 Subject: [PATCH 30/41] create tmp dir inside output folder if tmp is not set --- isoquant_detect_barcodes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/isoquant_detect_barcodes.py b/isoquant_detect_barcodes.py index 74ec47bb..c56d0f5c 100755 --- a/isoquant_detect_barcodes.py +++ b/isoquant_detect_barcodes.py @@ -348,6 +348,8 @@ def _process_single_file_in_parallel(input_file, output_tsv, out_fasta, args, ba tmp_dir = "barcode_calling_%x" % random.randint(0, 1 << 32) if args.tmp_dir: tmp_dir = os.path.join(args.tmp_dir, tmp_dir) + else: + tmp_dir = os.path.join(os.path.dirname(args.output), tmp_dir) os.makedirs(tmp_dir) tmp_barcode_file = os.path.join(tmp_dir, "bc") From aefecf96fa5832afe0b3d50eebfd4d3ed2d5a160 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Wed, 4 Feb 2026 11:59:07 +0100 Subject: [PATCH 31/41] fix seq_len and remove default values --- src/barcode_calling/callers/universal_extraction.py | 4 ++-- src/barcode_calling/indexers/two_bit.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/barcode_calling/callers/universal_extraction.py b/src/barcode_calling/callers/universal_extraction.py index 6cfc614d..c502287f 100644 --- a/src/barcode_calling/callers/universal_extraction.py +++ b/src/barcode_calling/callers/universal_extraction.py @@ -124,12 +124,12 @@ def prepare_barcode_index(self, base_name, barcode_list, barcode_length): if barcode_count < 100000: self.index_dict[base_name] = ArrayKmerIndexer(barcode_list, kmer_size) else: - self.index_dict[base_name] = Array2BitKmerIndexer(batch_str_to_2bit(barcode_list), kmer_size) + self.index_dict[base_name] = Array2BitKmerIndexer(batch_str_to_2bit(barcode_list), kmer_size, seq_len=barcode_length) else: if barcode_count < 100000: self.index_dict[base_name] = KmerIndexer(barcode_list, kmer_size) else: - self.index_dict[base_name] = Dict2BitKmerIndexer(batch_str_to_2bit(barcode_list), kmer_size) + self.index_dict[base_name] = Dict2BitKmerIndexer(batch_str_to_2bit(barcode_list), kmer_size, seq_len=barcode_length) logger.info("Indexed %d barcodes for element %s" % (barcode_count, base_name)) def prepare_barcode_index_for_element(self, element: MoleculeElement): diff --git a/src/barcode_calling/indexers/two_bit.py b/src/barcode_calling/indexers/two_bit.py index 2c5d5290..7c9dbbb9 100644 --- a/src/barcode_calling/indexers/two_bit.py +++ b/src/barcode_calling/indexers/two_bit.py @@ -26,7 +26,7 @@ class Dict2BitKmerIndexer: Best for medium to large barcode sets. """ - def __init__(self, known_bit_seqs: Iterable[int], kmer_size: int = 6, seq_len: int = 25): + def __init__(self, known_bit_seqs: Iterable[int], kmer_size: int, seq_len: int): """ Initialize k-mer index with 2-bit encoded sequences. @@ -131,7 +131,7 @@ class Array2BitKmerIndexer: Best for large barcode sets (e.g., single-cell whitelists). """ - def __init__(self, known_bin_seq: Iterable[int], kmer_size: int = 12, seq_len: int = 25): + def __init__(self, known_bin_seq: Iterable[int], kmer_size: int, seq_len: int): """ Initialize 2-bit k-mer index. From 8cc6367223e4b4624554f98e505c3629ea06e09d Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Wed, 4 Feb 2026 12:28:37 +0100 Subject: [PATCH 32/41] fix seq len in str_to_bit conversion --- src/barcode_calling/callers/universal_extraction.py | 6 ++++-- src/barcode_calling/common.py | 4 ++-- tests/test_barcode_common.py | 4 ++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/barcode_calling/callers/universal_extraction.py b/src/barcode_calling/callers/universal_extraction.py index c502287f..9e5c6ec3 100644 --- a/src/barcode_calling/callers/universal_extraction.py +++ b/src/barcode_calling/callers/universal_extraction.py @@ -124,12 +124,14 @@ def prepare_barcode_index(self, base_name, barcode_list, barcode_length): if barcode_count < 100000: self.index_dict[base_name] = ArrayKmerIndexer(barcode_list, kmer_size) else: - self.index_dict[base_name] = Array2BitKmerIndexer(batch_str_to_2bit(barcode_list), kmer_size, seq_len=barcode_length) + self.index_dict[base_name] = Array2BitKmerIndexer(batch_str_to_2bit(barcode_list, barcode_length), + kmer_size, seq_len=barcode_length) else: if barcode_count < 100000: self.index_dict[base_name] = KmerIndexer(barcode_list, kmer_size) else: - self.index_dict[base_name] = Dict2BitKmerIndexer(batch_str_to_2bit(barcode_list), kmer_size, seq_len=barcode_length) + self.index_dict[base_name] = Dict2BitKmerIndexer(batch_str_to_2bit(barcode_list, barcode_length), + kmer_size, seq_len=barcode_length) logger.info("Indexed %d barcodes for element %s" % (barcode_count, base_name)) def prepare_barcode_index_for_element(self, element: MoleculeElement): diff --git a/src/barcode_calling/common.py b/src/barcode_calling/common.py index 431d748c..4349a5bf 100644 --- a/src/barcode_calling/common.py +++ b/src/barcode_calling/common.py @@ -281,7 +281,7 @@ def str_to_2bit(seq): return kmer_idx -def batch_str_to_2bit(barcodes, seq_len=25): +def batch_str_to_2bit(barcodes, seq_len): """ Convert a list/iterator of barcode strings to 2-bit encoded integers. @@ -338,7 +338,7 @@ def batch_str_to_2bit(barcodes, seq_len=25): return result -def batch_str_to_2bit_chunked(barcode_iterator, seq_len=25, chunk_size=50_000_000): +def batch_str_to_2bit_chunked(barcode_iterator, seq_len, chunk_size=50_000_000): """ Convert barcodes to 2-bit encoding in memory-efficient chunks. diff --git a/tests/test_barcode_common.py b/tests/test_barcode_common.py index 19a89533..2e04a94d 100644 --- a/tests/test_barcode_common.py +++ b/tests/test_barcode_common.py @@ -303,7 +303,7 @@ class TestBatchStrTo2Bit: def test_empty_input(self): """Test empty input returns empty array.""" import numpy as np - result = batch_str_to_2bit([]) + result = batch_str_to_2bit([], 25) assert len(result) == 0 assert result.dtype == np.uint64 @@ -361,7 +361,7 @@ class TestBatchStrTo2BitChunked: def test_empty_input(self): """Test empty iterator returns empty array.""" import numpy as np - result = batch_str_to_2bit_chunked(iter([])) + result = batch_str_to_2bit_chunked(iter([]), 25) assert len(result) == 0 assert result.dtype == np.uint64 From 889069588805c9f1e129422e3af25e02a6ec9101 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Wed, 4 Feb 2026 14:46:16 +0100 Subject: [PATCH 33/41] account for variable length barcode --- docs/single_cell.md | 5 ++++- .../callers/universal_extraction.py | 16 ++++++++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/docs/single_cell.md b/docs/single_cell.md index aea0438e..1d88402d 100644 --- a/docs/single_cell.md +++ b/docs/single_cell.md @@ -144,7 +144,7 @@ An MDF file has two parts: | `CONST` | Constant/known sequence (primer, linker, TSO) | Sequence | | `VAR_FILE` | Variable sequence matched against a whitelist file | Path to TSV file | | `VAR_LIST` | Variable sequence matched against an inline list | Comma-separated sequences | -| `VAR_ANY` | Variable-length sequence extracted as-is | Length (integer) | +| `VAR_ANY` | Variable fixed-length sequence extracted as-is | Length (integer) | | `VAR_ANY_SEPARATOR` | Fixed-length variable separator sequence (not extracted) | Length (integer) | | `VAR_ANY_NON_T_SEPARATOR` | Fixed-length variable separator sequence without T nucleoties (not extracted) | Length (integer) | | `PolyT` | PolyT tail | (none) | @@ -152,6 +152,9 @@ An MDF file has two parts: At the moment, only a single `cDNA` and a single `PolyT` are supported. +Variable elements are expected to have a fixed length (`VAR_FILE` and `VAR_LIST`). +Using variable-length barcodes may result in suboptimal performance. + ### Barcode and UMI identification Elements are identified as barcodes or UMIs by their name prefix (case-insensitive): diff --git a/src/barcode_calling/callers/universal_extraction.py b/src/barcode_calling/callers/universal_extraction.py index 9e5c6ec3..d62ef581 100644 --- a/src/barcode_calling/callers/universal_extraction.py +++ b/src/barcode_calling/callers/universal_extraction.py @@ -103,6 +103,15 @@ def __init__(self, molecule_structure: MoleculeStructure): sys.exit(IsoQuantExitCode.INVALID_PARAMETER) def prepare_barcode_index(self, base_name, barcode_list, barcode_length): + # Check whether all barcodes have the same length + barcode_lengths = set(len(b) for b in barcode_list) + variable_length = len(barcode_lengths) > 1 + if variable_length: + logger.warning("Barcodes for element %s have variable lengths (%s), " + "performance may be suboptimal" % + (base_name, ", ".join(str(l) for l in sorted(barcode_lengths)))) + barcode_length = min(barcode_lengths) + barcode_count = len(barcode_list) error_rate = self.DEFAULT_ERROR_RATE barcode_sparsity = math.pow(4, barcode_length) / barcode_count @@ -115,19 +124,22 @@ def prepare_barcode_index(self, base_name, barcode_list, barcode_length): if filling_edit_distance == 0: self.min_scores[base_name] = barcode_length - 1 self.min_scores[base_name] = barcode_length - filling_edit_distance + if variable_length: + self.min_scores[base_name] = max(self.min_scores[base_name] - 1, 0) + logger.info("Minimal score for element %s is set to %d" % (base_name, self.min_scores[base_name])) if barcode_count > 1000000: logger.warning("The number of barcodes for element %s is too large: %d, barcode calling may take substantial amount of time and RAM", (base_name, barcode_count)) kmer_size = min(max(6, int(barcode_length / 2) - 1), 15) if kmer_size <= 8: - if barcode_count < 100000: + if barcode_count < 100000 or variable_length: self.index_dict[base_name] = ArrayKmerIndexer(barcode_list, kmer_size) else: self.index_dict[base_name] = Array2BitKmerIndexer(batch_str_to_2bit(barcode_list, barcode_length), kmer_size, seq_len=barcode_length) else: - if barcode_count < 100000: + if barcode_count < 100000 or variable_length: self.index_dict[base_name] = KmerIndexer(barcode_list, kmer_size) else: self.index_dict[base_name] = Dict2BitKmerIndexer(batch_str_to_2bit(barcode_list, barcode_length), From f219f3a44e97c7dca3324dddc25e388bcb7bdbed Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Wed, 4 Feb 2026 12:59:47 +0200 Subject: [PATCH 34/41] add molecule to barcdoe detection test config --- tests/github/run_barcode_test.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/github/run_barcode_test.py b/tests/github/run_barcode_test.py index 03c0c9cf..a08d45dc 100644 --- a/tests/github/run_barcode_test.py +++ b/tests/github/run_barcode_test.py @@ -139,6 +139,11 @@ def run_detect_barcodes(args, config_dict): barcode_files = fix_paths(config_file, config_dict["barcodes"]) detect_command.extend(["--barcodes"] + barcode_files) + # Add molecule description file (for custom_sc mode) + if "molecule" in config_dict: + molecule_file = fix_path(config_file, config_dict["molecule"]) + detect_command.extend(["--molecule", molecule_file]) + # Add optional parameters if "threads" in config_dict: detect_command.extend(["-t", config_dict["threads"]]) From f089752d088a46354bc27fd703ce93dcca67e48a Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Wed, 4 Feb 2026 15:47:44 +0200 Subject: [PATCH 35/41] custom sc barocode callign tests --- .github/workflows/Barcode.Custom.10x.yml | 82 +++++++++++++++++++ .github/workflows/Barcode.Custom.Curio.yml | 82 +++++++++++++++++++ .../workflows/Barcode.Custom.StereoSeq.yml | 82 +++++++++++++++++++ 3 files changed, 246 insertions(+) create mode 100644 .github/workflows/Barcode.Custom.10x.yml create mode 100644 .github/workflows/Barcode.Custom.Curio.yml create mode 100644 .github/workflows/Barcode.Custom.StereoSeq.yml diff --git a/.github/workflows/Barcode.Custom.10x.yml b/.github/workflows/Barcode.Custom.10x.yml new file mode 100644 index 00000000..92c1f606 --- /dev/null +++ b/.github/workflows/Barcode.Custom.10x.yml @@ -0,0 +1,82 @@ +name: Barcode Custom SC 10x + +on: + workflow_dispatch: + schedule: + - cron: '0 4 * * 1' + +env: + LAUNCHER: ${{github.workspace}}/tests/github/run_barcode_test.py + CFG_DIR: /abga/work/andreyp/ci_isoquant/data/barcodes + BIN_PATH: /abga/work/andreyp/ci_isoquant/bin/ + OUTPUT_BASE: /abga/work/andreyp/ci_isoquant/output/${{github.ref_name}}/barcodes/ + +concurrency: + group: ${{github.workflow}} + cancel-in-progress: false + +jobs: + check-changes: + runs-on: + labels: [isoquant] + name: 'Check for recent changes' + outputs: + has_changes: ${{steps.check.outputs.has_changes}} + steps: + - name: 'Checkout' + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: 'Check for commits in last 7 days' + id: check + run: | + # Always run on manual trigger + if [ "${{github.event_name}}" = "workflow_dispatch" ]; then + echo "has_changes=true" >> $GITHUB_OUTPUT + exit 0 + fi + # Check for commits in last 7 days + COMMITS=$(git log --oneline --since="7 days ago" | wc -l) + if [ "$COMMITS" -gt 0 ]; then + echo "Found $COMMITS commits in last 7 days" + echo "has_changes=true" >> $GITHUB_OUTPUT + else + echo "No commits in last 7 days, skipping" + echo "has_changes=false" >> $GITHUB_OUTPUT + fi + + launch-runner: + needs: check-changes + if: needs.check-changes.outputs.has_changes == 'true' + runs-on: + labels: [isoquant] + name: 'Running custom_sc 10x barcode detection QC' + + steps: + - name: 'Cleanup' + run: > + set -e && + shopt -s dotglob && + rm -rf * + + - name: 'Checkout' + uses: actions/checkout@v3 + with: + fetch-depth: 1 + + - name: 'Custom SC 10x small (ArrayKmerIndexer)' + shell: bash + env: + STEP_NAME: Mouse.10x.custom_sc.small + run: | + export PATH=$PATH:${{env.BIN_PATH}} + python3 ${{env.LAUNCHER}} ${{env.CFG_DIR}}/${{env.STEP_NAME}}.cfg -o ${{env.OUTPUT_BASE}} + + - name: 'Custom SC 10x large (Array2BitKmerIndexer)' + shell: bash + env: + STEP_NAME: Mouse.10x.custom_sc.large + run: | + export PATH=$PATH:${{env.BIN_PATH}} + python3 ${{env.LAUNCHER}} ${{env.CFG_DIR}}/${{env.STEP_NAME}}.cfg -o ${{env.OUTPUT_BASE}} diff --git a/.github/workflows/Barcode.Custom.Curio.yml b/.github/workflows/Barcode.Custom.Curio.yml new file mode 100644 index 00000000..3c2b0c69 --- /dev/null +++ b/.github/workflows/Barcode.Custom.Curio.yml @@ -0,0 +1,82 @@ +name: Barcode Custom SC Curio + +on: + workflow_dispatch: + schedule: + - cron: '0 3 * * 3' + +env: + LAUNCHER: ${{github.workspace}}/tests/github/run_barcode_test.py + CFG_DIR: /abga/work/andreyp/ci_isoquant/data/barcodes + BIN_PATH: /abga/work/andreyp/ci_isoquant/bin/ + OUTPUT_BASE: /abga/work/andreyp/ci_isoquant/output/${{github.ref_name}}/barcodes/ + +concurrency: + group: ${{github.workflow}} + cancel-in-progress: false + +jobs: + check-changes: + runs-on: + labels: [isoquant] + name: 'Check for recent changes' + outputs: + has_changes: ${{steps.check.outputs.has_changes}} + steps: + - name: 'Checkout' + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: 'Check for commits in last 7 days' + id: check + run: | + # Always run on manual trigger + if [ "${{github.event_name}}" = "workflow_dispatch" ]; then + echo "has_changes=true" >> $GITHUB_OUTPUT + exit 0 + fi + # Check for commits in last 7 days + COMMITS=$(git log --oneline --since="7 days ago" | wc -l) + if [ "$COMMITS" -gt 0 ]; then + echo "Found $COMMITS commits in last 7 days" + echo "has_changes=true" >> $GITHUB_OUTPUT + else + echo "No commits in last 7 days, skipping" + echo "has_changes=false" >> $GITHUB_OUTPUT + fi + + launch-runner: + needs: check-changes + if: needs.check-changes.outputs.has_changes == 'true' + runs-on: + labels: [isoquant] + name: 'Running custom_sc Curio barcode detection QC' + + steps: + - name: 'Cleanup' + run: > + set -e && + shopt -s dotglob && + rm -rf * + + - name: 'Checkout' + uses: actions/checkout@v3 + with: + fetch-depth: 1 + + - name: 'Custom SC Curio small (ArrayKmerIndexer)' + shell: bash + env: + STEP_NAME: Human.Curio.custom_sc.small + run: | + export PATH=$PATH:${{env.BIN_PATH}} + python3 ${{env.LAUNCHER}} ${{env.CFG_DIR}}/${{env.STEP_NAME}}.cfg -o ${{env.OUTPUT_BASE}} + + - name: 'Custom SC Curio large (Array2BitKmerIndexer)' + shell: bash + env: + STEP_NAME: Human.Curio.custom_sc.large + run: | + export PATH=$PATH:${{env.BIN_PATH}} + python3 ${{env.LAUNCHER}} ${{env.CFG_DIR}}/${{env.STEP_NAME}}.cfg -o ${{env.OUTPUT_BASE}} diff --git a/.github/workflows/Barcode.Custom.StereoSeq.yml b/.github/workflows/Barcode.Custom.StereoSeq.yml new file mode 100644 index 00000000..161521d3 --- /dev/null +++ b/.github/workflows/Barcode.Custom.StereoSeq.yml @@ -0,0 +1,82 @@ +name: Barcode Custom SC StereoSeq + +on: + workflow_dispatch: + schedule: + - cron: '0 1 * * 5' + +env: + LAUNCHER: ${{github.workspace}}/tests/github/run_barcode_test.py + CFG_DIR: /abga/work/andreyp/ci_isoquant/data/barcodes + BIN_PATH: /abga/work/andreyp/ci_isoquant/bin/ + OUTPUT_BASE: /abga/work/andreyp/ci_isoquant/output/${{github.ref_name}}/barcodes/ + +concurrency: + group: ${{github.workflow}} + cancel-in-progress: false + +jobs: + check-changes: + runs-on: + labels: [isoquant] + name: 'Check for recent changes' + outputs: + has_changes: ${{steps.check.outputs.has_changes}} + steps: + - name: 'Checkout' + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: 'Check for commits in last 7 days' + id: check + run: | + # Always run on manual trigger + if [ "${{github.event_name}}" = "workflow_dispatch" ]; then + echo "has_changes=true" >> $GITHUB_OUTPUT + exit 0 + fi + # Check for commits in last 7 days + COMMITS=$(git log --oneline --since="7 days ago" | wc -l) + if [ "$COMMITS" -gt 0 ]; then + echo "Found $COMMITS commits in last 7 days" + echo "has_changes=true" >> $GITHUB_OUTPUT + else + echo "No commits in last 7 days, skipping" + echo "has_changes=false" >> $GITHUB_OUTPUT + fi + + launch-runner: + needs: check-changes + if: needs.check-changes.outputs.has_changes == 'true' + runs-on: + labels: [isoquant] + name: 'Running custom_sc StereoSeq barcode detection QC' + + steps: + - name: 'Cleanup' + run: > + set -e && + shopt -s dotglob && + rm -rf * + + - name: 'Checkout' + uses: actions/checkout@v3 + with: + fetch-depth: 1 + + - name: 'Custom SC StereoSeq small (KmerIndexer)' + shell: bash + env: + STEP_NAME: Mouse.StereoSeq.custom_sc.small + run: | + export PATH=$PATH:${{env.BIN_PATH}} + python3 ${{env.LAUNCHER}} ${{env.CFG_DIR}}/${{env.STEP_NAME}}.cfg -o ${{env.OUTPUT_BASE}} + + - name: 'Custom SC StereoSeq large (Dict2BitKmerIndexer)' + shell: bash + env: + STEP_NAME: Mouse.StereoSeq.custom_sc.large + run: | + export PATH=$PATH:${{env.BIN_PATH}} + python3 ${{env.LAUNCHER}} ${{env.CFG_DIR}}/${{env.STEP_NAME}}.cfg -o ${{env.OUTPUT_BASE}} From a8c6ff92feb3ffcb054d66427e7431e97cbdb41d Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Wed, 4 Feb 2026 14:50:05 +0100 Subject: [PATCH 36/41] fix var len min score --- src/barcode_calling/callers/universal_extraction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/barcode_calling/callers/universal_extraction.py b/src/barcode_calling/callers/universal_extraction.py index d62ef581..de774ae7 100644 --- a/src/barcode_calling/callers/universal_extraction.py +++ b/src/barcode_calling/callers/universal_extraction.py @@ -125,7 +125,7 @@ def prepare_barcode_index(self, base_name, barcode_list, barcode_length): self.min_scores[base_name] = barcode_length - 1 self.min_scores[base_name] = barcode_length - filling_edit_distance if variable_length: - self.min_scores[base_name] = max(self.min_scores[base_name] - 1, 0) + self.min_scores[base_name] = min(self.min_scores[base_name] + 1, barcode_length) logger.info("Minimal score for element %s is set to %d" % (base_name, self.min_scores[base_name])) if barcode_count > 1000000: From 66d3ce1f4d6aaed763790a756f0dba89a05c54aa Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Wed, 4 Feb 2026 16:38:24 +0200 Subject: [PATCH 37/41] visium custom testing --- .github/workflows/Barcode.Custom.VisiumHD.yml | 82 +++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 .github/workflows/Barcode.Custom.VisiumHD.yml diff --git a/.github/workflows/Barcode.Custom.VisiumHD.yml b/.github/workflows/Barcode.Custom.VisiumHD.yml new file mode 100644 index 00000000..5e32ed2a --- /dev/null +++ b/.github/workflows/Barcode.Custom.VisiumHD.yml @@ -0,0 +1,82 @@ +name: Barcode Custom SC Visium HD + +on: + workflow_dispatch: + schedule: + - cron: '0 2 * * 4' + +env: + LAUNCHER: ${{github.workspace}}/tests/github/run_barcode_test.py + CFG_DIR: /abga/work/andreyp/ci_isoquant/data/barcodes + BIN_PATH: /abga/work/andreyp/ci_isoquant/bin/ + OUTPUT_BASE: /abga/work/andreyp/ci_isoquant/output/${{github.ref_name}}/barcodes/ + +concurrency: + group: ${{github.workflow}} + cancel-in-progress: false + +jobs: + check-changes: + runs-on: + labels: [isoquant] + name: 'Check for recent changes' + outputs: + has_changes: ${{steps.check.outputs.has_changes}} + steps: + - name: 'Checkout' + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: 'Check for commits in last 7 days' + id: check + run: | + # Always run on manual trigger + if [ "${{github.event_name}}" = "workflow_dispatch" ]; then + echo "has_changes=true" >> $GITHUB_OUTPUT + exit 0 + fi + # Check for commits in last 7 days + COMMITS=$(git log --oneline --since="7 days ago" | wc -l) + if [ "$COMMITS" -gt 0 ]; then + echo "Found $COMMITS commits in last 7 days" + echo "has_changes=true" >> $GITHUB_OUTPUT + else + echo "No commits in last 7 days, skipping" + echo "has_changes=false" >> $GITHUB_OUTPUT + fi + + launch-runner: + needs: check-changes + if: needs.check-changes.outputs.has_changes == 'true' + runs-on: + labels: [isoquant] + name: 'Running custom_sc Visium HD barcode detection QC' + + steps: + - name: 'Cleanup' + run: > + set -e && + shopt -s dotglob && + rm -rf * + + - name: 'Checkout' + uses: actions/checkout@v3 + with: + fetch-depth: 1 + + - name: 'Custom SC Visium HD small (ArrayKmerIndexer)' + shell: bash + env: + STEP_NAME: Mouse.VisiumHD.custom_sc.small + run: | + export PATH=$PATH:${{env.BIN_PATH}} + python3 ${{env.LAUNCHER}} ${{env.CFG_DIR}}/${{env.STEP_NAME}}.cfg -o ${{env.OUTPUT_BASE}} + + - name: 'Custom SC Visium HD large (Array2BitKmerIndexer)' + shell: bash + env: + STEP_NAME: Mouse.VisiumHD.custom_sc.large + run: | + export PATH=$PATH:${{env.BIN_PATH}} + python3 ${{env.LAUNCHER}} ${{env.CFG_DIR}}/${{env.STEP_NAME}}.cfg -o ${{env.OUTPUT_BASE}} From 5228cf87e0dce0e987198550a3547afe301f07f3 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Wed, 4 Feb 2026 16:29:00 +0100 Subject: [PATCH 38/41] test data --- tests/universal_data/10x.mdf | 5 +++++ tests/universal_data/barcodes_small.tsv | 8 ++++++++ tests/universal_data/simple.mdf | 3 +++ tests/universal_data/test_reads.fq | 16 ++++++++++++++++ 4 files changed, 32 insertions(+) create mode 100644 tests/universal_data/10x.mdf create mode 100644 tests/universal_data/barcodes_small.tsv create mode 100644 tests/universal_data/simple.mdf create mode 100644 tests/universal_data/test_reads.fq diff --git a/tests/universal_data/10x.mdf b/tests/universal_data/10x.mdf new file mode 100644 index 00000000..b14e4004 --- /dev/null +++ b/tests/universal_data/10x.mdf @@ -0,0 +1,5 @@ +R1:Barcode:UMI:PolyT:cDNA:TSO +R1 CONST CTACACGACGCTCTTCCGATCT +Barcode VAR_FILE barcodes_small.tsv +UMI VAR_ANY 12 +TSO CONST CCCATGTACTCTGCGTTGATACCACTGCTT diff --git a/tests/universal_data/barcodes_small.tsv b/tests/universal_data/barcodes_small.tsv new file mode 100644 index 00000000..3fd4bd57 --- /dev/null +++ b/tests/universal_data/barcodes_small.tsv @@ -0,0 +1,8 @@ +AAACCCGGGTTTAAAC +TTTGGGCCCAAATTTG +GGGGAAAACCCCTTTT +ATCCTTAGTGTTTGTC +CTTAGGAGTGGTTAGC +TGAGGGCCAACGTGCT +GCGCTTGCATAGCAGG +ATCATGTCATTGTGAT diff --git a/tests/universal_data/simple.mdf b/tests/universal_data/simple.mdf new file mode 100644 index 00000000..f1ce3ecb --- /dev/null +++ b/tests/universal_data/simple.mdf @@ -0,0 +1,3 @@ +Barcode:UMI:PolyT:cDNA +Barcode VAR_LIST AAACCCGGGTTTAAAC,TTTGGGCCCAAATTTG,GGGGAAAACCCCTTTT +UMI VAR_ANY 10 diff --git a/tests/universal_data/test_reads.fq b/tests/universal_data/test_reads.fq new file mode 100644 index 00000000..281eb1e2 --- /dev/null +++ b/tests/universal_data/test_reads.fq @@ -0,0 +1,16 @@ +@READ_1_ATCCTTAGTGTTTGTC_AACCTTAGACCG_forward +TACACGACGCTCTTCCGATCTATCCTTAGTGTTTGTCAACCTTAGACCGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGGGCCTCAGCGTAATCCTATTGTGCATGAATTTTATGTCATCACTTGGCTTCAACATTGGGTTGTCATCATTGAACCCGCCCGGTTGCGTTGATACAATGCTT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@READ_2_CTTAGGAGTGGTTAGC_GAGCCTCCTTGA_reverse +AAGCAGTGGTATCAACGCAGAGTACATGGGAAGCAAGGAGGCAAGGCCCGCGCCAAGGCCAAGTCGCGGTCTTCCCGGGCCCGGGCTACCTATTCCCGGTGGGGCGTGTGCACCGGCTGCTGCGCAAGGGCAACTACGCGGAGCGTGTGGGCGCCGGCGCGCCGGTATACATGGCGGCGGTGCTGGAGTACCTAACGGCCGAGATCCTGGAGCTGGCGGGCAACGCGGCCCGCGACAACAAGAAGACTCGCATCATCCCGCGCCACCTGCAGCTGGCCATCCGCAACGACGAGGAGCTCAACAAGCTGCTGGGCAAAGTGACGATCGCGCAGGGCGGCGTCCTGCCCAACATCCAGGCCGTGCTGCTGCCCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATCAAGGAGGCTCGCTAACCACTCCTAAGAGATCGGAAGAGCGTCGTGTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@READ_3_TGAGGGCCAACGTGCT_CAAAGCCTTCCC_forward +TACACGACGCTCTTCCGATCTTGAGGGCCAACGTGCTCAAAGCCTTCCCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGAAGAGCAGGACACAGGAAGATGGACTCTGAGATCCAGCAGGAAGCGGAGAAGGCACTGCAGCAAAGGCTCTGCTCTGGAAGCAAAGGGAAGCAGCTTCCAAGAGAGCAGGGCAGGAGGAAGCCGACTGAGGACTGCTGTGCCGCAGGCGAGGCTTAAAGCCCAGAAAGCAATATGCAAGGTAGAAGAGACCAATGTGTACCGCCGAGAAGCTCAACGGAGGCACCTTGTGCCATGGTCTCAACTTAATCTAGGATGTTTCTGAAGCCGCCCAGACTCAACAAACTTGTCTACAAACTTTGTTTCTATTTGTAATGCTTAATCTTGTCTTCTTATACTTTTCAAATGTTAAGGTTTGTGCATAGGCAAGACTTTTTATATCAAAAGGACAGGAGTCTCTGTATACCCTGCACATGTATGTGAGCCTCTGTACCCTGCGGTACACATGTATGTGTGGCTCTGTACCCTGCAGTGCACACGTGTGTGTGGCTCTGTACCCTGCAGTGCACATGTGTGTGTGGCTCTGTACCCTGGCAGTGCAGAATGGCGGTGTGCGATGTACCCAGCATGCACTATTGTGTGTGGCCTCTAGAAGATCTGATGAAATGAGCATTCTCACCCTGCTTCCAGCTACATTATGATTTAAAAATATTTTTACATTTATTTGTATGTACTTGTATTACTGCATGAAAACTACTGCCACATTTGATCAGAGGACATGTACTTGTATCACAGCATGAAAACTACTGCAACATTTGATCAGAGGACATGTGTGTGTGTGTGCATGTGCGCTTGCATGTTTGTATCCACAGTGTATGTGCAGAAGTCAAAAGACAACTTGCACAAGTCAGCTCTATCTTTCTTTCTACTACATAGGGTCCAGGGGTAAAATTCAGGTCCTCAAATTTGCTGGCGAGCACCTATACTCGAATTGGACCACCTCACTTGGGACTTCCTAAAGACATGCTTAGACACTTACACCCCCGGATCAACATCTACACTAAAAGAATAGAAAGGCCATATTGACATCAACTAATCAGTTCTCCCATTATGACTCACACAAATCCTCAAGTGGATTACAGGATTAGGAGATCTGAATTTTATCTTTCTCCGACCCACTCTACCGAGTTTCCACACTTACCCTGCTCTGGTGGTAGAAGGCTCCCTAGCATATAGCCCAAGAGGCCACTCTTGGAGGAAAAGATTAGTTACAGTCAGTGAACCCCAGGAGAGAAAGTTCCTACATACTCACCCTCCCTGGGATAGCCTTCTCTCCCTGCACACAAAGCCCACCAACTATCCTGCAGCATTTCCTCCCTAAGTTACCTGTCTCCTCCTAATTACTCTCTTCCCAGTTTATTCATACATTCCCCTACAGAAGCTAGGAGGGCTGGGATTTCTGGCTGGCCTGCCCACTGCTGAAGTCCTTGGGAGTTCGTTGGAACATTGGGGTGCAATGTCAACAGAACAGTTTATTTGTCTGTATTCTTTCCATTGCACCCCATCTTCTCTATCACAATAGGTGTTTTTTCTTTCAAAGTAGCTACCACTGGACTCTTTCAAAATCTATCACAAAGATGGCCAGTTCTGCTGTGGGTTGCTAAGTCTGTCGTGTACCCTCAGAGTCTGCCTTATAAGGACTTGACAGAAATTATTTTTAACAACATTCTGTTGTGATATTATATGCAAGGCAGACCACATCTTCATGTATACATTCTGTTAAAACAACTAACCCTAAAGGCCAGGCATGGCCAGATGTTCATAGTCATTTGTGCATCTAGCCCTTGGCCATCCTGTGGTTCATAACTGAAGAAGACAAGACATGACTTGCTTCCCCCTAGACCATTGTGGCTGCTAAGGCCTACTTGGTGCTTGTAAAAATGAATCTCATTTTATTAATTGTGTGACTAAAGGTAAGTCACATAGTCCATCAGACCTTCATGTCTATCAACTGGAAAACAGTATCAATAGTAATACATTAGTGCTATAAGCCATTAGTCACTCTATATAAAACAAGCTCCTAAATGGTACTAAGTGAATAGAAGCTGTAATTATAGCTAGCATGATGACTGGTACATTTAATTGTAGGCTTTTGTCCCTATATTGTTTTGTAACAGGTTTTAGTTTATGCATCTAGACTTAGTTTGTCCAATTAAATCTTGAGTGTAACAACTGCAGAAAGTCTCCATGTTGCAACCACTTTGCAACATGCCTCTGAATGCTGTCTTCACCAGGTAATTCCCTTGGCTAATAGTTCTTAGGTTCATACGATATGCCAGACACAGTCTCTGTTTTGAAGGAGCTGCAGATCTCAAGAGTGAAAGGAAAGTGTCCACATGGATCTGGGGTGCATATACTCCCATGCTCCAACACAGCTGAGAAGATGGACTTTCTAACATTTACCACTTCAAATCCCACGGGAAAAGCACGAATGTATGAGCCACAGATTCTAAATAAAGAACTTAGACTATCTAACACGGAGTCAAGAAATAGGTGGAAGAAGGAAGATAAAGTTTCACCTCCTCAAGTAACGAATCCCATACCAGTCTGTTTGTAGATAGAACTGGTTAGAGTTAGCATCTTCCGTCCCCCTCGTTGGCTTGAGCTGGGGCAACCTTGGGTCTTTGGGTGGGTCCCATGTACTCTGCGTTGATACCACTGCTT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@READ_4_UNKNOWN_BARCODE_UNKNOWNUMI_random +ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII From 4701595fe83a8ca5a7dc541501ea9fcf036c3422 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Wed, 4 Feb 2026 16:34:22 +0100 Subject: [PATCH 39/41] fix flake test but changing numpy import --- src/barcode_calling/common.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/barcode_calling/common.py b/src/barcode_calling/common.py index 4349a5bf..6ed04bfc 100644 --- a/src/barcode_calling/common.py +++ b/src/barcode_calling/common.py @@ -8,7 +8,7 @@ import sys import logging import gzip -import numpy as np +import numpy from ssw import AlignmentMgr from ..error_codes import IsoQuantExitCode @@ -28,11 +28,11 @@ @njit(cache=True, parallel=True) def _compute_2bit_numba(encoded, shifts, n_barcodes, seq_len): """Numba JIT-compiled 2-bit encoding computation with parallel execution.""" - result = np.zeros(n_barcodes, dtype=np.uint64) + result = numpy.zeros(n_barcodes, dtype=numpy.uint64) for i in prange(n_barcodes): - val = np.uint64(0) + val = numpy.uint64(0) for pos in range(seq_len): - val |= np.uint64(encoded[i, pos]) << shifts[pos] + val |= numpy.uint64(encoded[i, pos]) << shifts[pos] result[i] = val return result @@ -296,7 +296,7 @@ def batch_str_to_2bit(barcodes, seq_len): numpy.ndarray of uint64 containing 2-bit encoded barcodes """ # Convert iterator to list if needed, reading in chunks to show progress - if not isinstance(barcodes, (list, np.ndarray)): + if not isinstance(barcodes, (list, numpy.ndarray)): logger.info("Loading barcodes from iterator...") barcode_list = [] chunk_size = 10_000_000 @@ -309,30 +309,30 @@ def batch_str_to_2bit(barcodes, seq_len): n_barcodes = len(barcodes) if n_barcodes == 0: - return np.array([], dtype=np.uint64) + return numpy.array([], dtype=numpy.uint64) logger.info("Converting %d barcodes to 2-bit encoding..." % n_barcodes) # Fast path: join all barcodes into single string, convert to bytes at once # This avoids Python loop overhead for encode() all_bytes = ''.join(barcodes).encode('ascii') - byte_array = np.frombuffer(all_bytes, dtype=np.uint8).reshape(n_barcodes, seq_len) + byte_array = numpy.frombuffer(all_bytes, dtype=numpy.uint8).reshape(n_barcodes, seq_len) # Apply the encoding: (ord(char) & 6) >> 1 # A=65 -> 0, C=67 -> 1, T=84 -> 2, G=71 -> 3 encoded = (byte_array & 6) >> 1 # Compute bit shifts for each position (highest position = leftmost) - shifts = np.arange(seq_len - 1, -1, -1, dtype=np.uint64) * 2 + shifts = numpy.arange(seq_len - 1, -1, -1, dtype=numpy.uint64) * 2 # Compute 2-bit encoding - use Numba if available for parallel execution if NUMBA_AVAILABLE: result = _compute_2bit_numba(encoded, shifts, n_barcodes, seq_len) else: # Fallback: vectorized NumPy loop - result = np.zeros(n_barcodes, dtype=np.uint64) + result = numpy.zeros(n_barcodes, dtype=numpy.uint64) for pos in range(seq_len): - result |= encoded[:, pos].astype(np.uint64) << shifts[pos] + result |= encoded[:, pos].astype(numpy.uint64) << shifts[pos] logger.info("Barcode conversion complete") return result @@ -378,33 +378,33 @@ def batch_str_to_2bit_chunked(barcode_iterator, seq_len, chunk_size=50_000_000): logger.info("Barcode conversion complete: %d barcodes" % total_processed) if not all_results: - return np.array([], dtype=np.uint64) - return np.concatenate(all_results) + return numpy.array([], dtype=numpy.uint64) + return numpy.concatenate(all_results) def _convert_chunk_to_2bit(barcodes, seq_len): """Convert a chunk of barcodes to 2-bit encoding (internal helper).""" n_barcodes = len(barcodes) if n_barcodes == 0: - return np.array([], dtype=np.uint64) + return numpy.array([], dtype=numpy.uint64) # Fast path: join all barcodes into single string, convert to bytes at once all_bytes = ''.join(barcodes).encode('ascii') - byte_array = np.frombuffer(all_bytes, dtype=np.uint8).reshape(n_barcodes, seq_len) + byte_array = numpy.frombuffer(all_bytes, dtype=numpy.uint8).reshape(n_barcodes, seq_len) # Apply encoding: (ord(char) & 6) >> 1 encoded = (byte_array & 6) >> 1 # Compute shifts - shifts = np.arange(seq_len - 1, -1, -1, dtype=np.uint64) * 2 + shifts = numpy.arange(seq_len - 1, -1, -1, dtype=numpy.uint64) * 2 # Compute result - use Numba if available for parallel execution if NUMBA_AVAILABLE: result = _compute_2bit_numba(encoded, shifts, n_barcodes, seq_len) else: - result = np.zeros(n_barcodes, dtype=np.uint64) + result = numpy.zeros(n_barcodes, dtype=numpy.uint64) for pos in range(seq_len): - result |= encoded[:, pos].astype(np.uint64) << shifts[pos] + result |= encoded[:, pos].astype(numpy.uint64) << shifts[pos] return result From 15fdf2fac2ba1127a80b37fdca105ede53e871cd Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Thu, 5 Feb 2026 12:17:34 +0200 Subject: [PATCH 40/41] add single-cell 10x pipeline test --- .github/workflows/SC.Mouse.10x.yml | 74 +++++++++++ misc/assess_allinfo_quality.py | 191 +++++++++++++++++++++++++++++ misc/create_stereo_test_subsets.py | 118 ++++++++++++++++++ misc/create_test_barcodes.py | 82 +++++++++++++ tests/github/run_pipeline.py | 75 +++++++++++ 5 files changed, 540 insertions(+) create mode 100644 .github/workflows/SC.Mouse.10x.yml create mode 100644 misc/assess_allinfo_quality.py create mode 100644 misc/create_stereo_test_subsets.py create mode 100644 misc/create_test_barcodes.py diff --git a/.github/workflows/SC.Mouse.10x.yml b/.github/workflows/SC.Mouse.10x.yml new file mode 100644 index 00000000..d13de1dc --- /dev/null +++ b/.github/workflows/SC.Mouse.10x.yml @@ -0,0 +1,74 @@ +name: SC Mouse 10x Allinfo + +on: + workflow_dispatch: + schedule: + - cron: '0 5 * * 6' + +env: + LAUNCHER: ${{github.workspace}}/tests/github/run_pipeline.py + CFG_DIR: /abga/work/andreyp/ci_isoquant/data + BIN_PATH: /abga/work/andreyp/ci_isoquant/bin/ + OUTPUT_BASE: /abga/work/andreyp/ci_isoquant/output/${{github.ref_name}}/ + +concurrency: + group: ${{github.workflow}} + cancel-in-progress: false + +jobs: + check-changes: + runs-on: + labels: [isoquant] + name: 'Check for recent changes' + outputs: + has_changes: ${{steps.check.outputs.has_changes}} + steps: + - name: 'Checkout' + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: 'Check for commits in last 7 days' + id: check + run: | + # Always run on manual trigger + if [ "${{github.event_name}}" = "workflow_dispatch" ]; then + echo "has_changes=true" >> $GITHUB_OUTPUT + exit 0 + fi + # Check for commits in last 7 days + COMMITS=$(git log --oneline --since="7 days ago" | wc -l) + if [ "$COMMITS" -gt 0 ]; then + echo "Found $COMMITS commits in last 7 days" + echo "has_changes=true" >> $GITHUB_OUTPUT + else + echo "No commits in last 7 days, skipping" + echo "has_changes=false" >> $GITHUB_OUTPUT + fi + + launch-runner: + needs: check-changes + if: needs.check-changes.outputs.has_changes == 'true' + runs-on: + labels: [isoquant] + name: 'Running SC Mouse 10x pipeline with allinfo QC' + + steps: + - name: 'Cleanup' + run: > + set -e && + shopt -s dotglob && + rm -rf * + + - name: 'Checkout' + uses: actions/checkout@v3 + with: + fetch-depth: 1 + + - name: 'SC Mouse 10x allinfo' + shell: bash + env: + STEP_NAME: SC.Mouse.10x.allinfo + run: | + export PATH=$PATH:${{env.BIN_PATH}} + python3 ${{env.LAUNCHER}} ${{env.CFG_DIR}}/${{env.STEP_NAME}}.cfg -o ${{env.OUTPUT_BASE}} diff --git a/misc/assess_allinfo_quality.py b/misc/assess_allinfo_quality.py new file mode 100644 index 00000000..f71f938c --- /dev/null +++ b/misc/assess_allinfo_quality.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python3 + +############################################################################ +# Copyright (c) 2022-2026 University of Helsinki +# Copyright (c) 2020-2022 Saint Petersburg State University +# # All Rights Reserved +# See file LICENSE for details. +############################################################################ + +# Assess quality of UMI-filtered allinfo output by independently computing +# statistics from the allinfo file and cross-validating against stats.tsv. + +import argparse +import gzip +import logging +import sys +from collections import Counter +from typing import Dict, Tuple + +logger = logging.getLogger('AllinfoQA') + + +def setup_logger(): + logger.setLevel(logging.INFO) + ch = logging.StreamHandler(sys.stdout) + ch.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + ch.setFormatter(formatter) + logger.addHandler(ch) + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Assess quality of UMI-filtered allinfo output") + parser.add_argument("--allinfo", type=str, required=True, + help="Path to allinfo file (plain or .gz)") + parser.add_argument("--stats", type=str, required=True, + help="Path to stats.tsv file from UMI filtering") + parser.add_argument("--output", "-o", type=str, required=True, + help="Output quality report file (TSV)") + return parser.parse_args() + + +def load_tsv_config(filepath: str) -> Dict[str, str]: + config = {} + with open(filepath) as f: + for line in f: + if line.startswith("#"): + continue + tokens = line.strip().split('\t') + if len(tokens) >= 2: + config[tokens[0]] = tokens[1] + return config + + +def compute_allinfo_stats(allinfo_path: str) -> Tuple[Dict[str, int], int]: + """Parse allinfo file and compute statistics. + + Returns: + Tuple of (stats_dict, duplicate_triplet_count) + """ + open_fn = gzip.open if allinfo_path.endswith('.gz') else open + mode = 'rt' if allinfo_path.endswith('.gz') else 'r' + + total_reads = 0 + spliced_reads = 0 + gene_barcode_pairs: set = set() + triplet_counts: Counter = Counter() + + with open_fn(allinfo_path, mode) as f: + for line in f: + line = line.strip() + if not line: + continue + + fields = line.split('\t') + if len(fields) < 13: + logger.warning("Skipping malformed line with %d fields: %s", + len(fields), line[:100]) + continue + + total_reads += 1 + + # Columns (0-indexed): + # 0: read_id, 1: gene_id, 2: cell_type, 3: barcode, 4: umi, + # 5: introns, 6: TSS, 7: polyA, 8: exons, 9: read_type, + # 10: num_introns, 11: transcript_id, 12: transcript_type + gene_id = fields[1] + barcode = fields[3] + umi = fields[4] + num_introns = int(fields[10]) + + if num_introns > 0: + spliced_reads += 1 + + gene_barcode_pairs.add((gene_id, barcode)) + triplet_counts[(gene_id, barcode, umi)] += 1 + + duplicate_triplets = sum(1 for count in triplet_counts.values() if count > 1) + + stats = { + "Total reads saved": total_reads, + "Spliced reads saved": spliced_reads, + "Unique gene-barcodes pairs": len(gene_barcode_pairs), + } + return stats, duplicate_triplets + + +def cross_validate(computed_stats: Dict[str, int], + stats_tsv: Dict[str, str]) -> bool: + """Cross-validate computed allinfo stats against stats.tsv values. + + Returns True if all checks pass. + """ + all_match = True + + checks = [ + ("Total reads saved", "Total reads saved"), + ("Spliced reads saved", "Spliced reads saved"), + ("Unique gene-barcodes pairs", "Unique gene-barcodes pairs"), + ] + + for computed_key, stats_key in checks: + if stats_key not in stats_tsv: + logger.error("Key '%s' not found in stats.tsv", stats_key) + all_match = False + continue + + computed_val = computed_stats[computed_key] + stats_val = int(stats_tsv[stats_key]) + + if computed_val != stats_val: + logger.error("Mismatch for '%s': allinfo=%d, stats.tsv=%d", + computed_key, computed_val, stats_val) + all_match = False + else: + logger.info("OK '%s': %d", computed_key, computed_val) + + return all_match + + +def main(): + args = parse_args() + setup_logger() + + logger.info("Loading allinfo from %s", args.allinfo) + computed_stats, duplicate_triplets = compute_allinfo_stats(args.allinfo) + logger.info("Computed stats from allinfo: %s", computed_stats) + + logger.info("Loading stats from %s", args.stats) + stats_tsv = load_tsv_config(args.stats) + logger.info("Stats.tsv contains %d entries", len(stats_tsv)) + + # Cross-validate + stats_match = cross_validate(computed_stats, stats_tsv) + + if duplicate_triplets > 0: + logger.error("Found %d duplicate gene-barcode-UMI triplets in allinfo", + duplicate_triplets) + else: + logger.info("No duplicate gene-barcode-UMI triplets found") + + # Write quality report + exit_code = 0 + with open(args.output, "w") as outf: + # Pass through all stats.tsv entries + for key, val in stats_tsv.items(): + outf.write("%s\t%s\n" % (key, val)) + + # Add computed validation metrics + outf.write("duplicate_triplets\t%d\n" % duplicate_triplets) + allinfo_match_val = 1 if stats_match else 0 + outf.write("allinfo_stats_match\t%d\n" % allinfo_match_val) + + logger.info("Quality report written to %s", args.output) + + if not stats_match: + logger.error("FAILED: allinfo stats do not match stats.tsv") + exit_code = 1 + if duplicate_triplets > 0: + logger.error("FAILED: duplicate gene-barcode-UMI triplets detected") + exit_code = 1 + + if exit_code == 0: + logger.info("All checks passed") + + return exit_code + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/misc/create_stereo_test_subsets.py b/misc/create_stereo_test_subsets.py new file mode 100644 index 00000000..d72ef807 --- /dev/null +++ b/misc/create_stereo_test_subsets.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +""" +Create stereo-seq barcode subsets and corresponding filtered read files. + +From the full 10M stereo barcode whitelist: +1. Randomly select 1M barcodes -> stereo.1M.tsv +2. Randomly select 80K barcodes (subset of the 1M) -> stereo.80K.tsv +3. Filter reads to keep only those matching each subset + +Read ID format: >READ_____... +Barcode is field 3 (0-indexed, underscore-delimited). +""" + +import os +import random +import sys + +DATA_DIR = "/abga/work/andreyp/ci_isoquant/data/barcodes" + +FULL_BARCODE_FILE = os.path.join(DATA_DIR, "D04620C2.10M.tsv") +FULL_READS_FILE = os.path.join(DATA_DIR, "Mouse.StereoSeq.D04620C2.10M.800K.fa") + +SUBSET_1M_FILE = os.path.join(DATA_DIR, "stereo.1M.tsv") +SUBSET_80K_FILE = os.path.join(DATA_DIR, "stereo.80K.tsv") +READS_1M_FILE = os.path.join(DATA_DIR, "Mouse.StereoSeq.custom_sc.1M.fa") +READS_80K_FILE = os.path.join(DATA_DIR, "Mouse.StereoSeq.custom_sc.80K.fa") + +SUBSET_1M_SIZE = 1000000 +SUBSET_80K_SIZE = 80000 + + +def load_barcodes_with_lines(filepath: str) -> tuple[list[str], list[str]]: + """Load barcodes (first column) and full lines from TSV file.""" + barcodes = [] + lines = [] + with open(filepath) as f: + for line in f: + stripped = line.strip() + if not stripped: + continue + bc = stripped.split()[0] + barcodes.append(bc) + lines.append(line) + return barcodes, lines + + +def extract_barcode_from_read_id(read_id: str) -> str: + """Extract barcode from FASTA read ID. + + Format: >READ_idx_transcript_BARCODE_UMI_... + """ + parts = read_id.lstrip(">").split("_") + if len(parts) >= 4: + return parts[3] + return "" + + +def filter_reads_fasta(input_fasta: str, output_fasta: str, barcode_set: set[str]) -> int: + """Filter FASTA file, keeping reads with barcodes in the given set.""" + kept = 0 + total = 0 + with open(input_fasta) as inf, open(output_fasta, "w") as outf: + keep_current = False + for line in inf: + if line.startswith(">"): + total += 1 + bc = extract_barcode_from_read_id(line.strip()) + keep_current = bc in barcode_set + if keep_current: + kept += 1 + if keep_current: + outf.write(line) + return kept, total + + +def main(): + random.seed(42) + + print("Loading full barcode set...") + barcodes, lines = load_barcodes_with_lines(FULL_BARCODE_FILE) + print(f" Loaded {len(barcodes)} barcodes") + + # Select 1M subset + print(f"Selecting {SUBSET_1M_SIZE} barcodes for 1M subset...") + indices_1m = sorted(random.sample(range(len(barcodes)), SUBSET_1M_SIZE)) + barcodes_1m = set(barcodes[i] for i in indices_1m) + + with open(SUBSET_1M_FILE, "w") as f: + for i in indices_1m: + f.write(lines[i]) + print(f" Written to {SUBSET_1M_FILE}") + + # Select 80K subset from the 1M subset + print(f"Selecting {SUBSET_80K_SIZE} barcodes for 80K subset (from 1M)...") + indices_80k_in_1m = sorted(random.sample(range(len(indices_1m)), SUBSET_80K_SIZE)) + barcodes_80k = set(barcodes[indices_1m[i]] for i in indices_80k_in_1m) + + with open(SUBSET_80K_FILE, "w") as f: + for i in indices_80k_in_1m: + orig_idx = indices_1m[i] + f.write(lines[orig_idx]) + print(f" Written to {SUBSET_80K_FILE}") + + # Filter reads for 1M subset + print("Filtering reads for 1M subset...") + kept_1m, total = filter_reads_fasta(FULL_READS_FILE, READS_1M_FILE, barcodes_1m) + print(f" Kept {kept_1m}/{total} reads -> {READS_1M_FILE}") + + # Filter reads for 80K subset + print("Filtering reads for 80K subset...") + kept_80k, total = filter_reads_fasta(FULL_READS_FILE, READS_80K_FILE, barcodes_80k) + print(f" Kept {kept_80k}/{total} reads -> {READS_80K_FILE}") + + print("Done.") + + +if __name__ == "__main__": + main() diff --git a/misc/create_test_barcodes.py b/misc/create_test_barcodes.py new file mode 100644 index 00000000..a27ef373 --- /dev/null +++ b/misc/create_test_barcodes.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +""" +Generate inflated barcode whitelist files for testing large k-mer indexer paths. + +Appends random unique DNA sequences to existing barcode files to push +the total count above 100K, triggering Array2BitKmerIndexer / Dict2BitKmerIndexer +in the universal barcode calling algorithm. +""" + +import os +import random +import sys + +NUCLEOTIDES = "ACGT" +DATA_DIR = "/abga/work/andreyp/ci_isoquant/data/barcodes" + +# Configuration: (input_file, output_file, dummy_count, barcode_length) +CONFIGS = [ + ("10xMultiome_5K.tsv", "10x.large_barcodes.tsv", 120000, 16), + ("A0079_044_BeadBarcodes.tsv", "curio.large_barcodes.tsv", 50000, 14), + ("visium_v1_hd.slide1.barcodes1.tsv", "visium.large_barcodes1.tsv", 120000, 15), + ("visium_v1_hd.slide1.barcodes2.tsv", "visium.large_barcodes2.tsv", 120000, 14), +] + + +def load_existing_barcodes(filepath: str) -> set[str]: + """Load first column from TSV barcode file.""" + barcodes = set() + with open(filepath) as f: + for line in f: + bc = line.strip().split()[0] + if bc: + barcodes.add(bc) + return barcodes + + +def generate_random_barcode(length: int) -> str: + return "".join(random.choice(NUCLEOTIDES) for _ in range(length)) + + +def generate_unique_barcodes(count: int, length: int, existing: set[str]) -> list[str]: + """Generate random unique barcodes not in the existing set.""" + new_barcodes = [] + seen = set(existing) + while len(new_barcodes) < count: + bc = generate_random_barcode(length) + if bc not in seen: + seen.add(bc) + new_barcodes.append(bc) + return new_barcodes + + +def main(): + random.seed(42) # Reproducible results + + for input_name, output_name, dummy_count, bc_length in CONFIGS: + input_path = os.path.join(DATA_DIR, input_name) + output_path = os.path.join(DATA_DIR, output_name) + + print(f"Processing {input_name} -> {output_name}") + existing = load_existing_barcodes(input_path) + print(f" Loaded {len(existing)} existing barcodes") + + dummy = generate_unique_barcodes(dummy_count, bc_length, existing) + print(f" Generated {len(dummy)} dummy barcodes ({bc_length}bp)") + + # Write output: copy original file + append dummy barcodes + with open(output_path, "w") as out: + with open(input_path) as inp: + for line in inp: + out.write(line) + for bc in dummy: + out.write(bc + "\n") + + total = len(existing) + len(dummy) + print(f" Total: {total} barcodes -> {output_path}") + + print("Done.") + + +if __name__ == "__main__": + main() diff --git a/tests/github/run_pipeline.py b/tests/github/run_pipeline.py index e749932a..3259e77a 100755 --- a/tests/github/run_pipeline.py +++ b/tests/github/run_pipeline.py @@ -31,6 +31,7 @@ RT_QUANTIFICATION_NOVEL = "quantification_novel" RT_QUANTIFICATION_GENES = "quantification_genes" RT_PERFORMANCE = "performance" +RT_ALLINFO = "allinfo" log = logging.getLogger('GitHubRunner') @@ -433,6 +434,78 @@ def run_performance_assessment(args, config_dict): return exit_code +def run_allinfo_quality(args, config_dict): + log.info('== Running allinfo quality assessment ==') + config_file = args.config_file + source_dir = os.path.dirname(os.path.realpath(__file__)) + isoquant_dir = os.path.join(source_dir, "../../") + + run_name = config_dict["name"] + label = config_dict["label"] + output_folder = os.path.join(args.output if args.output else config_dict["output"], run_name) + internal_output_dir = os.path.join(output_folder, label) + + # Edit distance from config or default (3 for 10x/curio, 4 for stereo/visium_hd) + edit_distance = int(config_dict.get("edit_distance", "3")) + allinfo_file = os.path.join(internal_output_dir, "%s.UMI_filtered.ED%d.allinfo" % (label, edit_distance)) + stats_file = os.path.join(internal_output_dir, "%s.UMI_filtered.ED%d.stats.tsv" % (label, edit_distance)) + + # Support gzipped allinfo + if not os.path.exists(allinfo_file) and os.path.exists(allinfo_file + ".gz"): + allinfo_file = allinfo_file + ".gz" + + if not os.path.exists(allinfo_file): + log.error("Allinfo file not found: %s" % allinfo_file) + return -5 + if not os.path.exists(stats_file): + log.error("Stats file not found: %s" % stats_file) + return -5 + + quality_report = os.path.join(output_folder, "allinfo_quality_report.tsv") + + qa_command = ["python3", os.path.join(isoquant_dir, "misc/assess_allinfo_quality.py"), + "--allinfo", allinfo_file, "--stats", stats_file, + "--output", quality_report] + + log.info("Running: %s" % " ".join(qa_command)) + result = subprocess.run(qa_command) + if result.returncode != 0: + log.error("Allinfo QA script failed with code %d" % result.returncode) + return -11 + + # Compare against etalon if provided + if "etalon_allinfo" not in config_dict: + return 0 + + etalon_file = fix_path(config_file, config_dict["etalon_allinfo"]) + if not os.path.exists(etalon_file): + log.error("Etalon file not found: %s" % etalon_file) + return -35 + + etalon_dict = load_tsv_config(etalon_file) + quality_dict = load_tsv_config(quality_report) + + exit_code = 0 + tolerance = float(config_dict.get("tolerance", "0.01")) + new_etalon_path = os.path.join(output_folder, "new_allinfo_etalon.tsv") + with open(new_etalon_path, "w") as new_etalon_outf: + for metric_name, val_str in quality_dict.items(): + new_etalon_outf.write("%s\t%s\n" % (metric_name, val_str)) + + for metric_name, etalon_val_str in etalon_dict.items(): + etalon_val = float(etalon_val_str) + if metric_name not in quality_dict: + log.error("Metric %s not found in quality report" % metric_name) + exit_code = -36 + continue + output_val = float(quality_dict[metric_name]) + err = check_value(etalon_val, output_val, metric_name, tolerance) + if err != 0: + exit_code = err + + return exit_code + + def check_output_files(out_dir, label, file_list): missing_files = [] internal_output_dir = os.path.join(out_dir, label) @@ -488,6 +561,8 @@ def main(): err_codes.append(run_quantification(args, config_dict, "gene")) if RT_PERFORMANCE in run_types: err_codes.append(run_performance_assessment(args, config_dict)) + if RT_ALLINFO in run_types: + err_codes.append(run_allinfo_quality(args, config_dict)) if "check_input_files" in config_dict: files_list = config_dict["check_input_files"].split() From 8723b783357ffa9686c3ac47b522fbda3429a3e1 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Thu, 5 Feb 2026 14:21:32 +0200 Subject: [PATCH 41/41] custom sc test --- .../workflows/SC.Human.Curio.custom_sc.yml | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 .github/workflows/SC.Human.Curio.custom_sc.yml diff --git a/.github/workflows/SC.Human.Curio.custom_sc.yml b/.github/workflows/SC.Human.Curio.custom_sc.yml new file mode 100644 index 00000000..f1bcbd5e --- /dev/null +++ b/.github/workflows/SC.Human.Curio.custom_sc.yml @@ -0,0 +1,74 @@ +name: SC Human Curio Custom SC Allinfo + +on: + workflow_dispatch: + schedule: + - cron: '0 6 * * 0' + +env: + LAUNCHER: ${{github.workspace}}/tests/github/run_pipeline.py + CFG_DIR: /abga/work/andreyp/ci_isoquant/data + BIN_PATH: /abga/work/andreyp/ci_isoquant/bin/ + OUTPUT_BASE: /abga/work/andreyp/ci_isoquant/output/${{github.ref_name}}/ + +concurrency: + group: ${{github.workflow}} + cancel-in-progress: false + +jobs: + check-changes: + runs-on: + labels: [isoquant] + name: 'Check for recent changes' + outputs: + has_changes: ${{steps.check.outputs.has_changes}} + steps: + - name: 'Checkout' + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: 'Check for commits in last 7 days' + id: check + run: | + # Always run on manual trigger + if [ "${{github.event_name}}" = "workflow_dispatch" ]; then + echo "has_changes=true" >> $GITHUB_OUTPUT + exit 0 + fi + # Check for commits in last 7 days + COMMITS=$(git log --oneline --since="7 days ago" | wc -l) + if [ "$COMMITS" -gt 0 ]; then + echo "Found $COMMITS commits in last 7 days" + echo "has_changes=true" >> $GITHUB_OUTPUT + else + echo "No commits in last 7 days, skipping" + echo "has_changes=false" >> $GITHUB_OUTPUT + fi + + launch-runner: + needs: check-changes + if: needs.check-changes.outputs.has_changes == 'true' + runs-on: + labels: [isoquant] + name: 'Running SC Human Curio custom_sc pipeline with allinfo QC' + + steps: + - name: 'Cleanup' + run: > + set -e && + shopt -s dotglob && + rm -rf * + + - name: 'Checkout' + uses: actions/checkout@v3 + with: + fetch-depth: 1 + + - name: 'SC Human Curio custom_sc allinfo' + shell: bash + env: + STEP_NAME: SC.Human.Curio.custom_sc.allinfo + run: | + export PATH=$PATH:${{env.BIN_PATH}} + python3 ${{env.LAUNCHER}} ${{env.CFG_DIR}}/${{env.STEP_NAME}}.cfg -o ${{env.OUTPUT_BASE}}