diff --git a/.github/workflows/Barcode.Custom.10x.yml b/.github/workflows/Barcode.Custom.10x.yml new file mode 100644 index 00000000..92c1f606 --- /dev/null +++ b/.github/workflows/Barcode.Custom.10x.yml @@ -0,0 +1,82 @@ +name: Barcode Custom SC 10x + +on: + workflow_dispatch: + schedule: + - cron: '0 4 * * 1' + +env: + LAUNCHER: ${{github.workspace}}/tests/github/run_barcode_test.py + CFG_DIR: /abga/work/andreyp/ci_isoquant/data/barcodes + BIN_PATH: /abga/work/andreyp/ci_isoquant/bin/ + OUTPUT_BASE: /abga/work/andreyp/ci_isoquant/output/${{github.ref_name}}/barcodes/ + +concurrency: + group: ${{github.workflow}} + cancel-in-progress: false + +jobs: + check-changes: + runs-on: + labels: [isoquant] + name: 'Check for recent changes' + outputs: + has_changes: ${{steps.check.outputs.has_changes}} + steps: + - name: 'Checkout' + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: 'Check for commits in last 7 days' + id: check + run: | + # Always run on manual trigger + if [ "${{github.event_name}}" = "workflow_dispatch" ]; then + echo "has_changes=true" >> $GITHUB_OUTPUT + exit 0 + fi + # Check for commits in last 7 days + COMMITS=$(git log --oneline --since="7 days ago" | wc -l) + if [ "$COMMITS" -gt 0 ]; then + echo "Found $COMMITS commits in last 7 days" + echo "has_changes=true" >> $GITHUB_OUTPUT + else + echo "No commits in last 7 days, skipping" + echo "has_changes=false" >> $GITHUB_OUTPUT + fi + + launch-runner: + needs: check-changes + if: needs.check-changes.outputs.has_changes == 'true' + runs-on: + labels: [isoquant] + name: 'Running custom_sc 10x barcode detection QC' + + steps: + - name: 'Cleanup' + run: > + set -e && + shopt -s dotglob && + rm -rf * + + - name: 'Checkout' + uses: actions/checkout@v3 + with: + fetch-depth: 1 + + - name: 'Custom SC 10x small (ArrayKmerIndexer)' + shell: bash + env: + STEP_NAME: Mouse.10x.custom_sc.small + run: | + export PATH=$PATH:${{env.BIN_PATH}} + python3 ${{env.LAUNCHER}} ${{env.CFG_DIR}}/${{env.STEP_NAME}}.cfg -o ${{env.OUTPUT_BASE}} + + - name: 'Custom SC 10x large (Array2BitKmerIndexer)' + shell: bash + env: + STEP_NAME: Mouse.10x.custom_sc.large + run: | + export PATH=$PATH:${{env.BIN_PATH}} + python3 ${{env.LAUNCHER}} ${{env.CFG_DIR}}/${{env.STEP_NAME}}.cfg -o ${{env.OUTPUT_BASE}} diff --git a/.github/workflows/Barcode.Custom.Curio.yml b/.github/workflows/Barcode.Custom.Curio.yml new file mode 100644 index 00000000..3c2b0c69 --- /dev/null +++ b/.github/workflows/Barcode.Custom.Curio.yml @@ -0,0 +1,82 @@ +name: Barcode Custom SC Curio + +on: + workflow_dispatch: + schedule: + - cron: '0 3 * * 3' + +env: + LAUNCHER: ${{github.workspace}}/tests/github/run_barcode_test.py + CFG_DIR: /abga/work/andreyp/ci_isoquant/data/barcodes + BIN_PATH: /abga/work/andreyp/ci_isoquant/bin/ + OUTPUT_BASE: /abga/work/andreyp/ci_isoquant/output/${{github.ref_name}}/barcodes/ + +concurrency: + group: ${{github.workflow}} + cancel-in-progress: false + +jobs: + check-changes: + runs-on: + labels: [isoquant] + name: 'Check for recent changes' + outputs: + has_changes: ${{steps.check.outputs.has_changes}} + steps: + - name: 'Checkout' + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: 'Check for commits in last 7 days' + id: check + run: | + # Always run on manual trigger + if [ "${{github.event_name}}" = "workflow_dispatch" ]; then + echo "has_changes=true" >> $GITHUB_OUTPUT + exit 0 + fi + # Check for commits in last 7 days + COMMITS=$(git log --oneline --since="7 days ago" | wc -l) + if [ "$COMMITS" -gt 0 ]; then + echo "Found $COMMITS commits in last 7 days" + echo "has_changes=true" >> $GITHUB_OUTPUT + else + echo "No commits in last 7 days, skipping" + echo "has_changes=false" >> $GITHUB_OUTPUT + fi + + launch-runner: + needs: check-changes + if: needs.check-changes.outputs.has_changes == 'true' + runs-on: + labels: [isoquant] + name: 'Running custom_sc Curio barcode detection QC' + + steps: + - name: 'Cleanup' + run: > + set -e && + shopt -s dotglob && + rm -rf * + + - name: 'Checkout' + uses: actions/checkout@v3 + with: + fetch-depth: 1 + + - name: 'Custom SC Curio small (ArrayKmerIndexer)' + shell: bash + env: + STEP_NAME: Human.Curio.custom_sc.small + run: | + export PATH=$PATH:${{env.BIN_PATH}} + python3 ${{env.LAUNCHER}} ${{env.CFG_DIR}}/${{env.STEP_NAME}}.cfg -o ${{env.OUTPUT_BASE}} + + - name: 'Custom SC Curio large (Array2BitKmerIndexer)' + shell: bash + env: + STEP_NAME: Human.Curio.custom_sc.large + run: | + export PATH=$PATH:${{env.BIN_PATH}} + python3 ${{env.LAUNCHER}} ${{env.CFG_DIR}}/${{env.STEP_NAME}}.cfg -o ${{env.OUTPUT_BASE}} diff --git a/.github/workflows/Barcode.Custom.StereoSeq.yml b/.github/workflows/Barcode.Custom.StereoSeq.yml new file mode 100644 index 00000000..161521d3 --- /dev/null +++ b/.github/workflows/Barcode.Custom.StereoSeq.yml @@ -0,0 +1,82 @@ +name: Barcode Custom SC StereoSeq + +on: + workflow_dispatch: + schedule: + - cron: '0 1 * * 5' + +env: + LAUNCHER: ${{github.workspace}}/tests/github/run_barcode_test.py + CFG_DIR: /abga/work/andreyp/ci_isoquant/data/barcodes + BIN_PATH: /abga/work/andreyp/ci_isoquant/bin/ + OUTPUT_BASE: /abga/work/andreyp/ci_isoquant/output/${{github.ref_name}}/barcodes/ + +concurrency: + group: ${{github.workflow}} + cancel-in-progress: false + +jobs: + check-changes: + runs-on: + labels: [isoquant] + name: 'Check for recent changes' + outputs: + has_changes: ${{steps.check.outputs.has_changes}} + steps: + - name: 'Checkout' + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: 'Check for commits in last 7 days' + id: check + run: | + # Always run on manual trigger + if [ "${{github.event_name}}" = "workflow_dispatch" ]; then + echo "has_changes=true" >> $GITHUB_OUTPUT + exit 0 + fi + # Check for commits in last 7 days + COMMITS=$(git log --oneline --since="7 days ago" | wc -l) + if [ "$COMMITS" -gt 0 ]; then + echo "Found $COMMITS commits in last 7 days" + echo "has_changes=true" >> $GITHUB_OUTPUT + else + echo "No commits in last 7 days, skipping" + echo "has_changes=false" >> $GITHUB_OUTPUT + fi + + launch-runner: + needs: check-changes + if: needs.check-changes.outputs.has_changes == 'true' + runs-on: + labels: [isoquant] + name: 'Running custom_sc StereoSeq barcode detection QC' + + steps: + - name: 'Cleanup' + run: > + set -e && + shopt -s dotglob && + rm -rf * + + - name: 'Checkout' + uses: actions/checkout@v3 + with: + fetch-depth: 1 + + - name: 'Custom SC StereoSeq small (KmerIndexer)' + shell: bash + env: + STEP_NAME: Mouse.StereoSeq.custom_sc.small + run: | + export PATH=$PATH:${{env.BIN_PATH}} + python3 ${{env.LAUNCHER}} ${{env.CFG_DIR}}/${{env.STEP_NAME}}.cfg -o ${{env.OUTPUT_BASE}} + + - name: 'Custom SC StereoSeq large (Dict2BitKmerIndexer)' + shell: bash + env: + STEP_NAME: Mouse.StereoSeq.custom_sc.large + run: | + export PATH=$PATH:${{env.BIN_PATH}} + python3 ${{env.LAUNCHER}} ${{env.CFG_DIR}}/${{env.STEP_NAME}}.cfg -o ${{env.OUTPUT_BASE}} diff --git a/.github/workflows/Barcode.Custom.VisiumHD.yml b/.github/workflows/Barcode.Custom.VisiumHD.yml new file mode 100644 index 00000000..5e32ed2a --- /dev/null +++ b/.github/workflows/Barcode.Custom.VisiumHD.yml @@ -0,0 +1,82 @@ +name: Barcode Custom SC Visium HD + +on: + workflow_dispatch: + schedule: + - cron: '0 2 * * 4' + +env: + LAUNCHER: ${{github.workspace}}/tests/github/run_barcode_test.py + CFG_DIR: /abga/work/andreyp/ci_isoquant/data/barcodes + BIN_PATH: /abga/work/andreyp/ci_isoquant/bin/ + OUTPUT_BASE: /abga/work/andreyp/ci_isoquant/output/${{github.ref_name}}/barcodes/ + +concurrency: + group: ${{github.workflow}} + cancel-in-progress: false + +jobs: + check-changes: + runs-on: + labels: [isoquant] + name: 'Check for recent changes' + outputs: + has_changes: ${{steps.check.outputs.has_changes}} + steps: + - name: 'Checkout' + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: 'Check for commits in last 7 days' + id: check + run: | + # Always run on manual trigger + if [ "${{github.event_name}}" = "workflow_dispatch" ]; then + echo "has_changes=true" >> $GITHUB_OUTPUT + exit 0 + fi + # Check for commits in last 7 days + COMMITS=$(git log --oneline --since="7 days ago" | wc -l) + if [ "$COMMITS" -gt 0 ]; then + echo "Found $COMMITS commits in last 7 days" + echo "has_changes=true" >> $GITHUB_OUTPUT + else + echo "No commits in last 7 days, skipping" + echo "has_changes=false" >> $GITHUB_OUTPUT + fi + + launch-runner: + needs: check-changes + if: needs.check-changes.outputs.has_changes == 'true' + runs-on: + labels: [isoquant] + name: 'Running custom_sc Visium HD barcode detection QC' + + steps: + - name: 'Cleanup' + run: > + set -e && + shopt -s dotglob && + rm -rf * + + - name: 'Checkout' + uses: actions/checkout@v3 + with: + fetch-depth: 1 + + - name: 'Custom SC Visium HD small (ArrayKmerIndexer)' + shell: bash + env: + STEP_NAME: Mouse.VisiumHD.custom_sc.small + run: | + export PATH=$PATH:${{env.BIN_PATH}} + python3 ${{env.LAUNCHER}} ${{env.CFG_DIR}}/${{env.STEP_NAME}}.cfg -o ${{env.OUTPUT_BASE}} + + - name: 'Custom SC Visium HD large (Array2BitKmerIndexer)' + shell: bash + env: + STEP_NAME: Mouse.VisiumHD.custom_sc.large + run: | + export PATH=$PATH:${{env.BIN_PATH}} + python3 ${{env.LAUNCHER}} ${{env.CFG_DIR}}/${{env.STEP_NAME}}.cfg -o ${{env.OUTPUT_BASE}} diff --git a/.github/workflows/SC.Mouse.10x.yml b/.github/workflows/SC.Mouse.10x.yml new file mode 100644 index 00000000..d13de1dc --- /dev/null +++ b/.github/workflows/SC.Mouse.10x.yml @@ -0,0 +1,74 @@ +name: SC Mouse 10x Allinfo + +on: + workflow_dispatch: + schedule: + - cron: '0 5 * * 6' + +env: + LAUNCHER: ${{github.workspace}}/tests/github/run_pipeline.py + CFG_DIR: /abga/work/andreyp/ci_isoquant/data + BIN_PATH: /abga/work/andreyp/ci_isoquant/bin/ + OUTPUT_BASE: /abga/work/andreyp/ci_isoquant/output/${{github.ref_name}}/ + +concurrency: + group: ${{github.workflow}} + cancel-in-progress: false + +jobs: + check-changes: + runs-on: + labels: [isoquant] + name: 'Check for recent changes' + outputs: + has_changes: ${{steps.check.outputs.has_changes}} + steps: + - name: 'Checkout' + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: 'Check for commits in last 7 days' + id: check + run: | + # Always run on manual trigger + if [ "${{github.event_name}}" = "workflow_dispatch" ]; then + echo "has_changes=true" >> $GITHUB_OUTPUT + exit 0 + fi + # Check for commits in last 7 days + COMMITS=$(git log --oneline --since="7 days ago" | wc -l) + if [ "$COMMITS" -gt 0 ]; then + echo "Found $COMMITS commits in last 7 days" + echo "has_changes=true" >> $GITHUB_OUTPUT + else + echo "No commits in last 7 days, skipping" + echo "has_changes=false" >> $GITHUB_OUTPUT + fi + + launch-runner: + needs: check-changes + if: needs.check-changes.outputs.has_changes == 'true' + runs-on: + labels: [isoquant] + name: 'Running SC Mouse 10x pipeline with allinfo QC' + + steps: + - name: 'Cleanup' + run: > + set -e && + shopt -s dotglob && + rm -rf * + + - name: 'Checkout' + uses: actions/checkout@v3 + with: + fetch-depth: 1 + + - name: 'SC Mouse 10x allinfo' + shell: bash + env: + STEP_NAME: SC.Mouse.10x.allinfo + run: | + export PATH=$PATH:${{env.BIN_PATH}} + python3 ${{env.LAUNCHER}} ${{env.CFG_DIR}}/${{env.STEP_NAME}}.cfg -o ${{env.OUTPUT_BASE}} diff --git a/docs/barcode_calling.md b/docs/barcode_calling.md new file mode 100644 index 00000000..e0d64884 --- /dev/null +++ b/docs/barcode_calling.md @@ -0,0 +1,227 @@ +# Barcode calling + +IsoQuant includes a built-in barcode calling module (`isoquant_detect_barcodes.py`) that extracts + barcodes and UMIs from raw long reads. This module supports multiple single-cell and +spatial transcriptomics platforms and can also be used as a standalone tool. + +See [single-cell and spatial transcriptomics](single_cell.md) for supported platforms, +pipeline integration, MDF format, and UMI deduplication. + +## How it works + +The barcode calling module processes each read by: + +1. Searching for known constant sequences (linkers, primers, TSO) on the read +2. Locating the polyT tail to determine read orientation +3. Extracting the barcode region based on its expected position relative to the anchoring constant sequences +4. Matching the extracted barcode against a whitelist +5. Extracting the UMI sequence from its expected position + +Each read is assigned: +- A **barcode** (cell/spot identity) corrected against the whitelist, or `*` if no match is found +- A **UMI** (unique molecular identifier), or `*` if not detected +- A **strand** orientation (`+`, `-`, or `.` if unknown) +- Platform-specific features (polyT position, linker positions, etc.) + +## Integration with IsoQuant + +When running IsoQuant with any single-cell or spatial mode, +barcode calling is performed automatically as the first pipeline step. +The barcode detection output is a TSV file with one line per read, which is then used +throughout the rest of the pipeline for grouping and UMI deduplication. + +If barcodes have already been called externally (e.g., by Cell Ranger), +use `--barcoded_reads` to skip the barcode calling step. + +## Standalone usage + +`isoquant_detect_barcodes.py` can be run independently: + +```bash +python isoquant_detect_barcodes.py \ + --input reads.fastq.gz \ + --barcodes barcode_whitelist.txt \ + --mode tenX_v3 \ + --output output_prefix \ + --threads 16 +``` + +### Command line options + +`--input` or `-i` (required) + +One or more input read files in FASTQ, FASTA or BAM format. Gzipped FASTQ/FASTA files are supported. + +`--output` or `-o` (required) + +Output prefix. The barcode calling results will be written to `.barcoded_reads.tsv`. +When multiple input files are provided, outputs are numbered: `_0.barcoded_reads.tsv`, etc. + +`--barcodes` or `-b` + +One or more barcode whitelist files. The number of files depends on the mode: + +* 1 file: `tenX_v3`, `visium_5prime`, `stereoseq`, `stereoseq_nosplit`, `curio` +* 2 files: `visium_hd` +* Not needed for `custom_sc` (barcodes defined in MDF file) + +File should contain one barcode sequence per line. +More than 1 tab-separated column is allowed, but only the first will be used. +Supports plain text and gzipped files. + +_Note: barcode calling is performed much better if the whitelist contains a small number of barcodes. +If you have a subset of barcodes from short-read data, provide them instead of the full whitelist._ + +`--mode` + +Barcode calling mode. Available modes: `tenX_v3`, `curio`, `stereoseq`, `stereoseq_nosplit`, +`visium_5prime`, `visium_hd`, `custom_sc`. + +`--molecule` +Path to a molecule description format (MDF) file for `custom_sc` mode. +This file defines the structure of the sequencing molecule (barcodes, UMIs, linkers, polyT, cDNA)ю +See [MDF format](single_cell.md#molecule-definition-file-mdf-format) for the format specification. + +`--threads` or `-t` + +Number of threads for parallel processing (default: 16). + +`--tmp_dir` + +Folder for temporary files during parallel processing. + +### Example commands + +10x Genomics: +```bash +python isoquant_detect_barcodes.py \ + -i reads.fastq.gz \ + -b 3M-february-2018.txt.gz \ + --mode tenX_v3 \ + -o results/my_sample \ + -t 16 +``` + +Stereo-seq: +```bash +python isoquant_detect_barcodes.py \ + -i reads.fastq.gz \ + -b stereo_barcodes.txt \ + --mode stereoseq \ + -o results/stereo_sample \ + -t 16 +``` + +Custom molecule: +```bash +python isoquant_detect_barcodes.py \ + -i reads.fastq.gz \ + --mode custom_sc \ + --molecule my_platform.mdf \ + -o results/custom_sample +``` + +## Output + +The main output is a TSV file (`*.barcoded_reads.tsv`) with a header line followed by one line per read. +The columns depend on the platform mode. + +### Common columns (all modes) + +| Column | Description | +|--------|-------------| +| read_id | Read identifier | +| barcode | Corrected barcode sequence, or `*` if not found | +| UMI | UMI sequence, or `*` if not detected | +| BC_score | Barcode alignment score (-1 if not matched) | +| strand | Read orientation: `+`, `-`, or `.` | + +### Additional columns by mode + +**10x Genomics** (`tenX_v3`, `visium_5prime`, `visium_hd`): + +| Column | Description | +|--------|-------------| +| polyT | Position of polyT tail start (-1 if not found) | +| R1 | Position of R1 primer end (-1 if not found) | + +**Curio** (`curio`): + +| Column | Description | +|--------|-------------| +| polyT | PolyT tail position | +| primer | PCR primer position | +| linker_start | Linker start position | +| linker_end | Linker end position | + +**Stereo-seq** (`stereoseq_nosplit`, `stereoseq`): + +| Column | Description | +|--------|-------------| +| polyT | PolyT tail position | +| primer | Primer position | +| linker_start | Linker start position | +| linker_end | Linker end position | +| TSO | TSO position (-1 if not found) | + +**Custom** (`custom_sc`): columns depend on the elements defined in the MDF file. +Each variable element produces a column with its detected sequence and score. + +### Statistics file + +A statistics file (`*.barcoded_reads.tsv.stats`) is generated alongside the TSV output, +reporting the total number of reads processed, barcodes detected, and UMIs found. + +## Supported platforms + +### 10x Genomics (`tenX_v3`, `visium_5prime`) + +Molecule structure (3' to 5'): +``` +[R1 primer] [Barcode (16bp)] [UMI (12bp)] [PolyT] [cDNA] [TSO] +``` + +Requires 1 barcode whitelist file (e.g., the 10x `3M-february-2018.txt.gz`). + +### Visium HD (`visium_hd`) + +Molecule structure (3' to 5'): +``` +[R1 primer] [UMI (9bp)] [BC_part1 (15-16bp)] [BC_part2 (14-15bp)] VV [PolyT] [cDNA] [TSO] +``` + +Requires 2 barcode whitelist files (part 1 and part 2). +The final barcode is the concatenation of both parts. + +### Curio Bioscience (`curio`) + +Molecule structure (3' to 5'): +``` +[PCR primer] [Left BC (8bp)] [Linker] [Right BC (6bp)] [UMI (9bp)] [PolyT] [cDNA] +``` + +The linker sequence anchors the barcode detection. +Accepts 1 combined whitelist file (14bp barcodes). + +### Stereo-seq (`stereoseq`, `stereoseq_nosplit`) + +Molecule structure (3' to 5'): +``` +[Primer] [Barcode (25bp)] [Linker] [UMI (10bp)] [PolyT] [cDNA] [TSO] +``` + +- `stereoseq` mode: splits concatenated reads at TSO boundaries, producing a new FASTA with individual subreads +- `stereoseq_nosplit` mode: processes reads without splitting + +### Custom platform (`custom_sc`) + +Uses a molecule definition file (MDF) to describe arbitrary molecule structures. +See [MDF format](single_cell.md#molecule-definition-file-mdf-format). + +The universal extractor: +1. Tries both forward and reverse complement orientations +2. Detects polyT tail to determine strand +3. Finds constant elements (primers, linkers) on the read +4. Extracts variable elements (barcodes, UMIs) from their expected positions +5. Corrects barcodes against whitelists when provided (`VAR_FILE` or `VAR_LIST`) +6. Handles linked elements: concatenated (`|`) parts are joined then corrected; duplicated (`/`) parts use majority vote diff --git a/docs/cmd.md b/docs/cmd.md index a9bb7aef..462d530e 100644 --- a/docs/cmd.md +++ b/docs/cmd.md @@ -96,7 +96,7 @@ Use this option at your own risk. Input file names are used as labels if not set. `--read_group` - Sets one or more ways to group feature counts (e.g. by cell type, file, or barcode). + Sets one or more ways to group feature counts (e.g. by cell type, file name, BAM tag or barcode). Multiple grouping strategies can be combined (space-separated). Available grouping options: @@ -106,8 +106,8 @@ If multiple BAM/FASTQ files are provided and `--read_group` option is not set, I by default. * `tag:TAG` - groups reads by BAM file read tag, where `TAG` is the tag name -(e.g. `tag:CB` uses `CB` tag values as groups, commonly used for cell barcodes in single-cell data). - +(e.g. `tag:RG` uses `RG` tag values as groups, commonly used for read group information). +q * `read_id:DELIM` - groups reads by read name suffix, where `DELIM` is the symbol/string by which the read id will be split (e.g. if `DELIM` is `_`, for read `m54158_180727_042959_59310706_ccs_NEU` the group will be `NEU`). @@ -117,49 +117,17 @@ where `FILE` is the file path, `READ_COL` is column with read ids (default: 0), `GROUP_COL(S)` is column(s) with group ids (default: 1; use comma-separated columns for multi-column grouping, e.g., `1,2,3`), `DELIM` is separator symbol (default: tab). File can be gzipped. - * `barcode_spot` or `barcode_spot:FILE` - maps barcodes to spots/cell types. -Uses barcode-to-spot mapping from `--barcode2spot` file by default, or from explicit `FILE` if specified. + * `barcode_spot` - group by barcode properties, such as spots/cell types. +Set barcode-to-spot mapping via `--barcode2spot` option. Useful for grouping single-cell/spatial data by cell type or spatial region instead of individual barcodes. -**Example**: `--read_group tag:CB file_name barcode_spot` creates multi-level grouping by cell barcode tag, file name, and cell type. - +* `barcode` - groups reads by cell barcode. +When the number of barcodes is large, grouping counts by individual barcodes may take a long time. +It is recommended to use `--barcode2spot` file instead. -### Single-cell and spatial transcriptomics options +**Example**: `--read_group tag:RG file_name barcode_spot` creates multi-level grouping by read group tag, +original file name, and barcode property (e.g. cell type). -**NB! This functionality is not officially released yet, use it at your own risk.** - -`--mode` or `-m` -IsoQuant mode for processing single-cell or spatial transcriptomics data. Available modes: - -* `bulk` - standard bulk RNA-seq mode (default) -* `tenX_v3` - 10x Genomics single-cell 3' gene expression -* `curio` - Curio Bioscience single-cell -* `visium_hd` - 10x Genomics Visium HD spatial transcriptomics -* `visium_5prime` - 10x Genomics Visium 5' spatial transcriptomics -* `stereoseq` - Stereo-seq spatial transcriptomics (BGI) -* `stereoseq_nosplit` - Stereo-seq without barcode splitting - -Single-cell and spatial modes enable automatic barcode calling and UMI-based deduplication. - -`--barcode_whitelist` -Path to file(s) with barcode whitelist for barcode calling. -Required for single-cell/spatial modes unless `--barcoded_reads` is provided. -File should contain one barcode per line. - -`--barcoded_reads` -Path to TSV file(s) with barcoded reads. -Format: `read_idbarcodeumi` (one read per line). -If not provided, barcodes will be called automatically from raw reads using `--barcode_whitelist`. - -`--barcode_column` -Column index for barcodes in the `--barcoded_reads` file (default: 1). -Read ID column is 0, barcode column is 1, UMI column is 2. - -`--barcode2spot` -Path to TSV file(s) mapping barcodes to cell types or spatial spots. -Format: `barcodecell_type` (one barcode per line). -Used with `--read_group barcode_spot` to group counts by cell type/spot instead of individual barcodes. -Useful for reducing output dimensions from thousands of barcodes to tens of cell types. ### Output options @@ -200,6 +168,62 @@ Useful for reducing output dimensions from thousands of barcodes to tens of cell Align reads to the reference without running IsoQuant itself. +## Single-cell and spatial transcriptomics options + +**NB! This feature is experimental and is not part of the official IsoQuant release.** + +See [single-cell and spatial transcriptomics](single_cell.md) for a detailed guide on +supported platforms, examples, molecule description format, and UMI deduplication. + +`--mode` or `-m` +IsoQuant mode for processing single-cell or spatial transcriptomics data. Available modes: + +* `bulk` - standard bulk RNA-seq mode (default) +* `tenX_v3` - 10x Genomics single-cell 3' gene expression +* `visium_5prime` - 10x Genomics Visium 5' spatial transcriptomics +* `visium_hd` - 10x Genomics Visium HD spatial transcriptomics +* `curio` - Curio Bioscience spatial data +* `stereoseq` - Stereo-seq spatial data +* `stereoseq_nosplit` - Stereo-seq without read splitting +* `custom_sc` - custom single-cell/spatial mode using a [molecule definition file](single_cell.md#molecule-definition-file-mdf-format) + +Single-cell and spatial modes enable automatic barcode calling and UMI-based deduplication. + +`--barcode_whitelist` +Path to file(s) with barcode whitelist(s) for barcode calling. +Required for single-cell/spatial modes unless `--barcoded_reads` is provided. + +File should contain one barcode sequence per line. +More than 1 tab-separated column is allowed, but only the first will be used. +Supports plain text and gzipped files. + +_Note: barcode calling is performed much better if the whitelist contains a small number of barcodes. +If you have a subset of barcodes from short-read data, provide them instead of the full whitelist._ + +`--barcoded_reads` +Path to TSV file(s) with pre-called barcoded reads. +Format: `read_idbarcodeumi` (one read per line). +If provided, IsoQuant skips barcode calling and uses these assignments directly. +More than 3 columns are allowed, but only the first 3 will be used. + +Note! IsoQuant does not read barcodes or UMIs from BAM file tags. + +`--barcode2spot` +Path to TSV file mapping barcodes to cell types, spatial spots, or other barcode properties. +By default, barcode is in the first column, cell type in the second. +However, you can specify one or more columns via colon symbol (similar to `--read_group`): +`file.tsv:barcode_column:spot_column(s)` (e.g., `cell_types.tsv:0:1,2,3` for multiple barcode properties). + +When `--barcode2spot` is set, `--read_group barcode_spot` will be set automatically +to group counts by cell type, spatial regions, or other provided properties. + +`--molecule` +Path to a molecule description file (MDF) for `custom_sc` mode. +Defines molecule structure for universal barcode extraction from any platform. +See [MDF format](single_cell.md#molecule-definition-file-mdf-format) for details. + + + ## Algorithm parameters ### Quantification diff --git a/docs/single_cell.md b/docs/single_cell.md new file mode 100644 index 00000000..1d88402d --- /dev/null +++ b/docs/single_cell.md @@ -0,0 +1,258 @@ +# Single-cell and spatial transcriptomics + +**NB! This feature is experimental and is not part of the official IsoQuant release.** + +IsoQuant supports single-cell and spatial transcriptomics data from multiple platforms. +When a single-cell or spatial mode is selected, IsoQuant automatically performs +barcode calling and UMI-based PCR deduplication as part of the pipeline. + +## Overview + +The single-cell/spatial pipeline extends the standard bulk pipeline with these additional steps: + +1. **Barcode calling** -- extract cell/spot barcodes and UMIs from raw reads +2. **Standard IsoQuant processing** -- alignment, read-to-isoform assignment +3. **UMI deduplication** -- remove PCR/RT duplicates within each barcode group +4. **Grouped quantification** -- produce per-cell/per-spot count matrices + +Note: UMI deduplication relies on read-to-gene assignment. Reads that are not assigned to any gene are discarded. +Hence, novel gene discovery will not be performed in single-cell/spatial mode. +We recommend using `bulk` mode for novel gene and transcript discovery. + +Barcode calling is handled by the built-in [barcode calling module](barcode_calling.md), +which can also be used as a standalone tool. + +## Supported platforms + +| Mode | Platform | Barcode whitelist files | UMI length | Notes | +|------|----------|--------------------|------------|--------------------------------------------------------------------------------------| +| `tenX_v3` | 10x Genomics 3' v3 | 1 | 12 | Single 16bp barcode | +| `visium_5prime` | 10x Genomics Visium 5' | 1 | 12 | Same detector as tenX_v3 | +| `visium_hd` | 10x Genomics Visium HD | 2 | 9 | Two barcodes (15/16bp + 14/15bp) | +| `curio` | Curio Bioscience | 1 | 9 | Double barcode (8bp + 6bp) with linker | +| `stereoseq` | Stereo-seq | 1 | 10 | 25bp barcode, read splitting mode | +| `stereoseq_nosplit` | Stereo-seq | 1 | 10 | 25bp barcode, no read splitting | +| `custom_sc` | Any platform | 0 (uses MDF) | varies | User-defined molecule structure via [MDF file](#molecule-definition-file-mdf-format) | + +## Quick start examples + +10x Genomics single-cell: +```bash +isoquant.py --reference genome.fa --genedb genes.gtf --complete_genedb \ + --fastq reads.fastq.gz --data_type nanopore \ + --mode tenX_v3 --barcode_whitelist 3M-february-2018.txt.gz \ + -o sc_output +``` + +Stereo-seq spatial: +```bash +isoquant.py --reference genome.fa --genedb genes.gtf --complete_genedb \ + --fastq reads.fastq.gz --data_type nanopore \ + --mode stereoseq --barcode_whitelist barcodes.txt \ + -o stereo_output +``` + +Custom platform with molecule definition file: +```bash +isoquant.py --reference genome.fa --genedb genes.gtf --complete_genedb \ + --fastq reads.fastq.gz --data_type nanopore \ + --mode custom_sc --molecule molecule_definition.mdf \ + -o custom_output +``` + +Pre-called barcodes (skip barcode calling): +```bash +isoquant.py --reference genome.fa --genedb genes.gtf --complete_genedb \ + --bam aligned.bam --data_type nanopore \ + --mode tenX_v3 --barcoded_reads barcodes.tsv \ + -o sc_output +``` + +## Command line options + +`--mode` or `-m` + +IsoQuant processing mode. Available modes: + +* `bulk` -- standard bulk RNA-seq mode (default) +* `tenX_v3` -- 10x Genomics single-cell 3' gene expression +* `curio` -- Curio Bioscience single-cell +* `visium_hd` -- 10x Genomics Visium HD spatial transcriptomics +* `visium_5prime` -- 10x Genomics Visium 5' spatial transcriptomics +* `stereoseq` -- Stereo-seq spatial transcriptomics (BGI), with read splitting +* `stereoseq_nosplit` -- Stereo-seq without read splitting +* `custom_sc` -- custom single-cell/spatial mode using a molecule definition file (MDF) + +All modes except `bulk` enable automatic barcode calling and UMI-based deduplication. + + +`--barcode_whitelist` +Path to file(s) with barcode whitelist(s) for barcode calling. +Required for single-cell/spatial modes unless `--barcoded_reads` is provided. + +File should contain one barcode sequence per line. +More than 1 tab-separated column is allowed, but only the first will be used. +Supports plain text and gzipped files. + +Note: barcode calling is performed much better if the whitelist contains a small number of barcodes. +If you have a subset of barcodes from short-read data, provide them instead of the full whitelist. + +The number of whitelist files depends on the mode: + +* 1 file: `tenX_v3`, `visium_5prime`, `stereoseq`, `stereoseq_nosplit`, `curio` (combined 14bp barcodes) +* 2 files: `visium_hd` (part 1 and part 2 barcode lists) +* Not needed: `custom_sc` (barcode lists are specified inside the MDF file) + +`--barcoded_reads` +Path to TSV file(s) with pre-called barcoded reads. +Format: `read_idbarcodeumi` (one read per line). +If provided, IsoQuant skips barcode calling and uses these assignments directly. +More than 3 columns are allowed, but only the first 3 will be used. + +Note! IsoQuant does not read barcodes or UMIs from BAM file tags. + +`--barcode2spot` +Path to TSV file mapping barcodes to cell types, spatial spots, or other barcode properties. +By default, barcode is in the first column, cell type in the second. +However, you can specify one or more columns via colon symbol (similar to `--read_group`): +`file.tsv:barcode_column:spot_column(s)` (e.g., `cell_types.tsv:0:1,2,3` for multiple barcode properties). + +When `--barcode2spot` is set, `--read_group barcode_spot` will be set automatically +to group counts by cell type, spatial regions, or other provided properties. + +`--molecule` +Path to a molecule description format (MDF) file for `custom_sc` mode. +This file defines the structure of the sequencing molecule (barcodes, UMIs, linkers, polyT, cDNA) +and allows IsoQuant to process reads from any single-cell or spatial platform. +See the [MDF format](#molecule-definition-file-mdf-format) section below for details. + +## Molecule description format (MDF) + +The MDF format allows users to describe the structure of their sequencing molecule +so that IsoQuant can extract barcodes and UMIs from any platform. +The molecule is described in the 3' to 5' direction (primer end first, cDNA last). + +An MDF file has two parts: + +1. **Header line**: colon-separated list of element names defining the order of elements on the molecule (3' to 5'). +2. **Element definitions**: one line per element, tab-separated: `name type value [length]`. + +### Element types + +| Type | Description | Value field | +|------|-------------------------------------------------------------------------------|--------------------------| +| `CONST` | Constant/known sequence (primer, linker, TSO) | Sequence | +| `VAR_FILE` | Variable sequence matched against a whitelist file | Path to TSV file | +| `VAR_LIST` | Variable sequence matched against an inline list | Comma-separated sequences | +| `VAR_ANY` | Variable fixed-length sequence extracted as-is | Length (integer) | +| `VAR_ANY_SEPARATOR` | Fixed-length variable separator sequence (not extracted) | Length (integer) | +| `VAR_ANY_NON_T_SEPARATOR` | Fixed-length variable separator sequence without T nucleoties (not extracted) | Length (integer) | +| `PolyT` | PolyT tail | (none) | +| `cDNA` | cDNA region | (none) | + +At the moment, only a single `cDNA` and a single `PolyT` are supported. + +Variable elements are expected to have a fixed length (`VAR_FILE` and `VAR_LIST`). +Using variable-length barcodes may result in suboptimal performance. + +### Barcode and UMI identification + +Elements are identified as barcodes or UMIs by their name prefix (case-insensitive): +- Names starting with `barcode` are treated as barcode elements (e.g., `Barcode`, `barcode1`) +- Names starting with `umi` are treated as UMI elements (e.g., `UMI`, `umi1`) + +When multiple barcode (or UMI) elements are present, their sequences are concatenated +in the order they appear. + +### Examples + +**10x 3' single-cell v3**: +``` +R1:Barcode:UMI:PolyT:cDNA:TSO +R1 CONST CTACACGACGCTCTTCCGATCT +Barcode VAR_FILE barcodes.tsv +UMI VAR_ANY 12 +TSO CONST CCCATGTACTCTGCGTTGATACCACTGCTT +``` + +**10x 3' single-cell v3** (inline barcodes): +``` +R1:Barcode:UMI:PolyT:cDNA:TSO +R1 CONST CTACACGACGCTCTTCCGATCT +Barcode VAR_FILE AAACCCGGGTTTAAAC,TTTGGGCCCAAATTTG,GGGGAAAACCCCTTTT +UMI VAR_ANY 12 +TSO CONST CCCATGTACTCTGCGTTGATACCACTGCTT +``` + +### Linked elements for multi-part barcodes + +When a barcode is split into multiple parts in the molecule, use linked element notation. +All linked parts must be of the same variable type and reference the same whitelist. +Parts are numbered consecutively starting from 1. + +**Concatenated** (`|`): parts of the barcode separated by other elements. +Parts are extracted independently, then concatenated and corrected as one sequence against the full whitelist. +Such structure appears in, for example, Curio Bioscience single-cell protocol: + +``` +PCR_PRIMER:Barcode|1:Linker:Barcode|2:UMI:PolyT:cDNA +PCR_PRIMER TACACGACGCTCTTCCGATCT +Barcode|1 VAR_FILE barcodes.tsv 8 +Linker CONST TCTTCAGCGTTCCCGAGA +Barcode|2 VAR_FILE barcodes.tsv 6 +UMI VAR_ANY 9 +``` + +**Duplicated** (`/`): multiple redundant copies of the same barcode in the molecule. +Each copy is corrected independently against the whitelist; +a majority vote determines the final barcode. +For example: +``` +PCR_PRIMER:Barcode/1:UMI:PolyT:cDNA:Barcode/2:TSO +PCR_PRIMER TACACGACGCTCTTCCGATCT +Barcode/1 VAR_FILE barcodes.tsv 16 +Barcode/2 VAR_FILE barcodes.tsv 16 +UMI VAR_ANY 12 +TSO CONST CCCATGTACTCTGCGTTGATACCACTGCTT +``` + +## UMI deduplication + +All single-cell and spatial modes perform UMI-based PCR deduplication after isoform assignment. +Within each cell barcode and gene, reads with similar UMIs (similarity criteria depends on the UMI length) +are collapsed into a single representative read. + +The representative read is selected based on: +1. Unique isoform assignment over ambiguous +2. More exons +3. Longer transcript alignment + +## Output + +### Count matrices + +Single-cell/spatial modes produce grouped count matrices in addition to the standard IsoQuant output. +Use `--counts_format` to control the output format: + +* `default` -- automatic selection: matrix format for small numbers of groups (<=100), MTX for larger datasets +* `matrix` -- standard matrix format with genes/transcripts as rows and barcodes as columns +* `mtx` -- Matrix Market (MTX) format compatible with Seurat and Scanpy +* `none` -- no conversion (only internal linear format is produced) + +Grouped counts can also be converted after the run using `src/convert_grouped_counts.py`. + +### Grouping reads + +Use `--read_group` to control how reads are grouped for quantification. +Multiple grouping strategies can be combined (space-separated), producing separate count tables for each. + +The most common use-cases for single-cell/spatial data are grouping by barcode property (cell type, spot): + +`--read_group barcode_spot` (requires `--barcode2spot`) + +or grouping by individual barcode (not recommended for datasets with many barcodes): + +`--read_group barcode file_name` + + +See the [read grouping options](cmd.md#other-input-options) for the full list of grouping strategies. diff --git a/isoquant.py b/isoquant.py index 59f68cfa..60aebc22 100755 --- a/isoquant.py +++ b/isoquant.py @@ -46,7 +46,7 @@ from src.input_data_storage import InputDataStorage, InputDataType from src.multimap_resolver import MultimapResolvingStrategy from src.stats import combine_counts -from detect_barcodes import process_single_thread, process_in_parallel, get_umi_length +from isoquant_detect_barcodes import process_single_thread, process_in_parallel, get_umi_length logger = logging.getLogger('IsoQuant') @@ -167,17 +167,16 @@ def add_hidden_option(*args, **kwargs): # show command only with --full-help help="IsoQuant modes: " + ", ".join(ISOQUANT_MODES) + "; default:%s" % IsoQuantMode.bulk.name, default=IsoQuantMode.bulk.name) add_additional_option_to_group(sc_args_group, '--barcode_whitelist', type=str, nargs='+', - help='file(s) with barcode whitelist for barcode calling') + help='file(s) with barcode whitelist(s) for barcode calling') add_additional_option_to_group(sc_args_group, "--barcoded_reads", type=str, nargs='+', help='TSV file(s) with barcoded reads; barcodes will be called automatically if not provided') - # TODO: add UMI column, support various formats - add_additional_option_to_group(sc_args_group, "--barcode_column", type=str, - help='column with barcodes in barcoded_reads file, default=1; read id column is 0', - default=1) add_additional_option_to_group(sc_args_group, "--barcode2spot", type=str, help='TSV file mapping barcode to cell type / spot id. ' 'Format: file.tsv or file.tsv:barcode_col:spot_cols ' '(e.g., file.tsv:0:1,2,3 for multiple spot columns)') + add_additional_option_to_group(sc_args_group, "--molecule", type=str, + help='molecule definition file (MDF) for custom_sc mode; ' + 'defines molecule structure for universal barcode extraction') # ALGORITHM add_additional_option_to_group(algo_args_group, "--report_novel_unspliced", "-u", type=bool_str, @@ -531,7 +530,11 @@ def check_input_params(args): args.umi_length = 0 if args.mode.needs_barcode_calling(): - if not args.barcode_whitelist and not args.barcoded_reads: + if args.mode == IsoQuantMode.custom_sc: + if not args.molecule and not args.barcoded_reads: + logger.critical("custom_sc mode requires --molecule or --barcoded_reads") + sys.exit(IsoQuantExitCode.BARCODE_WHITELIST_MISSING) + elif not args.barcode_whitelist and not args.barcoded_reads: logger.critical("You have chosen single-cell/spatial mode %s, please specify barcode whitelist or file with " "barcoded reads" % args.mode.name) sys.exit(IsoQuantExitCode.BARCODE_WHITELIST_MISSING) @@ -607,6 +610,10 @@ def check_input_files(args): else: check_file_exists(args.barcoded_reads, "Barcoded reads file") + # Check molecule definition file + if hasattr(args, 'molecule') and args.molecule: + check_file_exists(args.molecule, "Molecule definition file") + # Check barcode whitelist files if hasattr(args, 'barcode_whitelist') and args.barcode_whitelist: for wl_file in args.barcode_whitelist: @@ -958,7 +965,8 @@ def prepare_reference_genome(args): class BarcodeCallingArgs: - def __init__(self, input, barcode_whitelist, mode, output, out_fasta, tmp_dir, threads): + def __init__(self, input, barcode_whitelist, mode, output, out_fasta, tmp_dir, threads, + molecule: str | None = None): self.input = input # Can be a single file (str) or list of files self.barcodes = barcode_whitelist self.mode = mode @@ -966,7 +974,7 @@ def __init__(self, input, barcode_whitelist, mode, output, out_fasta, tmp_dir, t self.out_fasta = out_fasta # Can be a single filename (str), list of filenames, or None self.tmp_dir = tmp_dir self.threads = threads - self.min_score = None + self.molecule = molecule def call_barcodes(args): @@ -999,7 +1007,8 @@ def call_barcodes(args): bc_threads = 1 if args.mode.enforces_single_thread() else args.threads bc_args = BarcodeCallingArgs(input_files, args.barcode_whitelist, args.mode, - output_barcodes_list, output_fasta_list, sample.aux_dir, bc_threads) + output_barcodes_list, output_fasta_list, sample.aux_dir, bc_threads, + molecule=getattr(args, 'molecule', None)) # Launching barcode calling in a separate process has the following reason: # Read chunks are not cleared by the GC in the end of barcode calling, leaving the main # IsoQuant process to consume ~2,5 GB even when barcode calling is done. diff --git a/detect_barcodes.py b/isoquant_detect_barcodes.py similarity index 89% rename from detect_barcodes.py rename to isoquant_detect_barcodes.py index de20df22..c56d0f5c 100755 --- a/detect_barcodes.py +++ b/isoquant_detect_barcodes.py @@ -16,14 +16,13 @@ from traceback import print_exc import shutil from collections import defaultdict -import numpy import pysam from Bio import SeqIO, Seq, SeqRecord import logging from src.modes import IsoQuantMode from src.error_codes import IsoQuantExitCode -from src.barcode_calling.common import bit_to_str, reverese_complement +from src.barcode_calling.common import reverese_complement, load_barcodes from src.barcode_calling import ( TenXBarcodeDetector, CurioBarcodeDetector, @@ -31,6 +30,8 @@ SharedMemoryStereoSplittingBarcodeDetector, ReadStats, VisiumHDBarcodeDetector, + UniversalSingleMoleculeExtractor, + MoleculeStructure ) logger = logging.getLogger('IsoQuant') @@ -43,7 +44,8 @@ IsoQuantMode.stereoseq_nosplit: SharedMemoryStereoBarcodeDetector, IsoQuantMode.stereoseq: SharedMemoryStereoSplittingBarcodeDetector, IsoQuantMode.visium_5prime: TenXBarcodeDetector, - IsoQuantMode.visium_hd: VisiumHDBarcodeDetector + IsoQuantMode.visium_hd: VisiumHDBarcodeDetector, + IsoQuantMode.custom_sc: UniversalSingleMoleculeExtractor } BARCODE_FILES_REQUIRED = {IsoQuantMode.tenX_v3: [1], @@ -51,7 +53,8 @@ IsoQuantMode.stereoseq_nosplit: [1], IsoQuantMode.stereoseq: [1], IsoQuantMode.visium_5prime: [1], - IsoQuantMode.visium_hd: [2] + IsoQuantMode.visium_hd: [2], + IsoQuantMode.custom_sc: [0] } @@ -108,7 +111,7 @@ def __init__(self, output_file_name, barcode_detector, header=False, output_sequ self.output_sequences_file = open(self.output_sequences, "w") self.process_function = self._process_read_split if header: - self.output_file.write(barcode_detector.result_type().header() + "\n") + self.output_file.write(barcode_detector.header() + "\n") self.read_stat = ReadStats() def get_stats(self): @@ -244,13 +247,12 @@ def bam_file_chunk_reader(handler): yield current_chunk -def process_chunk(barcode_detector, read_chunk, output_file, num, out_fasta=None, min_score=None): +def process_chunk(barcode_detector, read_chunk, output_file, num, out_fasta=None): output_file += "_" + str(num) if out_fasta: out_fasta += "_" + str(num) counter = 0 - if min_score: - barcode_detector.min_score = min_score + barcode_caller = BarcodeCaller(output_file, barcode_detector, output_sequences=out_fasta) counter += barcode_caller.process_chunk(read_chunk) read_chunk.clear() @@ -260,13 +262,26 @@ def process_chunk(barcode_detector, read_chunk, output_file, num, out_fasta=None return output_file, out_fasta, counter -def prepare_barcodes(args): +def create_barcode_caller(args): + logger.info("Creating barcode detector for mode %s" % args.mode.name) + + if args.mode == IsoQuantMode.custom_sc: + if not args.molecule: + logger.critical("Custom single-cell/spatial mode requires molecule description to be provided via --molecule") + sys.exit(IsoQuantExitCode.INCOMPATIBLE_OPTIONS) + if not os.path.isfile(args.molecule): + logger.critical("Molecule file %s does not exist" % args.molecule) + sys.exit(IsoQuantExitCode.INPUT_FILE_NOT_FOUND) + + return BARCODE_CALLING_MODES[args.mode](MoleculeStructure(open(args.molecule))) + logger.info("Using barcodes from %s" % ", ".join(args.barcodes)) barcode_files = len(args.barcodes) if barcode_files not in BARCODE_FILES_REQUIRED[args.mode]: logger.critical("Barcode calling mode %s requires %s files, %d provided" % (args.mode.name, " or ".join([str(x) for x in BARCODE_FILES_REQUIRED[args.mode]]), barcode_files)) sys.exit(IsoQuantExitCode.BARCODE_FILE_COUNT_MISMATCH) + barcodes = [] for bc in args.barcodes: barcodes.append(load_barcodes(bc, needs_iterator=args.mode.needs_barcode_iterator())) @@ -280,15 +295,14 @@ def prepare_barcodes(args): for i, bc in enumerate(barcodes): logger.info("Loaded %d barcodes from %s" % (len(bc), args.barcodes[i])) barcodes = tuple(barcodes) - return barcodes + + barcode_detector = BARCODE_CALLING_MODES[args.mode](barcodes) + + return barcode_detector def process_single_thread(args): - logger.info("Preparing barcodes indices") - barcodes = prepare_barcodes(args) - barcode_detector = BARCODE_CALLING_MODES[args.mode](barcodes) - if args.min_score: - barcode_detector.min_score = args.min_score + barcode_detector = create_barcode_caller(args) # args.input, args.output_tsv are always lists # args.out_fasta is a list or None @@ -306,7 +320,7 @@ def process_single_thread(args): logger.info("Finished barcode calling") -def _process_single_file_in_parallel(input_file, output_tsv, out_fasta, args, barcodes): +def _process_single_file_in_parallel(input_file, output_tsv, out_fasta, args, barcode_detector): """Process a single file in parallel (internal helper function).""" logger.info("Processing " + input_file) fname, outer_ext = os.path.splitext(os.path.basename(input_file)) @@ -334,15 +348,10 @@ def _process_single_file_in_parallel(input_file, output_tsv, out_fasta, args, ba tmp_dir = "barcode_calling_%x" % random.randint(0, 1 << 32) if args.tmp_dir: tmp_dir = os.path.join(args.tmp_dir, tmp_dir) + else: + tmp_dir = os.path.join(os.path.dirname(args.output), tmp_dir) os.makedirs(tmp_dir) - barcode_detector = BARCODE_CALLING_MODES[args.mode](barcodes) - logger.info("Barcode caller created") - - min_score = None - if args.min_score: - min_score = args.min_score - tmp_barcode_file = os.path.join(tmp_dir, "bc") tmp_fasta_file = os.path.join(tmp_dir, "subreads") if out_fasta else None chunk_counter = 0 @@ -363,8 +372,7 @@ def _process_single_file_in_parallel(input_file, output_tsv, out_fasta, args, ba chunk, tmp_barcode_file, chunk_counter, - tmp_fasta_file, - min_score)) + tmp_fasta_file)) chunk_counter += 1 if chunk_counter >= args.threads: break @@ -391,15 +399,14 @@ def _process_single_file_in_parallel(input_file, output_tsv, out_fasta, args, ba chunk, tmp_barcode_file, chunk_counter, - tmp_fasta_file, - min_score)) + tmp_fasta_file)) chunk_counter += 1 except StopIteration: reads_left = False with open(output_tsv, "w") as final_output_tsv: final_output_fasta = open(out_fasta, "w") if out_fasta else None - header = BARCODE_CALLING_MODES[args.mode].result_type().header() + header = barcode_detector.header() final_output_tsv.write(header + "\n") stat_dict = defaultdict(int) for tmp_file, tmp_fasta in output_files: @@ -427,46 +434,16 @@ def process_in_parallel(args): # args.input, args.output_tsv are always lists # args.out_fasta is a list or None out_fastas = args.out_fasta if args.out_fasta else [None] * len(args.input) - - # Prepare barcodes once for all files - barcodes = prepare_barcodes(args) + barcode_detector = create_barcode_caller(args) for idx, (input_file, output_tsv, out_fasta) in enumerate(zip(args.input, args.output_tsv, out_fastas)): if len(args.input) > 1: logger.info("Processing file %d/%d: %s" % (idx + 1, len(args.input), input_file)) - _process_single_file_in_parallel(input_file, output_tsv, out_fasta, args, barcodes) + _process_single_file_in_parallel(input_file, output_tsv, out_fasta, args, barcode_detector) logger.info("Finished barcode calling") -def load_barcodes(inf, needs_iterator=False): - if inf.endswith("h5") or inf.endswith("hdf5"): - return load_h5_barcodes_bit(inf) - - if inf.endswith("gz") or inf.endswith("gzip"): - handle = gzip.open(inf, "rt") - else: - handle = open(inf, "r") - - barcode_iterator = iter(l.strip().split()[0] for l in handle) - if needs_iterator: - return barcode_iterator - - return [b for b in barcode_iterator] - - -def load_h5_barcodes_bit(h5_file_path, dataset_name='bpMatrix_1'): - raise NotImplementedError() - import h5py - barcode_list = [] - with h5py.File(h5_file_path, 'r') as h5_file: - dataset = numpy.array(h5_file[dataset_name]) - for row in dataset: - for col in row: - barcode_list.append(bit_to_str(int(col[0]))) - return barcode_list - - def set_logger(logger_instance, args): logger_instance.setLevel(logging.INFO) if args.debug: @@ -490,14 +467,13 @@ def add_hidden_option(*args, **kwargs): # show command only with --full-help parser.add_argument("--output", "-o", type=str, help="output prefix name", required=True) parser.add_argument("--barcodes", "-b", nargs='+', type=str, help="barcode whitelist(s)", required=False) # parser.add_argument("--umi", "-u", type=str, help="potential UMIs, detected de novo if not set") - parser.add_argument("--mode", type=str, help="mode to be used", choices=[x.name for x in BARCODE_CALLING_MODES.keys()], - default=IsoQuantMode.stereoseq.name) + parser.add_argument("--mode", type=str, help="mode to be used", choices=[x.name for x in BARCODE_CALLING_MODES.keys()]) + parser.add_argument("--molecule", type=str, help="MDF files with molecule description (for custom_sc mode only)") + parser.add_argument("--input", "-i", nargs='+', type=str, help="input reads in [gzipped] FASTA, FASTQ, BAM, SAM", required=True) parser.add_argument("--threads", "-t", type=int, help="threads to use (16)", default=16) parser.add_argument("--tmp_dir", type=str, help="folder for temporary files") - parser.add_argument("--min_score", type=int, help="minimal barcode score " - "(scoring system is +1, -1, -1, -1)") add_hidden_option('--debug', action='store_true', default=False, help='Debug log output.') args = parser.parse_args(sys_argv) diff --git a/misc/assess_allinfo_quality.py b/misc/assess_allinfo_quality.py new file mode 100644 index 00000000..f71f938c --- /dev/null +++ b/misc/assess_allinfo_quality.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python3 + +############################################################################ +# Copyright (c) 2022-2026 University of Helsinki +# Copyright (c) 2020-2022 Saint Petersburg State University +# # All Rights Reserved +# See file LICENSE for details. +############################################################################ + +# Assess quality of UMI-filtered allinfo output by independently computing +# statistics from the allinfo file and cross-validating against stats.tsv. + +import argparse +import gzip +import logging +import sys +from collections import Counter +from typing import Dict, Tuple + +logger = logging.getLogger('AllinfoQA') + + +def setup_logger(): + logger.setLevel(logging.INFO) + ch = logging.StreamHandler(sys.stdout) + ch.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + ch.setFormatter(formatter) + logger.addHandler(ch) + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Assess quality of UMI-filtered allinfo output") + parser.add_argument("--allinfo", type=str, required=True, + help="Path to allinfo file (plain or .gz)") + parser.add_argument("--stats", type=str, required=True, + help="Path to stats.tsv file from UMI filtering") + parser.add_argument("--output", "-o", type=str, required=True, + help="Output quality report file (TSV)") + return parser.parse_args() + + +def load_tsv_config(filepath: str) -> Dict[str, str]: + config = {} + with open(filepath) as f: + for line in f: + if line.startswith("#"): + continue + tokens = line.strip().split('\t') + if len(tokens) >= 2: + config[tokens[0]] = tokens[1] + return config + + +def compute_allinfo_stats(allinfo_path: str) -> Tuple[Dict[str, int], int]: + """Parse allinfo file and compute statistics. + + Returns: + Tuple of (stats_dict, duplicate_triplet_count) + """ + open_fn = gzip.open if allinfo_path.endswith('.gz') else open + mode = 'rt' if allinfo_path.endswith('.gz') else 'r' + + total_reads = 0 + spliced_reads = 0 + gene_barcode_pairs: set = set() + triplet_counts: Counter = Counter() + + with open_fn(allinfo_path, mode) as f: + for line in f: + line = line.strip() + if not line: + continue + + fields = line.split('\t') + if len(fields) < 13: + logger.warning("Skipping malformed line with %d fields: %s", + len(fields), line[:100]) + continue + + total_reads += 1 + + # Columns (0-indexed): + # 0: read_id, 1: gene_id, 2: cell_type, 3: barcode, 4: umi, + # 5: introns, 6: TSS, 7: polyA, 8: exons, 9: read_type, + # 10: num_introns, 11: transcript_id, 12: transcript_type + gene_id = fields[1] + barcode = fields[3] + umi = fields[4] + num_introns = int(fields[10]) + + if num_introns > 0: + spliced_reads += 1 + + gene_barcode_pairs.add((gene_id, barcode)) + triplet_counts[(gene_id, barcode, umi)] += 1 + + duplicate_triplets = sum(1 for count in triplet_counts.values() if count > 1) + + stats = { + "Total reads saved": total_reads, + "Spliced reads saved": spliced_reads, + "Unique gene-barcodes pairs": len(gene_barcode_pairs), + } + return stats, duplicate_triplets + + +def cross_validate(computed_stats: Dict[str, int], + stats_tsv: Dict[str, str]) -> bool: + """Cross-validate computed allinfo stats against stats.tsv values. + + Returns True if all checks pass. + """ + all_match = True + + checks = [ + ("Total reads saved", "Total reads saved"), + ("Spliced reads saved", "Spliced reads saved"), + ("Unique gene-barcodes pairs", "Unique gene-barcodes pairs"), + ] + + for computed_key, stats_key in checks: + if stats_key not in stats_tsv: + logger.error("Key '%s' not found in stats.tsv", stats_key) + all_match = False + continue + + computed_val = computed_stats[computed_key] + stats_val = int(stats_tsv[stats_key]) + + if computed_val != stats_val: + logger.error("Mismatch for '%s': allinfo=%d, stats.tsv=%d", + computed_key, computed_val, stats_val) + all_match = False + else: + logger.info("OK '%s': %d", computed_key, computed_val) + + return all_match + + +def main(): + args = parse_args() + setup_logger() + + logger.info("Loading allinfo from %s", args.allinfo) + computed_stats, duplicate_triplets = compute_allinfo_stats(args.allinfo) + logger.info("Computed stats from allinfo: %s", computed_stats) + + logger.info("Loading stats from %s", args.stats) + stats_tsv = load_tsv_config(args.stats) + logger.info("Stats.tsv contains %d entries", len(stats_tsv)) + + # Cross-validate + stats_match = cross_validate(computed_stats, stats_tsv) + + if duplicate_triplets > 0: + logger.error("Found %d duplicate gene-barcode-UMI triplets in allinfo", + duplicate_triplets) + else: + logger.info("No duplicate gene-barcode-UMI triplets found") + + # Write quality report + exit_code = 0 + with open(args.output, "w") as outf: + # Pass through all stats.tsv entries + for key, val in stats_tsv.items(): + outf.write("%s\t%s\n" % (key, val)) + + # Add computed validation metrics + outf.write("duplicate_triplets\t%d\n" % duplicate_triplets) + allinfo_match_val = 1 if stats_match else 0 + outf.write("allinfo_stats_match\t%d\n" % allinfo_match_val) + + logger.info("Quality report written to %s", args.output) + + if not stats_match: + logger.error("FAILED: allinfo stats do not match stats.tsv") + exit_code = 1 + if duplicate_triplets > 0: + logger.error("FAILED: duplicate gene-barcode-UMI triplets detected") + exit_code = 1 + + if exit_code == 0: + logger.info("All checks passed") + + return exit_code + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/misc/assess_barcode_quality.py b/misc/assess_barcode_quality.py index 5db838a0..66a537f5 100644 --- a/misc/assess_barcode_quality.py +++ b/misc/assess_barcode_quality.py @@ -384,7 +384,7 @@ def parse_args(): choices=['tenX_v3', 'visium_hd', 'curio', 'stereo', 'stereo_split'], help='Barcode calling mode') parser.add_argument('--input', '-i', required=True, - help='Input barcode TSV file from detect_barcodes.py') + help='Input barcode TSV file from isoquant_detect_barcodes.py') parser.add_argument('--output', '-o', help='Output metrics TSV file (default: stdout)') parser.add_argument('--barcode_col', type=int, default=1, diff --git a/misc/create_stereo_test_subsets.py b/misc/create_stereo_test_subsets.py new file mode 100644 index 00000000..d72ef807 --- /dev/null +++ b/misc/create_stereo_test_subsets.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +""" +Create stereo-seq barcode subsets and corresponding filtered read files. + +From the full 10M stereo barcode whitelist: +1. Randomly select 1M barcodes -> stereo.1M.tsv +2. Randomly select 80K barcodes (subset of the 1M) -> stereo.80K.tsv +3. Filter reads to keep only those matching each subset + +Read ID format: >READ_____... +Barcode is field 3 (0-indexed, underscore-delimited). +""" + +import os +import random +import sys + +DATA_DIR = "/abga/work/andreyp/ci_isoquant/data/barcodes" + +FULL_BARCODE_FILE = os.path.join(DATA_DIR, "D04620C2.10M.tsv") +FULL_READS_FILE = os.path.join(DATA_DIR, "Mouse.StereoSeq.D04620C2.10M.800K.fa") + +SUBSET_1M_FILE = os.path.join(DATA_DIR, "stereo.1M.tsv") +SUBSET_80K_FILE = os.path.join(DATA_DIR, "stereo.80K.tsv") +READS_1M_FILE = os.path.join(DATA_DIR, "Mouse.StereoSeq.custom_sc.1M.fa") +READS_80K_FILE = os.path.join(DATA_DIR, "Mouse.StereoSeq.custom_sc.80K.fa") + +SUBSET_1M_SIZE = 1000000 +SUBSET_80K_SIZE = 80000 + + +def load_barcodes_with_lines(filepath: str) -> tuple[list[str], list[str]]: + """Load barcodes (first column) and full lines from TSV file.""" + barcodes = [] + lines = [] + with open(filepath) as f: + for line in f: + stripped = line.strip() + if not stripped: + continue + bc = stripped.split()[0] + barcodes.append(bc) + lines.append(line) + return barcodes, lines + + +def extract_barcode_from_read_id(read_id: str) -> str: + """Extract barcode from FASTA read ID. + + Format: >READ_idx_transcript_BARCODE_UMI_... + """ + parts = read_id.lstrip(">").split("_") + if len(parts) >= 4: + return parts[3] + return "" + + +def filter_reads_fasta(input_fasta: str, output_fasta: str, barcode_set: set[str]) -> int: + """Filter FASTA file, keeping reads with barcodes in the given set.""" + kept = 0 + total = 0 + with open(input_fasta) as inf, open(output_fasta, "w") as outf: + keep_current = False + for line in inf: + if line.startswith(">"): + total += 1 + bc = extract_barcode_from_read_id(line.strip()) + keep_current = bc in barcode_set + if keep_current: + kept += 1 + if keep_current: + outf.write(line) + return kept, total + + +def main(): + random.seed(42) + + print("Loading full barcode set...") + barcodes, lines = load_barcodes_with_lines(FULL_BARCODE_FILE) + print(f" Loaded {len(barcodes)} barcodes") + + # Select 1M subset + print(f"Selecting {SUBSET_1M_SIZE} barcodes for 1M subset...") + indices_1m = sorted(random.sample(range(len(barcodes)), SUBSET_1M_SIZE)) + barcodes_1m = set(barcodes[i] for i in indices_1m) + + with open(SUBSET_1M_FILE, "w") as f: + for i in indices_1m: + f.write(lines[i]) + print(f" Written to {SUBSET_1M_FILE}") + + # Select 80K subset from the 1M subset + print(f"Selecting {SUBSET_80K_SIZE} barcodes for 80K subset (from 1M)...") + indices_80k_in_1m = sorted(random.sample(range(len(indices_1m)), SUBSET_80K_SIZE)) + barcodes_80k = set(barcodes[indices_1m[i]] for i in indices_80k_in_1m) + + with open(SUBSET_80K_FILE, "w") as f: + for i in indices_80k_in_1m: + orig_idx = indices_1m[i] + f.write(lines[orig_idx]) + print(f" Written to {SUBSET_80K_FILE}") + + # Filter reads for 1M subset + print("Filtering reads for 1M subset...") + kept_1m, total = filter_reads_fasta(FULL_READS_FILE, READS_1M_FILE, barcodes_1m) + print(f" Kept {kept_1m}/{total} reads -> {READS_1M_FILE}") + + # Filter reads for 80K subset + print("Filtering reads for 80K subset...") + kept_80k, total = filter_reads_fasta(FULL_READS_FILE, READS_80K_FILE, barcodes_80k) + print(f" Kept {kept_80k}/{total} reads -> {READS_80K_FILE}") + + print("Done.") + + +if __name__ == "__main__": + main() diff --git a/misc/create_test_barcodes.py b/misc/create_test_barcodes.py new file mode 100644 index 00000000..a27ef373 --- /dev/null +++ b/misc/create_test_barcodes.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +""" +Generate inflated barcode whitelist files for testing large k-mer indexer paths. + +Appends random unique DNA sequences to existing barcode files to push +the total count above 100K, triggering Array2BitKmerIndexer / Dict2BitKmerIndexer +in the universal barcode calling algorithm. +""" + +import os +import random +import sys + +NUCLEOTIDES = "ACGT" +DATA_DIR = "/abga/work/andreyp/ci_isoquant/data/barcodes" + +# Configuration: (input_file, output_file, dummy_count, barcode_length) +CONFIGS = [ + ("10xMultiome_5K.tsv", "10x.large_barcodes.tsv", 120000, 16), + ("A0079_044_BeadBarcodes.tsv", "curio.large_barcodes.tsv", 50000, 14), + ("visium_v1_hd.slide1.barcodes1.tsv", "visium.large_barcodes1.tsv", 120000, 15), + ("visium_v1_hd.slide1.barcodes2.tsv", "visium.large_barcodes2.tsv", 120000, 14), +] + + +def load_existing_barcodes(filepath: str) -> set[str]: + """Load first column from TSV barcode file.""" + barcodes = set() + with open(filepath) as f: + for line in f: + bc = line.strip().split()[0] + if bc: + barcodes.add(bc) + return barcodes + + +def generate_random_barcode(length: int) -> str: + return "".join(random.choice(NUCLEOTIDES) for _ in range(length)) + + +def generate_unique_barcodes(count: int, length: int, existing: set[str]) -> list[str]: + """Generate random unique barcodes not in the existing set.""" + new_barcodes = [] + seen = set(existing) + while len(new_barcodes) < count: + bc = generate_random_barcode(length) + if bc not in seen: + seen.add(bc) + new_barcodes.append(bc) + return new_barcodes + + +def main(): + random.seed(42) # Reproducible results + + for input_name, output_name, dummy_count, bc_length in CONFIGS: + input_path = os.path.join(DATA_DIR, input_name) + output_path = os.path.join(DATA_DIR, output_name) + + print(f"Processing {input_name} -> {output_name}") + existing = load_existing_barcodes(input_path) + print(f" Loaded {len(existing)} existing barcodes") + + dummy = generate_unique_barcodes(dummy_count, bc_length, existing) + print(f" Generated {len(dummy)} dummy barcodes ({bc_length}bp)") + + # Write output: copy original file + append dummy barcodes + with open(output_path, "w") as out: + with open(input_path) as inp: + for line in inp: + out.write(line) + for bc in dummy: + out.write(bc + "\n") + + total = len(existing) + len(dummy) + print(f" Total: {total} barcodes -> {output_path}") + + print("Done.") + + +if __name__ == "__main__": + main() diff --git a/src/barcode_calling/__init__.py b/src/barcode_calling/__init__.py index f04b78c6..ae7cca81 100644 --- a/src/barcode_calling/__init__.py +++ b/src/barcode_calling/__init__.py @@ -49,6 +49,8 @@ SharedMemoryWrapper, TenXBarcodeDetector, VisiumHDBarcodeDetector, + UniversalSingleMoleculeExtractor, + MoleculeStructure ) # Utilities @@ -82,6 +84,9 @@ # 10x detectors 'TenXBarcodeDetector', 'VisiumHDBarcodeDetector', + # universal calling + 'UniversalSingleMoleculeExtractor', + 'MoleculeStructure', # Utilities 'str_to_2bit', 'bit_to_str', diff --git a/src/barcode_calling/callers/__init__.py b/src/barcode_calling/callers/__init__.py index 4263cf5d..512f1045 100644 --- a/src/barcode_calling/callers/__init__.py +++ b/src/barcode_calling/callers/__init__.py @@ -7,12 +7,16 @@ """ Barcode detection classes for single-cell and spatial transcriptomics. +Protocol: + BarcodeResult: Protocol interface for all detection results + Classes: BarcodeDetectionResult: Base result class - CurioBarcodeDetectionResult: Result for Curio (double barcode) platform - StereoBarcodeDetectionResult: Result for Stereo-seq platform + LinkerBarcodeDetectionResult: Result for platforms with linker sequences + TSOBarcodeDetectionResult: Result for Stereo-seq with TSO detection TenXBarcodeDetectionResult: Result for 10x Genomics platform SplittingBarcodeDetectionResult: Result for read splitting modes + ExtractionResult: Dict-based result for universal molecule extraction ReadStats: Statistics tracker for barcode detection Detectors: @@ -25,8 +29,11 @@ SharedMemoryWrapper: Generic shared memory wrapper TenXBarcodeDetector: 10x Genomics v3 detector VisiumHDBarcodeDetector: Visium HD detector + UniversalSingleMoleculeExtractor: Universal barcode detector for custom molecules """ +from .protocol import BarcodeResult + from .base import ( BarcodeDetectionResult, LinkerBarcodeDetectionResult, @@ -37,6 +44,11 @@ increase_if_valid, ) +from .extraction_result import ( + ExtractionResult, + DetectedElement, +) + from .curio import ( CurioBarcodeDetector, CurioIlluminaDetector, @@ -55,13 +67,21 @@ VisiumHDBarcodeDetector, ) +from .molecule_structure import MoleculeStructure + +from .universal_extraction import UniversalSingleMoleculeExtractor + __all__ = [ + # Protocol + 'BarcodeResult', # Result classes 'BarcodeDetectionResult', 'LinkerBarcodeDetectionResult', 'TSOBarcodeDetectionResult', 'TenXBarcodeDetectionResult', 'SplittingBarcodeDetectionResult', + 'ExtractionResult', + 'DetectedElement', 'ReadStats', 'increase_if_valid', # Curio detectors @@ -76,4 +96,7 @@ # 10x detectors 'TenXBarcodeDetector', 'VisiumHDBarcodeDetector', + # Universal extraction + 'MoleculeStructure', + 'UniversalSingleMoleculeExtractor', ] diff --git a/src/barcode_calling/callers/base.py b/src/barcode_calling/callers/base.py index bc6a9258..d9d3714c 100644 --- a/src/barcode_calling/callers/base.py +++ b/src/barcode_calling/callers/base.py @@ -30,11 +30,15 @@ def increase_if_valid(val: Optional[int], delta: int) -> Optional[int]: return val + + + class BarcodeDetectionResult: """ Base class for barcode detection results. Stores detected barcode, UMI, and quality scores for a single read. + Implements the BarcodeResult protocol. """ NOSEQ = "*" # Sentinel for missing/undetected sequence @@ -53,19 +57,51 @@ def __init__(self, read_id: str, barcode: str = NOSEQ, UMI: str = NOSEQ, strand: Detected strand ('+', '-', or '.') """ self.read_id: str = read_id - self.barcode: str = barcode - if not self.barcode: - self.barcode = BarcodeDetectionResult.NOSEQ - self.UMI: str = UMI - if not self.UMI: - self.UMI = BarcodeDetectionResult.NOSEQ + self._barcode: str = barcode if barcode else BarcodeDetectionResult.NOSEQ + self._umi: str = UMI if UMI else BarcodeDetectionResult.NOSEQ self.BC_score: int = BC_score self.UMI_good: bool = UMI_good self.strand: str = strand + # Primary getters (preferred interface) + def get_barcode(self) -> str: + """Get the detected barcode sequence.""" + return self._barcode + + def get_umi(self) -> str: + """Get the detected UMI sequence.""" + return self._umi + + # Backward-compatible property accessors + @property + def barcode(self) -> str: + """Barcode sequence. Prefer get_barcode() for new code.""" + return self._barcode + + @barcode.setter + def barcode(self, value: str) -> None: + self._barcode = value if value else BarcodeDetectionResult.NOSEQ + + @property + def UMI(self) -> str: + """UMI sequence. Prefer get_umi() for new code.""" + return self._umi + + @UMI.setter + def UMI(self, value: str) -> None: + self._umi = value if value else BarcodeDetectionResult.NOSEQ + def is_valid(self) -> bool: """Check if a valid barcode was detected.""" - return self.barcode != BarcodeDetectionResult.NOSEQ + return self._barcode != BarcodeDetectionResult.NOSEQ + + def has_barcode(self) -> bool: + """Check if a barcode was detected (alias for is_valid()).""" + return self._barcode != BarcodeDetectionResult.NOSEQ + + def has_umi(self) -> bool: + """Check if a valid UMI was detected.""" + return self._umi != BarcodeDetectionResult.NOSEQ def update_coordinates(self, delta: int) -> None: """ @@ -98,12 +134,9 @@ def get_additional_attributes(self) -> List[str]: Get list of detected additional features (primer, linker, etc.). Returns: - List of detected feature names - - Raises: - NotImplementedError: Must be implemented by subclasses + List of detected feature names. Empty list for base class. """ - raise NotImplementedError() + return [] def set_strand(self, strand: str) -> None: """Set the detected strand.""" @@ -111,12 +144,12 @@ def set_strand(self, strand: str) -> None: def __str__(self) -> str: """Format result as TSV line.""" - return "%s\t%s\t%s\t%d\t%s\t%s" % (self.read_id, self.barcode, self.UMI, + return "%s\t%s\t%s\t%d\t%s\t%s" % (self.read_id, self._barcode, self._umi, self.BC_score, self.UMI_good, self.strand) @staticmethod def header() -> str: - """Get TSV header for result output.""" + """Static header for class-level access.""" return "#read_id\tbarcode\tUMI\tBC_score\tvalid_UMI\tstrand" @@ -153,9 +186,6 @@ def __init__(self, read_id: str, barcode: str = BarcodeDetectionResult.NOSEQ, self.linker_end: int = linker_end self.polyT: int = polyT - def is_valid(self) -> bool: - return self.barcode != BarcodeDetectionResult.NOSEQ - def update_coordinates(self, delta: int) -> None: self.primer = increase_if_valid(self.primer, delta) self.linker_start = increase_if_valid(self.linker_start, delta) @@ -181,15 +211,13 @@ def get_additional_attributes(self) -> List[str]: attr.append("Linker detected") return attr - def set_strand(self, strand: str) -> None: - self.strand = strand - def __str__(self) -> str: return (BarcodeDetectionResult.__str__(self) + "\t%d\t%d\t%d\t%d" % (self.polyT, self.primer, self.linker_start, self.linker_end)) @staticmethod def header() -> str: + """Static header for class-level access.""" return BarcodeDetectionResult.header() + "\tpolyT_start\tprimer_end\tlinker_start\tlinker_end" @@ -227,6 +255,7 @@ def get_additional_attributes(self) -> List[str]: @staticmethod def header() -> str: + """Static header for class-level access.""" return LinkerBarcodeDetectionResult.header() + "\tTSO5" @@ -241,9 +270,6 @@ def __init__(self, read_id: str, barcode: str = BarcodeDetectionResult.NOSEQ, self.r1: int = r1 self.polyT: int = polyT - def is_valid(self) -> bool: - return self.barcode != BarcodeDetectionResult.NOSEQ - def update_coordinates(self, delta: int) -> None: self.r1 = increase_if_valid(self.r1, delta) self.polyT = increase_if_valid(self.polyT, delta) @@ -263,23 +289,24 @@ def get_additional_attributes(self) -> List[str]: attr.append("R1 detected") return attr - def set_strand(self, strand: str) -> None: - self.strand = strand - def __str__(self) -> str: return (BarcodeDetectionResult.__str__(self) + "\t%d\t%d" % (self.polyT, self.r1)) @staticmethod def header() -> str: + """Static header for class-level access.""" return BarcodeDetectionResult.header() + "\tpolyT_start\tR1_end" class SplittingBarcodeDetectionResult: """Result container for read splitting modes (multiple barcodes per read).""" + NOSEQ = BarcodeDetectionResult.NOSEQ # For consistency with protocol + def __init__(self, read_id: str): self.read_id: str = read_id + self.strand: str = "." # For protocol compatibility self.detected_patterns: List[TSOBarcodeDetectionResult] = [] def append(self, barcode_detection_result: TSOBarcodeDetectionResult) -> None: @@ -293,7 +320,7 @@ def filter(self) -> None: return barcoded_results = [] for r in self.detected_patterns: - if r.barcode != BarcodeDetectionResult.NOSEQ: + if r.is_valid(): barcoded_results.append(r) if not barcoded_results: @@ -301,8 +328,63 @@ def filter(self) -> None: else: self.detected_patterns = barcoded_results + def get_barcode(self) -> str: + """Get barcode from first valid pattern, or NOSEQ if none.""" + for r in self.detected_patterns: + if r.is_valid(): + return r.get_barcode() + return self.NOSEQ + + def get_umi(self) -> str: + """Get UMI from first valid pattern, or NOSEQ if none.""" + for r in self.detected_patterns: + if r.is_valid(): + return r.get_umi() + return self.NOSEQ + + def is_valid(self) -> bool: + """Check if any pattern has a valid barcode.""" + return any(r.is_valid() for r in self.detected_patterns) + + def has_barcode(self) -> bool: + """Check if any pattern has a valid barcode.""" + return any(r.has_barcode() for r in self.detected_patterns) + + def has_umi(self) -> bool: + """Check if any pattern has a valid UMI.""" + return any(r.has_umi() for r in self.detected_patterns) + + def set_strand(self, strand: str) -> None: + """Set strand for all patterns.""" + self.strand = strand + for r in self.detected_patterns: + r.set_strand(strand) + + def update_coordinates(self, delta: int) -> None: + """Update coordinates for all patterns.""" + for r in self.detected_patterns: + r.update_coordinates(delta) + + def more_informative_than(self, other: 'SplittingBarcodeDetectionResult') -> bool: + """Compare by number of valid patterns.""" + self_valid = sum(1 for r in self.detected_patterns if r.is_valid()) + other_valid = sum(1 for r in other.detected_patterns if r.is_valid()) + return self_valid > other_valid + + def get_additional_attributes(self) -> List[str]: + """Get combined attributes from all patterns.""" + attrs = set() + for r in self.detected_patterns: + attrs.update(r.get_additional_attributes()) + return list(attrs) + + def __str__(self) -> str: + """Format all patterns as TSV lines.""" + return "\n".join(str(r) for r in self.detected_patterns) + @staticmethod def header() -> str: + """Get TSV header for result output.""" return TSOBarcodeDetectionResult.header() @@ -321,22 +403,22 @@ def __init__(self): self.umi_count: int = 0 self.additional_attributes_counts: Dict[str, int] = defaultdict(int) - def add_read(self, barcode_detection_result: BarcodeDetectionResult) -> None: + def add_read(self, barcode_detection_result) -> None: """ Add a read result to statistics. Args: - barcode_detection_result: Detection result to accumulate + barcode_detection_result: Detection result to accumulate (implements BarcodeResult protocol) """ self.read_count += 1 # Count detected features (primer, linker, etc.) for a in barcode_detection_result.get_additional_attributes(): self.additional_attributes_counts[a] += 1 # Count valid barcode - if barcode_detection_result.barcode != BarcodeDetectionResult.NOSEQ: + if barcode_detection_result.has_barcode(): self.bc_count += 1 # Count valid UMI - if barcode_detection_result.UMI_good: + if barcode_detection_result.has_umi(): self.umi_count += 1 def add_custom_stats(self, stat_name: str, val: int) -> None: diff --git a/src/barcode_calling/callers/curio.py b/src/barcode_calling/callers/curio.py index 24434359..df2aa3ca 100644 --- a/src/barcode_calling/callers/curio.py +++ b/src/barcode_calling/callers/curio.py @@ -13,10 +13,11 @@ import logging from typing import List, Optional +from ..indexers import Array2BitKmerIndexer from ..indexers import KmerIndexer, ArrayKmerIndexer from ..common import ( find_polyt_start, reverese_complement, - find_candidate_with_max_score_ssw, detect_exact_positions, + find_candidate_with_max_score_ssw, detect_exact_positions, str_to_2bit, ) from .base import LinkerBarcodeDetectionResult @@ -41,8 +42,7 @@ class CurioBarcodeDetector: TERMINAL_MATCH_DELTA = 2 STRICT_TERMINAL_MATCH_DELTA = 1 - def __init__(self, barcode_list: List[str], umi_list: Optional[List[str]] = None, - min_score: int = 13): + def __init__(self, barcode_list: List[str]): """ Initialize Curio barcode detector. @@ -61,13 +61,15 @@ def __init__(self, barcode_list: List[str], umi_list: Optional[List[str]] = None joint_barcode_list.append(b1 + b2) else: joint_barcode_list = barcode_list - self.barcode_indexer = ArrayKmerIndexer(joint_barcode_list, kmer_size=6) - self.umi_set = None - if umi_list: - self.umi_set = set(umi_list) - logger.debug("Loaded %d UMIs" % len(umi_list)) - self.umi_indexer = KmerIndexer(umi_list, kmer_size=5) - self.min_score = min_score + if len(joint_barcode_list) <= 100000: + self.barcode_indexer = ArrayKmerIndexer(joint_barcode_list, kmer_size=6) + self.max_barcodes_hits = 20 + self.min_matching_kmers = 1 + else: + self.barcode_indexer = Array2BitKmerIndexer([str_to_2bit(b) for b in joint_barcode_list], kmer_size=6, seq_len=self.BC_LENGTH) + self.max_barcodes_hits = 10 + self.min_matching_kmers = 2 + self.min_score = 13 def find_barcode_umi(self, read_id: str, sequence: str) -> LinkerBarcodeDetectionResult: """ @@ -149,7 +151,9 @@ def _find_barcode_umi_fwd(self, read_id: str, sequence: str) -> LinkerBarcodeDet barcode_end = linker_end + self.RIGHT_BC_LENGTH + 1 potential_barcode = sequence[barcode_start:linker_start] + sequence[linker_end + 1:barcode_end + 1] logger.debug("Barcode: %s" % (potential_barcode)) - matching_barcodes = self.barcode_indexer.get_occurrences(potential_barcode) + matching_barcodes = self.barcode_indexer.get_occurrences(potential_barcode, + max_hits=self.max_barcodes_hits, + min_kmers=self.min_matching_kmers) barcode, bc_score, bc_start, bc_end = \ find_candidate_with_max_score_ssw(matching_barcodes, potential_barcode, min_score=self.min_score) @@ -167,19 +171,11 @@ def _find_barcode_umi_fwd(self, read_id: str, sequence: str) -> LinkerBarcodeDet potential_umi = sequence[potential_umi_start:potential_umi_end + 1] logger.debug("Potential UMI: %s" % potential_umi) - umi = None good_umi = False - if self.umi_set: - matching_umis = self.umi_indexer.get_occurrences(potential_umi) - umi, umi_score, umi_start, umi_end = \ - find_candidate_with_max_score_ssw(matching_umis, potential_umi, min_score=7) - logger.debug("Found UMI %s %d-%d" % (umi, umi_start, umi_end)) - - if not umi: - umi = potential_umi - if self.UMI_LEN - self.UMI_LEN_DELTA <= len(umi) <= self.UMI_LEN + self.UMI_LEN_DELTA and \ - all(x != "T" for x in umi[-self.NON_T_UMI_BASES:]): - good_umi = True + umi = potential_umi + if self.UMI_LEN - self.UMI_LEN_DELTA <= len(umi) <= self.UMI_LEN + self.UMI_LEN_DELTA and \ + all(x != "T" for x in umi[-self.NON_T_UMI_BASES:]): + good_umi = True if not umi: return LinkerBarcodeDetectionResult(read_id, barcode, BC_score=bc_score, @@ -193,6 +189,10 @@ def _find_barcode_umi_fwd(self, read_id: str, sequence: str) -> LinkerBarcodeDet def result_type(): return LinkerBarcodeDetectionResult + @classmethod + def header(cls): + return cls.result_type().header() + class CurioIlluminaDetector: """ @@ -215,8 +215,7 @@ class CurioIlluminaDetector: TERMINAL_MATCH_DELTA = 1 STRICT_TERMINAL_MATCH_DELTA = 0 - def __init__(self, joint_barcode_list: List[str], umi_list: Optional[List[str]] = None, - min_score: int = 14): + def __init__(self, joint_barcode_list: List[str]): """ Initialize Illumina Curio detector. @@ -227,13 +226,15 @@ def __init__(self, joint_barcode_list: List[str], umi_list: Optional[List[str]] """ self.pcr_primer_indexer = KmerIndexer([CurioBarcodeDetector.PCR_PRIMER], kmer_size=6) self.linker_indexer = KmerIndexer([CurioBarcodeDetector.LINKER], kmer_size=5) - self.barcode_indexer = KmerIndexer(joint_barcode_list, kmer_size=5) - self.umi_set = None - if umi_list: - self.umi_set = set(umi_list) - logger.debug("Loaded %d UMIs" % len(umi_list)) - self.umi_indexer = KmerIndexer(umi_list, kmer_size=5) - self.min_score = min_score + if len(joint_barcode_list) <= 100000: + self.barcode_indexer = ArrayKmerIndexer(joint_barcode_list, kmer_size=5) + self.max_barcodes_hits = 20 + self.min_matching_kmers = 2 + else: + self.barcode_indexer = Array2BitKmerIndexer([str_to_2bit(b) for b in joint_barcode_list], kmer_size=5, seq_len=self.BC_LENGTH) + self.max_barcodes_hits = 10 + self.min_matching_kmers = 2 + self.min_score = 14 def find_barcode_umi(self, read_id: str, sequence: str) -> LinkerBarcodeDetectionResult: """Detect barcode and UMI.""" @@ -285,7 +286,9 @@ def _find_barcode_umi_fwd(self, read_id: str, sequence: str) -> LinkerBarcodeDet logger.debug("Barcode: %s" % (potential_barcode)) if len(potential_barcode) < self.MIN_BC_LEN: return LinkerBarcodeDetectionResult(read_id, linker_start=linker_start, linker_end=linker_end) - matching_barcodes = self.barcode_indexer.get_occurrences(potential_barcode) + matching_barcodes = self.barcode_indexer.get_occurrences(potential_barcode, + max_hits=self.max_barcodes_hits, + min_kmers=self.min_matching_kmers) barcode, bc_score, bc_start, bc_end = \ find_candidate_with_max_score_ssw(matching_barcodes, potential_barcode, min_score=len(potential_barcode) - 1, score_diff=self.SCORE_DIFF) @@ -304,19 +307,11 @@ def _find_barcode_umi_fwd(self, read_id: str, sequence: str) -> LinkerBarcodeDet potential_umi = sequence[potential_umi_start:min(potential_umi_end + 1, len(sequence))] logger.debug("Potential UMI: %s" % potential_umi) - umi = None good_umi = False - if self.umi_set: - matching_umis = self.umi_indexer.get_occurrences(potential_umi) - umi, umi_score, umi_start, umi_end = \ - find_candidate_with_max_score_ssw(matching_umis, potential_umi, min_score=7) - logger.debug("Found UMI %s %d-%d" % (umi, umi_start, umi_end)) - - if not umi: - umi = potential_umi - if self.UMI_LEN - self.UMI_LEN_DELTA <= len(umi) <= self.UMI_LEN + self.UMI_LEN_DELTA and \ - all(x != "T" for x in umi[-self.NON_T_UMI_BASES:]): - good_umi = True + umi = potential_umi + if self.UMI_LEN - self.UMI_LEN_DELTA <= len(umi) <= self.UMI_LEN + self.UMI_LEN_DELTA and \ + all(x != "T" for x in umi[-self.NON_T_UMI_BASES:]): + good_umi = True if not umi: return LinkerBarcodeDetectionResult(read_id, barcode, BC_score=bc_score, @@ -329,3 +324,8 @@ def _find_barcode_umi_fwd(self, read_id: str, sequence: str) -> LinkerBarcodeDet @staticmethod def result_type(): return LinkerBarcodeDetectionResult + + @classmethod + def header(cls): + return cls.result_type().header() + diff --git a/src/barcode_calling/callers/extraction_result.py b/src/barcode_calling/callers/extraction_result.py new file mode 100644 index 00000000..2c3fb4b7 --- /dev/null +++ b/src/barcode_calling/callers/extraction_result.py @@ -0,0 +1,333 @@ +############################################################################ +# Copyright (c) 2023-2026 University of Helsinki +# # All Rights Reserved +# See file LICENSE for details. +############################################################################ + +""" +Dict-based flexible detection result for universal molecule extraction. + +Unlike the attribute-based result classes in base.py, ExtractionResult stores +detected elements in a dictionary, allowing flexible molecule structures. +Implements the BarcodeResult protocol. +""" + +import logging +from collections import defaultdict +from typing import List + +from .molecule_structure import MoleculeStructure, ElementType + +logger = logging.getLogger('IsoQuant') + + +class DetectedElement: + """Container for a single detected element's coordinates and sequence.""" + + def __init__(self, start: int = -1, end: int = -1, score: int = -1, seq: str = None): + self.start = start + self.end = end + self.score = score + self.seq = seq + + +class ExtractionResult: + """ + Dict-based flexible detection result for universal molecule extraction. + + Unlike the attribute-based result classes (BarcodeDetectionResult, etc.), + ExtractionResult stores detected elements in a dictionary, allowing + flexible molecule structures defined by MoleculeStructure. + + Implements the BarcodeResult protocol. + + Barcode/UMI elements are identified by naming convention: + - Elements with prefix "barcode" (case-insensitive) are barcodes + - Elements with prefix "umi" (case-insensitive) are UMIs + """ + + NOSEQ = "*" # Sentinel for missing/undetected sequence (matches BarcodeDetectionResult.NOSEQ) + + def __init__(self, molecule_structure: MoleculeStructure, read_id: str, strand: str, detected_results: dict): + """ + Initialize extraction result. + + Args: + molecule_structure: Molecule structure definition + read_id: Read identifier + strand: Detected strand ('+', '-', or '.') + detected_results: Dict mapping element names to DetectedElement objects + """ + self.molecule_structure = molecule_structure + self.read_id = read_id + self.strand = strand + self.detected_results = detected_results + + @property + def _barcode_elements(self) -> List[str]: + """Get barcode element names from molecule structure.""" + return self.molecule_structure.barcode_elements + + @property + def _umi_elements(self) -> List[str]: + """Get UMI element names from molecule structure.""" + return self.molecule_structure.umi_elements + + def get_barcode(self) -> str: + """ + Get the detected barcode sequence. + + For molecule structures with multiple barcode elements (e.g., barcode1, barcode2), + returns concatenated barcodes using '|' delimiter only when ALL parts are detected. + Returns NOSEQ if any part is missing (partial barcodes are not usable). + + Returns: + str: Concatenated barcode sequence or NOSEQ if any part is not detected. + """ + parts = [] + for el_name in self._barcode_elements: + if el_name in self.detected_results: + detected = self.detected_results[el_name] + if detected.seq and detected.seq != self.NOSEQ: + parts.append(detected.seq) + continue + return self.NOSEQ + return "|".join(parts) if parts else self.NOSEQ + + def get_umi(self) -> str: + """ + Get the detected UMI sequence. + + For molecule structures with multiple UMI elements, + returns all parts using '|' delimiter, with NOSEQ ('*') for undetected elements. + + Returns: + str: Concatenated UMI sequence or NOSEQ if not detected. + """ + if len(self._umi_elements) <= 1: + for el_name in self._umi_elements: + if el_name in self.detected_results: + detected = self.detected_results[el_name] + if detected.seq and detected.seq != self.NOSEQ: + return detected.seq + return self.NOSEQ + + parts = [] + for el_name in self._umi_elements: + if el_name in self.detected_results: + detected = self.detected_results[el_name] + if detected.seq and detected.seq != self.NOSEQ: + parts.append(detected.seq) + continue + parts.append(self.NOSEQ) + return "|".join(parts) + + def is_valid(self) -> bool: + """Check if a valid barcode was detected (all parts must be present).""" + return self.get_barcode() != self.NOSEQ + + def has_barcode(self) -> bool: + """Check if a barcode was detected (alias for is_valid()).""" + return self.get_barcode() != self.NOSEQ + + def has_umi(self) -> bool: + """Check if any valid UMI was detected.""" + return self.get_umi() != self.NOSEQ + + def get_barcode_score(self) -> int: + """ + Get the best barcode alignment score. + + Returns the maximum score from all barcode elements, or -1 if none detected. + """ + max_score = -1 + for el_name in self._barcode_elements: + if el_name in self.detected_results: + detected = self.detected_results[el_name] + if detected.score > max_score: + max_score = detected.score + return max_score + + def update_coordinates(self, delta: int) -> None: + """ + Shift all genomic coordinates by delta. + + Used when processing read subsequences. + + Args: + delta: Amount to shift coordinates + """ + for detected in self.detected_results.values(): + if detected.start != -1: + detected.start += delta + if detected.end != -1: + detected.end += delta + + def more_informative_than(self, other: 'ExtractionResult') -> bool: + """ + Compare two results to determine which is more informative. + + Compares by number of detected elements, then total score. + + Args: + other: Another detection result + + Returns: + True if this result is more informative than other + """ + self_count = sum(1 for d in self.detected_results.values() if d.start != -1) + other_count = sum(1 for d in other.detected_results.values() if d.start != -1) + if self_count != other_count: + return self_count > other_count + self_score = sum(d.score for d in self.detected_results.values() if d.score > 0) + other_score = sum(d.score for d in other.detected_results.values() if d.score > 0) + return self_score > other_score + + def get_additional_attributes(self) -> List[str]: + """ + Get list of detected supplementary features (excluding barcode/UMI). + + Barcode and UMI elements are excluded because they are already + tracked by ReadStats.bc_count and ReadStats.umi_count respectively. + + Returns: + List of detected element names with " detected" suffix. + """ + barcode_umi_set = set(self._barcode_elements + self._umi_elements) + return ["%s detected" % el_name for el_name, d in self.detected_results.items() + if d.start != -1 and el_name not in barcode_umi_set] + + def set_strand(self, strand: str) -> None: + """Set the detected strand.""" + self.strand = strand + + def _get_supplementary_elements(self) -> List[str]: + """Get list of element names that are not barcode, UMI, or concatenated/duplicated parts.""" + supplementary = [] + barcode_umi_set = set(self._barcode_elements + self._umi_elements) + for el in self.molecule_structure: + if el.element_type == ElementType.cDNA: + continue + if el.element_name in self.molecule_structure.elements_to_concatenate: + continue + if el.element_name in self.molecule_structure.duplicated_elements: + continue + if el.element_name not in barcode_umi_set: + supplementary.append(el.element_name) + return supplementary + + def __str__(self) -> str: + """ + Format result as TSV line in standard format. + + Format: read_id, barcode, UMI, BC_score, valid_UMI, strand, [supplementary elements] + This matches the format expected by the rest of the IsoQuant pipeline. + """ + # Standard fields: read_id, barcode, UMI, BC_score, valid_UMI, strand + res_str = "%s\t%s\t%s\t%d\t%s\t%s" % ( + self.read_id, + self.get_barcode(), + self.get_umi(), + self.get_barcode_score(), + self.has_umi(), + self.strand + ) + + # Supplementary elements (non-barcode, non-UMI) + barcode_umi_set = set(self._barcode_elements + self._umi_elements) + for el in self.molecule_structure: + if el.element_type == ElementType.cDNA: + continue + # Skip concatenated/duplicated parts (data lives under base name) + if el.element_name in self.molecule_structure.elements_to_concatenate: + continue + if el.element_name in self.molecule_structure.duplicated_elements: + continue + # Skip barcode and UMI elements (already printed above) + if el.element_name in barcode_umi_set: + continue + + if el.element_type == ElementType.PolyT: + if ElementType.PolyT.name not in self.detected_results: + res_str += "\t-1\t-1" + continue + detected_element = self.detected_results[ElementType.PolyT.name] + res_str += "\t%d\t%d" % (detected_element.start, detected_element.end) + elif el.element_type.is_constant(): + if el.element_name not in self.detected_results: + res_str += "\t-1\t-1" + continue + detected_element = self.detected_results[el.element_name] + res_str += "\t%d\t%d" % (detected_element.start, detected_element.end) + elif el.element_type.needs_sequence_extraction() or el.element_type.needs_correction(): + if el.element_name not in self.detected_results: + res_str += "\t-1\t-1\t%s" % self.NOSEQ + continue + detected_element = self.detected_results[el.element_name] + seq = detected_element.seq if detected_element.seq else self.NOSEQ + res_str += "\t%d\t%d\t%s" % (detected_element.start, detected_element.end, seq) + return res_str + + def header(self) -> str: + """ + Get TSV header for result output in standard format. + + Format: #read_id, barcode, UMI, BC_score, valid_UMI, strand, [supplementary elements] + """ + # Standard header fields + header = "#read_id\tbarcode\tUMI\tBC_score\tvalid_UMI\tstrand" + + # Supplementary elements (non-barcode, non-UMI) + barcode_umi_set = set(self._barcode_elements + self._umi_elements) + for el in self.molecule_structure: + if el.element_type == ElementType.cDNA: + continue + # Skip concatenated/duplicated parts (data lives under base name) + if el.element_name in self.molecule_structure.elements_to_concatenate: + continue + if el.element_name in self.molecule_structure.duplicated_elements: + continue + # Skip barcode and UMI elements (already in standard header) + if el.element_name in barcode_umi_set: + continue + + if el.element_type == ElementType.PolyT: + header += "\tpolyT_start\tpolyT_end" + elif el.element_type.needs_only_coordinates(): + header += "\t%s_start\t%s_end" % (el.element_name, el.element_name) + elif el.element_type.needs_sequence_extraction() or el.element_type.needs_correction(): + header += "\t%s_start\t%s_end\t%s_seq" % (el.element_name, el.element_name, el.element_name) + return header + + # TODO: add simple serialization + + +class ReadStats: + """Statistics tracker for ExtractionResult objects.""" + + def __init__(self): + self.read_count = 0 + self.bc_count = 0 + self.umi_count = 0 + self.pattern_counts = defaultdict(int) + + def add_read(self, barcode_detection_result: ExtractionResult) -> None: + """Add an ExtractionResult to statistics.""" + self.read_count += 1 + # Count valid barcodes and UMIs + if barcode_detection_result.has_barcode(): + self.bc_count += 1 + if barcode_detection_result.has_umi(): + self.umi_count += 1 + # Count individual detected elements + for el in barcode_detection_result.detected_results: + if barcode_detection_result.detected_results[el].start != -1: + self.pattern_counts[el] += 1 + + def __str__(self) -> str: + human_readable_str = "Total reads:\t%d\n" % self.read_count + human_readable_str += "Barcode detected:\t%d\n" % self.bc_count + human_readable_str += "UMI detected:\t%d\n" % self.umi_count + for a in self.pattern_counts: + human_readable_str += "%s:\t%d\n" % (a, self.pattern_counts[a]) + return human_readable_str diff --git a/src/barcode_calling/callers/molecule_structure.py b/src/barcode_calling/callers/molecule_structure.py new file mode 100644 index 00000000..478bc28e --- /dev/null +++ b/src/barcode_calling/callers/molecule_structure.py @@ -0,0 +1,277 @@ +########################################################################### +# Copyright (c) 2026 University of Helsinki +# # All Rights Reserved +# See file LICENSE for details. +############################################################################ + +import logging +import sys +from enum import unique, Enum +from typing import Iterator, List +from collections import defaultdict + +from ...error_codes import IsoQuantExitCode +from ..common import load_barcodes + +logger = logging.getLogger('IsoQuant') + + +@unique +class ElementType(Enum): + PolyT = 1 + cDNA = 2 + CONST = 10 + VAR_ANY = 20 + VAR_LIST = 21 + VAR_FILE = 22 + VAR_ANY_SEPARATOR = 31 + VAR_ANY_NON_T_SEPARATOR = 32 + + def needs_correction(self): + return self in (ElementType.VAR_FILE, ElementType.VAR_LIST) + + def needs_sequence_extraction(self): + return self == ElementType.VAR_ANY + + def is_base_separator(self): + return self in (ElementType.VAR_ANY_SEPARATOR, ElementType.VAR_ANY_NON_T_SEPARATOR) + + def is_variable(self): + return self in (ElementType.VAR_ANY, ElementType.VAR_LIST, ElementType.VAR_FILE) + + def is_constant(self): + return self == ElementType.CONST + + def needs_only_coordinates(self): + """Return True if element only needs start/end coordinates (no sequence extraction).""" + return self.is_constant() + + +class MoleculeElement: + def __init__(self, element_name: str, element_type: ElementType, element_value_1 = None, element_value_2 = None): + self.element_name = element_name + self.element_type = element_type + + if self.element_type in [ElementType.PolyT, ElementType.cDNA]: + self.element_value = None + self.element_length = -1 + elif self.element_type == ElementType.CONST: + self.element_value = element_value_1 + self.element_length = len(self.element_value) + elif self.element_type == ElementType.VAR_FILE: + self.element_value = load_barcodes(element_value_1, needs_iterator=False) + if element_value_2 is None: + self.element_length = len(self.element_value[0]) + else: + try: + self.element_length = int(element_value_2) + except ValueError: + logger.critical("Incorrectly specified length for element %s: %s" % (self.element_name, element_value_2)) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + elif self.element_type == ElementType.VAR_LIST: + self.element_value = element_value_1.split(',') + if element_value_2 is None: + self.element_length = len(self.element_value[0]) + else: + try: + self.element_length = int(element_value_2) + except ValueError: + logger.critical( + "Incorrectly specified length for element %s: %s" % (self.element_name, element_value_2)) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + elif self.element_type in (ElementType.VAR_ANY, ElementType.VAR_ANY_SEPARATOR, ElementType.VAR_ANY_NON_T_SEPARATOR): + self.element_value = None + try: + self.element_length = int(element_value_1) + except ValueError: + logger.critical("Incorrectly specified length for element %s: %s" % (self.element_name, element_value_1)) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + else: + logger.critical("Wrong element type %s" % element_type) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + + +class MoleculeStructure: + """ + Defines the structure of a molecule for barcode/UMI extraction. + + Parses molecule definition files (MDF) and identifies which elements + are barcodes and UMIs based on naming convention: + - Elements with prefix "barcode" (case-insensitive) are barcodes + - Elements with prefix "umi" (case-insensitive) are UMIs + """ + + CONCAT_DELIM = "|" + DUPL_DELIM = "/" + + def __init__(self, str_iterator): + self.ordered_elements: List[MoleculeElement] = [] + self.elements_to_concatenate = {} + self.concatenated_elements_counts = {} + self.duplicated_elements = {} + self.duplicated_elements_counts = {} + # Barcode/UMI element names identified by naming convention + self._barcode_elements: List[str] = [] + self._umi_elements: List[str] = [] + + l = next(str_iterator) + elements = list(map(lambda x: x.strip(), l.strip().split(':'))) + element_properties = {} + for l in str_iterator: + v = l.strip().split() + if not v: + continue + element_name = v[0] + if element_name in element_properties: + logger.critical("Duplicated element name %s" % element_name) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + + if len(v) == 3: + element_properties[element_name] = (v[1], v[2], None) + elif len(v) == 4: + element_properties[element_name] = (v[1], v[2], v[3]) + else: + logger.critical("Incorrect number of properties in line %s" % l.strip()) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + + element_name: str + for element_name in elements: + if element_name not in element_properties: + if element_name not in (ElementType.cDNA.name, ElementType.PolyT.name): + logger.critical("Molecule element %s was not described in the format file" % element_name) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + element_type = ElementType[element_name] + self.ordered_elements.append(MoleculeElement(element_name, element_type)) + else: + element_type, element_val1, element_val2 = element_properties[element_name] + if element_type not in ElementType.__dict__: + logger.critical("Molecule element type %s is not among the possible types" % element_type) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + element_type = ElementType[element_type] + self.ordered_elements.append(MoleculeElement(element_name, element_type, element_val1, element_val2)) + + if self.CONCAT_DELIM in element_name: + if not (element_type.needs_sequence_extraction() or element_type.needs_correction()): + logger.critical("Concatenated elements must be variable (fix element %s)" % element_name) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + v = element_name.split(self.CONCAT_DELIM) + if len(v) != 2: + logger.critical("Incorrect concatenated element %s" % element_name) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + try: + self.elements_to_concatenate[element_name] = (v[0], int(v[1])) + except ValueError: + logger.critical("Incorrectly specified index for concatenated element %s: %s" % (element_name, v[1])) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + + elif self.DUPL_DELIM in element_name: + if not (element_type.needs_sequence_extraction() or element_type.needs_correction()): + logger.critical("Concatenated elements must be variable (fix element %s)" % element_name) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + v = element_name.split(self.DUPL_DELIM) + if len(v) != 2: + logger.critical("Incorrect duplicated element %s" % element_name) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + try: + self.duplicated_elements[element_name] = (v[0], int(v[1])) + except ValueError: + logger.critical("Incorrectly specified index for duplicated element %s: %s" % (element_name, v[1])) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + + self._check_linked_elements(self.elements_to_concatenate) + self.concatenated_elements_counts = self._check_indices(self.elements_to_concatenate) + self._check_linked_elements(self.duplicated_elements) + self.duplicated_elements_counts = self._check_indices(self.duplicated_elements) + self._identify_barcode_umi_elements() + + def _check_linked_elements(self, element_dict): + # check that duplicated or concatenated elements have the same values (e.g. barcode whitelist) + # base element name -> List[MoleculeElement] + linked_elements = defaultdict(list) + for el in self.ordered_elements: + if el.element_name in element_dict: + base_element_name = element_dict[el.element_name][0] + linked_elements[base_element_name].append(el) + + for element_list in linked_elements.values(): + if len(set(el.element_type for el in element_list)) != 1: + logger.critical("Linked elements must have the same type (fix elements: %s)" % ", ".join(el.element_name for el in element_list)) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + first_value = element_list[0].element_value + for el in element_list[1:]: + if el.element_value != first_value: + logger.critical("Linked elements must have the same values (fix elements: %s)" % ", ".join(e.element_name for e in element_list)) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + + def _check_indices(self, element_dict): + # check for consecutive indices + index_dict = defaultdict(list) + for el in element_dict: + base_element_name = element_dict[el][0] + index_dict[base_element_name].append(element_dict[el][1]) + + for el in index_dict: + if sorted(index_dict[el]) != list(range(1, len(index_dict[el]) + 1)): + logger.critical("Concatenated elements must be consecutive from 1 to %d (fix element %s, indices: %s)" % (len(index_dict[el]), el, str(index_dict[el]))) + sys.exit(IsoQuantExitCode.INVALID_FILE_FORMAT) + + return {el: len(val) for el, val in index_dict.items()} + + def _identify_barcode_umi_elements(self) -> None: + """ + Identify barcode/UMI elements by naming convention. + + Elements with prefix "barcode" (case-insensitive) are barcodes. + Elements with prefix "umi" (case-insensitive) are UMIs. + + For concatenated/duplicated elements (e.g., barcode|1, barcode|2), + the base name (e.g., "barcode") is used and deduplicated. + """ + self._barcode_elements = [] + self._umi_elements = [] + seen = set() + for el in self.ordered_elements: + # For concatenated/duplicated elements, use the base name + if el.element_name in self.elements_to_concatenate: + name = self.elements_to_concatenate[el.element_name][0] + elif el.element_name in self.duplicated_elements: + name = self.duplicated_elements[el.element_name][0] + else: + name = el.element_name + + if name in seen: + continue + seen.add(name) + + name_lower = name.lower() + if name_lower.startswith("barcode"): + self._barcode_elements.append(name) + elif name_lower.startswith("umi"): + self._umi_elements.append(name) + + @property + def barcode_elements(self) -> List[str]: + """Get list of barcode element names (identified by 'barcode' prefix).""" + return self._barcode_elements + + @property + def umi_elements(self) -> List[str]: + """Get list of UMI element names (identified by 'umi' prefix).""" + return self._umi_elements + + @classmethod + def from_element_list(cls, element_list): + """Create MoleculeStructure from a list of MoleculeElement objects.""" + ms = cls.__new__(cls) + ms.ordered_elements = element_list + ms.elements_to_concatenate = {} + ms.duplicated_elements = {} + ms._barcode_elements = [] + ms._umi_elements = [] + ms._identify_barcode_umi_elements() + return ms + + def __iter__(self) -> Iterator[MoleculeElement]: + for e in self.ordered_elements: + yield e + diff --git a/src/barcode_calling/callers/protocol.py b/src/barcode_calling/callers/protocol.py new file mode 100644 index 00000000..a76f6717 --- /dev/null +++ b/src/barcode_calling/callers/protocol.py @@ -0,0 +1,191 @@ +############################################################################ +# Copyright (c) 2023-2026 University of Helsinki +# # All Rights Reserved +# See file LICENSE for details. +############################################################################ + +""" +Protocol definition for barcode detection results. + +This module defines the BarcodeResult protocol that all barcode detection +result classes implement. Using Protocol (structural typing) allows +existing classes to automatically conform without explicit inheritance. +""" + +from typing import Protocol, List, ClassVar, runtime_checkable + + +@runtime_checkable +class BarcodeResult(Protocol): + """ + Protocol defining the interface for barcode detection results. + + All barcode detection result classes (BarcodeDetectionResult, + LinkerBarcodeDetectionResult, TSOBarcodeDetectionResult, + TenXBarcodeDetectionResult, ExtractionResult) implement this interface. + + Using Protocol (PEP 544) enables structural subtyping - classes + don't need to explicitly inherit from this protocol to be + considered compatible; they just need to implement the required + methods and attributes. + + Attributes: + NOSEQ: Class variable sentinel for missing/undetected sequence ("*"). + read_id: The unique identifier of the read being analyzed. + strand: The detected strand orientation ('+', '-', or '.'). + + Example: + def process_result(result: BarcodeResult) -> None: + if result.is_valid(): + barcode = result.get_barcode() + umi = result.get_umi() + print(f"{result.read_id}: {barcode}/{umi}") + """ + + NOSEQ: ClassVar[str] + read_id: str + strand: str + + def get_barcode(self) -> str: + """ + Get the detected barcode sequence. + + Returns the barcode sequence if detected, or NOSEQ ("*") if no + barcode was found. For platforms with multiple barcode segments + (e.g., Visium HD with part1|part2), returns the concatenated + barcode with "|" delimiter. + + Returns: + str: Detected barcode sequence or NOSEQ if not found. + """ + ... + + def get_umi(self) -> str: + """ + Get the detected UMI (Unique Molecular Identifier) sequence. + + Returns the UMI sequence if detected, or NOSEQ ("*") if no + UMI was found. For platforms with multiple UMI elements, + returns concatenated UMIs with "|" delimiter. + + Returns: + str: Detected UMI sequence or NOSEQ if not found. + """ + ... + + def is_valid(self) -> bool: + """ + Check if a valid barcode was detected. + + A result is considered valid if get_barcode() returns something + other than NOSEQ. This is the primary method to check if barcode + detection was successful. + + Returns: + bool: True if a valid barcode was detected, False otherwise. + """ + ... + + def has_barcode(self) -> bool: + """ + Check if a barcode was detected (alias for is_valid()). + + Used by ReadStats for counting detected barcodes. + + Returns: + bool: True if barcode was detected. + """ + ... + + def has_umi(self) -> bool: + """ + Check if a valid UMI was detected. + + Returns True if get_umi() returns something other than NOSEQ. + Used by ReadStats for counting detected UMIs. + + Returns: + bool: True if UMI was detected. + """ + ... + + def set_strand(self, strand: str) -> None: + """ + Set the detected strand orientation. + + Args: + strand: Strand orientation. Should be '+' for forward, + '-' for reverse, or '.' for unknown/unstranded. + """ + ... + + def update_coordinates(self, delta: int) -> None: + """ + Shift all genomic coordinates by a delta value. + + Used when processing read subsequences (e.g., after splitting + a concatenated read). Implementations should update all position + attributes (polyT, primer, linker_start, linker_end, etc.) + by adding delta. Coordinates that are -1 (invalid/not detected) + should remain -1. + + Args: + delta: Amount to add to all coordinate values. + """ + ... + + def more_informative_than(self, other: 'BarcodeResult') -> bool: + """ + Compare two results to determine which is more informative. + + Used when both forward and reverse complement orientations + produce results, to select the better one. The comparison + criteria vary by platform but typically consider: + - Barcode alignment score + - Presence of additional features (polyT, primer, linker) + - Feature positions + + Args: + other: Another detection result to compare against. + + Returns: + bool: True if this result is more informative than `other`. + """ + ... + + def get_additional_attributes(self) -> List[str]: + """ + Get list of detected additional features. + + Returns human-readable strings describing which optional + features were detected (e.g., "PolyT detected", "Primer detected", + "Linker detected", "TSO detected", "R1 detected"). + + Used for statistics collection and quality reporting. + + Returns: + List[str]: Names of detected features. Empty list if none. + """ + ... + + def __str__(self) -> str: + """ + Format result as a tab-separated string for TSV output. + + The format should match what header() returns. + + Returns: + str: Tab-separated values for TSV output. + """ + ... + + def header(self) -> str: + """ + Get TSV header line for result output. + + Returns the column names matching the fields output by __str__(). + + Returns: + str: Tab-separated column names with leading '#'. + """ + ... diff --git a/src/barcode_calling/callers/stereo.py b/src/barcode_calling/callers/stereo.py index bdcbe8c0..58a65b96 100644 --- a/src/barcode_calling/callers/stereo.py +++ b/src/barcode_calling/callers/stereo.py @@ -36,13 +36,12 @@ class StereoBarcodeDetector: TERMINAL_MATCH_DELTA = 3 STRICT_TERMINAL_MATCH_DELTA = 1 - def __init__(self, barcodes: List[str], min_score: int = 21): + def __init__(self, barcodes: List[str]): """ Initialize Stereo-seq detector. Args: barcodes: List of known barcode sequences - min_score: Minimum alignment score for barcode matching """ self.main_primer = StereoBarcodeDetector.PC1_PRIMER self.pcr_primer_indexer = ArrayKmerIndexer([self.main_primer], kmer_size=6) @@ -55,8 +54,7 @@ def __init__(self, barcodes: List[str], min_score: int = 21): self.barcode_indexer = Dict2BitKmerIndexer(bit_barcodes, kmer_size=14, seq_len=self.BC_LENGTH) logger.info("Indexed %d barcodes" % len(bit_barcodes)) - self.umi_set = None - self.min_score = min_score + self.min_score = 21 def find_barcode_umi_multiple(self, read_id: str, sequence: str) -> List[LinkerBarcodeDetectionResult]: """Find multiple barcodes in a single read.""" @@ -203,14 +201,18 @@ def _find_barcode_umi_fwd(self, read_id: str, sequence: str) -> LinkerBarcodeDet def result_type(): return LinkerBarcodeDetectionResult + @classmethod + def header(cls): + return cls.result_type().header() + class SharedMemoryStereoBarcodeDetector(StereoBarcodeDetector): """Stereo-seq detector with shared memory for large barcode sets.""" MIN_BARCODES_FOR_SHARED_MEM = 1000000 - def __init__(self, barcodes: List[str], min_score: int = 21): - super().__init__([], min_score=min_score) + def __init__(self, barcodes: List[str]): + super().__init__([]) # Convert to 2-bit encoding first (memory efficient) bit_barcodes = batch_str_to_2bit_chunked(iter(barcodes), seq_len=self.BC_LENGTH) self.barcode_count = len(bit_barcodes) @@ -238,7 +240,7 @@ def __getstate__(self): def __setstate__(self, state): self.min_score = state[0] - super().__init__([], min_score=self.min_score) + super().__init__([]) self.bit_barcodes = None self.barcode_count = state[1] if self.barcode_count < self.MIN_BARCODES_FOR_SHARED_MEM: @@ -262,13 +264,12 @@ class StereoSplittingBarcodeDetector: TERMINAL_MATCH_DELTA = 3 STRICT_TERMINAL_MATCH_DELTA = 1 - def __init__(self, barcodes: List[str], min_score: int = 21): + def __init__(self, barcodes: List[str]): """ Initialize splitting detector. Args: barcodes: List of known barcode sequences - min_score: Minimum alignment score """ self.main_primer = self.PC1_PRIMER self.tso5_indexer = ArrayKmerIndexer([self.TSO5], kmer_size=8) @@ -281,8 +282,7 @@ def __init__(self, barcodes: List[str], min_score: int = 21): bit_barcodes = batch_str_to_2bit_chunked(iter(barcodes), seq_len=self.BC_LENGTH) self.barcode_indexer = Dict2BitKmerIndexer(bit_barcodes, kmer_size=14, seq_len=self.BC_LENGTH) logger.info("Indexed %d barcodes" % len(bit_barcodes)) - self.umi_set = None - self.min_score = min_score + self.min_score = 21 def find_barcode_umi(self, read_id: str, sequence: str) -> SplittingBarcodeDetectionResult: """Find multiple barcodes in a concatenated read.""" @@ -490,14 +490,18 @@ def _find_barcode_umi_fwd(self, read_id: str, sequence: str) -> TSOBarcodeDetect def result_type(): return SplittingBarcodeDetectionResult + @classmethod + def header(cls): + return cls.result_type().header() + class SharedMemoryStereoSplittingBarcodeDetector(StereoSplittingBarcodeDetector): """Stereo-seq splitting detector with shared memory for large barcode sets.""" MIN_BARCODES_FOR_SHARED_MEM = 1000000 - def __init__(self, barcodes: List[str], min_score: int = 21): - super().__init__([], min_score=min_score) + def __init__(self, barcodes: List[str]): + super().__init__([]) # Convert to 2-bit encoding first (memory efficient) bit_barcodes = batch_str_to_2bit_chunked(iter(barcodes), seq_len=self.BC_LENGTH) self.barcode_count = len(bit_barcodes) @@ -525,7 +529,7 @@ def __getstate__(self): def __setstate__(self, state): self.min_score = state[0] - super().__init__([], min_score=self.min_score) + super().__init__([]) self.bit_barcodes = None self.barcode_count = state[1] if self.barcode_count < self.MIN_BARCODES_FOR_SHARED_MEM: @@ -538,18 +542,17 @@ def __setstate__(self, state): class SharedMemoryWrapper: """Generic wrapper to add shared memory support to any barcode detector.""" - def __init__(self, barcode_detector_class, barcodes: List[str], min_score: int = 21): + def __init__(self, barcode_detector_class, barcodes: List[str]): """ Initialize wrapper. Args: barcode_detector_class: Detector class to wrap barcodes: List of known barcodes - min_score: Minimum alignment score """ self.barcode_detector_class = barcode_detector_class - self.min_score = min_score - self.barcode_detector = self.barcode_detector_class([], self.min_score) + self.barcode_detector = self.barcode_detector_class([]) + self.min_score = self.barcode_detector.min_score # Convert to 2-bit and use shared memory indexer barcode_iter = iter(barcodes) if isinstance(barcodes, list) else barcodes bit_barcodes = batch_str_to_2bit_chunked(barcode_iter, seq_len=self.barcode_detector_class.BC_LENGTH) @@ -566,5 +569,6 @@ def __getstate__(self): def __setstate__(self, state): self.barcode_detector_class = state[0] self.min_score = state[1] - self.barcode_detector = self.barcode_detector_class([], self.min_score) + self.barcode_detector = self.barcode_detector_class([]) + self.barcode_detector.min_score = self.min_score self.barcode_detector.barcode_indexer = SharedMemoryArray2BitKmerIndexer.from_sharable_info(state[2]) diff --git a/src/barcode_calling/callers/tenx.py b/src/barcode_calling/callers/tenx.py index 449b7798..5732466d 100644 --- a/src/barcode_calling/callers/tenx.py +++ b/src/barcode_calling/callers/tenx.py @@ -13,11 +13,12 @@ import logging from typing import List, Optional +from .. import ArrayKmerIndexer, Array2BitKmerIndexer from ..indexers import KmerIndexer from ..common import ( find_polyt_start, reverese_complement, find_candidate_with_max_score_ssw, find_candidate_with_max_score_ssw_var_len, - detect_exact_positions, + detect_exact_positions, str_to_2bit, ) from .base import TenXBarcodeDetectionResult @@ -36,7 +37,7 @@ class TenXBarcodeDetector: TERMINAL_MATCH_DELTA = 2 STRICT_TERMINAL_MATCH_DELTA = 1 - def __init__(self, barcode_list: List[str], umi_list: Optional[List[str]] = None): + def __init__(self, barcode_list: List[str]): """ Initialize 10x detector. @@ -45,15 +46,19 @@ def __init__(self, barcode_list: List[str], umi_list: Optional[List[str]] = None umi_list: Optional list of known UMIs for validation """ self.r1_indexer = KmerIndexer([TenXBarcodeDetector.R1], kmer_size=7) - self.barcode_indexer = KmerIndexer(barcode_list, kmer_size=6) - self.umi_set = None - if umi_list: - self.umi_set = set(umi_list) - logger.debug("Loaded %d UMIs" % len(umi_list)) - self.umi_indexer = KmerIndexer(umi_list, kmer_size=5) - self.min_score = 14 - if len(self.barcode_indexer.seq_list) > 100000: - self.min_score = 16 + if len(barcode_list) <= 100000: + self.barcode_indexer = ArrayKmerIndexer(barcode_list, kmer_size=6) + self.max_barcodes_hits = 20 + self.min_matching_kmers = 1 + self.min_score = 14 + self.score_diff = 0 + else: + self.barcode_indexer = Array2BitKmerIndexer([str_to_2bit(b) for b in barcode_list], kmer_size=6, seq_len=self.BARCODE_LEN_10X) + self.max_barcodes_hits = 10 + self.min_matching_kmers = 2 + self.min_score = 15 + self.score_diff = 1 + logger.debug("Min score set to %d" % self.min_score) def find_barcode_umi(self, read_id: str, sequence: str) -> TenXBarcodeDetectionResult: @@ -121,9 +126,13 @@ def _find_barcode_umi_fwd(self, read_id: str, sequence: str) -> TenXBarcodeDetec barcode_end = r1_end + self.BARCODE_LEN_10X + 1 potential_barcode = sequence[barcode_start:barcode_end + 1] logger.debug("Barcode: %s" % (potential_barcode)) - matching_barcodes = self.barcode_indexer.get_occurrences(potential_barcode) + matching_barcodes = self.barcode_indexer.get_occurrences(potential_barcode, + max_hits=self.max_barcodes_hits, + min_kmers=self.min_matching_kmers) barcode, bc_score, bc_start, bc_end = \ - find_candidate_with_max_score_ssw(matching_barcodes, potential_barcode, min_score=self.min_score) + find_candidate_with_max_score_ssw(matching_barcodes, potential_barcode, + min_score=self.min_score, + score_diff=self.score_diff) if barcode is None: return TenXBarcodeDetectionResult(read_id, polyT=polyt_start, r1=r1_end) @@ -137,18 +146,10 @@ def _find_barcode_umi_fwd(self, read_id: str, sequence: str) -> TenXBarcodeDetec potential_umi = sequence[potential_umi_start:potential_umi_end + 1] logger.debug("Potential UMI: %s" % potential_umi) - umi = None good_umi = False - if self.umi_set: - matching_umis = self.umi_indexer.get_occurrences(potential_umi) - umi, umi_score, umi_start, umi_end = \ - find_candidate_with_max_score_ssw(matching_umis, potential_umi, min_score=7) - logger.debug("Found UMI %s %d-%d" % (umi, umi_start, umi_end)) - - if not umi: - umi = potential_umi - if self.UMI_LEN - self.UMI_LEN_DELTA <= len(umi) <= self.UMI_LEN + self.UMI_LEN_DELTA: - good_umi = True + umi = potential_umi + if self.UMI_LEN - self.UMI_LEN_DELTA <= len(umi) <= self.UMI_LEN + self.UMI_LEN_DELTA: + good_umi = True if not umi: return TenXBarcodeDetectionResult(read_id, barcode, BC_score=bc_score, polyT=polyt_start, r1=r1_end) @@ -175,6 +176,10 @@ def find_barcode_umi_no_polya(self, read_id: str, sequence: str) -> TenXBarcodeD def result_type(): return TenXBarcodeDetectionResult + @classmethod + def header(cls): + return cls.result_type().header() + class VisiumHDBarcodeDetector: """Visium HD spatial transcriptomics barcode detector.""" @@ -201,10 +206,13 @@ def __init__(self, barcode_pair_list: List[List[str]]): self.r1_indexer = KmerIndexer([VisiumHDBarcodeDetector.R1], kmer_size=7) self.part1_list = barcode_pair_list[0] self.part2_list = barcode_pair_list[1] - self.part1_barcode_indexer = KmerIndexer(self.part1_list, kmer_size=7) - self.part2_barcode_indexer = KmerIndexer(self.part2_list, kmer_size=7) - self.umi_set = None + + self.part1_barcode_indexer = ArrayKmerIndexer(self.part1_list, kmer_size=7) + self.part2_barcode_indexer = ArrayKmerIndexer(self.part2_list, kmer_size=7) + self.max_barcodes_hits = 20 + self.min_matching_kmers = 2 self.min_score = 13 + logger.debug("Min score set to %d" % self.min_score) def find_barcode_umi(self, read_id: str, sequence: str) -> TenXBarcodeDetectionResult: @@ -344,3 +352,7 @@ def find_barcode_umi_no_polya(self, read_id: str, sequence: str) -> TenXBarcodeD @staticmethod def result_type(): return TenXBarcodeDetectionResult + + @classmethod + def header(cls): + return cls.result_type().header() diff --git a/src/barcode_calling/callers/universal_extraction.py b/src/barcode_calling/callers/universal_extraction.py new file mode 100644 index 00000000..de774ae7 --- /dev/null +++ b/src/barcode_calling/callers/universal_extraction.py @@ -0,0 +1,579 @@ +########################################################################### +# Copyright (c) 2026 University of Helsinki +# # All Rights Reserved +# See file LICENSE for details. +############################################################################ + +import logging +import math +import sys + +from ..indexers import ArrayKmerIndexer, Array2BitKmerIndexer, Dict2BitKmerIndexer, KmerIndexer +from ..common import batch_str_to_2bit +from ...error_codes import IsoQuantExitCode +from .extraction_result import DetectedElement, ExtractionResult +from ..common import find_polyt, reverese_complement, detect_exact_positions, find_candidate_with_max_score_ssw +from .molecule_structure import ElementType, MoleculeStructure, MoleculeElement + +logger = logging.getLogger('IsoQuant') + + +class UniversalSingleMoleculeExtractor: + MIN_SCORE_COEFF = 0.75 + MIN_SCORE_COEFF_TERMMINAL = 0.5 + TERMINAL_MATCH_DELTA = 3 + MAX_LEN_DIFF = 0.25 + DEFAULT_ERROR_RATE = 0.03 # rought estimate for modern nanopore, doesn't matter that much + MAX_HITS = 10 + SCORE_DIFF = 1 + + def __init__(self, molecule_structure: MoleculeStructure): + self.correct_sequences = True + self.molecule_structure = molecule_structure + self.index_dict = {} + self.min_scores = {} + self.has_polyt = False + self.has_cdna = False + self.constant_elements_to_detect = set() + self.elements_to_extract = set() + self.elements_to_correct = set() + + for el in self.molecule_structure: + if el.element_type == ElementType.PolyT: + if self.has_polyt: + logger.critical( + "Current version only supports a single polyT, extraction results may be suboptimal") + sys.exit(IsoQuantExitCode.INVALID_PARAMETER) + self.has_polyt = True + elif el.element_type == ElementType.cDNA: + if self.has_cdna: + logger.critical( + "Current version only supports a single cDNA, reads will not be split into molecules") + sys.exit(IsoQuantExitCode.INVALID_PARAMETER) + self.has_cdna = True + elif el.element_type.is_constant(): + self.constant_elements_to_detect.add(el.element_name) + self.index_dict[el.element_name] = KmerIndexer([el.element_value], + max(6, int(el.element_length / 2) - 2)) + elif el.element_type.needs_sequence_extraction(): + self.elements_to_extract.add(el.element_name) + elif el.element_type.needs_correction(): + if el.element_name in self.molecule_structure.elements_to_concatenate: + # Concatenated parts: skip individual index, handled after loop + pass + elif el.element_name in self.molecule_structure.duplicated_elements: + # Duplicated parts: skip individual index, handled after loop + pass + elif self.correct_sequences: + self.prepare_barcode_index_for_element(el) + self.elements_to_correct.add(el.element_name) + else: + self.elements_to_extract.add(el.element_name) + + # Build one index per concatenated base element using the full whitelist + if self.correct_sequences: + for base_name, count in self.molecule_structure.concatenated_elements_counts.items(): + first_part_el = None + total_length = 0 + for el in self.molecule_structure: + if el.element_name in self.molecule_structure.elements_to_concatenate: + if self.molecule_structure.elements_to_concatenate[el.element_name][0] == base_name: + if first_part_el is None: + first_part_el = el + total_length += el.element_length + if first_part_el is not None and first_part_el.element_value is not None: + barcode_list = first_part_el.element_value + self.prepare_barcode_index(base_name, barcode_list, total_length) + + # Build one index per duplicated base element using the full whitelist + if self.correct_sequences: + for base_name, count in self.molecule_structure.duplicated_elements_counts.items(): + first_part_el = None + for el in self.molecule_structure: + if el.element_name in self.molecule_structure.duplicated_elements: + if self.molecule_structure.duplicated_elements[el.element_name][0] == base_name: + first_part_el = el + break + if first_part_el is not None and first_part_el.element_value is not None: + barcode_list = first_part_el.element_value + self.prepare_barcode_index(base_name, barcode_list, first_part_el.element_length) + + if not self.has_cdna: + logger.critical("Molecule must include a cDNA") + sys.exit(IsoQuantExitCode.INVALID_PARAMETER) + + def prepare_barcode_index(self, base_name, barcode_list, barcode_length): + # Check whether all barcodes have the same length + barcode_lengths = set(len(b) for b in barcode_list) + variable_length = len(barcode_lengths) > 1 + if variable_length: + logger.warning("Barcodes for element %s have variable lengths (%s), " + "performance may be suboptimal" % + (base_name, ", ".join(str(l) for l in sorted(barcode_lengths)))) + barcode_length = min(barcode_lengths) + + barcode_count = len(barcode_list) + error_rate = self.DEFAULT_ERROR_RATE + barcode_sparsity = math.pow(4, barcode_length) / barcode_count + filling_edit_distance = math.ceil(math.log(barcode_sparsity, 8 * barcode_length)) + error_probability = math.pow(1 - error_rate, barcode_length - filling_edit_distance) * math.pow(error_rate, filling_edit_distance) * math.comb(barcode_length, filling_edit_distance) + density = barcode_count * math.pow(8 * barcode_length, filling_edit_distance) / math.pow(4, barcode_length) + if error_probability > 0.01 or density > 1: + filling_edit_distance -= 1 + + if filling_edit_distance == 0: + self.min_scores[base_name] = barcode_length - 1 + self.min_scores[base_name] = barcode_length - filling_edit_distance + if variable_length: + self.min_scores[base_name] = min(self.min_scores[base_name] + 1, barcode_length) + logger.info("Minimal score for element %s is set to %d" % (base_name, self.min_scores[base_name])) + + if barcode_count > 1000000: + logger.warning("The number of barcodes for element %s is too large: %d, barcode calling may take substantial amount of time and RAM", (base_name, barcode_count)) + + kmer_size = min(max(6, int(barcode_length / 2) - 1), 15) + if kmer_size <= 8: + if barcode_count < 100000 or variable_length: + self.index_dict[base_name] = ArrayKmerIndexer(barcode_list, kmer_size) + else: + self.index_dict[base_name] = Array2BitKmerIndexer(batch_str_to_2bit(barcode_list, barcode_length), + kmer_size, seq_len=barcode_length) + else: + if barcode_count < 100000 or variable_length: + self.index_dict[base_name] = KmerIndexer(barcode_list, kmer_size) + else: + self.index_dict[base_name] = Dict2BitKmerIndexer(batch_str_to_2bit(barcode_list, barcode_length), + kmer_size, seq_len=barcode_length) + logger.info("Indexed %d barcodes for element %s" % (barcode_count, base_name)) + + def prepare_barcode_index_for_element(self, element: MoleculeElement): + if element.element_type not in (ElementType.VAR_LIST, ElementType.VAR_FILE): + return + barcode_list = element.element_value + self.prepare_barcode_index(element.element_name, barcode_list, element.element_length) + + def result_type(self): + return ExtractionResult + + def header(self): + """Get header matching ExtractionResult output format.""" + # Create a dummy result to get the correct header format + dummy_result = ExtractionResult(self.molecule_structure, "", ".", {}) + return dummy_result.header() + + def find_barcode_umi(self, read_id, sequence): + detected_elements_fwd = self._find_patterns_fwd(read_id, sequence) + rev_seq = reverese_complement(sequence) + detected_elements_rev = self._find_patterns_fwd(read_id, rev_seq) + + if self.has_polyt: + fwd_has_polyt = ElementType.PolyT.name in detected_elements_fwd + rev_has_polyt = ElementType.PolyT.name in detected_elements_rev + if fwd_has_polyt and not rev_has_polyt: + return ExtractionResult(self.molecule_structure, read_id, '+', detected_elements_fwd) + if not fwd_has_polyt and rev_has_polyt: + return ExtractionResult(self.molecule_structure, read_id, '-', detected_elements_rev) + + if len(detected_elements_fwd) >= len(detected_elements_rev): + return ExtractionResult(self.molecule_structure, read_id, '+', detected_elements_fwd) + return ExtractionResult(self.molecule_structure, read_id, '-', detected_elements_rev) + + def _find_patterns_fwd(self, read_id, sequence): + logger.debug("== read id %s ==" % read_id) + # TODO: make a list of DetectedElement rather that dict of str->DetectedElement for speed + detected_elements = {} + polyt_start, polyt_end = -1, -1 + if self.has_polyt: + polyt_start, polyt_end = find_polyt(sequence) + detected_elements[ElementType.PolyT.name] = DetectedElement(polyt_start, polyt_end, 0) + logger.debug("PolyT %d" % polyt_start) + + self.detect_const_elements(sequence, polyt_start, polyt_end, detected_elements) + concat_element_storage = {} + dupl_element_storage = {} + self.extract_variable_elements5(sequence, detected_elements, concat_element_storage, dupl_element_storage) + self.extract_variable_elements3(sequence, detected_elements, concat_element_storage, dupl_element_storage) + self.process_concatenated_elements(concat_element_storage, detected_elements, sequence) + self.process_duplicated_elements(dupl_element_storage, detected_elements, sequence) + logger.debug("== end read id %s ==" % read_id) + return detected_elements + + def process_concatenated_elements(self, concatenated_element_storage, detected_results, sequence): + for base_name, parts in concatenated_element_storage.items(): + # Check all parts were detected + if any(start == -1 for start, end in parts): + detected_results[base_name] = DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ) + continue + + # Concatenate sequences from all parts + concatenated_seq = "" + for start, end in parts: + concatenated_seq += sequence[start:end + 1] + + # If correction is enabled and an index exists, correct against the whitelist + if self.correct_sequences and base_name in self.index_dict: + matching_sequences = self.index_dict[base_name].get_occurrences(concatenated_seq, max_hits=self.MAX_HITS) + corrected_seq, seq_score, _, _ = find_candidate_with_max_score_ssw( + matching_sequences, concatenated_seq, + min_score=self.min_scores[base_name], score_diff=self.SCORE_DIFF) + if corrected_seq is not None: + detected_results[base_name] = DetectedElement( + parts[0][0], parts[-1][1], seq_score, corrected_seq) + else: + detected_results[base_name] = DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ) + else: + # No correction: just store the raw concatenated sequence + detected_results[base_name] = DetectedElement( + parts[0][0], parts[-1][1], 0, concatenated_seq) + + def process_duplicated_elements(self, duplicated_element_storage, detected_results, sequence): + for base_name, parts in duplicated_element_storage.items(): + # Extract sequences for detected parts + part_seqs = [] + part_coords = [] + for start, end in parts: + if start != -1: + part_seqs.append(sequence[start:end + 1]) + part_coords.append((start, end)) + else: + part_seqs.append(None) + part_coords.append((-1, -1)) + + detected_seqs = [s for s in part_seqs if s is not None] + if not detected_seqs: + detected_results[base_name] = DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ) + continue + + # Use first detected copy's coordinates for reporting + first_detected_idx = next(i for i, s in enumerate(part_seqs) if s is not None) + + if self.correct_sequences and base_name in self.index_dict: + # Correct each copy independently + corrected = [] + for seq in part_seqs: + if seq is None: + corrected.append((None, -1)) + continue + matching = self.index_dict[base_name].get_occurrences(seq, max_hits=self.MAX_HITS) + corrected_seq, score, _, _ = find_candidate_with_max_score_ssw( + matching, seq, min_score=self.min_scores[base_name], score_diff=self.SCORE_DIFF) + corrected.append((corrected_seq, score)) + + successful = [(seq, score) for seq, score in corrected if seq is not None] + + if not successful: + # Neither copy corrected + detected_results[base_name] = DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ) + elif len(successful) == 1: + # One succeeded + seq, score = successful[0] + detected_results[base_name] = DetectedElement( + part_coords[first_detected_idx][0], + part_coords[first_detected_idx][1], + score, seq) + else: + # Multiple succeeded - check agreement + unique_seqs = set(seq for seq, _ in successful) + if len(unique_seqs) == 1: + # All agree + best_score = max(score for _, score in successful) + detected_results[base_name] = DetectedElement( + part_coords[first_detected_idx][0], + part_coords[first_detected_idx][1], + best_score, successful[0][0]) + else: + # Copies disagree after correction + detected_results[base_name] = DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ) + else: + # No correction: compare raw sequences + if len(set(detected_seqs)) == 1: + # All match + detected_results[base_name] = DetectedElement( + part_coords[first_detected_idx][0], + part_coords[first_detected_idx][1], + 0, detected_seqs[0]) + else: + # Differ: report all comma-separated + comma_sep = ",".join(detected_seqs) + detected_results[base_name] = DetectedElement( + part_coords[first_detected_idx][0], + part_coords[first_detected_idx][1], + 0, comma_sep) + + def correct_element(self, element: MoleculeElement, sequence: str, potential_start: int, potential_end: int) -> DetectedElement: + potential_seq = sequence[potential_start:potential_end + 1] + matching_sequences = self.index_dict[element.element_name].get_occurrences(potential_seq, max_hits=self.MAX_HITS) + corrected_seq, seq_score, seq_start, seq_end = \ + find_candidate_with_max_score_ssw(matching_sequences, potential_seq, + min_score=self.min_scores[element.element_name], score_diff=self.SCORE_DIFF) + if corrected_seq is not None: + read_seq_start = potential_start + seq_start + read_seq_end = potential_start + seq_end - 1 + return DetectedElement(read_seq_start, read_seq_end, seq_score, corrected_seq) + else: + return DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ) + + def _backup_concatenated_element(self, element, potential_start, potential_end, element_storage): + base_name, part_index = self.molecule_structure.elements_to_concatenate[element.element_name] + if base_name not in element_storage: + element_storage[base_name] = [(-1, -1) for _ in range( + self.molecule_structure.concatenated_elements_counts[base_name])] + # part_index is 1-based in MDF, convert to 0-based + element_storage[base_name][part_index - 1] = (potential_start, potential_end) + + def _backup_duplicated_element(self, element, potential_start, potential_end, element_storage): + base_name, part_index = self.molecule_structure.duplicated_elements[element.element_name] + if base_name not in element_storage: + element_storage[base_name] = [(-1, -1) for _ in range( + self.molecule_structure.duplicated_elements_counts[base_name])] + # part_index is 1-based in MDF, convert to 0-based + element_storage[base_name][part_index - 1] = (potential_start, potential_end) + + def _detect_variable_element(self, el, sequence, potential_start, potential_end, + detected_elements, concat_element_storage, dupl_element_storage, + len_diff_threshold=None): + potential_len = potential_end - potential_start + 1 + if len_diff_threshold is None or abs(potential_len - el.element_length) <= len_diff_threshold * el.element_length: + if el.element_name in self.molecule_structure.elements_to_concatenate: + self._backup_concatenated_element(el, potential_start, potential_end, concat_element_storage) + return potential_start, potential_end + + if el.element_name in self.molecule_structure.duplicated_elements: + self._backup_duplicated_element(el, potential_start, potential_end, dupl_element_storage) + return potential_start, potential_end + + if el.element_name in self.elements_to_extract: + detected_elements[el.element_name] = DetectedElement(potential_start, potential_end, 0, + seq=sequence[potential_start:potential_end + 1]) + + elif el.element_name in self.elements_to_correct: + detected_element = self.correct_element(el, sequence, potential_start, potential_end) + if detected_element.start != -1: + detected_elements[el.element_name] = detected_element + return detected_element.start, detected_element.end + else: + detected_elements[el.element_name] = DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ) + + return potential_start, potential_end + + def _detect_potential_start(self, element, element_index, potential_end, detected_elements): + if element_index == 0: + potential_start = potential_end - element.element_length + 1 + else: + prev_el = self.molecule_structure.ordered_elements[element_index - 1] + if prev_el.element_name in detected_elements: + potential_start = detected_elements[prev_el].end + 1 + else: + potential_start = potential_end - element.element_length + 1 + if potential_start < 0: + potential_start = 0 + + return potential_start + + def _detect_potential_end(self, sequence, element, element_index, potential_start, detected_elements): + if element_index + 1 == len(self.molecule_structure.ordered_elements): + potential_end = potential_start + element.element_length - 1 + else: + next_el = self.molecule_structure.ordered_elements[element_index + 1] + if next_el.element_name in detected_elements: + potential_end = detected_elements[next_el.element_name].start - 1 + else: + potential_end = potential_start + element.element_length - 1 + if potential_end >= len(sequence): + potential_end = len(sequence) - 1 + return potential_end + + def extract_variable_elements5(self, sequence, detected_elements, concat_element_storage, dupl_element_storage): + first_detected_const_element = None + for i, el in enumerate(self.molecule_structure): + if el.element_type == ElementType.cDNA: + break + if (el.element_type == ElementType.PolyT or el.element_type.is_constant()) and el.element_name in detected_elements: + first_detected_const_element = i + break + + if first_detected_const_element is None: return + + # extracting elements preceding first detected const element + current_pos = detected_elements[ + self.molecule_structure.ordered_elements[first_detected_const_element].element_name].start - 1 + for i in range(first_detected_const_element - 1, -1, -1): + el: MoleculeElement = self.molecule_structure.ordered_elements[i] + + if el.element_type.is_base_separator(): + current_pos -= el.element_length + continue + if not el.element_type.is_variable(): + # we do not skip undetected const elements on the open side (e.g. here we go to the left of the first detected const element) + break + + potential_end = current_pos + potential_start = potential_end - el.element_length + 1 + if potential_start < 0: break + + el_start, _ = self._detect_variable_element(el, sequence, potential_start, potential_end, + detected_elements, concat_element_storage, dupl_element_storage) + current_pos = el_start - 1 + + current_pos = detected_elements[ + self.molecule_structure.ordered_elements[first_detected_const_element].element_name].end + 1 + for i in range(first_detected_const_element + 1, len(self.molecule_structure.ordered_elements)): + if current_pos >= len(sequence): + break + + el: MoleculeElement = self.molecule_structure.ordered_elements[i] + if el.element_type in [ElementType.cDNA, ElementType.PolyT]: + break + elif el.element_type.is_base_separator(): + current_pos += el.element_length + continue + elif el.element_type.is_constant(): + if el.element_name in detected_elements: + current_pos = detected_elements[el.element_name].end + 1 + else: + current_pos += el.element_length + continue + + potential_start = current_pos + potential_end = self._detect_potential_end(sequence, el, i, potential_start, detected_elements) + + _, el_end = self._detect_variable_element(el, sequence, potential_start, potential_end, + detected_elements, concat_element_storage, dupl_element_storage, + len_diff_threshold=self.MAX_LEN_DIFF) + current_pos = el_end + 1 + + + def extract_variable_elements3(self, sequence, detected_elements, concat_element_storage, dupl_element_storage): + last_detected_const_element = None + for i in range(len(self.molecule_structure.ordered_elements) - 1, -1, -1): + el = self.molecule_structure.ordered_elements[i] + if el.element_type == ElementType.cDNA: + break + if (el.element_type == ElementType.PolyT or el.element_type.is_constant()) and el.element_name in detected_elements: + last_detected_const_element = i + break + + if last_detected_const_element is None: return + + # extracting elements following the last detected const element + current_pos = detected_elements[ + self.molecule_structure.ordered_elements[last_detected_const_element].element_name].end + 1 + for i in range(last_detected_const_element + 1, len(self.molecule_structure.ordered_elements)): + el: MoleculeElement = self.molecule_structure.ordered_elements[i] + + if el.element_type.is_base_separator(): + current_pos += el.element_length + continue + if not el.element_type.is_variable(): + # we do not skip undetected const elements on the open side (e.g. here we go to the left of the first detected const element) + break + + potential_start = current_pos + potential_end = potential_start + el.element_length - 1 + if potential_end >= len(sequence): break + + _, el_end = self._detect_variable_element(el, sequence, potential_start, potential_end, + detected_elements, concat_element_storage, dupl_element_storage) + current_pos = el_end + 1 + + current_pos = detected_elements[ + self.molecule_structure.ordered_elements[last_detected_const_element].element_name].start - 1 + for i in range(last_detected_const_element - 1, -1, -1): + if current_pos <= 0: + break + el: MoleculeElement = self.molecule_structure.ordered_elements[i] + + if el.element_type in [ElementType.cDNA, ElementType.PolyT]: + break + elif el.element_type.is_base_separator(): + current_pos -= el.element_length + continue + elif el.element_type.is_constant(): + if el.element_name in detected_elements: + current_pos = detected_elements[el.element_name].start - 1 + else: + current_pos -= el.element_length + continue + + potential_end = current_pos + potential_start = self._detect_potential_start(el, i, potential_end, detected_elements) + + el_start, _ = self._detect_variable_element(el, sequence, potential_start, potential_end, + detected_elements, concat_element_storage, dupl_element_storage, + len_diff_threshold=self.MAX_LEN_DIFF) + current_pos = el_start - 1 + + def detect_const_elements(self, sequence, polyt_start, polyt_end, detected_elements): + # searching left of cDNA + first_element_detected = False + current_search_start = 0 + for el in self.molecule_structure: + if el.element_type in [ElementType.cDNA, ElementType.PolyT]: + break + if el.element_name not in self.constant_elements_to_detect: + continue + + search_start = current_search_start + search_end = len(sequence) if polyt_start == -1 else polyt_start + 1 + element_occurrences = self.index_dict[el.element_name].get_occurrences_substr(sequence, + search_start, + search_end) + if not first_element_detected: + min_score = int(el.element_length * self.MIN_SCORE_COEFF_TERMMINAL) + start_delta = -1 + else: + min_score = int(el.element_length * self.MIN_SCORE_COEFF) + start_delta = self.TERMINAL_MATCH_DELTA + + element_start, element_end = detect_exact_positions(sequence, + search_start, search_end, + self.index_dict[el.element_name].k, + el.element_value, + element_occurrences, + min_score=min_score, + start_delta=start_delta, + end_delta=self.TERMINAL_MATCH_DELTA) + + + if element_start is not None: + current_search_start = element_end + if not first_element_detected: + first_element_detected = True + + detected_elements[el.element_name] = DetectedElement(element_start, element_end) + + last_element_detected = False + current_search_end = len(sequence) + for i in range(len(self.molecule_structure.ordered_elements) - 1, -1, -1): + el = self.molecule_structure.ordered_elements[i] + if el.element_type in [ElementType.cDNA, ElementType.PolyT]: + break + if el.element_name not in self.constant_elements_to_detect: + continue + + search_start = 0 if polyt_start == -1 else polyt_end + 1 + search_end = current_search_end + element_occurrences = self.index_dict[el.element_name].get_occurrences_substr(sequence, search_start, + search_end) + + if not last_element_detected: + min_score = int(el.element_length * self.MIN_SCORE_COEFF_TERMMINAL) + end_delta = -1 + else: + min_score = int(el.element_length * self.MIN_SCORE_COEFF) + end_delta = self.TERMINAL_MATCH_DELTA + + element_start, element_end = detect_exact_positions(sequence, + search_start, search_end, + self.index_dict[el.element_name].k, + el.element_value, + element_occurrences, + min_score=min_score, + start_delta=self.TERMINAL_MATCH_DELTA, + end_delta=end_delta) + if element_start is not None: + current_search_end = element_end + if not last_element_detected: + last_element_detected = True + + detected_elements[el.element_name] = DetectedElement(element_start, element_end) diff --git a/src/barcode_calling/common.py b/src/barcode_calling/common.py index f1b88d7f..6ed04bfc 100644 --- a/src/barcode_calling/common.py +++ b/src/barcode_calling/common.py @@ -4,11 +4,16 @@ # See file LICENSE for details. ############################################################################ import copy +import os +import sys import logging - -import numpy as np +import gzip +import numpy from ssw import AlignmentMgr +from ..error_codes import IsoQuantExitCode + + logger = logging.getLogger('IsoQuant') # Try to import numba for JIT compilation of barcode conversion @@ -23,11 +28,11 @@ @njit(cache=True, parallel=True) def _compute_2bit_numba(encoded, shifts, n_barcodes, seq_len): """Numba JIT-compiled 2-bit encoding computation with parallel execution.""" - result = np.zeros(n_barcodes, dtype=np.uint64) + result = numpy.zeros(n_barcodes, dtype=numpy.uint64) for i in prange(n_barcodes): - val = np.uint64(0) + val = numpy.uint64(0) for pos in range(seq_len): - val |= np.uint64(encoded[i, pos]) << shifts[pos] + val |= numpy.uint64(encoded[i, pos]) << shifts[pos] result[i] = val return result @@ -56,6 +61,48 @@ def find_polyt_start(seq, window_size = 16, polya_fraction = 0.75): return i + max(0, seq[i:].find('TTTT')) +def find_polyt(seq, window_size = 16, polya_fraction = 0.75): + polyA_count = int(window_size * polya_fraction) + + if len(seq) < window_size: + return -1, -1 + i = 0 + a_count = seq[0:window_size].count('T') + while i < len(seq) - window_size: + if a_count >= polyA_count: + break + first_base_a = seq[i] == 'T' + new_base_a = i + window_size < len(seq) and seq[i + window_size] == 'T' + if first_base_a and not new_base_a: + a_count -= 1 + elif not first_base_a and new_base_a: + a_count += 1 + i += 1 + + if i >= len(seq) - window_size: + return -1, -1 + polyt_start = i + max(0, seq[i:].find('TTTT')) + + while i < len(seq) - window_size: + if a_count < polyA_count: + break + first_base_a = seq[i] == 'T' + new_base_a = i + window_size < len(seq) and seq[i + window_size] == 'T' + if first_base_a and not new_base_a: + a_count -= 1 + elif not first_base_a and new_base_a: + a_count += 1 + i += 1 + + if i >= len(seq) - window_size: + polyt_end = len(seq) - 1 + else: + last_t_pos = seq[i:i + window_size].rfind('TTTT') + polyt_end = i + window_size - 1 if last_t_pos == -1 else i + last_t_pos + 4 + + return polyt_start, polyt_end + + base_comp = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', " ": " "} @@ -181,7 +228,7 @@ def detect_exact_positions(sequence, start, end, kmer_size, pattern, pattern_occ return None, None leftover_bases = len(pattern) - pattern_end - 1 skipped_bases = pattern_start - return start_pos - skipped_bases, end_pos + leftover_bases + return max(0, start_pos - skipped_bases), min(end_pos + leftover_bases, len(sequence) - 1) def detect_first_exact_positions(sequence, start, end, kmer_size, pattern, pattern_occurrences: list, @@ -234,7 +281,7 @@ def str_to_2bit(seq): return kmer_idx -def batch_str_to_2bit(barcodes, seq_len=25): +def batch_str_to_2bit(barcodes, seq_len): """ Convert a list/iterator of barcode strings to 2-bit encoded integers. @@ -249,7 +296,7 @@ def batch_str_to_2bit(barcodes, seq_len=25): numpy.ndarray of uint64 containing 2-bit encoded barcodes """ # Convert iterator to list if needed, reading in chunks to show progress - if not isinstance(barcodes, (list, np.ndarray)): + if not isinstance(barcodes, (list, numpy.ndarray)): logger.info("Loading barcodes from iterator...") barcode_list = [] chunk_size = 10_000_000 @@ -262,36 +309,36 @@ def batch_str_to_2bit(barcodes, seq_len=25): n_barcodes = len(barcodes) if n_barcodes == 0: - return np.array([], dtype=np.uint64) + return numpy.array([], dtype=numpy.uint64) logger.info("Converting %d barcodes to 2-bit encoding..." % n_barcodes) # Fast path: join all barcodes into single string, convert to bytes at once # This avoids Python loop overhead for encode() all_bytes = ''.join(barcodes).encode('ascii') - byte_array = np.frombuffer(all_bytes, dtype=np.uint8).reshape(n_barcodes, seq_len) + byte_array = numpy.frombuffer(all_bytes, dtype=numpy.uint8).reshape(n_barcodes, seq_len) # Apply the encoding: (ord(char) & 6) >> 1 # A=65 -> 0, C=67 -> 1, T=84 -> 2, G=71 -> 3 encoded = (byte_array & 6) >> 1 # Compute bit shifts for each position (highest position = leftmost) - shifts = np.arange(seq_len - 1, -1, -1, dtype=np.uint64) * 2 + shifts = numpy.arange(seq_len - 1, -1, -1, dtype=numpy.uint64) * 2 # Compute 2-bit encoding - use Numba if available for parallel execution if NUMBA_AVAILABLE: result = _compute_2bit_numba(encoded, shifts, n_barcodes, seq_len) else: # Fallback: vectorized NumPy loop - result = np.zeros(n_barcodes, dtype=np.uint64) + result = numpy.zeros(n_barcodes, dtype=numpy.uint64) for pos in range(seq_len): - result |= encoded[:, pos].astype(np.uint64) << shifts[pos] + result |= encoded[:, pos].astype(numpy.uint64) << shifts[pos] logger.info("Barcode conversion complete") return result -def batch_str_to_2bit_chunked(barcode_iterator, seq_len=25, chunk_size=50_000_000): +def batch_str_to_2bit_chunked(barcode_iterator, seq_len, chunk_size=50_000_000): """ Convert barcodes to 2-bit encoding in memory-efficient chunks. @@ -331,33 +378,33 @@ def batch_str_to_2bit_chunked(barcode_iterator, seq_len=25, chunk_size=50_000_00 logger.info("Barcode conversion complete: %d barcodes" % total_processed) if not all_results: - return np.array([], dtype=np.uint64) - return np.concatenate(all_results) + return numpy.array([], dtype=numpy.uint64) + return numpy.concatenate(all_results) def _convert_chunk_to_2bit(barcodes, seq_len): """Convert a chunk of barcodes to 2-bit encoding (internal helper).""" n_barcodes = len(barcodes) if n_barcodes == 0: - return np.array([], dtype=np.uint64) + return numpy.array([], dtype=numpy.uint64) # Fast path: join all barcodes into single string, convert to bytes at once all_bytes = ''.join(barcodes).encode('ascii') - byte_array = np.frombuffer(all_bytes, dtype=np.uint8).reshape(n_barcodes, seq_len) + byte_array = numpy.frombuffer(all_bytes, dtype=numpy.uint8).reshape(n_barcodes, seq_len) # Apply encoding: (ord(char) & 6) >> 1 encoded = (byte_array & 6) >> 1 # Compute shifts - shifts = np.arange(seq_len - 1, -1, -1, dtype=np.uint64) * 2 + shifts = numpy.arange(seq_len - 1, -1, -1, dtype=numpy.uint64) * 2 # Compute result - use Numba if available for parallel execution if NUMBA_AVAILABLE: result = _compute_2bit_numba(encoded, shifts, n_barcodes, seq_len) else: - result = np.zeros(n_barcodes, dtype=np.uint64) + result = numpy.zeros(n_barcodes, dtype=numpy.uint64) for pos in range(seq_len): - result |= encoded[:, pos].astype(np.uint64) << shifts[pos] + result |= encoded[:, pos].astype(numpy.uint64) << shifts[pos] return result @@ -369,3 +416,36 @@ def bit_to_str(seq, seq_len): for i in range(seq_len): str_seq += BIN2NUCL[(seq >> ((seq_len - i - 1) * 2)) & 3] return str_seq + + + +def load_h5_barcodes_bit(h5_file_path, dataset_name='bpMatrix_1'): + raise NotImplementedError() + import h5py + barcode_list = [] + with h5py.File(h5_file_path, 'r') as h5_file: + dataset = numpy.array(h5_file[dataset_name]) + for row in dataset: + for col in row: + barcode_list.append(bit_to_str(int(col[0]))) + return barcode_list + + +def load_barcodes(inf, needs_iterator=False): + if not os.path.isfile(inf): + logger.critical("Barcode file '%s' does not exist.", inf) + sys.exit(IsoQuantExitCode.INPUT_FILE_NOT_FOUND) + + if inf.endswith("h5") or inf.endswith("hdf5"): + return load_h5_barcodes_bit(inf) + + if inf.endswith("gz") or inf.endswith("gzip"): + handle = gzip.open(inf, "rt") + else: + handle = open(inf, "r") + + barcode_iterator = iter(l.strip().split()[0] for l in handle) + if needs_iterator: + return barcode_iterator + + return list(barcode_iterator) diff --git a/src/barcode_calling/indexers/base.py b/src/barcode_calling/indexers/base.py index 1f190f7b..5e3ba232 100644 --- a/src/barcode_calling/indexers/base.py +++ b/src/barcode_calling/indexers/base.py @@ -108,6 +108,57 @@ def get_occurrences(self, sequence: str, max_hits: int = 0, min_kmers: int = 1, return result return result[:max_hits] + def _get_kmers_substr(self, seq, start, end): + if end - start + 1 < self.k: + return + kmer = seq[start:start + self.k] + yield kmer + for i in range(start + self.k, min(len(seq), end + 1)): + kmer = kmer[1:] + seq[i] + yield kmer + + # [start, end] + def get_occurrences_substr(self, sequence, start, end, max_hits=0, min_kmers=1, hits_delta=1) -> List[Tuple[str, int, List[int]]]: + """ + Find indexed sequences with shared k-mers. + + Args: + sequence: Query sequence to search + max_hits: Maximum number of results (0 = unlimited) + min_kmers: Minimum shared k-mers required + hits_delta: Include results within this many k-mers of top hit + ignore_equal: Skip exact matches + + Returns: + List of (sequence, shared_kmer_count, kmer_positions) tuples, + sorted by shared k-mer count (descending) + """ + barcode_counts: DefaultDict[int, int] = defaultdict(int) + barcode_positions: DefaultDict[int, List[int]] = defaultdict(list) + + for pos, kmer in enumerate(self._get_kmers_substr(sequence, start, end)): + for i in self.index[kmer]: + barcode_counts[i] += 1 + barcode_positions[i].append(pos) + + result = [] + for i in barcode_counts.keys(): + count = barcode_counts[i] + if count < min_kmers: + continue + result.append((self.seq_list[i], count, barcode_positions[i])) + + if not result: + return [] + + top_hits = max(result, key=lambda x: x[1])[1] + result = filter(lambda x: x[1] >= top_hits - hits_delta, result) + result = list(sorted(result, reverse=True, key=lambda x: x[1])) + + if max_hits == 0: + return result + return result[:max_hits] + class ArrayKmerIndexer: """ diff --git a/src/barcode_calling/indexers/two_bit.py b/src/barcode_calling/indexers/two_bit.py index 2c5d5290..7c9dbbb9 100644 --- a/src/barcode_calling/indexers/two_bit.py +++ b/src/barcode_calling/indexers/two_bit.py @@ -26,7 +26,7 @@ class Dict2BitKmerIndexer: Best for medium to large barcode sets. """ - def __init__(self, known_bit_seqs: Iterable[int], kmer_size: int = 6, seq_len: int = 25): + def __init__(self, known_bit_seqs: Iterable[int], kmer_size: int, seq_len: int): """ Initialize k-mer index with 2-bit encoded sequences. @@ -131,7 +131,7 @@ class Array2BitKmerIndexer: Best for large barcode sets (e.g., single-cell whitelists). """ - def __init__(self, known_bin_seq: Iterable[int], kmer_size: int = 12, seq_len: int = 25): + def __init__(self, known_bin_seq: Iterable[int], kmer_size: int, seq_len: int): """ Initialize 2-bit k-mer index. diff --git a/src/dataset_processor.py b/src/dataset_processor.py index 9182c06c..026b6f6f 100644 --- a/src/dataset_processor.py +++ b/src/dataset_processor.py @@ -133,10 +133,11 @@ def process_sample(self, sample): len(sample.file_list) > 1 and self.args.read_group is not None and "file_name" in self.args.read_group) - if sample.use_technical_replicas: - logger.info("Technical replicas filtering enabled: novel transcripts must be confirmed by at least 2 files") - elif len(sample.file_list) > 1 and not self.args.use_replicas: - logger.info("Technical replicas filtering disabled by --use_replicas false") + if not self.args.no_model_construction: + if sample.use_technical_replicas: + logger.info("Technical replicas filtering enabled: novel transcripts must be confirmed by at least 2 files") + elif len(sample.file_list) > 1 and not self.args.use_replicas: + logger.info("Technical replicas filtering disabled by --use_replicas false") # Initialize all_read_groups as list of sets (one per grouping strategy) self.all_read_groups = [set() for _ in self.grouping_strategy_names] @@ -496,7 +497,8 @@ def filter_umis(self, sample): IsoQuantMode.curio: [3], IsoQuantMode.visium_hd: [4], IsoQuantMode.stereoseq: [4], - IsoQuantMode.stereoseq_nosplit: [4]} + IsoQuantMode.stereoseq_nosplit: [4], + IsoQuantMode.custom_sc: [4]} for i, edit_distance in enumerate(umi_ed_dict[self.args.mode]): logger.info("Filtering PCR duplicates with edit distance %d" % edit_distance) diff --git a/src/modes.py b/src/modes.py index 632667f7..ff83ce83 100644 --- a/src/modes.py +++ b/src/modes.py @@ -16,6 +16,7 @@ class IsoQuantMode(Enum): stereoseq = 5 visium_hd = 6 visium_5prime = 7 + custom_sc = 10 def needs_barcode_calling(self): return self in [IsoQuantMode.tenX_v3, @@ -23,7 +24,8 @@ def needs_barcode_calling(self): IsoQuantMode.stereoseq_nosplit, IsoQuantMode.stereoseq, IsoQuantMode.visium_hd, - IsoQuantMode.visium_5prime] + IsoQuantMode.visium_5prime, + IsoQuantMode.custom_sc] def needs_pcr_deduplication(self): return self in [IsoQuantMode.tenX_v3, @@ -31,7 +33,8 @@ def needs_pcr_deduplication(self): IsoQuantMode.stereoseq_nosplit, IsoQuantMode.stereoseq, IsoQuantMode.visium_hd, - IsoQuantMode.visium_5prime] + IsoQuantMode.visium_5prime, + IsoQuantMode.custom_sc] def produces_new_fasta(self): return self in [IsoQuantMode.stereoseq] diff --git a/tests/github/barcode_test_templates/stereo_single.cfg b/tests/github/barcode_test_templates/stereo_single.cfg index dd2b4b42..9a9ae74e 100644 --- a/tests/github/barcode_test_templates/stereo_single.cfg +++ b/tests/github/barcode_test_templates/stereo_single.cfg @@ -12,14 +12,14 @@ run_type single reads /path/to/simulated_reads.fq.gz barcodes /path/to/whitelist.txt -# Barcode calling mode (for detect_barcodes.py) +# Barcode calling mode (for isoquant_detect_barcodes.py) mode stereoseq # Optional: number of threads threads 8 -# Optional: additional detect_barcodes.py options -# detect_options "--min_score 21" +# Optional: additional isoquant_detect_barcodes.py options +# detect_options "" # Quality assessment mode (for assess_barcode_quality.py) # If not specified, derived from 'mode' diff --git a/tests/github/barcode_test_templates/stereo_split.cfg b/tests/github/barcode_test_templates/stereo_split.cfg index 5d3061cb..38959b69 100644 --- a/tests/github/barcode_test_templates/stereo_split.cfg +++ b/tests/github/barcode_test_templates/stereo_split.cfg @@ -18,7 +18,7 @@ mode stereoseq_split # Optional: number of threads threads 8 -# Optional: additional detect_barcodes.py options +# Optional: additional isoquant_detect_barcodes.py options # detect_options "" # Quality assessment mode diff --git a/tests/github/run_barcode_test.py b/tests/github/run_barcode_test.py index 6a49b845..a08d45dc 100644 --- a/tests/github/run_barcode_test.py +++ b/tests/github/run_barcode_test.py @@ -8,7 +8,7 @@ """ Test runner for barcode calling quality assessment. -Takes a configuration file as input, runs detect_barcodes.py on simulated data, +Takes a configuration file as input, runs isoquant_detect_barcodes.py on simulated data, then runs quality assessment and compares results against etalon values. Usage: @@ -91,7 +91,7 @@ def parse_args(): parser.add_argument("--output", "-o", type=str, help="Output directory (overrides config)") parser.add_argument("--additional_options", "-a", type=str, - help="Additional options for detect_barcodes.py") + help="Additional options for isoquant_detect_barcodes.py") parser.add_argument("--debug", "-d", action="store_true", help="Enable debug output") @@ -99,13 +99,13 @@ def parse_args(): def run_detect_barcodes(args, config_dict): - """Run detect_barcodes.py with configuration options.""" + """Run isoquant_detect_barcodes.py with configuration options.""" source_dir = os.path.dirname(os.path.realpath(__file__)) isoquant_dir = os.path.join(source_dir, "../../") config_file = args.config_file run_name = config_dict["name"] - # detect_barcodes.py uses -o as a file prefix, not a directory + # isoquant_detect_barcodes.py uses -o as a file prefix, not a directory # Output will be: {output_folder}/{run_name}.barcoded_reads.tsv output_folder = args.output if args.output else config_dict["output"] output_prefix = os.path.join(output_folder, run_name) @@ -113,11 +113,11 @@ def run_detect_barcodes(args, config_dict): if not os.path.exists(output_folder): os.makedirs(output_folder) - log.info('== Running detect_barcodes.py ==') + log.info('== Running isoquant_detect_barcodes.py ==') # Build command detect_command = [ - "python3", os.path.join(isoquant_dir, "detect_barcodes.py"), + "python3", os.path.join(isoquant_dir, "isoquant_detect_barcodes.py"), "-o", output_prefix, "--mode", config_dict["mode"] ] @@ -139,6 +139,11 @@ def run_detect_barcodes(args, config_dict): barcode_files = fix_paths(config_file, config_dict["barcodes"]) detect_command.extend(["--barcodes"] + barcode_files) + # Add molecule description file (for custom_sc mode) + if "molecule" in config_dict: + molecule_file = fix_path(config_file, config_dict["molecule"]) + detect_command.extend(["--molecule", molecule_file]) + # Add optional parameters if "threads" in config_dict: detect_command.extend(["-t", config_dict["threads"]]) @@ -156,7 +161,7 @@ def run_detect_barcodes(args, config_dict): result = subprocess.run(detect_command) if result.returncode != 0: - log.error("detect_barcodes.py exited with non-zero status: %d" % result.returncode) + log.error("isoquant_detect_barcodes.py exited with non-zero status: %d" % result.returncode) return -11 return 0 @@ -173,7 +178,7 @@ def run_barcode_quality(args, config_dict): output_prefix = os.path.join(output_folder, run_name) # Find barcode output file - # detect_barcodes.py outputs: {prefix}.barcoded_reads.tsv + # isoquant_detect_barcodes.py outputs: {prefix}.barcoded_reads.tsv barcode_output = output_prefix + ".barcoded_reads.tsv" if not os.path.exists(barcode_output): barcode_output = output_prefix + ".barcoded_reads.tsv.gz" diff --git a/tests/github/run_pipeline.py b/tests/github/run_pipeline.py index e749932a..3259e77a 100755 --- a/tests/github/run_pipeline.py +++ b/tests/github/run_pipeline.py @@ -31,6 +31,7 @@ RT_QUANTIFICATION_NOVEL = "quantification_novel" RT_QUANTIFICATION_GENES = "quantification_genes" RT_PERFORMANCE = "performance" +RT_ALLINFO = "allinfo" log = logging.getLogger('GitHubRunner') @@ -433,6 +434,78 @@ def run_performance_assessment(args, config_dict): return exit_code +def run_allinfo_quality(args, config_dict): + log.info('== Running allinfo quality assessment ==') + config_file = args.config_file + source_dir = os.path.dirname(os.path.realpath(__file__)) + isoquant_dir = os.path.join(source_dir, "../../") + + run_name = config_dict["name"] + label = config_dict["label"] + output_folder = os.path.join(args.output if args.output else config_dict["output"], run_name) + internal_output_dir = os.path.join(output_folder, label) + + # Edit distance from config or default (3 for 10x/curio, 4 for stereo/visium_hd) + edit_distance = int(config_dict.get("edit_distance", "3")) + allinfo_file = os.path.join(internal_output_dir, "%s.UMI_filtered.ED%d.allinfo" % (label, edit_distance)) + stats_file = os.path.join(internal_output_dir, "%s.UMI_filtered.ED%d.stats.tsv" % (label, edit_distance)) + + # Support gzipped allinfo + if not os.path.exists(allinfo_file) and os.path.exists(allinfo_file + ".gz"): + allinfo_file = allinfo_file + ".gz" + + if not os.path.exists(allinfo_file): + log.error("Allinfo file not found: %s" % allinfo_file) + return -5 + if not os.path.exists(stats_file): + log.error("Stats file not found: %s" % stats_file) + return -5 + + quality_report = os.path.join(output_folder, "allinfo_quality_report.tsv") + + qa_command = ["python3", os.path.join(isoquant_dir, "misc/assess_allinfo_quality.py"), + "--allinfo", allinfo_file, "--stats", stats_file, + "--output", quality_report] + + log.info("Running: %s" % " ".join(qa_command)) + result = subprocess.run(qa_command) + if result.returncode != 0: + log.error("Allinfo QA script failed with code %d" % result.returncode) + return -11 + + # Compare against etalon if provided + if "etalon_allinfo" not in config_dict: + return 0 + + etalon_file = fix_path(config_file, config_dict["etalon_allinfo"]) + if not os.path.exists(etalon_file): + log.error("Etalon file not found: %s" % etalon_file) + return -35 + + etalon_dict = load_tsv_config(etalon_file) + quality_dict = load_tsv_config(quality_report) + + exit_code = 0 + tolerance = float(config_dict.get("tolerance", "0.01")) + new_etalon_path = os.path.join(output_folder, "new_allinfo_etalon.tsv") + with open(new_etalon_path, "w") as new_etalon_outf: + for metric_name, val_str in quality_dict.items(): + new_etalon_outf.write("%s\t%s\n" % (metric_name, val_str)) + + for metric_name, etalon_val_str in etalon_dict.items(): + etalon_val = float(etalon_val_str) + if metric_name not in quality_dict: + log.error("Metric %s not found in quality report" % metric_name) + exit_code = -36 + continue + output_val = float(quality_dict[metric_name]) + err = check_value(etalon_val, output_val, metric_name, tolerance) + if err != 0: + exit_code = err + + return exit_code + + def check_output_files(out_dir, label, file_list): missing_files = [] internal_output_dir = os.path.join(out_dir, label) @@ -488,6 +561,8 @@ def main(): err_codes.append(run_quantification(args, config_dict, "gene")) if RT_PERFORMANCE in run_types: err_codes.append(run_performance_assessment(args, config_dict)) + if RT_ALLINFO in run_types: + err_codes.append(run_allinfo_quality(args, config_dict)) if "check_input_files" in config_dict: files_list = config_dict["check_input_files"].split() diff --git a/tests/test_barcode_callers.py b/tests/test_barcode_callers.py index a0b1cbf0..e5fdc5eb 100644 --- a/tests/test_barcode_callers.py +++ b/tests/test_barcode_callers.py @@ -100,6 +100,7 @@ def test_str_format(self): def test_header(self): """Test TSV header.""" + # header() is a static method header = BarcodeDetectionResult.header() assert "#read_id" in header assert "barcode" in header @@ -407,6 +408,7 @@ def test_add_read_with_barcode(self): result = LinkerBarcodeDetectionResult( "read_001", barcode="ACTG", + UMI="GGGGGGGG", # Need actual UMI for has_umi() to return True UMI_good=True, polyT=100 ) @@ -458,7 +460,8 @@ def test_add_custom_stats(self): def test_str_format(self): """Test string formatting.""" stats = ReadStats() - result = LinkerBarcodeDetectionResult("read_001", barcode="ACTG", UMI_good=True) + # Need actual UMI for has_umi() to return True + result = LinkerBarcodeDetectionResult("read_001", barcode="ACTG", UMI="GGGGGGGG", UMI_good=True) stats.add_read(result) output = str(stats) diff --git a/tests/test_barcode_common.py b/tests/test_barcode_common.py index 19a89533..2e04a94d 100644 --- a/tests/test_barcode_common.py +++ b/tests/test_barcode_common.py @@ -303,7 +303,7 @@ class TestBatchStrTo2Bit: def test_empty_input(self): """Test empty input returns empty array.""" import numpy as np - result = batch_str_to_2bit([]) + result = batch_str_to_2bit([], 25) assert len(result) == 0 assert result.dtype == np.uint64 @@ -361,7 +361,7 @@ class TestBatchStrTo2BitChunked: def test_empty_input(self): """Test empty iterator returns empty array.""" import numpy as np - result = batch_str_to_2bit_chunked(iter([])) + result = batch_str_to_2bit_chunked(iter([]), 25) assert len(result) == 0 assert result.dtype == np.uint64 diff --git a/tests/test_barcode_detectors.py b/tests/test_barcode_detectors.py index fc09bba3..a08b7754 100644 --- a/tests/test_barcode_detectors.py +++ b/tests/test_barcode_detectors.py @@ -93,18 +93,18 @@ class TestStereoBarcodeDetector: def test_init(self): """Test basic initialization.""" - detector = StereoBarcodeDetector(DUMMY_BARCODES_25, min_score=21) + detector = StereoBarcodeDetector(DUMMY_BARCODES_25) assert detector.barcode_indexer is not None assert detector.min_score == 21 def test_init_empty_barcodes(self): """Test initialization with empty barcode list.""" - detector = StereoBarcodeDetector([], min_score=21) + detector = StereoBarcodeDetector([]) assert detector.barcode_indexer is None def test_no_match_random_sequence(self): """Test that random sequence returns no valid barcode.""" - detector = StereoBarcodeDetector(DUMMY_BARCODES_25, min_score=21) + detector = StereoBarcodeDetector(DUMMY_BARCODES_25) result = detector.find_barcode_umi("test_read", RANDOM_SEQUENCE) assert isinstance(result, (LinkerBarcodeDetectionResult, TSOBarcodeDetectionResult)) @@ -141,7 +141,7 @@ def test_real_sequence(self, read_sequence, true_barcode, true_umi): "CATCAAAGGTTCACCGAATCTCCAC", "GTCGCCTTCAACGTATACTGATATA", "ACCAATTAGCAGCCATGGGAACTCC"] - detector = StereoBarcodeDetector(barcodes, min_score=21) + detector = StereoBarcodeDetector(barcodes) result = detector.find_barcode_umi("test_read", read_sequence) assert isinstance(result, (LinkerBarcodeDetectionResult, TSOBarcodeDetectionResult)) @@ -196,7 +196,7 @@ def test_real_sequence(self, read_sequence, true_barcodes, true_umis): "GGGCACATTCCTCAGAAAGCGAAAA", "GGCAGGCCATTAGTTACTCGTCGAG", "CTCCCCTAATCGTTTGTCGGTGGAC"] - detector = StereoSplittingBarcodeDetector(barcodes, min_score=21) + detector = StereoSplittingBarcodeDetector(barcodes) result = detector.find_barcode_umi("test_read", read_sequence) assert isinstance(result, SplittingBarcodeDetectionResult) @@ -214,13 +214,13 @@ class TestSharedMemStereoBarcodeDetector: def test_init(self): """Test basic initialization.""" - detector = SharedMemoryStereoBarcodeDetector(DUMMY_BARCODES_25, min_score=21) + detector = SharedMemoryStereoBarcodeDetector(DUMMY_BARCODES_25) assert detector.barcode_indexer is not None assert detector.min_score == 21 def test_no_match_random_sequence(self): """Test that random sequence returns no valid barcode.""" - detector = SharedMemoryStereoBarcodeDetector(DUMMY_BARCODES_25, min_score=21) + detector = SharedMemoryStereoBarcodeDetector(DUMMY_BARCODES_25) result = detector.find_barcode_umi("test_read", RANDOM_SEQUENCE) assert isinstance(result, (LinkerBarcodeDetectionResult, TSOBarcodeDetectionResult)) @@ -266,7 +266,7 @@ def test_real_sequence(self, read_sequence, true_barcode, true_umi): "CATCAAAGGTTCACCGAATCTCCAC", "GTCGCCTTCAACGTATACTGATATA", "ACCAATTAGCAGCCATGGGAACTCC"] - detector = SharedMemoryStereoBarcodeDetector(barcodes, min_score=21) + detector = SharedMemoryStereoBarcodeDetector(barcodes) result = detector.find_barcode_umi("test_read", read_sequence) assert isinstance(result, (LinkerBarcodeDetectionResult, TSOBarcodeDetectionResult)) @@ -289,7 +289,7 @@ def test_real_sequence_no_bc(self, read_sequence): "CATCAAAGGTTCACCGAATCTCCAC", "GTCGCCTTCAACGTATACTGATATA", "ACCAATTAGCAGCCATGGGAACTCC"] - detector = SharedMemoryStereoBarcodeDetector(barcodes, min_score=21) + detector = SharedMemoryStereoBarcodeDetector(barcodes) result = detector.find_barcode_umi("test_read", read_sequence) assert isinstance(result, (LinkerBarcodeDetectionResult, TSOBarcodeDetectionResult)) @@ -338,7 +338,7 @@ def test_real_sequence(self, read_sequence, true_barcodes, true_umis): "GGGCACATTCCTCAGAAAGCGAAAA", "GGCAGGCCATTAGTTACTCGTCGAG", "CTCCCCTAATCGTTTGTCGGTGGAC"] - detector = SharedMemoryStereoSplittingBarcodeDetector(barcodes, min_score=21) + detector = SharedMemoryStereoSplittingBarcodeDetector(barcodes) result = detector.find_barcode_umi("test_read", read_sequence) assert isinstance(result, SplittingBarcodeDetectionResult) @@ -370,7 +370,7 @@ def test_real_sequence_no_bc(self, read_sequence): "GGGCACATTCCTCAGAAAGCGAAAA", "GGCAGGCCATTAGTTACTCGTCGAG", "CTCCCCTAATCGTTTGTCGGTGGAC"] - detector = SharedMemoryStereoSplittingBarcodeDetector(barcodes, min_score=21) + detector = SharedMemoryStereoSplittingBarcodeDetector(barcodes) result = detector.find_barcode_umi("test_read", read_sequence) assert isinstance(result, SplittingBarcodeDetectionResult) @@ -399,7 +399,7 @@ def test_init_large_whitelist(self): """Test that min_score increases for large whitelists.""" large_barcodes = [f"ACTG{i:012d}"[:16] for i in range(100001)] detector = TenXBarcodeDetector(large_barcodes) - assert detector.min_score == 16 + assert detector.min_score == 15 def test_no_match_random_sequence(self): """Test that random sequence returns no valid barcode.""" @@ -567,20 +567,20 @@ class TestCurioBarcodeDetector: def test_init_joint_barcodes(self): """Test initialization with joint barcode list.""" - detector = CurioBarcodeDetector(DUMMY_BARCODES_14, min_score=13) + detector = CurioBarcodeDetector(DUMMY_BARCODES_14) assert detector.barcode_indexer is not None assert detector.min_score == 13 def test_init_split_barcodes(self): """Test initialization with separate left/right barcode lists.""" barcode_tuple = (DUMMY_BARCODES_8, DUMMY_BARCODES_6) - detector = CurioBarcodeDetector(barcode_tuple, min_score=13) + detector = CurioBarcodeDetector(barcode_tuple) # Should have 3*3 = 9 combined barcodes assert len(detector.barcode_indexer.seq_list) == 9 def test_no_match_random_sequence(self): """Test that random sequence returns no valid barcode.""" - detector = CurioBarcodeDetector(DUMMY_BARCODES_14, min_score=13) + detector = CurioBarcodeDetector(DUMMY_BARCODES_14) result = detector.find_barcode_umi("test_read", RANDOM_SEQUENCE) assert isinstance(result, LinkerBarcodeDetectionResult) @@ -637,13 +637,13 @@ class TestCurioIlluminaDetector: def test_init(self): """Test basic initialization.""" - detector = CurioIlluminaDetector(DUMMY_BARCODES_14, min_score=14) + detector = CurioIlluminaDetector(DUMMY_BARCODES_14) assert detector.barcode_indexer is not None assert detector.min_score == 14 def test_no_match_random_sequence(self): """Test that random sequence returns no valid barcode.""" - detector = CurioIlluminaDetector(DUMMY_BARCODES_14, min_score=14) + detector = CurioIlluminaDetector(DUMMY_BARCODES_14) result = detector.find_barcode_umi("test_read", RANDOM_SEQUENCE) assert isinstance(result, LinkerBarcodeDetectionResult) diff --git a/tests/test_molecule_structure.py b/tests/test_molecule_structure.py new file mode 100644 index 00000000..127d2e71 --- /dev/null +++ b/tests/test_molecule_structure.py @@ -0,0 +1,631 @@ +############################################################################ +# Copyright (c) 2025-2026 University of Helsinki +# All Rights Reserved +# See file LICENSE for details. +############################################################################ + +""" +Tests for molecule structure parsing and element types. + +Tests the universal barcode calling infrastructure: +- ElementType enum and its method classification +- MoleculeElement initialization for different element types +- MoleculeStructure parsing from MDF format +""" + +import os +import pytest +from io import StringIO + +from src.barcode_calling.callers.molecule_structure import ( + ElementType, + MoleculeElement, + MoleculeStructure +) + +# Test data directory +TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "universal_data") + + +class TestElementType: + """Test ElementType enum methods.""" + + def test_needs_correction_var_file(self): + """VAR_FILE elements need barcode correction.""" + assert ElementType.VAR_FILE.needs_correction() is True + + def test_needs_correction_var_list(self): + """VAR_LIST elements need barcode correction.""" + assert ElementType.VAR_LIST.needs_correction() is True + + def test_needs_correction_var_any(self): + """VAR_ANY elements do NOT need correction.""" + assert ElementType.VAR_ANY.needs_correction() is False + + def test_needs_correction_const(self): + """CONST elements do NOT need correction.""" + assert ElementType.CONST.needs_correction() is False + + def test_needs_sequence_extraction_var_any(self): + """VAR_ANY elements need sequence extraction.""" + assert ElementType.VAR_ANY.needs_sequence_extraction() is True + + def test_needs_sequence_extraction_var_file(self): + """VAR_FILE elements do NOT need sequence extraction (they get corrected).""" + assert ElementType.VAR_FILE.needs_sequence_extraction() is False + + def test_needs_sequence_extraction_const(self): + """CONST elements do NOT need sequence extraction.""" + assert ElementType.CONST.needs_sequence_extraction() is False + + def test_is_constant_const(self): + """CONST is a constant element.""" + assert ElementType.CONST.is_constant() is True + + def test_is_constant_var_any(self): + """VAR_ANY is NOT a constant element.""" + assert ElementType.VAR_ANY.is_constant() is False + + def test_is_constant_polyt(self): + """PolyT is NOT a constant element.""" + assert ElementType.PolyT.is_constant() is False + + def test_is_variable_var_any(self): + """VAR_ANY is a variable element.""" + assert ElementType.VAR_ANY.is_variable() is True + + def test_is_variable_var_list(self): + """VAR_LIST is a variable element.""" + assert ElementType.VAR_LIST.is_variable() is True + + def test_is_variable_var_file(self): + """VAR_FILE is a variable element.""" + assert ElementType.VAR_FILE.is_variable() is True + + def test_is_variable_const(self): + """CONST is NOT a variable element.""" + assert ElementType.CONST.is_variable() is False + + def test_is_variable_polyt(self): + """PolyT is NOT a variable element.""" + assert ElementType.PolyT.is_variable() is False + + def test_is_base_separator_var_any_separator(self): + """VAR_ANY_SEPARATOR is a separator.""" + assert ElementType.VAR_ANY_SEPARATOR.is_base_separator() is True + + def test_is_base_separator_var_any_non_t_separator(self): + """VAR_ANY_NON_T_SEPARATOR is a separator.""" + assert ElementType.VAR_ANY_NON_T_SEPARATOR.is_base_separator() is True + + def test_is_base_separator_var_any(self): + """VAR_ANY is NOT a separator.""" + assert ElementType.VAR_ANY.is_base_separator() is False + + +class TestMoleculeElement: + """Test MoleculeElement initialization.""" + + def test_init_polyt(self): + """Test PolyT element initialization.""" + element = MoleculeElement("PolyT", ElementType.PolyT) + + assert element.element_name == "PolyT" + assert element.element_type == ElementType.PolyT + assert element.element_value is None + assert element.element_length == -1 + + def test_init_cdna(self): + """Test cDNA element initialization.""" + element = MoleculeElement("cDNA", ElementType.cDNA) + + assert element.element_name == "cDNA" + assert element.element_type == ElementType.cDNA + assert element.element_value is None + assert element.element_length == -1 + + def test_init_const(self): + """Test CONST element initialization.""" + sequence = "CTACACGACGCTCTTCCGATCT" + element = MoleculeElement("R1", ElementType.CONST, sequence) + + assert element.element_name == "R1" + assert element.element_type == ElementType.CONST + assert element.element_value == sequence + assert element.element_length == len(sequence) + + def test_init_var_list(self): + """Test VAR_LIST element initialization.""" + barcodes = "AAAA,CCCC,GGGG,TTTT" + element = MoleculeElement("Barcode", ElementType.VAR_LIST, barcodes) + + assert element.element_name == "Barcode" + assert element.element_type == ElementType.VAR_LIST + assert element.element_value == ["AAAA", "CCCC", "GGGG", "TTTT"] + assert element.element_length == 4 # Length of first barcode + + def test_init_var_any(self): + """Test VAR_ANY element initialization.""" + element = MoleculeElement("UMI", ElementType.VAR_ANY, "12") + + assert element.element_name == "UMI" + assert element.element_type == ElementType.VAR_ANY + assert element.element_value is None + assert element.element_length == 12 + + def test_init_var_file(self): + """Test VAR_FILE element initialization with real file.""" + barcode_file = os.path.join(TEST_DATA_DIR, "barcodes_small.tsv") + element = MoleculeElement("Barcode", ElementType.VAR_FILE, barcode_file) + + assert element.element_name == "Barcode" + assert element.element_type == ElementType.VAR_FILE + assert len(element.element_value) == 8 # 8 barcodes in test file + assert element.element_length == 16 # 16bp barcodes + + def test_init_var_any_separator(self): + """Test VAR_ANY_SEPARATOR element initialization.""" + element = MoleculeElement("Sep", ElementType.VAR_ANY_SEPARATOR, "5") + + assert element.element_name == "Sep" + assert element.element_type == ElementType.VAR_ANY_SEPARATOR + assert element.element_value is None + assert element.element_length == 5 + + +class TestMoleculeStructure: + """Test MoleculeStructure parsing.""" + + def test_parse_simple_structure(self): + """Test parsing simple molecule structure.""" + mdf_content = """Barcode:UMI:PolyT:cDNA +Barcode\tVAR_LIST\tAAAA,CCCC,GGGG +UMI\tVAR_ANY\t10 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + elements = list(structure) + assert len(elements) == 4 + + assert elements[0].element_name == "Barcode" + assert elements[0].element_type == ElementType.VAR_LIST + + assert elements[1].element_name == "UMI" + assert elements[1].element_type == ElementType.VAR_ANY + + assert elements[2].element_name == "PolyT" + assert elements[2].element_type == ElementType.PolyT + + assert elements[3].element_name == "cDNA" + assert elements[3].element_type == ElementType.cDNA + + def test_parse_with_const(self): + """Test parsing structure with constant element.""" + mdf_content = """R1:Barcode:UMI:PolyT:cDNA +R1\tCONST\tCTACACGACGCTCTTCCGATCT +Barcode\tVAR_LIST\tAAAA,CCCC,GGGG +UMI\tVAR_ANY\t12 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + elements = list(structure) + assert len(elements) == 5 + + assert elements[0].element_name == "R1" + assert elements[0].element_type == ElementType.CONST + assert elements[0].element_value == "CTACACGACGCTCTTCCGATCT" + + def test_parse_from_file_simple(self): + """Test parsing simple.mdf file.""" + mdf_path = os.path.join(TEST_DATA_DIR, "simple.mdf") + + with open(mdf_path) as f: + structure = MoleculeStructure(f) + + elements = list(structure) + + # Simple.mdf: Barcode:UMI:PolyT:cDNA + assert len(elements) == 4 + assert elements[0].element_name == "Barcode" + assert elements[0].element_type == ElementType.VAR_LIST + assert elements[1].element_name == "UMI" + assert elements[1].element_type == ElementType.VAR_ANY + assert elements[1].element_length == 10 + + def test_parse_from_file_10x(self): + """Test parsing 10x-like structure. + + Uses inline definition to avoid relative path issues with VAR_FILE. + """ + # Use inline definition to avoid file path issues + mdf_content = """R1:Barcode:UMI:PolyT:cDNA:TSO +R1\tCONST\tCTACACGACGCTCTTCCGATCT +Barcode\tVAR_LIST\tATCCTTAGTGTTTGTC,CTTAGGAGTGGTTAGC,TGAGGGCCAACGTGCT +UMI\tVAR_ANY\t12 +TSO\tCONST\tCCCATGTACTCTGCGTTGATACCACTGCTT +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + elements = list(structure) + + # R1:Barcode:UMI:PolyT:cDNA:TSO + assert len(elements) == 6 + assert elements[0].element_name == "R1" + assert elements[0].element_type == ElementType.CONST + assert elements[1].element_name == "Barcode" + assert elements[1].element_type == ElementType.VAR_LIST + + def test_iteration(self): + """Test iterating over structure elements.""" + mdf_content = """Barcode:UMI:cDNA +Barcode\tVAR_LIST\tAAAA,CCCC +UMI\tVAR_ANY\t8 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + names = [el.element_name for el in structure] + assert names == ["Barcode", "UMI", "cDNA"] + + def test_from_element_list(self): + """Test creating structure from element list.""" + elements = [ + MoleculeElement("Barcode", ElementType.VAR_LIST, "AAAA,CCCC"), + MoleculeElement("UMI", ElementType.VAR_ANY, "10"), + MoleculeElement("cDNA", ElementType.cDNA) + ] + + structure = MoleculeStructure.from_element_list(elements) + + assert len(list(structure)) == 3 + assert list(structure)[0].element_name == "Barcode" + + def test_concat_delimiter(self): + """Test concatenation delimiter constant.""" + assert MoleculeStructure.CONCAT_DELIM == "|" + + def test_dupl_delimiter(self): + """Test duplication delimiter constant.""" + assert MoleculeStructure.DUPL_DELIM == "/" + + +class TestMoleculeStructureBarcodeUMI: + """Test barcode/UMI element identification.""" + + def test_barcode_elements_single(self): + """Test single barcode element identification.""" + mdf_content = """Barcode:UMI:cDNA +Barcode\tVAR_LIST\tAAAA,CCCC +UMI\tVAR_ANY\t10 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert structure.barcode_elements == ["Barcode"] + + def test_umi_elements_single(self): + """Test single UMI element identification.""" + mdf_content = """Barcode:UMI:cDNA +Barcode\tVAR_LIST\tAAAA,CCCC +UMI\tVAR_ANY\t10 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert structure.umi_elements == ["UMI"] + + def test_barcode_elements_multiple(self): + """Test multiple barcode elements identification.""" + mdf_content = """barcode1:barcode2:UMI:cDNA +barcode1\tVAR_LIST\tAAAA,CCCC +barcode2\tVAR_LIST\tGGGG,TTTT +UMI\tVAR_ANY\t10 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert structure.barcode_elements == ["barcode1", "barcode2"] + + def test_umi_elements_multiple(self): + """Test multiple UMI elements identification.""" + mdf_content = """Barcode:UMI1:UMI2:cDNA +Barcode\tVAR_LIST\tAAAA,CCCC +UMI1\tVAR_ANY\t6 +UMI2\tVAR_ANY\t6 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert structure.umi_elements == ["UMI1", "UMI2"] + + def test_barcode_case_insensitive(self): + """Test that barcode prefix is case-insensitive.""" + mdf_content = """BARCODE:Barcode2:barcode3:cDNA +BARCODE\tVAR_LIST\tAAAA,CCCC +Barcode2\tVAR_LIST\tGGGG,TTTT +barcode3\tVAR_LIST\tACGT,TGCA +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert len(structure.barcode_elements) == 3 + assert "BARCODE" in structure.barcode_elements + assert "Barcode2" in structure.barcode_elements + assert "barcode3" in structure.barcode_elements + + def test_umi_case_insensitive(self): + """Test that UMI prefix is case-insensitive.""" + mdf_content = """Barcode:UMI:umi2:cDNA +Barcode\tVAR_LIST\tAAAA,CCCC +UMI\tVAR_ANY\t6 +umi2\tVAR_ANY\t6 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert len(structure.umi_elements) == 2 + assert "UMI" in structure.umi_elements + assert "umi2" in structure.umi_elements + + def test_no_barcode_elements(self): + """Test structure with no barcode elements.""" + mdf_content = """R1:UMI:cDNA +R1\tCONST\tACGT +UMI\tVAR_ANY\t10 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert structure.barcode_elements == [] + + def test_no_umi_elements(self): + """Test structure with no UMI elements.""" + mdf_content = """Barcode:cDNA +Barcode\tVAR_LIST\tAAAA,CCCC +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert structure.umi_elements == [] + + def test_from_element_list_identifies_barcodes(self): + """Test that from_element_list also identifies barcode/UMI elements.""" + elements = [ + MoleculeElement("Barcode", ElementType.VAR_LIST, "AAAA,CCCC"), + MoleculeElement("UMI", ElementType.VAR_ANY, "10"), + MoleculeElement("cDNA", ElementType.cDNA) + ] + + structure = MoleculeStructure.from_element_list(elements) + + assert structure.barcode_elements == ["Barcode"] + assert structure.umi_elements == ["UMI"] + + +class TestMoleculeStructureEdgeCases: + """Test edge cases and error handling.""" + + def test_element_order_preserved(self): + """Test that element order from header line is preserved.""" + mdf_content = """cDNA:UMI:Barcode:PolyT +Barcode\tVAR_LIST\tAAAA,CCCC +UMI\tVAR_ANY\t8 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + names = [el.element_name for el in structure] + # Order should match header line, not definition order + assert names == ["cDNA", "UMI", "Barcode", "PolyT"] + + def test_whitespace_handling(self): + """Test that whitespace in MDF is handled correctly.""" + mdf_content = """ Barcode : UMI : cDNA +Barcode VAR_LIST AAAA,CCCC +UMI VAR_ANY 8 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + elements = list(structure) + assert len(elements) == 3 + + def test_multiple_var_list_barcodes(self): + """Test VAR_LIST with many barcodes.""" + barcodes = ",".join(["ACGT" * 4] * 100) # 100 barcodes + mdf_content = f"""Barcode:cDNA +Barcode\tVAR_LIST\t{barcodes} +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + elements = list(structure) + assert len(elements[0].element_value) == 100 + + +class TestConcatenatedElements: + """Test concatenated element parsing and identification.""" + + def test_parse_concatenated_elements(self): + """Test parsing MDF with concatenated barcode parts.""" + mdf_content = """Barcode|1:Barcode|2:UMI:PolyT:cDNA +Barcode|1\tVAR_LIST\tAAAACCCC,GGGGTTTT\t4 +Barcode|2\tVAR_LIST\tAAAACCCC,GGGGTTTT\t4 +UMI\tVAR_ANY\t8 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert len(structure.ordered_elements) == 5 + assert "Barcode|1" in structure.elements_to_concatenate + assert "Barcode|2" in structure.elements_to_concatenate + assert structure.elements_to_concatenate["Barcode|1"] == ("Barcode", 1) + assert structure.elements_to_concatenate["Barcode|2"] == ("Barcode", 2) + + def test_concatenated_elements_counts(self): + """Test that concatenated_elements_counts has correct base name and count.""" + mdf_content = """Barcode|1:Barcode|2:cDNA +Barcode|1\tVAR_LIST\tAAAACCCC,GGGGTTTT\t4 +Barcode|2\tVAR_LIST\tAAAACCCC,GGGGTTTT\t4 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert "Barcode" in structure.concatenated_elements_counts + assert structure.concatenated_elements_counts["Barcode"] == 2 + + def test_concatenated_three_parts(self): + """Test three-part concatenated element.""" + mdf_content = """Barcode|1:Barcode|2:Barcode|3:cDNA +Barcode|1\tVAR_LIST\tAAAACCCCGGGG,TTTTAAAACCCC\t4 +Barcode|2\tVAR_LIST\tAAAACCCCGGGG,TTTTAAAACCCC\t4 +Barcode|3\tVAR_LIST\tAAAACCCCGGGG,TTTTAAAACCCC\t4 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert structure.concatenated_elements_counts["Barcode"] == 3 + + def test_concatenated_barcode_umi_identification(self): + """Test that concatenated barcode elements are identified by base name.""" + mdf_content = """Barcode|1:Barcode|2:UMI:cDNA +Barcode|1\tVAR_LIST\tAAAACCCC,GGGGTTTT\t4 +Barcode|2\tVAR_LIST\tAAAACCCC,GGGGTTTT\t4 +UMI\tVAR_ANY\t8 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + # Base name "Barcode" should be in barcode_elements, not "Barcode|1", "Barcode|2" + assert structure.barcode_elements == ["Barcode"] + assert "Barcode|1" not in structure.barcode_elements + assert "Barcode|2" not in structure.barcode_elements + assert structure.umi_elements == ["UMI"] + + def test_concatenated_umi_identification(self): + """Test that concatenated UMI elements are identified by base name.""" + mdf_content = """Barcode:UMI|1:UMI|2:cDNA +Barcode\tVAR_LIST\tAAAA,CCCC +UMI|1\tVAR_ANY\t4 +UMI|2\tVAR_ANY\t4 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert structure.barcode_elements == ["Barcode"] + assert structure.umi_elements == ["UMI"] + + def test_concatenated_explicit_length(self): + """Test that explicit length (4th field) is correctly parsed for concatenated parts.""" + mdf_content = """Barcode|1:Barcode|2:cDNA +Barcode|1\tVAR_LIST\tAAAACCCC,GGGGTTTT\t4 +Barcode|2\tVAR_LIST\tAAAACCCC,GGGGTTTT\t4 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + # Each part should have length 4, not the full barcode length 8 + for el in structure: + if el.element_name == "Barcode|1": + assert el.element_length == 4 + elif el.element_name == "Barcode|2": + assert el.element_length == 4 + + def test_concatenated_elements_must_be_variable(self): + """Test that non-variable concatenated elements are rejected.""" + mdf_content = """Adapter|1:Adapter|2:cDNA +Adapter|1\tCONST\tACGT +Adapter|2\tCONST\tTGCA +""" + with pytest.raises(SystemExit): + MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + def test_concatenated_nonconsecutive_indices_rejected(self): + """Test that non-consecutive indices (e.g., 1 and 3) are rejected.""" + mdf_content = """Barcode|1:Barcode|3:cDNA +Barcode|1\tVAR_LIST\tAAAA,CCCC\t4 +Barcode|3\tVAR_LIST\tAAAA,CCCC\t4 +""" + with pytest.raises(SystemExit): + MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + +class TestDuplicatedElements: + """Test duplicated element parsing and identification.""" + + def test_parse_duplicated_elements(self): + """Test parsing MDF with duplicated barcode parts.""" + mdf_content = """Barcode/1:Barcode/2:UMI:PolyT:cDNA +Barcode/1\tVAR_LIST\tAAAACCCC,GGGGTTTT +Barcode/2\tVAR_LIST\tAAAACCCC,GGGGTTTT +UMI\tVAR_ANY\t8 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert len(structure.ordered_elements) == 5 + assert "Barcode/1" in structure.duplicated_elements + assert "Barcode/2" in structure.duplicated_elements + assert structure.duplicated_elements["Barcode/1"] == ("Barcode", 1) + assert structure.duplicated_elements["Barcode/2"] == ("Barcode", 2) + + def test_duplicated_elements_counts(self): + """Test that duplicated_elements_counts has correct base name and count.""" + mdf_content = """Barcode/1:Barcode/2:cDNA +Barcode/1\tVAR_LIST\tAAAACCCC,GGGGTTTT +Barcode/2\tVAR_LIST\tAAAACCCC,GGGGTTTT +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert "Barcode" in structure.duplicated_elements_counts + assert structure.duplicated_elements_counts["Barcode"] == 2 + + def test_duplicated_three_parts(self): + """Test three-part duplicated element.""" + mdf_content = """Barcode/1:Barcode/2:Barcode/3:cDNA +Barcode/1\tVAR_LIST\tAAAACCCC,GGGGTTTT +Barcode/2\tVAR_LIST\tAAAACCCC,GGGGTTTT +Barcode/3\tVAR_LIST\tAAAACCCC,GGGGTTTT +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert structure.duplicated_elements_counts["Barcode"] == 3 + + def test_duplicated_barcode_umi_identification(self): + """Test that duplicated barcode elements are identified by base name.""" + mdf_content = """Barcode/1:Barcode/2:UMI:cDNA +Barcode/1\tVAR_LIST\tAAAACCCC,GGGGTTTT +Barcode/2\tVAR_LIST\tAAAACCCC,GGGGTTTT +UMI\tVAR_ANY\t8 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + # Base name "Barcode" should be in barcode_elements, not "Barcode/1", "Barcode/2" + assert structure.barcode_elements == ["Barcode"] + assert "Barcode/1" not in structure.barcode_elements + assert "Barcode/2" not in structure.barcode_elements + assert structure.umi_elements == ["UMI"] + + def test_duplicated_umi_identification(self): + """Test that duplicated UMI elements are identified by base name.""" + mdf_content = """Barcode:UMI/1:UMI/2:cDNA +Barcode\tVAR_LIST\tAAAA,CCCC +UMI/1\tVAR_ANY\t8 +UMI/2\tVAR_ANY\t8 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + assert structure.barcode_elements == ["Barcode"] + assert structure.umi_elements == ["UMI"] + + def test_duplicated_elements_must_be_variable(self): + """Test that non-variable duplicated elements are rejected.""" + mdf_content = """Adapter/1:Adapter/2:cDNA +Adapter/1\tCONST\tACGT +Adapter/2\tCONST\tACGT +""" + with pytest.raises(SystemExit): + MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + def test_duplicated_nonconsecutive_indices_rejected(self): + """Test that non-consecutive indices (e.g., 1 and 3) are rejected.""" + mdf_content = """Barcode/1:Barcode/3:cDNA +Barcode/1\tVAR_LIST\tAAAA,CCCC +Barcode/3\tVAR_LIST\tAAAA,CCCC +""" + with pytest.raises(SystemExit): + MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + def test_duplicated_different_types_rejected(self): + """Test that duplicated elements with different types are rejected.""" + mdf_content = """Barcode/1:Barcode/2:cDNA +Barcode/1\tVAR_LIST\tAAAA,CCCC +Barcode/2\tVAR_ANY\t4 +""" + with pytest.raises(SystemExit): + MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + +if __name__ == '__main__': + pytest.main([__file__, '-v']) diff --git a/tests/test_universal_extraction.py b/tests/test_universal_extraction.py new file mode 100644 index 00000000..e9363393 --- /dev/null +++ b/tests/test_universal_extraction.py @@ -0,0 +1,1104 @@ +############################################################################ +# Copyright (c) 2025-2026 University of Helsinki +# All Rights Reserved +# See file LICENSE for details. +############################################################################ + +""" +Tests for universal barcode extraction. + +Tests the universal barcode calling infrastructure: +- DetectedElement container class +- ExtractionResult dict-based detection result +- UniversalSingleMoleculeExtractor for pattern detection +- BarcodeResult protocol compliance +""" + +import os +import pytest + +from src.barcode_calling.callers.extraction_result import ( + DetectedElement, + ExtractionResult, + ReadStats +) +from src.barcode_calling.callers.molecule_structure import ( + ElementType, + MoleculeElement, + MoleculeStructure +) +from src.barcode_calling.callers.universal_extraction import ( + UniversalSingleMoleculeExtractor +) +from src.barcode_calling.callers.protocol import BarcodeResult + +# Test data directory +TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "universal_data") + + +# ============================================================================= +# Test Fixtures +# ============================================================================= + +def create_simple_structure(): + """Create simple molecule structure: Barcode:UMI:PolyT:cDNA""" + mdf_content = """Barcode:UMI:PolyT:cDNA +Barcode\tVAR_LIST\tATCCTTAGTGTTTGTC,CTTAGGAGTGGTTAGC,TGAGGGCCAACGTGCT +UMI\tVAR_ANY\t12 +""" + return MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + +def create_10x_structure(): + """Create 10x-like molecule structure with R1 adapter and TSO. + + Uses inline barcodes instead of file reference to avoid path issues. + """ + # Use inline barcodes to avoid relative path issues with VAR_FILE + mdf_content = """R1:Barcode:UMI:PolyT:cDNA:TSO +R1\tCONST\tCTACACGACGCTCTTCCGATCT +Barcode\tVAR_LIST\tATCCTTAGTGTTTGTC,CTTAGGAGTGGTTAGC,TGAGGGCCAACGTGCT,AAACCCGGGTTTAAAC +UMI\tVAR_ANY\t12 +TSO\tCONST\tCCCATGTACTCTGCGTTGATACCACTGCTT +""" + return MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + +def create_detected_elements(): + """Create sample detected elements for testing.""" + return { + "Barcode": DetectedElement(22, 37, 16, "ATCCTTAGTGTTTGTC"), + "UMI": DetectedElement(38, 49, 0, "AACCTTAGACCG"), + "PolyT": DetectedElement(50, 79, 0) + } + + +# ============================================================================= +# Test DetectedElement +# ============================================================================= + +class TestDetectedElement: + """Test DetectedElement container class.""" + + def test_default_init(self): + """Test initialization with default values.""" + element = DetectedElement() + + assert element.start == -1 + assert element.end == -1 + assert element.score == -1 + assert element.seq is None + + def test_custom_init(self): + """Test initialization with custom values.""" + element = DetectedElement(start=10, end=25, score=15, seq="ACTGACTG") + + assert element.start == 10 + assert element.end == 25 + assert element.score == 15 + assert element.seq == "ACTGACTG" + + def test_partial_init(self): + """Test initialization with some values.""" + element = DetectedElement(start=100, end=150) + + assert element.start == 100 + assert element.end == 150 + assert element.score == -1 + assert element.seq is None + + def test_coordinates_only(self): + """Test element with only coordinates (no sequence).""" + element = DetectedElement(50, 70, 0) + + assert element.start == 50 + assert element.end == 70 + assert element.seq is None + + +# ============================================================================= +# Test ExtractionResult +# ============================================================================= + +class TestExtractionResult: + """Test ExtractionResult dict-based detection result.""" + + def test_init(self): + """Test initialization.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.read_id == "read_001" + assert result.strand == "+" + assert result.molecule_structure is structure + assert "Barcode" in result.detected_results + + def test_barcode_identification_by_prefix(self): + """Test barcode elements are identified by 'barcode' prefix.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", "+", detected) + + # Should identify 'Barcode' element as barcode (case-insensitive prefix) + assert "Barcode" in result._barcode_elements + + def test_umi_identification_by_prefix(self): + """Test UMI elements are identified by 'umi' prefix.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", "+", detected) + + # Should identify 'UMI' element as UMI (case-insensitive prefix) + assert "UMI" in result._umi_elements + + def test_get_barcode_single(self): + """Test getting single barcode.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.get_barcode() == "ATCCTTAGTGTTTGTC" + + def test_get_barcode_missing(self): + """Test getting barcode when not detected.""" + structure = create_simple_structure() + detected = { + "UMI": DetectedElement(38, 49, 0, "AACCTTAGACCG"), + "PolyT": DetectedElement(50, 79, 0) + } + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.get_barcode() == ExtractionResult.NOSEQ + + def test_get_umi(self): + """Test getting UMI.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.get_umi() == "AACCTTAGACCG" + + def test_get_umi_missing(self): + """Test getting UMI when not detected.""" + structure = create_simple_structure() + detected = { + "Barcode": DetectedElement(22, 37, 16, "ATCCTTAGTGTTTGTC"), + "PolyT": DetectedElement(50, 79, 0) + } + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.get_umi() == ExtractionResult.NOSEQ + + def test_is_valid_with_barcode(self): + """Test is_valid returns True when barcode detected.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.is_valid() is True + + def test_is_valid_without_barcode(self): + """Test is_valid returns False when no barcode.""" + structure = create_simple_structure() + detected = {"PolyT": DetectedElement(50, 79, 0)} + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.is_valid() is False + + def test_has_barcode(self): + """Test has_barcode method.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.has_barcode() is True + + def test_has_umi(self): + """Test has_umi method.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.has_umi() is True + + def test_has_umi_false(self): + """Test has_umi returns False when no UMI.""" + structure = create_simple_structure() + detected = { + "Barcode": DetectedElement(22, 37, 16, "ATCCTTAGTGTTTGTC"), + "PolyT": DetectedElement(50, 79, 0) + } + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.has_umi() is False + + def test_get_barcode_score(self): + """Test getting barcode score.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.get_barcode_score() == 16 + + def test_get_barcode_score_no_barcode(self): + """Test barcode score when no barcode detected.""" + structure = create_simple_structure() + detected = {"PolyT": DetectedElement(50, 79, 0)} + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.get_barcode_score() == -1 + + def test_set_strand(self): + """Test setting strand.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", ".", detected) + result.set_strand("-") + + assert result.strand == "-" + + def test_update_coordinates(self): + """Test shifting all coordinates.""" + structure = create_simple_structure() + detected = { + "Barcode": DetectedElement(22, 37, 16, "ATCCTTAGTGTTTGTC"), + "UMI": DetectedElement(38, 49, 0, "AACCTTAGACCG"), + } + + result = ExtractionResult(structure, "read_001", "+", detected) + result.update_coordinates(100) + + assert result.detected_results["Barcode"].start == 122 + assert result.detected_results["Barcode"].end == 137 + assert result.detected_results["UMI"].start == 138 + assert result.detected_results["UMI"].end == 149 + + def test_update_coordinates_invalid(self): + """Test coordinate shift preserves -1 values.""" + structure = create_simple_structure() + detected = { + "Barcode": DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ), + } + + result = ExtractionResult(structure, "read_001", "+", detected) + result.update_coordinates(100) + + # -1 values (invalid/not detected) should remain -1 + assert result.detected_results["Barcode"].start == -1 + assert result.detected_results["Barcode"].end == -1 + + def test_more_informative_than_by_count(self): + """Test comparison by detected element count.""" + structure = create_simple_structure() + + detected1 = { + "Barcode": DetectedElement(22, 37, 16, "ATCCTTAGTGTTTGTC"), + "UMI": DetectedElement(38, 49, 0, "AACCTTAGACCG"), + "PolyT": DetectedElement(50, 79, 0), + } + detected2 = { + "Barcode": DetectedElement(22, 37, 16, "ATCCTTAGTGTTTGTC"), + } + + result1 = ExtractionResult(structure, "read_001", "+", detected1) + result2 = ExtractionResult(structure, "read_001", "+", detected2) + + assert result1.more_informative_than(result2) is True + assert result2.more_informative_than(result1) is False + + def test_more_informative_than_by_score(self): + """Test comparison by total score when counts equal.""" + structure = create_simple_structure() + + detected1 = { + "Barcode": DetectedElement(22, 37, 20, "ATCCTTAGTGTTTGTC"), + } + detected2 = { + "Barcode": DetectedElement(22, 37, 10, "ATCCTTAGTGTTTGTC"), + } + + result1 = ExtractionResult(structure, "read_001", "+", detected1) + result2 = ExtractionResult(structure, "read_001", "+", detected2) + + assert result1.more_informative_than(result2) is True + + def test_str_format(self): + """Test string formatting for TSV output.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", "+", detected) + output = str(result) + + # Standard format: read_id, barcode, UMI, BC_score, valid_UMI, strand + assert "read_001" in output + assert "ATCCTTAGTGTTTGTC" in output + assert "AACCTTAGACCG" in output + assert "16" in output + assert "True" in output + assert "+" in output + + def test_header_format(self): + """Test header generation.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", "+", detected) + header = result.header() + + assert "#read_id" in header + assert "barcode" in header + assert "UMI" in header + assert "BC_score" in header + assert "valid_UMI" in header + assert "strand" in header + + def test_get_additional_attributes(self): + """Test getting additional attributes excludes barcode/UMI elements.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", "+", detected) + attrs = result.get_additional_attributes() + + # Barcode and UMI are excluded (tracked separately by ReadStats) + assert "Barcode detected" not in attrs + assert "UMI detected" not in attrs + # Supplementary elements are included + assert "PolyT detected" in attrs + + def test_noseq_constant(self): + """Test NOSEQ constant.""" + assert ExtractionResult.NOSEQ == "*" + + +class TestExtractionResultProtocolCompliance: + """Test that ExtractionResult conforms to BarcodeResult protocol.""" + + def test_isinstance_check(self): + """Test that ExtractionResult is recognized as BarcodeResult.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", "+", detected) + + # Protocol is runtime_checkable + assert isinstance(result, BarcodeResult) + + def test_has_required_attributes(self): + """Test that all protocol attributes exist.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", "+", detected) + + # Required attributes + assert hasattr(result, 'NOSEQ') + assert hasattr(result, 'read_id') + assert hasattr(result, 'strand') + + def test_has_required_methods(self): + """Test that all protocol methods exist.""" + structure = create_simple_structure() + detected = create_detected_elements() + + result = ExtractionResult(structure, "read_001", "+", detected) + + # Required methods + assert callable(getattr(result, 'get_barcode', None)) + assert callable(getattr(result, 'get_umi', None)) + assert callable(getattr(result, 'is_valid', None)) + assert callable(getattr(result, 'has_barcode', None)) + assert callable(getattr(result, 'has_umi', None)) + assert callable(getattr(result, 'set_strand', None)) + assert callable(getattr(result, 'update_coordinates', None)) + assert callable(getattr(result, 'more_informative_than', None)) + assert callable(getattr(result, 'get_additional_attributes', None)) + assert callable(getattr(result, 'header', None)) + + +# ============================================================================= +# Test ExtractionResult ReadStats +# ============================================================================= + +class TestExtractionReadStats: + """Test ReadStats class for ExtractionResult.""" + + def test_init(self): + """Test initialization.""" + stats = ReadStats() + + assert stats.read_count == 0 + assert stats.bc_count == 0 + assert stats.umi_count == 0 + assert len(stats.pattern_counts) == 0 + + def test_add_read_with_barcode(self): + """Test adding read with barcode detected.""" + stats = ReadStats() + structure = create_simple_structure() + detected = create_detected_elements() + result = ExtractionResult(structure, "read_001", "+", detected) + + stats.add_read(result) + + assert stats.read_count == 1 + assert stats.bc_count == 1 + assert stats.umi_count == 1 + assert stats.pattern_counts["Barcode"] == 1 + assert stats.pattern_counts["UMI"] == 1 + assert stats.pattern_counts["PolyT"] == 1 + + def test_add_read_without_barcode(self): + """Test adding read without barcode.""" + stats = ReadStats() + structure = create_simple_structure() + detected = {"PolyT": DetectedElement(50, 79, 0)} + result = ExtractionResult(structure, "read_001", "+", detected) + + stats.add_read(result) + + assert stats.read_count == 1 + assert stats.bc_count == 0 + assert stats.umi_count == 0 + assert stats.pattern_counts["PolyT"] == 1 + + def test_add_multiple_reads(self): + """Test adding multiple reads.""" + stats = ReadStats() + structure = create_simple_structure() + + # Read 1: has barcode, UMI, polyT + detected1 = create_detected_elements() + result1 = ExtractionResult(structure, "read_001", "+", detected1) + + # Read 2: has only barcode + detected2 = {"Barcode": DetectedElement(22, 37, 16, "ATCCTTAGTGTTTGTC")} + result2 = ExtractionResult(structure, "read_002", "+", detected2) + + # Read 3: no barcode + detected3 = {"PolyT": DetectedElement(50, 79, 0)} + result3 = ExtractionResult(structure, "read_003", "+", detected3) + + stats.add_read(result1) + stats.add_read(result2) + stats.add_read(result3) + + assert stats.read_count == 3 + assert stats.bc_count == 2 # reads 1 and 2 + assert stats.umi_count == 1 # only read 1 + assert stats.pattern_counts["Barcode"] == 2 + assert stats.pattern_counts["PolyT"] == 2 + + def test_str_format(self): + """Test string formatting.""" + stats = ReadStats() + structure = create_simple_structure() + detected = create_detected_elements() + result = ExtractionResult(structure, "read_001", "+", detected) + + stats.add_read(result) + output = str(stats) + + assert "Total reads:\t1" in output + assert "Barcode detected:\t1" in output + assert "UMI detected:\t1" in output + + +# ============================================================================= +# Test UniversalSingleMoleculeExtractor +# ============================================================================= + +class TestUniversalSingleMoleculeExtractor: + """Test UniversalSingleMoleculeExtractor class.""" + + def test_init_simple_structure(self): + """Test initialization with simple structure.""" + structure = create_simple_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + assert extractor.has_polyt is True + assert extractor.has_cdna is True + assert "UMI" in extractor.elements_to_extract + + def test_init_10x_structure(self): + """Test initialization with 10x structure.""" + structure = create_10x_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + assert extractor.has_polyt is True + assert extractor.has_cdna is True + assert "R1" in extractor.constant_elements_to_detect + assert "TSO" in extractor.constant_elements_to_detect + + def test_result_type(self): + """Test result_type returns ExtractionResult.""" + structure = create_simple_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + assert extractor.result_type() is ExtractionResult + + def test_header(self): + """Test header generation.""" + structure = create_simple_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + header = extractor.header() + + assert "#read_id" in header + assert "strand" in header + + +class TestPatternDetection: + """Test actual barcode/UMI detection with synthetic sequences.""" + + def test_find_barcode_umi_forward(self): + """Test detection in forward strand read.""" + structure = create_10x_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + # Test read with known structure (from test_reads.fq READ_1) + # Structure: R1(22bp) + Barcode(16bp) + UMI(12bp) + PolyT + cDNA + read_id = "READ_1" + sequence = "TACACGACGCTCTTCCGATCTATCCTTAGTGTTTGTCAACCTTAGACCGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGGGCCTCAGCGTAATCCTATTGTGCATG" + + result = extractor.find_barcode_umi(read_id, sequence) + + assert result.read_id == read_id + assert result.strand == "+" + # Should detect polyT + assert "PolyT" in result.detected_results + polyt = result.detected_results["PolyT"] + assert polyt.start != -1 + + def test_find_barcode_umi_reverse(self): + """Test detection in reverse strand read.""" + structure = create_10x_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + # Test read with TSO at beginning (from test_reads.fq READ_2) + # This indicates reverse strand - TSO is at 3' end + read_id = "READ_2" + # Starts with TSO adapter, ends with R1 + sequence = "AAGCAGTGGTATCAACGCAGAGTACATGGGAAGCAAGGAGGCAAGGCCCGCGCCAAGGCCAAGTCGCGGTCTTCCCGGGCCCGGGCTACCTATTCCCGGTGGGGCGTGTGCACCGGCTGCTGCGCAAGGGCAACTACGCGGAGCGTGTGGGCGCCGGCGCGCCGGTATACATGGCGGCGGTGCTGGAGTACCTAACGGCCGAGATCCTGGAGCTGGCGGGCAACGCGGCCCGCGACAACAAGAAGACTCGCATCATCCCGCGCCACCTGCAGCTGGCCATCCGCAACGACGAGGAGCTCAACAAGCTGCTGGGCAAAGTGACGATCGCGCAGGGCGGCGTCCTGCCCAACATCCAGGCCGTGCTGCTGCCCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATCAAGGAGGCTCGCTAACCACTCCTAAGAGATCGGAAGAGCGTCGTGTA" + + result = extractor.find_barcode_umi(read_id, sequence) + + assert result.read_id == read_id + # Should detect in reverse orientation (TSO at start means polyA/polyT would be internal) + + def test_no_match_random_sequence(self): + """Test detection with random sequence (no patterns).""" + structure = create_simple_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + read_id = "RANDOM" + sequence = "ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT" * 5 + + result = extractor.find_barcode_umi(read_id, sequence) + + assert result.read_id == read_id + # May or may not detect anything, but should not crash + + +class TestMultipleBarcodeNaming: + """Test barcode/UMI identification with various naming conventions.""" + + def test_barcode_prefix_lowercase(self): + """Test 'barcode' prefix detection (lowercase).""" + mdf_content = """barcode1:cDNA +barcode1\tVAR_LIST\tAAAA,CCCC,GGGG +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + detected = {"barcode1": DetectedElement(0, 3, 10, "AAAA")} + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.get_barcode() == "AAAA" + assert "barcode1" in result._barcode_elements + + def test_barcode_prefix_mixedcase(self): + """Test 'Barcode' prefix detection (mixed case).""" + mdf_content = """Barcode:cDNA +Barcode\tVAR_LIST\tAAAA,CCCC,GGGG +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + detected = {"Barcode": DetectedElement(0, 3, 10, "CCCC")} + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.get_barcode() == "CCCC" + + def test_umi_prefix_lowercase(self): + """Test 'umi' prefix detection (lowercase).""" + mdf_content = """umi:cDNA +umi\tVAR_ANY\t10 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + detected = {"umi": DetectedElement(0, 9, 0, "ACGTACGTAC")} + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.get_umi() == "ACGTACGTAC" + assert "umi" in result._umi_elements + + def test_multiple_barcodes_concatenated(self): + """Test multiple barcode elements are concatenated with '|'.""" + mdf_content = """Barcode1:Barcode2:cDNA +Barcode1\tVAR_LIST\tAAAA,CCCC +Barcode2\tVAR_LIST\tGGGG,TTTT +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + detected = { + "Barcode1": DetectedElement(0, 3, 10, "AAAA"), + "Barcode2": DetectedElement(4, 7, 10, "GGGG") + } + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.get_barcode() == "AAAA|GGGG" + + def test_multiple_umis_concatenated(self): + """Test multiple UMI elements are concatenated with '|'.""" + mdf_content = """UMI1:UMI2:cDNA +UMI1\tVAR_ANY\t6 +UMI2\tVAR_ANY\t6 +""" + structure = MoleculeStructure(iter(mdf_content.strip().split('\n'))) + detected = { + "UMI1": DetectedElement(0, 5, 0, "AAAAAA"), + "UMI2": DetectedElement(6, 11, 0, "CCCCCC") + } + + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.get_umi() == "AAAAAA|CCCCCC" + + +class TestIntegration: + """Integration tests with test data files.""" + + def test_process_test_reads_file(self): + """Test processing reads from test_reads.fq.""" + structure = create_10x_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + # Load test reads + test_file = os.path.join(TEST_DATA_DIR, "test_reads.fq") + reads = [] + with open(test_file) as f: + lines = f.readlines() + for i in range(0, len(lines), 4): + if i + 1 < len(lines): + read_id = lines[i].strip()[1:].split()[0] # Remove @ and take first part + sequence = lines[i + 1].strip() + reads.append((read_id, sequence)) + + # Process each read + results = [] + for read_id, sequence in reads: + result = extractor.find_barcode_umi(read_id, sequence) + results.append(result) + + # Should process all reads without error + assert len(results) == len(reads) + + # Check results are ExtractionResult instances + for result in results: + assert isinstance(result, ExtractionResult) + assert result.read_id in [r[0] for r in reads] + + +class TestConcatenatedElementProcessing: + """Test concatenated element extraction and processing.""" + + def _create_concat_structure(self): + """Create structure with concatenated barcode: Barcode|1:Barcode|2:UMI:PolyT:cDNA""" + # Use 8bp barcodes split into two 4bp parts + mdf_content = """Barcode|1:Barcode|2:UMI:PolyT:cDNA +Barcode|1\tVAR_LIST\tAAAACCCC,GGGGTTTT,ACGTACGT\t4 +Barcode|2\tVAR_LIST\tAAAACCCC,GGGGTTTT,ACGTACGT\t4 +UMI\tVAR_ANY\t8 +""" + return MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + def test_concat_structure_barcode_elements(self): + """Test that concatenated barcode is identified by base name.""" + structure = self._create_concat_structure() + assert structure.barcode_elements == ["Barcode"] + assert structure.umi_elements == ["UMI"] + + def test_concat_structure_counts(self): + """Test concatenated elements counts.""" + structure = self._create_concat_structure() + assert structure.concatenated_elements_counts["Barcode"] == 2 + + def test_concat_extractor_init(self): + """Test extractor initialization with concatenated elements.""" + structure = self._create_concat_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + # Should have one index for the base name "Barcode", not individual parts + assert "Barcode" in extractor.index_dict + assert "Barcode|1" not in extractor.index_dict + assert "Barcode|2" not in extractor.index_dict + + def test_concat_extractor_header(self): + """Test header output for concatenated elements.""" + structure = self._create_concat_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + header = extractor.header() + + # Header should have standard fields, not individual parts + assert "barcode" in header + assert "UMI" in header + # Should NOT contain individual concatenated part names + assert "Barcode|1" not in header + assert "Barcode|2" not in header + + def test_process_concatenated_elements_all_detected(self): + """Test process_concatenated_elements when all parts are found.""" + structure = self._create_concat_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + # Simulate detected parts: AAAA at [10,13] and CCCC at [14,17] + sequence = "XXXXXXXXXX" + "AAAA" + "CCCC" + "XXXXXXXX" + concat_storage = {"Barcode": [(10, 13), (14, 17)]} + detected_results = {} + + extractor.process_concatenated_elements(concat_storage, detected_results, sequence) + + assert "Barcode" in detected_results + result = detected_results["Barcode"] + # Should find corrected match for "AAAACCCC" in whitelist + assert result.seq == "AAAACCCC" + assert result.start == 10 + assert result.end == 17 + + def test_process_concatenated_elements_missing_part(self): + """Test process_concatenated_elements when a part is missing.""" + structure = self._create_concat_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + # Part 2 not detected (start=-1) + concat_storage = {"Barcode": [(10, 13), (-1, -1)]} + detected_results = {} + + extractor.process_concatenated_elements(concat_storage, detected_results, "AAAA" * 10) + + assert "Barcode" in detected_results + assert detected_results["Barcode"].seq == ExtractionResult.NOSEQ + assert detected_results["Barcode"].start == -1 + + def test_process_concatenated_no_correction(self): + """Test process_concatenated_elements without correction (raw concatenation).""" + structure = self._create_concat_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + extractor.correct_sequences = False + + sequence = "XXXXXXXXXX" + "ACGT" + "TGCA" + "XXXXXXXX" + concat_storage = {"Barcode": [(10, 13), (14, 17)]} + detected_results = {} + + extractor.process_concatenated_elements(concat_storage, detected_results, sequence) + + assert "Barcode" in detected_results + result = detected_results["Barcode"] + # Without correction, raw concatenated sequence is stored + assert result.seq == "ACGTTGCA" + assert result.start == 10 + assert result.end == 17 + + def test_concat_extraction_result_get_barcode(self): + """Test ExtractionResult.get_barcode() with concatenated element result.""" + structure = self._create_concat_structure() + detected = {"Barcode": DetectedElement(10, 17, 14, "AAAACCCC")} + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.get_barcode() == "AAAACCCC" + assert result.has_barcode() is True + assert result.is_valid() is True + + def test_concat_extraction_result_str_format(self): + """Test ExtractionResult __str__ output with concatenated elements.""" + structure = self._create_concat_structure() + detected = { + "Barcode": DetectedElement(10, 17, 14, "AAAACCCC"), + "UMI": DetectedElement(18, 25, 0, "ACGTACGT"), + "PolyT": DetectedElement(26, 40, 0) + } + result = ExtractionResult(structure, "read_001", "+", detected) + output = str(result) + + fields = output.split('\t') + assert fields[0] == "read_001" + assert fields[1] == "AAAACCCC" # barcode + assert fields[2] == "ACGTACGT" # UMI + # Should NOT contain individual part names + + def test_concat_extraction_result_no_barcode(self): + """Test ExtractionResult when concatenated barcode correction fails.""" + structure = self._create_concat_structure() + detected = { + "Barcode": DetectedElement(-1, -1, -1, ExtractionResult.NOSEQ), + "UMI": DetectedElement(18, 25, 0, "ACGTACGT") + } + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.get_barcode() == ExtractionResult.NOSEQ + assert result.has_barcode() is False + + def test_concat_additional_attributes_exclude_barcode(self): + """Test that concatenated barcode is excluded from additional attributes.""" + structure = self._create_concat_structure() + detected = { + "Barcode": DetectedElement(10, 17, 14, "AAAACCCC"), + "PolyT": DetectedElement(26, 40, 0) + } + result = ExtractionResult(structure, "read_001", "+", detected) + attrs = result.get_additional_attributes() + + assert "Barcode detected" not in attrs + assert "Barcode|1 detected" not in attrs + assert "Barcode|2 detected" not in attrs + assert "PolyT detected" in attrs + + +class TestDuplicatedElementProcessing: + """Test duplicated element extraction and processing (majority vote).""" + + def _create_dupl_structure(self): + """Create structure with duplicated barcode: Barcode/1:Barcode/2:UMI:PolyT:cDNA""" + mdf_content = """Barcode/1:Barcode/2:UMI:PolyT:cDNA +Barcode/1\tVAR_LIST\tAAAACCCC,GGGGTTTT,ACGTACGT +Barcode/2\tVAR_LIST\tAAAACCCC,GGGGTTTT,ACGTACGT +UMI\tVAR_ANY\t8 +""" + return MoleculeStructure(iter(mdf_content.strip().split('\n'))) + + def test_dupl_structure_barcode_elements(self): + """Test that duplicated barcode is identified by base name.""" + structure = self._create_dupl_structure() + assert structure.barcode_elements == ["Barcode"] + assert structure.umi_elements == ["UMI"] + + def test_dupl_structure_counts(self): + """Test duplicated elements counts.""" + structure = self._create_dupl_structure() + assert structure.duplicated_elements_counts["Barcode"] == 2 + + def test_dupl_extractor_init(self): + """Test extractor initialization with duplicated elements.""" + structure = self._create_dupl_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + # Should have one index for the base name "Barcode", not individual parts + assert "Barcode" in extractor.index_dict + assert "Barcode/1" not in extractor.index_dict + assert "Barcode/2" not in extractor.index_dict + + def test_dupl_extractor_header(self): + """Test header output for duplicated elements.""" + structure = self._create_dupl_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + header = extractor.header() + + assert "barcode" in header + assert "UMI" in header + # Should NOT contain individual duplicated part names + assert "Barcode/1" not in header + assert "Barcode/2" not in header + + def test_process_duplicated_both_agree(self): + """Test process_duplicated_elements when both copies correct to the same barcode.""" + structure = self._create_dupl_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + # Both copies have AAAACCCC (exact match to whitelist) + sequence = "XXXXXXXXXX" + "AAAACCCC" + "XXXXXXXX" + "AAAACCCC" + "XXXXXXXX" + dupl_storage = {"Barcode": [(10, 17), (26, 33)]} + detected_results = {} + + extractor.process_duplicated_elements(dupl_storage, detected_results, sequence) + + assert "Barcode" in detected_results + result = detected_results["Barcode"] + assert result.seq == "AAAACCCC" + assert result.start == 10 + assert result.end == 17 + + def test_process_duplicated_one_succeeds(self): + """Test process_duplicated_elements when one copy corrects, other fails.""" + structure = self._create_dupl_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + # Copy 1: AAAACCCC (exact match), Copy 2: totally wrong sequence + sequence = "XXXXXXXXXX" + "AAAACCCC" + "XXXXXXXX" + "TTTTTTTT" + "XXXXXXXX" + dupl_storage = {"Barcode": [(10, 17), (26, 33)]} + detected_results = {} + + extractor.process_duplicated_elements(dupl_storage, detected_results, sequence) + + assert "Barcode" in detected_results + result = detected_results["Barcode"] + assert result.seq == "AAAACCCC" + + def test_process_duplicated_both_fail(self): + """Test process_duplicated_elements when neither copy corrects.""" + structure = self._create_dupl_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + # Both copies have garbage sequences that don't match any whitelist entry + sequence = "XXXXXXXXXX" + "TTTTTTTT" + "XXXXXXXX" + "TTTTTTTT" + "XXXXXXXX" + dupl_storage = {"Barcode": [(10, 17), (26, 33)]} + detected_results = {} + + extractor.process_duplicated_elements(dupl_storage, detected_results, sequence) + + assert "Barcode" in detected_results + assert detected_results["Barcode"].seq == ExtractionResult.NOSEQ + assert detected_results["Barcode"].start == -1 + + def test_process_duplicated_disagree(self): + """Test process_duplicated_elements when copies correct to different barcodes.""" + structure = self._create_dupl_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + # Copy 1: AAAACCCC, Copy 2: GGGGTTTT (both valid but different) + sequence = "XXXXXXXXXX" + "AAAACCCC" + "XXXXXXXX" + "GGGGTTTT" + "XXXXXXXX" + dupl_storage = {"Barcode": [(10, 17), (26, 33)]} + detected_results = {} + + extractor.process_duplicated_elements(dupl_storage, detected_results, sequence) + + assert "Barcode" in detected_results + assert detected_results["Barcode"].seq == ExtractionResult.NOSEQ + assert detected_results["Barcode"].start == -1 + + def test_process_duplicated_none_detected(self): + """Test process_duplicated_elements when no copies are detected.""" + structure = self._create_dupl_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + dupl_storage = {"Barcode": [(-1, -1), (-1, -1)]} + detected_results = {} + + extractor.process_duplicated_elements(dupl_storage, detected_results, "AAAA" * 20) + + assert "Barcode" in detected_results + assert detected_results["Barcode"].seq == ExtractionResult.NOSEQ + assert detected_results["Barcode"].start == -1 + + def test_process_duplicated_one_detected(self): + """Test process_duplicated_elements when only one copy is detected.""" + structure = self._create_dupl_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + + # Only copy 1 detected with AAAACCCC + sequence = "XXXXXXXXXX" + "AAAACCCC" + "XXXXXXXXXXXXXXXX" + dupl_storage = {"Barcode": [(10, 17), (-1, -1)]} + detected_results = {} + + extractor.process_duplicated_elements(dupl_storage, detected_results, sequence) + + assert "Barcode" in detected_results + result = detected_results["Barcode"] + assert result.seq == "AAAACCCC" + assert result.start == 10 + + def test_process_duplicated_no_correction_match(self): + """Test process_duplicated_elements without correction when copies match.""" + structure = self._create_dupl_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + extractor.correct_sequences = False + + sequence = "XXXXXXXXXX" + "ACGTACGT" + "XXXXXXXX" + "ACGTACGT" + "XXXXXXXX" + dupl_storage = {"Barcode": [(10, 17), (26, 33)]} + detected_results = {} + + extractor.process_duplicated_elements(dupl_storage, detected_results, sequence) + + assert "Barcode" in detected_results + result = detected_results["Barcode"] + assert result.seq == "ACGTACGT" + assert result.start == 10 + + def test_process_duplicated_no_correction_differ(self): + """Test process_duplicated_elements without correction when copies differ.""" + structure = self._create_dupl_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + extractor.correct_sequences = False + + sequence = "XXXXXXXXXX" + "ACGTACGT" + "XXXXXXXX" + "TGCATGCA" + "XXXXXXXX" + dupl_storage = {"Barcode": [(10, 17), (26, 33)]} + detected_results = {} + + extractor.process_duplicated_elements(dupl_storage, detected_results, sequence) + + assert "Barcode" in detected_results + result = detected_results["Barcode"] + # Should be comma-separated when they differ + assert result.seq == "ACGTACGT,TGCATGCA" + assert result.start == 10 + + def test_process_duplicated_no_correction_one_detected(self): + """Test process_duplicated_elements without correction, one copy detected.""" + structure = self._create_dupl_structure() + extractor = UniversalSingleMoleculeExtractor(structure) + extractor.correct_sequences = False + + sequence = "XXXXXXXXXX" + "ACGTACGT" + "XXXXXXXXXXXXXXXX" + dupl_storage = {"Barcode": [(10, 17), (-1, -1)]} + detected_results = {} + + extractor.process_duplicated_elements(dupl_storage, detected_results, sequence) + + assert "Barcode" in detected_results + result = detected_results["Barcode"] + assert result.seq == "ACGTACGT" + assert result.start == 10 + + def test_dupl_extraction_result_get_barcode(self): + """Test ExtractionResult.get_barcode() with duplicated element result.""" + structure = self._create_dupl_structure() + detected = {"Barcode": DetectedElement(10, 17, 14, "AAAACCCC")} + result = ExtractionResult(structure, "read_001", "+", detected) + + assert result.get_barcode() == "AAAACCCC" + assert result.has_barcode() is True + assert result.is_valid() is True + + def test_dupl_additional_attributes_exclude_barcode(self): + """Test that duplicated barcode is excluded from additional attributes.""" + structure = self._create_dupl_structure() + detected = { + "Barcode": DetectedElement(10, 17, 14, "AAAACCCC"), + "PolyT": DetectedElement(26, 40, 0) + } + result = ExtractionResult(structure, "read_001", "+", detected) + attrs = result.get_additional_attributes() + + assert "Barcode detected" not in attrs + assert "Barcode/1 detected" not in attrs + assert "Barcode/2 detected" not in attrs + assert "PolyT detected" in attrs + + +if __name__ == '__main__': + pytest.main([__file__, '-v']) diff --git a/tests/universal_data/10x.mdf b/tests/universal_data/10x.mdf new file mode 100644 index 00000000..b14e4004 --- /dev/null +++ b/tests/universal_data/10x.mdf @@ -0,0 +1,5 @@ +R1:Barcode:UMI:PolyT:cDNA:TSO +R1 CONST CTACACGACGCTCTTCCGATCT +Barcode VAR_FILE barcodes_small.tsv +UMI VAR_ANY 12 +TSO CONST CCCATGTACTCTGCGTTGATACCACTGCTT diff --git a/tests/universal_data/barcodes_small.tsv b/tests/universal_data/barcodes_small.tsv new file mode 100644 index 00000000..3fd4bd57 --- /dev/null +++ b/tests/universal_data/barcodes_small.tsv @@ -0,0 +1,8 @@ +AAACCCGGGTTTAAAC +TTTGGGCCCAAATTTG +GGGGAAAACCCCTTTT +ATCCTTAGTGTTTGTC +CTTAGGAGTGGTTAGC +TGAGGGCCAACGTGCT +GCGCTTGCATAGCAGG +ATCATGTCATTGTGAT diff --git a/tests/universal_data/simple.mdf b/tests/universal_data/simple.mdf new file mode 100644 index 00000000..f1ce3ecb --- /dev/null +++ b/tests/universal_data/simple.mdf @@ -0,0 +1,3 @@ +Barcode:UMI:PolyT:cDNA +Barcode VAR_LIST AAACCCGGGTTTAAAC,TTTGGGCCCAAATTTG,GGGGAAAACCCCTTTT +UMI VAR_ANY 10 diff --git a/tests/universal_data/test_reads.fq b/tests/universal_data/test_reads.fq new file mode 100644 index 00000000..281eb1e2 --- /dev/null +++ b/tests/universal_data/test_reads.fq @@ -0,0 +1,16 @@ +@READ_1_ATCCTTAGTGTTTGTC_AACCTTAGACCG_forward +TACACGACGCTCTTCCGATCTATCCTTAGTGTTTGTCAACCTTAGACCGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGGGCCTCAGCGTAATCCTATTGTGCATGAATTTTATGTCATCACTTGGCTTCAACATTGGGTTGTCATCATTGAACCCGCCCGGTTGCGTTGATACAATGCTT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@READ_2_CTTAGGAGTGGTTAGC_GAGCCTCCTTGA_reverse +AAGCAGTGGTATCAACGCAGAGTACATGGGAAGCAAGGAGGCAAGGCCCGCGCCAAGGCCAAGTCGCGGTCTTCCCGGGCCCGGGCTACCTATTCCCGGTGGGGCGTGTGCACCGGCTGCTGCGCAAGGGCAACTACGCGGAGCGTGTGGGCGCCGGCGCGCCGGTATACATGGCGGCGGTGCTGGAGTACCTAACGGCCGAGATCCTGGAGCTGGCGGGCAACGCGGCCCGCGACAACAAGAAGACTCGCATCATCCCGCGCCACCTGCAGCTGGCCATCCGCAACGACGAGGAGCTCAACAAGCTGCTGGGCAAAGTGACGATCGCGCAGGGCGGCGTCCTGCCCAACATCCAGGCCGTGCTGCTGCCCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATCAAGGAGGCTCGCTAACCACTCCTAAGAGATCGGAAGAGCGTCGTGTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@READ_3_TGAGGGCCAACGTGCT_CAAAGCCTTCCC_forward +TACACGACGCTCTTCCGATCTTGAGGGCCAACGTGCTCAAAGCCTTCCCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGAAGAGCAGGACACAGGAAGATGGACTCTGAGATCCAGCAGGAAGCGGAGAAGGCACTGCAGCAAAGGCTCTGCTCTGGAAGCAAAGGGAAGCAGCTTCCAAGAGAGCAGGGCAGGAGGAAGCCGACTGAGGACTGCTGTGCCGCAGGCGAGGCTTAAAGCCCAGAAAGCAATATGCAAGGTAGAAGAGACCAATGTGTACCGCCGAGAAGCTCAACGGAGGCACCTTGTGCCATGGTCTCAACTTAATCTAGGATGTTTCTGAAGCCGCCCAGACTCAACAAACTTGTCTACAAACTTTGTTTCTATTTGTAATGCTTAATCTTGTCTTCTTATACTTTTCAAATGTTAAGGTTTGTGCATAGGCAAGACTTTTTATATCAAAAGGACAGGAGTCTCTGTATACCCTGCACATGTATGTGAGCCTCTGTACCCTGCGGTACACATGTATGTGTGGCTCTGTACCCTGCAGTGCACACGTGTGTGTGGCTCTGTACCCTGCAGTGCACATGTGTGTGTGGCTCTGTACCCTGGCAGTGCAGAATGGCGGTGTGCGATGTACCCAGCATGCACTATTGTGTGTGGCCTCTAGAAGATCTGATGAAATGAGCATTCTCACCCTGCTTCCAGCTACATTATGATTTAAAAATATTTTTACATTTATTTGTATGTACTTGTATTACTGCATGAAAACTACTGCCACATTTGATCAGAGGACATGTACTTGTATCACAGCATGAAAACTACTGCAACATTTGATCAGAGGACATGTGTGTGTGTGTGCATGTGCGCTTGCATGTTTGTATCCACAGTGTATGTGCAGAAGTCAAAAGACAACTTGCACAAGTCAGCTCTATCTTTCTTTCTACTACATAGGGTCCAGGGGTAAAATTCAGGTCCTCAAATTTGCTGGCGAGCACCTATACTCGAATTGGACCACCTCACTTGGGACTTCCTAAAGACATGCTTAGACACTTACACCCCCGGATCAACATCTACACTAAAAGAATAGAAAGGCCATATTGACATCAACTAATCAGTTCTCCCATTATGACTCACACAAATCCTCAAGTGGATTACAGGATTAGGAGATCTGAATTTTATCTTTCTCCGACCCACTCTACCGAGTTTCCACACTTACCCTGCTCTGGTGGTAGAAGGCTCCCTAGCATATAGCCCAAGAGGCCACTCTTGGAGGAAAAGATTAGTTACAGTCAGTGAACCCCAGGAGAGAAAGTTCCTACATACTCACCCTCCCTGGGATAGCCTTCTCTCCCTGCACACAAAGCCCACCAACTATCCTGCAGCATTTCCTCCCTAAGTTACCTGTCTCCTCCTAATTACTCTCTTCCCAGTTTATTCATACATTCCCCTACAGAAGCTAGGAGGGCTGGGATTTCTGGCTGGCCTGCCCACTGCTGAAGTCCTTGGGAGTTCGTTGGAACATTGGGGTGCAATGTCAACAGAACAGTTTATTTGTCTGTATTCTTTCCATTGCACCCCATCTTCTCTATCACAATAGGTGTTTTTTCTTTCAAAGTAGCTACCACTGGACTCTTTCAAAATCTATCACAAAGATGGCCAGTTCTGCTGTGGGTTGCTAAGTCTGTCGTGTACCCTCAGAGTCTGCCTTATAAGGACTTGACAGAAATTATTTTTAACAACATTCTGTTGTGATATTATATGCAAGGCAGACCACATCTTCATGTATACATTCTGTTAAAACAACTAACCCTAAAGGCCAGGCATGGCCAGATGTTCATAGTCATTTGTGCATCTAGCCCTTGGCCATCCTGTGGTTCATAACTGAAGAAGACAAGACATGACTTGCTTCCCCCTAGACCATTGTGGCTGCTAAGGCCTACTTGGTGCTTGTAAAAATGAATCTCATTTTATTAATTGTGTGACTAAAGGTAAGTCACATAGTCCATCAGACCTTCATGTCTATCAACTGGAAAACAGTATCAATAGTAATACATTAGTGCTATAAGCCATTAGTCACTCTATATAAAACAAGCTCCTAAATGGTACTAAGTGAATAGAAGCTGTAATTATAGCTAGCATGATGACTGGTACATTTAATTGTAGGCTTTTGTCCCTATATTGTTTTGTAACAGGTTTTAGTTTATGCATCTAGACTTAGTTTGTCCAATTAAATCTTGAGTGTAACAACTGCAGAAAGTCTCCATGTTGCAACCACTTTGCAACATGCCTCTGAATGCTGTCTTCACCAGGTAATTCCCTTGGCTAATAGTTCTTAGGTTCATACGATATGCCAGACACAGTCTCTGTTTTGAAGGAGCTGCAGATCTCAAGAGTGAAAGGAAAGTGTCCACATGGATCTGGGGTGCATATACTCCCATGCTCCAACACAGCTGAGAAGATGGACTTTCTAACATTTACCACTTCAAATCCCACGGGAAAAGCACGAATGTATGAGCCACAGATTCTAAATAAAGAACTTAGACTATCTAACACGGAGTCAAGAAATAGGTGGAAGAAGGAAGATAAAGTTTCACCTCCTCAAGTAACGAATCCCATACCAGTCTGTTTGTAGATAGAACTGGTTAGAGTTAGCATCTTCCGTCCCCCTCGTTGGCTTGAGCTGGGGCAACCTTGGGTCTTTGGGTGGGTCCCATGTACTCTGCGTTGATACCACTGCTT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@READ_4_UNKNOWN_BARCODE_UNKNOWNUMI_random +ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII