From 45a259648dacd135ececb88d21ca89dc3eceded1 Mon Sep 17 00:00:00 2001 From: Charlie Date: Mon, 26 Jan 2026 12:55:39 -0500 Subject: [PATCH 1/2] Move standard references to TSV data --- machine_types.tsv | 8 + pyproject.toml | 29 ++-- sample_registry/app.py | 147 ++++++++--------- sample_registry/data/__init__.py | 1 + sample_registry/data/machine_types.tsv | 8 + .../data/standard_host_species.tsv | 13 ++ .../data/standard_sample_types.tsv | 73 +++++++++ sample_registry/db.py | 77 --------- sample_registry/models.py | 19 --- sample_registry/register.py | 47 ------ sample_registry/registrar.py | 54 +------ sample_registry/standards.py | 149 ++++++++++++++++++ tests/test_register.py | 79 ---------- tests/test_registrar.py | 38 +---- tests/test_standards.py | 25 +++ 15 files changed, 376 insertions(+), 391 deletions(-) create mode 100644 machine_types.tsv create mode 100644 sample_registry/data/__init__.py create mode 100644 sample_registry/data/machine_types.tsv create mode 100644 sample_registry/data/standard_host_species.tsv create mode 100644 sample_registry/data/standard_sample_types.tsv create mode 100644 sample_registry/standards.py create mode 100644 tests/test_standards.py diff --git a/machine_types.tsv b/machine_types.tsv new file mode 100644 index 0000000..140d633 --- /dev/null +++ b/machine_types.tsv @@ -0,0 +1,8 @@ +prefix machine_type +VH Illumina-NextSeq +D Illumina-HiSeq +M Illumina-MiSeq +A Illumina-NovaSeq +NB Illumina-MiniSeq +LH Illumina-NovaSeqX +SH Illumina-MiSeq diff --git a/pyproject.toml b/pyproject.toml index 7b69390..07684ea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,23 +54,22 @@ register_run_file = "sample_registry.register:register_illumina_file" unregister_samples = "sample_registry.register:unregister_samples" register_samples = "sample_registry.register:register_samples" modify_sample = "sample_registry.register:modify_sample" -register_annotations = "sample_registry.register:register_annotations" -modify_annotation = "sample_registry.register:modify_annotation" -register_host_species = "sample_registry.register:register_host_species" -register_sample_types = "sample_registry.register:register_sample_types" -export_samples = "sample_registry.export:export_samples" -create_test_db = "sample_registry.db:create_test_db" -sample_registry_version = "sample_registry:sample_registry_version" +register_annotations = "sample_registry.register:register_annotations" +modify_annotation = "sample_registry.register:modify_annotation" +export_samples = "sample_registry.export:export_samples" +create_test_db = "sample_registry.db:create_test_db" +sample_registry_version = "sample_registry:sample_registry_version" -[tool.setuptools] -packages = ["sample_registry"] +[tool.setuptools] +packages = ["sample_registry", "sample_registry.data"] -[tool.setuptools.package-data] -"sample_registry" = [ - "templates/*.html", - "static/*", - "static/img/*", -] +[tool.setuptools.package-data] +"sample_registry" = [ + "templates/*.html", + "static/*", + "static/img/*", + "data/*.tsv", +] [build-system] requires = ["setuptools>=61.0.0", "wheel"] diff --git a/sample_registry/app.py b/sample_registry/app.py index 966201d..bc9ea72 100644 --- a/sample_registry/app.py +++ b/sample_registry/app.py @@ -18,15 +18,9 @@ from io import StringIO from pathlib import Path from sample_registry import ARCHIVE_ROOT, SQLALCHEMY_DATABASE_URI -from sample_registry.models import ( - Base, - Annotation, - Run, - Sample, - StandardHostSpecies, - StandardSampleType, -) -from sample_registry.db import run_to_dataframe, query_tag_stats, STANDARD_TAGS +from sample_registry.models import Base, Annotation, Run, Sample +from sample_registry.db import run_to_dataframe, query_tag_stats, STANDARD_TAGS +from sample_registry.standards import STANDARD_HOST_SPECIES, STANDARD_SAMPLE_TYPES from werkzeug.middleware.proxy_fix import ProxyFix app = Flask(__name__) @@ -191,39 +185,46 @@ def show_runs(run_acc=None): @app.route("/stats") -def show_stats(): - num_samples = db.session.query(Sample).count() - num_samples_with_sampletype = ( - db.session.query(Sample).filter(Sample.sample_type is not None).count() - ) - num_samples_with_standard_sampletype = ( - db.session.query(Sample) - .join(StandardSampleType, Sample.sample_type == StandardSampleType.sample_type) - .count() - ) - standard_sampletype_counts = ( - db.session.query( - StandardSampleType.sample_type, - db.func.count(Sample.sample_accession), - StandardSampleType.host_associated, - ) - .join(Sample, Sample.sample_type == StandardSampleType.sample_type) - .group_by(StandardSampleType.sample_type) - .order_by( - db.func.count(Sample.sample_accession).desc(), - StandardSampleType.sample_type, - ) - .all() - ) - standard_sampletypes = set( - s.sample_type for s in db.session.query(StandardSampleType.sample_type).all() - ) - nonstandard_sampletype_counts = ( - db.session.query(Sample.sample_type, db.func.count(Sample.sample_accession)) - .filter(Sample.sample_type.notin_(standard_sampletypes)) - .group_by(Sample.sample_type) - .order_by(db.func.count(Sample.sample_accession).desc(), Sample.sample_type) - ) +def show_stats(): + standard_sampletypes = set(STANDARD_SAMPLE_TYPES.names()) + standard_hostspecies = set(STANDARD_HOST_SPECIES.names()) + + num_samples = db.session.query(Sample).count() + num_samples_with_sampletype = ( + db.session.query(Sample).filter(Sample.sample_type is not None).count() + ) + num_samples_with_standard_sampletype = ( + db.session.query(Sample) + .filter(Sample.sample_type.in_(standard_sampletypes)) + .count() + if standard_sampletypes + else 0 + ) + standard_sampletype_counts = ( + db.session.query(Sample.sample_type, db.func.count(Sample.sample_accession)) + .filter(Sample.sample_type.in_(standard_sampletypes)) + .group_by(Sample.sample_type) + .order_by(db.func.count(Sample.sample_accession).desc(), Sample.sample_type) + .all() + if standard_sampletypes + else [] + ) + nonstandard_sampletype_counts = ( + db.session.query(Sample.sample_type, db.func.count(Sample.sample_accession)) + .filter( + Sample.sample_type.isnot(None), + Sample.sample_type.notin_(standard_sampletypes), + ) + .group_by(Sample.sample_type) + .order_by(db.func.count(Sample.sample_accession).desc(), Sample.sample_type) + .all() + if standard_sampletypes + else db.session.query(Sample.sample_type, db.func.count(Sample.sample_accession)) + .filter(Sample.sample_type.isnot(None)) + .group_by(Sample.sample_type) + .order_by(db.func.count(Sample.sample_accession).desc(), Sample.sample_type) + .all() + ) num_subjectid = ( db.session.query(Sample.subject_id) @@ -239,36 +240,38 @@ def show_stats(): num_samples_with_hostspecies = ( db.session.query(Sample).filter(Sample.host_species is not None).count() ) - num_samples_with_standard_hostspecies = ( - db.session.query(Sample) - .join( - StandardHostSpecies, Sample.host_species == StandardHostSpecies.host_species - ) - .count() - ) - standard_hostspecies_counts = ( - db.session.query( - StandardHostSpecies.host_species, - db.func.count(Sample.sample_accession), - StandardHostSpecies.ncbi_taxon_id, - ) - .join(Sample, Sample.host_species == StandardHostSpecies.host_species) - .group_by(StandardHostSpecies.host_species) - .order_by( - db.func.count(Sample.sample_accession).desc(), - StandardHostSpecies.host_species, - ) - .all() - ) - standard_hostspecies = set( - s.host_species for s in db.session.query(StandardHostSpecies.host_species).all() - ) - nonstandard_hostspecies_counts = ( - db.session.query(Sample.host_species, db.func.count(Sample.sample_accession)) - .filter(Sample.host_species.notin_(standard_hostspecies)) - .group_by(Sample.host_species) - .order_by(db.func.count(Sample.sample_accession).desc(), Sample.host_species) - ) + num_samples_with_standard_hostspecies = ( + db.session.query(Sample) + .filter(Sample.host_species.in_(standard_hostspecies)) + .count() + if standard_hostspecies + else 0 + ) + standard_hostspecies_counts = ( + db.session.query(Sample.host_species, db.func.count(Sample.sample_accession)) + .filter(Sample.host_species.in_(standard_hostspecies)) + .group_by(Sample.host_species) + .order_by(db.func.count(Sample.sample_accession).desc(), Sample.host_species) + .all() + if standard_hostspecies + else [] + ) + nonstandard_hostspecies_counts = ( + db.session.query(Sample.host_species, db.func.count(Sample.sample_accession)) + .filter( + Sample.host_species.isnot(None), + Sample.host_species.notin_(standard_hostspecies), + ) + .group_by(Sample.host_species) + .order_by(db.func.count(Sample.sample_accession).desc(), Sample.host_species) + .all() + if standard_hostspecies + else db.session.query(Sample.host_species, db.func.count(Sample.sample_accession)) + .filter(Sample.host_species.isnot(None)) + .group_by(Sample.host_species) + .order_by(db.func.count(Sample.sample_accession).desc(), Sample.host_species) + .all() + ) num_samples_with_primer = ( db.session.query(Sample).filter(Sample.primer_sequence != "").count() diff --git a/sample_registry/data/__init__.py b/sample_registry/data/__init__.py new file mode 100644 index 0000000..9c34099 --- /dev/null +++ b/sample_registry/data/__init__.py @@ -0,0 +1 @@ +"""Package data for SampleRegistry reference tables.""" diff --git a/sample_registry/data/machine_types.tsv b/sample_registry/data/machine_types.tsv new file mode 100644 index 0000000..140d633 --- /dev/null +++ b/sample_registry/data/machine_types.tsv @@ -0,0 +1,8 @@ +prefix machine_type +VH Illumina-NextSeq +D Illumina-HiSeq +M Illumina-MiSeq +A Illumina-NovaSeq +NB Illumina-MiniSeq +LH Illumina-NovaSeqX +SH Illumina-MiSeq diff --git a/sample_registry/data/standard_host_species.tsv b/sample_registry/data/standard_host_species.tsv new file mode 100644 index 0000000..dc4c9e0 --- /dev/null +++ b/sample_registry/data/standard_host_species.tsv @@ -0,0 +1,13 @@ +host_species scientific_name ncbi_taxid +Dog Canis lupus familiaris 9615 +Fruit fly Drosophila melanogaster 7227 +Human Homo sapiens 9606 +Mouse Mus musculus 10090 +Naked mole rat Heterocephalus glaber 10181 +Pig Sus scrofa domesticus 9825 +Pigeon Columba livia 8932 +Rabbit Oryctolagus cuniculus 9986 +Rat Rattus norvegicus 10116 +Rhesus macaque Macaca mulatta 9544 +Cow Bos taurus 9913 +Sheep Ovis aries 9940 diff --git a/sample_registry/data/standard_sample_types.tsv b/sample_registry/data/standard_sample_types.tsv new file mode 100644 index 0000000..ed59c06 --- /dev/null +++ b/sample_registry/data/standard_sample_types.tsv @@ -0,0 +1,73 @@ +sample_type rarity host_associated description +Amnoitic fluid Rare 1 NA +BAL Uncommon 1 NA +Bedding Uncommon 0 NA +Biofilm Rare 1 NA +Bioreactor Uncommon 0 NA +Blank swab Common 0 Swab taken out of packaging in sequencing lab immediately before extraction. +Blood Uncommon 1 NA +Breast milk Rare 1 NA +Buffer Uncommon 0 NA +Cecum Uncommon 1 NA +Cell lysate Uncommon 1 NA +Cervical swab Rare 1 NA +Cheek swab Uncommon 1 NA +Crop Rare 1 NA +Dental plaque Uncommon 1 NA +Duodenum Rare 1 NA +Dust Uncommon 0 NA +Elution buffer Common 0 NA +Empty well Common 0 NA +Endometrial swab Rare 1 NA +Environmental control Common 0 Includes Air swab, Environmental swab, Environmental blank +Esophageal biopsy Rare 1 NA +Esophagus Rare 1 NA +Feces Common 1 Human and animal fecal material. +Feed Uncommon 0 NA +Fistula Common 1 NA +Fistula swab Uncommon 1 NA +Fly food Rare 0 NA +Fruit fly Rare 1 NA +Ileostomy fluid Uncommon 1 NA +Ileum Uncommon 1 NA +Kveim reagent Uncommon 0 NA +Lab water Uncommon 0 NA +Macular Retina Rare 1 NA +Meconium Rare 1 NA +Medium Rare 0 NA +Microbial culture Common 0 NA +Mock DNA Common 0 NA +Mouse chow Rare 0 NA +Nasal swab Common 1 NA +Nasopharyngeal swab Uncommon 1 NA +Oral swab Common 1 NA +Oral wash Uncommon 1 NA +Oropharyngeal swab Uncommon 1 NA +Ostomy fluid Uncommon 1 NA +Pancreatic fluid Rare 1 NA +PCR water Uncommon 0 NA +Peripheral retina Rare 1 NA +Placenta Rare 1 NA +Plasma Uncommon 1 NA +Rectal biopsy Common 1 NA +Rectal swab Common 1 We have observed that results are sensitive to exact collection method employed, please include notes in publication. +Saline Uncommon 0 NA +Saliva Uncommon 1 NA +Sediment Uncommon 0 NA +Serum Uncommon 1 NA +Skin swab Common 1 NA +Small intestine Uncommon 1 NA +Soil Rare 1 NA +Sputum Common 1 NA +Surface swab Common 0 NA +Tongue swab Common 1 NA +Tonsil Rare 1 NA +Tracheal aspirate Common 1 NA +Tracheal control Uncommon 1 NA +Urethral swab Rare 1 NA +Urine Uncommon 1 NA +Water Uncommon 0 NA +Weighing paper Common 0 NA +Whole gut Uncommon 1 From dissection. +Large intestine mucosa Common 1 "NA" +Large intestine lumen Common 1 "NA" diff --git a/sample_registry/db.py b/sample_registry/db.py index 6322348..b55a28b 100644 --- a/sample_registry/db.py +++ b/sample_registry/db.py @@ -1,4 +1,3 @@ -import csv import sys from typing import Optional from flask_sqlalchemy import SQLAlchemy @@ -9,8 +8,6 @@ Run, Sample, Annotation, - StandardSampleType, - StandardHostSpecies, ) STANDARD_TAGS = { @@ -125,83 +122,9 @@ def create_test_db(session: Optional[sessionmaker] = None): ] session.bulk_save_objects(annotations) - try: - init_standard_sample_types(session) - except FileNotFoundError: - session.add( - StandardSampleType( - sample_type="Stool", - rarity="Uncommon", - host_associated=True, - comment="Poo", - ) - ) - session.add( - StandardSampleType( - sample_type="Blood", - rarity="Common", - host_associated=True, - comment="Red stuff", - ) - ) - - try: - init_standard_host_species(session) - except FileNotFoundError: - session.add( - StandardHostSpecies( - host_species="Human", scientific_name="Person", ncbi_taxon_id=1 - ) - ) - session.add( - StandardHostSpecies( - host_species="Mouse", scientific_name="FurryLittleDude", ncbi_taxon_id=2 - ) - ) - session.commit() -def init_standard_sample_types(session: sessionmaker): - with open("standard_sample_types.tsv", "r") as file: - reader = csv.reader(file, delimiter="\t") - next(reader) # Skip header row - sample_types = [] - for row in reader: - sample_type = row[0] - rarity = row[1] - host_associated = bool(row[2]) - comment = row[3] - sample_types.append( - StandardSampleType( - sample_type=sample_type, - rarity=rarity, - host_associated=host_associated, - comment=comment, - ) - ) - session.bulk_save_objects(sample_types) - - -def init_standard_host_species(session: sessionmaker): - with open("standard_host_species.tsv", "r") as file: - reader = csv.reader(file, delimiter="\t") - next(reader) # Skip header row - host_species_list = [] - for row in reader: - host_species = row[0] - scientific_name = row[1] - ncbi_taxon_id = row[2] - host_species_list.append( - StandardHostSpecies( - host_species=host_species, - scientific_name=scientific_name, - ncbi_taxon_id=ncbi_taxon_id, - ) - ) - session.bulk_save_objects(host_species_list) - - def query_tag_stats(db: SQLAlchemy, tag: str) -> list[dict]: if tag in STANDARD_TAGS.keys(): return ( diff --git a/sample_registry/models.py b/sample_registry/models.py index 37d5c14..8b4c9df 100644 --- a/sample_registry/models.py +++ b/sample_registry/models.py @@ -49,22 +49,3 @@ def __repr__(self): return f"Annotation(sample_accession={self.sample_accession}, key={self.key}, val={self.val})" -class StandardSampleType(Base): - __tablename__ = "standard_sample_types" - sample_type: Mapped[str] = mapped_column(primary_key=True) - rarity: Mapped[str] - host_associated: Mapped[bool] - comment: Mapped[Optional[str]] = mapped_column(nullable=True) - - def __repr__(self): - return f"StandardSampleType(sample_type={self.sample_type}, rarity={self.rarity}, host_associated={self.host_associated}, comment={self.comment})" - - -class StandardHostSpecies(Base): - __tablename__ = "standard_host_species" - host_species: Mapped[str] = mapped_column(primary_key=True) - scientific_name: Mapped[str] - ncbi_taxon_id: Mapped[int] - - def __repr__(self): - return f"StandardHostSpecies(host_species={self.host_species}, scientific_name={self.scientific_name}, ncbi_taxon_id={self.ncbi_taxon_id})" diff --git a/sample_registry/register.py b/sample_registry/register.py index 1530470..66fe790 100644 --- a/sample_registry/register.py +++ b/sample_registry/register.py @@ -4,7 +4,6 @@ import sys import gzip from sqlalchemy.orm import Session -from typing import Generator from sample_registry.mapping import SampleTable from sample_registry.registrar import SampleRegistry from seqBackupLib.illumina import IlluminaFastq @@ -87,52 +86,6 @@ def register_sample_annotations( registry.session.commit() -def parse_tsv_ncol(f, ncol: int) -> Generator[tuple[str], None, None]: - assert ncol > 0 - # Skip header - next(f) - for line in f: - line = line.rstrip("\n") - if line.startswith("#"): - continue - if not line.strip(): - continue - vals = line.split("\t") - if len(vals) < ncol: - raise ValueError("Each line must contain at least {0} fields".format(ncol)) - yield tuple(vals[:ncol]) - - -def register_sample_types(argv=None, session: Session = None): - p = argparse.ArgumentParser( - description=("Update the list of standard sample types in the registry") - ) - p.add_argument("file", type=argparse.FileType("r")) - args = p.parse_args(argv) - - registry = SampleRegistry(session) - sample_types = list(parse_tsv_ncol(args.file, 4)) - registry.remove_standard_sample_types() - registry.register_standard_sample_types(sample_types) - - registry.session.commit() - - -def register_host_species(argv=None, session: Session = None): - p = argparse.ArgumentParser( - description=("Update the list of standard host species in the registry") - ) - p.add_argument("file", type=argparse.FileType("r")) - args = p.parse_args(argv) - - registry = SampleRegistry(session) - host_species = list(parse_tsv_ncol(args.file, 3)) - registry.remove_standard_host_species() - registry.register_standard_host_species(host_species) - - registry.session.commit() - - def register_illumina_file(argv=None, session: Session = None, out=sys.stdout): p = argparse.ArgumentParser( description=("Add a new run to the registry from a gzipped Illumina FASTQ file") diff --git a/sample_registry/registrar.py b/sample_registry/registrar.py index a533d5e..e53aa15 100644 --- a/sample_registry/registrar.py +++ b/sample_registry/registrar.py @@ -3,18 +3,12 @@ from sqlalchemy.orm import Session, sessionmaker from sample_registry.db import STANDARD_TAGS from sample_registry.mapping import SampleTable -from sample_registry.models import ( - Annotation, - Sample, - StandardSampleType, - StandardHostSpecies, - Run, -) -from seqBackupLib.illumina import MACHINE_TYPES - - -class SampleRegistry: - machines = MACHINE_TYPES.values() +from sample_registry.models import Annotation, Sample, Run +from sample_registry.standards import MACHINE_TYPE_MAPPINGS + + +class SampleRegistry: + machines = MACHINE_TYPE_MAPPINGS.values() kits = ["Nextera XT"] def __init__(self, session: Optional[Session] = None, uri: Optional[str] = None): @@ -296,39 +290,3 @@ def modify_annotation(self, sample_accession: int, key: str, val: str): .values({"val": val}) ) - def remove_standard_sample_types(self): - self.session.execute(delete(StandardSampleType)) - - def register_standard_sample_types( - self, sample_types: list[tuple[str, str, bool, str]] - ): - self.session.execute( - insert(StandardSampleType).values( - [ - { - "sample_type": sample_type, - "rarity": rarity, - "host_associated": bool(host_associated), - "comment": comment, - } - for sample_type, rarity, host_associated, comment in sample_types - ] - ) - ) - - def remove_standard_host_species(self): - self.session.execute(delete(StandardHostSpecies)) - - def register_standard_host_species(self, host_species): - self.session.execute( - insert(StandardHostSpecies).values( - [ - { - "host_species": host_species, - "scientific_name": scientific_name, - "ncbi_taxon_id": int(ncbi_taxon_id), - } - for host_species, scientific_name, ncbi_taxon_id in host_species - ] - ) - ) diff --git a/sample_registry/standards.py b/sample_registry/standards.py new file mode 100644 index 0000000..525bcf2 --- /dev/null +++ b/sample_registry/standards.py @@ -0,0 +1,149 @@ +from __future__ import annotations + +import csv +from dataclasses import dataclass +from importlib import resources +from typing import Iterable + +DATA_PACKAGE = "sample_registry.data" + + +@dataclass(frozen=True) +class StandardSampleType: + sample_type: str + rarity: str + host_associated: bool + description: str + + +@dataclass(frozen=True) +class StandardHostSpecies: + host_species: str + scientific_name: str + ncbi_taxon_id: int + + +@dataclass(frozen=True) +class MachineType: + prefix: str + machine_type: str + + +def _parse_bool(value: str) -> bool: + return value.strip().lower() in {"1", "true", "t", "yes", "y"} + + +def _read_tsv_rows(filename: str, min_columns: int) -> list[list[str]]: + data_path = resources.files(DATA_PACKAGE) / filename + rows: list[list[str]] = [] + with resources.as_file(data_path) as path: + with path.open("r", encoding="utf-8") as handle: + reader = csv.reader(handle, delimiter="\t") + for row_index, row in enumerate(reader): + if row_index == 0: + continue + if not row: + continue + if row[0].startswith("#"): + continue + if len(row) < min_columns: + raise ValueError( + f"Expected at least {min_columns} columns in {filename}, got {len(row)}" + ) + rows.append(row) + return rows + + +class StandardSampleTypes: + def __init__(self, entries: Iterable[StandardSampleType]): + self._entries = list(entries) + self._by_sample_type = {entry.sample_type: entry for entry in self._entries} + + @classmethod + def load(cls) -> "StandardSampleTypes": + rows = _read_tsv_rows("standard_sample_types.tsv", 4) + entries = [ + StandardSampleType( + sample_type=row[0], + rarity=row[1], + host_associated=_parse_bool(row[2]), + description=row[3], + ) + for row in rows + ] + return cls(entries) + + def all(self) -> list[StandardSampleType]: + return list(self._entries) + + def get(self, sample_type: str) -> StandardSampleType | None: + return self._by_sample_type.get(sample_type) + + def names(self) -> list[str]: + return list(self._by_sample_type.keys()) + + def is_standard(self, sample_type: str) -> bool: + return sample_type in self._by_sample_type + + +class StandardHostSpeciesList: + def __init__(self, entries: Iterable[StandardHostSpecies]): + self._entries = list(entries) + self._by_host_species = {entry.host_species: entry for entry in self._entries} + + @classmethod + def load(cls) -> "StandardHostSpeciesList": + rows = _read_tsv_rows("standard_host_species.tsv", 3) + entries = [ + StandardHostSpecies( + host_species=row[0], + scientific_name=row[1], + ncbi_taxon_id=int(row[2]), + ) + for row in rows + ] + return cls(entries) + + def all(self) -> list[StandardHostSpecies]: + return list(self._entries) + + def get(self, host_species: str) -> StandardHostSpecies | None: + return self._by_host_species.get(host_species) + + def names(self) -> list[str]: + return list(self._by_host_species.keys()) + + def is_standard(self, host_species: str) -> bool: + return host_species in self._by_host_species + + +class MachineTypeMappings: + def __init__(self, entries: Iterable[MachineType]): + self._entries = list(entries) + self._by_prefix = {entry.prefix: entry.machine_type for entry in self._entries} + + @classmethod + def load(cls) -> "MachineTypeMappings": + rows = _read_tsv_rows("machine_types.tsv", 2) + entries = [MachineType(prefix=row[0], machine_type=row[1]) for row in rows] + return cls(entries) + + def all(self) -> list[MachineType]: + return list(self._entries) + + def get(self, prefix: str) -> str | None: + return self._by_prefix.get(prefix) + + def prefixes(self) -> list[str]: + return list(self._by_prefix.keys()) + + def values(self) -> list[str]: + return list(self._by_prefix.values()) + + def as_dict(self) -> dict[str, str]: + return dict(self._by_prefix) + + +STANDARD_SAMPLE_TYPES = StandardSampleTypes.load() +STANDARD_HOST_SPECIES = StandardHostSpeciesList.load() +MACHINE_TYPE_MAPPINGS = MachineTypeMappings.load() diff --git a/tests/test_register.py b/tests/test_register.py index 12ccfbf..579e2b5 100644 --- a/tests/test_register.py +++ b/tests/test_register.py @@ -13,16 +13,12 @@ Base, Run, Sample, - StandardHostSpecies, - StandardSampleType, ) from sample_registry.register import ( register_run, register_sample_annotations, unregister_samples, register_illumina_file, - register_sample_types, - register_host_species, ) samples = [ @@ -236,78 +232,3 @@ def test_unregister_samples(db, temp_sample_file): assert not db.scalar(select(Annotation).where(Annotation.sample_accession == 6)) assert not db.scalar(select(Annotation).where(Annotation.sample_accession == 7)) - -def test_register_sample_types(db): - f = tempfile.NamedTemporaryFile("wt") - f.write(SAMPLE_TYPES_TSV) - f.seek(0) - - register_sample_types([f.name], db) - assert db.scalars(select(StandardSampleType.sample_type)).all() == [ - "Colonic biopsy", - "Feces", - "Oral wash", - "Ostomy fluid", - "Rectal swab", - ] - - # Add a new sample type and re-register - new_line = "Extra type\tCommon\t1\tJust to test" - f2 = tempfile.NamedTemporaryFile("wt") - f2.write(SAMPLE_TYPES_TSV + new_line) - f2.seek(0) - - register_sample_types([f2.name], db) - assert db.scalars(select(StandardSampleType.sample_type)).all() == [ - "Colonic biopsy", - "Extra type", - "Feces", - "Oral wash", - "Ostomy fluid", - "Rectal swab", - ] - - -def test_register_host_species(db): - f = tempfile.NamedTemporaryFile("wt") - f.write(HOST_SPECIES_TSV) - f.seek(0) - - register_host_species([f.name], db) - assert db.scalars(select(StandardHostSpecies.host_species)).all() == [ - "Human", - "Mouse", - ] - - # Add a new host species and re-register - new_line = "Dog\tCanis lupus\t9615" - f2 = tempfile.NamedTemporaryFile("wt") - f2.write(HOST_SPECIES_TSV + new_line) - f2.seek(0) - - register_host_species([f2.name], db) - assert db.scalars(select(StandardHostSpecies.host_species)).all() == [ - "Dog", - "Human", - "Mouse", - ] - - -SAMPLE_TYPES_TSV = """\ -sample_type\trarity\thost_associated\tdescription - -# Gut -Feces\tCommon\t1\tHuman and animal fecal material.\tNA -Rectal swab\tCommon\t1\tResults are sensitive to collection method.\tNA -Ostomy fluid\tCommon\t1\tNA -Colonic biopsy\tCommon\t1\tNA - -# Oral -Oral wash\tCommon\t1\tNA -""" - -HOST_SPECIES_TSV = """\ -host_species\tscientific_name\tncbi_taxid -Human\tHomo sapiens\t9606 -Mouse\tMus musculus\t10090 -""" diff --git a/tests/test_registrar.py b/tests/test_registrar.py index 99960fc..4f525a0 100644 --- a/tests/test_registrar.py +++ b/tests/test_registrar.py @@ -5,12 +5,10 @@ from sample_registry.db import create_test_db from sample_registry.mapping import SampleTable from sample_registry.models import ( - Annotation, - Base, - Run, - Sample, - StandardHostSpecies, - StandardSampleType, + Annotation, + Base, + Run, + Sample, ) from sample_registry.registrar import SampleRegistry @@ -185,31 +183,3 @@ def test_modify_annotation(db): ) -def test_register_standard_sample_types(db): - registry = SampleRegistry(db) - registry.register_standard_sample_types([("type1", "common", False, "NA")]) - assert db.scalar( - select(StandardSampleType).where(StandardSampleType.sample_type == "type1") - ) - - -def test_remove_standard_sample_types(db): - registry = SampleRegistry(db) - registry.remove_standard_sample_types() - assert not db.scalar(select(StandardSampleType)) - - -def test_register_standard_host_species(db): - registry = SampleRegistry(db) - registry.register_standard_host_species([("species1", "Species specius", 1)]) - assert db.scalar( - select(StandardHostSpecies).where( - StandardHostSpecies.host_species == "species1" - ) - ) - - -def test_remove_standard_host_species(db): - registry = SampleRegistry(db) - registry.remove_standard_host_species() - assert not db.scalar(select(StandardHostSpecies)) diff --git a/tests/test_standards.py b/tests/test_standards.py new file mode 100644 index 0000000..8cf7569 --- /dev/null +++ b/tests/test_standards.py @@ -0,0 +1,25 @@ +from sample_registry.standards import ( + MACHINE_TYPE_MAPPINGS, + STANDARD_HOST_SPECIES, + STANDARD_SAMPLE_TYPES, +) + + +def test_standard_sample_types_loaded(): + feces = STANDARD_SAMPLE_TYPES.get("Feces") + assert feces is not None + assert feces.host_associated is True + assert "fecal" in feces.description.lower() + + +def test_standard_host_species_loaded(): + human = STANDARD_HOST_SPECIES.get("Human") + assert human is not None + assert human.scientific_name == "Homo sapiens" + assert human.ncbi_taxon_id == 9606 + + +def test_machine_type_mappings_loaded(): + assert MACHINE_TYPE_MAPPINGS.get("VH") == "Illumina-NextSeq" + assert MACHINE_TYPE_MAPPINGS.get("SH") == "Illumina-MiSeq" + assert "Illumina-NovaSeq" in MACHINE_TYPE_MAPPINGS.values() From 4b2e898d71f69ce5d6d0f339466556236c6cf0be Mon Sep 17 00:00:00 2001 From: Charlie Date: Mon, 26 Jan 2026 13:59:00 -0500 Subject: [PATCH 2/2] Update sample_registry/data/standard_sample_types.tsv Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- sample_registry/data/standard_sample_types.tsv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sample_registry/data/standard_sample_types.tsv b/sample_registry/data/standard_sample_types.tsv index ed59c06..8908cb9 100644 --- a/sample_registry/data/standard_sample_types.tsv +++ b/sample_registry/data/standard_sample_types.tsv @@ -1,5 +1,5 @@ sample_type rarity host_associated description -Amnoitic fluid Rare 1 NA +Amniotic fluid Rare 1 NA BAL Uncommon 1 NA Bedding Uncommon 0 NA Biofilm Rare 1 NA