diff --git a/pyproject.toml b/pyproject.toml index 11f2ab8..414fab5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -149,6 +149,7 @@ ignore = [ "tests/**/*.py" = ["S101", "T201"] # use of assert "**/__init__.py" = ["D104"] + [tool.ruff.lint.mccabe] # Flag errors (`C901`) whenever the complexity level exceeds 15. max-complexity = 15 @@ -160,7 +161,7 @@ convention = "google" requires = ["uv_build>=0.9.9,<0.10.0"] build-backend = "uv_build" -[tool.pytest] +[tool.pytest.ini_options] pythonpath = ["src"] log_cli = true log_cli_level = "INFO" diff --git a/src/cdm_data_loader_utils/parsers/annotation_parse.py b/src/cdm_data_loader_utils/parsers/annotation_parse.py new file mode 100644 index 0000000..d2a06d5 --- /dev/null +++ b/src/cdm_data_loader_utils/parsers/annotation_parse.py @@ -0,0 +1,446 @@ +""" + +RefSeq annotation parser for transforming NCBI Datasets API JSON into CDM-formatted Delta Lake tables. + +Usage: +PYTHONPATH=src python src/cdm_data_loader_utils/parsers/annotation_parse.py \ + --accession GCF_000869125.1 \ + --namespace refseq_api \ + --query + +""" + +from __future__ import annotations +import argparse +import json +from pathlib import Path +from typing import Optional + +import requests +from pyspark.sql import SparkSession +from pyspark.sql.types import StructType +from delta import configure_spark_with_delta_pip + +from cdm_data_loader_utils.parsers.kbase_cdm_pyspark import schema as cdm_schemas + + +# --------------------------------------------------------------------- +# Accession-based annotation fetch +# --------------------------------------------------------------------- +def fetch_annotation_json(accession: str) -> dict: + """Fetch annotation JSON from NCBI Datasets API.""" + url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{accession}/annotation_report" + resp = requests.get(url, headers={"Accept": "application/json"}, timeout=60) + resp.raise_for_status() + return resp.json() + + +# --------------------------------------------------------------------- +# Spark initialization with Delta support +# --------------------------------------------------------------------- +def build_spark_session(app_name: str = "RefSeqAnnotationToCDM") -> SparkSession: + builder = ( + SparkSession.builder.appName(app_name) + .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") + .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") + .enableHiveSupport() + ) + return configure_spark_with_delta_pip(builder).getOrCreate() + + +def init_spark_and_db(app_name: str, database: str) -> SparkSession: + spark = build_spark_session(app_name) + spark.sql(f"CREATE DATABASE IF NOT EXISTS {database}") + spark.sql(f"USE {database}") + return spark + + +# --------------------------------------------------------------------- +# CDM PREFIX NORMALIZATION +# --------------------------------------------------------------------- +def apply_prefix(identifier: str | None) -> str | None: + if not identifier: + return None + + if identifier.startswith("GeneID:"): + return identifier.replace("GeneID:", "ncbigene:") + + if identifier.startswith(("YP_", "XP_", "WP_", "NP_", "NC_")): + return f"refseq:{identifier}" + + if identifier.startswith("GCF_"): + return f"insdc.gcf:{identifier}" + + return identifier + + +# --------------------------------------------------------------------- +# Safe integer conversion +# --------------------------------------------------------------------- +def to_int(val: str) -> int | None: + try: + return int(val) + except Exception: + return None + + +# --------------------------------------------------------------------- +# For repeat section markers +# --------------------------------------------------------------------- +def unique_annotations(data: dict): + seen = set() + for report in data.get("reports", []): + ann = report.get("annotation", {}) + gene_id = ann.get("gene_id") + if gene_id and gene_id not in seen: + seen.add(gene_id) + yield gene_id, ann + + +# --------------------------------------------------------------------- +# IDENTIFIERS +# --------------------------------------------------------------------- +def load_identifiers(data: dict) -> list[tuple[str, str, str, str, str | None]]: + """Extract Identifier table records.""" + out = [] + + for gene_id, ann in unique_annotations(data): + entity_id = f"ncbigene:{gene_id}" + out.append((entity_id, gene_id, ann.get("name"), "RefSeq", ann.get("relationship"))) + return list({tuple(row) for row in out}) # deduplicate + + +# --------------------------------------------------------------------- +# NAME EXTRACTION +# --------------------------------------------------------------------- +def load_names(data: dict) -> list[tuple[str, str, str, str]]: + """Extract Name table records.""" + out = [] + + for gene_id, ann in unique_annotations(data): + entity_id = f"ncbigene:{gene_id}" + for label, desc in [ + ("symbol", "RefSeq gene symbol"), + ("name", "RefSeq gene name"), + ("locus_tag", "RefSeq locus tag"), + ]: + val = ann.get(label) + if val: + out.append((entity_id, val, desc, "RefSeq")) + return list({tuple(row) for row in out}) + + +# --------------------------------------------------------------------- +# FEATURE LOCATIONS +# --------------------------------------------------------------------- +def load_feature_records(data: dict) -> list[tuple]: + """Extract Feature table records.""" + features = [] + + for gene_id, ann in unique_annotations(data): + feature_id = f"ncbigene:{gene_id}" + for region in ann.get("genomic_regions", []): + for r in region.get("gene_range", {}).get("range", []): + strand = { + "plus": "positive", + "minus": "negative", + "unstranded": "unstranded", + }.get(r.get("orientation"), "unknown") + features.append(( + feature_id, + None, + None, + None, + to_int(r.get("end")), + None, + to_int(r.get("begin")), + strand, + "RefSeq", + None, + "gene", + )) + return list({tuple(row) for row in features}) + + +# --------------------------------------------------------------------- +# PARSE CONTIG_COLLECTION <-> FEATURE +# --------------------------------------------------------------------- +def load_contig_collection_x_feature(data: dict) -> list[tuple[str, str]]: + """Parse ContigCollection Feature links.""" + links = [] + + for gene_id, ann in unique_annotations(data): + regions = ann.get("genomic_regions", []) + + if not regions: + continue + + acc = regions[0].get("gene_range", {}).get("accession_version") + if acc: + links.append((apply_prefix(acc), f"ncbigene:{gene_id}")) + + return list(set(links)) + + +# --------------------------------------------------------------------- +# PARSE CONTIG_COLLECTION <-> PROTEIN +# --------------------------------------------------------------------- +def load_contig_collection_x_protein(data: dict) -> list[tuple[str, str]]: + links = [] + + for report in data.get("reports", []): + ann = report.get("annotation", {}) + assembly = ann.get("annotations", [{}])[0].get("assembly_accession") + if not assembly: + continue + + contig_id = apply_prefix(assembly) + for p in ann.get("proteins", []): + pid = p.get("accession_version") + if pid: + links.append((contig_id, apply_prefix(pid))) + + return list(set(links)) + + +# --------------------------------------------------------------------- +# PARSE FEATURE <-> PROTEIN +# --------------------------------------------------------------------- +def load_feature_x_protein(data: dict) -> list[tuple[str, str]]: + links = [] + + for gene_id, ann in unique_annotations(data): + feature_id = f"ncbigene:{gene_id}" + + for p in ann.get("proteins", []): + pid = p.get("accession_version") + if pid: + protein_id = apply_prefix(pid) + links.append((feature_id, protein_id)) + + return list(set(links)) + + +# --------------------------------------------------------------------- +# PARSE CONTIGS +# --------------------------------------------------------------------- +def load_contigs(data: dict) -> list[tuple[str, str | None, float | None, int | None]]: + contigs = {} + + for report in data.get("reports", []): + for region in report.get("annotation", {}).get("genomic_regions", []): + acc = region.get("gene_range", {}).get("accession_version") + if acc: + contig_id = apply_prefix(acc) + # Only track first occurrence of each contig + contigs.setdefault(contig_id, {"hash": None, "gc_content": None, "length": None}) + + return [(cid, meta["hash"], meta["gc_content"], meta["length"]) for cid, meta in contigs.items()] + + +# --------------------------------------------------------------------- +# PARSE CONTIG <-> CONTIG_COLLECTION +# --------------------------------------------------------------------- +def load_contig_x_contig_collection(data: dict) -> list[tuple[str, str]]: + links = [] + + for report in data.get("reports", []): + ann = report.get("annotation", {}) + regions = ann.get("genomic_regions", []) + annotations = ann.get("annotations", []) + + if not regions or not annotations: + continue + + contig = regions[0].get("gene_range", {}).get("accession_version") + assembly = annotations[0].get("assembly_accession") + + if contig and assembly: + links.append(( + f"refseq:{contig}", + apply_prefix(assembly), + )) + + return list(set(links)) + + +# --------------------------------------------------------------------- +# DELTA TABLE +# --------------------------------------------------------------------- +def write_to_table( + spark: SparkSession, + records: list[tuple], + table_name: str, + database: str = "default", +) -> None: + if records: + spark.createDataFrame(records, cdm_schemas[table_name]).write.format("delta").mode("overwrite").option( + "overwriteSchema", "true" + ).saveAsTable(f"{database}.{table_name}") + + +# --------------------------------------------------------------------- +# SQL PREVIEW +# --------------------------------------------------------------------- + +CDM_TABLES = [ + "Identifier", + "Name", + "Feature", + "ContigCollection_x_Feature", + "ContigCollection_x_Protein", + "Feature_x_Protein", + "Contig", + "Contig_x_ContigCollection", +] + + +def run_sql_query(spark: SparkSession, database: str = "default") -> None: + spark.sql(f"USE {database}") + for table in CDM_TABLES: + print(f"\n[SQL Preview] {table}") + spark.sql(f"SELECT * FROM {table} LIMIT 20").show(truncate=False) + + +def parse_annotation_data(spark: SparkSession, datasets: list[dict], namespace: str) -> None: + # ----------------------------------------- + # Parse and write CDM tables + # ----------------------------------------- + for data in datasets: + write_to_table( + spark, + load_identifiers(data), + "Identifier", + namespace, + ) + + write_to_table( + spark, + load_names(data), + "Name", + namespace, + ) + + write_to_table( + spark, + load_feature_records(data), + "Feature", + namespace, + ) + + write_to_table( + spark, + load_contig_collection_x_feature(data), + "ContigCollection_x_Feature", + namespace, + ) + + write_to_table( + spark, + load_contig_collection_x_protein(data), + "ContigCollection_x_Protein", + namespace, + ) + + write_to_table( + spark, + load_feature_x_protein(data), + "Feature_x_Protein", + namespace, + ) + + write_to_table( + spark, + load_contigs(data), + "Contig", + namespace, + ) + + write_to_table( + spark, + load_contig_x_contig_collection(data), + "Contig_x_ContigCollection", + namespace, + ) + + +# --------------------------------------------------------------------- +# CLI ENTRY +# --------------------------------------------------------------------- +def main(): + parser = argparse.ArgumentParser(description="RefSeq Annotation Parser to CDM") + + # ------------------------- + # Input options + # ------------------------- + parser.add_argument("--accession", type=str, help="RefSeq genome accession (e.g. GCF_000869125.1)") + parser.add_argument("--input_file", type=str, help="Path to a RefSeq annotation JSON file.") + parser.add_argument("--input_dir", type=str, help="Directory containing RefSeq annotation JSON files.") + + # ------------------------- + # Output / runtime options + # ------------------------- + parser.add_argument( + "--namespace", + default="refseq_api", + help="Database to write Delta tables.", + ) + parser.add_argument( + "--tenant", + default=None, + help="Tenant SQL warehouse to use.", + ) + parser.add_argument( + "--query", + action="store_true", + help="Preview SQL output after writing.", + ) + + args = parser.parse_args() + + # ----------------------------------------- + # Input validation + # ----------------------------------------- + if not args.accession and not args.input_file and not args.input_dir: + raise ValueError("provide --accession, --input_file, or --input_dir.") + + # ----------------------------------------- + # Initialize Spark + # ----------------------------------------- + spark = init_spark_and_db("RefSeq Annotation Parser", args.namespace) + + if args.tenant: + spark.sql(f"USE CATALOG {args.tenant}") + + # ----------------------------------------- + # Load annotation data + # ----------------------------------------- + datasets: list[dict] = [] + + if args.accession: + # Fetch from NCBI Datasets API + data = fetch_annotation_json(args.accession) + datasets.append(data) + + if args.input_file: + with open(args.input_file) as f: + datasets.append(json.load(f)) + + if args.input_dir: + for path in Path(args.input_dir).rglob("*.json"): + with open(path) as f: + datasets.append(json.load(f)) + + parse_annotation_data(spark, datasets, args.namespace) + + # ----------------------------------------- + # SQL preview + # ----------------------------------------- + if args.query: + run_sql_query(spark, args.namespace) + + spark.stop() + + +if __name__ == "__main__": + main() diff --git a/src/cdm_data_loader_utils/parsers/gene_association_file.py b/src/cdm_data_loader_utils/parsers/gene_association_file.py index 548de56..cd81647 100644 --- a/src/cdm_data_loader_utils/parsers/gene_association_file.py +++ b/src/cdm_data_loader_utils/parsers/gene_association_file.py @@ -273,7 +273,7 @@ def run( if register: register_table(spark, output_path, table_name=table_name, permanent=permanent) - except Exception as e: + except Exception: logger.exception("Pipeline failed") sys.exit(1) finally: diff --git a/src/cdm_data_loader_utils/parsers/kbase_cdm_pyspark.py b/src/cdm_data_loader_utils/parsers/kbase_cdm_pyspark.py new file mode 100644 index 0000000..19be5e8 --- /dev/null +++ b/src/cdm_data_loader_utils/parsers/kbase_cdm_pyspark.py @@ -0,0 +1,610 @@ +"""Automated conversion of cdm_schema to PySpark.""" + +from pyspark.sql.types import BooleanType, DateType, FloatType, IntegerType, StringType, StructField, StructType + +schema = { + "Association": StructType( + [ + StructField("association_id", StringType(), nullable=False), + StructField("subject", StringType(), nullable=False), + StructField("object", StringType(), nullable=False), + StructField("predicate", StringType(), nullable=False), + StructField("negated", BooleanType(), nullable=True), + StructField("evidence_type", StringType(), nullable=True), + StructField("primary_knowledge_source", StringType(), nullable=True), + StructField("aggregator_knowledge_source", StringType(), nullable=True), + StructField("annotation_date", DateType(), nullable=True), + StructField("comments", StringType(), nullable=True), + ] + ), + "Association_x_SupportingObject": StructType( + [ + StructField("association_id", StringType(), nullable=False), + StructField("entity_id", StringType(), nullable=False), + ] + ), + "Cluster": StructType( + [ + StructField("cluster_id", StringType(), nullable=False), + StructField("description", StringType(), nullable=True), + StructField("name", StringType(), nullable=True), + StructField("entity_type", StringType(), nullable=False), + StructField("protocol_id", StringType(), nullable=True), + ] + ), + "ClusterMember": StructType( + [ + StructField("cluster_id", StringType(), nullable=False), + StructField("entity_id", StringType(), nullable=False), + StructField("is_representative", BooleanType(), nullable=True), + StructField("is_seed", BooleanType(), nullable=True), + StructField("score", FloatType(), nullable=True), + ] + ), + "Contig": StructType( + [ + StructField("contig_id", StringType(), nullable=False), + StructField("hash", StringType(), nullable=True), + StructField("gc_content", FloatType(), nullable=True), + StructField("length", IntegerType(), nullable=True), + ] + ), + "ContigCollection": StructType( + [ + StructField("contig_collection_id", StringType(), nullable=False), + StructField("hash", StringType(), nullable=True), + StructField("asm_score", FloatType(), nullable=True), + StructField("checkm_completeness", FloatType(), nullable=True), + StructField("checkm_contamination", FloatType(), nullable=True), + StructField("checkm_version", StringType(), nullable=True), + StructField("contig_bp", IntegerType(), nullable=True), + StructField("contig_collection_type", StringType(), nullable=True), + StructField("contig_l50", IntegerType(), nullable=True), + StructField("contig_l90", IntegerType(), nullable=True), + StructField("contig_n50", IntegerType(), nullable=True), + StructField("contig_n90", IntegerType(), nullable=True), + StructField("contig_logsum", FloatType(), nullable=True), + StructField("contig_max", IntegerType(), nullable=True), + StructField("contig_powersum", FloatType(), nullable=True), + StructField("gap_percent", FloatType(), nullable=True), + StructField("gc_average", FloatType(), nullable=True), + StructField("gc_std", FloatType(), nullable=True), + StructField("gtdb_taxon_id", StringType(), nullable=True), + StructField("n_chromosomes", IntegerType(), nullable=True), + StructField("n_contigs", IntegerType(), nullable=True), + StructField("n_scaffolds", IntegerType(), nullable=True), + StructField("ncbi_taxon_id", StringType(), nullable=True), + StructField("scaffold_l50", IntegerType(), nullable=True), + StructField("scaffold_l90", IntegerType(), nullable=True), + StructField("scaffold_n50", IntegerType(), nullable=True), + StructField("scaffold_n90", IntegerType(), nullable=True), + StructField("scaffold_bp", IntegerType(), nullable=True), + StructField("scaffold_logsum", FloatType(), nullable=True), + StructField("scaffold_maximum_length", IntegerType(), nullable=True), + StructField("scaffold_powersum", FloatType(), nullable=True), + StructField("scaffolds_n_over_50K", IntegerType(), nullable=True), + StructField("scaffolds_percent_over_50K", FloatType(), nullable=True), + StructField("scaffolds_total_length_over_50k", IntegerType(), nullable=True), + ] + ), + "ContigCollection_x_EncodedFeature": StructType( + [ + StructField("contig_collection_id", StringType(), nullable=False), + StructField("encoded_feature_id", StringType(), nullable=False), + ] + ), + "ContigCollection_x_Feature": StructType( + [ + StructField("contig_collection_id", StringType(), nullable=False), + StructField("feature_id", StringType(), nullable=False), + ] + ), + "ContigCollection_x_Protein": StructType( + [ + StructField("contig_collection_id", StringType(), nullable=False), + StructField("protein_id", StringType(), nullable=False), + ] + ), + "Contig_x_ContigCollection": StructType( + [ + StructField("contig_id", StringType(), nullable=False), + StructField("contig_collection_id", StringType(), nullable=False), + ] + ), + "Contig_x_EncodedFeature": StructType( + [ + StructField("contig_id", StringType(), nullable=False), + StructField("encoded_feature_id", StringType(), nullable=False), + ] + ), + "Contig_x_Feature": StructType( + [ + StructField("contig_id", StringType(), nullable=False), + StructField("feature_id", StringType(), nullable=False), + ] + ), + "Contig_x_Protein": StructType( + [ + StructField("contig_id", StringType(), nullable=False), + StructField("protein_id", StringType(), nullable=False), + ] + ), + "Contributor": StructType( + [ + StructField("contributor_id", StringType(), nullable=False), + StructField("contributor_type", StringType(), nullable=True), + StructField("name", StringType(), nullable=True), + StructField("given_name", StringType(), nullable=True), + StructField("family_name", StringType(), nullable=True), + ] + ), + "ContributorAffiliation": StructType( + [ + StructField("contributor_id", StringType(), nullable=False), + StructField("affiliation_id", StringType(), nullable=True), + ] + ), + "Contributor_x_DataSource": StructType( + [ + StructField("contributor_id", StringType(), nullable=False), + StructField("data_source_id", StringType(), nullable=False), + StructField("contributor_role", StringType(), nullable=True), + ] + ), + "Contributor_x_Role_x_Project": StructType( + [ + StructField("contributor_id", StringType(), nullable=False), + StructField("project_id", StringType(), nullable=False), + StructField("contributor_role", StringType(), nullable=True), + ] + ), + "ControlledTermValue": StructType( + [ + StructField("value_cv_label", StringType(), nullable=False), + StructField("raw_value", StringType(), nullable=True), + StructField("type", StringType(), nullable=True), + StructField("attribute_cv_id", StringType(), nullable=True), + StructField("attribute_cv_label", StringType(), nullable=True), + StructField("attribute_string", StringType(), nullable=True), + StructField("entity_id", StringType(), nullable=False), + ] + ), + "ControlledVocabularyTermValue": StructType( + [ + StructField("value_cv_label", StringType(), nullable=True), + StructField("value_cv_id", StringType(), nullable=True), + StructField("raw_value", StringType(), nullable=True), + StructField("type", StringType(), nullable=True), + StructField("attribute_cv_id", StringType(), nullable=True), + StructField("attribute_cv_label", StringType(), nullable=True), + StructField("attribute_string", StringType(), nullable=True), + StructField("entity_id", StringType(), nullable=False), + ] + ), + "DataSource": StructType( + [ + StructField("data_source_id", StringType(), nullable=False), + StructField("name", StringType(), nullable=True), + ] + ), + "DataSourceNew": StructType( + [ + StructField("data_source_id", StringType(), nullable=False), + StructField("name", StringType(), nullable=True), + StructField("comments", StringType(), nullable=True), + StructField("date_accessed", DateType(), nullable=False), + StructField("date_published", DateType(), nullable=True), + StructField("date_updated", DateType(), nullable=True), + StructField("license", StringType(), nullable=True), + StructField("publisher", StringType(), nullable=True), + StructField("resource_type", StringType(), nullable=False), + StructField("url", StringType(), nullable=True), + StructField("version", StringType(), nullable=True), + ] + ), + "DataSource_x_Description": StructType( + [ + StructField("data_source_id", StringType(), nullable=False), + StructField("resource_description_id", StringType(), nullable=False), + ] + ), + "DataSource_x_FundingReference": StructType( + [ + StructField("data_source_id", StringType(), nullable=False), + StructField("funding_reference_id", StringType(), nullable=False), + ] + ), + "DataSource_x_License": StructType( + [ + StructField("data_source_id", StringType(), nullable=False), + StructField("license_id", StringType(), nullable=False), + ] + ), + "DataSource_x_Title": StructType( + [ + StructField("data_source_id", StringType(), nullable=False), + StructField("resource_title_id", StringType(), nullable=False), + ] + ), + "DateTimeValue": StructType( + [ + StructField("date_time", DateType(), nullable=False), + StructField("raw_value", StringType(), nullable=True), + StructField("type", StringType(), nullable=True), + StructField("attribute_cv_id", StringType(), nullable=True), + StructField("attribute_cv_label", StringType(), nullable=True), + StructField("attribute_string", StringType(), nullable=True), + StructField("entity_id", StringType(), nullable=False), + ] + ), + "EncodedFeature": StructType( + [ + StructField("encoded_feature_id", StringType(), nullable=False), + StructField("hash", StringType(), nullable=True), + StructField("has_stop_codon", BooleanType(), nullable=True), + StructField("type", StringType(), nullable=True), + ] + ), + "EncodedFeature_x_Feature": StructType( + [ + StructField("encoded_feature_id", StringType(), nullable=False), + StructField("feature_id", StringType(), nullable=False), + ] + ), + "EncodedFeature_x_Protein": StructType( + [ + StructField("encoded_feature_id", StringType(), nullable=False), + StructField("protein_id", StringType(), nullable=False), + ] + ), + "EntailedEdge": StructType( + [ + StructField("subject", StringType(), nullable=True), + StructField("predicate", StringType(), nullable=True), + StructField("object", StringType(), nullable=True), + ] + ), + "Entity": StructType( + [ + StructField("entity_id", StringType(), nullable=False), + StructField("entity_type", StringType(), nullable=False), + StructField("data_source_id", StringType(), nullable=True), + StructField("data_source_entity_id", StringType(), nullable=True), + StructField("data_source_created", DateType(), nullable=False), + StructField("data_source_updated", DateType(), nullable=True), + StructField("created", DateType(), nullable=False), + StructField("updated", DateType(), nullable=False), + ] + ), + "Event": StructType( + [ + StructField("event_id", StringType(), nullable=False), + StructField("created_at", DateType(), nullable=True), + StructField("description", StringType(), nullable=True), + StructField("name", StringType(), nullable=True), + StructField("location", StringType(), nullable=True), + ] + ), + "Experiment": StructType( + [ + StructField("experiment_id", StringType(), nullable=False), + StructField("protocol_id", StringType(), nullable=False), + StructField("name", StringType(), nullable=True), + StructField("description", StringType(), nullable=True), + StructField("created_at", DateType(), nullable=True), + ] + ), + "ExperimentCondition": StructType( + [ + StructField("experiment_condition_id", StringType(), nullable=False), + StructField("experiment_id", StringType(), nullable=False), + StructField("variable_id", StringType(), nullable=False), + StructField("value", StringType(), nullable=True), + ] + ), + "ExperimentConditionSet": StructType( + [ + StructField("experiment_condition_set_id", StringType(), nullable=False), + StructField("experiment_condition_id", StringType(), nullable=False), + ] + ), + "Feature": StructType( + [ + StructField("feature_id", StringType(), nullable=False), + StructField("hash", StringType(), nullable=True), + StructField("cds_phase", StringType(), nullable=True), + StructField("e_value", FloatType(), nullable=True), + StructField("end", IntegerType(), nullable=True), + StructField("p_value", FloatType(), nullable=True), + StructField("start", IntegerType(), nullable=True), + StructField("strand", StringType(), nullable=True), + StructField("source_database", StringType(), nullable=True), + StructField("protocol_id", StringType(), nullable=True), + StructField("type", StringType(), nullable=True), + ] + ), + "Feature_x_Protein": StructType( + [ + StructField("feature_id", StringType(), nullable=False), + StructField("protein_id", StringType(), nullable=False), + ] + ), + "FundingReference": StructType( + [ + StructField("funding_reference_id", StringType(), nullable=False), + StructField("funder", StringType(), nullable=True), + StructField("grant_id", StringType(), nullable=True), + StructField("grant_title", StringType(), nullable=True), + StructField("grant_url", StringType(), nullable=True), + ] + ), + "Geolocation": StructType( + [ + StructField("latitude", FloatType(), nullable=False), + StructField("longitude", FloatType(), nullable=False), + StructField("raw_value", StringType(), nullable=True), + StructField("type", StringType(), nullable=True), + StructField("attribute_cv_id", StringType(), nullable=True), + StructField("attribute_cv_label", StringType(), nullable=True), + StructField("attribute_string", StringType(), nullable=True), + StructField("entity_id", StringType(), nullable=False), + ] + ), + "GoldEnvironmentalContext": StructType( + [ + StructField("gold_environmental_context_id", StringType(), nullable=False), + StructField("ecosystem", StringType(), nullable=True), + StructField("ecosystem_category", StringType(), nullable=True), + StructField("ecosystem_subtype", StringType(), nullable=True), + StructField("ecosystem_type", StringType(), nullable=True), + StructField("specific_ecosystem", StringType(), nullable=True), + ] + ), + "Identifier": StructType( + [ + StructField("entity_id", StringType(), nullable=False), + StructField("identifier", StringType(), nullable=False), + StructField("description", StringType(), nullable=True), + StructField("source", StringType(), nullable=True), + StructField("relationship", StringType(), nullable=True), + ] + ), + "License": StructType( + [ + StructField("license_id", StringType(), nullable=False), + StructField("id", StringType(), nullable=True), + StructField("name", StringType(), nullable=True), + StructField("url", StringType(), nullable=True), + ] + ), + "Measurement": StructType( + [ + StructField("measurement_id", StringType(), nullable=False), + StructField("measurement_set_id", StringType(), nullable=False), + StructField("experiment_condition_set_id", StringType(), nullable=False), + StructField("value", StringType(), nullable=True), + ] + ), + "MeasurementSet": StructType( + [ + StructField("measurement_set_id", StringType(), nullable=False), + StructField("variable_id", StringType(), nullable=False), + StructField("quality", StringType(), nullable=True), + StructField("created_at", DateType(), nullable=True), + ] + ), + "MixsEnvironmentalContext": StructType( + [ + StructField("mixs_environmental_context_id", StringType(), nullable=False), + StructField("env_broad_scale", StringType(), nullable=True), + StructField("env_local_scale", StringType(), nullable=True), + StructField("env_medium", StringType(), nullable=True), + ] + ), + "Name": StructType( + [ + StructField("entity_id", StringType(), nullable=False), + StructField("name", StringType(), nullable=False), + StructField("description", StringType(), nullable=True), + StructField("source", StringType(), nullable=True), + ] + ), + "OrderedProtocolStep": StructType( + [ + StructField("protocol_id", StringType(), nullable=False), + StructField("protocol_step_id", StringType(), nullable=False), + StructField("step_index", IntegerType(), nullable=False), + ] + ), + "Parameter": StructType( + [ + StructField("parameter_id", StringType(), nullable=False), + StructField("name", StringType(), nullable=True), + StructField("description", StringType(), nullable=True), + StructField("value_type", StringType(), nullable=True), + StructField("required", BooleanType(), nullable=True), + StructField("cardinality", StringType(), nullable=True), + StructField("default", StringType(), nullable=True), + StructField("parameter_type", StringType(), nullable=True), + ] + ), + "Prefix": StructType( + [ + StructField("prefix", StringType(), nullable=True), + StructField("base", StringType(), nullable=True), + ] + ), + "Project": StructType( + [ + StructField("project_id", StringType(), nullable=False), + StructField("description", StringType(), nullable=True), + ] + ), + "Protein": StructType( + [ + StructField("protein_id", StringType(), nullable=False), + StructField("hash", StringType(), nullable=True), + StructField("description", StringType(), nullable=True), + StructField("evidence_for_existence", StringType(), nullable=True), + StructField("length", IntegerType(), nullable=True), + StructField("sequence", StringType(), nullable=True), + ] + ), + "Protocol": StructType( + [ + StructField("protocol_id", StringType(), nullable=False), + StructField("name", StringType(), nullable=True), + StructField("description", StringType(), nullable=True), + StructField("doi", StringType(), nullable=True), + StructField("url", StringType(), nullable=True), + StructField("version", StringType(), nullable=True), + ] + ), + "ProtocolExecution": StructType( + [ + StructField("protocol_execution_id", StringType(), nullable=False), + StructField("protocol_id", StringType(), nullable=False), + StructField("name", StringType(), nullable=True), + StructField("description", StringType(), nullable=True), + StructField("created_at", DateType(), nullable=True), + ] + ), + "ProtocolInput": StructType( + [ + StructField("parameter_id", StringType(), nullable=False), + StructField("protocol_input_id", StringType(), nullable=False), + StructField("protocol_execution_id", StringType(), nullable=False), + StructField("value", StringType(), nullable=False), + ] + ), + "ProtocolInputSet": StructType( + [ + StructField("protocol_input_id", StringType(), nullable=False), + StructField("protocol_input_set_id", StringType(), nullable=False), + ] + ), + "ProtocolOutput": StructType( + [ + StructField("protocol_output_id", StringType(), nullable=False), + StructField("protocol_input_set_id", StringType(), nullable=False), + StructField("value", StringType(), nullable=False), + ] + ), + "ProtocolStep": StructType( + [ + StructField("protocol_step_id", StringType(), nullable=False), + StructField("step", StringType(), nullable=True), + ] + ), + "ProtocolVariable": StructType( + [ + StructField("protocol_id", StringType(), nullable=False), + StructField("variable_id", StringType(), nullable=False), + ] + ), + "Publication": StructType( + [ + StructField("publication_id", StringType(), nullable=False), + ] + ), + "QuantityRangeValue": StructType( + [ + StructField("maximum_numeric_value", FloatType(), nullable=False), + StructField("minimum_numeric_value", FloatType(), nullable=False), + StructField("unit_cv_id", StringType(), nullable=True), + StructField("unit_cv_label", StringType(), nullable=True), + StructField("unit_string", StringType(), nullable=True), + StructField("raw_value", StringType(), nullable=True), + StructField("type", StringType(), nullable=True), + StructField("attribute_cv_id", StringType(), nullable=True), + StructField("attribute_cv_label", StringType(), nullable=True), + StructField("attribute_string", StringType(), nullable=True), + StructField("entity_id", StringType(), nullable=False), + ] + ), + "QuantityValue": StructType( + [ + StructField("numeric_value", FloatType(), nullable=False), + StructField("unit_cv_id", StringType(), nullable=True), + StructField("unit_cv_label", StringType(), nullable=True), + StructField("unit_string", StringType(), nullable=True), + StructField("raw_value", StringType(), nullable=True), + StructField("type", StringType(), nullable=True), + StructField("attribute_cv_id", StringType(), nullable=True), + StructField("attribute_cv_label", StringType(), nullable=True), + StructField("attribute_string", StringType(), nullable=True), + StructField("entity_id", StringType(), nullable=False), + ] + ), + "ResourceDescription": StructType( + [ + StructField("resource_description_id", StringType(), nullable=False), + StructField("description_text", StringType(), nullable=False), + StructField("description_type", StringType(), nullable=True), + StructField("language", StringType(), nullable=True), + ] + ), + "ResourceTitle": StructType( + [ + StructField("resource_title_id", StringType(), nullable=False), + StructField("language", StringType(), nullable=True), + StructField("title", StringType(), nullable=False), + StructField("title_type", StringType(), nullable=True), + ] + ), + "Sample": StructType( + [ + StructField("sample_id", StringType(), nullable=False), + StructField("description", StringType(), nullable=True), + StructField("type", StringType(), nullable=True), + ] + ), + "Sequence": StructType( + [ + StructField("sequence_id", StringType(), nullable=False), + StructField("entity_id", StringType(), nullable=False), + StructField("type", StringType(), nullable=True), + StructField("length", IntegerType(), nullable=True), + StructField("checksum", StringType(), nullable=True), + ] + ), + "Statement": StructType( + [ + StructField("subject", StringType(), nullable=True), + StructField("predicate", StringType(), nullable=True), + StructField("object", StringType(), nullable=True), + StructField("value", StringType(), nullable=True), + StructField("datatype", StringType(), nullable=True), + StructField("language", StringType(), nullable=True), + ] + ), + "TextValue": StructType( + [ + StructField("text_value", StringType(), nullable=False), + StructField("language", StringType(), nullable=True), + StructField("raw_value", StringType(), nullable=True), + StructField("type", StringType(), nullable=True), + StructField("attribute_cv_id", StringType(), nullable=True), + StructField("attribute_cv_label", StringType(), nullable=True), + StructField("attribute_string", StringType(), nullable=True), + StructField("entity_id", StringType(), nullable=False), + ] + ), + "Variable": StructType( + [ + StructField("variable_id", StringType(), nullable=False), + StructField("name", StringType(), nullable=True), + StructField("description", StringType(), nullable=True), + StructField("name_cv_id", StringType(), nullable=True), + StructField("unit", StringType(), nullable=True), + StructField("value_type", StringType(), nullable=False), + ] + ), + "VariableValue": StructType( + [ + StructField("variable_value_id", StringType(), nullable=False), + StructField("variable_id", StringType(), nullable=False), + StructField("value_type", StringType(), nullable=True), + ] + ), +} diff --git a/src/cdm_data_loader_utils/parsers/shared_identifiers.py b/src/cdm_data_loader_utils/parsers/shared_identifiers.py new file mode 100644 index 0000000..33b865f --- /dev/null +++ b/src/cdm_data_loader_utils/parsers/shared_identifiers.py @@ -0,0 +1,11 @@ +from cdm_data_loader_utils.parsers.xml_utils import get_text + + +def parse_identifiers_generic(entry, xpath, prefix, ns): + result = [] + for node in entry.findall(xpath, ns): + text = get_text(node) + if not text: + continue + result.append({"identifier": f"{prefix}:{text}", "source": prefix, "description": f"{prefix} accession"}) + return result diff --git a/src/cdm_data_loader_utils/parsers/uniprot.py b/src/cdm_data_loader_utils/parsers/uniprot.py index fa6d6a4..ca4516e 100644 --- a/src/cdm_data_loader_utils/parsers/uniprot.py +++ b/src/cdm_data_loader_utils/parsers/uniprot.py @@ -1,28 +1,37 @@ """ -UniProt XML Delta Lake Ingestion Pipeline. +UniProt XML Delta Lake Ingestion Pipeline ========================================= This script parses UniProt XML (.xml.gz) file and ingests the data into structured Delta Lake tables. Typical usage: -------------- +Use it in Berdle as: python3 src/parsers/uniprot.py \ --xml-url "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/uniprot_sprot_archaea.xml.gz" \ --output-dir "./output" \ --namespace "uniprot_db" \ --batch-size 5000 + +python -m cdm_data_loader_utils.parsers.uniprot \ + --xml-url "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/uniprot_sprot_archaea.xml.gz" \ + --output-dir "tests/data/uniprot_archaea" \ + --namespace "uniprot_db" \ + --batch-size 5000 + + Arguments: ---------- ---xml-url: URL to the UniProt XML .gz file +--xml-url: URL to the UniProt XML .gz file --output-dir: Output directory for Delta tables and logs (default: './output') --namespace: Delta Lake database name (default: 'uniprot_db') ---target-date: Process entries modified/updated since specific date +--target-date: Process entries modified/updated since specific date --batch-size: Number of UniProt entries to process per write batch (default: 5000) Functionality: -------------- -- Downloads the XML file if not present locally +- Downloads the XML file if not present locally - Parses UniProt entries in a memory-efficient streaming fashion - Maps parsed data into standardized CDM tables - Writes all tables as Delta Lake tables, supporting incremental import @@ -38,6 +47,7 @@ import datetime import gzip import json +import logging import os import uuid import xml.etree.ElementTree as ET @@ -46,49 +56,61 @@ import requests from delta import configure_spark_with_delta_pip from pyspark.sql import SparkSession +from pyspark.sql.functions import col, split from pyspark.sql.types import ArrayType, StringType, StructField, StructType -## XML namespace mapping for UniProt entries (used for all XPath queries) -NS = {"u": "https://uniprot.org/uniprot"} +from cdm_data_loader_utils.parsers.shared_identifiers import parse_identifiers_generic +from cdm_data_loader_utils.parsers.xml_utils import clean_dict, find_all_text, get_attr, get_text, parse_db_references +# --------------------------------------------------------------------- +# Logging +# --------------------------------------------------------------------- +logger = logging.getLogger(__name__) +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", +) -def load_existing_identifiers(spark, output_dir, namespace): - """ - Load the existing 'identifiers' Delta table and build a mapping from UniProt accession to CDM entity ID. - This function enables consistent mapping of accessions to CDM IDs across multiple imports, supporting upsert and idempotent workflows. - Returns: - dict: {accession: entity_id} - """ - access_to_cdm_id = {} - id_path = os.path.abspath(os.path.join(output_dir, f"{namespace}_identifiers_delta")) - if os.path.exists(id_path): - try: - # Read identifier and entity_id columns from the Delta table - df = spark.read.format("delta").load(id_path).select("identifier", "entity_id") - for row in df.collect(): - # Identifier field: UniProt:Pxxxxx, extract the actual accession part after the colon - accession = row["identifier"].split(":", 1)[1] - access_to_cdm_id[accession] = row["entity_id"] - except Exception as e: - print(f"Couldn't load identifiers table: {e}") - else: - print(f"No previous identifiers delta at {id_path}.") - return access_to_cdm_id +# --------------------------------------------------------------------- +# XML namespace mapping for UniProt entries (used for all XPath queries) +# --------------------------------------------------------------------- +NS = {"ns": "https://uniprot.org/uniprot"} -def generate_cdm_id() -> str: - """ - Generate a CDM entity_id directly from UniProt accession, using 'CDM:' prefix - Ensures that each accession is mapped to stable and unique CDM entity ID, making it easy to join across different tables by accession. - """ - return f"CDM:{uuid.uuid4()}" +# --------------------------------------------------------------------- +# Stable ID namespace (UUIDv5) +# --------------------------------------------------------------------- +CDM_UUID_NAMESPACE = uuid.UUID("2d3f6e2a-4d7b-4a8c-9c5a-0e0f7b7d9b3a") -def build_datasource_record(xml_url): - """ - Build a provenance record for the UniProt datasource without version extraction. - """ +# --------------------------------------------------------------------- +# CURIE prefixes +# --------------------------------------------------------------------- +PREFIX_TRANSLATION: dict[str, str] = { + "UniProtKB": "UniProt", + "UniProtKB/Swiss-Prot": "UniProt", + "UniProtKB/TrEMBL": "UniProt", + "UniParc": "UniParc", + "RefSeq": "RefSeq", + "EMBL": "EMBL", + "PDB": "PDB", + "ChEBI": "ChEBI", + "Rhea": "Rhea", + "NCBI Taxonomy": "NCBITaxon", + "GeneID": "NCBIGene", + "Ensembl": "Ensembl", + "GO": "GO", +} + + +# ================================ HELPERS ================================= +def delta_table_path(output_dir: str, namespace: str, table: str) -> str: + return os.path.abspath(os.path.join(output_dir, namespace, table)) + + +def build_datasource_record(xml_url: str) -> dict: + """Build a provenance record for the UniProt datasource.""" return { "name": "UniProt import", "source": "UniProt", @@ -98,162 +120,323 @@ def build_datasource_record(xml_url): } -def parse_identifiers(entry, cdm_id): - """ - Extract all accession numbers in the UniProt entry and format them into a CDM identifier structure. - """ - return [ - { - "entity_id": cdm_id, - "identifier": f"UniProt:{acc.text}", - "source": "UniProt", - "description": "UniProt accession", - } - for acc in entry.findall("u:accession", NS) - ] +def save_datasource_record(xml_url: str, output_dir: str) -> dict: + """Generate and save the datasource provenance record as a JSON file.""" + datasource = build_datasource_record(xml_url) + + os.makedirs(output_dir, exist_ok=True) + output_path = os.path.join(output_dir, "datasource.json") + + with open(output_path, "w", encoding="utf-8") as f: + json.dump(datasource, f, indent=2) + + logger.info("Saved datasource record to %s", output_path) + return datasource + + +def download_file( + url: str, + output_path: str, + chunk_size: int = 1024 * 1024, + overwrite: bool = False, +) -> None: + """Download URL -> output_path (streaming)""" + if os.path.exists(output_path) and not overwrite: + logger.info("File already exists, skip download: %s", output_path) + return + + tmp_path = output_path + ".part" + if os.path.exists(tmp_path): + try: + os.remove(tmp_path) + except Exception: + pass + + try: + logger.info("Downloading %s -> %s", url, output_path) + with requests.get(url, stream=True, timeout=120) as r: + r.raise_for_status() + with open(tmp_path, "wb") as f: + for chunk in r.iter_content(chunk_size=chunk_size): + if chunk: + f.write(chunk) + os.replace(tmp_path, output_path) + logger.info("Download complete: %s", output_path) + except Exception: + logger.exception("Failed to download %s", url) + try: + if os.path.exists(tmp_path): + os.remove(tmp_path) + except Exception: + logger.exception("Failed to remove partial download: %s", tmp_path) + raise + + +def prepare_local_xml(xml_url: str, output_dir: str, overwrite: bool = False) -> str: + os.makedirs(output_dir, exist_ok=True) + local_path = os.path.join(output_dir, os.path.basename(xml_url)) + download_file(xml_url, local_path, overwrite=overwrite) + return local_path + + +def stream_uniprot_xml(filepath: str): + """Stream gzipped UniProt XML entries.""" + logger.info("Streaming UniProt XML from: %s", filepath) + with gzip.open(filepath, "rb") as f: + for _, elem in ET.iterparse(f, events=("end",)): + if elem.tag.endswith("entry"): + yield elem + elem.clear() + + +def get_spark_session(namespace: str) -> SparkSession: + """Initialize SparkSession with Delta Lake support, and ensure the target database exists.""" + builder = ( + SparkSession.builder.appName("UniProtDeltaIngestion") + .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") + .config( + "spark.sql.catalog.spark_catalog", + "org.apache.spark.sql.delta.catalog.DeltaCatalog", + ) + .config("spark.databricks.delta.schema.autoMerge.enabled", "true") + ) + spark = configure_spark_with_delta_pip(builder).getOrCreate() + spark.sql(f"CREATE DATABASE IF NOT EXISTS {namespace}") + return spark + + +def normalize_prefix(db_type: str) -> str: + """Map UniProt dbReference @type to a normalized CURIE prefix.""" + return PREFIX_TRANSLATION.get(db_type, db_type.replace(" ", "")) + +def make_curie(db_type: str, db_id: str) -> str: + """Create CURIE with normalized prefix.""" + return f"{normalize_prefix(db_type)}:{db_id}" -def parse_names(entry, cdm_id): + +# ================================ STABLE ID ================================= +def stable_cdm_id_from_uniprot_accession(accession: str, prefix: str = "cdm_prot_") -> str: + u = uuid.uuid5(CDM_UUID_NAMESPACE, f"UniProt:{accession}") + return f"{prefix}{u}" + + +def load_existing_maps( + spark: SparkSession, + output_dir: str, + namespace: str, +) -> tuple[dict[str, str], dict[str, str]]: """ - Extract all protein names from a UniProt element, including - - Top-level elements (generic names) - - and blocks within (full and short names). + Returns: + accession_to_entity_id: accession -> entity_id (from identifiers) + entity_id_to_created: entity_id -> created (from entities) """ - names = [] + accession_to_entity_id: dict[str, str] = {} + entity_id_to_created: dict[str, str] = {} - # Extract all top-level tags - for name_element in entry.findall("u:name", NS): - if name_element.text: - names.append( - { - "entity_id": cdm_id, - "name": name_element.text, - "description": "UniProt protein name", - "source": "UniProt", - } + id_path = os.path.join(output_dir, namespace, "identifiers") + ent_path = os.path.join(output_dir, namespace, "entities") + + if os.path.exists(id_path): + try: + df = ( + spark.read.format("delta") + .load(id_path) + .filter(col("identifier").startswith("UniProt:")) + .select( + split(col("identifier"), ":").getItem(1).alias("accession"), + col("entity_id"), + ) ) + for row in df.toLocalIterator(): + acc = row["accession"] + eid = row["entity_id"] + if acc and eid: + accession_to_entity_id[acc] = eid + logger.info( + "Loaded %d accession->entity_id from %s", + len(accession_to_entity_id), + id_path, + ) + except Exception: + logger.exception("Couldn't load identifiers from %s", id_path) - # Extract recommended and alternative names from block - protein = entry.find("u:protein", NS) - if protein is not None: - for name_type in ["recommended", "alternative"]: - # Directly use findall for simplicity (recommendedName returns single-element list) - name_blocks = protein.findall(f"u:{name_type}Name", NS) - for name in name_blocks: - for name_length in ["full", "short"]: - name_string = name.find(f"u:{name_length}Name", NS) - if name_string is None or not name_string.text: - continue + if os.path.exists(ent_path): + try: + df = spark.read.format("delta").load(ent_path).select("entity_id", "created") + for row in df.toLocalIterator(): + if row["entity_id"] and row["created"]: + entity_id_to_created[row["entity_id"]] = row["created"] + logger.info( + "Loaded %d entity_id->created from %s", + len(entity_id_to_created), + ent_path, + ) + except Exception: + logger.exception("Couldn't load entities from %s", ent_path) + + return accession_to_entity_id, entity_id_to_created + + +# ================================ PARSERS ================================= +def parse_identifiers(entry, cdm_id: str) -> list[dict]: + out = parse_identifiers_generic(entry=entry, xpath="ns:accession", prefix="UniProt", ns=NS) + for row in out: + row["entity_id"] = cdm_id + row.setdefault("source", "UniProt") + row.setdefault("description", "UniProt accession") + return out - names.append( - { - "entity_id": cdm_id, - "name": name_string.text, - "description": f"UniProt {name_type} {name_length} name", - "source": "UniProt", - } - ) + +def _make_name_record(cdm_id: str, name_text: str, description: str) -> dict: + return { + "entity_id": cdm_id, + "name": name_text, + "description": description, + "source": "UniProt", + } + + +def parse_names(entry, cdm_id: str) -> list[dict]: + names: list[dict] = [] + + for txt in find_all_text(entry, "ns:name", NS): + names.append(_make_name_record(cdm_id, txt, "UniProt entry name")) + + protein = entry.find("ns:protein", NS) + if protein is not None: + for tag_name, logical_type in [ + ("recommendedName", "recommended"), + ("alternativeName", "alternative"), + ]: + for name_block in protein.findall(f"ns:{tag_name}", NS): + for xml_tag, length_label in [ + ("fullName", "full"), + ("shortName", "short"), + ]: + elem = name_block.find(f"ns:{xml_tag}", NS) + text = get_text(elem) + if text: + names.append( + _make_name_record( + cdm_id, + text, + f"UniProt {logical_type} {length_label} name", + ) + ) return names -def parse_protein_info(entry, cdm_id): - """ - Extract protein-level metadata from a UniProt XML element. - """ - protein_info = {} - ec_numbers = [] +def parse_protein_info(entry, cdm_id: str) -> dict | None: + protein_info: dict = {} - # Extract EC numbers from and in - protein = entry.find("u:protein", NS) + protein = entry.find("ns:protein", NS) if protein is not None: - # Find EC numbers in recommendedName - rec = protein.find("u:recommendedName", NS) - if rec is not None: - for ec in rec.findall("u:ecNumber", NS): - if ec.text: - ec_numbers.append(ec.text) - - # Find EC numbers in all alternativeNames - for alt in protein.findall("u:alternativeName", NS): - for ec in alt.findall("u:ecNumber", NS): - if ec.text: - ec_numbers.append(ec.text) + ec_paths = ["ns:recommendedName/ns:ecNumber", "ns:alternativeName/ns:ecNumber"] + ec_numbers: list[str] = [] + for path in ec_paths: + ec_numbers.extend(find_all_text(protein, path, NS)) if ec_numbers: - protein_info["ec_numbers"] = ec_numbers + protein_info["ec_numbers"] = ";".join(ec_numbers) - # Extract protein existence evidence type - protein_existence = entry.find("u:proteinExistence", NS) + protein_existence = entry.find("ns:proteinExistence", NS) if protein_existence is not None: protein_info["protein_id"] = cdm_id - protein_info["evidence_for_existence"] = protein_existence.get("type") - - # Extract sequence and sequence-related attributes - seq_elem = entry.find("u:sequence", NS) - if seq_elem is not None and seq_elem.text: - protein_info["length"] = seq_elem.get("length") - protein_info["mass"] = seq_elem.get("mass") - protein_info["checksum"] = seq_elem.get("checksum") - protein_info["modified"] = seq_elem.get("modified") - protein_info["sequence_version"] = seq_elem.get("version") - protein_info["sequence"] = seq_elem.text.strip() - - # Capture the entry's modified/updated date for tracking - entry_modified = entry.attrib.get("modified") or entry.attrib.get("updated") + protein_info["evidence_for_existence"] = get_attr(protein_existence, "type") + + seq_elem = entry.find("ns:sequence", NS) + if seq_elem is not None: + protein_info.update( + clean_dict( + { + "length": get_attr(seq_elem, "length"), + "mass": get_attr(seq_elem, "mass"), + "checksum": get_attr(seq_elem, "checksum"), + "modified": get_attr(seq_elem, "modified"), + "sequence_version": get_attr(seq_elem, "version"), + "sequence": get_text(seq_elem), + } + ) + ) + + entry_modified = get_attr(entry, "modified") or get_attr(entry, "updated") if entry_modified: protein_info["entry_modified"] = entry_modified - # Return the dictionary if any protein info was extracted return protein_info if protein_info else None -def parse_evidence_map(entry): - """ - Parse all elements from a UniProt XML entry and build a mapping - from evidence key to metadata (type, supporting objects, publications). - """ - evidence_map = {} +def parse_evidence_map(entry) -> dict[str, dict]: + evidence_map: dict[str, dict] = {} - # Loop through every element in the entry - for evidence in entry.findall("u:evidence", NS): - key = evidence.get("key") # Unique evidence key (string) - evidence_type = evidence.get("type") # Evidence code/type (e.g., ECO:0000255) + for ev in entry.findall("ns:evidence", NS): + key = get_attr(ev, "key") + if not key: + continue - supporting_objects = [] - publications = [] + evidence_type = get_attr(ev, "type") + pubs: list[str] = [] + others: list[str] = [] - # Check if this evidence has a element with children - source = evidence.find("u:source", NS) + source = ev.find("ns:source", NS) if source is not None: - for dbref in source.findall("u:dbReference", NS): - db_type = dbref.get("type") - db_id = dbref.get("id") - # Add publication references as PubMed or DOI; others as supporting objects - if db_type == "PubMed": - publications.append(f"PMID:{db_id}") - elif db_type == "DOI": - publications.append(f"DOI:{db_id}") + raw_pubs, raw_others = parse_db_references(source, NS) + + normalized_pubs: list[str] = [] + for p in raw_pubs: + up = p.upper() + if up.startswith("PUBMED:"): + _, acc = p.split(":", 1) + normalized_pubs.append(f"PMID:{acc}") else: - supporting_objects.append(f"{db_type}:{db_id}") + normalized_pubs.append(p) - # Store evidence metadata, omitting empty lists for cleanliness - evidence_map[key] = { - "evidence_type": evidence_type, - "supporting_objects": supporting_objects if supporting_objects else None, - "publications": publications if publications else None, - } + pubs = normalized_pubs + others = raw_others + + evidence_map[key] = clean_dict( + { + "evidence_type": evidence_type, + "publications": pubs or None, + "supporting_objects": others or None, + } + ) return evidence_map -def parse_reaction_association(reaction, cdm_id, evidence_map): - associations = [] - for dbref in reaction.findall("u:dbReference", NS): +def _make_association( + cdm_id: str, + obj: str, + predicate: str | None = None, + evidence_key: str | None = None, + evidence_map: dict | None = None, +) -> dict: + assoc = { + "subject": cdm_id, + "object": obj, + "predicate": predicate, + "evidence_type": None, + "supporting_objects": None, + "publications": None, + } + if evidence_key and evidence_map and evidence_key in evidence_map: + assoc.update(evidence_map[evidence_key]) + return clean_dict(assoc) + + +def parse_reaction_association(reaction, cdm_id: str, evidence_map: dict[str, dict]) -> list[dict]: + associations: list[dict] = [] + for dbref in reaction.findall("ns:dbReference", NS): db_type = dbref.get("type") db_id = dbref.get("id") + if not db_type or not db_id: + continue + assoc = { "subject": cdm_id, "predicate": "catalyzes", - "object": f"{db_type}:{db_id}", + "object": make_curie(db_type, db_id), "evidence_type": None, "supporting_objects": None, "publications": None, @@ -261,124 +444,127 @@ def parse_reaction_association(reaction, cdm_id, evidence_map): evidence_key = reaction.get("evidence") if evidence_key and evidence_key in evidence_map: assoc.update(evidence_map[evidence_key]) - associations.append(assoc) + associations.append(clean_dict(assoc)) return associations -def parse_cofactor_association(cofactor, cdm_id): - associations = [] - for dbref in cofactor.findall("u:dbReference", NS): +def parse_cofactor_association(cofactor, cdm_id: str) -> list[dict]: + associations: list[dict] = [] + for dbref in cofactor.findall("ns:dbReference", NS): db_type = dbref.get("type") db_id = dbref.get("id") - assoc = { - "subject": cdm_id, - "predicate": "requires_cofactor", - "object": f"{db_type}:{db_id}", - "evidence_type": None, - "supporting_objects": None, - "publications": None, - } - associations.append(assoc) + if not db_type or not db_id: + continue + associations.append( + clean_dict( + { + "subject": cdm_id, + "predicate": "requires_cofactor", + "object": make_curie(db_type, db_id), + "evidence_type": None, + "supporting_objects": None, + "publications": None, + } + ) + ) return associations -def parse_associations(entry, cdm_id, evidence_map): +def parse_associations(entry, cdm_id: str, evidence_map: dict[str, dict]) -> list[dict]: """ - Parse all relevant associations from a UniProt XML entry for the CDM model. - Only include fields that are not None for each association. + Only keep: + - taxonomy association + - catalytic activity / cofactor associations """ - associations = [] - - def clean(d): - """Remove None-value keys from a dict.""" - return {k: v for k, v in d.items() if v is not None} + associations: list[dict] = [] # Taxonomy association - organism = entry.find("u:organism", NS) + organism = entry.find("ns:organism", NS) if organism is not None: - taxon_ref = organism.find('u:dbReference[@type="NCBI Taxonomy"]', NS) + taxon_ref = organism.find('ns:dbReference[@type="NCBI Taxonomy"]', NS) if taxon_ref is not None: - associations.append( - clean( - { - "subject": cdm_id, - "object": f"NCBITaxon:{taxon_ref.get('id')}", - "predicate": None, - "evidence_type": None, - "supporting_objects": None, - "publications": None, - } - ) - ) - - # Database cross-references with evidence - for dbref in entry.findall("u:dbReference", NS): - db_type = dbref.get("type") - db_id = dbref.get("id") - association = { - "subject": cdm_id, - "object": f"{db_type}:{db_id}", - "predicate": None, - "evidence_type": None, - "supporting_objects": None, - "publications": None, - } - evidence_key = dbref.get("evidence") - if evidence_key and evidence_key in evidence_map: - association.update(evidence_map[evidence_key]) - associations.append(clean(association)) + tax_id = taxon_ref.get("id") + if tax_id: + associations.append(_make_association(cdm_id, f"NCBITaxon:{tax_id}", predicate="in_taxon")) - # Catalytic/cofactor - for comment in entry.findall("u:comment", NS): + # Catalytic activity / cofactor + for comment in entry.findall("ns:comment", NS): comment_type = comment.get("type") if comment_type == "catalytic activity": - # extract catalytic associations - for reaction in comment.findall("u:reaction", NS): - for assoc in parse_reaction_association(reaction, cdm_id, evidence_map): - associations.append(clean(assoc)) + for reaction in comment.findall("ns:reaction", NS): + associations.extend(parse_reaction_association(reaction, cdm_id, evidence_map)) elif comment_type == "cofactor": - # extract cofactor associations - for cofactor in comment.findall("u:cofactor", NS): - for assoc in parse_cofactor_association(cofactor, cdm_id): - associations.append(clean(assoc)) + for cofactor in comment.findall("ns:cofactor", NS): + associations.extend(parse_cofactor_association(cofactor, cdm_id)) + return associations -def parse_publications(entry): - """ - Extract all publication references from a UniProt XML - Returns a list of standardized publication IDs (PMID and DOI). - """ - publications = [] - - # Iterate through all blocks in the entry - for reference in entry.findall("u:reference", NS): - citation = reference.find("u:citation", NS) - if citation is not None: - # Each may have multiple elements (e.g., PubMed, DOI) - for dbref in citation.findall("u:dbReference", NS): - db_type = dbref.get("type") - db_id = dbref.get("id") - # Standardize format for known publication types - if db_type == "PubMed": - publications.append(f"PMID:{db_id}") - elif db_type == "DOI": - publications.append(f"DOI:{db_id}") - - return publications - - -def parse_uniprot_entry(entry, cdm_id, current_timestamp, datasource_name="UniProt import", prev_created=None): - if prev_created: - entity_created = prev_created - entity_updated = current_timestamp - else: - entity_created = current_timestamp - entity_updated = current_timestamp +def parse_cross_references(entry, cdm_id: str) -> list[dict]: + """Generic -> cross_references table.""" + rows: list[dict] = [] + + for dbref in entry.findall("ns:dbReference", NS): + db_type = dbref.get("type") + db_id = dbref.get("id") + if not db_type or not db_id: + continue + + xref_type = normalize_prefix(db_type) + + if ":" in db_id: + xref = db_id + else: + xref = f"{xref_type}:{db_id}" + + rows.append( + clean_dict( + { + "entity_id": cdm_id, + "xref_type": xref_type, + "xref_value": db_id, + "xref": xref, + } + ) + ) + + return rows + + +def parse_publications(entry) -> list[str]: + publications: list[str] = [] + for reference in entry.findall("ns:reference", NS): + citation = reference.find("ns:citation", NS) + if citation is None: + continue + + raw_pubs, _ = parse_db_references(citation, NS) + for p in raw_pubs: + up = p.upper() + if up.startswith("PUBMED:"): + _, acc = p.split(":", 1) + publications.append(f"PMID:{acc}") + elif up.startswith("DOI:"): + _, acc = p.split(":", 1) + publications.append(f"DOI:{acc}") + + return list(dict.fromkeys(publications)) + + +def parse_uniprot_entry( + entry, + cdm_id: str, + current_timestamp: str, + datasource_name: str = "UniProt import", + prev_created: str | None = None, +) -> dict: + entity_created = prev_created or current_timestamp + entity_updated = current_timestamp uniprot_created = entry.attrib.get("created") uniprot_modified = entry.attrib.get("modified") or entry.attrib.get("updated") uniprot_version = entry.attrib.get("version") + entity = { "entity_id": cdm_id, "entity_type": "protein", @@ -389,65 +575,21 @@ def parse_uniprot_entry(entry, cdm_id, current_timestamp, datasource_name="UniPr "uniprot_created": uniprot_created, "uniprot_modified": uniprot_modified, } + evidence_map = parse_evidence_map(entry) + return { "entity": entity, "identifiers": parse_identifiers(entry, cdm_id), "names": parse_names(entry, cdm_id), "protein": parse_protein_info(entry, cdm_id), "associations": parse_associations(entry, cdm_id, evidence_map), + "cross_references": parse_cross_references(entry, cdm_id), "publications": parse_publications(entry), } -def download_file(url, output_path, chunk_size=8192, overwrite=False) -> None: - """ - Download a file from a given URL to a local output path. - """ - # Skip download if file already exists and not overwriting - if os.path.exists(output_path) and not overwrite: - print(f"File '{output_path}' already exists.") - return - - # Stream download to avoid high memory usage - try: - with requests.get(url, stream=True, timeout=60) as response: - response.raise_for_status() - with open(output_path, "wb") as f: - for chunk in response.iter_content(chunk_size=chunk_size): - if chunk: - f.write(chunk) - print(f"Downloaded '{url}' to '{output_path}'") - except Exception as e: - print(f"Failed to download '{url}': {e}") - - if os.path.exists(output_path): - os.remove(output_path) # Remove incomplete file - raise - - -def stream_uniprot_xml(filepath): - """ - Stream and parse UniProt XML entries from a local gzipped file. - Yields each element as soon as it is parsed to avoid loading the entire XML into memory. - """ - # Open the gzipped XML file for reading in binary mode - with gzip.open(filepath, "rb") as f: - # Use iterparse to process XML incrementally, triggering on element end events - context = ET.iterparse(f, events=("end",)) - for _event, element in context: - # Check tag name, ignoring namespace - if element.tag.endswith("entry"): - yield element - element.clear() - - -## ================================ SCHEMA ================================= -""" -Defines the Spark schema for all major CDM tables derived from UniProt XML. -Each schema is tailored for protein entities, identifiers, protein details, names, associations, and linked publications. -""" - +# ================================ SCHEMA ================================= schema_entities = StructType( [ StructField("entity_id", StringType(), False), @@ -505,6 +647,15 @@ def stream_uniprot_xml(filepath): ] ) +schema_cross_references = StructType( + [ + StructField("entity_id", StringType(), False), + StructField("xref_type", StringType(), True), + StructField("xref_value", StringType(), True), + StructField("xref", StringType(), True), + ] +) + schema_publications = StructType( [ StructField("entity_id", StringType(), False), @@ -513,126 +664,74 @@ def stream_uniprot_xml(filepath): ) -def save_batches_to_delta(spark, tables, output_dir, namespace) -> None: - """ - Persist batches of parsed records for each CDM table into Delta Lake format. - - - Each table is saved into a Delta directory named '{namespace}_{table}_delta' in the output folder. - - If the Delta directory exists, append new records. Otherwise, overwrite it. - - Registers the table in the Spark SQL for downstream query. - """ - for table, (records, schema) in tables.items(): - if not records: - continue # Skip all empty tables - - delta_dir = os.path.abspath(os.path.join(output_dir, f"{namespace}_{table}_delta")) - # Use "append" mode if the Delta directory already exists, otherwise "overwrite" - mode = "append" if os.path.exists(delta_dir) else "overwrite" - - print( - f"[DEBUG] Registering table: {namespace}.{table} at {delta_dir} with mode={mode}, record count: {len(records)}" - ) - - try: - df = spark.createDataFrame(records, schema) - df.write.format("delta").mode(mode).option("overwriteSchema", "true").save(delta_dir) - spark.sql(f""" - CREATE TABLE IF NOT EXISTS {namespace}.{table} - USING DELTA - LOCATION '{delta_dir}' - """) - except Exception as e: - print(f"Failed to save {table} to Delta: {e}") - - -def prepare_local_xml(xml_url, output_dir): - """ - Download the remote UniProt XML (.xml.gz) file to the specified local output directory, - unless the file already exists locally. Returns the full local file path. - """ - # Ensure output directory exists - os.makedirs(output_dir, exist_ok=True) - local_xml_path = os.path.join(output_dir, os.path.basename(xml_url)) - # Download only if file does not exist - download_file(xml_url, local_xml_path) - return local_xml_path - - -def save_datasource_record(xml_url, output_dir): - """ - Generate and save the datasource provenance record as a JSON file in the output directory. - """ - datasource = build_datasource_record(xml_url) - os.makedirs(output_dir, exist_ok=True) # Ensure output directory exists - output_path = os.path.join(output_dir, "datasource.json") - with open(output_path, "w") as f: - json.dump(datasource, f, indent=4) - return datasource - - -def get_spark_session(namespace): - """ - Initialize SparkSession with Delta Lake support, and ensure the target database exists. - """ - # Build SparkSession with Delta extensions enabled - builder = ( - SparkSession.builder.appName("DeltaIngestion") - .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") - .config( - "spark.sql.catalog.spark_catalog", - "org.apache.spark.sql.delta.catalog.DeltaCatalog", - ) - ) - spark = configure_spark_with_delta_pip(builder).getOrCreate() - # Ensure the target namespace (database) exists +# ================================ DELTA WRITE ================================= +def ensure_tables_registered(spark: SparkSession, output_dir: str, namespace: str, table_names: list[str]) -> None: spark.sql(f"CREATE DATABASE IF NOT EXISTS {namespace}") - return spark - - -def load_existing_entity(spark, output_dir, namespace): - """ - Load the existing entities_delta Delta table and build a mapping of entity_id to created timestamp. - This mapping is used to support upserts and idempotent writes. - """ - old_created_dict = {} - entities_table_path = os.path.abspath(os.path.join(output_dir, f"{namespace}_entities_delta")) - if os.path.exists(entities_table_path): - try: - # Read only the required columns for efficiency - old_df = spark.read.format("delta").load(entities_table_path).select("entity_id", "created") - for row in old_df.collect(): - old_created_dict[row["entity_id"]] = row["created"] - print(f"Loaded {len(old_created_dict)} existing entity_id records for upsert.") - except Exception as e: - print(f"Couldn't load previous entities delta table: {e}") - else: - print(f"No previous entities delta at {entities_table_path}.") - return old_created_dict + for tbl in table_names: + # delta_dir = os.path.abspath(os.path.join(output_dir, namespace, tbl)) + delta_dir = delta_table_path(output_dir, namespace, tbl) + spark.sql( + f""" + CREATE TABLE IF NOT EXISTS {namespace}.{tbl} + USING DELTA + LOCATION '{delta_dir}' + """ + ) -def parse_entries(local_xml_path, target_date, batch_size, spark, tables, output_dir, namespace, current_timestamp): - """ - Parse UniProt XML entries, write to Delta Lake in batches - Return (processed_entry_count, skipped_entry_count). +def save_batches_to_delta( + spark: SparkSession, + tables: dict[str, tuple[list, StructType]], + output_dir: str, + namespace: str, + mode: str = "append", +) -> None: + for table_name, (records, schema) in tables.items(): + if not records: + continue - """ + # delta_dir = os.path.abspath(os.path.join(output_dir, namespace, table_name)) + delta_dir = delta_table_path(output_dir, namespace, table_name) + df = spark.createDataFrame(records, schema) + writer = df.write.format("delta").mode(mode) + + if mode == "append": + writer = writer.option("mergeSchema", "true") + if mode == "overwrite": + writer = writer.option("overwriteSchema", "true") + + writer.save(delta_dir) + + +## =============================== MAIN PARSING LOOP ================================= +def parse_entries( + local_xml_path: str, + target_date: str | None, + batch_size: int, + spark: SparkSession, + tables: dict[str, tuple[list, StructType]], + output_dir: str, + namespace: str, + current_timestamp: str, + accession_to_entity_id: dict[str, str], + entity_id_to_created: dict[str, str], + mode: str, +) -> tuple[int, int]: target_date_dt = None - - # Convert target_date string to datetime for comparison if provided if target_date: try: target_date_dt = datetime.datetime.strptime(target_date, "%Y-%m-%d") + logger.info("Target date filter enabled: >= %s", target_date) except Exception: - print(f"Invalid target date is {target_date}") + logger.warning("Invalid target date provided: %s (ignored)", target_date) + target_date_dt = None entry_count, skipped = 0, 0 - # Iterate over each element in the XML file for entry_elem in stream_uniprot_xml(local_xml_path): try: - # Get the modification date of the entry mod_date = entry_elem.attrib.get("modified") or entry_elem.attrib.get("updated") - # If target_date is set, skip entries older than target_date + if target_date_dt and mod_date: try: entry_date_dt = datetime.datetime.strptime(mod_date[:10], "%Y-%m-%d") @@ -643,110 +742,197 @@ def parse_entries(local_xml_path, target_date, batch_size, spark, tables, output skipped += 1 continue - # Extract main accession (skip entry if not present) - main_accession_elem = entry_elem.find("u:accession", NS) - if main_accession_elem is None or main_accession_elem.text is None: + main_accession_elem = entry_elem.find("ns:accession", NS) + if main_accession_elem is None or not main_accession_elem.text: skipped += 1 continue - # Generate a unique CDM ID (UUID) for this entry - cdm_id = generate_cdm_id() + accession = main_accession_elem.text.strip() + + cdm_id = accession_to_entity_id.get(accession) or stable_cdm_id_from_uniprot_accession(accession) + prev_created = entity_id_to_created.get(cdm_id) + + record = parse_uniprot_entry(entry_elem, cdm_id, current_timestamp, prev_created=prev_created) - # Parse all sub-objects: entity, identifiers, names, protein, associations, publications - record = parse_uniprot_entry(entry_elem, cdm_id, current_timestamp) tables["entities"][0].append(record["entity"]) tables["identifiers"][0].extend(record["identifiers"]) tables["names"][0].extend(record["names"]) if record["protein"]: tables["proteins"][0].append(record["protein"]) + tables["associations"][0].extend(record["associations"]) - tables["publications"][0].extend( - {"entity_id": record["entity"]["entity_id"], "publication": pub} for pub in record["publications"] - ) + tables["cross_references"][0].extend(record["cross_references"]) + + for pub in record["publications"]: + tables["publications"][0].append( + { + "entity_id": cdm_id, + "publication": pub, + } + ) entry_count += 1 - # Write batch to Delta and clear lists every batch_size entries + if entry_count % batch_size == 0: - save_batches_to_delta(spark, tables, output_dir, namespace) + save_batches_to_delta(spark, tables, output_dir, namespace, mode=mode) for v in tables.values(): v[0].clear() - print(f"{entry_count} entries processed and saved") - except Exception as e: - # If any error occurs in parsing this entry, skip it and count - print(f"Error parsing entry: {e}") + logger.info("Processed and saved %d entries...", entry_count) + + except Exception: + logger.exception("Error parsing UniProt entry, skipping") skipped += 1 - continue - # write remaining records - save_batches_to_delta(spark, tables, output_dir, namespace) + save_batches_to_delta(spark, tables, output_dir, namespace, mode=mode) return entry_count, skipped -def ingest_uniprot(xml_url, output_dir, namespace, target_date=None, batch_size=5000) -> None: - # Generate the timestamp for the current run +def ingest_uniprot( + xml_url: str, + output_dir: str, + namespace: str, + target_date: str | None = None, + batch_size: int = 5000, + mode: str = "append", + overwrite_download: bool = False, +) -> None: current_timestamp = datetime.datetime.now(datetime.UTC).isoformat() - # Prepare local XML - local_xml_path = prepare_local_xml(xml_url, output_dir) - - # Save data source meta information + local_xml_path = prepare_local_xml(xml_url, output_dir, overwrite=overwrite_download) save_datasource_record(xml_url, output_dir) - # Get Spark and the existing CDM entity_id spark = get_spark_session(namespace) + if mode == "append": + accession_to_entity_id, entity_id_to_created = load_existing_maps(spark, output_dir, namespace) + else: + accession_to_entity_id, entity_id_to_created = {}, {} - # Define the table structure (batch storage) - entities, identifiers, names, proteins, associations, publications = ( - [], - [], - [], - [], - [], - [], - ) - tables = { + # accession_to_entity_id, entity_id_to_created = load_existing_maps(spark, output_dir, namespace) + + entities: list[dict] = [] + identifiers: list[dict] = [] + names: list[dict] = [] + proteins: list[dict] = [] + associations: list[dict] = [] + cross_references: list[dict] = [] + publications: list[dict] = [] + + tables: dict[str, tuple[list, StructType]] = { "entities": (entities, schema_entities), "identifiers": (identifiers, schema_identifiers), "names": (names, schema_names), "proteins": (proteins, schema_proteins), "associations": (associations, schema_associations), + "cross_references": (cross_references, schema_cross_references), "publications": (publications, schema_publications), } - # Main cycle processing, transfer to current timestamp + ensure_tables_registered( + spark, + output_dir, + namespace, + [ + "entities", + "identifiers", + "names", + "proteins", + "associations", + "cross_references", + "publications", + ], + ) + + logger.info( + "Starting UniProt ingestion: xml=%s | namespace=%s | mode=%s | batch_size=%d", + xml_url, + namespace, + mode, + batch_size, + ) + entry_count, skipped = parse_entries( - local_xml_path, target_date, batch_size, spark, tables, output_dir, namespace, current_timestamp + local_xml_path=local_xml_path, + target_date=target_date, + batch_size=batch_size, + spark=spark, + tables=tables, + output_dir=output_dir, + namespace=namespace, + current_timestamp=current_timestamp, + accession_to_entity_id=accession_to_entity_id, + entity_id_to_created=entity_id_to_created, + mode=mode, ) - print(f"All entries processed ({entry_count}), skipped {skipped}, writing complete tables.") - spark.sql(f"SHOW TABLES IN {namespace}").show() - spark.sql(f"SELECT COUNT(*) FROM {namespace}.entities").show() - # make sql test in entity table - spark.sql(f"SELECT * FROM {namespace}.entities LIMIT 10").show(truncate=False) + logger.info("Completed parsing UniProt XML. processed=%d skipped=%d", entry_count, skipped) - spark.stop() + logger.info("Verifying Delta tables in namespace `%s`", namespace) + spark.sql(f"SHOW TABLES IN {namespace}").show(truncate=False) - print(f"All Delta tables are created and registered in Spark SQL under `{namespace}`.") + for tbl in [ + "entities", + "identifiers", + "names", + "proteins", + "associations", + "cross_references", + "publications", + ]: + logger.info("Verifying table: %s.%s", namespace, tbl) + spark.sql(f"SELECT COUNT(*) AS row_count FROM {namespace}.{tbl}").show(truncate=False) + spark.sql(f"SELECT * FROM {namespace}.{tbl} LIMIT 5").show(truncate=False) + + spark.stop() + logger.info("Done") +# ================================ CLI ================================= @click.command() @click.option("--xml-url", required=True, help="URL to UniProt XML (.xml.gz)") -@click.option("--output-dir", default="output", help="Output directory for Delta tables") -@click.option("--namespace", default="uniprot_db", help="Delta Lake database name") +@click.option( + "--output-dir", + default="output", + show_default=True, + help="Output directory for Delta tables", +) +@click.option( + "--namespace", + default="uniprot_db", + show_default=True, + help="Delta Lake database name", +) @click.option( "--target-date", default=None, help="Only process entries modified/updated since this date (YYYY-MM-DD)", ) -@click.option("--batch-size", default=5000, help="Batch size for writing Delta tables") -def main(xml_url, output_dir, namespace, target_date, batch_size) -> None: +@click.option( + "--batch-size", + default=5000, + show_default=True, + help="Batch size for writing Delta tables", +) +@click.option( + "--mode", + type=click.Choice(["append", "overwrite"]), + default="append", + show_default=True, +) +@click.option( + "--overwrite-download", + is_flag=True, + help="Force re-download XML even if file exists", +) +def main(xml_url, output_dir, namespace, target_date, batch_size, mode, overwrite_download): ingest_uniprot( xml_url=xml_url, output_dir=output_dir, namespace=namespace, target_date=target_date, batch_size=int(batch_size), + mode=mode, + overwrite_download=overwrite_download, ) diff --git a/src/cdm_data_loader_utils/parsers/uniref.py b/src/cdm_data_loader_utils/parsers/uniref.py index da3327c..6e1cdf3 100644 --- a/src/cdm_data_loader_utils/parsers/uniref.py +++ b/src/cdm_data_loader_utils/parsers/uniref.py @@ -1,5 +1,5 @@ """ -UniRef XML Cluster ETL Pipeline. +UniRef XML Cluster ETL Pipeline This script downloads a UniRef100 XML file, parses cluster and member information, and writes the extracted data into Delta Lake tables for downstream analysis. @@ -22,7 +22,12 @@ --output-dir cdm-data-loader-utils/output/uniref100_clusters \ --batch-size 1000 -**Parameters:** +python3 uniref.py \ + --ftp-url https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref100/uniref100.xml.gz \ + --output-dir output_uniref \ + --batch-size 1000 + +Parameters: - --ftp-url: UniProt FTP URL to the UniRef100 gzipped XML file. - --output-dir: Output directory where Delta tables will be written. - --batch-size: Number of UniRef entries to process. @@ -30,118 +35,211 @@ """ import gzip + +### ===== logging setup ===== ### +import logging import os import uuid import xml.etree.ElementTree as ET -from datetime import datetime -from urllib.request import URLError, urlretrieve +from datetime import UTC, datetime +from pathlib import Path +from urllib.error import URLError +from urllib.request import urlretrieve import click from delta import configure_spark_with_delta_pip from pyspark.sql import SparkSession from pyspark.sql.types import StringType, StructField, StructType +from cdm_data_loader_utils.parsers.xml_utils import get_text, parse_properties + +logger = logging.getLogger(__name__) + + +UNIREF_NS = {"ns": "http://uniprot.org/uniref"} +DATA_SOURCE = "UniRef 100/90/50" + + +PREFIX_TRANSLATION = { + "UniProtKB ID": "UniProt", + "UniProtKB accession": "UniProt", + "UniParc ID": "UniParc", + "UniRef90 ID": "UniRef90", + "UniRef50 ID": "UniRef50", + "UniRef100 ID": "UniRef100", +} -# Generate a unique CDM entity_id based on accession -def cdm_entity_id(accession) -> str | None: - if not accession: - return None - uuid_part = uuid.uuid5(uuid.NAMESPACE_OID, accession) - return f"CDM:{uuid_part}" +def generate_dbxref(db: str, acc: str) -> str: + """Generate a database reference that uses BioRegistry prefixes.""" + return f"{PREFIX_TRANSLATION[db]}:{acc}" -# Download a file from the specified URL to the local path if it does not already exist -def download_file(url, local_path) -> None: + +def cdm_entity_id(value: str, prefix: str = "CDM:") -> str: """ - If the file is already present at local, the function does nothing. - If the download fails, any partially downloaded file will be removed. + Deterministic UUIDv5-based CDM id generator. + + value must be non-empty. """ - if not os.path.exists(local_path): - print(f"Downloading from URL link: {url}") + if not value: + raise ValueError("Value must be a non-empty string") + + return f"{prefix}{uuid.uuid5(uuid.NAMESPACE_OID, value)}" + + +# timestamp helper +def get_timestamps( + uniref_id: str, + existing_created: dict[str, str], + now: datetime | None = None, +) -> tuple[str, str]: + """ + Return (updated_time, created_time) for a given UniRef cluster ID. + - All timestamps are UTC ISO8601 with timezone (e.g., 2026-01-05T12:34:56+00:00) + - uniref_id must be non-empty (schema invariant) + """ + if not uniref_id: + raise ValueError("get_timestamps: uniref_id must be a non-empty string") + + now_dt = now or datetime.now(UTC) + updated_time = now_dt.isoformat(timespec="seconds") + + created_time = existing_created.get(uniref_id) or updated_time + return updated_time, created_time + + +def download_file(url: str, local_path: str, overwrite: bool = False) -> str: + """ + Download URL -> local_path. + - Atomic: downloads to .part then os.replace + - Idempotent: skips if exists unless overwrite=True + Returns the final local_path. + """ + dst = Path(local_path) + dst.parent.mkdir(parents=True, exist_ok=True) + + if dst.exists() and not overwrite: + logger.info("File already exists, skip download: %s", dst) + return str(dst) + + tmp = dst.with_suffix(dst.suffix + ".part") + + try: + if tmp.exists(): + tmp.unlink() + except Exception: + logger.exception("Failed to remove partial download: %s", tmp) + + logger.info("Downloading %s -> %s", url, dst) + try: + urlretrieve(url, str(tmp)) + os.replace(tmp, dst) + logger.info("Download complete: %s", dst) + return str(dst) + except Exception: + logger.exception("Failed to download %s", url) try: - urlretrieve(url, local_path) - print("Download completed!") - except Exception as e: - print(f"Failed to download {url}: {e}") - if os.path.exists(local_path): - os.remove(local_path) - raise - else: - print(f"File already exists: {local_path}") + if tmp.exists(): + tmp.unlink() + except Exception: + logger.exception("Failed to cleanup tmp file: %s", tmp) + raise -# Load mapping from data_source_entity_id to created timestamp from Delta table -def load_existing_created(spark, entity_table): - existing_created = {} +def load_existing_created(spark: SparkSession, entity_table: str | None) -> dict[str, str]: + """ + Load mapping data_source_entity_id -> created timestamp from the Entity Delta table. + Returns an empty dict if the table does not exist. + """ + existing_created: dict[str, str] = {} if not entity_table: - print("Entity table path not specified.") + logger.warning("Entity table path not specified.") return existing_created try: df = spark.read.format("delta").load(entity_table).select("data_source_entity_id", "created") existing_created = {row["data_source_entity_id"]: row["created"] for row in df.collect()} - print(f"Loaded {len(existing_created)} existing created timestamps.") + logger.info(f"Loaded {len(existing_created)} existing created timestamps from {entity_table}.") except Exception as e: - print(f"No existing Delta table found at {entity_table}. Starting fresh. ({e.__class__.__name__})") + logger.warning(f"No existing Delta table found at {entity_table}. Starting fresh. ({e.__class__.__name__})") return existing_created ##### -------------- List utility function --------------- ##### - - -# Helper function to extract basic cluster info from XML entry element -def extract_cluster(elem, ns): - cluster_id = f"CDM:{uuid.uuid4()}" +def extract_cluster( + elem: ET.Element, + ns: dict[str, str], + uniref_id: str, +) -> tuple[str, str]: + """ + Extract a deterministic CDM cluster_id and the UniRef cluster name. + """ + cluster_id = cdm_entity_id(value=uniref_id) or f"CDM:{uuid.uuid4()}" name_elem = elem.find("ns:name", ns) - name = name_elem.text if name_elem is not None else "UNKNOWN" - return cluster_id, name - + name = get_text(elem=name_elem, default="UNKNOWN") or "UNKNOWN" -# Returns tuple of (updated_time, created_time) -def get_timestamps(uniref_id, existing_created, now=None): - now_dt = now or datetime.now() - formatted_now = now_dt.strftime("%Y-%m-%dT%H:%M:%S") - created = existing_created.get(uniref_id) - created_time = (created.split(".")[0] if "." in created else created) if created else formatted_now - return formatted_now, created_time + return cluster_id, name -# Extract UniProtKB accession and is_seed status from a dbReference element -def get_accession_and_seed(dbref, ns): +def get_accession_and_seed(dbref: ET.Element | None, ns: dict[str, str]) -> tuple[str | None, bool]: + """ + Extract UniProtKB accession and is_seed status from a dbReference element. + """ if dbref is None: return None, False - prop_elems = dbref.findall("ns:property", ns) - props = {} - for prop in prop_elems: - t = prop.attrib["type"] - v = prop.attrib["value"] - props[t] = v + props = parse_properties(dbref, ns) + + raw_acc = props.get("UniProtKB accession") + if isinstance(raw_acc, list): + accession = raw_acc[0] if raw_acc else None + else: + accession = raw_acc # string or None + + raw_seed = props.get("isSeed") + if isinstance(raw_seed, list): + is_seed = bool(raw_seed) and raw_seed[0].lower() == "true" + else: + is_seed = raw_seed is not None and raw_seed.lower() == "true" - acc = props.get("UniProtKB accession") or dbref.attrib.get("id") - is_seed = props.get("isSeed", "false").lower() == "true" - return acc, is_seed + return accession, is_seed -# Add both representative and other cluster members into cluster_member_data list -def add_cluster_members(cluster_id, repr_db, elem, cluster_member_data, ns) -> None: - dbrefs = [] +def add_cluster_members( + cluster_id: str, + repr_db: ET.Element | None, + elem: ET.Element, + cluster_member_rows: list[tuple[str, str, str, str, str]], + ns: dict[str, str], +) -> None: + """Populate cluster_member_rows with representative, member records.""" + dbrefs: list[tuple[ET.Element, bool]] = [] if repr_db is not None: dbrefs.append((repr_db, True)) for mem in elem.findall("ns:member/ns:dbReference", ns): dbrefs.append((mem, False)) for dbref, is_representative in dbrefs: - acc, is_seed = get_accession_and_seed(dbref, ns) - if acc: - member_entity_id = cdm_entity_id(acc) - cluster_member_data.append( - (cluster_id, member_entity_id, str(is_representative).lower(), str(is_seed).lower(), "1.0") + accession, is_seed = get_accession_and_seed(dbref, ns) + if not accession: + continue + + member_entity_id = cdm_entity_id(accession) + if not member_entity_id: + continue + + cluster_member_rows.append( + ( + cluster_id, + member_entity_id, + str(is_representative).lower(), + str(is_seed).lower(), + "1.0", # score placeholder ) + ) -# Extract cross-references (UniRef90/50/UniParc) from a dbReference element def extract_cross_refs(dbref, cross_reference_data, ns) -> None: if dbref is None: return @@ -153,83 +251,95 @@ def extract_cross_refs(dbref, cross_reference_data, ns) -> None: cross_reference_data.append((entity_id, i, props[i])) -##### -------------- Parse Uniref XML --------------- ##### +def parse_uniref_entry( + elem: ET.Element, existing_created: dict[str, str], ns: dict[str, str] +) -> dict[str, list[tuple]]: + """ + Parse a single UniRef element into CDM-friendly row tuples. + """ + cluster_rows: list[tuple[str, str, str, str | None, str]] = [] + entity_rows: list[tuple[str, str, str, str, str, str]] = [] + member_rows: list[tuple[str, str, str, str, str]] = [] + xref_rows: list[tuple[str, str, str]] = [] + + uniref_id = elem.attrib.get("id") or "" + + cluster_id, name = extract_cluster(elem, ns, uniref_id) + updated_time, created_time = get_timestamps(uniref_id, existing_created) + + # Cluster table + cluster_rows.append( + ( + cluster_id, + name, + "protein", + None, + DATA_SOURCE, + ) + ) + # Entity table + entity_rows.append( + ( + cluster_id, + uniref_id, + "Cluster", + DATA_SOURCE, + updated_time, + created_time, + ) + ) -def parse_uniref_xml(local_gz, batch_size, existing_created): - """ - Parse UniRef XML (gzipped) and extract cluster, entity, cluster member, UniProtKB member, and cross-reference info. + # Cross references from representative and members + repr_db = elem.find("ns:representativeMember/ns:dbReference", ns) + extract_cross_refs(repr_db, xref_rows, ns) + + for mem in elem.findall("ns:member/ns:dbReference", ns): + extract_cross_refs(mem, xref_rows, ns) - Args: - local_gz (str): Local gzipped UniRef XML path. - batch_size (int): Maximum number of entries to parse. - existing_created (dict): Mapping from UniRef cluster ID to 'created' timestamp for idempotent imports. + # Cluster members (representative + members) + add_cluster_members(cluster_id, repr_db, elem, member_rows, ns) - Returns: - dict: Dictionary with lists for each CDM table + return { + "cluster_data": cluster_rows, + "entity_data": entity_rows, + "cluster_member_data": member_rows, + "cross_reference_data": xref_rows, + } + + +##### -------------- Parse Uniref XML --------------- ##### +def parse_uniref_xml(local_gz: str, batch_size: int, existing_created: dict[str, str]) -> dict[str, list[tuple]]: + """ + Stream-parse UniRef XML (gzipped) and extract CDM-like row tuples. """ - ns = {"ns": "http://uniprot.org/uniref"} # Namespace for XML parsing + ns = UNIREF_NS entry_count = 0 - # Initialize lists to collect parsed rows for different tables - cluster_data = [] - entity_data = [] - cluster_member_data = [] - cross_reference_data = [] + cluster_data: list[tuple] = [] + entity_data: list[tuple] = [] + cluster_member_data: list[tuple] = [] + cross_reference_data: list[tuple] = [] with gzip.open(local_gz, "rb") as f: - # Stream parse the XML to avoid memory issues with big files context = ET.iterparse(f, events=("end",)) for _, elem in context: - if elem.tag.endswith("entry"): - # Cluster basic info - cluster_id, name = extract_cluster(elem, ns) - - # Get UniRef cluster id and timestamps - uniref_id = elem.attrib.get("id") - updated_time, created_time = get_timestamps(uniref_id, existing_created) - - # Populate Cluster and Entity table data - cluster_data.append( - ( - cluster_id, # cluster_id - name, # cluster name - "protein", # entity_type (fixed value) - None, # description (not present) - "UniRef 100", # protocol_id - ) - ) - - entity_data.append( - ( - cluster_id, # entity_id (matches cluster_id) - uniref_id, # data_source_entity_id (UniRef100_xxx) - "Cluster", # entity_type - "UniRef 100", # data_source - updated_time, # updated - created_time, # created - ) - ) - - # Extract UniProtKB member attributes and cross-references - repr_db = elem.find("ns:representativeMember/ns:dbReference", ns) - extract_cross_refs(repr_db, cross_reference_data, ns) - - for mem in elem.findall("ns:member/ns:dbReference", ns): - extract_cross_refs(mem, cross_reference_data, ns) - - # ClusterMember table (representative + members) - add_cluster_members(cluster_id, repr_db, elem, cluster_member_data, ns) - - # Batch size limit - entry_count += 1 - if entry_count >= batch_size: - break - - # Release element to save memory - elem.clear() - - print(f"Parsed {entry_count} clusters") + if not elem.tag.endswith("entry"): + continue + + parsed = parse_uniref_entry(elem, existing_created, ns) + cluster_data.extend(parsed["cluster_data"]) + entity_data.extend(parsed["entity_data"]) + cluster_member_data.extend(parsed["cluster_member_data"]) + cross_reference_data.extend(parsed["cross_reference_data"]) + + entry_count += 1 + if entry_count >= batch_size: + break + + elem.clear() + + logger.info(f"Parsed {entry_count} clusters") return { "cluster_data": cluster_data, "entity_data": entity_data, @@ -238,10 +348,8 @@ def parse_uniref_xml(local_gz, batch_size, existing_created): } -##### -------------- Save dalta table and print the preview --------------- ##### - - -def save_delta_tables(spark, output_dir, data_dict) -> None: +##### -------------- Save delta table and print the preview --------------- ##### +def save_delta_tables(spark, output_dir, data_dict): # Cluster cluster_schema = StructType( [ @@ -255,7 +363,7 @@ def save_delta_tables(spark, output_dir, data_dict) -> None: cluster_df = spark.createDataFrame(data_dict["cluster_data"], cluster_schema) cluster_df.write.format("delta").mode("overwrite").save(os.path.join(output_dir, "Cluster")) - print(f"Cluster Delta table written to: {os.path.join(output_dir, 'Cluster')}") + logger.info(f"Cluster Delta table written to: {os.path.join(output_dir, 'Cluster')}") # Entity entity_schema = StructType( @@ -272,7 +380,7 @@ def save_delta_tables(spark, output_dir, data_dict) -> None: entity_df = spark.createDataFrame(data_dict["entity_data"], entity_schema) entity_table_path = os.path.join(output_dir, "Entity") entity_df.write.format("delta").mode("overwrite").save(entity_table_path) - print(f"Entity Delta table written to: {entity_table_path}") + logger.info(f"Entity Delta table written to: {entity_table_path}") # ClusterMember cluster_member_schema = StructType( @@ -288,7 +396,7 @@ def save_delta_tables(spark, output_dir, data_dict) -> None: cluster_member_df = spark.createDataFrame(data_dict["cluster_member_data"], cluster_member_schema) cluster_member_path = os.path.join(output_dir, "ClusterMember") cluster_member_df.write.format("delta").mode("overwrite").save(cluster_member_path) - print(f"ClusterMember Delta table written to: {cluster_member_path}") + logger.info(f"ClusterMember Delta table written to: {cluster_member_path}") # CrossReference cross_reference_schema = StructType( @@ -302,22 +410,22 @@ def save_delta_tables(spark, output_dir, data_dict) -> None: cross_reference_df = spark.createDataFrame(data_dict["cross_reference_data"], cross_reference_schema) cross_reference_path = os.path.join(output_dir, "CrossReference") cross_reference_df.write.format("delta").mode("overwrite").save(cross_reference_path) - print(f"CrossReference Delta table written to: {cross_reference_path}") + logger.info(f"CrossReference Delta table written to: {cross_reference_path}") # Previews - print("Sample Clusters:") + logger.info("Sample Clusters:") cluster_df.createOrReplaceTempView("Cluster") spark.sql("SELECT * FROM Cluster LIMIT 20").show(truncate=False) - print("Sample Entities:") + logger.info("Sample Entities:") entity_df.createOrReplaceTempView("Entity") spark.sql("SELECT * FROM Entity LIMIT 20").show(truncate=False) - print("Sample ClusterMembers:") + logger.info("Sample ClusterMembers:") cluster_member_df.createOrReplaceTempView("ClusterMember") spark.sql("SELECT * FROM ClusterMember LIMIT 20").show(truncate=False) - print("Sample CrossReferences:") + logger.info("Sample CrossReferences:") cross_reference_df.createOrReplaceTempView("CrossReference") spark.sql("SELECT * FROM CrossReference LIMIT 20").show(truncate=False) @@ -327,17 +435,27 @@ def build_spark_session(): builder = ( SparkSession.builder.appName("UniRef Cluster Extractor") .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") - .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") + .config( + "spark.sql.catalog.spark_catalog", + "org.apache.spark.sql.delta.catalog.DeltaCatalog", + ) ) return configure_spark_with_delta_pip(builder).getOrCreate() -# Click command-line interface for parameter parsing @click.command() @click.option("--ftp-url", required=True, help="FTP URL to UniRef100 XML file") @click.option("--output-dir", required=True, help="Output directory for Delta table") @click.option("--batch-size", default=1000, help="Number of UniRef entries to parse (limit)") -def main(ftp_url, output_dir, batch_size) -> None: +def main(ftp_url, output_dir, batch_size): + # set up logging in CLI context + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] (%(name)s:%(lineno)d %(message)s", + ) + + logger.info("Starting UniRef100/90/50 Import Pipeline") + # Set local path for downloaded gzipped XML file local_gz = os.path.join("/tmp", os.path.basename(ftp_url)) @@ -345,23 +463,31 @@ def main(ftp_url, output_dir, batch_size) -> None: try: download_file(ftp_url, local_gz) except URLError as e: - print(f"Error! Cannot download file: {e.reason}") + logger.error(f"Error! Cannot download file: {e.reason}") return # Start Spark session with Delta Lake support + logger.info("Building Spark session:") spark = build_spark_session() # Load existing entity creation timestamps - entity_table_path = os.path.join(output_dir, "Entity") - existing_created = load_existing_created(spark, entity_table_path) + try: + entity_table_path = os.path.join(output_dir, "Entity") + existing_created = load_existing_created(spark, entity_table_path) + + # Parse the UniRef XML and extract all CDM table data + logger.info("Parsing UniRef XML:") + data_dict = parse_uniref_xml(local_gz, batch_size, existing_created) - # Parse the UniRef XML and extract all CDM table data - data_dict = parse_uniref_xml(local_gz, batch_size, existing_created) + # Write parsed data to Delta tables in output directory + logger.info("Saving Delta tables:") + save_delta_tables(spark, output_dir, data_dict) - # Write parsed data to Delta tables in output directory - save_delta_tables(spark, output_dir, data_dict) + logger.info("UniRef100/90/50 Import Pipeline completed successfully.") - spark.stop() + finally: + spark.stop() + logger.info("Spark session stopped.") if __name__ == "__main__": diff --git a/src/cdm_data_loader_utils/parsers/xml_utils.py b/src/cdm_data_loader_utils/parsers/xml_utils.py new file mode 100644 index 0000000..d916799 --- /dev/null +++ b/src/cdm_data_loader_utils/parsers/xml_utils.py @@ -0,0 +1,124 @@ +""" +Shared XML helper utilities used by UniProt and UniRef parsers. + +This module centralizes common operations: +- Safe text extraction +- Safe attribute extraction +- Property parsing +- Evidence / dbReference parsing +- Cleaning dictionaries +- Deduplicating lists +""" + +import xml.etree.ElementTree as ET +from typing import Any + +# ============================================================ +# Basic Safe Accessors +# ============================================================ + + +def get_text(elem: ET.Element | None, default: str | None = None) -> str | None: + """Return elem.text if exists and non-empty.""" + if elem is None: + return default + if elem.text is None: + return default + text = elem.text.strip() + return text if text else default + + +def get_attr(elem: ET.Element | None, name: str, default: str | None = None) -> str | None: + """Return elem.get(name) safely.""" + if elem is None: + return default + val = elem.get(name) + return val.strip() if isinstance(val, str) else default + + +# ============================================================ +# List / Node Finders +# ============================================================ + + +def find_one(elem: ET.Element, xpath: str, ns: dict[str, str]): + """Return first element matching xpath or None.""" + results = elem.findall(xpath, ns) + return results[0] if results else None + + +def find_all_text(elem: ET.Element, xpath: str, ns: dict[str, str]) -> list[str]: + """Return list of text values from xpath matches (deduped).""" + texts = [] + for node in elem.findall(xpath, ns): + txt = get_text(node) + if txt: + texts.append(txt) + return list(dict.fromkeys(texts)) # preserve order, dedupe + + +def safe_list(x) -> list[Any]: + """Convert None → [].""" + if x is None: + return [] + if isinstance(x, list): + return x + return [x] + + +# ============================================================ +# dbReference / property parsing (shared by UniProt + UniRef) +# ============================================================ + + +def parse_properties(dbref: ET.Element | None, ns: dict[str, str]) -> dict[str, list[str]]: + """ + Extract key/value pairs from blocks. + """ + if dbref is None: + return {} + props = {} + for prop in dbref.findall("ns:property", ns): + ptype = prop.attrib.get("type") + pval = prop.attrib.get("value") + if ptype and pval: + if ptype not in props: + props[ptype] = [] + props[ptype].append(pval) + return props + + +def parse_db_references(elem: ET.Element, ns: dict[str, str], pub_types=("PubMed", "DOI")): + """ + Generic dbReference parser: + - Identify publication IDs (PubMed, DOI) + - Identify other cross-references (dbType:dbId) + """ + publications = [] + others = [] + + for dbref in elem.findall("ns:dbReference", ns): + db_type = dbref.get("type") + db_id = dbref.get("id") + + if not db_type or not db_id: + continue + + if db_type in pub_types: + publications.append(f"{db_type.upper()}:{db_id}") + else: + others.append(f"{db_type}:{db_id}") + + return publications, others + + +# ============================================================ +# Dict Cleaning +# ============================================================ + + +def clean_dict(d: dict[str, Any]) -> dict[str, Any]: + """ + Remove keys whose value is None or empty list. + """ + return {k: v for k, v in d.items() if v not in (None, [], {})} diff --git a/tests/conftest.py b/tests/conftest.py index 049dbb9..bbd7507 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,7 +7,7 @@ from typing import Any import pytest -from pyspark.sql import SparkSession, DataFrame +from pyspark.sql import DataFrame, SparkSession from pyspark.sql.types import ( ArrayType, BooleanType, diff --git a/tests/data/refseq/annotation_report.json b/tests/data/refseq/annotation_report.json new file mode 100644 index 0000000..53cd0d6 --- /dev/null +++ b/tests/data/refseq/annotation_report.json @@ -0,0 +1,105 @@ +{ + "reports": [ + { + "annotation": { + "gene_id": "4156250", + "name": "hypothetical protein", + "gene_type": "protein-coding", + "locus_tag": "MIV001R", + "genomic_regions": [ + { + "gene_range": { + "accession_version": "NC_008187.1", + "range": [ + { + "begin": "2620", + "end": "3066", + "orientation": "plus" + } + ] + } + } + ], + "proteins": [ + { + "accession_version": "YP_654573.1", + "name": "hypothetical protein", + "length": 148 + } + ], + "annotations": [ + { + "assembly_accession": "GCF_000869125.1" + } + ] + }, + "row_id": "1" + }, + { + "annotation": { + "gene_id": "4156251", + "name": "hypodermical protein", + "gene_type": "protein-coding", + "locus_tag": "MIV002R", + "genomic_regions": [ + { + "gene_range": { + "accession_version": "NC_008187.1", + "range": [ + { + "begin": "3603", + "end": "4979", + "orientation": "plus" + } + ] + } + } + ], + "proteins": [ + { + "accession_version": "YP_654574.1", + "name": "hypothetical protein", + "length": 458 + } + ], + "annotations": [ + { + "assembly_accession": "GCF_000869125.1" + } + ] + }, + "row_id": "2" + }, + { + "annotation": { + "gene_id": "4156252", + "name": "very hypothetical protein", + "gene_type": "protein-coding", + "locus_tag": "MIV003R", + "symbol": "kappa-delta-phi", + "genomic_regions": [ + { + "gene_range": { + "accession_version": "NC_008187.1", + "range": [ + { + "begin": "5168", + "end": "5638", + "orientation": "minus" + } + ] + } + } + ], + "proteins": [], + "annotations": [ + { + "assembly_accession": "GCF_000869125.1" + } + ] + }, + "row_id": "3" + } + ], + "total_count": 3 +} diff --git a/tests/data/refseq/annotation_report.parsed.json b/tests/data/refseq/annotation_report.parsed.json new file mode 100644 index 0000000..178e040 --- /dev/null +++ b/tests/data/refseq/annotation_report.parsed.json @@ -0,0 +1,239 @@ +{ + "contig": [ + { + "contig_id": "refseq:NC_008187.1", + "hash": null, + "gc_content": null, + "length": null + } + ], + "contig_x_contigcollection": [ + { + "contig_id": "refseq:NC_008187.1", + "contig_collection_id": "insdc.gcf:GCF_000869125.1" + } + ], + "contig_x_feature": [ + { + "contig_id": "refseq:NC_008187.1", + "feature_id": "ncbigene:4156250" + }, + { + "contig_id": "refseq:NC_008187.1", + "feature_id": "ncbigene:4156251" + }, + { + "contig_id": "refseq:NC_008187.1", + "feature_id": "ncbigene:4156252" + } + ], + "contig_x_protein": [ + { + "contig_id": "refseq:NC_008187.1", + "protein_id": "refseq:YP_654573.1" + }, + { + "contig_id": "refseq:NC_008187.1", + "protein_id": "refseq:YP_654574.1" + } + ], + "contigcollection": [ + { + "contig_collection_id": "insdc.gcf:GCF_000869125.1", + "hash": null + } + ], + "contigcollection_x_feature": [ + { + "contig_collection_id": "insdc.gcf:GCF_000869125.1", + "feature_id": "ncbigene:4156250" + }, + { + "contig_collection_id": "insdc.gcf:GCF_000869125.1", + "feature_id": "ncbigene:4156251" + }, + { + "contig_collection_id": "insdc.gcf:GCF_000869125.1", + "feature_id": "ncbigene:4156252" + } + ], + "contigcollection_x_protein": [ + { + "contig_collection_id": "insdc.gcf:GCF_000869125.1", + "protein_id": "refseq:YP_654573.1" + }, + { + "contig_collection_id": "insdc.gcf:GCF_000869125.1", + "protein_id": "refseq:YP_654574.1" + } + ], + "feature_x_protein": [ + { + "feature_id": "ncbigene:4156250", + "protein_id": "refseq:YP_654573.1" + }, + { + "feature_id": "ncbigene:4156251", + "protein_id": "refseq:YP_654574.1" + } + ], + "feature": [ + { + "feature_id": "ncbigene:4156250", + "hash": null, + "cds_phase": null, + "e_value": null, + "end": 3066, + "p_value": null, + "start": 2620, + "strand": "positive", + "source_database": "ncbigene", + "protocol_id": null, + "type": "protein-coding" + }, + { + "feature_id": "ncbigene:4156251", + "hash": null, + "cds_phase": null, + "e_value": null, + "end": 4979, + "p_value": null, + "start": 3603, + "strand": "positive", + "source_database": "ncbigene", + "protocol_id": null, + "type": "protein-coding" + }, + { + "feature_id": "ncbigene:4156251", + "hash": null, + "cds_phase": null, + "e_value": null, + "end": 5638, + "p_value": null, + "start": 5168, + "strand": "negative", + "source_database": "ncbigene", + "protocol_id": null, + "type": "protein-coding" + } + ], + "identifier": [ + { + "entity_id": "insdc.gcf:GCF_000869125", + "identifier": "insdc.gcf:GCF_000869125", + "description": "RefSeq genome ID", + "source": "RefSeq", + "relationship": null + }, + { + "entity_id": "refseq:NC_008187.1", + "identifier": "refseq:NC_008187.1", + "description": "RefSeq assembly ID", + "source": "RefSeq", + "relationship": null + }, + { + "entity_id": "ncbigene:4156250", + "identifier": "ncbigene:4156250", + "description": "NCBI gene ID", + "source": "RefSeq", + "relationship": null + }, + { + "entity_id": "ncbigene:4156251", + "identifier": "ncbigene:4156251", + "description": "NCBI gene ID", + "source": "RefSeq", + "relationship": null + }, + { + "entity_id": "ncbigene:4156252", + "identifier": "ncbigene:4156252", + "description": "NCBI gene ID", + "source": "RefSeq", + "relationship": null + }, + { + "entity_id": "refseq:YP_654573.1", + "identifier": "refseq:YP_654573.1", + "description": "RefSeq protein ID", + "source": "RefSeq", + "relationship": null + } + ], + "name": [ + { + "entity_id": "ncbigene:4156250", + "name": "hypothetical protein", + "description": "RefSeq gene name", + "source": "RefSeq" + }, + { + "entity_id": "ncbigene:4156251", + "name": "hypodermical protein", + "description": "RefSeq gene name", + "source": "RefSeq" + }, + { + "entity_id": "ncbigene:4156252", + "name": "very hypothetical protein", + "description": "RefSeq gene name", + "source": "RefSeq" + }, + { + "entity_id": "ncbigene:4156250", + "name": "MIV001R", + "description": "RefSeq locus tag", + "source": "RefSeq" + }, + { + "entity_id": "ncbigene:4156251", + "name": "MIV002R", + "description": "RefSeq locus tag", + "source": "RefSeq" + }, + { + "entity_id": "ncbigene:4156252", + "name": "MIV003R", + "description": "RefSeq locus tag", + "source": "RefSeq" + }, + { + "entity_id": "ncbigene:4156252", + "name": "kappa-delta-phi", + "description": "RefSeq symbol", + "source": "RefSeq" + }, + { + "entity_id": "refseq:YP_654573.1", + "name": "hypothetical protein", + "description": "RefSeq protein name", + "source": "RefSeq" + }, + { + "entity_id": "refseq:YP_654574.1", + "name": "hypothetical protein", + "description": "RefSeq protein name", + "source": "RefSeq" + } + ], + "protein": [ + { + "protein_id": "refseq:YP_654573.1", + "hash": null, + "description": null, + "evidence_for_existence": null, + "length": null, + "sequence": null + }, + { + "protein_id": "refseq:YP_654574.1", + "hash": null, + "description": null, + "evidence_for_existence": null, + "length": null, + "sequence": null + } + ] +} diff --git a/tests/parsers/refseq_importer/__init__.py b/tests/parsers/refseq_importer/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/parsers/refseq_importer/test_spark_delta.py b/tests/parsers/refseq_importer/test_spark_delta.py index b5cd9d0..0f5751e 100644 --- a/tests/parsers/refseq_importer/test_spark_delta.py +++ b/tests/parsers/refseq_importer/test_spark_delta.py @@ -114,14 +114,12 @@ def test_write_delta_contig_collection_schema(spark) -> None: db = "cdmdb" spark.sql(f"CREATE DATABASE IF NOT EXISTS {db}") - schema = StructType( - [ - StructField("collection_id", StringType(), True), - StructField("contig_collection_type", StringType(), True), - StructField("ncbi_taxon_id", StringType(), True), - StructField("gtdb_taxon_id", StringType(), True), - ] - ) + schema = StructType([ + StructField("collection_id", StringType(), True), + StructField("contig_collection_type", StringType(), True), + StructField("ncbi_taxon_id", StringType(), True), + StructField("gtdb_taxon_id", StringType(), True), + ]) df = spark.createDataFrame( [("C1", "isolate", "NCBITaxon:123", None)], diff --git a/tests/parsers/refseq_importer/test_tables_finalize.py b/tests/parsers/refseq_importer/test_tables_finalize.py index c71911c..d9151fd 100644 --- a/tests/parsers/refseq_importer/test_tables_finalize.py +++ b/tests/parsers/refseq_importer/test_tables_finalize.py @@ -20,12 +20,10 @@ def spark(): # ------------------------------------------------------------------- @pytest.mark.requires_spark def test_list_of_dicts_to_spark(spark) -> None: - schema = StructType( - [ - StructField("a", StringType(), True), - StructField("b", StringType(), True), - ] - ) + schema = StructType([ + StructField("a", StringType(), True), + StructField("b", StringType(), True), + ]) rows = [{"a": "1", "b": "x"}, {"a": "2", "b": "y"}] df = list_of_dicts_to_spark(spark, rows, schema) @@ -40,15 +38,13 @@ def test_list_of_dicts_to_spark(spark) -> None: @pytest.mark.requires_spark def test_finalize_tables_basic(spark) -> None: # ---------- entity ---------- - e_schema = StructType( - [ - StructField("entity_id", StringType(), True), - StructField("entity_type", StringType(), True), - StructField("data_source", StringType(), True), - StructField("created", StringType(), True), - StructField("updated", StringType(), True), - ] - ) + e_schema = StructType([ + StructField("entity_id", StringType(), True), + StructField("entity_type", StringType(), True), + StructField("data_source", StringType(), True), + StructField("created", StringType(), True), + StructField("updated", StringType(), True), + ]) e1 = spark.createDataFrame( [Row(entity_id="E1", entity_type="genome", data_source="RefSeq", created="2020", updated="2021")], @@ -60,14 +56,12 @@ def test_finalize_tables_basic(spark) -> None: ) # ---------- contig_collection (schema REQUIRED due to None!) ---------- - coll_schema = StructType( - [ - StructField("collection_id", StringType(), True), - StructField("contig_collection_type", StringType(), True), - StructField("ncbi_taxon_id", StringType(), True), - StructField("gtdb_taxon_id", StringType(), True), - ] - ) + coll_schema = StructType([ + StructField("collection_id", StringType(), True), + StructField("contig_collection_type", StringType(), True), + StructField("ncbi_taxon_id", StringType(), True), + StructField("gtdb_taxon_id", StringType(), True), + ]) c1 = spark.createDataFrame( [ diff --git a/tests/parsers/test_annotation_parse.py b/tests/parsers/test_annotation_parse.py new file mode 100644 index 0000000..0173504 --- /dev/null +++ b/tests/parsers/test_annotation_parse.py @@ -0,0 +1,767 @@ +### pytest tests/parsers/test_annotation_parse.py + +import json +from pathlib import Path + +import pytest +from pyspark.sql import SparkSession + +from src.cdm_data_loader_utils.parsers.annotation_parse import parse_annotation_data +from src.cdm_data_loader_utils.parsers.kbase_cdm_pyspark import schema as cdm_schemas + +from tests.validation.assertions import ( + assertDataFrameEqual, + assertDataFrameSchemaEqual, +) + +from src.cdm_data_loader_utils.parsers.annotation_parse import ( + apply_prefix, + load_contig_collection_x_feature, + load_contig_collection_x_protein, + load_contig_x_contig_collection, + load_contigs, + load_feature_records, + load_feature_x_protein, + load_identifiers, + load_names, + parse_annotation_data, + to_int, +) +from tests.conftest import TEST_NS + + +@pytest.mark.parametrize( + "input_data, expected_output", + [ + ( + { + "reports": [ + { + "annotation": { + "gene_id": "1234", + "name": "hypothetical protein", + "relationship": "RefSeq gene symbol", + } + } + ] + }, + [ + ( + "ncbigene:1234", + "1234", + "hypothetical protein", + "RefSeq", + "RefSeq gene symbol", + ) + ], + ), + ( + {"reports": [{"annotation": {"gene_id": "5678", "name": "some protein"}}]}, + [("ncbigene:5678", "5678", "some protein", "RefSeq", None)], + ), + ( + { + "reports": [ + { + "annotation": { + "name": "no gene id here", + "relationship": "RefSeq locus tag", + } + } + ] + }, + [], + ), + ], +) +def test_load_identifiers(input_data, expected_output): + result = load_identifiers(input_data) + assert result == expected_output + + +@pytest.mark.parametrize( + "input_data, expected_output", + [ + # Case 1: all name fields present + ( + { + "reports": [ + { + "annotation": { + "gene_id": "1234", + "symbol": "abc", + "name": "ABC protein", + "locus_tag": "LTG_1234", + } + } + ] + }, + [ + ("ncbigene:1234", "abc", "RefSeq gene symbol", "RefSeq"), + ("ncbigene:1234", "ABC protein", "RefSeq gene name", "RefSeq"), + ("ncbigene:1234", "LTG_1234", "RefSeq locus tag", "RefSeq"), + ], + ), + # Case 2: only gene_name present + ( + {"reports": [{"annotation": {"gene_id": "5678", "name": "Hypothetical protein"}}]}, + [ + ( + "ncbigene:5678", + "Hypothetical protein", + "RefSeq gene name", + "RefSeq", + ) + ], + ), + # Case 3: no gene_id + ( + {"reports": [{"annotation": {"name": "Unnamed", "symbol": "XYZ"}}]}, + [], + ), + # Case 4: only locus_tag present + ( + {"reports": [{"annotation": {"gene_id": "8888", "locus_tag": "LTG_8888"}}]}, + [("ncbigene:8888", "LTG_8888", "RefSeq locus tag", "RefSeq")], + ), + # Case 5: multiple reports + ( + { + "reports": [ + {"annotation": {"gene_id": "1001", "symbol": "DEF"}}, + {"annotation": {"gene_id": "1002", "name": "DEF protein"}}, + ] + }, + [ + ("ncbigene:1001", "DEF", "RefSeq gene symbol", "RefSeq"), + ("ncbigene:1002", "DEF protein", "RefSeq gene name", "RefSeq"), + ], + ), + ], +) +def test_load_names(input_data, expected_output): + result = load_names(input_data) + assert sorted(result) == sorted(expected_output) + + +@pytest.mark.parametrize( + "input_data, expected_output", + [ + # Case 1: basic valid input with plus strand + ( + { + "reports": [ + { + "annotation": { + "gene_id": "1234", + "genomic_regions": [ + { + "gene_range": { + "range": [ + { + "begin": "100", + "end": "200", + "orientation": "plus", + } + ] + } + } + ], + } + } + ] + }, + [ + ( + "ncbigene:1234", + None, + None, + None, + 200, + None, + 100, + "positive", + "RefSeq", + None, + "gene", + ) + ], + ), + # Case 2: multiple ranges, different strands + ( + { + "reports": [ + { + "annotation": { + "gene_id": "5678", + "genomic_regions": [ + { + "gene_range": { + "range": [ + { + "begin": "300", + "end": "500", + "orientation": "minus", + }, + { + "begin": "600", + "end": "800", + "orientation": "plus", + }, + ] + } + } + ], + } + } + ] + }, + [ + ( + "ncbigene:5678", + None, + None, + None, + 500, + None, + 300, + "negative", + "RefSeq", + None, + "gene", + ), + ( + "ncbigene:5678", + None, + None, + None, + 800, + None, + 600, + "positive", + "RefSeq", + None, + "gene", + ), + ], + ), + # Case 3: missing orientation + ( + { + "reports": [ + { + "annotation": { + "gene_id": "9999", + "genomic_regions": [{"gene_range": {"range": [{"begin": "1", "end": "2"}]}}], + } + } + ] + }, + [ + ( + "ncbigene:9999", + None, + None, + None, + 2, + None, + 1, + "unknown", + "RefSeq", + None, + "gene", + ) + ], + ), + # Case 4: no gene_id + ( + { + "reports": [ + { + "annotation": { + "genomic_regions": [ + { + "gene_range": { + "range": [ + { + "begin": "100", + "end": "200", + "orientation": "plus", + } + ] + } + } + ] + } + } + ] + }, + [], + ), + # Case 5: non-integer start/end + ( + { + "reports": [ + { + "annotation": { + "gene_id": "1111", + "genomic_regions": [ + { + "gene_range": { + "range": [ + { + "begin": "abc", + "end": "xyz", + "orientation": "plus", + } + ] + } + } + ], + } + } + ] + }, + [ + ( + "ncbigene:1111", + None, + None, + None, + None, + None, + None, + "positive", + "RefSeq", + None, + "gene", + ) + ], + ), + ], +) +def test_load_feature_records(input_data, expected_output): + result = load_feature_records(input_data) + assert sorted(result) == sorted(expected_output) + + +@pytest.mark.parametrize( + "input_data, expected_output", + [ + # Case 1: valid mapping + ( + { + "reports": [ + { + "annotation": { + "gene_id": "12345", + "genomic_regions": [{"gene_range": {"accession_version": "NC_000001.11"}}], + } + } + ] + }, + [("refseq:NC_000001.11", "ncbigene:12345")], + ), + # Case 2: no gene_id + ( + {"reports": [{"annotation": {"genomic_regions": [{"gene_range": {"accession_version": "NC_000002.11"}}]}}]}, + [], + ), + # Case 3: no genomic_regions + ( + {"reports": [{"annotation": {"gene_id": "67890"}}]}, + [], + ), + # Case 4: empty genomic_regions list + ( + {"reports": [{"annotation": {"gene_id": "99999", "genomic_regions": []}}]}, + [], + ), + # Case 5: missing accession_version + ( + { + "reports": [ + { + "annotation": { + "gene_id": "13579", + "genomic_regions": [{"gene_range": {}}], + } + } + ] + }, + [], + ), + ], +) +def test_load_contig_collection_x_feature(input_data, expected_output): + result = load_contig_collection_x_feature(input_data) + assert result == expected_output + + +@pytest.mark.parametrize( + "input_data, expected_output", + [ + # Case 1: Valid report with multiple proteins + ( + { + "reports": [ + { + "annotation": { + "proteins": [ + {"accession_version": "XP_123"}, + {"accession_version": "XP_456"}, + ], + "annotations": [{"assembly_accession": "GCF_000001"}], + } + } + ] + }, + [ + ("insdc.gcf:GCF_000001", "refseq:XP_123"), + ("insdc.gcf:GCF_000001", "refseq:XP_456"), + ], + ), + # Case 2: No proteins + ( + { + "reports": [ + { + "annotation": { + "proteins": [], + "annotations": [{"assembly_accession": "GCF_000002"}], + } + } + ] + }, + [], + ), + # Case 3: No annotations + ( + {"reports": [{"annotation": {"proteins": [{"accession_version": "XP_789"}]}}]}, + [], + ), + # Case 4: Missing assembly_accession + ( + { + "reports": [ + { + "annotation": { + "proteins": [{"accession_version": "XP_789"}], + "annotations": [{}], + } + } + ] + }, + [], + ), + # Case 5: Some proteins missing accession_version + ( + { + "reports": [ + { + "annotation": { + "proteins": [ + {"accession_version": "XP_111"}, + {}, + {"accession_version": "XP_222"}, + ], + "annotations": [{"assembly_accession": "GCF_000003"}], + } + } + ] + }, + [ + ("insdc.gcf:GCF_000003", "refseq:XP_111"), + ("insdc.gcf:GCF_000003", "refseq:XP_222"), + ], + ), + ], +) +def test_load_contig_collection_x_protein(input_data, expected_output): + result = load_contig_collection_x_protein(input_data) + assert sorted(result) == sorted(expected_output) + + +@pytest.mark.parametrize( + "input_data, expected_output", + [ + # Case 1: valid gene with multiple proteins + ( + { + "reports": [ + { + "annotation": { + "gene_id": "4156311", + "proteins": [ + {"accession_version": "XP_001"}, + {"accession_version": "XP_002"}, + ], + } + } + ] + }, + [ + ("ncbigene:4156311", "refseq:XP_001"), + ("ncbigene:4156311", "refseq:XP_002"), + ], + ), + # Case 2: no gene_id + ( + {"reports": [{"annotation": {"proteins": [{"accession_version": "XP_999"}]}}]}, + [], + ), + # Case 3: gene with no proteins + ( + {"reports": [{"annotation": {"gene_id": "4156312"}}]}, + [], + ), + # Case 4: some proteins missing accession_version + ( + { + "reports": [ + { + "annotation": { + "gene_id": "4156313", + "proteins": [ + {"accession_version": "XP_777"}, + {}, + {"accession_version": "XP_888"}, + ], + } + } + ] + }, + [ + ("ncbigene:4156313", "refseq:XP_777"), + ("ncbigene:4156313", "refseq:XP_888"), + ], + ), + # Case 5: empty report list + ({"reports": []}, []), + ], +) +def test_load_feature_x_protein(input_data, expected_output): + result = load_feature_x_protein(input_data) + assert sorted(result) == sorted(expected_output) + + +@pytest.mark.parametrize( + "input_data, expected_output", + [ + # Case 1: Valid contig and assembly + ( + { + "reports": [ + { + "annotation": { + "genomic_regions": [{"gene_range": {"accession_version": "NC_000001.11"}}], + "annotations": [{"assembly_accession": "GCF_000001.1"}], + } + } + ] + }, + [("refseq:NC_000001.11", "insdc.gcf:GCF_000001.1")], + ), + # Case 2: Missing genomic_regions + ( + {"reports": [{"annotation": {"annotations": [{"assembly_accession": "GCF_000002.1"}]}}]}, + [], + ), + # Case 3: Missing annotations + ( + {"reports": [{"annotation": {"genomic_regions": [{"gene_range": {"accession_version": "NC_000003.11"}}]}}]}, + [], + ), + # Case 4: Missing accession_version in region + ( + { + "reports": [ + { + "annotation": { + "genomic_regions": [{"gene_range": {}}], + "annotations": [{"assembly_accession": "GCF_000004.1"}], + } + } + ] + }, + [], + ), + # Case 5: Missing assembly_accession in annotations + ( + { + "reports": [ + { + "annotation": { + "genomic_regions": [{"gene_range": {"accession_version": "NC_000005.11"}}], + "annotations": [{}], + } + } + ] + }, + [], + ), + # Case 6: Multiple reports, one valid + ( + { + "reports": [ + { + "annotation": { + "genomic_regions": [{"gene_range": {"accession_version": "NC_000006.11"}}], + "annotations": [{"assembly_accession": "GCF_000006.1"}], + } + }, + { + "annotation": { + "genomic_regions": [{"gene_range": {"accession_version": "NC_000007.11"}}], + "annotations": [{}], + } + }, + ] + }, + [("refseq:NC_000006.11", "insdc.gcf:GCF_000006.1")], + ), + ], +) +def test_load_contig_x_contig_collection(input_data, expected_output): + result = load_contig_x_contig_collection(input_data) + assert sorted(result) == sorted(expected_output) + + +@pytest.mark.parametrize( + "input_data, expected_output", + [ + # Case 1: Valid contig with accession_version + ( + {"reports": [{"annotation": {"genomic_regions": [{"gene_range": {"accession_version": "NC_000001.11"}}]}}]}, + [("refseq:NC_000001.11", None, None, None)], + ), + # Case 2: Multiple contigs, different accession_versions + ( + { + "reports": [ + { + "annotation": { + "genomic_regions": [ + {"gene_range": {"accession_version": "NC_000001.11"}}, + {"gene_range": {"accession_version": "NC_000002.12"}}, + ] + } + } + ] + }, + [ + ("refseq:NC_000001.11", None, None, None), + ("refseq:NC_000002.12", None, None, None), + ], + ), + # Case 3: Duplicate accession versions + ( + { + "reports": [ + { + "annotation": { + "genomic_regions": [ + {"gene_range": {"accession_version": "NC_000003.13"}}, + {"gene_range": {"accession_version": "NC_000003.13"}}, + ] + } + } + ] + }, + [("refseq:NC_000003.13", None, None, None)], + ), + # Case 4: Missing accession_version + ( + {"reports": [{"annotation": {"genomic_regions": [{"gene_range": {}}]}}]}, + [], + ), + # Case 5: Empty reports + ( + {"reports": []}, + [], + ), + ], +) +def test_load_contigs(input_data, expected_output): + result = load_contigs(input_data) + assert sorted(result) == sorted(expected_output) + + +### add new test: to_int +@pytest.mark.parametrize( + "input_id, expected", + [ + ("GeneID:123", "ncbigene:123"), + ("YP_009725307.1", "refseq:YP_009725307.1"), + ("GCF_000001405.39", "insdc.gcf:GCF_000001405.39"), + ("random", "random"), + ], +) +def test_apply_prefix(input_id, expected): + assert apply_prefix(input_id) == expected + + +@pytest.mark.parametrize("val, expected", [("123", 123), ("abc", None), ("", None)]) +def test_to_int(val, expected): + assert to_int(val) == expected + + +TABLE_NAME_MAP = { + "contig": "Contig", + "feature": "Feature", + "identifier": "Identifier", + "name": "Name", + "contig_x_contigcollection": "Contig_x_ContigCollection", + "contigcollection_x_feature": "ContigCollection_x_Feature", + "contigcollection_x_protein": "ContigCollection_x_Protein", + "feature_x_protein": "Feature_x_Protein", +} + + +@pytest.mark.requires_spark +def test_parse_annotation_data(spark: SparkSession, test_data_dir: Path) -> None: + """Test the parsing of the annotation data with direct Delta table.""" + + test_ns = TEST_NS.lower() + spark.sql(f"CREATE DATABASE IF NOT EXISTS {TEST_NS}") + + # Load NCBI dataset from NCBI API + sample_api_response = test_data_dir / "refseq" / "annotation_report.json" + dataset = json.load(sample_api_response.open()) + + # Run parse function + parse_annotation_data(spark, [dataset], test_ns) + + # Expected tables to validate from output + expected_tables = [ + "contig", + "contig_x_contigcollection", + "contigcollection_x_feature", + "contigcollection_x_protein", + "feature", + "feature_x_protein", + "identifier", + "name", + ] + + for table_name in expected_tables: + result_df = spark.table(f"{test_ns}.{table_name}") + schema_key = TABLE_NAME_MAP[table_name] + + # Construct expected_df just for schema comparison + rows = [r.asDict() for r in result_df.collect()] + expected_df = spark.createDataFrame(rows, schema=cdm_schemas[schema_key]) + + # Assert schema match + assertDataFrameSchemaEqual( + expected_df, + result_df, + msg=f"{table_name}: schema mismatch", + ) + # Assert content match + assertDataFrameEqual( + expected_df, + result_df, + ignore_row_order=True, + msg=f"{table_name}: content mismatch", + ) diff --git a/tests/parsers/test_shared_identifiers.py b/tests/parsers/test_shared_identifiers.py new file mode 100644 index 0000000..b76e9af --- /dev/null +++ b/tests/parsers/test_shared_identifiers.py @@ -0,0 +1,34 @@ +import xml.etree.ElementTree as ET + +from cdm_data_loader_utils.parsers.shared_identifiers import parse_identifiers_generic + + +def test_parse_identifiers_generic_basic() -> None: + # + # P12345 + # Q99999 + # + ns = {"ns": "dummy"} + entry = ET.Element("entry") + + a1 = ET.SubElement(entry, "accession") + a1.text = "P12345" + a2 = ET.SubElement(entry, "accession") + a2.text = "Q99999" + + # Add namespace prefix to match xpath + a1.tag = "{dummy}accession" + a2.tag = "{dummy}accession" + + rows = parse_identifiers_generic( + entry=entry, + xpath="ns:accession", + prefix="UniProt", + ns=ns, + ) + + assert len(rows) == 2 + assert rows[0]["identifier"] == "UniProt:P12345" + assert rows[1]["identifier"] == "UniProt:Q99999" + assert rows[0]["source"] == "UniProt" + assert rows[0]["description"] == "UniProt accession" diff --git a/tests/parsers/test_xml_utils.py b/tests/parsers/test_xml_utils.py new file mode 100644 index 0000000..fc6e3ba --- /dev/null +++ b/tests/parsers/test_xml_utils.py @@ -0,0 +1,49 @@ +import xml.etree.ElementTree as ET + +from cdm_data_loader_utils.parsers.xml_utils import ( + clean_dict, + get_attr, + get_text, + parse_db_references, +) + + +def test_get_text_and_get_attr_basic() -> None: + elem = ET.Element("tag", attrib={"id": "123"}) + elem.text = " hello " + + assert get_text(elem) == "hello" + assert get_text(None) is None + assert get_attr(elem, "id") == "123" + assert get_attr(elem, "missing") is None + + +def test_parse_db_references_pub_and_others() -> None: + ns = {"ns": "dummy"} + source = ET.Element("source") + db1 = ET.SubElement(source, "dbReference", attrib={"type": "PubMed", "id": "12345"}) + db2 = ET.SubElement(source, "dbReference", attrib={"type": "DOI", "id": "10.1000/xyz"}) + db3 = ET.SubElement(source, "dbReference", attrib={"type": "PDB", "id": "1ABC"}) + + db1.tag = "{dummy}dbReference" + db2.tag = "{dummy}dbReference" + db3.tag = "{dummy}dbReference" + + pubs, others = parse_db_references(source, ns) + + assert "PUBMED:12345" in pubs + assert "DOI:10.1000/xyz" in pubs + assert "PDB:1ABC" in others + + +def test_clean_dict_removes_nones_and_empty() -> None: + """Test that clean_dict removes None and empty values.""" + d = { + "a": 1, + "b": None, + "c": [], + "d": {}, + "e": "ok", + } + cleaned = clean_dict(d) + assert cleaned == {"a": 1, "e": "ok"} diff --git a/tests/validation/assertions.py b/tests/validation/assertions.py new file mode 100644 index 0000000..b8ef6f1 --- /dev/null +++ b/tests/validation/assertions.py @@ -0,0 +1,40 @@ +from typing import List, Optional +from pyspark.sql import DataFrame +from pyspark.sql.types import StructType +from math import isclose + + +def assertDataFrameSchemaEqual(df1: DataFrame, df2: DataFrame, msg: str = "") -> None: + fields1 = [(f.name, f.dataType) for f in df1.schema.fields] + fields2 = [(f.name, f.dataType) for f in df2.schema.fields] + + assert fields1 == fields2, f"{msg}\nSchema mismatch:\n{fields1}\n!=\n{fields2}" + + +def assertDataFrameEqual( + df1: DataFrame, + df2: DataFrame, + msg: str = "", + ignore_row_order: bool = False, + float_tol: Optional[float] = None, +) -> None: + """ + Assert two DataFrames are equal in content. + """ + if ignore_row_order: + df1_rows = sorted([tuple(row) for row in df1.collect()]) + df2_rows = sorted([tuple(row) for row in df2.collect()]) + else: + df1_rows = df1.collect() + df2_rows = df2.collect() + + assert len(df1_rows) == len(df2_rows), f"{msg}\nRow count mismatch: {len(df1_rows)} != {len(df2_rows)}" + + for i, (r1, r2) in enumerate(zip(df1_rows, df2_rows)): + for j, (v1, v2) in enumerate(zip(r1, r2)): + if float_tol is not None and isinstance(v1, float) and isinstance(v2, float): + assert isclose(v1, v2, rel_tol=float_tol), ( + f"{msg}\nFloat mismatch at row {i}, col {j}: {v1} != {v2} within tol {float_tol}" + ) + else: + assert v1 == v2, f"{msg}\nValue mismatch at row {i}, col {j}: {v1} != {v2}" diff --git a/tests/validation/test_dataframe_validator.py b/tests/validation/test_dataframe_validator.py index 63a316c..f78dcd2 100644 --- a/tests/validation/test_dataframe_validator.py +++ b/tests/validation/test_dataframe_validator.py @@ -1,4 +1,11 @@ -"""Tests for parser error handling, schema compliance, and so on.""" +""" + +Tests for DataFrameValidator behavior: +- empty dataframe handling +- mocked validation flow +- integration validation on real RefSeq CDM outputs + +""" from typing import Any from unittest.mock import MagicMock @@ -6,6 +13,7 @@ import pytest from pyspark.sql import DataFrame, SparkSession from pyspark.sql.types import StructField, StructType +from pyspark.sql.functions import col, when, lit from cdm_data_loader_utils.audit.schema import METRICS, REJECTS, ROW_ERRORS from cdm_data_loader_utils.core.constants import INVALID_DATA_FIELD_NAME @@ -14,6 +22,9 @@ from tests.audit.conftest import create_table +# ------------------------------------------------------------------------------ +# Unit tests +# ------------------------------------------------------------------------------ @pytest.mark.requires_spark def test_validate_dataframe_empty_df(pipeline_run: PipelineRun, empty_df: DataFrame) -> None: """Assert that an empty dataframe does not perform any validation.""" @@ -80,3 +91,57 @@ def test_validate_dataframe_no_validation( # noqa: PLR0913 assert metrics.count() == 1 rejects = spark.table(f"{pipeline_run.namespace}.{REJECTS}") assert rejects.count() == output.records_invalid + + +# ------------------------------------------------------------------------------ +# Integration-style test (real RefSeq CDM output) +# ------------------------------------------------------------------------------ + + +@pytest.mark.requires_spark +def test_validate_refseq_cdm( + spark: SparkSession, + pipeline_run: PipelineRun, +) -> None: + # Prepare audit tables from scratch + for t in (METRICS, REJECTS): + create_table(spark, t, add_default_data=False) + + # Load real pipeline output + df = spark.table(f"{pipeline_run.namespace}.cdm_identifiers") + + # Sanity check: pipeline actually produced data + assert df.count() > 0 + assert "identifier" in df.columns + + # Simple validation rule: identifier cannot be null + def validation_fn(df: DataFrame) -> DataFrame: + return df.withColumn( + INVALID_DATA_FIELD_NAME, + when(col("identifier").isNull(), lit("identifier is null")), + ) + + validator = Validator(validation_fn, {}) + + dfv = DataFrameValidator(spark) + output = dfv.validate_dataframe( + data_to_validate=df, + schema=df.schema.fields, + run=pipeline_run, + validator=validator, + invalid_col=INVALID_DATA_FIELD_NAME, + ) + + # Records accounting + assert output.records_read == df.count() + assert output.records_valid + output.records_invalid == output.records_read + + # valid_df must not contain invalid rows + assert output.valid_df.filter(col(INVALID_DATA_FIELD_NAME).isNotNull()).count() == 0 + + # Audit tables written + metrics = spark.table(f"{pipeline_run.namespace}.{METRICS}") + rejects = spark.table(f"{pipeline_run.namespace}.{REJECTS}") + + assert metrics.count() == 1 + assert rejects.count() == output.records_invalid diff --git a/uv.lock b/uv.lock index 0110326..cec3518 100644 --- a/uv.lock +++ b/uv.lock @@ -95,6 +95,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e", size = 7490, upload-time = "2025-07-03T22:54:42.156Z" }, ] +[[package]] +name = "alabaster" +version = "1.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a6/f8/d9c74d0daf3f742840fd818d69cfae176fa332022fd44e3469487d5a9420/alabaster-1.0.0.tar.gz", hash = "sha256:c00dca57bca26fa62a6d7d0a9fcce65f3e026e9bfe33e9c538fd3fbb2144fd9e", size = 24210, upload-time = "2024-07-26T18:15:03.762Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/b3/6b4067be973ae96ba0d615946e314c5ae35f9f993eca561b356540bb0c2b/alabaster-1.0.0-py3-none-any.whl", hash = "sha256:fc6786402dc3fcb2de3cabd5fe455a2db534b371124f1f21de8731783dec828b", size = 13929, upload-time = "2024-07-26T18:15:02.05Z" }, +] + [[package]] name = "annotated-types" version = "0.7.0" @@ -236,6 +245,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b7/b8/3fe70c75fe32afc4bb507f75563d39bc5642255d1d94f1f23604725780bf/babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2", size = 10182537, upload-time = "2025-02-01T15:17:37.39Z" }, ] +[[package]] +name = "backrefs" +version = "6.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/86/e3/bb3a439d5cb255c4774724810ad8073830fac9c9dee123555820c1bcc806/backrefs-6.1.tar.gz", hash = "sha256:3bba1749aafe1db9b915f00e0dd166cba613b6f788ffd63060ac3485dc9be231", size = 7011962, upload-time = "2025-11-15T14:52:08.323Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3b/ee/c216d52f58ea75b5e1841022bbae24438b19834a29b163cb32aa3a2a7c6e/backrefs-6.1-py310-none-any.whl", hash = "sha256:2a2ccb96302337ce61ee4717ceacfbf26ba4efb1d55af86564b8bbaeda39cac1", size = 381059, upload-time = "2025-11-15T14:51:59.758Z" }, + { url = "https://files.pythonhosted.org/packages/e6/9a/8da246d988ded941da96c7ed945d63e94a445637eaad985a0ed88787cb89/backrefs-6.1-py311-none-any.whl", hash = "sha256:e82bba3875ee4430f4de4b6db19429a27275d95a5f3773c57e9e18abc23fd2b7", size = 392854, upload-time = "2025-11-15T14:52:01.194Z" }, + { url = "https://files.pythonhosted.org/packages/37/c9/fd117a6f9300c62bbc33bc337fd2b3c6bfe28b6e9701de336b52d7a797ad/backrefs-6.1-py312-none-any.whl", hash = "sha256:c64698c8d2269343d88947c0735cb4b78745bd3ba590e10313fbf3f78c34da5a", size = 398770, upload-time = "2025-11-15T14:52:02.584Z" }, + { url = "https://files.pythonhosted.org/packages/eb/95/7118e935b0b0bd3f94dfec2d852fd4e4f4f9757bdb49850519acd245cd3a/backrefs-6.1-py313-none-any.whl", hash = "sha256:4c9d3dc1e2e558965202c012304f33d4e0e477e1c103663fd2c3cc9bb18b0d05", size = 400726, upload-time = "2025-11-15T14:52:04.093Z" }, + { url = "https://files.pythonhosted.org/packages/1d/72/6296bad135bfafd3254ae3648cd152980a424bd6fed64a101af00cc7ba31/backrefs-6.1-py314-none-any.whl", hash = "sha256:13eafbc9ccd5222e9c1f0bec563e6d2a6d21514962f11e7fc79872fd56cbc853", size = 412584, upload-time = "2025-11-15T14:52:05.233Z" }, + { url = "https://files.pythonhosted.org/packages/02/e3/a4fa1946722c4c7b063cc25043a12d9ce9b4323777f89643be74cef2993c/backrefs-6.1-py39-none-any.whl", hash = "sha256:a9e99b8a4867852cad177a6430e31b0f6e495d65f8c6c134b68c14c3c95bf4b0", size = 381058, upload-time = "2025-11-15T14:52:06.698Z" }, +] + [[package]] name = "beautifulsoup4" version = "4.14.3" @@ -376,6 +399,7 @@ source = { editable = "." } dependencies = [ { name = "berdl-notebook-utils" }, { name = "biopython" }, + { name = "cdm-schema" }, { name = "click" }, { name = "lxml" }, { name = "pytest-asyncio" }, @@ -428,6 +452,19 @@ wheels = [ [package.metadata] requires-dist = [{ name = "jupyterlab", specifier = ">=3.0" }] +[[package]] +name = "cdm-schema" +version = "0.1.0" +source = { git = "https://github.com/kbase/cdm-schema.git#ae0e5d60f826d53507b117149c33a0f98051296b" } +dependencies = [ + { name = "linkml" }, + { name = "linkml-runtime" }, + { name = "mkdocs-material" }, + { name = "mkdocs-mermaid2-plugin" }, + { name = "pyspark" }, + { name = "ruff" }, +] + [[package]] name = "cdm-spark-manager-client" version = "0.0.1" @@ -507,6 +544,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ae/3a/dbeec9d1ee0844c679f6bb5d6ad4e9f198b1224f4e7a32825f47f6192b0c/cffi-2.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:0a1527a803f0a659de1af2e1fd700213caba79377e27e4693648c2923da066f9", size = 184195, upload-time = "2025-09-08T23:23:43.004Z" }, ] +[[package]] +name = "cfgraph" +version = "0.2.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "rdflib" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/cb/51/3e7e021920cfe2f7d18b672642e13f7dc4f53545d530b52ee6533b6681ca/CFGraph-0.2.1.tar.gz", hash = "sha256:b57fe7044a10b8ff65aa3a8a8ddc7d4cd77bf511b42e57289cd52cbc29f8fe74", size = 2630, upload-time = "2018-11-20T15:27:28.69Z" } + +[[package]] +name = "chardet" +version = "5.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/f7b6ab21ec75897ed80c17d79b15951a719226b9fababf1e40ea74d69079/chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7", size = 2069618, upload-time = "2023-08-01T19:23:02.662Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/38/6f/f5fbc992a329ee4e0f288c1fe0e2ad9485ed064cac731ed2fe47dcc38cbf/chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970", size = 199385, upload-time = "2023-08-01T19:23:00.661Z" }, +] + [[package]] name = "charset-normalizer" version = "3.4.4" @@ -550,14 +605,14 @@ wheels = [ [[package]] name = "click" -version = "8.3.1" +version = "8.1.8" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "colorama", marker = "sys_platform == 'win32'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/3d/fa/656b739db8587d7b5dfa22e22ed02566950fbfbcdc20311993483657a5c0/click-8.3.1.tar.gz", hash = "sha256:12ff4785d337a1bb490bb7e9c2b1ee5da3112e94a8622f26a6c77f5d2fc6842a", size = 295065, upload-time = "2025-11-15T20:45:42.706Z" } +sdist = { url = "https://files.pythonhosted.org/packages/b9/2e/0090cbf739cee7d23781ad4b89a9894a41538e4fcf4c31dcdd705b78eb8b/click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a", size = 226593, upload-time = "2024-12-21T18:38:44.339Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/98/78/01c019cdb5d6498122777c1a43056ebb3ebfeef2076d9d026bfe15583b2b/click-8.3.1-py3-none-any.whl", hash = "sha256:981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6", size = 108274, upload-time = "2025-11-15T20:45:41.139Z" }, + { url = "https://files.pythonhosted.org/packages/7e/d4/7ebdbd03970677812aac39c869717059dbb71a4cfc033ca6e5221787892c/click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2", size = 98188, upload-time = "2024-12-21T18:38:41.666Z" }, ] [[package]] @@ -704,6 +759,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e8/cb/2da4cc83f5edb9c3257d09e1e7ab7b23f049c7962cae8d842bbef0a9cec9/cryptography-46.0.3-cp38-abi3-win_arm64.whl", hash = "sha256:d89c3468de4cdc4f08a57e214384d0471911a3830fcdaf7a8cc587e42a866372", size = 2918740, upload-time = "2025-10-15T23:18:12.277Z" }, ] +[[package]] +name = "curies" +version = "0.12.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pydantic" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ef/fc/8f73cbde9b2034e4b4f8524b4c5b7ce2a68d052ede8a486c0bc806c1f54d/curies-0.12.7.tar.gz", hash = "sha256:b51f422f6f8b93b35b583195222563327a00629d0ef8e889dc14606e31950e4f", size = 283292, upload-time = "2025-12-22T15:48:33.554Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c6/65/c6118987bc902a1a5941d2028c49d91c2db55d5bec148b46d155a125543b/curies-0.12.7-py3-none-any.whl", hash = "sha256:9038d6afd6311328b072db51488af1ce162cb26c1a3cc497d2d00871ddb824a9", size = 70042, upload-time = "2025-12-22T15:48:32.508Z" }, +] + [[package]] name = "dask" version = "2026.1.1" @@ -806,6 +874,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3b/d8/265a93d22ae79262cdff701496a6f5676926a342153f3855ae6060430660/delta_spark-4.0.0-py3-none-any.whl", hash = "sha256:4e4ded07bb9ee4f6a0df45606d84395239d4b82001e765a627fecc1e914f3029", size = 39756, upload-time = "2025-06-06T01:41:44.815Z" }, ] +[[package]] +name = "deprecated" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wrapt" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/49/85/12f0a49a7c4ffb70572b6c2ef13c90c88fd190debda93b23f026b25f9634/deprecated-1.3.1.tar.gz", hash = "sha256:b1b50e0ff0c1fddaa5708a2c6b0a6588bb09b892825ab2b214ac9ea9d92a5223", size = 2932523, upload-time = "2025-10-30T08:19:02.757Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/84/d0/205d54408c08b13550c733c4b85429e7ead111c7f0014309637425520a9a/deprecated-1.3.1-py2.py3-none-any.whl", hash = "sha256:597bfef186b6f60181535a29fbe44865ce137a5079f295b479886c82729d5f3f", size = 11298, upload-time = "2025-10-30T08:19:00.758Z" }, +] + [[package]] name = "distributed" version = "2026.1.1" @@ -1010,6 +1090,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c4/ab/09169d5a4612a5f92490806649ac8d41e3ec9129c636754575b3553f4ea4/googleapis_common_protos-1.72.0-py3-none-any.whl", hash = "sha256:4299c5a82d5ae1a9702ada957347726b167f9f8d1fc352477702a1e851ff4038", size = 297515, upload-time = "2025-11-06T18:29:13.14Z" }, ] +[[package]] +name = "graphviz" +version = "0.21" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f8/b3/3ac91e9be6b761a4b30d66ff165e54439dcd48b83f4e20d644867215f6ca/graphviz-0.21.tar.gz", hash = "sha256:20743e7183be82aaaa8ad6c93f8893c923bd6658a04c32ee115edb3c8a835f78", size = 200434, upload-time = "2025-06-15T09:35:05.824Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/91/4c/e0ce1ef95d4000ebc1c11801f9b944fa5910ecc15b5e351865763d8657f8/graphviz-0.21-py3-none-any.whl", hash = "sha256:54f33de9f4f911d7e84e4191749cac8cc5653f815b06738c54db9a15ab8b1e42", size = 47300, upload-time = "2025-06-15T09:35:04.433Z" }, +] + [[package]] name = "greenlet" version = "3.3.0" @@ -1095,6 +1184,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" }, ] +[[package]] +name = "hbreader" +version = "0.9.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/54/66/3a649ce125e03d1d43727a8b833cd211f0b9fe54a7e5be326f50d6f1d951/hbreader-0.9.1.tar.gz", hash = "sha256:d2c132f8ba6276d794c66224c3297cec25c8079d0a4cf019c061611e0a3b94fa", size = 19016, upload-time = "2021-02-25T19:22:32.799Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7b/24/61844afbf38acf419e01ca2639f7bd079584523d34471acbc4152ee991c5/hbreader-0.9.1-py3-none-any.whl", hash = "sha256:9a6e76c9d1afc1b977374a5dc430a1ebb0ea0488205546d4678d6e31cc5f6801", size = 7595, upload-time = "2021-02-25T19:22:31.944Z" }, +] + [[package]] name = "hf-xet" version = "1.2.0" @@ -1192,7 +1290,7 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/6f/fa/a1a94c55637f2b7cfeb05263ac3881aa87c82df92d8b4b31c909079f4419/huggingface_hub-1.1.7.tar.gz", hash = "sha256:3c84b6283caca928595f08fd42e9a572f17ec3501dec508c3f2939d94bfbd9d2", size = 607537, upload-time = "2025-12-01T11:05:28.137Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/dd/4f/82e5ab009089a2c48472bf4248391fe4091cf0b9c3e951dbb8afe3b23d76/huggingface_hub-1.1.7-py3-none-any.whl", hash = "sha256:f3efa4779f4890e44c957bbbb0f197e6028887ad09f0cf95a21659fa7753605d", size = 516239, upload-time = "2025-12-01T11:05:25.981Z" }, + { url = "https://files.pythonhosted.org/packages/33/3f/969137c9d9428ed8bf171d27604243dd950a47cac82414826e2aebbc0a4c/huggingface_hub-1.1.4-py3-none-any.whl", hash = "sha256:867799fbd2ef338b7f8b03d038d9c0e09415dfe45bb2893b48a510d1d746daa5", size = 515580, upload-time = "2025-11-13T10:51:55.742Z" }, ] [[package]] @@ -1216,6 +1314,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" }, ] +[[package]] +name = "imagesize" +version = "1.4.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a7/84/62473fb57d61e31fef6e36d64a179c8781605429fd927b5dd608c997be31/imagesize-1.4.1.tar.gz", hash = "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a", size = 1280026, upload-time = "2022-07-01T12:21:05.687Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ff/62/85c4c919272577931d407be5ba5d71c20f0b616d31a0befe0ae45bb79abd/imagesize-1.4.1-py2.py3-none-any.whl", hash = "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b", size = 8769, upload-time = "2022-07-01T12:21:02.467Z" }, +] + [[package]] name = "importlib-metadata" version = "8.7.1" @@ -1310,6 +1417,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/56/6d/0d9848617b9f753b87f214f1c682592f7ca42de085f564352f10f0843026/ipywidgets-8.1.8-py3-none-any.whl", hash = "sha256:ecaca67aed704a338f88f67b1181b58f821ab5dc89c1f0f5ef99db43c1c2921e", size = 139808, upload-time = "2025-11-01T21:18:10.956Z" }, ] +[[package]] +name = "isodate" +version = "0.7.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/54/4d/e940025e2ce31a8ce1202635910747e5a87cc3a6a6bb2d00973375014749/isodate-0.7.2.tar.gz", hash = "sha256:4cd1aa0f43ca76f4a6c6c0292a85f40b35ec2e43e315b59f06e6d32171a953e6", size = 29705, upload-time = "2024-10-08T23:04:11.5Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/15/aa/0aca39a37d3c7eb941ba736ede56d689e7be91cab5d9ca846bde3999eba6/isodate-0.7.2-py3-none-any.whl", hash = "sha256:28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15", size = 22320, upload-time = "2024-10-08T23:04:09.501Z" }, +] + [[package]] name = "isoduration" version = "20.11.0" @@ -1420,6 +1536,32 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/31/b4/b9b800c45527aadd64d5b442f9b932b00648617eb5d63d2c7a6587b7cafc/jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980", size = 20256, upload-time = "2022-06-17T18:00:10.251Z" }, ] +[[package]] +name = "jsbeautifier" +version = "1.15.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "editorconfig" }, + { name = "six" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ea/98/d6cadf4d5a1c03b2136837a435682418c29fdeb66be137128544cecc5b7a/jsbeautifier-1.15.4.tar.gz", hash = "sha256:5bb18d9efb9331d825735fbc5360ee8f1aac5e52780042803943aa7f854f7592", size = 75257, upload-time = "2025-02-27T17:53:53.252Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2d/14/1c65fccf8413d5f5c6e8425f84675169654395098000d8bddc4e9d3390e1/jsbeautifier-1.15.4-py3-none-any.whl", hash = "sha256:72f65de312a3f10900d7685557f84cb61a9733c50dcc27271a39f5b0051bf528", size = 94707, upload-time = "2025-02-27T17:53:46.152Z" }, +] + +[[package]] +name = "json-flattener" +version = "0.1.9" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "pyyaml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6d/77/b00e46d904818826275661a690532d3a3a43a4ded0264b2d7fcdb5c0feea/json_flattener-0.1.9.tar.gz", hash = "sha256:84cf8523045ffb124301a602602201665fcb003a171ece87e6f46ed02f7f0c15", size = 11479, upload-time = "2022-02-26T01:36:04.545Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/00/cc/7fbd75d3362e939eb98bcf9bd22f3f7df8c237a85148899ed3d38e5614e5/json_flattener-0.1.9-py3-none-any.whl", hash = "sha256:6b027746f08bf37a75270f30c6690c7149d5f704d8af1740c346a3a1236bc941", size = 10799, upload-time = "2022-02-26T01:36:03.06Z" }, +] + [[package]] name = "json5" version = "0.13.0" @@ -1478,6 +1620,16 @@ wheels = [ ] [package.optional-dependencies] +format = [ + { name = "fqdn" }, + { name = "idna" }, + { name = "isoduration" }, + { name = "jsonpointer" }, + { name = "rfc3339-validator" }, + { name = "rfc3987" }, + { name = "uri-template" }, + { name = "webcolors" }, +] format-nongpl = [ { name = "fqdn" }, { name = "idna" }, @@ -2533,6 +2685,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" }, ] +[[package]] +name = "paginate" +version = "0.5.7" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ec/46/68dde5b6bc00c1296ec6466ab27dddede6aec9af1b99090e1107091b3b84/paginate-0.5.7.tar.gz", hash = "sha256:22bd083ab41e1a8b4f3690544afb2c60c25e5c9a63a30fa2f483f6c60c8e5945", size = 19252, upload-time = "2024-08-25T14:17:24.139Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/90/96/04b8e52da071d28f5e21a805b19cb9390aa17a47462ac87f5e2696b9566d/paginate-0.5.7-py2.py3-none-any.whl", hash = "sha256:b885e2af73abcf01d9559fd5216b57ef722f8c42affbb63942377668e35c7591", size = 13746, upload-time = "2024-08-25T14:17:22.55Z" }, +] + [[package]] name = "pandas" version = "2.3.3" @@ -2582,6 +2743,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ef/af/4fbc8cab944db5d21b7e2a5b8e9211a03a79852b1157e2c102fcc61ac440/pandocfilters-1.5.1-py2.py3-none-any.whl", hash = "sha256:93be382804a9cdb0a7267585f157e5d1731bbe5545a85b268d6f5fe6232de2bc", size = 8663, upload-time = "2024-01-18T20:08:11.28Z" }, ] +[[package]] +name = "parse" +version = "1.20.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4f/78/d9b09ba24bb36ef8b83b71be547e118d46214735b6dfb39e4bfde0e9b9dd/parse-1.20.2.tar.gz", hash = "sha256:b41d604d16503c79d81af5165155c0b20f6c8d6c559efa66b4b695c3e5a0a0ce", size = 29391, upload-time = "2024-06-11T04:41:57.34Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d0/31/ba45bf0b2aa7898d81cbbfac0e88c267befb59ad91a19e36e1bc5578ddb1/parse-1.20.2-py2.py3-none-any.whl", hash = "sha256:967095588cb802add9177d0c0b6133b5ba33b1ea9007ca800e526f42a85af558", size = 20126, upload-time = "2024-06-11T04:41:55.057Z" }, +] + [[package]] name = "parso" version = "0.8.5" @@ -2604,6 +2774,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/71/e7/40fb618334dcdf7c5a316c0e7343c5cd82d3d866edc100d98e29bc945ecd/partd-1.4.2-py3-none-any.whl", hash = "sha256:978e4ac767ec4ba5b86c6eaa52e5a2a3bc748a2ca839e8cc798f1cc6ce6efb0f", size = 18905, upload-time = "2024-05-06T19:51:39.271Z" }, ] +[[package]] +name = "pathspec" +version = "0.12.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ca/bc/f35b8446f4531a7cb215605d100cd88b7ac6f44ab3fc94870c120ab3adbf/pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712", size = 51043, upload-time = "2023-12-10T22:30:45Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191, upload-time = "2023-12-10T22:30:43.14Z" }, +] + [[package]] name = "pexpect" version = "4.9.0" @@ -2652,6 +2831,34 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a3/58/35da89ee790598a0700ea49b2a66594140f44dec458c07e8e3d4979137fc/ply-3.11-py2.py3-none-any.whl", hash = "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce", size = 49567, upload-time = "2018-02-15T19:01:27.172Z" }, ] +[[package]] +name = "prefixcommons" +version = "0.1.12" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "pytest-logging" }, + { name = "pyyaml" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7d/b5/c5b63a4bf5dedb36567181fdb98dbcc7aaa025faebabaaffa2f5eb4b8feb/prefixcommons-0.1.12.tar.gz", hash = "sha256:22c4e2d37b63487b3ab48f0495b70f14564cb346a15220f23919eb0c1851f69f", size = 24063, upload-time = "2022-07-19T00:06:12.478Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/31/e8/715b09df3dab02b07809d812042dc47a46236b5603d9d3a2572dbd1d8a97/prefixcommons-0.1.12-py3-none-any.whl", hash = "sha256:16dbc0a1f775e003c724f19a694fcfa3174608f5c8b0e893d494cf8098ac7f8b", size = 29482, upload-time = "2022-07-19T00:06:08.709Z" }, +] + +[[package]] +name = "prefixmaps" +version = "0.2.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "curies" }, + { name = "pyyaml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/4d/cf/f588bcdfd2c841839b9d59ce219a46695da56aa2805faff937bbafb9ee2b/prefixmaps-0.2.6.tar.gz", hash = "sha256:7421e1244eea610217fa1ba96c9aebd64e8162a930dc0626207cd8bf62ecf4b9", size = 709899, upload-time = "2024-10-17T16:30:57.738Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/89/b2/2b2153173f2819e3d7d1949918612981bc6bd895b75ffa392d63d115f327/prefixmaps-0.2.6-py3-none-any.whl", hash = "sha256:f6cef28a7320fc6337cf411be212948ce570333a0ce958940ef684c7fb192a62", size = 754732, upload-time = "2024-10-17T16:30:55.731Z" }, +] + [[package]] name = "prometheus-client" version = "0.24.1" @@ -2978,6 +3185,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" }, ] +[[package]] +name = "pyjsg" +version = "0.11.10" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "antlr4-python3-runtime" }, + { name = "jsonasobj" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/90/61/e001a4b679a171f84783deb8e215a91c9f614cb498807e24e4f73ea4e5ed/PyJSG-0.11.10.tar.gz", hash = "sha256:4bd6e3ff2833fa2b395bbe803a2d72a5f0bab5b7285bccd0da1a1bc0aee88bfa", size = 130742, upload-time = "2022-04-14T17:18:24.511Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/96/ee/370c3b1908327dac967841ff723db391a02f3637c95c6898160e5ffe1060/PyJSG-0.11.10-py3-none-any.whl", hash = "sha256:10af60ff42219be7e85bf7f11c19b648715b0b29eb2ddbd269e87069a7c3f26d", size = 80763, upload-time = "2022-04-14T17:18:23.169Z" }, +] + [[package]] name = "pyjwt" version = "2.10.1" @@ -2992,6 +3212,65 @@ crypto = [ { name = "cryptography" }, ] +[[package]] +name = "pymdown-extensions" +version = "10.19.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markdown" }, + { name = "pyyaml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/72/2d/9f30cee56d4d6d222430d401e85b0a6a1ae229819362f5786943d1a8c03b/pymdown_extensions-10.19.1.tar.gz", hash = "sha256:4969c691009a389fb1f9712dd8e7bd70dcc418d15a0faf70acb5117d022f7de8", size = 847839, upload-time = "2025-12-14T17:25:24.42Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fb/35/b763e8fbcd51968329b9adc52d188fc97859f85f2ee15fe9f379987d99c5/pymdown_extensions-10.19.1-py3-none-any.whl", hash = "sha256:e8698a66055b1dc0dca2a7f2c9d0ea6f5faa7834a9c432e3535ab96c0c4e509b", size = 266693, upload-time = "2025-12-14T17:25:22.999Z" }, +] + +[[package]] +name = "pyparsing" +version = "3.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/33/c1/1d9de9aeaa1b89b0186e5fe23294ff6517fce1bc69149185577cd31016b2/pyparsing-3.3.1.tar.gz", hash = "sha256:47fad0f17ac1e2cad3de3b458570fbc9b03560aa029ed5e16ee5554da9a2251c", size = 1550512, upload-time = "2025-12-23T03:14:04.391Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8b/40/2614036cdd416452f5bf98ec037f38a1afb17f327cb8e6b652d4729e0af8/pyparsing-3.3.1-py3-none-any.whl", hash = "sha256:023b5e7e5520ad96642e2c6db4cb683d3970bd640cdf7115049a6e9c3682df82", size = 121793, upload-time = "2025-12-23T03:14:02.103Z" }, +] + +[[package]] +name = "pyshex" +version = "0.8.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cfgraph" }, + { name = "chardet" }, + { name = "pyshexc" }, + { name = "rdflib-shim" }, + { name = "requests" }, + { name = "shexjsg" }, + { name = "sparqlslurper" }, + { name = "sparqlwrapper" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/97/d7/420ce2df4e8688e06fa8e1fc353fdf3875eb70f6fc2e17493d0526d778ff/PyShEx-0.8.1.tar.gz", hash = "sha256:3c5c4d45fe27faaadae803cb008c41acf8ee784da7868b04fd84967e75be70d0", size = 475611, upload-time = "2022-04-14T21:14:58.769Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9b/48/efb1b1d3f3aee8cfc9f256738ca6e79ec362edbfc3a3abecbaf84db04643/PyShEx-0.8.1-py3-none-any.whl", hash = "sha256:6da1b10123e191abf8dcb6bf3e54aa3e1fcf771df5d1a0ed453217c8900c8e6a", size = 51861, upload-time = "2022-04-14T21:14:57.254Z" }, +] + +[[package]] +name = "pyshexc" +version = "0.9.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "antlr4-python3-runtime" }, + { name = "chardet" }, + { name = "jsonasobj" }, + { name = "pyjsg" }, + { name = "rdflib-shim" }, + { name = "shexjsg" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2a/31/95c590e8ed6e8cff141b6dd2a3de93b540f9dc3fba54621a20fd1cdb11e4/PyShExC-0.9.1.tar.gz", hash = "sha256:35a9975d4b9afeb20ef710fb6680871756381d0c39fbb5470b3b506581a304d3", size = 96070, upload-time = "2022-04-14T18:51:45.979Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/39/7d/ff5000e0882f2b3995bef20b667945d3faa9289b556295e4cc5d2e91f104/PyShExC-0.9.1-py2.py3-none-any.whl", hash = "sha256:efc55ed5cb2453e9df569b03e282505e96bb06597934288f3b23dd980ef10028", size = 69792, upload-time = "2022-04-14T18:51:44.148Z" }, +] + [[package]] name = "pyspark" version = "4.0.1" @@ -3065,6 +3344,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/27/98/822b924a4a3eb58aacba84444c7439fce32680592f394de26af9c76e2569/pytest_env-1.2.0-py3-none-any.whl", hash = "sha256:d7e5b7198f9b83c795377c09feefa45d56083834e60d04767efd64819fc9da00", size = 6251, upload-time = "2025-10-09T19:15:46.077Z" }, ] +[[package]] +name = "pytest-logging" +version = "2015.11.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/dc/1e/fb11174c9eaebcec27d36e9e994b90ffa168bc3226925900b9dbbf16c9da/pytest-logging-2015.11.4.tar.gz", hash = "sha256:cec5c85ecf18aab7b2ead5498a31b9f758680ef5a902b9054ab3f2bdbb77c896", size = 3916, upload-time = "2015-11-04T12:15:54.122Z" } + [[package]] name = "python-dateutil" version = "2.9.0.post0" @@ -3241,6 +3529,43 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/81/d6/4bfbb40c9a0b42fc53c7cf442f6385db70b40f74a783130c5d0a5aa62228/pyzmq-27.1.0-cp314-cp314t-win_arm64.whl", hash = "sha256:dc5dbf68a7857b59473f7df42650c621d7e8923fb03fa74a526890f4d33cc4d7", size = 575170, upload-time = "2025-09-08T23:09:01.418Z" }, ] +[[package]] +name = "rdflib" +version = "7.5.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyparsing" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ec/1b/4cd9a29841951371304828d13282e27a5f25993702c7c87dcb7e0604bd25/rdflib-7.5.0.tar.gz", hash = "sha256:663083443908b1830e567350d72e74d9948b310f827966358d76eebdc92bf592", size = 4903859, upload-time = "2025-11-28T05:51:54.562Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b9/20/35d2baebacf357b562bd081936b66cd845775442973cb033a377fd639a84/rdflib-7.5.0-py3-none-any.whl", hash = "sha256:b011dfc40d0fc8a44252e906dcd8fc806a7859bc231be190c37e9568a31ac572", size = 587215, upload-time = "2025-11-28T05:51:38.178Z" }, +] + +[[package]] +name = "rdflib-jsonld" +version = "0.6.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "rdflib" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5a/48/9eaecac5f5ba6b31dd932fbbe67206afcbd24a7a696c03c6c920ac7ddc39/rdflib-jsonld-0.6.1.tar.gz", hash = "sha256:eda5a42a2e09f80d4da78e32b5c684bccdf275368f1541e6b7bcddfb1382a0e0", size = 130465, upload-time = "2021-09-14T12:22:20.082Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d0/d2/760527679057a7dad67f4e41f3e0c463b247f0bdbffc594e0add7c9077d6/rdflib_jsonld-0.6.1-py2.py3-none-any.whl", hash = "sha256:bcf84317e947a661bae0a3f2aee1eced697075fc4ac4db6065a3340ea0f10fc2", size = 16381, upload-time = "2021-09-14T12:22:17.805Z" }, +] + +[[package]] +name = "rdflib-shim" +version = "1.0.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "rdflib" }, + { name = "rdflib-jsonld" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1b/c8/1014ec6b5f4428c630deffba1f9851043ae378eb1d6ef52a03bd492cea99/rdflib_shim-1.0.3.tar.gz", hash = "sha256:d955d11e2986aab42b6830ca56ac6bc9c893abd1d049a161c6de2f1b99d4fc0d", size = 7783, upload-time = "2021-12-21T16:31:06.945Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5f/97/d8a785d2c7131c731c90cb0e65af9400081af4380bea4ec04868dc21aa92/rdflib_shim-1.0.3-py3-none-any.whl", hash = "sha256:7a853e7750ef1e9bf4e35dea27d54e02d4ed087de5a9e0c329c4a6d82d647081", size = 5190, upload-time = "2021-12-21T16:31:05.719Z" }, +] + [[package]] name = "referencing" version = "0.37.0" @@ -3374,6 +3699,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9e/51/17023c0f8f1869d8806b979a2bffa3f861f26a3f1a66b094288323fba52f/rfc3986_validator-0.1.1-py2.py3-none-any.whl", hash = "sha256:2f235c432ef459970b4306369336b9d5dbdda31b510ca1e327636e01f528bfa9", size = 4242, upload-time = "2019-10-28T16:00:13.976Z" }, ] +[[package]] +name = "rfc3987" +version = "1.3.8" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/14/bb/f1395c4b62f251a1cb503ff884500ebd248eed593f41b469f89caa3547bd/rfc3987-1.3.8.tar.gz", hash = "sha256:d3c4d257a560d544e9826b38bc81db676890c79ab9d7ac92b39c7a253d5ca733", size = 20700, upload-time = "2018-07-29T17:23:47.954Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/65/d4/f7407c3d15d5ac779c3dd34fbbc6ea2090f77bd7dd12f207ccf881551208/rfc3987-1.3.8-py2.py3-none-any.whl", hash = "sha256:10702b1e51e5658843460b189b185c0366d2cf4cff716f13111b0ea9fd2dce53", size = 13377, upload-time = "2018-07-29T17:23:45.313Z" }, +] + [[package]] name = "rfc3987-syntax" version = "1.1.0" @@ -3578,6 +3912,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" }, ] +[[package]] +name = "shexjsg" +version = "0.8.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyjsg" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/30/c9/34224e3c8fd9d466535626e3c2f6e01f6adae3e82acaed353d42add509ec/ShExJSG-0.8.2.tar.gz", hash = "sha256:f17a629fc577fa344382bdee143cd9ff86588537f9f811f66cea6f63cdbcd0b6", size = 33550, upload-time = "2022-04-14T20:23:13.75Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/6e/d23bcde21d4ef0250a74e7505d2990d429f862be65810a3b650a69def7f0/ShExJSG-0.8.2-py2.py3-none-any.whl", hash = "sha256:3b0d8432dd313bee9e1343382c5e02e9908dd941a7dd7342bf8c0200fe523766", size = 14381, upload-time = "2022-04-14T20:23:12.515Z" }, +] + [[package]] name = "sidecar" version = "0.8.0" @@ -3608,6 +3954,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, ] +[[package]] +name = "snowballstemmer" +version = "3.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/75/a7/9810d872919697c9d01295633f5d574fb416d47e535f258272ca1f01f447/snowballstemmer-3.0.1.tar.gz", hash = "sha256:6d5eeeec8e9f84d4d56b847692bacf79bc2c8e90c7f80ca4444ff8b6f2e52895", size = 105575, upload-time = "2025-05-09T16:34:51.843Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c8/78/3565d011c61f5a43488987ee32b6f3f656e7f107ac2782dd57bdd7d91d9a/snowballstemmer-3.0.1-py3-none-any.whl", hash = "sha256:6cd7b3897da8d6c9ffb968a6781fa6532dce9c3618a4b127d920dab764a19064", size = 103274, upload-time = "2025-05-09T16:34:50.371Z" }, +] + [[package]] name = "sortedcontainers" version = "2.4.0" @@ -3977,6 +4332,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3d/d8/2083a1daa7439a66f3a48589a57d576aa117726762618f6bb09fe3798796/uvicorn-0.40.0-py3-none-any.whl", hash = "sha256:c6c8f55bc8bf13eb6fa9ff87ad62308bbbc33d0b67f84293151efe87e0d5f2ee", size = 68502, upload-time = "2025-12-21T14:16:21.041Z" }, ] +[[package]] +name = "watchdog" +version = "6.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/db/7d/7f3d619e951c88ed75c6037b246ddcf2d322812ee8ea189be89511721d54/watchdog-6.0.0.tar.gz", hash = "sha256:9ddf7c82fda3ae8e24decda1338ede66e1c99883db93711d8fb941eaa2d8c282", size = 131220, upload-time = "2024-11-01T14:07:13.037Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/68/98/b0345cabdce2041a01293ba483333582891a3bd5769b08eceb0d406056ef/watchdog-6.0.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:490ab2ef84f11129844c23fb14ecf30ef3d8a6abafd3754a6f75ca1e6654136c", size = 96480, upload-time = "2024-11-01T14:06:42.952Z" }, + { url = "https://files.pythonhosted.org/packages/85/83/cdf13902c626b28eedef7ec4f10745c52aad8a8fe7eb04ed7b1f111ca20e/watchdog-6.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:76aae96b00ae814b181bb25b1b98076d5fc84e8a53cd8885a318b42b6d3a5134", size = 88451, upload-time = "2024-11-01T14:06:45.084Z" }, + { url = "https://files.pythonhosted.org/packages/fe/c4/225c87bae08c8b9ec99030cd48ae9c4eca050a59bf5c2255853e18c87b50/watchdog-6.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a175f755fc2279e0b7312c0035d52e27211a5bc39719dd529625b1930917345b", size = 89057, upload-time = "2024-11-01T14:06:47.324Z" }, + { url = "https://files.pythonhosted.org/packages/a9/c7/ca4bf3e518cb57a686b2feb4f55a1892fd9a3dd13f470fca14e00f80ea36/watchdog-6.0.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7607498efa04a3542ae3e05e64da8202e58159aa1fa4acddf7678d34a35d4f13", size = 79079, upload-time = "2024-11-01T14:06:59.472Z" }, + { url = "https://files.pythonhosted.org/packages/5c/51/d46dc9332f9a647593c947b4b88e2381c8dfc0942d15b8edc0310fa4abb1/watchdog-6.0.0-py3-none-manylinux2014_armv7l.whl", hash = "sha256:9041567ee8953024c83343288ccc458fd0a2d811d6a0fd68c4c22609e3490379", size = 79078, upload-time = "2024-11-01T14:07:01.431Z" }, + { url = "https://files.pythonhosted.org/packages/d4/57/04edbf5e169cd318d5f07b4766fee38e825d64b6913ca157ca32d1a42267/watchdog-6.0.0-py3-none-manylinux2014_i686.whl", hash = "sha256:82dc3e3143c7e38ec49d61af98d6558288c415eac98486a5c581726e0737c00e", size = 79076, upload-time = "2024-11-01T14:07:02.568Z" }, + { url = "https://files.pythonhosted.org/packages/ab/cc/da8422b300e13cb187d2203f20b9253e91058aaf7db65b74142013478e66/watchdog-6.0.0-py3-none-manylinux2014_ppc64.whl", hash = "sha256:212ac9b8bf1161dc91bd09c048048a95ca3a4c4f5e5d4a7d1b1a7d5752a7f96f", size = 79077, upload-time = "2024-11-01T14:07:03.893Z" }, + { url = "https://files.pythonhosted.org/packages/2c/3b/b8964e04ae1a025c44ba8e4291f86e97fac443bca31de8bd98d3263d2fcf/watchdog-6.0.0-py3-none-manylinux2014_ppc64le.whl", hash = "sha256:e3df4cbb9a450c6d49318f6d14f4bbc80d763fa587ba46ec86f99f9e6876bb26", size = 79078, upload-time = "2024-11-01T14:07:05.189Z" }, + { url = "https://files.pythonhosted.org/packages/62/ae/a696eb424bedff7407801c257d4b1afda455fe40821a2be430e173660e81/watchdog-6.0.0-py3-none-manylinux2014_s390x.whl", hash = "sha256:2cce7cfc2008eb51feb6aab51251fd79b85d9894e98ba847408f662b3395ca3c", size = 79077, upload-time = "2024-11-01T14:07:06.376Z" }, + { url = "https://files.pythonhosted.org/packages/b5/e8/dbf020b4d98251a9860752a094d09a65e1b436ad181faf929983f697048f/watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl", hash = "sha256:20ffe5b202af80ab4266dcd3e91aae72bf2da48c0d33bdb15c66658e685e94e2", size = 79078, upload-time = "2024-11-01T14:07:07.547Z" }, + { url = "https://files.pythonhosted.org/packages/07/f6/d0e5b343768e8bcb4cda79f0f2f55051bf26177ecd5651f84c07567461cf/watchdog-6.0.0-py3-none-win32.whl", hash = "sha256:07df1fdd701c5d4c8e55ef6cf55b8f0120fe1aef7ef39a1c6fc6bc2e606d517a", size = 79065, upload-time = "2024-11-01T14:07:09.525Z" }, + { url = "https://files.pythonhosted.org/packages/db/d9/c495884c6e548fce18a8f40568ff120bc3a4b7b99813081c8ac0c936fa64/watchdog-6.0.0-py3-none-win_amd64.whl", hash = "sha256:cbafb470cf848d93b5d013e2ecb245d4aa1c8fd0504e863ccefa32445359d680", size = 79070, upload-time = "2024-11-01T14:07:10.686Z" }, + { url = "https://files.pythonhosted.org/packages/33/e8/e40370e6d74ddba47f002a32919d91310d6074130fe4e17dabcafc15cbf1/watchdog-6.0.0-py3-none-win_ia64.whl", hash = "sha256:a1914259fa9e1454315171103c6a30961236f508b9b623eae470268bbcc6a22f", size = 79067, upload-time = "2024-11-01T14:07:11.845Z" }, +] + [[package]] name = "wcwidth" version = "0.2.14"