diff --git a/pyproject.toml b/pyproject.toml
index 11f2ab8..414fab5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -149,6 +149,7 @@ ignore = [
 "tests/**/*.py" = ["S101", "T201"] # use of assert
 "**/__init__.py" = ["D104"]
 
+
 [tool.ruff.lint.mccabe]
 # Flag errors (`C901`) whenever the complexity level exceeds 15.
 max-complexity = 15
@@ -160,7 +161,7 @@ convention = "google"
 requires = ["uv_build>=0.9.9,<0.10.0"]
 build-backend = "uv_build"
 
-[tool.pytest]
+[tool.pytest.ini_options]
 pythonpath = ["src"]
 log_cli = true
 log_cli_level = "INFO"
diff --git a/src/cdm_data_loader_utils/parsers/annotation_parse.py b/src/cdm_data_loader_utils/parsers/annotation_parse.py
new file mode 100644
index 0000000..d2a06d5
--- /dev/null
+++ b/src/cdm_data_loader_utils/parsers/annotation_parse.py
@@ -0,0 +1,446 @@
+"""
+
+RefSeq annotation parser for transforming NCBI Datasets API JSON into CDM-formatted Delta Lake tables.
+
+Usage:
+PYTHONPATH=src python src/cdm_data_loader_utils/parsers/annotation_parse.py \
+  --accession GCF_000869125.1 \
+  --namespace refseq_api \
+  --query
+
+"""
+
+from __future__ import annotations
+import argparse
+import json
+from pathlib import Path
+from typing import Optional
+
+import requests
+from pyspark.sql import SparkSession
+from pyspark.sql.types import StructType
+from delta import configure_spark_with_delta_pip
+
+from cdm_data_loader_utils.parsers.kbase_cdm_pyspark import schema as cdm_schemas
+
+
+# ---------------------------------------------------------------------
+# Accession-based annotation fetch
+# ---------------------------------------------------------------------
+def fetch_annotation_json(accession: str) -> dict:
+    """Fetch annotation JSON from NCBI Datasets API."""
+    url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{accession}/annotation_report"
+    resp = requests.get(url, headers={"Accept": "application/json"}, timeout=60)
+    resp.raise_for_status()
+    return resp.json()
+
+
+# ---------------------------------------------------------------------
+# Spark initialization with Delta support
+# ---------------------------------------------------------------------
+def build_spark_session(app_name: str = "RefSeqAnnotationToCDM") -> SparkSession:
+    builder = (
+        SparkSession.builder.appName(app_name)
+        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
+        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
+        .enableHiveSupport()
+    )
+    return configure_spark_with_delta_pip(builder).getOrCreate()
+
+
+def init_spark_and_db(app_name: str, database: str) -> SparkSession:
+    spark = build_spark_session(app_name)
+    spark.sql(f"CREATE DATABASE IF NOT EXISTS {database}")
+    spark.sql(f"USE {database}")
+    return spark
+
+
+# ---------------------------------------------------------------------
+# CDM PREFIX NORMALIZATION
+# ---------------------------------------------------------------------
+def apply_prefix(identifier: str | None) -> str | None:
+    if not identifier:
+        return None
+
+    if identifier.startswith("GeneID:"):
+        return identifier.replace("GeneID:", "ncbigene:")
+
+    if identifier.startswith(("YP_", "XP_", "WP_", "NP_", "NC_")):
+        return f"refseq:{identifier}"
+
+    if identifier.startswith("GCF_"):
+        return f"insdc.gcf:{identifier}"
+
+    return identifier
+
+
+# ---------------------------------------------------------------------
+# Safe integer conversion
+# ---------------------------------------------------------------------
+def to_int(val: str) -> int | None:
+    try:
+        return int(val)
+    except Exception:
+        return None
+
+
+# ---------------------------------------------------------------------
+# For repeat section markers
+# ---------------------------------------------------------------------
+def unique_annotations(data: dict):
+    seen = set()
+    for report in data.get("reports", []):
+        ann = report.get("annotation", {})
+        gene_id = ann.get("gene_id")
+        if gene_id and gene_id not in seen:
+            seen.add(gene_id)
+            yield gene_id, ann
+
+
+# ---------------------------------------------------------------------
+# IDENTIFIERS
+# ---------------------------------------------------------------------
+def load_identifiers(data: dict) -> list[tuple[str, str, str, str, str | None]]:
+    """Extract Identifier table records."""
+    out = []
+
+    for gene_id, ann in unique_annotations(data):
+        entity_id = f"ncbigene:{gene_id}"
+        out.append((entity_id, gene_id, ann.get("name"), "RefSeq", ann.get("relationship")))
+    return list({tuple(row) for row in out})  # deduplicate
+
+
+# ---------------------------------------------------------------------
+# NAME EXTRACTION
+# ---------------------------------------------------------------------
+def load_names(data: dict) -> list[tuple[str, str, str, str]]:
+    """Extract Name table records."""
+    out = []
+
+    for gene_id, ann in unique_annotations(data):
+        entity_id = f"ncbigene:{gene_id}"
+        for label, desc in [
+            ("symbol", "RefSeq gene symbol"),
+            ("name", "RefSeq gene name"),
+            ("locus_tag", "RefSeq locus tag"),
+        ]:
+            val = ann.get(label)
+            if val:
+                out.append((entity_id, val, desc, "RefSeq"))
+    return list({tuple(row) for row in out})
+
+
+# ---------------------------------------------------------------------
+# FEATURE LOCATIONS
+# ---------------------------------------------------------------------
+def load_feature_records(data: dict) -> list[tuple]:
+    """Extract Feature table records."""
+    features = []
+
+    for gene_id, ann in unique_annotations(data):
+        feature_id = f"ncbigene:{gene_id}"
+        for region in ann.get("genomic_regions", []):
+            for r in region.get("gene_range", {}).get("range", []):
+                strand = {
+                    "plus": "positive",
+                    "minus": "negative",
+                    "unstranded": "unstranded",
+                }.get(r.get("orientation"), "unknown")
+                features.append((
+                    feature_id,
+                    None,
+                    None,
+                    None,
+                    to_int(r.get("end")),
+                    None,
+                    to_int(r.get("begin")),
+                    strand,
+                    "RefSeq",
+                    None,
+                    "gene",
+                ))
+    return list({tuple(row) for row in features})
+
+
+# ---------------------------------------------------------------------
+# PARSE CONTIG_COLLECTION <-> FEATURE
+# ---------------------------------------------------------------------
+def load_contig_collection_x_feature(data: dict) -> list[tuple[str, str]]:
+    """Parse ContigCollection Feature links."""
+    links = []
+
+    for gene_id, ann in unique_annotations(data):
+        regions = ann.get("genomic_regions", [])
+
+        if not regions:
+            continue
+
+        acc = regions[0].get("gene_range", {}).get("accession_version")
+        if acc:
+            links.append((apply_prefix(acc), f"ncbigene:{gene_id}"))
+
+    return list(set(links))
+
+
+# ---------------------------------------------------------------------
+# PARSE CONTIG_COLLECTION <-> PROTEIN
+# ---------------------------------------------------------------------
+def load_contig_collection_x_protein(data: dict) -> list[tuple[str, str]]:
+    links = []
+
+    for report in data.get("reports", []):
+        ann = report.get("annotation", {})
+        assembly = ann.get("annotations", [{}])[0].get("assembly_accession")
+        if not assembly:
+            continue
+
+        contig_id = apply_prefix(assembly)
+        for p in ann.get("proteins", []):
+            pid = p.get("accession_version")
+            if pid:
+                links.append((contig_id, apply_prefix(pid)))
+
+    return list(set(links))
+
+
+# ---------------------------------------------------------------------
+# PARSE FEATURE <-> PROTEIN
+# ---------------------------------------------------------------------
+def load_feature_x_protein(data: dict) -> list[tuple[str, str]]:
+    links = []
+
+    for gene_id, ann in unique_annotations(data):
+        feature_id = f"ncbigene:{gene_id}"
+
+        for p in ann.get("proteins", []):
+            pid = p.get("accession_version")
+            if pid:
+                protein_id = apply_prefix(pid)
+                links.append((feature_id, protein_id))
+
+    return list(set(links))
+
+
+# ---------------------------------------------------------------------
+# PARSE CONTIGS
+# ---------------------------------------------------------------------
+def load_contigs(data: dict) -> list[tuple[str, str | None, float | None, int | None]]:
+    contigs = {}
+
+    for report in data.get("reports", []):
+        for region in report.get("annotation", {}).get("genomic_regions", []):
+            acc = region.get("gene_range", {}).get("accession_version")
+            if acc:
+                contig_id = apply_prefix(acc)
+                # Only track first occurrence of each contig
+                contigs.setdefault(contig_id, {"hash": None, "gc_content": None, "length": None})
+
+    return [(cid, meta["hash"], meta["gc_content"], meta["length"]) for cid, meta in contigs.items()]
+
+
+# ---------------------------------------------------------------------
+# PARSE CONTIG <-> CONTIG_COLLECTION
+# ---------------------------------------------------------------------
+def load_contig_x_contig_collection(data: dict) -> list[tuple[str, str]]:
+    links = []
+
+    for report in data.get("reports", []):
+        ann = report.get("annotation", {})
+        regions = ann.get("genomic_regions", [])
+        annotations = ann.get("annotations", [])
+
+        if not regions or not annotations:
+            continue
+
+        contig = regions[0].get("gene_range", {}).get("accession_version")
+        assembly = annotations[0].get("assembly_accession")
+
+        if contig and assembly:
+            links.append((
+                f"refseq:{contig}",
+                apply_prefix(assembly),
+            ))
+
+    return list(set(links))
+
+
+# ---------------------------------------------------------------------
+# DELTA TABLE
+# ---------------------------------------------------------------------
+def write_to_table(
+    spark: SparkSession,
+    records: list[tuple],
+    table_name: str,
+    database: str = "default",
+) -> None:
+    if records:
+        spark.createDataFrame(records, cdm_schemas[table_name]).write.format("delta").mode("overwrite").option(
+            "overwriteSchema", "true"
+        ).saveAsTable(f"{database}.{table_name}")
+
+
+# ---------------------------------------------------------------------
+# SQL PREVIEW
+# ---------------------------------------------------------------------
+
+CDM_TABLES = [
+    "Identifier",
+    "Name",
+    "Feature",
+    "ContigCollection_x_Feature",
+    "ContigCollection_x_Protein",
+    "Feature_x_Protein",
+    "Contig",
+    "Contig_x_ContigCollection",
+]
+
+
+def run_sql_query(spark: SparkSession, database: str = "default") -> None:
+    spark.sql(f"USE {database}")
+    for table in CDM_TABLES:
+        print(f"\n[SQL Preview] {table}")
+        spark.sql(f"SELECT * FROM {table} LIMIT 20").show(truncate=False)
+
+
+def parse_annotation_data(spark: SparkSession, datasets: list[dict], namespace: str) -> None:
+    # -----------------------------------------
+    # Parse and write CDM tables
+    # -----------------------------------------
+    for data in datasets:
+        write_to_table(
+            spark,
+            load_identifiers(data),
+            "Identifier",
+            namespace,
+        )
+
+        write_to_table(
+            spark,
+            load_names(data),
+            "Name",
+            namespace,
+        )
+
+        write_to_table(
+            spark,
+            load_feature_records(data),
+            "Feature",
+            namespace,
+        )
+
+        write_to_table(
+            spark,
+            load_contig_collection_x_feature(data),
+            "ContigCollection_x_Feature",
+            namespace,
+        )
+
+        write_to_table(
+            spark,
+            load_contig_collection_x_protein(data),
+            "ContigCollection_x_Protein",
+            namespace,
+        )
+
+        write_to_table(
+            spark,
+            load_feature_x_protein(data),
+            "Feature_x_Protein",
+            namespace,
+        )
+
+        write_to_table(
+            spark,
+            load_contigs(data),
+            "Contig",
+            namespace,
+        )
+
+        write_to_table(
+            spark,
+            load_contig_x_contig_collection(data),
+            "Contig_x_ContigCollection",
+            namespace,
+        )
+
+
+# ---------------------------------------------------------------------
+# CLI ENTRY
+# ---------------------------------------------------------------------
+def main():
+    parser = argparse.ArgumentParser(description="RefSeq Annotation Parser to CDM")
+
+    # -------------------------
+    # Input options
+    # -------------------------
+    parser.add_argument("--accession", type=str, help="RefSeq genome accession (e.g. GCF_000869125.1)")
+    parser.add_argument("--input_file", type=str, help="Path to a RefSeq annotation JSON file.")
+    parser.add_argument("--input_dir", type=str, help="Directory containing RefSeq annotation JSON files.")
+
+    # -------------------------
+    # Output / runtime options
+    # -------------------------
+    parser.add_argument(
+        "--namespace",
+        default="refseq_api",
+        help="Database to write Delta tables.",
+    )
+    parser.add_argument(
+        "--tenant",
+        default=None,
+        help="Tenant SQL warehouse to use.",
+    )
+    parser.add_argument(
+        "--query",
+        action="store_true",
+        help="Preview SQL output after writing.",
+    )
+
+    args = parser.parse_args()
+
+    # -----------------------------------------
+    # Input validation
+    # -----------------------------------------
+    if not args.accession and not args.input_file and not args.input_dir:
+        raise ValueError("provide --accession, --input_file, or --input_dir.")
+
+    # -----------------------------------------
+    # Initialize Spark
+    # -----------------------------------------
+    spark = init_spark_and_db("RefSeq Annotation Parser", args.namespace)
+
+    if args.tenant:
+        spark.sql(f"USE CATALOG {args.tenant}")
+
+    # -----------------------------------------
+    # Load annotation data
+    # -----------------------------------------
+    datasets: list[dict] = []
+
+    if args.accession:
+        # Fetch from NCBI Datasets API
+        data = fetch_annotation_json(args.accession)
+        datasets.append(data)
+
+    if args.input_file:
+        with open(args.input_file) as f:
+            datasets.append(json.load(f))
+
+    if args.input_dir:
+        for path in Path(args.input_dir).rglob("*.json"):
+            with open(path) as f:
+                datasets.append(json.load(f))
+
+    parse_annotation_data(spark, datasets, args.namespace)
+
+    # -----------------------------------------
+    # SQL preview
+    # -----------------------------------------
+    if args.query:
+        run_sql_query(spark, args.namespace)
+
+    spark.stop()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/cdm_data_loader_utils/parsers/gene_association_file.py b/src/cdm_data_loader_utils/parsers/gene_association_file.py
index 548de56..cd81647 100644
--- a/src/cdm_data_loader_utils/parsers/gene_association_file.py
+++ b/src/cdm_data_loader_utils/parsers/gene_association_file.py
@@ -273,7 +273,7 @@ def run(
             if register:
                 register_table(spark, output_path, table_name=table_name, permanent=permanent)
 
-    except Exception as e:
+    except Exception:
         logger.exception("Pipeline failed")
         sys.exit(1)
     finally:
diff --git a/src/cdm_data_loader_utils/parsers/kbase_cdm_pyspark.py b/src/cdm_data_loader_utils/parsers/kbase_cdm_pyspark.py
new file mode 100644
index 0000000..19be5e8
--- /dev/null
+++ b/src/cdm_data_loader_utils/parsers/kbase_cdm_pyspark.py
@@ -0,0 +1,610 @@
+"""Automated conversion of cdm_schema to PySpark."""
+
+from pyspark.sql.types import BooleanType, DateType, FloatType, IntegerType, StringType, StructField, StructType
+
+schema = {
+    "Association": StructType(
+        [
+            StructField("association_id", StringType(), nullable=False),
+            StructField("subject", StringType(), nullable=False),
+            StructField("object", StringType(), nullable=False),
+            StructField("predicate", StringType(), nullable=False),
+            StructField("negated", BooleanType(), nullable=True),
+            StructField("evidence_type", StringType(), nullable=True),
+            StructField("primary_knowledge_source", StringType(), nullable=True),
+            StructField("aggregator_knowledge_source", StringType(), nullable=True),
+            StructField("annotation_date", DateType(), nullable=True),
+            StructField("comments", StringType(), nullable=True),
+        ]
+    ),
+    "Association_x_SupportingObject": StructType(
+        [
+            StructField("association_id", StringType(), nullable=False),
+            StructField("entity_id", StringType(), nullable=False),
+        ]
+    ),
+    "Cluster": StructType(
+        [
+            StructField("cluster_id", StringType(), nullable=False),
+            StructField("description", StringType(), nullable=True),
+            StructField("name", StringType(), nullable=True),
+            StructField("entity_type", StringType(), nullable=False),
+            StructField("protocol_id", StringType(), nullable=True),
+        ]
+    ),
+    "ClusterMember": StructType(
+        [
+            StructField("cluster_id", StringType(), nullable=False),
+            StructField("entity_id", StringType(), nullable=False),
+            StructField("is_representative", BooleanType(), nullable=True),
+            StructField("is_seed", BooleanType(), nullable=True),
+            StructField("score", FloatType(), nullable=True),
+        ]
+    ),
+    "Contig": StructType(
+        [
+            StructField("contig_id", StringType(), nullable=False),
+            StructField("hash", StringType(), nullable=True),
+            StructField("gc_content", FloatType(), nullable=True),
+            StructField("length", IntegerType(), nullable=True),
+        ]
+    ),
+    "ContigCollection": StructType(
+        [
+            StructField("contig_collection_id", StringType(), nullable=False),
+            StructField("hash", StringType(), nullable=True),
+            StructField("asm_score", FloatType(), nullable=True),
+            StructField("checkm_completeness", FloatType(), nullable=True),
+            StructField("checkm_contamination", FloatType(), nullable=True),
+            StructField("checkm_version", StringType(), nullable=True),
+            StructField("contig_bp", IntegerType(), nullable=True),
+            StructField("contig_collection_type", StringType(), nullable=True),
+            StructField("contig_l50", IntegerType(), nullable=True),
+            StructField("contig_l90", IntegerType(), nullable=True),
+            StructField("contig_n50", IntegerType(), nullable=True),
+            StructField("contig_n90", IntegerType(), nullable=True),
+            StructField("contig_logsum", FloatType(), nullable=True),
+            StructField("contig_max", IntegerType(), nullable=True),
+            StructField("contig_powersum", FloatType(), nullable=True),
+            StructField("gap_percent", FloatType(), nullable=True),
+            StructField("gc_average", FloatType(), nullable=True),
+            StructField("gc_std", FloatType(), nullable=True),
+            StructField("gtdb_taxon_id", StringType(), nullable=True),
+            StructField("n_chromosomes", IntegerType(), nullable=True),
+            StructField("n_contigs", IntegerType(), nullable=True),
+            StructField("n_scaffolds", IntegerType(), nullable=True),
+            StructField("ncbi_taxon_id", StringType(), nullable=True),
+            StructField("scaffold_l50", IntegerType(), nullable=True),
+            StructField("scaffold_l90", IntegerType(), nullable=True),
+            StructField("scaffold_n50", IntegerType(), nullable=True),
+            StructField("scaffold_n90", IntegerType(), nullable=True),
+            StructField("scaffold_bp", IntegerType(), nullable=True),
+            StructField("scaffold_logsum", FloatType(), nullable=True),
+            StructField("scaffold_maximum_length", IntegerType(), nullable=True),
+            StructField("scaffold_powersum", FloatType(), nullable=True),
+            StructField("scaffolds_n_over_50K", IntegerType(), nullable=True),
+            StructField("scaffolds_percent_over_50K", FloatType(), nullable=True),
+            StructField("scaffolds_total_length_over_50k", IntegerType(), nullable=True),
+        ]
+    ),
+    "ContigCollection_x_EncodedFeature": StructType(
+        [
+            StructField("contig_collection_id", StringType(), nullable=False),
+            StructField("encoded_feature_id", StringType(), nullable=False),
+        ]
+    ),
+    "ContigCollection_x_Feature": StructType(
+        [
+            StructField("contig_collection_id", StringType(), nullable=False),
+            StructField("feature_id", StringType(), nullable=False),
+        ]
+    ),
+    "ContigCollection_x_Protein": StructType(
+        [
+            StructField("contig_collection_id", StringType(), nullable=False),
+            StructField("protein_id", StringType(), nullable=False),
+        ]
+    ),
+    "Contig_x_ContigCollection": StructType(
+        [
+            StructField("contig_id", StringType(), nullable=False),
+            StructField("contig_collection_id", StringType(), nullable=False),
+        ]
+    ),
+    "Contig_x_EncodedFeature": StructType(
+        [
+            StructField("contig_id", StringType(), nullable=False),
+            StructField("encoded_feature_id", StringType(), nullable=False),
+        ]
+    ),
+    "Contig_x_Feature": StructType(
+        [
+            StructField("contig_id", StringType(), nullable=False),
+            StructField("feature_id", StringType(), nullable=False),
+        ]
+    ),
+    "Contig_x_Protein": StructType(
+        [
+            StructField("contig_id", StringType(), nullable=False),
+            StructField("protein_id", StringType(), nullable=False),
+        ]
+    ),
+    "Contributor": StructType(
+        [
+            StructField("contributor_id", StringType(), nullable=False),
+            StructField("contributor_type", StringType(), nullable=True),
+            StructField("name", StringType(), nullable=True),
+            StructField("given_name", StringType(), nullable=True),
+            StructField("family_name", StringType(), nullable=True),
+        ]
+    ),
+    "ContributorAffiliation": StructType(
+        [
+            StructField("contributor_id", StringType(), nullable=False),
+            StructField("affiliation_id", StringType(), nullable=True),
+        ]
+    ),
+    "Contributor_x_DataSource": StructType(
+        [
+            StructField("contributor_id", StringType(), nullable=False),
+            StructField("data_source_id", StringType(), nullable=False),
+            StructField("contributor_role", StringType(), nullable=True),
+        ]
+    ),
+    "Contributor_x_Role_x_Project": StructType(
+        [
+            StructField("contributor_id", StringType(), nullable=False),
+            StructField("project_id", StringType(), nullable=False),
+            StructField("contributor_role", StringType(), nullable=True),
+        ]
+    ),
+    "ControlledTermValue": StructType(
+        [
+            StructField("value_cv_label", StringType(), nullable=False),
+            StructField("raw_value", StringType(), nullable=True),
+            StructField("type", StringType(), nullable=True),
+            StructField("attribute_cv_id", StringType(), nullable=True),
+            StructField("attribute_cv_label", StringType(), nullable=True),
+            StructField("attribute_string", StringType(), nullable=True),
+            StructField("entity_id", StringType(), nullable=False),
+        ]
+    ),
+    "ControlledVocabularyTermValue": StructType(
+        [
+            StructField("value_cv_label", StringType(), nullable=True),
+            StructField("value_cv_id", StringType(), nullable=True),
+            StructField("raw_value", StringType(), nullable=True),
+            StructField("type", StringType(), nullable=True),
+            StructField("attribute_cv_id", StringType(), nullable=True),
+            StructField("attribute_cv_label", StringType(), nullable=True),
+            StructField("attribute_string", StringType(), nullable=True),
+            StructField("entity_id", StringType(), nullable=False),
+        ]
+    ),
+    "DataSource": StructType(
+        [
+            StructField("data_source_id", StringType(), nullable=False),
+            StructField("name", StringType(), nullable=True),
+        ]
+    ),
+    "DataSourceNew": StructType(
+        [
+            StructField("data_source_id", StringType(), nullable=False),
+            StructField("name", StringType(), nullable=True),
+            StructField("comments", StringType(), nullable=True),
+            StructField("date_accessed", DateType(), nullable=False),
+            StructField("date_published", DateType(), nullable=True),
+            StructField("date_updated", DateType(), nullable=True),
+            StructField("license", StringType(), nullable=True),
+            StructField("publisher", StringType(), nullable=True),
+            StructField("resource_type", StringType(), nullable=False),
+            StructField("url", StringType(), nullable=True),
+            StructField("version", StringType(), nullable=True),
+        ]
+    ),
+    "DataSource_x_Description": StructType(
+        [
+            StructField("data_source_id", StringType(), nullable=False),
+            StructField("resource_description_id", StringType(), nullable=False),
+        ]
+    ),
+    "DataSource_x_FundingReference": StructType(
+        [
+            StructField("data_source_id", StringType(), nullable=False),
+            StructField("funding_reference_id", StringType(), nullable=False),
+        ]
+    ),
+    "DataSource_x_License": StructType(
+        [
+            StructField("data_source_id", StringType(), nullable=False),
+            StructField("license_id", StringType(), nullable=False),
+        ]
+    ),
+    "DataSource_x_Title": StructType(
+        [
+            StructField("data_source_id", StringType(), nullable=False),
+            StructField("resource_title_id", StringType(), nullable=False),
+        ]
+    ),
+    "DateTimeValue": StructType(
+        [
+            StructField("date_time", DateType(), nullable=False),
+            StructField("raw_value", StringType(), nullable=True),
+            StructField("type", StringType(), nullable=True),
+            StructField("attribute_cv_id", StringType(), nullable=True),
+            StructField("attribute_cv_label", StringType(), nullable=True),
+            StructField("attribute_string", StringType(), nullable=True),
+            StructField("entity_id", StringType(), nullable=False),
+        ]
+    ),
+    "EncodedFeature": StructType(
+        [
+            StructField("encoded_feature_id", StringType(), nullable=False),
+            StructField("hash", StringType(), nullable=True),
+            StructField("has_stop_codon", BooleanType(), nullable=True),
+            StructField("type", StringType(), nullable=True),
+        ]
+    ),
+    "EncodedFeature_x_Feature": StructType(
+        [
+            StructField("encoded_feature_id", StringType(), nullable=False),
+            StructField("feature_id", StringType(), nullable=False),
+        ]
+    ),
+    "EncodedFeature_x_Protein": StructType(
+        [
+            StructField("encoded_feature_id", StringType(), nullable=False),
+            StructField("protein_id", StringType(), nullable=False),
+        ]
+    ),
+    "EntailedEdge": StructType(
+        [
+            StructField("subject", StringType(), nullable=True),
+            StructField("predicate", StringType(), nullable=True),
+            StructField("object", StringType(), nullable=True),
+        ]
+    ),
+    "Entity": StructType(
+        [
+            StructField("entity_id", StringType(), nullable=False),
+            StructField("entity_type", StringType(), nullable=False),
+            StructField("data_source_id", StringType(), nullable=True),
+            StructField("data_source_entity_id", StringType(), nullable=True),
+            StructField("data_source_created", DateType(), nullable=False),
+            StructField("data_source_updated", DateType(), nullable=True),
+            StructField("created", DateType(), nullable=False),
+            StructField("updated", DateType(), nullable=False),
+        ]
+    ),
+    "Event": StructType(
+        [
+            StructField("event_id", StringType(), nullable=False),
+            StructField("created_at", DateType(), nullable=True),
+            StructField("description", StringType(), nullable=True),
+            StructField("name", StringType(), nullable=True),
+            StructField("location", StringType(), nullable=True),
+        ]
+    ),
+    "Experiment": StructType(
+        [
+            StructField("experiment_id", StringType(), nullable=False),
+            StructField("protocol_id", StringType(), nullable=False),
+            StructField("name", StringType(), nullable=True),
+            StructField("description", StringType(), nullable=True),
+            StructField("created_at", DateType(), nullable=True),
+        ]
+    ),
+    "ExperimentCondition": StructType(
+        [
+            StructField("experiment_condition_id", StringType(), nullable=False),
+            StructField("experiment_id", StringType(), nullable=False),
+            StructField("variable_id", StringType(), nullable=False),
+            StructField("value", StringType(), nullable=True),
+        ]
+    ),
+    "ExperimentConditionSet": StructType(
+        [
+            StructField("experiment_condition_set_id", StringType(), nullable=False),
+            StructField("experiment_condition_id", StringType(), nullable=False),
+        ]
+    ),
+    "Feature": StructType(
+        [
+            StructField("feature_id", StringType(), nullable=False),
+            StructField("hash", StringType(), nullable=True),
+            StructField("cds_phase", StringType(), nullable=True),
+            StructField("e_value", FloatType(), nullable=True),
+            StructField("end", IntegerType(), nullable=True),
+            StructField("p_value", FloatType(), nullable=True),
+            StructField("start", IntegerType(), nullable=True),
+            StructField("strand", StringType(), nullable=True),
+            StructField("source_database", StringType(), nullable=True),
+            StructField("protocol_id", StringType(), nullable=True),
+            StructField("type", StringType(), nullable=True),
+        ]
+    ),
+    "Feature_x_Protein": StructType(
+        [
+            StructField("feature_id", StringType(), nullable=False),
+            StructField("protein_id", StringType(), nullable=False),
+        ]
+    ),
+    "FundingReference": StructType(
+        [
+            StructField("funding_reference_id", StringType(), nullable=False),
+            StructField("funder", StringType(), nullable=True),
+            StructField("grant_id", StringType(), nullable=True),
+            StructField("grant_title", StringType(), nullable=True),
+            StructField("grant_url", StringType(), nullable=True),
+        ]
+    ),
+    "Geolocation": StructType(
+        [
+            StructField("latitude", FloatType(), nullable=False),
+            StructField("longitude", FloatType(), nullable=False),
+            StructField("raw_value", StringType(), nullable=True),
+            StructField("type", StringType(), nullable=True),
+            StructField("attribute_cv_id", StringType(), nullable=True),
+            StructField("attribute_cv_label", StringType(), nullable=True),
+            StructField("attribute_string", StringType(), nullable=True),
+            StructField("entity_id", StringType(), nullable=False),
+        ]
+    ),
+    "GoldEnvironmentalContext": StructType(
+        [
+            StructField("gold_environmental_context_id", StringType(), nullable=False),
+            StructField("ecosystem", StringType(), nullable=True),
+            StructField("ecosystem_category", StringType(), nullable=True),
+            StructField("ecosystem_subtype", StringType(), nullable=True),
+            StructField("ecosystem_type", StringType(), nullable=True),
+            StructField("specific_ecosystem", StringType(), nullable=True),
+        ]
+    ),
+    "Identifier": StructType(
+        [
+            StructField("entity_id", StringType(), nullable=False),
+            StructField("identifier", StringType(), nullable=False),
+            StructField("description", StringType(), nullable=True),
+            StructField("source", StringType(), nullable=True),
+            StructField("relationship", StringType(), nullable=True),
+        ]
+    ),
+    "License": StructType(
+        [
+            StructField("license_id", StringType(), nullable=False),
+            StructField("id", StringType(), nullable=True),
+            StructField("name", StringType(), nullable=True),
+            StructField("url", StringType(), nullable=True),
+        ]
+    ),
+    "Measurement": StructType(
+        [
+            StructField("measurement_id", StringType(), nullable=False),
+            StructField("measurement_set_id", StringType(), nullable=False),
+            StructField("experiment_condition_set_id", StringType(), nullable=False),
+            StructField("value", StringType(), nullable=True),
+        ]
+    ),
+    "MeasurementSet": StructType(
+        [
+            StructField("measurement_set_id", StringType(), nullable=False),
+            StructField("variable_id", StringType(), nullable=False),
+            StructField("quality", StringType(), nullable=True),
+            StructField("created_at", DateType(), nullable=True),
+        ]
+    ),
+    "MixsEnvironmentalContext": StructType(
+        [
+            StructField("mixs_environmental_context_id", StringType(), nullable=False),
+            StructField("env_broad_scale", StringType(), nullable=True),
+            StructField("env_local_scale", StringType(), nullable=True),
+            StructField("env_medium", StringType(), nullable=True),
+        ]
+    ),
+    "Name": StructType(
+        [
+            StructField("entity_id", StringType(), nullable=False),
+            StructField("name", StringType(), nullable=False),
+            StructField("description", StringType(), nullable=True),
+            StructField("source", StringType(), nullable=True),
+        ]
+    ),
+    "OrderedProtocolStep": StructType(
+        [
+            StructField("protocol_id", StringType(), nullable=False),
+            StructField("protocol_step_id", StringType(), nullable=False),
+            StructField("step_index", IntegerType(), nullable=False),
+        ]
+    ),
+    "Parameter": StructType(
+        [
+            StructField("parameter_id", StringType(), nullable=False),
+            StructField("name", StringType(), nullable=True),
+            StructField("description", StringType(), nullable=True),
+            StructField("value_type", StringType(), nullable=True),
+            StructField("required", BooleanType(), nullable=True),
+            StructField("cardinality", StringType(), nullable=True),
+            StructField("default", StringType(), nullable=True),
+            StructField("parameter_type", StringType(), nullable=True),
+        ]
+    ),
+    "Prefix": StructType(
+        [
+            StructField("prefix", StringType(), nullable=True),
+            StructField("base", StringType(), nullable=True),
+        ]
+    ),
+    "Project": StructType(
+        [
+            StructField("project_id", StringType(), nullable=False),
+            StructField("description", StringType(), nullable=True),
+        ]
+    ),
+    "Protein": StructType(
+        [
+            StructField("protein_id", StringType(), nullable=False),
+            StructField("hash", StringType(), nullable=True),
+            StructField("description", StringType(), nullable=True),
+            StructField("evidence_for_existence", StringType(), nullable=True),
+            StructField("length", IntegerType(), nullable=True),
+            StructField("sequence", StringType(), nullable=True),
+        ]
+    ),
+    "Protocol": StructType(
+        [
+            StructField("protocol_id", StringType(), nullable=False),
+            StructField("name", StringType(), nullable=True),
+            StructField("description", StringType(), nullable=True),
+            StructField("doi", StringType(), nullable=True),
+            StructField("url", StringType(), nullable=True),
+            StructField("version", StringType(), nullable=True),
+        ]
+    ),
+    "ProtocolExecution": StructType(
+        [
+            StructField("protocol_execution_id", StringType(), nullable=False),
+            StructField("protocol_id", StringType(), nullable=False),
+            StructField("name", StringType(), nullable=True),
+            StructField("description", StringType(), nullable=True),
+            StructField("created_at", DateType(), nullable=True),
+        ]
+    ),
+    "ProtocolInput": StructType(
+        [
+            StructField("parameter_id", StringType(), nullable=False),
+            StructField("protocol_input_id", StringType(), nullable=False),
+            StructField("protocol_execution_id", StringType(), nullable=False),
+            StructField("value", StringType(), nullable=False),
+        ]
+    ),
+    "ProtocolInputSet": StructType(
+        [
+            StructField("protocol_input_id", StringType(), nullable=False),
+            StructField("protocol_input_set_id", StringType(), nullable=False),
+        ]
+    ),
+    "ProtocolOutput": StructType(
+        [
+            StructField("protocol_output_id", StringType(), nullable=False),
+            StructField("protocol_input_set_id", StringType(), nullable=False),
+            StructField("value", StringType(), nullable=False),
+        ]
+    ),
+    "ProtocolStep": StructType(
+        [
+            StructField("protocol_step_id", StringType(), nullable=False),
+            StructField("step", StringType(), nullable=True),
+        ]
+    ),
+    "ProtocolVariable": StructType(
+        [
+            StructField("protocol_id", StringType(), nullable=False),
+            StructField("variable_id", StringType(), nullable=False),
+        ]
+    ),
+    "Publication": StructType(
+        [
+            StructField("publication_id", StringType(), nullable=False),
+        ]
+    ),
+    "QuantityRangeValue": StructType(
+        [
+            StructField("maximum_numeric_value", FloatType(), nullable=False),
+            StructField("minimum_numeric_value", FloatType(), nullable=False),
+            StructField("unit_cv_id", StringType(), nullable=True),
+            StructField("unit_cv_label", StringType(), nullable=True),
+            StructField("unit_string", StringType(), nullable=True),
+            StructField("raw_value", StringType(), nullable=True),
+            StructField("type", StringType(), nullable=True),
+            StructField("attribute_cv_id", StringType(), nullable=True),
+            StructField("attribute_cv_label", StringType(), nullable=True),
+            StructField("attribute_string", StringType(), nullable=True),
+            StructField("entity_id", StringType(), nullable=False),
+        ]
+    ),
+    "QuantityValue": StructType(
+        [
+            StructField("numeric_value", FloatType(), nullable=False),
+            StructField("unit_cv_id", StringType(), nullable=True),
+            StructField("unit_cv_label", StringType(), nullable=True),
+            StructField("unit_string", StringType(), nullable=True),
+            StructField("raw_value", StringType(), nullable=True),
+            StructField("type", StringType(), nullable=True),
+            StructField("attribute_cv_id", StringType(), nullable=True),
+            StructField("attribute_cv_label", StringType(), nullable=True),
+            StructField("attribute_string", StringType(), nullable=True),
+            StructField("entity_id", StringType(), nullable=False),
+        ]
+    ),
+    "ResourceDescription": StructType(
+        [
+            StructField("resource_description_id", StringType(), nullable=False),
+            StructField("description_text", StringType(), nullable=False),
+            StructField("description_type", StringType(), nullable=True),
+            StructField("language", StringType(), nullable=True),
+        ]
+    ),
+    "ResourceTitle": StructType(
+        [
+            StructField("resource_title_id", StringType(), nullable=False),
+            StructField("language", StringType(), nullable=True),
+            StructField("title", StringType(), nullable=False),
+            StructField("title_type", StringType(), nullable=True),
+        ]
+    ),
+    "Sample": StructType(
+        [
+            StructField("sample_id", StringType(), nullable=False),
+            StructField("description", StringType(), nullable=True),
+            StructField("type", StringType(), nullable=True),
+        ]
+    ),
+    "Sequence": StructType(
+        [
+            StructField("sequence_id", StringType(), nullable=False),
+            StructField("entity_id", StringType(), nullable=False),
+            StructField("type", StringType(), nullable=True),
+            StructField("length", IntegerType(), nullable=True),
+            StructField("checksum", StringType(), nullable=True),
+        ]
+    ),
+    "Statement": StructType(
+        [
+            StructField("subject", StringType(), nullable=True),
+            StructField("predicate", StringType(), nullable=True),
+            StructField("object", StringType(), nullable=True),
+            StructField("value", StringType(), nullable=True),
+            StructField("datatype", StringType(), nullable=True),
+            StructField("language", StringType(), nullable=True),
+        ]
+    ),
+    "TextValue": StructType(
+        [
+            StructField("text_value", StringType(), nullable=False),
+            StructField("language", StringType(), nullable=True),
+            StructField("raw_value", StringType(), nullable=True),
+            StructField("type", StringType(), nullable=True),
+            StructField("attribute_cv_id", StringType(), nullable=True),
+            StructField("attribute_cv_label", StringType(), nullable=True),
+            StructField("attribute_string", StringType(), nullable=True),
+            StructField("entity_id", StringType(), nullable=False),
+        ]
+    ),
+    "Variable": StructType(
+        [
+            StructField("variable_id", StringType(), nullable=False),
+            StructField("name", StringType(), nullable=True),
+            StructField("description", StringType(), nullable=True),
+            StructField("name_cv_id", StringType(), nullable=True),
+            StructField("unit", StringType(), nullable=True),
+            StructField("value_type", StringType(), nullable=False),
+        ]
+    ),
+    "VariableValue": StructType(
+        [
+            StructField("variable_value_id", StringType(), nullable=False),
+            StructField("variable_id", StringType(), nullable=False),
+            StructField("value_type", StringType(), nullable=True),
+        ]
+    ),
+}
diff --git a/src/cdm_data_loader_utils/parsers/shared_identifiers.py b/src/cdm_data_loader_utils/parsers/shared_identifiers.py
new file mode 100644
index 0000000..33b865f
--- /dev/null
+++ b/src/cdm_data_loader_utils/parsers/shared_identifiers.py
@@ -0,0 +1,11 @@
+from cdm_data_loader_utils.parsers.xml_utils import get_text
+
+
+def parse_identifiers_generic(entry, xpath, prefix, ns):
+    result = []
+    for node in entry.findall(xpath, ns):
+        text = get_text(node)
+        if not text:
+            continue
+        result.append({"identifier": f"{prefix}:{text}", "source": prefix, "description": f"{prefix} accession"})
+    return result
diff --git a/src/cdm_data_loader_utils/parsers/uniprot.py b/src/cdm_data_loader_utils/parsers/uniprot.py
index fa6d6a4..ca4516e 100644
--- a/src/cdm_data_loader_utils/parsers/uniprot.py
+++ b/src/cdm_data_loader_utils/parsers/uniprot.py
@@ -1,28 +1,37 @@
 """
-UniProt XML Delta Lake Ingestion Pipeline.
+UniProt XML Delta Lake Ingestion Pipeline
 =========================================
 
 This script parses UniProt XML (.xml.gz) file and ingests the data into structured Delta Lake tables.
 
 Typical usage:
 --------------
+Use it in Berdle as:
 python3 src/parsers/uniprot.py \
     --xml-url "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/uniprot_sprot_archaea.xml.gz" \
     --output-dir "./output" \
     --namespace "uniprot_db" \
     --batch-size 5000
 
+
+python -m cdm_data_loader_utils.parsers.uniprot \
+  --xml-url "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/uniprot_sprot_archaea.xml.gz" \
+  --output-dir "tests/data/uniprot_archaea" \
+  --namespace "uniprot_db" \
+  --batch-size 5000
+
+
 Arguments:
 ----------
---xml-url:      URL to the UniProt XML .gz file
+--xml-url:      URL to the UniProt XML .gz file 
 --output-dir:   Output directory for Delta tables and logs (default: './output')
 --namespace:    Delta Lake database name (default: 'uniprot_db')
---target-date:  Process entries modified/updated since specific date
+--target-date:  Process entries modified/updated since specific date 
 --batch-size:   Number of UniProt entries to process per write batch (default: 5000)
 
 Functionality:
 --------------
-- Downloads the XML file if not present locally
+- Downloads the XML file if not present locally 
 - Parses UniProt entries in a memory-efficient streaming fashion
 - Maps parsed data into standardized CDM tables
 - Writes all tables as Delta Lake tables, supporting incremental import
@@ -38,6 +47,7 @@
 import datetime
 import gzip
 import json
+import logging
 import os
 import uuid
 import xml.etree.ElementTree as ET
@@ -46,49 +56,61 @@
 import requests
 from delta import configure_spark_with_delta_pip
 from pyspark.sql import SparkSession
+from pyspark.sql.functions import col, split
 from pyspark.sql.types import ArrayType, StringType, StructField, StructType
 
-## XML namespace mapping for UniProt entries (used for all XPath queries)
-NS = {"u": "https://uniprot.org/uniprot"}
+from cdm_data_loader_utils.parsers.shared_identifiers import parse_identifiers_generic
+from cdm_data_loader_utils.parsers.xml_utils import clean_dict, find_all_text, get_attr, get_text, parse_db_references
 
+# ---------------------------------------------------------------------
+#                              Logging
+# ---------------------------------------------------------------------
+logger = logging.getLogger(__name__)
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+)
 
-def load_existing_identifiers(spark, output_dir, namespace):
-    """
-    Load the existing 'identifiers' Delta table and build a mapping from UniProt accession to CDM entity ID.
-    This function enables consistent mapping of accessions to CDM IDs across multiple imports, supporting upsert and idempotent workflows.
 
-    Returns:
-        dict: {accession: entity_id}
-    """
-    access_to_cdm_id = {}
-    id_path = os.path.abspath(os.path.join(output_dir, f"{namespace}_identifiers_delta"))
-    if os.path.exists(id_path):
-        try:
-            # Read identifier and entity_id columns from the Delta table
-            df = spark.read.format("delta").load(id_path).select("identifier", "entity_id")
-            for row in df.collect():
-                # Identifier field: UniProt:Pxxxxx, extract the actual accession part after the colon
-                accession = row["identifier"].split(":", 1)[1]
-                access_to_cdm_id[accession] = row["entity_id"]
-        except Exception as e:
-            print(f"Couldn't load identifiers table: {e}")
-    else:
-        print(f"No previous identifiers delta at {id_path}.")
-    return access_to_cdm_id
+# ---------------------------------------------------------------------
+# XML namespace mapping for UniProt entries (used for all XPath queries)
+# ---------------------------------------------------------------------
+NS = {"ns": "https://uniprot.org/uniprot"}
 
 
-def generate_cdm_id() -> str:
-    """
-    Generate a CDM entity_id directly from UniProt accession, using 'CDM:' prefix
-    Ensures that each accession is mapped to stable and unique CDM entity ID, making it easy to join across different tables by accession.
-    """
-    return f"CDM:{uuid.uuid4()}"
+# ---------------------------------------------------------------------
+# Stable ID namespace (UUIDv5)
+# ---------------------------------------------------------------------
+CDM_UUID_NAMESPACE = uuid.UUID("2d3f6e2a-4d7b-4a8c-9c5a-0e0f7b7d9b3a")
 
 
-def build_datasource_record(xml_url):
-    """
-    Build a provenance record for the UniProt datasource without version extraction.
-    """
+# ---------------------------------------------------------------------
+# CURIE prefixes
+# ---------------------------------------------------------------------
+PREFIX_TRANSLATION: dict[str, str] = {
+    "UniProtKB": "UniProt",
+    "UniProtKB/Swiss-Prot": "UniProt",
+    "UniProtKB/TrEMBL": "UniProt",
+    "UniParc": "UniParc",
+    "RefSeq": "RefSeq",
+    "EMBL": "EMBL",
+    "PDB": "PDB",
+    "ChEBI": "ChEBI",
+    "Rhea": "Rhea",
+    "NCBI Taxonomy": "NCBITaxon",
+    "GeneID": "NCBIGene",
+    "Ensembl": "Ensembl",
+    "GO": "GO",
+}
+
+
+# ================================ HELPERS =================================
+def delta_table_path(output_dir: str, namespace: str, table: str) -> str:
+    return os.path.abspath(os.path.join(output_dir, namespace, table))
+
+
+def build_datasource_record(xml_url: str) -> dict:
+    """Build a provenance record for the UniProt datasource."""
     return {
         "name": "UniProt import",
         "source": "UniProt",
@@ -98,162 +120,323 @@ def build_datasource_record(xml_url):
     }
 
 
-def parse_identifiers(entry, cdm_id):
-    """
-    Extract all accession numbers in the UniProt entry and format them into a CDM identifier structure.
-    """
-    return [
-        {
-            "entity_id": cdm_id,
-            "identifier": f"UniProt:{acc.text}",
-            "source": "UniProt",
-            "description": "UniProt accession",
-        }
-        for acc in entry.findall("u:accession", NS)
-    ]
+def save_datasource_record(xml_url: str, output_dir: str) -> dict:
+    """Generate and save the datasource provenance record as a JSON file."""
+    datasource = build_datasource_record(xml_url)
+
+    os.makedirs(output_dir, exist_ok=True)
+    output_path = os.path.join(output_dir, "datasource.json")
+
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(datasource, f, indent=2)
+
+    logger.info("Saved datasource record to %s", output_path)
+    return datasource
+
+
+def download_file(
+    url: str,
+    output_path: str,
+    chunk_size: int = 1024 * 1024,
+    overwrite: bool = False,
+) -> None:
+    """Download URL -> output_path (streaming)"""
+    if os.path.exists(output_path) and not overwrite:
+        logger.info("File already exists, skip download: %s", output_path)
+        return
+
+    tmp_path = output_path + ".part"
+    if os.path.exists(tmp_path):
+        try:
+            os.remove(tmp_path)
+        except Exception:
+            pass
+
+    try:
+        logger.info("Downloading %s -> %s", url, output_path)
+        with requests.get(url, stream=True, timeout=120) as r:
+            r.raise_for_status()
+            with open(tmp_path, "wb") as f:
+                for chunk in r.iter_content(chunk_size=chunk_size):
+                    if chunk:
+                        f.write(chunk)
+        os.replace(tmp_path, output_path)
+        logger.info("Download complete: %s", output_path)
+    except Exception:
+        logger.exception("Failed to download %s", url)
+        try:
+            if os.path.exists(tmp_path):
+                os.remove(tmp_path)
+        except Exception:
+            logger.exception("Failed to remove partial download: %s", tmp_path)
+        raise
+
+
+def prepare_local_xml(xml_url: str, output_dir: str, overwrite: bool = False) -> str:
+    os.makedirs(output_dir, exist_ok=True)
+    local_path = os.path.join(output_dir, os.path.basename(xml_url))
+    download_file(xml_url, local_path, overwrite=overwrite)
+    return local_path
+
+
+def stream_uniprot_xml(filepath: str):
+    """Stream gzipped UniProt XML entries."""
+    logger.info("Streaming UniProt XML from: %s", filepath)
+    with gzip.open(filepath, "rb") as f:
+        for _, elem in ET.iterparse(f, events=("end",)):
+            if elem.tag.endswith("entry"):
+                yield elem
+                elem.clear()
+
+
+def get_spark_session(namespace: str) -> SparkSession:
+    """Initialize SparkSession with Delta Lake support, and ensure the target database exists."""
+    builder = (
+        SparkSession.builder.appName("UniProtDeltaIngestion")
+        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
+        .config(
+            "spark.sql.catalog.spark_catalog",
+            "org.apache.spark.sql.delta.catalog.DeltaCatalog",
+        )
+        .config("spark.databricks.delta.schema.autoMerge.enabled", "true")
+    )
+    spark = configure_spark_with_delta_pip(builder).getOrCreate()
+    spark.sql(f"CREATE DATABASE IF NOT EXISTS {namespace}")
+    return spark
+
+
+def normalize_prefix(db_type: str) -> str:
+    """Map UniProt dbReference @type to a normalized CURIE prefix."""
+    return PREFIX_TRANSLATION.get(db_type, db_type.replace(" ", ""))
+
 
+def make_curie(db_type: str, db_id: str) -> str:
+    """Create CURIE with normalized prefix."""
+    return f"{normalize_prefix(db_type)}:{db_id}"
 
-def parse_names(entry, cdm_id):
+
+# ================================ STABLE ID =================================
+def stable_cdm_id_from_uniprot_accession(accession: str, prefix: str = "cdm_prot_") -> str:
+    u = uuid.uuid5(CDM_UUID_NAMESPACE, f"UniProt:{accession}")
+    return f"{prefix}{u}"
+
+
+def load_existing_maps(
+    spark: SparkSession,
+    output_dir: str,
+    namespace: str,
+) -> tuple[dict[str, str], dict[str, str]]:
     """
-    Extract all protein names from a UniProt <entry> element, including
-    - Top-level <name> elements (generic names)
-    - <recommendedName> and <alternativeName> blocks within <protein> (full and short names).
+    Returns:
+      accession_to_entity_id: accession -> entity_id  (from identifiers)
+      entity_id_to_created:   entity_id -> created    (from entities)
     """
-    names = []
+    accession_to_entity_id: dict[str, str] = {}
+    entity_id_to_created: dict[str, str] = {}
 
-    # Extract all top-level <name> tags
-    for name_element in entry.findall("u:name", NS):
-        if name_element.text:
-            names.append(
-                {
-                    "entity_id": cdm_id,
-                    "name": name_element.text,
-                    "description": "UniProt protein name",
-                    "source": "UniProt",
-                }
+    id_path = os.path.join(output_dir, namespace, "identifiers")
+    ent_path = os.path.join(output_dir, namespace, "entities")
+
+    if os.path.exists(id_path):
+        try:
+            df = (
+                spark.read.format("delta")
+                .load(id_path)
+                .filter(col("identifier").startswith("UniProt:"))
+                .select(
+                    split(col("identifier"), ":").getItem(1).alias("accession"),
+                    col("entity_id"),
+                )
             )
+            for row in df.toLocalIterator():
+                acc = row["accession"]
+                eid = row["entity_id"]
+                if acc and eid:
+                    accession_to_entity_id[acc] = eid
+            logger.info(
+                "Loaded %d accession->entity_id from %s",
+                len(accession_to_entity_id),
+                id_path,
+            )
+        except Exception:
+            logger.exception("Couldn't load identifiers from %s", id_path)
 
-    # Extract recommended and alternative names from <protein> block
-    protein = entry.find("u:protein", NS)
-    if protein is not None:
-        for name_type in ["recommended", "alternative"]:
-            # Directly use findall for simplicity (recommendedName returns single-element list)
-            name_blocks = protein.findall(f"u:{name_type}Name", NS)
-            for name in name_blocks:
-                for name_length in ["full", "short"]:
-                    name_string = name.find(f"u:{name_length}Name", NS)
-                    if name_string is None or not name_string.text:
-                        continue
+    if os.path.exists(ent_path):
+        try:
+            df = spark.read.format("delta").load(ent_path).select("entity_id", "created")
+            for row in df.toLocalIterator():
+                if row["entity_id"] and row["created"]:
+                    entity_id_to_created[row["entity_id"]] = row["created"]
+            logger.info(
+                "Loaded %d entity_id->created from %s",
+                len(entity_id_to_created),
+                ent_path,
+            )
+        except Exception:
+            logger.exception("Couldn't load entities from %s", ent_path)
+
+    return accession_to_entity_id, entity_id_to_created
+
+
+# ================================ PARSERS =================================
+def parse_identifiers(entry, cdm_id: str) -> list[dict]:
+    out = parse_identifiers_generic(entry=entry, xpath="ns:accession", prefix="UniProt", ns=NS)
+    for row in out:
+        row["entity_id"] = cdm_id
+        row.setdefault("source", "UniProt")
+        row.setdefault("description", "UniProt accession")
+    return out
 
-                    names.append(
-                        {
-                            "entity_id": cdm_id,
-                            "name": name_string.text,
-                            "description": f"UniProt {name_type} {name_length} name",
-                            "source": "UniProt",
-                        }
-                    )
+
+def _make_name_record(cdm_id: str, name_text: str, description: str) -> dict:
+    return {
+        "entity_id": cdm_id,
+        "name": name_text,
+        "description": description,
+        "source": "UniProt",
+    }
+
+
+def parse_names(entry, cdm_id: str) -> list[dict]:
+    names: list[dict] = []
+
+    for txt in find_all_text(entry, "ns:name", NS):
+        names.append(_make_name_record(cdm_id, txt, "UniProt entry name"))
+
+    protein = entry.find("ns:protein", NS)
+    if protein is not None:
+        for tag_name, logical_type in [
+            ("recommendedName", "recommended"),
+            ("alternativeName", "alternative"),
+        ]:
+            for name_block in protein.findall(f"ns:{tag_name}", NS):
+                for xml_tag, length_label in [
+                    ("fullName", "full"),
+                    ("shortName", "short"),
+                ]:
+                    elem = name_block.find(f"ns:{xml_tag}", NS)
+                    text = get_text(elem)
+                    if text:
+                        names.append(
+                            _make_name_record(
+                                cdm_id,
+                                text,
+                                f"UniProt {logical_type} {length_label} name",
+                            )
+                        )
     return names
 
 
-def parse_protein_info(entry, cdm_id):
-    """
-    Extract protein-level metadata from a UniProt XML <entry> element.
-    """
-    protein_info = {}
-    ec_numbers = []
+def parse_protein_info(entry, cdm_id: str) -> dict | None:
+    protein_info: dict = {}
 
-    # Extract EC numbers from <recommendedName> and <alternativeName> in <protein>
-    protein = entry.find("u:protein", NS)
+    protein = entry.find("ns:protein", NS)
     if protein is not None:
-        # Find EC numbers in recommendedName
-        rec = protein.find("u:recommendedName", NS)
-        if rec is not None:
-            for ec in rec.findall("u:ecNumber", NS):
-                if ec.text:
-                    ec_numbers.append(ec.text)
-
-        # Find EC numbers in all alternativeNames
-        for alt in protein.findall("u:alternativeName", NS):
-            for ec in alt.findall("u:ecNumber", NS):
-                if ec.text:
-                    ec_numbers.append(ec.text)
+        ec_paths = ["ns:recommendedName/ns:ecNumber", "ns:alternativeName/ns:ecNumber"]
+        ec_numbers: list[str] = []
+        for path in ec_paths:
+            ec_numbers.extend(find_all_text(protein, path, NS))
         if ec_numbers:
-            protein_info["ec_numbers"] = ec_numbers
+            protein_info["ec_numbers"] = ";".join(ec_numbers)
 
-    # Extract protein existence evidence type
-    protein_existence = entry.find("u:proteinExistence", NS)
+    protein_existence = entry.find("ns:proteinExistence", NS)
     if protein_existence is not None:
         protein_info["protein_id"] = cdm_id
-        protein_info["evidence_for_existence"] = protein_existence.get("type")
-
-    # Extract sequence and sequence-related attributes
-    seq_elem = entry.find("u:sequence", NS)
-    if seq_elem is not None and seq_elem.text:
-        protein_info["length"] = seq_elem.get("length")
-        protein_info["mass"] = seq_elem.get("mass")
-        protein_info["checksum"] = seq_elem.get("checksum")
-        protein_info["modified"] = seq_elem.get("modified")
-        protein_info["sequence_version"] = seq_elem.get("version")
-        protein_info["sequence"] = seq_elem.text.strip()
-
-    # Capture the entry's modified/updated date for tracking
-    entry_modified = entry.attrib.get("modified") or entry.attrib.get("updated")
+        protein_info["evidence_for_existence"] = get_attr(protein_existence, "type")
+
+    seq_elem = entry.find("ns:sequence", NS)
+    if seq_elem is not None:
+        protein_info.update(
+            clean_dict(
+                {
+                    "length": get_attr(seq_elem, "length"),
+                    "mass": get_attr(seq_elem, "mass"),
+                    "checksum": get_attr(seq_elem, "checksum"),
+                    "modified": get_attr(seq_elem, "modified"),
+                    "sequence_version": get_attr(seq_elem, "version"),
+                    "sequence": get_text(seq_elem),
+                }
+            )
+        )
+
+    entry_modified = get_attr(entry, "modified") or get_attr(entry, "updated")
     if entry_modified:
         protein_info["entry_modified"] = entry_modified
 
-    # Return the dictionary if any protein info was extracted
     return protein_info if protein_info else None
 
 
-def parse_evidence_map(entry):
-    """
-    Parse all <evidence> elements from a UniProt XML entry and build a mapping
-    from evidence key to metadata (type, supporting objects, publications).
-    """
-    evidence_map = {}
+def parse_evidence_map(entry) -> dict[str, dict]:
+    evidence_map: dict[str, dict] = {}
 
-    # Loop through every <evidence> element in the entry
-    for evidence in entry.findall("u:evidence", NS):
-        key = evidence.get("key")  # Unique evidence key (string)
-        evidence_type = evidence.get("type")  # Evidence code/type (e.g., ECO:0000255)
+    for ev in entry.findall("ns:evidence", NS):
+        key = get_attr(ev, "key")
+        if not key:
+            continue
 
-        supporting_objects = []
-        publications = []
+        evidence_type = get_attr(ev, "type")
+        pubs: list[str] = []
+        others: list[str] = []
 
-        # Check if this evidence has a <source> element with <dbReference> children
-        source = evidence.find("u:source", NS)
+        source = ev.find("ns:source", NS)
         if source is not None:
-            for dbref in source.findall("u:dbReference", NS):
-                db_type = dbref.get("type")
-                db_id = dbref.get("id")
-                # Add publication references as PubMed or DOI; others as supporting objects
-                if db_type == "PubMed":
-                    publications.append(f"PMID:{db_id}")
-                elif db_type == "DOI":
-                    publications.append(f"DOI:{db_id}")
+            raw_pubs, raw_others = parse_db_references(source, NS)
+
+            normalized_pubs: list[str] = []
+            for p in raw_pubs:
+                up = p.upper()
+                if up.startswith("PUBMED:"):
+                    _, acc = p.split(":", 1)
+                    normalized_pubs.append(f"PMID:{acc}")
                 else:
-                    supporting_objects.append(f"{db_type}:{db_id}")
+                    normalized_pubs.append(p)
 
-        # Store evidence metadata, omitting empty lists for cleanliness
-        evidence_map[key] = {
-            "evidence_type": evidence_type,
-            "supporting_objects": supporting_objects if supporting_objects else None,
-            "publications": publications if publications else None,
-        }
+            pubs = normalized_pubs
+            others = raw_others
+
+        evidence_map[key] = clean_dict(
+            {
+                "evidence_type": evidence_type,
+                "publications": pubs or None,
+                "supporting_objects": others or None,
+            }
+        )
 
     return evidence_map
 
 
-def parse_reaction_association(reaction, cdm_id, evidence_map):
-    associations = []
-    for dbref in reaction.findall("u:dbReference", NS):
+def _make_association(
+    cdm_id: str,
+    obj: str,
+    predicate: str | None = None,
+    evidence_key: str | None = None,
+    evidence_map: dict | None = None,
+) -> dict:
+    assoc = {
+        "subject": cdm_id,
+        "object": obj,
+        "predicate": predicate,
+        "evidence_type": None,
+        "supporting_objects": None,
+        "publications": None,
+    }
+    if evidence_key and evidence_map and evidence_key in evidence_map:
+        assoc.update(evidence_map[evidence_key])
+    return clean_dict(assoc)
+
+
+def parse_reaction_association(reaction, cdm_id: str, evidence_map: dict[str, dict]) -> list[dict]:
+    associations: list[dict] = []
+    for dbref in reaction.findall("ns:dbReference", NS):
         db_type = dbref.get("type")
         db_id = dbref.get("id")
+        if not db_type or not db_id:
+            continue
+
         assoc = {
             "subject": cdm_id,
             "predicate": "catalyzes",
-            "object": f"{db_type}:{db_id}",
+            "object": make_curie(db_type, db_id),
             "evidence_type": None,
             "supporting_objects": None,
             "publications": None,
@@ -261,124 +444,127 @@ def parse_reaction_association(reaction, cdm_id, evidence_map):
         evidence_key = reaction.get("evidence")
         if evidence_key and evidence_key in evidence_map:
             assoc.update(evidence_map[evidence_key])
-        associations.append(assoc)
+        associations.append(clean_dict(assoc))
     return associations
 
 
-def parse_cofactor_association(cofactor, cdm_id):
-    associations = []
-    for dbref in cofactor.findall("u:dbReference", NS):
+def parse_cofactor_association(cofactor, cdm_id: str) -> list[dict]:
+    associations: list[dict] = []
+    for dbref in cofactor.findall("ns:dbReference", NS):
         db_type = dbref.get("type")
         db_id = dbref.get("id")
-        assoc = {
-            "subject": cdm_id,
-            "predicate": "requires_cofactor",
-            "object": f"{db_type}:{db_id}",
-            "evidence_type": None,
-            "supporting_objects": None,
-            "publications": None,
-        }
-        associations.append(assoc)
+        if not db_type or not db_id:
+            continue
+        associations.append(
+            clean_dict(
+                {
+                    "subject": cdm_id,
+                    "predicate": "requires_cofactor",
+                    "object": make_curie(db_type, db_id),
+                    "evidence_type": None,
+                    "supporting_objects": None,
+                    "publications": None,
+                }
+            )
+        )
     return associations
 
 
-def parse_associations(entry, cdm_id, evidence_map):
+def parse_associations(entry, cdm_id: str, evidence_map: dict[str, dict]) -> list[dict]:
     """
-    Parse all relevant associations from a UniProt XML entry for the CDM model.
-    Only include fields that are not None for each association.
+    Only keep:
+      - taxonomy association
+      - catalytic activity / cofactor associations
     """
-    associations = []
-
-    def clean(d):
-        """Remove None-value keys from a dict."""
-        return {k: v for k, v in d.items() if v is not None}
+    associations: list[dict] = []
 
     # Taxonomy association
-    organism = entry.find("u:organism", NS)
+    organism = entry.find("ns:organism", NS)
     if organism is not None:
-        taxon_ref = organism.find('u:dbReference[@type="NCBI Taxonomy"]', NS)
+        taxon_ref = organism.find('ns:dbReference[@type="NCBI Taxonomy"]', NS)
         if taxon_ref is not None:
-            associations.append(
-                clean(
-                    {
-                        "subject": cdm_id,
-                        "object": f"NCBITaxon:{taxon_ref.get('id')}",
-                        "predicate": None,
-                        "evidence_type": None,
-                        "supporting_objects": None,
-                        "publications": None,
-                    }
-                )
-            )
-
-    # Database cross-references with evidence
-    for dbref in entry.findall("u:dbReference", NS):
-        db_type = dbref.get("type")
-        db_id = dbref.get("id")
-        association = {
-            "subject": cdm_id,
-            "object": f"{db_type}:{db_id}",
-            "predicate": None,
-            "evidence_type": None,
-            "supporting_objects": None,
-            "publications": None,
-        }
-        evidence_key = dbref.get("evidence")
-        if evidence_key and evidence_key in evidence_map:
-            association.update(evidence_map[evidence_key])
-        associations.append(clean(association))
+            tax_id = taxon_ref.get("id")
+            if tax_id:
+                associations.append(_make_association(cdm_id, f"NCBITaxon:{tax_id}", predicate="in_taxon"))
 
-    # Catalytic/cofactor
-    for comment in entry.findall("u:comment", NS):
+    # Catalytic activity / cofactor
+    for comment in entry.findall("ns:comment", NS):
         comment_type = comment.get("type")
         if comment_type == "catalytic activity":
-            # extract catalytic associations
-            for reaction in comment.findall("u:reaction", NS):
-                for assoc in parse_reaction_association(reaction, cdm_id, evidence_map):
-                    associations.append(clean(assoc))
+            for reaction in comment.findall("ns:reaction", NS):
+                associations.extend(parse_reaction_association(reaction, cdm_id, evidence_map))
         elif comment_type == "cofactor":
-            # extract cofactor associations
-            for cofactor in comment.findall("u:cofactor", NS):
-                for assoc in parse_cofactor_association(cofactor, cdm_id):
-                    associations.append(clean(assoc))
+            for cofactor in comment.findall("ns:cofactor", NS):
+                associations.extend(parse_cofactor_association(cofactor, cdm_id))
+
     return associations
 
 
-def parse_publications(entry):
-    """
-    Extract all publication references from a UniProt XML <entry>
-    Returns a list of standardized publication IDs (PMID and DOI).
-    """
-    publications = []
-
-    # Iterate through all <reference> blocks in the entry
-    for reference in entry.findall("u:reference", NS):
-        citation = reference.find("u:citation", NS)
-        if citation is not None:
-            # Each <citation> may have multiple <dbReference> elements (e.g., PubMed, DOI)
-            for dbref in citation.findall("u:dbReference", NS):
-                db_type = dbref.get("type")
-                db_id = dbref.get("id")
-                # Standardize format for known publication types
-                if db_type == "PubMed":
-                    publications.append(f"PMID:{db_id}")
-                elif db_type == "DOI":
-                    publications.append(f"DOI:{db_id}")
-
-    return publications
-
-
-def parse_uniprot_entry(entry, cdm_id, current_timestamp, datasource_name="UniProt import", prev_created=None):
-    if prev_created:
-        entity_created = prev_created
-        entity_updated = current_timestamp
-    else:
-        entity_created = current_timestamp
-        entity_updated = current_timestamp
+def parse_cross_references(entry, cdm_id: str) -> list[dict]:
+    """Generic <dbReference> -> cross_references table."""
+    rows: list[dict] = []
+
+    for dbref in entry.findall("ns:dbReference", NS):
+        db_type = dbref.get("type")
+        db_id = dbref.get("id")
+        if not db_type or not db_id:
+            continue
+
+        xref_type = normalize_prefix(db_type)
+
+        if ":" in db_id:
+            xref = db_id
+        else:
+            xref = f"{xref_type}:{db_id}"
+
+        rows.append(
+            clean_dict(
+                {
+                    "entity_id": cdm_id,
+                    "xref_type": xref_type,
+                    "xref_value": db_id,
+                    "xref": xref,
+                }
+            )
+        )
+
+    return rows
+
+
+def parse_publications(entry) -> list[str]:
+    publications: list[str] = []
+    for reference in entry.findall("ns:reference", NS):
+        citation = reference.find("ns:citation", NS)
+        if citation is None:
+            continue
+
+        raw_pubs, _ = parse_db_references(citation, NS)
+        for p in raw_pubs:
+            up = p.upper()
+            if up.startswith("PUBMED:"):
+                _, acc = p.split(":", 1)
+                publications.append(f"PMID:{acc}")
+            elif up.startswith("DOI:"):
+                _, acc = p.split(":", 1)
+                publications.append(f"DOI:{acc}")
+
+    return list(dict.fromkeys(publications))
+
+
+def parse_uniprot_entry(
+    entry,
+    cdm_id: str,
+    current_timestamp: str,
+    datasource_name: str = "UniProt import",
+    prev_created: str | None = None,
+) -> dict:
+    entity_created = prev_created or current_timestamp
+    entity_updated = current_timestamp
 
     uniprot_created = entry.attrib.get("created")
     uniprot_modified = entry.attrib.get("modified") or entry.attrib.get("updated")
     uniprot_version = entry.attrib.get("version")
+
     entity = {
         "entity_id": cdm_id,
         "entity_type": "protein",
@@ -389,65 +575,21 @@ def parse_uniprot_entry(entry, cdm_id, current_timestamp, datasource_name="UniPr
         "uniprot_created": uniprot_created,
         "uniprot_modified": uniprot_modified,
     }
+
     evidence_map = parse_evidence_map(entry)
+
     return {
         "entity": entity,
         "identifiers": parse_identifiers(entry, cdm_id),
         "names": parse_names(entry, cdm_id),
         "protein": parse_protein_info(entry, cdm_id),
         "associations": parse_associations(entry, cdm_id, evidence_map),
+        "cross_references": parse_cross_references(entry, cdm_id),
         "publications": parse_publications(entry),
     }
 
 
-def download_file(url, output_path, chunk_size=8192, overwrite=False) -> None:
-    """
-    Download a file from a given URL to a local output path.
-    """
-    # Skip download if file already exists and not overwriting
-    if os.path.exists(output_path) and not overwrite:
-        print(f"File '{output_path}' already exists.")
-        return
-
-    # Stream download to avoid high memory usage
-    try:
-        with requests.get(url, stream=True, timeout=60) as response:
-            response.raise_for_status()
-            with open(output_path, "wb") as f:
-                for chunk in response.iter_content(chunk_size=chunk_size):
-                    if chunk:
-                        f.write(chunk)
-        print(f"Downloaded '{url}' to '{output_path}'")
-    except Exception as e:
-        print(f"Failed to download '{url}': {e}")
-
-        if os.path.exists(output_path):
-            os.remove(output_path)  # Remove incomplete file
-        raise
-
-
-def stream_uniprot_xml(filepath):
-    """
-    Stream and parse UniProt XML entries from a local gzipped file.
-    Yields each <entry> element as soon as it is parsed to avoid loading the entire XML into memory.
-    """
-    # Open the gzipped XML file for reading in binary mode
-    with gzip.open(filepath, "rb") as f:
-        # Use iterparse to process XML incrementally, triggering on element end events
-        context = ET.iterparse(f, events=("end",))
-        for _event, element in context:
-            # Check tag name, ignoring namespace
-            if element.tag.endswith("entry"):
-                yield element
-                element.clear()
-
-
-## ================================ SCHEMA =================================
-"""
-Defines the Spark schema for all major CDM tables derived from UniProt XML.
-Each schema is tailored for protein entities, identifiers, protein details, names, associations, and linked publications.
-"""
-
+# ================================ SCHEMA =================================
 schema_entities = StructType(
     [
         StructField("entity_id", StringType(), False),
@@ -505,6 +647,15 @@ def stream_uniprot_xml(filepath):
     ]
 )
 
+schema_cross_references = StructType(
+    [
+        StructField("entity_id", StringType(), False),
+        StructField("xref_type", StringType(), True),
+        StructField("xref_value", StringType(), True),
+        StructField("xref", StringType(), True),
+    ]
+)
+
 schema_publications = StructType(
     [
         StructField("entity_id", StringType(), False),
@@ -513,126 +664,74 @@ def stream_uniprot_xml(filepath):
 )
 
 
-def save_batches_to_delta(spark, tables, output_dir, namespace) -> None:
-    """
-    Persist batches of parsed records for each CDM table into Delta Lake format.
-
-    - Each table is saved into a Delta directory named '{namespace}_{table}_delta' in the output folder.
-    - If the Delta directory exists, append new records. Otherwise, overwrite it.
-    - Registers the table in the Spark SQL for downstream query.
-    """
-    for table, (records, schema) in tables.items():
-        if not records:
-            continue  # Skip all empty tables
-
-        delta_dir = os.path.abspath(os.path.join(output_dir, f"{namespace}_{table}_delta"))
-        # Use "append" mode if the Delta directory already exists, otherwise "overwrite"
-        mode = "append" if os.path.exists(delta_dir) else "overwrite"
-
-        print(
-            f"[DEBUG] Registering table: {namespace}.{table} at {delta_dir} with mode={mode}, record count: {len(records)}"
-        )
-
-        try:
-            df = spark.createDataFrame(records, schema)
-            df.write.format("delta").mode(mode).option("overwriteSchema", "true").save(delta_dir)
-            spark.sql(f"""
-                CREATE TABLE IF NOT EXISTS {namespace}.{table}
-                USING DELTA
-                LOCATION '{delta_dir}'
-            """)
-        except Exception as e:
-            print(f"Failed to save {table} to Delta: {e}")
-
-
-def prepare_local_xml(xml_url, output_dir):
-    """
-    Download the remote UniProt XML (.xml.gz) file to the specified local output directory,
-    unless the file already exists locally. Returns the full local file path.
-    """
-    # Ensure output directory exists
-    os.makedirs(output_dir, exist_ok=True)
-    local_xml_path = os.path.join(output_dir, os.path.basename(xml_url))
-    # Download only if file does not exist
-    download_file(xml_url, local_xml_path)
-    return local_xml_path
-
-
-def save_datasource_record(xml_url, output_dir):
-    """
-    Generate and save the datasource provenance record as a JSON file in the output directory.
-    """
-    datasource = build_datasource_record(xml_url)
-    os.makedirs(output_dir, exist_ok=True)  # Ensure output directory exists
-    output_path = os.path.join(output_dir, "datasource.json")
-    with open(output_path, "w") as f:
-        json.dump(datasource, f, indent=4)
-    return datasource
-
-
-def get_spark_session(namespace):
-    """
-    Initialize SparkSession with Delta Lake support, and ensure the target database exists.
-    """
-    # Build SparkSession with Delta extensions enabled
-    builder = (
-        SparkSession.builder.appName("DeltaIngestion")
-        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
-        .config(
-            "spark.sql.catalog.spark_catalog",
-            "org.apache.spark.sql.delta.catalog.DeltaCatalog",
-        )
-    )
-    spark = configure_spark_with_delta_pip(builder).getOrCreate()
-    # Ensure the target namespace (database) exists
+# ================================ DELTA WRITE =================================
+def ensure_tables_registered(spark: SparkSession, output_dir: str, namespace: str, table_names: list[str]) -> None:
     spark.sql(f"CREATE DATABASE IF NOT EXISTS {namespace}")
-    return spark
-
-
-def load_existing_entity(spark, output_dir, namespace):
-    """
-    Load the existing entities_delta Delta table and build a mapping of entity_id to created timestamp.
-    This mapping is used to support upserts and idempotent writes.
-    """
-    old_created_dict = {}
-    entities_table_path = os.path.abspath(os.path.join(output_dir, f"{namespace}_entities_delta"))
-    if os.path.exists(entities_table_path):
-        try:
-            # Read only the required columns for efficiency
-            old_df = spark.read.format("delta").load(entities_table_path).select("entity_id", "created")
-            for row in old_df.collect():
-                old_created_dict[row["entity_id"]] = row["created"]
-            print(f"Loaded {len(old_created_dict)} existing entity_id records for upsert.")
-        except Exception as e:
-            print(f"Couldn't load previous entities delta table: {e}")
-    else:
-        print(f"No previous entities delta at {entities_table_path}.")
-    return old_created_dict
+    for tbl in table_names:
+        # delta_dir = os.path.abspath(os.path.join(output_dir, namespace, tbl))
+        delta_dir = delta_table_path(output_dir, namespace, tbl)
+        spark.sql(
+            f"""
+            CREATE TABLE IF NOT EXISTS {namespace}.{tbl}
+            USING DELTA
+            LOCATION '{delta_dir}'
+            """
+        )
 
 
-def parse_entries(local_xml_path, target_date, batch_size, spark, tables, output_dir, namespace, current_timestamp):
-    """
-    Parse UniProt XML entries, write to Delta Lake in batches
-    Return (processed_entry_count, skipped_entry_count).
+def save_batches_to_delta(
+    spark: SparkSession,
+    tables: dict[str, tuple[list, StructType]],
+    output_dir: str,
+    namespace: str,
+    mode: str = "append",
+) -> None:
+    for table_name, (records, schema) in tables.items():
+        if not records:
+            continue
 
-    """
+        # delta_dir = os.path.abspath(os.path.join(output_dir, namespace, table_name))
+        delta_dir = delta_table_path(output_dir, namespace, table_name)
+        df = spark.createDataFrame(records, schema)
+        writer = df.write.format("delta").mode(mode)
+
+        if mode == "append":
+            writer = writer.option("mergeSchema", "true")
+        if mode == "overwrite":
+            writer = writer.option("overwriteSchema", "true")
+
+        writer.save(delta_dir)
+
+
+## =============================== MAIN PARSING LOOP =================================
+def parse_entries(
+    local_xml_path: str,
+    target_date: str | None,
+    batch_size: int,
+    spark: SparkSession,
+    tables: dict[str, tuple[list, StructType]],
+    output_dir: str,
+    namespace: str,
+    current_timestamp: str,
+    accession_to_entity_id: dict[str, str],
+    entity_id_to_created: dict[str, str],
+    mode: str,
+) -> tuple[int, int]:
     target_date_dt = None
-
-    # Convert target_date string to datetime for comparison if provided
     if target_date:
         try:
             target_date_dt = datetime.datetime.strptime(target_date, "%Y-%m-%d")
+            logger.info("Target date filter enabled: >= %s", target_date)
         except Exception:
-            print(f"Invalid target date is {target_date}")
+            logger.warning("Invalid target date provided: %s (ignored)", target_date)
+            target_date_dt = None
 
     entry_count, skipped = 0, 0
 
-    # Iterate over each <entry> element in the XML file
     for entry_elem in stream_uniprot_xml(local_xml_path):
         try:
-            # Get the modification date of the entry
             mod_date = entry_elem.attrib.get("modified") or entry_elem.attrib.get("updated")
-            # If target_date is set, skip entries older than target_date
+
             if target_date_dt and mod_date:
                 try:
                     entry_date_dt = datetime.datetime.strptime(mod_date[:10], "%Y-%m-%d")
@@ -643,110 +742,197 @@ def parse_entries(local_xml_path, target_date, batch_size, spark, tables, output
                     skipped += 1
                     continue
 
-            # Extract main accession (skip entry if not present)
-            main_accession_elem = entry_elem.find("u:accession", NS)
-            if main_accession_elem is None or main_accession_elem.text is None:
+            main_accession_elem = entry_elem.find("ns:accession", NS)
+            if main_accession_elem is None or not main_accession_elem.text:
                 skipped += 1
                 continue
 
-            # Generate a unique CDM ID (UUID) for this entry
-            cdm_id = generate_cdm_id()
+            accession = main_accession_elem.text.strip()
+
+            cdm_id = accession_to_entity_id.get(accession) or stable_cdm_id_from_uniprot_accession(accession)
+            prev_created = entity_id_to_created.get(cdm_id)
+
+            record = parse_uniprot_entry(entry_elem, cdm_id, current_timestamp, prev_created=prev_created)
 
-            # Parse all sub-objects: entity, identifiers, names, protein, associations, publications
-            record = parse_uniprot_entry(entry_elem, cdm_id, current_timestamp)
             tables["entities"][0].append(record["entity"])
             tables["identifiers"][0].extend(record["identifiers"])
             tables["names"][0].extend(record["names"])
 
             if record["protein"]:
                 tables["proteins"][0].append(record["protein"])
+
             tables["associations"][0].extend(record["associations"])
-            tables["publications"][0].extend(
-                {"entity_id": record["entity"]["entity_id"], "publication": pub} for pub in record["publications"]
-            )
+            tables["cross_references"][0].extend(record["cross_references"])
+
+            for pub in record["publications"]:
+                tables["publications"][0].append(
+                    {
+                        "entity_id": cdm_id,
+                        "publication": pub,
+                    }
+                )
 
             entry_count += 1
-            # Write batch to Delta and clear lists every batch_size entries
+
             if entry_count % batch_size == 0:
-                save_batches_to_delta(spark, tables, output_dir, namespace)
+                save_batches_to_delta(spark, tables, output_dir, namespace, mode=mode)
                 for v in tables.values():
                     v[0].clear()
-                print(f"{entry_count} entries processed and saved")
-        except Exception as e:
-            # If any error occurs in parsing this entry, skip it and count
-            print(f"Error parsing entry: {e}")
+                logger.info("Processed and saved %d entries...", entry_count)
+
+        except Exception:
+            logger.exception("Error parsing UniProt entry, skipping")
             skipped += 1
-            continue
 
-    # write remaining records
-    save_batches_to_delta(spark, tables, output_dir, namespace)
+    save_batches_to_delta(spark, tables, output_dir, namespace, mode=mode)
     return entry_count, skipped
 
 
-def ingest_uniprot(xml_url, output_dir, namespace, target_date=None, batch_size=5000) -> None:
-    # Generate the timestamp for the current run
+def ingest_uniprot(
+    xml_url: str,
+    output_dir: str,
+    namespace: str,
+    target_date: str | None = None,
+    batch_size: int = 5000,
+    mode: str = "append",
+    overwrite_download: bool = False,
+) -> None:
     current_timestamp = datetime.datetime.now(datetime.UTC).isoformat()
 
-    # Prepare local XML
-    local_xml_path = prepare_local_xml(xml_url, output_dir)
-
-    # Save data source meta information
+    local_xml_path = prepare_local_xml(xml_url, output_dir, overwrite=overwrite_download)
     save_datasource_record(xml_url, output_dir)
 
-    # Get Spark and the existing CDM entity_id
     spark = get_spark_session(namespace)
+    if mode == "append":
+        accession_to_entity_id, entity_id_to_created = load_existing_maps(spark, output_dir, namespace)
+    else:
+        accession_to_entity_id, entity_id_to_created = {}, {}
 
-    # Define the table structure (batch storage)
-    entities, identifiers, names, proteins, associations, publications = (
-        [],
-        [],
-        [],
-        [],
-        [],
-        [],
-    )
-    tables = {
+    # accession_to_entity_id, entity_id_to_created = load_existing_maps(spark, output_dir, namespace)
+
+    entities: list[dict] = []
+    identifiers: list[dict] = []
+    names: list[dict] = []
+    proteins: list[dict] = []
+    associations: list[dict] = []
+    cross_references: list[dict] = []
+    publications: list[dict] = []
+
+    tables: dict[str, tuple[list, StructType]] = {
         "entities": (entities, schema_entities),
         "identifiers": (identifiers, schema_identifiers),
         "names": (names, schema_names),
         "proteins": (proteins, schema_proteins),
         "associations": (associations, schema_associations),
+        "cross_references": (cross_references, schema_cross_references),
         "publications": (publications, schema_publications),
     }
 
-    # Main cycle processing, transfer to current timestamp
+    ensure_tables_registered(
+        spark,
+        output_dir,
+        namespace,
+        [
+            "entities",
+            "identifiers",
+            "names",
+            "proteins",
+            "associations",
+            "cross_references",
+            "publications",
+        ],
+    )
+
+    logger.info(
+        "Starting UniProt ingestion: xml=%s | namespace=%s | mode=%s | batch_size=%d",
+        xml_url,
+        namespace,
+        mode,
+        batch_size,
+    )
+
     entry_count, skipped = parse_entries(
-        local_xml_path, target_date, batch_size, spark, tables, output_dir, namespace, current_timestamp
+        local_xml_path=local_xml_path,
+        target_date=target_date,
+        batch_size=batch_size,
+        spark=spark,
+        tables=tables,
+        output_dir=output_dir,
+        namespace=namespace,
+        current_timestamp=current_timestamp,
+        accession_to_entity_id=accession_to_entity_id,
+        entity_id_to_created=entity_id_to_created,
+        mode=mode,
     )
-    print(f"All entries processed ({entry_count}), skipped {skipped}, writing complete tables.")
-    spark.sql(f"SHOW TABLES IN {namespace}").show()
-    spark.sql(f"SELECT COUNT(*) FROM {namespace}.entities").show()
 
-    # make sql test in entity table
-    spark.sql(f"SELECT * FROM {namespace}.entities LIMIT 10").show(truncate=False)
+    logger.info("Completed parsing UniProt XML. processed=%d skipped=%d", entry_count, skipped)
 
-    spark.stop()
+    logger.info("Verifying Delta tables in namespace `%s`", namespace)
+    spark.sql(f"SHOW TABLES IN {namespace}").show(truncate=False)
 
-    print(f"All Delta tables are created and registered in Spark SQL under `{namespace}`.")
+    for tbl in [
+        "entities",
+        "identifiers",
+        "names",
+        "proteins",
+        "associations",
+        "cross_references",
+        "publications",
+    ]:
+        logger.info("Verifying table: %s.%s", namespace, tbl)
+        spark.sql(f"SELECT COUNT(*) AS row_count FROM {namespace}.{tbl}").show(truncate=False)
+        spark.sql(f"SELECT * FROM {namespace}.{tbl} LIMIT 5").show(truncate=False)
+
+    spark.stop()
+    logger.info("Done")
 
 
+# ================================ CLI =================================
 @click.command()
 @click.option("--xml-url", required=True, help="URL to UniProt XML (.xml.gz)")
-@click.option("--output-dir", default="output", help="Output directory for Delta tables")
-@click.option("--namespace", default="uniprot_db", help="Delta Lake database name")
+@click.option(
+    "--output-dir",
+    default="output",
+    show_default=True,
+    help="Output directory for Delta tables",
+)
+@click.option(
+    "--namespace",
+    default="uniprot_db",
+    show_default=True,
+    help="Delta Lake database name",
+)
 @click.option(
     "--target-date",
     default=None,
     help="Only process entries modified/updated since this date (YYYY-MM-DD)",
 )
-@click.option("--batch-size", default=5000, help="Batch size for writing Delta tables")
-def main(xml_url, output_dir, namespace, target_date, batch_size) -> None:
+@click.option(
+    "--batch-size",
+    default=5000,
+    show_default=True,
+    help="Batch size for writing Delta tables",
+)
+@click.option(
+    "--mode",
+    type=click.Choice(["append", "overwrite"]),
+    default="append",
+    show_default=True,
+)
+@click.option(
+    "--overwrite-download",
+    is_flag=True,
+    help="Force re-download XML even if file exists",
+)
+def main(xml_url, output_dir, namespace, target_date, batch_size, mode, overwrite_download):
     ingest_uniprot(
         xml_url=xml_url,
         output_dir=output_dir,
         namespace=namespace,
         target_date=target_date,
         batch_size=int(batch_size),
+        mode=mode,
+        overwrite_download=overwrite_download,
     )
 
 
diff --git a/src/cdm_data_loader_utils/parsers/uniref.py b/src/cdm_data_loader_utils/parsers/uniref.py
index da3327c..6e1cdf3 100644
--- a/src/cdm_data_loader_utils/parsers/uniref.py
+++ b/src/cdm_data_loader_utils/parsers/uniref.py
@@ -1,5 +1,5 @@
 """
-UniRef XML Cluster ETL Pipeline.
+UniRef XML Cluster ETL Pipeline
 
 This script downloads a UniRef100 XML file, parses cluster and member information, and writes the extracted data into Delta Lake tables for downstream analysis.
 
@@ -22,7 +22,12 @@
   --output-dir cdm-data-loader-utils/output/uniref100_clusters \
   --batch-size 1000
 
-**Parameters:**
+python3 uniref.py \
+  --ftp-url https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref100/uniref100.xml.gz \
+  --output-dir output_uniref \
+  --batch-size 1000
+
+Parameters:
 - --ftp-url:       UniProt FTP URL to the UniRef100 gzipped XML file.
 - --output-dir:    Output directory where Delta tables will be written.
 - --batch-size:    Number of UniRef entries to process.
@@ -30,118 +35,211 @@
 """
 
 import gzip
+
+### ===== logging setup ===== ###
+import logging
 import os
 import uuid
 import xml.etree.ElementTree as ET
-from datetime import datetime
-from urllib.request import URLError, urlretrieve
+from datetime import UTC, datetime
+from pathlib import Path
+from urllib.error import URLError
+from urllib.request import urlretrieve
 
 import click
 from delta import configure_spark_with_delta_pip
 from pyspark.sql import SparkSession
 from pyspark.sql.types import StringType, StructField, StructType
 
+from cdm_data_loader_utils.parsers.xml_utils import get_text, parse_properties
+
+logger = logging.getLogger(__name__)
+
+
+UNIREF_NS = {"ns": "http://uniprot.org/uniref"}
+DATA_SOURCE = "UniRef 100/90/50"
+
+
+PREFIX_TRANSLATION = {
+    "UniProtKB ID": "UniProt",
+    "UniProtKB accession": "UniProt",
+    "UniParc ID": "UniParc",
+    "UniRef90 ID": "UniRef90",
+    "UniRef50 ID": "UniRef50",
+    "UniRef100 ID": "UniRef100",
+}
 
-# Generate a unique CDM entity_id based on accession
-def cdm_entity_id(accession) -> str | None:
-    if not accession:
-        return None
-    uuid_part = uuid.uuid5(uuid.NAMESPACE_OID, accession)
-    return f"CDM:{uuid_part}"
 
+def generate_dbxref(db: str, acc: str) -> str:
+    """Generate a database reference that uses BioRegistry prefixes."""
+    return f"{PREFIX_TRANSLATION[db]}:{acc}"
 
-# Download a file from the specified URL to the local path if it does not already exist
-def download_file(url, local_path) -> None:
+
+def cdm_entity_id(value: str, prefix: str = "CDM:") -> str:
     """
-    If the file is already present at local, the function does nothing.
-    If the download fails, any partially downloaded file will be removed.
+    Deterministic UUIDv5-based CDM id generator.
+
+    value must be non-empty.
     """
-    if not os.path.exists(local_path):
-        print(f"Downloading from URL link: {url}")
+    if not value:
+        raise ValueError("Value must be a non-empty string")
+
+    return f"{prefix}{uuid.uuid5(uuid.NAMESPACE_OID, value)}"
+
+
+# timestamp helper
+def get_timestamps(
+    uniref_id: str,
+    existing_created: dict[str, str],
+    now: datetime | None = None,
+) -> tuple[str, str]:
+    """
+    Return (updated_time, created_time) for a given UniRef cluster ID.
+    - All timestamps are UTC ISO8601 with timezone (e.g., 2026-01-05T12:34:56+00:00)
+    - uniref_id must be non-empty (schema invariant)
+    """
+    if not uniref_id:
+        raise ValueError("get_timestamps: uniref_id must be a non-empty string")
+
+    now_dt = now or datetime.now(UTC)
+    updated_time = now_dt.isoformat(timespec="seconds")
+
+    created_time = existing_created.get(uniref_id) or updated_time
+    return updated_time, created_time
+
+
+def download_file(url: str, local_path: str, overwrite: bool = False) -> str:
+    """
+    Download URL -> local_path.
+    - Atomic: downloads to .part then os.replace
+    - Idempotent: skips if exists unless overwrite=True
+    Returns the final local_path.
+    """
+    dst = Path(local_path)
+    dst.parent.mkdir(parents=True, exist_ok=True)
+
+    if dst.exists() and not overwrite:
+        logger.info("File already exists, skip download: %s", dst)
+        return str(dst)
+
+    tmp = dst.with_suffix(dst.suffix + ".part")
+
+    try:
+        if tmp.exists():
+            tmp.unlink()
+    except Exception:
+        logger.exception("Failed to remove partial download: %s", tmp)
+
+    logger.info("Downloading %s -> %s", url, dst)
+    try:
+        urlretrieve(url, str(tmp))
+        os.replace(tmp, dst)
+        logger.info("Download complete: %s", dst)
+        return str(dst)
+    except Exception:
+        logger.exception("Failed to download %s", url)
         try:
-            urlretrieve(url, local_path)
-            print("Download completed!")
-        except Exception as e:
-            print(f"Failed to download {url}: {e}")
-            if os.path.exists(local_path):
-                os.remove(local_path)
-            raise
-    else:
-        print(f"File already exists: {local_path}")
+            if tmp.exists():
+                tmp.unlink()
+        except Exception:
+            logger.exception("Failed to cleanup tmp file: %s", tmp)
+        raise
 
 
-# Load mapping from data_source_entity_id to created timestamp from Delta table
-def load_existing_created(spark, entity_table):
-    existing_created = {}
+def load_existing_created(spark: SparkSession, entity_table: str | None) -> dict[str, str]:
+    """
+    Load mapping data_source_entity_id -> created timestamp from the Entity Delta table.
+    Returns an empty dict if the table does not exist.
+    """
+    existing_created: dict[str, str] = {}
     if not entity_table:
-        print("Entity table path not specified.")
+        logger.warning("Entity table path not specified.")
         return existing_created
 
     try:
         df = spark.read.format("delta").load(entity_table).select("data_source_entity_id", "created")
         existing_created = {row["data_source_entity_id"]: row["created"] for row in df.collect()}
-        print(f"Loaded {len(existing_created)} existing created timestamps.")
+        logger.info(f"Loaded {len(existing_created)} existing created timestamps from {entity_table}.")
     except Exception as e:
-        print(f"No existing Delta table found at {entity_table}. Starting fresh. ({e.__class__.__name__})")
+        logger.warning(f"No existing Delta table found at {entity_table}. Starting fresh. ({e.__class__.__name__})")
 
     return existing_created
 
 
 ##### -------------- List utility function --------------- #####
-
-
-# Helper function to extract basic cluster info from XML entry element
-def extract_cluster(elem, ns):
-    cluster_id = f"CDM:{uuid.uuid4()}"
+def extract_cluster(
+    elem: ET.Element,
+    ns: dict[str, str],
+    uniref_id: str,
+) -> tuple[str, str]:
+    """
+    Extract a deterministic CDM cluster_id and the UniRef cluster name.
+    """
+    cluster_id = cdm_entity_id(value=uniref_id) or f"CDM:{uuid.uuid4()}"
     name_elem = elem.find("ns:name", ns)
-    name = name_elem.text if name_elem is not None else "UNKNOWN"
-    return cluster_id, name
-
+    name = get_text(elem=name_elem, default="UNKNOWN") or "UNKNOWN"
 
-# Returns tuple of (updated_time, created_time)
-def get_timestamps(uniref_id, existing_created, now=None):
-    now_dt = now or datetime.now()
-    formatted_now = now_dt.strftime("%Y-%m-%dT%H:%M:%S")
-    created = existing_created.get(uniref_id)
-    created_time = (created.split(".")[0] if "." in created else created) if created else formatted_now
-    return formatted_now, created_time
+    return cluster_id, name
 
 
-# Extract UniProtKB accession and is_seed status from a dbReference element
-def get_accession_and_seed(dbref, ns):
+def get_accession_and_seed(dbref: ET.Element | None, ns: dict[str, str]) -> tuple[str | None, bool]:
+    """
+    Extract UniProtKB accession and is_seed status from a dbReference element.
+    """
     if dbref is None:
         return None, False
-    prop_elems = dbref.findall("ns:property", ns)
 
-    props = {}
-    for prop in prop_elems:
-        t = prop.attrib["type"]
-        v = prop.attrib["value"]
-        props[t] = v
+    props = parse_properties(dbref, ns)
+
+    raw_acc = props.get("UniProtKB accession")
+    if isinstance(raw_acc, list):
+        accession = raw_acc[0] if raw_acc else None
+    else:
+        accession = raw_acc  # string or None
+
+    raw_seed = props.get("isSeed")
+    if isinstance(raw_seed, list):
+        is_seed = bool(raw_seed) and raw_seed[0].lower() == "true"
+    else:
+        is_seed = raw_seed is not None and raw_seed.lower() == "true"
 
-    acc = props.get("UniProtKB accession") or dbref.attrib.get("id")
-    is_seed = props.get("isSeed", "false").lower() == "true"
-    return acc, is_seed
+    return accession, is_seed
 
 
-# Add both representative and other cluster members into cluster_member_data list
-def add_cluster_members(cluster_id, repr_db, elem, cluster_member_data, ns) -> None:
-    dbrefs = []
+def add_cluster_members(
+    cluster_id: str,
+    repr_db: ET.Element | None,
+    elem: ET.Element,
+    cluster_member_rows: list[tuple[str, str, str, str, str]],
+    ns: dict[str, str],
+) -> None:
+    """Populate cluster_member_rows with representative, member records."""
+    dbrefs: list[tuple[ET.Element, bool]] = []
     if repr_db is not None:
         dbrefs.append((repr_db, True))
     for mem in elem.findall("ns:member/ns:dbReference", ns):
         dbrefs.append((mem, False))
 
     for dbref, is_representative in dbrefs:
-        acc, is_seed = get_accession_and_seed(dbref, ns)
-        if acc:
-            member_entity_id = cdm_entity_id(acc)
-            cluster_member_data.append(
-                (cluster_id, member_entity_id, str(is_representative).lower(), str(is_seed).lower(), "1.0")
+        accession, is_seed = get_accession_and_seed(dbref, ns)
+        if not accession:
+            continue
+
+        member_entity_id = cdm_entity_id(accession)
+        if not member_entity_id:
+            continue
+
+        cluster_member_rows.append(
+            (
+                cluster_id,
+                member_entity_id,
+                str(is_representative).lower(),
+                str(is_seed).lower(),
+                "1.0",  # score placeholder
             )
+        )
 
 
-# Extract cross-references (UniRef90/50/UniParc) from a dbReference element
 def extract_cross_refs(dbref, cross_reference_data, ns) -> None:
     if dbref is None:
         return
@@ -153,83 +251,95 @@ def extract_cross_refs(dbref, cross_reference_data, ns) -> None:
             cross_reference_data.append((entity_id, i, props[i]))
 
 
-##### -------------- Parse Uniref XML --------------- #####
+def parse_uniref_entry(
+    elem: ET.Element, existing_created: dict[str, str], ns: dict[str, str]
+) -> dict[str, list[tuple]]:
+    """
+    Parse a single UniRef <entry> element into CDM-friendly row tuples.
+    """
+    cluster_rows: list[tuple[str, str, str, str | None, str]] = []
+    entity_rows: list[tuple[str, str, str, str, str, str]] = []
+    member_rows: list[tuple[str, str, str, str, str]] = []
+    xref_rows: list[tuple[str, str, str]] = []
+
+    uniref_id = elem.attrib.get("id") or ""
+
+    cluster_id, name = extract_cluster(elem, ns, uniref_id)
+    updated_time, created_time = get_timestamps(uniref_id, existing_created)
+
+    # Cluster table
+    cluster_rows.append(
+        (
+            cluster_id,
+            name,
+            "protein",
+            None,
+            DATA_SOURCE,
+        )
+    )
 
+    # Entity table
+    entity_rows.append(
+        (
+            cluster_id,
+            uniref_id,
+            "Cluster",
+            DATA_SOURCE,
+            updated_time,
+            created_time,
+        )
+    )
 
-def parse_uniref_xml(local_gz, batch_size, existing_created):
-    """
-    Parse UniRef XML (gzipped) and extract cluster, entity, cluster member, UniProtKB member, and cross-reference info.
+    # Cross references from representative and members
+    repr_db = elem.find("ns:representativeMember/ns:dbReference", ns)
+    extract_cross_refs(repr_db, xref_rows, ns)
+
+    for mem in elem.findall("ns:member/ns:dbReference", ns):
+        extract_cross_refs(mem, xref_rows, ns)
 
-    Args:
-        local_gz (str): Local gzipped UniRef XML path.
-        batch_size (int): Maximum number of entries to parse.
-        existing_created (dict): Mapping from UniRef cluster ID to 'created' timestamp for idempotent imports.
+    # Cluster members (representative + members)
+    add_cluster_members(cluster_id, repr_db, elem, member_rows, ns)
 
-    Returns:
-        dict: Dictionary with lists for each CDM table
+    return {
+        "cluster_data": cluster_rows,
+        "entity_data": entity_rows,
+        "cluster_member_data": member_rows,
+        "cross_reference_data": xref_rows,
+    }
+
+
+##### -------------- Parse Uniref XML --------------- #####
+def parse_uniref_xml(local_gz: str, batch_size: int, existing_created: dict[str, str]) -> dict[str, list[tuple]]:
+    """
+    Stream-parse UniRef XML (gzipped) and extract CDM-like row tuples.
     """
-    ns = {"ns": "http://uniprot.org/uniref"}  # Namespace for XML parsing
+    ns = UNIREF_NS
     entry_count = 0
 
-    # Initialize lists to collect parsed rows for different tables
-    cluster_data = []
-    entity_data = []
-    cluster_member_data = []
-    cross_reference_data = []
+    cluster_data: list[tuple] = []
+    entity_data: list[tuple] = []
+    cluster_member_data: list[tuple] = []
+    cross_reference_data: list[tuple] = []
 
     with gzip.open(local_gz, "rb") as f:
-        # Stream parse the XML to avoid memory issues with big files
         context = ET.iterparse(f, events=("end",))
         for _, elem in context:
-            if elem.tag.endswith("entry"):
-                # Cluster basic info
-                cluster_id, name = extract_cluster(elem, ns)
-
-                # Get UniRef cluster id and timestamps
-                uniref_id = elem.attrib.get("id")
-                updated_time, created_time = get_timestamps(uniref_id, existing_created)
-
-                # Populate Cluster and Entity table data
-                cluster_data.append(
-                    (
-                        cluster_id,  # cluster_id
-                        name,  # cluster name
-                        "protein",  # entity_type (fixed value)
-                        None,  # description (not present)
-                        "UniRef 100",  # protocol_id
-                    )
-                )
-
-                entity_data.append(
-                    (
-                        cluster_id,  # entity_id (matches cluster_id)
-                        uniref_id,  # data_source_entity_id (UniRef100_xxx)
-                        "Cluster",  # entity_type
-                        "UniRef 100",  # data_source
-                        updated_time,  # updated
-                        created_time,  # created
-                    )
-                )
-
-                # Extract UniProtKB member attributes and cross-references
-                repr_db = elem.find("ns:representativeMember/ns:dbReference", ns)
-                extract_cross_refs(repr_db, cross_reference_data, ns)
-
-                for mem in elem.findall("ns:member/ns:dbReference", ns):
-                    extract_cross_refs(mem, cross_reference_data, ns)
-
-                # ClusterMember table (representative + members)
-                add_cluster_members(cluster_id, repr_db, elem, cluster_member_data, ns)
-
-                # Batch size limit
-                entry_count += 1
-                if entry_count >= batch_size:
-                    break
-
-                # Release element to save memory
-                elem.clear()
-
-    print(f"Parsed {entry_count} clusters")
+            if not elem.tag.endswith("entry"):
+                continue
+
+            parsed = parse_uniref_entry(elem, existing_created, ns)
+            cluster_data.extend(parsed["cluster_data"])
+            entity_data.extend(parsed["entity_data"])
+            cluster_member_data.extend(parsed["cluster_member_data"])
+            cross_reference_data.extend(parsed["cross_reference_data"])
+
+            entry_count += 1
+            if entry_count >= batch_size:
+                break
+
+            elem.clear()
+
+    logger.info(f"Parsed {entry_count} clusters")
     return {
         "cluster_data": cluster_data,
         "entity_data": entity_data,
@@ -238,10 +348,8 @@ def parse_uniref_xml(local_gz, batch_size, existing_created):
     }
 
 
-##### -------------- Save dalta table and print the preview --------------- #####
-
-
-def save_delta_tables(spark, output_dir, data_dict) -> None:
+##### -------------- Save delta table and print the preview --------------- #####
+def save_delta_tables(spark, output_dir, data_dict):
     # Cluster
     cluster_schema = StructType(
         [
@@ -255,7 +363,7 @@ def save_delta_tables(spark, output_dir, data_dict) -> None:
 
     cluster_df = spark.createDataFrame(data_dict["cluster_data"], cluster_schema)
     cluster_df.write.format("delta").mode("overwrite").save(os.path.join(output_dir, "Cluster"))
-    print(f"Cluster Delta table written to: {os.path.join(output_dir, 'Cluster')}")
+    logger.info(f"Cluster Delta table written to: {os.path.join(output_dir, 'Cluster')}")
 
     # Entity
     entity_schema = StructType(
@@ -272,7 +380,7 @@ def save_delta_tables(spark, output_dir, data_dict) -> None:
     entity_df = spark.createDataFrame(data_dict["entity_data"], entity_schema)
     entity_table_path = os.path.join(output_dir, "Entity")
     entity_df.write.format("delta").mode("overwrite").save(entity_table_path)
-    print(f"Entity Delta table written to: {entity_table_path}")
+    logger.info(f"Entity Delta table written to: {entity_table_path}")
 
     # ClusterMember
     cluster_member_schema = StructType(
@@ -288,7 +396,7 @@ def save_delta_tables(spark, output_dir, data_dict) -> None:
     cluster_member_df = spark.createDataFrame(data_dict["cluster_member_data"], cluster_member_schema)
     cluster_member_path = os.path.join(output_dir, "ClusterMember")
     cluster_member_df.write.format("delta").mode("overwrite").save(cluster_member_path)
-    print(f"ClusterMember Delta table written to: {cluster_member_path}")
+    logger.info(f"ClusterMember Delta table written to: {cluster_member_path}")
 
     # CrossReference
     cross_reference_schema = StructType(
@@ -302,22 +410,22 @@ def save_delta_tables(spark, output_dir, data_dict) -> None:
     cross_reference_df = spark.createDataFrame(data_dict["cross_reference_data"], cross_reference_schema)
     cross_reference_path = os.path.join(output_dir, "CrossReference")
     cross_reference_df.write.format("delta").mode("overwrite").save(cross_reference_path)
-    print(f"CrossReference Delta table written to: {cross_reference_path}")
+    logger.info(f"CrossReference Delta table written to: {cross_reference_path}")
 
     # Previews
-    print("Sample Clusters:")
+    logger.info("Sample Clusters:")
     cluster_df.createOrReplaceTempView("Cluster")
     spark.sql("SELECT * FROM Cluster LIMIT 20").show(truncate=False)
 
-    print("Sample Entities:")
+    logger.info("Sample Entities:")
     entity_df.createOrReplaceTempView("Entity")
     spark.sql("SELECT * FROM Entity LIMIT 20").show(truncate=False)
 
-    print("Sample ClusterMembers:")
+    logger.info("Sample ClusterMembers:")
     cluster_member_df.createOrReplaceTempView("ClusterMember")
     spark.sql("SELECT * FROM ClusterMember LIMIT 20").show(truncate=False)
 
-    print("Sample CrossReferences:")
+    logger.info("Sample CrossReferences:")
     cross_reference_df.createOrReplaceTempView("CrossReference")
     spark.sql("SELECT * FROM CrossReference LIMIT 20").show(truncate=False)
 
@@ -327,17 +435,27 @@ def build_spark_session():
     builder = (
         SparkSession.builder.appName("UniRef Cluster Extractor")
         .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
-        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
+        .config(
+            "spark.sql.catalog.spark_catalog",
+            "org.apache.spark.sql.delta.catalog.DeltaCatalog",
+        )
     )
     return configure_spark_with_delta_pip(builder).getOrCreate()
 
 
-# Click command-line interface for parameter parsing
 @click.command()
 @click.option("--ftp-url", required=True, help="FTP URL to UniRef100 XML file")
 @click.option("--output-dir", required=True, help="Output directory for Delta table")
 @click.option("--batch-size", default=1000, help="Number of UniRef entries to parse (limit)")
-def main(ftp_url, output_dir, batch_size) -> None:
+def main(ftp_url, output_dir, batch_size):
+    # set up logging in CLI context
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s [%(levelname)s] (%(name)s:%(lineno)d %(message)s",
+    )
+
+    logger.info("Starting UniRef100/90/50 Import Pipeline")
+
     # Set local path for downloaded gzipped XML file
     local_gz = os.path.join("/tmp", os.path.basename(ftp_url))
 
@@ -345,23 +463,31 @@ def main(ftp_url, output_dir, batch_size) -> None:
     try:
         download_file(ftp_url, local_gz)
     except URLError as e:
-        print(f"Error! Cannot download file: {e.reason}")
+        logger.error(f"Error! Cannot download file: {e.reason}")
         return
 
     # Start Spark session with Delta Lake support
+    logger.info("Building Spark session:")
     spark = build_spark_session()
 
     # Load existing entity creation timestamps
-    entity_table_path = os.path.join(output_dir, "Entity")
-    existing_created = load_existing_created(spark, entity_table_path)
+    try:
+        entity_table_path = os.path.join(output_dir, "Entity")
+        existing_created = load_existing_created(spark, entity_table_path)
+
+        # Parse the UniRef XML and extract all CDM table data
+        logger.info("Parsing UniRef XML:")
+        data_dict = parse_uniref_xml(local_gz, batch_size, existing_created)
 
-    # Parse the UniRef XML and extract all CDM table data
-    data_dict = parse_uniref_xml(local_gz, batch_size, existing_created)
+        # Write parsed data to Delta tables in output directory
+        logger.info("Saving Delta tables:")
+        save_delta_tables(spark, output_dir, data_dict)
 
-    # Write parsed data to Delta tables in output directory
-    save_delta_tables(spark, output_dir, data_dict)
+        logger.info("UniRef100/90/50 Import Pipeline completed successfully.")
 
-    spark.stop()
+    finally:
+        spark.stop()
+        logger.info("Spark session stopped.")
 
 
 if __name__ == "__main__":
diff --git a/src/cdm_data_loader_utils/parsers/xml_utils.py b/src/cdm_data_loader_utils/parsers/xml_utils.py
new file mode 100644
index 0000000..d916799
--- /dev/null
+++ b/src/cdm_data_loader_utils/parsers/xml_utils.py
@@ -0,0 +1,124 @@
+"""
+Shared XML helper utilities used by UniProt and UniRef parsers.
+
+This module centralizes common operations:
+- Safe text extraction
+- Safe attribute extraction
+- Property parsing
+- Evidence / dbReference parsing
+- Cleaning dictionaries
+- Deduplicating lists
+"""
+
+import xml.etree.ElementTree as ET
+from typing import Any
+
+# ============================================================
+# Basic Safe Accessors
+# ============================================================
+
+
+def get_text(elem: ET.Element | None, default: str | None = None) -> str | None:
+    """Return elem.text if exists and non-empty."""
+    if elem is None:
+        return default
+    if elem.text is None:
+        return default
+    text = elem.text.strip()
+    return text if text else default
+
+
+def get_attr(elem: ET.Element | None, name: str, default: str | None = None) -> str | None:
+    """Return elem.get(name) safely."""
+    if elem is None:
+        return default
+    val = elem.get(name)
+    return val.strip() if isinstance(val, str) else default
+
+
+# ============================================================
+# List / Node Finders
+# ============================================================
+
+
+def find_one(elem: ET.Element, xpath: str, ns: dict[str, str]):
+    """Return first element matching xpath or None."""
+    results = elem.findall(xpath, ns)
+    return results[0] if results else None
+
+
+def find_all_text(elem: ET.Element, xpath: str, ns: dict[str, str]) -> list[str]:
+    """Return list of text values from xpath matches (deduped)."""
+    texts = []
+    for node in elem.findall(xpath, ns):
+        txt = get_text(node)
+        if txt:
+            texts.append(txt)
+    return list(dict.fromkeys(texts))  # preserve order, dedupe
+
+
+def safe_list(x) -> list[Any]:
+    """Convert None → []."""
+    if x is None:
+        return []
+    if isinstance(x, list):
+        return x
+    return [x]
+
+
+# ============================================================
+# dbReference / property parsing (shared by UniProt + UniRef)
+# ============================================================
+
+
+def parse_properties(dbref: ET.Element | None, ns: dict[str, str]) -> dict[str, list[str]]:
+    """
+    Extract key/value pairs from <property type="..." value="..."> blocks.
+    """
+    if dbref is None:
+        return {}
+    props = {}
+    for prop in dbref.findall("ns:property", ns):
+        ptype = prop.attrib.get("type")
+        pval = prop.attrib.get("value")
+        if ptype and pval:
+            if ptype not in props:
+                props[ptype] = []
+            props[ptype].append(pval)
+    return props
+
+
+def parse_db_references(elem: ET.Element, ns: dict[str, str], pub_types=("PubMed", "DOI")):
+    """
+    Generic dbReference parser:
+    - Identify publication IDs (PubMed, DOI)
+    - Identify other cross-references (dbType:dbId)
+    """
+    publications = []
+    others = []
+
+    for dbref in elem.findall("ns:dbReference", ns):
+        db_type = dbref.get("type")
+        db_id = dbref.get("id")
+
+        if not db_type or not db_id:
+            continue
+
+        if db_type in pub_types:
+            publications.append(f"{db_type.upper()}:{db_id}")
+        else:
+            others.append(f"{db_type}:{db_id}")
+
+    return publications, others
+
+
+# ============================================================
+# Dict Cleaning
+# ============================================================
+
+
+def clean_dict(d: dict[str, Any]) -> dict[str, Any]:
+    """
+    Remove keys whose value is None or empty list.
+    """
+    return {k: v for k, v in d.items() if v not in (None, [], {})}
diff --git a/tests/conftest.py b/tests/conftest.py
index 049dbb9..bbd7507 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -7,7 +7,7 @@
 from typing import Any
 
 import pytest
-from pyspark.sql import SparkSession, DataFrame
+from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.types import (
     ArrayType,
     BooleanType,
diff --git a/tests/data/refseq/annotation_report.json b/tests/data/refseq/annotation_report.json
new file mode 100644
index 0000000..53cd0d6
--- /dev/null
+++ b/tests/data/refseq/annotation_report.json
@@ -0,0 +1,105 @@
+{
+    "reports": [
+        {
+            "annotation": {
+                "gene_id": "4156250",
+                "name": "hypothetical protein",
+                "gene_type": "protein-coding",
+                "locus_tag": "MIV001R",
+                "genomic_regions": [
+                    {
+                        "gene_range": {
+                            "accession_version": "NC_008187.1",
+                            "range": [
+                                {
+                                    "begin": "2620",
+                                    "end": "3066",
+                                    "orientation": "plus"
+                                }
+                            ]
+                        }
+                    }
+                ],
+                "proteins": [
+                    {
+                        "accession_version": "YP_654573.1",
+                        "name": "hypothetical protein",
+                        "length": 148
+                    }
+                ],
+                "annotations": [
+                    {
+                        "assembly_accession": "GCF_000869125.1"
+                    }
+                ]
+            },
+            "row_id": "1"
+        },
+        {
+            "annotation": {
+                "gene_id": "4156251",
+                "name": "hypodermical protein",
+                "gene_type": "protein-coding",
+                "locus_tag": "MIV002R",
+                "genomic_regions": [
+                    {
+                        "gene_range": {
+                            "accession_version": "NC_008187.1",
+                            "range": [
+                                {
+                                    "begin": "3603",
+                                    "end": "4979",
+                                    "orientation": "plus"
+                                }
+                            ]
+                        }
+                    }
+                ],
+                "proteins": [
+                    {
+                        "accession_version": "YP_654574.1",
+                        "name": "hypothetical protein",
+                        "length": 458
+                    }
+                ],
+                "annotations": [
+                    {
+                        "assembly_accession": "GCF_000869125.1"
+                    }
+                ]
+            },
+            "row_id": "2"
+        },
+        {
+            "annotation": {
+                "gene_id": "4156252",
+                "name": "very hypothetical protein",
+                "gene_type": "protein-coding",
+                "locus_tag": "MIV003R",
+                "symbol": "kappa-delta-phi",
+                "genomic_regions": [
+                    {
+                        "gene_range": {
+                            "accession_version": "NC_008187.1",
+                            "range": [
+                                {
+                                    "begin": "5168",
+                                    "end": "5638",
+                                    "orientation": "minus"
+                                }
+                            ]
+                        }
+                    }
+                ],
+                "proteins": [],
+                "annotations": [
+                    {
+                        "assembly_accession": "GCF_000869125.1"
+                    }
+                ]
+            },
+            "row_id": "3"
+        }
+    ],
+    "total_count": 3
+}
diff --git a/tests/data/refseq/annotation_report.parsed.json b/tests/data/refseq/annotation_report.parsed.json
new file mode 100644
index 0000000..178e040
--- /dev/null
+++ b/tests/data/refseq/annotation_report.parsed.json
@@ -0,0 +1,239 @@
+{
+  "contig": [
+    {
+      "contig_id": "refseq:NC_008187.1",
+      "hash": null,
+      "gc_content": null,
+      "length": null
+    }
+  ],
+  "contig_x_contigcollection": [
+    {
+      "contig_id": "refseq:NC_008187.1",
+      "contig_collection_id": "insdc.gcf:GCF_000869125.1"
+    }
+  ],
+  "contig_x_feature": [
+    {
+      "contig_id": "refseq:NC_008187.1",
+      "feature_id": "ncbigene:4156250"
+    },
+    {
+      "contig_id": "refseq:NC_008187.1",
+      "feature_id": "ncbigene:4156251"
+    },
+    {
+      "contig_id": "refseq:NC_008187.1",
+      "feature_id": "ncbigene:4156252"
+    }
+  ],
+  "contig_x_protein": [
+    {
+      "contig_id": "refseq:NC_008187.1",
+      "protein_id": "refseq:YP_654573.1"
+    },
+    {
+      "contig_id": "refseq:NC_008187.1",
+      "protein_id": "refseq:YP_654574.1"
+    }
+  ],
+  "contigcollection": [
+    {
+      "contig_collection_id": "insdc.gcf:GCF_000869125.1",
+      "hash": null
+    }
+  ],
+  "contigcollection_x_feature": [
+    {
+      "contig_collection_id": "insdc.gcf:GCF_000869125.1",
+      "feature_id": "ncbigene:4156250"
+    },
+    {
+      "contig_collection_id": "insdc.gcf:GCF_000869125.1",
+      "feature_id": "ncbigene:4156251"
+    },
+    {
+      "contig_collection_id": "insdc.gcf:GCF_000869125.1",
+      "feature_id": "ncbigene:4156252"
+    }
+  ],
+  "contigcollection_x_protein": [
+    {
+      "contig_collection_id": "insdc.gcf:GCF_000869125.1",
+      "protein_id": "refseq:YP_654573.1"
+    },
+    {
+      "contig_collection_id": "insdc.gcf:GCF_000869125.1",
+      "protein_id": "refseq:YP_654574.1"
+    }
+  ],
+  "feature_x_protein": [
+    {
+      "feature_id": "ncbigene:4156250",
+      "protein_id": "refseq:YP_654573.1"
+    },
+    {
+      "feature_id": "ncbigene:4156251",
+      "protein_id": "refseq:YP_654574.1"
+    }
+  ],
+  "feature": [
+    {
+      "feature_id": "ncbigene:4156250",
+      "hash": null,
+      "cds_phase": null,
+      "e_value": null,
+      "end": 3066,
+      "p_value": null,
+      "start": 2620,
+      "strand": "positive",
+      "source_database": "ncbigene",
+      "protocol_id": null,
+      "type": "protein-coding"
+    },
+    {
+      "feature_id": "ncbigene:4156251",
+      "hash": null,
+      "cds_phase": null,
+      "e_value": null,
+      "end": 4979,
+      "p_value": null,
+      "start": 3603,
+      "strand": "positive",
+      "source_database": "ncbigene",
+      "protocol_id": null,
+      "type": "protein-coding"
+    },
+    {
+      "feature_id": "ncbigene:4156251",
+      "hash": null,
+      "cds_phase": null,
+      "e_value": null,
+      "end": 5638,
+      "p_value": null,
+      "start": 5168,
+      "strand": "negative",
+      "source_database": "ncbigene",
+      "protocol_id": null,
+      "type": "protein-coding"
+    }
+  ],
+  "identifier": [
+    {
+      "entity_id": "insdc.gcf:GCF_000869125",
+      "identifier": "insdc.gcf:GCF_000869125",
+      "description": "RefSeq genome ID",
+      "source": "RefSeq",
+      "relationship": null
+    },
+    {
+      "entity_id": "refseq:NC_008187.1",
+      "identifier": "refseq:NC_008187.1",
+      "description": "RefSeq assembly ID",
+      "source": "RefSeq",
+      "relationship": null
+    },
+    {
+      "entity_id": "ncbigene:4156250",
+      "identifier": "ncbigene:4156250",
+      "description": "NCBI gene ID",
+      "source": "RefSeq",
+      "relationship": null
+    },
+    {
+      "entity_id": "ncbigene:4156251",
+      "identifier": "ncbigene:4156251",
+      "description": "NCBI gene ID",
+      "source": "RefSeq",
+      "relationship": null
+    },
+    {
+      "entity_id": "ncbigene:4156252",
+      "identifier": "ncbigene:4156252",
+      "description": "NCBI gene ID",
+      "source": "RefSeq",
+      "relationship": null
+    },
+    {
+      "entity_id": "refseq:YP_654573.1",
+      "identifier": "refseq:YP_654573.1",
+      "description": "RefSeq protein ID",
+      "source": "RefSeq",
+      "relationship": null
+    }
+  ],
+  "name": [
+    {
+      "entity_id": "ncbigene:4156250",
+      "name": "hypothetical protein",
+      "description": "RefSeq gene name",
+      "source": "RefSeq"
+    },
+    {
+      "entity_id": "ncbigene:4156251",
+      "name": "hypodermical protein",
+      "description": "RefSeq gene name",
+      "source": "RefSeq"
+    },
+    {
+      "entity_id": "ncbigene:4156252",
+      "name": "very hypothetical protein",
+      "description": "RefSeq gene name",
+      "source": "RefSeq"
+    },
+    {
+      "entity_id": "ncbigene:4156250",
+      "name": "MIV001R",
+      "description": "RefSeq locus tag",
+      "source": "RefSeq"
+    },
+    {
+      "entity_id": "ncbigene:4156251",
+      "name": "MIV002R",
+      "description": "RefSeq locus tag",
+      "source": "RefSeq"
+    },
+    {
+      "entity_id": "ncbigene:4156252",
+      "name": "MIV003R",
+      "description": "RefSeq locus tag",
+      "source": "RefSeq"
+    },
+    {
+      "entity_id": "ncbigene:4156252",
+      "name": "kappa-delta-phi",
+      "description": "RefSeq symbol",
+      "source": "RefSeq"
+    },
+    {
+      "entity_id": "refseq:YP_654573.1",
+      "name": "hypothetical protein",
+      "description": "RefSeq protein name",
+      "source": "RefSeq"
+    },
+    {
+      "entity_id": "refseq:YP_654574.1",
+      "name": "hypothetical protein",
+      "description": "RefSeq protein name",
+      "source": "RefSeq"
+    }
+  ],
+  "protein": [
+    {
+      "protein_id": "refseq:YP_654573.1",
+      "hash": null,
+      "description": null,
+      "evidence_for_existence": null,
+      "length": null,
+      "sequence": null
+    },
+    {
+      "protein_id": "refseq:YP_654574.1",
+      "hash": null,
+      "description": null,
+      "evidence_for_existence": null,
+      "length": null,
+      "sequence": null
+    }
+  ]
+}
diff --git a/tests/parsers/refseq_importer/__init__.py b/tests/parsers/refseq_importer/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/tests/parsers/refseq_importer/test_spark_delta.py b/tests/parsers/refseq_importer/test_spark_delta.py
index b5cd9d0..0f5751e 100644
--- a/tests/parsers/refseq_importer/test_spark_delta.py
+++ b/tests/parsers/refseq_importer/test_spark_delta.py
@@ -114,14 +114,12 @@ def test_write_delta_contig_collection_schema(spark) -> None:
     db = "cdmdb"
     spark.sql(f"CREATE DATABASE IF NOT EXISTS {db}")
 
-    schema = StructType(
-        [
-            StructField("collection_id", StringType(), True),
-            StructField("contig_collection_type", StringType(), True),
-            StructField("ncbi_taxon_id", StringType(), True),
-            StructField("gtdb_taxon_id", StringType(), True),
-        ]
-    )
+    schema = StructType([
+        StructField("collection_id", StringType(), True),
+        StructField("contig_collection_type", StringType(), True),
+        StructField("ncbi_taxon_id", StringType(), True),
+        StructField("gtdb_taxon_id", StringType(), True),
+    ])
 
     df = spark.createDataFrame(
         [("C1", "isolate", "NCBITaxon:123", None)],
diff --git a/tests/parsers/refseq_importer/test_tables_finalize.py b/tests/parsers/refseq_importer/test_tables_finalize.py
index c71911c..d9151fd 100644
--- a/tests/parsers/refseq_importer/test_tables_finalize.py
+++ b/tests/parsers/refseq_importer/test_tables_finalize.py
@@ -20,12 +20,10 @@ def spark():
 # -------------------------------------------------------------------
 @pytest.mark.requires_spark
 def test_list_of_dicts_to_spark(spark) -> None:
-    schema = StructType(
-        [
-            StructField("a", StringType(), True),
-            StructField("b", StringType(), True),
-        ]
-    )
+    schema = StructType([
+        StructField("a", StringType(), True),
+        StructField("b", StringType(), True),
+    ])
 
     rows = [{"a": "1", "b": "x"}, {"a": "2", "b": "y"}]
     df = list_of_dicts_to_spark(spark, rows, schema)
@@ -40,15 +38,13 @@ def test_list_of_dicts_to_spark(spark) -> None:
 @pytest.mark.requires_spark
 def test_finalize_tables_basic(spark) -> None:
     # ---------- entity ----------
-    e_schema = StructType(
-        [
-            StructField("entity_id", StringType(), True),
-            StructField("entity_type", StringType(), True),
-            StructField("data_source", StringType(), True),
-            StructField("created", StringType(), True),
-            StructField("updated", StringType(), True),
-        ]
-    )
+    e_schema = StructType([
+        StructField("entity_id", StringType(), True),
+        StructField("entity_type", StringType(), True),
+        StructField("data_source", StringType(), True),
+        StructField("created", StringType(), True),
+        StructField("updated", StringType(), True),
+    ])
 
     e1 = spark.createDataFrame(
         [Row(entity_id="E1", entity_type="genome", data_source="RefSeq", created="2020", updated="2021")],
@@ -60,14 +56,12 @@ def test_finalize_tables_basic(spark) -> None:
     )
 
     # ---------- contig_collection (schema REQUIRED due to None!) ----------
-    coll_schema = StructType(
-        [
-            StructField("collection_id", StringType(), True),
-            StructField("contig_collection_type", StringType(), True),
-            StructField("ncbi_taxon_id", StringType(), True),
-            StructField("gtdb_taxon_id", StringType(), True),
-        ]
-    )
+    coll_schema = StructType([
+        StructField("collection_id", StringType(), True),
+        StructField("contig_collection_type", StringType(), True),
+        StructField("ncbi_taxon_id", StringType(), True),
+        StructField("gtdb_taxon_id", StringType(), True),
+    ])
 
     c1 = spark.createDataFrame(
         [
diff --git a/tests/parsers/test_annotation_parse.py b/tests/parsers/test_annotation_parse.py
new file mode 100644
index 0000000..0173504
--- /dev/null
+++ b/tests/parsers/test_annotation_parse.py
@@ -0,0 +1,767 @@
+### pytest tests/parsers/test_annotation_parse.py
+
+import json
+from pathlib import Path
+
+import pytest
+from pyspark.sql import SparkSession
+
+from src.cdm_data_loader_utils.parsers.annotation_parse import parse_annotation_data
+from src.cdm_data_loader_utils.parsers.kbase_cdm_pyspark import schema as cdm_schemas
+
+from tests.validation.assertions import (
+    assertDataFrameEqual,
+    assertDataFrameSchemaEqual,
+)
+
+from src.cdm_data_loader_utils.parsers.annotation_parse import (
+    apply_prefix,
+    load_contig_collection_x_feature,
+    load_contig_collection_x_protein,
+    load_contig_x_contig_collection,
+    load_contigs,
+    load_feature_records,
+    load_feature_x_protein,
+    load_identifiers,
+    load_names,
+    parse_annotation_data,
+    to_int,
+)
+from tests.conftest import TEST_NS
+
+
+@pytest.mark.parametrize(
+    "input_data, expected_output",
+    [
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "1234",
+                            "name": "hypothetical protein",
+                            "relationship": "RefSeq gene symbol",
+                        }
+                    }
+                ]
+            },
+            [
+                (
+                    "ncbigene:1234",
+                    "1234",
+                    "hypothetical protein",
+                    "RefSeq",
+                    "RefSeq gene symbol",
+                )
+            ],
+        ),
+        (
+            {"reports": [{"annotation": {"gene_id": "5678", "name": "some protein"}}]},
+            [("ncbigene:5678", "5678", "some protein", "RefSeq", None)],
+        ),
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "name": "no gene id here",
+                            "relationship": "RefSeq locus tag",
+                        }
+                    }
+                ]
+            },
+            [],
+        ),
+    ],
+)
+def test_load_identifiers(input_data, expected_output):
+    result = load_identifiers(input_data)
+    assert result == expected_output
+
+
+@pytest.mark.parametrize(
+    "input_data, expected_output",
+    [
+        # Case 1: all name fields present
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "1234",
+                            "symbol": "abc",
+                            "name": "ABC protein",
+                            "locus_tag": "LTG_1234",
+                        }
+                    }
+                ]
+            },
+            [
+                ("ncbigene:1234", "abc", "RefSeq gene symbol", "RefSeq"),
+                ("ncbigene:1234", "ABC protein", "RefSeq gene name", "RefSeq"),
+                ("ncbigene:1234", "LTG_1234", "RefSeq locus tag", "RefSeq"),
+            ],
+        ),
+        # Case 2: only gene_name present
+        (
+            {"reports": [{"annotation": {"gene_id": "5678", "name": "Hypothetical protein"}}]},
+            [
+                (
+                    "ncbigene:5678",
+                    "Hypothetical protein",
+                    "RefSeq gene name",
+                    "RefSeq",
+                )
+            ],
+        ),
+        # Case 3: no gene_id
+        (
+            {"reports": [{"annotation": {"name": "Unnamed", "symbol": "XYZ"}}]},
+            [],
+        ),
+        # Case 4: only locus_tag present
+        (
+            {"reports": [{"annotation": {"gene_id": "8888", "locus_tag": "LTG_8888"}}]},
+            [("ncbigene:8888", "LTG_8888", "RefSeq locus tag", "RefSeq")],
+        ),
+        # Case 5: multiple reports
+        (
+            {
+                "reports": [
+                    {"annotation": {"gene_id": "1001", "symbol": "DEF"}},
+                    {"annotation": {"gene_id": "1002", "name": "DEF protein"}},
+                ]
+            },
+            [
+                ("ncbigene:1001", "DEF", "RefSeq gene symbol", "RefSeq"),
+                ("ncbigene:1002", "DEF protein", "RefSeq gene name", "RefSeq"),
+            ],
+        ),
+    ],
+)
+def test_load_names(input_data, expected_output):
+    result = load_names(input_data)
+    assert sorted(result) == sorted(expected_output)
+
+
+@pytest.mark.parametrize(
+    "input_data, expected_output",
+    [
+        # Case 1: basic valid input with plus strand
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "1234",
+                            "genomic_regions": [
+                                {
+                                    "gene_range": {
+                                        "range": [
+                                            {
+                                                "begin": "100",
+                                                "end": "200",
+                                                "orientation": "plus",
+                                            }
+                                        ]
+                                    }
+                                }
+                            ],
+                        }
+                    }
+                ]
+            },
+            [
+                (
+                    "ncbigene:1234",
+                    None,
+                    None,
+                    None,
+                    200,
+                    None,
+                    100,
+                    "positive",
+                    "RefSeq",
+                    None,
+                    "gene",
+                )
+            ],
+        ),
+        # Case 2: multiple ranges, different strands
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "5678",
+                            "genomic_regions": [
+                                {
+                                    "gene_range": {
+                                        "range": [
+                                            {
+                                                "begin": "300",
+                                                "end": "500",
+                                                "orientation": "minus",
+                                            },
+                                            {
+                                                "begin": "600",
+                                                "end": "800",
+                                                "orientation": "plus",
+                                            },
+                                        ]
+                                    }
+                                }
+                            ],
+                        }
+                    }
+                ]
+            },
+            [
+                (
+                    "ncbigene:5678",
+                    None,
+                    None,
+                    None,
+                    500,
+                    None,
+                    300,
+                    "negative",
+                    "RefSeq",
+                    None,
+                    "gene",
+                ),
+                (
+                    "ncbigene:5678",
+                    None,
+                    None,
+                    None,
+                    800,
+                    None,
+                    600,
+                    "positive",
+                    "RefSeq",
+                    None,
+                    "gene",
+                ),
+            ],
+        ),
+        # Case 3: missing orientation
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "9999",
+                            "genomic_regions": [{"gene_range": {"range": [{"begin": "1", "end": "2"}]}}],
+                        }
+                    }
+                ]
+            },
+            [
+                (
+                    "ncbigene:9999",
+                    None,
+                    None,
+                    None,
+                    2,
+                    None,
+                    1,
+                    "unknown",
+                    "RefSeq",
+                    None,
+                    "gene",
+                )
+            ],
+        ),
+        # Case 4: no gene_id
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "genomic_regions": [
+                                {
+                                    "gene_range": {
+                                        "range": [
+                                            {
+                                                "begin": "100",
+                                                "end": "200",
+                                                "orientation": "plus",
+                                            }
+                                        ]
+                                    }
+                                }
+                            ]
+                        }
+                    }
+                ]
+            },
+            [],
+        ),
+        # Case 5: non-integer start/end
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "1111",
+                            "genomic_regions": [
+                                {
+                                    "gene_range": {
+                                        "range": [
+                                            {
+                                                "begin": "abc",
+                                                "end": "xyz",
+                                                "orientation": "plus",
+                                            }
+                                        ]
+                                    }
+                                }
+                            ],
+                        }
+                    }
+                ]
+            },
+            [
+                (
+                    "ncbigene:1111",
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    "positive",
+                    "RefSeq",
+                    None,
+                    "gene",
+                )
+            ],
+        ),
+    ],
+)
+def test_load_feature_records(input_data, expected_output):
+    result = load_feature_records(input_data)
+    assert sorted(result) == sorted(expected_output)
+
+
+@pytest.mark.parametrize(
+    "input_data, expected_output",
+    [
+        # Case 1: valid mapping
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "12345",
+                            "genomic_regions": [{"gene_range": {"accession_version": "NC_000001.11"}}],
+                        }
+                    }
+                ]
+            },
+            [("refseq:NC_000001.11", "ncbigene:12345")],
+        ),
+        # Case 2: no gene_id
+        (
+            {"reports": [{"annotation": {"genomic_regions": [{"gene_range": {"accession_version": "NC_000002.11"}}]}}]},
+            [],
+        ),
+        # Case 3: no genomic_regions
+        (
+            {"reports": [{"annotation": {"gene_id": "67890"}}]},
+            [],
+        ),
+        # Case 4: empty genomic_regions list
+        (
+            {"reports": [{"annotation": {"gene_id": "99999", "genomic_regions": []}}]},
+            [],
+        ),
+        # Case 5: missing accession_version
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "13579",
+                            "genomic_regions": [{"gene_range": {}}],
+                        }
+                    }
+                ]
+            },
+            [],
+        ),
+    ],
+)
+def test_load_contig_collection_x_feature(input_data, expected_output):
+    result = load_contig_collection_x_feature(input_data)
+    assert result == expected_output
+
+
+@pytest.mark.parametrize(
+    "input_data, expected_output",
+    [
+        # Case 1: Valid report with multiple proteins
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "proteins": [
+                                {"accession_version": "XP_123"},
+                                {"accession_version": "XP_456"},
+                            ],
+                            "annotations": [{"assembly_accession": "GCF_000001"}],
+                        }
+                    }
+                ]
+            },
+            [
+                ("insdc.gcf:GCF_000001", "refseq:XP_123"),
+                ("insdc.gcf:GCF_000001", "refseq:XP_456"),
+            ],
+        ),
+        # Case 2: No proteins
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "proteins": [],
+                            "annotations": [{"assembly_accession": "GCF_000002"}],
+                        }
+                    }
+                ]
+            },
+            [],
+        ),
+        # Case 3: No annotations
+        (
+            {"reports": [{"annotation": {"proteins": [{"accession_version": "XP_789"}]}}]},
+            [],
+        ),
+        # Case 4: Missing assembly_accession
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "proteins": [{"accession_version": "XP_789"}],
+                            "annotations": [{}],
+                        }
+                    }
+                ]
+            },
+            [],
+        ),
+        # Case 5: Some proteins missing accession_version
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "proteins": [
+                                {"accession_version": "XP_111"},
+                                {},
+                                {"accession_version": "XP_222"},
+                            ],
+                            "annotations": [{"assembly_accession": "GCF_000003"}],
+                        }
+                    }
+                ]
+            },
+            [
+                ("insdc.gcf:GCF_000003", "refseq:XP_111"),
+                ("insdc.gcf:GCF_000003", "refseq:XP_222"),
+            ],
+        ),
+    ],
+)
+def test_load_contig_collection_x_protein(input_data, expected_output):
+    result = load_contig_collection_x_protein(input_data)
+    assert sorted(result) == sorted(expected_output)
+
+
+@pytest.mark.parametrize(
+    "input_data, expected_output",
+    [
+        # Case 1: valid gene with multiple proteins
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "4156311",
+                            "proteins": [
+                                {"accession_version": "XP_001"},
+                                {"accession_version": "XP_002"},
+                            ],
+                        }
+                    }
+                ]
+            },
+            [
+                ("ncbigene:4156311", "refseq:XP_001"),
+                ("ncbigene:4156311", "refseq:XP_002"),
+            ],
+        ),
+        # Case 2: no gene_id
+        (
+            {"reports": [{"annotation": {"proteins": [{"accession_version": "XP_999"}]}}]},
+            [],
+        ),
+        # Case 3: gene with no proteins
+        (
+            {"reports": [{"annotation": {"gene_id": "4156312"}}]},
+            [],
+        ),
+        # Case 4: some proteins missing accession_version
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "4156313",
+                            "proteins": [
+                                {"accession_version": "XP_777"},
+                                {},
+                                {"accession_version": "XP_888"},
+                            ],
+                        }
+                    }
+                ]
+            },
+            [
+                ("ncbigene:4156313", "refseq:XP_777"),
+                ("ncbigene:4156313", "refseq:XP_888"),
+            ],
+        ),
+        # Case 5: empty report list
+        ({"reports": []}, []),
+    ],
+)
+def test_load_feature_x_protein(input_data, expected_output):
+    result = load_feature_x_protein(input_data)
+    assert sorted(result) == sorted(expected_output)
+
+
+@pytest.mark.parametrize(
+    "input_data, expected_output",
+    [
+        # Case 1: Valid contig and assembly
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "genomic_regions": [{"gene_range": {"accession_version": "NC_000001.11"}}],
+                            "annotations": [{"assembly_accession": "GCF_000001.1"}],
+                        }
+                    }
+                ]
+            },
+            [("refseq:NC_000001.11", "insdc.gcf:GCF_000001.1")],
+        ),
+        # Case 2: Missing genomic_regions
+        (
+            {"reports": [{"annotation": {"annotations": [{"assembly_accession": "GCF_000002.1"}]}}]},
+            [],
+        ),
+        # Case 3: Missing annotations
+        (
+            {"reports": [{"annotation": {"genomic_regions": [{"gene_range": {"accession_version": "NC_000003.11"}}]}}]},
+            [],
+        ),
+        # Case 4: Missing accession_version in region
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "genomic_regions": [{"gene_range": {}}],
+                            "annotations": [{"assembly_accession": "GCF_000004.1"}],
+                        }
+                    }
+                ]
+            },
+            [],
+        ),
+        # Case 5: Missing assembly_accession in annotations
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "genomic_regions": [{"gene_range": {"accession_version": "NC_000005.11"}}],
+                            "annotations": [{}],
+                        }
+                    }
+                ]
+            },
+            [],
+        ),
+        # Case 6: Multiple reports, one valid
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "genomic_regions": [{"gene_range": {"accession_version": "NC_000006.11"}}],
+                            "annotations": [{"assembly_accession": "GCF_000006.1"}],
+                        }
+                    },
+                    {
+                        "annotation": {
+                            "genomic_regions": [{"gene_range": {"accession_version": "NC_000007.11"}}],
+                            "annotations": [{}],
+                        }
+                    },
+                ]
+            },
+            [("refseq:NC_000006.11", "insdc.gcf:GCF_000006.1")],
+        ),
+    ],
+)
+def test_load_contig_x_contig_collection(input_data, expected_output):
+    result = load_contig_x_contig_collection(input_data)
+    assert sorted(result) == sorted(expected_output)
+
+
+@pytest.mark.parametrize(
+    "input_data, expected_output",
+    [
+        # Case 1: Valid contig with accession_version
+        (
+            {"reports": [{"annotation": {"genomic_regions": [{"gene_range": {"accession_version": "NC_000001.11"}}]}}]},
+            [("refseq:NC_000001.11", None, None, None)],
+        ),
+        # Case 2: Multiple contigs, different accession_versions
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "genomic_regions": [
+                                {"gene_range": {"accession_version": "NC_000001.11"}},
+                                {"gene_range": {"accession_version": "NC_000002.12"}},
+                            ]
+                        }
+                    }
+                ]
+            },
+            [
+                ("refseq:NC_000001.11", None, None, None),
+                ("refseq:NC_000002.12", None, None, None),
+            ],
+        ),
+        # Case 3: Duplicate accession versions
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "genomic_regions": [
+                                {"gene_range": {"accession_version": "NC_000003.13"}},
+                                {"gene_range": {"accession_version": "NC_000003.13"}},
+                            ]
+                        }
+                    }
+                ]
+            },
+            [("refseq:NC_000003.13", None, None, None)],
+        ),
+        # Case 4: Missing accession_version
+        (
+            {"reports": [{"annotation": {"genomic_regions": [{"gene_range": {}}]}}]},
+            [],
+        ),
+        # Case 5: Empty reports
+        (
+            {"reports": []},
+            [],
+        ),
+    ],
+)
+def test_load_contigs(input_data, expected_output):
+    result = load_contigs(input_data)
+    assert sorted(result) == sorted(expected_output)
+
+
+### add new test: to_int
+@pytest.mark.parametrize(
+    "input_id, expected",
+    [
+        ("GeneID:123", "ncbigene:123"),
+        ("YP_009725307.1", "refseq:YP_009725307.1"),
+        ("GCF_000001405.39", "insdc.gcf:GCF_000001405.39"),
+        ("random", "random"),
+    ],
+)
+def test_apply_prefix(input_id, expected):
+    assert apply_prefix(input_id) == expected
+
+
+@pytest.mark.parametrize("val, expected", [("123", 123), ("abc", None), ("", None)])
+def test_to_int(val, expected):
+    assert to_int(val) == expected
+
+
+TABLE_NAME_MAP = {
+    "contig": "Contig",
+    "feature": "Feature",
+    "identifier": "Identifier",
+    "name": "Name",
+    "contig_x_contigcollection": "Contig_x_ContigCollection",
+    "contigcollection_x_feature": "ContigCollection_x_Feature",
+    "contigcollection_x_protein": "ContigCollection_x_Protein",
+    "feature_x_protein": "Feature_x_Protein",
+}
+
+
+@pytest.mark.requires_spark
+def test_parse_annotation_data(spark: SparkSession, test_data_dir: Path) -> None:
+    """Test the parsing of the annotation data with direct Delta table."""
+
+    test_ns = TEST_NS.lower()
+    spark.sql(f"CREATE DATABASE IF NOT EXISTS {TEST_NS}")
+
+    # Load NCBI dataset from NCBI API
+    sample_api_response = test_data_dir / "refseq" / "annotation_report.json"
+    dataset = json.load(sample_api_response.open())
+
+    # Run parse function
+    parse_annotation_data(spark, [dataset], test_ns)
+
+    # Expected tables to validate from output
+    expected_tables = [
+        "contig",
+        "contig_x_contigcollection",
+        "contigcollection_x_feature",
+        "contigcollection_x_protein",
+        "feature",
+        "feature_x_protein",
+        "identifier",
+        "name",
+    ]
+
+    for table_name in expected_tables:
+        result_df = spark.table(f"{test_ns}.{table_name}")
+        schema_key = TABLE_NAME_MAP[table_name]
+
+        # Construct expected_df just for schema comparison
+        rows = [r.asDict() for r in result_df.collect()]
+        expected_df = spark.createDataFrame(rows, schema=cdm_schemas[schema_key])
+
+        # Assert schema match
+        assertDataFrameSchemaEqual(
+            expected_df,
+            result_df,
+            msg=f"{table_name}: schema mismatch",
+        )
+        # Assert content match
+        assertDataFrameEqual(
+            expected_df,
+            result_df,
+            ignore_row_order=True,
+            msg=f"{table_name}: content mismatch",
+        )
diff --git a/tests/parsers/test_shared_identifiers.py b/tests/parsers/test_shared_identifiers.py
new file mode 100644
index 0000000..b76e9af
--- /dev/null
+++ b/tests/parsers/test_shared_identifiers.py
@@ -0,0 +1,34 @@
+import xml.etree.ElementTree as ET
+
+from cdm_data_loader_utils.parsers.shared_identifiers import parse_identifiers_generic
+
+
+def test_parse_identifiers_generic_basic() -> None:
+    # <entry>
+    #   <accession>P12345</accession>
+    #   <accession>Q99999</accession>
+    # </entry>
+    ns = {"ns": "dummy"}
+    entry = ET.Element("entry")
+
+    a1 = ET.SubElement(entry, "accession")
+    a1.text = "P12345"
+    a2 = ET.SubElement(entry, "accession")
+    a2.text = "Q99999"
+
+    # Add namespace prefix to match xpath
+    a1.tag = "{dummy}accession"
+    a2.tag = "{dummy}accession"
+
+    rows = parse_identifiers_generic(
+        entry=entry,
+        xpath="ns:accession",
+        prefix="UniProt",
+        ns=ns,
+    )
+
+    assert len(rows) == 2
+    assert rows[0]["identifier"] == "UniProt:P12345"
+    assert rows[1]["identifier"] == "UniProt:Q99999"
+    assert rows[0]["source"] == "UniProt"
+    assert rows[0]["description"] == "UniProt accession"
diff --git a/tests/parsers/test_xml_utils.py b/tests/parsers/test_xml_utils.py
new file mode 100644
index 0000000..fc6e3ba
--- /dev/null
+++ b/tests/parsers/test_xml_utils.py
@@ -0,0 +1,49 @@
+import xml.etree.ElementTree as ET
+
+from cdm_data_loader_utils.parsers.xml_utils import (
+    clean_dict,
+    get_attr,
+    get_text,
+    parse_db_references,
+)
+
+
+def test_get_text_and_get_attr_basic() -> None:
+    elem = ET.Element("tag", attrib={"id": "123"})
+    elem.text = "  hello  "
+
+    assert get_text(elem) == "hello"
+    assert get_text(None) is None
+    assert get_attr(elem, "id") == "123"
+    assert get_attr(elem, "missing") is None
+
+
+def test_parse_db_references_pub_and_others() -> None:
+    ns = {"ns": "dummy"}
+    source = ET.Element("source")
+    db1 = ET.SubElement(source, "dbReference", attrib={"type": "PubMed", "id": "12345"})
+    db2 = ET.SubElement(source, "dbReference", attrib={"type": "DOI", "id": "10.1000/xyz"})
+    db3 = ET.SubElement(source, "dbReference", attrib={"type": "PDB", "id": "1ABC"})
+
+    db1.tag = "{dummy}dbReference"
+    db2.tag = "{dummy}dbReference"
+    db3.tag = "{dummy}dbReference"
+
+    pubs, others = parse_db_references(source, ns)
+
+    assert "PUBMED:12345" in pubs
+    assert "DOI:10.1000/xyz" in pubs
+    assert "PDB:1ABC" in others
+
+
+def test_clean_dict_removes_nones_and_empty() -> None:
+    """Test that clean_dict removes None and empty values."""
+    d = {
+        "a": 1,
+        "b": None,
+        "c": [],
+        "d": {},
+        "e": "ok",
+    }
+    cleaned = clean_dict(d)
+    assert cleaned == {"a": 1, "e": "ok"}
diff --git a/tests/validation/assertions.py b/tests/validation/assertions.py
new file mode 100644
index 0000000..b8ef6f1
--- /dev/null
+++ b/tests/validation/assertions.py
@@ -0,0 +1,40 @@
+from typing import List, Optional
+from pyspark.sql import DataFrame
+from pyspark.sql.types import StructType
+from math import isclose
+
+
+def assertDataFrameSchemaEqual(df1: DataFrame, df2: DataFrame, msg: str = "") -> None:
+    fields1 = [(f.name, f.dataType) for f in df1.schema.fields]
+    fields2 = [(f.name, f.dataType) for f in df2.schema.fields]
+
+    assert fields1 == fields2, f"{msg}\nSchema mismatch:\n{fields1}\n!=\n{fields2}"
+
+
+def assertDataFrameEqual(
+    df1: DataFrame,
+    df2: DataFrame,
+    msg: str = "",
+    ignore_row_order: bool = False,
+    float_tol: Optional[float] = None,
+) -> None:
+    """
+    Assert two DataFrames are equal in content.
+    """
+    if ignore_row_order:
+        df1_rows = sorted([tuple(row) for row in df1.collect()])
+        df2_rows = sorted([tuple(row) for row in df2.collect()])
+    else:
+        df1_rows = df1.collect()
+        df2_rows = df2.collect()
+
+    assert len(df1_rows) == len(df2_rows), f"{msg}\nRow count mismatch: {len(df1_rows)} != {len(df2_rows)}"
+
+    for i, (r1, r2) in enumerate(zip(df1_rows, df2_rows)):
+        for j, (v1, v2) in enumerate(zip(r1, r2)):
+            if float_tol is not None and isinstance(v1, float) and isinstance(v2, float):
+                assert isclose(v1, v2, rel_tol=float_tol), (
+                    f"{msg}\nFloat mismatch at row {i}, col {j}: {v1} != {v2} within tol {float_tol}"
+                )
+            else:
+                assert v1 == v2, f"{msg}\nValue mismatch at row {i}, col {j}: {v1} != {v2}"
diff --git a/tests/validation/test_dataframe_validator.py b/tests/validation/test_dataframe_validator.py
index 63a316c..f78dcd2 100644
--- a/tests/validation/test_dataframe_validator.py
+++ b/tests/validation/test_dataframe_validator.py
@@ -1,4 +1,11 @@
-"""Tests for parser error handling, schema compliance, and so on."""
+"""
+
+Tests for DataFrameValidator behavior:
+- empty dataframe handling
+- mocked validation flow
+- integration validation on real RefSeq CDM outputs
+
+"""
 
 from typing import Any
 from unittest.mock import MagicMock
@@ -6,6 +13,7 @@
 import pytest
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.types import StructField, StructType
+from pyspark.sql.functions import col, when, lit
 
 from cdm_data_loader_utils.audit.schema import METRICS, REJECTS, ROW_ERRORS
 from cdm_data_loader_utils.core.constants import INVALID_DATA_FIELD_NAME
@@ -14,6 +22,9 @@
 from tests.audit.conftest import create_table
 
 
+# ------------------------------------------------------------------------------
+# Unit tests
+# ------------------------------------------------------------------------------
 @pytest.mark.requires_spark
 def test_validate_dataframe_empty_df(pipeline_run: PipelineRun, empty_df: DataFrame) -> None:
     """Assert that an empty dataframe does not perform any validation."""
@@ -80,3 +91,57 @@ def test_validate_dataframe_no_validation(  # noqa: PLR0913
     assert metrics.count() == 1
     rejects = spark.table(f"{pipeline_run.namespace}.{REJECTS}")
     assert rejects.count() == output.records_invalid
+
+
+# ------------------------------------------------------------------------------
+# Integration-style test (real RefSeq CDM output)
+# ------------------------------------------------------------------------------
+
+
+@pytest.mark.requires_spark
+def test_validate_refseq_cdm(
+    spark: SparkSession,
+    pipeline_run: PipelineRun,
+) -> None:
+    # Prepare audit tables from scratch
+    for t in (METRICS, REJECTS):
+        create_table(spark, t, add_default_data=False)
+
+    # Load real pipeline output
+    df = spark.table(f"{pipeline_run.namespace}.cdm_identifiers")
+
+    # Sanity check: pipeline actually produced data
+    assert df.count() > 0
+    assert "identifier" in df.columns
+
+    # Simple validation rule: identifier cannot be null
+    def validation_fn(df: DataFrame) -> DataFrame:
+        return df.withColumn(
+            INVALID_DATA_FIELD_NAME,
+            when(col("identifier").isNull(), lit("identifier is null")),
+        )
+
+    validator = Validator(validation_fn, {})
+
+    dfv = DataFrameValidator(spark)
+    output = dfv.validate_dataframe(
+        data_to_validate=df,
+        schema=df.schema.fields,
+        run=pipeline_run,
+        validator=validator,
+        invalid_col=INVALID_DATA_FIELD_NAME,
+    )
+
+    # Records accounting
+    assert output.records_read == df.count()
+    assert output.records_valid + output.records_invalid == output.records_read
+
+    # valid_df must not contain invalid rows
+    assert output.valid_df.filter(col(INVALID_DATA_FIELD_NAME).isNotNull()).count() == 0
+
+    # Audit tables written
+    metrics = spark.table(f"{pipeline_run.namespace}.{METRICS}")
+    rejects = spark.table(f"{pipeline_run.namespace}.{REJECTS}")
+
+    assert metrics.count() == 1
+    assert rejects.count() == output.records_invalid
diff --git a/uv.lock b/uv.lock
index 0110326..cec3518 100644
--- a/uv.lock
+++ b/uv.lock
@@ -95,6 +95,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e", size = 7490, upload-time = "2025-07-03T22:54:42.156Z" },
 ]
 
+[[package]]
+name = "alabaster"
+version = "1.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a6/f8/d9c74d0daf3f742840fd818d69cfae176fa332022fd44e3469487d5a9420/alabaster-1.0.0.tar.gz", hash = "sha256:c00dca57bca26fa62a6d7d0a9fcce65f3e026e9bfe33e9c538fd3fbb2144fd9e", size = 24210, upload-time = "2024-07-26T18:15:03.762Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7e/b3/6b4067be973ae96ba0d615946e314c5ae35f9f993eca561b356540bb0c2b/alabaster-1.0.0-py3-none-any.whl", hash = "sha256:fc6786402dc3fcb2de3cabd5fe455a2db534b371124f1f21de8731783dec828b", size = 13929, upload-time = "2024-07-26T18:15:02.05Z" },
+]
+
 [[package]]
 name = "annotated-types"
 version = "0.7.0"
@@ -236,6 +245,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b7/b8/3fe70c75fe32afc4bb507f75563d39bc5642255d1d94f1f23604725780bf/babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2", size = 10182537, upload-time = "2025-02-01T15:17:37.39Z" },
 ]
 
+[[package]]
+name = "backrefs"
+version = "6.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/86/e3/bb3a439d5cb255c4774724810ad8073830fac9c9dee123555820c1bcc806/backrefs-6.1.tar.gz", hash = "sha256:3bba1749aafe1db9b915f00e0dd166cba613b6f788ffd63060ac3485dc9be231", size = 7011962, upload-time = "2025-11-15T14:52:08.323Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3b/ee/c216d52f58ea75b5e1841022bbae24438b19834a29b163cb32aa3a2a7c6e/backrefs-6.1-py310-none-any.whl", hash = "sha256:2a2ccb96302337ce61ee4717ceacfbf26ba4efb1d55af86564b8bbaeda39cac1", size = 381059, upload-time = "2025-11-15T14:51:59.758Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/9a/8da246d988ded941da96c7ed945d63e94a445637eaad985a0ed88787cb89/backrefs-6.1-py311-none-any.whl", hash = "sha256:e82bba3875ee4430f4de4b6db19429a27275d95a5f3773c57e9e18abc23fd2b7", size = 392854, upload-time = "2025-11-15T14:52:01.194Z" },
+    { url = "https://files.pythonhosted.org/packages/37/c9/fd117a6f9300c62bbc33bc337fd2b3c6bfe28b6e9701de336b52d7a797ad/backrefs-6.1-py312-none-any.whl", hash = "sha256:c64698c8d2269343d88947c0735cb4b78745bd3ba590e10313fbf3f78c34da5a", size = 398770, upload-time = "2025-11-15T14:52:02.584Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/95/7118e935b0b0bd3f94dfec2d852fd4e4f4f9757bdb49850519acd245cd3a/backrefs-6.1-py313-none-any.whl", hash = "sha256:4c9d3dc1e2e558965202c012304f33d4e0e477e1c103663fd2c3cc9bb18b0d05", size = 400726, upload-time = "2025-11-15T14:52:04.093Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/72/6296bad135bfafd3254ae3648cd152980a424bd6fed64a101af00cc7ba31/backrefs-6.1-py314-none-any.whl", hash = "sha256:13eafbc9ccd5222e9c1f0bec563e6d2a6d21514962f11e7fc79872fd56cbc853", size = 412584, upload-time = "2025-11-15T14:52:05.233Z" },
+    { url = "https://files.pythonhosted.org/packages/02/e3/a4fa1946722c4c7b063cc25043a12d9ce9b4323777f89643be74cef2993c/backrefs-6.1-py39-none-any.whl", hash = "sha256:a9e99b8a4867852cad177a6430e31b0f6e495d65f8c6c134b68c14c3c95bf4b0", size = 381058, upload-time = "2025-11-15T14:52:06.698Z" },
+]
+
 [[package]]
 name = "beautifulsoup4"
 version = "4.14.3"
@@ -376,6 +399,7 @@ source = { editable = "." }
 dependencies = [
     { name = "berdl-notebook-utils" },
     { name = "biopython" },
+    { name = "cdm-schema" },
     { name = "click" },
     { name = "lxml" },
     { name = "pytest-asyncio" },
@@ -428,6 +452,19 @@ wheels = [
 [package.metadata]
 requires-dist = [{ name = "jupyterlab", specifier = ">=3.0" }]
 
+[[package]]
+name = "cdm-schema"
+version = "0.1.0"
+source = { git = "https://github.com/kbase/cdm-schema.git#ae0e5d60f826d53507b117149c33a0f98051296b" }
+dependencies = [
+    { name = "linkml" },
+    { name = "linkml-runtime" },
+    { name = "mkdocs-material" },
+    { name = "mkdocs-mermaid2-plugin" },
+    { name = "pyspark" },
+    { name = "ruff" },
+]
+
 [[package]]
 name = "cdm-spark-manager-client"
 version = "0.0.1"
@@ -507,6 +544,24 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ae/3a/dbeec9d1ee0844c679f6bb5d6ad4e9f198b1224f4e7a32825f47f6192b0c/cffi-2.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:0a1527a803f0a659de1af2e1fd700213caba79377e27e4693648c2923da066f9", size = 184195, upload-time = "2025-09-08T23:23:43.004Z" },
 ]
 
+[[package]]
+name = "cfgraph"
+version = "0.2.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "rdflib" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/cb/51/3e7e021920cfe2f7d18b672642e13f7dc4f53545d530b52ee6533b6681ca/CFGraph-0.2.1.tar.gz", hash = "sha256:b57fe7044a10b8ff65aa3a8a8ddc7d4cd77bf511b42e57289cd52cbc29f8fe74", size = 2630, upload-time = "2018-11-20T15:27:28.69Z" }
+
+[[package]]
+name = "chardet"
+version = "5.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/f7b6ab21ec75897ed80c17d79b15951a719226b9fababf1e40ea74d69079/chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7", size = 2069618, upload-time = "2023-08-01T19:23:02.662Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/38/6f/f5fbc992a329ee4e0f288c1fe0e2ad9485ed064cac731ed2fe47dcc38cbf/chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970", size = 199385, upload-time = "2023-08-01T19:23:00.661Z" },
+]
+
 [[package]]
 name = "charset-normalizer"
 version = "3.4.4"
@@ -550,14 +605,14 @@ wheels = [
 
 [[package]]
 name = "click"
-version = "8.3.1"
+version = "8.1.8"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "colorama", marker = "sys_platform == 'win32'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/3d/fa/656b739db8587d7b5dfa22e22ed02566950fbfbcdc20311993483657a5c0/click-8.3.1.tar.gz", hash = "sha256:12ff4785d337a1bb490bb7e9c2b1ee5da3112e94a8622f26a6c77f5d2fc6842a", size = 295065, upload-time = "2025-11-15T20:45:42.706Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/b9/2e/0090cbf739cee7d23781ad4b89a9894a41538e4fcf4c31dcdd705b78eb8b/click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a", size = 226593, upload-time = "2024-12-21T18:38:44.339Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/98/78/01c019cdb5d6498122777c1a43056ebb3ebfeef2076d9d026bfe15583b2b/click-8.3.1-py3-none-any.whl", hash = "sha256:981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6", size = 108274, upload-time = "2025-11-15T20:45:41.139Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/d4/7ebdbd03970677812aac39c869717059dbb71a4cfc033ca6e5221787892c/click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2", size = 98188, upload-time = "2024-12-21T18:38:41.666Z" },
 ]
 
 [[package]]
@@ -704,6 +759,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e8/cb/2da4cc83f5edb9c3257d09e1e7ab7b23f049c7962cae8d842bbef0a9cec9/cryptography-46.0.3-cp38-abi3-win_arm64.whl", hash = "sha256:d89c3468de4cdc4f08a57e214384d0471911a3830fcdaf7a8cc587e42a866372", size = 2918740, upload-time = "2025-10-15T23:18:12.277Z" },
 ]
 
+[[package]]
+name = "curies"
+version = "0.12.7"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pydantic" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ef/fc/8f73cbde9b2034e4b4f8524b4c5b7ce2a68d052ede8a486c0bc806c1f54d/curies-0.12.7.tar.gz", hash = "sha256:b51f422f6f8b93b35b583195222563327a00629d0ef8e889dc14606e31950e4f", size = 283292, upload-time = "2025-12-22T15:48:33.554Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c6/65/c6118987bc902a1a5941d2028c49d91c2db55d5bec148b46d155a125543b/curies-0.12.7-py3-none-any.whl", hash = "sha256:9038d6afd6311328b072db51488af1ce162cb26c1a3cc497d2d00871ddb824a9", size = 70042, upload-time = "2025-12-22T15:48:32.508Z" },
+]
+
 [[package]]
 name = "dask"
 version = "2026.1.1"
@@ -806,6 +874,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/3b/d8/265a93d22ae79262cdff701496a6f5676926a342153f3855ae6060430660/delta_spark-4.0.0-py3-none-any.whl", hash = "sha256:4e4ded07bb9ee4f6a0df45606d84395239d4b82001e765a627fecc1e914f3029", size = 39756, upload-time = "2025-06-06T01:41:44.815Z" },
 ]
 
+[[package]]
+name = "deprecated"
+version = "1.3.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "wrapt" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/49/85/12f0a49a7c4ffb70572b6c2ef13c90c88fd190debda93b23f026b25f9634/deprecated-1.3.1.tar.gz", hash = "sha256:b1b50e0ff0c1fddaa5708a2c6b0a6588bb09b892825ab2b214ac9ea9d92a5223", size = 2932523, upload-time = "2025-10-30T08:19:02.757Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/84/d0/205d54408c08b13550c733c4b85429e7ead111c7f0014309637425520a9a/deprecated-1.3.1-py2.py3-none-any.whl", hash = "sha256:597bfef186b6f60181535a29fbe44865ce137a5079f295b479886c82729d5f3f", size = 11298, upload-time = "2025-10-30T08:19:00.758Z" },
+]
+
 [[package]]
 name = "distributed"
 version = "2026.1.1"
@@ -1010,6 +1090,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c4/ab/09169d5a4612a5f92490806649ac8d41e3ec9129c636754575b3553f4ea4/googleapis_common_protos-1.72.0-py3-none-any.whl", hash = "sha256:4299c5a82d5ae1a9702ada957347726b167f9f8d1fc352477702a1e851ff4038", size = 297515, upload-time = "2025-11-06T18:29:13.14Z" },
 ]
 
+[[package]]
+name = "graphviz"
+version = "0.21"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f8/b3/3ac91e9be6b761a4b30d66ff165e54439dcd48b83f4e20d644867215f6ca/graphviz-0.21.tar.gz", hash = "sha256:20743e7183be82aaaa8ad6c93f8893c923bd6658a04c32ee115edb3c8a835f78", size = 200434, upload-time = "2025-06-15T09:35:05.824Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/91/4c/e0ce1ef95d4000ebc1c11801f9b944fa5910ecc15b5e351865763d8657f8/graphviz-0.21-py3-none-any.whl", hash = "sha256:54f33de9f4f911d7e84e4191749cac8cc5653f815b06738c54db9a15ab8b1e42", size = 47300, upload-time = "2025-06-15T09:35:04.433Z" },
+]
+
 [[package]]
 name = "greenlet"
 version = "3.3.0"
@@ -1095,6 +1184,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" },
 ]
 
+[[package]]
+name = "hbreader"
+version = "0.9.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/54/66/3a649ce125e03d1d43727a8b833cd211f0b9fe54a7e5be326f50d6f1d951/hbreader-0.9.1.tar.gz", hash = "sha256:d2c132f8ba6276d794c66224c3297cec25c8079d0a4cf019c061611e0a3b94fa", size = 19016, upload-time = "2021-02-25T19:22:32.799Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7b/24/61844afbf38acf419e01ca2639f7bd079584523d34471acbc4152ee991c5/hbreader-0.9.1-py3-none-any.whl", hash = "sha256:9a6e76c9d1afc1b977374a5dc430a1ebb0ea0488205546d4678d6e31cc5f6801", size = 7595, upload-time = "2021-02-25T19:22:31.944Z" },
+]
+
 [[package]]
 name = "hf-xet"
 version = "1.2.0"
@@ -1192,7 +1290,7 @@ dependencies = [
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/6f/fa/a1a94c55637f2b7cfeb05263ac3881aa87c82df92d8b4b31c909079f4419/huggingface_hub-1.1.7.tar.gz", hash = "sha256:3c84b6283caca928595f08fd42e9a572f17ec3501dec508c3f2939d94bfbd9d2", size = 607537, upload-time = "2025-12-01T11:05:28.137Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/dd/4f/82e5ab009089a2c48472bf4248391fe4091cf0b9c3e951dbb8afe3b23d76/huggingface_hub-1.1.7-py3-none-any.whl", hash = "sha256:f3efa4779f4890e44c957bbbb0f197e6028887ad09f0cf95a21659fa7753605d", size = 516239, upload-time = "2025-12-01T11:05:25.981Z" },
+    { url = "https://files.pythonhosted.org/packages/33/3f/969137c9d9428ed8bf171d27604243dd950a47cac82414826e2aebbc0a4c/huggingface_hub-1.1.4-py3-none-any.whl", hash = "sha256:867799fbd2ef338b7f8b03d038d9c0e09415dfe45bb2893b48a510d1d746daa5", size = 515580, upload-time = "2025-11-13T10:51:55.742Z" },
 ]
 
 [[package]]
@@ -1216,6 +1314,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" },
 ]
 
+[[package]]
+name = "imagesize"
+version = "1.4.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a7/84/62473fb57d61e31fef6e36d64a179c8781605429fd927b5dd608c997be31/imagesize-1.4.1.tar.gz", hash = "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a", size = 1280026, upload-time = "2022-07-01T12:21:05.687Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ff/62/85c4c919272577931d407be5ba5d71c20f0b616d31a0befe0ae45bb79abd/imagesize-1.4.1-py2.py3-none-any.whl", hash = "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b", size = 8769, upload-time = "2022-07-01T12:21:02.467Z" },
+]
+
 [[package]]
 name = "importlib-metadata"
 version = "8.7.1"
@@ -1310,6 +1417,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/56/6d/0d9848617b9f753b87f214f1c682592f7ca42de085f564352f10f0843026/ipywidgets-8.1.8-py3-none-any.whl", hash = "sha256:ecaca67aed704a338f88f67b1181b58f821ab5dc89c1f0f5ef99db43c1c2921e", size = 139808, upload-time = "2025-11-01T21:18:10.956Z" },
 ]
 
+[[package]]
+name = "isodate"
+version = "0.7.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/54/4d/e940025e2ce31a8ce1202635910747e5a87cc3a6a6bb2d00973375014749/isodate-0.7.2.tar.gz", hash = "sha256:4cd1aa0f43ca76f4a6c6c0292a85f40b35ec2e43e315b59f06e6d32171a953e6", size = 29705, upload-time = "2024-10-08T23:04:11.5Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/15/aa/0aca39a37d3c7eb941ba736ede56d689e7be91cab5d9ca846bde3999eba6/isodate-0.7.2-py3-none-any.whl", hash = "sha256:28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15", size = 22320, upload-time = "2024-10-08T23:04:09.501Z" },
+]
+
 [[package]]
 name = "isoduration"
 version = "20.11.0"
@@ -1420,6 +1536,32 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/31/b4/b9b800c45527aadd64d5b442f9b932b00648617eb5d63d2c7a6587b7cafc/jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980", size = 20256, upload-time = "2022-06-17T18:00:10.251Z" },
 ]
 
+[[package]]
+name = "jsbeautifier"
+version = "1.15.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "editorconfig" },
+    { name = "six" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ea/98/d6cadf4d5a1c03b2136837a435682418c29fdeb66be137128544cecc5b7a/jsbeautifier-1.15.4.tar.gz", hash = "sha256:5bb18d9efb9331d825735fbc5360ee8f1aac5e52780042803943aa7f854f7592", size = 75257, upload-time = "2025-02-27T17:53:53.252Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2d/14/1c65fccf8413d5f5c6e8425f84675169654395098000d8bddc4e9d3390e1/jsbeautifier-1.15.4-py3-none-any.whl", hash = "sha256:72f65de312a3f10900d7685557f84cb61a9733c50dcc27271a39f5b0051bf528", size = 94707, upload-time = "2025-02-27T17:53:46.152Z" },
+]
+
+[[package]]
+name = "json-flattener"
+version = "0.1.9"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "click" },
+    { name = "pyyaml" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/6d/77/b00e46d904818826275661a690532d3a3a43a4ded0264b2d7fcdb5c0feea/json_flattener-0.1.9.tar.gz", hash = "sha256:84cf8523045ffb124301a602602201665fcb003a171ece87e6f46ed02f7f0c15", size = 11479, upload-time = "2022-02-26T01:36:04.545Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/00/cc/7fbd75d3362e939eb98bcf9bd22f3f7df8c237a85148899ed3d38e5614e5/json_flattener-0.1.9-py3-none-any.whl", hash = "sha256:6b027746f08bf37a75270f30c6690c7149d5f704d8af1740c346a3a1236bc941", size = 10799, upload-time = "2022-02-26T01:36:03.06Z" },
+]
+
 [[package]]
 name = "json5"
 version = "0.13.0"
@@ -1478,6 +1620,16 @@ wheels = [
 ]
 
 [package.optional-dependencies]
+format = [
+    { name = "fqdn" },
+    { name = "idna" },
+    { name = "isoduration" },
+    { name = "jsonpointer" },
+    { name = "rfc3339-validator" },
+    { name = "rfc3987" },
+    { name = "uri-template" },
+    { name = "webcolors" },
+]
 format-nongpl = [
     { name = "fqdn" },
     { name = "idna" },
@@ -2533,6 +2685,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" },
 ]
 
+[[package]]
+name = "paginate"
+version = "0.5.7"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ec/46/68dde5b6bc00c1296ec6466ab27dddede6aec9af1b99090e1107091b3b84/paginate-0.5.7.tar.gz", hash = "sha256:22bd083ab41e1a8b4f3690544afb2c60c25e5c9a63a30fa2f483f6c60c8e5945", size = 19252, upload-time = "2024-08-25T14:17:24.139Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/90/96/04b8e52da071d28f5e21a805b19cb9390aa17a47462ac87f5e2696b9566d/paginate-0.5.7-py2.py3-none-any.whl", hash = "sha256:b885e2af73abcf01d9559fd5216b57ef722f8c42affbb63942377668e35c7591", size = 13746, upload-time = "2024-08-25T14:17:22.55Z" },
+]
+
 [[package]]
 name = "pandas"
 version = "2.3.3"
@@ -2582,6 +2743,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ef/af/4fbc8cab944db5d21b7e2a5b8e9211a03a79852b1157e2c102fcc61ac440/pandocfilters-1.5.1-py2.py3-none-any.whl", hash = "sha256:93be382804a9cdb0a7267585f157e5d1731bbe5545a85b268d6f5fe6232de2bc", size = 8663, upload-time = "2024-01-18T20:08:11.28Z" },
 ]
 
+[[package]]
+name = "parse"
+version = "1.20.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/4f/78/d9b09ba24bb36ef8b83b71be547e118d46214735b6dfb39e4bfde0e9b9dd/parse-1.20.2.tar.gz", hash = "sha256:b41d604d16503c79d81af5165155c0b20f6c8d6c559efa66b4b695c3e5a0a0ce", size = 29391, upload-time = "2024-06-11T04:41:57.34Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d0/31/ba45bf0b2aa7898d81cbbfac0e88c267befb59ad91a19e36e1bc5578ddb1/parse-1.20.2-py2.py3-none-any.whl", hash = "sha256:967095588cb802add9177d0c0b6133b5ba33b1ea9007ca800e526f42a85af558", size = 20126, upload-time = "2024-06-11T04:41:55.057Z" },
+]
+
 [[package]]
 name = "parso"
 version = "0.8.5"
@@ -2604,6 +2774,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/71/e7/40fb618334dcdf7c5a316c0e7343c5cd82d3d866edc100d98e29bc945ecd/partd-1.4.2-py3-none-any.whl", hash = "sha256:978e4ac767ec4ba5b86c6eaa52e5a2a3bc748a2ca839e8cc798f1cc6ce6efb0f", size = 18905, upload-time = "2024-05-06T19:51:39.271Z" },
 ]
 
+[[package]]
+name = "pathspec"
+version = "0.12.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ca/bc/f35b8446f4531a7cb215605d100cd88b7ac6f44ab3fc94870c120ab3adbf/pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712", size = 51043, upload-time = "2023-12-10T22:30:45Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191, upload-time = "2023-12-10T22:30:43.14Z" },
+]
+
 [[package]]
 name = "pexpect"
 version = "4.9.0"
@@ -2652,6 +2831,34 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/a3/58/35da89ee790598a0700ea49b2a66594140f44dec458c07e8e3d4979137fc/ply-3.11-py2.py3-none-any.whl", hash = "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce", size = 49567, upload-time = "2018-02-15T19:01:27.172Z" },
 ]
 
+[[package]]
+name = "prefixcommons"
+version = "0.1.12"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "click" },
+    { name = "pytest-logging" },
+    { name = "pyyaml" },
+    { name = "requests" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7d/b5/c5b63a4bf5dedb36567181fdb98dbcc7aaa025faebabaaffa2f5eb4b8feb/prefixcommons-0.1.12.tar.gz", hash = "sha256:22c4e2d37b63487b3ab48f0495b70f14564cb346a15220f23919eb0c1851f69f", size = 24063, upload-time = "2022-07-19T00:06:12.478Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/31/e8/715b09df3dab02b07809d812042dc47a46236b5603d9d3a2572dbd1d8a97/prefixcommons-0.1.12-py3-none-any.whl", hash = "sha256:16dbc0a1f775e003c724f19a694fcfa3174608f5c8b0e893d494cf8098ac7f8b", size = 29482, upload-time = "2022-07-19T00:06:08.709Z" },
+]
+
+[[package]]
+name = "prefixmaps"
+version = "0.2.6"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "curies" },
+    { name = "pyyaml" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/4d/cf/f588bcdfd2c841839b9d59ce219a46695da56aa2805faff937bbafb9ee2b/prefixmaps-0.2.6.tar.gz", hash = "sha256:7421e1244eea610217fa1ba96c9aebd64e8162a930dc0626207cd8bf62ecf4b9", size = 709899, upload-time = "2024-10-17T16:30:57.738Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/89/b2/2b2153173f2819e3d7d1949918612981bc6bd895b75ffa392d63d115f327/prefixmaps-0.2.6-py3-none-any.whl", hash = "sha256:f6cef28a7320fc6337cf411be212948ce570333a0ce958940ef684c7fb192a62", size = 754732, upload-time = "2024-10-17T16:30:55.731Z" },
+]
+
 [[package]]
 name = "prometheus-client"
 version = "0.24.1"
@@ -2978,6 +3185,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" },
 ]
 
+[[package]]
+name = "pyjsg"
+version = "0.11.10"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "antlr4-python3-runtime" },
+    { name = "jsonasobj" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/90/61/e001a4b679a171f84783deb8e215a91c9f614cb498807e24e4f73ea4e5ed/PyJSG-0.11.10.tar.gz", hash = "sha256:4bd6e3ff2833fa2b395bbe803a2d72a5f0bab5b7285bccd0da1a1bc0aee88bfa", size = 130742, upload-time = "2022-04-14T17:18:24.511Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/96/ee/370c3b1908327dac967841ff723db391a02f3637c95c6898160e5ffe1060/PyJSG-0.11.10-py3-none-any.whl", hash = "sha256:10af60ff42219be7e85bf7f11c19b648715b0b29eb2ddbd269e87069a7c3f26d", size = 80763, upload-time = "2022-04-14T17:18:23.169Z" },
+]
+
 [[package]]
 name = "pyjwt"
 version = "2.10.1"
@@ -2992,6 +3212,65 @@ crypto = [
     { name = "cryptography" },
 ]
 
+[[package]]
+name = "pymdown-extensions"
+version = "10.19.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "markdown" },
+    { name = "pyyaml" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/72/2d/9f30cee56d4d6d222430d401e85b0a6a1ae229819362f5786943d1a8c03b/pymdown_extensions-10.19.1.tar.gz", hash = "sha256:4969c691009a389fb1f9712dd8e7bd70dcc418d15a0faf70acb5117d022f7de8", size = 847839, upload-time = "2025-12-14T17:25:24.42Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fb/35/b763e8fbcd51968329b9adc52d188fc97859f85f2ee15fe9f379987d99c5/pymdown_extensions-10.19.1-py3-none-any.whl", hash = "sha256:e8698a66055b1dc0dca2a7f2c9d0ea6f5faa7834a9c432e3535ab96c0c4e509b", size = 266693, upload-time = "2025-12-14T17:25:22.999Z" },
+]
+
+[[package]]
+name = "pyparsing"
+version = "3.3.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/33/c1/1d9de9aeaa1b89b0186e5fe23294ff6517fce1bc69149185577cd31016b2/pyparsing-3.3.1.tar.gz", hash = "sha256:47fad0f17ac1e2cad3de3b458570fbc9b03560aa029ed5e16ee5554da9a2251c", size = 1550512, upload-time = "2025-12-23T03:14:04.391Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8b/40/2614036cdd416452f5bf98ec037f38a1afb17f327cb8e6b652d4729e0af8/pyparsing-3.3.1-py3-none-any.whl", hash = "sha256:023b5e7e5520ad96642e2c6db4cb683d3970bd640cdf7115049a6e9c3682df82", size = 121793, upload-time = "2025-12-23T03:14:02.103Z" },
+]
+
+[[package]]
+name = "pyshex"
+version = "0.8.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cfgraph" },
+    { name = "chardet" },
+    { name = "pyshexc" },
+    { name = "rdflib-shim" },
+    { name = "requests" },
+    { name = "shexjsg" },
+    { name = "sparqlslurper" },
+    { name = "sparqlwrapper" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/97/d7/420ce2df4e8688e06fa8e1fc353fdf3875eb70f6fc2e17493d0526d778ff/PyShEx-0.8.1.tar.gz", hash = "sha256:3c5c4d45fe27faaadae803cb008c41acf8ee784da7868b04fd84967e75be70d0", size = 475611, upload-time = "2022-04-14T21:14:58.769Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9b/48/efb1b1d3f3aee8cfc9f256738ca6e79ec362edbfc3a3abecbaf84db04643/PyShEx-0.8.1-py3-none-any.whl", hash = "sha256:6da1b10123e191abf8dcb6bf3e54aa3e1fcf771df5d1a0ed453217c8900c8e6a", size = 51861, upload-time = "2022-04-14T21:14:57.254Z" },
+]
+
+[[package]]
+name = "pyshexc"
+version = "0.9.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "antlr4-python3-runtime" },
+    { name = "chardet" },
+    { name = "jsonasobj" },
+    { name = "pyjsg" },
+    { name = "rdflib-shim" },
+    { name = "shexjsg" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/2a/31/95c590e8ed6e8cff141b6dd2a3de93b540f9dc3fba54621a20fd1cdb11e4/PyShExC-0.9.1.tar.gz", hash = "sha256:35a9975d4b9afeb20ef710fb6680871756381d0c39fbb5470b3b506581a304d3", size = 96070, upload-time = "2022-04-14T18:51:45.979Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/39/7d/ff5000e0882f2b3995bef20b667945d3faa9289b556295e4cc5d2e91f104/PyShExC-0.9.1-py2.py3-none-any.whl", hash = "sha256:efc55ed5cb2453e9df569b03e282505e96bb06597934288f3b23dd980ef10028", size = 69792, upload-time = "2022-04-14T18:51:44.148Z" },
+]
+
 [[package]]
 name = "pyspark"
 version = "4.0.1"
@@ -3065,6 +3344,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/27/98/822b924a4a3eb58aacba84444c7439fce32680592f394de26af9c76e2569/pytest_env-1.2.0-py3-none-any.whl", hash = "sha256:d7e5b7198f9b83c795377c09feefa45d56083834e60d04767efd64819fc9da00", size = 6251, upload-time = "2025-10-09T19:15:46.077Z" },
 ]
 
+[[package]]
+name = "pytest-logging"
+version = "2015.11.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pytest" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/dc/1e/fb11174c9eaebcec27d36e9e994b90ffa168bc3226925900b9dbbf16c9da/pytest-logging-2015.11.4.tar.gz", hash = "sha256:cec5c85ecf18aab7b2ead5498a31b9f758680ef5a902b9054ab3f2bdbb77c896", size = 3916, upload-time = "2015-11-04T12:15:54.122Z" }
+
 [[package]]
 name = "python-dateutil"
 version = "2.9.0.post0"
@@ -3241,6 +3529,43 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/81/d6/4bfbb40c9a0b42fc53c7cf442f6385db70b40f74a783130c5d0a5aa62228/pyzmq-27.1.0-cp314-cp314t-win_arm64.whl", hash = "sha256:dc5dbf68a7857b59473f7df42650c621d7e8923fb03fa74a526890f4d33cc4d7", size = 575170, upload-time = "2025-09-08T23:09:01.418Z" },
 ]
 
+[[package]]
+name = "rdflib"
+version = "7.5.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pyparsing" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ec/1b/4cd9a29841951371304828d13282e27a5f25993702c7c87dcb7e0604bd25/rdflib-7.5.0.tar.gz", hash = "sha256:663083443908b1830e567350d72e74d9948b310f827966358d76eebdc92bf592", size = 4903859, upload-time = "2025-11-28T05:51:54.562Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b9/20/35d2baebacf357b562bd081936b66cd845775442973cb033a377fd639a84/rdflib-7.5.0-py3-none-any.whl", hash = "sha256:b011dfc40d0fc8a44252e906dcd8fc806a7859bc231be190c37e9568a31ac572", size = 587215, upload-time = "2025-11-28T05:51:38.178Z" },
+]
+
+[[package]]
+name = "rdflib-jsonld"
+version = "0.6.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "rdflib" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/5a/48/9eaecac5f5ba6b31dd932fbbe67206afcbd24a7a696c03c6c920ac7ddc39/rdflib-jsonld-0.6.1.tar.gz", hash = "sha256:eda5a42a2e09f80d4da78e32b5c684bccdf275368f1541e6b7bcddfb1382a0e0", size = 130465, upload-time = "2021-09-14T12:22:20.082Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d0/d2/760527679057a7dad67f4e41f3e0c463b247f0bdbffc594e0add7c9077d6/rdflib_jsonld-0.6.1-py2.py3-none-any.whl", hash = "sha256:bcf84317e947a661bae0a3f2aee1eced697075fc4ac4db6065a3340ea0f10fc2", size = 16381, upload-time = "2021-09-14T12:22:17.805Z" },
+]
+
+[[package]]
+name = "rdflib-shim"
+version = "1.0.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "rdflib" },
+    { name = "rdflib-jsonld" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/1b/c8/1014ec6b5f4428c630deffba1f9851043ae378eb1d6ef52a03bd492cea99/rdflib_shim-1.0.3.tar.gz", hash = "sha256:d955d11e2986aab42b6830ca56ac6bc9c893abd1d049a161c6de2f1b99d4fc0d", size = 7783, upload-time = "2021-12-21T16:31:06.945Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5f/97/d8a785d2c7131c731c90cb0e65af9400081af4380bea4ec04868dc21aa92/rdflib_shim-1.0.3-py3-none-any.whl", hash = "sha256:7a853e7750ef1e9bf4e35dea27d54e02d4ed087de5a9e0c329c4a6d82d647081", size = 5190, upload-time = "2021-12-21T16:31:05.719Z" },
+]
+
 [[package]]
 name = "referencing"
 version = "0.37.0"
@@ -3374,6 +3699,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/9e/51/17023c0f8f1869d8806b979a2bffa3f861f26a3f1a66b094288323fba52f/rfc3986_validator-0.1.1-py2.py3-none-any.whl", hash = "sha256:2f235c432ef459970b4306369336b9d5dbdda31b510ca1e327636e01f528bfa9", size = 4242, upload-time = "2019-10-28T16:00:13.976Z" },
 ]
 
+[[package]]
+name = "rfc3987"
+version = "1.3.8"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/14/bb/f1395c4b62f251a1cb503ff884500ebd248eed593f41b469f89caa3547bd/rfc3987-1.3.8.tar.gz", hash = "sha256:d3c4d257a560d544e9826b38bc81db676890c79ab9d7ac92b39c7a253d5ca733", size = 20700, upload-time = "2018-07-29T17:23:47.954Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/65/d4/f7407c3d15d5ac779c3dd34fbbc6ea2090f77bd7dd12f207ccf881551208/rfc3987-1.3.8-py2.py3-none-any.whl", hash = "sha256:10702b1e51e5658843460b189b185c0366d2cf4cff716f13111b0ea9fd2dce53", size = 13377, upload-time = "2018-07-29T17:23:45.313Z" },
+]
+
 [[package]]
 name = "rfc3987-syntax"
 version = "1.1.0"
@@ -3578,6 +3912,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" },
 ]
 
+[[package]]
+name = "shexjsg"
+version = "0.8.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pyjsg" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/30/c9/34224e3c8fd9d466535626e3c2f6e01f6adae3e82acaed353d42add509ec/ShExJSG-0.8.2.tar.gz", hash = "sha256:f17a629fc577fa344382bdee143cd9ff86588537f9f811f66cea6f63cdbcd0b6", size = 33550, upload-time = "2022-04-14T20:23:13.75Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/18/6e/d23bcde21d4ef0250a74e7505d2990d429f862be65810a3b650a69def7f0/ShExJSG-0.8.2-py2.py3-none-any.whl", hash = "sha256:3b0d8432dd313bee9e1343382c5e02e9908dd941a7dd7342bf8c0200fe523766", size = 14381, upload-time = "2022-04-14T20:23:12.515Z" },
+]
+
 [[package]]
 name = "sidecar"
 version = "0.8.0"
@@ -3608,6 +3954,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
 ]
 
+[[package]]
+name = "snowballstemmer"
+version = "3.0.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/75/a7/9810d872919697c9d01295633f5d574fb416d47e535f258272ca1f01f447/snowballstemmer-3.0.1.tar.gz", hash = "sha256:6d5eeeec8e9f84d4d56b847692bacf79bc2c8e90c7f80ca4444ff8b6f2e52895", size = 105575, upload-time = "2025-05-09T16:34:51.843Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c8/78/3565d011c61f5a43488987ee32b6f3f656e7f107ac2782dd57bdd7d91d9a/snowballstemmer-3.0.1-py3-none-any.whl", hash = "sha256:6cd7b3897da8d6c9ffb968a6781fa6532dce9c3618a4b127d920dab764a19064", size = 103274, upload-time = "2025-05-09T16:34:50.371Z" },
+]
+
 [[package]]
 name = "sortedcontainers"
 version = "2.4.0"
@@ -3977,6 +4332,27 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/3d/d8/2083a1daa7439a66f3a48589a57d576aa117726762618f6bb09fe3798796/uvicorn-0.40.0-py3-none-any.whl", hash = "sha256:c6c8f55bc8bf13eb6fa9ff87ad62308bbbc33d0b67f84293151efe87e0d5f2ee", size = 68502, upload-time = "2025-12-21T14:16:21.041Z" },
 ]
 
+[[package]]
+name = "watchdog"
+version = "6.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/db/7d/7f3d619e951c88ed75c6037b246ddcf2d322812ee8ea189be89511721d54/watchdog-6.0.0.tar.gz", hash = "sha256:9ddf7c82fda3ae8e24decda1338ede66e1c99883db93711d8fb941eaa2d8c282", size = 131220, upload-time = "2024-11-01T14:07:13.037Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/68/98/b0345cabdce2041a01293ba483333582891a3bd5769b08eceb0d406056ef/watchdog-6.0.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:490ab2ef84f11129844c23fb14ecf30ef3d8a6abafd3754a6f75ca1e6654136c", size = 96480, upload-time = "2024-11-01T14:06:42.952Z" },
+    { url = "https://files.pythonhosted.org/packages/85/83/cdf13902c626b28eedef7ec4f10745c52aad8a8fe7eb04ed7b1f111ca20e/watchdog-6.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:76aae96b00ae814b181bb25b1b98076d5fc84e8a53cd8885a318b42b6d3a5134", size = 88451, upload-time = "2024-11-01T14:06:45.084Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/c4/225c87bae08c8b9ec99030cd48ae9c4eca050a59bf5c2255853e18c87b50/watchdog-6.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a175f755fc2279e0b7312c0035d52e27211a5bc39719dd529625b1930917345b", size = 89057, upload-time = "2024-11-01T14:06:47.324Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/c7/ca4bf3e518cb57a686b2feb4f55a1892fd9a3dd13f470fca14e00f80ea36/watchdog-6.0.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7607498efa04a3542ae3e05e64da8202e58159aa1fa4acddf7678d34a35d4f13", size = 79079, upload-time = "2024-11-01T14:06:59.472Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/51/d46dc9332f9a647593c947b4b88e2381c8dfc0942d15b8edc0310fa4abb1/watchdog-6.0.0-py3-none-manylinux2014_armv7l.whl", hash = "sha256:9041567ee8953024c83343288ccc458fd0a2d811d6a0fd68c4c22609e3490379", size = 79078, upload-time = "2024-11-01T14:07:01.431Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/57/04edbf5e169cd318d5f07b4766fee38e825d64b6913ca157ca32d1a42267/watchdog-6.0.0-py3-none-manylinux2014_i686.whl", hash = "sha256:82dc3e3143c7e38ec49d61af98d6558288c415eac98486a5c581726e0737c00e", size = 79076, upload-time = "2024-11-01T14:07:02.568Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/cc/da8422b300e13cb187d2203f20b9253e91058aaf7db65b74142013478e66/watchdog-6.0.0-py3-none-manylinux2014_ppc64.whl", hash = "sha256:212ac9b8bf1161dc91bd09c048048a95ca3a4c4f5e5d4a7d1b1a7d5752a7f96f", size = 79077, upload-time = "2024-11-01T14:07:03.893Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/3b/b8964e04ae1a025c44ba8e4291f86e97fac443bca31de8bd98d3263d2fcf/watchdog-6.0.0-py3-none-manylinux2014_ppc64le.whl", hash = "sha256:e3df4cbb9a450c6d49318f6d14f4bbc80d763fa587ba46ec86f99f9e6876bb26", size = 79078, upload-time = "2024-11-01T14:07:05.189Z" },
+    { url = "https://files.pythonhosted.org/packages/62/ae/a696eb424bedff7407801c257d4b1afda455fe40821a2be430e173660e81/watchdog-6.0.0-py3-none-manylinux2014_s390x.whl", hash = "sha256:2cce7cfc2008eb51feb6aab51251fd79b85d9894e98ba847408f662b3395ca3c", size = 79077, upload-time = "2024-11-01T14:07:06.376Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/e8/dbf020b4d98251a9860752a094d09a65e1b436ad181faf929983f697048f/watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl", hash = "sha256:20ffe5b202af80ab4266dcd3e91aae72bf2da48c0d33bdb15c66658e685e94e2", size = 79078, upload-time = "2024-11-01T14:07:07.547Z" },
+    { url = "https://files.pythonhosted.org/packages/07/f6/d0e5b343768e8bcb4cda79f0f2f55051bf26177ecd5651f84c07567461cf/watchdog-6.0.0-py3-none-win32.whl", hash = "sha256:07df1fdd701c5d4c8e55ef6cf55b8f0120fe1aef7ef39a1c6fc6bc2e606d517a", size = 79065, upload-time = "2024-11-01T14:07:09.525Z" },
+    { url = "https://files.pythonhosted.org/packages/db/d9/c495884c6e548fce18a8f40568ff120bc3a4b7b99813081c8ac0c936fa64/watchdog-6.0.0-py3-none-win_amd64.whl", hash = "sha256:cbafb470cf848d93b5d013e2ecb245d4aa1c8fd0504e863ccefa32445359d680", size = 79070, upload-time = "2024-11-01T14:07:10.686Z" },
+    { url = "https://files.pythonhosted.org/packages/33/e8/e40370e6d74ddba47f002a32919d91310d6074130fe4e17dabcafc15cbf1/watchdog-6.0.0-py3-none-win_ia64.whl", hash = "sha256:a1914259fa9e1454315171103c6a30961236f508b9b623eae470268bbcc6a22f", size = 79067, upload-time = "2024-11-01T14:07:11.845Z" },
+]
+
 [[package]]
 name = "wcwidth"
 version = "0.2.14"