EBIvariation · apriltuesday · Jan 19, 2026 · Jan 14, 2026 · Jan 15, 2026 · Jan 16, 2026
diff --git a/bin/trait_mapping/check_latest_mappings.py b/bin/trait_mapping/check_latest_mappings.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+
+import argparse
+import csv
+import json
+import os
+import re
+
+
+def get_ontology_id_regex(ot_schema_file):
+    if not ot_schema_file:
+        return '.*'
+    with open(ot_schema_file, 'r') as f:
+        schema = json.load(f)
+        return schema['definitions']['diseaseFromSourceMappedId']['pattern']
+
+
+def check_mappings(mappings_file, ot_schema_file):
+    """
+    Check mappings for conformity against regex in latest OT schema.
+    Outputs a new mappings file and a file of the mappings that have been removed.
+
+    :param mappings_file: path to mappings file (tab-delimited, no header)
+    :param ot_schema_file: path to Open Targets JSON schema
+    """
+    with open(mappings_file, 'r') as f:
+        reader = csv.reader(f, delimiter='\t')
+        mappings = list(reader)
+    ontology_id_regex = get_ontology_id_regex(ot_schema_file)
+    updated_mappings = set()
+    nonmatching_mappings = set()
+
+    for trait_name, uri, label in mappings:
+        if re.match(ontology_id_regex, uri.split('/')[-1]):
+            updated_mappings.add((label, uri))
+        else:
+            nonmatching_mappings.add((label, uri))
+
+    # Output files
+    filename = '.'.join(os.path.basename(mappings_file).split('.')[:-1])
+    with open(f'{filename}_nonmatching.tsv', 'w+') as outfile:
+        writer = csv.writer(outfile, delimiter='\t')
+        print(f'Removed {len(nonmatching_mappings)} nonmatching mappings')
+        writer.writerows(sorted(list(nonmatching_mappings)))
+
+    with open(f'{filename}_updated.tsv', 'w+') as outfile:
+        writer = csv.writer(outfile, delimiter='\t')
+        print(f'{len(updated_mappings)} mappings remaining')
+        writer.writerows(sorted(list(updated_mappings)))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Check latest mappings for obsolete terms and (optionally) conformity'
+                                                 ' against latest OT schema')
+    parser.add_argument('--mappings-file', required=True, help='File of latest ontology mappings to process')
+    parser.add_argument('--ot-schema', required=True, help='Open Targets schema JSON')
+    args = parser.parse_args()
+    check_mappings(args.mappings_file, args.ot_schema)
diff --git a/pipelines/annotation_pipeline.nf b/pipelines/annotation_pipeline.nf
@@ -2,6 +2,7 @@
 
 nextflow.enable.dsl=2
 
+include { downloadClinvar; downloadJsonSchema } from './utils.nf'
 
 def helpMessage() {
     log.info"""
@@ -57,7 +58,7 @@ workflow {
 
     if (params.schema != null) {
         // Open Targets evidence string output
-        downloadJsonSchema()
+        downloadJsonSchema(params.schema)
         // Get start/end indices to break XML into chunks
         countClinvarRecords(clinvarXml)
         .map { strN ->
@@ -94,39 +95,6 @@ workflow {
     }
 }
 
-/*
- * Download ClinVar data, using the most recent XML dump.
- */
-process downloadClinvar {
-    label 'small_mem'
-
-    output:
-    path "clinvar.xml.gz", emit: clinvarXml
-
-    script:
-    """
-    wget -O clinvar.xml.gz \
-        https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/RCV_release/ClinVarRCVRelease_00-latest.xml.gz
-    """
-}
-
-/*
- * Download the Open Targets JSON schema.
- */
-process downloadJsonSchema {
-    label 'short_time'
-    label 'small_mem'
-
-    output:
-    path "opentargets-${params.schema}.json", emit: jsonSchema
-
-    script:
-    """
-    wget -O opentargets-${params.schema}.json \
-        https://raw.githubusercontent.com/opentargets/json_schema/${params.schema}/schemas/disease_target_evidence.json
-    """
-}
-
 /*
  * Run simple variants (SNPs and other variants with complete coordinates) through VEP and map them
  * to genes and functional consequences.

diff --git a/pipelines/export_curation_spreadsheet.nf b/pipelines/export_curation_spreadsheet.nf
@@ -2,7 +2,7 @@
 
 nextflow.enable.dsl=2
 
-include { getTargetOntology } from './utils.nf'
+include { getTargetOntology; downloadJsonSchema } from './utils.nf'
 
 
 def helpMessage() {
@@ -13,6 +13,7 @@ def helpMessage() {
         --curation_root     Directory for current batch
         --input_csv         Input csv file
         --mappings          Current mappings file (optional, will use a default path if omitted)
+        --schema            Open Targets JSON schema version (optional, will check term IDs match schema if included)
         --with_feedback     Whether to generate EFO/Zooma feedback and final symlinking (default false)
     """
 }
@@ -21,6 +22,7 @@ params.help = null
 params.curation_root = null
 params.input_csv = null
 params.mappings = "\${BATCH_ROOT_BASE}/manual_curation/latest_mappings.tsv"
+params.schema = null
 params.with_feedback = false
 
 if (params.help) {
@@ -37,13 +39,25 @@ codeRoot = "${projectDir}/.."
  * Main workflow.
  */
 workflow {
+    // Generate latest mappings
     exportTable()
     combineManualAndAutomated(exportTable.out.finishedMappings)
     getTargetOntology(params.mappings)
     stripMappingsHeader()
     mergeWithLatestMappings(combineManualAndAutomated.out.newMappings, stripMappingsHeader.out.previousMappings)
-    checkDuplicates(mergeWithLatestMappings.out.newMappings)
-    addMappingsHeader(checkDuplicates.out.duplicatesOk, mergeWithLatestMappings.out.newMappings, getTargetOntology.out.targetOntology)
+
+    // Perform checks on latest mappings
+    if (params.schema != null) {
+        downloadJsonSchema(params.schema)
+        checkMappings(mergeWithLatestMappings.out.newMappings, downloadJsonSchema.out.jsonSchema)
+        updatedMappings = checkMappings.out.updatedMappings
+    } else {
+        updatedMappings = mergeWithLatestMappings.out.newMappings
+    }
+    checkDuplicates(updatedMappings)
+
+    // Finalise latest mappings file
+    addMappingsHeader(checkDuplicates.out.duplicatesOk, updatedMappings, getTargetOntology.out.targetOntology)
     if (params.with_feedback) {
         generateZoomaFeedback(addMappingsHeader.out.finalMappings)
         updateLinks(addMappingsHeader.out.finalMappings, generateZoomaFeedback.out.zoomaFeedback)
@@ -168,6 +182,33 @@ process generateZoomaFeedback {
     """
 }
 
+/*
+ * Check latest mappings conformity against latest OT schema.
+ */
+process checkMappings {
+    label 'short_time'
+    label 'small_mem'
+    publishDir "${curationRoot}",
+        overwrite: true,
+        mode: "copy",
+        pattern: "*_nonmatching.tsv"
+
+    input:
+    path mappingsFile
+    path schemaFile
+
+    output:
+    path "${mappingsFile.getBaseName()}_updated.tsv", emit: updatedMappings
+    path "${mappingsFile.getBaseName()}_nonmatching.tsv", emit: nonmatchingMappings
+
+    script:
+    """
+    \${PYTHON_BIN} ${codeRoot}/bin/trait_mapping/check_latest_mappings.py \
+        --mappings-file ${mappingsFile} \
+        --ot-schema ${schemaFile}
+    """
+}
+
 /*
  * Check there are no complete duplicates in the final mappings file.
  */

diff --git a/pipelines/generate_curation_spreadsheet.nf b/pipelines/generate_curation_spreadsheet.nf
@@ -2,7 +2,7 @@
 
 nextflow.enable.dsl=2
 
-include { getTargetOntology } from './utils.nf'
+include { getTargetOntology; downloadClinvar } from './utils.nf'
 
 
 def helpMessage() {
@@ -55,22 +55,6 @@ workflow {
     createCurationTable(collectCurationTraits.out.curationTraits)
 }
 
-/*
- * Download ClinVar data, using the most recent XML dump.
- */
-process downloadClinvar {
-    label 'small_mem'
-
-    output:
-    path "clinvar.xml.gz", emit: clinvarXml
-
-    script:
-    """
-    wget -O clinvar.xml.gz \
-        https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/RCV_release/ClinVarRCVRelease_00-latest.xml.gz
-    """
-}
-
 /*
  * Parse traits from ClinVar XML.
  */

diff --git a/pipelines/utils.nf b/pipelines/utils.nf
@@ -17,3 +17,39 @@ process getTargetOntology {
     ONTOLOGY=\${ONTOLOGY:-EFO}
     """
 }
+
+/*
+ * Download ClinVar data, using the most recent XML dump.
+ */
+process downloadClinvar {
+    label 'small_mem'
+
+    output:
+    path "clinvar.xml.gz", emit: clinvarXml
+
+    script:
+    """
+    wget -O clinvar.xml.gz \
+        https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/RCV_release/ClinVarRCVRelease_00-latest.xml.gz
+    """
+}
+
+/*
+ * Download the Open Targets JSON schema.
+ */
+process downloadJsonSchema {
+    label 'short_time'
+    label 'small_mem'
+
+    input:
+    val schemaVersion
+
+    output:
+    path "opentargets-${schemaVersion}.json", emit: jsonSchema
+
+    script:
+    """
+    wget -O opentargets-${schemaVersion}.json \
+        https://raw.githubusercontent.com/opentargets/json_schema/${schemaVersion}/schemas/disease_target_evidence.json
+    """
+}