EBIvariation · apriltuesday · Feb 12, 2026 · Feb 4, 2026 · Feb 4, 2026 · Feb 9, 2026
diff --git a/bin/trait_mapping/create_latest_mappings.py b/bin/trait_mapping/create_latest_mappings.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+import argparse
+from copy import deepcopy
+from datetime import datetime
+
+import yaml
+
+from cmat.output_generation.clinvar_to_evidence_strings import load_ontology_mapping
+from cmat.trait_mapping.ols import is_current_and_in_ontology
+
+
+OUTPUT_FILE_NAME = 'trait_names_to_ontology_mappings.tsv'
+OBSOLETE_FILE_NAME = 'obsolete_mappings.tsv'
+COUNTS_FILE_NAME = 'trait_counts.yml'
+
+
+def create_latest_mappings(automated_mappings, curated_mappings, previous_mappings, target_ontology):
+    # Combine automated, curated and previous mappings
+    # Also count how many traits have their mappings updated or added by automation or curation (before filtering out
+    # duplicates or obsolete terms)
+    counts = {
+        'n_previous_unchanged': 0,
+        'n_automated_updated': 0,
+        'n_automated_new': 0,
+        'n_curated_updated': 0,
+        'n_curated_new': 0
+    }
+    latest_mappings = deepcopy(previous_mappings)
+    counts['n_previous_unchanged'] = len(previous_mappings)
+    for trait_name in automated_mappings:
+        if trait_name in latest_mappings:
+            if latest_mappings[trait_name] != automated_mappings[trait_name]:
+                latest_mappings[trait_name] = automated_mappings[trait_name]
+                counts['n_automated_updated'] += 1
+                counts['n_previous_unchanged'] -= 1
+        else:
+            latest_mappings[trait_name] = automated_mappings[trait_name]
+            counts['n_automated_new'] += 1
+    for trait_name in curated_mappings:
+        if trait_name in latest_mappings:
+            if latest_mappings[trait_name] != curated_mappings[trait_name]:
+                latest_mappings[trait_name] = curated_mappings[trait_name]
+                counts['n_curated_updated'] += 1
+                if trait_name in automated_mappings and previous_mappings[trait_name] != automated_mappings[trait_name]:
+                    counts['n_automated_updated'] -= 1
+                else:
+                    counts['n_previous_unchanged'] -= 1
+        else:
+            latest_mappings[trait_name] = curated_mappings[trait_name]
+            counts['n_curated_new'] += 1
+
+    assert sum(counts.values()) == len(latest_mappings), 'Trait counts not consistent'
+
+    # Ensure no duplicate rows or obsolete mappings
+    current_rows = set()
+    obsolete_rows = set()
+    for trait_name, mappings in latest_mappings.items():
+        for uri, label in mappings:
+            if is_current_and_in_ontology(uri, target_ontology):
+                current_rows.add((trait_name, uri, label))
+            else:
+                obsolete_rows.add((trait_name, uri, label))
+
+    current_rows = sorted(list(current_rows))
+    obsolete_rows = sorted(list(obsolete_rows))
+    return current_rows, obsolete_rows, counts
+
+
+def output_and_report(current_mappings, obsolete_mappings, target_ontology, counts):
+    with open(OUTPUT_FILE_NAME, 'w') as out_file:
+        out_file.write(f'#generated-date={datetime.today().strftime("%Y-%m-%d")}\n')
+        out_file.write(f'#ontology=${target_ontology}\n')
+        out_file.write('#clinvar_trait_name\turi\tlabel\n')
+        for trait_name, uri, label in current_mappings:
+            out_file.write(f'{trait_name}\t{uri}\t{label}\n')
+
+    with open(OBSOLETE_FILE_NAME, 'w') as obs_file:
+        for trait_name, uri, label in obsolete_mappings:
+            obs_file.write(f'{trait_name}\t{uri}\t{label}\n')
+
+    print(f'Number of traits with mappings:')
+    print(f'\tUnchanged from previous: {counts["n_previous_unchanged"]}')
+    print(f'\tUpdated by automation: {counts["n_automated_updated"]}')
+    print(f'\tAdded by automation: {counts["n_automated_new"]}')
+    print(f'\tUpdated by curation: {counts["n_curated_updated"]}')
+    print(f'\tAdded by curation: {counts["n_curated_new"]}')
+    with open(COUNTS_FILE_NAME, 'w') as counts_file:
+        yaml.dump(counts, counts_file)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Create latest mapping file by combining automated, curated, and previous mappings')
+    parser.add_argument('--automated', required=True, help='Path to automated mappings TSV file')
+    parser.add_argument('--curated', required=True, help='Path to curated mappings TSV file')
+    parser.add_argument('--previous', required=True, help='Path to previous mappings TSV file')
+    args = parser.parse_args()
+
+    automated_mappings, _, _ = load_ontology_mapping(args.automated)
+    curated_mappings, _, _ = load_ontology_mapping(args.curated)
+    previous_mappings, target_ontology, _ = load_ontology_mapping(args.previous)
+
+    current_mappings, obsolete_mappings, counts = create_latest_mappings(automated_mappings, curated_mappings,
+                                                                         previous_mappings, target_ontology)
+    output_and_report(current_mappings, obsolete_mappings, target_ontology, counts)
diff --git a/docs/manual-curation/step2-manual-curation.md b/docs/manual-curation/step2-manual-curation.md
@@ -118,9 +118,38 @@ to http://www.ebi.ac.uk/efo/EFO_0000612 “Myocardial infarction”.
 To do this, **duplicate** the row containing the disease string, assign different mappings in each of the rows, and mark
 them both with an appropriate status. This will be handled downstream during export and evidence string generation.
 
+The result in the spreadsheet might look like this (some columns omitted for brevity):
+
+| Mapping to use | Status | ClinVar label | Previous mapping | Replacement mapping | Exact matches |
+|-|-|-|-|-|-|
+| `http://www.ebi.ac.uk/efo/EFO_0001645\|Coronary artery disease\|\|EFO_CURRENT` | DONE | coronary artery disease/myocardial infarction | | | |
+| `http://www.ebi.ac.uk/efo/EFO_0000612\|Myocardial infarction\|\|EFO_CURRENT` | DONE | coronary artery disease/myocardial infarction | | | |
+
 This provision does _not_ apply to cases where the source string contains additional semantic context, such as
 “susceptibility to...” or “resistance to...”, or drug response terms.
 
+If a disease string was previously mapped to multiple ontology terms, it will appear as two nearly identical rows with 
+different values in the "Previous mappings" column. These rows can be kept and curated as usual if the above case
+applies.
+
+However, if the multiple mapping is not appropriate (i.e. the string really does contain only a single trait and was
+multiply mapped due to an error), then you can **delete** any unnecessary row(s) and proceed with curation as usual.
+The export pipeline will handle this modification accordingly.
+
+For example (some columns omitted for brevity):
+
+| Mapping to use | Status | ClinVar label | Previous mapping | Replacement mapping | Exact matches |
+|-|-|-|-|-|-|
+| | | lissencephaly 8 | `http://purl.obolibrary.org/obo/HP_0001339\|Lissencephaly\|TOKEN_MATCH_LABEL\|EFO_CURRENT` | | `http://purl.obolibrary.org/obo/MONDO_0014992\|lissencephaly 8\|EXACT_MATCH_LABEL\|MONDO_HP_NOT_EFO` |
+| | | lissencephaly 8 | `http://purl.obolibrary.org/obo/MONDO_0018838\|lissencephaly spectrum disorders\|TOKEN_MATCH_LABEL\|EFO_CURRENT` | | `http://purl.obolibrary.org/obo/MONDO_0014992\|lissencephaly 8\|EXACT_MATCH_LABEL\|MONDO_HP_NOT_EFO` |
+
+Here "lissencephaly 8" refers to a single disease, and has an exact label match in MONDO that we can import. So we
+should delete one of the rows (in this case it doesn't matter which) and use the mapping string from "Exact matches":
+
+| Mapping to use | Status | ClinVar label | Previous mapping | Replacement mapping | Exact matches |
+|-|-|-|-|-|-|
+| `http://purl.obolibrary.org/obo/MONDO_0014992\|lissencephaly 8\|EXACT_MATCH_LABEL\|MONDO_HP_NOT_EFO` | IMPORT | lissencephaly 8 | `http://purl.obolibrary.org/obo/HP_0001339\|Lissencephaly\|TOKEN_MATCH_LABEL\|EFO_CURRENT` | | `http://purl.obolibrary.org/obo/MONDO_0014992\|lissencephaly 8\|EXACT_MATCH_LABEL\|MONDO_HP_NOT_EFO` |
+
 ### Note on spaces and line breaks
 
 Sometimes, especially when copy-pasting information from external sources, a mapping label or URL can contain an

diff --git a/docs/manual-curation/step3-export-results.md b/docs/manual-curation/step3-export-results.md
@@ -21,9 +21,6 @@ nextflow run ${CODE_ROOT}/pipelines/export_curation_spreadsheet.nf \
   -resume
 ```
 
-### Duplication checks
-The automated pipeline checks for complete duplicates in the list of text-to-ontology mappings. If this check fails, resolve this by editing the `${BATCH_ROOT_BASE}/manual_curation/latest_mappings.tsv` file directly.
-
 ## Check and correct known problematic mappings
 There is a [spreadsheet](https://docs.google.com/spreadsheets/d/1m4ld3y3Pfust5JSOJOX9ZmImRCKRGi-fGYj_dExoGj8/edit) which was created to track trait-to-ontology mappings which were especially problematic in the past to users of Open Targets platform. Prior to running subsequent steps, make sure that all traits mentioned in that spreadsheet are mapped to the correct ontology terms in `${BATCH_ROOT_BASE}/manual_curation/latest_mappings.tsv`.
 

diff --git a/pipelines/export_curation_spreadsheet.nf b/pipelines/export_curation_spreadsheet.nf
@@ -2,8 +2,6 @@
 
 nextflow.enable.dsl=2
 
-include { getTargetOntology } from './utils.nf'
-
 
 def helpMessage() {
     log.info"""
@@ -26,7 +24,7 @@ params.with_feedback = false
 if (params.help) {
     exit 0, helpMessage()
 }
-if (!params.curation_root or !params.input_csv) {
+if (!params.curation_root || !params.input_csv) {
     exit 1, helpMessage()
 }
 curationRoot = params.curation_root
@@ -37,18 +35,13 @@ codeRoot = "${projectDir}/.."
  * Main workflow.
  */
 workflow {
-    // Generate latest mappings
     exportTable()
-    combineManualAndAutomated(exportTable.out.finishedMappings)
-    getTargetOntology(params.mappings)
-    stripMappingsHeader()
-    mergeWithLatestMappings(combineManualAndAutomated.out.newMappings, stripMappingsHeader.out.previousMappings)
-
-    // Perform checks on latest mappings
-    checkDuplicates(mergeWithLatestMappings.out.newMappings)
+    createLatestMappings(
+        Channel.of("${curationRoot}/automated_trait_mappings.tsv"),
+        exportTable.out.finishedMappings,
+        Channel.of(params.mappings)
+    )
 
-    // Finalise latest mappings file
-    addMappingsHeader(checkDuplicates.out.duplicatesOk, mergeWithLatestMappings.out.newMappings, getTargetOntology.out.targetOntology)
     if (params.with_feedback) {
         generateZoomaFeedback(addMappingsHeader.out.finalMappings)
         updateLinks(addMappingsHeader.out.finalMappings, generateZoomaFeedback.out.zoomaFeedback)
@@ -81,66 +74,33 @@ process exportTable {
 }
 
 /*
- * Strip header from existing mappings file.
+ * Create latest mappings file.
  */
- process stripMappingsHeader {
-    label 'short_time'
+process createLatestMappings {
+    label 'default_time'
     label 'small_mem'
 
-    output:
-    path "previous_mappings.tsv", emit: previousMappings
-
-    script:
-    """
-    grep -v "^#" ${params.mappings} > previous_mappings.tsv
-    """
- }
-
-/*
- * Concatenate finished automated and manual mappings into a single file.
- */
-process combineManualAndAutomated {
-    label 'short_time'
-    label 'small_mem'
-
-    input:
-    path finishedMappings
-
-    output:
-    path "mappings_no_header.tsv", emit: newMappings
-
-    script:
-    """
-    cat ${curationRoot}/automated_trait_mappings.tsv ${finishedMappings} \
-        | sort -u > mappings_no_header.tsv
-    """
-}
-
-/*
- * Add all mappings from the database which are *not* present in the results of the current curation iteration (automated
- * + manually curated). This is done in order to never lose mappings, even if they are not present in ClinVar during the
- * latest curation iteration.
- */
-process mergeWithLatestMappings {
-    label 'short_time'
-    label 'small_mem'
+    publishDir "${curationRoot}",
+        overwrite: true,
+        mode: "copy",
+        pattern: "*.{tsv,yml}"
 
     input:
-    path newMappings
-    path previousMappings
+    val automatedMappings
+    path curatedMappings
+    val previousMappings
 
     output:
-    path newMappings, emit: newMappings
+    path "trait_names_to_ontology_mappings.tsv", emit: finalMappings
+    path "obsolete_mappings.tsv", emit: obsoleteMappings
+    path "trait_counts.yml", emit: counts
 
     script:
     """
-    # The first file operand is the list of mappings in the current database; and the second is the list of trait names
-    # which are only present in the existing database and not in the new mappings.
-    export LC_ALL=C
-    join -j 1 -t \$'\t' \
-        <(sort -t \$'\t' -k 1,1 ${previousMappings}) \
-        <(comm -23 <(cut -d \$'\t' -f 1 ${previousMappings} | sort -u) <(cut -d \$'\t' -f 1 ${newMappings} | sort -u)) \
-    >> ${newMappings}
+    \${PYTHON_BIN} ${codeRoot}/bin/trait_mapping/create_latest_mappings.py \
+        --automated ${automatedMappings} \
+        --curated ${curatedMappings} \
+        --previous ${previousMappings}
     """
 }
 
@@ -150,6 +110,7 @@ process mergeWithLatestMappings {
 process generateZoomaFeedback {
     label 'short_time'
     label 'small_mem'
+
     publishDir "${curationRoot}",
         overwrite: true,
         mode: "copy",
@@ -173,54 +134,6 @@ process generateZoomaFeedback {
     """
 }
 
-/*
- * Check there are no complete duplicates in the final mappings file.
- */
-process checkDuplicates {
-    label 'short_time'
-    label 'small_mem'
-
-    input:
-    path newMappings
-
-    output:
-    val true, emit: duplicatesOk  // ensure we don't do the final linking if this check fails
-
-    script:
-    """
-    sort ${newMappings} | uniq -c | awk '\$1 > 1' > duplicates.tsv
-    [[ ! -s duplicates.tsv ]]
-    """
-}
-
-/*
- * Add generated date and target ontology to header of final mappings file.
- */
-process addMappingsHeader {
-    label 'short_time'
-    label 'small_mem'
-    publishDir "${curationRoot}",
-        overwrite: true,
-        mode: "copy",
-        pattern: "*.tsv"
-
-    input:
-    val duplicatesOk
-    path newMappings
-    val targetOntology
-
-    output:
-    path "trait_names_to_ontology_mappings.tsv", emit: finalMappings
-
-    script:
-    """
-    printf '#generated-date=%(%Y-%m-%d)T\n' > trait_names_to_ontology_mappings.tsv
-    printf '#ontology=${targetOntology}\n' >> trait_names_to_ontology_mappings.tsv
-    printf '#clinvar_trait_name\turi\tlabel\n' >> trait_names_to_ontology_mappings.tsv
-    cat ${newMappings} >> trait_names_to_ontology_mappings.tsv
-    """
-}
-
 /*
  * Update the symbolic links pointing to the location of the most recent curation result and ZOOMA feedback dataset.
  */