Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 105 additions & 0 deletions bin/trait_mapping/create_latest_mappings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#!/usr/bin/env python3
import argparse
from copy import deepcopy
from datetime import datetime

import yaml

from cmat.output_generation.clinvar_to_evidence_strings import load_ontology_mapping
from cmat.trait_mapping.ols import is_current_and_in_ontology


OUTPUT_FILE_NAME = 'trait_names_to_ontology_mappings.tsv'
OBSOLETE_FILE_NAME = 'obsolete_mappings.tsv'
COUNTS_FILE_NAME = 'trait_counts.yml'


def create_latest_mappings(automated_mappings, curated_mappings, previous_mappings, target_ontology):
# Combine automated, curated and previous mappings
# Also count how many traits have their mappings updated or added by automation or curation (before filtering out
# duplicates or obsolete terms)
counts = {
'n_previous_unchanged': 0,
'n_automated_updated': 0,
'n_automated_new': 0,
'n_curated_updated': 0,
'n_curated_new': 0
}
latest_mappings = deepcopy(previous_mappings)
counts['n_previous_unchanged'] = len(previous_mappings)
for trait_name in automated_mappings:
if trait_name in latest_mappings:
if latest_mappings[trait_name] != automated_mappings[trait_name]:
latest_mappings[trait_name] = automated_mappings[trait_name]
counts['n_automated_updated'] += 1
counts['n_previous_unchanged'] -= 1
else:
latest_mappings[trait_name] = automated_mappings[trait_name]
counts['n_automated_new'] += 1
for trait_name in curated_mappings:
if trait_name in latest_mappings:
if latest_mappings[trait_name] != curated_mappings[trait_name]:
latest_mappings[trait_name] = curated_mappings[trait_name]
counts['n_curated_updated'] += 1
if trait_name in automated_mappings and previous_mappings[trait_name] != automated_mappings[trait_name]:
counts['n_automated_updated'] -= 1
else:
counts['n_previous_unchanged'] -= 1
else:
latest_mappings[trait_name] = curated_mappings[trait_name]
counts['n_curated_new'] += 1

assert sum(counts.values()) == len(latest_mappings), 'Trait counts not consistent'

# Ensure no duplicate rows or obsolete mappings
current_rows = set()
obsolete_rows = set()
for trait_name, mappings in latest_mappings.items():
for uri, label in mappings:
if is_current_and_in_ontology(uri, target_ontology):
current_rows.add((trait_name, uri, label))
else:
obsolete_rows.add((trait_name, uri, label))

current_rows = sorted(list(current_rows))
obsolete_rows = sorted(list(obsolete_rows))
return current_rows, obsolete_rows, counts


def output_and_report(current_mappings, obsolete_mappings, target_ontology, counts):
with open(OUTPUT_FILE_NAME, 'w') as out_file:
out_file.write(f'#generated-date={datetime.today().strftime("%Y-%m-%d")}\n')
out_file.write(f'#ontology=${target_ontology}\n')
out_file.write('#clinvar_trait_name\turi\tlabel\n')
for trait_name, uri, label in current_mappings:
out_file.write(f'{trait_name}\t{uri}\t{label}\n')

with open(OBSOLETE_FILE_NAME, 'w') as obs_file:
for trait_name, uri, label in obsolete_mappings:
obs_file.write(f'{trait_name}\t{uri}\t{label}\n')

print(f'Number of traits with mappings:')
print(f'\tUnchanged from previous: {counts["n_previous_unchanged"]}')
print(f'\tUpdated by automation: {counts["n_automated_updated"]}')
print(f'\tAdded by automation: {counts["n_automated_new"]}')
print(f'\tUpdated by curation: {counts["n_curated_updated"]}')
print(f'\tAdded by curation: {counts["n_curated_new"]}')
with open(COUNTS_FILE_NAME, 'w') as counts_file:
yaml.dump(counts, counts_file)


if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Create latest mapping file by combining automated, curated, and previous mappings')
parser.add_argument('--automated', required=True, help='Path to automated mappings TSV file')
parser.add_argument('--curated', required=True, help='Path to curated mappings TSV file')
parser.add_argument('--previous', required=True, help='Path to previous mappings TSV file')
args = parser.parse_args()

automated_mappings, _, _ = load_ontology_mapping(args.automated)
curated_mappings, _, _ = load_ontology_mapping(args.curated)
previous_mappings, target_ontology, _ = load_ontology_mapping(args.previous)

current_mappings, obsolete_mappings, counts = create_latest_mappings(automated_mappings, curated_mappings,
previous_mappings, target_ontology)
output_and_report(current_mappings, obsolete_mappings, target_ontology, counts)
29 changes: 29 additions & 0 deletions docs/manual-curation/step2-manual-curation.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,38 @@ to http://www.ebi.ac.uk/efo/EFO_0000612 “Myocardial infarction”.
To do this, **duplicate** the row containing the disease string, assign different mappings in each of the rows, and mark
them both with an appropriate status. This will be handled downstream during export and evidence string generation.

The result in the spreadsheet might look like this (some columns omitted for brevity):

| Mapping to use | Status | ClinVar label | Previous mapping | Replacement mapping | Exact matches |
|-|-|-|-|-|-|
| `http://www.ebi.ac.uk/efo/EFO_0001645\|Coronary artery disease\|\|EFO_CURRENT` | DONE | coronary artery disease/myocardial infarction | | | |
| `http://www.ebi.ac.uk/efo/EFO_0000612\|Myocardial infarction\|\|EFO_CURRENT` | DONE | coronary artery disease/myocardial infarction | | | |

This provision does _not_ apply to cases where the source string contains additional semantic context, such as
“susceptibility to...” or “resistance to...”, or drug response terms.

If a disease string was previously mapped to multiple ontology terms, it will appear as two nearly identical rows with
different values in the "Previous mappings" column. These rows can be kept and curated as usual if the above case
applies.

However, if the multiple mapping is not appropriate (i.e. the string really does contain only a single trait and was
multiply mapped due to an error), then you can **delete** any unnecessary row(s) and proceed with curation as usual.
The export pipeline will handle this modification accordingly.

For example (some columns omitted for brevity):

| Mapping to use | Status | ClinVar label | Previous mapping | Replacement mapping | Exact matches |
|-|-|-|-|-|-|
| | | lissencephaly 8 | `http://purl.obolibrary.org/obo/HP_0001339\|Lissencephaly\|TOKEN_MATCH_LABEL\|EFO_CURRENT` | | `http://purl.obolibrary.org/obo/MONDO_0014992\|lissencephaly 8\|EXACT_MATCH_LABEL\|MONDO_HP_NOT_EFO` |
| | | lissencephaly 8 | `http://purl.obolibrary.org/obo/MONDO_0018838\|lissencephaly spectrum disorders\|TOKEN_MATCH_LABEL\|EFO_CURRENT` | | `http://purl.obolibrary.org/obo/MONDO_0014992\|lissencephaly 8\|EXACT_MATCH_LABEL\|MONDO_HP_NOT_EFO` |

Here "lissencephaly 8" refers to a single disease, and has an exact label match in MONDO that we can import. So we
should delete one of the rows (in this case it doesn't matter which) and use the mapping string from "Exact matches":

| Mapping to use | Status | ClinVar label | Previous mapping | Replacement mapping | Exact matches |
|-|-|-|-|-|-|
| `http://purl.obolibrary.org/obo/MONDO_0014992\|lissencephaly 8\|EXACT_MATCH_LABEL\|MONDO_HP_NOT_EFO` | IMPORT | lissencephaly 8 | `http://purl.obolibrary.org/obo/HP_0001339\|Lissencephaly\|TOKEN_MATCH_LABEL\|EFO_CURRENT` | | `http://purl.obolibrary.org/obo/MONDO_0014992\|lissencephaly 8\|EXACT_MATCH_LABEL\|MONDO_HP_NOT_EFO` |

### Note on spaces and line breaks

Sometimes, especially when copy-pasting information from external sources, a mapping label or URL can contain an
Expand Down
3 changes: 0 additions & 3 deletions docs/manual-curation/step3-export-results.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,6 @@ nextflow run ${CODE_ROOT}/pipelines/export_curation_spreadsheet.nf \
-resume
```

### Duplication checks
The automated pipeline checks for complete duplicates in the list of text-to-ontology mappings. If this check fails, resolve this by editing the `${BATCH_ROOT_BASE}/manual_curation/latest_mappings.tsv` file directly.

## Check and correct known problematic mappings
There is a [spreadsheet](https://docs.google.com/spreadsheets/d/1m4ld3y3Pfust5JSOJOX9ZmImRCKRGi-fGYj_dExoGj8/edit) which was created to track trait-to-ontology mappings which were especially problematic in the past to users of Open Targets platform. Prior to running subsequent steps, make sure that all traits mentioned in that spreadsheet are mapped to the correct ontology terms in `${BATCH_ROOT_BASE}/manual_curation/latest_mappings.tsv`.

Expand Down
135 changes: 24 additions & 111 deletions pipelines/export_curation_spreadsheet.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

nextflow.enable.dsl=2

include { getTargetOntology } from './utils.nf'


def helpMessage() {
log.info"""
Expand All @@ -26,7 +24,7 @@ params.with_feedback = false
if (params.help) {
exit 0, helpMessage()
}
if (!params.curation_root or !params.input_csv) {
if (!params.curation_root || !params.input_csv) {
exit 1, helpMessage()
}
curationRoot = params.curation_root
Expand All @@ -37,18 +35,13 @@ codeRoot = "${projectDir}/.."
* Main workflow.
*/
workflow {
// Generate latest mappings
exportTable()
combineManualAndAutomated(exportTable.out.finishedMappings)
getTargetOntology(params.mappings)
stripMappingsHeader()
mergeWithLatestMappings(combineManualAndAutomated.out.newMappings, stripMappingsHeader.out.previousMappings)

// Perform checks on latest mappings
checkDuplicates(mergeWithLatestMappings.out.newMappings)
createLatestMappings(
Channel.of("${curationRoot}/automated_trait_mappings.tsv"),
exportTable.out.finishedMappings,
Channel.of(params.mappings)
)

// Finalise latest mappings file
addMappingsHeader(checkDuplicates.out.duplicatesOk, mergeWithLatestMappings.out.newMappings, getTargetOntology.out.targetOntology)
if (params.with_feedback) {
generateZoomaFeedback(addMappingsHeader.out.finalMappings)
updateLinks(addMappingsHeader.out.finalMappings, generateZoomaFeedback.out.zoomaFeedback)
Expand Down Expand Up @@ -81,66 +74,33 @@ process exportTable {
}

/*
* Strip header from existing mappings file.
* Create latest mappings file.
*/
process stripMappingsHeader {
label 'short_time'
process createLatestMappings {
label 'default_time'
label 'small_mem'

output:
path "previous_mappings.tsv", emit: previousMappings

script:
"""
grep -v "^#" ${params.mappings} > previous_mappings.tsv
"""
}

/*
* Concatenate finished automated and manual mappings into a single file.
*/
process combineManualAndAutomated {
label 'short_time'
label 'small_mem'

input:
path finishedMappings

output:
path "mappings_no_header.tsv", emit: newMappings

script:
"""
cat ${curationRoot}/automated_trait_mappings.tsv ${finishedMappings} \
| sort -u > mappings_no_header.tsv
"""
}

/*
* Add all mappings from the database which are *not* present in the results of the current curation iteration (automated
* + manually curated). This is done in order to never lose mappings, even if they are not present in ClinVar during the
* latest curation iteration.
*/
process mergeWithLatestMappings {
label 'short_time'
label 'small_mem'
publishDir "${curationRoot}",
overwrite: true,
mode: "copy",
pattern: "*.{tsv,yml}"

input:
path newMappings
path previousMappings
val automatedMappings
path curatedMappings
val previousMappings

output:
path newMappings, emit: newMappings
path "trait_names_to_ontology_mappings.tsv", emit: finalMappings
path "obsolete_mappings.tsv", emit: obsoleteMappings
path "trait_counts.yml", emit: counts

script:
"""
# The first file operand is the list of mappings in the current database; and the second is the list of trait names
# which are only present in the existing database and not in the new mappings.
export LC_ALL=C
join -j 1 -t \$'\t' \
<(sort -t \$'\t' -k 1,1 ${previousMappings}) \
<(comm -23 <(cut -d \$'\t' -f 1 ${previousMappings} | sort -u) <(cut -d \$'\t' -f 1 ${newMappings} | sort -u)) \
>> ${newMappings}
\${PYTHON_BIN} ${codeRoot}/bin/trait_mapping/create_latest_mappings.py \
--automated ${automatedMappings} \
--curated ${curatedMappings} \
--previous ${previousMappings}
"""
}

Expand All @@ -150,6 +110,7 @@ process mergeWithLatestMappings {
process generateZoomaFeedback {
label 'short_time'
label 'small_mem'

publishDir "${curationRoot}",
overwrite: true,
mode: "copy",
Expand All @@ -173,54 +134,6 @@ process generateZoomaFeedback {
"""
}

/*
* Check there are no complete duplicates in the final mappings file.
*/
process checkDuplicates {
label 'short_time'
label 'small_mem'

input:
path newMappings

output:
val true, emit: duplicatesOk // ensure we don't do the final linking if this check fails

script:
"""
sort ${newMappings} | uniq -c | awk '\$1 > 1' > duplicates.tsv
[[ ! -s duplicates.tsv ]]
"""
}

/*
* Add generated date and target ontology to header of final mappings file.
*/
process addMappingsHeader {
label 'short_time'
label 'small_mem'
publishDir "${curationRoot}",
overwrite: true,
mode: "copy",
pattern: "*.tsv"

input:
val duplicatesOk
path newMappings
val targetOntology

output:
path "trait_names_to_ontology_mappings.tsv", emit: finalMappings

script:
"""
printf '#generated-date=%(%Y-%m-%d)T\n' > trait_names_to_ontology_mappings.tsv
printf '#ontology=${targetOntology}\n' >> trait_names_to_ontology_mappings.tsv
printf '#clinvar_trait_name\turi\tlabel\n' >> trait_names_to_ontology_mappings.tsv
cat ${newMappings} >> trait_names_to_ontology_mappings.tsv
"""
}

/*
* Update the symbolic links pointing to the location of the most recent curation result and ZOOMA feedback dataset.
*/
Expand Down
Loading