Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions bin/trait_mapping/check_latest_mappings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/usr/bin/env python3

import argparse
import csv
import json
import os
import re


def get_ontology_id_regex(ot_schema_file):
if not ot_schema_file:
return '.*'
with open(ot_schema_file, 'r') as f:
schema = json.load(f)
return schema['definitions']['diseaseFromSourceMappedId']['pattern']


def check_mappings(mappings_file, ot_schema_file):
"""
Check mappings for conformity against regex in latest OT schema.
Outputs a new mappings file and a file of the mappings that have been removed.

:param mappings_file: path to mappings file (tab-delimited, no header)
:param ot_schema_file: path to Open Targets JSON schema
"""
with open(mappings_file, 'r') as f:
reader = csv.reader(f, delimiter='\t')
mappings = list(reader)
ontology_id_regex = get_ontology_id_regex(ot_schema_file)
updated_mappings = set()
nonmatching_mappings = set()

for trait_name, uri, label in mappings:
if re.match(ontology_id_regex, uri.split('/')[-1]):
updated_mappings.add((label, uri))
else:
nonmatching_mappings.add((label, uri))

# Output files
filename = '.'.join(os.path.basename(mappings_file).split('.')[:-1])
with open(f'{filename}_nonmatching.tsv', 'w+') as outfile:
writer = csv.writer(outfile, delimiter='\t')
print(f'Removed {len(nonmatching_mappings)} nonmatching mappings')
writer.writerows(sorted(list(nonmatching_mappings)))

with open(f'{filename}_updated.tsv', 'w+') as outfile:
writer = csv.writer(outfile, delimiter='\t')
print(f'{len(updated_mappings)} mappings remaining')
writer.writerows(sorted(list(updated_mappings)))


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Check latest mappings for obsolete terms and (optionally) conformity'
' against latest OT schema')
parser.add_argument('--mappings-file', required=True, help='File of latest ontology mappings to process')
parser.add_argument('--ot-schema', required=True, help='Open Targets schema JSON')
args = parser.parse_args()
check_mappings(args.mappings_file, args.ot_schema)
36 changes: 2 additions & 34 deletions pipelines/annotation_pipeline.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

nextflow.enable.dsl=2

include { downloadClinvar; downloadJsonSchema } from './utils.nf'

def helpMessage() {
log.info"""
Expand Down Expand Up @@ -57,7 +58,7 @@ workflow {

if (params.schema != null) {
// Open Targets evidence string output
downloadJsonSchema()
downloadJsonSchema(params.schema)
// Get start/end indices to break XML into chunks
countClinvarRecords(clinvarXml)
.map { strN ->
Expand Down Expand Up @@ -94,39 +95,6 @@ workflow {
}
}

/*
* Download ClinVar data, using the most recent XML dump.
*/
process downloadClinvar {
label 'small_mem'

output:
path "clinvar.xml.gz", emit: clinvarXml

script:
"""
wget -O clinvar.xml.gz \
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/RCV_release/ClinVarRCVRelease_00-latest.xml.gz
"""
}

/*
* Download the Open Targets JSON schema.
*/
process downloadJsonSchema {
label 'short_time'
label 'small_mem'

output:
path "opentargets-${params.schema}.json", emit: jsonSchema

script:
"""
wget -O opentargets-${params.schema}.json \
https://raw.githubusercontent.com/opentargets/json_schema/${params.schema}/schemas/disease_target_evidence.json
"""
}

/*
* Run simple variants (SNPs and other variants with complete coordinates) through VEP and map them
* to genes and functional consequences.
Expand Down
47 changes: 44 additions & 3 deletions pipelines/export_curation_spreadsheet.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

nextflow.enable.dsl=2

include { getTargetOntology } from './utils.nf'
include { getTargetOntology; downloadJsonSchema } from './utils.nf'


def helpMessage() {
Expand All @@ -13,6 +13,7 @@ def helpMessage() {
--curation_root Directory for current batch
--input_csv Input csv file
--mappings Current mappings file (optional, will use a default path if omitted)
--schema Open Targets JSON schema version (optional, will check term IDs match schema if included)
--with_feedback Whether to generate EFO/Zooma feedback and final symlinking (default false)
"""
}
Expand All @@ -21,6 +22,7 @@ params.help = null
params.curation_root = null
params.input_csv = null
params.mappings = "\${BATCH_ROOT_BASE}/manual_curation/latest_mappings.tsv"
params.schema = null
params.with_feedback = false

if (params.help) {
Expand All @@ -37,13 +39,25 @@ codeRoot = "${projectDir}/.."
* Main workflow.
*/
workflow {
// Generate latest mappings
exportTable()
combineManualAndAutomated(exportTable.out.finishedMappings)
getTargetOntology(params.mappings)
stripMappingsHeader()
mergeWithLatestMappings(combineManualAndAutomated.out.newMappings, stripMappingsHeader.out.previousMappings)
checkDuplicates(mergeWithLatestMappings.out.newMappings)
addMappingsHeader(checkDuplicates.out.duplicatesOk, mergeWithLatestMappings.out.newMappings, getTargetOntology.out.targetOntology)

// Perform checks on latest mappings
if (params.schema != null) {
downloadJsonSchema(params.schema)
checkMappings(mergeWithLatestMappings.out.newMappings, downloadJsonSchema.out.jsonSchema)
updatedMappings = checkMappings.out.updatedMappings
} else {
updatedMappings = mergeWithLatestMappings.out.newMappings
}
checkDuplicates(updatedMappings)

// Finalise latest mappings file
addMappingsHeader(checkDuplicates.out.duplicatesOk, updatedMappings, getTargetOntology.out.targetOntology)
if (params.with_feedback) {
generateZoomaFeedback(addMappingsHeader.out.finalMappings)
updateLinks(addMappingsHeader.out.finalMappings, generateZoomaFeedback.out.zoomaFeedback)
Expand Down Expand Up @@ -168,6 +182,33 @@ process generateZoomaFeedback {
"""
}

/*
* Check latest mappings conformity against latest OT schema.
*/
process checkMappings {
label 'short_time'
label 'small_mem'
publishDir "${curationRoot}",
overwrite: true,
mode: "copy",
pattern: "*_nonmatching.tsv"

input:
path mappingsFile
path schemaFile

output:
path "${mappingsFile.getBaseName()}_updated.tsv", emit: updatedMappings
path "${mappingsFile.getBaseName()}_nonmatching.tsv", emit: nonmatchingMappings

script:
"""
\${PYTHON_BIN} ${codeRoot}/bin/trait_mapping/check_latest_mappings.py \
--mappings-file ${mappingsFile} \
--ot-schema ${schemaFile}
"""
}

/*
* Check there are no complete duplicates in the final mappings file.
*/
Expand Down
18 changes: 1 addition & 17 deletions pipelines/generate_curation_spreadsheet.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

nextflow.enable.dsl=2

include { getTargetOntology } from './utils.nf'
include { getTargetOntology; downloadClinvar } from './utils.nf'


def helpMessage() {
Expand Down Expand Up @@ -55,22 +55,6 @@ workflow {
createCurationTable(collectCurationTraits.out.curationTraits)
}

/*
* Download ClinVar data, using the most recent XML dump.
*/
process downloadClinvar {
label 'small_mem'

output:
path "clinvar.xml.gz", emit: clinvarXml

script:
"""
wget -O clinvar.xml.gz \
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/RCV_release/ClinVarRCVRelease_00-latest.xml.gz
"""
}

/*
* Parse traits from ClinVar XML.
*/
Expand Down
36 changes: 36 additions & 0 deletions pipelines/utils.nf
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,39 @@ process getTargetOntology {
ONTOLOGY=\${ONTOLOGY:-EFO}
"""
}

/*
* Download ClinVar data, using the most recent XML dump.
*/
process downloadClinvar {
label 'small_mem'

output:
path "clinvar.xml.gz", emit: clinvarXml

script:
"""
wget -O clinvar.xml.gz \
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/RCV_release/ClinVarRCVRelease_00-latest.xml.gz
"""
}

/*
* Download the Open Targets JSON schema.
*/
process downloadJsonSchema {
label 'short_time'
label 'small_mem'

input:
val schemaVersion

output:
path "opentargets-${schemaVersion}.json", emit: jsonSchema

script:
"""
wget -O opentargets-${schemaVersion}.json \
https://raw.githubusercontent.com/opentargets/json_schema/${schemaVersion}/schemas/disease_target_evidence.json
"""
}
Loading