diff --git a/bin/cmat/VERSION b/bin/cmat/VERSION index 8cf6caf5..7387b31e 100644 --- a/bin/cmat/VERSION +++ b/bin/cmat/VERSION @@ -1 +1 @@ -3.4.1 \ No newline at end of file +3.4.2.dev0 \ No newline at end of file diff --git a/cmat/clinvar_xml_io/clinical_classification.py b/cmat/clinvar_xml_io/clinical_classification.py index 6db60679..5139a994 100644 --- a/cmat/clinvar_xml_io/clinical_classification.py +++ b/cmat/clinvar_xml_io/clinical_classification.py @@ -29,6 +29,13 @@ class ClinicalClassification: 'reviewed by expert panel': 3, 'practice guideline': 4, } + # Map clinical classification types from XML tags to more readable names + type_map = { + 'GermlineClassification': 'germline', + 'SomaticClinicalImpact': 'somatic', + 'OncogenicityClassification': 'oncogenicity', + 'NoClassification': 'none' + } # Some records have been flagged by ClinVar and should not be used. INVALID_CLINICAL_SIGNIFICANCES = {'no classifications from unflagged records'} @@ -37,8 +44,7 @@ def __init__(self, class_xml, clinvar_record): self.class_xml = class_xml self.clinvar_record = clinvar_record self.xsd_version = clinvar_record.xsd_version - # Type of clinical classification: germline, somatic, or oncogenicity - self.type = class_xml.tag + self.type = self.type_map.get(class_xml.tag, class_xml.tag) @property def last_evaluated_date(self): @@ -69,7 +75,7 @@ def clinical_significance_raw(self): return find_mandatory_unique_element(self.class_xml, './Description').text except AssertionError as e: raise MultipleClinicalClassificationsError(f'Found multiple descriptions for one ClinicalClassification in ' - f'{self.clinvar_record.accession}') + f'{self.clinvar_record.accession}') @property def clinical_significance_list(self): @@ -82,3 +88,19 @@ def clinical_significance_list(self): @property def valid_clinical_significances(self): return [cs for cs in self.clinical_significance_list if cs.lower() not in self.INVALID_CLINICAL_SIGNIFICANCES] + + @property + def somatic_assertion_type(self): + try: + return find_mandatory_unique_element(self.class_xml, './Description').attrib.get('ClinicalImpactAssertionType') + except AssertionError as e: + raise MultipleClinicalClassificationsError(f'Found multiple descriptions for one ClinicalClassification in ' + f'{self.clinvar_record.accession}') + + @property + def somatic_clinical_impact(self): + try: + return find_mandatory_unique_element(self.class_xml, './Description').attrib.get('ClinicalImpactClinicalSignificance') + except AssertionError as e: + raise MultipleClinicalClassificationsError(f'Found multiple descriptions for one ClinicalClassification in ' + f'{self.clinvar_record.accession}') diff --git a/cmat/clinvar_xml_io/clinvar_record.py b/cmat/clinvar_xml_io/clinvar_record.py index 9d67fb5b..32488d77 100644 --- a/cmat/clinvar_xml_io/clinvar_record.py +++ b/cmat/clinvar_xml_io/clinvar_record.py @@ -153,3 +153,7 @@ def valid_clinical_significances(self): if len(self.clinical_classifications) > 1: raise MultipleClinicalClassificationsError(f'Found multiple ClinicalClassifications for {self.accession}') return self.clinical_classifications[0].valid_clinical_significances + + @property + def collection_method_types(self): + return [elem.text for elem in find_elements(self.record_xml, './ObservedIn/Method/MethodType')] diff --git a/data-exploration/clinvar-variant-types/README.md b/data-exploration/clinvar-variant-types/README.md index 45a51246..6343941c 100644 --- a/data-exploration/clinvar-variant-types/README.md +++ b/data-exploration/clinvar-variant-types/README.md @@ -25,10 +25,7 @@ In addition, the source code for diagrams and tables will be printed to STDOUT. **RCV** is the top level of ClinVar data organisation. It is a record which associates one or more traits (usually diseases) with exactly one _VCV record,_ which can be one of two types: * **MeasureSet** contains one or more _Measures._ (Each Measure is essentially an individual, isolated variant.) The MeasureSet can be one of four types: - **Variant.** This means that the measure “set” has the size of 1 and contains just a single isolated variant. This variant can be one of the subtypes illustrated on the diagram. - - Three other complex types, which were not investigated further in this analysis. They may contain multiple Measures (variants), which must all be interpreted together: - + **Haplotype.** A collection of variants phased on the same chromosome copy and usually inherited together. - + **Phase unknown** - + **Distinct chromosomes** + - Several other complex types, which were not investigated further in this analysis. They may contain multiple Measures (variants), which must all be interpreted together. * **GenotypeSet** represents the cases when the variants which are interpreted together are located on different chromosomal copies (paternal/maternal), that is, when they include _trans_ phasing. The GenotypeSet can be one of two types, which were not investigated further in this analysis: - **CompoundHeterozygote.** Presumably this should include exactly two variants which are _trans_ phased and interpreted together. - **Diplotype.** Similar, but at least one of the _trans_ phased alleles includes a haplotype. An example of this would be three variants located on one copy of the gene, and one variant in the second one, all interpreted together. @@ -51,25 +48,79 @@ The diagram above demonstrates all these relationships. For a trait set with mul * “One name per trait” = _every_ trait in a trait set has at most one name; * “Multiple names per trait” = at least one trait in a trait set has multiple names. +![](diagrams/trait-quality.png) +The traits are described with varying levels of informativeness, which is shown in the above diagram. -## Clinical significance +Some trait names we consider invalid as they do not describe anything meaningful (e.g. "not provided" or "disease"). +Others are meaningful but less informative than specific disease terms, e.g. "CLCN4-related disorder" or other +gene-related disorder terms. The diagram uses the following categories: +* "No valid trait names" = no trait in the trait set has any valid names +* "Some gene related disorder" = some (but not all) trait names within the trait set are gene-related disorder terms +* "All gene related disorder" = all valid trait names within the trait set are gene-related disorder terms +* "Regular trait" = all other traits -![](diagrams/clinical-significance.png) +Finally, each trait also has one or multiple cross-references associated with it. +These are identifiers from ontologies like MONDO or databases like MedGen. -Clinical significance can be either “Simple” (only one level present per Variant record) or “Complex” (multiple levels are present, separated by slashes and/or commas). +Of these we are especially interested in those that are "EFO-aligned", i.e. from EFO, MONDO, HP, and Orphanet. +The above diagram shows which records have such cross-references. For records with multiple traits: +* "No EFO-aligned xrefs" = no trait in the trait set has any EFO-aligned cross-references; +* "Has EFO-aligned xrefs" = at least one trait in the trait set has at least one EFO-aligned cross-reference. + +Supplementary table: [**All trait cross-references**](supplementary-tables.md#all-trait-cross-references), which counts +the total number of cross-references from each source across all of ClinVar. + +## Clinical classification + +![](diagrams/clinical-classification.png) + +Clinical classification (formerly known as "clinical significance") can be of three types: +* Germline +* Somatic for clinical impact, such as prognosis or therapeutic response (called just "somatic") +* Somatic for oncogenicity (called just "oncogenicity") + +Records can have one or more clinical classifications, but classifications of the same type are aggregated across submissions. + +Each clinical classification description can be either “Simple” (only one level described) or “Complex” (multiple levels are present, separated by slashes and/or commas). + +![](diagrams/somatic-classification.png) + +Somatic classifications for clinical impact can in turn contain multiple assertions, relating to different types +of impacts and each with their own clinical significance level. +These are described in the above diagram, which includes only somatic classifications for simplicity as this granularity +is not present for other types of clinical classifications. + +For more about the terminology and aggregation process used for clinical classifications, see [ClinVar's documentation](https://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/). Supplementary tables: * [**Complex clinical significance levels**](supplementary-tables.md#complex-clinical-significance-levels). This is simply the part of the distribution which is not shown on the diagram above for readability. -* [**All clinical significance levels.**](supplementary-tables.md#all-clinical-significance-levels) This is the cumulative count for both simple and complex cases. For complex cases, the levels are split and counted individually. Hence, the total in this table will be higher than the total number of Variant records. +* [**All clinical significance levels**](supplementary-tables.md#all-clinical-significance-levels). This is the cumulative count for both simple and complex cases. For complex cases, the levels are split and counted individually. Hence, the total in this table will be higher than the total number of Variant records. ## Star rating and review status +![](diagrams/star-rating.png) + These fields reflect the strength of evidence supporting the assertion of variant/disease association contained in the ClinVar record. -![](diagrams/star-rating.png) +Star rating and review status are aggregated by ClinVar across submissions for each clinical classification. +For simplicity, the diagram focuses on the majority of records which have a single clinical classification. +The correspondence between star rating and review status is defined by ClinVar [here](https://www.ncbi.nlm.nih.gov/clinvar/docs/review_status/#review-status-on-aggregate-recor). + +The far right side of the diagram shows collection method types, which describes how the data used to make the +classification was collected. +These are (mostly) described by ClinVar [here](https://www.ncbi.nlm.nih.gov/clinvar/docs/spreadsheet/#collection). + +Collection method type is not aggregated by ClinVar, so the diagram shows a comma-separated list of all method types +associated with a record, deduplicated and sorted alphabetically. +For readability, only groups with at least 1000 records are included, with the rest being categorised under "Other". + +Supplementary tables: +* [**Collection method types**](supplementary-tables.md#collection-method-types). This simply counts all collection method types across the entire dataset. +* [**Distribution of records by collection method type**](supplementary-tables.md#distribution-of-records-by-collection-method-type). + This shows the same information as in the rightmost part of the diagram but without thresholding at 1000, i.e. all groupings within "Other" are listed. diff --git a/data-exploration/clinvar-variant-types/clinvar-variant-types.py b/data-exploration/clinvar-variant-types/clinvar-variant-types.py index 6d38c96e..a39e429d 100644 --- a/data-exploration/clinvar-variant-types/clinvar-variant-types.py +++ b/data-exploration/clinvar-variant-types/clinvar-variant-types.py @@ -10,6 +10,7 @@ from cmat import clinvar_xml_io from cmat.clinvar_xml_io.clinical_classification import MultipleClinicalClassificationsError +from cmat.clinvar_xml_io.xml_parsing import find_elements logging.basicConfig() logger = logging.getLogger(__name__) @@ -33,27 +34,72 @@ def add_transitions(self, *transition_chain): for t_from, t_to in zip(transition_chain, transition_chain[1:]): self[(t_from, t_to)] += 1 - def generate_diagram(self): + def generate_diagram(self, threshold=None): """Generate and save a Sankey diagram directly to file.""" dpi = 144 + font_size = 6 plt.figure(figsize=(self.width/dpi, self.height/dpi), dpi=dpi) - flows = [(t_from, t_to, t_count) for (t_from, t_to), t_count in sorted(self.items(), key=lambda x: -x[1])] + flows = [(self._format_label(t_from), self._format_label(t_to), t_count) + for (t_from, t_to), t_count in sorted(self.items(), key=lambda x: -x[1])] + # TODO: figure out how to show extremely thin flows better try: logger.info(f'Generating diagram: {self.name}') - s = Sankey(flows=flows, + nodes = Sankey.infer_nodes(flows) + if threshold is not None: + nodes, flows = self._apply_threshold(nodes, flows, threshold=threshold) + s = Sankey(flows=flows, nodes=nodes, node_pad_y_min=0.04, node_pad_y_max=0.08, - node_opts=dict(label_format='{label}: {value}', label_opts=dict(fontsize=8))) + node_opts=dict(label_format='{label}: {value}', label_opts=dict(fontsize=font_size))) except FloatingPointError: # Perturb values to avoid divide-by-zero errors. TODO: come up with a better solution to this plt.figure(figsize=(self.width/dpi, (self.height+1)/dpi), dpi=dpi) s = Sankey(flows=flows, node_pad_y_min=0.03, node_pad_y_max=0.08, - node_opts=dict(label_format='{label}: {value}', label_opts=dict(fontsize=8))) + node_opts=dict(label_format='{label}: {value}', label_opts=dict(fontsize=font_size))) s.draw() plt.savefig(self.name, bbox_inches='tight') + def _format_label(self, label): + max_len = 20 + words = label.split() + lines = [] + line = [] + while len(words) > 0: + while len(' '.join(line)) < max_len and len(words) > 0: + line.append(words.pop(0)) + lines.append(' '.join(line)) + line = [] + return '\n'.join(lines) + + def _apply_threshold(self, nodes, flows, threshold): + """Recompute nodes and flows by combining nodes at the final level with value less than threshold.""" + new_node_name = 'Other' + new_nodes = nodes[:-1] + new_final_level = [] + new_value = 0 + # Recompute final level + for node, value in nodes[-1]: + if value < threshold: + new_value += value + else: + new_final_level.append([node, value]) + if new_value > 0: + new_final_level.append([new_node_name, new_value]) + else: + return nodes, flows + new_nodes.append(new_final_level) + # Update flows + all_endpoints = [node for level in new_nodes for node, _ in level] + new_flows = [] + for start, end, count in flows: + if end not in all_endpoints: + new_flows.append((start, new_node_name, count)) + else: + new_flows.append((start, end, count)) + return new_nodes, new_flows + def __str__(self): lines = [f'========== SANKEY DIAGRAM: {self.name} ==========', f'Build using http://sankeymatic.com/build/ with width={self.width}, height={self.height}'] @@ -126,17 +172,22 @@ def main(clinvar_xml, process_items=None): # Sankey diagrams for visualisation sankey_variation_representation = SankeyDiagram('variant-types.png', 1200, 600) sankey_trait_representation = SankeyDiagram('traits.png', 1200, 400) - sankey_clinical_significance = SankeyDiagram('clinical-significance.png', 1200, 800) - sankey_star_rating = SankeyDiagram('star-rating.png', 1200, 600) + sankey_trait_quality = SankeyDiagram('trait-quality.png', 1500, 400) + sankey_clinical_classification = SankeyDiagram('clinical-classification.png', 1400, 800) + sankey_somatic_classification = SankeyDiagram('somatic-classification.png', 1200, 400) + sankey_star_rating = SankeyDiagram('star-rating.png', 1400, 800) sankey_mode_of_inheritance = SankeyDiagram('mode-of-inheritance.png', 1200, 500) sankey_allele_origin = SankeyDiagram('allele-origin.png', 1200, 600) sankey_inheritance_origin = SankeyDiagram('inheritance-origin.png', 1200, 400) # Supplementary tables and counters for the report - counter_clin_sig_complex = SupplementaryTableCounter('Complex clinical significance levels', 'Clinical significance') - counter_clin_sig_all = SupplementaryTableCounter('All clinical significance levels', 'Clinical significance') + counter_trait_xrefs = SupplementaryTableCounter('All trait cross-references', 'Source') + counter_clin_class_complex = SupplementaryTableCounter('Complex clinical classification levels', 'Clinical classification') + counter_clin_class_all = SupplementaryTableCounter('All clinical classification levels', 'Clinical classification') counter_star_rating = SupplementaryTableCounter( 'Distribution of records by star rating', 'Star rating', sort_lambda=lambda x: x[0]) + counter_coll_method_type = SupplementaryTableCounter('Collection method types', 'Collection method type') + counter_full_coll_method_type = SupplementaryTableCounter('Distribution of records by collection method type', 'Collection method type') table_multiple_mode_of_inheritance = SupplementaryTable( 'Multiple mode of inheritance', ['RCV', 'Modes of inheritance'], sort_lambda=lambda x: (x[1], x[0])) counter_multiple_allele_origin = SupplementaryTableCounter('Multiple allele origins', 'Allele origins') @@ -157,105 +208,160 @@ def main(clinvar_xml, process_items=None): measure_set_type = measure_set.attrib['Type'] sankey_variation_representation.add_transitions('RCV', 'MeasureSet', measure_set_type) - if measure_set_type == 'Variant': - # Most common case, accounting for >99.97% of all ClinVar records. Here, we go into details on various - # attribute distributions. - - # Variation representation - measures = measure_set.findall('Measure') - assert len(measures) == 1, 'MeasureSet of type Variant must contain exactly one Measure' - sankey_variation_representation.add_transitions(measure_set_type, measures[0].attrib['Type']) - - # Trait representation - traits = clinvar_record.traits - if len(traits) == 0: - raise AssertionError('There must always be at least one trait') - elif len(traits) == 1: - traits_category = 'One trait' - else: - traits_category = 'Multiple traits' - names_category = 'One name per trait' - for trait in traits: - if len(trait.all_names) > 1: - names_category = 'Multiple names per trait' - sankey_trait_representation.add_transitions('Variant', clinvar_record.trait_set_type, traits_category) - if traits_category != 'No traits': - sankey_trait_representation.add_transitions(traits_category, names_category) - - # Clinical significance + # Only go into details for single variants, the most common type + if measure_set_type != 'Variant': + continue + + # Variation representation + measures = measure_set.findall('Measure') + assert len(measures) == 1, 'MeasureSet of type Variant must contain exactly one Measure' + sankey_variation_representation.add_transitions(measure_set_type, measures[0].attrib['Type']) + + # Trait representation + traits = clinvar_record.traits + if len(traits) == 0: + raise AssertionError('There must always be at least one trait') + elif len(traits) == 1: + traits_category = 'One trait' + else: + traits_category = 'Multiple traits' + names_category = 'One name per trait' + ontology_category = 'No xrefs' + trait_quality_category = 'No valid trait name' if not clinvar_record.traits_with_valid_names else 'Regular trait' + for trait in traits: + if len(trait.all_names) > 1: + names_category = 'Multiple names per trait' + if any('related disorder' in name for name in trait.all_valid_names): + trait_quality_category = 'Some gene related disorder' + if trait.xrefs and ontology_category == 'No xrefs': + ontology_category = 'No EFO-aligned xrefs' + if len(trait.current_efo_aligned_xrefs) > 0: + ontology_category = 'Has EFO-aligned xrefs' + # Count all xref sources for each trait + for db, _, _ in trait.xrefs: + counter_trait_xrefs.add_count(db) + if clinvar_record.traits_with_valid_names and all( + 'related disorder' in name + for trait in clinvar_record.traits_with_valid_names + for name in trait.all_valid_names + ): + trait_quality_category = 'All gene related disorder' + + sankey_trait_representation.add_transitions('Variant', clinvar_record.trait_set_type, traits_category, names_category) + sankey_trait_quality.add_transitions('Variant', clinvar_record.trait_set_type, traits_category, names_category, trait_quality_category, ontology_category) + + # Clinical classification + class_cardinality = 'Single classification' + if len(clinvar_record.clinical_classifications) > 1: + class_cardinality = 'Multiple classifications' + # Somatic, germline, oncogenic, or combinations thereof + clin_class_types = list(sorted(clin_class.type for clin_class in clinvar_record.clinical_classifications)) + clin_class_string = ', '.join(clin_class_types) + for clin_class in clinvar_record.clinical_classifications: try: - clin_sig_split = clinvar_record.clinical_significance_list - clinical_significance = clinvar_record.clinical_classifications[0].clinical_significance_raw - for clin_sig in clin_sig_split: # Count all clinical significance levels after splitting - counter_clin_sig_all.add_count(clin_sig) - if len(clin_sig_split) == 1: - sankey_clinical_significance.add_transitions('Variant', 'Single classification', 'Simple', clinical_significance) - else: - sankey_clinical_significance.add_transitions('Variant', 'Single classification', 'Complex') - counter_clin_sig_complex.add_count(clinical_significance) - except MultipleClinicalClassificationsError: - sankey_clinical_significance.add_transitions('Variant', 'Multiple classification') - - # Review status - try: - review_status = clinvar_record.review_status - star_rating = review_status_stars(clinvar_record.score) - sankey_star_rating.add_transitions('Variant', 'Single classification', star_rating, review_status) - counter_star_rating.add_count(star_rating) - except MultipleClinicalClassificationsError: - sankey_star_rating.add_transitions('Variant', 'Multiple classification') - - # Mode of inheritance - modes_of_inheritance = clinvar_record.mode_of_inheritance - modes_of_inheritance_text = ', '.join(sorted(modes_of_inheritance)) - if len(modes_of_inheritance) == 0: - mode_of_inheritance_category = 'Missing' - elif 'Somatic mutation' in modes_of_inheritance: - if len(modes_of_inheritance) > 1: - mode_of_inheritance_category = 'Germline & somatic' - else: - mode_of_inheritance_category = 'Somatic' - else: - mode_of_inheritance_category = 'Germline' - sankey_mode_of_inheritance.add_transitions('Variant', mode_of_inheritance_category) - if mode_of_inheritance_category == 'Germline': - if len(modes_of_inheritance) == 1: - sankey_mode_of_inheritance.add_transitions('Germline', 'Single', modes_of_inheritance_text) + clin_class_split = clin_class.clinical_significance_list + clin_class_raw = clin_class.clinical_significance_raw + # Count all individual clinical classification terms + for clin_class_term in clin_class_split: + counter_clin_class_all.add_count(clin_class_term) + # Simple terms included in the diagram + if len(clin_class_split) == 1: + sankey_clinical_classification.add_transitions( + 'Variant', class_cardinality, clin_class_string, 'Simple', + clin_class_raw) + # Compound terms included in supplementary tables only else: - sankey_mode_of_inheritance.add_transitions('Germline', 'Multiple') - # Log multiple ModeOfInheritance cases in a separate table + sankey_clinical_classification.add_transitions( + 'Variant', class_cardinality, clin_class_string, 'Complex') + counter_clin_class_complex.add_count(clin_class_raw) + except MultipleClinicalClassificationsError as e: + # Multiple descriptions within a single clinical classification - only occurs for somatic + # classifications, which are handled in the next diagram. + continue + + # Focus on somatic classifications + if 'somatic' in clin_class_types: + for clin_class in clinvar_record.clinical_classifications: + if clin_class.type == 'somatic': + try: + assertion_type = clin_class.somatic_assertion_type or 'no assertion type' + clinical_impact = clin_class.somatic_clinical_impact or 'no clinical impact' + sankey_somatic_classification.add_transitions( + 'Somatic [CC]', 'Single assertion', assertion_type, + clinical_impact, clin_class.clinical_significance_raw) + except MultipleClinicalClassificationsError as e: + # Not supported by the main parsers yet + for elem in find_elements(clin_class.class_xml, './Description'): + sankey_somatic_classification.add_transitions( + 'Somatic [CC]', 'Multiple assertions', elem.attrib.get('ClinicalImpactAssertionType', 'no assertion type'), + elem.attrib.get('ClinicalImpactClinicalSignificance', 'no clinical impact'), elem.text + ) + + # Review status, star rating, and collection method type + try: + review_status = clinvar_record.review_status + star_rating = review_status_stars(clinvar_record.score) + collection_method_type = ', '.join(sorted(set(clinvar_record.collection_method_types))) or 'missing' + sankey_star_rating.add_transitions('Variant', 'Single classification', star_rating, review_status, + collection_method_type) + counter_star_rating.add_count(star_rating) + for coll_method_type in clinvar_record.collection_method_types: + counter_coll_method_type.add_count(coll_method_type) + counter_full_coll_method_type.add_count(collection_method_type) + except MultipleClinicalClassificationsError: + sankey_star_rating.add_transitions('Variant', 'Multiple classifications') + + # Mode of inheritance + modes_of_inheritance = clinvar_record.mode_of_inheritance + modes_of_inheritance_text = ', '.join(sorted(modes_of_inheritance)) + if len(modes_of_inheritance) == 0: + mode_of_inheritance_category = 'Missing' + elif 'Somatic mutation' in modes_of_inheritance: if len(modes_of_inheritance) > 1: - table_multiple_mode_of_inheritance.add_row([rcv_to_link(rcv_id), modes_of_inheritance_text]) - - # Allele origins - allele_origins = clinvar_record.allele_origins - allele_origin_text = ', '.join(sorted(allele_origins)) - if len(allele_origins) == 0: - allele_origin_category = 'Missing' - elif 'somatic' in allele_origins: - if len(allele_origins) > 1: - allele_origin_category = 'Germline & somatic' - else: - allele_origin_category = 'Somatic' + mode_of_inheritance_category = 'Germline & somatic' else: - allele_origin_category = 'Germline' - sankey_allele_origin.add_transitions('Variant', allele_origin_category) - if allele_origin_category == 'Germline': - if len(allele_origins) == 1: - sankey_allele_origin.add_transitions(allele_origin_category, 'Single', allele_origin_text) - else: - sankey_allele_origin.add_transitions(allele_origin_category, 'Multiple') - # Log multiple allele of origin values in a separate table + mode_of_inheritance_category = 'Somatic' + else: + mode_of_inheritance_category = 'Germline' + sankey_mode_of_inheritance.add_transitions('Variant', mode_of_inheritance_category) + if mode_of_inheritance_category == 'Germline': + if len(modes_of_inheritance) == 1: + sankey_mode_of_inheritance.add_transitions('Germline', 'Single', modes_of_inheritance_text) + else: + sankey_mode_of_inheritance.add_transitions('Germline', 'Multiple') + # Log multiple ModeOfInheritance cases in a separate table + if len(modes_of_inheritance) > 1: + table_multiple_mode_of_inheritance.add_row([rcv_to_link(rcv_id), modes_of_inheritance_text]) + + # Allele origins + allele_origins = clinvar_record.allele_origins + allele_origin_text = ', '.join(sorted(allele_origins)) + if len(allele_origins) == 0: + allele_origin_category = 'Missing' + elif 'somatic' in allele_origins: if len(allele_origins) > 1: - counter_multiple_allele_origin.add_count(allele_origin_text) - - # Mode of inheritance and allele origin mapping - if mode_of_inheritance_category != 'Missing' and allele_origin_category != 'Missing': - sankey_inheritance_origin.add_transitions( - f'[MoI] {mode_of_inheritance_category}', f'{allele_origin_category} [AO]') - if mode_of_inheritance_category != allele_origin_category: - table_inconsistent_moi_ao.add_row([rcv_to_link(rcv_id), modes_of_inheritance_text, - allele_origin_text]) + allele_origin_category = 'Germline & somatic' + else: + allele_origin_category = 'Somatic' + else: + allele_origin_category = 'Germline' + sankey_allele_origin.add_transitions('Variant', allele_origin_category) + if allele_origin_category == 'Germline': + if len(allele_origins) == 1: + sankey_allele_origin.add_transitions(allele_origin_category, 'Single', allele_origin_text) + else: + sankey_allele_origin.add_transitions(allele_origin_category, 'Multiple') + # Log multiple allele of origin values in a separate table + if len(allele_origins) > 1: + counter_multiple_allele_origin.add_count(allele_origin_text) + + # Mode of inheritance and allele origin mapping + if mode_of_inheritance_category != 'Missing' and allele_origin_category != 'Missing': + sankey_inheritance_origin.add_transitions( + f'[MoI] {mode_of_inheritance_category}', f'{allele_origin_category} [AO]') + if mode_of_inheritance_category != allele_origin_category: + table_inconsistent_moi_ao.add_row([rcv_to_link(rcv_id), modes_of_inheritance_text, + allele_origin_text]) elif len(measure_sets) == 0 and len(genotype_sets) == 1: # RCV directly contains one genotype set. @@ -275,14 +381,28 @@ def main(clinvar_xml, process_items=None): # Output the code for Sankey diagrams. Transitions are sorted in decreasing number of counts, so that the most frequent # cases are on top. - for sankey_diagram in (sankey_variation_representation, sankey_trait_representation, sankey_clinical_significance, - sankey_star_rating, sankey_mode_of_inheritance, sankey_allele_origin, sankey_inheritance_origin): + for sankey_diagram in (sankey_variation_representation, sankey_trait_representation, sankey_trait_quality, + sankey_clinical_classification, sankey_somatic_classification, + sankey_mode_of_inheritance, sankey_allele_origin, sankey_inheritance_origin): print('\n') print(sankey_diagram) - sankey_diagram.generate_diagram() + try: + sankey_diagram.generate_diagram() + except Exception as e: + print(e) + continue + + # Sankey diagrams requiring thresholding + print('\n') + print(sankey_star_rating) + try: + sankey_star_rating.generate_diagram(threshold=1000) + except Exception as e: + print(e) # Output the supplementary tables for the report. - for supplementary_table in (counter_clin_sig_complex, counter_clin_sig_all, counter_star_rating, + for supplementary_table in (counter_trait_xrefs, counter_clin_class_complex, counter_clin_class_all, + counter_star_rating, counter_coll_method_type, counter_full_coll_method_type, table_multiple_mode_of_inheritance, counter_multiple_allele_origin, table_inconsistent_moi_ao): print('\n') diff --git a/data-exploration/clinvar-variant-types/diagrams/allele-origin.png b/data-exploration/clinvar-variant-types/diagrams/allele-origin.png index 14d37110..9a0bc72b 100644 Binary files a/data-exploration/clinvar-variant-types/diagrams/allele-origin.png and b/data-exploration/clinvar-variant-types/diagrams/allele-origin.png differ diff --git a/data-exploration/clinvar-variant-types/diagrams/clinical-classification.png b/data-exploration/clinvar-variant-types/diagrams/clinical-classification.png new file mode 100644 index 00000000..c093b21f Binary files /dev/null and b/data-exploration/clinvar-variant-types/diagrams/clinical-classification.png differ diff --git a/data-exploration/clinvar-variant-types/diagrams/clinical-significance.png b/data-exploration/clinvar-variant-types/diagrams/clinical-significance.png deleted file mode 100644 index 586eb557..00000000 Binary files a/data-exploration/clinvar-variant-types/diagrams/clinical-significance.png and /dev/null differ diff --git a/data-exploration/clinvar-variant-types/diagrams/inheritance-origin.png b/data-exploration/clinvar-variant-types/diagrams/inheritance-origin.png index 63239d6e..feabf97b 100644 Binary files a/data-exploration/clinvar-variant-types/diagrams/inheritance-origin.png and b/data-exploration/clinvar-variant-types/diagrams/inheritance-origin.png differ diff --git a/data-exploration/clinvar-variant-types/diagrams/mode-of-inheritance.png b/data-exploration/clinvar-variant-types/diagrams/mode-of-inheritance.png index e146f3f2..919d7f05 100644 Binary files a/data-exploration/clinvar-variant-types/diagrams/mode-of-inheritance.png and b/data-exploration/clinvar-variant-types/diagrams/mode-of-inheritance.png differ diff --git a/data-exploration/clinvar-variant-types/diagrams/somatic-classification.png b/data-exploration/clinvar-variant-types/diagrams/somatic-classification.png new file mode 100644 index 00000000..29424285 Binary files /dev/null and b/data-exploration/clinvar-variant-types/diagrams/somatic-classification.png differ diff --git a/data-exploration/clinvar-variant-types/diagrams/star-rating.png b/data-exploration/clinvar-variant-types/diagrams/star-rating.png index 7465f3fe..53584ad8 100644 Binary files a/data-exploration/clinvar-variant-types/diagrams/star-rating.png and b/data-exploration/clinvar-variant-types/diagrams/star-rating.png differ diff --git a/data-exploration/clinvar-variant-types/diagrams/trait-quality.png b/data-exploration/clinvar-variant-types/diagrams/trait-quality.png new file mode 100644 index 00000000..0a87df3f Binary files /dev/null and b/data-exploration/clinvar-variant-types/diagrams/trait-quality.png differ diff --git a/data-exploration/clinvar-variant-types/diagrams/traits.png b/data-exploration/clinvar-variant-types/diagrams/traits.png index aef99418..a233ae4a 100644 Binary files a/data-exploration/clinvar-variant-types/diagrams/traits.png and b/data-exploration/clinvar-variant-types/diagrams/traits.png differ diff --git a/data-exploration/clinvar-variant-types/diagrams/variant-types.png b/data-exploration/clinvar-variant-types/diagrams/variant-types.png index bdb0c204..ecc81c2d 100644 Binary files a/data-exploration/clinvar-variant-types/diagrams/variant-types.png and b/data-exploration/clinvar-variant-types/diagrams/variant-types.png differ diff --git a/data-exploration/clinvar-variant-types/supplementary-tables.md b/data-exploration/clinvar-variant-types/supplementary-tables.md index cc9b1680..ad4487fa 100644 --- a/data-exploration/clinvar-variant-types/supplementary-tables.md +++ b/data-exploration/clinvar-variant-types/supplementary-tables.md @@ -1,3 +1,16 @@ +## All trait cross-references +Source|Count +:--|:-- +MedGen|5199733 +MONDO|2395103 +OMIM|1988569 +Orphanet|1676005 +MeSH|636549 +Human Phenotype Ontology|513248 +EFO|7556 +Gene|4432 +EFO: The Experimental Factor Ontology|448 + ## Complex clinical significance levels Clinical significance|Count :--|:-- @@ -88,6 +101,180 @@ Star rating|Count ★☆☆☆|4245851 ☆☆☆☆|319376 +## Collection method types +Collection method type|Count +:--|:-- +clinical testing|5082906 +not provided|129440 +research|72668 +literature only|65779 +curation|54414 +reference population|19804 +phenotyping only|5115 +case-control|3332 +provider interpretation|2179 +in vitro|1058 +in vivo|388 + +## Distribution of records by collection method type +collection method type|Count +:--|:-- +clinical testing|4609337 +clinical testing, not provided|107333 +research|51544 +literature only|42612 +clinical testing, curation|25303 +curation|23901 +not provided|20936 +clinical testing, literature only|12609 +clinical testing, research|11728 +missing|6720 +phenotyping only|2846 +case-control|2755 +reference population|2380 +clinical testing, literature only, research|1727 +clinical testing, phenotyping only|1310 +provider interpretation|1274 +literature only, research|1268 +clinical testing, reference population|1147 +clinical testing, curation, literature only|1131 +curation, literature only|990 +clinical testing, curation, research|621 +clinical testing, provider interpretation|610 +clinical testing, curation, literature only, research|534 +clinical testing, literature only, not provided|514 +clinical testing, in vitro|280 +curation, research|279 +in vitro, research|269 +in vivo, research|233 +case-control, clinical testing|202 +clinical testing, not provided, phenotyping only|185 +clinical testing, in vitro, research|159 +clinical testing, not provided, research|123 +curation, literature only, research|101 +literature only, not provided|96 +in vitro|95 +clinical testing, phenotyping only, research|94 +clinical testing, in vivo|88 +not provided, phenotyping only|74 +clinical testing, literature only, phenotyping only|73 +case-control, clinical testing, curation|71 +clinical testing, literature only, provider interpretation|70 +clinical testing, curation, not provided|62 +clinical testing, provider interpretation, research|51 +clinical testing, in vitro, literature only|44 +clinical testing, literature only, phenotyping only, research|43 +clinical testing, reference population, research|41 +literature only, provider interpretation|38 +clinical testing, curation, literature only, phenotyping only, research|34 +clinical testing, literature only, reference population|32 +curation, not provided|31 +case-control, literature only|29 +clinical testing, curation, phenotyping only|26 +curation, in vitro|25 +clinical testing, literature only, provider interpretation, research|24 +case-control, clinical testing, research|24 +not provided, research|23 +case-control, clinical testing, literature only|23 +clinical testing, curation, literature only, phenotyping only|21 +in vivo|21 +clinical testing, curation, reference population|20 +provider interpretation, research|19 +phenotyping only, research|19 +case-control, research|18 +literature only, reference population|17 +case-control, clinical testing, literature only, research|15 +literature only, phenotyping only|13 +clinical testing, in vitro, literature only, research|13 +in vitro, literature only|12 +case-control, in vitro|11 +case-control, clinical testing, curation, research|11 +clinical testing, curation, literature only, not provided, research|11 +in vitro, in vivo, research|10 +in vitro, literature only, research|10 +case-control, clinical testing, curation, literature only, research|10 +clinical testing, curation, phenotyping only, research|10 +clinical testing, in vivo, research|10 +clinical testing, in vitro, in vivo|10 +clinical testing, curation, in vitro|9 +clinical testing, literature only, not provided, research|9 +clinical testing, literature only, reference population, research|9 +reference population, research|8 +curation, provider interpretation, research|8 +clinical testing, curation, provider interpretation|7 +curation, literature only, provider interpretation|6 +clinical testing, curation, reference population, research|6 +clinical testing, curation, literature only, not provided|6 +clinical testing, curation, in vitro, research|6 +case-control, curation|5 +curation, phenotyping only|5 +clinical testing, curation, literature only, provider interpretation|5 +curation, reference population|5 +case-control, clinical testing, in vitro|5 +clinical testing, curation, literature only, provider interpretation, research|4 +clinical testing, phenotyping only, provider interpretation|4 +clinical testing, not provided, provider interpretation|4 +clinical testing, curation, literature only, reference population|4 +case-control, clinical testing, curation, literature only|4 +case-control, clinical testing, in vitro, literature only|4 +clinical testing, in vitro, not provided|4 +curation, provider interpretation|4 +case-control, in vivo|3 +literature only, not provided, research|3 +clinical testing, curation, in vitro, literature only|3 +clinical testing, in vitro, provider interpretation|3 +clinical testing, literature only, not provided, phenotyping only|3 +in vitro, in vivo|2 +clinical testing, curation, literature only, reference population, research|2 +clinical testing, curation, provider interpretation, research|2 +case-control, literature only, research|2 +case-control, clinical testing, literature only, phenotyping only, research|2 +clinical testing, curation, literature only, phenotyping only, provider interpretation, research|2 +clinical testing, in vitro, phenotyping only, research|2 +clinical testing, not provided, provider interpretation, research|2 +clinical testing, in vivo, literature only|2 +case-control, clinical testing, curation, literature only, phenotyping only|2 +clinical testing, literature only, not provided, provider interpretation, research|2 +clinical testing, curation, in vitro, literature only, research|2 +curation, literature only, not provided|1 +in vivo, literature only, research|1 +curation, not provided, research|1 +phenotyping only, reference population, research|1 +case-control, provider interpretation|1 +clinical testing, curation, in vivo|1 +clinical testing, literature only, phenotyping only, provider interpretation, research|1 +phenotyping only, provider interpretation|1 +case-control, clinical testing, reference population|1 +literature only, provider interpretation, research|1 +case-control, clinical testing, literature only, not provided, research|1 +clinical testing, curation, literature only, not provided, phenotyping only, research|1 +clinical testing, curation, phenotyping only, provider interpretation|1 +case-control, clinical testing, not provided|1 +clinical testing, in vivo, literature only, research|1 +case-control, clinical testing, phenotyping only, research|1 +clinical testing, curation, in vitro, phenotyping only, research|1 +not provided, provider interpretation|1 +case-control, clinical testing, literature only, provider interpretation, research|1 +clinical testing, curation, not provided, provider interpretation|1 +clinical testing, literature only, not provided, provider interpretation|1 +in vitro, provider interpretation|1 +clinical testing, not provided, phenotyping only, research|1 +case-control, in vitro, in vivo, research|1 +in vitro, literature only, phenotyping only|1 +in vitro, in vivo, literature only|1 +in vivo, not provided, research|1 +case-control, reference population|1 +curation, in vitro, research|1 +case-control, clinical testing, curation, literature only, phenotyping only, provider interpretation, research|1 +clinical testing, not provided, phenotyping only, provider interpretation|1 +literature only, phenotyping only, research|1 +clinical testing, literature only, phenotyping only, provider interpretation|1 +case-control, not provided|1 +clinical testing, phenotyping only, provider interpretation, research|1 +curation, reference population, research|1 +in vitro, reference population|1 + + ## Multiple mode of inheritance RCV|Modes of inheritance :--|:-- diff --git a/tests/clinvar_xml_io/test_clinvar_record.py b/tests/clinvar_xml_io/test_clinvar_record.py index 282dfda7..dd36e6bc 100644 --- a/tests/clinvar_xml_io/test_clinvar_record.py +++ b/tests/clinvar_xml_io/test_clinvar_record.py @@ -29,7 +29,7 @@ def test_multiple_clinical_classifications_record(): record = next(iter(ClinVarDataset(input_file))) assert len(record.clinical_classifications) == 2 - assert set(cc.type for cc in record.clinical_classifications) == {'GermlineClassification', 'SomaticClinicalImpact'} + assert set(cc.type for cc in record.clinical_classifications) == {'germline', 'somatic'} with pytest.raises(MultipleClinicalClassificationsError): print(record.valid_clinical_significances) @@ -76,3 +76,6 @@ def test_valid_allele_origins(self): def test_trait_efo_ids(self): assert self.test_clinvar_record.traits[0].current_efo_aligned_xrefs == [('MONDO', 'MONDO:0012990', 'current')] + + def test_observation_method_types(self): + assert self.test_clinvar_record.collection_method_types == ['literature only', 'clinical testing', 'clinical testing', 'research'] diff --git a/tests/clinvar_xml_io/test_clinvar_submitted_record.py b/tests/clinvar_xml_io/test_clinvar_submitted_record.py index 7a7be2b5..f2711931 100644 --- a/tests/clinvar_xml_io/test_clinvar_submitted_record.py +++ b/tests/clinvar_xml_io/test_clinvar_submitted_record.py @@ -33,6 +33,7 @@ def test_clinvar_submitted_record(submitted_record): assert submitted_record.accession == 'SCV000022285' assert submitted_record.valid_allele_origins == {'germline'} assert submitted_record.evidence_support_pubmed_refs == [15258582, 15322982] + assert submitted_record.collection_method_types == ['literature only'] assert submitted_record.created_date == '2013-04-04' # submission first publicly available assert submitted_record.submission_date == '2015-07-02' # submission last revised