From 3d7f1a0f1d847a4d21782b69a15faa63a930aa52 Mon Sep 17 00:00:00 2001 From: April Shen Date: Mon, 1 Dec 2025 11:05:29 +0000 Subject: [PATCH 1/2] script to clean up mappings file --- bin/trait_mapping/clean_up_mappings.py | 69 ++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 bin/trait_mapping/clean_up_mappings.py diff --git a/bin/trait_mapping/clean_up_mappings.py b/bin/trait_mapping/clean_up_mappings.py new file mode 100644 index 00000000..74868b6f --- /dev/null +++ b/bin/trait_mapping/clean_up_mappings.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +import argparse +import csv +import os.path +from collections import defaultdict + +from cmat.trait_mapping.ols import is_current_and_in_ontology + +from cmat.output_generation.clinvar_to_evidence_strings import load_ontology_mapping + + +def flatten_dict(d): + # key -> [(x1, y1), (x2, y2)] ==> [(key, x1, y1), (key, x2, y2)] + result = [] + for key in d: + for (x, y) in d[key]: + result.append((key, x, y)) + return result + + +def main(mappings_file): + mappings, target_ontology = load_ontology_mapping(mappings_file) + + obsolete_mappings = defaultdict(list) + current_mappings = defaultdict(list) + multiple_mappings = {} + single_current_mappings = {} + + # Separate obsolete mappings + for trait_name in mappings: + uri, label = mappings[trait_name] + if not is_current_and_in_ontology(uri, target_ontology): + obsolete_mappings[trait_name].append((uri, label)) + continue + current_mappings[trait_name].append((uri, label)) + + # Separate multiple mappings + for trait_name, mapping_list in current_mappings.items(): + if len(mapping_list) > 1: + multiple_mappings[trait_name] = mapping_list + else: + single_current_mappings[trait_name] = mapping_list + + # Output files + basename = os.path.basename(mappings_file) + with (open(f'{basename}_obsolete.csv'), 'w') as outfile: + writer = csv.writer(outfile, delimiter='\t') + obsolete_rows = flatten_dict(obsolete_mappings) + print(f'Removed {len(obsolete_rows)} obsolete mappings') + writer.writerows(obsolete_rows) + + with (open(f'{basename}_multiple.csv'), 'w') as outfile: + writer = csv.writer(outfile, delimiter='\t') + multiple_rows = flatten_dict(multiple_mappings) + print(f'Removed {len(multiple_rows)} multiple mappings') + writer.writerows(multiple_rows) + + with (open(f'{basename}_current.csv'), 'w') as outfile: + writer = csv.writer(outfile, delimiter='\t') + current_rows = flatten_dict(single_current_mappings) + print(f'{len(current_rows)} mappings remaining') + writer.writerows(current_rows) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--mappings-file', required=True, help='File of latest ontology mappings to process') + args = parser.parse_args() + main(args.mappings_file) From 7c6b34ed19f133269b082a50d96faa94a5632a8d Mon Sep 17 00:00:00 2001 From: April Shen Date: Mon, 1 Dec 2025 13:06:55 +0000 Subject: [PATCH 2/2] fix script --- bin/trait_mapping/clean_up_mappings.py | 41 +++++++++++++------------- 1 file changed, 20 insertions(+), 21 deletions(-) mode change 100644 => 100755 bin/trait_mapping/clean_up_mappings.py diff --git a/bin/trait_mapping/clean_up_mappings.py b/bin/trait_mapping/clean_up_mappings.py old mode 100644 new mode 100755 index 74868b6f..c6f2f1a1 --- a/bin/trait_mapping/clean_up_mappings.py +++ b/bin/trait_mapping/clean_up_mappings.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 + import argparse import csv import os.path @@ -22,48 +23,46 @@ def main(mappings_file): mappings, target_ontology = load_ontology_mapping(mappings_file) obsolete_mappings = defaultdict(list) - current_mappings = defaultdict(list) multiple_mappings = {} - single_current_mappings = {} - - # Separate obsolete mappings - for trait_name in mappings: - uri, label = mappings[trait_name] - if not is_current_and_in_ontology(uri, target_ontology): - obsolete_mappings[trait_name].append((uri, label)) - continue - current_mappings[trait_name].append((uri, label)) + current_mappings = {} - # Separate multiple mappings - for trait_name, mapping_list in current_mappings.items(): - if len(mapping_list) > 1: - multiple_mappings[trait_name] = mapping_list + for trait_name, mapping_list in mappings.items(): + keep_mappings = [] + # Remove obsolete mappings + for uri, label in mapping_list: + if not is_current_and_in_ontology(uri, target_ontology): + obsolete_mappings[trait_name].append((uri, label)) + else: + keep_mappings.append((uri, label)) + # Remove multiple mappings + if len(keep_mappings) > 1: + multiple_mappings[trait_name] = keep_mappings else: - single_current_mappings[trait_name] = mapping_list + current_mappings[trait_name] = keep_mappings # Output files - basename = os.path.basename(mappings_file) - with (open(f'{basename}_obsolete.csv'), 'w') as outfile: + filename = '.'.join(os.path.basename(mappings_file).split('.')[:-1]) + with open(f'{filename}_obsolete.csv', 'w+') as outfile: writer = csv.writer(outfile, delimiter='\t') obsolete_rows = flatten_dict(obsolete_mappings) print(f'Removed {len(obsolete_rows)} obsolete mappings') writer.writerows(obsolete_rows) - with (open(f'{basename}_multiple.csv'), 'w') as outfile: + with open(f'{filename}_multiple.csv', 'w+') as outfile: writer = csv.writer(outfile, delimiter='\t') multiple_rows = flatten_dict(multiple_mappings) print(f'Removed {len(multiple_rows)} multiple mappings') writer.writerows(multiple_rows) - with (open(f'{basename}_current.csv'), 'w') as outfile: + with open(f'{filename}_current.csv', 'w+') as outfile: writer = csv.writer(outfile, delimiter='\t') - current_rows = flatten_dict(single_current_mappings) + current_rows = flatten_dict(current_mappings) print(f'{len(current_rows)} mappings remaining') writer.writerows(current_rows) if __name__ == '__main__': - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser('Clean up obsolete and multiple mappings') parser.add_argument('--mappings-file', required=True, help='File of latest ontology mappings to process') args = parser.parse_args() main(args.mappings_file)