From e4843f3c14e700ae12bc3f330585d15ef0cd0c79 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 15 Dec 2025 14:26:15 -0500 Subject: [PATCH 01/11] First stab at adding clique leaders. --- node_normalizer/model/input.py | 5 +++++ node_normalizer/normalizer.py | 30 +++++++++++++++++++++++++----- node_normalizer/server.py | 4 +++- node_normalizer/set_id.py | 2 +- 4 files changed, 34 insertions(+), 7 deletions(-) diff --git a/node_normalizer/model/input.py b/node_normalizer/model/input.py index ea7820e..b6bf757 100644 --- a/node_normalizer/model/input.py +++ b/node_normalizer/model/input.py @@ -41,6 +41,11 @@ class CurieList(BaseModel): title="Whether to return taxa for equivalent identifiers" ) + include_clique_leaders: bool = Field( + default=False, + title="Whether to return clique leaders for conflated identifiers" + ) + class Config: schema_extra = { "example": { diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index 32d9126..713c877 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -532,6 +532,7 @@ async def get_normalized_nodes( include_descriptions: bool = False, include_individual_types: bool = True, include_taxa: bool = True, + include_clique_leaders: bool = False, ) -> Dict[str, Optional[str]]: """ Get value(s) for key(s) using redis MGET @@ -555,6 +556,7 @@ async def get_normalized_nodes( canonical_ids = await app.state.eq_id_to_id_db.mget(*upper_curies, encoding='utf-8') canonical_nonan = [canonical_id for canonical_id in canonical_ids if canonical_id is not None] info_contents = {} + clique_leaders = {} # did we get some canonical ids if canonical_nonan: @@ -569,14 +571,18 @@ async def get_normalized_nodes( other_ids = [] if conflate_gene_protein: - other_ids.extend(await app.state.gene_protein_db.mget(*canonical_nonan, encoding='utf8')) + gene_protein_clique_leaders = await app.state.gene_protein_db.mget(*canonical_nonan, encoding='utf8') + other_ids.extend(gene_protein_clique_leaders) + clique_leaders.update(zip(canonical_nonan, gene_protein_clique_leaders)) # logger.error(f"After conflate_gene_protein: {other_ids}") if conflate_chemical_drug: - other_ids.extend(await app.state.chemical_drug_db.mget(*canonical_nonan, encoding='utf8')) + drug_chemical_clique_leaders = await app.state.chemical_drug_db.mget(*canonical_nonan, encoding='utf8') + other_ids.extend(drug_chemical_clique_leaders) + clique_leaders.update(zip(canonical_nonan, drug_chemical_clique_leaders)) - # logger.error(f"After conflate_chemical_drug: {other_ids}") + # logger.error(f"After conflate_chemical_drug: {other_ids}") # if there are other ids, then we want to rebuild eqids and types. That's because even though we have them, # they're not necessarily first. For instance if what came in and got canonicalized was a protein id @@ -635,9 +641,13 @@ async def get_normalized_nodes( dereference_ids = dict() dereference_types = dict() + # Don't write out clique leaders unless its requested. + if not include_clique_leaders: + clique_leaders = None + # output the final result normal_nodes = { - input_curie: await create_node(app, canonical_id, dereference_ids, dereference_types, info_contents, + input_curie: await create_node(app, canonical_id, dereference_ids, dereference_types, info_contents, clique_leaders, include_descriptions=include_descriptions, include_individual_types=include_individual_types, include_taxa=include_taxa, @@ -680,7 +690,7 @@ async def get_info_content_attribute(app, canonical_nonan) -> dict: return new_attrib -async def create_node(app, canonical_id, equivalent_ids, types, info_contents, include_descriptions=True, +async def create_node(app, canonical_id, equivalent_ids, types, info_contents, clique_leaders, include_descriptions=True, include_individual_types=False, include_taxa=False, conflations=None): """Construct the output format given the compressed redis data""" # It's possible that we didn't find a canonical_id @@ -828,6 +838,16 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, i if include_taxa and node_taxa: node["taxa"] = sorted(node_taxa, key=get_numerical_curie_suffix) + # Add clique leaders if available. + if clique_leaders: + clique_leaders_for_node = clique_leaders.get(canonical_id, []) + clique_leaders_with_labels_and_types = [{ + 'identifier': cl, + 'labels': [eid['l'] for eid in eids if eid['i'] == cl], + 'types': [eid['t'] for eid in eids if eid['i'] == cl], + } for cl in clique_leaders_for_node] + node["clique_leaders"] = clique_leaders_with_labels_and_types + # We need to remove `biolink:Entity` from the types returned. # (See explanation at https://github.com/TranslatorSRI/NodeNormalization/issues/173) if 'biolink:Entity' in types[canonical_id]: diff --git a/node_normalizer/server.py b/node_normalizer/server.py index 18ca7ca..2fc1430 100644 --- a/node_normalizer/server.py +++ b/node_normalizer/server.py @@ -265,6 +265,7 @@ async def get_normalized_node_handler( description: bool = fastapi.Query(False, description="Whether to return curie descriptions when possible"), individual_types: bool = fastapi.Query(False, description="Whether to return individual types for equivalent identifiers"), include_taxa: bool = fastapi.Query(True, description="Whether to return taxa for equivalent identifiers"), + include_clique_leaders: bool = fastapi.Query(False, description="Whether to return clique leaders for conflated identifiers"), ): """ Get value(s) for key(s) using redis MGET @@ -274,6 +275,7 @@ async def get_normalized_node_handler( include_descriptions=description, include_individual_types=individual_types, include_taxa=include_taxa, + include_clique_leaders=include_clique_leaders, ) # If curie contains at least one entry, then the only way normalized_nodes could be blank @@ -295,7 +297,7 @@ async def get_normalized_node_handler_post(curies: CurieList): """ normalized_nodes = await get_normalized_nodes(app, curies.curies, curies.conflate, curies.drug_chemical_conflate, curies.description, include_individual_types=curies.individual_types, - include_taxa=curies.include_taxa, + include_taxa=curies.include_taxa, include_clique_leaders=curies.include_clique_leaders, ) # If curies.curies contains at least one entry, then the only way normalized_nodes could be blank diff --git a/node_normalizer/set_id.py b/node_normalizer/set_id.py index 3c3dc30..37a2745 100644 --- a/node_normalizer/set_id.py +++ b/node_normalizer/set_id.py @@ -41,7 +41,7 @@ async def generate_setid(app, curies, conflations) -> SetIDResponse: # We use get_normalized_nodes() to normalize all the CURIEs for us. normalization_results = await get_normalized_nodes( - app, curies, gene_protein_conflation, drug_chemical_conflation, include_descriptions=False, include_individual_types=False, include_taxa=False + app, curies, gene_protein_conflation, drug_chemical_conflation, include_descriptions=False, include_individual_types=False, include_taxa=False, include_clique_leaders=False ) # We prepare a set of sorted, deduplicated curies. From db9f2a32b932994bd32bc5176a53b64c91bcddfb Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 15 Dec 2025 14:26:41 -0500 Subject: [PATCH 02/11] Added on:push trigger for testing. --- .github/workflows/release.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 800d57b..e9d8f8f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,6 +1,7 @@ name: 'Publish to GitHub Packages' on: + push: release: types: [published] From 5351dc67f6c1eb85377dcedc808e8f1e73da4f1c Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 15 Dec 2025 14:36:15 -0500 Subject: [PATCH 03/11] Attempt at fix. --- node_normalizer/normalizer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index 713c877..430a083 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -573,14 +573,14 @@ async def get_normalized_nodes( if conflate_gene_protein: gene_protein_clique_leaders = await app.state.gene_protein_db.mget(*canonical_nonan, encoding='utf8') other_ids.extend(gene_protein_clique_leaders) - clique_leaders.update(zip(canonical_nonan, gene_protein_clique_leaders)) + clique_leaders.update(zip(*canonical_nonan, gene_protein_clique_leaders)) # logger.error(f"After conflate_gene_protein: {other_ids}") if conflate_chemical_drug: drug_chemical_clique_leaders = await app.state.chemical_drug_db.mget(*canonical_nonan, encoding='utf8') other_ids.extend(drug_chemical_clique_leaders) - clique_leaders.update(zip(canonical_nonan, drug_chemical_clique_leaders)) + clique_leaders.update(zip(*canonical_nonan, drug_chemical_clique_leaders)) # logger.error(f"After conflate_chemical_drug: {other_ids}") @@ -661,7 +661,7 @@ async def get_normalized_nodes( end_time = time.time_ns() logger.info(f"Normalized {len(curies)} nodes in {(end_time - start_time)/1_000_000:.2f} ms with arguments " + f"(curies={curies}, conflate_gene_protein={conflate_gene_protein}, conflate_chemical_drug={conflate_chemical_drug}, " + - f"include_descriptions={include_descriptions}, include_individual_types={include_individual_types})") + f"include_descriptions={include_descriptions}, include_individual_types={include_individual_types}, include_clique_leaders={include_clique_leaders})") return normal_nodes From 55753fb85834ee4331b779328de37cff082dcc57 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 15 Dec 2025 14:46:18 -0500 Subject: [PATCH 04/11] Added logging for debugging. --- node_normalizer/normalizer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index 430a083..b876f46 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -840,6 +840,7 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, c # Add clique leaders if available. if clique_leaders: + logger.info(f"Getting clique_leaders from {clique_leaders} for canonical ID {canonical_id}") clique_leaders_for_node = clique_leaders.get(canonical_id, []) clique_leaders_with_labels_and_types = [{ 'identifier': cl, From dda3646b789e7f1f79c755b6865333545c21943a Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 15 Dec 2025 14:47:15 -0500 Subject: [PATCH 05/11] Replaced logging with logger. --- node_normalizer/normalizer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index b876f46..a921bb2 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -511,7 +511,7 @@ async def get_eqids_and_types( types_with_ancestors = [] for index, typ in enumerate(types): if not typ: - logging.error(f"No type information found for '{canonical_nonan[index]}' with eqids: {eqids[index]}, " + logger.error(f"No type information found for '{canonical_nonan[index]}' with eqids: {eqids[index]}, " f"replacing with {BIOLINK_NAMED_THING}") types_with_ancestors.append([BIOLINK_NAMED_THING]) else: @@ -625,7 +625,7 @@ async def get_normalized_nodes( t = [] for other in dereference_others[canonical_id]: - # logging.debug(f"e = {e}, other = {other}, deref_others_eqs = {deref_others_eqs}") + # logger.debug(f"e = {e}, other = {other}, deref_others_eqs = {deref_others_eqs}") e += deref_others_eqs[other] t += deref_others_typ[other] @@ -703,16 +703,16 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, c # If we have 'None' in the equivalent IDs, skip it so we don't confuse things further down the line. if None in equivalent_ids[canonical_id]: - logging.warning(f"Skipping None in canonical ID {canonical_id} among eqids: {equivalent_ids}") + logger.warning(f"Skipping None in canonical ID {canonical_id} among eqids: {equivalent_ids}") equivalent_ids[canonical_id] = [x for x in equivalent_ids[canonical_id] if x is not None] if not equivalent_ids[canonical_id]: - logging.warning(f"No non-None values found for ID {canonical_id} among filtered eqids: {equivalent_ids}") + logger.warning(f"No non-None values found for ID {canonical_id} among filtered eqids: {equivalent_ids}") return None # If we have 'None' in the canonical types, something went horribly wrong (specifically: we couldn't # find the type information for all the eqids for this clique). Return None. if None in types[canonical_id]: - logging.error(f"No types found for canonical ID {canonical_id} among types: {types}") + logger.error(f"No types found for canonical ID {canonical_id} among types: {types}") return None # OK, now we should have id's in the format [ {"i": "MONDO:12312", "l": "Scrofula"}, {},...] From de096d2599a6cbbd549e529d3a233cd9764ece71 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 15 Dec 2025 14:59:12 -0500 Subject: [PATCH 06/11] Attempt to fix clique leader querying. --- node_normalizer/normalizer.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index a921bb2..4aaad64 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -573,14 +573,16 @@ async def get_normalized_nodes( if conflate_gene_protein: gene_protein_clique_leaders = await app.state.gene_protein_db.mget(*canonical_nonan, encoding='utf8') other_ids.extend(gene_protein_clique_leaders) - clique_leaders.update(zip(*canonical_nonan, gene_protein_clique_leaders)) + if include_clique_leaders: + clique_leaders.update(zip(canonical_nonan, json.loads(gene_protein_clique_leaders))) # logger.error(f"After conflate_gene_protein: {other_ids}") if conflate_chemical_drug: drug_chemical_clique_leaders = await app.state.chemical_drug_db.mget(*canonical_nonan, encoding='utf8') other_ids.extend(drug_chemical_clique_leaders) - clique_leaders.update(zip(*canonical_nonan, drug_chemical_clique_leaders)) + if include_clique_leaders: + clique_leaders.update(zip(canonical_nonan, json.loads(drug_chemical_clique_leaders))) # logger.error(f"After conflate_chemical_drug: {other_ids}") @@ -641,10 +643,6 @@ async def get_normalized_nodes( dereference_ids = dict() dereference_types = dict() - # Don't write out clique leaders unless its requested. - if not include_clique_leaders: - clique_leaders = None - # output the final result normal_nodes = { input_curie: await create_node(app, canonical_id, dereference_ids, dereference_types, info_contents, clique_leaders, From 53a82cb673383c5f439dc696538d154d83c209ea Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 15 Dec 2025 15:54:49 -0500 Subject: [PATCH 07/11] Get the clique leaders translated again. --- node_normalizer/normalizer.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index 4aaad64..aa1dcae 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -571,25 +571,27 @@ async def get_normalized_nodes( other_ids = [] if conflate_gene_protein: - gene_protein_clique_leaders = await app.state.gene_protein_db.mget(*canonical_nonan, encoding='utf8') + gene_protein_clique_leaders_strings = await app.state.gene_protein_db.mget(*canonical_nonan, encoding='utf8') + gene_protein_clique_leaders = [json.loads(oids) if oids else [] for oids in gene_protein_clique_leaders_strings] other_ids.extend(gene_protein_clique_leaders) if include_clique_leaders: - clique_leaders.update(zip(canonical_nonan, json.loads(gene_protein_clique_leaders))) + clique_leaders.update(zip(canonical_nonan, gene_protein_clique_leaders)) # logger.error(f"After conflate_gene_protein: {other_ids}") if conflate_chemical_drug: - drug_chemical_clique_leaders = await app.state.chemical_drug_db.mget(*canonical_nonan, encoding='utf8') + drug_chemical_clique_leaders_strings = await app.state.chemical_drug_db.mget(*canonical_nonan, encoding='utf8') + drug_chemical_clique_leaders = [json.loads(oids) if oids else [] for oids in drug_chemical_clique_leaders_strings] other_ids.extend(drug_chemical_clique_leaders) if include_clique_leaders: - clique_leaders.update(zip(canonical_nonan, json.loads(drug_chemical_clique_leaders))) + clique_leaders.update(zip(canonical_nonan, drug_chemical_clique_leaders)) - # logger.error(f"After conflate_chemical_drug: {other_ids}") + # logger.error(f"After conflate_chemical_drug: {other_ids}") # if there are other ids, then we want to rebuild eqids and types. That's because even though we have them, # they're not necessarily first. For instance if what came in and got canonicalized was a protein id # and we want gene first, then we're relying on the order of the other_ids to put it back in the right place. - other_ids = [json.loads(oids) if oids else [] for oids in other_ids] + # other_ids = [json.loads(oids) if oids else [] for oids in other_ids] # Until we added conflate_chemical_drug, canonical_nonan and other_ids would always have the same # length, so we could figure out mappings from one to the other just by doing: From b304088549209b30e9c0d0fb919acab61001a4e3 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 15 Dec 2025 16:38:22 -0500 Subject: [PATCH 08/11] Fixed up output. --- node_normalizer/normalizer.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index aa1dcae..ec6c1ee 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -731,8 +731,7 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, c identifiers_with_labels = eids else: # We have a conflation going on! To replicate Babel's behavior, we need to run the algorithem - # on the list of labels corresponding to the first - # So we need to run the algorithm on the first set of identifiers that have any + # on the list of labels corresponding to the first set of identifiers that have any # label whatsoever. identifiers_with_labels = [] curies_already_checked = set() @@ -810,12 +809,19 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, c # now need to reformat the identifier keys. It could be cleaner but we have to worry about if there is a label descriptions = [] + clique_leaders_output = {} node_taxa = set() node["equivalent_identifiers"] = [] for eqid in eids: eq_item = {"identifier": eqid["i"]} if "l" in eqid and eqid["l"]: eq_item["label"] = eqid["l"] + if clique_leaders and eqid["i"] in clique_leaders: + clique_leaders_output[eqid["i"]] = { + "identifier": eqid["i"], + "label": eqid["l"], + "biolink_type": types.get(eqid["i"], ["UNKNOWN"])[0], + } # if descriptions is enabled, add it to descriptions. if include_descriptions and "d" in eqid and len(eqid["d"]) > 0: desc = eqid["d"][0] @@ -840,14 +846,12 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, c # Add clique leaders if available. if clique_leaders: - logger.info(f"Getting clique_leaders from {clique_leaders} for canonical ID {canonical_id}") - clique_leaders_for_node = clique_leaders.get(canonical_id, []) - clique_leaders_with_labels_and_types = [{ - 'identifier': cl, - 'labels': [eid['l'] for eid in eids if eid['i'] == cl], - 'types': [eid['t'] for eid in eids if eid['i'] == cl], - } for cl in clique_leaders_for_node] - node["clique_leaders"] = clique_leaders_with_labels_and_types + # If there are any clique leader IDs we haven't included in clique_leaders_output, + # insert it anyway at this point. This shouldn't happen, but let's be careful. + missing_clique_leaders = (clique_leaders_output.keys() - clique_leaders) + for cl_id in missing_clique_leaders: + clique_leaders_output[cl_id] = {"identifier": cl_id, "biolink_type": types.get(cl_id, ["UNKNOWN"])[0]} + node["clique_leaders"] = clique_leaders_output # We need to remove `biolink:Entity` from the types returned. # (See explanation at https://github.com/TranslatorSRI/NodeNormalization/issues/173) From 6abcd84d103aac2c1aa3017075d220d460e1ac97 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 15 Dec 2025 16:52:31 -0500 Subject: [PATCH 09/11] Bugfixes. --- node_normalizer/normalizer.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index ec6c1ee..e6a7486 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -816,12 +816,6 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, c eq_item = {"identifier": eqid["i"]} if "l" in eqid and eqid["l"]: eq_item["label"] = eqid["l"] - if clique_leaders and eqid["i"] in clique_leaders: - clique_leaders_output[eqid["i"]] = { - "identifier": eqid["i"], - "label": eqid["l"], - "biolink_type": types.get(eqid["i"], ["UNKNOWN"])[0], - } # if descriptions is enabled, add it to descriptions. if include_descriptions and "d" in eqid and len(eqid["d"]) > 0: desc = eqid["d"][0] @@ -837,6 +831,15 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, c eq_item["type"] = eqid['types'][-1] node["equivalent_identifiers"].append(eq_item) + if clique_leaders and eqid["i"] in clique_leaders: + clique_leaders_output[eqid["i"]] = { + "identifier": eqid["i"], + "label": eq_item.get("label", ""), + "description": eq_item.get("description", ""), + "taxa": eq_item.get("taxa", []), + "type": eq_item.get("type", "UNKNOWN") + } + if include_descriptions and descriptions: node["descriptions"] = descriptions node["id"]["description"] = descriptions[0] @@ -848,7 +851,7 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, c if clique_leaders: # If there are any clique leader IDs we haven't included in clique_leaders_output, # insert it anyway at this point. This shouldn't happen, but let's be careful. - missing_clique_leaders = (clique_leaders_output.keys() - clique_leaders) + missing_clique_leaders = (clique_leaders - clique_leaders_output.keys()) for cl_id in missing_clique_leaders: clique_leaders_output[cl_id] = {"identifier": cl_id, "biolink_type": types.get(cl_id, ["UNKNOWN"])[0]} node["clique_leaders"] = clique_leaders_output From 321fb85233ad4126145417934840857dfce6c57e Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 15 Dec 2025 17:15:07 -0500 Subject: [PATCH 10/11] Fix clique leader output. --- node_normalizer/normalizer.py | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index e6a7486..b3361e6 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -831,14 +831,16 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, c eq_item["type"] = eqid['types'][-1] node["equivalent_identifiers"].append(eq_item) - if clique_leaders and eqid["i"] in clique_leaders: - clique_leaders_output[eqid["i"]] = { - "identifier": eqid["i"], - "label": eq_item.get("label", ""), - "description": eq_item.get("description", ""), - "taxa": eq_item.get("taxa", []), - "type": eq_item.get("type", "UNKNOWN") - } + if clique_leaders and canonical_id in clique_leaders and eqid["i"] in clique_leaders[canonical_id]: + clique_leaders_output[eqid["i"]] = { "identifier": eqid["i"] } + if "label" in eq_item: + clique_leaders_output[eqid["i"]]["label"] = eq_item["label"] + if "description" in eq_item: + clique_leaders_output[eqid["i"]]["description"] = eq_item["description"] + if "taxa" in eq_item: + clique_leaders_output[eqid["i"]]["taxa"] = eqid["taxa"] + if "type" in eq_item: + clique_leaders_output[eqid["i"]]["type"] = eqid["type"] if include_descriptions and descriptions: node["descriptions"] = descriptions @@ -849,12 +851,14 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, c # Add clique leaders if available. if clique_leaders: - # If there are any clique leader IDs we haven't included in clique_leaders_output, - # insert it anyway at this point. This shouldn't happen, but let's be careful. - missing_clique_leaders = (clique_leaders - clique_leaders_output.keys()) - for cl_id in missing_clique_leaders: - clique_leaders_output[cl_id] = {"identifier": cl_id, "biolink_type": types.get(cl_id, ["UNKNOWN"])[0]} - node["clique_leaders"] = clique_leaders_output + node["clique_leaders"] = [] + for cl_id in clique_leaders: + if cl_id in clique_leaders_output: + node["clique_leaders"].append(clique_leaders_output[cl_id]) + else: + node["clique_leaders"].append({ + "identifier": cl_id, + }) # We need to remove `biolink:Entity` from the types returned. # (See explanation at https://github.com/TranslatorSRI/NodeNormalization/issues/173) From 1ad6572b498cc98a2fa05a418e4ff6903c320717 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 15 Dec 2025 17:22:54 -0500 Subject: [PATCH 11/11] Bugfix. --- node_normalizer/normalizer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index b3361e6..4b9c007 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -816,6 +816,7 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, c eq_item = {"identifier": eqid["i"]} if "l" in eqid and eqid["l"]: eq_item["label"] = eqid["l"] + # if descriptions is enabled, add it to descriptions. if include_descriptions and "d" in eqid and len(eqid["d"]) > 0: desc = eqid["d"][0] @@ -838,9 +839,9 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, c if "description" in eq_item: clique_leaders_output[eqid["i"]]["description"] = eq_item["description"] if "taxa" in eq_item: - clique_leaders_output[eqid["i"]]["taxa"] = eqid["taxa"] + clique_leaders_output[eqid["i"]]["taxa"] = eq_item["taxa"] if "type" in eq_item: - clique_leaders_output[eqid["i"]]["type"] = eqid["type"] + clique_leaders_output[eqid["i"]]["type"] = eq_item["type"] if include_descriptions and descriptions: node["descriptions"] = descriptions