diff --git a/grma/donorsgraph/build_donors_graph.py b/grma/donorsgraph/build_donors_graph.py index 66b06c4..5c5faaf 100644 --- a/grma/donorsgraph/build_donors_graph.py +++ b/grma/donorsgraph/build_donors_graph.py @@ -10,7 +10,8 @@ from grma.donorsgraph.create_lol import LolBuilder from grma.match.graph_wrapper import Graph from grma.utilities.geno_representation import HashableArray -from grma.utilities.utils import gl_string_to_integers, tuple_geno_to_int, print_time +from grma.utilities.utils import print_time, geno_to_int, gl_string_to_hash +from bidict import bidict class BuildMatchingGraph: @@ -19,7 +20,7 @@ class BuildMatchingGraph: It gets a path to directory with the donors' file, builds the graph and saved it as LOL graph using Cython. """ - __slots__ = "_verbose", "_graph", "_edges" + __slots__ = "_verbose", "_graph", "_edges", "bidirectional_dict" def __init__(self, path_to_donors_directory: str, verbose: bool = False): """ @@ -31,19 +32,20 @@ def __init__(self, path_to_donors_directory: str, verbose: bool = False): self._verbose = verbose self._graph = None # LOL dict-representation self._edges: List[Edge] = [] # edge-list + self.bidirectional_dict = bidict() self._save_graph_as_edges(path_to_donors_directory) def _create_classes_edges(self, geno, class_, layers): - int_class = tuple_geno_to_int(class_) + hash_class = gl_string_to_hash(str(class_)) % 1000000000 + 1000000000 - self._edges.append(Edge(int_class, geno, 0)) + self._edges.append(Edge(hash_class, geno, 0)) # check if the class node was created - if int_class not in layers["CLASS"]: - layers["CLASS"].add(int_class) - self._create_subclass_edges(class_, int_class, layers) + if hash_class not in layers["CLASS"]: + layers["CLASS"].add(hash_class) + self._create_subclass_edges(class_, hash_class, layers) - def _create_subclass_edges(self, class_, int_class, layers): + def _create_subclass_edges(self, class_, hash_class, layers): """ subclasses edges are created by dropping an allele from a class. each allele we drop, will be replaced with zero, @@ -56,19 +58,18 @@ def _create_subclass_edges(self, class_, int_class, layers): # set the missing allele to always be the second allele in the locus for i in range(num_of_alleles): if i % 2 == 0: - subclass_alleles.add( - tuple_geno_to_int(tuple(class_[0:i] + (0,) + class_[i + 1 :])) - ) + subclass = tuple(class_[0:i] + (0,) + class_[i + 1 :]) else: - subclass_alleles.add( - tuple_geno_to_int( - tuple(class_[0 : i - 1] + (0, class_[i - 1]) + class_[i + 1 :]) - ) + subclass = tuple( + class_[0 : i - 1] + (0, class_[i - 1]) + class_[i + 1 :] ) + hash_subclass = gl_string_to_hash(str(subclass)) % 1000000000 + 2000000000 + subclass_alleles.add(hash_subclass) + # add subclass->class edges for sub in subclass_alleles: - self._edges.append(Edge(sub, int_class, 0)) + self._edges.append(Edge(sub, hash_class, 0)) if sub not in layers["SUBCLASS"]: layers["SUBCLASS"].add(sub) @@ -101,16 +102,27 @@ def _save_graph_as_edges(self, path_to_donors_directory: str | os.PathLike): ): # retrieve all line's parameters donor_id, geno, probability, index = line.strip().split(",") - donor_id = int(donor_id) + donor_id = -1 * int(donor_id) index = int(index) probability = float(probability) # convert geno to list of integers - geno = gl_string_to_integers(geno) - - # sort alleles for each HLA-X - for x in range(0, len(geno), 2): - geno[x : x + 2] = sorted(geno[x : x + 2]) + alleles = [ + allele + for locus in geno.split("^") + for allele in locus.split("+") + ] + geno = [] + for allele in alleles: + if "N" in allele: + geno.append(0) + elif allele in self.bidirectional_dict: + geno.append(self.bidirectional_dict[allele]) + else: + self.bidirectional_dict[allele] = ( + len(self.bidirectional_dict) + 3 + ) + geno.append(self.bidirectional_dict[allele]) geno = HashableArray(geno) # handle new donor appearance in file @@ -135,7 +147,7 @@ def _save_graph_as_edges(self, path_to_donors_directory: str | os.PathLike): # continue creation of classes and subclasses if geno not in layers["GENOTYPE"]: layers["GENOTYPE"].add(geno) - CLASS_I_END = -2 * int(-len(geno)/4 - 0.5) + CLASS_I_END = -2 * int(-len(geno) / 4 - 0.5) geno_class1 = tuple(geno[:CLASS_I_END]) geno_class2 = tuple(geno[CLASS_I_END:]) self._create_classes_edges(geno, geno_class1, layers) @@ -150,6 +162,8 @@ def _save_graph_as_edges(self, path_to_donors_directory: str | os.PathLike): # add the last donor to edgelist for HLA, probability in probability_dict.items(): + if last_id == 0: + print("last_id = 0") self._edges.append(Edge(HLA, last_id, probability / total_probability)) self._edges.append(Edge(last_id, HLA, probability / total_probability)) @@ -176,4 +190,5 @@ def to_pickle(self, path: Union[str, os.PathLike]): :param path: A path to save the pickled object """ - pickle.dump(self._graph, open(path, "wb")) + graph_bdict = [self._graph, self.bidirectional_dict] + pickle.dump(graph_bdict, open(path, "wb")) diff --git a/grma/donorsgraph/create_lol.py b/grma/donorsgraph/create_lol.py index 8365227..0789a4d 100644 --- a/grma/donorsgraph/create_lol.py +++ b/grma/donorsgraph/create_lol.py @@ -6,7 +6,7 @@ from collections import OrderedDict from grma.donorsgraph import Edge -from grma.utilities.utils import print_time, tuple_geno_to_int +from grma.utilities.utils import print_time class LolBuilder: @@ -76,10 +76,10 @@ def _convert(self, layers: Dict[str, Set]): arrays_start = free # map lol-ids to arrays # given an lol_id, the mapping will be map_number_to_arr_node[lol_id - arrays_start, :] - geno = layers['GENOTYPE'].pop() - layers['GENOTYPE'].add(geno) + geno = layers["GENOTYPE"].pop() + layers["GENOTYPE"].add(geno) map_number_to_arr_node = np.zeros( - (len(layers["GENOTYPE"]), len(geno)), dtype=np.uint16 + (len(layers["GENOTYPE"]), len(geno)), dtype=np.uint32 ) for i, geno in tqdm( enumerate(layers["GENOTYPE"]), @@ -87,7 +87,7 @@ def _convert(self, layers: Dict[str, Set]): disable=not self._verbose, ): map_node_to_number[geno] = free - map_number_to_arr_node[i, :] = geno.np() + map_number_to_arr_node[i] = geno free += 1 # map classes to lol-id. @@ -112,7 +112,7 @@ def _convert(self, layers: Dict[str, Set]): ) if y < subclasses_start ], - dtype=np.uint32, + dtype=np.int32, ) print_time("(3/6) Create the index list") @@ -186,12 +186,6 @@ def _convert(self, layers: Dict[str, Set]): weights_list=weights_list, ) - # replace geno hashable array to more efficient representation. - for array_geno in layers["GENOTYPE"]: - int_geno = tuple_geno_to_int(array_geno) - map_node_to_number[int_geno] = map_node_to_number[array_geno] - del map_node_to_number[array_geno] - del self._graph del layers gc.collect() diff --git a/grma/match/donors_matching.py b/grma/match/donors_matching.py index cbd0dc2..bf19534 100644 --- a/grma/match/donors_matching.py +++ b/grma/match/donors_matching.py @@ -1,7 +1,11 @@ from collections.abc import Iterator from typing import List, Tuple, Set, Iterable, Dict from typing import Sequence +from grma.match.hla_match import batch_locuses_match_between_genos +from ..utilities.utils import geno_to_int, gl_string_to_hash +import ast +from bidict import bidict import networkx as nx import numpy as np import pandas as pd @@ -13,8 +17,6 @@ donor_mismatch_format, drop_less_than_7_matches, check_similarity, - gl_string_to_integers, - tuple_geno_to_int, print_time, ) @@ -37,15 +39,24 @@ def _init_results_df(donors_info): fields_in_results = { "Patient_ID": [], "Donor_ID": [], - "GvH_Mismatches": [], - "HvG_Mismatches": [], - "Number_Of_Mismatches": [], "Matching_Probability": [], "Match_Probability": [], "Permissive/Non-Permissive": [], - "Match_Between_Most_Commons": [], + # "Match_Between_Most_Commons": [], } + # Define the mismatch types and their order + mismatch_types = ["", "HvG", "GvH"] + for t in mismatch_types: + for i in range(4): # 0 to 3 mismatches + name = ( + f"chance_for_{i}_{t}_Mismatches" if t else f"chance_for_{i}_Mismatches" + ) + fields_in_results[name] = [] + # Add "more than 3" for each mismatch type + name = f"chance_for_3+_{t}_Mismatches" if t else f"chance_for_3+_Mismatches" + fields_in_results[name] = [] + donors_db_fields = DONORS_DB.columns.values.tolist() for di in donors_info: if di in donors_db_fields: @@ -53,26 +64,28 @@ def _init_results_df(donors_info): return pd.DataFrame(fields_in_results) -def locuses_match_between_genos(geno_pat, geno_don): - matches = [] - total_gvh = 0 - total_hvg = 0 - - for i in range(0, len(geno_pat), 2): - a1, b1 = geno_pat[i], geno_pat[i + 1] - a2, b2 = geno_don[i], geno_don[i + 1] - - s1 = int(a1 == a2) + int(b1 == b2) - s2 = int(a1 == b2) + int(b1 == a2) - matches.append(max(s1, s2)) - - p_set = {x for x in (a1, b1) if x not in (None, 0)} - d_set = {x for x in (a2, b2) if x not in (None, 0)} - - total_gvh += len(p_set - d_set) # patient has, donor lacks - total_hvg += len(d_set - p_set) # donor has, patient lacks - - return matches, total_gvh, total_hvg +# def locuses_match_between_genos(geno_pat, geno_don): +# matches = [] +# total_gvh = 0 +# total_hvg = 0 +# +# for i in range(0, len(geno_pat), 2): +# a1, b1 = geno_pat[i], geno_pat[i + 1] +# a2, b2 = geno_don[i], geno_don[i + 1] +# +# P = {x for x in (a1, b1) if x not in (None, 0)} +# D = {x for x in (a2, b2) if x not in (None, 0)} +# +# gvh_i = len(P - D) # patient has, donor lacks +# hvg_i = len(D - P) # donor has, patient lacks +# +# total_gvh += gvh_i +# total_hvg += hvg_i +# +# mismatch_i = max(gvh_i, hvg_i) # table's #Max +# matches.append(mismatch_i) +# +# return matches, total_gvh, total_hvg class DonorsMatching(object): @@ -84,16 +97,20 @@ class DonorsMatching(object): "_genotype_candidates", "patients", "verbose", + "bidirectional_dict", + "_donor_neighbors_cache", ) - def __init__(self, graph: Graph, verbose: bool = False): + def __init__(self, graph: Graph, verbose: bool = False, bdict: bidict = None): self._graph: Graph = graph self._patients_graph: nx.DiGraph = nx.DiGraph() self._genotype_candidates: Dict[ int, Dict[int, List[Tuple[float, int]]] ] = {} # AMIT ADD - self.patients: Dict[int, Sequence[int]] = {} + self.patients: Dict[int, Dict[int, HashableArray]] = {} self.verbose = verbose + self.bidirectional_dict = bdict + self._donor_neighbors_cache = {} def get_most_common_genotype(self, donor_id): """Takes a donor ID and return his/her most common genotype.""" @@ -144,7 +161,9 @@ def __find_genotype_candidates_from_class( ) -> Tuple[np.ndarray, np.ndarray]: """Takes an integer subclass. Returns the genotypes (ids and values) which are connected to it in the graph""" - return self._graph.class_neighbors(clss, Len = len(list(self.patients.values())[0])) + return self._graph.class_neighbors( + clss, Len=len(next(iter(next(iter(self.patients.values())).values()))) + ) def __find_donor_from_geno(self, geno_id: int) -> Sequence[int]: """Gets the LOL ID of a genotype. @@ -210,14 +229,21 @@ def __add_matched_genos_to_graph( weight={geno_num: [probability, similarity]}) """ - def __classes_and_subclasses_from_genotype(self, genotype: HashableArray): + def __classes_and_subclasses_from_genotype(self, genotype): subclasses = [] - ALLELES_IN_CLASS_I = -2*int(-len(genotype)/4-0.5) + ALLELES_IN_CLASS_I = -2 * int(-len(genotype) / 4 - 0.5) ALLELES_IN_CLASS_II = len(genotype) - ALLELES_IN_CLASS_I - classes = [(genotype[:ALLELES_IN_CLASS_I], 0), (genotype[ALLELES_IN_CLASS_I:], 1)] + classes = [ + (tuple(genotype[:ALLELES_IN_CLASS_I]), 0), + (tuple(genotype[ALLELES_IN_CLASS_I:]), 1), + ] num_of_alleles_in_class = [ALLELES_IN_CLASS_I, ALLELES_IN_CLASS_II] - int_classes = [(tuple_geno_to_int(tuple(clss[0])), clss[1]) for clss in classes] + int_classes = [] + for clss in classes: + hash_clss = gl_string_to_hash(str(clss[0])) % 1000000000 + 1000000000 + int_classes.append((hash_clss, clss[1])) + for clss in int_classes: self._patients_graph.add_edge(clss[0], genotype) @@ -228,22 +254,29 @@ def __classes_and_subclasses_from_genotype(self, genotype: HashableArray): for k in range(0, num_of_alleles_in_class[class_num]): # set the missing allele to always be the second allele in the locus if k % 2 == 0: - sub = tuple_geno_to_int( - classes[class_num][0][0:k] + ZEROS + classes[class_num][0][k + 1 :] + subclass = tuple( + classes[class_num][0][0:k] + + (0,) + + classes[class_num][0][k + 1 :] ) else: - sub = tuple_geno_to_int( + subclass = tuple( classes[class_num][0][0 : k - 1] - + ZEROS + + (0,) + classes[class_num][0][k - 1 : k] + classes[class_num][0][k + 1 :] ) + hash_subclass = ( + gl_string_to_hash(str(subclass)) % 1000000000 + 2000000000 + ) # missing allele number is the index of the first allele of the locus the missing allele belongs to. # Could be [0, 2, 4, 6, 8] missing_allele_num = ALLELES_IN_CLASS_I * class_num + 2 * (k // 2) subclass = ClassMinusOne( - subclass=sub, class_num=class_num, allele_num=missing_allele_num + subclass=hash_subclass, + class_num=class_num, + allele_num=missing_allele_num, ) # add subclass -> genotype edge to patients graph @@ -253,8 +286,6 @@ def __classes_and_subclasses_from_genotype(self, genotype: HashableArray): return int_classes, subclasses - - def create_patients_graph(self, f_patients: str): """ create patients graph. \n @@ -273,13 +304,52 @@ def create_patients_graph(self, f_patients: str): # retrieve all line's parameters line_values = line.strip().split(",") patient_id, geno, prob, index = line_values - - geno = gl_string_to_integers(geno) - patient_id = int(patient_id) + patient_id = -1 * int(patient_id) index = int(index) prob = float(prob) - # handle new patient appearance in file + encoded = [] + for locus in geno.split("^"): + parts = locus.split("+") + aL = parts[0].strip() if len(parts) > 0 else "N" + aR = parts[1].strip() if len(parts) > 1 else "N" + + # Classify left allele + if "N" in aL: + tL, vL = "missing", 0 + elif aL in self.bidirectional_dict: + tL, vL = "known", self.bidirectional_dict[aL] + else: + tL, vL = "unknown", None + + # Classify right allele + if "N" in aR: + tR, vR = "missing", 0 + elif aR in self.bidirectional_dict: + tR, vR = "known", self.bidirectional_dict[aR] + else: + tR, vR = "unknown", None + + # Decide per-locus encoding (two outputs per locus, in order) + if tL == "unknown" and tR == "unknown": + if aL == aR: + encoded.extend( + [1, 1] + ) # same unknown twice (homozygous unknown) + else: + encoded.extend([1, 2]) # two different unknowns + elif tL == "unknown" and tR != "unknown": + encoded.extend([1, vR]) # left unknown, right known/missing + elif tL != "unknown" and tR == "unknown": + encoded.extend([vL, 1]) # left known/missing, right unknown + else: + encoded.extend([vL, vR]) # both known/missing + + geno = HashableArray(encoded) + if patient_id in self.patients.keys(): + self.patients[patient_id][index] = geno + else: + self.patients[patient_id] = {index: geno} if index == 0: # set normalized probabilities for HLA, probability in prob_dict.items(): @@ -290,7 +360,6 @@ def create_patients_graph(self, f_patients: str): # initialize parameters prob_dict = {} total_prob = 0 - self.patients[patient_id] = geno self._genotype_candidates[ patient_id ] = {} # AMIT ADD - initialize _genotype_candidates @@ -299,13 +368,6 @@ def create_patients_graph(self, f_patients: str): subclasses_by_patient[patient_id] = set() classes_by_patient[patient_id] = set() - # sort alleles for each HLA-X - l = len(geno) - for x in range(0, l, 2): - geno[x : x + 2] = sorted(geno[x : x + 2]) - - geno = HashableArray(geno) - # add probabilities to probability dict total_prob += prob if geno not in prob_dict: @@ -352,14 +414,22 @@ def find_geno_candidates_by_subclasses(self, subclasses): genotypes_value, ) = self.__find_genotype_candidates_from_subclass(subclass.subclass) - geno = genotypes_value[0] - ALLELES_IN_CLASS_I = -2*int(-len(geno)/4-0.5) - ALLELES_IN_CLASS_II = len(geno) - ALLELES_IN_CLASS_I + len_geno = len(genotypes_value[0]) + ALLELES_IN_CLASS_I = -2 * int(-len_geno / 4 - 0.5) + ALLELES_IN_CLASS_II = len_geno - ALLELES_IN_CLASS_I # Checks only the locuses that are not certain to match if subclass.class_num == 0: - allele_range_to_check = np.array([x for x in range(len(geno)//2 + (len(geno)//2 & 1), len(geno), 2)] + [subclass.allele_num], dtype=np.uint8) + allele_range_to_check = np.array( + [x for x in range(ALLELES_IN_CLASS_I, len_geno, 2)] + + [subclass.allele_num], + dtype=np.uint8, + ) else: - allele_range_to_check = np.array([x for x in range(0, len(geno)//2 + (len(geno)//2 & 1), 2)] + [subclass.allele_num], dtype=np.uint8) + allele_range_to_check = np.array( + [x for x in range(0, ALLELES_IN_CLASS_I, 2)] + + [subclass.allele_num], + dtype=np.uint8, + ) # number of alleles that already match due to match in subclass matched_alleles: int = ( @@ -395,14 +465,34 @@ def find_geno_candidates_by_classes(self, classes): # Checks only the locuses that are not certain to match (the locuses of the other class) # Class I appearances: 3 locuses = 6 alleles = 23/24 digits # Class II appearances: 2 locuses = 4 alleles = 15/16 digits - geno = list(self.patients.values())[0] + num_alleles = len( + next(iter(next(iter(self.patients.values())).values())) + ) if clss[1] == 0: - allele_range_to_check = np.array([x for x in range(len(geno)//2 + (len(geno)//2 & 1), len(geno), 2)], dtype=np.uint8) - matched_alleles: int = len(geno)//2 + (len(geno)//2 & 1) + allele_range_to_check = np.array( + [ + x + for x in range( + num_alleles // 2 + (num_alleles // 2 & 1), + num_alleles, + 2, + ) + ], + dtype=np.uint8, + ) + matched_alleles: int = num_alleles // 2 + (num_alleles // 2 & 1) else: - allele_range_to_check = np.array([x for x in range(0, len(geno)//2 + (len(geno)//2 & 1), 2)], dtype=np.uint8) - matched_alleles: int = len(geno)//2 - (len(geno)//2 & 1) + allele_range_to_check = np.array( + [ + x + for x in range( + 0, num_alleles // 2 + (num_alleles // 2 & 1), 2 + ) + ], + dtype=np.uint8, + ) + matched_alleles: int = num_alleles // 2 - (num_alleles // 2 & 1) # Compares the candidate to the patient's genotypes, and adds the match geno candidates to the graph. self.__add_matched_genos_to_graph( @@ -434,8 +524,7 @@ def find_geno_candidates_by_genotypes(self, patient_id: int): "probability" ] # patient's geno probability - int_geno = tuple_geno_to_int(geno) - geno_id = self._graph.get_node_id(int_geno) + geno_id = self._graph.get_node_id(geno) if not geno_id: continue @@ -485,7 +574,7 @@ def score_matches( matched: Set[int], ) -> Tuple[Set[int], int, pd.DataFrame]: """ - Given a number of mismatches and a patient, this function will return a dictionary + Given a number of mismatches and a patient, this function will return a tuple of all matching donors found in the data with the specific number of mismatches, sorted by their probability for a match. @@ -496,10 +585,11 @@ def score_matches( :param threshold: Minimal score value for a valid match. default is 0.1. :param cutoff: Maximum number of matches to return. default is 50. :param matched: A set of donors ID that have already matched for this patient. - :return: a dictionary of all matching donors and their matching properties. + :return: a tuple of (matched set, count of matches, updated results DataFrame). """ if len(matched) >= cutoff: return matched, 0, results_df + num_alleles = len(next(iter(self.patients[patient].values()))) # a loop that set the scores for all the matching candidates. patient_scores = {} @@ -509,7 +599,7 @@ def score_matches( ].items(): # AMIT ADD for prob, matches in genotype_matches.values(): # AMIT CHANGE # match_info = (probability of patient's genotype, number of matches to patient's genotype) - if matches != len(list(self.patients.values())[0]) - mismatch: + if matches != num_alleles - mismatch: continue # add the probabilities multiplication of the patient and all the donors that has this genotype @@ -522,7 +612,6 @@ def score_matches( patient_scores[donor][0] += prob * donor_prob if donor_prob > patient_scores[donor][2]: patient_scores[donor][1:] = [hla_id, donor_prob] - else: patient_scores[donor] = [prob * donor_prob, hla_id, donor_prob] @@ -547,10 +636,80 @@ def score_matches( if len(matched) >= cutoff: break matched.add(donor) - self.__append_matching_donor( - add_donors, donors_info, patient, donor, score * 100, mismatch + # Compute chances for gvh, hvg, and mis + pats = self.patients[patient] + chances = { + "gvh": [0, 0, 0, 0, 0], + "hvg": [0, 0, 0, 0, 0], + "mis": [0, 0, 0, 0, 0], + } + + for index, pat in pats.items(): + pat_prob = self._patients_graph[pat][patient]["probability"] + + # --- cache donor genotype neighbors once per donor --- + if not hasattr(self, "_donor_neighbors_cache"): + self._donor_neighbors_cache = {} + donor_neighbors = self._donor_neighbors_cache.setdefault( + donor, list(self._graph.neighbors(donor)) + ) + + if not donor_neighbors: + continue + + # Build 2D array of donor genotypes (rows) and 1D array of their probs + # Assumes each geno[0] is a fixed-length tuple/list of ints (same len as pat.np()). + don_genos = np.array( + [geno[0] for geno in donor_neighbors], dtype=np.uint32 + ) + don_probs = np.array([geno[1] for geno in donor_neighbors], dtype=float) + + # Ensure C-contiguous (should already be, but just to be safe) + if not don_genos.flags.c_contiguous: + don_genos = np.ascontiguousarray(don_genos, dtype=np.uint32) + + pat_np = np.asarray( + pat.np(), dtype=np.uint32 + ) # no copy if already uint32 + contiguous + + # One Cython call for all donor genos + # out[:,0]=mis, out[:,1]=gvh, out[:,2]=hvg + out = batch_locuses_match_between_genos(pat_np, don_genos) + + # Accumulate chances + # (same logic as before; clamp at 4 for "3+") + for j in range(out.shape[0]): + mis = int(out[j, 0]) + gvh = int(out[j, 1]) + hvg = int(out[j, 2]) + w = pat_prob * don_probs[j] + chances["gvh"][min(gvh, 4)] += w + chances["hvg"][min(hvg, 4)] += w + chances["mis"][min(mis, 4)] += w + + # Compute allele probability + allele_prob = self.probability_to_allele( + don_id=donor, pat_geno=self.patients[patient][0].np() ) + # Append donor data to add_donors + add_donors["Patient_ID"].append(-1 * patient) + add_donors["Donor_ID"].append(-1 * donor) + add_donors["Match_Probability"].append(allele_prob) + # add_donors["Match_Between_Most_Commons"].append(compare_commons) + add_donors["Matching_Probability"].append(score) + for t in ("GvH", "HvG", ""): + for i in range(5): + key = f"chance_for_{'3+' if i == 4 else i}_{t + '_Mismatches' if t else 'Mismatches'}" + add_donors[key].append(chances[t.lower() if t else "mis"][i]) + + add_donors["Permissive/Non-Permissive"].append("-") + # TODO: add permissiveness algorithm + + # add the other given fields to the results + for field in donors_info: + add_donors[field].append(DONORS_DB.loc[donor, field]) + results_df = pd.concat( [results_df, pd.DataFrame(add_donors)], ignore_index=True ) @@ -558,48 +717,7 @@ def score_matches( if self.verbose: print_time(f"({mismatch} MMs) Found {count_matches} matches") - return matched, count_matches, results_df # 3433825 - - def __append_matching_donor( - self, - add_donors: Dict, - donors_info: Iterable[str], - patient: int, - donor: int, - match_prob: float, - mm_number: int, - ) -> None: - """add a donor to the matches dictionary""" - pat = self.patients[patient] - don = self.get_most_common_genotype(donor) - compare_commons, gvh, hvg = locuses_match_between_genos( - pat, don - ) - - add_donors["Patient_ID"].append(patient) - add_donors["Donor_ID"].append(donor) - allele_prob = self.probability_to_allele( - don_id=donor, pat_geno=self.patients[patient] - ) - add_donors["Match_Probability"].append(allele_prob) - add_donors["Match_Between_Most_Commons"].append(compare_commons) - - add_donors["Matching_Probability"].append(match_prob) - actual_mismatches = 0 - for match_score in compare_commons: - if match_score != 2: - actual_mismatches += (2 - match_score) - - add_donors["Number_Of_Mismatches"].append(actual_mismatches) - add_donors["GvH_Mismatches"].append(gvh) - add_donors["HvG_Mismatches"].append(hvg) - - add_donors["Permissive/Non-Permissive"].append("-") - # TODO: add permissiveness algorithm - - # add the other given fields to the results - for field in donors_info: - add_donors[field].append(DONORS_DB.loc[donor, field]) + return matched, count_matches, results_df @property def patients_graph(self): diff --git a/grma/match/graph_wrapper.py b/grma/match/graph_wrapper.py index 6f1fe07..feb3080 100644 --- a/grma/match/graph_wrapper.py +++ b/grma/match/graph_wrapper.py @@ -7,6 +7,7 @@ import numpy as np from grma.utilities.geno_representation import HashableArray from grma.match.lol_graph import LolGraph +from bidict import bidict NODES_TYPES = Union[int, HashableArray] @@ -57,10 +58,12 @@ def get_edge_data( ret = self._graph.get_edge_data(node1_num, node2_num) return default if ret == exception_val else ret - def class_neighbors(self, node: NODES_TYPES | int, search_lol_id: bool = False, Len: int = 10): + def class_neighbors( + self, node: NODES_TYPES | int, search_lol_id: bool = False, Len: int = 10 + ): node_num = self._map_node_to_number[node[0]] if not search_lol_id else node neighbors_list = self._graph.neighbors_unweighted(node_num) - neighbors_list_values = np.ndarray([len(neighbors_list), Len], dtype=np.uint16) + neighbors_list_values = np.ndarray([len(neighbors_list), Len], dtype=np.uint32) for i, neighbor in enumerate(neighbors_list): neighbors_list_values[i, :] = self._graph.arr_node_value_from_id(neighbor) @@ -120,5 +123,5 @@ def node_value_from_id(self, node_id: int) -> NODES_TYPES: @classmethod def from_pickle(cls, path: Union[str, PathLike]): - graph_dict = pickle.load(open(path, "rb")) - return cls(graph_dict) + graph_bdict = pickle.load(open(path, "rb")) + return cls(graph_bdict[0]), bidict(graph_bdict[1]) diff --git a/grma/match/lol_graph.pyx b/grma/match/lol_graph.pyx index ee2679f..c123d15 100644 --- a/grma/match/lol_graph.pyx +++ b/grma/match/lol_graph.pyx @@ -15,8 +15,8 @@ cdef class LolGraph: UINT[:] _index_list UINT[:] _neighbors_list FLOAT[:] _weights_list - UINT[:] _map_number_to_num_node - UINT16[:, :] _map_number_to_arr_node + INT[:] _map_number_to_num_node + UINT[:, :] _map_number_to_arr_node # Changed from UINT16[:, :] UINT _arrays_start bint directed bint weighted @@ -24,8 +24,8 @@ cdef class LolGraph: def __init__(self, UINT[:] index_list, UINT[:] neighbors_list, FLOAT[:] weights_list, - UINT[:] map_number_to_num_node, - UINT16[:, :] map_number_to_arr_node, + INT[:] map_number_to_num_node, + UINT[:, :] map_number_to_arr_node, # Changed from UINT16[:, :] UINT arrays_start, bint directed, bint weighted): self._index_list = index_list @@ -49,12 +49,12 @@ cdef class LolGraph: @cython.boundscheck(False) @cython.wraparound(False) - cpdef UINT16[:] arr_node_value_from_id(self, UINT node_id): + cpdef UINT[:] arr_node_value_from_id(self, UINT node_id): # Changed from UINT16[:] return self._map_number_to_arr_node[node_id - self._arrays_start] @cython.boundscheck(False) @cython.wraparound(False) - cpdef UINT num_node_value_from_id(self, UINT node_id): + cpdef INT num_node_value_from_id(self, UINT node_id): return self._map_number_to_num_node[node_id] @cython.boundscheck(False) @@ -108,7 +108,6 @@ cdef class LolGraph: return self._weights_list[idx + node2_index] return -1 - # get neighbors of specific node n @cython.boundscheck(False) @cython.wraparound(False) cpdef tuple neighbors_weighted(self, UINT node): @@ -152,8 +151,8 @@ cdef class LolGraph: """return the second degree neighbors of a node - neighbors of neighbors.""" cdef UINT idx, idx_end, i, j, pointer, neighbor_1st, idx_1st_neigh, idx_end_1st_neigh, neighbor_id cdef np.ndarray[UINT, ndim=1] neighbors_list_id, neighbors_id - cdef UINT16[:] arr - cdef np.ndarray[UINT16, ndim=2] neighbors_value + cdef UINT[:] arr # Changed from UINT16[:] + cdef np.ndarray[UINT, ndim=2] neighbors_value # Changed from UINT16 cdef UINT num_of_neighbors_2nd cdef UINT loci_len @@ -178,7 +177,7 @@ cdef class LolGraph: pointer += 1 loci_len = self._map_number_to_arr_node.shape[1] - neighbors_value = np.zeros((num_of_neighbors_2nd, loci_len), dtype=np.uint16) + neighbors_value = np.zeros((num_of_neighbors_2nd, loci_len), dtype=np.uint32) # Changed from uint16 for i in range(len(neighbors_id)): neighbor_id = neighbors_id[i] arr = self.arr_node_value_from_id(neighbor_id) diff --git a/grma/match/match.py b/grma/match/match.py index 99a55ae..1ba0f66 100644 --- a/grma/match/match.py +++ b/grma/match/match.py @@ -9,7 +9,8 @@ from grim import grim import csv import ast - +from bidict import bidict +import re from grma.match import Graph as MatchingGraph from grma.match.donors_matching import DonorsMatching, _init_results_df @@ -120,6 +121,7 @@ def find_matches( save_to_csv: bool = False, calculate_time: bool = False, output_dir: str = "output", + bdict: bidict = None, ): """ The main function responsible for performing the matching. @@ -148,7 +150,7 @@ def find_matches( if verbose: print_time("Start graph matching") - g_m = DonorsMatching(match_graph, verbose=verbose) + g_m = DonorsMatching(match_graph, verbose=verbose, bdict=bdict) # create patients graph and find all candidates start_build_graph = time.time() @@ -199,13 +201,20 @@ def find_matches( patients_results = {patient: None for patient in patients} with open(imputation_filename, "r") as f: line = f.readline().strip() - loci = list(dict.fromkeys([item for item in line.split(',')[1].replace('*', ',').replace('+', ',').replace('^', ',').replace(':', ',').split(',') if not item.isdigit() and item])) - + text = line.split(",")[1] + raw_loci = re.findall(r"\b([A-Za-z0-9]+)\*", text) + loci = [] + for locus in raw_loci: + if locus in ("DRB3", "DRB4", "DRB5", "DRBX"): + locus = "DRB3/4/5/X" + loci.append(locus) + loci = list(dict.fromkeys(loci)) if patients: avg_build_time = (end_build_graph - start_build_graph) / len(patients) else: avg_build_time = 0 + times = [] for patient in patients: # print("\n","Patient", patient, "Verbose", verbose) # For each patient we search matches in the donor graph. @@ -222,11 +231,13 @@ def find_matches( patient, g_m, donors_info, threshold, cutoff, classes, subclasses ) - match_Probability = results_df["Match_Probability"] - match_Between_Most_Commons = results_df["Match_Between_Most_Commons"] + # match_Between_Most_Commons = results_df["Match_Between_Most_Commons"] Permissive = results_df["Permissive/Non-Permissive"] - df_new = results_df.drop(columns=['Match_Probability', 'Match_Between_Most_Commons', 'Permissive/Non-Permissive']).copy() + # df_new = results_df.drop(columns=['Match_Probability', 'Match_Between_Most_Commons', 'Permissive/Non-Permissive']).copy() + df_new = results_df.drop( + columns=["Match_Probability", "Permissive/Non-Permissive"] + ).copy() for idx, row in enumerate(match_Probability): k = 0 @@ -234,19 +245,30 @@ def find_matches( for i in [1, 2]: df_new.loc[idx, f"Match_Probability_{locus}_{i}"] = row[k] k += 1 - df_new['Permissive/Non-Permissive'] = Permissive - for idx, row in enumerate(match_Between_Most_Commons): - for l, locus in enumerate(loci): - df_new.loc[idx, f"Match_Between_Most_Commons_{locus}"] = row[l] + df_new["Permissive/Non-Permissive"] = Permissive + # for idx, row in enumerate(match_Between_Most_Commons): + # for l, locus in enumerate(loci): + # df_new.loc[idx, f"Match_Between_Most_Commons_{locus}"] = row[l] results_df = df_new.copy() + + # Normalize mismatch probability columns + # for cols in [ + # ['chance_for_0_GvH_Mismatches', 'chance_for_1_GvH_Mismatches', 'chance_for_2_GvH_Mismatches', 'chance_for_3_GvH_Mismatches', 'chance_for_3+_GvH_Mismatches'], + # ['chance_for_0_HvG_Mismatches', 'chance_for_1_HvG_Mismatches', 'chance_for_2_HvG_Mismatches', 'chance_for_3_HvG_Mismatches', 'chance_for_3+_HvG_Mismatches'], + # ['chance_for_0_Mismatches', 'chance_for_1_Mismatches', 'chance_for_2_Mismatches', 'chance_for_3_Mismatches', 'chance_for_3+_Mismatches'] + # ]: + # prob_sum = results_df[cols].sum(axis=1) + # for col in cols: + # results_df[col] = results_df[col].div(prob_sum).fillna(0.0).round(2) + end = time.time() patient_time = end - start + avg_build_time - + times.append(end - start) if calculate_time: patients_results[patient] = (results_df, patient_time) else: patients_results[patient] = results_df - + patient = -1 * patient if save_to_csv: # save results to csv results_df.to_csv( @@ -254,13 +276,15 @@ def find_matches( f"{output_dir}/search_{search_id}", f"Patient_{patient}.csv" ), index=True, - float_format="%.2f", + # float_format="%.2f", ) if verbose: print_time( f"Saved Matching results for {patient} in " f"{os.path.join(f'{output_dir}/search_{search_id}', f'Patient_{patient}.csv')}" ) + with open("times_grma.json", "w") as f: + json.dump(times, f, indent=4) return patients_results diff --git a/grma/utilities/cutils.pyx b/grma/utilities/cutils.pyx index 9072d3f..895bf06 100644 --- a/grma/utilities/cutils.pyx +++ b/grma/utilities/cutils.pyx @@ -9,8 +9,6 @@ ctypedef np.int8_t INT8 ctypedef np.uint16_t UINT16 ctypedef np.uint32_t UINT32 - - @cython.boundscheck(False) @cython.wraparound(False) cpdef np.ndarray[UINT32, ndim=2] cdrop_less_than_7_matches(np.ndarray[UINT32, ndim=1] ids, @@ -28,11 +26,10 @@ cpdef np.ndarray[UINT32, ndim=2] cdrop_less_than_7_matches(np.ndarray[UINT32, nd count += 1 return ret[:count, :] - @cython.boundscheck(False) @cython.wraparound(False) -cpdef np.ndarray[INT8, ndim=1] ccheck_similarity(np.ndarray[UINT16, ndim=1] patients_geno, - np.ndarray[UINT16, ndim=2] donors_genos, +cpdef np.ndarray[INT8, ndim=1] ccheck_similarity(np.ndarray[UINT32, ndim=1] patients_geno, # Changed from UINT16 + np.ndarray[UINT32, ndim=2] donors_genos, # Changed from UINT16 np.ndarray[UINT8, ndim=1] allele_range, UINT8 init_count_similar): """ Takes 2 genotypes to check similarity between \n @@ -45,9 +42,9 @@ cpdef np.ndarray[INT8, ndim=1] ccheck_similarity(np.ndarray[UINT16, ndim=1] pati """ cdef: np.ndarray[INT8, ndim=1] similarities - np.ndarray[UINT16, ndim=1] donors_geno + np.ndarray[UINT32, ndim=1] donors_geno # Changed from UINT16 UINT8 count_similar, counted, number_of_alleles, allele_num - UINT16 patient_alleles0, patient_alleles1, donor_alleles0, donor_alleles1 + UINT32 patient_alleles0, patient_alleles1, donor_alleles0, donor_alleles1 # Changed from UINT16 similarities = np.zeros(len(donors_genos), dtype=np.int8) number_of_alleles = len(allele_range) @@ -90,10 +87,9 @@ cpdef np.ndarray[INT8, ndim=1] ccheck_similarity(np.ndarray[UINT16, ndim=1] pati return similarities - @cython.boundscheck(False) @cython.wraparound(False) -cpdef UINT32 chash(np.ndarray[UINT16, ndim=1] arr): +cpdef UINT32 chash(np.ndarray[UINT32, ndim=1] arr): cdef UINT32 h = 17 cdef UINT8 i for i in range(len(arr)): diff --git a/grma/utilities/geno_representation.py b/grma/utilities/geno_representation.py index 1773874..f1ec8d7 100644 --- a/grma/utilities/geno_representation.py +++ b/grma/utilities/geno_representation.py @@ -27,7 +27,7 @@ class HashableArray: def __init__(self, arr): self.it = None - self.arr = np.array(arr, dtype=np.uint16) + self.arr = np.array(arr, dtype=np.uint32) def __hash__(self): return chash(self.arr) diff --git a/grma/utilities/utils.py b/grma/utilities/utils.py index 3f52a1b..5de97d2 100644 --- a/grma/utilities/utils.py +++ b/grma/utilities/utils.py @@ -3,9 +3,16 @@ from datetime import datetime from typing import Iterable +import numpy as np + from grma.utilities.cutils import cdrop_less_than_7_matches, ccheck_similarity from collections.abc import Sequence +import re +import ast +from typing import List +import hashlib +from grma.utilities.geno_representation import HashableArray LENGTH_OF_NUMBERS_IN_ALLELE: int = 4 TO_SEROLOGY: dict[str, int] = { @@ -225,9 +232,8 @@ def allele_style( def tuple_geno_to_int(tuple_geno: Iterable[int]): """convert genotype from Iterable to integer""" string = "" - fill: int = 4 for i in tuple_geno: - string += str(i).zfill(fill) + string += str(i) return int(string) @@ -243,7 +249,10 @@ def drop_less_than_7_matches(ids, similarities): def check_similarity(patients_geno, donors_genos, allele_range, init_count_similar): return ccheck_similarity( - patients_geno, donors_genos, allele_range, init_count_similar + np.fromiter(patients_geno, dtype=np.uint32), + donors_genos, + allele_range, + init_count_similar, ) @@ -255,3 +264,25 @@ def gl_string_to_integers(genotype: str) -> Sequence[int]: ] return genotype + + +def gl_string_to_hash(s: str) -> int: + return int.from_bytes(hashlib.blake2b(s.encode(), digest_size=8).digest(), "big") + + +def geno_to_int(geno_str): + geno = tuple([a for block in geno_str.split("^") for a in block.split("+")]) + allele_list = [] + for item in geno: + locus, allele = item.split("*") + a1, a2 = allele.split(":") + if locus == "DRB3": + a1 = "3" + a1 + elif locus == "DRB4": + a1 = "4" + a1 + elif locus == "DRB5": + a1 = "5" + a1 + else: + a1 = "0" + a1 + allele_list.append(int("1" + a1 + a2)) + return HashableArray(allele_list) diff --git a/pyproject.toml b/pyproject.toml index 440f358..8403ba1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools>=42", "wheel", "Cython>=0.29.32", "numpy>=1.20.2"] +requires = ["setuptools>=42", "wheel", "Cython>=0.29.32", "numpy==2.3.4"] build-backend = "setuptools.build_meta" [project] diff --git a/requirements.txt b/requirements.txt index 34542f6..701176f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,8 @@ toml==0.10.2 -py-graph-imputation==0.1.1 +py-graph-imputation>=0.0.12 py-ard==1.5.5 tqdm==4.66.1 +bidict +networkx +numpy +pandas diff --git a/setup.py b/setup.py index d33e6fa..343429d 100644 --- a/setup.py +++ b/setup.py @@ -87,12 +87,18 @@ def build_include_dirs(): "grma.utilities.cutils", ["grma/utilities/cutils.pyx"], define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")], + include_dirs=[numpy.get_include()], ), Extension( "grma.match.lol_graph", ["grma/match/lol_graph.pyx"], define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")], ), + Extension( + "grma.match.hla_match", + ["grma/match/hla_match.pyx"], + include_dirs=[numpy.get_include()], + ), ] ), )