diff --git a/machine_learning_hep/analysis/analyzerdhadrons.py b/machine_learning_hep/analysis/analyzerdhadrons.py index 24672b0303..0d5b35dfeb 100644 --- a/machine_learning_hep/analysis/analyzerdhadrons.py +++ b/machine_learning_hep/analysis/analyzerdhadrons.py @@ -55,10 +55,13 @@ from machine_learning_hep.utils.hist import get_dim, project_hist # pylint: disable=too-few-public-methods, too-many-instance-attributes, too-many-statements, fixme -# pylint: disable=consider-using-enumerate fixme +# pylint: disable=consider-using-enumerate, missing-function-docstring class AnalyzerDhadrons(Analyzer): # pylint: disable=invalid-name + """ + An analyzer for D and L hadrons. + """ species = "analyzer" def __init__(self, datap, case, typean, period): @@ -146,6 +149,8 @@ def __init__(self, datap, case, typean, period): self.root_objects = [] + self.do_ptshape = datap.get("do_ptshape", False) + # Fitting self.p_performval = datap["analysis"].get("event_cand_validation", None) @@ -418,60 +423,43 @@ def efficiency(self): print(self.n_fileff) lfileeff = TFile.Open(self.n_fileff) lfileeff.ls() - fileouteff = TFile.Open(f"{self.d_resultsallpmc}/{self.efficiency_filename}{self.case}{self.typean}.root", "recreate") - cEff = TCanvas("cEff", "The Fit Canvas") - cEff.SetCanvasSize(1900, 1500) - cEff.SetWindowSize(500, 500) - - legeff = TLegend(0.5, 0.65, 0.7, 0.85) - legeff.SetBorderSize(0) - legeff.SetFillColor(0) - legeff.SetFillStyle(0) - legeff.SetTextFont(42) - legeff.SetTextSize(0.035) - - h_gen_pr = lfileeff.Get("h_gen_pr") - h_sel_pr = lfileeff.Get("h_sel_pr") - h_sel_pr.Divide(h_sel_pr, h_gen_pr, 1.0, 1.0, "B") - h_sel_pr.Draw("same") - fileouteff.cd() - h_sel_pr.SetName("eff") - h_sel_pr.Write() - h_sel_pr.GetXaxis().SetTitle("#it{p}_{T} (GeV/#it{c})") - h_sel_pr.GetYaxis().SetTitle(f"Acc x efficiency (prompt) {self.p_latexnhadron} {self.typean} (1/GeV)") - h_sel_pr.SetMinimum(0.001) - h_sel_pr.SetMaximum(1.0) - gPad.SetLogy() - cEff.SaveAs(f"{self.d_resultsallpmc}/Eff{self.case}{self.typean}.eps") - - cEffFD = TCanvas("cEffFD", "The Fit Canvas") - cEffFD.SetCanvasSize(1900, 1500) - cEffFD.SetWindowSize(500, 500) - - legeffFD = TLegend(0.5, 0.65, 0.7, 0.85) - legeffFD.SetBorderSize(0) - legeffFD.SetFillColor(0) - legeffFD.SetFillStyle(0) - legeffFD.SetTextFont(42) - legeffFD.SetTextSize(0.035) - - h_gen_fd = lfileeff.Get("h_gen_fd") - h_sel_fd = lfileeff.Get("h_sel_fd") - h_sel_fd.Divide(h_sel_fd, h_gen_fd, 1.0, 1.0, "B") - h_sel_fd.Draw("same") - fileouteff.cd() - h_sel_fd.SetName("eff_fd") - h_sel_fd.Write() - h_sel_fd.GetXaxis().SetTitle("#it{p}_{T} (GeV/#it{c})") - h_sel_fd.GetYaxis().SetTitle(f"Acc x efficiency feed-down {self.p_latexnhadron} {self.typean} (1/GeV)") - h_sel_fd.SetMinimum(0.001) - h_sel_fd.SetMaximum(1.0) - gPad.SetLogy() - legeffFD.Draw() - cEffFD.SaveAs(f"{self.d_resultsallpmc}/EffFD{self.case}{self.typean}.eps") + fileouteff = TFile.Open(f"{self.d_resultsallpmc}/{self.efficiency_filename}{self.case}{self.typean}.root", + "recreate") + + def do_eff(gen_hist, sel_hist, histname, outname, eff_case): + cEff = TCanvas(f"c{outname}", "The Fit Canvas") + cEff.SetCanvasSize(1900, 1500) + cEff.SetWindowSize(500, 500) + + legeff = TLegend(0.5, 0.65, 0.7, 0.85) + legeff.SetBorderSize(0) + legeff.SetFillColor(0) + legeff.SetFillStyle(0) + legeff.SetTextFont(42) + legeff.SetTextSize(0.035) + + h_gen = lfileeff.Get(gen_hist) + h_sel = lfileeff.Get(sel_hist) + h_sel.Divide(h_sel, h_gen, 1.0, 1.0, "B") + h_sel.Draw("same") + fileouteff.cd() + h_sel.SetName(histname) + h_sel.Write() + h_sel.GetXaxis().SetTitle("#it{p}_{T} (GeV/#it{c})") + h_sel.GetYaxis().SetTitle(f"Acc x efficiency ({eff_case}) {self.p_latexnhadron} {self.typean} (1/GeV)") + h_sel.SetMinimum(0.001) + h_sel.SetMaximum(1.0) + gPad.SetLogy() + legeff.Draw() + cEff.SaveAs(f"{self.d_resultsallpmc}/{outname}{self.case}{self.typean}.eps") + + do_eff("h_gen_pr", "h_sel_pr", "eff", "Eff", "prompt") + do_eff("h_gen_fd", "h_sel_fd", "eff_fd", "EffFD", "feed-down") + if self.do_ptshape: + do_eff("h_gen_fd_ptshape", "h_sel_fd_ptshape", "eff_fd_ptshape", "EffFDPtShape", "feed-down") @staticmethod - def calculate_norm(logger, hevents, hselevents): # TO BE FIXED WITH EV SEL + def calculate_norm(logger, hevents, hselevents): # TO BE FIXED WITH EV SEL if not hevents: # pylint: disable=undefined-variable logger.error("Missing hevents") @@ -498,11 +486,13 @@ def makenormyields(self): # pylint: disable=import-outside-toplevel, too-many-b if not os.path.exists(fileouteff): self.logger.fatal("Efficiency file %s could not be found", fileouteff) - fileoutcross = f"{self.d_resultsallpdata}/finalcross{self.case}{self.typean}.root" + ptshape = "_ptshape" if self.do_ptshape else "" + fileoutcross = f"{self.d_resultsallpdata}/finalcross{self.case}{self.typean}{ptshape}.root" namehistoeffprompt = "eff" - namehistoefffeed = "eff_fd" + namehistoefffeed = f"eff_fd{ptshape}" nameyield = "hyields0" + self.logger.info("Using efficiency histos %s, %s", namehistoeffprompt, namehistoefffeed) histonorm = TH1F("histonorm", "histonorm", 1, 0, 1) @@ -549,7 +539,8 @@ def makenormyields(self): # pylint: disable=import-outside-toplevel, too-many-b fileoutcross, ) - fileoutcrosstot = TFile.Open(f"{self.d_resultsallpdata}/finalcross{self.case}{self.typean}tot.root", "recreate") + fileoutcrosstot = TFile.Open(f"{self.d_resultsallpdata}/finalcross{self.case}{self.typean}tot{ptshape}.root", + "recreate") f_fileoutcross = TFile.Open(fileoutcross) if f_fileoutcross: diff --git a/machine_learning_hep/processer.py b/machine_learning_hep/processer.py index b2766d3cbd..678e83fe33 100644 --- a/machine_learning_hep/processer.py +++ b/machine_learning_hep/processer.py @@ -42,6 +42,7 @@ mergerootfiles, openfile, read_df, + reweight, seldf_singlevar, write_df, ) @@ -49,8 +50,13 @@ pd.options.mode.chained_assignment = None +# pylint: disable=missing-function-docstring + class Processer: # pylint: disable=too-many-instance-attributes + """ + The main class for data processing, machine learning, and analysis. + """ # Class Attribute species = "processer" logger = get_logger() @@ -149,6 +155,7 @@ def __init__( self.n_fileeff = datap["files_names"]["efffilename"] self.n_fileresp = datap["files_names"]["respfilename"] self.n_mcreweights = datap["files_names"]["namefile_mcweights"] + self.n_weights = datap["files_names"]["histoweights"] # selections self.s_reco_skim = datap["sel_reco_skim"] @@ -172,6 +179,8 @@ def __init__( self.v_ismcbkg = datap["bitmap_sel"]["var_ismcbkg"] # used in hadrons self.v_ismcrefl = datap["bitmap_sel"]["var_ismcrefl"] # used in hadrons self.v_var_binning = datap["var_binning"] + self.do_ptshape = datap.get("do_ptshape", False) + self.v_var_binning_ptshape = datap.get("var_binning_ptshape", None) self.v_invmass = datap["variables"].get("var_inv_mass", "inv_mass") # self.v_rapy = datap["variables"].get("var_y", "y_cand") @@ -202,6 +211,11 @@ def __init__( self.l_gen_sl = createlist(self.d_pkl, self.l_path, self.n_gen_sl) self.f_totevt = os.path.join(self.d_pkl, self.n_evt) self.f_totevtorig = os.path.join(self.d_pkl, self.n_evtorig) + self.f_weights = os.path.join(self.d_mcreweights, self.n_mcreweights) + + if self.do_ptshape: + with uproot.open(self.f_weights) as fin: + self.v_hist_weights = fin[self.n_weights].to_numpy() self.p_modelname = datap["mlapplication"]["modelname"] # Analysis pT bins @@ -293,7 +307,7 @@ def __init__( ) self.lpt_recodec = None - if self.doml is True: + if self.doml: if self.mltype == "MultiClassification": self.lpt_recodec = [ self.n_reco.replace( @@ -347,6 +361,22 @@ def __init__( # self.triggerbit = datap["analysis"][self.typean]["triggerbit"] self.runlistrigger = runlisttrigger + if self.do_ptshape and self.mcordata == "mc": + lpt_recosk_ptshape = [None] * self.p_nptbins + lpt_gensk_ptshape = [None] * self.p_nptbins + lpt_recodec_ptshape = [None] * self.p_nptbins + + for ipt in range(self.p_nptbins): + lpt_recosk_ptshape[ipt] = self.v_var_binning_ptshape + self.lpt_recosk[ipt] + lpt_gensk_ptshape[ipt] = self.v_var_binning_ptshape + self.lpt_gensk[ipt] + lpt_recodec_ptshape[ipt] = self.v_var_binning_ptshape + self.lpt_recodec[ipt] + self.mptfiles_recosk_ptshape = [createlist(d_pklsk, self.l_path, \ + lpt_recosk_ptshape[ipt]) for ipt in range(self.p_nptbins)] + self.mptfiles_gensk_ptshape = [createlist(d_pklsk, self.l_path, \ + lpt_gensk_ptshape[ipt]) for ipt in range(self.p_nptbins)] + self.mptfiles_recoskmldec_ptshape = [createlist(self.d_pkl_dec, self.l_path, \ + lpt_recodec_ptshape[ipt]) for ipt in range(self.p_nptbins)] + # if os.path.exists(self.d_root) is False: # self.logger.warning("ROOT tree folder is not there. Is it intentional?") @@ -387,7 +417,7 @@ def dfread(rdir, trees, cols, idx_name=None): if idx_name: # df.rename_axis(idx_name, inplace=True) df[idx_name] = df.index - df.set_index(["df", idx_name], inplace=True) + df = df.set_index(["df", idx_name]) return df except Exception as e: self.logger.exception("Failed to read data from trees: %s", str(e)) @@ -474,7 +504,8 @@ def dfuse(df_spec): dfs[df_name][tag] = (var == value["req"]).astype(int) # dfs[df_name][tag] = np.array( - # tag_bit_df(dfs[df_name], value["var"], value["req"], value.get("abs", False)), dtype=int) + # tag_bit_df(dfs[df_name], value["var"], value["req"], value.get("abs", False)), + # dtype=int) if "swap" in df_spec: self.logger.debug(" %s -> swap", df_name) @@ -531,21 +562,25 @@ def dfuse(df_spec): path = os.path.join(self.d_pkl, self.l_path[file_index], df_spec["file"]) write_df(dfo, path) - def skim(self, file_index): - dfreco = read_df(self.l_reco[file_index]) - dfgen = read_df(self.l_gen[file_index]) if self.mcordata == "mc" else None - dfgen_sl = read_df(self.l_gen_sl[file_index]) if self.n_gen_sl and self.mcordata == "mc" else None - + def do_skim(self, dfreco, dfgen, var_binning, filenames_reco, filenames_gen, file_index): for ipt in range(self.p_nptbins): - dfrecosk = seldf_singlevar(dfreco, self.v_var_binning, self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt]) + dfrecosk = seldf_singlevar(dfreco, var_binning, + self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt]) dfrecosk = dfquery(dfrecosk, self.s_reco_skim[ipt]) - write_df(dfrecosk, self.mptfiles_recosk[ipt][file_index]) + write_df(dfrecosk, filenames_reco[ipt][file_index]) if dfgen is not None: - dfgensk = seldf_singlevar(dfgen, self.v_var_binning, self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt]) + dfgensk = seldf_singlevar(dfgen, var_binning, + self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt]) dfgensk = dfquery(dfgensk, self.s_gen_skim[ipt]) - write_df(dfgensk, self.mptfiles_gensk[ipt][file_index]) + write_df(dfgensk, filenames_gen[ipt][file_index]) + def skim(self, file_index): + dfreco = read_df(self.l_reco[file_index]) + dfgen = read_df(self.l_gen[file_index]) if self.mcordata == "mc" else None + dfgen_sl = read_df(self.l_gen_sl[file_index]) if self.n_gen_sl and self.mcordata == "mc" else None + + for ipt in range(self.p_nptbins): if dfgen_sl is not None: dfgensk_sl = seldf_singlevar( dfgen_sl, self.v_var_binning, self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt] @@ -553,12 +588,20 @@ def skim(self, file_index): dfgensk_sl = dfquery(dfgensk_sl, self.s_gen_skim[ipt]) write_df(dfgensk_sl, self.mptfiles_gensk_sl[ipt][file_index]) + self.do_skim(dfreco, dfgen, self.v_var_binning, self.mptfiles_recosk, self.mptfiles_gensk, file_index) + + if self.do_ptshape and self.mcordata == 'mc': + reweight(self.v_hist_weights, dfreco, self.v_var_binning, self.v_var_binning_ptshape) + reweight(self.v_hist_weights, dfgen, self.v_var_binning, self.v_var_binning_ptshape) + + self.do_skim(dfreco, dfgen, self.v_var_binning_ptshape, self.mptfiles_recosk_ptshape,\ + self.mptfiles_gensk_ptshape, file_index) + + + # pylint: disable=too-many-branches def applymodel(self, file_index): - for ipt in range(self.p_nptbins): - if os.path.exists(self.mptfiles_recoskmldec[ipt][file_index]): - if os.stat(self.mptfiles_recoskmldec[ipt][file_index]).st_size != 0: - continue - dfrecosk = read_df(self.mptfiles_recosk[ipt][file_index]) + def do_apply_model(in_filename, out_filename, ipt): + dfrecosk = read_df(in_filename) if self.p_mask_values: mask_df(dfrecosk, self.p_mask_values) if self.doml is True: @@ -584,7 +627,21 @@ def applymodel(self, file_index): dfrecoskml = dfrecoskml.loc[dfrecoskml[probvar] > self.lpt_probcutpre[ipt]] else: dfrecoskml = dfrecosk.query("isstd == 1") - write_df(dfrecoskml, self.mptfiles_recoskmldec[ipt][file_index]) + write_df(dfrecoskml, out_filename) + + for ipt in range(self.p_nptbins): + if os.path.exists(self.mptfiles_recoskmldec[ipt][file_index]): + if os.stat(self.mptfiles_recoskmldec[ipt][file_index]).st_size != 0: + continue + + do_apply_model(self.mptfiles_recosk[ipt][file_index], + self.mptfiles_recoskmldec[ipt][file_index], + ipt) + + if self.do_ptshape and self.mcordata == 'mc': + do_apply_model(self.mptfiles_recosk_ptshape[ipt][file_index], + self.mptfiles_recoskmldec_ptshape[ipt][file_index], + ipt) @staticmethod def callback(ex): @@ -717,8 +774,8 @@ def apply_cut_for_ipt(df_full, ipt: int): def process_histomass(self): self.logger.debug("Doing masshisto %s %s", self.mcordata, self.period) - self.logger.debug("Using run selection for mass histo %s %s %s", self.runlistrigger, "for period", self.period) - if self.doml is True: + self.logger.debug("Using run selection for mass histo %s for period %s", self.runlistrigger, self.period) + if self.doml: self.logger.debug("Doing ml analysis") elif self.do_custom_analysis_cuts: self.logger.debug("Using custom cuts") @@ -736,8 +793,8 @@ def process_histomass(self): def process_efficiency(self): print("Doing efficiencies", self.mcordata, self.period) - print("Using run selection for eff histo", self.runlistrigger, "for period", self.period) - if self.doml is True: + print("Using run selection for eff histo %s for period %s", self.runlistrigger, self.period) + if self.doml: print("Doing ml analysis") elif self.do_custom_analysis_cuts: print("Using custom cuts") diff --git a/machine_learning_hep/processerdhadrons.py b/machine_learning_hep/processerdhadrons.py index a46e90ee37..ce03c2f472 100755 --- a/machine_learning_hep/processerdhadrons.py +++ b/machine_learning_hep/processerdhadrons.py @@ -31,6 +31,9 @@ class ProcesserDhadrons(Processer): # pylint: disable=too-many-instance-attributes + """ + The class for specializations of data processing, machine learning and analysis for D and L hadrons. + """ # Class Attribute species = "processer" @@ -89,8 +92,8 @@ def __init__( self.p_mass_fit_lim = datap["analysis"][self.typean]["mass_fit_lim"] self.p_bin_width = datap["analysis"][self.typean]["bin_width"] limits_mass = datap["analysis"][self.typean]["mass_fit_lim"] - nbins_mass = int(round((limits_mass[1] - limits_mass[0]) / self.p_bin_width)) - self.p_num_bins = int(round((self.p_mass_fit_lim[1] - self.p_mass_fit_lim[0]) / self.p_bin_width)) + nbins_mass = round((limits_mass[1] - limits_mass[0]) / self.p_bin_width) + self.p_num_bins = round((self.p_mass_fit_lim[1] - self.p_mass_fit_lim[0]) / self.p_bin_width) self.s_presel_gen_eff = datap["analysis"][self.typean]["presel_gen_eff"] self.lpt_finbinmin = datap["analysis"][self.typean]["sel_an_binmin"] @@ -104,13 +107,13 @@ def __init__( # pylint: disable=too-many-branches def process_histomass_single(self, index): + """ + Generate invariant mass histograms for MC or real data. + """ myfile = TFile.Open(self.l_histomass[index], "recreate") dfevtorig = read_df(self.l_evtorig[index]) neventsorig = len(dfevtorig) - if self.s_evtsel is not None: - dfevtevtsel = dfevtorig.query(self.s_evtsel) - else: - dfevtevtsel = dfevtorig + dfevtevtsel = dfevtorig.query(self.s_evtsel) if self.s_evtsel is not None else dfevtorig neventsafterevtsel = len(dfevtevtsel) # validation plot for event selection @@ -138,8 +141,8 @@ def process_histomass_single(self, index): if self.s_evtsel is not None: df = df.query(self.s_evtsel) - if self.doml is True: - df = df.query(self.l_selml[bin_id]) + if self.doml: + df = df.query(self.l_selml[ipt]) df = seldf_singlevar(df, self.v_var_binning, self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt]) if self.do_custom_analysis_cuts: @@ -200,77 +203,69 @@ def process_histomass_single(self, index): # pylint: disable=line-too-long def process_efficiency_single(self, index): - # TO UPDATE TO DHADRON_MULT VERSION + """ + Generate efficiency histograms for MC or real data. + """ + # TODO: Unify this and the dhadrons_mult version out_file = TFile.Open(self.l_histoeff[index], "recreate") n_bins = len(self.lpt_finbinmin) analysis_bin_lims_temp = self.lpt_finbinmin.copy() analysis_bin_lims_temp.append(self.lpt_finbinmax[n_bins - 1]) analysis_bin_lims = array.array("f", analysis_bin_lims_temp) + h_gen_pr = TH1F("h_gen_pr", "Prompt Generated in acceptance |y|<0.5", n_bins, analysis_bin_lims) h_presel_pr = TH1F("h_presel_pr", "Prompt Reco in acc |#eta|<0.8 and sel", n_bins, analysis_bin_lims) h_sel_pr = TH1F("h_sel_pr", "Prompt Reco and sel in acc |#eta|<0.8 and sel", n_bins, analysis_bin_lims) h_gen_fd = TH1F("h_gen_fd", "FD Generated in acceptance |y|<0.5", n_bins, analysis_bin_lims) h_presel_fd = TH1F("h_presel_fd", "FD Reco in acc |#eta|<0.8 and sel", n_bins, analysis_bin_lims) h_sel_fd = TH1F("h_sel_fd", "FD Reco and sel in acc |#eta|<0.8 and sel", n_bins, analysis_bin_lims) - - bincounter = 0 - for ipt in range(self.p_nptfinbins): - bin_id = self.bin_matching[ipt] - df_mc_reco = read_df(self.mptfiles_recoskmldec[bin_id][index]) + if self.do_ptshape: # pylint: disable=no-member + h_gen_fd_ptshape = TH1F("h_gen_fd_ptshape", "FD Generated in acceptance |y|<0.5", \ + n_bins, analysis_bin_lims) + h_presel_fd_ptshape = TH1F("h_presel_fd_ptshape", "FD Reco in acc |#eta|<0.8 and sel", \ + n_bins, analysis_bin_lims) + h_sel_fd_ptshape = TH1F("h_sel_fd_ptshape", "FD Reco and sel in acc |#eta|<0.8 and sel", \ + n_bins, analysis_bin_lims) + + def get_eff_dfs(ipt, bin_id, gen_files, reco_files, var_binning): + df_mc_reco = read_df(reco_files[bin_id][index]) if self.s_evtsel is not None: df_mc_reco = df_mc_reco.query(self.s_evtsel) - df_mc_gen = read_df(self.mptfiles_gensk[bin_id][index]) + df_mc_gen = read_df(gen_files[bin_id][index]) df_mc_gen = df_mc_gen.query(self.s_presel_gen_eff) df_mc_reco = seldf_singlevar( - df_mc_reco, self.v_var_binning, self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt] + df_mc_reco, var_binning, self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt] ) - df_mc_gen = seldf_singlevar(df_mc_gen, self.v_var_binning, self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt]) - df_gen_sel_pr = df_mc_gen.loc[(df_mc_gen.ismcprompt == 1) & (df_mc_gen.ismcsignal == 1)] - df_reco_presel_pr = df_mc_reco.loc[(df_mc_reco.ismcprompt == 1) & (df_mc_reco.ismcsignal == 1)] - df_reco_sel_pr = None - if self.doml is True: - df_reco_sel_pr = df_reco_presel_pr.query(self.l_selml[bin_id]) - else: - df_reco_sel_pr = df_reco_presel_pr.copy() - df_gen_sel_fd = df_mc_gen.loc[(df_mc_gen.ismcfd == 1) & (df_mc_gen.ismcsignal == 1)] - df_reco_presel_fd = df_mc_reco.loc[(df_mc_reco.ismcfd == 1) & (df_mc_reco.ismcsignal == 1)] - df_reco_sel_fd = None - if self.doml is True: - df_reco_sel_fd = df_reco_presel_fd.query(self.l_selml[bin_id]) - else: - df_reco_sel_fd = df_reco_presel_fd.copy() - + df_mc_gen = seldf_singlevar(df_mc_gen, var_binning, self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt]) + return df_mc_gen, df_mc_reco + + def do_eff_single(ipt, bin_id, df_mc_gen, df_mc_reco, sel_column, bincounter, hists): + df_gen_sel = df_mc_gen.loc[(df_mc_gen[sel_column] == 1) & (df_mc_gen.ismcsignal == 1)] + df_reco_presel = df_mc_reco.loc[(df_mc_reco[sel_column] == 1) & (df_mc_reco.ismcsignal == 1)] + df_reco_sel = None + df_reco_sel = df_reco_presel.query(self.l_selml[bin_id]) if self.doml is True else df_reco_presel.copy() if self.do_custom_analysis_cuts: - df_reco_sel_pr = self.apply_cuts_ptbin(df_reco_sel_pr, ipt) - df_reco_sel_fd = self.apply_cuts_ptbin(df_reco_sel_fd, ipt) - - val = len(df_gen_sel_pr) - err = math.sqrt(val) - h_gen_pr.SetBinContent(bincounter + 1, val) - h_gen_pr.SetBinError(bincounter + 1, err) - val = len(df_reco_presel_pr) - err = math.sqrt(val) - h_presel_pr.SetBinContent(bincounter + 1, val) - h_presel_pr.SetBinError(bincounter + 1, err) - val = len(df_reco_sel_pr) - err = math.sqrt(val) - h_sel_pr.SetBinContent(bincounter + 1, val) - h_sel_pr.SetBinError(bincounter + 1, err) - - val = len(df_gen_sel_fd) - err = math.sqrt(val) - h_gen_fd.SetBinContent(bincounter + 1, val) - h_gen_fd.SetBinError(bincounter + 1, err) - val = len(df_reco_presel_fd) - err = math.sqrt(val) - h_presel_fd.SetBinContent(bincounter + 1, val) - h_presel_fd.SetBinError(bincounter + 1, err) - val = len(df_reco_sel_fd) - err = math.sqrt(val) - h_sel_fd.SetBinContent(bincounter + 1, val) - h_sel_fd.SetBinError(bincounter + 1, err) + df_reco_sel = self.apply_cuts_ptbin(df_reco_sel, ipt) + + for df, hist in zip((df_gen_sel, df_reco_presel, df_reco_sel), hists, strict=False): + val = len(df) + err = math.sqrt(val) + hist.SetBinContent(bincounter + 1, val) + hist.SetBinError(bincounter + 1, err) bincounter = bincounter + 1 + bincounter_pr = bincounter_fd = bincounter_ptshape = 0 + for ipt in range(self.p_nptfinbins): + bin_id = self.bin_matching[ipt] + df_mc_gen, df_mc_reco = get_eff_dfs(ipt, bin_id, self.mptfiles_gensk, self.mptfiles_recoskmldec, self.v_var_binning) + + do_eff_single(ipt, bin_id, df_mc_gen, df_mc_reco, "ismcprompt", bincounter_pr, (h_gen_pr, h_presel_pr, h_sel_pr)) + do_eff_single(ipt, bin_id, df_mc_gen, df_mc_reco, "ismcfd", bincounter_fd, (h_gen_fd, h_presel_fd, h_sel_fd)) + + if self.do_ptshape: # pylint: disable=no-member + df_mc_gen_ptshape, df_mc_reco_ptshape = get_eff_dfs(ipt, bin_id, self.mptfiles_gensk_ptshape, self.mptfiles_recoskmldec_ptshape, self.v_var_binning_ptshape) + do_eff_single(ipt, bin_id, df_mc_gen_ptshape, df_mc_reco_ptshape, "ismcfd", bincounter_ptshape, (h_gen_fd_ptshape, h_presel_fd_ptshape, h_sel_fd_ptshape)) + out_file.cd() h_gen_pr.Write() h_presel_pr.Write() @@ -278,3 +273,8 @@ def process_efficiency_single(self, index): h_gen_fd.Write() h_presel_fd.Write() h_sel_fd.Write() + + if self.do_ptshape: # pylint: disable=no-member + h_gen_fd_ptshape.Write() + h_presel_fd_ptshape.Write() + h_sel_fd_ptshape.Write() diff --git a/machine_learning_hep/utilities.py b/machine_learning_hep/utilities.py index f70a02b26d..e6c720c730 100644 --- a/machine_learning_hep/utilities.py +++ b/machine_learning_hep/utilities.py @@ -74,7 +74,7 @@ from machine_learning_hep.logger import get_logger from machine_learning_hep.selectionutils import select_runs -# pylint: disable=too-many-lines +# pylint: disable=too-many-lines, missing-function-docstring logger = get_logger() @@ -128,10 +128,7 @@ def write_df(dfo, path): def read_df(path, **kwargs): try: - if path.endswith(".parquet"): - df = pd.read_parquet(path, **kwargs) - else: - df = pickle.load(openfile(path, "rb")) + df = pd.read_parquet(path, **kwargs) if path.endswith(".parquet") else pickle.load(openfile(path, "rb")) except Exception as e: # pylint: disable=broad-except logger.critical("failed to open file <%s>: %s", path, str(e)) sys.exit() @@ -160,7 +157,7 @@ def conv_none(value): mask_by_value_dict[mc["column"]] = {mc["find_by_value"]: conv_none(mc["mask_with"])} # Mask all we can do by value - df_to_mask.replace(mask_by_value_dict, inplace=True) + df_to_mask = df_to_mask.replace(mask_by_value_dict) # Now mask all which are searched by query for mc in mask_by_query_list: @@ -172,7 +169,7 @@ def conv_none(value): df_to_mask.loc[mask_indices, [mc["column"]]] = conv_none(mc["mask_with"]) -def dfquery(df, selection, **kwargs): # pylint: disable=invalid-name +def dfquery(df, selection, **kwargs): # pylint: disable=invalid-name return df.query(selection, **kwargs) if selection is not None else df @@ -187,6 +184,20 @@ def selectdfrunlist(dfr, runlist, runvar): dfr = dfr[issel] return dfr +def reweight(hist_weights, df, var, weighted_var): # pylint: disable=invalid-name + weights = hist_weights[0] + bin_edges = hist_weights[1] + + # Extract pT values from DataFrame + var = df[var] + + # Determine bin indices for each pT value + bin_index = np.digitize(var, bin_edges) - 1 + + # Ensure bin indices are within the valid range + bin_index = np.clip(bin_index, 0, len(weights) - 1) + df['weights'] = weights[bin_index] + df[weighted_var] = df['weights'] * var def count_df_length_pkl(*pkls): """ @@ -294,12 +305,12 @@ def make_latex_table(column_names, row_names, rows, caption=None, save_path="./t columns = "|".join(["c"] * (len(column_names) + 1)) f.write("\\begin{tabular}{" + columns + "}\n") f.write("\\hline\n") - columns = "&".join([""] + column_names) + columns = "&".join(["", *column_names]) columns = columns.replace("_", "\\_") f.write(columns + "\\\\\n") f.write("\\hline\\hline\n") - for rn, row in zip(row_names, rows): - row_string = "&".join([rn] + row) + for rn, row in zip(row_names, rows, strict=False): + row_string = "&".join([rn, *row]) row_string = row_string.replace("_", "\\_") f.write(row_string + "\\\\\n") f.write("\\end{tabular}\n") @@ -397,10 +408,7 @@ def equal_axis_list(axis1, list2, precision=10): bins = get_bins(axis1) if len(bins) != len(list2): return False - for i, j in zip(bins, list2): - if round(i, precision) != round(j, precision): - return False - return True + return all(round(i, precision) == round(j, precision) for i, j in zip(bins, list2, strict=False)) def equal_binning(his1, his2): @@ -418,9 +426,7 @@ def equal_binning_lists(his, list_x=None, list_y=None, list_z=None): return False if list_y is not None and not equal_axis_list(his.GetYaxis(), list_y): return False - if list_z is not None and not equal_axis_list(his.GetZaxis(), list_z): - return False - return True + return not (list_z is not None and not equal_axis_list(his.GetZaxis(), list_z)) def folding(h_input, response_matrix, h_output): @@ -929,13 +935,13 @@ def plot_latex(latex): # set canvas margins if isinstance(margins_c, list) and len(margins_c) > 0: for setter, value in zip( - [can.SetBottomMargin, can.SetLeftMargin, can.SetTopMargin, can.SetRightMargin], margins_c + [can.SetBottomMargin, can.SetLeftMargin, can.SetTopMargin, can.SetRightMargin], margins_c, strict=False ): setter(value) # set logarithmic scale for selected axes log_y = False if isinstance(logscale, str) and len(logscale) > 0: - for setter, axis in zip([can.SetLogx, can.SetLogy, can.SetLogz], ["x", "y", "z"]): + for setter, axis in zip([can.SetLogx, can.SetLogy, can.SetLogz], ["x", "y", "z"], strict=False): if axis in logscale: setter() if axis == "y":