From 99d1907b266cdc1b49e228f32931a58fd18677ab Mon Sep 17 00:00:00 2001 From: saganatt <8majak8@gmail.com> Date: Mon, 28 Oct 2024 12:17:53 +0100 Subject: [PATCH 01/14] Add pt shape reweighting to the hadron workflow --- .../analysis/analyzerdhadrons.py | 29 +++++++++++++++++ machine_learning_hep/processer.py | 31 ++++++++++++++++--- 2 files changed, 56 insertions(+), 4 deletions(-) diff --git a/machine_learning_hep/analysis/analyzerdhadrons.py b/machine_learning_hep/analysis/analyzerdhadrons.py index 24672b0303..7d4ab89287 100644 --- a/machine_learning_hep/analysis/analyzerdhadrons.py +++ b/machine_learning_hep/analysis/analyzerdhadrons.py @@ -470,6 +470,35 @@ def efficiency(self): legeffFD.Draw() cEffFD.SaveAs(f"{self.d_resultsallpmc}/EffFD{self.case}{self.typean}.eps") + if self.do_ptshape: + cEffFDPtShape = TCanvas('cEffFDPtShape', 'The Fit Canvas') + cEffFDPtShape.SetCanvasSize(1900, 1500) + cEffFDPtShape.SetWindowSize(500, 500) + + legeffFDPtShape = TLegend(.5, .65, .7, .85) + legeffFDPtShape.SetBorderSize(0) + legeffFDPtShape.SetFillColor(0) + legeffFDPtShape.SetFillStyle(0) + legeffFDPtShape.SetTextFont(42) + legeffFDPtShape.SetTextSize(0.035) + + h_gen_fd = lfileeff.Get("h_gen_fd_ptshape") + h_sel_fd = lfileeff.Get("h_sel_fd_ptshape") + h_sel_fd.Divide(h_sel_fd, h_gen_fd, 1.0, 1.0, "B") + h_sel_fd.Draw("same") + fileouteff.cd() + h_sel_fd.SetName("eff_fd_ptshape") + h_sel_fd.Write() + h_sel_fd.GetXaxis().SetTitle("#it{p}_{T} (GeV/#it{c})") + h_sel_fd.GetYaxis().SetTitle("Acc x efficiency feed-down %s %s (1/GeV)" + % (self.p_latexnhadron, self.typean)) + h_sel_fd.SetMinimum(0.001) + h_sel_fd.SetMaximum(1.) + gPad.SetLogy() + legeffFDPtShape.Draw() + cEffFDPtShape.SaveAs("%s/EffFDPtShape%s%s.eps" % (self.d_resultsallpmc, + self.case, self.typean)) + @staticmethod def calculate_norm(logger, hevents, hselevents): # TO BE FIXED WITH EV SEL if not hevents: diff --git a/machine_learning_hep/processer.py b/machine_learning_hep/processer.py index b2766d3cbd..f4971b944f 100644 --- a/machine_learning_hep/processer.py +++ b/machine_learning_hep/processer.py @@ -586,6 +586,29 @@ def applymodel(self, file_index): dfrecoskml = dfrecosk.query("isstd == 1") write_df(dfrecoskml, self.mptfiles_recoskmldec[ipt][file_index]) + if(self.do_ptshape and self.mcordata == 'mc'): + dfrecosk_ptshape = read_df(self.mptfiles_recosk_ptshape[ipt][file_index]) + if self.doml is True: + if self.mltype == "MultiClassification": + dfrecoskml_ptshape = apply(self.mltype, [self.p_modelname], [mod], + dfrecosk_ptshape, self.v_train[ipt], self.class_labels) + probs = [f'y_test_prob{self.p_modelname}{label.replace("-", "_")}' \ + for label in self.class_labels] + dfrecoskml_ptshape = dfrecoskml_ptshape[ + (dfrecoskml_ptshape[probs[0]] <= self.lpt_probcutpre[ipt][0]) & + (dfrecoskml_ptshape[probs[1]] >= self.lpt_probcutpre[ipt][1]) & + (dfrecoskml_ptshape[probs[2]] >= self.lpt_probcutpre[ipt][2])] + else: + dfrecoskml_ptshape = apply("BinaryClassification", [self.p_modelname], [mod], + dfrecosk_ptshape, self.v_train[ipt]) + probvar = f"y_test_prob{self.p_modelname}" + dfrecoskml_ptshape = dfrecoskml_ptshape.loc[ + dfrecoskml_ptshape[probvar] > self.lpt_probcutpre[ipt]] + + else: + dfrecoskml_ptshape = dfrecosk_ptshape.query("isstd == 1") + write_df(dfrecoskml_ptshape, self.mptfiles_recoskmldec_ptshape[ipt][file_index]) + @staticmethod def callback(ex): get_logger().exception("Error callback: %s", ex) @@ -717,8 +740,8 @@ def apply_cut_for_ipt(df_full, ipt: int): def process_histomass(self): self.logger.debug("Doing masshisto %s %s", self.mcordata, self.period) - self.logger.debug("Using run selection for mass histo %s %s %s", self.runlistrigger, "for period", self.period) - if self.doml is True: + self.logger.debug("Using run selection for mass histo %s for period %s", self.runlistrigger, self.period) + if self.doml: self.logger.debug("Doing ml analysis") elif self.do_custom_analysis_cuts: self.logger.debug("Using custom cuts") @@ -736,8 +759,8 @@ def process_histomass(self): def process_efficiency(self): print("Doing efficiencies", self.mcordata, self.period) - print("Using run selection for eff histo", self.runlistrigger, "for period", self.period) - if self.doml is True: + print("Using run selection for eff histo %s for period %s", self.runlistrigger, self.period) + if self.doml: print("Doing ml analysis") elif self.do_custom_analysis_cuts: print("Using custom cuts") From fb2ffe81431653d15e4e55d6491d09681bbe88aa Mon Sep 17 00:00:00 2001 From: saganatt <8majak8@gmail.com> Date: Tue, 29 Oct 2024 18:39:35 +0100 Subject: [PATCH 02/14] Ptshape changes refactored to avoid repetitions. TODO: processerdhadrons. --- .../analysis/analyzerdhadrons.py | 109 +++++------------- machine_learning_hep/processer.py | 98 ++++++++++------ machine_learning_hep/utilities.py | 16 ++- 3 files changed, 107 insertions(+), 116 deletions(-) diff --git a/machine_learning_hep/analysis/analyzerdhadrons.py b/machine_learning_hep/analysis/analyzerdhadrons.py index 7d4ab89287..fc7e3432aa 100644 --- a/machine_learning_hep/analysis/analyzerdhadrons.py +++ b/machine_learning_hep/analysis/analyzerdhadrons.py @@ -419,88 +419,41 @@ def efficiency(self): lfileeff = TFile.Open(self.n_fileff) lfileeff.ls() fileouteff = TFile.Open(f"{self.d_resultsallpmc}/{self.efficiency_filename}{self.case}{self.typean}.root", "recreate") - cEff = TCanvas("cEff", "The Fit Canvas") - cEff.SetCanvasSize(1900, 1500) - cEff.SetWindowSize(500, 500) - - legeff = TLegend(0.5, 0.65, 0.7, 0.85) - legeff.SetBorderSize(0) - legeff.SetFillColor(0) - legeff.SetFillStyle(0) - legeff.SetTextFont(42) - legeff.SetTextSize(0.035) - - h_gen_pr = lfileeff.Get("h_gen_pr") - h_sel_pr = lfileeff.Get("h_sel_pr") - h_sel_pr.Divide(h_sel_pr, h_gen_pr, 1.0, 1.0, "B") - h_sel_pr.Draw("same") - fileouteff.cd() - h_sel_pr.SetName("eff") - h_sel_pr.Write() - h_sel_pr.GetXaxis().SetTitle("#it{p}_{T} (GeV/#it{c})") - h_sel_pr.GetYaxis().SetTitle(f"Acc x efficiency (prompt) {self.p_latexnhadron} {self.typean} (1/GeV)") - h_sel_pr.SetMinimum(0.001) - h_sel_pr.SetMaximum(1.0) - gPad.SetLogy() - cEff.SaveAs(f"{self.d_resultsallpmc}/Eff{self.case}{self.typean}.eps") - - cEffFD = TCanvas("cEffFD", "The Fit Canvas") - cEffFD.SetCanvasSize(1900, 1500) - cEffFD.SetWindowSize(500, 500) - - legeffFD = TLegend(0.5, 0.65, 0.7, 0.85) - legeffFD.SetBorderSize(0) - legeffFD.SetFillColor(0) - legeffFD.SetFillStyle(0) - legeffFD.SetTextFont(42) - legeffFD.SetTextSize(0.035) - - h_gen_fd = lfileeff.Get("h_gen_fd") - h_sel_fd = lfileeff.Get("h_sel_fd") - h_sel_fd.Divide(h_sel_fd, h_gen_fd, 1.0, 1.0, "B") - h_sel_fd.Draw("same") - fileouteff.cd() - h_sel_fd.SetName("eff_fd") - h_sel_fd.Write() - h_sel_fd.GetXaxis().SetTitle("#it{p}_{T} (GeV/#it{c})") - h_sel_fd.GetYaxis().SetTitle(f"Acc x efficiency feed-down {self.p_latexnhadron} {self.typean} (1/GeV)") - h_sel_fd.SetMinimum(0.001) - h_sel_fd.SetMaximum(1.0) - gPad.SetLogy() - legeffFD.Draw() - cEffFD.SaveAs(f"{self.d_resultsallpmc}/EffFD{self.case}{self.typean}.eps") - - if self.do_ptshape: - cEffFDPtShape = TCanvas('cEffFDPtShape', 'The Fit Canvas') - cEffFDPtShape.SetCanvasSize(1900, 1500) - cEffFDPtShape.SetWindowSize(500, 500) - - legeffFDPtShape = TLegend(.5, .65, .7, .85) - legeffFDPtShape.SetBorderSize(0) - legeffFDPtShape.SetFillColor(0) - legeffFDPtShape.SetFillStyle(0) - legeffFDPtShape.SetTextFont(42) - legeffFDPtShape.SetTextSize(0.035) - - h_gen_fd = lfileeff.Get("h_gen_fd_ptshape") - h_sel_fd = lfileeff.Get("h_sel_fd_ptshape") - h_sel_fd.Divide(h_sel_fd, h_gen_fd, 1.0, 1.0, "B") - h_sel_fd.Draw("same") + + def do_eff(gen_hist, sel_hist, histname, outname, eff_case): + cEff = TCanvas(f"c{outname}", "The Fit Canvas") + cEff.SetCanvasSize(1900, 1500) + cEff.SetWindowSize(500, 500) + + legeff = TLegend(0.5, 0.65, 0.7, 0.85) + legeff.SetBorderSize(0) + legeff.SetFillColor(0) + legeff.SetFillStyle(0) + legeff.SetTextFont(42) + legeff.SetTextSize(0.035) + + h_gen = lfileeff.Get(gen_hist) + h_sel = lfileeff.Get(sel_hist) + h_sel.Divide(h_sel, h_gen, 1.0, 1.0, "B") + h_sel.Draw("same") fileouteff.cd() - h_sel_fd.SetName("eff_fd_ptshape") - h_sel_fd.Write() - h_sel_fd.GetXaxis().SetTitle("#it{p}_{T} (GeV/#it{c})") - h_sel_fd.GetYaxis().SetTitle("Acc x efficiency feed-down %s %s (1/GeV)" - % (self.p_latexnhadron, self.typean)) - h_sel_fd.SetMinimum(0.001) - h_sel_fd.SetMaximum(1.) + h_sel.SetName(histname) + h_sel.Write() + h_sel.GetXaxis().SetTitle("#it{p}_{T} (GeV/#it{c})") + h_sel.GetYaxis().SetTitle(f"Acc x efficiency ({eff_case}) {self.p_latexnhadron} {self.typean} (1/GeV)") + h_sel.SetMinimum(0.001) + h_sel.SetMaximum(1.0) gPad.SetLogy() - legeffFDPtShape.Draw() - cEffFDPtShape.SaveAs("%s/EffFDPtShape%s%s.eps" % (self.d_resultsallpmc, - self.case, self.typean)) + legeff.Draw() + cEff.SaveAs(f"{self.d_resultsallpmc}/{outname}{self.case}{self.typean}.eps") + + do_eff("h_gen_pr", "h_sel_pr", "eff", "Eff", "prompt") + do_eff("h_gen_fd", "h_sel_fd", "eff_fd", "EffFD", "feed-down") + if self.do_ptshape: + do_eff("h_gen_fd_ptshape", "h_sel_fd_ptshape", "eff_fd_ptshape", "EffFDPtShape", "feed-down") @staticmethod - def calculate_norm(logger, hevents, hselevents): # TO BE FIXED WITH EV SEL + def calculate_norm(logger, hevents, hselevents): # TO BE FIXED WITH EV SEL if not hevents: # pylint: disable=undefined-variable logger.error("Missing hevents") diff --git a/machine_learning_hep/processer.py b/machine_learning_hep/processer.py index f4971b944f..e526516ba5 100644 --- a/machine_learning_hep/processer.py +++ b/machine_learning_hep/processer.py @@ -202,6 +202,11 @@ def __init__( self.l_gen_sl = createlist(self.d_pkl, self.l_path, self.n_gen_sl) self.f_totevt = os.path.join(self.d_pkl, self.n_evt) self.f_totevtorig = os.path.join(self.d_pkl, self.n_evtorig) + self.f_weights = os.path.join(self.d_mcreweights, self.n_mcreweights) + + if self.do_ptshape: + with uproot.open(self.f_weights) as fin: + self.v_hist_weights = fin[self.n_weights].to_numpy() self.p_modelname = datap["mlapplication"]["modelname"] # Analysis pT bins @@ -347,6 +352,22 @@ def __init__( # self.triggerbit = datap["analysis"][self.typean]["triggerbit"] self.runlistrigger = runlisttrigger + if self.do_ptshape and self.mcordata == "mc": + lpt_recosk_ptshape = [None] * self.p_nptbins + lpt_gensk_ptshape = [None] * self.p_nptbins + lpt_recodec_ptshape = [None] * self.p_nptbins + + for ipt in range(self.p_nptbins): + lpt_recosk_ptshape[ipt] = self.v_var_binning_ptshape + self.lpt_recosk[ipt] + lpt_gensk_ptshape[ipt] = self.v_var_binning_ptshape + self.lpt_gensk[ipt] + lpt_recodec_ptshape[ipt] = self.v_var_binning_ptshape + self.lpt_recodec[ipt] + self.mptfiles_recosk_ptshape = [createlist(d_pklsk, self.l_path, \ + lpt_recosk_ptshape[ipt]) for ipt in range(self.p_nptbins)] + self.mptfiles_gensk_ptshape = [createlist(d_pklsk, self.l_path, \ + lpt_gensk_ptshape[ipt]) for ipt in range(self.p_nptbins)] + self.mptfiles_recoskmldec_ptshape = [createlist(self.d_pkl_dec, self.l_path, \ + lpt_recodec_ptshape[ipt]) for ipt in range(self.p_nptbins)] + # if os.path.exists(self.d_root) is False: # self.logger.warning("ROOT tree folder is not there. Is it intentional?") @@ -531,21 +552,25 @@ def dfuse(df_spec): path = os.path.join(self.d_pkl, self.l_path[file_index], df_spec["file"]) write_df(dfo, path) - def skim(self, file_index): - dfreco = read_df(self.l_reco[file_index]) - dfgen = read_df(self.l_gen[file_index]) if self.mcordata == "mc" else None - dfgen_sl = read_df(self.l_gen_sl[file_index]) if self.n_gen_sl and self.mcordata == "mc" else None - + def do_skim(self, dfreco, dfgen, var_binning, filenames_reco, filenames_gen, file_index): for ipt in range(self.p_nptbins): - dfrecosk = seldf_singlevar(dfreco, self.v_var_binning, self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt]) + dfrecosk = seldf_singlevar(dfreco, var_binning, + self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt]) dfrecosk = dfquery(dfrecosk, self.s_reco_skim[ipt]) - write_df(dfrecosk, self.mptfiles_recosk[ipt][file_index]) + write_df(dfrecosk, filenames_reco[ipt][file_index]) if dfgen is not None: - dfgensk = seldf_singlevar(dfgen, self.v_var_binning, self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt]) + dfgensk = seldf_singlevar(dfgen, var_binning, + self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt]) dfgensk = dfquery(dfgensk, self.s_gen_skim[ipt]) - write_df(dfgensk, self.mptfiles_gensk[ipt][file_index]) + write_df(dfgensk, filenames_gen[ipt][file_index]) + def skim(self, file_index): + dfreco = read_df(self.l_reco[file_index]) + dfgen = read_df(self.l_gen[file_index]) if self.mcordata == "mc" else None + dfgen_sl = read_df(self.l_gen_sl[file_index]) if self.n_gen_sl and self.mcordata == "mc" else None + + for ipt in range(self.p_nptbins): if dfgen_sl is not None: dfgensk_sl = seldf_singlevar( dfgen_sl, self.v_var_binning, self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt] @@ -553,12 +578,20 @@ def skim(self, file_index): dfgensk_sl = dfquery(dfgensk_sl, self.s_gen_skim[ipt]) write_df(dfgensk_sl, self.mptfiles_gensk_sl[ipt][file_index]) + self.do_skim(dfreco, dfgen, self.v_var_binning, self.mptfiles_recosk, self.mptfiles_gensk, file_index) + + if self.do_ptshape and self.mcordata == 'mc': + reweight(self.v_hist_weights, dfreco, self.v_var_binning, self.v_var_binning_ptshape) + reweight(self.v_hist_weights, dfgen, self.v_var_binning, self.v_var_binning_ptshape) + + self.do_skim(dfreco, dfgen, self.v_var_binning_ptshape, self.mptfiles_recosk_ptshape,\ + self.mptfiles_gensk_ptshape, file_index) + + + # pylint: disable=too-many-branches def applymodel(self, file_index): - for ipt in range(self.p_nptbins): - if os.path.exists(self.mptfiles_recoskmldec[ipt][file_index]): - if os.stat(self.mptfiles_recoskmldec[ipt][file_index]).st_size != 0: - continue - dfrecosk = read_df(self.mptfiles_recosk[ipt][file_index]) + def do_apply_model(in_filename, out_filename, ipt): + dfrecosk = read_df(in_filename) if self.p_mask_values: mask_df(dfrecosk, self.p_mask_values) if self.doml is True: @@ -584,30 +617,21 @@ def applymodel(self, file_index): dfrecoskml = dfrecoskml.loc[dfrecoskml[probvar] > self.lpt_probcutpre[ipt]] else: dfrecoskml = dfrecosk.query("isstd == 1") - write_df(dfrecoskml, self.mptfiles_recoskmldec[ipt][file_index]) - - if(self.do_ptshape and self.mcordata == 'mc'): - dfrecosk_ptshape = read_df(self.mptfiles_recosk_ptshape[ipt][file_index]) - if self.doml is True: - if self.mltype == "MultiClassification": - dfrecoskml_ptshape = apply(self.mltype, [self.p_modelname], [mod], - dfrecosk_ptshape, self.v_train[ipt], self.class_labels) - probs = [f'y_test_prob{self.p_modelname}{label.replace("-", "_")}' \ - for label in self.class_labels] - dfrecoskml_ptshape = dfrecoskml_ptshape[ - (dfrecoskml_ptshape[probs[0]] <= self.lpt_probcutpre[ipt][0]) & - (dfrecoskml_ptshape[probs[1]] >= self.lpt_probcutpre[ipt][1]) & - (dfrecoskml_ptshape[probs[2]] >= self.lpt_probcutpre[ipt][2])] - else: - dfrecoskml_ptshape = apply("BinaryClassification", [self.p_modelname], [mod], - dfrecosk_ptshape, self.v_train[ipt]) - probvar = f"y_test_prob{self.p_modelname}" - dfrecoskml_ptshape = dfrecoskml_ptshape.loc[ - dfrecoskml_ptshape[probvar] > self.lpt_probcutpre[ipt]] + write_df(dfrecoskml, out_filename) - else: - dfrecoskml_ptshape = dfrecosk_ptshape.query("isstd == 1") - write_df(dfrecoskml_ptshape, self.mptfiles_recoskmldec_ptshape[ipt][file_index]) + for ipt in range(self.p_nptbins): + if os.path.exists(self.mptfiles_recoskmldec[ipt][file_index]): + if os.stat(self.mptfiles_recoskmldec[ipt][file_index]).st_size != 0: + continue + + do_apply_model(self.mptfiles_recosk[ipt][file_index], + self.mptfiles_recoskmldec[ipt][file_index], + ipt) + + if self.do_ptshape and self.mcordata == 'mc': + do_apply_model(self.mptfiles_recosk_ptshape[ipt][file_index], + self.mptfiles_recoskmldec_ptshape[ipt][file_index], + ipt) @staticmethod def callback(ex): diff --git a/machine_learning_hep/utilities.py b/machine_learning_hep/utilities.py index f70a02b26d..f8aa04ce26 100644 --- a/machine_learning_hep/utilities.py +++ b/machine_learning_hep/utilities.py @@ -172,7 +172,7 @@ def conv_none(value): df_to_mask.loc[mask_indices, [mc["column"]]] = conv_none(mc["mask_with"]) -def dfquery(df, selection, **kwargs): # pylint: disable=invalid-name +def dfquery(df, selection, **kwargs): # pylint: disable=invalid-name return df.query(selection, **kwargs) if selection is not None else df @@ -187,6 +187,20 @@ def selectdfrunlist(dfr, runlist, runvar): dfr = dfr[issel] return dfr +def reweight(hist_weights, df, var, weighted_var): # pylint: disable=invalid-name + weights = hist_weights[0] + bin_edges = hist_weights[1] + + # Extract pT values from DataFrame + var = df[var] + + # Determine bin indices for each pT value + bin_index = np.digitize(var, bin_edges) - 1 + + # Ensure bin indices are within the valid range + bin_index = np.clip(bin_index, 0, len(weights) - 1) + df['weights'] = weights[bin_index] + df[weighted_var] = df['weights'] * var def count_df_length_pkl(*pkls): """ From cdf8f673f977d525705c5c8f47f02054e6c342d4 Mon Sep 17 00:00:00 2001 From: saganatt <8majak8@gmail.com> Date: Mon, 14 Jul 2025 09:06:15 +0200 Subject: [PATCH 03/14] Add ptshape code to processerdhadrons --- machine_learning_hep/processerdhadrons.py | 56 ++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/machine_learning_hep/processerdhadrons.py b/machine_learning_hep/processerdhadrons.py index a46e90ee37..08484d783b 100755 --- a/machine_learning_hep/processerdhadrons.py +++ b/machine_learning_hep/processerdhadrons.py @@ -200,7 +200,7 @@ def process_histomass_single(self, index): # pylint: disable=line-too-long def process_efficiency_single(self, index): - # TO UPDATE TO DHADRON_MULT VERSION + # TODO: UPDATE TO DHADRON_MULT VERSION out_file = TFile.Open(self.l_histoeff[index], "recreate") n_bins = len(self.lpt_finbinmin) analysis_bin_lims_temp = self.lpt_finbinmin.copy() @@ -212,8 +212,16 @@ def process_efficiency_single(self, index): h_gen_fd = TH1F("h_gen_fd", "FD Generated in acceptance |y|<0.5", n_bins, analysis_bin_lims) h_presel_fd = TH1F("h_presel_fd", "FD Reco in acc |#eta|<0.8 and sel", n_bins, analysis_bin_lims) h_sel_fd = TH1F("h_sel_fd", "FD Reco and sel in acc |#eta|<0.8 and sel", n_bins, analysis_bin_lims) + if self.do_ptshape: + h_gen_fd_ptshape = TH1F("h_gen_fd_ptshape", "FD Generated in acceptance |y|<0.5", \ + n_bins, analysis_bin_lims) + h_presel_fd_ptshape = TH1F("h_presel_fd_ptshape", "FD Reco in acc |#eta|<0.8 and sel", \ + n_bins, analysis_bin_lims) + h_sel_fd_ptshape = TH1F("h_sel_fd_ptshape", "FD Reco and sel in acc |#eta|<0.8 and sel", \ + n_bins, analysis_bin_lims) bincounter = 0 + bincounter2 = 0 for ipt in range(self.p_nptfinbins): bin_id = self.bin_matching[ipt] df_mc_reco = read_df(self.mptfiles_recoskmldec[bin_id][index]) @@ -271,6 +279,47 @@ def process_efficiency_single(self, index): h_sel_fd.SetBinError(bincounter + 1, err) bincounter = bincounter + 1 + if self.do_ptshape: + + df_mc_reco_ptshape = read_df(self.mptfiles_recoskmldec_ptshape[bin_id][index]) + + if self.s_evtsel is not None: + df_mc_reco_ptshape = df_mc_reco_ptshape.query(self.s_evtsel) + + df_mc_gen_ptshape = read_df(self.mptfiles_gensk_ptshape[bin_id][index]) + + df_mc_gen_ptshape = df_mc_gen_ptshape.query(self.s_presel_gen_eff) + + df_mc_reco_ptshape = seldf_singlevar(df_mc_reco_ptshape, self.v_var_binning_ptshape, \ + self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt]) + df_mc_gen_ptshape = seldf_singlevar(df_mc_gen_ptshape, self.v_var_binning_ptshape, \ + self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt]) + + df_gen_sel_fd_ptshape = df_mc_gen_ptshape.loc[(df_mc_gen_ptshape.ismcfd == 1) & (df_mc_gen_ptshape.ismcsignal == 1)] + df_reco_presel_fd_ptshape = df_mc_reco_ptshape.loc[(df_mc_reco_ptshape.ismcfd == 1) & (df_mc_reco_ptshape.ismcsignal == 1)] + df_reco_sel_fd_ptshape = None + if self.doml is True: + df_reco_sel_fd_ptshape = df_reco_presel_fd_ptshape.query(self.l_selml[bin_id]) + else: + df_reco_sel_fd_ptshape = df_reco_presel_fd_ptshape.copy() + + if self.do_custom_analysis_cuts: + df_reco_sel_fd_ptshape = self.apply_cuts_ptbin(df_reco_sel_fd_ptshape, ipt) + + val2 = len(df_gen_sel_fd_ptshape) + err2 = math.sqrt(val2) + h_gen_fd_ptshape.SetBinContent(bincounter2 + 1, val2) + h_gen_fd_ptshape.SetBinError(bincounter2 + 1, err2) + val2 = len(df_reco_presel_fd_ptshape) + err2 = math.sqrt(val2) + h_presel_fd_ptshape.SetBinContent(bincounter2 + 1, val2) + h_presel_fd_ptshape.SetBinError(bincounter2 + 1, err2) + val2 = len(df_reco_sel_fd_ptshape) + err2 = math.sqrt(val2) + h_sel_fd_ptshape.SetBinContent(bincounter2 + 1, val2) + h_sel_fd_ptshape.SetBinError(bincounter2 + 1, err2) + bincounter2 = bincounter2 + 1 + out_file.cd() h_gen_pr.Write() h_presel_pr.Write() @@ -278,3 +327,8 @@ def process_efficiency_single(self, index): h_gen_fd.Write() h_presel_fd.Write() h_sel_fd.Write() + + if self.do_ptshape: + h_gen_fd_ptshape.Write() + h_presel_fd_ptshape.Write() + h_sel_fd_ptshape.Write() From 61ed9abedfaf11035be1a941a2c3fea90bcf86c4 Mon Sep 17 00:00:00 2001 From: saganatt <8majak8@gmail.com> Date: Mon, 14 Jul 2025 09:35:25 +0200 Subject: [PATCH 04/14] Refactored ptshape in processerdhadrons --- machine_learning_hep/processerdhadrons.py | 119 ++++++---------------- 1 file changed, 31 insertions(+), 88 deletions(-) diff --git a/machine_learning_hep/processerdhadrons.py b/machine_learning_hep/processerdhadrons.py index 08484d783b..cc5e6e6b27 100755 --- a/machine_learning_hep/processerdhadrons.py +++ b/machine_learning_hep/processerdhadrons.py @@ -206,6 +206,7 @@ def process_efficiency_single(self, index): analysis_bin_lims_temp = self.lpt_finbinmin.copy() analysis_bin_lims_temp.append(self.lpt_finbinmax[n_bins - 1]) analysis_bin_lims = array.array("f", analysis_bin_lims_temp) + h_gen_pr = TH1F("h_gen_pr", "Prompt Generated in acceptance |y|<0.5", n_bins, analysis_bin_lims) h_presel_pr = TH1F("h_presel_pr", "Prompt Reco in acc |#eta|<0.8 and sel", n_bins, analysis_bin_lims) h_sel_pr = TH1F("h_sel_pr", "Prompt Reco and sel in acc |#eta|<0.8 and sel", n_bins, analysis_bin_lims) @@ -220,105 +221,47 @@ def process_efficiency_single(self, index): h_sel_fd_ptshape = TH1F("h_sel_fd_ptshape", "FD Reco and sel in acc |#eta|<0.8 and sel", \ n_bins, analysis_bin_lims) - bincounter = 0 - bincounter2 = 0 - for ipt in range(self.p_nptfinbins): - bin_id = self.bin_matching[ipt] - df_mc_reco = read_df(self.mptfiles_recoskmldec[bin_id][index]) + def get_eff_dfs(ipt, bin_id, gen_files, reco_files, var_binning): + df_mc_reco = read_df(reco_files[bin_id][index]) if self.s_evtsel is not None: df_mc_reco = df_mc_reco.query(self.s_evtsel) - df_mc_gen = read_df(self.mptfiles_gensk[bin_id][index]) + df_mc_gen = read_df(gen_files[bin_id][index]) df_mc_gen = df_mc_gen.query(self.s_presel_gen_eff) df_mc_reco = seldf_singlevar( - df_mc_reco, self.v_var_binning, self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt] + df_mc_reco, var_binning, self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt] ) - df_mc_gen = seldf_singlevar(df_mc_gen, self.v_var_binning, self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt]) - df_gen_sel_pr = df_mc_gen.loc[(df_mc_gen.ismcprompt == 1) & (df_mc_gen.ismcsignal == 1)] - df_reco_presel_pr = df_mc_reco.loc[(df_mc_reco.ismcprompt == 1) & (df_mc_reco.ismcsignal == 1)] - df_reco_sel_pr = None - if self.doml is True: - df_reco_sel_pr = df_reco_presel_pr.query(self.l_selml[bin_id]) - else: - df_reco_sel_pr = df_reco_presel_pr.copy() - df_gen_sel_fd = df_mc_gen.loc[(df_mc_gen.ismcfd == 1) & (df_mc_gen.ismcsignal == 1)] - df_reco_presel_fd = df_mc_reco.loc[(df_mc_reco.ismcfd == 1) & (df_mc_reco.ismcsignal == 1)] - df_reco_sel_fd = None + df_mc_gen = seldf_singlevar(df_mc_gen, var_binning, self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt]) + return df_mc_gen, df_mc_reco + + def do_eff_single(ipt, bin_id, df_mc_gen, df_mc_reco, sel_column, bincounter, hists): + df_gen_sel = df_mc_gen.loc[(df_mc_gen[sel_column] == 1) & (df_mc_gen.ismcsignal == 1)] + df_reco_presel = df_mc_reco.loc[(df_mc_reco[sel_column] == 1) & (df_mc_reco.ismcsignal == 1)] + df_reco_sel = None if self.doml is True: - df_reco_sel_fd = df_reco_presel_fd.query(self.l_selml[bin_id]) + df_reco_sel = df_reco_presel.query(self.l_selml[bin_id]) else: - df_reco_sel_fd = df_reco_presel_fd.copy() - + df_reco_sel = df_reco_presel.copy() if self.do_custom_analysis_cuts: - df_reco_sel_pr = self.apply_cuts_ptbin(df_reco_sel_pr, ipt) - df_reco_sel_fd = self.apply_cuts_ptbin(df_reco_sel_fd, ipt) - - val = len(df_gen_sel_pr) - err = math.sqrt(val) - h_gen_pr.SetBinContent(bincounter + 1, val) - h_gen_pr.SetBinError(bincounter + 1, err) - val = len(df_reco_presel_pr) - err = math.sqrt(val) - h_presel_pr.SetBinContent(bincounter + 1, val) - h_presel_pr.SetBinError(bincounter + 1, err) - val = len(df_reco_sel_pr) - err = math.sqrt(val) - h_sel_pr.SetBinContent(bincounter + 1, val) - h_sel_pr.SetBinError(bincounter + 1, err) - - val = len(df_gen_sel_fd) - err = math.sqrt(val) - h_gen_fd.SetBinContent(bincounter + 1, val) - h_gen_fd.SetBinError(bincounter + 1, err) - val = len(df_reco_presel_fd) - err = math.sqrt(val) - h_presel_fd.SetBinContent(bincounter + 1, val) - h_presel_fd.SetBinError(bincounter + 1, err) - val = len(df_reco_sel_fd) - err = math.sqrt(val) - h_sel_fd.SetBinContent(bincounter + 1, val) - h_sel_fd.SetBinError(bincounter + 1, err) + df_reco_sel = self.apply_cuts_ptbin(df_reco_sel, ipt) + + for df, hist in zip((df_gen_sel, df_reco_presel, df_reco_sel), hists): + val = len(df) + err = math.sqrt(val) + hist.SetBinContent(bincounter + 1, val) + hist.SetBinError(bincounter + 1, err) bincounter = bincounter + 1 - if self.do_ptshape: + bincounter_pr = bincounter_fd = bincounter_ptshape = 0 + for ipt in range(self.p_nptfinbins): + bin_id = self.bin_matching[ipt] + df_mc_gen, df_mc_reco = get_eff_dfs(ipt, bin_id, self.mptfiles_gensk, self.mptfiles_recoskmldec, self.v_var_binning) + + do_eff_single(ipt, bin_id, df_mc_gen, df_mc_reco, "ismcprompt", bincounter_pr, (h_gen_pr, h_presel_pr, h_sel_pr)) + do_eff_single(ipt, bin_id, df_mc_gen, df_mc_reco, "ismcfd", bincounter_fd, (h_gen_fd, h_presel_fd, h_sel_fd)) - df_mc_reco_ptshape = read_df(self.mptfiles_recoskmldec_ptshape[bin_id][index]) - - if self.s_evtsel is not None: - df_mc_reco_ptshape = df_mc_reco_ptshape.query(self.s_evtsel) - - df_mc_gen_ptshape = read_df(self.mptfiles_gensk_ptshape[bin_id][index]) - - df_mc_gen_ptshape = df_mc_gen_ptshape.query(self.s_presel_gen_eff) - - df_mc_reco_ptshape = seldf_singlevar(df_mc_reco_ptshape, self.v_var_binning_ptshape, \ - self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt]) - df_mc_gen_ptshape = seldf_singlevar(df_mc_gen_ptshape, self.v_var_binning_ptshape, \ - self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt]) - - df_gen_sel_fd_ptshape = df_mc_gen_ptshape.loc[(df_mc_gen_ptshape.ismcfd == 1) & (df_mc_gen_ptshape.ismcsignal == 1)] - df_reco_presel_fd_ptshape = df_mc_reco_ptshape.loc[(df_mc_reco_ptshape.ismcfd == 1) & (df_mc_reco_ptshape.ismcsignal == 1)] - df_reco_sel_fd_ptshape = None - if self.doml is True: - df_reco_sel_fd_ptshape = df_reco_presel_fd_ptshape.query(self.l_selml[bin_id]) - else: - df_reco_sel_fd_ptshape = df_reco_presel_fd_ptshape.copy() - - if self.do_custom_analysis_cuts: - df_reco_sel_fd_ptshape = self.apply_cuts_ptbin(df_reco_sel_fd_ptshape, ipt) - - val2 = len(df_gen_sel_fd_ptshape) - err2 = math.sqrt(val2) - h_gen_fd_ptshape.SetBinContent(bincounter2 + 1, val2) - h_gen_fd_ptshape.SetBinError(bincounter2 + 1, err2) - val2 = len(df_reco_presel_fd_ptshape) - err2 = math.sqrt(val2) - h_presel_fd_ptshape.SetBinContent(bincounter2 + 1, val2) - h_presel_fd_ptshape.SetBinError(bincounter2 + 1, err2) - val2 = len(df_reco_sel_fd_ptshape) - err2 = math.sqrt(val2) - h_sel_fd_ptshape.SetBinContent(bincounter2 + 1, val2) - h_sel_fd_ptshape.SetBinError(bincounter2 + 1, err2) - bincounter2 = bincounter2 + 1 + if self.do_ptshape: + df_mc_gen_ptshape, df_mc_reco_ptshape = get_eff_dfs(ipt, bin_id, self.mptfiles_gensk_ptshape, self.mptfiles_recoskmldec_ptshape, self.v_var_binning_ptshape) + do_eff_single(ipt, bin_id, df_mc_gen_ptshape, df_mc_reco_ptshape, "ismcfd", bincounter_ptshape, (h_gen_fd_ptshape, h_presel_fd_ptshape, h_sel_fd_ptshape)) out_file.cd() h_gen_pr.Write() From be9bb25970fd2251cf3a572b7e74ee5f03727af2 Mon Sep 17 00:00:00 2001 From: saganatt <8majak8@gmail.com> Date: Tue, 15 Jul 2025 14:20:54 +0200 Subject: [PATCH 05/14] Leftover fixes --- machine_learning_hep/processer.py | 6 +++++- machine_learning_hep/processerdhadrons.py | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/machine_learning_hep/processer.py b/machine_learning_hep/processer.py index e526516ba5..9d1192d98b 100644 --- a/machine_learning_hep/processer.py +++ b/machine_learning_hep/processer.py @@ -42,6 +42,7 @@ mergerootfiles, openfile, read_df, + reweight, seldf_singlevar, write_df, ) @@ -149,6 +150,7 @@ def __init__( self.n_fileeff = datap["files_names"]["efffilename"] self.n_fileresp = datap["files_names"]["respfilename"] self.n_mcreweights = datap["files_names"]["namefile_mcweights"] + self.n_weights = datap["files_names"]["histoweights"] # selections self.s_reco_skim = datap["sel_reco_skim"] @@ -172,6 +174,8 @@ def __init__( self.v_ismcbkg = datap["bitmap_sel"]["var_ismcbkg"] # used in hadrons self.v_ismcrefl = datap["bitmap_sel"]["var_ismcrefl"] # used in hadrons self.v_var_binning = datap["var_binning"] + self.do_ptshape = datap.get("do_ptshape", False) + self.v_var_binning_ptshape = datap.get("var_binning_ptshape", None) self.v_invmass = datap["variables"].get("var_inv_mass", "inv_mass") # self.v_rapy = datap["variables"].get("var_y", "y_cand") @@ -298,7 +302,7 @@ def __init__( ) self.lpt_recodec = None - if self.doml is True: + if self.doml: if self.mltype == "MultiClassification": self.lpt_recodec = [ self.n_reco.replace( diff --git a/machine_learning_hep/processerdhadrons.py b/machine_learning_hep/processerdhadrons.py index cc5e6e6b27..908fff35a6 100755 --- a/machine_learning_hep/processerdhadrons.py +++ b/machine_learning_hep/processerdhadrons.py @@ -138,8 +138,8 @@ def process_histomass_single(self, index): if self.s_evtsel is not None: df = df.query(self.s_evtsel) - if self.doml is True: - df = df.query(self.l_selml[bin_id]) + if self.doml: + df = df.query(self.l_selml[ipt]) df = seldf_singlevar(df, self.v_var_binning, self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt]) if self.do_custom_analysis_cuts: From 5d196e1a2fd1b8ce235f4c094191765c7f48b9cc Mon Sep 17 00:00:00 2001 From: saganatt <8majak8@gmail.com> Date: Mon, 22 Sep 2025 10:35:32 +0200 Subject: [PATCH 06/14] Remove trailing whitespaces --- machine_learning_hep/analysis/analyzerdhadrons.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/machine_learning_hep/analysis/analyzerdhadrons.py b/machine_learning_hep/analysis/analyzerdhadrons.py index fc7e3432aa..0566465daf 100644 --- a/machine_learning_hep/analysis/analyzerdhadrons.py +++ b/machine_learning_hep/analysis/analyzerdhadrons.py @@ -419,7 +419,7 @@ def efficiency(self): lfileeff = TFile.Open(self.n_fileff) lfileeff.ls() fileouteff = TFile.Open(f"{self.d_resultsallpmc}/{self.efficiency_filename}{self.case}{self.typean}.root", "recreate") - + def do_eff(gen_hist, sel_hist, histname, outname, eff_case): cEff = TCanvas(f"c{outname}", "The Fit Canvas") cEff.SetCanvasSize(1900, 1500) @@ -446,7 +446,7 @@ def do_eff(gen_hist, sel_hist, histname, outname, eff_case): gPad.SetLogy() legeff.Draw() cEff.SaveAs(f"{self.d_resultsallpmc}/{outname}{self.case}{self.typean}.eps") - + do_eff("h_gen_pr", "h_sel_pr", "eff", "Eff", "prompt") do_eff("h_gen_fd", "h_sel_fd", "eff_fd", "EffFD", "feed-down") if self.do_ptshape: From 145bde40fbcac7dd49f2b21488174ef1f23cb069 Mon Sep 17 00:00:00 2001 From: saganatt <8majak8@gmail.com> Date: Mon, 22 Sep 2025 10:42:18 +0200 Subject: [PATCH 07/14] Change of TODO comment --- machine_learning_hep/processerdhadrons.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning_hep/processerdhadrons.py b/machine_learning_hep/processerdhadrons.py index 908fff35a6..12df785d46 100755 --- a/machine_learning_hep/processerdhadrons.py +++ b/machine_learning_hep/processerdhadrons.py @@ -200,7 +200,7 @@ def process_histomass_single(self, index): # pylint: disable=line-too-long def process_efficiency_single(self, index): - # TODO: UPDATE TO DHADRON_MULT VERSION + # TODO: Unify this and the dhadrons_mult version out_file = TFile.Open(self.l_histoeff[index], "recreate") n_bins = len(self.lpt_finbinmin) analysis_bin_lims_temp = self.lpt_finbinmin.copy() From 0d5082ff1acfa50843fd2d65a700c9d656627ac0 Mon Sep 17 00:00:00 2001 From: saganatt <8majak8@gmail.com> Date: Mon, 22 Sep 2025 15:53:06 +0200 Subject: [PATCH 08/14] Solve most of pylint errors --- machine_learning_hep/analysis/analyzerdhadrons.py | 10 +++++++--- machine_learning_hep/processer.py | 8 +++++++- machine_learning_hep/processerdhadrons.py | 11 ++++++++++- machine_learning_hep/utilities.py | 2 +- 4 files changed, 25 insertions(+), 6 deletions(-) diff --git a/machine_learning_hep/analysis/analyzerdhadrons.py b/machine_learning_hep/analysis/analyzerdhadrons.py index 0566465daf..8aff80c804 100644 --- a/machine_learning_hep/analysis/analyzerdhadrons.py +++ b/machine_learning_hep/analysis/analyzerdhadrons.py @@ -55,10 +55,13 @@ from machine_learning_hep.utils.hist import get_dim, project_hist # pylint: disable=too-few-public-methods, too-many-instance-attributes, too-many-statements, fixme -# pylint: disable=consider-using-enumerate fixme +# pylint: disable=consider-using-enumerate, missing-function-docstring class AnalyzerDhadrons(Analyzer): # pylint: disable=invalid-name + """ + An analyzer for D and L hadrons. + """ species = "analyzer" def __init__(self, datap, case, typean, period): @@ -288,7 +291,7 @@ def fit(self): lpt_probcutfin[ipt] = self.lpt_probcutfin_tmp[self.bin_matching[ipt]] self.logger.debug("fitting %s - %i", level, ipt) roows = self.roows.get(ipt) - if self.mltype == "MultiClassification": + if self.mltype == "MultiClassification": # pylint: ignore=duplicate-code suffix = "%s%d_%d_%.2f%.2f%.2f" % ( self.v_var_binning, self.lpt_finbinmin[ipt], @@ -418,7 +421,8 @@ def efficiency(self): print(self.n_fileff) lfileeff = TFile.Open(self.n_fileff) lfileeff.ls() - fileouteff = TFile.Open(f"{self.d_resultsallpmc}/{self.efficiency_filename}{self.case}{self.typean}.root", "recreate") + fileouteff = TFile.Open(f"{self.d_resultsallpmc}/{self.efficiency_filename}{self.case}{self.typean}.root", + "recreate") def do_eff(gen_hist, sel_hist, histname, outname, eff_case): cEff = TCanvas(f"c{outname}", "The Fit Canvas") diff --git a/machine_learning_hep/processer.py b/machine_learning_hep/processer.py index 9d1192d98b..7dc7e2e32f 100644 --- a/machine_learning_hep/processer.py +++ b/machine_learning_hep/processer.py @@ -50,8 +50,13 @@ pd.options.mode.chained_assignment = None +# pylint: disable=missing-function-docstring + class Processer: # pylint: disable=too-many-instance-attributes + """ + The main class for data processing, machine learning, and analysis. + """ # Class Attribute species = "processer" logger = get_logger() @@ -499,7 +504,8 @@ def dfuse(df_spec): dfs[df_name][tag] = (var == value["req"]).astype(int) # dfs[df_name][tag] = np.array( - # tag_bit_df(dfs[df_name], value["var"], value["req"], value.get("abs", False)), dtype=int) + # tag_bit_df(dfs[df_name], value["var"], value["req"], value.get("abs", False)), + # dtype=int) if "swap" in df_spec: self.logger.debug(" %s -> swap", df_name) diff --git a/machine_learning_hep/processerdhadrons.py b/machine_learning_hep/processerdhadrons.py index 12df785d46..cd36f89f35 100755 --- a/machine_learning_hep/processerdhadrons.py +++ b/machine_learning_hep/processerdhadrons.py @@ -31,6 +31,9 @@ class ProcesserDhadrons(Processer): # pylint: disable=too-many-instance-attributes + """ + The class for specializations of data processing, machine learning and analysis for D and L hadrons. + """ # Class Attribute species = "processer" @@ -104,6 +107,9 @@ def __init__( # pylint: disable=too-many-branches def process_histomass_single(self, index): + """ + Generate invariant mass histograms for MC or real data. + """ myfile = TFile.Open(self.l_histomass[index], "recreate") dfevtorig = read_df(self.l_evtorig[index]) neventsorig = len(dfevtorig) @@ -147,7 +153,7 @@ def process_histomass_single(self, index): df_ptmerged = pd.concat([df_ptmerged, df], ignore_index=True) - if self.mltype == "MultiClassification": + if self.mltype == "MultiClassification": # pylint: ignore=duplicate-code suffix = "%s%d_%d_%.2f%.2f%.2f" % ( self.v_var_binning, self.lpt_finbinmin[ipt], @@ -200,6 +206,9 @@ def process_histomass_single(self, index): # pylint: disable=line-too-long def process_efficiency_single(self, index): + """ + Generate efficiency histograms for MC or real data. + """ # TODO: Unify this and the dhadrons_mult version out_file = TFile.Open(self.l_histoeff[index], "recreate") n_bins = len(self.lpt_finbinmin) diff --git a/machine_learning_hep/utilities.py b/machine_learning_hep/utilities.py index f8aa04ce26..8af345e276 100644 --- a/machine_learning_hep/utilities.py +++ b/machine_learning_hep/utilities.py @@ -74,7 +74,7 @@ from machine_learning_hep.logger import get_logger from machine_learning_hep.selectionutils import select_runs -# pylint: disable=too-many-lines +# pylint: disable=too-many-lines, missing-function-docstring logger = get_logger() From 86e65eff3df84bbec1cb84c06d6f530146eb5963 Mon Sep 17 00:00:00 2001 From: saganatt <8majak8@gmail.com> Date: Mon, 22 Sep 2025 15:56:53 +0200 Subject: [PATCH 09/14] No-member error solved --- machine_learning_hep/processerdhadrons.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/machine_learning_hep/processerdhadrons.py b/machine_learning_hep/processerdhadrons.py index cd36f89f35..17f6037196 100755 --- a/machine_learning_hep/processerdhadrons.py +++ b/machine_learning_hep/processerdhadrons.py @@ -222,7 +222,7 @@ def process_efficiency_single(self, index): h_gen_fd = TH1F("h_gen_fd", "FD Generated in acceptance |y|<0.5", n_bins, analysis_bin_lims) h_presel_fd = TH1F("h_presel_fd", "FD Reco in acc |#eta|<0.8 and sel", n_bins, analysis_bin_lims) h_sel_fd = TH1F("h_sel_fd", "FD Reco and sel in acc |#eta|<0.8 and sel", n_bins, analysis_bin_lims) - if self.do_ptshape: + if self.do_ptshape: # pylint: disable=no-member h_gen_fd_ptshape = TH1F("h_gen_fd_ptshape", "FD Generated in acceptance |y|<0.5", \ n_bins, analysis_bin_lims) h_presel_fd_ptshape = TH1F("h_presel_fd_ptshape", "FD Reco in acc |#eta|<0.8 and sel", \ @@ -268,7 +268,7 @@ def do_eff_single(ipt, bin_id, df_mc_gen, df_mc_reco, sel_column, bincounter, hi do_eff_single(ipt, bin_id, df_mc_gen, df_mc_reco, "ismcprompt", bincounter_pr, (h_gen_pr, h_presel_pr, h_sel_pr)) do_eff_single(ipt, bin_id, df_mc_gen, df_mc_reco, "ismcfd", bincounter_fd, (h_gen_fd, h_presel_fd, h_sel_fd)) - if self.do_ptshape: + if self.do_ptshape: # pylint: disable=no-member df_mc_gen_ptshape, df_mc_reco_ptshape = get_eff_dfs(ipt, bin_id, self.mptfiles_gensk_ptshape, self.mptfiles_recoskmldec_ptshape, self.v_var_binning_ptshape) do_eff_single(ipt, bin_id, df_mc_gen_ptshape, df_mc_reco_ptshape, "ismcfd", bincounter_ptshape, (h_gen_fd_ptshape, h_presel_fd_ptshape, h_sel_fd_ptshape)) @@ -280,7 +280,7 @@ def do_eff_single(ipt, bin_id, df_mc_gen, df_mc_reco, sel_column, bincounter, hi h_presel_fd.Write() h_sel_fd.Write() - if self.do_ptshape: + if self.do_ptshape: # pylint: disable=no-member h_gen_fd_ptshape.Write() h_presel_fd_ptshape.Write() h_sel_fd_ptshape.Write() From 33fe9eeaf8b24dabc06fe525b29d1776ad1de236 Mon Sep 17 00:00:00 2001 From: saganatt <8majak8@gmail.com> Date: Mon, 22 Sep 2025 16:18:04 +0200 Subject: [PATCH 10/14] Nitpick and remaining pylint errors --- machine_learning_hep/analysis/analyzerdhadrons.py | 4 ++-- machine_learning_hep/processerdhadrons.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/machine_learning_hep/analysis/analyzerdhadrons.py b/machine_learning_hep/analysis/analyzerdhadrons.py index 8aff80c804..0a86b1ab8d 100644 --- a/machine_learning_hep/analysis/analyzerdhadrons.py +++ b/machine_learning_hep/analysis/analyzerdhadrons.py @@ -291,7 +291,7 @@ def fit(self): lpt_probcutfin[ipt] = self.lpt_probcutfin_tmp[self.bin_matching[ipt]] self.logger.debug("fitting %s - %i", level, ipt) roows = self.roows.get(ipt) - if self.mltype == "MultiClassification": # pylint: ignore=duplicate-code + if self.mltype == "MultiClassification": # pylint: disable=duplicate-code suffix = "%s%d_%d_%.2f%.2f%.2f" % ( self.v_var_binning, self.lpt_finbinmin[ipt], @@ -453,7 +453,7 @@ def do_eff(gen_hist, sel_hist, histname, outname, eff_case): do_eff("h_gen_pr", "h_sel_pr", "eff", "Eff", "prompt") do_eff("h_gen_fd", "h_sel_fd", "eff_fd", "EffFD", "feed-down") - if self.do_ptshape: + if self.do_ptshape: # pylint: disable=no-member do_eff("h_gen_fd_ptshape", "h_sel_fd_ptshape", "eff_fd_ptshape", "EffFDPtShape", "feed-down") @staticmethod diff --git a/machine_learning_hep/processerdhadrons.py b/machine_learning_hep/processerdhadrons.py index 17f6037196..83af88ecc9 100755 --- a/machine_learning_hep/processerdhadrons.py +++ b/machine_learning_hep/processerdhadrons.py @@ -153,7 +153,7 @@ def process_histomass_single(self, index): df_ptmerged = pd.concat([df_ptmerged, df], ignore_index=True) - if self.mltype == "MultiClassification": # pylint: ignore=duplicate-code + if self.mltype == "MultiClassification": # pylint: disable=duplicate-code suffix = "%s%d_%d_%.2f%.2f%.2f" % ( self.v_var_binning, self.lpt_finbinmin[ipt], From 17931a0170885b67dd3870cb038ee93910abf803 Mon Sep 17 00:00:00 2001 From: saganatt <8majak8@gmail.com> Date: Mon, 22 Sep 2025 16:22:55 +0200 Subject: [PATCH 11/14] Remove the pylint disable which doesn't work --- machine_learning_hep/analysis/analyzerdhadrons.py | 2 +- machine_learning_hep/processerdhadrons.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/machine_learning_hep/analysis/analyzerdhadrons.py b/machine_learning_hep/analysis/analyzerdhadrons.py index 0a86b1ab8d..e80204f115 100644 --- a/machine_learning_hep/analysis/analyzerdhadrons.py +++ b/machine_learning_hep/analysis/analyzerdhadrons.py @@ -291,7 +291,7 @@ def fit(self): lpt_probcutfin[ipt] = self.lpt_probcutfin_tmp[self.bin_matching[ipt]] self.logger.debug("fitting %s - %i", level, ipt) roows = self.roows.get(ipt) - if self.mltype == "MultiClassification": # pylint: disable=duplicate-code + if self.mltype == "MultiClassification": suffix = "%s%d_%d_%.2f%.2f%.2f" % ( self.v_var_binning, self.lpt_finbinmin[ipt], diff --git a/machine_learning_hep/processerdhadrons.py b/machine_learning_hep/processerdhadrons.py index 83af88ecc9..20f96dde33 100755 --- a/machine_learning_hep/processerdhadrons.py +++ b/machine_learning_hep/processerdhadrons.py @@ -153,7 +153,7 @@ def process_histomass_single(self, index): df_ptmerged = pd.concat([df_ptmerged, df], ignore_index=True) - if self.mltype == "MultiClassification": # pylint: disable=duplicate-code + if self.mltype == "MultiClassification": suffix = "%s%d_%d_%.2f%.2f%.2f" % ( self.v_var_binning, self.lpt_finbinmin[ipt], From 3b2c568ecb542138ad6283b68b3a125486f9b21d Mon Sep 17 00:00:00 2001 From: saganatt <8majak8@gmail.com> Date: Wed, 24 Sep 2025 12:12:07 +0200 Subject: [PATCH 12/14] Add missing ptshape init --- machine_learning_hep/analysis/analyzerdhadrons.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/machine_learning_hep/analysis/analyzerdhadrons.py b/machine_learning_hep/analysis/analyzerdhadrons.py index e80204f115..bac1a157ca 100644 --- a/machine_learning_hep/analysis/analyzerdhadrons.py +++ b/machine_learning_hep/analysis/analyzerdhadrons.py @@ -149,6 +149,8 @@ def __init__(self, datap, case, typean, period): self.root_objects = [] + self.do_ptshape = datap.get("do_ptshape", False) + # Fitting self.p_performval = datap["analysis"].get("event_cand_validation", None) @@ -453,7 +455,7 @@ def do_eff(gen_hist, sel_hist, histname, outname, eff_case): do_eff("h_gen_pr", "h_sel_pr", "eff", "Eff", "prompt") do_eff("h_gen_fd", "h_sel_fd", "eff_fd", "EffFD", "feed-down") - if self.do_ptshape: # pylint: disable=no-member + if self.do_ptshape: do_eff("h_gen_fd_ptshape", "h_sel_fd_ptshape", "eff_fd_ptshape", "EffFDPtShape", "feed-down") @staticmethod From e45b4e01f7f75dc65acddbe116602983c620806a Mon Sep 17 00:00:00 2001 From: saganatt <8majak8@gmail.com> Date: Wed, 24 Sep 2025 12:24:57 +0200 Subject: [PATCH 13/14] Add ptshape suffix to input histos and output files in makenormyields --- machine_learning_hep/analysis/analyzerdhadrons.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/machine_learning_hep/analysis/analyzerdhadrons.py b/machine_learning_hep/analysis/analyzerdhadrons.py index bac1a157ca..0d5b35dfeb 100644 --- a/machine_learning_hep/analysis/analyzerdhadrons.py +++ b/machine_learning_hep/analysis/analyzerdhadrons.py @@ -486,11 +486,13 @@ def makenormyields(self): # pylint: disable=import-outside-toplevel, too-many-b if not os.path.exists(fileouteff): self.logger.fatal("Efficiency file %s could not be found", fileouteff) - fileoutcross = f"{self.d_resultsallpdata}/finalcross{self.case}{self.typean}.root" + ptshape = "_ptshape" if self.do_ptshape else "" + fileoutcross = f"{self.d_resultsallpdata}/finalcross{self.case}{self.typean}{ptshape}.root" namehistoeffprompt = "eff" - namehistoefffeed = "eff_fd" + namehistoefffeed = f"eff_fd{ptshape}" nameyield = "hyields0" + self.logger.info("Using efficiency histos %s, %s", namehistoeffprompt, namehistoefffeed) histonorm = TH1F("histonorm", "histonorm", 1, 0, 1) @@ -537,7 +539,8 @@ def makenormyields(self): # pylint: disable=import-outside-toplevel, too-many-b fileoutcross, ) - fileoutcrosstot = TFile.Open(f"{self.d_resultsallpdata}/finalcross{self.case}{self.typean}tot.root", "recreate") + fileoutcrosstot = TFile.Open(f"{self.d_resultsallpdata}/finalcross{self.case}{self.typean}tot{ptshape}.root", + "recreate") f_fileoutcross = TFile.Open(fileoutcross) if f_fileoutcross: From 3aeb5676447723ebae5cf512290ee3a41c69cb34 Mon Sep 17 00:00:00 2001 From: saganatt <8majak8@gmail.com> Date: Fri, 3 Oct 2025 12:12:00 +0200 Subject: [PATCH 14/14] Mega linter autofix --- machine_learning_hep/processer.py | 2 +- machine_learning_hep/processerdhadrons.py | 16 +++++--------- machine_learning_hep/utilities.py | 26 ++++++++--------------- 3 files changed, 15 insertions(+), 29 deletions(-) diff --git a/machine_learning_hep/processer.py b/machine_learning_hep/processer.py index 7dc7e2e32f..678e83fe33 100644 --- a/machine_learning_hep/processer.py +++ b/machine_learning_hep/processer.py @@ -417,7 +417,7 @@ def dfread(rdir, trees, cols, idx_name=None): if idx_name: # df.rename_axis(idx_name, inplace=True) df[idx_name] = df.index - df.set_index(["df", idx_name], inplace=True) + df = df.set_index(["df", idx_name]) return df except Exception as e: self.logger.exception("Failed to read data from trees: %s", str(e)) diff --git a/machine_learning_hep/processerdhadrons.py b/machine_learning_hep/processerdhadrons.py index 20f96dde33..ce03c2f472 100755 --- a/machine_learning_hep/processerdhadrons.py +++ b/machine_learning_hep/processerdhadrons.py @@ -92,8 +92,8 @@ def __init__( self.p_mass_fit_lim = datap["analysis"][self.typean]["mass_fit_lim"] self.p_bin_width = datap["analysis"][self.typean]["bin_width"] limits_mass = datap["analysis"][self.typean]["mass_fit_lim"] - nbins_mass = int(round((limits_mass[1] - limits_mass[0]) / self.p_bin_width)) - self.p_num_bins = int(round((self.p_mass_fit_lim[1] - self.p_mass_fit_lim[0]) / self.p_bin_width)) + nbins_mass = round((limits_mass[1] - limits_mass[0]) / self.p_bin_width) + self.p_num_bins = round((self.p_mass_fit_lim[1] - self.p_mass_fit_lim[0]) / self.p_bin_width) self.s_presel_gen_eff = datap["analysis"][self.typean]["presel_gen_eff"] self.lpt_finbinmin = datap["analysis"][self.typean]["sel_an_binmin"] @@ -113,10 +113,7 @@ def process_histomass_single(self, index): myfile = TFile.Open(self.l_histomass[index], "recreate") dfevtorig = read_df(self.l_evtorig[index]) neventsorig = len(dfevtorig) - if self.s_evtsel is not None: - dfevtevtsel = dfevtorig.query(self.s_evtsel) - else: - dfevtevtsel = dfevtorig + dfevtevtsel = dfevtorig.query(self.s_evtsel) if self.s_evtsel is not None else dfevtorig neventsafterevtsel = len(dfevtevtsel) # validation plot for event selection @@ -246,14 +243,11 @@ def do_eff_single(ipt, bin_id, df_mc_gen, df_mc_reco, sel_column, bincounter, hi df_gen_sel = df_mc_gen.loc[(df_mc_gen[sel_column] == 1) & (df_mc_gen.ismcsignal == 1)] df_reco_presel = df_mc_reco.loc[(df_mc_reco[sel_column] == 1) & (df_mc_reco.ismcsignal == 1)] df_reco_sel = None - if self.doml is True: - df_reco_sel = df_reco_presel.query(self.l_selml[bin_id]) - else: - df_reco_sel = df_reco_presel.copy() + df_reco_sel = df_reco_presel.query(self.l_selml[bin_id]) if self.doml is True else df_reco_presel.copy() if self.do_custom_analysis_cuts: df_reco_sel = self.apply_cuts_ptbin(df_reco_sel, ipt) - for df, hist in zip((df_gen_sel, df_reco_presel, df_reco_sel), hists): + for df, hist in zip((df_gen_sel, df_reco_presel, df_reco_sel), hists, strict=False): val = len(df) err = math.sqrt(val) hist.SetBinContent(bincounter + 1, val) diff --git a/machine_learning_hep/utilities.py b/machine_learning_hep/utilities.py index 8af345e276..e6c720c730 100644 --- a/machine_learning_hep/utilities.py +++ b/machine_learning_hep/utilities.py @@ -128,10 +128,7 @@ def write_df(dfo, path): def read_df(path, **kwargs): try: - if path.endswith(".parquet"): - df = pd.read_parquet(path, **kwargs) - else: - df = pickle.load(openfile(path, "rb")) + df = pd.read_parquet(path, **kwargs) if path.endswith(".parquet") else pickle.load(openfile(path, "rb")) except Exception as e: # pylint: disable=broad-except logger.critical("failed to open file <%s>: %s", path, str(e)) sys.exit() @@ -160,7 +157,7 @@ def conv_none(value): mask_by_value_dict[mc["column"]] = {mc["find_by_value"]: conv_none(mc["mask_with"])} # Mask all we can do by value - df_to_mask.replace(mask_by_value_dict, inplace=True) + df_to_mask = df_to_mask.replace(mask_by_value_dict) # Now mask all which are searched by query for mc in mask_by_query_list: @@ -308,12 +305,12 @@ def make_latex_table(column_names, row_names, rows, caption=None, save_path="./t columns = "|".join(["c"] * (len(column_names) + 1)) f.write("\\begin{tabular}{" + columns + "}\n") f.write("\\hline\n") - columns = "&".join([""] + column_names) + columns = "&".join(["", *column_names]) columns = columns.replace("_", "\\_") f.write(columns + "\\\\\n") f.write("\\hline\\hline\n") - for rn, row in zip(row_names, rows): - row_string = "&".join([rn] + row) + for rn, row in zip(row_names, rows, strict=False): + row_string = "&".join([rn, *row]) row_string = row_string.replace("_", "\\_") f.write(row_string + "\\\\\n") f.write("\\end{tabular}\n") @@ -411,10 +408,7 @@ def equal_axis_list(axis1, list2, precision=10): bins = get_bins(axis1) if len(bins) != len(list2): return False - for i, j in zip(bins, list2): - if round(i, precision) != round(j, precision): - return False - return True + return all(round(i, precision) == round(j, precision) for i, j in zip(bins, list2, strict=False)) def equal_binning(his1, his2): @@ -432,9 +426,7 @@ def equal_binning_lists(his, list_x=None, list_y=None, list_z=None): return False if list_y is not None and not equal_axis_list(his.GetYaxis(), list_y): return False - if list_z is not None and not equal_axis_list(his.GetZaxis(), list_z): - return False - return True + return not (list_z is not None and not equal_axis_list(his.GetZaxis(), list_z)) def folding(h_input, response_matrix, h_output): @@ -943,13 +935,13 @@ def plot_latex(latex): # set canvas margins if isinstance(margins_c, list) and len(margins_c) > 0: for setter, value in zip( - [can.SetBottomMargin, can.SetLeftMargin, can.SetTopMargin, can.SetRightMargin], margins_c + [can.SetBottomMargin, can.SetLeftMargin, can.SetTopMargin, can.SetRightMargin], margins_c, strict=False ): setter(value) # set logarithmic scale for selected axes log_y = False if isinstance(logscale, str) and len(logscale) > 0: - for setter, axis in zip([can.SetLogx, can.SetLogy, can.SetLogz], ["x", "y", "z"]): + for setter, axis in zip([can.SetLogx, can.SetLogy, can.SetLogz], ["x", "y", "z"], strict=False): if axis in logscale: setter() if axis == "y":