Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 47 additions & 56 deletions machine_learning_hep/analysis/analyzerdhadrons.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,13 @@
from machine_learning_hep.utils.hist import get_dim, project_hist

# pylint: disable=too-few-public-methods, too-many-instance-attributes, too-many-statements, fixme
# pylint: disable=consider-using-enumerate fixme
# pylint: disable=consider-using-enumerate, missing-function-docstring


class AnalyzerDhadrons(Analyzer): # pylint: disable=invalid-name
"""
An analyzer for D and L hadrons.
"""
species = "analyzer"

def __init__(self, datap, case, typean, period):
Expand Down Expand Up @@ -146,6 +149,8 @@ def __init__(self, datap, case, typean, period):

self.root_objects = []

self.do_ptshape = datap.get("do_ptshape", False)

# Fitting
self.p_performval = datap["analysis"].get("event_cand_validation", None)

Expand Down Expand Up @@ -418,60 +423,43 @@ def efficiency(self):
print(self.n_fileff)
lfileeff = TFile.Open(self.n_fileff)
lfileeff.ls()
fileouteff = TFile.Open(f"{self.d_resultsallpmc}/{self.efficiency_filename}{self.case}{self.typean}.root", "recreate")
cEff = TCanvas("cEff", "The Fit Canvas")
cEff.SetCanvasSize(1900, 1500)
cEff.SetWindowSize(500, 500)

legeff = TLegend(0.5, 0.65, 0.7, 0.85)
legeff.SetBorderSize(0)
legeff.SetFillColor(0)
legeff.SetFillStyle(0)
legeff.SetTextFont(42)
legeff.SetTextSize(0.035)

h_gen_pr = lfileeff.Get("h_gen_pr")
h_sel_pr = lfileeff.Get("h_sel_pr")
h_sel_pr.Divide(h_sel_pr, h_gen_pr, 1.0, 1.0, "B")
h_sel_pr.Draw("same")
fileouteff.cd()
h_sel_pr.SetName("eff")
h_sel_pr.Write()
h_sel_pr.GetXaxis().SetTitle("#it{p}_{T} (GeV/#it{c})")
h_sel_pr.GetYaxis().SetTitle(f"Acc x efficiency (prompt) {self.p_latexnhadron} {self.typean} (1/GeV)")
h_sel_pr.SetMinimum(0.001)
h_sel_pr.SetMaximum(1.0)
gPad.SetLogy()
cEff.SaveAs(f"{self.d_resultsallpmc}/Eff{self.case}{self.typean}.eps")

cEffFD = TCanvas("cEffFD", "The Fit Canvas")
cEffFD.SetCanvasSize(1900, 1500)
cEffFD.SetWindowSize(500, 500)

legeffFD = TLegend(0.5, 0.65, 0.7, 0.85)
legeffFD.SetBorderSize(0)
legeffFD.SetFillColor(0)
legeffFD.SetFillStyle(0)
legeffFD.SetTextFont(42)
legeffFD.SetTextSize(0.035)

h_gen_fd = lfileeff.Get("h_gen_fd")
h_sel_fd = lfileeff.Get("h_sel_fd")
h_sel_fd.Divide(h_sel_fd, h_gen_fd, 1.0, 1.0, "B")
h_sel_fd.Draw("same")
fileouteff.cd()
h_sel_fd.SetName("eff_fd")
h_sel_fd.Write()
h_sel_fd.GetXaxis().SetTitle("#it{p}_{T} (GeV/#it{c})")
h_sel_fd.GetYaxis().SetTitle(f"Acc x efficiency feed-down {self.p_latexnhadron} {self.typean} (1/GeV)")
h_sel_fd.SetMinimum(0.001)
h_sel_fd.SetMaximum(1.0)
gPad.SetLogy()
legeffFD.Draw()
cEffFD.SaveAs(f"{self.d_resultsallpmc}/EffFD{self.case}{self.typean}.eps")
fileouteff = TFile.Open(f"{self.d_resultsallpmc}/{self.efficiency_filename}{self.case}{self.typean}.root",
"recreate")

def do_eff(gen_hist, sel_hist, histname, outname, eff_case):
cEff = TCanvas(f"c{outname}", "The Fit Canvas")
cEff.SetCanvasSize(1900, 1500)
cEff.SetWindowSize(500, 500)

legeff = TLegend(0.5, 0.65, 0.7, 0.85)
legeff.SetBorderSize(0)
legeff.SetFillColor(0)
legeff.SetFillStyle(0)
legeff.SetTextFont(42)
legeff.SetTextSize(0.035)

h_gen = lfileeff.Get(gen_hist)
h_sel = lfileeff.Get(sel_hist)
h_sel.Divide(h_sel, h_gen, 1.0, 1.0, "B")
h_sel.Draw("same")
fileouteff.cd()
h_sel.SetName(histname)
h_sel.Write()
h_sel.GetXaxis().SetTitle("#it{p}_{T} (GeV/#it{c})")
h_sel.GetYaxis().SetTitle(f"Acc x efficiency ({eff_case}) {self.p_latexnhadron} {self.typean} (1/GeV)")
h_sel.SetMinimum(0.001)
h_sel.SetMaximum(1.0)
gPad.SetLogy()
legeff.Draw()
cEff.SaveAs(f"{self.d_resultsallpmc}/{outname}{self.case}{self.typean}.eps")

do_eff("h_gen_pr", "h_sel_pr", "eff", "Eff", "prompt")
do_eff("h_gen_fd", "h_sel_fd", "eff_fd", "EffFD", "feed-down")
if self.do_ptshape:
do_eff("h_gen_fd_ptshape", "h_sel_fd_ptshape", "eff_fd_ptshape", "EffFDPtShape", "feed-down")

@staticmethod
def calculate_norm(logger, hevents, hselevents): # TO BE FIXED WITH EV SEL
def calculate_norm(logger, hevents, hselevents): # TO BE FIXED WITH EV SEL
if not hevents:
# pylint: disable=undefined-variable
logger.error("Missing hevents")
Expand All @@ -498,11 +486,13 @@ def makenormyields(self): # pylint: disable=import-outside-toplevel, too-many-b
if not os.path.exists(fileouteff):
self.logger.fatal("Efficiency file %s could not be found", fileouteff)

fileoutcross = f"{self.d_resultsallpdata}/finalcross{self.case}{self.typean}.root"
ptshape = "_ptshape" if self.do_ptshape else ""
fileoutcross = f"{self.d_resultsallpdata}/finalcross{self.case}{self.typean}{ptshape}.root"

namehistoeffprompt = "eff"
namehistoefffeed = "eff_fd"
namehistoefffeed = f"eff_fd{ptshape}"
nameyield = "hyields0"
self.logger.info("Using efficiency histos %s, %s", namehistoeffprompt, namehistoefffeed)

histonorm = TH1F("histonorm", "histonorm", 1, 0, 1)

Expand Down Expand Up @@ -549,7 +539,8 @@ def makenormyields(self): # pylint: disable=import-outside-toplevel, too-many-b
fileoutcross,
)

fileoutcrosstot = TFile.Open(f"{self.d_resultsallpdata}/finalcross{self.case}{self.typean}tot.root", "recreate")
fileoutcrosstot = TFile.Open(f"{self.d_resultsallpdata}/finalcross{self.case}{self.typean}tot{ptshape}.root",
"recreate")

f_fileoutcross = TFile.Open(fileoutcross)
if f_fileoutcross:
Expand Down
101 changes: 79 additions & 22 deletions machine_learning_hep/processer.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,21 @@
mergerootfiles,
openfile,
read_df,
reweight,
seldf_singlevar,
write_df,
)
from .utilities_files import appendmainfoldertolist, create_folder_struc, createlist, list_folders

pd.options.mode.chained_assignment = None

# pylint: disable=missing-function-docstring


class Processer: # pylint: disable=too-many-instance-attributes
"""
The main class for data processing, machine learning, and analysis.
"""
# Class Attribute
species = "processer"
logger = get_logger()
Expand Down Expand Up @@ -149,6 +155,7 @@ def __init__(
self.n_fileeff = datap["files_names"]["efffilename"]
self.n_fileresp = datap["files_names"]["respfilename"]
self.n_mcreweights = datap["files_names"]["namefile_mcweights"]
self.n_weights = datap["files_names"]["histoweights"]

# selections
self.s_reco_skim = datap["sel_reco_skim"]
Expand All @@ -172,6 +179,8 @@ def __init__(
self.v_ismcbkg = datap["bitmap_sel"]["var_ismcbkg"] # used in hadrons
self.v_ismcrefl = datap["bitmap_sel"]["var_ismcrefl"] # used in hadrons
self.v_var_binning = datap["var_binning"]
self.do_ptshape = datap.get("do_ptshape", False)
self.v_var_binning_ptshape = datap.get("var_binning_ptshape", None)
self.v_invmass = datap["variables"].get("var_inv_mass", "inv_mass")
# self.v_rapy = datap["variables"].get("var_y", "y_cand")

Expand Down Expand Up @@ -202,6 +211,11 @@ def __init__(
self.l_gen_sl = createlist(self.d_pkl, self.l_path, self.n_gen_sl)
self.f_totevt = os.path.join(self.d_pkl, self.n_evt)
self.f_totevtorig = os.path.join(self.d_pkl, self.n_evtorig)
self.f_weights = os.path.join(self.d_mcreweights, self.n_mcreweights)

if self.do_ptshape:
with uproot.open(self.f_weights) as fin:
self.v_hist_weights = fin[self.n_weights].to_numpy()

self.p_modelname = datap["mlapplication"]["modelname"]
# Analysis pT bins
Expand Down Expand Up @@ -293,7 +307,7 @@ def __init__(
)

self.lpt_recodec = None
if self.doml is True:
if self.doml:
if self.mltype == "MultiClassification":
self.lpt_recodec = [
self.n_reco.replace(
Expand Down Expand Up @@ -347,6 +361,22 @@ def __init__(
# self.triggerbit = datap["analysis"][self.typean]["triggerbit"]
self.runlistrigger = runlisttrigger

if self.do_ptshape and self.mcordata == "mc":
lpt_recosk_ptshape = [None] * self.p_nptbins
lpt_gensk_ptshape = [None] * self.p_nptbins
lpt_recodec_ptshape = [None] * self.p_nptbins

for ipt in range(self.p_nptbins):
lpt_recosk_ptshape[ipt] = self.v_var_binning_ptshape + self.lpt_recosk[ipt]
lpt_gensk_ptshape[ipt] = self.v_var_binning_ptshape + self.lpt_gensk[ipt]
lpt_recodec_ptshape[ipt] = self.v_var_binning_ptshape + self.lpt_recodec[ipt]
self.mptfiles_recosk_ptshape = [createlist(d_pklsk, self.l_path, \
lpt_recosk_ptshape[ipt]) for ipt in range(self.p_nptbins)]
self.mptfiles_gensk_ptshape = [createlist(d_pklsk, self.l_path, \
lpt_gensk_ptshape[ipt]) for ipt in range(self.p_nptbins)]
self.mptfiles_recoskmldec_ptshape = [createlist(self.d_pkl_dec, self.l_path, \
lpt_recodec_ptshape[ipt]) for ipt in range(self.p_nptbins)]

# if os.path.exists(self.d_root) is False:
# self.logger.warning("ROOT tree folder is not there. Is it intentional?")

Expand Down Expand Up @@ -387,7 +417,7 @@ def dfread(rdir, trees, cols, idx_name=None):
if idx_name:
# df.rename_axis(idx_name, inplace=True)
df[idx_name] = df.index
df.set_index(["df", idx_name], inplace=True)
df = df.set_index(["df", idx_name])
return df
except Exception as e:
self.logger.exception("Failed to read data from trees: %s", str(e))
Expand Down Expand Up @@ -474,7 +504,8 @@ def dfuse(df_spec):
dfs[df_name][tag] = (var == value["req"]).astype(int)

# dfs[df_name][tag] = np.array(
# tag_bit_df(dfs[df_name], value["var"], value["req"], value.get("abs", False)), dtype=int)
# tag_bit_df(dfs[df_name], value["var"], value["req"], value.get("abs", False)),
# dtype=int)

if "swap" in df_spec:
self.logger.debug(" %s -> swap", df_name)
Expand Down Expand Up @@ -531,34 +562,46 @@ def dfuse(df_spec):
path = os.path.join(self.d_pkl, self.l_path[file_index], df_spec["file"])
write_df(dfo, path)

def skim(self, file_index):
dfreco = read_df(self.l_reco[file_index])
dfgen = read_df(self.l_gen[file_index]) if self.mcordata == "mc" else None
dfgen_sl = read_df(self.l_gen_sl[file_index]) if self.n_gen_sl and self.mcordata == "mc" else None

def do_skim(self, dfreco, dfgen, var_binning, filenames_reco, filenames_gen, file_index):
for ipt in range(self.p_nptbins):
dfrecosk = seldf_singlevar(dfreco, self.v_var_binning, self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt])
dfrecosk = seldf_singlevar(dfreco, var_binning,
self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt])
dfrecosk = dfquery(dfrecosk, self.s_reco_skim[ipt])
write_df(dfrecosk, self.mptfiles_recosk[ipt][file_index])
write_df(dfrecosk, filenames_reco[ipt][file_index])

if dfgen is not None:
dfgensk = seldf_singlevar(dfgen, self.v_var_binning, self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt])
dfgensk = seldf_singlevar(dfgen, var_binning,
self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt])
dfgensk = dfquery(dfgensk, self.s_gen_skim[ipt])
write_df(dfgensk, self.mptfiles_gensk[ipt][file_index])
write_df(dfgensk, filenames_gen[ipt][file_index])

def skim(self, file_index):
dfreco = read_df(self.l_reco[file_index])
dfgen = read_df(self.l_gen[file_index]) if self.mcordata == "mc" else None
dfgen_sl = read_df(self.l_gen_sl[file_index]) if self.n_gen_sl and self.mcordata == "mc" else None

for ipt in range(self.p_nptbins):
if dfgen_sl is not None:
dfgensk_sl = seldf_singlevar(
dfgen_sl, self.v_var_binning, self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt]
)
dfgensk_sl = dfquery(dfgensk_sl, self.s_gen_skim[ipt])
write_df(dfgensk_sl, self.mptfiles_gensk_sl[ipt][file_index])

self.do_skim(dfreco, dfgen, self.v_var_binning, self.mptfiles_recosk, self.mptfiles_gensk, file_index)

if self.do_ptshape and self.mcordata == 'mc':
reweight(self.v_hist_weights, dfreco, self.v_var_binning, self.v_var_binning_ptshape)
reweight(self.v_hist_weights, dfgen, self.v_var_binning, self.v_var_binning_ptshape)

self.do_skim(dfreco, dfgen, self.v_var_binning_ptshape, self.mptfiles_recosk_ptshape,\
self.mptfiles_gensk_ptshape, file_index)


# pylint: disable=too-many-branches
def applymodel(self, file_index):
for ipt in range(self.p_nptbins):
if os.path.exists(self.mptfiles_recoskmldec[ipt][file_index]):
if os.stat(self.mptfiles_recoskmldec[ipt][file_index]).st_size != 0:
continue
dfrecosk = read_df(self.mptfiles_recosk[ipt][file_index])
def do_apply_model(in_filename, out_filename, ipt):
dfrecosk = read_df(in_filename)
if self.p_mask_values:
mask_df(dfrecosk, self.p_mask_values)
if self.doml is True:
Expand All @@ -584,7 +627,21 @@ def applymodel(self, file_index):
dfrecoskml = dfrecoskml.loc[dfrecoskml[probvar] > self.lpt_probcutpre[ipt]]
else:
dfrecoskml = dfrecosk.query("isstd == 1")
write_df(dfrecoskml, self.mptfiles_recoskmldec[ipt][file_index])
write_df(dfrecoskml, out_filename)

for ipt in range(self.p_nptbins):
if os.path.exists(self.mptfiles_recoskmldec[ipt][file_index]):
if os.stat(self.mptfiles_recoskmldec[ipt][file_index]).st_size != 0:
continue

do_apply_model(self.mptfiles_recosk[ipt][file_index],
self.mptfiles_recoskmldec[ipt][file_index],
ipt)

if self.do_ptshape and self.mcordata == 'mc':
do_apply_model(self.mptfiles_recosk_ptshape[ipt][file_index],
self.mptfiles_recoskmldec_ptshape[ipt][file_index],
ipt)

@staticmethod
def callback(ex):
Expand Down Expand Up @@ -717,8 +774,8 @@ def apply_cut_for_ipt(df_full, ipt: int):

def process_histomass(self):
self.logger.debug("Doing masshisto %s %s", self.mcordata, self.period)
self.logger.debug("Using run selection for mass histo %s %s %s", self.runlistrigger, "for period", self.period)
if self.doml is True:
self.logger.debug("Using run selection for mass histo %s for period %s", self.runlistrigger, self.period)
if self.doml:
self.logger.debug("Doing ml analysis")
elif self.do_custom_analysis_cuts:
self.logger.debug("Using custom cuts")
Expand All @@ -736,8 +793,8 @@ def process_histomass(self):

def process_efficiency(self):
print("Doing efficiencies", self.mcordata, self.period)
print("Using run selection for eff histo", self.runlistrigger, "for period", self.period)
if self.doml is True:
print("Using run selection for eff histo %s for period %s", self.runlistrigger, self.period)
if self.doml:
print("Doing ml analysis")
elif self.do_custom_analysis_cuts:
print("Using custom cuts")
Expand Down
Loading
Loading