From aa4340d97f554c7434a73a58a02fc4ee994f33dc Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 17 Nov 2025 10:59:52 -0800 Subject: [PATCH 001/150] Moved readtsq to tdt_step2.py. --- src/guppy/saveStoresList.py | 21 ++------------------- src/guppy/tdt_step2.py | 26 ++++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 19 deletions(-) create mode 100644 src/guppy/tdt_step2.py diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index ed3a7cf..a837aa9 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -21,6 +21,8 @@ import panel as pn from numpy import float32, float64, int32, int64, uint16 +from guppy.tdt_step2 import readtsq + # hv.extension() pn.extension() @@ -86,25 +88,6 @@ def check_header(df): return arr, check_float -# function to read 'tsq' file -def readtsq(filepath): - names = ("size", "type", "name", "chan", "sort_code", "timestamp", "fp_loc", "strobe", "format", "frequency") - formats = (int32, int32, "S4", uint16, uint16, float64, int64, float64, int32, float32) - offsets = 0, 4, 8, 12, 14, 16, 24, 24, 32, 36 - tsq_dtype = np.dtype({"names": names, "formats": formats, "offsets": offsets}, align=True) - path = glob.glob(os.path.join(filepath, "*.tsq")) - if len(path) > 1: - logger.error("Two tsq files are present at the location.") - raise Exception("Two tsq files are present at the location.") - elif len(path) == 0: - return 0 - else: - path = path[0] - tsq = np.fromfile(path, dtype=tsq_dtype) - df = pd.DataFrame(tsq) - return df - - # function to show GUI and save def saveStorenames(inputParameters, data, event_name, flag, filepath): diff --git a/src/guppy/tdt_step2.py b/src/guppy/tdt_step2.py new file mode 100644 index 0000000..09456a7 --- /dev/null +++ b/src/guppy/tdt_step2.py @@ -0,0 +1,26 @@ +import glob +import logging +import os +import numpy as np +from numpy import float32, float64, int32, int64, uint16 +import pandas as pd + +logger = logging.getLogger(__name__) + +# function to read 'tsq' file +def readtsq(filepath): + names = ("size", "type", "name", "chan", "sort_code", "timestamp", "fp_loc", "strobe", "format", "frequency") + formats = (int32, int32, "S4", uint16, uint16, float64, int64, float64, int32, float32) + offsets = 0, 4, 8, 12, 14, 16, 24, 24, 32, 36 + tsq_dtype = np.dtype({"names": names, "formats": formats, "offsets": offsets}, align=True) + path = glob.glob(os.path.join(filepath, "*.tsq")) + if len(path) > 1: + logger.error("Two tsq files are present at the location.") + raise Exception("Two tsq files are present at the location.") + elif len(path) == 0: + return 0 + else: + path = path[0] + tsq = np.fromfile(path, dtype=tsq_dtype) + df = pd.DataFrame(tsq) + return df \ No newline at end of file From c868823138945399df1fb2b043f1026b2099e3b6 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 17 Nov 2025 11:19:34 -0800 Subject: [PATCH 002/150] Moved import_np_doric_csv to np_doric_csv_step2.py. --- src/guppy/np_doric_csv_step2.py | 523 ++++++++++++++++++++++++++++++++ src/guppy/saveStoresList.py | 498 +----------------------------- 2 files changed, 524 insertions(+), 497 deletions(-) create mode 100644 src/guppy/np_doric_csv_step2.py diff --git a/src/guppy/np_doric_csv_step2.py b/src/guppy/np_doric_csv_step2.py new file mode 100644 index 0000000..d06dcc1 --- /dev/null +++ b/src/guppy/np_doric_csv_step2.py @@ -0,0 +1,523 @@ +import glob +import logging +import os +import tkinter as tk +from tkinter import StringVar, messagebox, ttk + +import h5py +import numpy as np +import pandas as pd +import panel as pn + +pn.extension() + +logger = logging.getLogger(__name__) + +# function to see if there are 'csv' files present +# and recognize type of 'csv' files either from +# Neurophotometrics, Doric systems or custom made 'csv' files +# and read data accordingly +def import_np_doric_csv(filepath, isosbestic_control, num_ch, inputParameters=None): + + logger.debug("If it exists, importing either NPM or Doric or csv file based on the structure of file") + # Headless configuration (used to avoid any UI prompts when running tests) + headless = bool(os.environ.get("GUPPY_BASE_DIR")) + npm_timestamp_column_name = None + npm_time_unit = None + npm_split_events = None + if isinstance(inputParameters, dict): + npm_timestamp_column_name = inputParameters.get("npm_timestamp_column_name") + npm_time_unit = inputParameters.get("npm_time_unit", "seconds") + npm_split_events = inputParameters.get("npm_split_events", True) + path = sorted(glob.glob(os.path.join(filepath, "*.csv"))) + sorted(glob.glob(os.path.join(filepath, "*.doric"))) + path_chev = glob.glob(os.path.join(filepath, "*chev*")) + path_chod = glob.glob(os.path.join(filepath, "*chod*")) + path_chpr = glob.glob(os.path.join(filepath, "*chpr*")) + path_event = glob.glob(os.path.join(filepath, "event*")) + # path_sig = glob.glob(os.path.join(filepath, 'sig*')) + path_chev_chod_event = path_chev + path_chod + path_event + path_chpr + + path = sorted(list(set(path) - set(path_chev_chod_event))) + flag = "None" + event_from_filename = [] + flag_arr = [] + for i in range(len(path)): + dirname = os.path.dirname(path[i]) + ext = os.path.basename(path[i]).split(".")[-1] + if ext == "doric": + key_names = read_doric(path[i]) + event_from_filename.extend(key_names) + flag = "doric_doric" + else: + df = pd.read_csv(path[i], header=None, nrows=2, index_col=False, dtype=str) + df = df.dropna(axis=1, how="all") + df_arr = np.array(df).flatten() + check_all_str = [] + for element in df_arr: + try: + float(element) + except: + check_all_str.append(i) + if len(check_all_str) == len(df_arr): + df = pd.read_csv(path[i], header=1, index_col=False, nrows=10) + df = df.drop(["Time(s)"], axis=1) + event_from_filename.extend(list(df.columns)) + flag = "doric_csv" + logger.info(flag) + else: + df = pd.read_csv(path[i], index_col=False) + # with warnings.catch_warnings(): + # warnings.simplefilter("error") + # try: + # df = pd.read_csv(path[i], index_col=False, dtype=float) + # except: + # df = pd.read_csv(path[i], header=1, index_col=False, nrows=10) # to make process faster reading just first 10 rows + # df = df.drop(['Time(s)'], axis=1) + # event_from_filename.extend(list(df.columns)) + # flag = 'doric_csv' + if flag == "doric_csv" or flag == "doric_doric": + continue + else: + colnames, value = check_header(df) + # logger.info(len(colnames), len(value)) + + # check dataframe structure and read data accordingly + if len(value) > 0: + columns_isstr = False + df = pd.read_csv(path[i], header=None) + cols = np.array(list(df.columns), dtype=str) + else: + df = df + columns_isstr = True + cols = np.array(list(df.columns), dtype=str) + # check the structure of dataframe and assign flag to the type of file + if len(cols) == 1: + if cols[0].lower() != "timestamps": + logger.error("\033[1m" + "Column name should be timestamps (all lower-cases)" + "\033[0m") + raise Exception("\033[1m" + "Column name should be timestamps (all lower-cases)" + "\033[0m") + else: + flag = "event_csv" + elif len(cols) == 3: + arr1 = np.array(["timestamps", "data", "sampling_rate"]) + arr2 = np.char.lower(np.array(cols)) + if (np.sort(arr1) == np.sort(arr2)).all() == False: + logger.error( + "\033[1m" + + "Column names should be timestamps, data and sampling_rate (all lower-cases)" + + "\033[0m" + ) + raise Exception( + "\033[1m" + + "Column names should be timestamps, data and sampling_rate (all lower-cases)" + + "\033[0m" + ) + else: + flag = "data_csv" + elif len(cols) == 2: + flag = "event_or_data_np" + elif len(cols) >= 2: + flag = "data_np" + else: + logger.error("Number of columns in csv file does not make sense.") + raise Exception("Number of columns in csv file does not make sense.") + + if columns_isstr == True and ( + "flags" in np.char.lower(np.array(cols)) or "ledstate" in np.char.lower(np.array(cols)) + ): + flag = flag + "_v2" + else: + flag = flag + + # used assigned flags to process the files and read the data + if flag == "event_or_data_np": + arr = list(df.iloc[:, 1]) + check_float = [True for i in arr if isinstance(i, float)] + if len(arr) == len(check_float) and columns_isstr == False: + flag = "data_np" + elif columns_isstr == True and ("value" in np.char.lower(np.array(cols))): + flag = "event_np" + else: + flag = "event_np" + else: + pass + + flag_arr.append(flag) + logger.info(flag) + if flag == "event_csv" or flag == "data_csv": + name = os.path.basename(path[i]).split(".")[0] + event_from_filename.append(name) + elif flag == "data_np": + file = f"file{str(i)}_" + df, indices_dict, num_channels = decide_indices(file, df, flag, num_ch) + keys = list(indices_dict.keys()) + for k in range(len(keys)): + for j in range(df.shape[1]): + if j == 0: + timestamps = df.iloc[:, j][indices_dict[keys[k]]] + # timestamps_odd = df.iloc[:,j][odd_indices] + else: + d = dict() + d["timestamps"] = timestamps + d["data"] = df.iloc[:, j][indices_dict[keys[k]]] + + df_ch = pd.DataFrame(d) + df_ch.to_csv(os.path.join(dirname, keys[k] + str(j) + ".csv"), index=False) + event_from_filename.append(keys[k] + str(j)) + + elif flag == "event_np": + type_val = np.array(df.iloc[:, 1]) + type_val_unique = np.unique(type_val) + if headless: + response = 1 if bool(npm_split_events) else 0 + else: + window = tk.Tk() + if len(type_val_unique) > 1: + response = messagebox.askyesno( + "Multiple event TTLs", + "Based on the TTL file,\ + it looks like TTLs \ + belongs to multiple behavior type. \ + Do you want to create multiple files for each \ + behavior type ?", + ) + else: + response = 0 + window.destroy() + if response == 1: + timestamps = np.array(df.iloc[:, 0]) + for j in range(len(type_val_unique)): + idx = np.where(type_val == type_val_unique[j]) + d = dict() + d["timestamps"] = timestamps[idx] + df_new = pd.DataFrame(d) + df_new.to_csv(os.path.join(dirname, "event" + str(type_val_unique[j]) + ".csv"), index=False) + event_from_filename.append("event" + str(type_val_unique[j])) + else: + timestamps = np.array(df.iloc[:, 0]) + d = dict() + d["timestamps"] = timestamps + df_new = pd.DataFrame(d) + df_new.to_csv(os.path.join(dirname, "event" + str(0) + ".csv"), index=False) + event_from_filename.append("event" + str(0)) + else: + file = f"file{str(i)}_" + df, ts_unit = decide_ts_unit_for_npm( + df, timestamp_column_name=npm_timestamp_column_name, time_unit=npm_time_unit, headless=headless + ) + df, indices_dict, num_channels = decide_indices(file, df, flag) + keys = list(indices_dict.keys()) + for k in range(len(keys)): + for j in range(df.shape[1]): + if j == 0: + timestamps = df.iloc[:, j][indices_dict[keys[k]]] + # timestamps_odd = df.iloc[:,j][odd_indices] + else: + d = dict() + d["timestamps"] = timestamps + d["data"] = df.iloc[:, j][indices_dict[keys[k]]] + + df_ch = pd.DataFrame(d) + df_ch.to_csv(os.path.join(dirname, keys[k] + str(j) + ".csv"), index=False) + event_from_filename.append(keys[k] + str(j)) + + path_chev = glob.glob(os.path.join(filepath, "*chev*")) + path_chod = glob.glob(os.path.join(filepath, "*chod*")) + path_chpr = glob.glob(os.path.join(filepath, "*chpr*")) + path_event = glob.glob(os.path.join(filepath, "event*")) + # path_sig = glob.glob(os.path.join(filepath, 'sig*')) + path_chev_chod_chpr = [path_chev, path_chod, path_chpr] + if ( + ("data_np_v2" in flag_arr or "data_np" in flag_arr) + and ("event_np" in flag_arr) + and (i == len(path) - 1) + ) or ( + ("data_np_v2" in flag_arr or "data_np" in flag_arr) and (i == len(path) - 1) + ): # i==len(path)-1 and or 'event_np' in flag + num_path_chev, num_path_chod, num_path_chpr = len(path_chev), len(path_chod), len(path_chpr) + arr_len, no_ch = [], [] + for i in range(len(path_chev_chod_chpr)): + if len(path_chev_chod_chpr[i]) > 0: + arr_len.append(len(path_chev_chod_chpr[i])) + else: + continue + + unique_arr_len = np.unique(np.array(arr_len)) + if "data_np_v2" in flag_arr: + if ts_unit == "seconds": + divisor = 1 + elif ts_unit == "milliseconds": + divisor = 1e3 + else: + divisor = 1e6 + else: + divisor = 1000 + + for j in range(len(path_event)): + df_event = pd.read_csv(path_event[j]) + df_chev = pd.read_csv(path_chev[0]) + df_event["timestamps"] = (df_event["timestamps"] - df_chev["timestamps"][0]) / divisor + df_event.to_csv(path_event[j], index=False) + if unique_arr_len.shape[0] == 1: + for j in range(len(path_chev)): + if file + "chev" in indices_dict.keys(): + df_chev = pd.read_csv(path_chev[j]) + df_chev["timestamps"] = (df_chev["timestamps"] - df_chev["timestamps"][0]) / divisor + df_chev["sampling_rate"] = np.full(df_chev.shape[0], np.nan) + df_chev.at[0, "sampling_rate"] = df_chev.shape[0] / ( + df_chev["timestamps"].iloc[-1] - df_chev["timestamps"].iloc[0] + ) + df_chev.to_csv(path_chev[j], index=False) + + if file + "chod" in indices_dict.keys(): + df_chod = pd.read_csv(path_chod[j]) + df_chod["timestamps"] = df_chev["timestamps"] + df_chod["sampling_rate"] = np.full(df_chod.shape[0], np.nan) + df_chod.at[0, "sampling_rate"] = df_chev["sampling_rate"][0] + df_chod.to_csv(path_chod[j], index=False) + + if file + "chpr" in indices_dict.keys(): + df_chpr = pd.read_csv(path_chpr[j]) + df_chpr["timestamps"] = df_chev["timestamps"] + df_chpr["sampling_rate"] = np.full(df_chpr.shape[0], np.nan) + df_chpr.at[0, "sampling_rate"] = df_chev["sampling_rate"][0] + df_chpr.to_csv(path_chpr[j], index=False) + else: + logger.error("Number of channels should be same for all regions.") + raise Exception("Number of channels should be same for all regions.") + else: + pass + logger.info("Importing of either NPM or Doric or csv file is done.") + return event_from_filename, flag_arr + +# ---------------------------------------------------------------------------------------------------------------------- +# Functions that import_np_doric_csv uses +# ---------------------------------------------------------------------------------------------------------------------- + +def read_doric(filepath): + with h5py.File(filepath, "r") as f: + if "Traces" in list(f.keys()): + keys = access_keys_doricV1(f) + elif list(f.keys()) == ["Configurations", "DataAcquisition"]: + keys = access_keys_doricV6(f) + + return keys + + +def check_header(df): + arr = list(df.columns) + check_float = [] + for i in arr: + try: + check_float.append(float(i)) + except: + pass + + return arr, check_float + +# function to decide indices of interleaved channels +# in neurophotometrics data +def decide_indices(file, df, flag, num_ch=2): + ch_name = [file + "chev", file + "chod", file + "chpr"] + if len(ch_name) < num_ch: + logger.error( + "Number of channels parameters in Input Parameters GUI is more than 3. \ + Looks like there are more than 3 channels in the file. Reading of these files\ + are not supported. Reach out to us if you get this error message." + ) + raise Exception( + "Number of channels parameters in Input Parameters GUI is more than 3. \ + Looks like there are more than 3 channels in the file. Reading of these files\ + are not supported. Reach out to us if you get this error message." + ) + if flag == "data_np": + indices_dict = dict() + for i in range(num_ch): + indices_dict[ch_name[i]] = np.arange(i, df.shape[0], num_ch) + + else: + cols = np.array(list(df.columns)) + if "flags" in np.char.lower(np.array(cols)): + arr = ["FrameCounter", "Flags"] + state = np.array(df["Flags"]) + elif "ledstate" in np.char.lower(np.array(cols)): + arr = ["FrameCounter", "LedState"] + state = np.array(df["LedState"]) + else: + logger.error( + "File type shows Neurophotometrics newer version \ + data but column names does not have Flags or LedState" + ) + raise Exception( + "File type shows Neurophotometrics newer version \ + data but column names does not have Flags or LedState" + ) + + num_ch, ch = check_channels(state) + indices_dict = dict() + for i in range(num_ch): + first_occurrence = np.where(state == ch[i])[0] + indices_dict[ch_name[i]] = np.arange(first_occurrence[0], df.shape[0], num_ch) + + df = df.drop(arr, axis=1) + + return df, indices_dict, num_ch + + +# function to decide NPM timestamps unit (seconds, ms or us) +def decide_ts_unit_for_npm(df, timestamp_column_name=None, time_unit=None, headless=False): + col_names = np.array(list(df.columns)) + col_names_ts = [""] + for name in col_names: + if "timestamp" in name.lower(): + col_names_ts.append(name) + + ts_unit = "seconds" + if len(col_names_ts) > 2: + # Headless path: auto-select column/unit without any UI + if headless: + if timestamp_column_name is not None: + assert ( + timestamp_column_name in col_names_ts + ), f"Provided timestamp_column_name '{timestamp_column_name}' not found in columns {col_names_ts[1:]}" + chosen = timestamp_column_name + else: + chosen = col_names_ts[1] + df.insert(1, "Timestamp", df[chosen]) + df = df.drop(col_names_ts[1:], axis=1) + valid_units = {"seconds", "milliseconds", "microseconds"} + ts_unit = time_unit if (isinstance(time_unit, str) and time_unit in valid_units) else "seconds" + return df, ts_unit + # def comboBoxSelected(event): + # logger.info(event.widget.get()) + + window = tk.Tk() + window.title("Select appropriate options for timestamps") + window.geometry("500x200") + holdComboboxValues = dict() + + timestamps_label = ttk.Label(window, text="Select which timestamps to use : ").grid( + row=0, column=1, pady=25, padx=25 + ) + holdComboboxValues["timestamps"] = StringVar() + timestamps_combo = ttk.Combobox(window, values=col_names_ts, textvariable=holdComboboxValues["timestamps"]) + timestamps_combo.grid(row=0, column=2, pady=25, padx=25) + timestamps_combo.current(0) + # timestamps_combo.bind("<>", comboBoxSelected) + + time_unit_label = ttk.Label(window, text="Select timestamps unit : ").grid(row=1, column=1, pady=25, padx=25) + holdComboboxValues["time_unit"] = StringVar() + time_unit_combo = ttk.Combobox( + window, values=["", "seconds", "milliseconds", "microseconds"], textvariable=holdComboboxValues["time_unit"] + ) + time_unit_combo.grid(row=1, column=2, pady=25, padx=25) + time_unit_combo.current(0) + # time_unit_combo.bind("<>", comboBoxSelected) + window.lift() + window.after(500, lambda: window.lift()) + window.mainloop() + + if holdComboboxValues["timestamps"].get(): + df.insert(1, "Timestamp", df[holdComboboxValues["timestamps"].get()]) + df = df.drop(col_names_ts[1:], axis=1) + else: + messagebox.showerror( + "All options not selected", + "All the options for timestamps \ + were not selected. Please select appropriate options", + ) + logger.error( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + raise Exception( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + if holdComboboxValues["time_unit"].get(): + if holdComboboxValues["time_unit"].get() == "seconds": + ts_unit = holdComboboxValues["time_unit"].get() + elif holdComboboxValues["time_unit"].get() == "milliseconds": + ts_unit = holdComboboxValues["time_unit"].get() + else: + ts_unit = holdComboboxValues["time_unit"].get() + else: + messagebox.showerror( + "All options not selected", + "All the options for timestamps \ + were not selected. Please select appropriate options", + ) + logger.error( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + raise Exception( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + else: + pass + + return df, ts_unit + + +# ---------------------------------------------------------------------------------------------------------------------- +# Functions that read_doric uses +# ---------------------------------------------------------------------------------------------------------------------- + +def access_keys_doricV6(doric_file): + data = [doric_file["DataAcquisition"]] + res = [] + while len(data) != 0: + members = len(data) + while members != 0: + members -= 1 + data, last_element = separate_last_element(data) + if isinstance(last_element, h5py.Dataset) and not last_element.name.endswith("/Time"): + res.append(last_element.name) + elif isinstance(last_element, h5py.Group): + data.extend(reversed([last_element[k] for k in last_element.keys()])) + + keys = [] + for element in res: + sep_values = element.split("/") + if sep_values[-1] == "Values": + keys.append(f"{sep_values[-3]}/{sep_values[-2]}") + else: + keys.append(f"{sep_values[-2]}/{sep_values[-1]}") + + return keys + + +def access_keys_doricV1(doric_file): + keys = list(doric_file["Traces"]["Console"].keys()) + keys.remove("Time(s)") + + return keys + +# ---------------------------------------------------------------------------------------------------------------------- +# Functions that decide_indices uses +# ---------------------------------------------------------------------------------------------------------------------- + +# check flag consistency in neurophotometrics data +def check_channels(state): + state = state.astype(int) + unique_state = np.unique(state[2:12]) + if unique_state.shape[0] > 3: + logger.error( + "Looks like there are more than 3 channels in the file. Reading of these files\ + are not supported. Reach out to us if you get this error message." + ) + raise Exception( + "Looks like there are more than 3 channels in the file. Reading of these files\ + are not supported. Reach out to us if you get this error message." + ) + + return unique_state.shape[0], unique_state + + +# ---------------------------------------------------------------------------------------------------------------------- +# Functions that access_keys_doricV6 uses +# ---------------------------------------------------------------------------------------------------------------------- +def separate_last_element(arr): + l = arr[-1] + return arr[:-1], l \ No newline at end of file diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index a837aa9..d7380ec 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -22,6 +22,7 @@ from numpy import float32, float64, int32, int64, uint16 from guppy.tdt_step2 import readtsq +from guppy.np_doric_csv_step2 import import_np_doric_csv # hv.extension() pn.extension() @@ -76,18 +77,6 @@ def make_dir(filepath): return op -def check_header(df): - arr = list(df.columns) - check_float = [] - for i in arr: - try: - check_float.append(float(i)) - except: - pass - - return arr, check_float - - # function to show GUI and save def saveStorenames(inputParameters, data, event_name, flag, filepath): @@ -582,491 +571,6 @@ def save_button(event=None): template.show(port=number) -# check flag consistency in neurophotometrics data -def check_channels(state): - state = state.astype(int) - unique_state = np.unique(state[2:12]) - if unique_state.shape[0] > 3: - logger.error( - "Looks like there are more than 3 channels in the file. Reading of these files\ - are not supported. Reach out to us if you get this error message." - ) - raise Exception( - "Looks like there are more than 3 channels in the file. Reading of these files\ - are not supported. Reach out to us if you get this error message." - ) - - return unique_state.shape[0], unique_state - - -# function to decide NPM timestamps unit (seconds, ms or us) -def decide_ts_unit_for_npm(df, timestamp_column_name=None, time_unit=None, headless=False): - col_names = np.array(list(df.columns)) - col_names_ts = [""] - for name in col_names: - if "timestamp" in name.lower(): - col_names_ts.append(name) - - ts_unit = "seconds" - if len(col_names_ts) > 2: - # Headless path: auto-select column/unit without any UI - if headless: - if timestamp_column_name is not None: - assert ( - timestamp_column_name in col_names_ts - ), f"Provided timestamp_column_name '{timestamp_column_name}' not found in columns {col_names_ts[1:]}" - chosen = timestamp_column_name - else: - chosen = col_names_ts[1] - df.insert(1, "Timestamp", df[chosen]) - df = df.drop(col_names_ts[1:], axis=1) - valid_units = {"seconds", "milliseconds", "microseconds"} - ts_unit = time_unit if (isinstance(time_unit, str) and time_unit in valid_units) else "seconds" - return df, ts_unit - # def comboBoxSelected(event): - # logger.info(event.widget.get()) - - window = tk.Tk() - window.title("Select appropriate options for timestamps") - window.geometry("500x200") - holdComboboxValues = dict() - - timestamps_label = ttk.Label(window, text="Select which timestamps to use : ").grid( - row=0, column=1, pady=25, padx=25 - ) - holdComboboxValues["timestamps"] = StringVar() - timestamps_combo = ttk.Combobox(window, values=col_names_ts, textvariable=holdComboboxValues["timestamps"]) - timestamps_combo.grid(row=0, column=2, pady=25, padx=25) - timestamps_combo.current(0) - # timestamps_combo.bind("<>", comboBoxSelected) - - time_unit_label = ttk.Label(window, text="Select timestamps unit : ").grid(row=1, column=1, pady=25, padx=25) - holdComboboxValues["time_unit"] = StringVar() - time_unit_combo = ttk.Combobox( - window, values=["", "seconds", "milliseconds", "microseconds"], textvariable=holdComboboxValues["time_unit"] - ) - time_unit_combo.grid(row=1, column=2, pady=25, padx=25) - time_unit_combo.current(0) - # time_unit_combo.bind("<>", comboBoxSelected) - window.lift() - window.after(500, lambda: window.lift()) - window.mainloop() - - if holdComboboxValues["timestamps"].get(): - df.insert(1, "Timestamp", df[holdComboboxValues["timestamps"].get()]) - df = df.drop(col_names_ts[1:], axis=1) - else: - messagebox.showerror( - "All options not selected", - "All the options for timestamps \ - were not selected. Please select appropriate options", - ) - logger.error( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - raise Exception( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - if holdComboboxValues["time_unit"].get(): - if holdComboboxValues["time_unit"].get() == "seconds": - ts_unit = holdComboboxValues["time_unit"].get() - elif holdComboboxValues["time_unit"].get() == "milliseconds": - ts_unit = holdComboboxValues["time_unit"].get() - else: - ts_unit = holdComboboxValues["time_unit"].get() - else: - messagebox.showerror( - "All options not selected", - "All the options for timestamps \ - were not selected. Please select appropriate options", - ) - logger.error( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - raise Exception( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - else: - pass - - return df, ts_unit - - -# function to decide indices of interleaved channels -# in neurophotometrics data -def decide_indices(file, df, flag, num_ch=2): - ch_name = [file + "chev", file + "chod", file + "chpr"] - if len(ch_name) < num_ch: - logger.error( - "Number of channels parameters in Input Parameters GUI is more than 3. \ - Looks like there are more than 3 channels in the file. Reading of these files\ - are not supported. Reach out to us if you get this error message." - ) - raise Exception( - "Number of channels parameters in Input Parameters GUI is more than 3. \ - Looks like there are more than 3 channels in the file. Reading of these files\ - are not supported. Reach out to us if you get this error message." - ) - if flag == "data_np": - indices_dict = dict() - for i in range(num_ch): - indices_dict[ch_name[i]] = np.arange(i, df.shape[0], num_ch) - - else: - cols = np.array(list(df.columns)) - if "flags" in np.char.lower(np.array(cols)): - arr = ["FrameCounter", "Flags"] - state = np.array(df["Flags"]) - elif "ledstate" in np.char.lower(np.array(cols)): - arr = ["FrameCounter", "LedState"] - state = np.array(df["LedState"]) - else: - logger.error( - "File type shows Neurophotometrics newer version \ - data but column names does not have Flags or LedState" - ) - raise Exception( - "File type shows Neurophotometrics newer version \ - data but column names does not have Flags or LedState" - ) - - num_ch, ch = check_channels(state) - indices_dict = dict() - for i in range(num_ch): - first_occurrence = np.where(state == ch[i])[0] - indices_dict[ch_name[i]] = np.arange(first_occurrence[0], df.shape[0], num_ch) - - df = df.drop(arr, axis=1) - - return df, indices_dict, num_ch - - -def separate_last_element(arr): - l = arr[-1] - return arr[:-1], l - - -def access_keys_doricV6(doric_file): - data = [doric_file["DataAcquisition"]] - res = [] - while len(data) != 0: - members = len(data) - while members != 0: - members -= 1 - data, last_element = separate_last_element(data) - if isinstance(last_element, h5py.Dataset) and not last_element.name.endswith("/Time"): - res.append(last_element.name) - elif isinstance(last_element, h5py.Group): - data.extend(reversed([last_element[k] for k in last_element.keys()])) - - keys = [] - for element in res: - sep_values = element.split("/") - if sep_values[-1] == "Values": - keys.append(f"{sep_values[-3]}/{sep_values[-2]}") - else: - keys.append(f"{sep_values[-2]}/{sep_values[-1]}") - - return keys - - -def access_keys_doricV1(doric_file): - keys = list(doric_file["Traces"]["Console"].keys()) - keys.remove("Time(s)") - - return keys - - -def read_doric(filepath): - with h5py.File(filepath, "r") as f: - if "Traces" in list(f.keys()): - keys = access_keys_doricV1(f) - elif list(f.keys()) == ["Configurations", "DataAcquisition"]: - keys = access_keys_doricV6(f) - - return keys - - -# function to see if there are 'csv' files present -# and recognize type of 'csv' files either from -# Neurophotometrics, Doric systems or custom made 'csv' files -# and read data accordingly -def import_np_doric_csv(filepath, isosbestic_control, num_ch, inputParameters=None): - - logger.debug("If it exists, importing either NPM or Doric or csv file based on the structure of file") - # Headless configuration (used to avoid any UI prompts when running tests) - headless = bool(os.environ.get("GUPPY_BASE_DIR")) - npm_timestamp_column_name = None - npm_time_unit = None - npm_split_events = None - if isinstance(inputParameters, dict): - npm_timestamp_column_name = inputParameters.get("npm_timestamp_column_name") - npm_time_unit = inputParameters.get("npm_time_unit", "seconds") - npm_split_events = inputParameters.get("npm_split_events", True) - path = sorted(glob.glob(os.path.join(filepath, "*.csv"))) + sorted(glob.glob(os.path.join(filepath, "*.doric"))) - path_chev = glob.glob(os.path.join(filepath, "*chev*")) - path_chod = glob.glob(os.path.join(filepath, "*chod*")) - path_chpr = glob.glob(os.path.join(filepath, "*chpr*")) - path_event = glob.glob(os.path.join(filepath, "event*")) - # path_sig = glob.glob(os.path.join(filepath, 'sig*')) - path_chev_chod_event = path_chev + path_chod + path_event + path_chpr - - path = sorted(list(set(path) - set(path_chev_chod_event))) - flag = "None" - event_from_filename = [] - flag_arr = [] - for i in range(len(path)): - dirname = os.path.dirname(path[i]) - ext = os.path.basename(path[i]).split(".")[-1] - if ext == "doric": - key_names = read_doric(path[i]) - event_from_filename.extend(key_names) - flag = "doric_doric" - else: - df = pd.read_csv(path[i], header=None, nrows=2, index_col=False, dtype=str) - df = df.dropna(axis=1, how="all") - df_arr = np.array(df).flatten() - check_all_str = [] - for element in df_arr: - try: - float(element) - except: - check_all_str.append(i) - if len(check_all_str) == len(df_arr): - df = pd.read_csv(path[i], header=1, index_col=False, nrows=10) - df = df.drop(["Time(s)"], axis=1) - event_from_filename.extend(list(df.columns)) - flag = "doric_csv" - logger.info(flag) - else: - df = pd.read_csv(path[i], index_col=False) - # with warnings.catch_warnings(): - # warnings.simplefilter("error") - # try: - # df = pd.read_csv(path[i], index_col=False, dtype=float) - # except: - # df = pd.read_csv(path[i], header=1, index_col=False, nrows=10) # to make process faster reading just first 10 rows - # df = df.drop(['Time(s)'], axis=1) - # event_from_filename.extend(list(df.columns)) - # flag = 'doric_csv' - if flag == "doric_csv" or flag == "doric_doric": - continue - else: - colnames, value = check_header(df) - # logger.info(len(colnames), len(value)) - - # check dataframe structure and read data accordingly - if len(value) > 0: - columns_isstr = False - df = pd.read_csv(path[i], header=None) - cols = np.array(list(df.columns), dtype=str) - else: - df = df - columns_isstr = True - cols = np.array(list(df.columns), dtype=str) - # check the structure of dataframe and assign flag to the type of file - if len(cols) == 1: - if cols[0].lower() != "timestamps": - logger.error("\033[1m" + "Column name should be timestamps (all lower-cases)" + "\033[0m") - raise Exception("\033[1m" + "Column name should be timestamps (all lower-cases)" + "\033[0m") - else: - flag = "event_csv" - elif len(cols) == 3: - arr1 = np.array(["timestamps", "data", "sampling_rate"]) - arr2 = np.char.lower(np.array(cols)) - if (np.sort(arr1) == np.sort(arr2)).all() == False: - logger.error( - "\033[1m" - + "Column names should be timestamps, data and sampling_rate (all lower-cases)" - + "\033[0m" - ) - raise Exception( - "\033[1m" - + "Column names should be timestamps, data and sampling_rate (all lower-cases)" - + "\033[0m" - ) - else: - flag = "data_csv" - elif len(cols) == 2: - flag = "event_or_data_np" - elif len(cols) >= 2: - flag = "data_np" - else: - logger.error("Number of columns in csv file does not make sense.") - raise Exception("Number of columns in csv file does not make sense.") - - if columns_isstr == True and ( - "flags" in np.char.lower(np.array(cols)) or "ledstate" in np.char.lower(np.array(cols)) - ): - flag = flag + "_v2" - else: - flag = flag - - # used assigned flags to process the files and read the data - if flag == "event_or_data_np": - arr = list(df.iloc[:, 1]) - check_float = [True for i in arr if isinstance(i, float)] - if len(arr) == len(check_float) and columns_isstr == False: - flag = "data_np" - elif columns_isstr == True and ("value" in np.char.lower(np.array(cols))): - flag = "event_np" - else: - flag = "event_np" - else: - pass - - flag_arr.append(flag) - logger.info(flag) - if flag == "event_csv" or flag == "data_csv": - name = os.path.basename(path[i]).split(".")[0] - event_from_filename.append(name) - elif flag == "data_np": - file = f"file{str(i)}_" - df, indices_dict, num_channels = decide_indices(file, df, flag, num_ch) - keys = list(indices_dict.keys()) - for k in range(len(keys)): - for j in range(df.shape[1]): - if j == 0: - timestamps = df.iloc[:, j][indices_dict[keys[k]]] - # timestamps_odd = df.iloc[:,j][odd_indices] - else: - d = dict() - d["timestamps"] = timestamps - d["data"] = df.iloc[:, j][indices_dict[keys[k]]] - - df_ch = pd.DataFrame(d) - df_ch.to_csv(os.path.join(dirname, keys[k] + str(j) + ".csv"), index=False) - event_from_filename.append(keys[k] + str(j)) - - elif flag == "event_np": - type_val = np.array(df.iloc[:, 1]) - type_val_unique = np.unique(type_val) - if headless: - response = 1 if bool(npm_split_events) else 0 - else: - window = tk.Tk() - if len(type_val_unique) > 1: - response = messagebox.askyesno( - "Multiple event TTLs", - "Based on the TTL file,\ - it looks like TTLs \ - belongs to multiple behavior type. \ - Do you want to create multiple files for each \ - behavior type ?", - ) - else: - response = 0 - window.destroy() - if response == 1: - timestamps = np.array(df.iloc[:, 0]) - for j in range(len(type_val_unique)): - idx = np.where(type_val == type_val_unique[j]) - d = dict() - d["timestamps"] = timestamps[idx] - df_new = pd.DataFrame(d) - df_new.to_csv(os.path.join(dirname, "event" + str(type_val_unique[j]) + ".csv"), index=False) - event_from_filename.append("event" + str(type_val_unique[j])) - else: - timestamps = np.array(df.iloc[:, 0]) - d = dict() - d["timestamps"] = timestamps - df_new = pd.DataFrame(d) - df_new.to_csv(os.path.join(dirname, "event" + str(0) + ".csv"), index=False) - event_from_filename.append("event" + str(0)) - else: - file = f"file{str(i)}_" - df, ts_unit = decide_ts_unit_for_npm( - df, timestamp_column_name=npm_timestamp_column_name, time_unit=npm_time_unit, headless=headless - ) - df, indices_dict, num_channels = decide_indices(file, df, flag) - keys = list(indices_dict.keys()) - for k in range(len(keys)): - for j in range(df.shape[1]): - if j == 0: - timestamps = df.iloc[:, j][indices_dict[keys[k]]] - # timestamps_odd = df.iloc[:,j][odd_indices] - else: - d = dict() - d["timestamps"] = timestamps - d["data"] = df.iloc[:, j][indices_dict[keys[k]]] - - df_ch = pd.DataFrame(d) - df_ch.to_csv(os.path.join(dirname, keys[k] + str(j) + ".csv"), index=False) - event_from_filename.append(keys[k] + str(j)) - - path_chev = glob.glob(os.path.join(filepath, "*chev*")) - path_chod = glob.glob(os.path.join(filepath, "*chod*")) - path_chpr = glob.glob(os.path.join(filepath, "*chpr*")) - path_event = glob.glob(os.path.join(filepath, "event*")) - # path_sig = glob.glob(os.path.join(filepath, 'sig*')) - path_chev_chod_chpr = [path_chev, path_chod, path_chpr] - if ( - ("data_np_v2" in flag_arr or "data_np" in flag_arr) - and ("event_np" in flag_arr) - and (i == len(path) - 1) - ) or ( - ("data_np_v2" in flag_arr or "data_np" in flag_arr) and (i == len(path) - 1) - ): # i==len(path)-1 and or 'event_np' in flag - num_path_chev, num_path_chod, num_path_chpr = len(path_chev), len(path_chod), len(path_chpr) - arr_len, no_ch = [], [] - for i in range(len(path_chev_chod_chpr)): - if len(path_chev_chod_chpr[i]) > 0: - arr_len.append(len(path_chev_chod_chpr[i])) - else: - continue - - unique_arr_len = np.unique(np.array(arr_len)) - if "data_np_v2" in flag_arr: - if ts_unit == "seconds": - divisor = 1 - elif ts_unit == "milliseconds": - divisor = 1e3 - else: - divisor = 1e6 - else: - divisor = 1000 - - for j in range(len(path_event)): - df_event = pd.read_csv(path_event[j]) - df_chev = pd.read_csv(path_chev[0]) - df_event["timestamps"] = (df_event["timestamps"] - df_chev["timestamps"][0]) / divisor - df_event.to_csv(path_event[j], index=False) - if unique_arr_len.shape[0] == 1: - for j in range(len(path_chev)): - if file + "chev" in indices_dict.keys(): - df_chev = pd.read_csv(path_chev[j]) - df_chev["timestamps"] = (df_chev["timestamps"] - df_chev["timestamps"][0]) / divisor - df_chev["sampling_rate"] = np.full(df_chev.shape[0], np.nan) - df_chev.at[0, "sampling_rate"] = df_chev.shape[0] / ( - df_chev["timestamps"].iloc[-1] - df_chev["timestamps"].iloc[0] - ) - df_chev.to_csv(path_chev[j], index=False) - - if file + "chod" in indices_dict.keys(): - df_chod = pd.read_csv(path_chod[j]) - df_chod["timestamps"] = df_chev["timestamps"] - df_chod["sampling_rate"] = np.full(df_chod.shape[0], np.nan) - df_chod.at[0, "sampling_rate"] = df_chev["sampling_rate"][0] - df_chod.to_csv(path_chod[j], index=False) - - if file + "chpr" in indices_dict.keys(): - df_chpr = pd.read_csv(path_chpr[j]) - df_chpr["timestamps"] = df_chev["timestamps"] - df_chpr["sampling_rate"] = np.full(df_chpr.shape[0], np.nan) - df_chpr.at[0, "sampling_rate"] = df_chev["sampling_rate"][0] - df_chpr.to_csv(path_chpr[j], index=False) - else: - logger.error("Number of channels should be same for all regions.") - raise Exception("Number of channels should be same for all regions.") - else: - pass - logger.info("Importing of either NPM or Doric or csv file is done.") - return event_from_filename, flag_arr - # function to read input parameters and run the saveStorenames function def execute(inputParameters): From a06cae4233657f52a77d5935928bcab9bceb6de7 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 17 Nov 2025 12:04:16 -0800 Subject: [PATCH 003/150] Split import_csv out from import_np_doric_csv --- src/guppy/csv_step2.py | 99 +++++++++++++++++++++++++++++++++++++ src/guppy/saveStoresList.py | 16 ++++-- 2 files changed, 111 insertions(+), 4 deletions(-) create mode 100644 src/guppy/csv_step2.py diff --git a/src/guppy/csv_step2.py b/src/guppy/csv_step2.py new file mode 100644 index 0000000..4d9b800 --- /dev/null +++ b/src/guppy/csv_step2.py @@ -0,0 +1,99 @@ +import glob +import logging +import os +import numpy as np +import pandas as pd + +logger = logging.getLogger(__name__) + +def check_header(df): + arr = list(df.columns) + check_float = [] + for i in arr: + try: + check_float.append(float(i)) + except: + pass + + return arr, check_float + +def import_csv_step2(filepath): + logger.debug("If it exists, importing either NPM or Doric or csv file based on the structure of file") + path = sorted(glob.glob(os.path.join(filepath, "*.csv"))) + + path = sorted(list(set(path))) + flag = "None" + event_from_filename = [] + flag_arr = [] + for i in range(len(path)): + ext = os.path.basename(path[i]).split(".")[-1] + assert ext == "csv", "Only .csv files are supported by import_csv function." + df = pd.read_csv(path[i], header=None, nrows=2, index_col=False, dtype=str) + df = df.dropna(axis=1, how="all") + df_arr = np.array(df).flatten() + check_all_str = [] + for element in df_arr: + try: + float(element) + except: + check_all_str.append(i) + assert len(check_all_str) != len(df_arr), "This file appears to be doric .csv. This function only supports standard .csv files." + df = pd.read_csv(path[i], index_col=False) + + _, value = check_header(df) + + # check dataframe structure and read data accordingly + if len(value) > 0: + columns_isstr = False + df = pd.read_csv(path[i], header=None) + cols = np.array(list(df.columns), dtype=str) + else: + df = df + columns_isstr = True + cols = np.array(list(df.columns), dtype=str) + # check the structure of dataframe and assign flag to the type of file + if len(cols) == 1: + if cols[0].lower() != "timestamps": + logger.error("\033[1m" + "Column name should be timestamps (all lower-cases)" + "\033[0m") + raise Exception("\033[1m" + "Column name should be timestamps (all lower-cases)" + "\033[0m") + else: + flag = "event_csv" + elif len(cols) == 3: + arr1 = np.array(["timestamps", "data", "sampling_rate"]) + arr2 = np.char.lower(np.array(cols)) + if (np.sort(arr1) == np.sort(arr2)).all() == False: + logger.error( + "\033[1m" + + "Column names should be timestamps, data and sampling_rate (all lower-cases)" + + "\033[0m" + ) + raise Exception( + "\033[1m" + + "Column names should be timestamps, data and sampling_rate (all lower-cases)" + + "\033[0m" + ) + else: + flag = "data_csv" + elif len(cols) == 2: + raise ValueError("Data appears to be Neurophotometrics csv. Please use import_npm_csv function to import the data.") + elif len(cols) >= 2: + raise ValueError("Data appears to be Neurophotometrics csv. Please use import_npm_csv function to import the data.") + else: + logger.error("Number of columns in csv file does not make sense.") + raise Exception("Number of columns in csv file does not make sense.") + + if columns_isstr == True and ( + "flags" in np.char.lower(np.array(cols)) or "ledstate" in np.char.lower(np.array(cols)) + ): + flag = flag + "_v2" + else: + flag = flag + + flag_arr.append(flag) + logger.info(flag) + assert flag == "event_csv" or flag == "data_csv", "This function only supports standard event_csv and data_csv files." + name = os.path.basename(path[i]).split(".")[0] + event_from_filename.append(name) + + logger.info("Importing of csv file is done.") + return event_from_filename, flag_arr \ No newline at end of file diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index d7380ec..1f6bae7 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -21,8 +21,10 @@ import panel as pn from numpy import float32, float64, int32, int64, uint16 +from guppy.readTevTsq import import_csv from guppy.tdt_step2 import readtsq from guppy.np_doric_csv_step2 import import_np_doric_csv +from guppy.csv_step2 import import_csv_step2 # hv.extension() pn.extension() @@ -585,10 +587,16 @@ def execute(inputParameters): try: for i in folderNames: filepath = os.path.join(inputParameters["abspath"], i) - data = readtsq(filepath) - event_name, flag = import_np_doric_csv( - filepath, isosbestic_control, num_ch, inputParameters=inputParameters - ) + modality = "csv" # TODO: ask for modality from the user + if modality == "tdt": + data = readtsq(filepath) + event_name, flag = None, None + elif modality == "csv": + data = 0 + event_name, flag = import_csv_step2(filepath) + else: + raise ValueError("Modality not recognized. Please use 'tdt' or 'csv'.") + saveStorenames(inputParameters, data, event_name, flag, filepath) logger.info("#" * 400) except Exception as e: From 66d60e2aabf95eac48556d747dd8bbf2a26b0dd6 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 17 Nov 2025 12:58:33 -0800 Subject: [PATCH 004/150] Fixed TDT --- src/guppy/saveStoresList.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index 1f6bae7..392c04e 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -587,10 +587,10 @@ def execute(inputParameters): try: for i in folderNames: filepath = os.path.join(inputParameters["abspath"], i) - modality = "csv" # TODO: ask for modality from the user + modality = "tdt" # TODO: ask for modality from the user if modality == "tdt": data = readtsq(filepath) - event_name, flag = None, None + event_name, flag = [], [] elif modality == "csv": data = 0 event_name, flag = import_csv_step2(filepath) From 4f4e1c921da919e28d5d595827f8bf397c74c5e4 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 17 Nov 2025 13:15:47 -0800 Subject: [PATCH 005/150] Split import_doric out from import_np_doric_csv --- src/guppy/doric_step2.py | 92 +++++++++++++++++++++++++++++++++++++ src/guppy/saveStoresList.py | 6 ++- 2 files changed, 97 insertions(+), 1 deletion(-) create mode 100644 src/guppy/doric_step2.py diff --git a/src/guppy/doric_step2.py b/src/guppy/doric_step2.py new file mode 100644 index 0000000..69022aa --- /dev/null +++ b/src/guppy/doric_step2.py @@ -0,0 +1,92 @@ +import glob +import logging +import os +import tkinter as tk +from tkinter import StringVar, messagebox, ttk + +import h5py +import numpy as np +import pandas as pd +import panel as pn + +pn.extension() + +logger = logging.getLogger(__name__) + +def import_doric(filepath): + + logger.debug("If it exists, importing Doric file based on the structure of file") + path = sorted(glob.glob(os.path.join(filepath, "*.csv"))) + sorted(glob.glob(os.path.join(filepath, "*.doric"))) + + path = sorted(list(set(path))) + flag = "None" + event_from_filename = [] + flag_arr = [] + for i in range(len(path)): + ext = os.path.basename(path[i]).split(".")[-1] + if ext == "doric": + key_names = read_doric(path[i]) + event_from_filename.extend(key_names) + flag = "doric_doric" + else: + df = pd.read_csv(path[i], header=None, nrows=2, index_col=False, dtype=str) + df = df.dropna(axis=1, how="all") + df_arr = np.array(df).flatten() + check_all_str = [] + for element in df_arr: + try: + float(element) + except: + check_all_str.append(i) + assert len(check_all_str) == len(df_arr), "This file appears to be standard .csv. This function only supports doric .csv files." + df = pd.read_csv(path[i], header=1, index_col=False, nrows=10) + df = df.drop(["Time(s)"], axis=1) + event_from_filename.extend(list(df.columns)) + flag = "doric_csv" + logger.info(flag) + logger.info("Importing of Doric file is done.") + return event_from_filename, flag_arr + + +def read_doric(filepath): + with h5py.File(filepath, "r") as f: + if "Traces" in list(f.keys()): + keys = access_keys_doricV1(f) + elif list(f.keys()) == ["Configurations", "DataAcquisition"]: + keys = access_keys_doricV6(f) + + return keys + +def access_keys_doricV6(doric_file): + data = [doric_file["DataAcquisition"]] + res = [] + while len(data) != 0: + members = len(data) + while members != 0: + members -= 1 + data, last_element = separate_last_element(data) + if isinstance(last_element, h5py.Dataset) and not last_element.name.endswith("/Time"): + res.append(last_element.name) + elif isinstance(last_element, h5py.Group): + data.extend(reversed([last_element[k] for k in last_element.keys()])) + + keys = [] + for element in res: + sep_values = element.split("/") + if sep_values[-1] == "Values": + keys.append(f"{sep_values[-3]}/{sep_values[-2]}") + else: + keys.append(f"{sep_values[-2]}/{sep_values[-1]}") + + return keys + + +def access_keys_doricV1(doric_file): + keys = list(doric_file["Traces"]["Console"].keys()) + keys.remove("Time(s)") + + return keys + +def separate_last_element(arr): + l = arr[-1] + return arr[:-1], l \ No newline at end of file diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index 392c04e..26065e4 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -25,6 +25,7 @@ from guppy.tdt_step2 import readtsq from guppy.np_doric_csv_step2 import import_np_doric_csv from guppy.csv_step2 import import_csv_step2 +from guppy.doric_step2 import import_doric # hv.extension() pn.extension() @@ -587,13 +588,16 @@ def execute(inputParameters): try: for i in folderNames: filepath = os.path.join(inputParameters["abspath"], i) - modality = "tdt" # TODO: ask for modality from the user + modality = "doric" # TODO: ask for modality from the user if modality == "tdt": data = readtsq(filepath) event_name, flag = [], [] elif modality == "csv": data = 0 event_name, flag = import_csv_step2(filepath) + elif modality == "doric": + data = 0 + event_name, flag = import_doric(filepath) else: raise ValueError("Modality not recognized. Please use 'tdt' or 'csv'.") From 341d77d722844c63fdbfd4c189e446adf390c3f0 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 17 Nov 2025 13:18:35 -0800 Subject: [PATCH 006/150] Removed unnecessary imports --- src/guppy/doric_step2.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/guppy/doric_step2.py b/src/guppy/doric_step2.py index 69022aa..bf402d1 100644 --- a/src/guppy/doric_step2.py +++ b/src/guppy/doric_step2.py @@ -1,15 +1,10 @@ import glob import logging import os -import tkinter as tk -from tkinter import StringVar, messagebox, ttk import h5py import numpy as np import pandas as pd -import panel as pn - -pn.extension() logger = logging.getLogger(__name__) From 0bcd4fee319ba485519bd71f75a7c756bea36157 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 17 Nov 2025 13:40:01 -0800 Subject: [PATCH 007/150] Split import_npm out from import_np_doric_csv --- src/guppy/npm_step2.py | 408 ++++++++++++++++++++++++++++++++++++ src/guppy/saveStoresList.py | 6 +- 2 files changed, 413 insertions(+), 1 deletion(-) create mode 100644 src/guppy/npm_step2.py diff --git a/src/guppy/npm_step2.py b/src/guppy/npm_step2.py new file mode 100644 index 0000000..f0fafec --- /dev/null +++ b/src/guppy/npm_step2.py @@ -0,0 +1,408 @@ +import glob +import logging +import os +import tkinter as tk +from tkinter import StringVar, messagebox, ttk + +import numpy as np +import pandas as pd +import panel as pn + +pn.extension() + +logger = logging.getLogger(__name__) + +def import_npm(filepath, num_ch, inputParameters=None): + + logger.debug("If it exists, importing NPM file based on the structure of file") + # Headless configuration (used to avoid any UI prompts when running tests) + headless = bool(os.environ.get("GUPPY_BASE_DIR")) + npm_timestamp_column_name = None + npm_time_unit = None + npm_split_events = None + if isinstance(inputParameters, dict): + npm_timestamp_column_name = inputParameters.get("npm_timestamp_column_name") + npm_time_unit = inputParameters.get("npm_time_unit", "seconds") + npm_split_events = inputParameters.get("npm_split_events", True) + path = sorted(glob.glob(os.path.join(filepath, "*.csv"))) + sorted(glob.glob(os.path.join(filepath, "*.doric"))) + path_chev = glob.glob(os.path.join(filepath, "*chev*")) + path_chod = glob.glob(os.path.join(filepath, "*chod*")) + path_chpr = glob.glob(os.path.join(filepath, "*chpr*")) + path_event = glob.glob(os.path.join(filepath, "event*")) + # path_sig = glob.glob(os.path.join(filepath, 'sig*')) # TODO: what is this for? + path_chev_chod_event = path_chev + path_chod + path_event + path_chpr + + path = sorted(list(set(path) - set(path_chev_chod_event))) + flag = "None" + event_from_filename = [] + flag_arr = [] + for i in range(len(path)): + dirname = os.path.dirname(path[i]) + ext = os.path.basename(path[i]).split(".")[-1] + assert ext != "doric", "Doric files are not supported by import_npm function." + df = pd.read_csv(path[i], header=None, nrows=2, index_col=False, dtype=str) + df = df.dropna(axis=1, how="all") + df_arr = np.array(df).flatten() + check_all_str = [] + for element in df_arr: + try: + float(element) + except: + check_all_str.append(i) + assert len(check_all_str) != len(df_arr), "This file appears to be doric .csv. This function only supports NPM .csv files." + df = pd.read_csv(path[i], index_col=False) + _, value = check_header(df) + + # check dataframe structure and read data accordingly + if len(value) > 0: + columns_isstr = False + df = pd.read_csv(path[i], header=None) + cols = np.array(list(df.columns), dtype=str) + else: + df = df + columns_isstr = True + cols = np.array(list(df.columns), dtype=str) + # check the structure of dataframe and assign flag to the type of file + assert len(cols) != 1, "File appears to be event .csv. This function only supports NPM .csv files." + assert len(cols) != 3, "File appears to be data .csv. This function only supports NPM .csv files." + if len(cols) == 2: + flag = "event_or_data_np" + elif len(cols) >= 2: + flag = "data_np" + else: + logger.error("Number of columns in csv file does not make sense.") + raise Exception("Number of columns in csv file does not make sense.") + + if columns_isstr == True and ( + "flags" in np.char.lower(np.array(cols)) or "ledstate" in np.char.lower(np.array(cols)) + ): + flag = flag + "_v2" + else: + flag = flag + + # used assigned flags to process the files and read the data + if flag == "event_or_data_np": + arr = list(df.iloc[:, 1]) + check_float = [True for i in arr if isinstance(i, float)] + if len(arr) == len(check_float) and columns_isstr == False: + flag = "data_np" + elif columns_isstr == True and ("value" in np.char.lower(np.array(cols))): + flag = "event_np" + else: + flag = "event_np" + + flag_arr.append(flag) + logger.info(flag) + if flag == "data_np": + file = f"file{str(i)}_" + df, indices_dict, _ = decide_indices(file, df, flag, num_ch) + keys = list(indices_dict.keys()) + for k in range(len(keys)): + for j in range(df.shape[1]): + if j == 0: + timestamps = df.iloc[:, j][indices_dict[keys[k]]] + # timestamps_odd = df.iloc[:,j][odd_indices] + else: + d = dict() + d["timestamps"] = timestamps + d["data"] = df.iloc[:, j][indices_dict[keys[k]]] + + df_ch = pd.DataFrame(d) + df_ch.to_csv(os.path.join(dirname, keys[k] + str(j) + ".csv"), index=False) + event_from_filename.append(keys[k] + str(j)) + + elif flag == "event_np": + type_val = np.array(df.iloc[:, 1]) + type_val_unique = np.unique(type_val) + if headless: + response = 1 if bool(npm_split_events) else 0 + else: + window = tk.Tk() + if len(type_val_unique) > 1: + response = messagebox.askyesno( + "Multiple event TTLs", + "Based on the TTL file,\ + it looks like TTLs \ + belongs to multiple behavior type. \ + Do you want to create multiple files for each \ + behavior type ?", + ) + else: + response = 0 + window.destroy() + if response == 1: + timestamps = np.array(df.iloc[:, 0]) + for j in range(len(type_val_unique)): + idx = np.where(type_val == type_val_unique[j]) + d = dict() + d["timestamps"] = timestamps[idx] + df_new = pd.DataFrame(d) + df_new.to_csv(os.path.join(dirname, "event" + str(type_val_unique[j]) + ".csv"), index=False) + event_from_filename.append("event" + str(type_val_unique[j])) + else: + timestamps = np.array(df.iloc[:, 0]) + d = dict() + d["timestamps"] = timestamps + df_new = pd.DataFrame(d) + df_new.to_csv(os.path.join(dirname, "event" + str(0) + ".csv"), index=False) + event_from_filename.append("event" + str(0)) + else: + file = f"file{str(i)}_" + df, ts_unit = decide_ts_unit_for_npm( + df, timestamp_column_name=npm_timestamp_column_name, time_unit=npm_time_unit, headless=headless + ) + df, indices_dict, _ = decide_indices(file, df, flag) + keys = list(indices_dict.keys()) + for k in range(len(keys)): + for j in range(df.shape[1]): + if j == 0: + timestamps = df.iloc[:, j][indices_dict[keys[k]]] + # timestamps_odd = df.iloc[:,j][odd_indices] + else: + d = dict() + d["timestamps"] = timestamps + d["data"] = df.iloc[:, j][indices_dict[keys[k]]] + + df_ch = pd.DataFrame(d) + df_ch.to_csv(os.path.join(dirname, keys[k] + str(j) + ".csv"), index=False) + event_from_filename.append(keys[k] + str(j)) + + path_chev = glob.glob(os.path.join(filepath, "*chev*")) + path_chod = glob.glob(os.path.join(filepath, "*chod*")) + path_chpr = glob.glob(os.path.join(filepath, "*chpr*")) + path_event = glob.glob(os.path.join(filepath, "event*")) + # path_sig = glob.glob(os.path.join(filepath, 'sig*')) + path_chev_chod_chpr = [path_chev, path_chod, path_chpr] + if ( + ("data_np_v2" in flag_arr or "data_np" in flag_arr) + and ("event_np" in flag_arr) + and (i == len(path) - 1) + ) or ( + ("data_np_v2" in flag_arr or "data_np" in flag_arr) and (i == len(path) - 1) + ): # i==len(path)-1 and or 'event_np' in flag + num_path_chev, num_path_chod, num_path_chpr = len(path_chev), len(path_chod), len(path_chpr) + arr_len, no_ch = [], [] + for i in range(len(path_chev_chod_chpr)): + if len(path_chev_chod_chpr[i]) > 0: + arr_len.append(len(path_chev_chod_chpr[i])) + else: + continue + + unique_arr_len = np.unique(np.array(arr_len)) + if "data_np_v2" in flag_arr: + if ts_unit == "seconds": + divisor = 1 + elif ts_unit == "milliseconds": + divisor = 1e3 + else: + divisor = 1e6 + else: + divisor = 1000 + + for j in range(len(path_event)): + df_event = pd.read_csv(path_event[j]) + df_chev = pd.read_csv(path_chev[0]) + df_event["timestamps"] = (df_event["timestamps"] - df_chev["timestamps"][0]) / divisor + df_event.to_csv(path_event[j], index=False) + if unique_arr_len.shape[0] == 1: + for j in range(len(path_chev)): + if file + "chev" in indices_dict.keys(): + df_chev = pd.read_csv(path_chev[j]) + df_chev["timestamps"] = (df_chev["timestamps"] - df_chev["timestamps"][0]) / divisor + df_chev["sampling_rate"] = np.full(df_chev.shape[0], np.nan) + df_chev.at[0, "sampling_rate"] = df_chev.shape[0] / ( + df_chev["timestamps"].iloc[-1] - df_chev["timestamps"].iloc[0] + ) + df_chev.to_csv(path_chev[j], index=False) + + if file + "chod" in indices_dict.keys(): + df_chod = pd.read_csv(path_chod[j]) + df_chod["timestamps"] = df_chev["timestamps"] + df_chod["sampling_rate"] = np.full(df_chod.shape[0], np.nan) + df_chod.at[0, "sampling_rate"] = df_chev["sampling_rate"][0] + df_chod.to_csv(path_chod[j], index=False) + + if file + "chpr" in indices_dict.keys(): + df_chpr = pd.read_csv(path_chpr[j]) + df_chpr["timestamps"] = df_chev["timestamps"] + df_chpr["sampling_rate"] = np.full(df_chpr.shape[0], np.nan) + df_chpr.at[0, "sampling_rate"] = df_chev["sampling_rate"][0] + df_chpr.to_csv(path_chpr[j], index=False) + else: + logger.error("Number of channels should be same for all regions.") + raise Exception("Number of channels should be same for all regions.") + logger.info("Importing of NPM file is done.") + return event_from_filename, flag_arr + +def check_header(df): + arr = list(df.columns) + check_float = [] + for i in arr: + try: + check_float.append(float(i)) + except: + pass + + return arr, check_float + + +# function to decide indices of interleaved channels +# in neurophotometrics data +def decide_indices(file, df, flag, num_ch=2): + ch_name = [file + "chev", file + "chod", file + "chpr"] + if len(ch_name) < num_ch: + logger.error( + "Number of channels parameters in Input Parameters GUI is more than 3. \ + Looks like there are more than 3 channels in the file. Reading of these files\ + are not supported. Reach out to us if you get this error message." + ) + raise Exception( + "Number of channels parameters in Input Parameters GUI is more than 3. \ + Looks like there are more than 3 channels in the file. Reading of these files\ + are not supported. Reach out to us if you get this error message." + ) + if flag == "data_np": + indices_dict = dict() + for i in range(num_ch): + indices_dict[ch_name[i]] = np.arange(i, df.shape[0], num_ch) + + else: + cols = np.array(list(df.columns)) + if "flags" in np.char.lower(np.array(cols)): + arr = ["FrameCounter", "Flags"] + state = np.array(df["Flags"]) + elif "ledstate" in np.char.lower(np.array(cols)): + arr = ["FrameCounter", "LedState"] + state = np.array(df["LedState"]) + else: + logger.error( + "File type shows Neurophotometrics newer version \ + data but column names does not have Flags or LedState" + ) + raise Exception( + "File type shows Neurophotometrics newer version \ + data but column names does not have Flags or LedState" + ) + + num_ch, ch = check_channels(state) + indices_dict = dict() + for i in range(num_ch): + first_occurrence = np.where(state == ch[i])[0] + indices_dict[ch_name[i]] = np.arange(first_occurrence[0], df.shape[0], num_ch) + + df = df.drop(arr, axis=1) + + return df, indices_dict, num_ch + +# check flag consistency in neurophotometrics data +def check_channels(state): + state = state.astype(int) + unique_state = np.unique(state[2:12]) + if unique_state.shape[0] > 3: + logger.error( + "Looks like there are more than 3 channels in the file. Reading of these files\ + are not supported. Reach out to us if you get this error message." + ) + raise Exception( + "Looks like there are more than 3 channels in the file. Reading of these files\ + are not supported. Reach out to us if you get this error message." + ) + + return unique_state.shape[0], unique_state + + +# function to decide NPM timestamps unit (seconds, ms or us) +def decide_ts_unit_for_npm(df, timestamp_column_name=None, time_unit=None, headless=False): + col_names = np.array(list(df.columns)) + col_names_ts = [""] + for name in col_names: + if "timestamp" in name.lower(): + col_names_ts.append(name) + + ts_unit = "seconds" + if len(col_names_ts) > 2: + # Headless path: auto-select column/unit without any UI + if headless: + if timestamp_column_name is not None: + assert ( + timestamp_column_name in col_names_ts + ), f"Provided timestamp_column_name '{timestamp_column_name}' not found in columns {col_names_ts[1:]}" + chosen = timestamp_column_name + else: + chosen = col_names_ts[1] + df.insert(1, "Timestamp", df[chosen]) + df = df.drop(col_names_ts[1:], axis=1) + valid_units = {"seconds", "milliseconds", "microseconds"} + ts_unit = time_unit if (isinstance(time_unit, str) and time_unit in valid_units) else "seconds" + return df, ts_unit + # def comboBoxSelected(event): + # logger.info(event.widget.get()) + + window = tk.Tk() + window.title("Select appropriate options for timestamps") + window.geometry("500x200") + holdComboboxValues = dict() + + timestamps_label = ttk.Label(window, text="Select which timestamps to use : ").grid( + row=0, column=1, pady=25, padx=25 + ) + holdComboboxValues["timestamps"] = StringVar() + timestamps_combo = ttk.Combobox(window, values=col_names_ts, textvariable=holdComboboxValues["timestamps"]) + timestamps_combo.grid(row=0, column=2, pady=25, padx=25) + timestamps_combo.current(0) + # timestamps_combo.bind("<>", comboBoxSelected) + + time_unit_label = ttk.Label(window, text="Select timestamps unit : ").grid(row=1, column=1, pady=25, padx=25) + holdComboboxValues["time_unit"] = StringVar() + time_unit_combo = ttk.Combobox( + window, values=["", "seconds", "milliseconds", "microseconds"], textvariable=holdComboboxValues["time_unit"] + ) + time_unit_combo.grid(row=1, column=2, pady=25, padx=25) + time_unit_combo.current(0) + # time_unit_combo.bind("<>", comboBoxSelected) + window.lift() + window.after(500, lambda: window.lift()) + window.mainloop() + + if holdComboboxValues["timestamps"].get(): + df.insert(1, "Timestamp", df[holdComboboxValues["timestamps"].get()]) + df = df.drop(col_names_ts[1:], axis=1) + else: + messagebox.showerror( + "All options not selected", + "All the options for timestamps \ + were not selected. Please select appropriate options", + ) + logger.error( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + raise Exception( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + if holdComboboxValues["time_unit"].get(): + if holdComboboxValues["time_unit"].get() == "seconds": + ts_unit = holdComboboxValues["time_unit"].get() + elif holdComboboxValues["time_unit"].get() == "milliseconds": + ts_unit = holdComboboxValues["time_unit"].get() + else: + ts_unit = holdComboboxValues["time_unit"].get() + else: + messagebox.showerror( + "All options not selected", + "All the options for timestamps \ + were not selected. Please select appropriate options", + ) + logger.error( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + raise Exception( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + else: + pass + + return df, ts_unit \ No newline at end of file diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index 26065e4..db9a4fc 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -26,6 +26,7 @@ from guppy.np_doric_csv_step2 import import_np_doric_csv from guppy.csv_step2 import import_csv_step2 from guppy.doric_step2 import import_doric +from guppy.npm_step2 import import_npm # hv.extension() pn.extension() @@ -588,7 +589,7 @@ def execute(inputParameters): try: for i in folderNames: filepath = os.path.join(inputParameters["abspath"], i) - modality = "doric" # TODO: ask for modality from the user + modality = "npm" # TODO: ask for modality from the user if modality == "tdt": data = readtsq(filepath) event_name, flag = [], [] @@ -598,6 +599,9 @@ def execute(inputParameters): elif modality == "doric": data = 0 event_name, flag = import_doric(filepath) + elif modality == "npm": + data = 0 + event_name, flag = import_npm(filepath, num_ch) else: raise ValueError("Modality not recognized. Please use 'tdt' or 'csv'.") From 7b36f64266a7b5c35b78310272f65fcecd6a6d3b Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 17 Nov 2025 13:50:35 -0800 Subject: [PATCH 008/150] Added modality selector to the GUI. --- src/guppy/saveStoresList.py | 4 ++-- src/guppy/savingInputParameters.py | 18 +++++++++++++++++- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index db9a4fc..f9921f9 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -589,7 +589,7 @@ def execute(inputParameters): try: for i in folderNames: filepath = os.path.join(inputParameters["abspath"], i) - modality = "npm" # TODO: ask for modality from the user + modality = inputParameters.get("modality", "tdt") if modality == "tdt": data = readtsq(filepath) event_name, flag = [], [] @@ -603,7 +603,7 @@ def execute(inputParameters): data = 0 event_name, flag = import_npm(filepath, num_ch) else: - raise ValueError("Modality not recognized. Please use 'tdt' or 'csv'.") + raise ValueError("Modality not recognized. Please use 'tdt', 'csv', 'doric', or 'npm'.") saveStorenames(inputParameters, data, event_name, flag, filepath) logger.info("#" * 400) diff --git a/src/guppy/savingInputParameters.py b/src/guppy/savingInputParameters.py index cd515ab..b0a5feb 100644 --- a/src/guppy/savingInputParameters.py +++ b/src/guppy/savingInputParameters.py @@ -119,6 +119,21 @@ def readPBIncrementValues(progressBar): files_1 = pn.widgets.FileSelector(folder_path, name="folderNames", width=950) + explain_modality = pn.pane.Markdown( + """ + **Data Modality:** Select the type of data acquisition system used for your recordings: + - **tdt**: Tucker-Davis Technologies system + - **csv**: Generic CSV format + - **doric**: Doric Photometry system + - **npm**: Neurophotometrics system + """, + width=600, + ) + + modality_selector = pn.widgets.Select( + name="Data Modality", value="tdt", options=["tdt", "csv", "doric", "npm"], width=320 + ) + explain_time_artifacts = pn.pane.Markdown( """ - ***Number of cores :*** Number of cores used for analysis. Try to @@ -357,6 +372,7 @@ def getInputParameters(): inputParameters = { "abspath": abspath[0], "folderNames": files_1.value, + "modality": modality_selector.value, "numberOfCores": numberOfCores.value, "combine_data": combine_data.value, "isosbestic_control": isosbestic_control.value, @@ -538,7 +554,7 @@ def onclickpsth(event=None): psth_baseline_param = pn.Column(zscore_param_wd, psth_param_wd, baseline_param_wd, peak_param_wd) - widget = pn.Column(mark_down_1, files_1, pn.Row(individual_analysis_wd_2, psth_baseline_param)) + widget = pn.Column(mark_down_1, files_1, explain_modality, modality_selector, pn.Row(individual_analysis_wd_2, psth_baseline_param)) # file_selector = pn.WidgetBox(files_1) styles = dict(background="WhiteSmoke") From 100ad14058e8f07a48aee74083e6f04d46a027fa Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 17 Nov 2025 13:52:00 -0800 Subject: [PATCH 009/150] Added modality selector to the GUI. --- src/guppy/saveStoresList.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index f9921f9..72dc604 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -583,13 +583,13 @@ def execute(inputParameters): folderNames = inputParameters["folderNames"] isosbestic_control = inputParameters["isosbestic_control"] num_ch = inputParameters["noChannels"] + modality = inputParameters.get("modality", "tdt") logger.info(folderNames) try: for i in folderNames: filepath = os.path.join(inputParameters["abspath"], i) - modality = inputParameters.get("modality", "tdt") if modality == "tdt": data = readtsq(filepath) event_name, flag = [], [] From ef978ec2cb8f7e51b9eacb8ce2d6f88bf73e01ea Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 17 Nov 2025 14:17:33 -0800 Subject: [PATCH 010/150] Added modality option to the api and tests --- src/guppy/testing/api.py | 7 +++++++ tests/test_step2.py | 17 +++++++++++++++-- tests/test_step3.py | 18 ++++++++++++++++-- tests/test_step4.py | 19 +++++++++++++++++-- tests/test_step5.py | 20 ++++++++++++++++++-- 5 files changed, 73 insertions(+), 8 deletions(-) diff --git a/src/guppy/testing/api.py b/src/guppy/testing/api.py index 587a022..0e16f23 100644 --- a/src/guppy/testing/api.py +++ b/src/guppy/testing/api.py @@ -68,6 +68,7 @@ def step2( base_dir: str, selected_folders: Iterable[str], storenames_map: dict[str, str], + modality: str = "tdt", npm_timestamp_column_name: str | None = None, npm_time_unit: str = "seconds", npm_split_events: bool = True, @@ -150,6 +151,9 @@ def step2( # Inject storenames mapping for headless execution input_params["storenames_map"] = dict(storenames_map) + # Inject modality + input_params["modality"] = modality + # Add npm parameters input_params["npm_timestamp_column_name"] = npm_timestamp_column_name input_params["npm_time_unit"] = npm_time_unit @@ -163,6 +167,7 @@ def step3( *, base_dir: str, selected_folders: Iterable[str], + modality: str = "tdt", npm_timestamp_column_name: str | None = None, npm_time_unit: str = "seconds", npm_split_events: bool = True, @@ -240,6 +245,7 @@ def step4( *, base_dir: str, selected_folders: Iterable[str], + modality: str = "tdt", npm_timestamp_column_name: str | None = None, npm_time_unit: str = "seconds", npm_split_events: bool = True, @@ -317,6 +323,7 @@ def step5( *, base_dir: str, selected_folders: Iterable[str], + modality: str = "tdt", npm_timestamp_column_name: str | None = None, npm_time_unit: str = "seconds", npm_split_events: bool = True, diff --git a/tests/test_step2.py b/tests/test_step2.py index 55181ab..34777be 100644 --- a/tests/test_step2.py +++ b/tests/test_step2.py @@ -10,7 +10,7 @@ @pytest.mark.parametrize( - "session_subdir, storenames_map", + "session_subdir, storenames_map, modality", [ ( "SampleData_csv/sample_data_csv_1", @@ -19,6 +19,7 @@ "Sample_Signal_Channel": "signal_region", "Sample_TTL": "ttl", }, + "csv", ), ( "SampleData_Doric/sample_doric_1", @@ -27,6 +28,7 @@ "AIn-2 - Raw": "signal_region", "DI--O-1": "ttl", }, + "doric", ), ( "SampleData_Doric/sample_doric_2", @@ -35,6 +37,7 @@ "AIn-1 - Dem (da)": "signal_region", "DI/O-1": "ttl", }, + "doric", ), ( "SampleData_Doric/sample_doric_3", @@ -43,6 +46,7 @@ "CAM1_EXC2/ROI01": "signal_region", "DigitalIO/CAM1": "ttl", }, + "doric", ), ( "SampleData_Doric/sample_doric_4", @@ -50,6 +54,7 @@ "Series0001/AIN01xAOUT01-LockIn": "control_region", "Series0001/AIN01xAOUT02-LockIn": "signal_region", }, + "doric", ), ( "SampleData_Doric/sample_doric_5", @@ -57,6 +62,7 @@ "Series0001/AIN01xAOUT01-LockIn": "control_region", "Series0001/AIN01xAOUT02-LockIn": "signal_region", }, + "doric", ), ( "SampleData_Clean/Photo_63_207-181030-103332", @@ -65,6 +71,7 @@ "Dv2A": "signal_dms", "PrtN": "port_entries_dms", }, + "tdt", ), ( "SampleData_with_artifacts/Photo_048_392-200728-121222", @@ -73,6 +80,7 @@ "Dv2A": "signal_dms", "PrtN": "port_entries_dms", }, + "tdt", ), # TODO: Add sampleData_NPM_1 after fixing Doric vs. NPM determination bug. ( @@ -81,6 +89,7 @@ "file0_chev6": "control_region", "file1_chev6": "signal_region", }, + "npm", ), ( "SampleData_Neurophotometrics/sampleData_NPM_3", @@ -89,6 +98,7 @@ "file0_chod3": "signal_region3", "event3": "ttl_region3", }, + "npm", ), ( "SampleData_Neurophotometrics/sampleData_NPM_4", @@ -97,6 +107,7 @@ "file0_chod1": "signal_region1", "eventTrue": "ttl_true_region1", }, + "npm", ), ( "SampleData_Neurophotometrics/sampleData_NPM_5", @@ -105,6 +116,7 @@ "file0_chod1": "signal_region1", "event0": "ttl_region1", }, + "npm", ), ], ids=[ @@ -122,7 +134,7 @@ "sample_npm_5", ], ) -def test_step2(tmp_path, session_subdir, storenames_map): +def test_step2(tmp_path, session_subdir, storenames_map, modality): """ Step 2 integration test (Save Storenames) using real sample data, isolated to a temporary workspace. For each dataset: @@ -170,6 +182,7 @@ def test_step2(tmp_path, session_subdir, storenames_map): base_dir=str(tmp_base), selected_folders=[str(session_copy)], storenames_map=storenames_map, + modality=modality, npm_timestamp_column_name=npm_timestamp_column_name, npm_time_unit=npm_time_unit, npm_split_events=npm_split_events, diff --git a/tests/test_step3.py b/tests/test_step3.py index 655fb10..d167585 100644 --- a/tests/test_step3.py +++ b/tests/test_step3.py @@ -20,7 +20,7 @@ def storenames_map(): @pytest.mark.parametrize( - "session_subdir, storenames_map", + "session_subdir, storenames_map, modality", [ ( "SampleData_csv/sample_data_csv_1", @@ -29,6 +29,7 @@ def storenames_map(): "Sample_Signal_Channel": "signal_region", "Sample_TTL": "ttl", }, + "csv", ), ( "SampleData_Doric/sample_doric_1", @@ -37,6 +38,7 @@ def storenames_map(): "AIn-2 - Raw": "signal_region", "DI--O-1": "ttl", }, + "doric", ), ( "SampleData_Doric/sample_doric_2", @@ -45,6 +47,7 @@ def storenames_map(): "AIn-1 - Dem (da)": "signal_region", "DI/O-1": "ttl", }, + "doric", ), ( "SampleData_Doric/sample_doric_3", @@ -53,6 +56,7 @@ def storenames_map(): "CAM1_EXC2/ROI01": "signal_region", "DigitalIO/CAM1": "ttl", }, + "doric", ), ( "SampleData_Doric/sample_doric_4", @@ -60,6 +64,7 @@ def storenames_map(): "Series0001/AIN01xAOUT01-LockIn": "control_region", "Series0001/AIN01xAOUT02-LockIn": "signal_region", }, + "doric", ), ( "SampleData_Doric/sample_doric_5", @@ -67,6 +72,7 @@ def storenames_map(): "Series0001/AIN01xAOUT01-LockIn": "control_region", "Series0001/AIN01xAOUT02-LockIn": "signal_region", }, + "doric", ), ( "SampleData_Clean/Photo_63_207-181030-103332", @@ -75,6 +81,7 @@ def storenames_map(): "Dv2A": "signal_dms", "PrtN": "port_entries_dms", }, + "tdt", ), ( "SampleData_with_artifacts/Photo_048_392-200728-121222", @@ -83,6 +90,7 @@ def storenames_map(): "Dv2A": "signal_dms", "PrtN": "port_entries_dms", }, + "tdt", ), ( "SampleData_Neurophotometrics/sampleData_NPM_2", @@ -90,6 +98,7 @@ def storenames_map(): "file0_chev6": "control_region", "file1_chev6": "signal_region", }, + "npm", ), ( "SampleData_Neurophotometrics/sampleData_NPM_3", @@ -98,6 +107,7 @@ def storenames_map(): "file0_chod3": "signal_region3", "event3": "ttl_region3", }, + "npm", ), ( "SampleData_Neurophotometrics/sampleData_NPM_4", @@ -106,6 +116,7 @@ def storenames_map(): "file0_chod1": "signal_region1", "eventTrue": "ttl_true_region1", }, + "npm", ), ( "SampleData_Neurophotometrics/sampleData_NPM_5", @@ -114,6 +125,7 @@ def storenames_map(): "file0_chod1": "signal_region1", "event0": "ttl_region1", }, + "npm", ), ], ids=[ @@ -131,7 +143,7 @@ def storenames_map(): "sample_npm_5", ], ) -def test_step3(tmp_path, storenames_map, session_subdir): +def test_step3(tmp_path, storenames_map, session_subdir, modality): """ Full integration test for Step 3 (Read Raw Data) using real CSV sample data, isolated to a temporary workspace to avoid mutating shared sample data. @@ -182,6 +194,7 @@ def test_step3(tmp_path, storenames_map, session_subdir): base_dir=str(tmp_base), selected_folders=[str(session_copy)], storenames_map=storenames_map, + modality=modality, npm_timestamp_column_name=npm_timestamp_column_name, npm_time_unit=npm_time_unit, npm_split_events=npm_split_events, @@ -191,6 +204,7 @@ def test_step3(tmp_path, storenames_map, session_subdir): step3( base_dir=str(tmp_base), selected_folders=[str(session_copy)], + modality=modality, npm_timestamp_column_name=npm_timestamp_column_name, npm_time_unit=npm_time_unit, npm_split_events=npm_split_events, diff --git a/tests/test_step4.py b/tests/test_step4.py index 9a2e9bb..80c2d3f 100644 --- a/tests/test_step4.py +++ b/tests/test_step4.py @@ -10,7 +10,7 @@ @pytest.mark.parametrize( - "session_subdir, storenames_map, expected_region, expected_ttl", + "session_subdir, storenames_map, expected_region, expected_ttl, modality", [ ( "SampleData_csv/sample_data_csv_1", @@ -21,6 +21,7 @@ }, "region", "ttl", + "csv", ), ( "SampleData_Doric/sample_doric_1", @@ -31,6 +32,7 @@ }, "region", "ttl", + "doric", ), ( "SampleData_Doric/sample_doric_2", @@ -41,6 +43,7 @@ }, "region", "ttl", + "doric", ), ( "SampleData_Doric/sample_doric_3", @@ -51,6 +54,7 @@ }, "region", "ttl", + "doric", ), ( "SampleData_Doric/sample_doric_4", @@ -60,6 +64,7 @@ }, "region", None, + "doric", ), ( "SampleData_Doric/sample_doric_5", @@ -69,6 +74,7 @@ }, "region", None, + "doric", ), ( "SampleData_Clean/Photo_63_207-181030-103332", @@ -79,6 +85,7 @@ }, "dms", "port_entries_dms", + "tdt", ), ( "SampleData_with_artifacts/Photo_048_392-200728-121222", @@ -89,6 +96,7 @@ }, "dms", "port_entries_dms", + "tdt", ), ( "SampleData_Neurophotometrics/sampleData_NPM_2", @@ -98,6 +106,7 @@ }, "region", None, + "npm", ), ( "SampleData_Neurophotometrics/sampleData_NPM_3", @@ -108,6 +117,7 @@ }, "region3", "ttl_region3", + "npm", ), ( "SampleData_Neurophotometrics/sampleData_NPM_4", @@ -118,6 +128,7 @@ }, "region1", "ttl_true_region1", + "npm", ), ( "SampleData_Neurophotometrics/sampleData_NPM_5", @@ -128,6 +139,7 @@ }, "region1", "ttl_region1", + "npm", ), ], ids=[ @@ -146,7 +158,7 @@ ], ) @pytest.mark.filterwarnings("ignore::UserWarning") -def test_step4(tmp_path, monkeypatch, session_subdir, storenames_map, expected_region, expected_ttl): +def test_step4(tmp_path, monkeypatch, session_subdir, storenames_map, expected_region, expected_ttl, modality): """ Full integration test for Step 4 (Extract timestamps and signal) using real CSV sample data, isolated to a temporary workspace to avoid mutating shared sample data. @@ -202,6 +214,7 @@ def test_step4(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r base_dir=str(tmp_base), selected_folders=[str(session_copy)], storenames_map=storenames_map, + modality=modality, npm_timestamp_column_name=npm_timestamp_column_name, npm_time_unit=npm_time_unit, npm_split_events=npm_split_events, @@ -211,6 +224,7 @@ def test_step4(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r step3( base_dir=str(tmp_base), selected_folders=[str(session_copy)], + modality=modality, npm_timestamp_column_name=npm_timestamp_column_name, npm_time_unit=npm_time_unit, npm_split_events=npm_split_events, @@ -220,6 +234,7 @@ def test_step4(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r step4( base_dir=str(tmp_base), selected_folders=[str(session_copy)], + modality=modality, npm_timestamp_column_name=npm_timestamp_column_name, npm_time_unit=npm_time_unit, npm_split_events=npm_split_events, diff --git a/tests/test_step5.py b/tests/test_step5.py index 5593ee0..d2de1f5 100644 --- a/tests/test_step5.py +++ b/tests/test_step5.py @@ -10,7 +10,7 @@ @pytest.mark.parametrize( - "session_subdir, storenames_map, expected_region, expected_ttl", + "session_subdir, storenames_map, expected_region, expected_ttl, modality", [ ( "SampleData_csv/sample_data_csv_1", @@ -21,6 +21,7 @@ }, "region", "ttl", + "csv", ), ( "SampleData_Doric/sample_doric_1", @@ -31,6 +32,7 @@ }, "region", "ttl", + "doric", ), ( "SampleData_Doric/sample_doric_2", @@ -41,6 +43,7 @@ }, "region", "ttl", + "doric", ), ( "SampleData_Doric/sample_doric_3", @@ -51,6 +54,7 @@ }, "region", "ttl", + "doric", ), ( "SampleData_Doric/sample_doric_4", @@ -60,6 +64,7 @@ }, "region", None, + "doric", ), ( "SampleData_Doric/sample_doric_5", @@ -69,6 +74,7 @@ }, "region", None, + "doric", ), ( "SampleData_Clean/Photo_63_207-181030-103332", @@ -79,6 +85,7 @@ }, "dms", "port_entries_dms", + "tdt", ), ( "SampleData_with_artifacts/Photo_048_392-200728-121222", @@ -89,6 +96,7 @@ }, "dms", "port_entries_dms", + "tdt", ), ( "SampleData_Neurophotometrics/sampleData_NPM_2", @@ -98,6 +106,7 @@ }, "region", None, + "npm", ), ( "SampleData_Neurophotometrics/sampleData_NPM_3", @@ -108,6 +117,7 @@ }, "region3", "ttl_region3", + "npm", ), ( "SampleData_Neurophotometrics/sampleData_NPM_4", @@ -118,6 +128,7 @@ }, "region1", "ttl_true_region1", + "npm", ), ( "SampleData_Neurophotometrics/sampleData_NPM_5", @@ -128,6 +139,7 @@ }, "region1", "ttl_region1", + "npm", ), ], ids=[ @@ -146,7 +158,7 @@ ], ) @pytest.mark.filterwarnings("ignore::UserWarning") -def test_step5(tmp_path, monkeypatch, session_subdir, storenames_map, expected_region, expected_ttl): +def test_step5(tmp_path, monkeypatch, session_subdir, storenames_map, expected_region, expected_ttl, modality): """ Full integration test for Step 5 (PSTH Computation) using real CSV sample data, isolated to a temporary workspace to avoid mutating shared sample data. @@ -204,6 +216,7 @@ def test_step5(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r base_dir=str(tmp_base), selected_folders=[str(session_copy)], storenames_map=storenames_map, + modality=modality, npm_timestamp_column_name=npm_timestamp_column_name, npm_time_unit=npm_time_unit, npm_split_events=npm_split_events, @@ -213,6 +226,7 @@ def test_step5(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r step3( base_dir=str(tmp_base), selected_folders=[str(session_copy)], + modality=modality, npm_timestamp_column_name=npm_timestamp_column_name, npm_time_unit=npm_time_unit, npm_split_events=npm_split_events, @@ -222,6 +236,7 @@ def test_step5(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r step4( base_dir=str(tmp_base), selected_folders=[str(session_copy)], + modality=modality, npm_timestamp_column_name=npm_timestamp_column_name, npm_time_unit=npm_time_unit, npm_split_events=npm_split_events, @@ -231,6 +246,7 @@ def test_step5(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r step5( base_dir=str(tmp_base), selected_folders=[str(session_copy)], + modality=modality, npm_timestamp_column_name=npm_timestamp_column_name, npm_time_unit=npm_time_unit, npm_split_events=npm_split_events, From 6589139f61a55ac673c5867e165c3a5a4cb3d657 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 18 Nov 2025 11:15:21 -0800 Subject: [PATCH 011/150] Removed intermediate np_doric_csv_step2 module. --- src/guppy/np_doric_csv_step2.py | 523 -------------------------------- 1 file changed, 523 deletions(-) delete mode 100644 src/guppy/np_doric_csv_step2.py diff --git a/src/guppy/np_doric_csv_step2.py b/src/guppy/np_doric_csv_step2.py deleted file mode 100644 index d06dcc1..0000000 --- a/src/guppy/np_doric_csv_step2.py +++ /dev/null @@ -1,523 +0,0 @@ -import glob -import logging -import os -import tkinter as tk -from tkinter import StringVar, messagebox, ttk - -import h5py -import numpy as np -import pandas as pd -import panel as pn - -pn.extension() - -logger = logging.getLogger(__name__) - -# function to see if there are 'csv' files present -# and recognize type of 'csv' files either from -# Neurophotometrics, Doric systems or custom made 'csv' files -# and read data accordingly -def import_np_doric_csv(filepath, isosbestic_control, num_ch, inputParameters=None): - - logger.debug("If it exists, importing either NPM or Doric or csv file based on the structure of file") - # Headless configuration (used to avoid any UI prompts when running tests) - headless = bool(os.environ.get("GUPPY_BASE_DIR")) - npm_timestamp_column_name = None - npm_time_unit = None - npm_split_events = None - if isinstance(inputParameters, dict): - npm_timestamp_column_name = inputParameters.get("npm_timestamp_column_name") - npm_time_unit = inputParameters.get("npm_time_unit", "seconds") - npm_split_events = inputParameters.get("npm_split_events", True) - path = sorted(glob.glob(os.path.join(filepath, "*.csv"))) + sorted(glob.glob(os.path.join(filepath, "*.doric"))) - path_chev = glob.glob(os.path.join(filepath, "*chev*")) - path_chod = glob.glob(os.path.join(filepath, "*chod*")) - path_chpr = glob.glob(os.path.join(filepath, "*chpr*")) - path_event = glob.glob(os.path.join(filepath, "event*")) - # path_sig = glob.glob(os.path.join(filepath, 'sig*')) - path_chev_chod_event = path_chev + path_chod + path_event + path_chpr - - path = sorted(list(set(path) - set(path_chev_chod_event))) - flag = "None" - event_from_filename = [] - flag_arr = [] - for i in range(len(path)): - dirname = os.path.dirname(path[i]) - ext = os.path.basename(path[i]).split(".")[-1] - if ext == "doric": - key_names = read_doric(path[i]) - event_from_filename.extend(key_names) - flag = "doric_doric" - else: - df = pd.read_csv(path[i], header=None, nrows=2, index_col=False, dtype=str) - df = df.dropna(axis=1, how="all") - df_arr = np.array(df).flatten() - check_all_str = [] - for element in df_arr: - try: - float(element) - except: - check_all_str.append(i) - if len(check_all_str) == len(df_arr): - df = pd.read_csv(path[i], header=1, index_col=False, nrows=10) - df = df.drop(["Time(s)"], axis=1) - event_from_filename.extend(list(df.columns)) - flag = "doric_csv" - logger.info(flag) - else: - df = pd.read_csv(path[i], index_col=False) - # with warnings.catch_warnings(): - # warnings.simplefilter("error") - # try: - # df = pd.read_csv(path[i], index_col=False, dtype=float) - # except: - # df = pd.read_csv(path[i], header=1, index_col=False, nrows=10) # to make process faster reading just first 10 rows - # df = df.drop(['Time(s)'], axis=1) - # event_from_filename.extend(list(df.columns)) - # flag = 'doric_csv' - if flag == "doric_csv" or flag == "doric_doric": - continue - else: - colnames, value = check_header(df) - # logger.info(len(colnames), len(value)) - - # check dataframe structure and read data accordingly - if len(value) > 0: - columns_isstr = False - df = pd.read_csv(path[i], header=None) - cols = np.array(list(df.columns), dtype=str) - else: - df = df - columns_isstr = True - cols = np.array(list(df.columns), dtype=str) - # check the structure of dataframe and assign flag to the type of file - if len(cols) == 1: - if cols[0].lower() != "timestamps": - logger.error("\033[1m" + "Column name should be timestamps (all lower-cases)" + "\033[0m") - raise Exception("\033[1m" + "Column name should be timestamps (all lower-cases)" + "\033[0m") - else: - flag = "event_csv" - elif len(cols) == 3: - arr1 = np.array(["timestamps", "data", "sampling_rate"]) - arr2 = np.char.lower(np.array(cols)) - if (np.sort(arr1) == np.sort(arr2)).all() == False: - logger.error( - "\033[1m" - + "Column names should be timestamps, data and sampling_rate (all lower-cases)" - + "\033[0m" - ) - raise Exception( - "\033[1m" - + "Column names should be timestamps, data and sampling_rate (all lower-cases)" - + "\033[0m" - ) - else: - flag = "data_csv" - elif len(cols) == 2: - flag = "event_or_data_np" - elif len(cols) >= 2: - flag = "data_np" - else: - logger.error("Number of columns in csv file does not make sense.") - raise Exception("Number of columns in csv file does not make sense.") - - if columns_isstr == True and ( - "flags" in np.char.lower(np.array(cols)) or "ledstate" in np.char.lower(np.array(cols)) - ): - flag = flag + "_v2" - else: - flag = flag - - # used assigned flags to process the files and read the data - if flag == "event_or_data_np": - arr = list(df.iloc[:, 1]) - check_float = [True for i in arr if isinstance(i, float)] - if len(arr) == len(check_float) and columns_isstr == False: - flag = "data_np" - elif columns_isstr == True and ("value" in np.char.lower(np.array(cols))): - flag = "event_np" - else: - flag = "event_np" - else: - pass - - flag_arr.append(flag) - logger.info(flag) - if flag == "event_csv" or flag == "data_csv": - name = os.path.basename(path[i]).split(".")[0] - event_from_filename.append(name) - elif flag == "data_np": - file = f"file{str(i)}_" - df, indices_dict, num_channels = decide_indices(file, df, flag, num_ch) - keys = list(indices_dict.keys()) - for k in range(len(keys)): - for j in range(df.shape[1]): - if j == 0: - timestamps = df.iloc[:, j][indices_dict[keys[k]]] - # timestamps_odd = df.iloc[:,j][odd_indices] - else: - d = dict() - d["timestamps"] = timestamps - d["data"] = df.iloc[:, j][indices_dict[keys[k]]] - - df_ch = pd.DataFrame(d) - df_ch.to_csv(os.path.join(dirname, keys[k] + str(j) + ".csv"), index=False) - event_from_filename.append(keys[k] + str(j)) - - elif flag == "event_np": - type_val = np.array(df.iloc[:, 1]) - type_val_unique = np.unique(type_val) - if headless: - response = 1 if bool(npm_split_events) else 0 - else: - window = tk.Tk() - if len(type_val_unique) > 1: - response = messagebox.askyesno( - "Multiple event TTLs", - "Based on the TTL file,\ - it looks like TTLs \ - belongs to multiple behavior type. \ - Do you want to create multiple files for each \ - behavior type ?", - ) - else: - response = 0 - window.destroy() - if response == 1: - timestamps = np.array(df.iloc[:, 0]) - for j in range(len(type_val_unique)): - idx = np.where(type_val == type_val_unique[j]) - d = dict() - d["timestamps"] = timestamps[idx] - df_new = pd.DataFrame(d) - df_new.to_csv(os.path.join(dirname, "event" + str(type_val_unique[j]) + ".csv"), index=False) - event_from_filename.append("event" + str(type_val_unique[j])) - else: - timestamps = np.array(df.iloc[:, 0]) - d = dict() - d["timestamps"] = timestamps - df_new = pd.DataFrame(d) - df_new.to_csv(os.path.join(dirname, "event" + str(0) + ".csv"), index=False) - event_from_filename.append("event" + str(0)) - else: - file = f"file{str(i)}_" - df, ts_unit = decide_ts_unit_for_npm( - df, timestamp_column_name=npm_timestamp_column_name, time_unit=npm_time_unit, headless=headless - ) - df, indices_dict, num_channels = decide_indices(file, df, flag) - keys = list(indices_dict.keys()) - for k in range(len(keys)): - for j in range(df.shape[1]): - if j == 0: - timestamps = df.iloc[:, j][indices_dict[keys[k]]] - # timestamps_odd = df.iloc[:,j][odd_indices] - else: - d = dict() - d["timestamps"] = timestamps - d["data"] = df.iloc[:, j][indices_dict[keys[k]]] - - df_ch = pd.DataFrame(d) - df_ch.to_csv(os.path.join(dirname, keys[k] + str(j) + ".csv"), index=False) - event_from_filename.append(keys[k] + str(j)) - - path_chev = glob.glob(os.path.join(filepath, "*chev*")) - path_chod = glob.glob(os.path.join(filepath, "*chod*")) - path_chpr = glob.glob(os.path.join(filepath, "*chpr*")) - path_event = glob.glob(os.path.join(filepath, "event*")) - # path_sig = glob.glob(os.path.join(filepath, 'sig*')) - path_chev_chod_chpr = [path_chev, path_chod, path_chpr] - if ( - ("data_np_v2" in flag_arr or "data_np" in flag_arr) - and ("event_np" in flag_arr) - and (i == len(path) - 1) - ) or ( - ("data_np_v2" in flag_arr or "data_np" in flag_arr) and (i == len(path) - 1) - ): # i==len(path)-1 and or 'event_np' in flag - num_path_chev, num_path_chod, num_path_chpr = len(path_chev), len(path_chod), len(path_chpr) - arr_len, no_ch = [], [] - for i in range(len(path_chev_chod_chpr)): - if len(path_chev_chod_chpr[i]) > 0: - arr_len.append(len(path_chev_chod_chpr[i])) - else: - continue - - unique_arr_len = np.unique(np.array(arr_len)) - if "data_np_v2" in flag_arr: - if ts_unit == "seconds": - divisor = 1 - elif ts_unit == "milliseconds": - divisor = 1e3 - else: - divisor = 1e6 - else: - divisor = 1000 - - for j in range(len(path_event)): - df_event = pd.read_csv(path_event[j]) - df_chev = pd.read_csv(path_chev[0]) - df_event["timestamps"] = (df_event["timestamps"] - df_chev["timestamps"][0]) / divisor - df_event.to_csv(path_event[j], index=False) - if unique_arr_len.shape[0] == 1: - for j in range(len(path_chev)): - if file + "chev" in indices_dict.keys(): - df_chev = pd.read_csv(path_chev[j]) - df_chev["timestamps"] = (df_chev["timestamps"] - df_chev["timestamps"][0]) / divisor - df_chev["sampling_rate"] = np.full(df_chev.shape[0], np.nan) - df_chev.at[0, "sampling_rate"] = df_chev.shape[0] / ( - df_chev["timestamps"].iloc[-1] - df_chev["timestamps"].iloc[0] - ) - df_chev.to_csv(path_chev[j], index=False) - - if file + "chod" in indices_dict.keys(): - df_chod = pd.read_csv(path_chod[j]) - df_chod["timestamps"] = df_chev["timestamps"] - df_chod["sampling_rate"] = np.full(df_chod.shape[0], np.nan) - df_chod.at[0, "sampling_rate"] = df_chev["sampling_rate"][0] - df_chod.to_csv(path_chod[j], index=False) - - if file + "chpr" in indices_dict.keys(): - df_chpr = pd.read_csv(path_chpr[j]) - df_chpr["timestamps"] = df_chev["timestamps"] - df_chpr["sampling_rate"] = np.full(df_chpr.shape[0], np.nan) - df_chpr.at[0, "sampling_rate"] = df_chev["sampling_rate"][0] - df_chpr.to_csv(path_chpr[j], index=False) - else: - logger.error("Number of channels should be same for all regions.") - raise Exception("Number of channels should be same for all regions.") - else: - pass - logger.info("Importing of either NPM or Doric or csv file is done.") - return event_from_filename, flag_arr - -# ---------------------------------------------------------------------------------------------------------------------- -# Functions that import_np_doric_csv uses -# ---------------------------------------------------------------------------------------------------------------------- - -def read_doric(filepath): - with h5py.File(filepath, "r") as f: - if "Traces" in list(f.keys()): - keys = access_keys_doricV1(f) - elif list(f.keys()) == ["Configurations", "DataAcquisition"]: - keys = access_keys_doricV6(f) - - return keys - - -def check_header(df): - arr = list(df.columns) - check_float = [] - for i in arr: - try: - check_float.append(float(i)) - except: - pass - - return arr, check_float - -# function to decide indices of interleaved channels -# in neurophotometrics data -def decide_indices(file, df, flag, num_ch=2): - ch_name = [file + "chev", file + "chod", file + "chpr"] - if len(ch_name) < num_ch: - logger.error( - "Number of channels parameters in Input Parameters GUI is more than 3. \ - Looks like there are more than 3 channels in the file. Reading of these files\ - are not supported. Reach out to us if you get this error message." - ) - raise Exception( - "Number of channels parameters in Input Parameters GUI is more than 3. \ - Looks like there are more than 3 channels in the file. Reading of these files\ - are not supported. Reach out to us if you get this error message." - ) - if flag == "data_np": - indices_dict = dict() - for i in range(num_ch): - indices_dict[ch_name[i]] = np.arange(i, df.shape[0], num_ch) - - else: - cols = np.array(list(df.columns)) - if "flags" in np.char.lower(np.array(cols)): - arr = ["FrameCounter", "Flags"] - state = np.array(df["Flags"]) - elif "ledstate" in np.char.lower(np.array(cols)): - arr = ["FrameCounter", "LedState"] - state = np.array(df["LedState"]) - else: - logger.error( - "File type shows Neurophotometrics newer version \ - data but column names does not have Flags or LedState" - ) - raise Exception( - "File type shows Neurophotometrics newer version \ - data but column names does not have Flags or LedState" - ) - - num_ch, ch = check_channels(state) - indices_dict = dict() - for i in range(num_ch): - first_occurrence = np.where(state == ch[i])[0] - indices_dict[ch_name[i]] = np.arange(first_occurrence[0], df.shape[0], num_ch) - - df = df.drop(arr, axis=1) - - return df, indices_dict, num_ch - - -# function to decide NPM timestamps unit (seconds, ms or us) -def decide_ts_unit_for_npm(df, timestamp_column_name=None, time_unit=None, headless=False): - col_names = np.array(list(df.columns)) - col_names_ts = [""] - for name in col_names: - if "timestamp" in name.lower(): - col_names_ts.append(name) - - ts_unit = "seconds" - if len(col_names_ts) > 2: - # Headless path: auto-select column/unit without any UI - if headless: - if timestamp_column_name is not None: - assert ( - timestamp_column_name in col_names_ts - ), f"Provided timestamp_column_name '{timestamp_column_name}' not found in columns {col_names_ts[1:]}" - chosen = timestamp_column_name - else: - chosen = col_names_ts[1] - df.insert(1, "Timestamp", df[chosen]) - df = df.drop(col_names_ts[1:], axis=1) - valid_units = {"seconds", "milliseconds", "microseconds"} - ts_unit = time_unit if (isinstance(time_unit, str) and time_unit in valid_units) else "seconds" - return df, ts_unit - # def comboBoxSelected(event): - # logger.info(event.widget.get()) - - window = tk.Tk() - window.title("Select appropriate options for timestamps") - window.geometry("500x200") - holdComboboxValues = dict() - - timestamps_label = ttk.Label(window, text="Select which timestamps to use : ").grid( - row=0, column=1, pady=25, padx=25 - ) - holdComboboxValues["timestamps"] = StringVar() - timestamps_combo = ttk.Combobox(window, values=col_names_ts, textvariable=holdComboboxValues["timestamps"]) - timestamps_combo.grid(row=0, column=2, pady=25, padx=25) - timestamps_combo.current(0) - # timestamps_combo.bind("<>", comboBoxSelected) - - time_unit_label = ttk.Label(window, text="Select timestamps unit : ").grid(row=1, column=1, pady=25, padx=25) - holdComboboxValues["time_unit"] = StringVar() - time_unit_combo = ttk.Combobox( - window, values=["", "seconds", "milliseconds", "microseconds"], textvariable=holdComboboxValues["time_unit"] - ) - time_unit_combo.grid(row=1, column=2, pady=25, padx=25) - time_unit_combo.current(0) - # time_unit_combo.bind("<>", comboBoxSelected) - window.lift() - window.after(500, lambda: window.lift()) - window.mainloop() - - if holdComboboxValues["timestamps"].get(): - df.insert(1, "Timestamp", df[holdComboboxValues["timestamps"].get()]) - df = df.drop(col_names_ts[1:], axis=1) - else: - messagebox.showerror( - "All options not selected", - "All the options for timestamps \ - were not selected. Please select appropriate options", - ) - logger.error( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - raise Exception( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - if holdComboboxValues["time_unit"].get(): - if holdComboboxValues["time_unit"].get() == "seconds": - ts_unit = holdComboboxValues["time_unit"].get() - elif holdComboboxValues["time_unit"].get() == "milliseconds": - ts_unit = holdComboboxValues["time_unit"].get() - else: - ts_unit = holdComboboxValues["time_unit"].get() - else: - messagebox.showerror( - "All options not selected", - "All the options for timestamps \ - were not selected. Please select appropriate options", - ) - logger.error( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - raise Exception( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - else: - pass - - return df, ts_unit - - -# ---------------------------------------------------------------------------------------------------------------------- -# Functions that read_doric uses -# ---------------------------------------------------------------------------------------------------------------------- - -def access_keys_doricV6(doric_file): - data = [doric_file["DataAcquisition"]] - res = [] - while len(data) != 0: - members = len(data) - while members != 0: - members -= 1 - data, last_element = separate_last_element(data) - if isinstance(last_element, h5py.Dataset) and not last_element.name.endswith("/Time"): - res.append(last_element.name) - elif isinstance(last_element, h5py.Group): - data.extend(reversed([last_element[k] for k in last_element.keys()])) - - keys = [] - for element in res: - sep_values = element.split("/") - if sep_values[-1] == "Values": - keys.append(f"{sep_values[-3]}/{sep_values[-2]}") - else: - keys.append(f"{sep_values[-2]}/{sep_values[-1]}") - - return keys - - -def access_keys_doricV1(doric_file): - keys = list(doric_file["Traces"]["Console"].keys()) - keys.remove("Time(s)") - - return keys - -# ---------------------------------------------------------------------------------------------------------------------- -# Functions that decide_indices uses -# ---------------------------------------------------------------------------------------------------------------------- - -# check flag consistency in neurophotometrics data -def check_channels(state): - state = state.astype(int) - unique_state = np.unique(state[2:12]) - if unique_state.shape[0] > 3: - logger.error( - "Looks like there are more than 3 channels in the file. Reading of these files\ - are not supported. Reach out to us if you get this error message." - ) - raise Exception( - "Looks like there are more than 3 channels in the file. Reading of these files\ - are not supported. Reach out to us if you get this error message." - ) - - return unique_state.shape[0], unique_state - - -# ---------------------------------------------------------------------------------------------------------------------- -# Functions that access_keys_doricV6 uses -# ---------------------------------------------------------------------------------------------------------------------- -def separate_last_element(arr): - l = arr[-1] - return arr[:-1], l \ No newline at end of file From e7ac4d8982da9383b1d50dccfde9c50f7171e90c Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 18 Nov 2025 11:15:58 -0800 Subject: [PATCH 012/150] Split tdt_step3.py off from read_raw_data.py. --- src/guppy/common_step3.py | 51 +++++++++ src/guppy/readTevTsq.py | 204 +----------------------------------- src/guppy/saveStoresList.py | 1 - src/guppy/tdt_step3.py | 183 ++++++++++++++++++++++++++++++++ 4 files changed, 237 insertions(+), 202 deletions(-) create mode 100644 src/guppy/common_step3.py create mode 100644 src/guppy/tdt_step3.py diff --git a/src/guppy/common_step3.py b/src/guppy/common_step3.py new file mode 100644 index 0000000..4ea5c95 --- /dev/null +++ b/src/guppy/common_step3.py @@ -0,0 +1,51 @@ +import glob +import json +import logging +import multiprocessing as mp +import os +import re +import sys +import time +import warnings +from itertools import repeat + +import h5py +import numpy as np +import pandas as pd +from numpy import float32, float64, int32, int64, uint16 + +logger = logging.getLogger(__name__) + +# function to write data to a hdf5 file +def write_hdf5(data, event, filepath, key): + + # replacing \\ or / in storenames with _ (to avoid errors while saving data) + event = event.replace("\\", "_") + event = event.replace("/", "_") + + op = os.path.join(filepath, event + ".hdf5") + + # if file does not exist create a new file + if not os.path.exists(op): + with h5py.File(op, "w") as f: + if type(data) is np.ndarray: + f.create_dataset(key, data=data, maxshape=(None,), chunks=True) + else: + f.create_dataset(key, data=data) + + # if file already exists, append data to it or add a new key to it + else: + with h5py.File(op, "r+") as f: + if key in list(f.keys()): + if type(data) is np.ndarray: + f[key].resize(data.shape) + arr = f[key] + arr[:] = data + else: + arr = f[key] + arr = data + else: + if type(data) is np.ndarray: + f.create_dataset(key, data=data, maxshape=(None,), chunks=True) + else: + f.create_dataset(key, data=data) \ No newline at end of file diff --git a/src/guppy/readTevTsq.py b/src/guppy/readTevTsq.py index 6deb3b1..fe16add 100755 --- a/src/guppy/readTevTsq.py +++ b/src/guppy/readTevTsq.py @@ -14,6 +14,9 @@ import pandas as pd from numpy import float32, float64, int32, int64, uint16 +from guppy.common_step3 import write_hdf5 +from guppy.tdt_step3 import execute_readtev + logger = logging.getLogger(__name__) @@ -91,47 +94,6 @@ def check_doric(filepath): return flag_arr[0] -# check if a particular element is there in an array or not -def ismember(arr, element): - res = [1 if i == element else 0 for i in arr] - return np.asarray(res) - - -# function to write data to a hdf5 file -def write_hdf5(data, event, filepath, key): - - # replacing \\ or / in storenames with _ (to avoid errors while saving data) - event = event.replace("\\", "_") - event = event.replace("/", "_") - - op = os.path.join(filepath, event + ".hdf5") - - # if file does not exist create a new file - if not os.path.exists(op): - with h5py.File(op, "w") as f: - if type(data) is np.ndarray: - f.create_dataset(key, data=data, maxshape=(None,), chunks=True) - else: - f.create_dataset(key, data=data) - - # if file already exists, append data to it or add a new key to it - else: - with h5py.File(op, "r+") as f: - if key in list(f.keys()): - if type(data) is np.ndarray: - f[key].resize(data.shape) - arr = f[key] - arr[:] = data - else: - arr = f[key] - arr = data - else: - if type(data) is np.ndarray: - f.create_dataset(key, data=data, maxshape=(None,), chunks=True) - else: - f.create_dataset(key, data=data) - - # function to read event timestamps csv file. def import_csv(filepath, event, outputPath): logger.debug("\033[1m" + "Trying to read data for {} from csv file.".format(event) + "\033[0m") @@ -177,166 +139,6 @@ def import_csv(filepath, event, outputPath): return data, key -# function to save data read from tev file to hdf5 file -def save_dict_to_hdf5(S, event, outputPath): - write_hdf5(S["storename"], event, outputPath, "storename") - write_hdf5(S["sampling_rate"], event, outputPath, "sampling_rate") - write_hdf5(S["timestamps"], event, outputPath, "timestamps") - - write_hdf5(S["data"], event, outputPath, "data") - write_hdf5(S["npoints"], event, outputPath, "npoints") - write_hdf5(S["channels"], event, outputPath, "channels") - - -# function to check event data (checking whether event timestamps belongs to same event or multiple events) -def check_data(S, filepath, event, outputPath): - # logger.info("Checking event storename data for creating multiple event names from single event storename...") - new_event = event.replace("\\", "") - new_event = event.replace("/", "") - diff = np.diff(S["data"]) - arr = np.full(diff.shape[0], 1) - - storesList = np.genfromtxt(os.path.join(outputPath, "storesList.csv"), dtype="str", delimiter=",").reshape(2, -1) - - if diff.shape[0] == 0: - return 0 - - if S["sampling_rate"] == 0 and np.all(diff == diff[0]) == False: - logger.info("\033[1m" + "Data in event {} belongs to multiple behavior".format(event) + "\033[0m") - logger.debug( - "\033[1m" + "Create timestamp files for individual new event and change the stores list file." + "\033[0m" - ) - i_d = np.unique(S["data"]) - for i in range(i_d.shape[0]): - new_S = dict() - idx = np.where(S["data"] == i_d[i])[0] - new_S["timestamps"] = S["timestamps"][idx] - new_S["storename"] = new_event + str(int(i_d[i])) - new_S["sampling_rate"] = S["sampling_rate"] - new_S["data"] = S["data"] - new_S["npoints"] = S["npoints"] - new_S["channels"] = S["channels"] - storesList = np.concatenate( - (storesList, [[new_event + str(int(i_d[i]))], [new_event + "_" + str(int(i_d[i]))]]), axis=1 - ) - save_dict_to_hdf5(new_S, new_event + str(int(i_d[i])), outputPath) - - idx = np.where(storesList[0] == event)[0] - storesList = np.delete(storesList, idx, axis=1) - if not os.path.exists(os.path.join(outputPath, ".cache_storesList.csv")): - os.rename(os.path.join(outputPath, "storesList.csv"), os.path.join(outputPath, ".cache_storesList.csv")) - if idx.shape[0] == 0: - pass - else: - np.savetxt(os.path.join(outputPath, "storesList.csv"), storesList, delimiter=",", fmt="%s") - logger.info( - "\033[1m" - + "Timestamp files for individual new event are created \ - and the stores list file is changed." - + "\033[0m" - ) - - -# function to read tev file -def readtev(data, filepath, event, outputPath): - - logger.debug("Reading data for event {} ...".format(event)) - tevfilepath = glob.glob(os.path.join(filepath, "*.tev")) - if len(tevfilepath) > 1: - raise Exception("Two tev files are present at the location.") - else: - tevfilepath = tevfilepath[0] - - data["name"] = np.asarray(data["name"], dtype=str) - - allnames = np.unique(data["name"]) - - index = [] - for i in range(len(allnames)): - length = len(str(allnames[i])) - if length < 4: - index.append(i) - - allnames = np.delete(allnames, index, 0) - - eventNew = np.array(list(event)) - - # logger.info(allnames) - # logger.info(eventNew) - row = ismember(data["name"], event) - - if sum(row) == 0: - logger.info("\033[1m" + "Requested store name " + event + " not found (case-sensitive)." + "\033[0m") - logger.info("\033[1m" + "File contains the following TDT store names:" + "\033[0m") - logger.info("\033[1m" + str(allnames) + "\033[0m") - logger.info("\033[1m" + "TDT store name " + str(event) + " not found." + "\033[0m") - import_csv(filepath, event, outputPath) - - return 0 - - allIndexesWhereEventIsPresent = np.where(row == 1) - first_row = allIndexesWhereEventIsPresent[0][0] - - formatNew = data["format"][first_row] + 1 - - table = np.array( - [ - [0, 0, 0, 0], - [0, "float", 1, np.float32], - [0, "long", 1, np.int32], - [0, "short", 2, np.int16], - [0, "byte", 4, np.int8], - ] - ) - - S = dict() - - S["storename"] = str(event) - S["sampling_rate"] = data["frequency"][first_row] - S["timestamps"] = np.asarray(data["timestamp"][allIndexesWhereEventIsPresent[0]]) - S["channels"] = np.asarray(data["chan"][allIndexesWhereEventIsPresent[0]]) - - fp_loc = np.asarray(data["fp_loc"][allIndexesWhereEventIsPresent[0]]) - data_size = np.asarray(data["size"]) - - if formatNew != 5: - nsample = (data_size[first_row,] - 10) * int(table[formatNew, 2]) - S["data"] = np.zeros((len(fp_loc), nsample)) - for i in range(0, len(fp_loc)): - with open(tevfilepath, "rb") as fp: - fp.seek(fp_loc[i], os.SEEK_SET) - S["data"][i, :] = np.fromfile(fp, dtype=table[formatNew, 3], count=nsample).reshape( - 1, nsample, order="F" - ) - # S['data'] = S['data'].swapaxes() - S["npoints"] = nsample - else: - S["data"] = np.asarray(data["strobe"][allIndexesWhereEventIsPresent[0]]) - S["npoints"] = 1 - S["channels"] = np.tile(1, (S["data"].shape[0],)) - - S["data"] = (S["data"].T).reshape(-1, order="F") - - save_dict_to_hdf5(S, event, outputPath) - - check_data(S, filepath, event, outputPath) - - logger.info("Data for event {} fetched and stored.".format(event)) - - -# function to execute readtev function using multiprocessing to make it faster -def execute_readtev(data, filepath, event, outputPath, numProcesses=mp.cpu_count()): - - start = time.time() - with mp.Pool(numProcesses) as p: - p.starmap(readtev, zip(repeat(data), repeat(filepath), event, repeat(outputPath))) - # p = mp.Pool(mp.cpu_count()) - # p.starmap(readtev, zip(repeat(data), repeat(filepath), event, repeat(outputPath))) - # p.close() - # p.join() - logger.info("Time taken = {0:.5f}".format(time.time() - start)) - - def execute_import_csv(filepath, event, outputPath, numProcesses=mp.cpu_count()): # logger.info("Reading data for event {} ...".format(event)) diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index 72dc604..c2867ba 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -23,7 +23,6 @@ from guppy.readTevTsq import import_csv from guppy.tdt_step2 import readtsq -from guppy.np_doric_csv_step2 import import_np_doric_csv from guppy.csv_step2 import import_csv_step2 from guppy.doric_step2 import import_doric from guppy.npm_step2 import import_npm diff --git a/src/guppy/tdt_step3.py b/src/guppy/tdt_step3.py new file mode 100644 index 0000000..04ba0dd --- /dev/null +++ b/src/guppy/tdt_step3.py @@ -0,0 +1,183 @@ +import glob +import json +import logging +import multiprocessing as mp +import os +import re +import sys +import time +import warnings +from itertools import repeat + +import h5py +import numpy as np +import pandas as pd +from numpy import float32, float64, int32, int64, uint16 + +from guppy.common_step3 import write_hdf5 + +logger = logging.getLogger(__name__) + +# function to execute readtev function using multiprocessing to make it faster +def execute_readtev(data, filepath, event, outputPath, numProcesses=mp.cpu_count()): + + start = time.time() + with mp.Pool(numProcesses) as p: + p.starmap(readtev, zip(repeat(data), repeat(filepath), event, repeat(outputPath))) + # p = mp.Pool(mp.cpu_count()) + # p.starmap(readtev, zip(repeat(data), repeat(filepath), event, repeat(outputPath))) + # p.close() + # p.join() + logger.info("Time taken = {0:.5f}".format(time.time() - start)) + + +# function to read tev file +def readtev(data, filepath, event, outputPath): + + logger.debug("Reading data for event {} ...".format(event)) + tevfilepath = glob.glob(os.path.join(filepath, "*.tev")) + if len(tevfilepath) > 1: + raise Exception("Two tev files are present at the location.") + else: + tevfilepath = tevfilepath[0] + + data["name"] = np.asarray(data["name"], dtype=str) + + allnames = np.unique(data["name"]) + + index = [] + for i in range(len(allnames)): + length = len(str(allnames[i])) + if length < 4: + index.append(i) + + allnames = np.delete(allnames, index, 0) + + eventNew = np.array(list(event)) + + # logger.info(allnames) + # logger.info(eventNew) + row = ismember(data["name"], event) + + if sum(row) == 0: + logger.info("\033[1m" + "Requested store name " + event + " not found (case-sensitive)." + "\033[0m") + logger.info("\033[1m" + "File contains the following TDT store names:" + "\033[0m") + logger.info("\033[1m" + str(allnames) + "\033[0m") + logger.info("\033[1m" + "TDT store name " + str(event) + " not found." + "\033[0m") + import_csv(filepath, event, outputPath) + + return 0 + + allIndexesWhereEventIsPresent = np.where(row == 1) + first_row = allIndexesWhereEventIsPresent[0][0] + + formatNew = data["format"][first_row] + 1 + + table = np.array( + [ + [0, 0, 0, 0], + [0, "float", 1, np.float32], + [0, "long", 1, np.int32], + [0, "short", 2, np.int16], + [0, "byte", 4, np.int8], + ] + ) + + S = dict() + + S["storename"] = str(event) + S["sampling_rate"] = data["frequency"][first_row] + S["timestamps"] = np.asarray(data["timestamp"][allIndexesWhereEventIsPresent[0]]) + S["channels"] = np.asarray(data["chan"][allIndexesWhereEventIsPresent[0]]) + + fp_loc = np.asarray(data["fp_loc"][allIndexesWhereEventIsPresent[0]]) + data_size = np.asarray(data["size"]) + + if formatNew != 5: + nsample = (data_size[first_row,] - 10) * int(table[formatNew, 2]) + S["data"] = np.zeros((len(fp_loc), nsample)) + for i in range(0, len(fp_loc)): + with open(tevfilepath, "rb") as fp: + fp.seek(fp_loc[i], os.SEEK_SET) + S["data"][i, :] = np.fromfile(fp, dtype=table[formatNew, 3], count=nsample).reshape( + 1, nsample, order="F" + ) + # S['data'] = S['data'].swapaxes() + S["npoints"] = nsample + else: + S["data"] = np.asarray(data["strobe"][allIndexesWhereEventIsPresent[0]]) + S["npoints"] = 1 + S["channels"] = np.tile(1, (S["data"].shape[0],)) + + S["data"] = (S["data"].T).reshape(-1, order="F") + + save_dict_to_hdf5(S, event, outputPath) + + check_data(S, filepath, event, outputPath) + + logger.info("Data for event {} fetched and stored.".format(event)) + +# check if a particular element is there in an array or not +def ismember(arr, element): + res = [1 if i == element else 0 for i in arr] + return np.asarray(res) + + +# function to save data read from tev file to hdf5 file +def save_dict_to_hdf5(S, event, outputPath): + write_hdf5(S["storename"], event, outputPath, "storename") + write_hdf5(S["sampling_rate"], event, outputPath, "sampling_rate") + write_hdf5(S["timestamps"], event, outputPath, "timestamps") + + write_hdf5(S["data"], event, outputPath, "data") + write_hdf5(S["npoints"], event, outputPath, "npoints") + write_hdf5(S["channels"], event, outputPath, "channels") + + +# function to check event data (checking whether event timestamps belongs to same event or multiple events) +def check_data(S, filepath, event, outputPath): + # logger.info("Checking event storename data for creating multiple event names from single event storename...") + new_event = event.replace("\\", "") + new_event = event.replace("/", "") + diff = np.diff(S["data"]) + arr = np.full(diff.shape[0], 1) + + storesList = np.genfromtxt(os.path.join(outputPath, "storesList.csv"), dtype="str", delimiter=",").reshape(2, -1) + + if diff.shape[0] == 0: + return 0 + + if S["sampling_rate"] == 0 and np.all(diff == diff[0]) == False: + logger.info("\033[1m" + "Data in event {} belongs to multiple behavior".format(event) + "\033[0m") + logger.debug( + "\033[1m" + "Create timestamp files for individual new event and change the stores list file." + "\033[0m" + ) + i_d = np.unique(S["data"]) + for i in range(i_d.shape[0]): + new_S = dict() + idx = np.where(S["data"] == i_d[i])[0] + new_S["timestamps"] = S["timestamps"][idx] + new_S["storename"] = new_event + str(int(i_d[i])) + new_S["sampling_rate"] = S["sampling_rate"] + new_S["data"] = S["data"] + new_S["npoints"] = S["npoints"] + new_S["channels"] = S["channels"] + storesList = np.concatenate( + (storesList, [[new_event + str(int(i_d[i]))], [new_event + "_" + str(int(i_d[i]))]]), axis=1 + ) + save_dict_to_hdf5(new_S, new_event + str(int(i_d[i])), outputPath) + + idx = np.where(storesList[0] == event)[0] + storesList = np.delete(storesList, idx, axis=1) + if not os.path.exists(os.path.join(outputPath, ".cache_storesList.csv")): + os.rename(os.path.join(outputPath, "storesList.csv"), os.path.join(outputPath, ".cache_storesList.csv")) + if idx.shape[0] == 0: + pass + else: + np.savetxt(os.path.join(outputPath, "storesList.csv"), storesList, delimiter=",", fmt="%s") + logger.info( + "\033[1m" + + "Timestamp files for individual new event are created \ + and the stores list file is changed." + + "\033[0m" + ) \ No newline at end of file From 2f57867030294aae62a6e931864b30c0e341c8d2 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 18 Nov 2025 11:41:03 -0800 Subject: [PATCH 013/150] Hard-coded modality to simplify read. --- src/guppy/readTevTsq.py | 50 +++++++---------------------------------- src/guppy/tdt_step3.py | 42 ++++++++++++++++++++++++++++------ 2 files changed, 43 insertions(+), 49 deletions(-) diff --git a/src/guppy/readTevTsq.py b/src/guppy/readTevTsq.py index fe16add..96fd59e 100755 --- a/src/guppy/readTevTsq.py +++ b/src/guppy/readTevTsq.py @@ -33,34 +33,6 @@ def writeToFile(value: str): file.write(value) -# function to read tsq file -def readtsq(filepath): - logger.debug("Trying to read tsq file.") - names = ("size", "type", "name", "chan", "sort_code", "timestamp", "fp_loc", "strobe", "format", "frequency") - formats = (int32, int32, "S4", uint16, uint16, float64, int64, float64, int32, float32) - offsets = 0, 4, 8, 12, 14, 16, 24, 24, 32, 36 - tsq_dtype = np.dtype({"names": names, "formats": formats, "offsets": offsets}, align=True) - path = glob.glob(os.path.join(filepath, "*.tsq")) - if len(path) > 1: - logger.error("Two tsq files are present at the location.") - raise Exception("Two tsq files are present at the location.") - elif len(path) == 0: - logger.info("\033[1m" + "tsq file not found." + "\033[1m") - return 0, 0 - else: - path = path[0] - flag = "tsq" - - # reading tsq file - tsq = np.fromfile(path, dtype=tsq_dtype) - - # creating dataframe of the data - df = pd.DataFrame(tsq) - - logger.info("Data from tsq file fetched.") - return df, flag - - # function to check if doric file exists def check_doric(filepath): logger.debug("Checking if doric file exists") @@ -294,13 +266,7 @@ def readRawData(inputParameters): filepath = folderNames[i] logger.debug(f"### Reading raw data for folder {folderNames[i]}") storesListPath = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))) - # reading tsq file - data, flag = readtsq(filepath) - # checking if doric file exists - if flag == "tsq": - pass - else: - flag = check_doric(filepath) + modality = "tdt" # read data corresponding to each storename selected by user while saving the storeslist file for j in range(len(storesListPath)): @@ -314,14 +280,14 @@ def readRawData(inputParameters): 2, -1 ) - if isinstance(data, pd.DataFrame) and flag == "tsq": - execute_readtev(data, filepath, np.unique(storesList[0, :]), op, numProcesses) - elif flag == "doric_csv": - execute_import_doric(filepath, storesList, flag, op) - elif flag == "doric_doric": - execute_import_doric(filepath, storesList, flag, op) - else: + if modality == "tdt": + execute_readtev(filepath, np.unique(storesList[0, :]), op, numProcesses) + elif modality == "doric": + execute_import_doric(filepath, storesList, modality, op) + elif modality == "csv" or modality == "npm": execute_import_csv(filepath, np.unique(storesList[0, :]), op, numProcesses) + else: + raise ValueError("Modality not recognized. Please use 'tdt', 'csv', 'doric', or 'npm'.") writeToFile(str(10 + ((step + 1) * 10)) + "\n") step += 1 diff --git a/src/guppy/tdt_step3.py b/src/guppy/tdt_step3.py index 04ba0dd..bc629f0 100644 --- a/src/guppy/tdt_step3.py +++ b/src/guppy/tdt_step3.py @@ -18,8 +18,36 @@ logger = logging.getLogger(__name__) +# function to read tsq file +def readtsq(filepath): + logger.debug("Trying to read tsq file.") + names = ("size", "type", "name", "chan", "sort_code", "timestamp", "fp_loc", "strobe", "format", "frequency") + formats = (int32, int32, "S4", uint16, uint16, float64, int64, float64, int32, float32) + offsets = 0, 4, 8, 12, 14, 16, 24, 24, 32, 36 + tsq_dtype = np.dtype({"names": names, "formats": formats, "offsets": offsets}, align=True) + path = glob.glob(os.path.join(filepath, "*.tsq")) + if len(path) > 1: + logger.error("Two tsq files are present at the location.") + raise Exception("Two tsq files are present at the location.") + elif len(path) == 0: + logger.info("\033[1m" + "tsq file not found." + "\033[1m") + return 0, 0 + else: + path = path[0] + flag = "tsq" + + # reading tsq file + tsq = np.fromfile(path, dtype=tsq_dtype) + + # creating dataframe of the data + df = pd.DataFrame(tsq) + + logger.info("Data from tsq file fetched.") + return df, flag + # function to execute readtev function using multiprocessing to make it faster -def execute_readtev(data, filepath, event, outputPath, numProcesses=mp.cpu_count()): +def execute_readtev(filepath, event, outputPath, numProcesses=mp.cpu_count()): + data, _ = readtsq(filepath) start = time.time() with mp.Pool(numProcesses) as p: @@ -60,13 +88,13 @@ def readtev(data, filepath, event, outputPath): row = ismember(data["name"], event) if sum(row) == 0: - logger.info("\033[1m" + "Requested store name " + event + " not found (case-sensitive)." + "\033[0m") - logger.info("\033[1m" + "File contains the following TDT store names:" + "\033[0m") - logger.info("\033[1m" + str(allnames) + "\033[0m") - logger.info("\033[1m" + "TDT store name " + str(event) + " not found." + "\033[0m") - import_csv(filepath, event, outputPath) + logger.error("\033[1m" + "Requested store name " + event + " not found (case-sensitive)." + "\033[0m") + logger.error("\033[1m" + "File contains the following TDT store names:" + "\033[0m") + logger.error("\033[1m" + str(allnames) + "\033[0m") + logger.error("\033[1m" + "TDT store name " + str(event) + " not found." + "\033[0m") + raise ValueError("Requested store name not found.") + - return 0 allIndexesWhereEventIsPresent = np.where(row == 1) first_row = allIndexesWhereEventIsPresent[0][0] From 092e1b7f40934bae192fd428b77e0486c9f516c0 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 18 Nov 2025 13:23:54 -0800 Subject: [PATCH 014/150] Split doric_step3.py off from read_raw_data.py. --- src/guppy/doric_step3.py | 126 +++++++++++++++++++++++++++++++++ src/guppy/readTevTsq.py | 146 +-------------------------------------- 2 files changed, 128 insertions(+), 144 deletions(-) create mode 100644 src/guppy/doric_step3.py diff --git a/src/guppy/doric_step3.py b/src/guppy/doric_step3.py new file mode 100644 index 0000000..792c54e --- /dev/null +++ b/src/guppy/doric_step3.py @@ -0,0 +1,126 @@ +import glob +import logging +import os +import re + +import h5py +import numpy as np +import pandas as pd + +from guppy.common_step3 import write_hdf5 + +logger = logging.getLogger(__name__) + + +def execute_import_doric(filepath, storesList, flag, outputPath): + + if flag == "doric_csv": + path = glob.glob(os.path.join(filepath, "*.csv")) + if len(path) > 1: + logger.error("An error occurred : More than one Doric csv file present at the location") + raise Exception("More than one Doric csv file present at the location") + else: + df = pd.read_csv(path[0], header=1, index_col=False) + df = df.dropna(axis=1, how="all") + df = df.dropna(axis=0, how="any") + df["Time(s)"] = df["Time(s)"] - df["Time(s)"].to_numpy()[0] + for i in range(storesList.shape[1]): + if "control" in storesList[1, i] or "signal" in storesList[1, i]: + timestamps = np.array(df["Time(s)"]) + sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) + write_hdf5(sampling_rate, storesList[0, i], outputPath, "sampling_rate") + write_hdf5(df["Time(s)"].to_numpy(), storesList[0, i], outputPath, "timestamps") + write_hdf5(df[storesList[0, i]].to_numpy(), storesList[0, i], outputPath, "data") + else: + ttl = df[storesList[0, i]] + indices = np.where(ttl <= 0)[0] + diff_indices = np.where(np.diff(indices) > 1)[0] + write_hdf5( + df["Time(s)"][indices[diff_indices] + 1].to_numpy(), storesList[0, i], outputPath, "timestamps" + ) + else: + path = glob.glob(os.path.join(filepath, "*.doric")) + if len(path) > 1: + logger.error("An error occurred : More than one Doric file present at the location") + raise Exception("More than one Doric file present at the location") + else: + with h5py.File(path[0], "r") as f: + if "Traces" in list(f.keys()): + keys = access_data_doricV1(f, storesList, outputPath) + elif list(f.keys()) == ["Configurations", "DataAcquisition"]: + keys = access_data_doricV6(f, storesList, outputPath) + + + +def access_data_doricV6(doric_file, storesList, outputPath): + data = [doric_file["DataAcquisition"]] + res = [] + while len(data) != 0: + members = len(data) + while members != 0: + members -= 1 + data, last_element = separate_last_element(data) + if isinstance(last_element, h5py.Dataset) and not last_element.name.endswith("/Time"): + res.append(last_element.name) + elif isinstance(last_element, h5py.Group): + data.extend(reversed([last_element[k] for k in last_element.keys()])) + + decide_path = [] + for element in res: + sep_values = element.split("/") + if sep_values[-1] == "Values": + if f"{sep_values[-3]}/{sep_values[-2]}" in storesList[0, :]: + decide_path.append(element) + else: + if f"{sep_values[-2]}/{sep_values[-1]}" in storesList[0, :]: + decide_path.append(element) + + for i in range(storesList.shape[1]): + if "control" in storesList[1, i] or "signal" in storesList[1, i]: + regex = re.compile("(.*?)" + str(storesList[0, i]) + "(.*?)") + idx = [i for i in range(len(decide_path)) if regex.match(decide_path[i])] + if len(idx) > 1: + logger.error("More than one string matched (which should not be the case)") + raise Exception("More than one string matched (which should not be the case)") + idx = idx[0] + data = np.array(doric_file[decide_path[idx]]) + timestamps = np.array(doric_file[decide_path[idx].rsplit("/", 1)[0] + "/Time"]) + sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) + write_hdf5(sampling_rate, storesList[0, i], outputPath, "sampling_rate") + write_hdf5(timestamps, storesList[0, i], outputPath, "timestamps") + write_hdf5(data, storesList[0, i], outputPath, "data") + else: + regex = re.compile("(.*?)" + storesList[0, i] + "$") + idx = [i for i in range(len(decide_path)) if regex.match(decide_path[i])] + if len(idx) > 1: + logger.error("More than one string matched (which should not be the case)") + raise Exception("More than one string matched (which should not be the case)") + idx = idx[0] + ttl = np.array(doric_file[decide_path[idx]]) + timestamps = np.array(doric_file[decide_path[idx].rsplit("/", 1)[0] + "/Time"]) + indices = np.where(ttl <= 0)[0] + diff_indices = np.where(np.diff(indices) > 1)[0] + write_hdf5(timestamps[indices[diff_indices] + 1], storesList[0, i], outputPath, "timestamps") + + +def access_data_doricV1(doric_file, storesList, outputPath): + keys = list(doric_file["Traces"]["Console"].keys()) + for i in range(storesList.shape[1]): + if "control" in storesList[1, i] or "signal" in storesList[1, i]: + timestamps = np.array(doric_file["Traces"]["Console"]["Time(s)"]["Console_time(s)"]) + sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) + data = np.array(doric_file["Traces"]["Console"][storesList[0, i]][storesList[0, i]]) + write_hdf5(sampling_rate, storesList[0, i], outputPath, "sampling_rate") + write_hdf5(timestamps, storesList[0, i], outputPath, "timestamps") + write_hdf5(data, storesList[0, i], outputPath, "data") + else: + timestamps = np.array(doric_file["Traces"]["Console"]["Time(s)"]["Console_time(s)"]) + ttl = np.array(doric_file["Traces"]["Console"][storesList[0, i]][storesList[0, i]]) + indices = np.where(ttl <= 0)[0] + diff_indices = np.where(np.diff(indices) > 1)[0] + write_hdf5(timestamps[indices[diff_indices] + 1], storesList[0, i], outputPath, "timestamps") + + +def separate_last_element(arr): + l = arr[-1] + return arr[:-1], l \ No newline at end of file diff --git a/src/guppy/readTevTsq.py b/src/guppy/readTevTsq.py index 96fd59e..6fdee1e 100755 --- a/src/guppy/readTevTsq.py +++ b/src/guppy/readTevTsq.py @@ -16,6 +16,7 @@ from guppy.common_step3 import write_hdf5 from guppy.tdt_step3 import execute_readtev +from guppy.doric_step3 import execute_import_doric logger = logging.getLogger(__name__) @@ -33,39 +34,6 @@ def writeToFile(value: str): file.write(value) -# function to check if doric file exists -def check_doric(filepath): - logger.debug("Checking if doric file exists") - path = glob.glob(os.path.join(filepath, "*.csv")) + glob.glob(os.path.join(filepath, "*.doric")) - - flag_arr = [] - for i in range(len(path)): - ext = os.path.basename(path[i]).split(".")[-1] - if ext == "csv": - with warnings.catch_warnings(): - warnings.simplefilter("error") - try: - df = pd.read_csv(path[i], index_col=False, dtype=float) - except: - df = pd.read_csv(path[i], header=1, index_col=False, nrows=10) - flag = "doric_csv" - flag_arr.append(flag) - elif ext == "doric": - flag = "doric_doric" - flag_arr.append(flag) - else: - pass - - if len(flag_arr) > 1: - logger.error("Two doric files are present at the same location") - raise Exception("Two doric files are present at the same location") - if len(flag_arr) == 0: - logger.error("\033[1m" + "Doric file not found." + "\033[1m") - return 0 - logger.info("Doric file found.") - return flag_arr[0] - - # function to read event timestamps csv file. def import_csv(filepath, event, outputPath): logger.debug("\033[1m" + "Trying to read data for {} from csv file.".format(event) + "\033[0m") @@ -120,27 +88,7 @@ def execute_import_csv(filepath, event, outputPath, numProcesses=mp.cpu_count()) logger.info("Time taken = {0:.5f}".format(time.time() - start)) -def access_data_doricV1(doric_file, storesList, outputPath): - keys = list(doric_file["Traces"]["Console"].keys()) - for i in range(storesList.shape[1]): - if "control" in storesList[1, i] or "signal" in storesList[1, i]: - timestamps = np.array(doric_file["Traces"]["Console"]["Time(s)"]["Console_time(s)"]) - sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) - data = np.array(doric_file["Traces"]["Console"][storesList[0, i]][storesList[0, i]]) - write_hdf5(sampling_rate, storesList[0, i], outputPath, "sampling_rate") - write_hdf5(timestamps, storesList[0, i], outputPath, "timestamps") - write_hdf5(data, storesList[0, i], outputPath, "data") - else: - timestamps = np.array(doric_file["Traces"]["Console"]["Time(s)"]["Console_time(s)"]) - ttl = np.array(doric_file["Traces"]["Console"][storesList[0, i]][storesList[0, i]]) - indices = np.where(ttl <= 0)[0] - diff_indices = np.where(np.diff(indices) > 1)[0] - write_hdf5(timestamps[indices[diff_indices] + 1], storesList[0, i], outputPath, "timestamps") - -def separate_last_element(arr): - l = arr[-1] - return arr[:-1], l def find_string(regex, arr): @@ -149,96 +97,6 @@ def find_string(regex, arr): return i -def access_data_doricV6(doric_file, storesList, outputPath): - data = [doric_file["DataAcquisition"]] - res = [] - while len(data) != 0: - members = len(data) - while members != 0: - members -= 1 - data, last_element = separate_last_element(data) - if isinstance(last_element, h5py.Dataset) and not last_element.name.endswith("/Time"): - res.append(last_element.name) - elif isinstance(last_element, h5py.Group): - data.extend(reversed([last_element[k] for k in last_element.keys()])) - - decide_path = [] - for element in res: - sep_values = element.split("/") - if sep_values[-1] == "Values": - if f"{sep_values[-3]}/{sep_values[-2]}" in storesList[0, :]: - decide_path.append(element) - else: - if f"{sep_values[-2]}/{sep_values[-1]}" in storesList[0, :]: - decide_path.append(element) - - for i in range(storesList.shape[1]): - if "control" in storesList[1, i] or "signal" in storesList[1, i]: - regex = re.compile("(.*?)" + str(storesList[0, i]) + "(.*?)") - idx = [i for i in range(len(decide_path)) if regex.match(decide_path[i])] - if len(idx) > 1: - logger.error("More than one string matched (which should not be the case)") - raise Exception("More than one string matched (which should not be the case)") - idx = idx[0] - data = np.array(doric_file[decide_path[idx]]) - timestamps = np.array(doric_file[decide_path[idx].rsplit("/", 1)[0] + "/Time"]) - sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) - write_hdf5(sampling_rate, storesList[0, i], outputPath, "sampling_rate") - write_hdf5(timestamps, storesList[0, i], outputPath, "timestamps") - write_hdf5(data, storesList[0, i], outputPath, "data") - else: - regex = re.compile("(.*?)" + storesList[0, i] + "$") - idx = [i for i in range(len(decide_path)) if regex.match(decide_path[i])] - if len(idx) > 1: - logger.error("More than one string matched (which should not be the case)") - raise Exception("More than one string matched (which should not be the case)") - idx = idx[0] - ttl = np.array(doric_file[decide_path[idx]]) - timestamps = np.array(doric_file[decide_path[idx].rsplit("/", 1)[0] + "/Time"]) - indices = np.where(ttl <= 0)[0] - diff_indices = np.where(np.diff(indices) > 1)[0] - write_hdf5(timestamps[indices[diff_indices] + 1], storesList[0, i], outputPath, "timestamps") - - -def execute_import_doric(filepath, storesList, flag, outputPath): - - if flag == "doric_csv": - path = glob.glob(os.path.join(filepath, "*.csv")) - if len(path) > 1: - logger.error("An error occurred : More than one Doric csv file present at the location") - raise Exception("More than one Doric csv file present at the location") - else: - df = pd.read_csv(path[0], header=1, index_col=False) - df = df.dropna(axis=1, how="all") - df = df.dropna(axis=0, how="any") - df["Time(s)"] = df["Time(s)"] - df["Time(s)"].to_numpy()[0] - for i in range(storesList.shape[1]): - if "control" in storesList[1, i] or "signal" in storesList[1, i]: - timestamps = np.array(df["Time(s)"]) - sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) - write_hdf5(sampling_rate, storesList[0, i], outputPath, "sampling_rate") - write_hdf5(df["Time(s)"].to_numpy(), storesList[0, i], outputPath, "timestamps") - write_hdf5(df[storesList[0, i]].to_numpy(), storesList[0, i], outputPath, "data") - else: - ttl = df[storesList[0, i]] - indices = np.where(ttl <= 0)[0] - diff_indices = np.where(np.diff(indices) > 1)[0] - write_hdf5( - df["Time(s)"][indices[diff_indices] + 1].to_numpy(), storesList[0, i], outputPath, "timestamps" - ) - else: - path = glob.glob(os.path.join(filepath, "*.doric")) - if len(path) > 1: - logger.error("An error occurred : More than one Doric file present at the location") - raise Exception("More than one Doric file present at the location") - else: - with h5py.File(path[0], "r") as f: - if "Traces" in list(f.keys()): - keys = access_data_doricV1(f, storesList, outputPath) - elif list(f.keys()) == ["Configurations", "DataAcquisition"]: - keys = access_data_doricV6(f, storesList, outputPath) - - # function to read data from 'tsq' and 'tev' files def readRawData(inputParameters): @@ -266,7 +124,7 @@ def readRawData(inputParameters): filepath = folderNames[i] logger.debug(f"### Reading raw data for folder {folderNames[i]}") storesListPath = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))) - modality = "tdt" + modality = "doric" # read data corresponding to each storename selected by user while saving the storeslist file for j in range(len(storesListPath)): From 7abb8e09dd475ffcc1fb15156393ec04ac2c5c94 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 18 Nov 2025 13:31:15 -0800 Subject: [PATCH 015/150] Added check_doric to doric_step3.py. --- src/guppy/doric_step3.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/guppy/doric_step3.py b/src/guppy/doric_step3.py index 792c54e..2c30887 100644 --- a/src/guppy/doric_step3.py +++ b/src/guppy/doric_step3.py @@ -2,6 +2,7 @@ import logging import os import re +import warnings import h5py import numpy as np @@ -11,8 +12,39 @@ logger = logging.getLogger(__name__) +def check_doric(filepath): + logger.debug("Checking if doric file exists") + path = glob.glob(os.path.join(filepath, "*.csv")) + glob.glob(os.path.join(filepath, "*.doric")) + + flag_arr = [] + for i in range(len(path)): + ext = os.path.basename(path[i]).split(".")[-1] + if ext == "csv": + with warnings.catch_warnings(): + warnings.simplefilter("error") + try: + df = pd.read_csv(path[i], index_col=False, dtype=float) + except: + df = pd.read_csv(path[i], header=1, index_col=False, nrows=10) + flag = "doric_csv" + flag_arr.append(flag) + elif ext == "doric": + flag = "doric_doric" + flag_arr.append(flag) + else: + pass + + if len(flag_arr) > 1: + logger.error("Two doric files are present at the same location") + raise Exception("Two doric files are present at the same location") + if len(flag_arr) == 0: + logger.error("\033[1m" + "Doric file not found." + "\033[1m") + return 0 + logger.info("Doric file found.") + return flag_arr[0] def execute_import_doric(filepath, storesList, flag, outputPath): + flag = check_doric(filepath) if flag == "doric_csv": path = glob.glob(os.path.join(filepath, "*.csv")) From b653538fad3acf017085b8943d846a6b633d7d99 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 18 Nov 2025 13:45:55 -0800 Subject: [PATCH 016/150] Split csv_step3.py off from read_raw_data.py. --- src/guppy/csv_step3.py | 73 +++++++++++++++++++++++++++++++++++++ src/guppy/readTevTsq.py | 67 +--------------------------------- src/guppy/saveStoresList.py | 1 - 3 files changed, 75 insertions(+), 66 deletions(-) create mode 100644 src/guppy/csv_step3.py diff --git a/src/guppy/csv_step3.py b/src/guppy/csv_step3.py new file mode 100644 index 0000000..97d3eb5 --- /dev/null +++ b/src/guppy/csv_step3.py @@ -0,0 +1,73 @@ +import glob +import json +import logging +import multiprocessing as mp +import os +import re +import sys +import time +import warnings +from itertools import repeat + +import h5py +import numpy as np +import pandas as pd +from numpy import float32, float64, int32, int64, uint16 + +from guppy.common_step3 import write_hdf5 + +logger = logging.getLogger(__name__) + + +def execute_import_csv(filepath, event, outputPath, numProcesses=mp.cpu_count()): + # logger.info("Reading data for event {} ...".format(event)) + + start = time.time() + with mp.Pool(numProcesses) as p: + p.starmap(import_csv, zip(repeat(filepath), event, repeat(outputPath))) + logger.info("Time taken = {0:.5f}".format(time.time() - start)) + + +# function to read event timestamps csv file. +def import_csv(filepath, event, outputPath): + logger.debug("\033[1m" + "Trying to read data for {} from csv file.".format(event) + "\033[0m") + if not os.path.exists(os.path.join(filepath, event + ".csv")): + logger.error("\033[1m" + "No csv file found for event {}".format(event) + "\033[0m") + raise Exception("\033[1m" + "No csv file found for event {}".format(event) + "\033[0m") + + df = pd.read_csv(os.path.join(filepath, event + ".csv"), index_col=False) + data = df + key = list(df.columns) + + if len(key) == 3: + arr1 = np.array(["timestamps", "data", "sampling_rate"]) + arr2 = np.char.lower(np.array(key)) + if (np.sort(arr1) == np.sort(arr2)).all() == False: + logger.error("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") + raise Exception("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") + + if len(key) == 1: + if key[0].lower() != "timestamps": + logger.error("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") + raise Exception("\033[1m" + "Column name should be timestamps" + "\033[0m") + + if len(key) != 3 and len(key) != 1: + logger.error( + "\033[1m" + + "Number of columns in csv file should be either three or one. Three columns if \ + the file is for control or signal data or one column if the file is for event TTLs." + + "\033[0m" + ) + raise Exception( + "\033[1m" + + "Number of columns in csv file should be either three or one. Three columns if \ + the file is for control or signal data or one column if the file is for event TTLs." + + "\033[0m" + ) + + for i in range(len(key)): + write_hdf5(data[key[i]].dropna(), event, outputPath, key[i].lower()) + + logger.info("\033[1m" + "Reading data for {} from csv file is completed.".format(event) + "\033[0m") + + return data, key \ No newline at end of file diff --git a/src/guppy/readTevTsq.py b/src/guppy/readTevTsq.py index 6fdee1e..c080b58 100755 --- a/src/guppy/readTevTsq.py +++ b/src/guppy/readTevTsq.py @@ -17,6 +17,7 @@ from guppy.common_step3 import write_hdf5 from guppy.tdt_step3 import execute_readtev from guppy.doric_step3 import execute_import_doric +from guppy.csv_step3 import execute_import_csv logger = logging.getLogger(__name__) @@ -33,70 +34,6 @@ def writeToFile(value: str): with open(os.path.join(os.path.expanduser("~"), "pbSteps.txt"), "a") as file: file.write(value) - -# function to read event timestamps csv file. -def import_csv(filepath, event, outputPath): - logger.debug("\033[1m" + "Trying to read data for {} from csv file.".format(event) + "\033[0m") - if not os.path.exists(os.path.join(filepath, event + ".csv")): - logger.error("\033[1m" + "No csv file found for event {}".format(event) + "\033[0m") - raise Exception("\033[1m" + "No csv file found for event {}".format(event) + "\033[0m") - - df = pd.read_csv(os.path.join(filepath, event + ".csv"), index_col=False) - data = df - key = list(df.columns) - - if len(key) == 3: - arr1 = np.array(["timestamps", "data", "sampling_rate"]) - arr2 = np.char.lower(np.array(key)) - if (np.sort(arr1) == np.sort(arr2)).all() == False: - logger.error("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") - raise Exception("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") - - if len(key) == 1: - if key[0].lower() != "timestamps": - logger.error("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") - raise Exception("\033[1m" + "Column name should be timestamps" + "\033[0m") - - if len(key) != 3 and len(key) != 1: - logger.error( - "\033[1m" - + "Number of columns in csv file should be either three or one. Three columns if \ - the file is for control or signal data or one column if the file is for event TTLs." - + "\033[0m" - ) - raise Exception( - "\033[1m" - + "Number of columns in csv file should be either three or one. Three columns if \ - the file is for control or signal data or one column if the file is for event TTLs." - + "\033[0m" - ) - - for i in range(len(key)): - write_hdf5(data[key[i]].dropna(), event, outputPath, key[i].lower()) - - logger.info("\033[1m" + "Reading data for {} from csv file is completed.".format(event) + "\033[0m") - - return data, key - - -def execute_import_csv(filepath, event, outputPath, numProcesses=mp.cpu_count()): - # logger.info("Reading data for event {} ...".format(event)) - - start = time.time() - with mp.Pool(numProcesses) as p: - p.starmap(import_csv, zip(repeat(filepath), event, repeat(outputPath))) - logger.info("Time taken = {0:.5f}".format(time.time() - start)) - - - - - -def find_string(regex, arr): - for i in range(len(arr)): - if regex.match(arr[i]): - return i - - # function to read data from 'tsq' and 'tev' files def readRawData(inputParameters): @@ -124,7 +61,7 @@ def readRawData(inputParameters): filepath = folderNames[i] logger.debug(f"### Reading raw data for folder {folderNames[i]}") storesListPath = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))) - modality = "doric" + modality = "csv" # read data corresponding to each storename selected by user while saving the storeslist file for j in range(len(storesListPath)): diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index c2867ba..a432546 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -21,7 +21,6 @@ import panel as pn from numpy import float32, float64, int32, int64, uint16 -from guppy.readTevTsq import import_csv from guppy.tdt_step2 import readtsq from guppy.csv_step2 import import_csv_step2 from guppy.doric_step2 import import_doric From 6d661c291a389f16d7e0c109b2510622c7892289 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 18 Nov 2025 15:11:32 -0800 Subject: [PATCH 017/150] Added modality to Step 3. --- src/guppy/readTevTsq.py | 2 +- src/guppy/testing/api.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/guppy/readTevTsq.py b/src/guppy/readTevTsq.py index c080b58..e0bedfa 100755 --- a/src/guppy/readTevTsq.py +++ b/src/guppy/readTevTsq.py @@ -42,6 +42,7 @@ def readRawData(inputParameters): inputParameters = inputParameters folderNames = inputParameters["folderNames"] numProcesses = inputParameters["numberOfCores"] + modality = inputParameters["modality"] storesListPath = [] if numProcesses == 0: numProcesses = mp.cpu_count() @@ -61,7 +62,6 @@ def readRawData(inputParameters): filepath = folderNames[i] logger.debug(f"### Reading raw data for folder {folderNames[i]}") storesListPath = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))) - modality = "csv" # read data corresponding to each storename selected by user while saving the storeslist file for j in range(len(storesListPath)): diff --git a/src/guppy/testing/api.py b/src/guppy/testing/api.py index 0e16f23..d7e390d 100644 --- a/src/guppy/testing/api.py +++ b/src/guppy/testing/api.py @@ -237,6 +237,9 @@ def step3( input_params["npm_time_unit"] = npm_time_unit input_params["npm_split_events"] = npm_split_events + # Inject modality + input_params["modality"] = modality + # Call the underlying Step 3 worker directly (no subprocess) readRawData(input_params) @@ -315,6 +318,9 @@ def step4( input_params["npm_time_unit"] = npm_time_unit input_params["npm_split_events"] = npm_split_events + # Inject modality + input_params["modality"] = modality + # Call the underlying Step 4 worker directly (no subprocess) extractTsAndSignal(input_params) @@ -393,6 +399,9 @@ def step5( input_params["npm_time_unit"] = npm_time_unit input_params["npm_split_events"] = npm_split_events + # Inject modality + input_params["modality"] = modality + # Call the underlying Step 5 worker directly (no subprocess) psthForEachStorename(input_params) From a4f6583ecbd3071929551ecf71df4a5716a49791 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 19 Nov 2025 09:12:07 -0800 Subject: [PATCH 018/150] Added tdtRecordingExtractor --- src/guppy/extractors/__init__.py | 1 + .../extractors/tdt_recording_extractor.py | 197 ++++++++++++++++++ src/guppy/readTevTsq.py | 11 +- src/guppy/saveStoresList.py | 5 +- 4 files changed, 211 insertions(+), 3 deletions(-) create mode 100644 src/guppy/extractors/__init__.py create mode 100644 src/guppy/extractors/tdt_recording_extractor.py diff --git a/src/guppy/extractors/__init__.py b/src/guppy/extractors/__init__.py new file mode 100644 index 0000000..249daf9 --- /dev/null +++ b/src/guppy/extractors/__init__.py @@ -0,0 +1 @@ +from .tdt_recording_extractor import TdtRecordingExtractor \ No newline at end of file diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py new file mode 100644 index 0000000..98ae3cd --- /dev/null +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -0,0 +1,197 @@ +import glob +import logging +import os +import numpy as np +from numpy import float32, float64, int32, int64, uint16 +import pandas as pd +import multiprocessing as mp +import time +from itertools import repeat + +from guppy.common_step3 import write_hdf5 + +logger = logging.getLogger(__name__) + +class TdtRecordingExtractor: + + def __init__(self, folder_path): + self.folder_path = folder_path + self.header_df, _ = self.readtsq(folder_path) + + def readtsq(self, folder_path): + logger.debug("Trying to read tsq file.") + names = ("size", "type", "name", "chan", "sort_code", "timestamp", "fp_loc", "strobe", "format", "frequency") + formats = (int32, int32, "S4", uint16, uint16, float64, int64, float64, int32, float32) + offsets = 0, 4, 8, 12, 14, 16, 24, 24, 32, 36 + tsq_dtype = np.dtype({"names": names, "formats": formats, "offsets": offsets}, align=True) + path = glob.glob(os.path.join(folder_path, "*.tsq")) + if len(path) > 1: + logger.error("Two tsq files are present at the location.") + raise Exception("Two tsq files are present at the location.") + elif len(path) == 0: + logger.info("\033[1m" + "tsq file not found." + "\033[1m") + return 0, 0 + else: + path = path[0] + flag = "tsq" + + # reading tsq file + tsq = np.fromfile(path, dtype=tsq_dtype) + + # creating dataframe of the data + df = pd.DataFrame(tsq) + + logger.info("Data from tsq file fetched.") + return df, flag + + # function to execute readtev function using multiprocessing to make it faster + def execute_readtev(self, filepath, event, outputPath, numProcesses=mp.cpu_count()): + start = time.time() + with mp.Pool(numProcesses) as p: + p.starmap(self.readtev, zip(repeat(self.header_df), repeat(filepath), event, repeat(outputPath))) + logger.info("Time taken = {0:.5f}".format(time.time() - start)) + + + # function to read tev file + def readtev(self, event): + data = self.header_df + filepath = self.folder_path + + logger.debug("Reading data for event {} ...".format(event)) + tevfilepath = glob.glob(os.path.join(filepath, "*.tev")) + if len(tevfilepath) > 1: + raise Exception("Two tev files are present at the location.") + else: + tevfilepath = tevfilepath[0] + + data["name"] = np.asarray(data["name"], dtype=str) + + allnames = np.unique(data["name"]) + + index = [] + for i in range(len(allnames)): + length = len(str(allnames[i])) + if length < 4: + index.append(i) + + allnames = np.delete(allnames, index, 0) + + eventNew = np.array(list(event)) + + # logger.info(allnames) + # logger.info(eventNew) + row = self.ismember(data["name"], event) + + if sum(row) == 0: + logger.error("\033[1m" + "Requested store name " + event + " not found (case-sensitive)." + "\033[0m") + logger.error("\033[1m" + "File contains the following TDT store names:" + "\033[0m") + logger.error("\033[1m" + str(allnames) + "\033[0m") + logger.error("\033[1m" + "TDT store name " + str(event) + " not found." + "\033[0m") + raise ValueError("Requested store name not found.") + + + + allIndexesWhereEventIsPresent = np.where(row == 1) + first_row = allIndexesWhereEventIsPresent[0][0] + + formatNew = data["format"][first_row] + 1 + + table = np.array( + [ + [0, 0, 0, 0], + [0, "float", 1, np.float32], + [0, "long", 1, np.int32], + [0, "short", 2, np.int16], + [0, "byte", 4, np.int8], + ] + ) + + S = dict() + + S["storename"] = str(event) + S["sampling_rate"] = data["frequency"][first_row] + S["timestamps"] = np.asarray(data["timestamp"][allIndexesWhereEventIsPresent[0]]) + S["channels"] = np.asarray(data["chan"][allIndexesWhereEventIsPresent[0]]) + + fp_loc = np.asarray(data["fp_loc"][allIndexesWhereEventIsPresent[0]]) + data_size = np.asarray(data["size"]) + + if formatNew != 5: + nsample = (data_size[first_row,] - 10) * int(table[formatNew, 2]) + S["data"] = np.zeros((len(fp_loc), nsample)) + for i in range(0, len(fp_loc)): + with open(tevfilepath, "rb") as fp: + fp.seek(fp_loc[i], os.SEEK_SET) + S["data"][i, :] = np.fromfile(fp, dtype=table[formatNew, 3], count=nsample).reshape( + 1, nsample, order="F" + ) + # S['data'] = S['data'].swapaxes() + S["npoints"] = nsample + else: + S["data"] = np.asarray(data["strobe"][allIndexesWhereEventIsPresent[0]]) + S["npoints"] = 1 + S["channels"] = np.tile(1, (S["data"].shape[0],)) + + S["data"] = (S["data"].T).reshape(-1, order="F") + + return S + + # check if a particular element is there in an array or not + def ismember(self, arr, element): # TODO: replace this function with more standard usage + res = [1 if i == element else 0 for i in arr] + return np.asarray(res) + + + # function to save data read from tev file to hdf5 file + def save_dict_to_hdf5(self, S, event, outputPath): + write_hdf5(S["storename"], event, outputPath, "storename") + write_hdf5(S["sampling_rate"], event, outputPath, "sampling_rate") + write_hdf5(S["timestamps"], event, outputPath, "timestamps") + + write_hdf5(S["data"], event, outputPath, "data") + write_hdf5(S["npoints"], event, outputPath, "npoints") + write_hdf5(S["channels"], event, outputPath, "channels") + + + # function to check event data (checking whether event timestamps belongs to same event or multiple events) + def check_data(self, S, event, outputPath): + # logger.info("Checking event storename data for creating multiple event names from single event storename...") + new_event = event.replace("\\", "") + new_event = event.replace("/", "") + diff = np.diff(S["data"]) + arr = np.full(diff.shape[0], 1) + + storesList = np.genfromtxt(os.path.join(outputPath, "storesList.csv"), dtype="str", delimiter=",").reshape(2, -1) + + if diff.shape[0] == 0: + return 0 + + if S["sampling_rate"] == 0 and np.all(diff == diff[0]) == False: + logger.info("\033[1m" + "Data in event {} belongs to multiple behavior".format(event) + "\033[0m") + logger.debug( + "\033[1m" + "Create timestamp files for individual new event and change the stores list file." + "\033[0m" + ) + i_d = np.unique(S["data"]) + for i in range(i_d.shape[0]): + new_S = dict() + idx = np.where(S["data"] == i_d[i])[0] + new_S["timestamps"] = S["timestamps"][idx] + new_S["storename"] = new_event + str(int(i_d[i])) + new_S["sampling_rate"] = S["sampling_rate"] + new_S["data"] = S["data"] + new_S["npoints"] = S["npoints"] + new_S["channels"] = S["channels"] + storesList = np.concatenate( + (storesList, [[new_event + str(int(i_d[i]))], [new_event + "_" + str(int(i_d[i]))]]), axis=1 + ) + self.save_dict_to_hdf5(new_S, new_event + str(int(i_d[i])), outputPath) + + idx = np.where(storesList[0] == event)[0] + storesList = np.delete(storesList, idx, axis=1) + if not os.path.exists(os.path.join(outputPath, ".cache_storesList.csv")): + os.rename(os.path.join(outputPath, "storesList.csv"), os.path.join(outputPath, ".cache_storesList.csv")) + if idx.shape[0] == 0: + pass + else: + np.savetxt(os.path.join(outputPath, "storesList.csv"), storesList, delimiter=",", fmt="%s") + logger.info("\033[1m Timestamp files for individual new event are created and the stores list file is changed.\033[0m") \ No newline at end of file diff --git a/src/guppy/readTevTsq.py b/src/guppy/readTevTsq.py index e0bedfa..d3c9147 100755 --- a/src/guppy/readTevTsq.py +++ b/src/guppy/readTevTsq.py @@ -16,6 +16,7 @@ from guppy.common_step3 import write_hdf5 from guppy.tdt_step3 import execute_readtev +from guppy.extractors import TdtRecordingExtractor from guppy.doric_step3 import execute_import_doric from guppy.csv_step3 import execute_import_csv @@ -76,7 +77,15 @@ def readRawData(inputParameters): ) if modality == "tdt": - execute_readtev(filepath, np.unique(storesList[0, :]), op, numProcesses) + # execute_readtev(filepath, np.unique(storesList[0, :]), op, numProcesses) + extractor = TdtRecordingExtractor(folder_path=filepath) + event = np.unique(storesList[0, :]) + for e in event: + S = extractor.readtev(event=e) + extractor.save_dict_to_hdf5(S=S, event=e, outputPath=op) + extractor.check_data(S=S, event=e, outputPath=op) + logger.info("Data for event {} fetched and stored.".format(e)) + elif modality == "doric": execute_import_doric(filepath, storesList, modality, op) elif modality == "csv" or modality == "npm": diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index a432546..79fa71a 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -21,7 +21,7 @@ import panel as pn from numpy import float32, float64, int32, int64, uint16 -from guppy.tdt_step2 import readtsq +from guppy.extractors import TdtRecordingExtractor from guppy.csv_step2 import import_csv_step2 from guppy.doric_step2 import import_doric from guppy.npm_step2 import import_npm @@ -589,7 +589,8 @@ def execute(inputParameters): for i in folderNames: filepath = os.path.join(inputParameters["abspath"], i) if modality == "tdt": - data = readtsq(filepath) + extractor = TdtRecordingExtractor(folder_path=filepath) + data = extractor.header_df event_name, flag = [], [] elif modality == "csv": data = 0 From 882556e8b72fca014f51f3cd71ec2b51b9368b4d Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 19 Nov 2025 09:49:16 -0800 Subject: [PATCH 019/150] Adapted parallel execute function to use new extractor. --- .../extractors/tdt_recording_extractor.py | 23 +++++++++++-------- src/guppy/readTevTsq.py | 11 ++------- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py index 98ae3cd..c0b01f9 100644 --- a/src/guppy/extractors/tdt_recording_extractor.py +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -12,6 +12,19 @@ logger = logging.getLogger(__name__) +# function to execute readtev function using multiprocessing to make it faster +def execute_readtev(folder_path, events, outputPath, numProcesses=mp.cpu_count()): + extractor = TdtRecordingExtractor(folder_path=folder_path) + start = time.time() + with mp.Pool(numProcesses) as p: + p.starmap(read_tdt_and_save_hdf5, zip(repeat(extractor), events, repeat(outputPath))) + logger.info("Time taken = {0:.5f}".format(time.time() - start)) + +def read_tdt_and_save_hdf5(extractor, event, outputPath): + S = extractor.readtev(event=event) + extractor.save_dict_to_hdf5(S=S, event=event, outputPath=outputPath) + logger.info("Data for event {} fetched and stored.".format(event)) + class TdtRecordingExtractor: def __init__(self, folder_path): @@ -43,14 +56,6 @@ def readtsq(self, folder_path): logger.info("Data from tsq file fetched.") return df, flag - - # function to execute readtev function using multiprocessing to make it faster - def execute_readtev(self, filepath, event, outputPath, numProcesses=mp.cpu_count()): - start = time.time() - with mp.Pool(numProcesses) as p: - p.starmap(self.readtev, zip(repeat(self.header_df), repeat(filepath), event, repeat(outputPath))) - logger.info("Time taken = {0:.5f}".format(time.time() - start)) - # function to read tev file def readtev(self, event): @@ -154,7 +159,7 @@ def save_dict_to_hdf5(self, S, event, outputPath): # function to check event data (checking whether event timestamps belongs to same event or multiple events) - def check_data(self, S, event, outputPath): + def check_data(self, S, event, outputPath): # TODO: fold this function into the main read/get function # logger.info("Checking event storename data for creating multiple event names from single event storename...") new_event = event.replace("\\", "") new_event = event.replace("/", "") diff --git a/src/guppy/readTevTsq.py b/src/guppy/readTevTsq.py index d3c9147..47b7962 100755 --- a/src/guppy/readTevTsq.py +++ b/src/guppy/readTevTsq.py @@ -16,7 +16,6 @@ from guppy.common_step3 import write_hdf5 from guppy.tdt_step3 import execute_readtev -from guppy.extractors import TdtRecordingExtractor from guppy.doric_step3 import execute_import_doric from guppy.csv_step3 import execute_import_csv @@ -77,14 +76,8 @@ def readRawData(inputParameters): ) if modality == "tdt": - # execute_readtev(filepath, np.unique(storesList[0, :]), op, numProcesses) - extractor = TdtRecordingExtractor(folder_path=filepath) - event = np.unique(storesList[0, :]) - for e in event: - S = extractor.readtev(event=e) - extractor.save_dict_to_hdf5(S=S, event=e, outputPath=op) - extractor.check_data(S=S, event=e, outputPath=op) - logger.info("Data for event {} fetched and stored.".format(e)) + events = np.unique(storesList[0, :]) + execute_readtev(filepath, events, op, numProcesses) elif modality == "doric": execute_import_doric(filepath, storesList, modality, op) From df7b9e160a46723c12193946d2aebaa156fe336c Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 19 Nov 2025 11:09:51 -0800 Subject: [PATCH 020/150] Added CsvRecordingExtractor for step 2 --- src/guppy/extractors/__init__.py | 3 +- .../extractors/csv_recording_extractor.py | 115 ++++++++++++++++++ src/guppy/saveStoresList.py | 15 +-- 3 files changed, 123 insertions(+), 10 deletions(-) create mode 100644 src/guppy/extractors/csv_recording_extractor.py diff --git a/src/guppy/extractors/__init__.py b/src/guppy/extractors/__init__.py index 249daf9..812622b 100644 --- a/src/guppy/extractors/__init__.py +++ b/src/guppy/extractors/__init__.py @@ -1 +1,2 @@ -from .tdt_recording_extractor import TdtRecordingExtractor \ No newline at end of file +from .tdt_recording_extractor import TdtRecordingExtractor +from .csv_recording_extractor import CsvRecordingExtractor diff --git a/src/guppy/extractors/csv_recording_extractor.py b/src/guppy/extractors/csv_recording_extractor.py new file mode 100644 index 0000000..f5a73e9 --- /dev/null +++ b/src/guppy/extractors/csv_recording_extractor.py @@ -0,0 +1,115 @@ +import glob +import logging +import os + +import numpy as np +import pandas as pd + +logger = logging.getLogger(__name__) + + +class CsvRecordingExtractor: + + def __init__(self, folder_path): + self.folder_path = folder_path + + logger.debug("If it exists, importing either NPM or Doric or csv file based on the structure of file") + path = sorted(glob.glob(os.path.join(self.folder_path, "*.csv"))) + + path = sorted(list(set(path))) + flag = "None" + event_from_filename = [] + flag_arr = [] + for i in range(len(path)): + ext = os.path.basename(path[i]).split(".")[-1] + assert ext == "csv", "Only .csv files are supported by import_csv function." + df = pd.read_csv(path[i], header=None, nrows=2, index_col=False, dtype=str) + df = df.dropna(axis=1, how="all") + df_arr = np.array(df).flatten() + check_all_str = [] + for element in df_arr: + try: + float(element) + except: + check_all_str.append(i) + assert len(check_all_str) != len( + df_arr + ), "This file appears to be doric .csv. This function only supports standard .csv files." + df = pd.read_csv(path[i], index_col=False) + + _, value = self.check_header(df) + + # check dataframe structure and read data accordingly + if len(value) > 0: + columns_isstr = False + df = pd.read_csv(path[i], header=None) + cols = np.array(list(df.columns), dtype=str) + else: + df = df + columns_isstr = True + cols = np.array(list(df.columns), dtype=str) + # check the structure of dataframe and assign flag to the type of file + if len(cols) == 1: + if cols[0].lower() != "timestamps": + logger.error("\033[1m" + "Column name should be timestamps (all lower-cases)" + "\033[0m") + raise Exception("\033[1m" + "Column name should be timestamps (all lower-cases)" + "\033[0m") + else: + flag = "event_csv" + elif len(cols) == 3: + arr1 = np.array(["timestamps", "data", "sampling_rate"]) + arr2 = np.char.lower(np.array(cols)) + if (np.sort(arr1) == np.sort(arr2)).all() == False: + logger.error( + "\033[1m" + + "Column names should be timestamps, data and sampling_rate (all lower-cases)" + + "\033[0m" + ) + raise Exception( + "\033[1m" + + "Column names should be timestamps, data and sampling_rate (all lower-cases)" + + "\033[0m" + ) + else: + flag = "data_csv" + elif len(cols) == 2: + raise ValueError( + "Data appears to be Neurophotometrics csv. Please use import_npm_csv function to import the data." + ) + elif len(cols) >= 2: + raise ValueError( + "Data appears to be Neurophotometrics csv. Please use import_npm_csv function to import the data." + ) + else: + logger.error("Number of columns in csv file does not make sense.") + raise Exception("Number of columns in csv file does not make sense.") + + if columns_isstr == True and ( + "flags" in np.char.lower(np.array(cols)) or "ledstate" in np.char.lower(np.array(cols)) + ): + flag = flag + "_v2" + else: + flag = flag + + flag_arr.append(flag) + logger.info(flag) + assert ( + flag == "event_csv" or flag == "data_csv" + ), "This function only supports standard event_csv and data_csv files." + name = os.path.basename(path[i]).split(".")[0] + event_from_filename.append(name) + + logger.info("Importing of csv file is done.") + + self.events = event_from_filename + self.flags = flag_arr + + def check_header(self, df): + arr = list(df.columns) + check_float = [] + for i in arr: + try: + check_float.append(float(i)) + except: + pass + + return arr, check_float diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index 79fa71a..e64be8c 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -9,21 +9,16 @@ import logging import os import socket -import tkinter as tk from pathlib import Path from random import randint -from tkinter import StringVar, messagebox, ttk -import h5py import holoviews as hv import numpy as np import pandas as pd import panel as pn -from numpy import float32, float64, int32, int64, uint16 -from guppy.extractors import TdtRecordingExtractor -from guppy.csv_step2 import import_csv_step2 from guppy.doric_step2 import import_doric +from guppy.extractors import CsvRecordingExtractor, TdtRecordingExtractor from guppy.npm_step2 import import_npm # hv.extension() @@ -573,7 +568,6 @@ def save_button(event=None): template.show(port=number) - # function to read input parameters and run the saveStorenames function def execute(inputParameters): @@ -594,7 +588,10 @@ def execute(inputParameters): event_name, flag = [], [] elif modality == "csv": data = 0 - event_name, flag = import_csv_step2(filepath) + extractor = CsvRecordingExtractor(folder_path=filepath) + event_name = extractor.events + flag = extractor.flags + elif modality == "doric": data = 0 event_name, flag = import_doric(filepath) @@ -603,7 +600,7 @@ def execute(inputParameters): event_name, flag = import_npm(filepath, num_ch) else: raise ValueError("Modality not recognized. Please use 'tdt', 'csv', 'doric', or 'npm'.") - + saveStorenames(inputParameters, data, event_name, flag, filepath) logger.info("#" * 400) except Exception as e: From bcb78a51d52b54f3126d50260e735c4929da3a4e Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 19 Nov 2025 11:10:24 -0800 Subject: [PATCH 021/150] Installed pre-commit. --- src/guppy/common_step3.py | 13 ++------ src/guppy/csv_step2.py | 21 +++++++++--- src/guppy/csv_step3.py | 9 +----- src/guppy/doric_step2.py | 9 ++++-- src/guppy/doric_step3.py | 5 +-- .../extractors/tdt_recording_extractor.py | 32 +++++++++++-------- src/guppy/npm_step2.py | 13 +++++--- src/guppy/readTevTsq.py | 13 ++------ src/guppy/savingInputParameters.py | 4 ++- src/guppy/tdt_step2.py | 6 ++-- src/guppy/tdt_step3.py | 12 +++---- 11 files changed, 70 insertions(+), 67 deletions(-) diff --git a/src/guppy/common_step3.py b/src/guppy/common_step3.py index 4ea5c95..09e763f 100644 --- a/src/guppy/common_step3.py +++ b/src/guppy/common_step3.py @@ -1,21 +1,12 @@ -import glob -import json import logging -import multiprocessing as mp import os -import re -import sys -import time -import warnings -from itertools import repeat import h5py import numpy as np -import pandas as pd -from numpy import float32, float64, int32, int64, uint16 logger = logging.getLogger(__name__) + # function to write data to a hdf5 file def write_hdf5(data, event, filepath, key): @@ -48,4 +39,4 @@ def write_hdf5(data, event, filepath, key): if type(data) is np.ndarray: f.create_dataset(key, data=data, maxshape=(None,), chunks=True) else: - f.create_dataset(key, data=data) \ No newline at end of file + f.create_dataset(key, data=data) diff --git a/src/guppy/csv_step2.py b/src/guppy/csv_step2.py index 4d9b800..ba4b34f 100644 --- a/src/guppy/csv_step2.py +++ b/src/guppy/csv_step2.py @@ -1,11 +1,13 @@ import glob import logging import os + import numpy as np import pandas as pd logger = logging.getLogger(__name__) + def check_header(df): arr = list(df.columns) check_float = [] @@ -17,6 +19,7 @@ def check_header(df): return arr, check_float + def import_csv_step2(filepath): logger.debug("If it exists, importing either NPM or Doric or csv file based on the structure of file") path = sorted(glob.glob(os.path.join(filepath, "*.csv"))) @@ -37,7 +40,9 @@ def import_csv_step2(filepath): float(element) except: check_all_str.append(i) - assert len(check_all_str) != len(df_arr), "This file appears to be doric .csv. This function only supports standard .csv files." + assert len(check_all_str) != len( + df_arr + ), "This file appears to be doric .csv. This function only supports standard .csv files." df = pd.read_csv(path[i], index_col=False) _, value = check_header(df) @@ -75,9 +80,13 @@ def import_csv_step2(filepath): else: flag = "data_csv" elif len(cols) == 2: - raise ValueError("Data appears to be Neurophotometrics csv. Please use import_npm_csv function to import the data.") + raise ValueError( + "Data appears to be Neurophotometrics csv. Please use import_npm_csv function to import the data." + ) elif len(cols) >= 2: - raise ValueError("Data appears to be Neurophotometrics csv. Please use import_npm_csv function to import the data.") + raise ValueError( + "Data appears to be Neurophotometrics csv. Please use import_npm_csv function to import the data." + ) else: logger.error("Number of columns in csv file does not make sense.") raise Exception("Number of columns in csv file does not make sense.") @@ -91,9 +100,11 @@ def import_csv_step2(filepath): flag_arr.append(flag) logger.info(flag) - assert flag == "event_csv" or flag == "data_csv", "This function only supports standard event_csv and data_csv files." + assert ( + flag == "event_csv" or flag == "data_csv" + ), "This function only supports standard event_csv and data_csv files." name = os.path.basename(path[i]).split(".")[0] event_from_filename.append(name) logger.info("Importing of csv file is done.") - return event_from_filename, flag_arr \ No newline at end of file + return event_from_filename, flag_arr diff --git a/src/guppy/csv_step3.py b/src/guppy/csv_step3.py index 97d3eb5..985959a 100644 --- a/src/guppy/csv_step3.py +++ b/src/guppy/csv_step3.py @@ -1,18 +1,11 @@ -import glob -import json import logging import multiprocessing as mp import os -import re -import sys import time -import warnings from itertools import repeat -import h5py import numpy as np import pandas as pd -from numpy import float32, float64, int32, int64, uint16 from guppy.common_step3 import write_hdf5 @@ -70,4 +63,4 @@ def import_csv(filepath, event, outputPath): logger.info("\033[1m" + "Reading data for {} from csv file is completed.".format(event) + "\033[0m") - return data, key \ No newline at end of file + return data, key diff --git a/src/guppy/doric_step2.py b/src/guppy/doric_step2.py index bf402d1..26ab22e 100644 --- a/src/guppy/doric_step2.py +++ b/src/guppy/doric_step2.py @@ -8,6 +8,7 @@ logger = logging.getLogger(__name__) + def import_doric(filepath): logger.debug("If it exists, importing Doric file based on the structure of file") @@ -33,7 +34,9 @@ def import_doric(filepath): float(element) except: check_all_str.append(i) - assert len(check_all_str) == len(df_arr), "This file appears to be standard .csv. This function only supports doric .csv files." + assert len(check_all_str) == len( + df_arr + ), "This file appears to be standard .csv. This function only supports doric .csv files." df = pd.read_csv(path[i], header=1, index_col=False, nrows=10) df = df.drop(["Time(s)"], axis=1) event_from_filename.extend(list(df.columns)) @@ -52,6 +55,7 @@ def read_doric(filepath): return keys + def access_keys_doricV6(doric_file): data = [doric_file["DataAcquisition"]] res = [] @@ -82,6 +86,7 @@ def access_keys_doricV1(doric_file): return keys + def separate_last_element(arr): l = arr[-1] - return arr[:-1], l \ No newline at end of file + return arr[:-1], l diff --git a/src/guppy/doric_step3.py b/src/guppy/doric_step3.py index 2c30887..e9fd7cc 100644 --- a/src/guppy/doric_step3.py +++ b/src/guppy/doric_step3.py @@ -12,6 +12,7 @@ logger = logging.getLogger(__name__) + def check_doric(filepath): logger.debug("Checking if doric file exists") path = glob.glob(os.path.join(filepath, "*.csv")) + glob.glob(os.path.join(filepath, "*.doric")) @@ -43,6 +44,7 @@ def check_doric(filepath): logger.info("Doric file found.") return flag_arr[0] + def execute_import_doric(filepath, storesList, flag, outputPath): flag = check_doric(filepath) @@ -83,7 +85,6 @@ def execute_import_doric(filepath, storesList, flag, outputPath): keys = access_data_doricV6(f, storesList, outputPath) - def access_data_doricV6(doric_file, storesList, outputPath): data = [doric_file["DataAcquisition"]] res = [] @@ -155,4 +156,4 @@ def access_data_doricV1(doric_file, storesList, outputPath): def separate_last_element(arr): l = arr[-1] - return arr[:-1], l \ No newline at end of file + return arr[:-1], l diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py index c0b01f9..1d46b1e 100644 --- a/src/guppy/extractors/tdt_recording_extractor.py +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -1,17 +1,19 @@ import glob import logging -import os -import numpy as np -from numpy import float32, float64, int32, int64, uint16 -import pandas as pd import multiprocessing as mp +import os import time from itertools import repeat +import numpy as np +import pandas as pd +from numpy import float32, float64, int32, int64, uint16 + from guppy.common_step3 import write_hdf5 logger = logging.getLogger(__name__) + # function to execute readtev function using multiprocessing to make it faster def execute_readtev(folder_path, events, outputPath, numProcesses=mp.cpu_count()): extractor = TdtRecordingExtractor(folder_path=folder_path) @@ -20,11 +22,13 @@ def execute_readtev(folder_path, events, outputPath, numProcesses=mp.cpu_count() p.starmap(read_tdt_and_save_hdf5, zip(repeat(extractor), events, repeat(outputPath))) logger.info("Time taken = {0:.5f}".format(time.time() - start)) + def read_tdt_and_save_hdf5(extractor, event, outputPath): S = extractor.readtev(event=event) extractor.save_dict_to_hdf5(S=S, event=event, outputPath=outputPath) logger.info("Data for event {} fetched and stored.".format(event)) + class TdtRecordingExtractor: def __init__(self, folder_path): @@ -94,8 +98,6 @@ def readtev(self, event): logger.error("\033[1m" + "TDT store name " + str(event) + " not found." + "\033[0m") raise ValueError("Requested store name not found.") - - allIndexesWhereEventIsPresent = np.where(row == 1) first_row = allIndexesWhereEventIsPresent[0][0] @@ -142,11 +144,10 @@ def readtev(self, event): return S # check if a particular element is there in an array or not - def ismember(self, arr, element): # TODO: replace this function with more standard usage + def ismember(self, arr, element): # TODO: replace this function with more standard usage res = [1 if i == element else 0 for i in arr] return np.asarray(res) - # function to save data read from tev file to hdf5 file def save_dict_to_hdf5(self, S, event, outputPath): write_hdf5(S["storename"], event, outputPath, "storename") @@ -157,16 +158,17 @@ def save_dict_to_hdf5(self, S, event, outputPath): write_hdf5(S["npoints"], event, outputPath, "npoints") write_hdf5(S["channels"], event, outputPath, "channels") - # function to check event data (checking whether event timestamps belongs to same event or multiple events) - def check_data(self, S, event, outputPath): # TODO: fold this function into the main read/get function + def check_data(self, S, event, outputPath): # TODO: fold this function into the main read/get function # logger.info("Checking event storename data for creating multiple event names from single event storename...") new_event = event.replace("\\", "") new_event = event.replace("/", "") diff = np.diff(S["data"]) arr = np.full(diff.shape[0], 1) - storesList = np.genfromtxt(os.path.join(outputPath, "storesList.csv"), dtype="str", delimiter=",").reshape(2, -1) + storesList = np.genfromtxt(os.path.join(outputPath, "storesList.csv"), dtype="str", delimiter=",").reshape( + 2, -1 + ) if diff.shape[0] == 0: return 0 @@ -174,7 +176,9 @@ def check_data(self, S, event, outputPath): # TODO: fold this function into the if S["sampling_rate"] == 0 and np.all(diff == diff[0]) == False: logger.info("\033[1m" + "Data in event {} belongs to multiple behavior".format(event) + "\033[0m") logger.debug( - "\033[1m" + "Create timestamp files for individual new event and change the stores list file." + "\033[0m" + "\033[1m" + + "Create timestamp files for individual new event and change the stores list file." + + "\033[0m" ) i_d = np.unique(S["data"]) for i in range(i_d.shape[0]): @@ -199,4 +203,6 @@ def check_data(self, S, event, outputPath): # TODO: fold this function into the pass else: np.savetxt(os.path.join(outputPath, "storesList.csv"), storesList, delimiter=",", fmt="%s") - logger.info("\033[1m Timestamp files for individual new event are created and the stores list file is changed.\033[0m") \ No newline at end of file + logger.info( + "\033[1m Timestamp files for individual new event are created and the stores list file is changed.\033[0m" + ) diff --git a/src/guppy/npm_step2.py b/src/guppy/npm_step2.py index f0fafec..14b776f 100644 --- a/src/guppy/npm_step2.py +++ b/src/guppy/npm_step2.py @@ -12,6 +12,7 @@ logger = logging.getLogger(__name__) + def import_npm(filepath, num_ch, inputParameters=None): logger.debug("If it exists, importing NPM file based on the structure of file") @@ -49,7 +50,9 @@ def import_npm(filepath, num_ch, inputParameters=None): float(element) except: check_all_str.append(i) - assert len(check_all_str) != len(df_arr), "This file appears to be doric .csv. This function only supports NPM .csv files." + assert len(check_all_str) != len( + df_arr + ), "This file appears to be doric .csv. This function only supports NPM .csv files." df = pd.read_csv(path[i], index_col=False) _, value = check_header(df) @@ -174,9 +177,7 @@ def import_npm(filepath, num_ch, inputParameters=None): # path_sig = glob.glob(os.path.join(filepath, 'sig*')) path_chev_chod_chpr = [path_chev, path_chod, path_chpr] if ( - ("data_np_v2" in flag_arr or "data_np" in flag_arr) - and ("event_np" in flag_arr) - and (i == len(path) - 1) + ("data_np_v2" in flag_arr or "data_np" in flag_arr) and ("event_np" in flag_arr) and (i == len(path) - 1) ) or ( ("data_np_v2" in flag_arr or "data_np" in flag_arr) and (i == len(path) - 1) ): # i==len(path)-1 and or 'event_np' in flag @@ -234,6 +235,7 @@ def import_npm(filepath, num_ch, inputParameters=None): logger.info("Importing of NPM file is done.") return event_from_filename, flag_arr + def check_header(df): arr = list(df.columns) check_float = [] @@ -294,6 +296,7 @@ def decide_indices(file, df, flag, num_ch=2): return df, indices_dict, num_ch + # check flag consistency in neurophotometrics data def check_channels(state): state = state.astype(int) @@ -405,4 +408,4 @@ def decide_ts_unit_for_npm(df, timestamp_column_name=None, time_unit=None, headl else: pass - return df, ts_unit \ No newline at end of file + return df, ts_unit diff --git a/src/guppy/readTevTsq.py b/src/guppy/readTevTsq.py index 47b7962..b86f6a2 100755 --- a/src/guppy/readTevTsq.py +++ b/src/guppy/readTevTsq.py @@ -3,21 +3,13 @@ import logging import multiprocessing as mp import os -import re import sys -import time -import warnings -from itertools import repeat -import h5py import numpy as np -import pandas as pd -from numpy import float32, float64, int32, int64, uint16 -from guppy.common_step3 import write_hdf5 -from guppy.tdt_step3 import execute_readtev -from guppy.doric_step3 import execute_import_doric from guppy.csv_step3 import execute_import_csv +from guppy.doric_step3 import execute_import_doric +from guppy.tdt_step3 import execute_readtev logger = logging.getLogger(__name__) @@ -34,6 +26,7 @@ def writeToFile(value: str): with open(os.path.join(os.path.expanduser("~"), "pbSteps.txt"), "a") as file: file.write(value) + # function to read data from 'tsq' and 'tev' files def readRawData(inputParameters): diff --git a/src/guppy/savingInputParameters.py b/src/guppy/savingInputParameters.py index b0a5feb..a1bd35e 100644 --- a/src/guppy/savingInputParameters.py +++ b/src/guppy/savingInputParameters.py @@ -554,7 +554,9 @@ def onclickpsth(event=None): psth_baseline_param = pn.Column(zscore_param_wd, psth_param_wd, baseline_param_wd, peak_param_wd) - widget = pn.Column(mark_down_1, files_1, explain_modality, modality_selector, pn.Row(individual_analysis_wd_2, psth_baseline_param)) + widget = pn.Column( + mark_down_1, files_1, explain_modality, modality_selector, pn.Row(individual_analysis_wd_2, psth_baseline_param) + ) # file_selector = pn.WidgetBox(files_1) styles = dict(background="WhiteSmoke") diff --git a/src/guppy/tdt_step2.py b/src/guppy/tdt_step2.py index 09456a7..130ace8 100644 --- a/src/guppy/tdt_step2.py +++ b/src/guppy/tdt_step2.py @@ -1,12 +1,14 @@ import glob import logging import os + import numpy as np -from numpy import float32, float64, int32, int64, uint16 import pandas as pd +from numpy import float32, float64, int32, int64, uint16 logger = logging.getLogger(__name__) + # function to read 'tsq' file def readtsq(filepath): names = ("size", "type", "name", "chan", "sort_code", "timestamp", "fp_loc", "strobe", "format", "frequency") @@ -23,4 +25,4 @@ def readtsq(filepath): path = path[0] tsq = np.fromfile(path, dtype=tsq_dtype) df = pd.DataFrame(tsq) - return df \ No newline at end of file + return df diff --git a/src/guppy/tdt_step3.py b/src/guppy/tdt_step3.py index bc629f0..be92d4c 100644 --- a/src/guppy/tdt_step3.py +++ b/src/guppy/tdt_step3.py @@ -1,15 +1,10 @@ import glob -import json import logging import multiprocessing as mp import os -import re -import sys import time -import warnings from itertools import repeat -import h5py import numpy as np import pandas as pd from numpy import float32, float64, int32, int64, uint16 @@ -18,6 +13,7 @@ logger = logging.getLogger(__name__) + # function to read tsq file def readtsq(filepath): logger.debug("Trying to read tsq file.") @@ -45,6 +41,7 @@ def readtsq(filepath): logger.info("Data from tsq file fetched.") return df, flag + # function to execute readtev function using multiprocessing to make it faster def execute_readtev(filepath, event, outputPath, numProcesses=mp.cpu_count()): data, _ = readtsq(filepath) @@ -94,8 +91,6 @@ def readtev(data, filepath, event, outputPath): logger.error("\033[1m" + "TDT store name " + str(event) + " not found." + "\033[0m") raise ValueError("Requested store name not found.") - - allIndexesWhereEventIsPresent = np.where(row == 1) first_row = allIndexesWhereEventIsPresent[0][0] @@ -145,6 +140,7 @@ def readtev(data, filepath, event, outputPath): logger.info("Data for event {} fetched and stored.".format(event)) + # check if a particular element is there in an array or not def ismember(arr, element): res = [1 if i == element else 0 for i in arr] @@ -208,4 +204,4 @@ def check_data(S, filepath, event, outputPath): + "Timestamp files for individual new event are created \ and the stores list file is changed." + "\033[0m" - ) \ No newline at end of file + ) From 1c8ee07e09d566578219e08e41bc87a54bb9854a Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 19 Nov 2025 11:33:31 -0800 Subject: [PATCH 022/150] Added CsvRecordingExtractor for step 3 --- src/guppy/extractors/__init__.py | 4 +- .../extractors/csv_recording_extractor.py | 65 +++++++++++++++++++ src/guppy/readTevTsq.py | 7 +- 3 files changed, 71 insertions(+), 5 deletions(-) diff --git a/src/guppy/extractors/__init__.py b/src/guppy/extractors/__init__.py index 812622b..a421290 100644 --- a/src/guppy/extractors/__init__.py +++ b/src/guppy/extractors/__init__.py @@ -1,2 +1,2 @@ -from .tdt_recording_extractor import TdtRecordingExtractor -from .csv_recording_extractor import CsvRecordingExtractor +from .tdt_recording_extractor import TdtRecordingExtractor, execute_readtev +from .csv_recording_extractor import CsvRecordingExtractor, execute_import_csv diff --git a/src/guppy/extractors/csv_recording_extractor.py b/src/guppy/extractors/csv_recording_extractor.py index f5a73e9..3df76f6 100644 --- a/src/guppy/extractors/csv_recording_extractor.py +++ b/src/guppy/extractors/csv_recording_extractor.py @@ -1,13 +1,34 @@ import glob import logging +import multiprocessing as mp import os +import time +from itertools import repeat import numpy as np import pandas as pd +from guppy.common_step3 import write_hdf5 + logger = logging.getLogger(__name__) +def execute_import_csv(filepath, events, outputPath, numProcesses=mp.cpu_count()): + logger.info("Reading data for event {} ...".format(events)) + + extractor = CsvRecordingExtractor(folder_path=filepath) + start = time.time() + with mp.Pool(numProcesses) as p: + p.starmap(read_csv_and_save_hdf5, zip(repeat(extractor), events, repeat(outputPath))) + logger.info("Time taken = {0:.5f}".format(time.time() - start)) + + +def read_csv_and_save_hdf5(extractor, event, outputPath): + df = extractor.read_csv(event=event) + extractor.save_to_hdf5(df=df, event=event, outputPath=outputPath) + logger.info("Data for event {} fetched and stored.".format(event)) + + class CsvRecordingExtractor: def __init__(self, folder_path): @@ -113,3 +134,47 @@ def check_header(self, df): pass return arr, check_float + + def read_csv(self, event): + logger.debug("\033[1m" + "Trying to read data for {} from csv file.".format(event) + "\033[0m") + if not os.path.exists(os.path.join(self.folder_path, event + ".csv")): + logger.error("\033[1m" + "No csv file found for event {}".format(event) + "\033[0m") + raise Exception("\033[1m" + "No csv file found for event {}".format(event) + "\033[0m") + + df = pd.read_csv(os.path.join(self.folder_path, event + ".csv"), index_col=False) + return df + + def save_to_hdf5(self, df, event, outputPath): + key = list(df.columns) + + # TODO: clean up these if branches + if len(key) == 3: + arr1 = np.array(["timestamps", "data", "sampling_rate"]) + arr2 = np.char.lower(np.array(key)) + if (np.sort(arr1) == np.sort(arr2)).all() == False: + logger.error("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") + raise Exception("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") + + if len(key) == 1: + if key[0].lower() != "timestamps": + logger.error("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") + raise Exception("\033[1m" + "Column name should be timestamps" + "\033[0m") + + if len(key) != 3 and len(key) != 1: + logger.error( + "\033[1m" + + "Number of columns in csv file should be either three or one. Three columns if \ + the file is for control or signal data or one column if the file is for event TTLs." + + "\033[0m" + ) + raise Exception( + "\033[1m" + + "Number of columns in csv file should be either three or one. Three columns if \ + the file is for control or signal data or one column if the file is for event TTLs." + + "\033[0m" + ) + + for i in range(len(key)): + write_hdf5(df[key[i]].dropna(), event, outputPath, key[i].lower()) + + logger.info("\033[1m" + "Reading data for {} from csv file is completed.".format(event) + "\033[0m") diff --git a/src/guppy/readTevTsq.py b/src/guppy/readTevTsq.py index b86f6a2..c67f075 100755 --- a/src/guppy/readTevTsq.py +++ b/src/guppy/readTevTsq.py @@ -7,9 +7,8 @@ import numpy as np -from guppy.csv_step3 import execute_import_csv from guppy.doric_step3 import execute_import_doric -from guppy.tdt_step3 import execute_readtev +from guppy.extractors import execute_import_csv, execute_readtev logger = logging.getLogger(__name__) @@ -74,8 +73,10 @@ def readRawData(inputParameters): elif modality == "doric": execute_import_doric(filepath, storesList, modality, op) - elif modality == "csv" or modality == "npm": + elif modality == "csv": execute_import_csv(filepath, np.unique(storesList[0, :]), op, numProcesses) + elif modality == "npm": + raise NotImplementedError("NPM modality is not yet implemented.") else: raise ValueError("Modality not recognized. Please use 'tdt', 'csv', 'doric', or 'npm'.") From 9262a5ad3cf21497a6a489183a7b093768cd15cb Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 19 Nov 2025 11:51:24 -0800 Subject: [PATCH 023/150] Added DoricRecordingExtractor for step 2 --- src/guppy/extractors/__init__.py | 1 + src/guppy/saveStoresList.py | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/guppy/extractors/__init__.py b/src/guppy/extractors/__init__.py index a421290..ebb9fb0 100644 --- a/src/guppy/extractors/__init__.py +++ b/src/guppy/extractors/__init__.py @@ -1,2 +1,3 @@ from .tdt_recording_extractor import TdtRecordingExtractor, execute_readtev from .csv_recording_extractor import CsvRecordingExtractor, execute_import_csv +from .doric_recording_extractor import DoricRecordingExtractor diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index e64be8c..baec41e 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -17,8 +17,11 @@ import pandas as pd import panel as pn -from guppy.doric_step2 import import_doric -from guppy.extractors import CsvRecordingExtractor, TdtRecordingExtractor +from guppy.extractors import ( + CsvRecordingExtractor, + DoricRecordingExtractor, + TdtRecordingExtractor, +) from guppy.npm_step2 import import_npm # hv.extension() @@ -594,7 +597,10 @@ def execute(inputParameters): elif modality == "doric": data = 0 - event_name, flag = import_doric(filepath) + extractor = DoricRecordingExtractor(folder_path=filepath) + event_name = extractor.events + flag = extractor.flags + elif modality == "npm": data = 0 event_name, flag = import_npm(filepath, num_ch) From 9c5afced4ccabb31edc87eaafaa1b54df5d95eb9 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 19 Nov 2025 11:51:49 -0800 Subject: [PATCH 024/150] Added DoricRecordingExtractor for step 2 --- .../extractors/doric_recording_extractor.py | 94 +++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 src/guppy/extractors/doric_recording_extractor.py diff --git a/src/guppy/extractors/doric_recording_extractor.py b/src/guppy/extractors/doric_recording_extractor.py new file mode 100644 index 0000000..f45df50 --- /dev/null +++ b/src/guppy/extractors/doric_recording_extractor.py @@ -0,0 +1,94 @@ +import glob +import logging +import os + +import h5py +import numpy as np +import pandas as pd + +logger = logging.getLogger(__name__) + + +class DoricRecordingExtractor: + + def __init__(self, folder_path): + self.folder_path = folder_path + logger.debug("If it exists, importing Doric file based on the structure of file") + path = sorted(glob.glob(os.path.join(self.folder_path, "*.csv"))) + sorted( + glob.glob(os.path.join(self.folder_path, "*.doric")) + ) + + path = sorted(list(set(path))) + flag = "None" + event_from_filename = [] + flag_arr = [] + for i in range(len(path)): + ext = os.path.basename(path[i]).split(".")[-1] + if ext == "doric": + key_names = self.read_doric(path[i]) + event_from_filename.extend(key_names) + flag = "doric_doric" + else: + df = pd.read_csv(path[i], header=None, nrows=2, index_col=False, dtype=str) + df = df.dropna(axis=1, how="all") + df_arr = np.array(df).flatten() + check_all_str = [] + for element in df_arr: + try: + float(element) + except: + check_all_str.append(i) + assert len(check_all_str) == len( + df_arr + ), "This file appears to be standard .csv. This function only supports doric .csv files." + df = pd.read_csv(path[i], header=1, index_col=False, nrows=10) + df = df.drop(["Time(s)"], axis=1) + event_from_filename.extend(list(df.columns)) + flag = "doric_csv" + logger.info(flag) + logger.info("Importing of Doric file is done.") + + self.events = event_from_filename + self.flags = flag_arr + + def read_doric(self, filepath): + with h5py.File(filepath, "r") as f: + if "Traces" in list(f.keys()): + keys = self.access_keys_doricV1(f) + elif list(f.keys()) == ["Configurations", "DataAcquisition"]: + keys = self.access_keys_doricV6(f) + + return keys + + def access_keys_doricV6(self, doric_file): + data = [doric_file["DataAcquisition"]] + res = [] + while len(data) != 0: + members = len(data) + while members != 0: + members -= 1 + data, last_element = self.separate_last_element(data) + if isinstance(last_element, h5py.Dataset) and not last_element.name.endswith("/Time"): + res.append(last_element.name) + elif isinstance(last_element, h5py.Group): + data.extend(reversed([last_element[k] for k in last_element.keys()])) + + keys = [] + for element in res: + sep_values = element.split("/") + if sep_values[-1] == "Values": + keys.append(f"{sep_values[-3]}/{sep_values[-2]}") + else: + keys.append(f"{sep_values[-2]}/{sep_values[-1]}") + + return keys + + def access_keys_doricV1(self, doric_file): + keys = list(doric_file["Traces"]["Console"].keys()) + keys.remove("Time(s)") + + return keys + + def separate_last_element(self, arr): + l = arr[-1] + return arr[:-1], l From 914f23f36b7a4adc9a4edeb9af1a316c146e9586 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 19 Nov 2025 14:22:40 -0800 Subject: [PATCH 025/150] Added DoricRecordingExtractor for step 3 --- src/guppy/extractors/__init__.py | 2 +- .../extractors/doric_recording_extractor.py | 152 ++++++++++++++++++ 2 files changed, 153 insertions(+), 1 deletion(-) diff --git a/src/guppy/extractors/__init__.py b/src/guppy/extractors/__init__.py index ebb9fb0..b3c2c3a 100644 --- a/src/guppy/extractors/__init__.py +++ b/src/guppy/extractors/__init__.py @@ -1,3 +1,3 @@ from .tdt_recording_extractor import TdtRecordingExtractor, execute_readtev from .csv_recording_extractor import CsvRecordingExtractor, execute_import_csv -from .doric_recording_extractor import DoricRecordingExtractor +from .doric_recording_extractor import DoricRecordingExtractor, execute_import_doric diff --git a/src/guppy/extractors/doric_recording_extractor.py b/src/guppy/extractors/doric_recording_extractor.py index f45df50..cbade8b 100644 --- a/src/guppy/extractors/doric_recording_extractor.py +++ b/src/guppy/extractors/doric_recording_extractor.py @@ -1,14 +1,31 @@ import glob import logging import os +import re +import warnings import h5py import numpy as np import pandas as pd +from guppy.common_step3 import write_hdf5 + logger = logging.getLogger(__name__) +def execute_import_doric(folder_path, storesList, flag, outputPath): + extractor = DoricRecordingExtractor(folder_path=folder_path) + flag = extractor.check_doric(folder_path) + + if flag == "doric_csv": + extractor.read_doric_csv(folder_path, storesList, outputPath) + elif flag == "doric_doric": + extractor.read_doric_doric(folder_path, storesList, outputPath) + else: + logger.error("Doric file not found or not recognized.") + raise FileNotFoundError("Doric file not found or not recognized.") + + class DoricRecordingExtractor: def __init__(self, folder_path): @@ -92,3 +109,138 @@ def access_keys_doricV1(self, doric_file): def separate_last_element(self, arr): l = arr[-1] return arr[:-1], l + + def check_doric(self, filepath): + logger.debug("Checking if doric file exists") + path = glob.glob(os.path.join(filepath, "*.csv")) + glob.glob(os.path.join(filepath, "*.doric")) + + flag_arr = [] + for i in range(len(path)): + ext = os.path.basename(path[i]).split(".")[-1] + if ext == "csv": + with warnings.catch_warnings(): + warnings.simplefilter("error") + try: + df = pd.read_csv(path[i], index_col=False, dtype=float) + except: # TODO: fix this bare try-except + df = pd.read_csv(path[i], header=1, index_col=False, nrows=10) + flag = "doric_csv" + flag_arr.append(flag) + elif ext == "doric": + flag = "doric_doric" + flag_arr.append(flag) + else: + pass + + if len(flag_arr) > 1: + logger.error("Two doric files are present at the same location") + raise Exception("Two doric files are present at the same location") + if len(flag_arr) == 0: + logger.error("\033[1m" + "Doric file not found." + "\033[1m") + return 0 + logger.info("Doric file found.") + return flag_arr[0] + + def read_doric_csv(self, filepath, storesList, outputPath): + path = glob.glob(os.path.join(filepath, "*.csv")) + if len(path) > 1: + logger.error("An error occurred : More than one Doric csv file present at the location") + raise Exception("More than one Doric csv file present at the location") + else: + df = pd.read_csv(path[0], header=1, index_col=False) + df = df.dropna(axis=1, how="all") + df = df.dropna(axis=0, how="any") + df["Time(s)"] = df["Time(s)"] - df["Time(s)"].to_numpy()[0] + for i in range(storesList.shape[1]): + if "control" in storesList[1, i] or "signal" in storesList[1, i]: + timestamps = np.array(df["Time(s)"]) + sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) + write_hdf5(sampling_rate, storesList[0, i], outputPath, "sampling_rate") + write_hdf5(df["Time(s)"].to_numpy(), storesList[0, i], outputPath, "timestamps") + write_hdf5(df[storesList[0, i]].to_numpy(), storesList[0, i], outputPath, "data") + else: + ttl = df[storesList[0, i]] + indices = np.where(ttl <= 0)[0] + diff_indices = np.where(np.diff(indices) > 1)[0] + write_hdf5( + df["Time(s)"][indices[diff_indices] + 1].to_numpy(), storesList[0, i], outputPath, "timestamps" + ) + + def read_doric_doric(self, filepath, storesList, outputPath): + path = glob.glob(os.path.join(filepath, "*.doric")) + if len(path) > 1: + logger.error("An error occurred : More than one Doric file present at the location") + raise Exception("More than one Doric file present at the location") + else: + with h5py.File(path[0], "r") as f: + if "Traces" in list(f.keys()): + keys = self.access_data_doricV1(f, storesList, outputPath) + elif list(f.keys()) == ["Configurations", "DataAcquisition"]: + keys = self.access_data_doricV6(f, storesList, outputPath) + + def access_data_doricV6(self, doric_file, storesList, outputPath): + data = [doric_file["DataAcquisition"]] + res = [] + while len(data) != 0: + members = len(data) + while members != 0: + members -= 1 + data, last_element = self.separate_last_element(data) + if isinstance(last_element, h5py.Dataset) and not last_element.name.endswith("/Time"): + res.append(last_element.name) + elif isinstance(last_element, h5py.Group): + data.extend(reversed([last_element[k] for k in last_element.keys()])) + + decide_path = [] + for element in res: + sep_values = element.split("/") + if sep_values[-1] == "Values": + if f"{sep_values[-3]}/{sep_values[-2]}" in storesList[0, :]: + decide_path.append(element) + else: + if f"{sep_values[-2]}/{sep_values[-1]}" in storesList[0, :]: + decide_path.append(element) + + for i in range(storesList.shape[1]): + if "control" in storesList[1, i] or "signal" in storesList[1, i]: + regex = re.compile("(.*?)" + str(storesList[0, i]) + "(.*?)") + idx = [i for i in range(len(decide_path)) if regex.match(decide_path[i])] + if len(idx) > 1: + logger.error("More than one string matched (which should not be the case)") + raise Exception("More than one string matched (which should not be the case)") + idx = idx[0] + data = np.array(doric_file[decide_path[idx]]) + timestamps = np.array(doric_file[decide_path[idx].rsplit("/", 1)[0] + "/Time"]) + sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) + write_hdf5(sampling_rate, storesList[0, i], outputPath, "sampling_rate") + write_hdf5(timestamps, storesList[0, i], outputPath, "timestamps") + write_hdf5(data, storesList[0, i], outputPath, "data") + else: + regex = re.compile("(.*?)" + storesList[0, i] + "$") + idx = [i for i in range(len(decide_path)) if regex.match(decide_path[i])] + if len(idx) > 1: + logger.error("More than one string matched (which should not be the case)") + raise Exception("More than one string matched (which should not be the case)") + idx = idx[0] + ttl = np.array(doric_file[decide_path[idx]]) + timestamps = np.array(doric_file[decide_path[idx].rsplit("/", 1)[0] + "/Time"]) + indices = np.where(ttl <= 0)[0] + diff_indices = np.where(np.diff(indices) > 1)[0] + write_hdf5(timestamps[indices[diff_indices] + 1], storesList[0, i], outputPath, "timestamps") + + def access_data_doricV1(self, doric_file, storesList, outputPath): + keys = list(doric_file["Traces"]["Console"].keys()) + for i in range(storesList.shape[1]): + if "control" in storesList[1, i] or "signal" in storesList[1, i]: + timestamps = np.array(doric_file["Traces"]["Console"]["Time(s)"]["Console_time(s)"]) + sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) + data = np.array(doric_file["Traces"]["Console"][storesList[0, i]][storesList[0, i]]) + write_hdf5(sampling_rate, storesList[0, i], outputPath, "sampling_rate") + write_hdf5(timestamps, storesList[0, i], outputPath, "timestamps") + write_hdf5(data, storesList[0, i], outputPath, "data") + else: + timestamps = np.array(doric_file["Traces"]["Console"]["Time(s)"]["Console_time(s)"]) + ttl = np.array(doric_file["Traces"]["Console"][storesList[0, i]][storesList[0, i]]) + indices = np.where(ttl <= 0)[0] + diff_indices = np.where(np.diff(indices) > 1)[0] + write_hdf5(timestamps[indices[diff_indices] + 1], storesList[0, i], outputPath, "timestamps") From cd966ae4acf2c07bcae716ed69b403758d7e819f Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 19 Nov 2025 14:34:52 -0800 Subject: [PATCH 026/150] streamlined inputs --- src/guppy/extractors/doric_recording_extractor.py | 2 +- src/guppy/readTevTsq.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/guppy/extractors/doric_recording_extractor.py b/src/guppy/extractors/doric_recording_extractor.py index cbade8b..e5a97cb 100644 --- a/src/guppy/extractors/doric_recording_extractor.py +++ b/src/guppy/extractors/doric_recording_extractor.py @@ -13,7 +13,7 @@ logger = logging.getLogger(__name__) -def execute_import_doric(folder_path, storesList, flag, outputPath): +def execute_import_doric(folder_path, storesList, outputPath): extractor = DoricRecordingExtractor(folder_path=folder_path) flag = extractor.check_doric(folder_path) diff --git a/src/guppy/readTevTsq.py b/src/guppy/readTevTsq.py index c67f075..c5c52da 100755 --- a/src/guppy/readTevTsq.py +++ b/src/guppy/readTevTsq.py @@ -67,14 +67,13 @@ def readRawData(inputParameters): 2, -1 ) + events = np.unique(storesList[0, :]) if modality == "tdt": - events = np.unique(storesList[0, :]) execute_readtev(filepath, events, op, numProcesses) - elif modality == "doric": - execute_import_doric(filepath, storesList, modality, op) + execute_import_doric(filepath, storesList, op) elif modality == "csv": - execute_import_csv(filepath, np.unique(storesList[0, :]), op, numProcesses) + execute_import_csv(filepath, events, op, numProcesses) elif modality == "npm": raise NotImplementedError("NPM modality is not yet implemented.") else: From ac158de53025dbe370238a0080c71f1dbf9fb9d1 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 19 Nov 2025 15:14:44 -0800 Subject: [PATCH 027/150] Added NpmRecordingExtractor for step 2 --- src/guppy/extractors/__init__.py | 1 + .../extractors/npm_recording_extractor.py | 429 ++++++++++++++++++ src/guppy/saveStoresList.py | 6 +- 3 files changed, 434 insertions(+), 2 deletions(-) create mode 100644 src/guppy/extractors/npm_recording_extractor.py diff --git a/src/guppy/extractors/__init__.py b/src/guppy/extractors/__init__.py index b3c2c3a..b876012 100644 --- a/src/guppy/extractors/__init__.py +++ b/src/guppy/extractors/__init__.py @@ -1,3 +1,4 @@ from .tdt_recording_extractor import TdtRecordingExtractor, execute_readtev from .csv_recording_extractor import CsvRecordingExtractor, execute_import_csv from .doric_recording_extractor import DoricRecordingExtractor, execute_import_doric +from .npm_recording_extractor import NpmRecordingExtractor, execute_import_npm diff --git a/src/guppy/extractors/npm_recording_extractor.py b/src/guppy/extractors/npm_recording_extractor.py new file mode 100644 index 0000000..c15987f --- /dev/null +++ b/src/guppy/extractors/npm_recording_extractor.py @@ -0,0 +1,429 @@ +import glob +import logging +import os +import tkinter as tk +from tkinter import StringVar, messagebox, ttk + +import numpy as np +import pandas as pd +import panel as pn + +pn.extension() + +logger = logging.getLogger(__name__) + + +def execute_import_npm(): + raise NotImplementedError("This function is a placeholder for execute_import_npm functionality.") + + +class NpmRecordingExtractor: + + def __init__(self, folder_path, num_ch, inputParameters=None): + self.folder_path = folder_path + self.num_ch = num_ch + self.inputParameters = inputParameters + self.events, self.flags = self.import_npm( + folder_path=folder_path, num_ch=num_ch, inputParameters=inputParameters + ) + + def import_npm(self, folder_path, num_ch, inputParameters=None): + + logger.debug("If it exists, importing NPM file based on the structure of file") + # Headless configuration (used to avoid any UI prompts when running tests) + headless = bool(os.environ.get("GUPPY_BASE_DIR")) + npm_timestamp_column_name = None + npm_time_unit = None + npm_split_events = None + if isinstance(inputParameters, dict): + npm_timestamp_column_name = inputParameters.get("npm_timestamp_column_name") + npm_time_unit = inputParameters.get("npm_time_unit", "seconds") + npm_split_events = inputParameters.get("npm_split_events", True) + path = sorted(glob.glob(os.path.join(folder_path, "*.csv"))) + sorted( + glob.glob(os.path.join(folder_path, "*.doric")) + ) + path_chev = glob.glob(os.path.join(folder_path, "*chev*")) + path_chod = glob.glob(os.path.join(folder_path, "*chod*")) + path_chpr = glob.glob(os.path.join(folder_path, "*chpr*")) + path_event = glob.glob(os.path.join(folder_path, "event*")) + # path_sig = glob.glob(os.path.join(filepath, 'sig*')) # TODO: what is this for? + path_chev_chod_event = path_chev + path_chod + path_event + path_chpr + + path = sorted(list(set(path) - set(path_chev_chod_event))) + flag = "None" + event_from_filename = [] + flag_arr = [] + for i in range(len(path)): + dirname = os.path.dirname(path[i]) + ext = os.path.basename(path[i]).split(".")[-1] + assert ext != "doric", "Doric files are not supported by import_npm function." + df = pd.read_csv(path[i], header=None, nrows=2, index_col=False, dtype=str) + df = df.dropna(axis=1, how="all") + df_arr = np.array(df).flatten() + check_all_str = [] + for element in df_arr: + try: + float(element) + except: + check_all_str.append(i) + assert len(check_all_str) != len( + df_arr + ), "This file appears to be doric .csv. This function only supports NPM .csv files." + df = pd.read_csv(path[i], index_col=False) + _, value = self.check_header(df) + + # check dataframe structure and read data accordingly + if len(value) > 0: + columns_isstr = False + df = pd.read_csv(path[i], header=None) + cols = np.array(list(df.columns), dtype=str) + else: + df = df + columns_isstr = True + cols = np.array(list(df.columns), dtype=str) + # check the structure of dataframe and assign flag to the type of file + assert len(cols) != 1, "File appears to be event .csv. This function only supports NPM .csv files." + assert len(cols) != 3, "File appears to be data .csv. This function only supports NPM .csv files." + if len(cols) == 2: + flag = "event_or_data_np" + elif len(cols) >= 2: + flag = "data_np" + else: + logger.error("Number of columns in csv file does not make sense.") + raise Exception("Number of columns in csv file does not make sense.") + + if columns_isstr == True and ( + "flags" in np.char.lower(np.array(cols)) or "ledstate" in np.char.lower(np.array(cols)) + ): + flag = flag + "_v2" + else: + flag = flag + + # used assigned flags to process the files and read the data + if flag == "event_or_data_np": + arr = list(df.iloc[:, 1]) + check_float = [True for i in arr if isinstance(i, float)] + if len(arr) == len(check_float) and columns_isstr == False: + flag = "data_np" + elif columns_isstr == True and ("value" in np.char.lower(np.array(cols))): + flag = "event_np" + else: + flag = "event_np" + + flag_arr.append(flag) + logger.info(flag) + if flag == "data_np": + file = f"file{str(i)}_" + df, indices_dict, _ = self.decide_indices(file, df, flag, num_ch) + keys = list(indices_dict.keys()) + for k in range(len(keys)): + for j in range(df.shape[1]): + if j == 0: + timestamps = df.iloc[:, j][indices_dict[keys[k]]] + # timestamps_odd = df.iloc[:,j][odd_indices] + else: + d = dict() + d["timestamps"] = timestamps + d["data"] = df.iloc[:, j][indices_dict[keys[k]]] + + df_ch = pd.DataFrame(d) + df_ch.to_csv(os.path.join(dirname, keys[k] + str(j) + ".csv"), index=False) + event_from_filename.append(keys[k] + str(j)) + + elif flag == "event_np": + type_val = np.array(df.iloc[:, 1]) + type_val_unique = np.unique(type_val) + if headless: + response = 1 if bool(npm_split_events) else 0 + else: + window = tk.Tk() + if len(type_val_unique) > 1: + response = messagebox.askyesno( + "Multiple event TTLs", + "Based on the TTL file,\ + it looks like TTLs \ + belongs to multiple behavior type. \ + Do you want to create multiple files for each \ + behavior type ?", + ) + else: + response = 0 + window.destroy() + if response == 1: + timestamps = np.array(df.iloc[:, 0]) + for j in range(len(type_val_unique)): + idx = np.where(type_val == type_val_unique[j]) + d = dict() + d["timestamps"] = timestamps[idx] + df_new = pd.DataFrame(d) + df_new.to_csv(os.path.join(dirname, "event" + str(type_val_unique[j]) + ".csv"), index=False) + event_from_filename.append("event" + str(type_val_unique[j])) + else: + timestamps = np.array(df.iloc[:, 0]) + d = dict() + d["timestamps"] = timestamps + df_new = pd.DataFrame(d) + df_new.to_csv(os.path.join(dirname, "event" + str(0) + ".csv"), index=False) + event_from_filename.append("event" + str(0)) + else: + file = f"file{str(i)}_" + df, ts_unit = self.decide_ts_unit_for_npm( + df, timestamp_column_name=npm_timestamp_column_name, time_unit=npm_time_unit, headless=headless + ) + df, indices_dict, _ = self.decide_indices(file, df, flag) + keys = list(indices_dict.keys()) + for k in range(len(keys)): + for j in range(df.shape[1]): + if j == 0: + timestamps = df.iloc[:, j][indices_dict[keys[k]]] + # timestamps_odd = df.iloc[:,j][odd_indices] + else: + d = dict() + d["timestamps"] = timestamps + d["data"] = df.iloc[:, j][indices_dict[keys[k]]] + + df_ch = pd.DataFrame(d) + df_ch.to_csv(os.path.join(dirname, keys[k] + str(j) + ".csv"), index=False) + event_from_filename.append(keys[k] + str(j)) + + path_chev = glob.glob(os.path.join(folder_path, "*chev*")) + path_chod = glob.glob(os.path.join(folder_path, "*chod*")) + path_chpr = glob.glob(os.path.join(folder_path, "*chpr*")) + path_event = glob.glob(os.path.join(folder_path, "event*")) + # path_sig = glob.glob(os.path.join(filepath, 'sig*')) + path_chev_chod_chpr = [path_chev, path_chod, path_chpr] + if ( + ("data_np_v2" in flag_arr or "data_np" in flag_arr) + and ("event_np" in flag_arr) + and (i == len(path) - 1) + ) or ( + ("data_np_v2" in flag_arr or "data_np" in flag_arr) and (i == len(path) - 1) + ): # i==len(path)-1 and or 'event_np' in flag + num_path_chev, num_path_chod, num_path_chpr = len(path_chev), len(path_chod), len(path_chpr) + arr_len, no_ch = [], [] + for i in range(len(path_chev_chod_chpr)): + if len(path_chev_chod_chpr[i]) > 0: + arr_len.append(len(path_chev_chod_chpr[i])) + else: + continue + + unique_arr_len = np.unique(np.array(arr_len)) + if "data_np_v2" in flag_arr: + if ts_unit == "seconds": + divisor = 1 + elif ts_unit == "milliseconds": + divisor = 1e3 + else: + divisor = 1e6 + else: + divisor = 1000 + + for j in range(len(path_event)): + df_event = pd.read_csv(path_event[j]) + df_chev = pd.read_csv(path_chev[0]) + df_event["timestamps"] = (df_event["timestamps"] - df_chev["timestamps"][0]) / divisor + df_event.to_csv(path_event[j], index=False) + if unique_arr_len.shape[0] == 1: + for j in range(len(path_chev)): + if file + "chev" in indices_dict.keys(): + df_chev = pd.read_csv(path_chev[j]) + df_chev["timestamps"] = (df_chev["timestamps"] - df_chev["timestamps"][0]) / divisor + df_chev["sampling_rate"] = np.full(df_chev.shape[0], np.nan) + df_chev.at[0, "sampling_rate"] = df_chev.shape[0] / ( + df_chev["timestamps"].iloc[-1] - df_chev["timestamps"].iloc[0] + ) + df_chev.to_csv(path_chev[j], index=False) + + if file + "chod" in indices_dict.keys(): + df_chod = pd.read_csv(path_chod[j]) + df_chod["timestamps"] = df_chev["timestamps"] + df_chod["sampling_rate"] = np.full(df_chod.shape[0], np.nan) + df_chod.at[0, "sampling_rate"] = df_chev["sampling_rate"][0] + df_chod.to_csv(path_chod[j], index=False) + + if file + "chpr" in indices_dict.keys(): + df_chpr = pd.read_csv(path_chpr[j]) + df_chpr["timestamps"] = df_chev["timestamps"] + df_chpr["sampling_rate"] = np.full(df_chpr.shape[0], np.nan) + df_chpr.at[0, "sampling_rate"] = df_chev["sampling_rate"][0] + df_chpr.to_csv(path_chpr[j], index=False) + else: + logger.error("Number of channels should be same for all regions.") + raise Exception("Number of channels should be same for all regions.") + logger.info("Importing of NPM file is done.") + return event_from_filename, flag_arr + + def check_header(self, df): + arr = list(df.columns) + check_float = [] + for i in arr: + try: + check_float.append(float(i)) + except: + pass + + return arr, check_float + + # function to decide indices of interleaved channels + # in neurophotometrics data + def decide_indices(self, file, df, flag, num_ch=2): + ch_name = [file + "chev", file + "chod", file + "chpr"] + if len(ch_name) < num_ch: + logger.error( + "Number of channels parameters in Input Parameters GUI is more than 3. \ + Looks like there are more than 3 channels in the file. Reading of these files\ + are not supported. Reach out to us if you get this error message." + ) + raise Exception( + "Number of channels parameters in Input Parameters GUI is more than 3. \ + Looks like there are more than 3 channels in the file. Reading of these files\ + are not supported. Reach out to us if you get this error message." + ) + if flag == "data_np": + indices_dict = dict() + for i in range(num_ch): + indices_dict[ch_name[i]] = np.arange(i, df.shape[0], num_ch) + + else: + cols = np.array(list(df.columns)) + if "flags" in np.char.lower(np.array(cols)): + arr = ["FrameCounter", "Flags"] + state = np.array(df["Flags"]) + elif "ledstate" in np.char.lower(np.array(cols)): + arr = ["FrameCounter", "LedState"] + state = np.array(df["LedState"]) + else: + logger.error( + "File type shows Neurophotometrics newer version \ + data but column names does not have Flags or LedState" + ) + raise Exception( + "File type shows Neurophotometrics newer version \ + data but column names does not have Flags or LedState" + ) + + num_ch, ch = self.check_channels(state) + indices_dict = dict() + for i in range(num_ch): + first_occurrence = np.where(state == ch[i])[0] + indices_dict[ch_name[i]] = np.arange(first_occurrence[0], df.shape[0], num_ch) + + df = df.drop(arr, axis=1) + + return df, indices_dict, num_ch + + # check flag consistency in neurophotometrics data + def check_channels(self, state): + state = state.astype(int) + unique_state = np.unique(state[2:12]) + if unique_state.shape[0] > 3: + logger.error( + "Looks like there are more than 3 channels in the file. Reading of these files\ + are not supported. Reach out to us if you get this error message." + ) + raise Exception( + "Looks like there are more than 3 channels in the file. Reading of these files\ + are not supported. Reach out to us if you get this error message." + ) + + return unique_state.shape[0], unique_state + + # function to decide NPM timestamps unit (seconds, ms or us) + def decide_ts_unit_for_npm(self, df, timestamp_column_name=None, time_unit=None, headless=False): + col_names = np.array(list(df.columns)) + col_names_ts = [""] + for name in col_names: + if "timestamp" in name.lower(): + col_names_ts.append(name) + + ts_unit = "seconds" + if len(col_names_ts) > 2: + # Headless path: auto-select column/unit without any UI + if headless: + if timestamp_column_name is not None: + assert ( + timestamp_column_name in col_names_ts + ), f"Provided timestamp_column_name '{timestamp_column_name}' not found in columns {col_names_ts[1:]}" + chosen = timestamp_column_name + else: + chosen = col_names_ts[1] + df.insert(1, "Timestamp", df[chosen]) + df = df.drop(col_names_ts[1:], axis=1) + valid_units = {"seconds", "milliseconds", "microseconds"} + ts_unit = time_unit if (isinstance(time_unit, str) and time_unit in valid_units) else "seconds" + return df, ts_unit + # def comboBoxSelected(event): + # logger.info(event.widget.get()) + + window = tk.Tk() + window.title("Select appropriate options for timestamps") + window.geometry("500x200") + holdComboboxValues = dict() + + timestamps_label = ttk.Label(window, text="Select which timestamps to use : ").grid( + row=0, column=1, pady=25, padx=25 + ) + holdComboboxValues["timestamps"] = StringVar() + timestamps_combo = ttk.Combobox(window, values=col_names_ts, textvariable=holdComboboxValues["timestamps"]) + timestamps_combo.grid(row=0, column=2, pady=25, padx=25) + timestamps_combo.current(0) + # timestamps_combo.bind("<>", comboBoxSelected) + + time_unit_label = ttk.Label(window, text="Select timestamps unit : ").grid( + row=1, column=1, pady=25, padx=25 + ) + holdComboboxValues["time_unit"] = StringVar() + time_unit_combo = ttk.Combobox( + window, + values=["", "seconds", "milliseconds", "microseconds"], + textvariable=holdComboboxValues["time_unit"], + ) + time_unit_combo.grid(row=1, column=2, pady=25, padx=25) + time_unit_combo.current(0) + # time_unit_combo.bind("<>", comboBoxSelected) + window.lift() + window.after(500, lambda: window.lift()) + window.mainloop() + + if holdComboboxValues["timestamps"].get(): + df.insert(1, "Timestamp", df[holdComboboxValues["timestamps"].get()]) + df = df.drop(col_names_ts[1:], axis=1) + else: + messagebox.showerror( + "All options not selected", + "All the options for timestamps \ + were not selected. Please select appropriate options", + ) + logger.error( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + raise Exception( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + if holdComboboxValues["time_unit"].get(): + if holdComboboxValues["time_unit"].get() == "seconds": + ts_unit = holdComboboxValues["time_unit"].get() + elif holdComboboxValues["time_unit"].get() == "milliseconds": + ts_unit = holdComboboxValues["time_unit"].get() + else: + ts_unit = holdComboboxValues["time_unit"].get() + else: + messagebox.showerror( + "All options not selected", + "All the options for timestamps \ + were not selected. Please select appropriate options", + ) + logger.error( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + raise Exception( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + else: + pass + + return df, ts_unit diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index baec41e..daf7457 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -20,9 +20,9 @@ from guppy.extractors import ( CsvRecordingExtractor, DoricRecordingExtractor, + NpmRecordingExtractor, TdtRecordingExtractor, ) -from guppy.npm_step2 import import_npm # hv.extension() pn.extension() @@ -603,7 +603,9 @@ def execute(inputParameters): elif modality == "npm": data = 0 - event_name, flag = import_npm(filepath, num_ch) + extractor = NpmRecordingExtractor(folder_path=filepath, num_ch=num_ch, inputParameters=inputParameters) + event_name = extractor.events + flag = extractor.flags else: raise ValueError("Modality not recognized. Please use 'tdt', 'csv', 'doric', or 'npm'.") From 6a470a1a9d11c8e1abd6a32de7eaf7390cad1472 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 19 Nov 2025 16:48:57 -0800 Subject: [PATCH 028/150] Added NpmRecordingExtractor for step 3 --- .../extractors/npm_recording_extractor.py | 65 ++++++++++++++++++- src/guppy/readTevTsq.py | 11 +++- 2 files changed, 71 insertions(+), 5 deletions(-) diff --git a/src/guppy/extractors/npm_recording_extractor.py b/src/guppy/extractors/npm_recording_extractor.py index c15987f..a8cfd98 100644 --- a/src/guppy/extractors/npm_recording_extractor.py +++ b/src/guppy/extractors/npm_recording_extractor.py @@ -1,20 +1,37 @@ import glob import logging +import multiprocessing as mp import os +import time import tkinter as tk +from itertools import repeat from tkinter import StringVar, messagebox, ttk import numpy as np import pandas as pd import panel as pn +from guppy.common_step3 import write_hdf5 + pn.extension() logger = logging.getLogger(__name__) -def execute_import_npm(): - raise NotImplementedError("This function is a placeholder for execute_import_npm functionality.") +def execute_import_npm(folder_path, num_ch, inputParameters, events, outputPath, numProcesses=mp.cpu_count()): + logger.info("Reading data for event {} ...".format(events)) + + extractor = NpmRecordingExtractor(folder_path=folder_path, num_ch=num_ch, inputParameters=inputParameters) + start = time.time() + with mp.Pool(numProcesses) as p: + p.starmap(read_npm_and_save_hdf5, zip(repeat(extractor), events, repeat(outputPath))) + logger.info("Time taken = {0:.5f}".format(time.time() - start)) + + +def read_npm_and_save_hdf5(extractor, event, outputPath): + df = extractor.read_npm(event=event) + extractor.save_to_hdf5(df=df, event=event, outputPath=outputPath) + logger.info("Data for event {} fetched and stored.".format(event)) class NpmRecordingExtractor: @@ -427,3 +444,47 @@ def decide_ts_unit_for_npm(self, df, timestamp_column_name=None, time_unit=None, pass return df, ts_unit + + def read_npm(self, event): + logger.debug("\033[1m" + "Trying to read data for {} from csv file.".format(event) + "\033[0m") + if not os.path.exists(os.path.join(self.folder_path, event + ".csv")): + logger.error("\033[1m" + "No csv file found for event {}".format(event) + "\033[0m") + raise Exception("\033[1m" + "No csv file found for event {}".format(event) + "\033[0m") + + df = pd.read_csv(os.path.join(self.folder_path, event + ".csv"), index_col=False) + return df + + def save_to_hdf5(self, df, event, outputPath): + key = list(df.columns) + + # TODO: clean up these if branches + if len(key) == 3: + arr1 = np.array(["timestamps", "data", "sampling_rate"]) + arr2 = np.char.lower(np.array(key)) + if (np.sort(arr1) == np.sort(arr2)).all() == False: + logger.error("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") + raise Exception("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") + + if len(key) == 1: + if key[0].lower() != "timestamps": + logger.error("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") + raise Exception("\033[1m" + "Column name should be timestamps" + "\033[0m") + + if len(key) != 3 and len(key) != 1: + logger.error( + "\033[1m" + + "Number of columns in csv file should be either three or one. Three columns if \ + the file is for control or signal data or one column if the file is for event TTLs." + + "\033[0m" + ) + raise Exception( + "\033[1m" + + "Number of columns in csv file should be either three or one. Three columns if \ + the file is for control or signal data or one column if the file is for event TTLs." + + "\033[0m" + ) + + for i in range(len(key)): + write_hdf5(df[key[i]].dropna(), event, outputPath, key[i].lower()) + + logger.info("\033[1m" + "Reading data for {} from csv file is completed.".format(event) + "\033[0m") diff --git a/src/guppy/readTevTsq.py b/src/guppy/readTevTsq.py index c5c52da..f2c9419 100755 --- a/src/guppy/readTevTsq.py +++ b/src/guppy/readTevTsq.py @@ -7,8 +7,12 @@ import numpy as np -from guppy.doric_step3 import execute_import_doric -from guppy.extractors import execute_import_csv, execute_readtev +from guppy.extractors import ( + execute_import_csv, + execute_import_doric, + execute_import_npm, + execute_readtev, +) logger = logging.getLogger(__name__) @@ -35,6 +39,7 @@ def readRawData(inputParameters): folderNames = inputParameters["folderNames"] numProcesses = inputParameters["numberOfCores"] modality = inputParameters["modality"] + num_ch = inputParameters["noChannels"] storesListPath = [] if numProcesses == 0: numProcesses = mp.cpu_count() @@ -75,7 +80,7 @@ def readRawData(inputParameters): elif modality == "csv": execute_import_csv(filepath, events, op, numProcesses) elif modality == "npm": - raise NotImplementedError("NPM modality is not yet implemented.") + execute_import_npm(filepath, num_ch, inputParameters, events, op, numProcesses) else: raise ValueError("Modality not recognized. Please use 'tdt', 'csv', 'doric', or 'npm'.") From 9b88cad73cbf64fa7648b4210682aeefff9d2782 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Thu, 20 Nov 2025 16:45:18 -0800 Subject: [PATCH 029/150] Add a tdt_check_data example session to the tests. --- tests/test_step2.py | 3 ++- tests/test_step3.py | 3 ++- tests/test_step4.py | 3 ++- tests/test_step5.py | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/test_step2.py b/tests/test_step2.py index 01d32e2..b34fe64 100644 --- a/tests/test_step2.py +++ b/tests/test_step2.py @@ -87,8 +87,9 @@ { "405R": "control_region", "490R": "signal_region", - "Tick": "ttl", + "PAB/": "ttl", }, + "tdt", ), # TODO: Add sampleData_NPM_1 after fixing Doric vs. NPM determination bug. ( diff --git a/tests/test_step3.py b/tests/test_step3.py index cfe2294..330d017 100644 --- a/tests/test_step3.py +++ b/tests/test_step3.py @@ -88,8 +88,9 @@ def storenames_map(): { "405R": "control_region", "490R": "signal_region", - "Tick": "ttl", + "PAB/": "ttl", }, + "tdt", ), ( "SampleData_with_artifacts/Photo_048_392-200728-121222", diff --git a/tests/test_step4.py b/tests/test_step4.py index d691d06..cdaf0ec 100644 --- a/tests/test_step4.py +++ b/tests/test_step4.py @@ -92,10 +92,11 @@ { "405R": "control_region", "490R": "signal_region", - "Tick": "ttl", + "PAB/": "ttl", }, "region", "ttl", + "tdt", ), ( "SampleData_with_artifacts/Photo_048_392-200728-121222", diff --git a/tests/test_step5.py b/tests/test_step5.py index ddd6935..4bed772 100644 --- a/tests/test_step5.py +++ b/tests/test_step5.py @@ -92,10 +92,11 @@ { "405R": "control_region", "490R": "signal_region", - "Tick": "ttl", + "PAB/": "ttl", }, "region", "ttl", + "tdt", ), ( "SampleData_with_artifacts/Photo_048_392-200728-121222", From 73e6a1c3586ec361155bde7cec610729412d7041 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 21 Nov 2025 09:28:59 -0800 Subject: [PATCH 030/150] Added event-splitting to tdt --- .../extractors/tdt_recording_extractor.py | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py index 1d46b1e..2cc2f15 100644 --- a/src/guppy/extractors/tdt_recording_extractor.py +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -26,6 +26,8 @@ def execute_readtev(folder_path, events, outputPath, numProcesses=mp.cpu_count() def read_tdt_and_save_hdf5(extractor, event, outputPath): S = extractor.readtev(event=event) extractor.save_dict_to_hdf5(S=S, event=event, outputPath=outputPath) + if extractor.event_needs_splitting(data=S["data"], sampling_rate=S["sampling_rate"]): + extractor.split_event_data(S, event, outputPath) logger.info("Data for event {} fetched and stored.".format(event)) @@ -148,6 +150,53 @@ def ismember(self, arr, element): # TODO: replace this function with more stand res = [1 if i == element else 0 for i in arr] return np.asarray(res) + # TODO: this is broken, and I need to fix it. + def event_needs_splitting(self, data, sampling_rate): + diff = np.diff(data) + if diff.shape[0] == 0: + return False + if sampling_rate == 0 and not (np.all(diff == diff[0])): + return True + return False + + def split_event_data(self, S, event, outputPath): + event = event.replace("\\", "") + event = event.replace("/", "") + logger.info("Checking event storename data for creating multiple event names from single event storename...") + storesList = np.genfromtxt(os.path.join(outputPath, "storesList.csv"), dtype="str", delimiter=",").reshape( + 2, -1 + ) + logger.info("\033[1m" + "Data in event {} belongs to multiple behavior".format(event) + "\033[0m") + logger.debug( + "\033[1m" + "Create timestamp files for individual new event and change the stores list file." + "\033[0m" + ) + i_d = np.unique(S["data"]) + for i in range(i_d.shape[0]): + new_S = dict() + idx = np.where(S["data"] == i_d[i])[0] + new_S["timestamps"] = S["timestamps"][idx] + new_S["storename"] = event + str(int(i_d[i])) + new_S["sampling_rate"] = S["sampling_rate"] + new_S["data"] = S["data"] + new_S["npoints"] = S["npoints"] + new_S["channels"] = S["channels"] + storesList = np.concatenate( + (storesList, [[event + str(int(i_d[i]))], [event + "_" + str(int(i_d[i]))]]), axis=1 + ) + self.save_dict_to_hdf5(new_S, event + str(int(i_d[i])), outputPath) + + idx = np.where(storesList[0] == event)[0] + storesList = np.delete(storesList, idx, axis=1) + if not os.path.exists(os.path.join(outputPath, ".cache_storesList.csv")): + os.rename(os.path.join(outputPath, "storesList.csv"), os.path.join(outputPath, ".cache_storesList.csv")) + if idx.shape[0] == 0: + pass + else: + np.savetxt(os.path.join(outputPath, "storesList.csv"), storesList, delimiter=",", fmt="%s") + logger.info( + "\033[1m Timestamp files for individual new event are created and the stores list file is changed.\033[0m" + ) + # function to save data read from tev file to hdf5 file def save_dict_to_hdf5(self, S, event, outputPath): write_hdf5(S["storename"], event, outputPath, "storename") From a036090c79e166a6d454e3997d2b984867e4d469 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 21 Nov 2025 10:12:05 -0800 Subject: [PATCH 031/150] Fixed event vs. new event bug. --- src/guppy/extractors/tdt_recording_extractor.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py index 2cc2f15..527235f 100644 --- a/src/guppy/extractors/tdt_recording_extractor.py +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -160,8 +160,9 @@ def event_needs_splitting(self, data, sampling_rate): return False def split_event_data(self, S, event, outputPath): - event = event.replace("\\", "") - event = event.replace("/", "") + # Note that new_event is only used for the new storesList and event is still used for the old storesList + new_event = event.replace("\\", "") + new_event = event.replace("/", "") logger.info("Checking event storename data for creating multiple event names from single event storename...") storesList = np.genfromtxt(os.path.join(outputPath, "storesList.csv"), dtype="str", delimiter=",").reshape( 2, -1 @@ -175,15 +176,15 @@ def split_event_data(self, S, event, outputPath): new_S = dict() idx = np.where(S["data"] == i_d[i])[0] new_S["timestamps"] = S["timestamps"][idx] - new_S["storename"] = event + str(int(i_d[i])) + new_S["storename"] = new_event + str(int(i_d[i])) new_S["sampling_rate"] = S["sampling_rate"] new_S["data"] = S["data"] new_S["npoints"] = S["npoints"] new_S["channels"] = S["channels"] storesList = np.concatenate( - (storesList, [[event + str(int(i_d[i]))], [event + "_" + str(int(i_d[i]))]]), axis=1 + (storesList, [[new_event + str(int(i_d[i]))], [new_event + "_" + str(int(i_d[i]))]]), axis=1 ) - self.save_dict_to_hdf5(new_S, event + str(int(i_d[i])), outputPath) + self.save_dict_to_hdf5(new_S, new_event + str(int(i_d[i])), outputPath) idx = np.where(storesList[0] == event)[0] storesList = np.delete(storesList, idx, axis=1) From 7ecdf7809454bd5aee9b9b3a3a9164437784edd1 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 21 Nov 2025 10:12:54 -0800 Subject: [PATCH 032/150] Fixed event vs. new event bug. --- src/guppy/extractors/tdt_recording_extractor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py index 527235f..71c8d29 100644 --- a/src/guppy/extractors/tdt_recording_extractor.py +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -150,7 +150,6 @@ def ismember(self, arr, element): # TODO: replace this function with more stand res = [1 if i == element else 0 for i in arr] return np.asarray(res) - # TODO: this is broken, and I need to fix it. def event_needs_splitting(self, data, sampling_rate): diff = np.diff(data) if diff.shape[0] == 0: From b87e79ff4409d889fcdb4536d328f4189043aec8 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 21 Nov 2025 12:07:35 -0800 Subject: [PATCH 033/150] Refactored save_dict_to_hdf5 to compute event from S. --- .../extractors/tdt_recording_extractor.py | 67 +++++-------------- 1 file changed, 15 insertions(+), 52 deletions(-) diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py index 71c8d29..530ccc5 100644 --- a/src/guppy/extractors/tdt_recording_extractor.py +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -25,7 +25,7 @@ def execute_readtev(folder_path, events, outputPath, numProcesses=mp.cpu_count() def read_tdt_and_save_hdf5(extractor, event, outputPath): S = extractor.readtev(event=event) - extractor.save_dict_to_hdf5(S=S, event=event, outputPath=outputPath) + extractor.save_dict_to_hdf5(S=S, outputPath=outputPath) if extractor.event_needs_splitting(data=S["data"], sampling_rate=S["sampling_rate"]): extractor.split_event_data(S, event, outputPath) logger.info("Data for event {} fetched and stored.".format(event)) @@ -145,6 +145,17 @@ def readtev(self, event): return S + def read(self, events): + output_dicts = [] + for event in events: + S = self.readtev(event=event) + if self.event_needs_splitting(data=S["data"], sampling_rate=S["sampling_rate"]): + event_dicts = self.split_event_data(S, event, None) + else: + event_dicts = [S] + output_dicts.extend(event_dicts) + return output_dicts + # check if a particular element is there in an array or not def ismember(self, arr, element): # TODO: replace this function with more standard usage res = [1 if i == element else 0 for i in arr] @@ -183,7 +194,7 @@ def split_event_data(self, S, event, outputPath): storesList = np.concatenate( (storesList, [[new_event + str(int(i_d[i]))], [new_event + "_" + str(int(i_d[i]))]]), axis=1 ) - self.save_dict_to_hdf5(new_S, new_event + str(int(i_d[i])), outputPath) + self.save_dict_to_hdf5(new_S, outputPath) idx = np.where(storesList[0] == event)[0] storesList = np.delete(storesList, idx, axis=1) @@ -198,7 +209,8 @@ def split_event_data(self, S, event, outputPath): ) # function to save data read from tev file to hdf5 file - def save_dict_to_hdf5(self, S, event, outputPath): + def save_dict_to_hdf5(self, S, outputPath): + event = S["storename"] write_hdf5(S["storename"], event, outputPath, "storename") write_hdf5(S["sampling_rate"], event, outputPath, "sampling_rate") write_hdf5(S["timestamps"], event, outputPath, "timestamps") @@ -206,52 +218,3 @@ def save_dict_to_hdf5(self, S, event, outputPath): write_hdf5(S["data"], event, outputPath, "data") write_hdf5(S["npoints"], event, outputPath, "npoints") write_hdf5(S["channels"], event, outputPath, "channels") - - # function to check event data (checking whether event timestamps belongs to same event or multiple events) - def check_data(self, S, event, outputPath): # TODO: fold this function into the main read/get function - # logger.info("Checking event storename data for creating multiple event names from single event storename...") - new_event = event.replace("\\", "") - new_event = event.replace("/", "") - diff = np.diff(S["data"]) - arr = np.full(diff.shape[0], 1) - - storesList = np.genfromtxt(os.path.join(outputPath, "storesList.csv"), dtype="str", delimiter=",").reshape( - 2, -1 - ) - - if diff.shape[0] == 0: - return 0 - - if S["sampling_rate"] == 0 and np.all(diff == diff[0]) == False: - logger.info("\033[1m" + "Data in event {} belongs to multiple behavior".format(event) + "\033[0m") - logger.debug( - "\033[1m" - + "Create timestamp files for individual new event and change the stores list file." - + "\033[0m" - ) - i_d = np.unique(S["data"]) - for i in range(i_d.shape[0]): - new_S = dict() - idx = np.where(S["data"] == i_d[i])[0] - new_S["timestamps"] = S["timestamps"][idx] - new_S["storename"] = new_event + str(int(i_d[i])) - new_S["sampling_rate"] = S["sampling_rate"] - new_S["data"] = S["data"] - new_S["npoints"] = S["npoints"] - new_S["channels"] = S["channels"] - storesList = np.concatenate( - (storesList, [[new_event + str(int(i_d[i]))], [new_event + "_" + str(int(i_d[i]))]]), axis=1 - ) - self.save_dict_to_hdf5(new_S, new_event + str(int(i_d[i])), outputPath) - - idx = np.where(storesList[0] == event)[0] - storesList = np.delete(storesList, idx, axis=1) - if not os.path.exists(os.path.join(outputPath, ".cache_storesList.csv")): - os.rename(os.path.join(outputPath, "storesList.csv"), os.path.join(outputPath, ".cache_storesList.csv")) - if idx.shape[0] == 0: - pass - else: - np.savetxt(os.path.join(outputPath, "storesList.csv"), storesList, delimiter=",", fmt="%s") - logger.info( - "\033[1m Timestamp files for individual new event are created and the stores list file is changed.\033[0m" - ) From 11922663bd537b4c6dddf5f460ddb959ff1cc993 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 21 Nov 2025 12:27:33 -0800 Subject: [PATCH 034/150] Peeled split_event_storesList from split_event_data. --- .../extractors/tdt_recording_extractor.py | 65 +++++++++++++------ 1 file changed, 46 insertions(+), 19 deletions(-) diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py index 530ccc5..0659d3a 100644 --- a/src/guppy/extractors/tdt_recording_extractor.py +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -3,7 +3,6 @@ import multiprocessing as mp import os import time -from itertools import repeat import numpy as np import pandas as pd @@ -14,23 +13,32 @@ logger = logging.getLogger(__name__) -# function to execute readtev function using multiprocessing to make it faster +# # function to execute readtev function using multiprocessing to make it faster +# def execute_readtev(folder_path, events, outputPath, numProcesses=mp.cpu_count()): +# extractor = TdtRecordingExtractor(folder_path=folder_path) +# start = time.time() +# with mp.Pool(numProcesses) as p: +# p.starmap(read_tdt_and_save_hdf5, zip(repeat(extractor), events, repeat(outputPath))) +# logger.info("Time taken = {0:.5f}".format(time.time() - start)) + + +# def read_tdt_and_save_hdf5(extractor, event, outputPath): +# S = extractor.readtev(event=event) +# extractor.save_dict_to_hdf5(S=S, outputPath=outputPath) +# if extractor.event_needs_splitting(data=S["data"], sampling_rate=S["sampling_rate"]): +# extractor.split_event_data(S, event, outputPath) +# logger.info("Data for event {} fetched and stored.".format(event)) + + def execute_readtev(folder_path, events, outputPath, numProcesses=mp.cpu_count()): extractor = TdtRecordingExtractor(folder_path=folder_path) start = time.time() - with mp.Pool(numProcesses) as p: - p.starmap(read_tdt_and_save_hdf5, zip(repeat(extractor), events, repeat(outputPath))) + output_dicts = extractor.read(events=events, outputPath=outputPath) + for S in output_dicts: + extractor.save_dict_to_hdf5(S=S, outputPath=outputPath) logger.info("Time taken = {0:.5f}".format(time.time() - start)) -def read_tdt_and_save_hdf5(extractor, event, outputPath): - S = extractor.readtev(event=event) - extractor.save_dict_to_hdf5(S=S, outputPath=outputPath) - if extractor.event_needs_splitting(data=S["data"], sampling_rate=S["sampling_rate"]): - extractor.split_event_data(S, event, outputPath) - logger.info("Data for event {} fetched and stored.".format(event)) - - class TdtRecordingExtractor: def __init__(self, folder_path): @@ -145,12 +153,13 @@ def readtev(self, event): return S - def read(self, events): + def read(self, events, outputPath): output_dicts = [] for event in events: S = self.readtev(event=event) if self.event_needs_splitting(data=S["data"], sampling_rate=S["sampling_rate"]): - event_dicts = self.split_event_data(S, event, None) + event_dicts = self.split_event_data(S, event) + self.split_event_storesList(S, event, outputPath) else: event_dicts = [S] output_dicts.extend(event_dicts) @@ -169,19 +178,17 @@ def event_needs_splitting(self, data, sampling_rate): return True return False - def split_event_data(self, S, event, outputPath): + def split_event_data(self, S, event): # Note that new_event is only used for the new storesList and event is still used for the old storesList new_event = event.replace("\\", "") new_event = event.replace("/", "") logger.info("Checking event storename data for creating multiple event names from single event storename...") - storesList = np.genfromtxt(os.path.join(outputPath, "storesList.csv"), dtype="str", delimiter=",").reshape( - 2, -1 - ) logger.info("\033[1m" + "Data in event {} belongs to multiple behavior".format(event) + "\033[0m") logger.debug( "\033[1m" + "Create timestamp files for individual new event and change the stores list file." + "\033[0m" ) i_d = np.unique(S["data"]) + event_dicts = [S] for i in range(i_d.shape[0]): new_S = dict() idx = np.where(S["data"] == i_d[i])[0] @@ -191,10 +198,30 @@ def split_event_data(self, S, event, outputPath): new_S["data"] = S["data"] new_S["npoints"] = S["npoints"] new_S["channels"] = S["channels"] + event_dicts.append(new_S) + logger.info( + "\033[1m Timestamp files for individual new event are created and the stores list file is changed.\033[0m" + ) + + return event_dicts + + def split_event_storesList(self, S, event, outputPath): + # Note that new_event is only used for the new storesList and event is still used for the old storesList + new_event = event.replace("\\", "") + new_event = event.replace("/", "") + logger.info("Checking event storename data for creating multiple event names from single event storename...") + storesList = np.genfromtxt(os.path.join(outputPath, "storesList.csv"), dtype="str", delimiter=",").reshape( + 2, -1 + ) + logger.info("\033[1m" + "Data in event {} belongs to multiple behavior".format(event) + "\033[0m") + logger.debug( + "\033[1m" + "Create timestamp files for individual new event and change the stores list file." + "\033[0m" + ) + i_d = np.unique(S["data"]) + for i in range(i_d.shape[0]): storesList = np.concatenate( (storesList, [[new_event + str(int(i_d[i]))], [new_event + "_" + str(int(i_d[i]))]]), axis=1 ) - self.save_dict_to_hdf5(new_S, outputPath) idx = np.where(storesList[0] == event)[0] storesList = np.delete(storesList, idx, axis=1) From 9231f5fc01192b810eb82d216bee819a06bc934e Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 21 Nov 2025 12:31:49 -0800 Subject: [PATCH 035/150] updated logging. --- .../extractors/tdt_recording_extractor.py | 21 ++++++------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py index 0659d3a..4743185 100644 --- a/src/guppy/extractors/tdt_recording_extractor.py +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -171,6 +171,7 @@ def ismember(self, arr, element): # TODO: replace this function with more stand return np.asarray(res) def event_needs_splitting(self, data, sampling_rate): + logger.info("Checking event storename data for creating multiple event names from single event storename...") diff = np.diff(data) if diff.shape[0] == 0: return False @@ -182,11 +183,8 @@ def split_event_data(self, S, event): # Note that new_event is only used for the new storesList and event is still used for the old storesList new_event = event.replace("\\", "") new_event = event.replace("/", "") - logger.info("Checking event storename data for creating multiple event names from single event storename...") logger.info("\033[1m" + "Data in event {} belongs to multiple behavior".format(event) + "\033[0m") - logger.debug( - "\033[1m" + "Create timestamp files for individual new event and change the stores list file." + "\033[0m" - ) + logger.debug("\033[1m" + "Create timestamp files for individual new event." + "\033[0m") i_d = np.unique(S["data"]) event_dicts = [S] for i in range(i_d.shape[0]): @@ -199,9 +197,7 @@ def split_event_data(self, S, event): new_S["npoints"] = S["npoints"] new_S["channels"] = S["channels"] event_dicts.append(new_S) - logger.info( - "\033[1m Timestamp files for individual new event are created and the stores list file is changed.\033[0m" - ) + logger.info("\033[1m Timestamp files for individual new event are created.\033[0m") return event_dicts @@ -209,14 +205,11 @@ def split_event_storesList(self, S, event, outputPath): # Note that new_event is only used for the new storesList and event is still used for the old storesList new_event = event.replace("\\", "") new_event = event.replace("/", "") - logger.info("Checking event storename data for creating multiple event names from single event storename...") storesList = np.genfromtxt(os.path.join(outputPath, "storesList.csv"), dtype="str", delimiter=",").reshape( 2, -1 ) - logger.info("\033[1m" + "Data in event {} belongs to multiple behavior".format(event) + "\033[0m") - logger.debug( - "\033[1m" + "Create timestamp files for individual new event and change the stores list file." + "\033[0m" - ) + logger.info("\033[1m" + "StoresList in event {} belongs to multiple behavior".format(event) + "\033[0m") + logger.debug("\033[1m" + "Change the stores list file for individual new event." + "\033[0m") i_d = np.unique(S["data"]) for i in range(i_d.shape[0]): storesList = np.concatenate( @@ -231,9 +224,7 @@ def split_event_storesList(self, S, event, outputPath): pass else: np.savetxt(os.path.join(outputPath, "storesList.csv"), storesList, delimiter=",", fmt="%s") - logger.info( - "\033[1m Timestamp files for individual new event are created and the stores list file is changed.\033[0m" - ) + logger.info("\033[1m The stores list file is changed.\033[0m") # function to save data read from tev file to hdf5 file def save_dict_to_hdf5(self, S, outputPath): From ddf6ae5a34effe3e835e5107b18242307dcaa42c Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 21 Nov 2025 12:37:10 -0800 Subject: [PATCH 036/150] Added high-level save --- src/guppy/extractors/tdt_recording_extractor.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py index 4743185..a503bb2 100644 --- a/src/guppy/extractors/tdt_recording_extractor.py +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -34,8 +34,7 @@ def execute_readtev(folder_path, events, outputPath, numProcesses=mp.cpu_count() extractor = TdtRecordingExtractor(folder_path=folder_path) start = time.time() output_dicts = extractor.read(events=events, outputPath=outputPath) - for S in output_dicts: - extractor.save_dict_to_hdf5(S=S, outputPath=outputPath) + extractor.save(output_dicts=output_dicts, outputPath=outputPath) logger.info("Time taken = {0:.5f}".format(time.time() - start)) @@ -236,3 +235,7 @@ def save_dict_to_hdf5(self, S, outputPath): write_hdf5(S["data"], event, outputPath, "data") write_hdf5(S["npoints"], event, outputPath, "npoints") write_hdf5(S["channels"], event, outputPath, "channels") + + def save(self, output_dicts, outputPath): + for S in output_dicts: + self.save_dict_to_hdf5(S=S, outputPath=outputPath) From 212c7c5a7cf3f22e84804d77762978493d06aa5c Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 21 Nov 2025 12:40:03 -0800 Subject: [PATCH 037/150] Added TODO --- src/guppy/extractors/tdt_recording_extractor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py index a503bb2..b5dc670 100644 --- a/src/guppy/extractors/tdt_recording_extractor.py +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -200,6 +200,9 @@ def split_event_data(self, S, event): return event_dicts + # This function saves a new storesList.csv file, which is a bit of a side effect in the overall read path, + # which is supposed to just return a list of dictionaries. + # TODO: long term I'd like to move these storesList shenanigans somewhere else, likely outside of the extractor. def split_event_storesList(self, S, event, outputPath): # Note that new_event is only used for the new storesList and event is still used for the old storesList new_event = event.replace("\\", "") From 33682d26b074ac9f44bc8fd64f9c9bcae5171656 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 21 Nov 2025 12:55:18 -0800 Subject: [PATCH 038/150] Added multi-processing back in. --- .../extractors/tdt_recording_extractor.py | 23 +++++-------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py index b5dc670..58cde99 100644 --- a/src/guppy/extractors/tdt_recording_extractor.py +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -3,6 +3,7 @@ import multiprocessing as mp import os import time +from itertools import repeat import numpy as np import pandas as pd @@ -13,28 +14,16 @@ logger = logging.getLogger(__name__) -# # function to execute readtev function using multiprocessing to make it faster -# def execute_readtev(folder_path, events, outputPath, numProcesses=mp.cpu_count()): -# extractor = TdtRecordingExtractor(folder_path=folder_path) -# start = time.time() -# with mp.Pool(numProcesses) as p: -# p.starmap(read_tdt_and_save_hdf5, zip(repeat(extractor), events, repeat(outputPath))) -# logger.info("Time taken = {0:.5f}".format(time.time() - start)) - - -# def read_tdt_and_save_hdf5(extractor, event, outputPath): -# S = extractor.readtev(event=event) -# extractor.save_dict_to_hdf5(S=S, outputPath=outputPath) -# if extractor.event_needs_splitting(data=S["data"], sampling_rate=S["sampling_rate"]): -# extractor.split_event_data(S, event, outputPath) -# logger.info("Data for event {} fetched and stored.".format(event)) +def read_and_save_tdt(extractor, event, outputPath): + output_dicts = extractor.read(events=[event], outputPath=outputPath) + extractor.save(output_dicts=output_dicts, outputPath=outputPath) def execute_readtev(folder_path, events, outputPath, numProcesses=mp.cpu_count()): extractor = TdtRecordingExtractor(folder_path=folder_path) start = time.time() - output_dicts = extractor.read(events=events, outputPath=outputPath) - extractor.save(output_dicts=output_dicts, outputPath=outputPath) + with mp.Pool(numProcesses) as p: + p.starmap(read_and_save_tdt, zip(repeat(extractor), events, repeat(outputPath))) logger.info("Time taken = {0:.5f}".format(time.time() - start)) From f84c550bb181fa51a53587fd4374266746c6c88e Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 21 Nov 2025 14:47:45 -0800 Subject: [PATCH 039/150] Fixed test_step5.py for tdt_check_data --- tests/test_step5.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/test_step5.py b/tests/test_step5.py index 4bed772..870fb7c 100644 --- a/tests/test_step5.py +++ b/tests/test_step5.py @@ -95,7 +95,7 @@ "PAB/": "ttl", }, "region", - "ttl", + ["PAB_0", "PAB_16", "PAB_2064"], # This session has an event which gets split into three sub-events. "tdt", ), ( @@ -278,7 +278,13 @@ def test_step5(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r assert os.path.exists(stores_fp), "Missing storesList.csv after Steps 2-5" # Expected PSTH outputs (defaults compute z_score PSTH) - only for datasets with TTLs - if expected_ttl is not None: + if expected_ttl is None: + expected_ttls = [] + elif isinstance(expected_ttl, str): + expected_ttls = [expected_ttl] + else: + expected_ttls = expected_ttl + for expected_ttl in expected_ttls: psth_h5 = os.path.join(out_dir, f"{expected_ttl}_{expected_region}_z_score_{expected_region}.h5") psth_baseline_uncorr_h5 = os.path.join( out_dir, f"{expected_ttl}_{expected_region}_baselineUncorrected_z_score_{expected_region}.h5" From c55a230bd8034a608d5e7cbd259bed5d20a4b282 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 21 Nov 2025 14:50:04 -0800 Subject: [PATCH 040/150] Fixed test_step4.py for tdt_check_data --- tests/test_step4.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/test_step4.py b/tests/test_step4.py index cdaf0ec..109e7da 100644 --- a/tests/test_step4.py +++ b/tests/test_step4.py @@ -95,7 +95,7 @@ "PAB/": "ttl", }, "region", - "ttl", + ["PAB_0", "PAB_16", "PAB_2064"], # This session has an event which gets split into three sub-events. "tdt", ), ( @@ -272,7 +272,13 @@ def test_step4(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r assert "timestampNew" in f, f"Expected 'timestampNew' dataset in {timecorr}" # If TTLs exist, check their per-region 'ts' outputs - if expected_ttl is not None: + if expected_ttl is None: + expected_ttls = [] + elif isinstance(expected_ttl, str): + expected_ttls = [expected_ttl] + else: + expected_ttls = expected_ttl + for expected_ttl in expected_ttls: ttl_fp = os.path.join(out_dir, f"{expected_ttl}_{expected_region}.hdf5") assert os.path.exists(ttl_fp), f"Missing TTL-aligned file {ttl_fp}" with h5py.File(ttl_fp, "r") as f: From 03ffd54c7c61d449cab7ac077f3bf1e746e206fb Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 21 Nov 2025 15:00:59 -0800 Subject: [PATCH 041/150] Renamed test_case from tdt_check_data to tdt_split_event. --- tests/test_step2.py | 2 +- tests/test_step3.py | 2 +- tests/test_step4.py | 2 +- tests/test_step5.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_step2.py b/tests/test_step2.py index b34fe64..f7e34d1 100644 --- a/tests/test_step2.py +++ b/tests/test_step2.py @@ -136,7 +136,7 @@ "sample_doric_4", "sample_doric_5", "tdt_clean", - "tdt_check_data", + "tdt_split_event", "tdt_with_artifacts", "sample_npm_2", "sample_npm_3", diff --git a/tests/test_step3.py b/tests/test_step3.py index 330d017..26dac14 100644 --- a/tests/test_step3.py +++ b/tests/test_step3.py @@ -145,7 +145,7 @@ def storenames_map(): "sample_doric_4", "sample_doric_5", "tdt_clean", - "tdt_check_data", + "tdt_split_event", "tdt_with_artifacts", "sample_npm_2", "sample_npm_3", diff --git a/tests/test_step4.py b/tests/test_step4.py index 109e7da..df18f75 100644 --- a/tests/test_step4.py +++ b/tests/test_step4.py @@ -161,7 +161,7 @@ "sample_doric_4", "sample_doric_5", "tdt_clean", - "tdt_check_data", + "tdt_split_event", "tdt_with_artifacts", "sample_npm_2", "sample_npm_3", diff --git a/tests/test_step5.py b/tests/test_step5.py index 870fb7c..a8cdeb4 100644 --- a/tests/test_step5.py +++ b/tests/test_step5.py @@ -161,7 +161,7 @@ "sample_doric_4", "sample_doric_5", "tdt_clean", - "tdt_check_data", + "tdt_split_event", "tdt_with_artifacts", "sample_npm_2", "sample_npm_3", From 27acc6cecee233b83dd8d6961bbf5fc5bb669a74 Mon Sep 17 00:00:00 2001 From: Paul Adkisson-Floro Date: Mon, 1 Dec 2025 19:43:14 -0500 Subject: [PATCH 042/150] Standardize read and save (#188) --- .../extractors/csv_recording_extractor.py | 23 ++- .../extractors/doric_recording_extractor.py | 139 +++++++++++------- .../extractors/npm_recording_extractor.py | 23 ++- 3 files changed, 126 insertions(+), 59 deletions(-) diff --git a/src/guppy/extractors/csv_recording_extractor.py b/src/guppy/extractors/csv_recording_extractor.py index 3df76f6..5a42bd1 100644 --- a/src/guppy/extractors/csv_recording_extractor.py +++ b/src/guppy/extractors/csv_recording_extractor.py @@ -19,13 +19,13 @@ def execute_import_csv(filepath, events, outputPath, numProcesses=mp.cpu_count() extractor = CsvRecordingExtractor(folder_path=filepath) start = time.time() with mp.Pool(numProcesses) as p: - p.starmap(read_csv_and_save_hdf5, zip(repeat(extractor), events, repeat(outputPath))) + p.starmap(read_and_save_csv, zip(repeat(extractor), events, repeat(outputPath))) logger.info("Time taken = {0:.5f}".format(time.time() - start)) -def read_csv_and_save_hdf5(extractor, event, outputPath): - df = extractor.read_csv(event=event) - extractor.save_to_hdf5(df=df, event=event, outputPath=outputPath) +def read_and_save_csv(extractor, event, outputPath): + output_dicts = extractor.read(events=[event], outputPath=outputPath) + extractor.save(output_dicts=output_dicts, outputPath=outputPath) logger.info("Data for event {} fetched and stored.".format(event)) @@ -178,3 +178,18 @@ def save_to_hdf5(self, df, event, outputPath): write_hdf5(df[key[i]].dropna(), event, outputPath, key[i].lower()) logger.info("\033[1m" + "Reading data for {} from csv file is completed.".format(event) + "\033[0m") + + def read(self, events, outputPath): + output_dicts = [] + for event in events: + df = self.read_csv(event=event) + S = df.to_dict() + S["storename"] = event + output_dicts.append(S) + return output_dicts + + def save(self, output_dicts, outputPath): + for S in output_dicts: + event = S.pop("storename") + df = pd.DataFrame.from_dict(S) + self.save_to_hdf5(df=df, event=event, outputPath=outputPath) diff --git a/src/guppy/extractors/doric_recording_extractor.py b/src/guppy/extractors/doric_recording_extractor.py index e5a97cb..2966ec6 100644 --- a/src/guppy/extractors/doric_recording_extractor.py +++ b/src/guppy/extractors/doric_recording_extractor.py @@ -15,18 +15,12 @@ def execute_import_doric(folder_path, storesList, outputPath): extractor = DoricRecordingExtractor(folder_path=folder_path) - flag = extractor.check_doric(folder_path) - - if flag == "doric_csv": - extractor.read_doric_csv(folder_path, storesList, outputPath) - elif flag == "doric_doric": - extractor.read_doric_doric(folder_path, storesList, outputPath) - else: - logger.error("Doric file not found or not recognized.") - raise FileNotFoundError("Doric file not found or not recognized.") + output_dicts = extractor.read(storesList=storesList) + extractor.save(output_dicts=output_dicts, outputPath=outputPath) class DoricRecordingExtractor: + # TODO: consolidate duplicate flag logic between the `__init__` and the `check_doric` method. def __init__(self, folder_path): self.folder_path = folder_path @@ -110,9 +104,9 @@ def separate_last_element(self, arr): l = arr[-1] return arr[:-1], l - def check_doric(self, filepath): + def check_doric(self): logger.debug("Checking if doric file exists") - path = glob.glob(os.path.join(filepath, "*.csv")) + glob.glob(os.path.join(filepath, "*.doric")) + path = glob.glob(os.path.join(self.folder_path, "*.csv")) + glob.glob(os.path.join(self.folder_path, "*.doric")) flag_arr = [] for i in range(len(path)): @@ -141,44 +135,50 @@ def check_doric(self, filepath): logger.info("Doric file found.") return flag_arr[0] - def read_doric_csv(self, filepath, storesList, outputPath): - path = glob.glob(os.path.join(filepath, "*.csv")) + def read_doric_csv(self, storesList): + path = glob.glob(os.path.join(self.folder_path, "*.csv")) if len(path) > 1: logger.error("An error occurred : More than one Doric csv file present at the location") raise Exception("More than one Doric csv file present at the location") - else: - df = pd.read_csv(path[0], header=1, index_col=False) - df = df.dropna(axis=1, how="all") - df = df.dropna(axis=0, how="any") - df["Time(s)"] = df["Time(s)"] - df["Time(s)"].to_numpy()[0] - for i in range(storesList.shape[1]): - if "control" in storesList[1, i] or "signal" in storesList[1, i]: - timestamps = np.array(df["Time(s)"]) - sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) - write_hdf5(sampling_rate, storesList[0, i], outputPath, "sampling_rate") - write_hdf5(df["Time(s)"].to_numpy(), storesList[0, i], outputPath, "timestamps") - write_hdf5(df[storesList[0, i]].to_numpy(), storesList[0, i], outputPath, "data") - else: - ttl = df[storesList[0, i]] - indices = np.where(ttl <= 0)[0] - diff_indices = np.where(np.diff(indices) > 1)[0] - write_hdf5( - df["Time(s)"][indices[diff_indices] + 1].to_numpy(), storesList[0, i], outputPath, "timestamps" - ) - - def read_doric_doric(self, filepath, storesList, outputPath): - path = glob.glob(os.path.join(filepath, "*.doric")) + + df = pd.read_csv(path[0], header=1, index_col=False) + df = df.dropna(axis=1, how="all") + df = df.dropna(axis=0, how="any") + df["Time(s)"] = df["Time(s)"] - df["Time(s)"].to_numpy()[0] + + output_dicts = [] + for i in range(storesList.shape[1]): + if "control" in storesList[1, i] or "signal" in storesList[1, i]: + timestamps = np.array(df["Time(s)"]) + sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) + data = np.array(df[storesList[0, i]]) + storename = storesList[0, i] + S = {"storename": storename, "sampling_rate": sampling_rate, "timestamps": timestamps, "data": data} + output_dicts.append(S) + else: + ttl = df[storesList[0, i]] + indices = np.where(ttl <= 0)[0] + diff_indices = np.where(np.diff(indices) > 1)[0] + timestamps = df["Time(s)"][indices[diff_indices] + 1].to_numpy() + storename = storesList[0, i] + S = {"storename": storename, "timestamps": timestamps} + output_dicts.append(S) + + return output_dicts + + def read_doric_doric(self, storesList): + path = glob.glob(os.path.join(self.folder_path, "*.doric")) if len(path) > 1: logger.error("An error occurred : More than one Doric file present at the location") raise Exception("More than one Doric file present at the location") - else: - with h5py.File(path[0], "r") as f: - if "Traces" in list(f.keys()): - keys = self.access_data_doricV1(f, storesList, outputPath) - elif list(f.keys()) == ["Configurations", "DataAcquisition"]: - keys = self.access_data_doricV6(f, storesList, outputPath) + with h5py.File(path[0], "r") as f: + if "Traces" in list(f.keys()): + output_dicts = self.access_data_doricV1(f, storesList) + elif list(f.keys()) == ["Configurations", "DataAcquisition"]: + output_dicts = self.access_data_doricV6(f, storesList) + return output_dicts - def access_data_doricV6(self, doric_file, storesList, outputPath): + def access_data_doricV6(self, doric_file, storesList): data = [doric_file["DataAcquisition"]] res = [] while len(data) != 0: @@ -201,6 +201,7 @@ def access_data_doricV6(self, doric_file, storesList, outputPath): if f"{sep_values[-2]}/{sep_values[-1]}" in storesList[0, :]: decide_path.append(element) + output_dicts = [] for i in range(storesList.shape[1]): if "control" in storesList[1, i] or "signal" in storesList[1, i]: regex = re.compile("(.*?)" + str(storesList[0, i]) + "(.*?)") @@ -212,9 +213,9 @@ def access_data_doricV6(self, doric_file, storesList, outputPath): data = np.array(doric_file[decide_path[idx]]) timestamps = np.array(doric_file[decide_path[idx].rsplit("/", 1)[0] + "/Time"]) sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) - write_hdf5(sampling_rate, storesList[0, i], outputPath, "sampling_rate") - write_hdf5(timestamps, storesList[0, i], outputPath, "timestamps") - write_hdf5(data, storesList[0, i], outputPath, "data") + storename = storesList[0, i] + S = {"storename": storename, "sampling_rate": sampling_rate, "timestamps": timestamps, "data": data} + output_dicts.append(S) else: regex = re.compile("(.*?)" + storesList[0, i] + "$") idx = [i for i in range(len(decide_path)) if regex.match(decide_path[i])] @@ -226,21 +227,57 @@ def access_data_doricV6(self, doric_file, storesList, outputPath): timestamps = np.array(doric_file[decide_path[idx].rsplit("/", 1)[0] + "/Time"]) indices = np.where(ttl <= 0)[0] diff_indices = np.where(np.diff(indices) > 1)[0] - write_hdf5(timestamps[indices[diff_indices] + 1], storesList[0, i], outputPath, "timestamps") + timestamps = timestamps[indices[diff_indices] + 1] + storename = storesList[0, i] + S = {"storename": storename, "timestamps": timestamps} + output_dicts.append(S) - def access_data_doricV1(self, doric_file, storesList, outputPath): + return output_dicts + + def access_data_doricV1(self, doric_file, storesList): keys = list(doric_file["Traces"]["Console"].keys()) + output_dicts = [] for i in range(storesList.shape[1]): if "control" in storesList[1, i] or "signal" in storesList[1, i]: timestamps = np.array(doric_file["Traces"]["Console"]["Time(s)"]["Console_time(s)"]) sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) data = np.array(doric_file["Traces"]["Console"][storesList[0, i]][storesList[0, i]]) - write_hdf5(sampling_rate, storesList[0, i], outputPath, "sampling_rate") - write_hdf5(timestamps, storesList[0, i], outputPath, "timestamps") - write_hdf5(data, storesList[0, i], outputPath, "data") + storename = storesList[0, i] + S = {"storename": storename, "sampling_rate": sampling_rate, "timestamps": timestamps, "data": data} + output_dicts.append(S) else: timestamps = np.array(doric_file["Traces"]["Console"]["Time(s)"]["Console_time(s)"]) ttl = np.array(doric_file["Traces"]["Console"][storesList[0, i]][storesList[0, i]]) indices = np.where(ttl <= 0)[0] diff_indices = np.where(np.diff(indices) > 1)[0] - write_hdf5(timestamps[indices[diff_indices] + 1], storesList[0, i], outputPath, "timestamps") + timestamps = timestamps[indices[diff_indices] + 1] + storename = storesList[0, i] + S = {"storename": storename, "timestamps": timestamps} + output_dicts.append(S) + + return output_dicts + + def save_dict_to_hdf5(self, S, outputPath): + event = S["storename"] + write_hdf5(S["timestamps"], event, outputPath, "timestamps") + + if "sampling_rate" in S: + write_hdf5(S["sampling_rate"], event, outputPath, "sampling_rate") + if "data" in S: + write_hdf5(S["data"], event, outputPath, "data") + + def read(self, storesList): + flag = self.check_doric() + if flag == "doric_csv": + output_dicts = self.read_doric_csv(storesList) + elif flag == "doric_doric": + output_dicts = self.read_doric_doric(storesList) + else: + logger.error("Doric file not found or not recognized.") + raise FileNotFoundError("Doric file not found or not recognized.") + + return output_dicts + + def save(self, output_dicts, outputPath): + for S in output_dicts: + self.save_dict_to_hdf5(S=S, outputPath=outputPath) diff --git a/src/guppy/extractors/npm_recording_extractor.py b/src/guppy/extractors/npm_recording_extractor.py index a8cfd98..bc9b210 100644 --- a/src/guppy/extractors/npm_recording_extractor.py +++ b/src/guppy/extractors/npm_recording_extractor.py @@ -24,13 +24,13 @@ def execute_import_npm(folder_path, num_ch, inputParameters, events, outputPath, extractor = NpmRecordingExtractor(folder_path=folder_path, num_ch=num_ch, inputParameters=inputParameters) start = time.time() with mp.Pool(numProcesses) as p: - p.starmap(read_npm_and_save_hdf5, zip(repeat(extractor), events, repeat(outputPath))) + p.starmap(read_and_save_npm, zip(repeat(extractor), events, repeat(outputPath))) logger.info("Time taken = {0:.5f}".format(time.time() - start)) -def read_npm_and_save_hdf5(extractor, event, outputPath): - df = extractor.read_npm(event=event) - extractor.save_to_hdf5(df=df, event=event, outputPath=outputPath) +def read_and_save_npm(extractor, event, outputPath): + output_dicts = extractor.read(events=[event], outputPath=outputPath) + extractor.save(output_dicts=output_dicts, outputPath=outputPath) logger.info("Data for event {} fetched and stored.".format(event)) @@ -488,3 +488,18 @@ def save_to_hdf5(self, df, event, outputPath): write_hdf5(df[key[i]].dropna(), event, outputPath, key[i].lower()) logger.info("\033[1m" + "Reading data for {} from csv file is completed.".format(event) + "\033[0m") + + def read(self, events, outputPath): + output_dicts = [] + for event in events: + df = self.read_npm(event=event) + S = df.to_dict() + S["storename"] = event + output_dicts.append(S) + return output_dicts + + def save(self, output_dicts, outputPath): + for S in output_dicts: + event = S.pop("storename") + df = pd.DataFrame.from_dict(S) + self.save_to_hdf5(df=df, event=event, outputPath=outputPath) From a633550144b26b2ed6cc1a4d86696f1296a6e9f1 Mon Sep 17 00:00:00 2001 From: Paul Adkisson-Floro Date: Wed, 3 Dec 2025 13:18:21 -0500 Subject: [PATCH 043/150] Remove tkinter from NPM (#189) Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../extractors/npm_recording_extractor.py | 293 ++++++++++-------- src/guppy/saveStoresList.py | 118 +++++++ src/guppy/testing/api.py | 78 +++-- tests/test_step2.py | 19 +- tests/test_step3.py | 22 +- tests/test_step4.py | 26 +- tests/test_step5.py | 30 +- 7 files changed, 387 insertions(+), 199 deletions(-) diff --git a/src/guppy/extractors/npm_recording_extractor.py b/src/guppy/extractors/npm_recording_extractor.py index bc9b210..ae4f540 100644 --- a/src/guppy/extractors/npm_recording_extractor.py +++ b/src/guppy/extractors/npm_recording_extractor.py @@ -3,9 +3,7 @@ import multiprocessing as mp import os import time -import tkinter as tk from itertools import repeat -from tkinter import StringVar, messagebox, ttk import numpy as np import pandas as pd @@ -36,7 +34,7 @@ def read_and_save_npm(extractor, event, outputPath): class NpmRecordingExtractor: - def __init__(self, folder_path, num_ch, inputParameters=None): + def __init__(self, folder_path, num_ch, inputParameters=None): # TODO: make inputParameters mandatory self.folder_path = folder_path self.num_ch = num_ch self.inputParameters = inputParameters @@ -44,18 +42,70 @@ def __init__(self, folder_path, num_ch, inputParameters=None): folder_path=folder_path, num_ch=num_ch, inputParameters=inputParameters ) + @classmethod + def has_multiple_event_ttls(cls, folder_path): + path = sorted(glob.glob(os.path.join(folder_path, "*.csv"))) + path_chev = glob.glob(os.path.join(folder_path, "*chev*")) + path_chod = glob.glob(os.path.join(folder_path, "*chod*")) + path_chpr = glob.glob(os.path.join(folder_path, "*chpr*")) + path_event = glob.glob(os.path.join(folder_path, "event*")) + path_chev_chod_event = path_chev + path_chod + path_event + path_chpr + + path = sorted(list(set(path) - set(path_chev_chod_event))) + multiple_event_ttls = [] + for i in range(len(path)): + df = pd.read_csv(path[i], index_col=False) + _, value = cls.check_header(df) + + # check dataframe structure and read data accordingly + if len(value) > 0: + columns_isstr = False + df = pd.read_csv(path[i], header=None) + cols = np.array(list(df.columns), dtype=str) + else: + columns_isstr = True + cols = np.array(list(df.columns), dtype=str) + if len(cols) == 2: + flag = "event_or_data_np" + elif len(cols) > 2: + flag = "data_np" + else: + logger.error("Number of columns in csv file does not make sense.") + raise Exception("Number of columns in csv file does not make sense.") + + # used assigned flags to process the files and read the data + if flag == "event_or_data_np": + arr = list(df.iloc[:, 1]) + check_float = [True for i in arr if isinstance(i, float)] + if len(arr) == len(check_float) and columns_isstr == False: + flag = "data_np" + elif columns_isstr == True and ("value" in np.char.lower(np.array(cols))): + flag = "event_np" + else: + flag = "event_np" + + if flag == "event_np": + type_val = np.array(df.iloc[:, 1]) + type_val_unique = np.unique(type_val) + if len(type_val_unique) > 1: + multiple_event_ttls.append(True) + else: + multiple_event_ttls.append(False) + else: + multiple_event_ttls.append(False) + + return multiple_event_ttls + def import_npm(self, folder_path, num_ch, inputParameters=None): logger.debug("If it exists, importing NPM file based on the structure of file") # Headless configuration (used to avoid any UI prompts when running tests) headless = bool(os.environ.get("GUPPY_BASE_DIR")) - npm_timestamp_column_name = None - npm_time_unit = None - npm_split_events = None if isinstance(inputParameters, dict): - npm_timestamp_column_name = inputParameters.get("npm_timestamp_column_name") - npm_time_unit = inputParameters.get("npm_time_unit", "seconds") - npm_split_events = inputParameters.get("npm_split_events", True) + npm_timestamp_column_names = inputParameters.get("npm_timestamp_column_names") + npm_time_units = inputParameters.get("npm_time_units") + # TODO: come up with a better name for npm_split_events that can be appropriately pluralized for a list + npm_split_events = inputParameters.get("npm_split_events") path = sorted(glob.glob(os.path.join(folder_path, "*.csv"))) + sorted( glob.glob(os.path.join(folder_path, "*.doric")) ) @@ -71,6 +121,20 @@ def import_npm(self, folder_path, num_ch, inputParameters=None): event_from_filename = [] flag_arr = [] for i in range(len(path)): + # TODO: validate npm_timestamp_column_names, npm_time_units, npm_split_events lengths + if npm_timestamp_column_names is None: + npm_timestamp_column_name = None + else: + npm_timestamp_column_name = npm_timestamp_column_names[i] + if npm_time_units is None: + npm_time_unit = "seconds" + else: + npm_time_unit = npm_time_units[i] + if npm_split_events is None: + split_events = False + else: + split_events = npm_split_events[i] + dirname = os.path.dirname(path[i]) ext = os.path.basename(path[i]).split(".")[-1] assert ext != "doric", "Doric files are not supported by import_npm function." @@ -103,7 +167,7 @@ def import_npm(self, folder_path, num_ch, inputParameters=None): assert len(cols) != 3, "File appears to be data .csv. This function only supports NPM .csv files." if len(cols) == 2: flag = "event_or_data_np" - elif len(cols) >= 2: + elif len(cols) > 2: flag = "data_np" else: logger.error("Number of columns in csv file does not make sense.") @@ -150,23 +214,7 @@ def import_npm(self, folder_path, num_ch, inputParameters=None): elif flag == "event_np": type_val = np.array(df.iloc[:, 1]) type_val_unique = np.unique(type_val) - if headless: - response = 1 if bool(npm_split_events) else 0 - else: - window = tk.Tk() - if len(type_val_unique) > 1: - response = messagebox.askyesno( - "Multiple event TTLs", - "Based on the TTL file,\ - it looks like TTLs \ - belongs to multiple behavior type. \ - Do you want to create multiple files for each \ - behavior type ?", - ) - else: - response = 0 - window.destroy() - if response == 1: + if split_events: timestamps = np.array(df.iloc[:, 0]) for j in range(len(type_val_unique)): idx = np.where(type_val == type_val_unique[j]) @@ -184,9 +232,8 @@ def import_npm(self, folder_path, num_ch, inputParameters=None): event_from_filename.append("event" + str(0)) else: file = f"file{str(i)}_" - df, ts_unit = self.decide_ts_unit_for_npm( - df, timestamp_column_name=npm_timestamp_column_name, time_unit=npm_time_unit, headless=headless - ) + ts_unit = npm_time_unit + df = self.update_df_with_timestamp_columns(df, timestamp_column_name=npm_timestamp_column_name) df, indices_dict, _ = self.decide_indices(file, df, flag) keys = list(indices_dict.keys()) for k in range(len(keys)): @@ -270,7 +317,8 @@ def import_npm(self, folder_path, num_ch, inputParameters=None): logger.info("Importing of NPM file is done.") return event_from_filename, flag_arr - def check_header(self, df): + @classmethod + def check_header(cls, df): arr = list(df.columns) check_float = [] for i in arr: @@ -283,7 +331,8 @@ def check_header(self, df): # function to decide indices of interleaved channels # in neurophotometrics data - def decide_indices(self, file, df, flag, num_ch=2): + @classmethod + def decide_indices(cls, file, df, flag, num_ch=2): ch_name = [file + "chev", file + "chod", file + "chpr"] if len(ch_name) < num_ch: logger.error( @@ -319,7 +368,7 @@ def decide_indices(self, file, df, flag, num_ch=2): data but column names does not have Flags or LedState" ) - num_ch, ch = self.check_channels(state) + num_ch, ch = cls.check_channels(state) indices_dict = dict() for i in range(num_ch): first_occurrence = np.where(state == ch[i])[0] @@ -330,7 +379,8 @@ def decide_indices(self, file, df, flag, num_ch=2): return df, indices_dict, num_ch # check flag consistency in neurophotometrics data - def check_channels(self, state): + @classmethod + def check_channels(cls, state): state = state.astype(int) unique_state = np.unique(state[2:12]) if unique_state.shape[0] > 3: @@ -345,105 +395,94 @@ def check_channels(self, state): return unique_state.shape[0], unique_state - # function to decide NPM timestamps unit (seconds, ms or us) - def decide_ts_unit_for_npm(self, df, timestamp_column_name=None, time_unit=None, headless=False): - col_names = np.array(list(df.columns)) + @classmethod + def needs_ts_unit(cls, folder_path, num_ch): + path = sorted(glob.glob(os.path.join(folder_path, "*.csv"))) + sorted( + glob.glob(os.path.join(folder_path, "*.doric")) + ) + path_chev = glob.glob(os.path.join(folder_path, "*chev*")) + path_chod = glob.glob(os.path.join(folder_path, "*chod*")) + path_chpr = glob.glob(os.path.join(folder_path, "*chpr*")) + path_event = glob.glob(os.path.join(folder_path, "event*")) + # path_sig = glob.glob(os.path.join(filepath, 'sig*')) # TODO: what is this for? + path_chev_chod_event = path_chev + path_chod + path_event + path_chpr + + path = sorted(list(set(path) - set(path_chev_chod_event))) + ts_unit_needs = [] col_names_ts = [""] - for name in col_names: - if "timestamp" in name.lower(): - col_names_ts.append(name) + for i in range(len(path)): + df = pd.read_csv(path[i], index_col=False) + _, value = cls.check_header(df) - ts_unit = "seconds" - if len(col_names_ts) > 2: - # Headless path: auto-select column/unit without any UI - if headless: - if timestamp_column_name is not None: - assert ( - timestamp_column_name in col_names_ts - ), f"Provided timestamp_column_name '{timestamp_column_name}' not found in columns {col_names_ts[1:]}" - chosen = timestamp_column_name - else: - chosen = col_names_ts[1] - df.insert(1, "Timestamp", df[chosen]) - df = df.drop(col_names_ts[1:], axis=1) - valid_units = {"seconds", "milliseconds", "microseconds"} - ts_unit = time_unit if (isinstance(time_unit, str) and time_unit in valid_units) else "seconds" - return df, ts_unit - # def comboBoxSelected(event): - # logger.info(event.widget.get()) - - window = tk.Tk() - window.title("Select appropriate options for timestamps") - window.geometry("500x200") - holdComboboxValues = dict() - - timestamps_label = ttk.Label(window, text="Select which timestamps to use : ").grid( - row=0, column=1, pady=25, padx=25 - ) - holdComboboxValues["timestamps"] = StringVar() - timestamps_combo = ttk.Combobox(window, values=col_names_ts, textvariable=holdComboboxValues["timestamps"]) - timestamps_combo.grid(row=0, column=2, pady=25, padx=25) - timestamps_combo.current(0) - # timestamps_combo.bind("<>", comboBoxSelected) - - time_unit_label = ttk.Label(window, text="Select timestamps unit : ").grid( - row=1, column=1, pady=25, padx=25 - ) - holdComboboxValues["time_unit"] = StringVar() - time_unit_combo = ttk.Combobox( - window, - values=["", "seconds", "milliseconds", "microseconds"], - textvariable=holdComboboxValues["time_unit"], - ) - time_unit_combo.grid(row=1, column=2, pady=25, padx=25) - time_unit_combo.current(0) - # time_unit_combo.bind("<>", comboBoxSelected) - window.lift() - window.after(500, lambda: window.lift()) - window.mainloop() - - if holdComboboxValues["timestamps"].get(): - df.insert(1, "Timestamp", df[holdComboboxValues["timestamps"].get()]) - df = df.drop(col_names_ts[1:], axis=1) + # check dataframe structure and read data accordingly + if len(value) > 0: + df = pd.read_csv(path[i], header=None) + cols = np.array(list(df.columns), dtype=str) + columns_isstr = False else: - messagebox.showerror( - "All options not selected", - "All the options for timestamps \ - were not selected. Please select appropriate options", - ) - logger.error( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - raise Exception( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - if holdComboboxValues["time_unit"].get(): - if holdComboboxValues["time_unit"].get() == "seconds": - ts_unit = holdComboboxValues["time_unit"].get() - elif holdComboboxValues["time_unit"].get() == "milliseconds": - ts_unit = holdComboboxValues["time_unit"].get() + columns_isstr = True + cols = np.array(list(df.columns), dtype=str) + # check the structure of dataframe and assign flag to the type of file + if len(cols) == 2: + flag = "event_or_data_np" + elif len(cols) > 2: + flag = "data_np" + else: + logger.error("Number of columns in csv file does not make sense.") + raise Exception("Number of columns in csv file does not make sense.") + + if columns_isstr == True and ( + "flags" in np.char.lower(np.array(cols)) or "ledstate" in np.char.lower(np.array(cols)) + ): + flag = flag + "_v2" + + # used assigned flags to process the files and read the data + if flag == "event_or_data_np": + arr = list(df.iloc[:, 1]) + check_float = [True for i in arr if isinstance(i, float)] + if len(arr) == len(check_float) and columns_isstr == False: + flag = "data_np" + elif columns_isstr == True and ("value" in np.char.lower(np.array(cols))): + flag = "event_np" else: - ts_unit = holdComboboxValues["time_unit"].get() + flag = "event_np" + + if flag == "data_np": + file = f"file{str(i)}_" + df, _, _ = cls.decide_indices(file, df, flag, num_ch) + + if flag == "event_np" or flag == "data_np": + ts_unit_needs.append(False) + continue + + col_names = np.array(list(df.columns)) + for name in col_names: + if "timestamp" in name.lower(): + col_names_ts.append(name) + + if len(col_names_ts) > 2: + ts_unit_needs.append(True) else: - messagebox.showerror( - "All options not selected", - "All the options for timestamps \ - were not selected. Please select appropriate options", - ) - logger.error( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - raise Exception( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - else: - pass + ts_unit_needs.append(False) - return df, ts_unit + return ts_unit_needs, col_names_ts + + def update_df_with_timestamp_columns(self, df, timestamp_column_name): + col_names = np.array(list(df.columns)) + col_names_ts = [""] + for name in col_names: + if "timestamp" in name.lower(): + col_names_ts.append(name) + if len(col_names_ts) <= 2: + return df + + timestamp_column_name = timestamp_column_name if timestamp_column_name is not None else col_names_ts[1] + assert ( + timestamp_column_name in col_names_ts + ), f"Provided timestamp_column_name '{timestamp_column_name}' not found in columns {col_names_ts[1:]}" + df.insert(1, "Timestamp", df[timestamp_column_name]) + df = df.drop(col_names_ts[1:], axis=1) + return df def read_npm(self, event): logger.debug("\033[1m" + "Trying to read data for {} from csv file.".format(event) + "\033[0m") diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index daf7457..552d76c 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -9,8 +9,10 @@ import logging import os import socket +import tkinter as tk from pathlib import Path from random import randint +from tkinter import StringVar, messagebox, ttk import holoviews as hv import numpy as np @@ -602,6 +604,23 @@ def execute(inputParameters): flag = extractor.flags elif modality == "npm": + headless = bool(os.environ.get("GUPPY_BASE_DIR")) + if not headless: + # Resolve multiple event TTLs + multiple_event_ttls = NpmRecordingExtractor.has_multiple_event_ttls(folder_path=filepath) + responses = get_multi_event_responses(multiple_event_ttls) + inputParameters["npm_split_events"] = responses + + # Resolve timestamp units and columns + ts_unit_needs, col_names_ts = NpmRecordingExtractor.needs_ts_unit( + folder_path=filepath, num_ch=num_ch + ) + ts_units, npm_timestamp_column_names = get_timestamp_configuration(ts_unit_needs, col_names_ts) + inputParameters["npm_time_units"] = ts_units if ts_units else None + inputParameters["npm_timestamp_column_names"] = ( + npm_timestamp_column_names if npm_timestamp_column_names else None + ) + data = 0 extractor = NpmRecordingExtractor(folder_path=filepath, num_ch=num_ch, inputParameters=inputParameters) event_name = extractor.events @@ -614,3 +633,102 @@ def execute(inputParameters): except Exception as e: logger.error(str(e)) raise e + + +def get_multi_event_responses(multiple_event_ttls): + responses = [] + for has_multiple in multiple_event_ttls: + if not has_multiple: + responses.append(False) + continue + window = tk.Tk() + response = messagebox.askyesno( + "Multiple event TTLs", + ( + "Based on the TTL file, " + "it looks like TTLs " + "belong to multiple behavior types. " + "Do you want to create multiple files for each " + "behavior type?" + ), + ) + window.destroy() + responses.append(response) + return responses + + +def get_timestamp_configuration(ts_unit_needs, col_names_ts): + ts_units, npm_timestamp_column_names = [], [] + for need in ts_unit_needs: + if not need: + ts_units.append("seconds") + npm_timestamp_column_names.append(None) + continue + window = tk.Tk() + window.title("Select appropriate options for timestamps") + window.geometry("500x200") + holdComboboxValues = dict() + + timestamps_label = ttk.Label(window, text="Select which timestamps to use : ").grid( + row=0, column=1, pady=25, padx=25 + ) + holdComboboxValues["timestamps"] = StringVar() + timestamps_combo = ttk.Combobox(window, values=col_names_ts, textvariable=holdComboboxValues["timestamps"]) + timestamps_combo.grid(row=0, column=2, pady=25, padx=25) + timestamps_combo.current(0) + # timestamps_combo.bind("<>", comboBoxSelected) + + time_unit_label = ttk.Label(window, text="Select timestamps unit : ").grid(row=1, column=1, pady=25, padx=25) + holdComboboxValues["time_unit"] = StringVar() + time_unit_combo = ttk.Combobox( + window, + values=["", "seconds", "milliseconds", "microseconds"], + textvariable=holdComboboxValues["time_unit"], + ) + time_unit_combo.grid(row=1, column=2, pady=25, padx=25) + time_unit_combo.current(0) + # time_unit_combo.bind("<>", comboBoxSelected) + window.lift() + window.after(500, lambda: window.lift()) + window.mainloop() + + if holdComboboxValues["timestamps"].get(): + npm_timestamp_column_name = holdComboboxValues["timestamps"].get() + else: + messagebox.showerror( + "All options not selected", + "All the options for timestamps \ + were not selected. Please select appropriate options", + ) + logger.error( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + raise Exception( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + if holdComboboxValues["time_unit"].get(): + if holdComboboxValues["time_unit"].get() == "seconds": + ts_unit = holdComboboxValues["time_unit"].get() + elif holdComboboxValues["time_unit"].get() == "milliseconds": + ts_unit = holdComboboxValues["time_unit"].get() + else: + ts_unit = holdComboboxValues["time_unit"].get() + else: + messagebox.showerror( + "All options not selected", + "All the options for timestamps \ + were not selected. Please select appropriate options", + ) + logger.error( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + raise Exception( + "All the options for timestamps \ + were not selected. Please select appropriate options" + ) + ts_units.append(ts_unit) + npm_timestamp_column_names.append(npm_timestamp_column_name) + return ts_units, npm_timestamp_column_names diff --git a/src/guppy/testing/api.py b/src/guppy/testing/api.py index d7e390d..c647907 100644 --- a/src/guppy/testing/api.py +++ b/src/guppy/testing/api.py @@ -69,9 +69,9 @@ def step2( selected_folders: Iterable[str], storenames_map: dict[str, str], modality: str = "tdt", - npm_timestamp_column_name: str | None = None, - npm_time_unit: str = "seconds", - npm_split_events: bool = True, + npm_timestamp_column_names: list[str | None] | None = None, + npm_time_units: list[str] | None = None, + npm_split_events: list[bool] | None = None, ) -> None: """ Run pipeline Step 2 (Save Storenames) via the actual Panel-backed logic. @@ -94,6 +94,14 @@ def step2( storenames_map : dict[str, str] Mapping from raw storenames (e.g., "Dv1A") to semantic names (e.g., "control_DMS"). Insertion order is preserved. + modality : str + Data acquisition modality (e.g., 'tdt', 'csv', 'doric', 'npm'). + npm_timestamp_column_names : list[str | None] | None + List of timestamp column names for NPM files, one per CSV file. None if not applicable. + npm_time_units : list[str] | None + List of time units for NPM files, one per CSV file (e.g., 'seconds', 'milliseconds'). None if not applicable. + npm_split_events : list[bool] | None + List of booleans indicating whether to split events for NPM files, one per CSV file. None if not applicable. Raises ------ @@ -155,8 +163,8 @@ def step2( input_params["modality"] = modality # Add npm parameters - input_params["npm_timestamp_column_name"] = npm_timestamp_column_name - input_params["npm_time_unit"] = npm_time_unit + input_params["npm_timestamp_column_names"] = npm_timestamp_column_names + input_params["npm_time_units"] = npm_time_units input_params["npm_split_events"] = npm_split_events # Call the underlying Step 2 executor (now headless-aware) @@ -168,9 +176,9 @@ def step3( base_dir: str, selected_folders: Iterable[str], modality: str = "tdt", - npm_timestamp_column_name: str | None = None, - npm_time_unit: str = "seconds", - npm_split_events: bool = True, + npm_timestamp_column_names: list[str | None] | None = None, + npm_time_units: list[str] | None = None, + npm_split_events: list[bool] | None = None, ) -> None: """ Run pipeline Step 3 (Read Raw Data) via the actual Panel-backed logic, headlessly. @@ -188,6 +196,14 @@ def step3( must reside directly under this path. selected_folders : Iterable[str] Absolute paths to the session directories to process. + modality : str + Data acquisition modality (e.g., 'tdt', 'csv', 'doric', 'npm'). + npm_timestamp_column_names : list[str | None] | None + List of timestamp column names for NPM files, one per CSV file. None if not applicable. + npm_time_units : list[str] | None + List of time units for NPM files, one per CSV file (e.g., 'seconds', 'milliseconds'). None if not applicable. + npm_split_events : list[bool] | None + List of booleans indicating whether to split events for NPM files, one per CSV file. None if not applicable. Raises ------ @@ -232,9 +248,9 @@ def step3( template._widgets["files_1"].value = abs_sessions input_params = template._hooks["getInputParameters"]() - # Inject explicit NPM parameters (match Step 2 style) - input_params["npm_timestamp_column_name"] = npm_timestamp_column_name - input_params["npm_time_unit"] = npm_time_unit + # Inject explicit NPM parameters + input_params["npm_timestamp_column_names"] = npm_timestamp_column_names + input_params["npm_time_units"] = npm_time_units input_params["npm_split_events"] = npm_split_events # Inject modality @@ -249,9 +265,9 @@ def step4( base_dir: str, selected_folders: Iterable[str], modality: str = "tdt", - npm_timestamp_column_name: str | None = None, - npm_time_unit: str = "seconds", - npm_split_events: bool = True, + npm_timestamp_column_names: list[str | None] | None = None, + npm_time_units: list[str] | None = None, + npm_split_events: list[bool] | None = None, ) -> None: """ Run pipeline Step 4 (Extract timestamps and signal) via the Panel-backed logic, headlessly. @@ -269,6 +285,14 @@ def step4( must reside directly under this path. selected_folders : Iterable[str] Absolute paths to the session directories to process. + modality : str + Data acquisition modality (e.g., 'tdt', 'csv', 'doric', 'npm'). + npm_timestamp_column_names : list[str | None] | None + List of timestamp column names for NPM files, one per CSV file. None if not applicable. + npm_time_units : list[str] | None + List of time units for NPM files, one per CSV file (e.g., 'seconds', 'milliseconds'). None if not applicable. + npm_split_events : list[bool] | None + List of booleans indicating whether to split events for NPM files, one per CSV file. None if not applicable. Raises ------ @@ -313,9 +337,9 @@ def step4( template._widgets["files_1"].value = abs_sessions input_params = template._hooks["getInputParameters"]() - # Inject explicit NPM parameters (match Step 2 style) - input_params["npm_timestamp_column_name"] = npm_timestamp_column_name - input_params["npm_time_unit"] = npm_time_unit + # Inject explicit NPM parameters + input_params["npm_timestamp_column_names"] = npm_timestamp_column_names + input_params["npm_time_units"] = npm_time_units input_params["npm_split_events"] = npm_split_events # Inject modality @@ -330,9 +354,9 @@ def step5( base_dir: str, selected_folders: Iterable[str], modality: str = "tdt", - npm_timestamp_column_name: str | None = None, - npm_time_unit: str = "seconds", - npm_split_events: bool = True, + npm_timestamp_column_names: list[str | None] | None = None, + npm_time_units: list[str] | None = None, + npm_split_events: list[bool] | None = None, ) -> None: """ Run pipeline Step 5 (PSTH Computation) via the Panel-backed logic, headlessly. @@ -350,6 +374,14 @@ def step5( must reside directly under this path. selected_folders : Iterable[str] Absolute paths to the session directories to process. + modality : str + Data acquisition modality (e.g., 'tdt', 'csv', 'doric', 'npm'). + npm_timestamp_column_names : list[str | None] | None + List of timestamp column names for NPM files, one per CSV file. None if not applicable. + npm_time_units : list[str] | None + List of time units for NPM files, one per CSV file (e.g., 'seconds', 'milliseconds'). None if not applicable. + npm_split_events : list[bool] | None + List of booleans indicating whether to split events for NPM files, one per CSV file. None if not applicable. Raises ------ @@ -394,9 +426,9 @@ def step5( template._widgets["files_1"].value = abs_sessions input_params = template._hooks["getInputParameters"]() - # Inject explicit NPM parameters (match Step 2 style) - input_params["npm_timestamp_column_name"] = npm_timestamp_column_name - input_params["npm_time_unit"] = npm_time_unit + # Inject explicit NPM parameters + input_params["npm_timestamp_column_names"] = npm_timestamp_column_names + input_params["npm_time_units"] = npm_time_units input_params["npm_split_events"] = npm_split_events # Inject modality diff --git a/tests/test_step2.py b/tests/test_step2.py index f7e34d1..6ab85eb 100644 --- a/tests/test_step2.py +++ b/tests/test_step2.py @@ -154,16 +154,15 @@ def test_step2(tmp_path, session_subdir, storenames_map, modality): - Asserts storesList.csv exists and exactly matches the provided mapping (2xN) """ if session_subdir == "SampleData_Neurophotometrics/sampleData_NPM_3": - npm_timestamp_column_name = "ComputerTimestamp" - npm_time_unit = "milliseconds" + npm_timestamp_column_names = ["ComputerTimestamp", None] + npm_time_units = ["milliseconds", "seconds"] + npm_split_events = [False, True] else: - npm_timestamp_column_name = None - npm_time_unit = None + npm_timestamp_column_names = None + npm_time_units = None + npm_split_events = [True, True] if session_subdir == "SampleData_Neurophotometrics/sampleData_NPM_5": - npm_split_events = False - else: - npm_split_events = True - + npm_split_events = None # Source sample data src_base_dir = str(Path(".") / "testing_data") src_session = os.path.join(src_base_dir, session_subdir) @@ -193,8 +192,8 @@ def test_step2(tmp_path, session_subdir, storenames_map, modality): selected_folders=[str(session_copy)], storenames_map=storenames_map, modality=modality, - npm_timestamp_column_name=npm_timestamp_column_name, - npm_time_unit=npm_time_unit, + npm_timestamp_column_names=npm_timestamp_column_names, + npm_time_units=npm_time_units, npm_split_events=npm_split_events, ) diff --git a/tests/test_step3.py b/tests/test_step3.py index 26dac14..e4b5150 100644 --- a/tests/test_step3.py +++ b/tests/test_step3.py @@ -167,15 +167,15 @@ def test_step3(tmp_path, storenames_map, session_subdir, modality): the temp copy (never touching the original sample path). """ if session_subdir == "SampleData_Neurophotometrics/sampleData_NPM_3": - npm_timestamp_column_name = "ComputerTimestamp" - npm_time_unit = "milliseconds" + npm_timestamp_column_names = ["ComputerTimestamp", None] + npm_time_units = ["milliseconds", "seconds"] + npm_split_events = [False, True] else: - npm_timestamp_column_name = None - npm_time_unit = None + npm_timestamp_column_names = None + npm_time_units = None + npm_split_events = [True, True] if session_subdir == "SampleData_Neurophotometrics/sampleData_NPM_5": - npm_split_events = False - else: - npm_split_events = True + npm_split_events = None src_base_dir = str(Path(".") / "testing_data") src_session = os.path.join(src_base_dir, session_subdir) @@ -205,8 +205,8 @@ def test_step3(tmp_path, storenames_map, session_subdir, modality): selected_folders=[str(session_copy)], storenames_map=storenames_map, modality=modality, - npm_timestamp_column_name=npm_timestamp_column_name, - npm_time_unit=npm_time_unit, + npm_timestamp_column_names=npm_timestamp_column_names, + npm_time_units=npm_time_units, npm_split_events=npm_split_events, ) @@ -215,8 +215,8 @@ def test_step3(tmp_path, storenames_map, session_subdir, modality): base_dir=str(tmp_base), selected_folders=[str(session_copy)], modality=modality, - npm_timestamp_column_name=npm_timestamp_column_name, - npm_time_unit=npm_time_unit, + npm_timestamp_column_names=npm_timestamp_column_names, + npm_time_units=npm_time_units, npm_split_events=npm_split_events, ) diff --git a/tests/test_step4.py b/tests/test_step4.py index df18f75..8e5f989 100644 --- a/tests/test_step4.py +++ b/tests/test_step4.py @@ -185,15 +185,15 @@ def test_step4(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r - Assertions confirm creation of key HDF5 outputs expected from Step 4. """ if session_subdir == "SampleData_Neurophotometrics/sampleData_NPM_3": - npm_timestamp_column_name = "ComputerTimestamp" - npm_time_unit = "milliseconds" + npm_timestamp_column_names = ["ComputerTimestamp", None] + npm_time_units = ["milliseconds", "seconds"] + npm_split_events = [False, True] else: - npm_timestamp_column_name = None - npm_time_unit = None + npm_timestamp_column_names = None + npm_time_units = None + npm_split_events = [True, True] if session_subdir == "SampleData_Neurophotometrics/sampleData_NPM_5": - npm_split_events = False - else: - npm_split_events = True + npm_split_events = None # Use the CSV sample session src_base_dir = str(Path(".") / "testing_data") @@ -227,8 +227,8 @@ def test_step4(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r selected_folders=[str(session_copy)], storenames_map=storenames_map, modality=modality, - npm_timestamp_column_name=npm_timestamp_column_name, - npm_time_unit=npm_time_unit, + npm_timestamp_column_names=npm_timestamp_column_names, + npm_time_units=npm_time_units, npm_split_events=npm_split_events, ) @@ -237,8 +237,8 @@ def test_step4(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r base_dir=str(tmp_base), selected_folders=[str(session_copy)], modality=modality, - npm_timestamp_column_name=npm_timestamp_column_name, - npm_time_unit=npm_time_unit, + npm_timestamp_column_names=npm_timestamp_column_names, + npm_time_units=npm_time_units, npm_split_events=npm_split_events, ) @@ -247,8 +247,8 @@ def test_step4(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r base_dir=str(tmp_base), selected_folders=[str(session_copy)], modality=modality, - npm_timestamp_column_name=npm_timestamp_column_name, - npm_time_unit=npm_time_unit, + npm_timestamp_column_names=npm_timestamp_column_names, + npm_time_units=npm_time_units, npm_split_events=npm_split_events, ) diff --git a/tests/test_step5.py b/tests/test_step5.py index a8cdeb4..1837ebf 100644 --- a/tests/test_step5.py +++ b/tests/test_step5.py @@ -187,15 +187,15 @@ def test_step5(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r - Defaults are used for input parameters; PSTH computation defaults to z_score. """ if session_subdir == "SampleData_Neurophotometrics/sampleData_NPM_3": - npm_timestamp_column_name = "ComputerTimestamp" - npm_time_unit = "milliseconds" + npm_timestamp_column_names = ["ComputerTimestamp", None] + npm_time_units = ["milliseconds", "seconds"] + npm_split_events = [False, True] else: - npm_timestamp_column_name = None - npm_time_unit = None + npm_timestamp_column_names = None + npm_time_units = None + npm_split_events = [True, True] if session_subdir == "SampleData_Neurophotometrics/sampleData_NPM_5": - npm_split_events = False - else: - npm_split_events = True + npm_split_events = None # Use the sample session src_base_dir = str(Path(".") / "testing_data") @@ -229,8 +229,8 @@ def test_step5(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r selected_folders=[str(session_copy)], storenames_map=storenames_map, modality=modality, - npm_timestamp_column_name=npm_timestamp_column_name, - npm_time_unit=npm_time_unit, + npm_timestamp_column_names=npm_timestamp_column_names, + npm_time_units=npm_time_units, npm_split_events=npm_split_events, ) @@ -239,8 +239,8 @@ def test_step5(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r base_dir=str(tmp_base), selected_folders=[str(session_copy)], modality=modality, - npm_timestamp_column_name=npm_timestamp_column_name, - npm_time_unit=npm_time_unit, + npm_timestamp_column_names=npm_timestamp_column_names, + npm_time_units=npm_time_units, npm_split_events=npm_split_events, ) @@ -249,8 +249,8 @@ def test_step5(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r base_dir=str(tmp_base), selected_folders=[str(session_copy)], modality=modality, - npm_timestamp_column_name=npm_timestamp_column_name, - npm_time_unit=npm_time_unit, + npm_timestamp_column_names=npm_timestamp_column_names, + npm_time_units=npm_time_units, npm_split_events=npm_split_events, ) @@ -259,8 +259,8 @@ def test_step5(tmp_path, monkeypatch, session_subdir, storenames_map, expected_r base_dir=str(tmp_base), selected_folders=[str(session_copy)], modality=modality, - npm_timestamp_column_name=npm_timestamp_column_name, - npm_time_unit=npm_time_unit, + npm_timestamp_column_names=npm_timestamp_column_names, + npm_time_units=npm_time_units, npm_split_events=npm_split_events, ) From d55bba7887bfc2c94c05b6c26214dc4350495395 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 10:59:09 -0800 Subject: [PATCH 044/150] Defined BaseRecordingExtractor. --- src/guppy/extractors/__init__.py | 1 + .../extractors/base_recording_extractor.py | 128 ++++++++++++++++++ 2 files changed, 129 insertions(+) create mode 100644 src/guppy/extractors/base_recording_extractor.py diff --git a/src/guppy/extractors/__init__.py b/src/guppy/extractors/__init__.py index b876012..75933c7 100644 --- a/src/guppy/extractors/__init__.py +++ b/src/guppy/extractors/__init__.py @@ -1,3 +1,4 @@ +from .base_recording_extractor import BaseRecordingExtractor from .tdt_recording_extractor import TdtRecordingExtractor, execute_readtev from .csv_recording_extractor import CsvRecordingExtractor, execute_import_csv from .doric_recording_extractor import DoricRecordingExtractor, execute_import_doric diff --git a/src/guppy/extractors/base_recording_extractor.py b/src/guppy/extractors/base_recording_extractor.py new file mode 100644 index 0000000..7058a0a --- /dev/null +++ b/src/guppy/extractors/base_recording_extractor.py @@ -0,0 +1,128 @@ +"""Base class for recording extractors.""" + +import os +from abc import ABC, abstractmethod +from typing import Any + +import h5py +import numpy as np + + +class BaseRecordingExtractor(ABC): + """ + Abstract base class for recording extractors. + + Defines the interface contract for reading and saving fiber photometry + data from various acquisition formats (TDT, Doric, CSV, NPM, etc.). + """ + + @property + @abstractmethod + def events(self) -> list[str]: + """ + List of available event/store names in the data. + + Returns + ------- + list of str + Names of all events or stores available in the dataset. + """ + pass + + @property + @abstractmethod + def flags(self) -> list: + """ + Format indicators or file type flags. + + Returns + ------- + list + Flags indicating file types or data formats. + """ + pass + + @abstractmethod + def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str, Any]]: + """ + Read data from source files for specified events. + + Parameters + ---------- + events : list of str + List of event/store names to extract from the data. + outputPath : str + Path to the output directory. + **kwargs + Additional extractor-specific parameters. + + Returns + ------- + list of dict + List of dictionaries containing extracted data. Each dictionary + represents one event/store and contains keys such as 'storename', + 'timestamps', 'data', 'sampling_rate', etc. + """ + pass + + @abstractmethod + def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str, **kwargs) -> None: + """ + Save extracted data dictionaries to HDF5 format. + + Parameters + ---------- + output_dicts : list of dict + List of data dictionaries from read(). + outputPath : str + Path to the output directory. + **kwargs + Additional extractor-specific parameters. + """ + pass + + @staticmethod + def _write_hdf5(data: Any, storename: str, output_path: str, key: str) -> None: + """ + Write data to HDF5 file. + + Parameters + ---------- + data : array-like + Data to write to the HDF5 file. + storename : str + Name of the store/event. + output_path : str + Directory path where HDF5 file will be written. + key : str + Key name for this data field in the HDF5 file. + """ + # Replace invalid characters in storename to avoid filesystem errors + storename = storename.replace("\\", "_") + storename = storename.replace("/", "_") + + filepath = os.path.join(output_path, storename + ".hdf5") + + # Create new file if it doesn't exist + if not os.path.exists(filepath): + with h5py.File(filepath, "w") as f: + if isinstance(data, np.ndarray): + f.create_dataset(key, data=data, maxshape=(None,), chunks=True) + else: + f.create_dataset(key, data=data) + # Append to existing file + else: + with h5py.File(filepath, "r+") as f: + if key in list(f.keys()): + if isinstance(data, np.ndarray): + f[key].resize(data.shape) + arr = f[key] + arr[:] = data + else: + arr = f[key] + arr[()] = data + else: + if isinstance(data, np.ndarray): + f.create_dataset(key, data=data, maxshape=(None,), chunks=True) + else: + f.create_dataset(key, data=data) From 1689b7ef15c188e62f9f3a38fa63cc7329b08d2c Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 11:00:47 -0800 Subject: [PATCH 045/150] Removed obsolete intermediates extractor steps --- src/guppy/csv_step2.py | 110 ----------- src/guppy/csv_step3.py | 66 ------- src/guppy/doric_step2.py | 92 --------- src/guppy/doric_step3.py | 159 --------------- src/guppy/npm_step2.py | 411 --------------------------------------- src/guppy/tdt_step2.py | 28 --- src/guppy/tdt_step3.py | 207 -------------------- 7 files changed, 1073 deletions(-) delete mode 100644 src/guppy/csv_step2.py delete mode 100644 src/guppy/csv_step3.py delete mode 100644 src/guppy/doric_step2.py delete mode 100644 src/guppy/doric_step3.py delete mode 100644 src/guppy/npm_step2.py delete mode 100644 src/guppy/tdt_step2.py delete mode 100644 src/guppy/tdt_step3.py diff --git a/src/guppy/csv_step2.py b/src/guppy/csv_step2.py deleted file mode 100644 index ba4b34f..0000000 --- a/src/guppy/csv_step2.py +++ /dev/null @@ -1,110 +0,0 @@ -import glob -import logging -import os - -import numpy as np -import pandas as pd - -logger = logging.getLogger(__name__) - - -def check_header(df): - arr = list(df.columns) - check_float = [] - for i in arr: - try: - check_float.append(float(i)) - except: - pass - - return arr, check_float - - -def import_csv_step2(filepath): - logger.debug("If it exists, importing either NPM or Doric or csv file based on the structure of file") - path = sorted(glob.glob(os.path.join(filepath, "*.csv"))) - - path = sorted(list(set(path))) - flag = "None" - event_from_filename = [] - flag_arr = [] - for i in range(len(path)): - ext = os.path.basename(path[i]).split(".")[-1] - assert ext == "csv", "Only .csv files are supported by import_csv function." - df = pd.read_csv(path[i], header=None, nrows=2, index_col=False, dtype=str) - df = df.dropna(axis=1, how="all") - df_arr = np.array(df).flatten() - check_all_str = [] - for element in df_arr: - try: - float(element) - except: - check_all_str.append(i) - assert len(check_all_str) != len( - df_arr - ), "This file appears to be doric .csv. This function only supports standard .csv files." - df = pd.read_csv(path[i], index_col=False) - - _, value = check_header(df) - - # check dataframe structure and read data accordingly - if len(value) > 0: - columns_isstr = False - df = pd.read_csv(path[i], header=None) - cols = np.array(list(df.columns), dtype=str) - else: - df = df - columns_isstr = True - cols = np.array(list(df.columns), dtype=str) - # check the structure of dataframe and assign flag to the type of file - if len(cols) == 1: - if cols[0].lower() != "timestamps": - logger.error("\033[1m" + "Column name should be timestamps (all lower-cases)" + "\033[0m") - raise Exception("\033[1m" + "Column name should be timestamps (all lower-cases)" + "\033[0m") - else: - flag = "event_csv" - elif len(cols) == 3: - arr1 = np.array(["timestamps", "data", "sampling_rate"]) - arr2 = np.char.lower(np.array(cols)) - if (np.sort(arr1) == np.sort(arr2)).all() == False: - logger.error( - "\033[1m" - + "Column names should be timestamps, data and sampling_rate (all lower-cases)" - + "\033[0m" - ) - raise Exception( - "\033[1m" - + "Column names should be timestamps, data and sampling_rate (all lower-cases)" - + "\033[0m" - ) - else: - flag = "data_csv" - elif len(cols) == 2: - raise ValueError( - "Data appears to be Neurophotometrics csv. Please use import_npm_csv function to import the data." - ) - elif len(cols) >= 2: - raise ValueError( - "Data appears to be Neurophotometrics csv. Please use import_npm_csv function to import the data." - ) - else: - logger.error("Number of columns in csv file does not make sense.") - raise Exception("Number of columns in csv file does not make sense.") - - if columns_isstr == True and ( - "flags" in np.char.lower(np.array(cols)) or "ledstate" in np.char.lower(np.array(cols)) - ): - flag = flag + "_v2" - else: - flag = flag - - flag_arr.append(flag) - logger.info(flag) - assert ( - flag == "event_csv" or flag == "data_csv" - ), "This function only supports standard event_csv and data_csv files." - name = os.path.basename(path[i]).split(".")[0] - event_from_filename.append(name) - - logger.info("Importing of csv file is done.") - return event_from_filename, flag_arr diff --git a/src/guppy/csv_step3.py b/src/guppy/csv_step3.py deleted file mode 100644 index 985959a..0000000 --- a/src/guppy/csv_step3.py +++ /dev/null @@ -1,66 +0,0 @@ -import logging -import multiprocessing as mp -import os -import time -from itertools import repeat - -import numpy as np -import pandas as pd - -from guppy.common_step3 import write_hdf5 - -logger = logging.getLogger(__name__) - - -def execute_import_csv(filepath, event, outputPath, numProcesses=mp.cpu_count()): - # logger.info("Reading data for event {} ...".format(event)) - - start = time.time() - with mp.Pool(numProcesses) as p: - p.starmap(import_csv, zip(repeat(filepath), event, repeat(outputPath))) - logger.info("Time taken = {0:.5f}".format(time.time() - start)) - - -# function to read event timestamps csv file. -def import_csv(filepath, event, outputPath): - logger.debug("\033[1m" + "Trying to read data for {} from csv file.".format(event) + "\033[0m") - if not os.path.exists(os.path.join(filepath, event + ".csv")): - logger.error("\033[1m" + "No csv file found for event {}".format(event) + "\033[0m") - raise Exception("\033[1m" + "No csv file found for event {}".format(event) + "\033[0m") - - df = pd.read_csv(os.path.join(filepath, event + ".csv"), index_col=False) - data = df - key = list(df.columns) - - if len(key) == 3: - arr1 = np.array(["timestamps", "data", "sampling_rate"]) - arr2 = np.char.lower(np.array(key)) - if (np.sort(arr1) == np.sort(arr2)).all() == False: - logger.error("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") - raise Exception("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") - - if len(key) == 1: - if key[0].lower() != "timestamps": - logger.error("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") - raise Exception("\033[1m" + "Column name should be timestamps" + "\033[0m") - - if len(key) != 3 and len(key) != 1: - logger.error( - "\033[1m" - + "Number of columns in csv file should be either three or one. Three columns if \ - the file is for control or signal data or one column if the file is for event TTLs." - + "\033[0m" - ) - raise Exception( - "\033[1m" - + "Number of columns in csv file should be either three or one. Three columns if \ - the file is for control or signal data or one column if the file is for event TTLs." - + "\033[0m" - ) - - for i in range(len(key)): - write_hdf5(data[key[i]].dropna(), event, outputPath, key[i].lower()) - - logger.info("\033[1m" + "Reading data for {} from csv file is completed.".format(event) + "\033[0m") - - return data, key diff --git a/src/guppy/doric_step2.py b/src/guppy/doric_step2.py deleted file mode 100644 index 26ab22e..0000000 --- a/src/guppy/doric_step2.py +++ /dev/null @@ -1,92 +0,0 @@ -import glob -import logging -import os - -import h5py -import numpy as np -import pandas as pd - -logger = logging.getLogger(__name__) - - -def import_doric(filepath): - - logger.debug("If it exists, importing Doric file based on the structure of file") - path = sorted(glob.glob(os.path.join(filepath, "*.csv"))) + sorted(glob.glob(os.path.join(filepath, "*.doric"))) - - path = sorted(list(set(path))) - flag = "None" - event_from_filename = [] - flag_arr = [] - for i in range(len(path)): - ext = os.path.basename(path[i]).split(".")[-1] - if ext == "doric": - key_names = read_doric(path[i]) - event_from_filename.extend(key_names) - flag = "doric_doric" - else: - df = pd.read_csv(path[i], header=None, nrows=2, index_col=False, dtype=str) - df = df.dropna(axis=1, how="all") - df_arr = np.array(df).flatten() - check_all_str = [] - for element in df_arr: - try: - float(element) - except: - check_all_str.append(i) - assert len(check_all_str) == len( - df_arr - ), "This file appears to be standard .csv. This function only supports doric .csv files." - df = pd.read_csv(path[i], header=1, index_col=False, nrows=10) - df = df.drop(["Time(s)"], axis=1) - event_from_filename.extend(list(df.columns)) - flag = "doric_csv" - logger.info(flag) - logger.info("Importing of Doric file is done.") - return event_from_filename, flag_arr - - -def read_doric(filepath): - with h5py.File(filepath, "r") as f: - if "Traces" in list(f.keys()): - keys = access_keys_doricV1(f) - elif list(f.keys()) == ["Configurations", "DataAcquisition"]: - keys = access_keys_doricV6(f) - - return keys - - -def access_keys_doricV6(doric_file): - data = [doric_file["DataAcquisition"]] - res = [] - while len(data) != 0: - members = len(data) - while members != 0: - members -= 1 - data, last_element = separate_last_element(data) - if isinstance(last_element, h5py.Dataset) and not last_element.name.endswith("/Time"): - res.append(last_element.name) - elif isinstance(last_element, h5py.Group): - data.extend(reversed([last_element[k] for k in last_element.keys()])) - - keys = [] - for element in res: - sep_values = element.split("/") - if sep_values[-1] == "Values": - keys.append(f"{sep_values[-3]}/{sep_values[-2]}") - else: - keys.append(f"{sep_values[-2]}/{sep_values[-1]}") - - return keys - - -def access_keys_doricV1(doric_file): - keys = list(doric_file["Traces"]["Console"].keys()) - keys.remove("Time(s)") - - return keys - - -def separate_last_element(arr): - l = arr[-1] - return arr[:-1], l diff --git a/src/guppy/doric_step3.py b/src/guppy/doric_step3.py deleted file mode 100644 index e9fd7cc..0000000 --- a/src/guppy/doric_step3.py +++ /dev/null @@ -1,159 +0,0 @@ -import glob -import logging -import os -import re -import warnings - -import h5py -import numpy as np -import pandas as pd - -from guppy.common_step3 import write_hdf5 - -logger = logging.getLogger(__name__) - - -def check_doric(filepath): - logger.debug("Checking if doric file exists") - path = glob.glob(os.path.join(filepath, "*.csv")) + glob.glob(os.path.join(filepath, "*.doric")) - - flag_arr = [] - for i in range(len(path)): - ext = os.path.basename(path[i]).split(".")[-1] - if ext == "csv": - with warnings.catch_warnings(): - warnings.simplefilter("error") - try: - df = pd.read_csv(path[i], index_col=False, dtype=float) - except: - df = pd.read_csv(path[i], header=1, index_col=False, nrows=10) - flag = "doric_csv" - flag_arr.append(flag) - elif ext == "doric": - flag = "doric_doric" - flag_arr.append(flag) - else: - pass - - if len(flag_arr) > 1: - logger.error("Two doric files are present at the same location") - raise Exception("Two doric files are present at the same location") - if len(flag_arr) == 0: - logger.error("\033[1m" + "Doric file not found." + "\033[1m") - return 0 - logger.info("Doric file found.") - return flag_arr[0] - - -def execute_import_doric(filepath, storesList, flag, outputPath): - flag = check_doric(filepath) - - if flag == "doric_csv": - path = glob.glob(os.path.join(filepath, "*.csv")) - if len(path) > 1: - logger.error("An error occurred : More than one Doric csv file present at the location") - raise Exception("More than one Doric csv file present at the location") - else: - df = pd.read_csv(path[0], header=1, index_col=False) - df = df.dropna(axis=1, how="all") - df = df.dropna(axis=0, how="any") - df["Time(s)"] = df["Time(s)"] - df["Time(s)"].to_numpy()[0] - for i in range(storesList.shape[1]): - if "control" in storesList[1, i] or "signal" in storesList[1, i]: - timestamps = np.array(df["Time(s)"]) - sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) - write_hdf5(sampling_rate, storesList[0, i], outputPath, "sampling_rate") - write_hdf5(df["Time(s)"].to_numpy(), storesList[0, i], outputPath, "timestamps") - write_hdf5(df[storesList[0, i]].to_numpy(), storesList[0, i], outputPath, "data") - else: - ttl = df[storesList[0, i]] - indices = np.where(ttl <= 0)[0] - diff_indices = np.where(np.diff(indices) > 1)[0] - write_hdf5( - df["Time(s)"][indices[diff_indices] + 1].to_numpy(), storesList[0, i], outputPath, "timestamps" - ) - else: - path = glob.glob(os.path.join(filepath, "*.doric")) - if len(path) > 1: - logger.error("An error occurred : More than one Doric file present at the location") - raise Exception("More than one Doric file present at the location") - else: - with h5py.File(path[0], "r") as f: - if "Traces" in list(f.keys()): - keys = access_data_doricV1(f, storesList, outputPath) - elif list(f.keys()) == ["Configurations", "DataAcquisition"]: - keys = access_data_doricV6(f, storesList, outputPath) - - -def access_data_doricV6(doric_file, storesList, outputPath): - data = [doric_file["DataAcquisition"]] - res = [] - while len(data) != 0: - members = len(data) - while members != 0: - members -= 1 - data, last_element = separate_last_element(data) - if isinstance(last_element, h5py.Dataset) and not last_element.name.endswith("/Time"): - res.append(last_element.name) - elif isinstance(last_element, h5py.Group): - data.extend(reversed([last_element[k] for k in last_element.keys()])) - - decide_path = [] - for element in res: - sep_values = element.split("/") - if sep_values[-1] == "Values": - if f"{sep_values[-3]}/{sep_values[-2]}" in storesList[0, :]: - decide_path.append(element) - else: - if f"{sep_values[-2]}/{sep_values[-1]}" in storesList[0, :]: - decide_path.append(element) - - for i in range(storesList.shape[1]): - if "control" in storesList[1, i] or "signal" in storesList[1, i]: - regex = re.compile("(.*?)" + str(storesList[0, i]) + "(.*?)") - idx = [i for i in range(len(decide_path)) if regex.match(decide_path[i])] - if len(idx) > 1: - logger.error("More than one string matched (which should not be the case)") - raise Exception("More than one string matched (which should not be the case)") - idx = idx[0] - data = np.array(doric_file[decide_path[idx]]) - timestamps = np.array(doric_file[decide_path[idx].rsplit("/", 1)[0] + "/Time"]) - sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) - write_hdf5(sampling_rate, storesList[0, i], outputPath, "sampling_rate") - write_hdf5(timestamps, storesList[0, i], outputPath, "timestamps") - write_hdf5(data, storesList[0, i], outputPath, "data") - else: - regex = re.compile("(.*?)" + storesList[0, i] + "$") - idx = [i for i in range(len(decide_path)) if regex.match(decide_path[i])] - if len(idx) > 1: - logger.error("More than one string matched (which should not be the case)") - raise Exception("More than one string matched (which should not be the case)") - idx = idx[0] - ttl = np.array(doric_file[decide_path[idx]]) - timestamps = np.array(doric_file[decide_path[idx].rsplit("/", 1)[0] + "/Time"]) - indices = np.where(ttl <= 0)[0] - diff_indices = np.where(np.diff(indices) > 1)[0] - write_hdf5(timestamps[indices[diff_indices] + 1], storesList[0, i], outputPath, "timestamps") - - -def access_data_doricV1(doric_file, storesList, outputPath): - keys = list(doric_file["Traces"]["Console"].keys()) - for i in range(storesList.shape[1]): - if "control" in storesList[1, i] or "signal" in storesList[1, i]: - timestamps = np.array(doric_file["Traces"]["Console"]["Time(s)"]["Console_time(s)"]) - sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) - data = np.array(doric_file["Traces"]["Console"][storesList[0, i]][storesList[0, i]]) - write_hdf5(sampling_rate, storesList[0, i], outputPath, "sampling_rate") - write_hdf5(timestamps, storesList[0, i], outputPath, "timestamps") - write_hdf5(data, storesList[0, i], outputPath, "data") - else: - timestamps = np.array(doric_file["Traces"]["Console"]["Time(s)"]["Console_time(s)"]) - ttl = np.array(doric_file["Traces"]["Console"][storesList[0, i]][storesList[0, i]]) - indices = np.where(ttl <= 0)[0] - diff_indices = np.where(np.diff(indices) > 1)[0] - write_hdf5(timestamps[indices[diff_indices] + 1], storesList[0, i], outputPath, "timestamps") - - -def separate_last_element(arr): - l = arr[-1] - return arr[:-1], l diff --git a/src/guppy/npm_step2.py b/src/guppy/npm_step2.py deleted file mode 100644 index 14b776f..0000000 --- a/src/guppy/npm_step2.py +++ /dev/null @@ -1,411 +0,0 @@ -import glob -import logging -import os -import tkinter as tk -from tkinter import StringVar, messagebox, ttk - -import numpy as np -import pandas as pd -import panel as pn - -pn.extension() - -logger = logging.getLogger(__name__) - - -def import_npm(filepath, num_ch, inputParameters=None): - - logger.debug("If it exists, importing NPM file based on the structure of file") - # Headless configuration (used to avoid any UI prompts when running tests) - headless = bool(os.environ.get("GUPPY_BASE_DIR")) - npm_timestamp_column_name = None - npm_time_unit = None - npm_split_events = None - if isinstance(inputParameters, dict): - npm_timestamp_column_name = inputParameters.get("npm_timestamp_column_name") - npm_time_unit = inputParameters.get("npm_time_unit", "seconds") - npm_split_events = inputParameters.get("npm_split_events", True) - path = sorted(glob.glob(os.path.join(filepath, "*.csv"))) + sorted(glob.glob(os.path.join(filepath, "*.doric"))) - path_chev = glob.glob(os.path.join(filepath, "*chev*")) - path_chod = glob.glob(os.path.join(filepath, "*chod*")) - path_chpr = glob.glob(os.path.join(filepath, "*chpr*")) - path_event = glob.glob(os.path.join(filepath, "event*")) - # path_sig = glob.glob(os.path.join(filepath, 'sig*')) # TODO: what is this for? - path_chev_chod_event = path_chev + path_chod + path_event + path_chpr - - path = sorted(list(set(path) - set(path_chev_chod_event))) - flag = "None" - event_from_filename = [] - flag_arr = [] - for i in range(len(path)): - dirname = os.path.dirname(path[i]) - ext = os.path.basename(path[i]).split(".")[-1] - assert ext != "doric", "Doric files are not supported by import_npm function." - df = pd.read_csv(path[i], header=None, nrows=2, index_col=False, dtype=str) - df = df.dropna(axis=1, how="all") - df_arr = np.array(df).flatten() - check_all_str = [] - for element in df_arr: - try: - float(element) - except: - check_all_str.append(i) - assert len(check_all_str) != len( - df_arr - ), "This file appears to be doric .csv. This function only supports NPM .csv files." - df = pd.read_csv(path[i], index_col=False) - _, value = check_header(df) - - # check dataframe structure and read data accordingly - if len(value) > 0: - columns_isstr = False - df = pd.read_csv(path[i], header=None) - cols = np.array(list(df.columns), dtype=str) - else: - df = df - columns_isstr = True - cols = np.array(list(df.columns), dtype=str) - # check the structure of dataframe and assign flag to the type of file - assert len(cols) != 1, "File appears to be event .csv. This function only supports NPM .csv files." - assert len(cols) != 3, "File appears to be data .csv. This function only supports NPM .csv files." - if len(cols) == 2: - flag = "event_or_data_np" - elif len(cols) >= 2: - flag = "data_np" - else: - logger.error("Number of columns in csv file does not make sense.") - raise Exception("Number of columns in csv file does not make sense.") - - if columns_isstr == True and ( - "flags" in np.char.lower(np.array(cols)) or "ledstate" in np.char.lower(np.array(cols)) - ): - flag = flag + "_v2" - else: - flag = flag - - # used assigned flags to process the files and read the data - if flag == "event_or_data_np": - arr = list(df.iloc[:, 1]) - check_float = [True for i in arr if isinstance(i, float)] - if len(arr) == len(check_float) and columns_isstr == False: - flag = "data_np" - elif columns_isstr == True and ("value" in np.char.lower(np.array(cols))): - flag = "event_np" - else: - flag = "event_np" - - flag_arr.append(flag) - logger.info(flag) - if flag == "data_np": - file = f"file{str(i)}_" - df, indices_dict, _ = decide_indices(file, df, flag, num_ch) - keys = list(indices_dict.keys()) - for k in range(len(keys)): - for j in range(df.shape[1]): - if j == 0: - timestamps = df.iloc[:, j][indices_dict[keys[k]]] - # timestamps_odd = df.iloc[:,j][odd_indices] - else: - d = dict() - d["timestamps"] = timestamps - d["data"] = df.iloc[:, j][indices_dict[keys[k]]] - - df_ch = pd.DataFrame(d) - df_ch.to_csv(os.path.join(dirname, keys[k] + str(j) + ".csv"), index=False) - event_from_filename.append(keys[k] + str(j)) - - elif flag == "event_np": - type_val = np.array(df.iloc[:, 1]) - type_val_unique = np.unique(type_val) - if headless: - response = 1 if bool(npm_split_events) else 0 - else: - window = tk.Tk() - if len(type_val_unique) > 1: - response = messagebox.askyesno( - "Multiple event TTLs", - "Based on the TTL file,\ - it looks like TTLs \ - belongs to multiple behavior type. \ - Do you want to create multiple files for each \ - behavior type ?", - ) - else: - response = 0 - window.destroy() - if response == 1: - timestamps = np.array(df.iloc[:, 0]) - for j in range(len(type_val_unique)): - idx = np.where(type_val == type_val_unique[j]) - d = dict() - d["timestamps"] = timestamps[idx] - df_new = pd.DataFrame(d) - df_new.to_csv(os.path.join(dirname, "event" + str(type_val_unique[j]) + ".csv"), index=False) - event_from_filename.append("event" + str(type_val_unique[j])) - else: - timestamps = np.array(df.iloc[:, 0]) - d = dict() - d["timestamps"] = timestamps - df_new = pd.DataFrame(d) - df_new.to_csv(os.path.join(dirname, "event" + str(0) + ".csv"), index=False) - event_from_filename.append("event" + str(0)) - else: - file = f"file{str(i)}_" - df, ts_unit = decide_ts_unit_for_npm( - df, timestamp_column_name=npm_timestamp_column_name, time_unit=npm_time_unit, headless=headless - ) - df, indices_dict, _ = decide_indices(file, df, flag) - keys = list(indices_dict.keys()) - for k in range(len(keys)): - for j in range(df.shape[1]): - if j == 0: - timestamps = df.iloc[:, j][indices_dict[keys[k]]] - # timestamps_odd = df.iloc[:,j][odd_indices] - else: - d = dict() - d["timestamps"] = timestamps - d["data"] = df.iloc[:, j][indices_dict[keys[k]]] - - df_ch = pd.DataFrame(d) - df_ch.to_csv(os.path.join(dirname, keys[k] + str(j) + ".csv"), index=False) - event_from_filename.append(keys[k] + str(j)) - - path_chev = glob.glob(os.path.join(filepath, "*chev*")) - path_chod = glob.glob(os.path.join(filepath, "*chod*")) - path_chpr = glob.glob(os.path.join(filepath, "*chpr*")) - path_event = glob.glob(os.path.join(filepath, "event*")) - # path_sig = glob.glob(os.path.join(filepath, 'sig*')) - path_chev_chod_chpr = [path_chev, path_chod, path_chpr] - if ( - ("data_np_v2" in flag_arr or "data_np" in flag_arr) and ("event_np" in flag_arr) and (i == len(path) - 1) - ) or ( - ("data_np_v2" in flag_arr or "data_np" in flag_arr) and (i == len(path) - 1) - ): # i==len(path)-1 and or 'event_np' in flag - num_path_chev, num_path_chod, num_path_chpr = len(path_chev), len(path_chod), len(path_chpr) - arr_len, no_ch = [], [] - for i in range(len(path_chev_chod_chpr)): - if len(path_chev_chod_chpr[i]) > 0: - arr_len.append(len(path_chev_chod_chpr[i])) - else: - continue - - unique_arr_len = np.unique(np.array(arr_len)) - if "data_np_v2" in flag_arr: - if ts_unit == "seconds": - divisor = 1 - elif ts_unit == "milliseconds": - divisor = 1e3 - else: - divisor = 1e6 - else: - divisor = 1000 - - for j in range(len(path_event)): - df_event = pd.read_csv(path_event[j]) - df_chev = pd.read_csv(path_chev[0]) - df_event["timestamps"] = (df_event["timestamps"] - df_chev["timestamps"][0]) / divisor - df_event.to_csv(path_event[j], index=False) - if unique_arr_len.shape[0] == 1: - for j in range(len(path_chev)): - if file + "chev" in indices_dict.keys(): - df_chev = pd.read_csv(path_chev[j]) - df_chev["timestamps"] = (df_chev["timestamps"] - df_chev["timestamps"][0]) / divisor - df_chev["sampling_rate"] = np.full(df_chev.shape[0], np.nan) - df_chev.at[0, "sampling_rate"] = df_chev.shape[0] / ( - df_chev["timestamps"].iloc[-1] - df_chev["timestamps"].iloc[0] - ) - df_chev.to_csv(path_chev[j], index=False) - - if file + "chod" in indices_dict.keys(): - df_chod = pd.read_csv(path_chod[j]) - df_chod["timestamps"] = df_chev["timestamps"] - df_chod["sampling_rate"] = np.full(df_chod.shape[0], np.nan) - df_chod.at[0, "sampling_rate"] = df_chev["sampling_rate"][0] - df_chod.to_csv(path_chod[j], index=False) - - if file + "chpr" in indices_dict.keys(): - df_chpr = pd.read_csv(path_chpr[j]) - df_chpr["timestamps"] = df_chev["timestamps"] - df_chpr["sampling_rate"] = np.full(df_chpr.shape[0], np.nan) - df_chpr.at[0, "sampling_rate"] = df_chev["sampling_rate"][0] - df_chpr.to_csv(path_chpr[j], index=False) - else: - logger.error("Number of channels should be same for all regions.") - raise Exception("Number of channels should be same for all regions.") - logger.info("Importing of NPM file is done.") - return event_from_filename, flag_arr - - -def check_header(df): - arr = list(df.columns) - check_float = [] - for i in arr: - try: - check_float.append(float(i)) - except: - pass - - return arr, check_float - - -# function to decide indices of interleaved channels -# in neurophotometrics data -def decide_indices(file, df, flag, num_ch=2): - ch_name = [file + "chev", file + "chod", file + "chpr"] - if len(ch_name) < num_ch: - logger.error( - "Number of channels parameters in Input Parameters GUI is more than 3. \ - Looks like there are more than 3 channels in the file. Reading of these files\ - are not supported. Reach out to us if you get this error message." - ) - raise Exception( - "Number of channels parameters in Input Parameters GUI is more than 3. \ - Looks like there are more than 3 channels in the file. Reading of these files\ - are not supported. Reach out to us if you get this error message." - ) - if flag == "data_np": - indices_dict = dict() - for i in range(num_ch): - indices_dict[ch_name[i]] = np.arange(i, df.shape[0], num_ch) - - else: - cols = np.array(list(df.columns)) - if "flags" in np.char.lower(np.array(cols)): - arr = ["FrameCounter", "Flags"] - state = np.array(df["Flags"]) - elif "ledstate" in np.char.lower(np.array(cols)): - arr = ["FrameCounter", "LedState"] - state = np.array(df["LedState"]) - else: - logger.error( - "File type shows Neurophotometrics newer version \ - data but column names does not have Flags or LedState" - ) - raise Exception( - "File type shows Neurophotometrics newer version \ - data but column names does not have Flags or LedState" - ) - - num_ch, ch = check_channels(state) - indices_dict = dict() - for i in range(num_ch): - first_occurrence = np.where(state == ch[i])[0] - indices_dict[ch_name[i]] = np.arange(first_occurrence[0], df.shape[0], num_ch) - - df = df.drop(arr, axis=1) - - return df, indices_dict, num_ch - - -# check flag consistency in neurophotometrics data -def check_channels(state): - state = state.astype(int) - unique_state = np.unique(state[2:12]) - if unique_state.shape[0] > 3: - logger.error( - "Looks like there are more than 3 channels in the file. Reading of these files\ - are not supported. Reach out to us if you get this error message." - ) - raise Exception( - "Looks like there are more than 3 channels in the file. Reading of these files\ - are not supported. Reach out to us if you get this error message." - ) - - return unique_state.shape[0], unique_state - - -# function to decide NPM timestamps unit (seconds, ms or us) -def decide_ts_unit_for_npm(df, timestamp_column_name=None, time_unit=None, headless=False): - col_names = np.array(list(df.columns)) - col_names_ts = [""] - for name in col_names: - if "timestamp" in name.lower(): - col_names_ts.append(name) - - ts_unit = "seconds" - if len(col_names_ts) > 2: - # Headless path: auto-select column/unit without any UI - if headless: - if timestamp_column_name is not None: - assert ( - timestamp_column_name in col_names_ts - ), f"Provided timestamp_column_name '{timestamp_column_name}' not found in columns {col_names_ts[1:]}" - chosen = timestamp_column_name - else: - chosen = col_names_ts[1] - df.insert(1, "Timestamp", df[chosen]) - df = df.drop(col_names_ts[1:], axis=1) - valid_units = {"seconds", "milliseconds", "microseconds"} - ts_unit = time_unit if (isinstance(time_unit, str) and time_unit in valid_units) else "seconds" - return df, ts_unit - # def comboBoxSelected(event): - # logger.info(event.widget.get()) - - window = tk.Tk() - window.title("Select appropriate options for timestamps") - window.geometry("500x200") - holdComboboxValues = dict() - - timestamps_label = ttk.Label(window, text="Select which timestamps to use : ").grid( - row=0, column=1, pady=25, padx=25 - ) - holdComboboxValues["timestamps"] = StringVar() - timestamps_combo = ttk.Combobox(window, values=col_names_ts, textvariable=holdComboboxValues["timestamps"]) - timestamps_combo.grid(row=0, column=2, pady=25, padx=25) - timestamps_combo.current(0) - # timestamps_combo.bind("<>", comboBoxSelected) - - time_unit_label = ttk.Label(window, text="Select timestamps unit : ").grid(row=1, column=1, pady=25, padx=25) - holdComboboxValues["time_unit"] = StringVar() - time_unit_combo = ttk.Combobox( - window, values=["", "seconds", "milliseconds", "microseconds"], textvariable=holdComboboxValues["time_unit"] - ) - time_unit_combo.grid(row=1, column=2, pady=25, padx=25) - time_unit_combo.current(0) - # time_unit_combo.bind("<>", comboBoxSelected) - window.lift() - window.after(500, lambda: window.lift()) - window.mainloop() - - if holdComboboxValues["timestamps"].get(): - df.insert(1, "Timestamp", df[holdComboboxValues["timestamps"].get()]) - df = df.drop(col_names_ts[1:], axis=1) - else: - messagebox.showerror( - "All options not selected", - "All the options for timestamps \ - were not selected. Please select appropriate options", - ) - logger.error( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - raise Exception( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - if holdComboboxValues["time_unit"].get(): - if holdComboboxValues["time_unit"].get() == "seconds": - ts_unit = holdComboboxValues["time_unit"].get() - elif holdComboboxValues["time_unit"].get() == "milliseconds": - ts_unit = holdComboboxValues["time_unit"].get() - else: - ts_unit = holdComboboxValues["time_unit"].get() - else: - messagebox.showerror( - "All options not selected", - "All the options for timestamps \ - were not selected. Please select appropriate options", - ) - logger.error( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - raise Exception( - "All the options for timestamps \ - were not selected. Please select appropriate options" - ) - else: - pass - - return df, ts_unit diff --git a/src/guppy/tdt_step2.py b/src/guppy/tdt_step2.py deleted file mode 100644 index 130ace8..0000000 --- a/src/guppy/tdt_step2.py +++ /dev/null @@ -1,28 +0,0 @@ -import glob -import logging -import os - -import numpy as np -import pandas as pd -from numpy import float32, float64, int32, int64, uint16 - -logger = logging.getLogger(__name__) - - -# function to read 'tsq' file -def readtsq(filepath): - names = ("size", "type", "name", "chan", "sort_code", "timestamp", "fp_loc", "strobe", "format", "frequency") - formats = (int32, int32, "S4", uint16, uint16, float64, int64, float64, int32, float32) - offsets = 0, 4, 8, 12, 14, 16, 24, 24, 32, 36 - tsq_dtype = np.dtype({"names": names, "formats": formats, "offsets": offsets}, align=True) - path = glob.glob(os.path.join(filepath, "*.tsq")) - if len(path) > 1: - logger.error("Two tsq files are present at the location.") - raise Exception("Two tsq files are present at the location.") - elif len(path) == 0: - return 0 - else: - path = path[0] - tsq = np.fromfile(path, dtype=tsq_dtype) - df = pd.DataFrame(tsq) - return df diff --git a/src/guppy/tdt_step3.py b/src/guppy/tdt_step3.py deleted file mode 100644 index be92d4c..0000000 --- a/src/guppy/tdt_step3.py +++ /dev/null @@ -1,207 +0,0 @@ -import glob -import logging -import multiprocessing as mp -import os -import time -from itertools import repeat - -import numpy as np -import pandas as pd -from numpy import float32, float64, int32, int64, uint16 - -from guppy.common_step3 import write_hdf5 - -logger = logging.getLogger(__name__) - - -# function to read tsq file -def readtsq(filepath): - logger.debug("Trying to read tsq file.") - names = ("size", "type", "name", "chan", "sort_code", "timestamp", "fp_loc", "strobe", "format", "frequency") - formats = (int32, int32, "S4", uint16, uint16, float64, int64, float64, int32, float32) - offsets = 0, 4, 8, 12, 14, 16, 24, 24, 32, 36 - tsq_dtype = np.dtype({"names": names, "formats": formats, "offsets": offsets}, align=True) - path = glob.glob(os.path.join(filepath, "*.tsq")) - if len(path) > 1: - logger.error("Two tsq files are present at the location.") - raise Exception("Two tsq files are present at the location.") - elif len(path) == 0: - logger.info("\033[1m" + "tsq file not found." + "\033[1m") - return 0, 0 - else: - path = path[0] - flag = "tsq" - - # reading tsq file - tsq = np.fromfile(path, dtype=tsq_dtype) - - # creating dataframe of the data - df = pd.DataFrame(tsq) - - logger.info("Data from tsq file fetched.") - return df, flag - - -# function to execute readtev function using multiprocessing to make it faster -def execute_readtev(filepath, event, outputPath, numProcesses=mp.cpu_count()): - data, _ = readtsq(filepath) - - start = time.time() - with mp.Pool(numProcesses) as p: - p.starmap(readtev, zip(repeat(data), repeat(filepath), event, repeat(outputPath))) - # p = mp.Pool(mp.cpu_count()) - # p.starmap(readtev, zip(repeat(data), repeat(filepath), event, repeat(outputPath))) - # p.close() - # p.join() - logger.info("Time taken = {0:.5f}".format(time.time() - start)) - - -# function to read tev file -def readtev(data, filepath, event, outputPath): - - logger.debug("Reading data for event {} ...".format(event)) - tevfilepath = glob.glob(os.path.join(filepath, "*.tev")) - if len(tevfilepath) > 1: - raise Exception("Two tev files are present at the location.") - else: - tevfilepath = tevfilepath[0] - - data["name"] = np.asarray(data["name"], dtype=str) - - allnames = np.unique(data["name"]) - - index = [] - for i in range(len(allnames)): - length = len(str(allnames[i])) - if length < 4: - index.append(i) - - allnames = np.delete(allnames, index, 0) - - eventNew = np.array(list(event)) - - # logger.info(allnames) - # logger.info(eventNew) - row = ismember(data["name"], event) - - if sum(row) == 0: - logger.error("\033[1m" + "Requested store name " + event + " not found (case-sensitive)." + "\033[0m") - logger.error("\033[1m" + "File contains the following TDT store names:" + "\033[0m") - logger.error("\033[1m" + str(allnames) + "\033[0m") - logger.error("\033[1m" + "TDT store name " + str(event) + " not found." + "\033[0m") - raise ValueError("Requested store name not found.") - - allIndexesWhereEventIsPresent = np.where(row == 1) - first_row = allIndexesWhereEventIsPresent[0][0] - - formatNew = data["format"][first_row] + 1 - - table = np.array( - [ - [0, 0, 0, 0], - [0, "float", 1, np.float32], - [0, "long", 1, np.int32], - [0, "short", 2, np.int16], - [0, "byte", 4, np.int8], - ] - ) - - S = dict() - - S["storename"] = str(event) - S["sampling_rate"] = data["frequency"][first_row] - S["timestamps"] = np.asarray(data["timestamp"][allIndexesWhereEventIsPresent[0]]) - S["channels"] = np.asarray(data["chan"][allIndexesWhereEventIsPresent[0]]) - - fp_loc = np.asarray(data["fp_loc"][allIndexesWhereEventIsPresent[0]]) - data_size = np.asarray(data["size"]) - - if formatNew != 5: - nsample = (data_size[first_row,] - 10) * int(table[formatNew, 2]) - S["data"] = np.zeros((len(fp_loc), nsample)) - for i in range(0, len(fp_loc)): - with open(tevfilepath, "rb") as fp: - fp.seek(fp_loc[i], os.SEEK_SET) - S["data"][i, :] = np.fromfile(fp, dtype=table[formatNew, 3], count=nsample).reshape( - 1, nsample, order="F" - ) - # S['data'] = S['data'].swapaxes() - S["npoints"] = nsample - else: - S["data"] = np.asarray(data["strobe"][allIndexesWhereEventIsPresent[0]]) - S["npoints"] = 1 - S["channels"] = np.tile(1, (S["data"].shape[0],)) - - S["data"] = (S["data"].T).reshape(-1, order="F") - - save_dict_to_hdf5(S, event, outputPath) - - check_data(S, filepath, event, outputPath) - - logger.info("Data for event {} fetched and stored.".format(event)) - - -# check if a particular element is there in an array or not -def ismember(arr, element): - res = [1 if i == element else 0 for i in arr] - return np.asarray(res) - - -# function to save data read from tev file to hdf5 file -def save_dict_to_hdf5(S, event, outputPath): - write_hdf5(S["storename"], event, outputPath, "storename") - write_hdf5(S["sampling_rate"], event, outputPath, "sampling_rate") - write_hdf5(S["timestamps"], event, outputPath, "timestamps") - - write_hdf5(S["data"], event, outputPath, "data") - write_hdf5(S["npoints"], event, outputPath, "npoints") - write_hdf5(S["channels"], event, outputPath, "channels") - - -# function to check event data (checking whether event timestamps belongs to same event or multiple events) -def check_data(S, filepath, event, outputPath): - # logger.info("Checking event storename data for creating multiple event names from single event storename...") - new_event = event.replace("\\", "") - new_event = event.replace("/", "") - diff = np.diff(S["data"]) - arr = np.full(diff.shape[0], 1) - - storesList = np.genfromtxt(os.path.join(outputPath, "storesList.csv"), dtype="str", delimiter=",").reshape(2, -1) - - if diff.shape[0] == 0: - return 0 - - if S["sampling_rate"] == 0 and np.all(diff == diff[0]) == False: - logger.info("\033[1m" + "Data in event {} belongs to multiple behavior".format(event) + "\033[0m") - logger.debug( - "\033[1m" + "Create timestamp files for individual new event and change the stores list file." + "\033[0m" - ) - i_d = np.unique(S["data"]) - for i in range(i_d.shape[0]): - new_S = dict() - idx = np.where(S["data"] == i_d[i])[0] - new_S["timestamps"] = S["timestamps"][idx] - new_S["storename"] = new_event + str(int(i_d[i])) - new_S["sampling_rate"] = S["sampling_rate"] - new_S["data"] = S["data"] - new_S["npoints"] = S["npoints"] - new_S["channels"] = S["channels"] - storesList = np.concatenate( - (storesList, [[new_event + str(int(i_d[i]))], [new_event + "_" + str(int(i_d[i]))]]), axis=1 - ) - save_dict_to_hdf5(new_S, new_event + str(int(i_d[i])), outputPath) - - idx = np.where(storesList[0] == event)[0] - storesList = np.delete(storesList, idx, axis=1) - if not os.path.exists(os.path.join(outputPath, ".cache_storesList.csv")): - os.rename(os.path.join(outputPath, "storesList.csv"), os.path.join(outputPath, ".cache_storesList.csv")) - if idx.shape[0] == 0: - pass - else: - np.savetxt(os.path.join(outputPath, "storesList.csv"), storesList, delimiter=",", fmt="%s") - logger.info( - "\033[1m" - + "Timestamp files for individual new event are created \ - and the stores list file is changed." - + "\033[0m" - ) From b35e04b0db575f6ca72ea198d9db12bde06e6b68 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 11:08:20 -0800 Subject: [PATCH 046/150] Refactored csv_recording_extractor to inherit from base_recording_extractor. --- .../extractors/csv_recording_extractor.py | 35 ++++++++++++------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/src/guppy/extractors/csv_recording_extractor.py b/src/guppy/extractors/csv_recording_extractor.py index 5a42bd1..792ad01 100644 --- a/src/guppy/extractors/csv_recording_extractor.py +++ b/src/guppy/extractors/csv_recording_extractor.py @@ -4,11 +4,12 @@ import os import time from itertools import repeat +from typing import Any import numpy as np import pandas as pd -from guppy.common_step3 import write_hdf5 +from guppy.extractors import BaseRecordingExtractor logger = logging.getLogger(__name__) @@ -29,7 +30,7 @@ def read_and_save_csv(extractor, event, outputPath): logger.info("Data for event {} fetched and stored.".format(event)) -class CsvRecordingExtractor: +class CsvRecordingExtractor(BaseRecordingExtractor): def __init__(self, folder_path): self.folder_path = folder_path @@ -58,7 +59,7 @@ def __init__(self, folder_path): ), "This file appears to be doric .csv. This function only supports standard .csv files." df = pd.read_csv(path[i], index_col=False) - _, value = self.check_header(df) + _, value = self._check_header(df) # check dataframe structure and read data accordingly if len(value) > 0: @@ -121,10 +122,18 @@ def __init__(self, folder_path): logger.info("Importing of csv file is done.") - self.events = event_from_filename - self.flags = flag_arr + self._events = event_from_filename + self._flags = flag_arr - def check_header(self, df): + @property + def events(self) -> list[str]: + return self._events + + @property + def flags(self) -> list: + return self._flags + + def _check_header(self, df): arr = list(df.columns) check_float = [] for i in arr: @@ -135,7 +144,7 @@ def check_header(self, df): return arr, check_float - def read_csv(self, event): + def _read_csv(self, event): logger.debug("\033[1m" + "Trying to read data for {} from csv file.".format(event) + "\033[0m") if not os.path.exists(os.path.join(self.folder_path, event + ".csv")): logger.error("\033[1m" + "No csv file found for event {}".format(event) + "\033[0m") @@ -144,7 +153,7 @@ def read_csv(self, event): df = pd.read_csv(os.path.join(self.folder_path, event + ".csv"), index_col=False) return df - def save_to_hdf5(self, df, event, outputPath): + def _save_to_hdf5(self, df, event, outputPath): key = list(df.columns) # TODO: clean up these if branches @@ -175,21 +184,21 @@ def save_to_hdf5(self, df, event, outputPath): ) for i in range(len(key)): - write_hdf5(df[key[i]].dropna(), event, outputPath, key[i].lower()) + self._write_hdf5(df[key[i]].dropna(), event, outputPath, key[i].lower()) logger.info("\033[1m" + "Reading data for {} from csv file is completed.".format(event) + "\033[0m") - def read(self, events, outputPath): + def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str, Any]]: output_dicts = [] for event in events: - df = self.read_csv(event=event) + df = self._read_csv(event=event) S = df.to_dict() S["storename"] = event output_dicts.append(S) return output_dicts - def save(self, output_dicts, outputPath): + def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str, **kwargs) -> None: for S in output_dicts: event = S.pop("storename") df = pd.DataFrame.from_dict(S) - self.save_to_hdf5(df=df, event=event, outputPath=outputPath) + self._save_to_hdf5(df=df, event=event, outputPath=outputPath) From b330a64b43e87ec20536e3cdfa815efcb3b7f054 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 11:46:39 -0800 Subject: [PATCH 047/150] Refactored tdt_recording_extractor to inherit from base_recording_extractor. --- .../extractors/tdt_recording_extractor.py | 84 +++++++++++-------- src/guppy/saveStoresList.py | 30 ++----- 2 files changed, 57 insertions(+), 57 deletions(-) diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py index 58cde99..6e712fb 100644 --- a/src/guppy/extractors/tdt_recording_extractor.py +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -4,12 +4,13 @@ import os import time from itertools import repeat +from typing import Any import numpy as np import pandas as pd from numpy import float32, float64, int32, int64, uint16 -from guppy.common_step3 import write_hdf5 +from guppy.extractors import BaseRecordingExtractor logger = logging.getLogger(__name__) @@ -27,13 +28,37 @@ def execute_readtev(folder_path, events, outputPath, numProcesses=mp.cpu_count() logger.info("Time taken = {0:.5f}".format(time.time() - start)) -class TdtRecordingExtractor: +class TdtRecordingExtractor(BaseRecordingExtractor): def __init__(self, folder_path): self.folder_path = folder_path - self.header_df, _ = self.readtsq(folder_path) + self._header_df, _ = self._readtsq(folder_path) + + # Populate events from header_df + if isinstance(self._header_df, pd.DataFrame): + self._header_df["name"] = np.asarray(self._header_df["name"], dtype=str) + allnames = np.unique(self._header_df["name"]) + index = [] + for i in range(len(allnames)): + length = len(str(allnames[i])) + if length < 4: + index.append(i) + allnames = np.delete(allnames, index, 0) + self._events = list(allnames) + else: + self._events = [] + + self._flags = [] + + @property + def events(self) -> list[str]: + return self._events - def readtsq(self, folder_path): + @property + def flags(self) -> list: + return self._flags + + def _readtsq(self, folder_path): logger.debug("Trying to read tsq file.") names = ("size", "type", "name", "chan", "sort_code", "timestamp", "fp_loc", "strobe", "format", "frequency") formats = (int32, int32, "S4", uint16, uint16, float64, int64, float64, int32, float32) @@ -59,9 +84,8 @@ def readtsq(self, folder_path): logger.info("Data from tsq file fetched.") return df, flag - # function to read tev file - def readtev(self, event): - data = self.header_df + def _readtev(self, event): + data = self._header_df filepath = self.folder_path logger.debug("Reading data for event {} ...".format(event)) @@ -87,7 +111,7 @@ def readtev(self, event): # logger.info(allnames) # logger.info(eventNew) - row = self.ismember(data["name"], event) + row = self._ismember(data["name"], event) if sum(row) == 0: logger.error("\033[1m" + "Requested store name " + event + " not found (case-sensitive)." + "\033[0m") @@ -141,24 +165,23 @@ def readtev(self, event): return S - def read(self, events, outputPath): + def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str, Any]]: output_dicts = [] for event in events: - S = self.readtev(event=event) - if self.event_needs_splitting(data=S["data"], sampling_rate=S["sampling_rate"]): - event_dicts = self.split_event_data(S, event) - self.split_event_storesList(S, event, outputPath) + S = self._readtev(event=event) + if self._event_needs_splitting(data=S["data"], sampling_rate=S["sampling_rate"]): + event_dicts = self._split_event_data(S, event) + self._split_event_storesList(S, event, outputPath) else: event_dicts = [S] output_dicts.extend(event_dicts) return output_dicts - # check if a particular element is there in an array or not - def ismember(self, arr, element): # TODO: replace this function with more standard usage + def _ismember(self, arr, element): res = [1 if i == element else 0 for i in arr] return np.asarray(res) - def event_needs_splitting(self, data, sampling_rate): + def _event_needs_splitting(self, data, sampling_rate): logger.info("Checking event storename data for creating multiple event names from single event storename...") diff = np.diff(data) if diff.shape[0] == 0: @@ -167,7 +190,7 @@ def event_needs_splitting(self, data, sampling_rate): return True return False - def split_event_data(self, S, event): + def _split_event_data(self, S, event): # Note that new_event is only used for the new storesList and event is still used for the old storesList new_event = event.replace("\\", "") new_event = event.replace("/", "") @@ -189,10 +212,7 @@ def split_event_data(self, S, event): return event_dicts - # This function saves a new storesList.csv file, which is a bit of a side effect in the overall read path, - # which is supposed to just return a list of dictionaries. - # TODO: long term I'd like to move these storesList shenanigans somewhere else, likely outside of the extractor. - def split_event_storesList(self, S, event, outputPath): + def _split_event_storesList(self, S, event, outputPath): # Note that new_event is only used for the new storesList and event is still used for the old storesList new_event = event.replace("\\", "") new_event = event.replace("/", "") @@ -217,17 +237,15 @@ def split_event_storesList(self, S, event, outputPath): np.savetxt(os.path.join(outputPath, "storesList.csv"), storesList, delimiter=",", fmt="%s") logger.info("\033[1m The stores list file is changed.\033[0m") - # function to save data read from tev file to hdf5 file - def save_dict_to_hdf5(self, S, outputPath): + def _save_dict_to_hdf5(self, S, outputPath): event = S["storename"] - write_hdf5(S["storename"], event, outputPath, "storename") - write_hdf5(S["sampling_rate"], event, outputPath, "sampling_rate") - write_hdf5(S["timestamps"], event, outputPath, "timestamps") - - write_hdf5(S["data"], event, outputPath, "data") - write_hdf5(S["npoints"], event, outputPath, "npoints") - write_hdf5(S["channels"], event, outputPath, "channels") - - def save(self, output_dicts, outputPath): + self._write_hdf5(S["storename"], event, outputPath, "storename") + self._write_hdf5(S["sampling_rate"], event, outputPath, "sampling_rate") + self._write_hdf5(S["timestamps"], event, outputPath, "timestamps") + self._write_hdf5(S["data"], event, outputPath, "data") + self._write_hdf5(S["npoints"], event, outputPath, "npoints") + self._write_hdf5(S["channels"], event, outputPath, "channels") + + def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str, **kwargs) -> None: for S in output_dicts: - self.save_dict_to_hdf5(S=S, outputPath=outputPath) + self._save_dict_to_hdf5(S=S, outputPath=outputPath) diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index 552d76c..74602a5 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -80,7 +80,7 @@ def make_dir(filepath): # function to show GUI and save -def saveStorenames(inputParameters, data, event_name, flag, filepath): +def saveStorenames(inputParameters, event_name, flag, filepath): logger.debug("Saving stores list file.") # getting input parameters @@ -96,20 +96,8 @@ def saveStorenames(inputParameters, data, event_name, flag, filepath): logger.info("Storeslist : \n" + str(arr)) return - # reading storenames from the data fetched using 'readtsq' function - if isinstance(data, pd.DataFrame): - data["name"] = np.asarray(data["name"], dtype=str) - allnames = np.unique(data["name"]) - index = [] - for i in range(len(allnames)): - length = len(str(allnames[i])) - if length < 4: - index.append(i) - allnames = np.delete(allnames, index, 0) - allnames = list(allnames) - - else: - allnames = [] + # Get storenames from extractor's events property + allnames = event_name if "data_np_v2" in flag or "data_np" in flag or "event_np" in flag: path_chev = glob.glob(os.path.join(filepath, "*chev*")) @@ -152,9 +140,6 @@ def plot(plot_select): else: pass - # finalizing all the storenames - allnames = allnames + event_name - # instructions about how to save the storeslist file mark_down = pn.pane.Markdown( """ @@ -589,16 +574,14 @@ def execute(inputParameters): filepath = os.path.join(inputParameters["abspath"], i) if modality == "tdt": extractor = TdtRecordingExtractor(folder_path=filepath) - data = extractor.header_df - event_name, flag = [], [] + event_name = extractor.events + flag = extractor.flags elif modality == "csv": - data = 0 extractor = CsvRecordingExtractor(folder_path=filepath) event_name = extractor.events flag = extractor.flags elif modality == "doric": - data = 0 extractor = DoricRecordingExtractor(folder_path=filepath) event_name = extractor.events flag = extractor.flags @@ -621,14 +604,13 @@ def execute(inputParameters): npm_timestamp_column_names if npm_timestamp_column_names else None ) - data = 0 extractor = NpmRecordingExtractor(folder_path=filepath, num_ch=num_ch, inputParameters=inputParameters) event_name = extractor.events flag = extractor.flags else: raise ValueError("Modality not recognized. Please use 'tdt', 'csv', 'doric', or 'npm'.") - saveStorenames(inputParameters, data, event_name, flag, filepath) + saveStorenames(inputParameters, event_name, flag, filepath) logger.info("#" * 400) except Exception as e: logger.error(str(e)) From 8af3b2be7e73eeaa326de65344bb36e8955f4207 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 11:49:11 -0800 Subject: [PATCH 048/150] Updated parameter names for saveStoresList. --- src/guppy/saveStoresList.py | 62 ++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 29 deletions(-) diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index 74602a5..318bc5f 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -80,7 +80,7 @@ def make_dir(filepath): # function to show GUI and save -def saveStorenames(inputParameters, event_name, flag, filepath): +def saveStorenames(inputParameters, events, flags, folder_path): logger.debug("Saving stores list file.") # getting input parameters @@ -89,7 +89,7 @@ def saveStorenames(inputParameters, event_name, flag, filepath): # Headless path: if storenames_map provided, write storesList.csv without building the Panel UI storenames_map = inputParameters.get("storenames_map") if isinstance(storenames_map, dict) and len(storenames_map) > 0: - op = make_dir(filepath) + op = make_dir(folder_path) arr = np.asarray([list(storenames_map.keys()), list(storenames_map.values())], dtype=str) np.savetxt(os.path.join(op, "storesList.csv"), arr, delimiter=",", fmt="%s") logger.info(f"Storeslist file saved at {op}") @@ -97,12 +97,12 @@ def saveStorenames(inputParameters, event_name, flag, filepath): return # Get storenames from extractor's events property - allnames = event_name + allnames = events - if "data_np_v2" in flag or "data_np" in flag or "event_np" in flag: - path_chev = glob.glob(os.path.join(filepath, "*chev*")) - path_chod = glob.glob(os.path.join(filepath, "*chod*")) - path_chpr = glob.glob(os.path.join(filepath, "*chpr*")) + if "data_np_v2" in flags or "data_np" in flags or "event_np" in flags: + path_chev = glob.glob(os.path.join(folder_path, "*chev*")) + path_chod = glob.glob(os.path.join(folder_path, "*chod*")) + path_chpr = glob.glob(os.path.join(folder_path, "*chpr*")) combine_paths = path_chev + path_chod + path_chpr d = dict() for i in range(len(combine_paths)): @@ -179,7 +179,9 @@ def plot(plot_select): ) # creating GUI template - template = pn.template.BootstrapTemplate(title="Storenames GUI - {}".format(os.path.basename(filepath), mark_down)) + template = pn.template.BootstrapTemplate( + title="Storenames GUI - {}".format(os.path.basename(folder_path), mark_down) + ) # creating different buttons and selectors for the GUI cross_selector = pn.widgets.CrossSelector(name="Store Names Selection", value=[], options=allnames, width=600) @@ -253,10 +255,10 @@ def callback(target, event): # on clicking overwrite_button, following function is executed def overwrite_button_actions(event): if event.new == "over_write_file": - select_location.options = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))) + select_location.options = takeOnlyDirs(glob.glob(os.path.join(folder_path, "*_output_*"))) # select_location.value = select_location.options[0] else: - select_location.options = [show_dir(filepath)] + select_location.options = [show_dir(folder_path)] # select_location.value = select_location.options[0] def fetchValues(event): @@ -513,8 +515,8 @@ def save_button(event=None): # creating widgets, adding them to template and showing a GUI on a new browser window number = scanPortsAndFind(start_port=5000, end_port=5200) - if "data_np_v2" in flag or "data_np" in flag or "event_np" in flag: - widget_1 = pn.Column("# " + os.path.basename(filepath), mark_down, mark_down_np, plot_select, plot) + if "data_np_v2" in flags or "data_np" in flags or "event_np" in flags: + widget_1 = pn.Column("# " + os.path.basename(folder_path), mark_down, mark_down_np, plot_select, plot) widget_2 = pn.Column( repeat_storenames, repeat_storename_wd, @@ -535,7 +537,7 @@ def save_button(event=None): template.main.append(pn.Row(widget_1, widget_2)) else: - widget_1 = pn.Column("# " + os.path.basename(filepath), mark_down) + widget_1 = pn.Column("# " + os.path.basename(folder_path), mark_down) widget_2 = pn.Column( repeat_storenames, repeat_storename_wd, @@ -571,32 +573,32 @@ def execute(inputParameters): try: for i in folderNames: - filepath = os.path.join(inputParameters["abspath"], i) + folder_path = os.path.join(inputParameters["abspath"], i) if modality == "tdt": - extractor = TdtRecordingExtractor(folder_path=filepath) - event_name = extractor.events - flag = extractor.flags + extractor = TdtRecordingExtractor(folder_path=folder_path) + events = extractor.events + flags = extractor.flags elif modality == "csv": - extractor = CsvRecordingExtractor(folder_path=filepath) - event_name = extractor.events - flag = extractor.flags + extractor = CsvRecordingExtractor(folder_path=folder_path) + events = extractor.events + flags = extractor.flags elif modality == "doric": - extractor = DoricRecordingExtractor(folder_path=filepath) - event_name = extractor.events - flag = extractor.flags + extractor = DoricRecordingExtractor(folder_path=folder_path) + events = extractor.events + flags = extractor.flags elif modality == "npm": headless = bool(os.environ.get("GUPPY_BASE_DIR")) if not headless: # Resolve multiple event TTLs - multiple_event_ttls = NpmRecordingExtractor.has_multiple_event_ttls(folder_path=filepath) + multiple_event_ttls = NpmRecordingExtractor.has_multiple_event_ttls(folder_path=folder_path) responses = get_multi_event_responses(multiple_event_ttls) inputParameters["npm_split_events"] = responses # Resolve timestamp units and columns ts_unit_needs, col_names_ts = NpmRecordingExtractor.needs_ts_unit( - folder_path=filepath, num_ch=num_ch + folder_path=folder_path, num_ch=num_ch ) ts_units, npm_timestamp_column_names = get_timestamp_configuration(ts_unit_needs, col_names_ts) inputParameters["npm_time_units"] = ts_units if ts_units else None @@ -604,13 +606,15 @@ def execute(inputParameters): npm_timestamp_column_names if npm_timestamp_column_names else None ) - extractor = NpmRecordingExtractor(folder_path=filepath, num_ch=num_ch, inputParameters=inputParameters) - event_name = extractor.events - flag = extractor.flags + extractor = NpmRecordingExtractor( + folder_path=folder_path, num_ch=num_ch, inputParameters=inputParameters + ) + events = extractor.events + flags = extractor.flags else: raise ValueError("Modality not recognized. Please use 'tdt', 'csv', 'doric', or 'npm'.") - saveStorenames(inputParameters, event_name, flag, filepath) + saveStorenames(inputParameters, events, flags, folder_path) logger.info("#" * 400) except Exception as e: logger.error(str(e)) From 5dc6d78626a796bb2fde267ec9996b372f040eba Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 13:22:03 -0800 Subject: [PATCH 049/150] Refactored npm_recording_extractor to inherit from base_recording_extractor. --- .../extractors/npm_recording_extractor.py | 35 ++++++++++++------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/src/guppy/extractors/npm_recording_extractor.py b/src/guppy/extractors/npm_recording_extractor.py index ae4f540..6d9b26a 100644 --- a/src/guppy/extractors/npm_recording_extractor.py +++ b/src/guppy/extractors/npm_recording_extractor.py @@ -4,12 +4,13 @@ import os import time from itertools import repeat +from typing import Any import numpy as np import pandas as pd import panel as pn -from guppy.common_step3 import write_hdf5 +from guppy.extractors import BaseRecordingExtractor pn.extension() @@ -32,16 +33,24 @@ def read_and_save_npm(extractor, event, outputPath): logger.info("Data for event {} fetched and stored.".format(event)) -class NpmRecordingExtractor: +class NpmRecordingExtractor(BaseRecordingExtractor): def __init__(self, folder_path, num_ch, inputParameters=None): # TODO: make inputParameters mandatory self.folder_path = folder_path self.num_ch = num_ch self.inputParameters = inputParameters - self.events, self.flags = self.import_npm( + self._events, self._flags = self._import_npm( folder_path=folder_path, num_ch=num_ch, inputParameters=inputParameters ) + @property + def events(self) -> list[str]: + return self._events + + @property + def flags(self) -> list: + return self._flags + @classmethod def has_multiple_event_ttls(cls, folder_path): path = sorted(glob.glob(os.path.join(folder_path, "*.csv"))) @@ -96,7 +105,7 @@ def has_multiple_event_ttls(cls, folder_path): return multiple_event_ttls - def import_npm(self, folder_path, num_ch, inputParameters=None): + def _import_npm(self, folder_path, num_ch, inputParameters=None): logger.debug("If it exists, importing NPM file based on the structure of file") # Headless configuration (used to avoid any UI prompts when running tests) @@ -233,7 +242,7 @@ def import_npm(self, folder_path, num_ch, inputParameters=None): else: file = f"file{str(i)}_" ts_unit = npm_time_unit - df = self.update_df_with_timestamp_columns(df, timestamp_column_name=npm_timestamp_column_name) + df = self._update_df_with_timestamp_columns(df, timestamp_column_name=npm_timestamp_column_name) df, indices_dict, _ = self.decide_indices(file, df, flag) keys = list(indices_dict.keys()) for k in range(len(keys)): @@ -467,7 +476,7 @@ def needs_ts_unit(cls, folder_path, num_ch): return ts_unit_needs, col_names_ts - def update_df_with_timestamp_columns(self, df, timestamp_column_name): + def _update_df_with_timestamp_columns(self, df, timestamp_column_name): col_names = np.array(list(df.columns)) col_names_ts = [""] for name in col_names: @@ -484,7 +493,7 @@ def update_df_with_timestamp_columns(self, df, timestamp_column_name): df = df.drop(col_names_ts[1:], axis=1) return df - def read_npm(self, event): + def _read_npm(self, event): logger.debug("\033[1m" + "Trying to read data for {} from csv file.".format(event) + "\033[0m") if not os.path.exists(os.path.join(self.folder_path, event + ".csv")): logger.error("\033[1m" + "No csv file found for event {}".format(event) + "\033[0m") @@ -493,7 +502,7 @@ def read_npm(self, event): df = pd.read_csv(os.path.join(self.folder_path, event + ".csv"), index_col=False) return df - def save_to_hdf5(self, df, event, outputPath): + def _save_to_hdf5(self, df, event, outputPath): key = list(df.columns) # TODO: clean up these if branches @@ -524,21 +533,21 @@ def save_to_hdf5(self, df, event, outputPath): ) for i in range(len(key)): - write_hdf5(df[key[i]].dropna(), event, outputPath, key[i].lower()) + self._write_hdf5(data=df[key[i]].dropna(), storename=event, output_path=outputPath, key=key[i].lower()) logger.info("\033[1m" + "Reading data for {} from csv file is completed.".format(event) + "\033[0m") - def read(self, events, outputPath): + def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str, Any]]: output_dicts = [] for event in events: - df = self.read_npm(event=event) + df = self._read_npm(event=event) S = df.to_dict() S["storename"] = event output_dicts.append(S) return output_dicts - def save(self, output_dicts, outputPath): + def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str, **kwargs) -> None: for S in output_dicts: event = S.pop("storename") df = pd.DataFrame.from_dict(S) - self.save_to_hdf5(df=df, event=event, outputPath=outputPath) + self._save_to_hdf5(df=df, event=event, outputPath=outputPath) From 861e991cbacaf8165d3023b33ada9961e0f14424 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 13:44:03 -0800 Subject: [PATCH 050/150] Refactored doric_recording_extractor to inherit from base_recording_extractor. --- .../extractors/doric_recording_extractor.py | 128 ++++++++++-------- 1 file changed, 72 insertions(+), 56 deletions(-) diff --git a/src/guppy/extractors/doric_recording_extractor.py b/src/guppy/extractors/doric_recording_extractor.py index 2966ec6..51c22ca 100644 --- a/src/guppy/extractors/doric_recording_extractor.py +++ b/src/guppy/extractors/doric_recording_extractor.py @@ -3,23 +3,28 @@ import os import re import warnings +from typing import Any import h5py import numpy as np import pandas as pd -from guppy.common_step3 import write_hdf5 +from guppy.extractors import BaseRecordingExtractor logger = logging.getLogger(__name__) def execute_import_doric(folder_path, storesList, outputPath): + # Parse storesList into events and event_types + events = list(storesList[0, :]) + event_types = {storesList[0, i]: storesList[1, i] for i in range(storesList.shape[1])} + extractor = DoricRecordingExtractor(folder_path=folder_path) - output_dicts = extractor.read(storesList=storesList) + output_dicts = extractor.read(events=events, outputPath=outputPath, event_types=event_types) extractor.save(output_dicts=output_dicts, outputPath=outputPath) -class DoricRecordingExtractor: +class DoricRecordingExtractor(BaseRecordingExtractor): # TODO: consolidate duplicate flag logic between the `__init__` and the `check_doric` method. def __init__(self, folder_path): @@ -36,7 +41,7 @@ def __init__(self, folder_path): for i in range(len(path)): ext = os.path.basename(path[i]).split(".")[-1] if ext == "doric": - key_names = self.read_doric(path[i]) + key_names = self._read_doric(path[i]) event_from_filename.extend(key_names) flag = "doric_doric" else: @@ -59,26 +64,34 @@ def __init__(self, folder_path): logger.info(flag) logger.info("Importing of Doric file is done.") - self.events = event_from_filename - self.flags = flag_arr + self._events = event_from_filename + self._flags = flag_arr + + @property + def events(self) -> list[str]: + return self._events + + @property + def flags(self) -> list: + return self._flags - def read_doric(self, filepath): + def _read_doric(self, filepath): with h5py.File(filepath, "r") as f: if "Traces" in list(f.keys()): - keys = self.access_keys_doricV1(f) + keys = self._access_keys_doricV1(f) elif list(f.keys()) == ["Configurations", "DataAcquisition"]: - keys = self.access_keys_doricV6(f) + keys = self._access_keys_doricV6(f) return keys - def access_keys_doricV6(self, doric_file): + def _access_keys_doricV6(self, doric_file): data = [doric_file["DataAcquisition"]] res = [] while len(data) != 0: members = len(data) while members != 0: members -= 1 - data, last_element = self.separate_last_element(data) + data, last_element = self._separate_last_element(data) if isinstance(last_element, h5py.Dataset) and not last_element.name.endswith("/Time"): res.append(last_element.name) elif isinstance(last_element, h5py.Group): @@ -94,17 +107,17 @@ def access_keys_doricV6(self, doric_file): return keys - def access_keys_doricV1(self, doric_file): + def _access_keys_doricV1(self, doric_file): keys = list(doric_file["Traces"]["Console"].keys()) keys.remove("Time(s)") return keys - def separate_last_element(self, arr): + def _separate_last_element(self, arr): l = arr[-1] return arr[:-1], l - def check_doric(self): + def _check_doric(self): logger.debug("Checking if doric file exists") path = glob.glob(os.path.join(self.folder_path, "*.csv")) + glob.glob(os.path.join(self.folder_path, "*.doric")) @@ -135,7 +148,7 @@ def check_doric(self): logger.info("Doric file found.") return flag_arr[0] - def read_doric_csv(self, storesList): + def _read_doric_csv(self, events, event_types): path = glob.glob(os.path.join(self.folder_path, "*.csv")) if len(path) > 1: logger.error("An error occurred : More than one Doric csv file present at the location") @@ -147,45 +160,46 @@ def read_doric_csv(self, storesList): df["Time(s)"] = df["Time(s)"] - df["Time(s)"].to_numpy()[0] output_dicts = [] - for i in range(storesList.shape[1]): - if "control" in storesList[1, i] or "signal" in storesList[1, i]: + for event in events: + event_type = event_types[event] + if "control" in event_type or "signal" in event_type: timestamps = np.array(df["Time(s)"]) sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) - data = np.array(df[storesList[0, i]]) - storename = storesList[0, i] + data = np.array(df[event]) + storename = event S = {"storename": storename, "sampling_rate": sampling_rate, "timestamps": timestamps, "data": data} output_dicts.append(S) else: - ttl = df[storesList[0, i]] + ttl = df[event] indices = np.where(ttl <= 0)[0] diff_indices = np.where(np.diff(indices) > 1)[0] timestamps = df["Time(s)"][indices[diff_indices] + 1].to_numpy() - storename = storesList[0, i] + storename = event S = {"storename": storename, "timestamps": timestamps} output_dicts.append(S) return output_dicts - def read_doric_doric(self, storesList): + def _read_doric_doric(self, events, event_types): path = glob.glob(os.path.join(self.folder_path, "*.doric")) if len(path) > 1: logger.error("An error occurred : More than one Doric file present at the location") raise Exception("More than one Doric file present at the location") with h5py.File(path[0], "r") as f: if "Traces" in list(f.keys()): - output_dicts = self.access_data_doricV1(f, storesList) + output_dicts = self._access_data_doricV1(f, events, event_types) elif list(f.keys()) == ["Configurations", "DataAcquisition"]: - output_dicts = self.access_data_doricV6(f, storesList) + output_dicts = self._access_data_doricV6(f, events, event_types) return output_dicts - def access_data_doricV6(self, doric_file, storesList): + def _access_data_doricV6(self, doric_file, events, event_types): data = [doric_file["DataAcquisition"]] res = [] while len(data) != 0: members = len(data) while members != 0: members -= 1 - data, last_element = self.separate_last_element(data) + data, last_element = self._separate_last_element(data) if isinstance(last_element, h5py.Dataset) and not last_element.name.endswith("/Time"): res.append(last_element.name) elif isinstance(last_element, h5py.Group): @@ -195,16 +209,17 @@ def access_data_doricV6(self, doric_file, storesList): for element in res: sep_values = element.split("/") if sep_values[-1] == "Values": - if f"{sep_values[-3]}/{sep_values[-2]}" in storesList[0, :]: + if f"{sep_values[-3]}/{sep_values[-2]}" in events: decide_path.append(element) else: - if f"{sep_values[-2]}/{sep_values[-1]}" in storesList[0, :]: + if f"{sep_values[-2]}/{sep_values[-1]}" in events: decide_path.append(element) output_dicts = [] - for i in range(storesList.shape[1]): - if "control" in storesList[1, i] or "signal" in storesList[1, i]: - regex = re.compile("(.*?)" + str(storesList[0, i]) + "(.*?)") + for event in events: + event_type = event_types[event] + if "control" in event_type or "signal" in event_type: + regex = re.compile("(.*?)" + str(event) + "(.*?)") idx = [i for i in range(len(decide_path)) if regex.match(decide_path[i])] if len(idx) > 1: logger.error("More than one string matched (which should not be the case)") @@ -213,11 +228,11 @@ def access_data_doricV6(self, doric_file, storesList): data = np.array(doric_file[decide_path[idx]]) timestamps = np.array(doric_file[decide_path[idx].rsplit("/", 1)[0] + "/Time"]) sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) - storename = storesList[0, i] + storename = event S = {"storename": storename, "sampling_rate": sampling_rate, "timestamps": timestamps, "data": data} output_dicts.append(S) else: - regex = re.compile("(.*?)" + storesList[0, i] + "$") + regex = re.compile("(.*?)" + event + "$") idx = [i for i in range(len(decide_path)) if regex.match(decide_path[i])] if len(idx) > 1: logger.error("More than one string matched (which should not be the case)") @@ -228,56 +243,57 @@ def access_data_doricV6(self, doric_file, storesList): indices = np.where(ttl <= 0)[0] diff_indices = np.where(np.diff(indices) > 1)[0] timestamps = timestamps[indices[diff_indices] + 1] - storename = storesList[0, i] + storename = event S = {"storename": storename, "timestamps": timestamps} output_dicts.append(S) return output_dicts - def access_data_doricV1(self, doric_file, storesList): + def _access_data_doricV1(self, doric_file, events, event_types): keys = list(doric_file["Traces"]["Console"].keys()) output_dicts = [] - for i in range(storesList.shape[1]): - if "control" in storesList[1, i] or "signal" in storesList[1, i]: + for event in events: + event_type = event_types[event] + if "control" in event_type or "signal" in event_type: timestamps = np.array(doric_file["Traces"]["Console"]["Time(s)"]["Console_time(s)"]) sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) - data = np.array(doric_file["Traces"]["Console"][storesList[0, i]][storesList[0, i]]) - storename = storesList[0, i] + data = np.array(doric_file["Traces"]["Console"][event][event]) + storename = event S = {"storename": storename, "sampling_rate": sampling_rate, "timestamps": timestamps, "data": data} output_dicts.append(S) else: timestamps = np.array(doric_file["Traces"]["Console"]["Time(s)"]["Console_time(s)"]) - ttl = np.array(doric_file["Traces"]["Console"][storesList[0, i]][storesList[0, i]]) + ttl = np.array(doric_file["Traces"]["Console"][event][event]) indices = np.where(ttl <= 0)[0] diff_indices = np.where(np.diff(indices) > 1)[0] timestamps = timestamps[indices[diff_indices] + 1] - storename = storesList[0, i] + storename = event S = {"storename": storename, "timestamps": timestamps} output_dicts.append(S) return output_dicts - def save_dict_to_hdf5(self, S, outputPath): - event = S["storename"] - write_hdf5(S["timestamps"], event, outputPath, "timestamps") - - if "sampling_rate" in S: - write_hdf5(S["sampling_rate"], event, outputPath, "sampling_rate") - if "data" in S: - write_hdf5(S["data"], event, outputPath, "data") - - def read(self, storesList): - flag = self.check_doric() + def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str, Any]]: + event_types = kwargs["event_types"] + flag = self._check_doric() if flag == "doric_csv": - output_dicts = self.read_doric_csv(storesList) + output_dicts = self._read_doric_csv(events, event_types) elif flag == "doric_doric": - output_dicts = self.read_doric_doric(storesList) + output_dicts = self._read_doric_doric(events, event_types) else: logger.error("Doric file not found or not recognized.") raise FileNotFoundError("Doric file not found or not recognized.") return output_dicts - def save(self, output_dicts, outputPath): + def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str, **kwargs) -> None: for S in output_dicts: - self.save_dict_to_hdf5(S=S, outputPath=outputPath) + storename = S["storename"] + self._write_hdf5(data=S["timestamps"], storename=storename, output_path=outputPath, key="timestamps") + + if "sampling_rate" in S: + self._write_hdf5( + data=S["sampling_rate"], storename=storename, output_path=outputPath, key="sampling_rate" + ) + if "data" in S: + self._write_hdf5(data=S["data"], storename=storename, output_path=outputPath, key="data") From dd40cb4aa0629e539322e100c908f05509acdd07 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 14:23:53 -0800 Subject: [PATCH 051/150] Refactored doric_recording_extractor to use class method for events and flags. --- .../extractors/doric_recording_extractor.py | 87 ++++++++++++------- src/guppy/saveStoresList.py | 4 +- 2 files changed, 56 insertions(+), 35 deletions(-) diff --git a/src/guppy/extractors/doric_recording_extractor.py b/src/guppy/extractors/doric_recording_extractor.py index 51c22ca..f67e3f1 100644 --- a/src/guppy/extractors/doric_recording_extractor.py +++ b/src/guppy/extractors/doric_recording_extractor.py @@ -15,33 +15,48 @@ def execute_import_doric(folder_path, storesList, outputPath): - # Parse storesList into events and event_types events = list(storesList[0, :]) - event_types = {storesList[0, i]: storesList[1, i] for i in range(storesList.shape[1])} + event_name_to_event_type = {storesList[0, i]: storesList[1, i] for i in range(storesList.shape[1])} - extractor = DoricRecordingExtractor(folder_path=folder_path) - output_dicts = extractor.read(events=events, outputPath=outputPath, event_types=event_types) + extractor = DoricRecordingExtractor(folder_path=folder_path, event_name_to_event_type=event_name_to_event_type) + output_dicts = extractor.read(events=events, outputPath=outputPath) extractor.save(output_dicts=output_dicts, outputPath=outputPath) class DoricRecordingExtractor(BaseRecordingExtractor): # TODO: consolidate duplicate flag logic between the `__init__` and the `check_doric` method. - def __init__(self, folder_path): - self.folder_path = folder_path - logger.debug("If it exists, importing Doric file based on the structure of file") - path = sorted(glob.glob(os.path.join(self.folder_path, "*.csv"))) + sorted( - glob.glob(os.path.join(self.folder_path, "*.doric")) + @classmethod + def discover_events_and_flags(cls, folder_path): + """ + Discover available events and file format flags from Doric files. + + Parameters + ---------- + folder_path : str + Path to the folder containing Doric files + + Returns + ------- + events : list + List of discovered event names + flags : list + List of format flags (e.g., 'doric_csv', 'doric_doric') + """ + logger.debug("Discovering Doric events from file headers") + path = sorted(glob.glob(os.path.join(folder_path, "*.csv"))) + sorted( + glob.glob(os.path.join(folder_path, "*.doric")) ) path = sorted(list(set(path))) flag = "None" event_from_filename = [] flag_arr = [] + for i in range(len(path)): ext = os.path.basename(path[i]).split(".")[-1] if ext == "doric": - key_names = self._read_doric(path[i]) + key_names = cls._read_doric_file(path[i]) event_from_filename.extend(key_names) flag = "doric_doric" else: @@ -62,10 +77,14 @@ def __init__(self, folder_path): event_from_filename.extend(list(df.columns)) flag = "doric_csv" logger.info(flag) - logger.info("Importing of Doric file is done.") - self._events = event_from_filename - self._flags = flag_arr + logger.info("Doric event discovery complete.") + return event_from_filename, flag_arr + + def __init__(self, folder_path, event_name_to_event_type): + self.folder_path = folder_path + self._event_name_to_event_type = event_name_to_event_type + self._events, self._flags = self.discover_events_and_flags(folder_path) @property def events(self) -> list[str]: @@ -75,23 +94,26 @@ def events(self) -> list[str]: def flags(self) -> list: return self._flags - def _read_doric(self, filepath): + @staticmethod + def _read_doric_file(filepath): + """Static helper to read Doric file headers for event discovery.""" with h5py.File(filepath, "r") as f: if "Traces" in list(f.keys()): - keys = self._access_keys_doricV1(f) + keys = DoricRecordingExtractor._access_keys_doricV1(f) elif list(f.keys()) == ["Configurations", "DataAcquisition"]: - keys = self._access_keys_doricV6(f) + keys = DoricRecordingExtractor._access_keys_doricV6(f) return keys - def _access_keys_doricV6(self, doric_file): + @staticmethod + def _access_keys_doricV6(doric_file): data = [doric_file["DataAcquisition"]] res = [] while len(data) != 0: members = len(data) while members != 0: members -= 1 - data, last_element = self._separate_last_element(data) + data, last_element = DoricRecordingExtractor._separate_last_element(data) if isinstance(last_element, h5py.Dataset) and not last_element.name.endswith("/Time"): res.append(last_element.name) elif isinstance(last_element, h5py.Group): @@ -107,13 +129,15 @@ def _access_keys_doricV6(self, doric_file): return keys - def _access_keys_doricV1(self, doric_file): + @staticmethod + def _access_keys_doricV1(doric_file): keys = list(doric_file["Traces"]["Console"].keys()) keys.remove("Time(s)") return keys - def _separate_last_element(self, arr): + @staticmethod + def _separate_last_element(arr): l = arr[-1] return arr[:-1], l @@ -148,7 +172,7 @@ def _check_doric(self): logger.info("Doric file found.") return flag_arr[0] - def _read_doric_csv(self, events, event_types): + def _read_doric_csv(self, events): path = glob.glob(os.path.join(self.folder_path, "*.csv")) if len(path) > 1: logger.error("An error occurred : More than one Doric csv file present at the location") @@ -161,7 +185,7 @@ def _read_doric_csv(self, events, event_types): output_dicts = [] for event in events: - event_type = event_types[event] + event_type = self._event_name_to_event_type[event] if "control" in event_type or "signal" in event_type: timestamps = np.array(df["Time(s)"]) sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) @@ -180,19 +204,19 @@ def _read_doric_csv(self, events, event_types): return output_dicts - def _read_doric_doric(self, events, event_types): + def _read_doric_doric(self, events): path = glob.glob(os.path.join(self.folder_path, "*.doric")) if len(path) > 1: logger.error("An error occurred : More than one Doric file present at the location") raise Exception("More than one Doric file present at the location") with h5py.File(path[0], "r") as f: if "Traces" in list(f.keys()): - output_dicts = self._access_data_doricV1(f, events, event_types) + output_dicts = self._access_data_doricV1(f, events) elif list(f.keys()) == ["Configurations", "DataAcquisition"]: - output_dicts = self._access_data_doricV6(f, events, event_types) + output_dicts = self._access_data_doricV6(f, events) return output_dicts - def _access_data_doricV6(self, doric_file, events, event_types): + def _access_data_doricV6(self, doric_file, events): data = [doric_file["DataAcquisition"]] res = [] while len(data) != 0: @@ -217,7 +241,7 @@ def _access_data_doricV6(self, doric_file, events, event_types): output_dicts = [] for event in events: - event_type = event_types[event] + event_type = self._event_name_to_event_type[event] if "control" in event_type or "signal" in event_type: regex = re.compile("(.*?)" + str(event) + "(.*?)") idx = [i for i in range(len(decide_path)) if regex.match(decide_path[i])] @@ -249,11 +273,11 @@ def _access_data_doricV6(self, doric_file, events, event_types): return output_dicts - def _access_data_doricV1(self, doric_file, events, event_types): + def _access_data_doricV1(self, doric_file, events): keys = list(doric_file["Traces"]["Console"].keys()) output_dicts = [] for event in events: - event_type = event_types[event] + event_type = self._event_name_to_event_type[event] if "control" in event_type or "signal" in event_type: timestamps = np.array(doric_file["Traces"]["Console"]["Time(s)"]["Console_time(s)"]) sampling_rate = np.array([1 / (timestamps[-1] - timestamps[-2])]) @@ -274,12 +298,11 @@ def _access_data_doricV1(self, doric_file, events, event_types): return output_dicts def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str, Any]]: - event_types = kwargs["event_types"] flag = self._check_doric() if flag == "doric_csv": - output_dicts = self._read_doric_csv(events, event_types) + output_dicts = self._read_doric_csv(events) elif flag == "doric_doric": - output_dicts = self._read_doric_doric(events, event_types) + output_dicts = self._read_doric_doric(events) else: logger.error("Doric file not found or not recognized.") raise FileNotFoundError("Doric file not found or not recognized.") diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index 318bc5f..acc62f4 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -584,9 +584,7 @@ def execute(inputParameters): flags = extractor.flags elif modality == "doric": - extractor = DoricRecordingExtractor(folder_path=folder_path) - events = extractor.events - flags = extractor.flags + events, flags = DoricRecordingExtractor.discover_events_and_flags(folder_path=folder_path) elif modality == "npm": headless = bool(os.environ.get("GUPPY_BASE_DIR")) From 4619964733e040b64f375254dce8d6dde99d94d1 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 16:23:19 -0800 Subject: [PATCH 052/150] Refactored Extractors to use class method discover_events and flags instead of properties. --- .../extractors/base_recording_extractor.py | 25 +-- .../extractors/csv_recording_extractor.py | 40 +++-- .../extractors/npm_recording_extractor.py | 166 +++++++++--------- .../extractors/tdt_recording_extractor.py | 47 +++-- src/guppy/readTevTsq.py | 2 +- src/guppy/saveStoresList.py | 12 +- 6 files changed, 151 insertions(+), 141 deletions(-) diff --git a/src/guppy/extractors/base_recording_extractor.py b/src/guppy/extractors/base_recording_extractor.py index 7058a0a..76d4f3c 100644 --- a/src/guppy/extractors/base_recording_extractor.py +++ b/src/guppy/extractors/base_recording_extractor.py @@ -16,29 +16,18 @@ class BaseRecordingExtractor(ABC): data from various acquisition formats (TDT, Doric, CSV, NPM, etc.). """ - @property + @classmethod @abstractmethod - def events(self) -> list[str]: + def discover_events_and_flags(cls) -> tuple[list[str], list[str]]: """ - List of available event/store names in the data. + Discover available events and format flags from data files. Returns ------- - list of str - Names of all events or stores available in the dataset. - """ - pass - - @property - @abstractmethod - def flags(self) -> list: - """ - Format indicators or file type flags. - - Returns - ------- - list - Flags indicating file types or data formats. + events : list of str + Names of all events/stores available in the dataset. + flags : list of str + Format indicators or file type flags. """ pass diff --git a/src/guppy/extractors/csv_recording_extractor.py b/src/guppy/extractors/csv_recording_extractor.py index 792ad01..41ee7ab 100644 --- a/src/guppy/extractors/csv_recording_extractor.py +++ b/src/guppy/extractors/csv_recording_extractor.py @@ -32,11 +32,25 @@ def read_and_save_csv(extractor, event, outputPath): class CsvRecordingExtractor(BaseRecordingExtractor): - def __init__(self, folder_path): - self.folder_path = folder_path - + @classmethod + def discover_events_and_flags(cls, folder_path) -> tuple[list[str], list[str]]: + """ + Discover available events and format flags from CSV files. + + Parameters + ---------- + folder_path : str + Path to the folder containing CSV files. + + Returns + ------- + events : list of str + Names of all events/stores available in the dataset. + flags : list of str + Format indicators or file type flags. + """ logger.debug("If it exists, importing either NPM or Doric or csv file based on the structure of file") - path = sorted(glob.glob(os.path.join(self.folder_path, "*.csv"))) + path = sorted(glob.glob(os.path.join(folder_path, "*.csv"))) path = sorted(list(set(path))) flag = "None" @@ -59,7 +73,7 @@ def __init__(self, folder_path): ), "This file appears to be doric .csv. This function only supports standard .csv files." df = pd.read_csv(path[i], index_col=False) - _, value = self._check_header(df) + _, value = cls._check_header(df) # check dataframe structure and read data accordingly if len(value) > 0: @@ -121,19 +135,13 @@ def __init__(self, folder_path): event_from_filename.append(name) logger.info("Importing of csv file is done.") + return event_from_filename, flag_arr - self._events = event_from_filename - self._flags = flag_arr - - @property - def events(self) -> list[str]: - return self._events - - @property - def flags(self) -> list: - return self._flags + def __init__(self, folder_path): + self.folder_path = folder_path - def _check_header(self, df): + @staticmethod + def _check_header(df): arr = list(df.columns) check_float = [] for i in arr: diff --git a/src/guppy/extractors/npm_recording_extractor.py b/src/guppy/extractors/npm_recording_extractor.py index 6d9b26a..110ba56 100644 --- a/src/guppy/extractors/npm_recording_extractor.py +++ b/src/guppy/extractors/npm_recording_extractor.py @@ -17,10 +17,10 @@ logger = logging.getLogger(__name__) -def execute_import_npm(folder_path, num_ch, inputParameters, events, outputPath, numProcesses=mp.cpu_count()): +def execute_import_npm(folder_path, events, outputPath, numProcesses=mp.cpu_count()): logger.info("Reading data for event {} ...".format(events)) - extractor = NpmRecordingExtractor(folder_path=folder_path, num_ch=num_ch, inputParameters=inputParameters) + extractor = NpmRecordingExtractor(folder_path=folder_path) start = time.time() with mp.Pool(numProcesses) as p: p.starmap(read_and_save_npm, zip(repeat(extractor), events, repeat(outputPath))) @@ -35,81 +35,29 @@ def read_and_save_npm(extractor, event, outputPath): class NpmRecordingExtractor(BaseRecordingExtractor): - def __init__(self, folder_path, num_ch, inputParameters=None): # TODO: make inputParameters mandatory - self.folder_path = folder_path - self.num_ch = num_ch - self.inputParameters = inputParameters - self._events, self._flags = self._import_npm( - folder_path=folder_path, num_ch=num_ch, inputParameters=inputParameters - ) - - @property - def events(self) -> list[str]: - return self._events - - @property - def flags(self) -> list: - return self._flags - + # TODO: make inputParameters mandatory @classmethod - def has_multiple_event_ttls(cls, folder_path): - path = sorted(glob.glob(os.path.join(folder_path, "*.csv"))) - path_chev = glob.glob(os.path.join(folder_path, "*chev*")) - path_chod = glob.glob(os.path.join(folder_path, "*chod*")) - path_chpr = glob.glob(os.path.join(folder_path, "*chpr*")) - path_event = glob.glob(os.path.join(folder_path, "event*")) - path_chev_chod_event = path_chev + path_chod + path_event + path_chpr - - path = sorted(list(set(path) - set(path_chev_chod_event))) - multiple_event_ttls = [] - for i in range(len(path)): - df = pd.read_csv(path[i], index_col=False) - _, value = cls.check_header(df) - - # check dataframe structure and read data accordingly - if len(value) > 0: - columns_isstr = False - df = pd.read_csv(path[i], header=None) - cols = np.array(list(df.columns), dtype=str) - else: - columns_isstr = True - cols = np.array(list(df.columns), dtype=str) - if len(cols) == 2: - flag = "event_or_data_np" - elif len(cols) > 2: - flag = "data_np" - else: - logger.error("Number of columns in csv file does not make sense.") - raise Exception("Number of columns in csv file does not make sense.") - - # used assigned flags to process the files and read the data - if flag == "event_or_data_np": - arr = list(df.iloc[:, 1]) - check_float = [True for i in arr if isinstance(i, float)] - if len(arr) == len(check_float) and columns_isstr == False: - flag = "data_np" - elif columns_isstr == True and ("value" in np.char.lower(np.array(cols))): - flag = "event_np" - else: - flag = "event_np" - - if flag == "event_np": - type_val = np.array(df.iloc[:, 1]) - type_val_unique = np.unique(type_val) - if len(type_val_unique) > 1: - multiple_event_ttls.append(True) - else: - multiple_event_ttls.append(False) - else: - multiple_event_ttls.append(False) - - return multiple_event_ttls - - def _import_npm(self, folder_path, num_ch, inputParameters=None): - + def discover_events_and_flags(cls, folder_path, num_ch, inputParameters=None) -> tuple[list[str], list[str]]: + """ + Discover available events and format flags from NPM files. + + Parameters + ---------- + folder_path : str + Path to the folder containing NPM files. + num_ch : int + Number of channels in the recording. + inputParameters : dict, optional + Input parameters containing NPM-specific configuration. + + Returns + ------- + events : list of str + Names of all events/stores available in the dataset. + flags : list of str + Format indicators or file type flags. + """ logger.debug("If it exists, importing NPM file based on the structure of file") - # Headless configuration (used to avoid any UI prompts when running tests) - headless = bool(os.environ.get("GUPPY_BASE_DIR")) if isinstance(inputParameters, dict): npm_timestamp_column_names = inputParameters.get("npm_timestamp_column_names") npm_time_units = inputParameters.get("npm_time_units") @@ -160,7 +108,7 @@ def _import_npm(self, folder_path, num_ch, inputParameters=None): df_arr ), "This file appears to be doric .csv. This function only supports NPM .csv files." df = pd.read_csv(path[i], index_col=False) - _, value = self.check_header(df) + _, value = cls.check_header(df) # check dataframe structure and read data accordingly if len(value) > 0: @@ -204,7 +152,7 @@ def _import_npm(self, folder_path, num_ch, inputParameters=None): logger.info(flag) if flag == "data_np": file = f"file{str(i)}_" - df, indices_dict, _ = self.decide_indices(file, df, flag, num_ch) + df, indices_dict, _ = cls.decide_indices(file, df, flag, num_ch) keys = list(indices_dict.keys()) for k in range(len(keys)): for j in range(df.shape[1]): @@ -242,8 +190,8 @@ def _import_npm(self, folder_path, num_ch, inputParameters=None): else: file = f"file{str(i)}_" ts_unit = npm_time_unit - df = self._update_df_with_timestamp_columns(df, timestamp_column_name=npm_timestamp_column_name) - df, indices_dict, _ = self.decide_indices(file, df, flag) + df = cls._update_df_with_timestamp_columns(df, timestamp_column_name=npm_timestamp_column_name) + df, indices_dict, _ = cls.decide_indices(file, df, flag) keys = list(indices_dict.keys()) for k in range(len(keys)): for j in range(df.shape[1]): @@ -326,6 +274,63 @@ def _import_npm(self, folder_path, num_ch, inputParameters=None): logger.info("Importing of NPM file is done.") return event_from_filename, flag_arr + def __init__(self, folder_path): + self.folder_path = folder_path + + @classmethod + def has_multiple_event_ttls(cls, folder_path): + path = sorted(glob.glob(os.path.join(folder_path, "*.csv"))) + path_chev = glob.glob(os.path.join(folder_path, "*chev*")) + path_chod = glob.glob(os.path.join(folder_path, "*chod*")) + path_chpr = glob.glob(os.path.join(folder_path, "*chpr*")) + path_event = glob.glob(os.path.join(folder_path, "event*")) + path_chev_chod_event = path_chev + path_chod + path_event + path_chpr + + path = sorted(list(set(path) - set(path_chev_chod_event))) + multiple_event_ttls = [] + for i in range(len(path)): + df = pd.read_csv(path[i], index_col=False) + _, value = cls.check_header(df) + + # check dataframe structure and read data accordingly + if len(value) > 0: + columns_isstr = False + df = pd.read_csv(path[i], header=None) + cols = np.array(list(df.columns), dtype=str) + else: + columns_isstr = True + cols = np.array(list(df.columns), dtype=str) + if len(cols) == 2: + flag = "event_or_data_np" + elif len(cols) > 2: + flag = "data_np" + else: + logger.error("Number of columns in csv file does not make sense.") + raise Exception("Number of columns in csv file does not make sense.") + + # used assigned flags to process the files and read the data + if flag == "event_or_data_np": + arr = list(df.iloc[:, 1]) + check_float = [True for i in arr if isinstance(i, float)] + if len(arr) == len(check_float) and columns_isstr == False: + flag = "data_np" + elif columns_isstr == True and ("value" in np.char.lower(np.array(cols))): + flag = "event_np" + else: + flag = "event_np" + + if flag == "event_np": + type_val = np.array(df.iloc[:, 1]) + type_val_unique = np.unique(type_val) + if len(type_val_unique) > 1: + multiple_event_ttls.append(True) + else: + multiple_event_ttls.append(False) + else: + multiple_event_ttls.append(False) + + return multiple_event_ttls + @classmethod def check_header(cls, df): arr = list(df.columns) @@ -476,7 +481,8 @@ def needs_ts_unit(cls, folder_path, num_ch): return ts_unit_needs, col_names_ts - def _update_df_with_timestamp_columns(self, df, timestamp_column_name): + @staticmethod + def _update_df_with_timestamp_columns(df, timestamp_column_name): col_names = np.array(list(df.columns)) col_names_ts = [""] for name in col_names: diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py index 6e712fb..949c9ec 100644 --- a/src/guppy/extractors/tdt_recording_extractor.py +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -30,35 +30,48 @@ def execute_readtev(folder_path, events, outputPath, numProcesses=mp.cpu_count() class TdtRecordingExtractor(BaseRecordingExtractor): - def __init__(self, folder_path): - self.folder_path = folder_path - self._header_df, _ = self._readtsq(folder_path) + @classmethod + def discover_events_and_flags(cls, folder_path) -> tuple[list[str], list[str]]: + """ + Discover available events and format flags from TDT files. + + Parameters + ---------- + folder_path : str + Path to the folder containing TDT files. + + Returns + ------- + events : list of str + Names of all events/stores available in the dataset. + flags : list of str + Format indicators or file type flags. + """ + header_df, _ = cls._readtsq(folder_path) # Populate events from header_df - if isinstance(self._header_df, pd.DataFrame): - self._header_df["name"] = np.asarray(self._header_df["name"], dtype=str) - allnames = np.unique(self._header_df["name"]) + if isinstance(header_df, pd.DataFrame): + header_df["name"] = np.asarray(header_df["name"], dtype=str) + allnames = np.unique(header_df["name"]) index = [] for i in range(len(allnames)): length = len(str(allnames[i])) if length < 4: index.append(i) allnames = np.delete(allnames, index, 0) - self._events = list(allnames) + events = list(allnames) else: - self._events = [] - - self._flags = [] + events = [] - @property - def events(self) -> list[str]: - return self._events + flags = [] + return events, flags - @property - def flags(self) -> list: - return self._flags + def __init__(self, folder_path): + self.folder_path = folder_path + self._header_df, _ = self._readtsq(folder_path) - def _readtsq(self, folder_path): + @staticmethod + def _readtsq(folder_path): logger.debug("Trying to read tsq file.") names = ("size", "type", "name", "chan", "sort_code", "timestamp", "fp_loc", "strobe", "format", "frequency") formats = (int32, int32, "S4", uint16, uint16, float64, int64, float64, int32, float32) diff --git a/src/guppy/readTevTsq.py b/src/guppy/readTevTsq.py index f2c9419..2ae0c59 100755 --- a/src/guppy/readTevTsq.py +++ b/src/guppy/readTevTsq.py @@ -80,7 +80,7 @@ def readRawData(inputParameters): elif modality == "csv": execute_import_csv(filepath, events, op, numProcesses) elif modality == "npm": - execute_import_npm(filepath, num_ch, inputParameters, events, op, numProcesses) + execute_import_npm(filepath, events, op, numProcesses) else: raise ValueError("Modality not recognized. Please use 'tdt', 'csv', 'doric', or 'npm'.") diff --git a/src/guppy/saveStoresList.py b/src/guppy/saveStoresList.py index acc62f4..20a5c94 100755 --- a/src/guppy/saveStoresList.py +++ b/src/guppy/saveStoresList.py @@ -575,13 +575,9 @@ def execute(inputParameters): for i in folderNames: folder_path = os.path.join(inputParameters["abspath"], i) if modality == "tdt": - extractor = TdtRecordingExtractor(folder_path=folder_path) - events = extractor.events - flags = extractor.flags + events, flags = TdtRecordingExtractor.discover_events_and_flags(folder_path=folder_path) elif modality == "csv": - extractor = CsvRecordingExtractor(folder_path=folder_path) - events = extractor.events - flags = extractor.flags + events, flags = CsvRecordingExtractor.discover_events_and_flags(folder_path=folder_path) elif modality == "doric": events, flags = DoricRecordingExtractor.discover_events_and_flags(folder_path=folder_path) @@ -604,11 +600,9 @@ def execute(inputParameters): npm_timestamp_column_names if npm_timestamp_column_names else None ) - extractor = NpmRecordingExtractor( + events, flags = NpmRecordingExtractor.discover_events_and_flags( folder_path=folder_path, num_ch=num_ch, inputParameters=inputParameters ) - events = extractor.events - flags = extractor.flags else: raise ValueError("Modality not recognized. Please use 'tdt', 'csv', 'doric', or 'npm'.") From beb585fab80a38d728f354364d91a7875680db0b Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 16:34:09 -0800 Subject: [PATCH 053/150] Refactored Extractors to use class method discover_events and flags instead of properties. --- src/guppy/extractors/doric_recording_extractor.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/guppy/extractors/doric_recording_extractor.py b/src/guppy/extractors/doric_recording_extractor.py index f67e3f1..dd0ecdd 100644 --- a/src/guppy/extractors/doric_recording_extractor.py +++ b/src/guppy/extractors/doric_recording_extractor.py @@ -84,15 +84,6 @@ def discover_events_and_flags(cls, folder_path): def __init__(self, folder_path, event_name_to_event_type): self.folder_path = folder_path self._event_name_to_event_type = event_name_to_event_type - self._events, self._flags = self.discover_events_and_flags(folder_path) - - @property - def events(self) -> list[str]: - return self._events - - @property - def flags(self) -> list: - return self._flags @staticmethod def _read_doric_file(filepath): From 1b5e8ca6b4ea454636978e295eab8ca70a38027e Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 16:42:36 -0800 Subject: [PATCH 054/150] Added comment about discover_events_and_flags signature --- src/guppy/extractors/base_recording_extractor.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/guppy/extractors/base_recording_extractor.py b/src/guppy/extractors/base_recording_extractor.py index 76d4f3c..c71297b 100644 --- a/src/guppy/extractors/base_recording_extractor.py +++ b/src/guppy/extractors/base_recording_extractor.py @@ -29,6 +29,11 @@ def discover_events_and_flags(cls) -> tuple[list[str], list[str]]: flags : list of str Format indicators or file type flags. """ + # NOTE: This method signature is intentionally minimal and flexible. + # Different formats have different discovery requirements: + # - TDT/CSV/Doric: need only folder_path parameter + # - NPM: needs folder_path, num_ch, and optional inputParameters for interleaved channels + # Each child class defines its own signature with the parameters it needs. pass @abstractmethod From 2e38ee8afd6b5c061c0f4618ba606619b1ce142c Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 16:46:28 -0800 Subject: [PATCH 055/150] Removed unused quarks. --- src/guppy/extractors/base_recording_extractor.py | 8 ++------ src/guppy/extractors/csv_recording_extractor.py | 4 ++-- src/guppy/extractors/doric_recording_extractor.py | 4 ++-- src/guppy/extractors/npm_recording_extractor.py | 4 ++-- src/guppy/extractors/tdt_recording_extractor.py | 4 ++-- 5 files changed, 10 insertions(+), 14 deletions(-) diff --git a/src/guppy/extractors/base_recording_extractor.py b/src/guppy/extractors/base_recording_extractor.py index c71297b..839c3db 100644 --- a/src/guppy/extractors/base_recording_extractor.py +++ b/src/guppy/extractors/base_recording_extractor.py @@ -37,7 +37,7 @@ def discover_events_and_flags(cls) -> tuple[list[str], list[str]]: pass @abstractmethod - def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str, Any]]: + def read(self, *, events: list[str], outputPath: str) -> list[dict[str, Any]]: """ Read data from source files for specified events. @@ -47,8 +47,6 @@ def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str List of event/store names to extract from the data. outputPath : str Path to the output directory. - **kwargs - Additional extractor-specific parameters. Returns ------- @@ -60,7 +58,7 @@ def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str pass @abstractmethod - def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str, **kwargs) -> None: + def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str) -> None: """ Save extracted data dictionaries to HDF5 format. @@ -70,8 +68,6 @@ def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str, **kwargs) List of data dictionaries from read(). outputPath : str Path to the output directory. - **kwargs - Additional extractor-specific parameters. """ pass diff --git a/src/guppy/extractors/csv_recording_extractor.py b/src/guppy/extractors/csv_recording_extractor.py index 41ee7ab..d74cfde 100644 --- a/src/guppy/extractors/csv_recording_extractor.py +++ b/src/guppy/extractors/csv_recording_extractor.py @@ -196,7 +196,7 @@ def _save_to_hdf5(self, df, event, outputPath): logger.info("\033[1m" + "Reading data for {} from csv file is completed.".format(event) + "\033[0m") - def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str, Any]]: + def read(self, *, events: list[str], outputPath: str) -> list[dict[str, Any]]: output_dicts = [] for event in events: df = self._read_csv(event=event) @@ -205,7 +205,7 @@ def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str output_dicts.append(S) return output_dicts - def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str, **kwargs) -> None: + def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str) -> None: for S in output_dicts: event = S.pop("storename") df = pd.DataFrame.from_dict(S) diff --git a/src/guppy/extractors/doric_recording_extractor.py b/src/guppy/extractors/doric_recording_extractor.py index dd0ecdd..62a8586 100644 --- a/src/guppy/extractors/doric_recording_extractor.py +++ b/src/guppy/extractors/doric_recording_extractor.py @@ -288,7 +288,7 @@ def _access_data_doricV1(self, doric_file, events): return output_dicts - def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str, Any]]: + def read(self, *, events: list[str], outputPath: str) -> list[dict[str, Any]]: flag = self._check_doric() if flag == "doric_csv": output_dicts = self._read_doric_csv(events) @@ -300,7 +300,7 @@ def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str return output_dicts - def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str, **kwargs) -> None: + def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str) -> None: for S in output_dicts: storename = S["storename"] self._write_hdf5(data=S["timestamps"], storename=storename, output_path=outputPath, key="timestamps") diff --git a/src/guppy/extractors/npm_recording_extractor.py b/src/guppy/extractors/npm_recording_extractor.py index 110ba56..e3042c2 100644 --- a/src/guppy/extractors/npm_recording_extractor.py +++ b/src/guppy/extractors/npm_recording_extractor.py @@ -543,7 +543,7 @@ def _save_to_hdf5(self, df, event, outputPath): logger.info("\033[1m" + "Reading data for {} from csv file is completed.".format(event) + "\033[0m") - def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str, Any]]: + def read(self, *, events: list[str], outputPath: str) -> list[dict[str, Any]]: output_dicts = [] for event in events: df = self._read_npm(event=event) @@ -552,7 +552,7 @@ def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str output_dicts.append(S) return output_dicts - def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str, **kwargs) -> None: + def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str) -> None: for S in output_dicts: event = S.pop("storename") df = pd.DataFrame.from_dict(S) diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py index 949c9ec..a877a8b 100644 --- a/src/guppy/extractors/tdt_recording_extractor.py +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -178,7 +178,7 @@ def _readtev(self, event): return S - def read(self, *, events: list[str], outputPath: str, **kwargs) -> list[dict[str, Any]]: + def read(self, *, events: list[str], outputPath: str) -> list[dict[str, Any]]: output_dicts = [] for event in events: S = self._readtev(event=event) @@ -259,6 +259,6 @@ def _save_dict_to_hdf5(self, S, outputPath): self._write_hdf5(S["npoints"], event, outputPath, "npoints") self._write_hdf5(S["channels"], event, outputPath, "channels") - def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str, **kwargs) -> None: + def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str) -> None: for S in output_dicts: self._save_dict_to_hdf5(S=S, outputPath=outputPath) From cdecf428d97d4db8e36dbc9cd44510c7d529016f Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 16:51:26 -0800 Subject: [PATCH 056/150] Refactored NpmRecordingExtractor to inherit from CsvRecordingExtractor. --- .../extractors/npm_recording_extractor.py | 87 ++----------------- 1 file changed, 7 insertions(+), 80 deletions(-) diff --git a/src/guppy/extractors/npm_recording_extractor.py b/src/guppy/extractors/npm_recording_extractor.py index e3042c2..68d13f7 100644 --- a/src/guppy/extractors/npm_recording_extractor.py +++ b/src/guppy/extractors/npm_recording_extractor.py @@ -4,13 +4,12 @@ import os import time from itertools import repeat -from typing import Any import numpy as np import pandas as pd import panel as pn -from guppy.extractors import BaseRecordingExtractor +from guppy.extractors import CsvRecordingExtractor pn.extension() @@ -33,7 +32,9 @@ def read_and_save_npm(extractor, event, outputPath): logger.info("Data for event {} fetched and stored.".format(event)) -class NpmRecordingExtractor(BaseRecordingExtractor): +class NpmRecordingExtractor(CsvRecordingExtractor): + # Inherits from CsvRecordingExtractor to reuse identical read/save logic. + # Only overrides discover_events_and_flags() and adds NPM-specific helper methods. # TODO: make inputParameters mandatory @classmethod @@ -108,7 +109,7 @@ def discover_events_and_flags(cls, folder_path, num_ch, inputParameters=None) -> df_arr ), "This file appears to be doric .csv. This function only supports NPM .csv files." df = pd.read_csv(path[i], index_col=False) - _, value = cls.check_header(df) + _, value = cls._check_header(df) # check dataframe structure and read data accordingly if len(value) > 0: @@ -274,9 +275,6 @@ def discover_events_and_flags(cls, folder_path, num_ch, inputParameters=None) -> logger.info("Importing of NPM file is done.") return event_from_filename, flag_arr - def __init__(self, folder_path): - self.folder_path = folder_path - @classmethod def has_multiple_event_ttls(cls, folder_path): path = sorted(glob.glob(os.path.join(folder_path, "*.csv"))) @@ -290,7 +288,7 @@ def has_multiple_event_ttls(cls, folder_path): multiple_event_ttls = [] for i in range(len(path)): df = pd.read_csv(path[i], index_col=False) - _, value = cls.check_header(df) + _, value = cls._check_header(df) # check dataframe structure and read data accordingly if len(value) > 0: @@ -331,18 +329,6 @@ def has_multiple_event_ttls(cls, folder_path): return multiple_event_ttls - @classmethod - def check_header(cls, df): - arr = list(df.columns) - check_float = [] - for i in arr: - try: - check_float.append(float(i)) - except: - pass - - return arr, check_float - # function to decide indices of interleaved channels # in neurophotometrics data @classmethod @@ -426,7 +412,7 @@ def needs_ts_unit(cls, folder_path, num_ch): col_names_ts = [""] for i in range(len(path)): df = pd.read_csv(path[i], index_col=False) - _, value = cls.check_header(df) + _, value = cls._check_header(df) # check dataframe structure and read data accordingly if len(value) > 0: @@ -498,62 +484,3 @@ def _update_df_with_timestamp_columns(df, timestamp_column_name): df.insert(1, "Timestamp", df[timestamp_column_name]) df = df.drop(col_names_ts[1:], axis=1) return df - - def _read_npm(self, event): - logger.debug("\033[1m" + "Trying to read data for {} from csv file.".format(event) + "\033[0m") - if not os.path.exists(os.path.join(self.folder_path, event + ".csv")): - logger.error("\033[1m" + "No csv file found for event {}".format(event) + "\033[0m") - raise Exception("\033[1m" + "No csv file found for event {}".format(event) + "\033[0m") - - df = pd.read_csv(os.path.join(self.folder_path, event + ".csv"), index_col=False) - return df - - def _save_to_hdf5(self, df, event, outputPath): - key = list(df.columns) - - # TODO: clean up these if branches - if len(key) == 3: - arr1 = np.array(["timestamps", "data", "sampling_rate"]) - arr2 = np.char.lower(np.array(key)) - if (np.sort(arr1) == np.sort(arr2)).all() == False: - logger.error("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") - raise Exception("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") - - if len(key) == 1: - if key[0].lower() != "timestamps": - logger.error("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m") - raise Exception("\033[1m" + "Column name should be timestamps" + "\033[0m") - - if len(key) != 3 and len(key) != 1: - logger.error( - "\033[1m" - + "Number of columns in csv file should be either three or one. Three columns if \ - the file is for control or signal data or one column if the file is for event TTLs." - + "\033[0m" - ) - raise Exception( - "\033[1m" - + "Number of columns in csv file should be either three or one. Three columns if \ - the file is for control or signal data or one column if the file is for event TTLs." - + "\033[0m" - ) - - for i in range(len(key)): - self._write_hdf5(data=df[key[i]].dropna(), storename=event, output_path=outputPath, key=key[i].lower()) - - logger.info("\033[1m" + "Reading data for {} from csv file is completed.".format(event) + "\033[0m") - - def read(self, *, events: list[str], outputPath: str) -> list[dict[str, Any]]: - output_dicts = [] - for event in events: - df = self._read_npm(event=event) - S = df.to_dict() - S["storename"] = event - output_dicts.append(S) - return output_dicts - - def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str) -> None: - for S in output_dicts: - event = S.pop("storename") - df = pd.DataFrame.from_dict(S) - self._save_to_hdf5(df=df, event=event, outputPath=outputPath) From d43670ffa39f5a7668867b0f21307d99bd240c48 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 17:41:09 -0800 Subject: [PATCH 057/150] Updated TODO --- src/guppy/extractors/doric_recording_extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/guppy/extractors/doric_recording_extractor.py b/src/guppy/extractors/doric_recording_extractor.py index 62a8586..13f7fdb 100644 --- a/src/guppy/extractors/doric_recording_extractor.py +++ b/src/guppy/extractors/doric_recording_extractor.py @@ -24,7 +24,7 @@ def execute_import_doric(folder_path, storesList, outputPath): class DoricRecordingExtractor(BaseRecordingExtractor): - # TODO: consolidate duplicate flag logic between the `__init__` and the `check_doric` method. + # TODO: consolidate duplicate flag logic between the `discover_events_and_flags` and the `check_doric` method. @classmethod def discover_events_and_flags(cls, folder_path): From cd245a165ba8afea06780fbd12e007f33a99f218 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 18:01:33 -0800 Subject: [PATCH 058/150] Centralized read_and_save_all_events and read_and_save_event functions into the base_recording_extractor and removed all duplicates. --- src/guppy/extractors/__init__.py | 10 ++++----- .../extractors/base_recording_extractor.py | 21 +++++++++++++++++++ .../extractors/csv_recording_extractor.py | 19 ----------------- .../extractors/doric_recording_extractor.py | 9 -------- .../extractors/npm_recording_extractor.py | 19 ----------------- .../extractors/tdt_recording_extractor.py | 16 -------------- src/guppy/readTevTsq.py | 21 ++++++++++++------- 7 files changed, 39 insertions(+), 76 deletions(-) diff --git a/src/guppy/extractors/__init__.py b/src/guppy/extractors/__init__.py index 75933c7..ca2fbe0 100644 --- a/src/guppy/extractors/__init__.py +++ b/src/guppy/extractors/__init__.py @@ -1,5 +1,5 @@ -from .base_recording_extractor import BaseRecordingExtractor -from .tdt_recording_extractor import TdtRecordingExtractor, execute_readtev -from .csv_recording_extractor import CsvRecordingExtractor, execute_import_csv -from .doric_recording_extractor import DoricRecordingExtractor, execute_import_doric -from .npm_recording_extractor import NpmRecordingExtractor, execute_import_npm +from .base_recording_extractor import BaseRecordingExtractor, read_and_save_event, read_and_save_all_events +from .tdt_recording_extractor import TdtRecordingExtractor +from .csv_recording_extractor import CsvRecordingExtractor +from .doric_recording_extractor import DoricRecordingExtractor +from .npm_recording_extractor import NpmRecordingExtractor diff --git a/src/guppy/extractors/base_recording_extractor.py b/src/guppy/extractors/base_recording_extractor.py index 839c3db..a8f274b 100644 --- a/src/guppy/extractors/base_recording_extractor.py +++ b/src/guppy/extractors/base_recording_extractor.py @@ -1,12 +1,18 @@ """Base class for recording extractors.""" +import logging +import multiprocessing as mp import os +import time from abc import ABC, abstractmethod +from itertools import repeat from typing import Any import h5py import numpy as np +logger = logging.getLogger(__name__) + class BaseRecordingExtractor(ABC): """ @@ -116,3 +122,18 @@ def _write_hdf5(data: Any, storename: str, output_path: str, key: str) -> None: f.create_dataset(key, data=data, maxshape=(None,), chunks=True) else: f.create_dataset(key, data=data) + + +def read_and_save_event(extractor, event, outputPath): + output_dicts = extractor.read(events=[event], outputPath=outputPath) + extractor.save(output_dicts=output_dicts, outputPath=outputPath) + logger.info("Data for event {} fetched and stored.".format(event)) + + +def read_and_save_all_events(extractor, events, outputPath, numProcesses=mp.cpu_count()): + logger.info("Reading data for event {} ...".format(events)) + + start = time.time() + with mp.Pool(numProcesses) as p: + p.starmap(read_and_save_event, zip(repeat(extractor), events, repeat(outputPath))) + logger.info("Time taken = {0:.5f}".format(time.time() - start)) diff --git a/src/guppy/extractors/csv_recording_extractor.py b/src/guppy/extractors/csv_recording_extractor.py index d74cfde..cfa9a8d 100644 --- a/src/guppy/extractors/csv_recording_extractor.py +++ b/src/guppy/extractors/csv_recording_extractor.py @@ -1,9 +1,6 @@ import glob import logging -import multiprocessing as mp import os -import time -from itertools import repeat from typing import Any import numpy as np @@ -14,22 +11,6 @@ logger = logging.getLogger(__name__) -def execute_import_csv(filepath, events, outputPath, numProcesses=mp.cpu_count()): - logger.info("Reading data for event {} ...".format(events)) - - extractor = CsvRecordingExtractor(folder_path=filepath) - start = time.time() - with mp.Pool(numProcesses) as p: - p.starmap(read_and_save_csv, zip(repeat(extractor), events, repeat(outputPath))) - logger.info("Time taken = {0:.5f}".format(time.time() - start)) - - -def read_and_save_csv(extractor, event, outputPath): - output_dicts = extractor.read(events=[event], outputPath=outputPath) - extractor.save(output_dicts=output_dicts, outputPath=outputPath) - logger.info("Data for event {} fetched and stored.".format(event)) - - class CsvRecordingExtractor(BaseRecordingExtractor): @classmethod diff --git a/src/guppy/extractors/doric_recording_extractor.py b/src/guppy/extractors/doric_recording_extractor.py index 13f7fdb..047e087 100644 --- a/src/guppy/extractors/doric_recording_extractor.py +++ b/src/guppy/extractors/doric_recording_extractor.py @@ -14,15 +14,6 @@ logger = logging.getLogger(__name__) -def execute_import_doric(folder_path, storesList, outputPath): - events = list(storesList[0, :]) - event_name_to_event_type = {storesList[0, i]: storesList[1, i] for i in range(storesList.shape[1])} - - extractor = DoricRecordingExtractor(folder_path=folder_path, event_name_to_event_type=event_name_to_event_type) - output_dicts = extractor.read(events=events, outputPath=outputPath) - extractor.save(output_dicts=output_dicts, outputPath=outputPath) - - class DoricRecordingExtractor(BaseRecordingExtractor): # TODO: consolidate duplicate flag logic between the `discover_events_and_flags` and the `check_doric` method. diff --git a/src/guppy/extractors/npm_recording_extractor.py b/src/guppy/extractors/npm_recording_extractor.py index 68d13f7..e3455b2 100644 --- a/src/guppy/extractors/npm_recording_extractor.py +++ b/src/guppy/extractors/npm_recording_extractor.py @@ -1,9 +1,6 @@ import glob import logging -import multiprocessing as mp import os -import time -from itertools import repeat import numpy as np import pandas as pd @@ -16,22 +13,6 @@ logger = logging.getLogger(__name__) -def execute_import_npm(folder_path, events, outputPath, numProcesses=mp.cpu_count()): - logger.info("Reading data for event {} ...".format(events)) - - extractor = NpmRecordingExtractor(folder_path=folder_path) - start = time.time() - with mp.Pool(numProcesses) as p: - p.starmap(read_and_save_npm, zip(repeat(extractor), events, repeat(outputPath))) - logger.info("Time taken = {0:.5f}".format(time.time() - start)) - - -def read_and_save_npm(extractor, event, outputPath): - output_dicts = extractor.read(events=[event], outputPath=outputPath) - extractor.save(output_dicts=output_dicts, outputPath=outputPath) - logger.info("Data for event {} fetched and stored.".format(event)) - - class NpmRecordingExtractor(CsvRecordingExtractor): # Inherits from CsvRecordingExtractor to reuse identical read/save logic. # Only overrides discover_events_and_flags() and adds NPM-specific helper methods. diff --git a/src/guppy/extractors/tdt_recording_extractor.py b/src/guppy/extractors/tdt_recording_extractor.py index a877a8b..f65f7a9 100644 --- a/src/guppy/extractors/tdt_recording_extractor.py +++ b/src/guppy/extractors/tdt_recording_extractor.py @@ -1,9 +1,6 @@ import glob import logging -import multiprocessing as mp import os -import time -from itertools import repeat from typing import Any import numpy as np @@ -15,19 +12,6 @@ logger = logging.getLogger(__name__) -def read_and_save_tdt(extractor, event, outputPath): - output_dicts = extractor.read(events=[event], outputPath=outputPath) - extractor.save(output_dicts=output_dicts, outputPath=outputPath) - - -def execute_readtev(folder_path, events, outputPath, numProcesses=mp.cpu_count()): - extractor = TdtRecordingExtractor(folder_path=folder_path) - start = time.time() - with mp.Pool(numProcesses) as p: - p.starmap(read_and_save_tdt, zip(repeat(extractor), events, repeat(outputPath))) - logger.info("Time taken = {0:.5f}".format(time.time() - start)) - - class TdtRecordingExtractor(BaseRecordingExtractor): @classmethod diff --git a/src/guppy/readTevTsq.py b/src/guppy/readTevTsq.py index 2ae0c59..19a0a4a 100755 --- a/src/guppy/readTevTsq.py +++ b/src/guppy/readTevTsq.py @@ -8,10 +8,11 @@ import numpy as np from guppy.extractors import ( - execute_import_csv, - execute_import_doric, - execute_import_npm, - execute_readtev, + CsvRecordingExtractor, + DoricRecordingExtractor, + NpmRecordingExtractor, + TdtRecordingExtractor, + read_and_save_all_events, ) logger = logging.getLogger(__name__) @@ -74,15 +75,19 @@ def readRawData(inputParameters): events = np.unique(storesList[0, :]) if modality == "tdt": - execute_readtev(filepath, events, op, numProcesses) + extractor = TdtRecordingExtractor(folder_path=filepath) elif modality == "doric": - execute_import_doric(filepath, storesList, op) + event_name_to_event_type = {storesList[0, i]: storesList[1, i] for i in range(storesList.shape[1])} + extractor = DoricRecordingExtractor( + folder_path=filepath, event_name_to_event_type=event_name_to_event_type + ) elif modality == "csv": - execute_import_csv(filepath, events, op, numProcesses) + extractor = CsvRecordingExtractor(folder_path=filepath) elif modality == "npm": - execute_import_npm(filepath, events, op, numProcesses) + extractor = NpmRecordingExtractor(folder_path=filepath) else: raise ValueError("Modality not recognized. Please use 'tdt', 'csv', 'doric', or 'npm'.") + read_and_save_all_events(extractor, events, op, numProcesses) writeToFile(str(10 + ((step + 1) * 10)) + "\n") step += 1 From 7e69cc747dfff63d93dd733ff584c6cdbd459b03 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 3 Dec 2025 18:04:56 -0800 Subject: [PATCH 059/150] Removed redundant intermediate common_step3.py. --- src/guppy/common_step3.py | 42 --------------------------------------- 1 file changed, 42 deletions(-) delete mode 100644 src/guppy/common_step3.py diff --git a/src/guppy/common_step3.py b/src/guppy/common_step3.py deleted file mode 100644 index 09e763f..0000000 --- a/src/guppy/common_step3.py +++ /dev/null @@ -1,42 +0,0 @@ -import logging -import os - -import h5py -import numpy as np - -logger = logging.getLogger(__name__) - - -# function to write data to a hdf5 file -def write_hdf5(data, event, filepath, key): - - # replacing \\ or / in storenames with _ (to avoid errors while saving data) - event = event.replace("\\", "_") - event = event.replace("/", "_") - - op = os.path.join(filepath, event + ".hdf5") - - # if file does not exist create a new file - if not os.path.exists(op): - with h5py.File(op, "w") as f: - if type(data) is np.ndarray: - f.create_dataset(key, data=data, maxshape=(None,), chunks=True) - else: - f.create_dataset(key, data=data) - - # if file already exists, append data to it or add a new key to it - else: - with h5py.File(op, "r+") as f: - if key in list(f.keys()): - if type(data) is np.ndarray: - f[key].resize(data.shape) - arr = f[key] - arr[:] = data - else: - arr = f[key] - arr = data - else: - if type(data) is np.ndarray: - f.create_dataset(key, data=data, maxshape=(None,), chunks=True) - else: - f.create_dataset(key, data=data) From 792e421ba5c6d22674e6b6558f480524a5f0c461 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 5 Dec 2025 09:27:31 -0800 Subject: [PATCH 060/150] Added Claude code docs to gitignore. --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 0628429..f684eec 100755 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,5 @@ GuPPy/runFiberPhotometryAnalysis.ipynb .clinerules/ testing_data/ + +CLAUDE.md From 60fa0bc67761ed648e08c2944f0da9a413ca5a53 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 5 Dec 2025 11:46:09 -0800 Subject: [PATCH 061/150] Pulled out analysis-specific functions and io_utils from preprocess.py. --- src/guppy/analysis/__init__.py | 0 src/guppy/analysis/analysis.py | 268 ++++++++++++++++++++ src/guppy/analysis/io_utils.py | 163 ++++++++++++ src/guppy/preprocess.py | 441 +++++---------------------------- step4_data_flow_analysis.md | 348 ++++++++++++++++++++++++++ 5 files changed, 841 insertions(+), 379 deletions(-) create mode 100644 src/guppy/analysis/__init__.py create mode 100644 src/guppy/analysis/analysis.py create mode 100644 src/guppy/analysis/io_utils.py create mode 100644 step4_data_flow_analysis.md diff --git a/src/guppy/analysis/__init__.py b/src/guppy/analysis/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/guppy/analysis/analysis.py b/src/guppy/analysis/analysis.py new file mode 100644 index 0000000..4ec8960 --- /dev/null +++ b/src/guppy/analysis/analysis.py @@ -0,0 +1,268 @@ +import logging + +import numpy as np +from scipy import signal as ss +from scipy.optimize import curve_fit + +from .io_utils import fetchCoords, read_hdf5 + +logger = logging.getLogger(__name__) + + +# Category: Analysis +# Reason: Pure mathematical function for exponential curve fitting - no dependencies, pure computation +# curve fit exponential function +def curveFitFn(x, a, b, c): + return a + (b * np.exp(-(1 / c) * x)) + + +# Category: Analysis +# Reason: Pure algorithmic function - applies Savitzky-Golay filter and curve fitting to generate synthetic control channel +# helper function to create control channel using signal channel +# by curve fitting signal channel to exponential function +# when there is no isosbestic control channel is present +def helper_create_control_channel(signal, timestamps, window): + # check if window is greater than signal shape + if window > signal.shape[0]: + window = ((signal.shape[0] + 1) / 2) + 1 + if window % 2 != 0: + window = window + else: + window = window + 1 + + filtered_signal = ss.savgol_filter(signal, window_length=window, polyorder=3) + + p0 = [5, 50, 60] + + try: + popt, pcov = curve_fit(curveFitFn, timestamps, filtered_signal, p0) + except Exception as e: + logger.error(str(e)) + + # logger.info('Curve Fit Parameters : ', popt) + control = curveFitFn(timestamps, *popt) + + return control + + +# Category: Analysis +# Reason: Data validation function - compares array lengths and returns indices for processing +# function to check control and signal channel has same length +# if not, take a smaller length and do pre-processing +def check_cntrl_sig_length(filepath, channels_arr, storenames, storesList): + + indices = [] + for i in range(channels_arr.shape[1]): + idx_c = np.where(storesList == channels_arr[0, i])[0] + idx_s = np.where(storesList == channels_arr[1, i])[0] + control = read_hdf5(storenames[idx_c[0]], filepath, "data") + signal = read_hdf5(storenames[idx_s[0]], filepath, "data") + if control.shape[0] < signal.shape[0]: + indices.append(storesList[idx_c[0]]) + elif control.shape[0] > signal.shape[0]: + indices.append(storesList[idx_s[0]]) + else: + indices.append(storesList[idx_s[0]]) + + return indices + + +# Category: Analysis +# Reason: Pure algorithmic function - concatenates data chunks based on coordinate boundaries, adjusts timestamps mathematically +# helper function to process control and signal timestamps +def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): + + ts = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") + data = read_hdf5(event, filepath, "data").reshape(-1) + coords = fetchCoords(filepath, naming, ts) + + if (data == 0).all() == True: + data = np.zeros(ts.shape[0]) + + arr = np.array([]) + ts_arr = np.array([]) + for i in range(coords.shape[0]): + + index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] + + if len(arr) == 0: + arr = np.concatenate((arr, data[index])) + sub = ts[index][0] - timeForLightsTurnOn + new_ts = ts[index] - sub + ts_arr = np.concatenate((ts_arr, new_ts)) + else: + temp = data[index] + # new = temp + (arr[-1]-temp[0]) + temp_ts = ts[index] + new_ts = temp_ts - (temp_ts[0] - ts_arr[-1]) + arr = np.concatenate((arr, temp)) + ts_arr = np.concatenate((ts_arr, new_ts + (1 / sampling_rate))) + + # logger.info(arr.shape, ts_arr.shape) + return arr, ts_arr + + +# Category: Analysis +# Reason: Pure algorithmic function - processes event timestamps based on coordinate boundaries, aligns with data timeline +# helper function to align event timestamps with the control and signal timestamps +def eliminateTs(filepath, timeForLightsTurnOn, event, sampling_rate, naming): + + tsNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") + ts = read_hdf5(event + "_" + naming, filepath, "ts").reshape(-1) + coords = fetchCoords(filepath, naming, tsNew) + + ts_arr = np.array([]) + tsNew_arr = np.array([]) + for i in range(coords.shape[0]): + tsNew_index = np.where((tsNew > coords[i, 0]) & (tsNew < coords[i, 1]))[0] + ts_index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] + + if len(tsNew_arr) == 0: + sub = tsNew[tsNew_index][0] - timeForLightsTurnOn + tsNew_arr = np.concatenate((tsNew_arr, tsNew[tsNew_index] - sub)) + ts_arr = np.concatenate((ts_arr, ts[ts_index] - sub)) + else: + temp_tsNew = tsNew[tsNew_index] + temp_ts = ts[ts_index] + new_ts = temp_ts - (temp_tsNew[0] - tsNew_arr[-1]) + new_tsNew = temp_tsNew - (temp_tsNew[0] - tsNew_arr[-1]) + tsNew_arr = np.concatenate((tsNew_arr, new_tsNew + (1 / sampling_rate))) + ts_arr = np.concatenate((ts_arr, new_ts + (1 / sampling_rate))) + + return ts_arr + + +# Category: Analysis +# Reason: Pure algorithmic function - replaces specified data chunks with NaN based on coordinate boundaries +# adding nan values to removed chunks +# when using artifacts removal method - replace with NaN +def addingNaNValues(filepath, event, naming): + + ts = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") + data = read_hdf5(event, filepath, "data").reshape(-1) + coords = fetchCoords(filepath, naming, ts) + + if (data == 0).all() == True: + data = np.zeros(ts.shape[0]) + + arr = np.array([]) + ts_index = np.arange(ts.shape[0]) + for i in range(coords.shape[0]): + + index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] + arr = np.concatenate((arr, index)) + + nan_indices = list(set(ts_index).symmetric_difference(arr)) + data[nan_indices] = np.nan + + return data + + +# Category: Analysis +# Reason: Pure algorithmic function - filters event timestamps to exclude artifact regions based on coordinates +# remove event TTLs which falls in the removed chunks +# when using artifacts removal method - replace with NaN +def removeTTLs(filepath, event, naming): + tsNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") + ts = read_hdf5(event + "_" + naming, filepath, "ts").reshape(-1) + coords = fetchCoords(filepath, naming, tsNew) + + ts_arr = np.array([]) + for i in range(coords.shape[0]): + ts_index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] + ts_arr = np.concatenate((ts_arr, ts[ts_index])) + + return ts_arr + + +# Category: Analysis +# Reason: Pure mathematical computation - calculates deltaF/F from signal and control arrays with simple formula +# function to compute deltaF/F using fitted control channel and filtered signal channel +def deltaFF(signal, control): + + res = np.subtract(signal, control) + normData = np.divide(res, control) + # deltaFF = normData + normData = normData * 100 + + return normData + + +# Category: Analysis +# Reason: Pure algorithmic function - performs polynomial linear regression to fit control to signal +# function to fit control channel to signal channel +def controlFit(control, signal): + + p = np.polyfit(control, signal, 1) + arr = (p[0] * control) + p[1] + return arr + + +# Category: Analysis +# Reason: Pure signal processing function - applies moving average filter using scipy filtfilt +def filterSignal(filter_window, signal): + if filter_window == 0: + return signal + elif filter_window > 1: + b = np.divide(np.ones((filter_window,)), filter_window) + a = 1 + filtered_signal = ss.filtfilt(b, a, signal) + return filtered_signal + else: + raise Exception("Moving average filter window value is not correct.") + + +# Category: Routing +# Reason: Orchestrates signal processing pipeline - sequences calls to filterSignal, controlFit, and deltaFF with conditional logic +# function to filter control and signal channel, also execute above two function : controlFit and deltaFF +# function will also take care if there is only signal channel and no control channel +# if there is only signal channel, z-score will be computed using just signal channel +def execute_controlFit_dff(control, signal, isosbestic_control, filter_window): + + if isosbestic_control == False: + signal_smooth = filterSignal(filter_window, signal) # ss.filtfilt(b, a, signal) + control_fit = controlFit(control, signal_smooth) + norm_data = deltaFF(signal_smooth, control_fit) + else: + control_smooth = filterSignal(filter_window, control) # ss.filtfilt(b, a, control) + signal_smooth = filterSignal(filter_window, signal) # ss.filtfilt(b, a, signal) + control_fit = controlFit(control_smooth, signal_smooth) + norm_data = deltaFF(signal_smooth, control_fit) + + return norm_data, control_fit + + +# Category: Analysis +# Reason: Pure statistical computation - calculates z-score from deltaF/F using different methods (standard, baseline, robust) +# function to compute z-score based on z-score computation method +def z_score_computation(dff, timestamps, inputParameters): + + zscore_method = inputParameters["zscore_method"] + baseline_start, baseline_end = inputParameters["baselineWindowStart"], inputParameters["baselineWindowEnd"] + + if zscore_method == "standard z-score": + numerator = np.subtract(dff, np.nanmean(dff)) + zscore = np.divide(numerator, np.nanstd(dff)) + elif zscore_method == "baseline z-score": + idx = np.where((timestamps > baseline_start) & (timestamps < baseline_end))[0] + if idx.shape[0] == 0: + logger.error( + "Baseline Window Parameters for baseline z-score computation zscore_method \ + are not correct." + ) + raise Exception( + "Baseline Window Parameters for baseline z-score computation zscore_method \ + are not correct." + ) + else: + baseline_mean = np.nanmean(dff[idx]) + baseline_std = np.nanstd(dff[idx]) + numerator = np.subtract(dff, baseline_mean) + zscore = np.divide(numerator, baseline_std) + else: + median = np.median(dff) + mad = np.median(np.abs(dff - median)) + numerator = 0.6745 * (dff - median) + zscore = np.divide(numerator, mad) + + return zscore diff --git a/src/guppy/analysis/io_utils.py b/src/guppy/analysis/io_utils.py new file mode 100644 index 0000000..33b6650 --- /dev/null +++ b/src/guppy/analysis/io_utils.py @@ -0,0 +1,163 @@ +import fnmatch +import glob +import logging +import os +import re + +import h5py +import numpy as np + +logger = logging.getLogger(__name__) + + +# Category: Analysis +# Reason: Utility function for path filtering - pure data transformation with no GUI or orchestration +def takeOnlyDirs(paths): + removePaths = [] + for p in paths: + if os.path.isfile(p): + removePaths.append(p) + return list(set(paths) - set(removePaths)) + + +# Category: Analysis +# Reason: File system utility for case-insensitive file discovery - pure I/O helper with no orchestration +# find files by ignoring the case sensitivity +def find_files(path, glob_path, ignore_case=False): + rule = ( + re.compile(fnmatch.translate(glob_path), re.IGNORECASE) + if ignore_case + else re.compile(fnmatch.translate(glob_path)) + ) + + no_bytes_path = os.listdir(os.path.expanduser(path)) + str_path = [] + + # converting byte object to string + for x in no_bytes_path: + try: + str_path.append(x.decode("utf-8")) + except: + str_path.append(x) + return [os.path.join(path, n) for n in str_path if rule.match(n)] + + +# Category: Analysis +# Reason: Simple file type detection utility - pure file system check with no orchestration +# check if dealing with TDT files or csv files +def check_TDT(filepath): + path = glob.glob(os.path.join(filepath, "*.tsq")) + if len(path) > 0: + return True + else: + return False + + +# Category: Analysis +# Reason: I/O utility function for reading HDF5 files - pure file access with no business logic or orchestration +# function to read hdf5 file +def read_hdf5(event, filepath, key): + if event: + event = event.replace("\\", "_") + event = event.replace("/", "_") + op = os.path.join(filepath, event + ".hdf5") + else: + op = filepath + + if os.path.exists(op): + with h5py.File(op, "r") as f: + arr = np.asarray(f[key]) + else: + logger.error(f"{event}.hdf5 file does not exist") + raise Exception("{}.hdf5 file does not exist".format(event)) + + return arr + + +# Category: Analysis +# Reason: I/O utility function for writing HDF5 files - pure file access with no business logic or orchestration +# function to write hdf5 file +def write_hdf5(data, event, filepath, key): + event = event.replace("\\", "_") + event = event.replace("/", "_") + op = os.path.join(filepath, event + ".hdf5") + + # if file does not exist create a new file + if not os.path.exists(op): + with h5py.File(op, "w") as f: + if type(data) is np.ndarray: + f.create_dataset(key, data=data, maxshape=(None,), chunks=True) + else: + f.create_dataset(key, data=data) + + # if file already exists, append data to it or add a new key to it + else: + with h5py.File(op, "r+") as f: + if key in list(f.keys()): + if type(data) is np.ndarray: + f[key].resize(data.shape) + arr = f[key] + arr[:] = data + else: + arr = f[key] + arr = data + else: + if type(data) is np.ndarray: + f.create_dataset(key, data=data, maxshape=(None,), chunks=True) + else: + f.create_dataset(key, data=data) + + +# Category: Analysis +# Reason: Validation utility - checks file naming conventions and returns structured path array with no orchestration +# function to check if the naming convention for saving storeslist file was followed or not +def decide_naming_convention(filepath): + path_1 = find_files(filepath, "control_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) + + path_2 = find_files(filepath, "signal_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'signal*')) + + path = sorted(path_1 + path_2, key=str.casefold) + if len(path) % 2 != 0: + logger.error("There are not equal number of Control and Signal data") + raise Exception("There are not equal number of Control and Signal data") + + path = np.asarray(path).reshape(2, -1) + + return path + + +# Category: Analysis +# Reason: I/O utility that loads artifact coordinates from .npy file or provides default - pure file loading with simple logic +# function to read coordinates file which was saved by selecting chunks for artifacts removal +def fetchCoords(filepath, naming, data): + + path = os.path.join(filepath, "coordsForPreProcessing_" + naming + ".npy") + + if not os.path.exists(path): + coords = np.array([0, data[-1]]) + else: + coords = np.load(os.path.join(filepath, "coordsForPreProcessing_" + naming + ".npy"))[:, 0] + + if coords.shape[0] % 2 != 0: + logger.error("Number of values in coordsForPreProcessing file is not even.") + raise Exception("Number of values in coordsForPreProcessing file is not even.") + + coords = coords.reshape(-1, 2) + + return coords + + +# Category: Routing +# Reason: Organizes output folders for data combination - loops through numbered outputs and groups related folders +def get_all_stores_for_combining_data(folderNames): + op = [] + for i in range(100): + temp = [] + match = r"[\s\S]*" + "_output_" + str(i) + for j in folderNames: + temp.append(re.findall(match, j)) + temp = sorted(list(np.concatenate(temp).flatten()), key=str.casefold) + if len(temp) > 0: + op.append(temp) + + return op diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 8b79039..69616d9 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -1,95 +1,52 @@ -import fnmatch import glob import json import logging import os -import re import shutil import sys -import h5py import matplotlib.pyplot as plt import numpy as np import pandas as pd -from scipy import signal as ss -from scipy.optimize import curve_fit +from .analysis.analysis import ( + addingNaNValues, + check_cntrl_sig_length, + eliminateData, + eliminateTs, + execute_controlFit_dff, + helper_create_control_channel, + removeTTLs, + z_score_computation, +) +from .analysis.io_utils import ( + check_TDT, + decide_naming_convention, + fetchCoords, + find_files, + get_all_stores_for_combining_data, + read_hdf5, + takeOnlyDirs, + write_hdf5, +) from .combineDataFn import processTimestampsForCombiningData logger = logging.getLogger(__name__) -logger = logging.getLogger(__name__) - # Only set matplotlib backend if not in CI environment if not os.getenv("CI"): plt.switch_backend("TKAgg") -def takeOnlyDirs(paths): - removePaths = [] - for p in paths: - if os.path.isfile(p): - removePaths.append(p) - return list(set(paths) - set(removePaths)) - - +# Category: Visualization/User Input +# Reason: Writes progress updates to file for GUI progress bar - couples backend to GUI feedback mechanism def writeToFile(value: str): with open(os.path.join(os.path.expanduser("~"), "pbSteps.txt"), "a") as file: file.write(value) -# find files by ignoring the case sensitivity -def find_files(path, glob_path, ignore_case=False): - rule = ( - re.compile(fnmatch.translate(glob_path), re.IGNORECASE) - if ignore_case - else re.compile(fnmatch.translate(glob_path)) - ) - - no_bytes_path = os.listdir(os.path.expanduser(path)) - str_path = [] - - # converting byte object to string - for x in no_bytes_path: - try: - str_path.append(x.decode("utf-8")) - except: - str_path.append(x) - return [os.path.join(path, n) for n in str_path if rule.match(n)] - - -# curve fit exponential function -def curveFitFn(x, a, b, c): - return a + (b * np.exp(-(1 / c) * x)) - - -# helper function to create control channel using signal channel -# by curve fitting signal channel to exponential function -# when there is no isosbestic control channel is present -def helper_create_control_channel(signal, timestamps, window): - # check if window is greater than signal shape - if window > signal.shape[0]: - window = ((signal.shape[0] + 1) / 2) + 1 - if window % 2 != 0: - window = window - else: - window = window + 1 - - filtered_signal = ss.savgol_filter(signal, window_length=window, polyorder=3) - - p0 = [5, 50, 60] - - try: - popt, pcov = curve_fit(curveFitFn, timestamps, filtered_signal, p0) - except Exception as e: - logger.error(str(e)) - - # logger.info('Curve Fit Parameters : ', popt) - control = curveFitFn(timestamps, *popt) - - return control - - +# Category: Routing +# Reason: Orchestrates reading HDF5 files, calling helper_create_control_channel, and writing results - coordinates I/O with computation # main function to create control channel using # signal channel and save it to a file def create_control_channel(filepath, arr, window=5001): @@ -116,6 +73,8 @@ def create_control_channel(filepath, arr, window=5001): logger.info("Control channel from signal channel created using curve-fitting") +# Category: Routing +# Reason: Orchestrates validation logic, file copying, and storesList updates - coordinates multiple operations and file manipulations # function to add control channel when there is no # isosbestic control channel and update the storeslist file def add_control_channel(filepath, arr): @@ -162,86 +121,8 @@ def add_control_channel(filepath, arr): return arr -# check if dealing with TDT files or csv files -def check_TDT(filepath): - path = glob.glob(os.path.join(filepath, "*.tsq")) - if len(path) > 0: - return True - else: - return False - - -# function to read hdf5 file -def read_hdf5(event, filepath, key): - if event: - event = event.replace("\\", "_") - event = event.replace("/", "_") - op = os.path.join(filepath, event + ".hdf5") - else: - op = filepath - - if os.path.exists(op): - with h5py.File(op, "r") as f: - arr = np.asarray(f[key]) - else: - logger.error(f"{event}.hdf5 file does not exist") - raise Exception("{}.hdf5 file does not exist".format(event)) - - return arr - - -# function to write hdf5 file -def write_hdf5(data, event, filepath, key): - event = event.replace("\\", "_") - event = event.replace("/", "_") - op = os.path.join(filepath, event + ".hdf5") - - # if file does not exist create a new file - if not os.path.exists(op): - with h5py.File(op, "w") as f: - if type(data) is np.ndarray: - f.create_dataset(key, data=data, maxshape=(None,), chunks=True) - else: - f.create_dataset(key, data=data) - - # if file already exists, append data to it or add a new key to it - else: - with h5py.File(op, "r+") as f: - if key in list(f.keys()): - if type(data) is np.ndarray: - f[key].resize(data.shape) - arr = f[key] - arr[:] = data - else: - arr = f[key] - arr = data - else: - if type(data) is np.ndarray: - f.create_dataset(key, data=data, maxshape=(None,), chunks=True) - else: - f.create_dataset(key, data=data) - - -# function to check control and signal channel has same length -# if not, take a smaller length and do pre-processing -def check_cntrl_sig_length(filepath, channels_arr, storenames, storesList): - - indices = [] - for i in range(channels_arr.shape[1]): - idx_c = np.where(storesList == channels_arr[0, i])[0] - idx_s = np.where(storesList == channels_arr[1, i])[0] - control = read_hdf5(storenames[idx_c[0]], filepath, "data") - signal = read_hdf5(storenames[idx_s[0]], filepath, "data") - if control.shape[0] < signal.shape[0]: - indices.append(storesList[idx_c[0]]) - elif control.shape[0] > signal.shape[0]: - indices.append(storesList[idx_s[0]]) - else: - indices.append(storesList[idx_s[0]]) - - return indices - - +# Category: Routing +# Reason: Orchestrates timestamp correction workflow - loops through stores, coordinates reading/writing, calls validation and correction logic # function to correct timestamps after eliminating first few seconds of the data (for csv data) def timestampCorrection_csv(filepath, timeForLightsTurnOn, storesList): @@ -292,6 +173,8 @@ def timestampCorrection_csv(filepath, timeForLightsTurnOn, storesList): logger.info("Timestamps corrected and converted to seconds.") +# Category: Routing +# Reason: Orchestrates timestamp correction workflow for TDT format - loops through stores, coordinates timestamp expansion algorithm with I/O # function to correct timestamps after eliminating first few seconds of the data (for TDT data) def timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList): @@ -354,6 +237,8 @@ def timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList): # return timeRecStart, correctionIndex, timestampNew +# Category: Routing +# Reason: Orchestrates applying timestamp corrections - reads correction indices, applies different logic based on data type, writes results # function to apply correction to control, signal and event timestamps def applyCorrection(filepath, timeForLightsTurnOn, event, displayName, naming): @@ -395,6 +280,8 @@ def applyCorrection(filepath, timeForLightsTurnOn, event, displayName, naming): # write_hdf5(control, displayName, filepath, 'data') +# Category: Routing +# Reason: Orchestrates naming validation and correction application - loops through channel pairs and delegates to applyCorrection # function to check if naming convention was followed while saving storeslist file # and apply timestamps correction using the function applyCorrection def decide_naming_convention_and_applyCorrection(filepath, timeForLightsTurnOn, event, displayName, storesList): @@ -423,6 +310,8 @@ def decide_naming_convention_and_applyCorrection(filepath, timeForLightsTurnOn, logger.info("Timestamps corrections applied to the data and event timestamps.") +# Category: Visualization/User Input +# Reason: Creates matplotlib plots to display z-score results - pure visualization with no computation # function to plot z_score def visualize_z_score(filepath): @@ -445,6 +334,8 @@ def visualize_z_score(filepath): # plt.show() +# Category: Visualization/User Input +# Reason: Creates matplotlib plots to display deltaF/F results - pure visualization with no computation # function to plot deltaF/F def visualize_dff(filepath): name = os.path.basename(filepath) @@ -466,6 +357,8 @@ def visualize_dff(filepath): # plt.show() +# Category: Visualization/User Input +# Reason: Interactive matplotlib GUI with keyboard event handlers for artifact selection - core user input mechanism that saves coordinates to disk def visualize(filepath, x, y1, y2, y3, plot_name, removeArtifacts): # plotting control and signal data @@ -555,6 +448,8 @@ def plt_close_event(event): # return fig +# Category: Visualization/User Input +# Reason: Orchestrates visualization of all control/signal pairs - reads data and delegates to visualize() for user interaction # function to plot control and signal, also provide a feature to select chunks for artifacts removal def visualizeControlAndSignal(filepath, removeArtifacts): path_1 = find_files(filepath, "control_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) @@ -590,141 +485,8 @@ def visualizeControlAndSignal(filepath, removeArtifacts): visualize(filepath, ts, control, signal, cntrl_sig_fit, plot_name, removeArtifacts) -# function to check if the naming convention for saving storeslist file was followed or not -def decide_naming_convention(filepath): - path_1 = find_files(filepath, "control_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) - - path_2 = find_files(filepath, "signal_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'signal*')) - - path = sorted(path_1 + path_2, key=str.casefold) - if len(path) % 2 != 0: - logger.error("There are not equal number of Control and Signal data") - raise Exception("There are not equal number of Control and Signal data") - - path = np.asarray(path).reshape(2, -1) - - return path - - -# function to read coordinates file which was saved by selecting chunks for artifacts removal -def fetchCoords(filepath, naming, data): - - path = os.path.join(filepath, "coordsForPreProcessing_" + naming + ".npy") - - if not os.path.exists(path): - coords = np.array([0, data[-1]]) - else: - coords = np.load(os.path.join(filepath, "coordsForPreProcessing_" + naming + ".npy"))[:, 0] - - if coords.shape[0] % 2 != 0: - logger.error("Number of values in coordsForPreProcessing file is not even.") - raise Exception("Number of values in coordsForPreProcessing file is not even.") - - coords = coords.reshape(-1, 2) - - return coords - - -# helper function to process control and signal timestamps -def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): - - ts = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - data = read_hdf5(event, filepath, "data").reshape(-1) - coords = fetchCoords(filepath, naming, ts) - - if (data == 0).all() == True: - data = np.zeros(ts.shape[0]) - - arr = np.array([]) - ts_arr = np.array([]) - for i in range(coords.shape[0]): - - index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] - - if len(arr) == 0: - arr = np.concatenate((arr, data[index])) - sub = ts[index][0] - timeForLightsTurnOn - new_ts = ts[index] - sub - ts_arr = np.concatenate((ts_arr, new_ts)) - else: - temp = data[index] - # new = temp + (arr[-1]-temp[0]) - temp_ts = ts[index] - new_ts = temp_ts - (temp_ts[0] - ts_arr[-1]) - arr = np.concatenate((arr, temp)) - ts_arr = np.concatenate((ts_arr, new_ts + (1 / sampling_rate))) - - # logger.info(arr.shape, ts_arr.shape) - return arr, ts_arr - - -# helper function to align event timestamps with the control and signal timestamps -def eliminateTs(filepath, timeForLightsTurnOn, event, sampling_rate, naming): - - tsNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - ts = read_hdf5(event + "_" + naming, filepath, "ts").reshape(-1) - coords = fetchCoords(filepath, naming, tsNew) - - ts_arr = np.array([]) - tsNew_arr = np.array([]) - for i in range(coords.shape[0]): - tsNew_index = np.where((tsNew > coords[i, 0]) & (tsNew < coords[i, 1]))[0] - ts_index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] - - if len(tsNew_arr) == 0: - sub = tsNew[tsNew_index][0] - timeForLightsTurnOn - tsNew_arr = np.concatenate((tsNew_arr, tsNew[tsNew_index] - sub)) - ts_arr = np.concatenate((ts_arr, ts[ts_index] - sub)) - else: - temp_tsNew = tsNew[tsNew_index] - temp_ts = ts[ts_index] - new_ts = temp_ts - (temp_tsNew[0] - tsNew_arr[-1]) - new_tsNew = temp_tsNew - (temp_tsNew[0] - tsNew_arr[-1]) - tsNew_arr = np.concatenate((tsNew_arr, new_tsNew + (1 / sampling_rate))) - ts_arr = np.concatenate((ts_arr, new_ts + (1 / sampling_rate))) - - return ts_arr - - -# adding nan values to removed chunks -# when using artifacts removal method - replace with NaN -def addingNaNValues(filepath, event, naming): - - ts = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - data = read_hdf5(event, filepath, "data").reshape(-1) - coords = fetchCoords(filepath, naming, ts) - - if (data == 0).all() == True: - data = np.zeros(ts.shape[0]) - - arr = np.array([]) - ts_index = np.arange(ts.shape[0]) - for i in range(coords.shape[0]): - - index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] - arr = np.concatenate((arr, index)) - - nan_indices = list(set(ts_index).symmetric_difference(arr)) - data[nan_indices] = np.nan - - return data - - -# remove event TTLs which falls in the removed chunks -# when using artifacts removal method - replace with NaN -def removeTTLs(filepath, event, naming): - tsNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - ts = read_hdf5(event + "_" + naming, filepath, "ts").reshape(-1) - coords = fetchCoords(filepath, naming, tsNew) - - ts_arr = np.array([]) - for i in range(coords.shape[0]): - ts_index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] - ts_arr = np.concatenate((ts_arr, ts[ts_index])) - - return ts_arr - - +# Category: Routing +# Reason: Orchestrates NaN replacement for all stores - loops through channels and coordinates calls to addingNaNValues and removeTTLs def addingNaNtoChunksWithArtifacts(filepath, events): logger.debug("Replacing chunks with artifacts by NaN values.") @@ -759,6 +521,8 @@ def addingNaNtoChunksWithArtifacts(filepath, events): logger.info("Chunks with artifacts are replaced by NaN values.") +# Category: Routing +# Reason: Orchestrates timestamp concatenation for artifact removal - loops through stores, coordinates eliminateData/eliminateTs calls and writes results # main function to align timestamps for control, signal and event timestamps for artifacts removal def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, events): @@ -800,89 +564,8 @@ def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, events): logger.info("Timestamps processed, artifacts are removed and good chunks are concatenated.") -# function to compute deltaF/F using fitted control channel and filtered signal channel -def deltaFF(signal, control): - - res = np.subtract(signal, control) - normData = np.divide(res, control) - # deltaFF = normData - normData = normData * 100 - - return normData - - -# function to fit control channel to signal channel -def controlFit(control, signal): - - p = np.polyfit(control, signal, 1) - arr = (p[0] * control) + p[1] - return arr - - -def filterSignal(filter_window, signal): - if filter_window == 0: - return signal - elif filter_window > 1: - b = np.divide(np.ones((filter_window,)), filter_window) - a = 1 - filtered_signal = ss.filtfilt(b, a, signal) - return filtered_signal - else: - raise Exception("Moving average filter window value is not correct.") - - -# function to filter control and signal channel, also execute above two function : controlFit and deltaFF -# function will also take care if there is only signal channel and no control channel -# if there is only signal channel, z-score will be computed using just signal channel -def execute_controlFit_dff(control, signal, isosbestic_control, filter_window): - - if isosbestic_control == False: - signal_smooth = filterSignal(filter_window, signal) # ss.filtfilt(b, a, signal) - control_fit = controlFit(control, signal_smooth) - norm_data = deltaFF(signal_smooth, control_fit) - else: - control_smooth = filterSignal(filter_window, control) # ss.filtfilt(b, a, control) - signal_smooth = filterSignal(filter_window, signal) # ss.filtfilt(b, a, signal) - control_fit = controlFit(control_smooth, signal_smooth) - norm_data = deltaFF(signal_smooth, control_fit) - - return norm_data, control_fit - - -# function to compute z-score based on z-score computation method -def z_score_computation(dff, timestamps, inputParameters): - - zscore_method = inputParameters["zscore_method"] - baseline_start, baseline_end = inputParameters["baselineWindowStart"], inputParameters["baselineWindowEnd"] - - if zscore_method == "standard z-score": - numerator = np.subtract(dff, np.nanmean(dff)) - zscore = np.divide(numerator, np.nanstd(dff)) - elif zscore_method == "baseline z-score": - idx = np.where((timestamps > baseline_start) & (timestamps < baseline_end))[0] - if idx.shape[0] == 0: - logger.error( - "Baseline Window Parameters for baseline z-score computation zscore_method \ - are not correct." - ) - raise Exception( - "Baseline Window Parameters for baseline z-score computation zscore_method \ - are not correct." - ) - else: - baseline_mean = np.nanmean(dff[idx]) - baseline_std = np.nanstd(dff[idx]) - numerator = np.subtract(dff, baseline_mean) - zscore = np.divide(numerator, baseline_std) - else: - median = np.median(dff) - mad = np.median(np.abs(dff - median)) - numerator = 0.6745 * (dff - median) - zscore = np.divide(numerator, mad) - - return zscore - - +# Category: Routing +# Reason: Orchestrates z-score computation for one channel - handles artifact removal logic, coordinates calls to execute_controlFit_dff and z_score_computation # helper function to compute z-score and deltaF/F def helper_z_score(control, signal, filepath, name, inputParameters): # helper_z_score(control_smooth, signal_smooth): @@ -957,6 +640,8 @@ def helper_z_score(control, signal, filepath, name, inputParameters): # helper_ return z_score_arr, norm_data_arr, control_fit_arr +# Category: Routing +# Reason: Orchestrates z-score computation for all channels in a session - loops through control/signal pairs, calls helper_z_score, writes results # compute z-score and deltaF/F and save it to hdf5 file def compute_z_score(filepath, inputParameters): @@ -1005,6 +690,8 @@ def compute_z_score(filepath, inputParameters): logger.info(f"z-score for the data in {filepath} computed.") +# Category: Routing +# Reason: Top-level orchestrator for timestamp correction across all sessions - loops through folders, coordinates timestamp correction workflow # function to execute timestamps corrections using functions timestampCorrection and decide_naming_convention_and_applyCorrection def execute_timestamp_correction(folderNames, inputParameters): @@ -1044,6 +731,8 @@ def execute_timestamp_correction(folderNames, inputParameters): logger.info(f"Timestamps corrections finished for {filepath}") +# Category: Routing +# Reason: Orchestrates reading and merging storeslist files from multiple sessions - loops through folders and consolidates results # for combining data, reading storeslist file from both data and create a new storeslist array def check_storeslistfile(folderNames): storesList = np.array([[], []]) @@ -1065,20 +754,8 @@ def check_storeslistfile(folderNames): return storesList -def get_all_stores_for_combining_data(folderNames): - op = [] - for i in range(100): - temp = [] - match = r"[\s\S]*" + "_output_" + str(i) - for j in folderNames: - temp.append(re.findall(match, j)) - temp = sorted(list(np.concatenate(temp).flatten()), key=str.casefold) - if len(temp) > 0: - op.append(temp) - - return op - - +# Category: Routing +# Reason: Orchestrates data combination workflow - validates sampling rates, coordinates processTimestampsForCombiningData, manages multi-session I/O # function to combine data when there are two different data files for the same recording session # it will combine the data, do timestamps processing and save the combined data in the first output folder. def combineData(folderNames, inputParameters, storesList): @@ -1123,6 +800,8 @@ def combineData(folderNames, inputParameters, storesList): return op +# Category: Routing +# Reason: Top-level orchestrator for z-score computation and artifact removal - coordinates compute_z_score, artifact processing, and visualization calls # function to compute z-score and deltaF/F using functions : compute_z_score and/or processTimestampsForArtifacts def execute_zscore(folderNames, inputParameters): @@ -1175,6 +854,8 @@ def execute_zscore(folderNames, inputParameters): logger.info("Signal data and event timestamps are extracted.") +# Category: Routing +# Reason: Main entry point for Step 4 - orchestrates entire preprocessing workflow including timestamp correction, data combination, and z-score computation def extractTsAndSignal(inputParameters): logger.debug("Extracting signal data and event timestamps...") @@ -1212,6 +893,8 @@ def extractTsAndSignal(inputParameters): execute_zscore(op_folder, inputParameters) +# Category: Routing +# Reason: Top-level entry point wrapper - handles error catching and calls extractTsAndSignal def main(input_parameters): try: extractTsAndSignal(input_parameters) diff --git a/step4_data_flow_analysis.md b/step4_data_flow_analysis.md new file mode 100644 index 0000000..d86e938 --- /dev/null +++ b/step4_data_flow_analysis.md @@ -0,0 +1,348 @@ +# Step 4 (preprocess.py) Data Flow Analysis + +## Overview + +Step 4 processes timestamp-corrected photometry data and computes normalized signals (ΔF/F and z-scores). It handles artifact removal, data combination from multiple sessions, and generates quality control visualizations. + +## High-Level Data Flow + +```mermaid +flowchart TD + A[Entry: extractTsAndSignal] --> B{combine_data?} + + B -->|False| C[execute_timestamp_correction] + B -->|True| D[execute_timestamp_correction] + + C --> E[execute_zscore] + + D --> F[check_storeslistfile] + F --> G[combineData] + G --> H[execute_zscore] + + E --> I[Output: z_score, dff, cntrl_sig_fit HDF5 files] + H --> I + + style A fill:#e1f5ff + style I fill:#d4edda +``` + +## Main Processing Paths + +### Entry Point +**`extractTsAndSignal(inputParameters)`** (line 1178) is the main entry point called by the GUI or API. + +### Path 1: Normal Processing (combine_data = False) +1. `execute_timestamp_correction()` → Correct timestamps and align data +2. `execute_zscore()` → Compute z-scores and ΔF/F + +### Path 2: Combined Data Processing (combine_data = True) +1. `execute_timestamp_correction()` → Correct timestamps for each file +2. `check_storeslistfile()` → Merge store lists from multiple files +3. `combineData()` → Combine data from multiple recording sessions +4. `execute_zscore()` → Compute z-scores and ΔF/F on combined data + +## Detailed Processing Stages + +### Stage 1: Timestamp Correction + +```mermaid +flowchart LR + A[Raw HDF5 files] --> B[Read storesList.csv] + B --> C{isosbestic_control?} + C -->|No| D[add_control_channel] + C -->|Yes| E[timestampCorrection_tdt/csv] + D --> E + E --> F[Eliminate first N seconds] + F --> G[decide_naming_convention_and_applyCorrection] + G --> H[applyCorrection for each store] + H --> I{isosbestic_control?} + I -->|No| J[create_control_channel via curve fitting] + I -->|Yes| K[timeCorrection_*.hdf5 files] + J --> K + + style A fill:#e1f5ff + style K fill:#d4edda +``` + +#### Function: `execute_timestamp_correction(folderNames, inputParameters)` + +**Input:** +- Raw HDF5 files from extractors: `control_*.hdf5`, `signal_*.hdf5`, `event_*.hdf5` + +**Process:** +1. For each session folder: + - Read `storesList.csv` (mapping of raw names to semantic names) + - If no isosbestic control: `add_control_channel()` creates placeholder control files + - **`timestampCorrection_tdt()`** or **`timestampCorrection_csv()`**: + - Eliminates first N seconds (`timeForLightsTurnOn`) + - For TDT: expands timestamps from block timestamps + sampling rate + - For CSV: uses timestamps as-is + - Writes `timeCorrection_*.hdf5` with keys: `timestampNew`, `correctionIndex`, `sampling_rate` + - **`decide_naming_convention_and_applyCorrection()`**: + - For each store, calls `applyCorrection()` to crop data using `correctionIndex` + - For control/signal channels: crops data arrays + - For event channels: subtracts time offset from timestamps + - If no isosbestic control: **`create_control_channel()`** generates synthetic control via curve fitting + +**Output:** +- Timestamp-corrected HDF5 files with trimmed data +- `timeCorrection_*.hdf5` files containing corrected timestamps + +### Stage 2: Z-Score Computation + +```mermaid +flowchart TD + A[Timestamp-corrected HDF5] --> B[compute_z_score] + B --> C{removeArtifacts?} + + C -->|No| D[helper_z_score: full data] + C -->|Yes| E[helper_z_score: chunk-by-chunk] + + D --> F[filterSignal] + E --> F + + F --> G[controlFit: linear regression] + G --> H[deltaFF: compute ΔF/F] + H --> I[z_score_computation] + + I --> J{removeArtifacts?} + + J -->|No| K[Write z_score, dff, cntrl_sig_fit] + J -->|Yes| L{artifactsRemovalMethod?} + + L -->|concatenate| M[processTimestampsForArtifacts] + L -->|NaN| N[addingNaNtoChunksWithArtifacts] + + M --> K + N --> K + + K --> O[visualizeControlAndSignal] + + style A fill:#e1f5ff + style K fill:#d4edda + style O fill:#fff3cd +``` + +#### Function: `execute_zscore(folderNames, inputParameters)` + +**Input:** +- Timestamp-corrected HDF5 files + +**Process:** +1. For each output folder: + + **`compute_z_score(filepath, inputParameters)`**: + - For each control/signal pair: + - **`helper_z_score(control, signal, filepath, name, inputParameters)`**: + + **Without artifacts removal:** + - `execute_controlFit_dff()`: Filter signals → fit control to signal → compute ΔF/F + - `z_score_computation()`: Compute z-score from ΔF/F + + **With artifacts removal:** + - For each user-selected chunk (from `coordsForPreProcessing_*.npy`): + - If no isosbestic: `helper_create_control_channel()` creates synthetic control + - `execute_controlFit_dff()` on chunk + - Concatenate or NaN-fill between chunks + - `z_score_computation()` on processed data + + - Writes: `z_score_*.hdf5`, `dff_*.hdf5`, `cntrl_sig_fit_*.hdf5` + + **If artifacts removal with concatenate method:** + - **`processTimestampsForArtifacts()`**: + - `eliminateData()`: Concatenates good chunks, adjusts timestamps to be continuous + - `eliminateTs()`: Aligns event timestamps with new timeline + - Overwrites data files with concatenated versions + + **If artifacts removal with NaN method:** + - **`addingNaNtoChunksWithArtifacts()`**: + - `addingNaNValues()`: Replaces bad chunks with NaN + - `removeTTLs()`: Filters event timestamps to keep only valid times + + - **`visualizeControlAndSignal()`**: Plots control, signal, cntrl_sig_fit for QC + +**Output:** +- `z_score_*.hdf5` (z-scored signal) +- `dff_*.hdf5` (ΔF/F) +- `cntrl_sig_fit_*.hdf5` (fitted control channel) + +## Key Data Transformations + +### Signal Processing Pipeline + +```mermaid +flowchart LR + A[Raw Signal] --> B[filterSignal: Moving Average] + C[Raw Control] --> D[filterSignal: Moving Average] + + B --> E[controlFit: Linear Regression] + D --> E + + E --> F[control_fit = p0*control + p1] + F --> G[deltaFF] + + B --> G + + G --> H[ΔF/F = signal - control_fit / control_fit * 100] + H --> I[z_score_computation] + + I --> J{zscore_method?} + J -->|standard| K[z = ΔF/F - mean / std] + J -->|baseline| L[z = ΔF/F - baseline_mean / baseline_std] + J -->|robust| M[z = 0.6745 * ΔF/F - median / MAD] + + K --> N[Z-Score Output] + L --> N + M --> N + + style A fill:#e1f5ff + style C fill:#e1f5ff + style N fill:#d4edda +``` + +### Transformation Functions + +1. **`filterSignal(filter_window, signal)`** (line 822) + - Applies moving average filter with configurable window + - Uses `scipy.signal.filtfilt` for zero-phase filtering + +2. **`controlFit(control, signal)`** (line 815) + - Linear regression: fits control to signal + - Returns: `fitted_control = p[0] * control + p[1]` + +3. **`deltaFF(signal, control)`** (line 804) + - Formula: `((signal - control) / control) * 100` + - Computes normalized fluorescence change + +4. **`z_score_computation(dff, timestamps, inputParameters)`** (line 853) + - **Standard z-score:** `(ΔF/F - mean(ΔF/F)) / std(ΔF/F)` + - **Baseline z-score:** `(ΔF/F - mean(baseline)) / std(baseline)` + - **Robust z-score:** `0.6745 * (ΔF/F - median) / MAD` + +## Artifact Removal Workflow + +### Interactive Artifact Selection + +The `visualize()` function (line 469) provides an interactive matplotlib plot: +- **Space key:** Mark artifact boundary (vertical line drawn) +- **'d' key:** Delete last marked boundary +- **Close plot:** Save coordinates to `coordsForPreProcessing_*.npy` + +### Two Removal Methods + +**Concatenate Method:** +- Removes artifact chunks completely +- Concatenates good chunks end-to-end +- Adjusts timestamps to be continuous +- Event timestamps realigned to new timeline + +**NaN Method:** +- Replaces artifact chunks with NaN values +- Preserves original timeline +- Filters out event timestamps in artifact regions + +## Supporting Functions + +### Control Channel Creation + +**`helper_create_control_channel(signal, timestamps, window)`** (line 69) +- Used when no isosbestic control is available +- Applies Savitzky-Golay filter to signal +- Fits to exponential function: `f(x) = a + b * exp(-(1/c) * x)` +- Returns synthetic control channel + +### Data Combination + +**`combineData(folderNames, inputParameters, storesList)`** (line 1084) +- Merges data from multiple recording sessions +- Validates that sampling rates match across sessions +- Calls `processTimestampsForCombiningData()` to align timelines +- Saves combined data to first output folder + +### Coordinate Fetching + +**`fetchCoords(filepath, naming, data)`** (line 610) +- Reads `coordsForPreProcessing_*.npy` (artifact boundary coordinates) +- If file doesn't exist: uses `[0, data[-1]]` (entire recording) +- Validates even number of coordinates (pairs of boundaries) +- Returns reshaped array of coordinate pairs + +## File I/O Summary + +### Files Read + +| File Pattern | Content | Source | +|-------------|---------|--------| +| `control_*.hdf5` | Control channel data | Extractors (Step 3) | +| `signal_*.hdf5` | Signal channel data | Extractors (Step 3) | +| `event_*.hdf5` | Event timestamps | Extractors (Step 3) | +| `storesList.csv` | Channel name mapping | Step 2 | +| `coordsForPreProcessing_*.npy` | Artifact boundaries | User selection (optional) | + +### Files Written + +| File Pattern | Content | Keys | +|-------------|---------|------| +| `timeCorrection_*.hdf5` | Corrected timestamps | `timestampNew`, `correctionIndex`, `sampling_rate`, `timeRecStart` (TDT only) | +| `z_score_*.hdf5` | Z-scored signal | `data` | +| `dff_*.hdf5` | ΔF/F signal | `data` | +| `cntrl_sig_fit_*.hdf5` | Fitted control | `data` | +| `event_*_*.hdf5` | Corrected event timestamps | `ts` | + +## Key Parameters from inputParameters + +| Parameter | Purpose | Default/Options | +|-----------|---------|-----------------| +| `timeForLightsTurnOn` | Seconds to eliminate from start | 1 | +| `filter_window` | Moving average window size | 100 | +| `isosbestic_control` | Use isosbestic control channel? | True/False | +| `removeArtifacts` | Enable artifact removal? | True/False | +| `artifactsRemovalMethod` | How to handle artifacts | "concatenate" / "NaN" | +| `zscore_method` | Z-score computation method | "standard z-score" / "baseline z-score" / "robust z-score" | +| `baselineWindowStart` | Baseline window start (seconds) | 0 | +| `baselineWindowEnd` | Baseline window end (seconds) | 0 | +| `combine_data` | Combine multiple recordings? | True/False | + +## Architecture Notes for Refactoring + +### Current Coupling Issues + +1. **GUI Progress Tracking:** `writeToFile()` writes to `~/pbSteps.txt` for progress bar updates (lines 36-38, 1042, 1171, 1203, 1208, 1220) +2. **Interactive Plotting:** `visualize()` requires user interaction (matplotlib event handlers) +3. **File Path Assumptions:** Hard-coded path patterns (`*_output_*`, naming conventions) +4. **Mixed Responsibilities:** Single functions handle both computation and I/O + +### Recommended Separation Points + +**Backend Analysis Layer Should Include:** +- `filterSignal()` - pure signal processing +- `controlFit()` - pure regression +- `deltaFF()` - pure computation +- `z_score_computation()` - pure statistical computation +- `helper_create_control_channel()` - algorithmic control generation +- Core timestamp correction logic (separated from I/O) +- Core artifact removal logic (separated from I/O) + +**Data I/O Layer Should Include:** +- `read_hdf5()`, `write_hdf5()` - file operations +- Store list reading/writing +- Coordinate file handling +- HDF5 file discovery and path management + +**Frontend Visualization Layer Should Include:** +- `visualize()` - interactive artifact selection +- `visualizeControlAndSignal()` - QC plots +- `visualize_z_score()`, `visualize_dff()` - result visualization +- Progress tracking callbacks (replace `writeToFile()`) + +### Potential Refactoring Strategy + +1. **Extract pure computation functions** into a `signal_processing` module +2. **Create data models** (dataclasses) for: + - TimeCorrectionResult + - ProcessedSignal (with z_score, dff, control_fit) + - ArtifactRegions +3. **Separate I/O operations** into `io_utils` module with consistent interfaces +4. **Create processing pipelines** that accept data objects, return data objects +5. **Move visualization to separate module** with callbacks for progress/interaction +6. **Use dependency injection** for progress callbacks instead of hard-coded file writes From eadb22f62670ffd10301ae85eb08060c45f6a133 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 5 Dec 2025 13:26:52 -0800 Subject: [PATCH 062/150] Organized step 4 analysis functions into various conceptual sub-steps. --- src/guppy/analysis/analysis.py | 268 ---------- src/guppy/analysis/artifact_removal.py | 200 ++++++++ src/guppy/analysis/combine_data.py | 398 ++++++++++++++ src/guppy/analysis/control_channel.py | 42 ++ src/guppy/analysis/io_utils.py | 23 + src/guppy/analysis/timestamp_correction.py | 302 +++++++++++ src/guppy/analysis/z_score.py | 234 +++++++++ src/guppy/preprocess.py | 570 +-------------------- 8 files changed, 1213 insertions(+), 824 deletions(-) delete mode 100644 src/guppy/analysis/analysis.py create mode 100644 src/guppy/analysis/artifact_removal.py create mode 100644 src/guppy/analysis/combine_data.py create mode 100644 src/guppy/analysis/control_channel.py create mode 100644 src/guppy/analysis/timestamp_correction.py create mode 100644 src/guppy/analysis/z_score.py diff --git a/src/guppy/analysis/analysis.py b/src/guppy/analysis/analysis.py deleted file mode 100644 index 4ec8960..0000000 --- a/src/guppy/analysis/analysis.py +++ /dev/null @@ -1,268 +0,0 @@ -import logging - -import numpy as np -from scipy import signal as ss -from scipy.optimize import curve_fit - -from .io_utils import fetchCoords, read_hdf5 - -logger = logging.getLogger(__name__) - - -# Category: Analysis -# Reason: Pure mathematical function for exponential curve fitting - no dependencies, pure computation -# curve fit exponential function -def curveFitFn(x, a, b, c): - return a + (b * np.exp(-(1 / c) * x)) - - -# Category: Analysis -# Reason: Pure algorithmic function - applies Savitzky-Golay filter and curve fitting to generate synthetic control channel -# helper function to create control channel using signal channel -# by curve fitting signal channel to exponential function -# when there is no isosbestic control channel is present -def helper_create_control_channel(signal, timestamps, window): - # check if window is greater than signal shape - if window > signal.shape[0]: - window = ((signal.shape[0] + 1) / 2) + 1 - if window % 2 != 0: - window = window - else: - window = window + 1 - - filtered_signal = ss.savgol_filter(signal, window_length=window, polyorder=3) - - p0 = [5, 50, 60] - - try: - popt, pcov = curve_fit(curveFitFn, timestamps, filtered_signal, p0) - except Exception as e: - logger.error(str(e)) - - # logger.info('Curve Fit Parameters : ', popt) - control = curveFitFn(timestamps, *popt) - - return control - - -# Category: Analysis -# Reason: Data validation function - compares array lengths and returns indices for processing -# function to check control and signal channel has same length -# if not, take a smaller length and do pre-processing -def check_cntrl_sig_length(filepath, channels_arr, storenames, storesList): - - indices = [] - for i in range(channels_arr.shape[1]): - idx_c = np.where(storesList == channels_arr[0, i])[0] - idx_s = np.where(storesList == channels_arr[1, i])[0] - control = read_hdf5(storenames[idx_c[0]], filepath, "data") - signal = read_hdf5(storenames[idx_s[0]], filepath, "data") - if control.shape[0] < signal.shape[0]: - indices.append(storesList[idx_c[0]]) - elif control.shape[0] > signal.shape[0]: - indices.append(storesList[idx_s[0]]) - else: - indices.append(storesList[idx_s[0]]) - - return indices - - -# Category: Analysis -# Reason: Pure algorithmic function - concatenates data chunks based on coordinate boundaries, adjusts timestamps mathematically -# helper function to process control and signal timestamps -def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): - - ts = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - data = read_hdf5(event, filepath, "data").reshape(-1) - coords = fetchCoords(filepath, naming, ts) - - if (data == 0).all() == True: - data = np.zeros(ts.shape[0]) - - arr = np.array([]) - ts_arr = np.array([]) - for i in range(coords.shape[0]): - - index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] - - if len(arr) == 0: - arr = np.concatenate((arr, data[index])) - sub = ts[index][0] - timeForLightsTurnOn - new_ts = ts[index] - sub - ts_arr = np.concatenate((ts_arr, new_ts)) - else: - temp = data[index] - # new = temp + (arr[-1]-temp[0]) - temp_ts = ts[index] - new_ts = temp_ts - (temp_ts[0] - ts_arr[-1]) - arr = np.concatenate((arr, temp)) - ts_arr = np.concatenate((ts_arr, new_ts + (1 / sampling_rate))) - - # logger.info(arr.shape, ts_arr.shape) - return arr, ts_arr - - -# Category: Analysis -# Reason: Pure algorithmic function - processes event timestamps based on coordinate boundaries, aligns with data timeline -# helper function to align event timestamps with the control and signal timestamps -def eliminateTs(filepath, timeForLightsTurnOn, event, sampling_rate, naming): - - tsNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - ts = read_hdf5(event + "_" + naming, filepath, "ts").reshape(-1) - coords = fetchCoords(filepath, naming, tsNew) - - ts_arr = np.array([]) - tsNew_arr = np.array([]) - for i in range(coords.shape[0]): - tsNew_index = np.where((tsNew > coords[i, 0]) & (tsNew < coords[i, 1]))[0] - ts_index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] - - if len(tsNew_arr) == 0: - sub = tsNew[tsNew_index][0] - timeForLightsTurnOn - tsNew_arr = np.concatenate((tsNew_arr, tsNew[tsNew_index] - sub)) - ts_arr = np.concatenate((ts_arr, ts[ts_index] - sub)) - else: - temp_tsNew = tsNew[tsNew_index] - temp_ts = ts[ts_index] - new_ts = temp_ts - (temp_tsNew[0] - tsNew_arr[-1]) - new_tsNew = temp_tsNew - (temp_tsNew[0] - tsNew_arr[-1]) - tsNew_arr = np.concatenate((tsNew_arr, new_tsNew + (1 / sampling_rate))) - ts_arr = np.concatenate((ts_arr, new_ts + (1 / sampling_rate))) - - return ts_arr - - -# Category: Analysis -# Reason: Pure algorithmic function - replaces specified data chunks with NaN based on coordinate boundaries -# adding nan values to removed chunks -# when using artifacts removal method - replace with NaN -def addingNaNValues(filepath, event, naming): - - ts = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - data = read_hdf5(event, filepath, "data").reshape(-1) - coords = fetchCoords(filepath, naming, ts) - - if (data == 0).all() == True: - data = np.zeros(ts.shape[0]) - - arr = np.array([]) - ts_index = np.arange(ts.shape[0]) - for i in range(coords.shape[0]): - - index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] - arr = np.concatenate((arr, index)) - - nan_indices = list(set(ts_index).symmetric_difference(arr)) - data[nan_indices] = np.nan - - return data - - -# Category: Analysis -# Reason: Pure algorithmic function - filters event timestamps to exclude artifact regions based on coordinates -# remove event TTLs which falls in the removed chunks -# when using artifacts removal method - replace with NaN -def removeTTLs(filepath, event, naming): - tsNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - ts = read_hdf5(event + "_" + naming, filepath, "ts").reshape(-1) - coords = fetchCoords(filepath, naming, tsNew) - - ts_arr = np.array([]) - for i in range(coords.shape[0]): - ts_index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] - ts_arr = np.concatenate((ts_arr, ts[ts_index])) - - return ts_arr - - -# Category: Analysis -# Reason: Pure mathematical computation - calculates deltaF/F from signal and control arrays with simple formula -# function to compute deltaF/F using fitted control channel and filtered signal channel -def deltaFF(signal, control): - - res = np.subtract(signal, control) - normData = np.divide(res, control) - # deltaFF = normData - normData = normData * 100 - - return normData - - -# Category: Analysis -# Reason: Pure algorithmic function - performs polynomial linear regression to fit control to signal -# function to fit control channel to signal channel -def controlFit(control, signal): - - p = np.polyfit(control, signal, 1) - arr = (p[0] * control) + p[1] - return arr - - -# Category: Analysis -# Reason: Pure signal processing function - applies moving average filter using scipy filtfilt -def filterSignal(filter_window, signal): - if filter_window == 0: - return signal - elif filter_window > 1: - b = np.divide(np.ones((filter_window,)), filter_window) - a = 1 - filtered_signal = ss.filtfilt(b, a, signal) - return filtered_signal - else: - raise Exception("Moving average filter window value is not correct.") - - -# Category: Routing -# Reason: Orchestrates signal processing pipeline - sequences calls to filterSignal, controlFit, and deltaFF with conditional logic -# function to filter control and signal channel, also execute above two function : controlFit and deltaFF -# function will also take care if there is only signal channel and no control channel -# if there is only signal channel, z-score will be computed using just signal channel -def execute_controlFit_dff(control, signal, isosbestic_control, filter_window): - - if isosbestic_control == False: - signal_smooth = filterSignal(filter_window, signal) # ss.filtfilt(b, a, signal) - control_fit = controlFit(control, signal_smooth) - norm_data = deltaFF(signal_smooth, control_fit) - else: - control_smooth = filterSignal(filter_window, control) # ss.filtfilt(b, a, control) - signal_smooth = filterSignal(filter_window, signal) # ss.filtfilt(b, a, signal) - control_fit = controlFit(control_smooth, signal_smooth) - norm_data = deltaFF(signal_smooth, control_fit) - - return norm_data, control_fit - - -# Category: Analysis -# Reason: Pure statistical computation - calculates z-score from deltaF/F using different methods (standard, baseline, robust) -# function to compute z-score based on z-score computation method -def z_score_computation(dff, timestamps, inputParameters): - - zscore_method = inputParameters["zscore_method"] - baseline_start, baseline_end = inputParameters["baselineWindowStart"], inputParameters["baselineWindowEnd"] - - if zscore_method == "standard z-score": - numerator = np.subtract(dff, np.nanmean(dff)) - zscore = np.divide(numerator, np.nanstd(dff)) - elif zscore_method == "baseline z-score": - idx = np.where((timestamps > baseline_start) & (timestamps < baseline_end))[0] - if idx.shape[0] == 0: - logger.error( - "Baseline Window Parameters for baseline z-score computation zscore_method \ - are not correct." - ) - raise Exception( - "Baseline Window Parameters for baseline z-score computation zscore_method \ - are not correct." - ) - else: - baseline_mean = np.nanmean(dff[idx]) - baseline_std = np.nanstd(dff[idx]) - numerator = np.subtract(dff, baseline_mean) - zscore = np.divide(numerator, baseline_std) - else: - median = np.median(dff) - mad = np.median(np.abs(dff - median)) - numerator = 0.6745 * (dff - median) - zscore = np.divide(numerator, mad) - - return zscore diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py new file mode 100644 index 0000000..3c51830 --- /dev/null +++ b/src/guppy/analysis/artifact_removal.py @@ -0,0 +1,200 @@ +import logging +import os + +import numpy as np + +from .io_utils import ( + decide_naming_convention, + fetchCoords, + read_hdf5, + write_hdf5, +) + +logger = logging.getLogger(__name__) + + +# Category: Routing +# Reason: Orchestrates NaN replacement for all stores - loops through channels and coordinates calls to addingNaNValues and removeTTLs +def addingNaNtoChunksWithArtifacts(filepath, events): + + logger.debug("Replacing chunks with artifacts by NaN values.") + storesList = events[1, :] + + path = decide_naming_convention(filepath) + + for j in range(path.shape[1]): + name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") + name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") + # dirname = os.path.dirname(path[i]) + if name_1[-1] == name_2[-1]: + name = name_1[-1] + sampling_rate = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] + for i in range(len(storesList)): + if ( + "control_" + name.lower() in storesList[i].lower() + or "signal_" + name.lower() in storesList[i].lower() + ): # changes done + data = addingNaNValues(filepath, storesList[i], name) + write_hdf5(data, storesList[i], filepath, "data") + else: + if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): + continue + else: + ts = removeTTLs(filepath, storesList[i], name) + write_hdf5(ts, storesList[i] + "_" + name, filepath, "ts") + + else: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + logger.info("Chunks with artifacts are replaced by NaN values.") + + +# Category: Routing +# Reason: Orchestrates timestamp concatenation for artifact removal - loops through stores, coordinates eliminateData/eliminateTs calls and writes results +# main function to align timestamps for control, signal and event timestamps for artifacts removal +def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, events): + + logger.debug("Processing timestamps to get rid of artifacts using concatenate method...") + storesList = events[1, :] + + path = decide_naming_convention(filepath) + + timestamp_dict = dict() + for j in range(path.shape[1]): + name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") + name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") + # dirname = os.path.dirname(path[i]) + if name_1[-1] == name_2[-1]: + name = name_1[-1] + sampling_rate = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] + + for i in range(len(storesList)): + if ( + "control_" + name.lower() in storesList[i].lower() + or "signal_" + name.lower() in storesList[i].lower() + ): # changes done + data, timestampNew = eliminateData( + filepath, timeForLightsTurnOn, storesList[i], sampling_rate, name + ) + write_hdf5(data, storesList[i], filepath, "data") + else: + if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): + continue + else: + ts = eliminateTs(filepath, timeForLightsTurnOn, storesList[i], sampling_rate, name) + write_hdf5(ts, storesList[i] + "_" + name, filepath, "ts") + + # timestamp_dict[name] = timestampNew + write_hdf5(timestampNew, "timeCorrection_" + name, filepath, "timestampNew") + else: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + logger.info("Timestamps processed, artifacts are removed and good chunks are concatenated.") + + +# Category: Analysis +# Reason: Pure algorithmic function - concatenates data chunks based on coordinate boundaries, adjusts timestamps mathematically +# helper function to process control and signal timestamps +def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): + + ts = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") + data = read_hdf5(event, filepath, "data").reshape(-1) + coords = fetchCoords(filepath, naming, ts) + + if (data == 0).all() == True: + data = np.zeros(ts.shape[0]) + + arr = np.array([]) + ts_arr = np.array([]) + for i in range(coords.shape[0]): + + index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] + + if len(arr) == 0: + arr = np.concatenate((arr, data[index])) + sub = ts[index][0] - timeForLightsTurnOn + new_ts = ts[index] - sub + ts_arr = np.concatenate((ts_arr, new_ts)) + else: + temp = data[index] + # new = temp + (arr[-1]-temp[0]) + temp_ts = ts[index] + new_ts = temp_ts - (temp_ts[0] - ts_arr[-1]) + arr = np.concatenate((arr, temp)) + ts_arr = np.concatenate((ts_arr, new_ts + (1 / sampling_rate))) + + # logger.info(arr.shape, ts_arr.shape) + return arr, ts_arr + + +# Category: Analysis +# Reason: Pure algorithmic function - processes event timestamps based on coordinate boundaries, aligns with data timeline +# helper function to align event timestamps with the control and signal timestamps +def eliminateTs(filepath, timeForLightsTurnOn, event, sampling_rate, naming): + + tsNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") + ts = read_hdf5(event + "_" + naming, filepath, "ts").reshape(-1) + coords = fetchCoords(filepath, naming, tsNew) + + ts_arr = np.array([]) + tsNew_arr = np.array([]) + for i in range(coords.shape[0]): + tsNew_index = np.where((tsNew > coords[i, 0]) & (tsNew < coords[i, 1]))[0] + ts_index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] + + if len(tsNew_arr) == 0: + sub = tsNew[tsNew_index][0] - timeForLightsTurnOn + tsNew_arr = np.concatenate((tsNew_arr, tsNew[tsNew_index] - sub)) + ts_arr = np.concatenate((ts_arr, ts[ts_index] - sub)) + else: + temp_tsNew = tsNew[tsNew_index] + temp_ts = ts[ts_index] + new_ts = temp_ts - (temp_tsNew[0] - tsNew_arr[-1]) + new_tsNew = temp_tsNew - (temp_tsNew[0] - tsNew_arr[-1]) + tsNew_arr = np.concatenate((tsNew_arr, new_tsNew + (1 / sampling_rate))) + ts_arr = np.concatenate((ts_arr, new_ts + (1 / sampling_rate))) + + return ts_arr + + +# Category: Analysis +# Reason: Pure algorithmic function - replaces specified data chunks with NaN based on coordinate boundaries +# adding nan values to removed chunks +# when using artifacts removal method - replace with NaN +def addingNaNValues(filepath, event, naming): + + ts = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") + data = read_hdf5(event, filepath, "data").reshape(-1) + coords = fetchCoords(filepath, naming, ts) + + if (data == 0).all() == True: + data = np.zeros(ts.shape[0]) + + arr = np.array([]) + ts_index = np.arange(ts.shape[0]) + for i in range(coords.shape[0]): + + index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] + arr = np.concatenate((arr, index)) + + nan_indices = list(set(ts_index).symmetric_difference(arr)) + data[nan_indices] = np.nan + + return data + + +# Category: Analysis +# Reason: Pure algorithmic function - filters event timestamps to exclude artifact regions based on coordinates +# remove event TTLs which falls in the removed chunks +# when using artifacts removal method - replace with NaN +def removeTTLs(filepath, event, naming): + tsNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") + ts = read_hdf5(event + "_" + naming, filepath, "ts").reshape(-1) + coords = fetchCoords(filepath, naming, tsNew) + + ts_arr = np.array([]) + for i in range(coords.shape[0]): + ts_index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] + ts_arr = np.concatenate((ts_arr, ts[ts_index])) + + return ts_arr diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py new file mode 100644 index 0000000..29e4b9d --- /dev/null +++ b/src/guppy/analysis/combine_data.py @@ -0,0 +1,398 @@ +# TODO: remove redundant function implementations such as eliminateData, eliminateTs, read_hdf5, et cetera. + +import fnmatch +import glob +import logging +import os +import re + +import numpy as np + +from .io_utils import ( + get_all_stores_for_combining_data, + read_hdf5, + takeOnlyDirs, +) + +logger = logging.getLogger(__name__) + + +# Category: Routing +# Reason: Orchestrates data combination workflow - validates sampling rates, coordinates processTimestampsForCombiningData, manages multi-session I/O +# function to combine data when there are two different data files for the same recording session +# it will combine the data, do timestamps processing and save the combined data in the first output folder. +def combineData(folderNames, inputParameters, storesList): + + logger.debug("Combining Data from different data files...") + timeForLightsTurnOn = inputParameters["timeForLightsTurnOn"] + op_folder = [] + for i in range(len(folderNames)): + filepath = folderNames[i] + op_folder.append(takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*")))) + + op_folder = list(np.concatenate(op_folder).flatten()) + sampling_rate_fp = [] + for i in range(len(folderNames)): + filepath = folderNames[i] + storesListPath = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))) + for j in range(len(storesListPath)): + filepath = storesListPath[j] + storesList_new = np.genfromtxt( + os.path.join(filepath, "storesList.csv"), dtype="str", delimiter="," + ).reshape(2, -1) + sampling_rate_fp.append(glob.glob(os.path.join(filepath, "timeCorrection_*"))) + + # check if sampling rate is same for both data + sampling_rate_fp = np.concatenate(sampling_rate_fp) + sampling_rate = [] + for i in range(sampling_rate_fp.shape[0]): + sampling_rate.append(read_hdf5("", sampling_rate_fp[i], "sampling_rate")) + + res = all(i == sampling_rate[0] for i in sampling_rate) + if res == False: + logger.error("To combine the data, sampling rate for both the data should be same.") + raise Exception("To combine the data, sampling rate for both the data should be same.") + + # get the output folders informatinos + op = get_all_stores_for_combining_data(op_folder) + + # processing timestamps for combining the data + processTimestampsForCombiningData(op, timeForLightsTurnOn, storesList, sampling_rate[0]) + logger.info("Data is combined from different data files.") + + return op + + +def find_files(path, glob_path, ignore_case=False): + rule = ( + re.compile(fnmatch.translate(glob_path), re.IGNORECASE) + if ignore_case + else re.compile(fnmatch.translate(glob_path)) + ) + no_bytes_path = os.listdir(os.path.expanduser(path)) + str_path = [] + + # converting byte object to string + for x in no_bytes_path: + try: + str_path.append(x.decode("utf-8")) + except: + str_path.append(x) + + return [os.path.join(path, n) for n in str_path if rule.match(n)] + + +def read_hdf5(event, filepath, key): + if event: + op = os.path.join(filepath, event + ".hdf5") + else: + op = filepath + + if os.path.exists(op): + with h5py.File(op, "r") as f: + arr = np.asarray(f[key]) + else: + raise Exception("{}.hdf5 file does not exist".format(event)) + + return arr + + +def write_hdf5(data, event, filepath, key): + op = os.path.join(filepath, event + ".hdf5") + + if not os.path.exists(op): + with h5py.File(op, "w") as f: + if type(data) is np.ndarray: + f.create_dataset(key, data=data, maxshape=(None,), chunks=True) + else: + f.create_dataset(key, data=data) + else: + with h5py.File(op, "r+") as f: + if key in list(f.keys()): + if type(data) is np.ndarray: + f[key].resize(data.shape) + arr = f[key] + arr[:] = data + else: + arr = f[key] + arr = data + else: + f.create_dataset(key, data=data, maxshape=(None,), chunks=True) + + +def decide_naming_convention(filepath): + path_1 = find_files(filepath, "control*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) + + path_2 = find_files(filepath, "signal*", ignore_case=True) # glob.glob(os.path.join(filepath, 'signal*')) + + path = sorted(path_1 + path_2, key=str.casefold) + + if len(path) % 2 != 0: + raise Exception("There are not equal number of Control and Signal data") + + path = np.asarray(path).reshape(2, -1) + + return path + + +def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): + + arr = np.array([]) + ts_arr = np.array([]) + for i in range(len(filepath)): + ts = read_hdf5("timeCorrection_" + naming, filepath[i], "timestampNew") + data = read_hdf5(event, filepath[i], "data").reshape(-1) + + # index = np.where((ts>coords[i,0]) & (tscoords[i,0]) & (ts signal.shape[0]: + window = ((signal.shape[0] + 1) / 2) + 1 + if window % 2 != 0: + window = window + else: + window = window + 1 + + filtered_signal = ss.savgol_filter(signal, window_length=window, polyorder=3) + + p0 = [5, 50, 60] + + try: + popt, pcov = curve_fit(curveFitFn, timestamps, filtered_signal, p0) + except Exception as e: + logger.error(str(e)) + + # logger.info('Curve Fit Parameters : ', popt) + control = curveFitFn(timestamps, *popt) + + return control + + +# Category: Analysis +# Reason: Pure mathematical function for exponential curve fitting - no dependencies, pure computation +# curve fit exponential function +def curveFitFn(x, a, b, c): + return a + (b * np.exp(-(1 / c) * x)) diff --git a/src/guppy/analysis/io_utils.py b/src/guppy/analysis/io_utils.py index 33b6650..999c190 100644 --- a/src/guppy/analysis/io_utils.py +++ b/src/guppy/analysis/io_utils.py @@ -161,3 +161,26 @@ def get_all_stores_for_combining_data(folderNames): op.append(temp) return op + + +# Category: Routing +# Reason: Orchestrates reading and merging storeslist files from multiple sessions - loops through folders and consolidates results +# for combining data, reading storeslist file from both data and create a new storeslist array +def check_storeslistfile(folderNames): + storesList = np.array([[], []]) + for i in range(len(folderNames)): + filepath = folderNames[i] + storesListPath = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))) + for j in range(len(storesListPath)): + filepath = storesListPath[j] + storesList = np.concatenate( + ( + storesList, + np.genfromtxt(os.path.join(filepath, "storesList.csv"), dtype="str", delimiter=",").reshape(2, -1), + ), + axis=1, + ) + + storesList = np.unique(storesList, axis=1) + + return storesList diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py new file mode 100644 index 0000000..350dd5d --- /dev/null +++ b/src/guppy/analysis/timestamp_correction.py @@ -0,0 +1,302 @@ +import logging +import os +import shutil + +import numpy as np +import pandas as pd + +from .control_channel import helper_create_control_channel +from .io_utils import ( + check_TDT, + read_hdf5, + write_hdf5, +) + +logger = logging.getLogger(__name__) + + +# Category: Routing +# Reason: Orchestrates validation logic, file copying, and storesList updates - coordinates multiple operations and file manipulations +# function to add control channel when there is no +# isosbestic control channel and update the storeslist file +def add_control_channel(filepath, arr): + + storenames = arr[0, :] + storesList = np.char.lower(arr[1, :]) + + keep_control = np.array([]) + # check a case if there is isosbestic control channel present + for i in range(storesList.shape[0]): + if "control" in storesList[i].lower(): + name = storesList[i].split("_")[-1] + new_str = "signal_" + str(name).lower() + find_signal = [True for i in storesList if i == new_str] + if len(find_signal) > 1: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + if len(find_signal) == 0: + logger.error( + "Isosbectic control channel parameter is set to False and still \ + storeslist file shows there is control channel present" + ) + raise Exception( + "Isosbectic control channel parameter is set to False and still \ + storeslist file shows there is control channel present" + ) + else: + continue + + for i in range(storesList.shape[0]): + if "signal" in storesList[i].lower(): + name = storesList[i].split("_")[-1] + new_str = "control_" + str(name).lower() + find_signal = [True for i in storesList if i == new_str] + if len(find_signal) == 0: + src, dst = os.path.join(filepath, arr[0, i] + ".hdf5"), os.path.join( + filepath, "cntrl" + str(i) + ".hdf5" + ) + shutil.copyfile(src, dst) + arr = np.concatenate((arr, [["cntrl" + str(i)], ["control_" + str(arr[1, i].split("_")[-1])]]), axis=1) + + np.savetxt(os.path.join(filepath, "storesList.csv"), arr, delimiter=",", fmt="%s") + + return arr + + +# Category: Routing +# Reason: Orchestrates timestamp correction workflow - loops through stores, coordinates reading/writing, calls validation and correction logic +# function to correct timestamps after eliminating first few seconds of the data (for csv data) +def timestampCorrection_csv(filepath, timeForLightsTurnOn, storesList): + + logger.debug( + f"Correcting timestamps by getting rid of the first {timeForLightsTurnOn} seconds and convert timestamps to seconds" + ) + storenames = storesList[0, :] + storesList = storesList[1, :] + + arr = [] + for i in range(storesList.shape[0]): + if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): + arr.append(storesList[i]) + + arr = sorted(arr, key=str.casefold) + try: + arr = np.asarray(arr).reshape(2, -1) + except: + logger.error("Error in saving stores list file or spelling mistake for control or signal") + raise Exception("Error in saving stores list file or spelling mistake for control or signal") + + indices = check_cntrl_sig_length(filepath, arr, storenames, storesList) + + for i in range(arr.shape[1]): + name_1 = arr[0, i].split("_")[-1] + name_2 = arr[1, i].split("_")[-1] + # dirname = os.path.dirname(path[i]) + idx = np.where(storesList == indices[i])[0] + + if idx.shape[0] == 0: + logger.error(f"{arr[0,i]} does not exist in the stores list file.") + raise Exception("{} does not exist in the stores list file.".format(arr[0, i])) + + timestamp = read_hdf5(storenames[idx][0], filepath, "timestamps") + sampling_rate = read_hdf5(storenames[idx][0], filepath, "sampling_rate") + + if name_1 == name_2: + correctionIndex = np.where(timestamp >= timeForLightsTurnOn)[0] + timestampNew = timestamp[correctionIndex] + write_hdf5(timestampNew, "timeCorrection_" + name_1, filepath, "timestampNew") + write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex") + write_hdf5(np.asarray(sampling_rate), "timeCorrection_" + name_1, filepath, "sampling_rate") + + else: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + + logger.info("Timestamps corrected and converted to seconds.") + + +# Category: Routing +# Reason: Orchestrates timestamp correction workflow for TDT format - loops through stores, coordinates timestamp expansion algorithm with I/O +# function to correct timestamps after eliminating first few seconds of the data (for TDT data) +def timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList): + + logger.debug( + f"Correcting timestamps by getting rid of the first {timeForLightsTurnOn} seconds and convert timestamps to seconds" + ) + storenames = storesList[0, :] + storesList = storesList[1, :] + + arr = [] + for i in range(storesList.shape[0]): + if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): + arr.append(storesList[i]) + + arr = sorted(arr, key=str.casefold) + + try: + arr = np.asarray(arr).reshape(2, -1) + except: + logger.error("Error in saving stores list file or spelling mistake for control or signal") + raise Exception("Error in saving stores list file or spelling mistake for control or signal") + + indices = check_cntrl_sig_length(filepath, arr, storenames, storesList) + + for i in range(arr.shape[1]): + name_1 = arr[0, i].split("_")[-1] + name_2 = arr[1, i].split("_")[-1] + # dirname = os.path.dirname(path[i]) + idx = np.where(storesList == indices[i])[0] + + if idx.shape[0] == 0: + logger.error(f"{arr[0,i]} does not exist in the stores list file.") + raise Exception("{} does not exist in the stores list file.".format(arr[0, i])) + + timestamp = read_hdf5(storenames[idx][0], filepath, "timestamps") + npoints = read_hdf5(storenames[idx][0], filepath, "npoints") + sampling_rate = read_hdf5(storenames[idx][0], filepath, "sampling_rate") + + if name_1 == name_2: + timeRecStart = timestamp[0] + timestamps = np.subtract(timestamp, timeRecStart) + adder = np.arange(npoints) / sampling_rate + lengthAdder = adder.shape[0] + timestampNew = np.zeros((len(timestamps), lengthAdder)) + for i in range(lengthAdder): + timestampNew[:, i] = np.add(timestamps, adder[i]) + timestampNew = (timestampNew.T).reshape(-1, order="F") + correctionIndex = np.where(timestampNew >= timeForLightsTurnOn)[0] + timestampNew = timestampNew[correctionIndex] + + write_hdf5(np.asarray([timeRecStart]), "timeCorrection_" + name_1, filepath, "timeRecStart") + write_hdf5(timestampNew, "timeCorrection_" + name_1, filepath, "timestampNew") + write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex") + write_hdf5(np.asarray([sampling_rate]), "timeCorrection_" + name_1, filepath, "sampling_rate") + else: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + + logger.info("Timestamps corrected and converted to seconds.") + # return timeRecStart, correctionIndex, timestampNew + + +# Category: Routing +# Reason: Orchestrates naming validation and correction application - loops through channel pairs and delegates to applyCorrection +# function to check if naming convention was followed while saving storeslist file +# and apply timestamps correction using the function applyCorrection +def decide_naming_convention_and_applyCorrection(filepath, timeForLightsTurnOn, event, displayName, storesList): + + logger.debug("Applying correction of timestamps to the data and event timestamps") + storesList = storesList[1, :] + + arr = [] + for i in range(storesList.shape[0]): + if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): + arr.append(storesList[i]) + + arr = sorted(arr, key=str.casefold) + arr = np.asarray(arr).reshape(2, -1) + + for i in range(arr.shape[1]): + name_1 = arr[0, i].split("_")[-1] + name_2 = arr[1, i].split("_")[-1] + # dirname = os.path.dirname(path[i]) + if name_1 == name_2: + applyCorrection(filepath, timeForLightsTurnOn, event, displayName, name_1) + else: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + + logger.info("Timestamps corrections applied to the data and event timestamps.") + + +# Category: Routing +# Reason: Orchestrates applying timestamp corrections - reads correction indices, applies different logic based on data type, writes results +# function to apply correction to control, signal and event timestamps +def applyCorrection(filepath, timeForLightsTurnOn, event, displayName, naming): + + cond = check_TDT(os.path.dirname(filepath)) + + if cond == True: + timeRecStart = read_hdf5("timeCorrection_" + naming, filepath, "timeRecStart")[0] + + timestampNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") + correctionIndex = read_hdf5("timeCorrection_" + naming, filepath, "correctionIndex") + + if "control" in displayName.lower() or "signal" in displayName.lower(): + split_name = displayName.split("_")[-1] + if split_name == naming: + pass + else: + correctionIndex = read_hdf5("timeCorrection_" + split_name, filepath, "correctionIndex") + arr = read_hdf5(event, filepath, "data") + if (arr == 0).all() == True: + arr = arr + else: + arr = arr[correctionIndex] + write_hdf5(arr, displayName, filepath, "data") + else: + arr = read_hdf5(event, filepath, "timestamps") + if cond == True: + res = (arr >= timeRecStart).all() + if res == True: + arr = np.subtract(arr, timeRecStart) + arr = np.subtract(arr, timeForLightsTurnOn) + else: + arr = np.subtract(arr, timeForLightsTurnOn) + else: + arr = np.subtract(arr, timeForLightsTurnOn) + write_hdf5(arr, displayName + "_" + naming, filepath, "ts") + + # if isosbestic_control==False and 'control' in displayName.lower(): + # control = create_control_channel(filepath, displayName) + # write_hdf5(control, displayName, filepath, 'data') + + +# Category: Routing +# Reason: Orchestrates reading HDF5 files, calling helper_create_control_channel, and writing results - coordinates I/O with computation +# main function to create control channel using +# signal channel and save it to a file +def create_control_channel(filepath, arr, window=5001): + + storenames = arr[0, :] + storesList = arr[1, :] + + for i in range(storesList.shape[0]): + event_name, event = storesList[i], storenames[i] + if "control" in event_name.lower() and "cntrl" in event.lower(): + logger.debug("Creating control channel from signal channel using curve-fitting") + name = event_name.split("_")[-1] + signal = read_hdf5("signal_" + name, filepath, "data") + timestampNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") + sampling_rate = np.full(timestampNew.shape, np.nan) + sampling_rate[0] = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] + + control = helper_create_control_channel(signal, timestampNew, window) + + write_hdf5(control, event_name, filepath, "data") + d = {"timestamps": timestampNew, "data": control, "sampling_rate": sampling_rate} + df = pd.DataFrame(d) + df.to_csv(os.path.join(os.path.dirname(filepath), event.lower() + ".csv"), index=False) + logger.info("Control channel from signal channel created using curve-fitting") + + +# Category: Analysis +# Reason: Data validation function - compares array lengths and returns indices for processing +# function to check control and signal channel has same length +# if not, take a smaller length and do pre-processing +def check_cntrl_sig_length(filepath, channels_arr, storenames, storesList): + + indices = [] + for i in range(channels_arr.shape[1]): + idx_c = np.where(storesList == channels_arr[0, i])[0] + idx_s = np.where(storesList == channels_arr[1, i])[0] + control = read_hdf5(storenames[idx_c[0]], filepath, "data") + signal = read_hdf5(storenames[idx_s[0]], filepath, "data") + if control.shape[0] < signal.shape[0]: + indices.append(storesList[idx_c[0]]) + elif control.shape[0] > signal.shape[0]: + indices.append(storesList[idx_s[0]]) + else: + indices.append(storesList[idx_s[0]]) + + return indices diff --git a/src/guppy/analysis/z_score.py b/src/guppy/analysis/z_score.py new file mode 100644 index 0000000..d8cc1bc --- /dev/null +++ b/src/guppy/analysis/z_score.py @@ -0,0 +1,234 @@ +import logging +import os + +import numpy as np +from scipy import signal as ss + +from .control_channel import helper_create_control_channel +from .io_utils import ( + fetchCoords, + find_files, + read_hdf5, + write_hdf5, +) + +logger = logging.getLogger(__name__) + + +# Category: Routing +# Reason: Orchestrates z-score computation for all channels in a session - loops through control/signal pairs, calls helper_z_score, writes results +# compute z-score and deltaF/F and save it to hdf5 file +def compute_z_score(filepath, inputParameters): + + logger.debug(f"Computing z-score for each of the data in {filepath}") + remove_artifacts = inputParameters["removeArtifacts"] + + path_1 = find_files(filepath, "control_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) + path_2 = find_files(filepath, "signal_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'signal*')) + + path = sorted(path_1 + path_2, key=str.casefold) + + b = np.divide(np.ones((100,)), 100) + a = 1 + + if len(path) % 2 != 0: + logger.error("There are not equal number of Control and Signal data") + raise Exception("There are not equal number of Control and Signal data") + + path = np.asarray(path).reshape(2, -1) + + for i in range(path.shape[1]): + name_1 = ((os.path.basename(path[0, i])).split(".")[0]).split("_") + name_2 = ((os.path.basename(path[1, i])).split(".")[0]).split("_") + # dirname = os.path.dirname(path[i]) + + if name_1[-1] == name_2[-1]: + name = name_1[-1] + control = read_hdf5("", path[0, i], "data").reshape(-1) + signal = read_hdf5("", path[1, i], "data").reshape(-1) + # control_smooth = ss.filtfilt(b, a, control) + # signal_smooth = ss.filtfilt(b, a, signal) + # _score, dff = helper_z_score(control_smooth, signal_smooth) + z_score, dff, control_fit = helper_z_score(control, signal, filepath, name, inputParameters) + if remove_artifacts == True: + write_hdf5(z_score, "z_score_" + name, filepath, "data") + write_hdf5(dff, "dff_" + name, filepath, "data") + write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") + else: + write_hdf5(z_score, "z_score_" + name, filepath, "data") + write_hdf5(dff, "dff_" + name, filepath, "data") + write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") + else: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + + logger.info(f"z-score for the data in {filepath} computed.") + + +# Category: Routing +# Reason: Orchestrates z-score computation for one channel - handles artifact removal logic, coordinates calls to execute_controlFit_dff and z_score_computation +# helper function to compute z-score and deltaF/F +def helper_z_score(control, signal, filepath, name, inputParameters): # helper_z_score(control_smooth, signal_smooth): + + removeArtifacts = inputParameters["removeArtifacts"] + artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] + filter_window = inputParameters["filter_window"] + + isosbestic_control = inputParameters["isosbestic_control"] + tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") + coords_path = os.path.join(filepath, "coordsForPreProcessing_" + name + ".npy") + + logger.info("Remove Artifacts : ", removeArtifacts) + + if (control == 0).all() == True: + control = np.zeros(tsNew.shape[0]) + + z_score_arr = np.array([]) + norm_data_arr = np.full(tsNew.shape[0], np.nan) + control_fit_arr = np.full(tsNew.shape[0], np.nan) + temp_control_arr = np.full(tsNew.shape[0], np.nan) + + if removeArtifacts == True: + coords = fetchCoords(filepath, name, tsNew) + + # for artifacts removal, each chunk which was selected by user is being processed individually and then + # z-score is calculated + for i in range(coords.shape[0]): + tsNew_index = np.where((tsNew > coords[i, 0]) & (tsNew < coords[i, 1]))[0] + if isosbestic_control == False: + control_arr = helper_create_control_channel(signal[tsNew_index], tsNew[tsNew_index], window=101) + signal_arr = signal[tsNew_index] + norm_data, control_fit = execute_controlFit_dff( + control_arr, signal_arr, isosbestic_control, filter_window + ) + temp_control_arr[tsNew_index] = control_arr + if i < coords.shape[0] - 1: + blank_index = np.where((tsNew > coords[i, 1]) & (tsNew < coords[i + 1, 0]))[0] + temp_control_arr[blank_index] = np.full(blank_index.shape[0], np.nan) + else: + control_arr = control[tsNew_index] + signal_arr = signal[tsNew_index] + norm_data, control_fit = execute_controlFit_dff( + control_arr, signal_arr, isosbestic_control, filter_window + ) + norm_data_arr[tsNew_index] = norm_data + control_fit_arr[tsNew_index] = control_fit + + if artifactsRemovalMethod == "concatenate": + norm_data_arr = norm_data_arr[~np.isnan(norm_data_arr)] + control_fit_arr = control_fit_arr[~np.isnan(control_fit_arr)] + z_score = z_score_computation(norm_data_arr, tsNew, inputParameters) + z_score_arr = np.concatenate((z_score_arr, z_score)) + else: + tsNew_index = np.arange(tsNew.shape[0]) + norm_data, control_fit = execute_controlFit_dff(control, signal, isosbestic_control, filter_window) + z_score = z_score_computation(norm_data, tsNew, inputParameters) + z_score_arr = np.concatenate((z_score_arr, z_score)) + norm_data_arr[tsNew_index] = norm_data # np.concatenate((norm_data_arr, norm_data)) + control_fit_arr[tsNew_index] = control_fit # np.concatenate((control_fit_arr, control_fit)) + + # handle the case if there are chunks being cut in the front and the end + if isosbestic_control == False and removeArtifacts == True: + coords = coords.flatten() + # front chunk + idx = np.where((tsNew >= tsNew[0]) & (tsNew < coords[0]))[0] + temp_control_arr[idx] = np.full(idx.shape[0], np.nan) + # end chunk + idx = np.where((tsNew > coords[-1]) & (tsNew <= tsNew[-1]))[0] + temp_control_arr[idx] = np.full(idx.shape[0], np.nan) + write_hdf5(temp_control_arr, "control_" + name, filepath, "data") + + return z_score_arr, norm_data_arr, control_fit_arr + + +# Category: Routing +# Reason: Orchestrates signal processing pipeline - sequences calls to filterSignal, controlFit, and deltaFF with conditional logic +# function to filter control and signal channel, also execute above two function : controlFit and deltaFF +# function will also take care if there is only signal channel and no control channel +# if there is only signal channel, z-score will be computed using just signal channel +def execute_controlFit_dff(control, signal, isosbestic_control, filter_window): + + if isosbestic_control == False: + signal_smooth = filterSignal(filter_window, signal) # ss.filtfilt(b, a, signal) + control_fit = controlFit(control, signal_smooth) + norm_data = deltaFF(signal_smooth, control_fit) + else: + control_smooth = filterSignal(filter_window, control) # ss.filtfilt(b, a, control) + signal_smooth = filterSignal(filter_window, signal) # ss.filtfilt(b, a, signal) + control_fit = controlFit(control_smooth, signal_smooth) + norm_data = deltaFF(signal_smooth, control_fit) + + return norm_data, control_fit + + +# Category: Analysis +# Reason: Pure mathematical computation - calculates deltaF/F from signal and control arrays with simple formula +# function to compute deltaF/F using fitted control channel and filtered signal channel +def deltaFF(signal, control): + + res = np.subtract(signal, control) + normData = np.divide(res, control) + # deltaFF = normData + normData = normData * 100 + + return normData + + +# Category: Analysis +# Reason: Pure algorithmic function - performs polynomial linear regression to fit control to signal +# function to fit control channel to signal channel +def controlFit(control, signal): + + p = np.polyfit(control, signal, 1) + arr = (p[0] * control) + p[1] + return arr + + +# Category: Analysis +# Reason: Pure signal processing function - applies moving average filter using scipy filtfilt +def filterSignal(filter_window, signal): + if filter_window == 0: + return signal + elif filter_window > 1: + b = np.divide(np.ones((filter_window,)), filter_window) + a = 1 + filtered_signal = ss.filtfilt(b, a, signal) + return filtered_signal + else: + raise Exception("Moving average filter window value is not correct.") + + +# Category: Analysis +# Reason: Pure statistical computation - calculates z-score from deltaF/F using different methods (standard, baseline, robust) +# function to compute z-score based on z-score computation method +def z_score_computation(dff, timestamps, inputParameters): + + zscore_method = inputParameters["zscore_method"] + baseline_start, baseline_end = inputParameters["baselineWindowStart"], inputParameters["baselineWindowEnd"] + + if zscore_method == "standard z-score": + numerator = np.subtract(dff, np.nanmean(dff)) + zscore = np.divide(numerator, np.nanstd(dff)) + elif zscore_method == "baseline z-score": + idx = np.where((timestamps > baseline_start) & (timestamps < baseline_end))[0] + if idx.shape[0] == 0: + logger.error( + "Baseline Window Parameters for baseline z-score computation zscore_method \ + are not correct." + ) + raise Exception( + "Baseline Window Parameters for baseline z-score computation zscore_method \ + are not correct." + ) + else: + baseline_mean = np.nanmean(dff[idx]) + baseline_std = np.nanstd(dff[idx]) + numerator = np.subtract(dff, baseline_mean) + zscore = np.divide(numerator, baseline_std) + else: + median = np.median(dff) + mad = np.median(np.abs(dff - median)) + numerator = 0.6745 * (dff - median) + zscore = np.divide(numerator, mad) + + return zscore diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 69616d9..78f046a 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -2,34 +2,31 @@ import json import logging import os -import shutil import sys import matplotlib.pyplot as plt import numpy as np -import pandas as pd - -from .analysis.analysis import ( - addingNaNValues, - check_cntrl_sig_length, - eliminateData, - eliminateTs, - execute_controlFit_dff, - helper_create_control_channel, - removeTTLs, - z_score_computation, + +from .analysis.artifact_removal import ( + addingNaNtoChunksWithArtifacts, + processTimestampsForArtifacts, ) +from .analysis.combine_data import combineData from .analysis.io_utils import ( + check_storeslistfile, check_TDT, - decide_naming_convention, - fetchCoords, find_files, - get_all_stores_for_combining_data, read_hdf5, takeOnlyDirs, - write_hdf5, +) # Necessary for other modules that depend on preprocess.py +from .analysis.timestamp_correction import ( + add_control_channel, + create_control_channel, + decide_naming_convention_and_applyCorrection, + timestampCorrection_csv, + timestampCorrection_tdt, ) -from .combineDataFn import processTimestampsForCombiningData +from .analysis.z_score import compute_z_score logger = logging.getLogger(__name__) @@ -45,271 +42,6 @@ def writeToFile(value: str): file.write(value) -# Category: Routing -# Reason: Orchestrates reading HDF5 files, calling helper_create_control_channel, and writing results - coordinates I/O with computation -# main function to create control channel using -# signal channel and save it to a file -def create_control_channel(filepath, arr, window=5001): - - storenames = arr[0, :] - storesList = arr[1, :] - - for i in range(storesList.shape[0]): - event_name, event = storesList[i], storenames[i] - if "control" in event_name.lower() and "cntrl" in event.lower(): - logger.debug("Creating control channel from signal channel using curve-fitting") - name = event_name.split("_")[-1] - signal = read_hdf5("signal_" + name, filepath, "data") - timestampNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") - sampling_rate = np.full(timestampNew.shape, np.nan) - sampling_rate[0] = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] - - control = helper_create_control_channel(signal, timestampNew, window) - - write_hdf5(control, event_name, filepath, "data") - d = {"timestamps": timestampNew, "data": control, "sampling_rate": sampling_rate} - df = pd.DataFrame(d) - df.to_csv(os.path.join(os.path.dirname(filepath), event.lower() + ".csv"), index=False) - logger.info("Control channel from signal channel created using curve-fitting") - - -# Category: Routing -# Reason: Orchestrates validation logic, file copying, and storesList updates - coordinates multiple operations and file manipulations -# function to add control channel when there is no -# isosbestic control channel and update the storeslist file -def add_control_channel(filepath, arr): - - storenames = arr[0, :] - storesList = np.char.lower(arr[1, :]) - - keep_control = np.array([]) - # check a case if there is isosbestic control channel present - for i in range(storesList.shape[0]): - if "control" in storesList[i].lower(): - name = storesList[i].split("_")[-1] - new_str = "signal_" + str(name).lower() - find_signal = [True for i in storesList if i == new_str] - if len(find_signal) > 1: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - if len(find_signal) == 0: - logger.error( - "Isosbectic control channel parameter is set to False and still \ - storeslist file shows there is control channel present" - ) - raise Exception( - "Isosbectic control channel parameter is set to False and still \ - storeslist file shows there is control channel present" - ) - else: - continue - - for i in range(storesList.shape[0]): - if "signal" in storesList[i].lower(): - name = storesList[i].split("_")[-1] - new_str = "control_" + str(name).lower() - find_signal = [True for i in storesList if i == new_str] - if len(find_signal) == 0: - src, dst = os.path.join(filepath, arr[0, i] + ".hdf5"), os.path.join( - filepath, "cntrl" + str(i) + ".hdf5" - ) - shutil.copyfile(src, dst) - arr = np.concatenate((arr, [["cntrl" + str(i)], ["control_" + str(arr[1, i].split("_")[-1])]]), axis=1) - - np.savetxt(os.path.join(filepath, "storesList.csv"), arr, delimiter=",", fmt="%s") - - return arr - - -# Category: Routing -# Reason: Orchestrates timestamp correction workflow - loops through stores, coordinates reading/writing, calls validation and correction logic -# function to correct timestamps after eliminating first few seconds of the data (for csv data) -def timestampCorrection_csv(filepath, timeForLightsTurnOn, storesList): - - logger.debug( - f"Correcting timestamps by getting rid of the first {timeForLightsTurnOn} seconds and convert timestamps to seconds" - ) - storenames = storesList[0, :] - storesList = storesList[1, :] - - arr = [] - for i in range(storesList.shape[0]): - if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): - arr.append(storesList[i]) - - arr = sorted(arr, key=str.casefold) - try: - arr = np.asarray(arr).reshape(2, -1) - except: - logger.error("Error in saving stores list file or spelling mistake for control or signal") - raise Exception("Error in saving stores list file or spelling mistake for control or signal") - - indices = check_cntrl_sig_length(filepath, arr, storenames, storesList) - - for i in range(arr.shape[1]): - name_1 = arr[0, i].split("_")[-1] - name_2 = arr[1, i].split("_")[-1] - # dirname = os.path.dirname(path[i]) - idx = np.where(storesList == indices[i])[0] - - if idx.shape[0] == 0: - logger.error(f"{arr[0,i]} does not exist in the stores list file.") - raise Exception("{} does not exist in the stores list file.".format(arr[0, i])) - - timestamp = read_hdf5(storenames[idx][0], filepath, "timestamps") - sampling_rate = read_hdf5(storenames[idx][0], filepath, "sampling_rate") - - if name_1 == name_2: - correctionIndex = np.where(timestamp >= timeForLightsTurnOn)[0] - timestampNew = timestamp[correctionIndex] - write_hdf5(timestampNew, "timeCorrection_" + name_1, filepath, "timestampNew") - write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex") - write_hdf5(np.asarray(sampling_rate), "timeCorrection_" + name_1, filepath, "sampling_rate") - - else: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - - logger.info("Timestamps corrected and converted to seconds.") - - -# Category: Routing -# Reason: Orchestrates timestamp correction workflow for TDT format - loops through stores, coordinates timestamp expansion algorithm with I/O -# function to correct timestamps after eliminating first few seconds of the data (for TDT data) -def timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList): - - logger.debug( - f"Correcting timestamps by getting rid of the first {timeForLightsTurnOn} seconds and convert timestamps to seconds" - ) - storenames = storesList[0, :] - storesList = storesList[1, :] - - arr = [] - for i in range(storesList.shape[0]): - if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): - arr.append(storesList[i]) - - arr = sorted(arr, key=str.casefold) - - try: - arr = np.asarray(arr).reshape(2, -1) - except: - logger.error("Error in saving stores list file or spelling mistake for control or signal") - raise Exception("Error in saving stores list file or spelling mistake for control or signal") - - indices = check_cntrl_sig_length(filepath, arr, storenames, storesList) - - for i in range(arr.shape[1]): - name_1 = arr[0, i].split("_")[-1] - name_2 = arr[1, i].split("_")[-1] - # dirname = os.path.dirname(path[i]) - idx = np.where(storesList == indices[i])[0] - - if idx.shape[0] == 0: - logger.error(f"{arr[0,i]} does not exist in the stores list file.") - raise Exception("{} does not exist in the stores list file.".format(arr[0, i])) - - timestamp = read_hdf5(storenames[idx][0], filepath, "timestamps") - npoints = read_hdf5(storenames[idx][0], filepath, "npoints") - sampling_rate = read_hdf5(storenames[idx][0], filepath, "sampling_rate") - - if name_1 == name_2: - timeRecStart = timestamp[0] - timestamps = np.subtract(timestamp, timeRecStart) - adder = np.arange(npoints) / sampling_rate - lengthAdder = adder.shape[0] - timestampNew = np.zeros((len(timestamps), lengthAdder)) - for i in range(lengthAdder): - timestampNew[:, i] = np.add(timestamps, adder[i]) - timestampNew = (timestampNew.T).reshape(-1, order="F") - correctionIndex = np.where(timestampNew >= timeForLightsTurnOn)[0] - timestampNew = timestampNew[correctionIndex] - - write_hdf5(np.asarray([timeRecStart]), "timeCorrection_" + name_1, filepath, "timeRecStart") - write_hdf5(timestampNew, "timeCorrection_" + name_1, filepath, "timestampNew") - write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex") - write_hdf5(np.asarray([sampling_rate]), "timeCorrection_" + name_1, filepath, "sampling_rate") - else: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - - logger.info("Timestamps corrected and converted to seconds.") - # return timeRecStart, correctionIndex, timestampNew - - -# Category: Routing -# Reason: Orchestrates applying timestamp corrections - reads correction indices, applies different logic based on data type, writes results -# function to apply correction to control, signal and event timestamps -def applyCorrection(filepath, timeForLightsTurnOn, event, displayName, naming): - - cond = check_TDT(os.path.dirname(filepath)) - - if cond == True: - timeRecStart = read_hdf5("timeCorrection_" + naming, filepath, "timeRecStart")[0] - - timestampNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - correctionIndex = read_hdf5("timeCorrection_" + naming, filepath, "correctionIndex") - - if "control" in displayName.lower() or "signal" in displayName.lower(): - split_name = displayName.split("_")[-1] - if split_name == naming: - pass - else: - correctionIndex = read_hdf5("timeCorrection_" + split_name, filepath, "correctionIndex") - arr = read_hdf5(event, filepath, "data") - if (arr == 0).all() == True: - arr = arr - else: - arr = arr[correctionIndex] - write_hdf5(arr, displayName, filepath, "data") - else: - arr = read_hdf5(event, filepath, "timestamps") - if cond == True: - res = (arr >= timeRecStart).all() - if res == True: - arr = np.subtract(arr, timeRecStart) - arr = np.subtract(arr, timeForLightsTurnOn) - else: - arr = np.subtract(arr, timeForLightsTurnOn) - else: - arr = np.subtract(arr, timeForLightsTurnOn) - write_hdf5(arr, displayName + "_" + naming, filepath, "ts") - - # if isosbestic_control==False and 'control' in displayName.lower(): - # control = create_control_channel(filepath, displayName) - # write_hdf5(control, displayName, filepath, 'data') - - -# Category: Routing -# Reason: Orchestrates naming validation and correction application - loops through channel pairs and delegates to applyCorrection -# function to check if naming convention was followed while saving storeslist file -# and apply timestamps correction using the function applyCorrection -def decide_naming_convention_and_applyCorrection(filepath, timeForLightsTurnOn, event, displayName, storesList): - - logger.debug("Applying correction of timestamps to the data and event timestamps") - storesList = storesList[1, :] - - arr = [] - for i in range(storesList.shape[0]): - if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): - arr.append(storesList[i]) - - arr = sorted(arr, key=str.casefold) - arr = np.asarray(arr).reshape(2, -1) - - for i in range(arr.shape[1]): - name_1 = arr[0, i].split("_")[-1] - name_2 = arr[1, i].split("_")[-1] - # dirname = os.path.dirname(path[i]) - if name_1 == name_2: - applyCorrection(filepath, timeForLightsTurnOn, event, displayName, name_1) - else: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - - logger.info("Timestamps corrections applied to the data and event timestamps.") - - # Category: Visualization/User Input # Reason: Creates matplotlib plots to display z-score results - pure visualization with no computation # function to plot z_score @@ -485,211 +217,6 @@ def visualizeControlAndSignal(filepath, removeArtifacts): visualize(filepath, ts, control, signal, cntrl_sig_fit, plot_name, removeArtifacts) -# Category: Routing -# Reason: Orchestrates NaN replacement for all stores - loops through channels and coordinates calls to addingNaNValues and removeTTLs -def addingNaNtoChunksWithArtifacts(filepath, events): - - logger.debug("Replacing chunks with artifacts by NaN values.") - storesList = events[1, :] - - path = decide_naming_convention(filepath) - - for j in range(path.shape[1]): - name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") - name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") - # dirname = os.path.dirname(path[i]) - if name_1[-1] == name_2[-1]: - name = name_1[-1] - sampling_rate = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] - for i in range(len(storesList)): - if ( - "control_" + name.lower() in storesList[i].lower() - or "signal_" + name.lower() in storesList[i].lower() - ): # changes done - data = addingNaNValues(filepath, storesList[i], name) - write_hdf5(data, storesList[i], filepath, "data") - else: - if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): - continue - else: - ts = removeTTLs(filepath, storesList[i], name) - write_hdf5(ts, storesList[i] + "_" + name, filepath, "ts") - - else: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - logger.info("Chunks with artifacts are replaced by NaN values.") - - -# Category: Routing -# Reason: Orchestrates timestamp concatenation for artifact removal - loops through stores, coordinates eliminateData/eliminateTs calls and writes results -# main function to align timestamps for control, signal and event timestamps for artifacts removal -def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, events): - - logger.debug("Processing timestamps to get rid of artifacts using concatenate method...") - storesList = events[1, :] - - path = decide_naming_convention(filepath) - - timestamp_dict = dict() - for j in range(path.shape[1]): - name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") - name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") - # dirname = os.path.dirname(path[i]) - if name_1[-1] == name_2[-1]: - name = name_1[-1] - sampling_rate = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] - - for i in range(len(storesList)): - if ( - "control_" + name.lower() in storesList[i].lower() - or "signal_" + name.lower() in storesList[i].lower() - ): # changes done - data, timestampNew = eliminateData( - filepath, timeForLightsTurnOn, storesList[i], sampling_rate, name - ) - write_hdf5(data, storesList[i], filepath, "data") - else: - if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): - continue - else: - ts = eliminateTs(filepath, timeForLightsTurnOn, storesList[i], sampling_rate, name) - write_hdf5(ts, storesList[i] + "_" + name, filepath, "ts") - - # timestamp_dict[name] = timestampNew - write_hdf5(timestampNew, "timeCorrection_" + name, filepath, "timestampNew") - else: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - logger.info("Timestamps processed, artifacts are removed and good chunks are concatenated.") - - -# Category: Routing -# Reason: Orchestrates z-score computation for one channel - handles artifact removal logic, coordinates calls to execute_controlFit_dff and z_score_computation -# helper function to compute z-score and deltaF/F -def helper_z_score(control, signal, filepath, name, inputParameters): # helper_z_score(control_smooth, signal_smooth): - - removeArtifacts = inputParameters["removeArtifacts"] - artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] - filter_window = inputParameters["filter_window"] - - isosbestic_control = inputParameters["isosbestic_control"] - tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") - coords_path = os.path.join(filepath, "coordsForPreProcessing_" + name + ".npy") - - logger.info("Remove Artifacts : ", removeArtifacts) - - if (control == 0).all() == True: - control = np.zeros(tsNew.shape[0]) - - z_score_arr = np.array([]) - norm_data_arr = np.full(tsNew.shape[0], np.nan) - control_fit_arr = np.full(tsNew.shape[0], np.nan) - temp_control_arr = np.full(tsNew.shape[0], np.nan) - - if removeArtifacts == True: - coords = fetchCoords(filepath, name, tsNew) - - # for artifacts removal, each chunk which was selected by user is being processed individually and then - # z-score is calculated - for i in range(coords.shape[0]): - tsNew_index = np.where((tsNew > coords[i, 0]) & (tsNew < coords[i, 1]))[0] - if isosbestic_control == False: - control_arr = helper_create_control_channel(signal[tsNew_index], tsNew[tsNew_index], window=101) - signal_arr = signal[tsNew_index] - norm_data, control_fit = execute_controlFit_dff( - control_arr, signal_arr, isosbestic_control, filter_window - ) - temp_control_arr[tsNew_index] = control_arr - if i < coords.shape[0] - 1: - blank_index = np.where((tsNew > coords[i, 1]) & (tsNew < coords[i + 1, 0]))[0] - temp_control_arr[blank_index] = np.full(blank_index.shape[0], np.nan) - else: - control_arr = control[tsNew_index] - signal_arr = signal[tsNew_index] - norm_data, control_fit = execute_controlFit_dff( - control_arr, signal_arr, isosbestic_control, filter_window - ) - norm_data_arr[tsNew_index] = norm_data - control_fit_arr[tsNew_index] = control_fit - - if artifactsRemovalMethod == "concatenate": - norm_data_arr = norm_data_arr[~np.isnan(norm_data_arr)] - control_fit_arr = control_fit_arr[~np.isnan(control_fit_arr)] - z_score = z_score_computation(norm_data_arr, tsNew, inputParameters) - z_score_arr = np.concatenate((z_score_arr, z_score)) - else: - tsNew_index = np.arange(tsNew.shape[0]) - norm_data, control_fit = execute_controlFit_dff(control, signal, isosbestic_control, filter_window) - z_score = z_score_computation(norm_data, tsNew, inputParameters) - z_score_arr = np.concatenate((z_score_arr, z_score)) - norm_data_arr[tsNew_index] = norm_data # np.concatenate((norm_data_arr, norm_data)) - control_fit_arr[tsNew_index] = control_fit # np.concatenate((control_fit_arr, control_fit)) - - # handle the case if there are chunks being cut in the front and the end - if isosbestic_control == False and removeArtifacts == True: - coords = coords.flatten() - # front chunk - idx = np.where((tsNew >= tsNew[0]) & (tsNew < coords[0]))[0] - temp_control_arr[idx] = np.full(idx.shape[0], np.nan) - # end chunk - idx = np.where((tsNew > coords[-1]) & (tsNew <= tsNew[-1]))[0] - temp_control_arr[idx] = np.full(idx.shape[0], np.nan) - write_hdf5(temp_control_arr, "control_" + name, filepath, "data") - - return z_score_arr, norm_data_arr, control_fit_arr - - -# Category: Routing -# Reason: Orchestrates z-score computation for all channels in a session - loops through control/signal pairs, calls helper_z_score, writes results -# compute z-score and deltaF/F and save it to hdf5 file -def compute_z_score(filepath, inputParameters): - - logger.debug(f"Computing z-score for each of the data in {filepath}") - remove_artifacts = inputParameters["removeArtifacts"] - - path_1 = find_files(filepath, "control_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) - path_2 = find_files(filepath, "signal_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'signal*')) - - path = sorted(path_1 + path_2, key=str.casefold) - - b = np.divide(np.ones((100,)), 100) - a = 1 - - if len(path) % 2 != 0: - logger.error("There are not equal number of Control and Signal data") - raise Exception("There are not equal number of Control and Signal data") - - path = np.asarray(path).reshape(2, -1) - - for i in range(path.shape[1]): - name_1 = ((os.path.basename(path[0, i])).split(".")[0]).split("_") - name_2 = ((os.path.basename(path[1, i])).split(".")[0]).split("_") - # dirname = os.path.dirname(path[i]) - - if name_1[-1] == name_2[-1]: - name = name_1[-1] - control = read_hdf5("", path[0, i], "data").reshape(-1) - signal = read_hdf5("", path[1, i], "data").reshape(-1) - # control_smooth = ss.filtfilt(b, a, control) - # signal_smooth = ss.filtfilt(b, a, signal) - # _score, dff = helper_z_score(control_smooth, signal_smooth) - z_score, dff, control_fit = helper_z_score(control, signal, filepath, name, inputParameters) - if remove_artifacts == True: - write_hdf5(z_score, "z_score_" + name, filepath, "data") - write_hdf5(dff, "dff_" + name, filepath, "data") - write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") - else: - write_hdf5(z_score, "z_score_" + name, filepath, "data") - write_hdf5(dff, "dff_" + name, filepath, "data") - write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") - else: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - - logger.info(f"z-score for the data in {filepath} computed.") - - # Category: Routing # Reason: Top-level orchestrator for timestamp correction across all sessions - loops through folders, coordinates timestamp correction workflow # function to execute timestamps corrections using functions timestampCorrection and decide_naming_convention_and_applyCorrection @@ -731,75 +258,6 @@ def execute_timestamp_correction(folderNames, inputParameters): logger.info(f"Timestamps corrections finished for {filepath}") -# Category: Routing -# Reason: Orchestrates reading and merging storeslist files from multiple sessions - loops through folders and consolidates results -# for combining data, reading storeslist file from both data and create a new storeslist array -def check_storeslistfile(folderNames): - storesList = np.array([[], []]) - for i in range(len(folderNames)): - filepath = folderNames[i] - storesListPath = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))) - for j in range(len(storesListPath)): - filepath = storesListPath[j] - storesList = np.concatenate( - ( - storesList, - np.genfromtxt(os.path.join(filepath, "storesList.csv"), dtype="str", delimiter=",").reshape(2, -1), - ), - axis=1, - ) - - storesList = np.unique(storesList, axis=1) - - return storesList - - -# Category: Routing -# Reason: Orchestrates data combination workflow - validates sampling rates, coordinates processTimestampsForCombiningData, manages multi-session I/O -# function to combine data when there are two different data files for the same recording session -# it will combine the data, do timestamps processing and save the combined data in the first output folder. -def combineData(folderNames, inputParameters, storesList): - - logger.debug("Combining Data from different data files...") - timeForLightsTurnOn = inputParameters["timeForLightsTurnOn"] - op_folder = [] - for i in range(len(folderNames)): - filepath = folderNames[i] - op_folder.append(takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*")))) - - op_folder = list(np.concatenate(op_folder).flatten()) - sampling_rate_fp = [] - for i in range(len(folderNames)): - filepath = folderNames[i] - storesListPath = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))) - for j in range(len(storesListPath)): - filepath = storesListPath[j] - storesList_new = np.genfromtxt( - os.path.join(filepath, "storesList.csv"), dtype="str", delimiter="," - ).reshape(2, -1) - sampling_rate_fp.append(glob.glob(os.path.join(filepath, "timeCorrection_*"))) - - # check if sampling rate is same for both data - sampling_rate_fp = np.concatenate(sampling_rate_fp) - sampling_rate = [] - for i in range(sampling_rate_fp.shape[0]): - sampling_rate.append(read_hdf5("", sampling_rate_fp[i], "sampling_rate")) - - res = all(i == sampling_rate[0] for i in sampling_rate) - if res == False: - logger.error("To combine the data, sampling rate for both the data should be same.") - raise Exception("To combine the data, sampling rate for both the data should be same.") - - # get the output folders informatinos - op = get_all_stores_for_combining_data(op_folder) - - # processing timestamps for combining the data - processTimestampsForCombiningData(op, timeForLightsTurnOn, storesList, sampling_rate[0]) - logger.info("Data is combined from different data files.") - - return op - - # Category: Routing # Reason: Top-level orchestrator for z-score computation and artifact removal - coordinates compute_z_score, artifact processing, and visualization calls # function to compute z-score and deltaF/F using functions : compute_z_score and/or processTimestampsForArtifacts From 29d5f9ac7f700957e2c0171e835c5201edb53442 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 5 Dec 2025 13:36:21 -0800 Subject: [PATCH 063/150] Removed categorization comments. --- src/guppy/analysis/artifact_removal.py | 12 ------------ src/guppy/analysis/combine_data.py | 2 -- src/guppy/analysis/control_channel.py | 2 -- src/guppy/analysis/io_utils.py | 18 ------------------ src/guppy/analysis/timestamp_correction.py | 14 -------------- src/guppy/analysis/z_score.py | 14 -------------- src/guppy/preprocess.py | 18 ------------------ 7 files changed, 80 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index 3c51830..ac483bb 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -13,8 +13,6 @@ logger = logging.getLogger(__name__) -# Category: Routing -# Reason: Orchestrates NaN replacement for all stores - loops through channels and coordinates calls to addingNaNValues and removeTTLs def addingNaNtoChunksWithArtifacts(filepath, events): logger.debug("Replacing chunks with artifacts by NaN values.") @@ -49,8 +47,6 @@ def addingNaNtoChunksWithArtifacts(filepath, events): logger.info("Chunks with artifacts are replaced by NaN values.") -# Category: Routing -# Reason: Orchestrates timestamp concatenation for artifact removal - loops through stores, coordinates eliminateData/eliminateTs calls and writes results # main function to align timestamps for control, signal and event timestamps for artifacts removal def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, events): @@ -92,8 +88,6 @@ def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, events): logger.info("Timestamps processed, artifacts are removed and good chunks are concatenated.") -# Category: Analysis -# Reason: Pure algorithmic function - concatenates data chunks based on coordinate boundaries, adjusts timestamps mathematically # helper function to process control and signal timestamps def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): @@ -127,8 +121,6 @@ def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): return arr, ts_arr -# Category: Analysis -# Reason: Pure algorithmic function - processes event timestamps based on coordinate boundaries, aligns with data timeline # helper function to align event timestamps with the control and signal timestamps def eliminateTs(filepath, timeForLightsTurnOn, event, sampling_rate, naming): @@ -157,8 +149,6 @@ def eliminateTs(filepath, timeForLightsTurnOn, event, sampling_rate, naming): return ts_arr -# Category: Analysis -# Reason: Pure algorithmic function - replaces specified data chunks with NaN based on coordinate boundaries # adding nan values to removed chunks # when using artifacts removal method - replace with NaN def addingNaNValues(filepath, event, naming): @@ -183,8 +173,6 @@ def addingNaNValues(filepath, event, naming): return data -# Category: Analysis -# Reason: Pure algorithmic function - filters event timestamps to exclude artifact regions based on coordinates # remove event TTLs which falls in the removed chunks # when using artifacts removal method - replace with NaN def removeTTLs(filepath, event, naming): diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py index 29e4b9d..d8f0ce6 100644 --- a/src/guppy/analysis/combine_data.py +++ b/src/guppy/analysis/combine_data.py @@ -17,8 +17,6 @@ logger = logging.getLogger(__name__) -# Category: Routing -# Reason: Orchestrates data combination workflow - validates sampling rates, coordinates processTimestampsForCombiningData, manages multi-session I/O # function to combine data when there are two different data files for the same recording session # it will combine the data, do timestamps processing and save the combined data in the first output folder. def combineData(folderNames, inputParameters, storesList): diff --git a/src/guppy/analysis/control_channel.py b/src/guppy/analysis/control_channel.py index 96665f2..2da82e2 100644 --- a/src/guppy/analysis/control_channel.py +++ b/src/guppy/analysis/control_channel.py @@ -35,8 +35,6 @@ def helper_create_control_channel(signal, timestamps, window): return control -# Category: Analysis -# Reason: Pure mathematical function for exponential curve fitting - no dependencies, pure computation # curve fit exponential function def curveFitFn(x, a, b, c): return a + (b * np.exp(-(1 / c) * x)) diff --git a/src/guppy/analysis/io_utils.py b/src/guppy/analysis/io_utils.py index 999c190..8b10127 100644 --- a/src/guppy/analysis/io_utils.py +++ b/src/guppy/analysis/io_utils.py @@ -10,8 +10,6 @@ logger = logging.getLogger(__name__) -# Category: Analysis -# Reason: Utility function for path filtering - pure data transformation with no GUI or orchestration def takeOnlyDirs(paths): removePaths = [] for p in paths: @@ -20,8 +18,6 @@ def takeOnlyDirs(paths): return list(set(paths) - set(removePaths)) -# Category: Analysis -# Reason: File system utility for case-insensitive file discovery - pure I/O helper with no orchestration # find files by ignoring the case sensitivity def find_files(path, glob_path, ignore_case=False): rule = ( @@ -42,8 +38,6 @@ def find_files(path, glob_path, ignore_case=False): return [os.path.join(path, n) for n in str_path if rule.match(n)] -# Category: Analysis -# Reason: Simple file type detection utility - pure file system check with no orchestration # check if dealing with TDT files or csv files def check_TDT(filepath): path = glob.glob(os.path.join(filepath, "*.tsq")) @@ -53,8 +47,6 @@ def check_TDT(filepath): return False -# Category: Analysis -# Reason: I/O utility function for reading HDF5 files - pure file access with no business logic or orchestration # function to read hdf5 file def read_hdf5(event, filepath, key): if event: @@ -74,8 +66,6 @@ def read_hdf5(event, filepath, key): return arr -# Category: Analysis -# Reason: I/O utility function for writing HDF5 files - pure file access with no business logic or orchestration # function to write hdf5 file def write_hdf5(data, event, filepath, key): event = event.replace("\\", "_") @@ -108,8 +98,6 @@ def write_hdf5(data, event, filepath, key): f.create_dataset(key, data=data) -# Category: Analysis -# Reason: Validation utility - checks file naming conventions and returns structured path array with no orchestration # function to check if the naming convention for saving storeslist file was followed or not def decide_naming_convention(filepath): path_1 = find_files(filepath, "control_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) @@ -126,8 +114,6 @@ def decide_naming_convention(filepath): return path -# Category: Analysis -# Reason: I/O utility that loads artifact coordinates from .npy file or provides default - pure file loading with simple logic # function to read coordinates file which was saved by selecting chunks for artifacts removal def fetchCoords(filepath, naming, data): @@ -147,8 +133,6 @@ def fetchCoords(filepath, naming, data): return coords -# Category: Routing -# Reason: Organizes output folders for data combination - loops through numbered outputs and groups related folders def get_all_stores_for_combining_data(folderNames): op = [] for i in range(100): @@ -163,8 +147,6 @@ def get_all_stores_for_combining_data(folderNames): return op -# Category: Routing -# Reason: Orchestrates reading and merging storeslist files from multiple sessions - loops through folders and consolidates results # for combining data, reading storeslist file from both data and create a new storeslist array def check_storeslistfile(folderNames): storesList = np.array([[], []]) diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index 350dd5d..2e3185a 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -15,8 +15,6 @@ logger = logging.getLogger(__name__) -# Category: Routing -# Reason: Orchestrates validation logic, file copying, and storesList updates - coordinates multiple operations and file manipulations # function to add control channel when there is no # isosbestic control channel and update the storeslist file def add_control_channel(filepath, arr): @@ -63,8 +61,6 @@ def add_control_channel(filepath, arr): return arr -# Category: Routing -# Reason: Orchestrates timestamp correction workflow - loops through stores, coordinates reading/writing, calls validation and correction logic # function to correct timestamps after eliminating first few seconds of the data (for csv data) def timestampCorrection_csv(filepath, timeForLightsTurnOn, storesList): @@ -115,8 +111,6 @@ def timestampCorrection_csv(filepath, timeForLightsTurnOn, storesList): logger.info("Timestamps corrected and converted to seconds.") -# Category: Routing -# Reason: Orchestrates timestamp correction workflow for TDT format - loops through stores, coordinates timestamp expansion algorithm with I/O # function to correct timestamps after eliminating first few seconds of the data (for TDT data) def timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList): @@ -179,8 +173,6 @@ def timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList): # return timeRecStart, correctionIndex, timestampNew -# Category: Routing -# Reason: Orchestrates naming validation and correction application - loops through channel pairs and delegates to applyCorrection # function to check if naming convention was followed while saving storeslist file # and apply timestamps correction using the function applyCorrection def decide_naming_convention_and_applyCorrection(filepath, timeForLightsTurnOn, event, displayName, storesList): @@ -209,8 +201,6 @@ def decide_naming_convention_and_applyCorrection(filepath, timeForLightsTurnOn, logger.info("Timestamps corrections applied to the data and event timestamps.") -# Category: Routing -# Reason: Orchestrates applying timestamp corrections - reads correction indices, applies different logic based on data type, writes results # function to apply correction to control, signal and event timestamps def applyCorrection(filepath, timeForLightsTurnOn, event, displayName, naming): @@ -252,8 +242,6 @@ def applyCorrection(filepath, timeForLightsTurnOn, event, displayName, naming): # write_hdf5(control, displayName, filepath, 'data') -# Category: Routing -# Reason: Orchestrates reading HDF5 files, calling helper_create_control_channel, and writing results - coordinates I/O with computation # main function to create control channel using # signal channel and save it to a file def create_control_channel(filepath, arr, window=5001): @@ -280,8 +268,6 @@ def create_control_channel(filepath, arr, window=5001): logger.info("Control channel from signal channel created using curve-fitting") -# Category: Analysis -# Reason: Data validation function - compares array lengths and returns indices for processing # function to check control and signal channel has same length # if not, take a smaller length and do pre-processing def check_cntrl_sig_length(filepath, channels_arr, storenames, storesList): diff --git a/src/guppy/analysis/z_score.py b/src/guppy/analysis/z_score.py index d8cc1bc..b5032be 100644 --- a/src/guppy/analysis/z_score.py +++ b/src/guppy/analysis/z_score.py @@ -15,8 +15,6 @@ logger = logging.getLogger(__name__) -# Category: Routing -# Reason: Orchestrates z-score computation for all channels in a session - loops through control/signal pairs, calls helper_z_score, writes results # compute z-score and deltaF/F and save it to hdf5 file def compute_z_score(filepath, inputParameters): @@ -65,8 +63,6 @@ def compute_z_score(filepath, inputParameters): logger.info(f"z-score for the data in {filepath} computed.") -# Category: Routing -# Reason: Orchestrates z-score computation for one channel - handles artifact removal logic, coordinates calls to execute_controlFit_dff and z_score_computation # helper function to compute z-score and deltaF/F def helper_z_score(control, signal, filepath, name, inputParameters): # helper_z_score(control_smooth, signal_smooth): @@ -141,8 +137,6 @@ def helper_z_score(control, signal, filepath, name, inputParameters): # helper_ return z_score_arr, norm_data_arr, control_fit_arr -# Category: Routing -# Reason: Orchestrates signal processing pipeline - sequences calls to filterSignal, controlFit, and deltaFF with conditional logic # function to filter control and signal channel, also execute above two function : controlFit and deltaFF # function will also take care if there is only signal channel and no control channel # if there is only signal channel, z-score will be computed using just signal channel @@ -161,8 +155,6 @@ def execute_controlFit_dff(control, signal, isosbestic_control, filter_window): return norm_data, control_fit -# Category: Analysis -# Reason: Pure mathematical computation - calculates deltaF/F from signal and control arrays with simple formula # function to compute deltaF/F using fitted control channel and filtered signal channel def deltaFF(signal, control): @@ -174,8 +166,6 @@ def deltaFF(signal, control): return normData -# Category: Analysis -# Reason: Pure algorithmic function - performs polynomial linear regression to fit control to signal # function to fit control channel to signal channel def controlFit(control, signal): @@ -184,8 +174,6 @@ def controlFit(control, signal): return arr -# Category: Analysis -# Reason: Pure signal processing function - applies moving average filter using scipy filtfilt def filterSignal(filter_window, signal): if filter_window == 0: return signal @@ -198,8 +186,6 @@ def filterSignal(filter_window, signal): raise Exception("Moving average filter window value is not correct.") -# Category: Analysis -# Reason: Pure statistical computation - calculates z-score from deltaF/F using different methods (standard, baseline, robust) # function to compute z-score based on z-score computation method def z_score_computation(dff, timestamps, inputParameters): diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 78f046a..5ff8de6 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -35,15 +35,11 @@ plt.switch_backend("TKAgg") -# Category: Visualization/User Input -# Reason: Writes progress updates to file for GUI progress bar - couples backend to GUI feedback mechanism def writeToFile(value: str): with open(os.path.join(os.path.expanduser("~"), "pbSteps.txt"), "a") as file: file.write(value) -# Category: Visualization/User Input -# Reason: Creates matplotlib plots to display z-score results - pure visualization with no computation # function to plot z_score def visualize_z_score(filepath): @@ -66,8 +62,6 @@ def visualize_z_score(filepath): # plt.show() -# Category: Visualization/User Input -# Reason: Creates matplotlib plots to display deltaF/F results - pure visualization with no computation # function to plot deltaF/F def visualize_dff(filepath): name = os.path.basename(filepath) @@ -89,8 +83,6 @@ def visualize_dff(filepath): # plt.show() -# Category: Visualization/User Input -# Reason: Interactive matplotlib GUI with keyboard event handlers for artifact selection - core user input mechanism that saves coordinates to disk def visualize(filepath, x, y1, y2, y3, plot_name, removeArtifacts): # plotting control and signal data @@ -180,8 +172,6 @@ def plt_close_event(event): # return fig -# Category: Visualization/User Input -# Reason: Orchestrates visualization of all control/signal pairs - reads data and delegates to visualize() for user interaction # function to plot control and signal, also provide a feature to select chunks for artifacts removal def visualizeControlAndSignal(filepath, removeArtifacts): path_1 = find_files(filepath, "control_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) @@ -217,8 +207,6 @@ def visualizeControlAndSignal(filepath, removeArtifacts): visualize(filepath, ts, control, signal, cntrl_sig_fit, plot_name, removeArtifacts) -# Category: Routing -# Reason: Top-level orchestrator for timestamp correction across all sessions - loops through folders, coordinates timestamp correction workflow # function to execute timestamps corrections using functions timestampCorrection and decide_naming_convention_and_applyCorrection def execute_timestamp_correction(folderNames, inputParameters): @@ -258,8 +246,6 @@ def execute_timestamp_correction(folderNames, inputParameters): logger.info(f"Timestamps corrections finished for {filepath}") -# Category: Routing -# Reason: Top-level orchestrator for z-score computation and artifact removal - coordinates compute_z_score, artifact processing, and visualization calls # function to compute z-score and deltaF/F using functions : compute_z_score and/or processTimestampsForArtifacts def execute_zscore(folderNames, inputParameters): @@ -312,8 +298,6 @@ def execute_zscore(folderNames, inputParameters): logger.info("Signal data and event timestamps are extracted.") -# Category: Routing -# Reason: Main entry point for Step 4 - orchestrates entire preprocessing workflow including timestamp correction, data combination, and z-score computation def extractTsAndSignal(inputParameters): logger.debug("Extracting signal data and event timestamps...") @@ -351,8 +335,6 @@ def extractTsAndSignal(inputParameters): execute_zscore(op_folder, inputParameters) -# Category: Routing -# Reason: Top-level entry point wrapper - handles error catching and calls extractTsAndSignal def main(input_parameters): try: extractTsAndSignal(input_parameters) From a9a65abf0b31e1aca2bc874efd6c4187c0801634 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 5 Dec 2025 15:00:15 -0800 Subject: [PATCH 064/150] Removed redundant fns --- src/guppy/analysis/combine_data.py | 251 +---------------------------- src/guppy/preprocess.py | 3 +- 2 files changed, 4 insertions(+), 250 deletions(-) diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py index d8f0ce6..aa5a1dd 100644 --- a/src/guppy/analysis/combine_data.py +++ b/src/guppy/analysis/combine_data.py @@ -1,17 +1,17 @@ # TODO: remove redundant function implementations such as eliminateData, eliminateTs, read_hdf5, et cetera. -import fnmatch import glob import logging import os -import re import numpy as np from .io_utils import ( + decide_naming_convention, get_all_stores_for_combining_data, read_hdf5, takeOnlyDirs, + write_hdf5, ) logger = logging.getLogger(__name__) @@ -61,78 +61,6 @@ def combineData(folderNames, inputParameters, storesList): return op -def find_files(path, glob_path, ignore_case=False): - rule = ( - re.compile(fnmatch.translate(glob_path), re.IGNORECASE) - if ignore_case - else re.compile(fnmatch.translate(glob_path)) - ) - no_bytes_path = os.listdir(os.path.expanduser(path)) - str_path = [] - - # converting byte object to string - for x in no_bytes_path: - try: - str_path.append(x.decode("utf-8")) - except: - str_path.append(x) - - return [os.path.join(path, n) for n in str_path if rule.match(n)] - - -def read_hdf5(event, filepath, key): - if event: - op = os.path.join(filepath, event + ".hdf5") - else: - op = filepath - - if os.path.exists(op): - with h5py.File(op, "r") as f: - arr = np.asarray(f[key]) - else: - raise Exception("{}.hdf5 file does not exist".format(event)) - - return arr - - -def write_hdf5(data, event, filepath, key): - op = os.path.join(filepath, event + ".hdf5") - - if not os.path.exists(op): - with h5py.File(op, "w") as f: - if type(data) is np.ndarray: - f.create_dataset(key, data=data, maxshape=(None,), chunks=True) - else: - f.create_dataset(key, data=data) - else: - with h5py.File(op, "r+") as f: - if key in list(f.keys()): - if type(data) is np.ndarray: - f[key].resize(data.shape) - arr = f[key] - arr[:] = data - else: - arr = f[key] - arr = data - else: - f.create_dataset(key, data=data, maxshape=(None,), chunks=True) - - -def decide_naming_convention(filepath): - path_1 = find_files(filepath, "control*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) - - path_2 = find_files(filepath, "signal*", ignore_case=True) # glob.glob(os.path.join(filepath, 'signal*')) - - path = sorted(path_1 + path_2, key=str.casefold) - - if len(path) % 2 != 0: - raise Exception("There are not equal number of Control and Signal data") - - path = np.asarray(path).reshape(2, -1) - - return path - - def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): arr = np.array([]) @@ -219,178 +147,3 @@ def processTimestampsForCombiningData(filepath, timeForLightsTurnOn, events, sam else: ts = eliminateTs(filepath[k], timeForLightsTurnOn, storesList[i], sampling_rate, name) write_hdf5(ts, storesList[i] + "_" + name, filepath[k][0], "ts") - - -import h5py -import numpy as np - -logger = logging.getLogger(__name__) - - -def find_files(path, glob_path, ignore_case=False): - rule = ( - re.compile(fnmatch.translate(glob_path), re.IGNORECASE) - if ignore_case - else re.compile(fnmatch.translate(glob_path)) - ) - no_bytes_path = os.listdir(os.path.expanduser(path)) - str_path = [] - - # converting byte object to string - for x in no_bytes_path: - try: - str_path.append(x.decode("utf-8")) - except: - str_path.append(x) - - return [os.path.join(path, n) for n in str_path if rule.match(n)] - - -def read_hdf5(event, filepath, key): - if event: - op = os.path.join(filepath, event + ".hdf5") - else: - op = filepath - - if os.path.exists(op): - with h5py.File(op, "r") as f: - arr = np.asarray(f[key]) - else: - raise Exception("{}.hdf5 file does not exist".format(event)) - - return arr - - -def write_hdf5(data, event, filepath, key): - op = os.path.join(filepath, event + ".hdf5") - - if not os.path.exists(op): - with h5py.File(op, "w") as f: - if type(data) is np.ndarray: - f.create_dataset(key, data=data, maxshape=(None,), chunks=True) - else: - f.create_dataset(key, data=data) - else: - with h5py.File(op, "r+") as f: - if key in list(f.keys()): - if type(data) is np.ndarray: - f[key].resize(data.shape) - arr = f[key] - arr[:] = data - else: - arr = f[key] - arr = data - else: - f.create_dataset(key, data=data, maxshape=(None,), chunks=True) - - -def decide_naming_convention(filepath): - path_1 = find_files(filepath, "control*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) - - path_2 = find_files(filepath, "signal*", ignore_case=True) # glob.glob(os.path.join(filepath, 'signal*')) - - path = sorted(path_1 + path_2, key=str.casefold) - - if len(path) % 2 != 0: - raise Exception("There are not equal number of Control and Signal data") - - path = np.asarray(path).reshape(2, -1) - - return path - - -def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): - - arr = np.array([]) - ts_arr = np.array([]) - for i in range(len(filepath)): - ts = read_hdf5("timeCorrection_" + naming, filepath[i], "timestampNew") - data = read_hdf5(event, filepath[i], "data").reshape(-1) - - # index = np.where((ts>coords[i,0]) & (ts Date: Fri, 5 Dec 2025 15:19:35 -0800 Subject: [PATCH 065/150] Removed redundant fns --- src/guppy/analysis/combine_data.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py index aa5a1dd..f89315f 100644 --- a/src/guppy/analysis/combine_data.py +++ b/src/guppy/analysis/combine_data.py @@ -1,5 +1,3 @@ -# TODO: remove redundant function implementations such as eliminateData, eliminateTs, read_hdf5, et cetera. - import glob import logging import os From 1bb8de4a2df3544f656bc4f52c48c75c0f0b338e Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 5 Dec 2025 17:54:37 -0800 Subject: [PATCH 066/150] Peeled off read operations from timestamp_correction CSV function. --- src/guppy/analysis/timestamp_correction.py | 146 ++--- src/guppy/preprocess.py | 58 +- timestamp_correction_analysis.md | 723 +++++++++++++++++++++ 3 files changed, 851 insertions(+), 76 deletions(-) create mode 100644 timestamp_correction_analysis.md diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index 2e3185a..e179d26 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -1,6 +1,5 @@ import logging import os -import shutil import numpy as np import pandas as pd @@ -15,91 +14,37 @@ logger = logging.getLogger(__name__) -# function to add control channel when there is no -# isosbestic control channel and update the storeslist file -def add_control_channel(filepath, arr): - - storenames = arr[0, :] - storesList = np.char.lower(arr[1, :]) - - keep_control = np.array([]) - # check a case if there is isosbestic control channel present - for i in range(storesList.shape[0]): - if "control" in storesList[i].lower(): - name = storesList[i].split("_")[-1] - new_str = "signal_" + str(name).lower() - find_signal = [True for i in storesList if i == new_str] - if len(find_signal) > 1: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - if len(find_signal) == 0: - logger.error( - "Isosbectic control channel parameter is set to False and still \ - storeslist file shows there is control channel present" - ) - raise Exception( - "Isosbectic control channel parameter is set to False and still \ - storeslist file shows there is control channel present" - ) - else: - continue - - for i in range(storesList.shape[0]): - if "signal" in storesList[i].lower(): - name = storesList[i].split("_")[-1] - new_str = "control_" + str(name).lower() - find_signal = [True for i in storesList if i == new_str] - if len(find_signal) == 0: - src, dst = os.path.join(filepath, arr[0, i] + ".hdf5"), os.path.join( - filepath, "cntrl" + str(i) + ".hdf5" - ) - shutil.copyfile(src, dst) - arr = np.concatenate((arr, [["cntrl" + str(i)], ["control_" + str(arr[1, i].split("_")[-1])]]), axis=1) - - np.savetxt(os.path.join(filepath, "storesList.csv"), arr, delimiter=",", fmt="%s") - - return arr - - # function to correct timestamps after eliminating first few seconds of the data (for csv data) -def timestampCorrection_csv(filepath, timeForLightsTurnOn, storesList): - +def timestampCorrection_csv( + filepath, timeForLightsTurnOn, storesList, name_to_data, name_to_timestamps, name_to_sampling_rate +): logger.debug( f"Correcting timestamps by getting rid of the first {timeForLightsTurnOn} seconds and convert timestamps to seconds" ) storenames = storesList[0, :] - storesList = storesList[1, :] + names_for_storenames = storesList[1, :] + arr = get_control_and_signal_channel_names(storesList) - arr = [] - for i in range(storesList.shape[0]): - if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): - arr.append(storesList[i]) - - arr = sorted(arr, key=str.casefold) - try: - arr = np.asarray(arr).reshape(2, -1) - except: - logger.error("Error in saving stores list file or spelling mistake for control or signal") - raise Exception("Error in saving stores list file or spelling mistake for control or signal") - - indices = check_cntrl_sig_length(filepath, arr, storenames, storesList) + indices = check_cntrl_sig_length(arr, name_to_data) for i in range(arr.shape[1]): name_1 = arr[0, i].split("_")[-1] name_2 = arr[1, i].split("_")[-1] # dirname = os.path.dirname(path[i]) - idx = np.where(storesList == indices[i])[0] + idx = np.where(names_for_storenames == indices[i])[0] if idx.shape[0] == 0: logger.error(f"{arr[0,i]} does not exist in the stores list file.") raise Exception("{} does not exist in the stores list file.".format(arr[0, i])) - timestamp = read_hdf5(storenames[idx][0], filepath, "timestamps") - sampling_rate = read_hdf5(storenames[idx][0], filepath, "sampling_rate") + name = names_for_storenames[idx][0] + timestamp = name_to_timestamps[name] + sampling_rate = name_to_sampling_rate[name] if name_1 == name_2: correctionIndex = np.where(timestamp >= timeForLightsTurnOn)[0] timestampNew = timestamp[correctionIndex] + # TODO: Pull out write operations into preprocess.py write_hdf5(timestampNew, "timeCorrection_" + name_1, filepath, "timestampNew") write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex") write_hdf5(np.asarray(sampling_rate), "timeCorrection_" + name_1, filepath, "sampling_rate") @@ -270,19 +215,72 @@ def create_control_channel(filepath, arr, window=5001): # function to check control and signal channel has same length # if not, take a smaller length and do pre-processing -def check_cntrl_sig_length(filepath, channels_arr, storenames, storesList): +def check_cntrl_sig_length(channels_arr, name_to_data): indices = [] for i in range(channels_arr.shape[1]): - idx_c = np.where(storesList == channels_arr[0, i])[0] - idx_s = np.where(storesList == channels_arr[1, i])[0] - control = read_hdf5(storenames[idx_c[0]], filepath, "data") - signal = read_hdf5(storenames[idx_s[0]], filepath, "data") + control_name = channels_arr[0, i] + signal_name = channels_arr[1, i] + control = name_to_data[control_name] + signal = name_to_data[signal_name] if control.shape[0] < signal.shape[0]: - indices.append(storesList[idx_c[0]]) + indices.append(control_name) elif control.shape[0] > signal.shape[0]: - indices.append(storesList[idx_s[0]]) + indices.append(signal_name) else: - indices.append(storesList[idx_s[0]]) + indices.append(signal_name) return indices + + +def get_control_and_signal_channel_names(storesList): + storenames = storesList[0, :] + names_for_storenames = storesList[1, :] + + channels_arr = [] + for i in range(names_for_storenames.shape[0]): + if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): + channels_arr.append(names_for_storenames[i]) + + channels_arr = sorted(channels_arr, key=str.casefold) + try: + channels_arr = np.asarray(channels_arr).reshape(2, -1) + except: + logger.error("Error in saving stores list file or spelling mistake for control or signal") + raise Exception("Error in saving stores list file or spelling mistake for control or signal") + + return channels_arr + + +def read_control_and_signal(filepath, storesList): + channels_arr = get_control_and_signal_channel_names(storesList) + storenames = storesList[0, :] + names_for_storenames = storesList[1, :] + + name_to_data = {} + name_to_timestamps = {} + name_to_sampling_rate = {} + + for i in range(channels_arr.shape[1]): + control_name = channels_arr[0, i] + signal_name = channels_arr[1, i] + idx_c = np.where(storesList == control_name)[0] + idx_s = np.where(storesList == signal_name)[0] + control_storename = storenames[idx_c[0]] + signal_storename = storenames[idx_s[0]] + + control_data = read_hdf5(control_storename, filepath, "data") + signal_data = read_hdf5(signal_storename, filepath, "data") + control_timestamps = read_hdf5(control_storename, filepath, "timestamps") + signal_timestamps = read_hdf5(signal_storename, filepath, "timestamps") + control_sampling_rate = read_hdf5(control_storename, filepath, "sampling_rate") + signal_sampling_rate = read_hdf5(signal_storename, filepath, "sampling_rate") + + name_to_data[control_name] = control_data + name_to_data[signal_name] = signal_data + name_to_timestamps[control_name] = control_timestamps + name_to_timestamps[signal_name] = signal_timestamps + name_to_sampling_rate[control_name] = control_sampling_rate + name_to_sampling_rate[signal_name] = signal_sampling_rate + + return name_to_data, name_to_timestamps, name_to_sampling_rate diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 15c547f..74033f8 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -2,6 +2,7 @@ import json import logging import os +import shutil import sys import matplotlib.pyplot as plt @@ -21,9 +22,9 @@ takeOnlyDirs, ) from .analysis.timestamp_correction import ( - add_control_channel, create_control_channel, decide_naming_convention_and_applyCorrection, + read_control_and_signal, timestampCorrection_csv, timestampCorrection_tdt, ) @@ -208,6 +209,54 @@ def visualizeControlAndSignal(filepath, removeArtifacts): visualize(filepath, ts, control, signal, cntrl_sig_fit, plot_name, removeArtifacts) +# This function just creates placeholder Control-HDF5 files that are then immediately overwritten later on in the pipeline. +# TODO: Refactor this function to avoid unnecessary file creation. +# function to add control channel when there is no +# isosbestic control channel and update the storeslist file +def add_control_channel(filepath, arr): + + storenames = arr[0, :] + storesList = np.char.lower(arr[1, :]) + + keep_control = np.array([]) + # check a case if there is isosbestic control channel present + for i in range(storesList.shape[0]): + if "control" in storesList[i].lower(): + name = storesList[i].split("_")[-1] + new_str = "signal_" + str(name).lower() + find_signal = [True for i in storesList if i == new_str] + if len(find_signal) > 1: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + if len(find_signal) == 0: + logger.error( + "Isosbectic control channel parameter is set to False and still \ + storeslist file shows there is control channel present" + ) + raise Exception( + "Isosbectic control channel parameter is set to False and still \ + storeslist file shows there is control channel present" + ) + else: + continue + + for i in range(storesList.shape[0]): + if "signal" in storesList[i].lower(): + name = storesList[i].split("_")[-1] + new_str = "control_" + str(name).lower() + find_signal = [True for i in storesList if i == new_str] + if len(find_signal) == 0: + src, dst = os.path.join(filepath, arr[0, i] + ".hdf5"), os.path.join( + filepath, "cntrl" + str(i) + ".hdf5" + ) + shutil.copyfile(src, dst) + arr = np.concatenate((arr, [["cntrl" + str(i)], ["control_" + str(arr[1, i].split("_")[-1])]]), axis=1) + + np.savetxt(os.path.join(filepath, "storesList.csv"), arr, delimiter=",", fmt="%s") + + return arr + + # function to execute timestamps corrections using functions timestampCorrection and decide_naming_convention_and_applyCorrection def execute_timestamp_correction(folderNames, inputParameters): @@ -231,7 +280,12 @@ def execute_timestamp_correction(folderNames, inputParameters): if cond == True: timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList) else: - timestampCorrection_csv(filepath, timeForLightsTurnOn, storesList) + + control_and_signal_dicts = read_control_and_signal(filepath, storesList) + name_to_data, name_to_timestamps, name_to_sampling_rate = control_and_signal_dicts + timestampCorrection_csv( + filepath, timeForLightsTurnOn, storesList, name_to_data, name_to_timestamps, name_to_sampling_rate + ) for k in range(storesList.shape[1]): decide_naming_convention_and_applyCorrection( diff --git a/timestamp_correction_analysis.md b/timestamp_correction_analysis.md new file mode 100644 index 0000000..121aa3f --- /dev/null +++ b/timestamp_correction_analysis.md @@ -0,0 +1,723 @@ +# Timestamp Correction Module Analysis + +## Overview + +The `timestamp_correction.py` module handles the correction of timestamps for photometry data, including: +- Eliminating the first N seconds of recording (light stabilization period) +- Expanding TDT block timestamps into continuous timestamps +- Creating synthetic control channels when no isosbestic control is present +- Applying corrections to both data channels and event markers + +## Module Structure + +### Entry Point from preprocess.py + +```python +execute_timestamp_correction(folderNames, inputParameters) # preprocess.py:212 +``` + +This orchestrator loops through all session folders and calls functions in this module. + +## Two-Phase Control Channel Creation Pattern + +### Understanding add_control_channel vs create_control_channel + +These two functions work together in a **two-phase process** to handle synthetic control channel generation. They are **not redundant** but serve distinct purposes: + +#### Phase 1: `add_control_channel` (Called BEFORE timestamp correction) + +**Execution:** Line 229 in `execute_timestamp_correction` + +**Purpose:** Create **PLACEHOLDER** control files to satisfy workflow requirements + +**What it does:** +1. Validates that if `isosbestic_control=False`, no real control channels exist +2. For each signal channel without a matching control: + - Copies the raw signal HDF5 file to `cntrl{i}.hdf5` (placeholder) + - Adds entry to storesList: `[["cntrl{i}"], ["control_{region}"]]` +3. Saves updated `storesList.csv` + +**Files created:** +- `cntrl0.hdf5`, `cntrl1.hdf5`, etc. (copies of **RAW** signal data) +- Updated `storesList.csv` with placeholder entries + +**Why it's needed:** +- Timestamp correction workflow expects **paired** control/signal channels in storesList +- Without placeholders, the pairing logic in `timestampCorrection_xxx` and `check_cntrl_sig_length` would fail +- The placeholder **data is never actually used** - it just satisfies structural requirements + +#### Phase 2: `create_control_channel` (Called AFTER timestamp correction) + +**Execution:** Line 243 in `execute_timestamp_correction` + +**Purpose:** Generate **ACTUAL** synthetic control via curve fitting and overwrite placeholders + +**What it does:** +1. Looks for placeholder files (checks: `"control" in event_name.lower() and "cntrl" in event.lower()`) +2. Reads the **CORRECTED** signal data: `signal_{region}.hdf5` (after timestamp correction) +3. Calls `helper_create_control_channel()` to: + - Apply Savitzky-Golay filter to cleaned signal + - Fit to exponential function: `f(x) = a + b * exp(-(1/c) * x)` +4. **OVERWRITES** the placeholder `control_{region}.hdf5` with real synthetic control +5. Also exports to CSV format (legacy) + +**Files written:** +- `control_{region}.hdf5` → `data` (replaces placeholder with curve-fitted control) +- `{raw_name}.csv` (timestamps, data, sampling_rate columns) + +**Why it's separate:** +- Requires **timestamp-corrected** signal data (doesn't exist until after lines 232-239) +- Curve fitting algorithm needs clean timestamps (first N seconds eliminated) +- Cannot be done before timestamp correction without re-correcting the synthetic control + +#### Execution Timeline + +```python +# When isosbestic_control == False: + +# ========== PHASE 1: BEFORE TIMESTAMP CORRECTION ========== +# Line 229: Create placeholders (just file copies) +storesList = add_control_channel(filepath, storesList) +# Result: storesList now has paired structure +# [["Dv1A", "cntrl0"], ["signal_dms", "control_dms"]] +# Files: cntrl0.hdf5 (copy of raw signal, never used) + +# ========== TIMESTAMP CORRECTION PHASE ========== +# Lines 232-234: Process both signal AND placeholder control +timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList) +# Result: Creates timeCorrection_dms.hdf5 with correctionIndex + +# Lines 236-239: Apply corrections to all channels +decide_naming_convention_and_applyCorrection(...) +# Result: signal_dms.hdf5 now contains corrected signal data +# control_dms.hdf5 still contains uncorrected placeholder copy + +# ========== PHASE 2: AFTER TIMESTAMP CORRECTION ========== +# Line 243: Generate REAL synthetic controls +create_control_channel(filepath, storesList, window=101) +# Result: control_dms.hdf5 OVERWRITTEN with curve-fitted synthetic control +# Now contains valid control data derived from corrected signal +``` + +#### Why This Design Exists + +This is a **chicken-and-egg problem solved with placeholders:** + +1. **Requirement:** Timestamp correction expects paired control/signal channels +2. **Constraint:** Synthetic control generation requires timestamp-corrected signal data +3. **Solution:** Create dummy placeholders → correct everything → replace placeholders with real data + +#### Visual Flow + +```mermaid +flowchart TD + A[isosbestic_control = False] --> B[add_control_channel] + B --> C[Copy signal.hdf5 to cntrl0.hdf5] + C --> D[Update storesList.csv] + + D --> E[timestampCorrection_xxx] + E --> F[Creates timeCorrection_dms.hdf5] + + F --> G[decide_naming_convention_and_applyCorrection] + G --> H[Corrects signal_dms.hdf5] + G --> I[Corrects control_dms.hdf5
still contains placeholder] + + I --> J[create_control_channel] + J --> K[Read corrected signal_dms.hdf5] + K --> L[helper_create_control_channel
curve fit] + L --> M[OVERWRITE control_dms.hdf5
with synthetic control] + + style C fill:#fff3cd + style I fill:#fff3cd + style M fill:#d4edda +``` + +#### Refactoring Opportunity + +This placeholder pattern is a **code smell** indicating potential design improvements: + +**Issues:** +1. **Unnecessary I/O:** Placeholder files are written and then overwritten +2. **Confusing flow:** Hard to understand that placeholders are temporary +3. **Tight coupling:** Timestamp correction assumes paired files exist +4. **Wasted computation:** Placeholder controls get timestamp-corrected unnecessarily + +**Potential Improvements:** + +**Option 1: Lazy Control Creation** +- Modify timestamp correction to handle missing controls gracefully +- Only create synthetic controls after all corrections complete +- Remove placeholder file creation entirely + +**Option 2: Data Structure Refactoring** +- Use a data structure that doesn't require physical paired files upfront +- Track "needs synthetic control" as metadata rather than file presence +- Generate and write controls only once at the end + +**Option 3: Two-Pass Workflow** +- First pass: Correct only signal channels +- Second pass: Generate synthetic controls from corrected signals +- Would require refactoring `check_cntrl_sig_length` and pairing logic + +## Function Catalog + +### 1. add_control_channel +**Location:** `timestamp_correction.py:20` +**Purpose:** Create placeholder control channel files when no isosbestic control exists + +```python +def add_control_channel(filepath, arr) -> arr +``` + +**Input:** +- `filepath`: Path to session output folder +- `arr`: 2D array `[[storenames], [storesList]]` from storesList.csv + +**Process:** +1. Validates that control/signal pairs match (raises error if mismatched) +2. For each signal channel without a matching control: + - Copies signal HDF5 file to `cntrl{i}.hdf5` (placeholder) + - Adds entry to storesList array: `[["cntrl{i}"], ["control_{region}"]]` +3. Writes updated storesList.csv + +**Output:** +- Updated `arr` with new control channel entries +- **Files Written:** Updated `storesList.csv`, copied `cntrl*.hdf5` files + +**I/O Summary:** +- **Reads:** Signal HDF5 files (via shutil.copyfile) +- **Writes:** `storesList.csv`, placeholder `cntrl*.hdf5` files + +--- + +### 2. timestampCorrection_csv +**Location:** `timestamp_correction.py:65` +**Purpose:** Correct timestamps for CSV-format data (Doric, NPM, custom CSV) + +```python +def timestampCorrection_csv(filepath, timeForLightsTurnOn, storesList) +``` + +**Input:** +- `filepath`: Path to session output folder +- `timeForLightsTurnOn`: Seconds to eliminate from start (default: 1) +- `storesList`: 2D array `[[storenames], [storesList]]` + +**Process:** +1. Filters storesList to control/signal channels only +2. Pairs control/signal channels, validates naming matches +3. Calls `check_cntrl_sig_length()` to determine which channel to use (shorter one) +4. For each control/signal pair: + - **Reads:** `timestamps` and `sampling_rate` from raw HDF5 + - **Computes:** `correctionIndex = np.where(timestamp >= timeForLightsTurnOn)` + - **Writes:** `timeCorrection_{region}.hdf5` with keys: + - `timestampNew`: Corrected timestamps + - `correctionIndex`: Indices to keep + - `sampling_rate`: Sampling rate + +**Output:** +- **Files Written:** `timeCorrection_{region}.hdf5` for each control/signal pair + +**I/O Summary:** +- **Reads:** `{storename}.hdf5` → `timestamps`, `sampling_rate` +- **Writes:** `timeCorrection_{region}.hdf5` → `timestampNew`, `correctionIndex`, `sampling_rate` + +--- + +### 3. timestampCorrection_tdt +**Location:** `timestamp_correction.py:115` +**Purpose:** Correct timestamps for TDT-format data (expands block timestamps) + +```python +def timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList) +``` + +**Input:** Same as `timestampCorrection_csv` + +**Process:** +1. Filters storesList to control/signal channels only +2. Pairs control/signal channels, validates naming matches +3. Calls `check_cntrl_sig_length()` to determine which channel to use +4. For each control/signal pair: + - **Reads:** `timestamps`, `npoints`, `sampling_rate` from raw HDF5 + - **TDT-specific expansion algorithm:** + ```python + timeRecStart = timestamp[0] + timestamps = np.subtract(timestamp, timeRecStart) # Zero-base + adder = np.arange(npoints) / sampling_rate # Within-block offsets + # Expand: for each block timestamp, add within-block offsets + timestampNew = np.zeros((len(timestamps), lengthAdder)) + for i in range(lengthAdder): + timestampNew[:, i] = np.add(timestamps, adder[i]) + timestampNew = (timestampNew.T).reshape(-1, order="F") # Flatten + correctionIndex = np.where(timestampNew >= timeForLightsTurnOn) + timestampNew = timestampNew[correctionIndex] + ``` + - **Writes:** `timeCorrection_{region}.hdf5` with keys: + - `timeRecStart`: Recording start time (TDT-specific) + - `timestampNew`: Expanded, corrected timestamps + - `correctionIndex`: Indices to keep + - `sampling_rate`: Sampling rate + +**Output:** +- **Files Written:** `timeCorrection_{region}.hdf5` with TDT-specific `timeRecStart` key + +**I/O Summary:** +- **Reads:** `{storename}.hdf5` → `timestamps`, `npoints`, `sampling_rate` +- **Writes:** `timeCorrection_{region}.hdf5` → `timeRecStart`, `timestampNew`, `correctionIndex`, `sampling_rate` + +--- + +### 4. check_cntrl_sig_length +**Location:** `timestamp_correction.py:273` +**Purpose:** Determine which channel (control or signal) to use as reference based on length + +```python +def check_cntrl_sig_length(filepath, channels_arr, storenames, storesList) -> indices +``` + +**Input:** +- `filepath`: Path to session output folder +- `channels_arr`: Paired control/signal array `[["control_A", "control_B"], ["signal_A", "signal_B"]]` +- `storenames`: Raw HDF5 filenames +- `storesList`: Semantic channel names + +**Process:** +1. For each control/signal pair: + - **Reads:** `data` from both control and signal HDF5 + - Compares lengths: `control.shape[0]` vs `signal.shape[0]` + - Returns the shorter one's storename (or signal if equal) + +**Output:** +- List of storenames to use for timestamp correction (one per pair) + +**I/O Summary:** +- **Reads:** `{control_storename}.hdf5` → `data`, `{signal_storename}.hdf5` → `data` + +**Note:** This is a pure analysis function but performs I/O to determine which data to use. + +--- + +### 5. decide_naming_convention_and_applyCorrection +**Location:** `timestamp_correction.py:178` +**Purpose:** Loop through all channels and apply timestamp corrections + +```python +def decide_naming_convention_and_applyCorrection(filepath, timeForLightsTurnOn, event, displayName, storesList) +``` + +**Input:** +- `filepath`: Path to session output folder +- `timeForLightsTurnOn`: Seconds eliminated from start +- `event`: Raw storename (e.g., "Dv1A") +- `displayName`: Semantic name (e.g., "control_DMS") +- `storesList`: Full storesList array + +**Process:** +1. Filters storesList to control/signal channels +2. Pairs channels and validates naming conventions +3. For each pair, calls `applyCorrection(filepath, timeForLightsTurnOn, event, displayName, region)` + +**Output:** +- Delegates to `applyCorrection()` (no direct I/O) + +--- + +### 6. applyCorrection +**Location:** `timestamp_correction.py:205` +**Purpose:** Apply timestamp corrections to data channels or event markers + +```python +def applyCorrection(filepath, timeForLightsTurnOn, event, displayName, naming) +``` + +**Input:** +- `filepath`: Path to session output folder +- `timeForLightsTurnOn`: Seconds eliminated from start +- `event`: Raw storename +- `displayName`: Semantic display name +- `naming`: Region identifier (e.g., "dms") + +**Process:** + +**For Control/Signal Channels:** +1. **Reads:** `timeCorrection_{naming}.hdf5` → `correctionIndex` +2. **Reads:** `{event}.hdf5` → `data` +3. **Applies:** `arr = arr[correctionIndex]` (crops data) +4. **Writes:** `{displayName}.hdf5` → `data` (overwrites with corrected data) + +**For Event Channels:** +1. Detects TDT format: `check_TDT(os.path.dirname(filepath))` +2. **Reads:** `timeCorrection_{naming}.hdf5` → `timeRecStart` (if TDT) +3. **Reads:** `{event}.hdf5` → `timestamps` +4. **Applies corrections:** + - If TDT and timestamps >= timeRecStart: subtract both `timeRecStart` and `timeForLightsTurnOn` + - Otherwise: subtract only `timeForLightsTurnOn` +5. **Writes:** `{event}_{naming}.hdf5` → `ts` (corrected event timestamps) + +**Output:** +- **Files Written:** + - `{displayName}.hdf5` → `data` (for control/signal) + - `{event}_{naming}.hdf5` → `ts` (for events) + +**I/O Summary:** +- **Reads:** `timeCorrection_{naming}.hdf5`, `{event}.hdf5` +- **Writes:** `{displayName}.hdf5` or `{event}_{naming}.hdf5` + +--- + +### 7. create_control_channel +**Location:** `timestamp_correction.py:247` +**Purpose:** Generate synthetic control channel using curve fitting (when no isosbestic control exists) + +```python +def create_control_channel(filepath, arr, window=5001) +``` + +**Input:** +- `filepath`: Path to session output folder +- `arr`: storesList array `[[storenames], [storesList]]` +- `window`: Savitzky-Golay filter window (default: 5001) + +**Process:** +1. Loops through storesList to find placeholder control channels (`cntrl` in storename) +2. For each placeholder: + - **Reads:** `signal_{region}.hdf5` → `data` (corrected signal) + - **Reads:** `timeCorrection_{region}.hdf5` → `timestampNew`, `sampling_rate` + - **Calls:** `helper_create_control_channel(signal, timestampNew, window)` from `control_channel.py` + - Applies Savitzky-Golay filter + - Fits to exponential: `f(x) = a + b * exp(-(1/c) * x)` + - **Writes:** `{control_name}.hdf5` → `data` (synthetic control) + - **Writes:** `{event_name}.csv` with columns: `timestamps`, `data`, `sampling_rate` + +**Output:** +- **Files Written:** + - `control_{region}.hdf5` → `data` (replaces placeholder) + - `{raw_name}.csv` (legacy format export) + +**I/O Summary:** +- **Reads:** `signal_{region}.hdf5` → `data`, `timeCorrection_{region}.hdf5` → `timestampNew`, `sampling_rate` +- **Writes:** `control_{region}.hdf5` → `data`, `{raw_name}.csv` + +--- + +## Data Flow Diagram + +### High-Level Flow (called from execute_timestamp_correction) + +```mermaid +flowchart TD + A[execute_timestamp_correction] --> B[Read storesList.csv] + B --> C{isosbestic_control?} + + C -->|False| D[add_control_channel] + C -->|True| E{Check format} + D --> E + + E -->|TDT| F[timestampCorrection_tdt] + E -->|CSV/Doric/NPM| G[timestampCorrection_csv] + + F --> H[Loop: decide_naming_convention_and_applyCorrection] + G --> H + + H --> I[For each store: applyCorrection] + + I --> J{isosbestic_control?} + J -->|False| K[create_control_channel] + J -->|True| L[Done] + K --> L + + style A fill:#e1f5ff + style L fill:#d4edda +``` + +### Detailed Flow: timestampCorrection Functions + +```mermaid +flowchart LR + A[Raw HDF5 files] --> B[check_cntrl_sig_length] + B --> C[Read control & signal data] + C --> D[Return shorter channel name] + + D --> E{Format?} + E -->|CSV| F[timestampCorrection_csv] + E -->|TDT| G[timestampCorrection_tdt] + + F --> H[Read timestamps from selected channel] + G --> I[Read timestamps, npoints, sampling_rate] + + H --> J[correctionIndex = where >= timeForLightsTurnOn] + I --> K[Expand block timestamps] + K --> J + + J --> L[Write timeCorrection_{region}.hdf5] + + style A fill:#e1f5ff + style L fill:#d4edda +``` + +### Detailed Flow: applyCorrection + +```mermaid +flowchart TD + A[applyCorrection called] --> B{Channel type?} + + B -->|control/signal| C[Read correctionIndex] + B -->|event| D[Read event timestamps] + + C --> E[Read raw data] + E --> F[data = data correctionIndex] + F --> G[Write displayName.hdf5] + + D --> H{TDT format?} + H -->|Yes| I[Read timeRecStart] + H -->|No| J[ts -= timeForLightsTurnOn] + + I --> K[ts -= timeRecStart] + K --> J + J --> L[Write event_region.hdf5] + + style A fill:#e1f5ff + style G fill:#d4edda + style L fill:#d4edda +``` + +### Detailed Flow: Control Channel Creation + +```mermaid +flowchart LR + A[add_control_channel] --> B[For each signal without control] + B --> C[Copy signal.hdf5 to cntrl_i.hdf5] + C --> D[Update storesList.csv] + + D --> E[... timestamp correction ...] + + E --> F[create_control_channel] + F --> G[For each cntrl_i placeholder] + G --> H[Read signal_{region}.hdf5] + H --> I[helper_create_control_channel] + I --> J[Savitzky-Golay filter] + J --> K[Curve fit to exponential] + K --> L[Write control_{region}.hdf5] + L --> M[Export to CSV] + + style A fill:#fff3cd + style M fill:#d4edda +``` + +## Execution Order in execute_timestamp_correction + +```python +# preprocess.py:212-247 +for each session in folderNames: + for each output_folder in session: + # Step 1: Read metadata + storesList = np.genfromtxt("storesList.csv") + + # Step 2: Add placeholder controls if needed + if isosbestic_control == False: + storesList = add_control_channel(filepath, storesList) + + # Step 3: Compute correctionIndex and timestampNew + if check_TDT(folderName): + timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList) + else: + timestampCorrection_csv(filepath, timeForLightsTurnOn, storesList) + + # Step 4: Apply corrections to all channels/events + for each store in storesList: + decide_naming_convention_and_applyCorrection( + filepath, timeForLightsTurnOn, storename, displayName, storesList + ) + # ^ This calls applyCorrection for each channel + + # Step 5: Generate synthetic controls via curve fitting + if isosbestic_control == False: + create_control_channel(filepath, storesList, window=101) +``` + +## File I/O Summary + +### Files Read + +| Function | Files Read | Keys | +|----------|-----------|------| +| `add_control_channel` | `signal_*.hdf5` (for copying) | - | +| `timestampCorrection_csv` | `{storename}.hdf5` | `timestamps`, `sampling_rate` | +| `timestampCorrection_tdt` | `{storename}.hdf5` | `timestamps`, `npoints`, `sampling_rate` | +| `check_cntrl_sig_length` | `control_*.hdf5`, `signal_*.hdf5` | `data` | +| `applyCorrection` | `timeCorrection_{region}.hdf5`
`{event}.hdf5` | `correctionIndex`, `timeRecStart` (TDT)
`data` or `timestamps` | +| `create_control_channel` | `signal_{region}.hdf5`
`timeCorrection_{region}.hdf5` | `data`
`timestampNew`, `sampling_rate` | + +### Files Written + +| Function | Files Written | Keys | Notes | +|----------|--------------|------|-------| +| `add_control_channel` | `storesList.csv`
`cntrl{i}.hdf5` | -
(copy of signal) | Placeholder files | +| `timestampCorrection_csv` | `timeCorrection_{region}.hdf5` | `timestampNew`, `correctionIndex`, `sampling_rate` | One per region | +| `timestampCorrection_tdt` | `timeCorrection_{region}.hdf5` | `timeRecStart`, `timestampNew`, `correctionIndex`, `sampling_rate` | TDT-specific | +| `applyCorrection` | `{displayName}.hdf5`
`{event}_{region}.hdf5` | `data`
`ts` | Overwrites with corrected data | +| `create_control_channel` | `control_{region}.hdf5`
`{raw_name}.csv` | `data`
timestamps, data, sampling_rate | Replaces placeholder | + +## Key Transformations + +### 1. Timestamp Expansion (TDT only) + +**Input:** Block timestamps (one per acquisition block) +**Algorithm:** +```python +timeRecStart = timestamp[0] +timestamps = timestamp - timeRecStart # Zero-base +adder = np.arange(npoints) / sampling_rate # Within-block offsets [0, 1/fs, 2/fs, ...] +# Matrix multiplication to expand: +timestampNew = zeros((n_blocks, npoints)) +for i in range(npoints): + timestampNew[:, i] = timestamps + adder[i] +timestampNew = timestampNew.T.reshape(-1, order='F') # Column-major flatten +``` +**Output:** Continuous timestamps at full sampling rate + +### 2. Correction Index Computation + +**Input:** Timestamps array, `timeForLightsTurnOn` +**Algorithm:** +```python +correctionIndex = np.where(timestamp >= timeForLightsTurnOn)[0] +``` +**Output:** Indices of timestamps to keep (after eliminating first N seconds) + +### 3. Data Cropping + +**Applied to:** Control/signal data channels +**Algorithm:** +```python +data_corrected = data[correctionIndex] +``` + +### 4. Event Timestamp Adjustment + +**Applied to:** Event markers (TTL pulses) +**Algorithm:** +```python +# CSV format: +ts_corrected = ts - timeForLightsTurnOn + +# TDT format (if ts >= timeRecStart): +ts_corrected = ts - timeRecStart - timeForLightsTurnOn +``` + +### 5. Synthetic Control Generation + +**Input:** Signal channel (already corrected) +**Algorithm:** +1. Apply Savitzky-Golay filter: `filtered_signal = savgol_filter(signal, window, polyorder=3)` +2. Curve fit to exponential: `control = a + b * exp(-(1/c) * t)` +3. Return fitted curve as synthetic control + +## Analysis for I/O Separation + +### Pure Analysis Functions (Minimal I/O) +These could be extracted with I/O injected: +- ❌ None - all functions perform substantial I/O + +### Orchestration Functions (Heavy I/O, Light Analysis) +These coordinate reading/writing and delegate computation: +- `add_control_channel` - File copying and CSV writing +- `decide_naming_convention_and_applyCorrection` - Loops and delegates +- `create_control_channel` - Orchestrates read → process → write + +### Mixed Functions (I/O + Analysis) +These perform both I/O and computation inline: +- `timestampCorrection_csv` - Reads data, computes correctionIndex, writes results +- `timestampCorrection_tdt` - Reads data, expands timestamps, computes correctionIndex, writes +- `applyCorrection` - Reads multiple files, applies transformations, writes +- `check_cntrl_sig_length` - Reads data just to compare lengths + +## Refactoring Recommendations for I/O Separation + +### Option 1: Extract Pure Computation Functions + +Create new pure functions: +```python +# Pure analysis (no I/O) +def compute_correction_index(timestamps, timeForLightsTurnOn): + return np.where(timestamps >= timeForLightsTurnOn)[0] + +def expand_tdt_timestamps(block_timestamps, npoints, sampling_rate): + # TDT expansion algorithm + ... + return expanded_timestamps + +def crop_data_by_index(data, correctionIndex): + return data[correctionIndex] + +def adjust_event_timestamps(ts, timeRecStart, timeForLightsTurnOn, is_tdt): + # Event adjustment logic + ... + return adjusted_ts +``` + +Then modify existing functions to use these pure functions, keeping I/O separate. + +### Option 2: Reader/Writer Pattern + +Create dedicated I/O classes: +```python +class TimestampCorrectionReader: + def read_raw_timestamps(self, filepath, storename): + ... + + def read_correction_data(self, filepath, region): + ... + +class TimestampCorrectionWriter: + def write_correction_file(self, filepath, region, data): + ... + + def write_corrected_data(self, filepath, displayName, data): + ... +``` + +### Option 3: Data Class Pattern + +Return data objects instead of writing directly: +```python +@dataclass +class TimestampCorrection: + timestampNew: np.ndarray + correctionIndex: np.ndarray + sampling_rate: float + timeRecStart: Optional[float] = None # TDT only + +def timestampCorrection_tdt(...) -> TimestampCorrection: + # Compute all values + return TimestampCorrection( + timestampNew=..., + correctionIndex=..., + sampling_rate=..., + timeRecStart=... + ) + +# Separate writer function +def write_timestamp_correction(filepath, region, correction: TimestampCorrection): + write_hdf5(correction.timestampNew, f"timeCorrection_{region}", filepath, "timestampNew") + # ... etc +``` + +## Current I/O Patterns to Refactor + +1. **Inline writes in computation functions:** + - `timestampCorrection_csv` and `timestampCorrection_tdt` compute AND write + - Should separate: compute → return data → write in caller + +2. **Reading for validation only:** + - `check_cntrl_sig_length` reads full data arrays just to compare shapes + - Could be optimized to read only array metadata/shapes + +3. **Side-effect file creation:** + - `add_control_channel` creates files as side effect + - `create_control_channel` both generates data AND writes multiple formats (HDF5 + CSV) + +4. **Mixed responsibilities in applyCorrection:** + - Handles both control/signal cropping AND event timestamp adjustment + - Could be split into two separate functions From aa36e330f790eaccea333f89703e8d49bfb31bfd Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Thu, 11 Dec 2025 15:07:26 -0800 Subject: [PATCH 067/150] Inverted name check --- src/guppy/analysis/timestamp_correction.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index e179d26..71b4760 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -30,6 +30,10 @@ def timestampCorrection_csv( for i in range(arr.shape[1]): name_1 = arr[0, i].split("_")[-1] name_2 = arr[1, i].split("_")[-1] + if name_1 != name_2: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + # dirname = os.path.dirname(path[i]) idx = np.where(names_for_storenames == indices[i])[0] @@ -41,17 +45,12 @@ def timestampCorrection_csv( timestamp = name_to_timestamps[name] sampling_rate = name_to_sampling_rate[name] - if name_1 == name_2: - correctionIndex = np.where(timestamp >= timeForLightsTurnOn)[0] - timestampNew = timestamp[correctionIndex] - # TODO: Pull out write operations into preprocess.py - write_hdf5(timestampNew, "timeCorrection_" + name_1, filepath, "timestampNew") - write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex") - write_hdf5(np.asarray(sampling_rate), "timeCorrection_" + name_1, filepath, "sampling_rate") - - else: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") + correctionIndex = np.where(timestamp >= timeForLightsTurnOn)[0] + timestampNew = timestamp[correctionIndex] + # TODO: Pull out write operations into preprocess.py + write_hdf5(timestampNew, "timeCorrection_" + name_1, filepath, "timestampNew") + write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex") + write_hdf5(np.asarray(sampling_rate), "timeCorrection_" + name_1, filepath, "sampling_rate") logger.info("Timestamps corrected and converted to seconds.") From 2049c4a5bd2337324a8b2bde0a7da88ba2922013 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Thu, 11 Dec 2025 15:40:52 -0800 Subject: [PATCH 068/150] Refactored out write --- src/guppy/analysis/timestamp_correction.py | 26 +++++++++++++++------- src/guppy/preprocess.py | 9 +++++--- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index 71b4760..8fbb8f9 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -15,12 +15,11 @@ # function to correct timestamps after eliminating first few seconds of the data (for csv data) -def timestampCorrection_csv( - filepath, timeForLightsTurnOn, storesList, name_to_data, name_to_timestamps, name_to_sampling_rate -): +def timestampCorrection_csv(timeForLightsTurnOn, storesList, name_to_data, name_to_timestamps): logger.debug( f"Correcting timestamps by getting rid of the first {timeForLightsTurnOn} seconds and convert timestamps to seconds" ) + name_to_timestamps = name_to_timestamps.copy() storenames = storesList[0, :] names_for_storenames = storesList[1, :] arr = get_control_and_signal_channel_names(storesList) @@ -43,16 +42,27 @@ def timestampCorrection_csv( name = names_for_storenames[idx][0] timestamp = name_to_timestamps[name] - sampling_rate = name_to_sampling_rate[name] correctionIndex = np.where(timestamp >= timeForLightsTurnOn)[0] timestampNew = timestamp[correctionIndex] - # TODO: Pull out write operations into preprocess.py - write_hdf5(timestampNew, "timeCorrection_" + name_1, filepath, "timestampNew") - write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex") - write_hdf5(np.asarray(sampling_rate), "timeCorrection_" + name_1, filepath, "sampling_rate") + name_to_timestamps[name] = timestampNew logger.info("Timestamps corrected and converted to seconds.") + return name_to_timestamps + + +def write_corrected_timestamps(filepath, corrected_name_to_timestamps, name_to_timestamps, name_to_sampling_rate): + for name, timestamps in name_to_timestamps.items(): + corrected_timestamps = corrected_name_to_timestamps[name] + correctionIndex = np.where(timestamps >= corrected_timestamps[0])[0] + sampling_rate = name_to_sampling_rate[name] + name_1 = name.split("_")[-1] + assert np.array_equal( + corrected_timestamps, timestamps[correctionIndex] + ), "Timestamps do not match after correction" + write_hdf5(corrected_timestamps, "timeCorrection_" + name_1, filepath, "timestampNew") + write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex") + write_hdf5(np.asarray(sampling_rate), "timeCorrection_" + name_1, filepath, "sampling_rate") # function to correct timestamps after eliminating first few seconds of the data (for TDT data) diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 74033f8..413246d 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -27,6 +27,7 @@ read_control_and_signal, timestampCorrection_csv, timestampCorrection_tdt, + write_corrected_timestamps, ) from .analysis.z_score import compute_z_score @@ -280,11 +281,13 @@ def execute_timestamp_correction(folderNames, inputParameters): if cond == True: timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList) else: - control_and_signal_dicts = read_control_and_signal(filepath, storesList) name_to_data, name_to_timestamps, name_to_sampling_rate = control_and_signal_dicts - timestampCorrection_csv( - filepath, timeForLightsTurnOn, storesList, name_to_data, name_to_timestamps, name_to_sampling_rate + corrected_name_to_timestamps = timestampCorrection_csv( + timeForLightsTurnOn, storesList, name_to_data, name_to_timestamps + ) + write_corrected_timestamps( + filepath, corrected_name_to_timestamps, name_to_timestamps, name_to_sampling_rate ) for k in range(storesList.shape[1]): From 8b50fb70522732a3413c60262262845548c4e4da Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 12 Dec 2025 11:44:59 -0800 Subject: [PATCH 069/150] Refactored read and write out of timestampcorrection_tdt --- src/guppy/analysis/timestamp_correction.py | 103 +++++++++++---------- src/guppy/preprocess.py | 29 +++++- 2 files changed, 78 insertions(+), 54 deletions(-) diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index 8fbb8f9..4e37efe 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -20,6 +20,7 @@ def timestampCorrection_csv(timeForLightsTurnOn, storesList, name_to_data, name_ f"Correcting timestamps by getting rid of the first {timeForLightsTurnOn} seconds and convert timestamps to seconds" ) name_to_timestamps = name_to_timestamps.copy() + name_to_correctionIndex = {} storenames = storesList[0, :] names_for_storenames = storesList[1, :] arr = get_control_and_signal_channel_names(storesList) @@ -46,85 +47,78 @@ def timestampCorrection_csv(timeForLightsTurnOn, storesList, name_to_data, name_ correctionIndex = np.where(timestamp >= timeForLightsTurnOn)[0] timestampNew = timestamp[correctionIndex] name_to_timestamps[name] = timestampNew + name_to_correctionIndex[name] = correctionIndex logger.info("Timestamps corrected and converted to seconds.") - return name_to_timestamps + return name_to_timestamps, name_to_correctionIndex -def write_corrected_timestamps(filepath, corrected_name_to_timestamps, name_to_timestamps, name_to_sampling_rate): - for name, timestamps in name_to_timestamps.items(): +def write_corrected_timestamps( + filepath, corrected_name_to_timestamps, name_to_timestamps, name_to_sampling_rate, name_to_correctionIndex +): + for name, correctionIndex in name_to_correctionIndex.items(): + timestamps = name_to_timestamps[name] corrected_timestamps = corrected_name_to_timestamps[name] - correctionIndex = np.where(timestamps >= corrected_timestamps[0])[0] sampling_rate = name_to_sampling_rate[name] + if sampling_rate.shape == (): # numpy scalar + sampling_rate = np.asarray([sampling_rate]) name_1 = name.split("_")[-1] - assert np.array_equal( - corrected_timestamps, timestamps[correctionIndex] - ), "Timestamps do not match after correction" + write_hdf5(np.asarray([timestamps[0]]), "timeCorrection_" + name_1, filepath, "timeRecStart") write_hdf5(corrected_timestamps, "timeCorrection_" + name_1, filepath, "timestampNew") write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex") - write_hdf5(np.asarray(sampling_rate), "timeCorrection_" + name_1, filepath, "sampling_rate") + write_hdf5(sampling_rate, "timeCorrection_" + name_1, filepath, "sampling_rate") # function to correct timestamps after eliminating first few seconds of the data (for TDT data) -def timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList): - +def timestampCorrection_tdt( + filepath, timeForLightsTurnOn, storesList, name_to_timestamps, name_to_data, name_to_sampling_rate, name_to_npoints +): logger.debug( f"Correcting timestamps by getting rid of the first {timeForLightsTurnOn} seconds and convert timestamps to seconds" ) + name_to_timestamps = name_to_timestamps.copy() + name_to_correctionIndex = {} storenames = storesList[0, :] - storesList = storesList[1, :] - - arr = [] - for i in range(storesList.shape[0]): - if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): - arr.append(storesList[i]) - - arr = sorted(arr, key=str.casefold) - - try: - arr = np.asarray(arr).reshape(2, -1) - except: - logger.error("Error in saving stores list file or spelling mistake for control or signal") - raise Exception("Error in saving stores list file or spelling mistake for control or signal") + names_for_storenames = storesList[1, :] + arr = get_control_and_signal_channel_names(storesList) - indices = check_cntrl_sig_length(filepath, arr, storenames, storesList) + indices = check_cntrl_sig_length(arr, name_to_data) for i in range(arr.shape[1]): name_1 = arr[0, i].split("_")[-1] name_2 = arr[1, i].split("_")[-1] + if name_1 != name_2: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + # dirname = os.path.dirname(path[i]) - idx = np.where(storesList == indices[i])[0] + idx = np.where(names_for_storenames == indices[i])[0] if idx.shape[0] == 0: logger.error(f"{arr[0,i]} does not exist in the stores list file.") raise Exception("{} does not exist in the stores list file.".format(arr[0, i])) - timestamp = read_hdf5(storenames[idx][0], filepath, "timestamps") - npoints = read_hdf5(storenames[idx][0], filepath, "npoints") - sampling_rate = read_hdf5(storenames[idx][0], filepath, "sampling_rate") + name = names_for_storenames[idx][0] + timestamp = name_to_timestamps[name] + sampling_rate = name_to_sampling_rate[name] + npoints = name_to_npoints[name] + + timeRecStart = timestamp[0] + timestamps = np.subtract(timestamp, timeRecStart) + adder = np.arange(npoints) / sampling_rate + lengthAdder = adder.shape[0] + timestampNew = np.zeros((len(timestamps), lengthAdder)) + for i in range(lengthAdder): + timestampNew[:, i] = np.add(timestamps, adder[i]) + timestampNew = (timestampNew.T).reshape(-1, order="F") + correctionIndex = np.where(timestampNew >= timeForLightsTurnOn)[0] + timestampNew = timestampNew[correctionIndex] - if name_1 == name_2: - timeRecStart = timestamp[0] - timestamps = np.subtract(timestamp, timeRecStart) - adder = np.arange(npoints) / sampling_rate - lengthAdder = adder.shape[0] - timestampNew = np.zeros((len(timestamps), lengthAdder)) - for i in range(lengthAdder): - timestampNew[:, i] = np.add(timestamps, adder[i]) - timestampNew = (timestampNew.T).reshape(-1, order="F") - correctionIndex = np.where(timestampNew >= timeForLightsTurnOn)[0] - timestampNew = timestampNew[correctionIndex] - - write_hdf5(np.asarray([timeRecStart]), "timeCorrection_" + name_1, filepath, "timeRecStart") - write_hdf5(timestampNew, "timeCorrection_" + name_1, filepath, "timestampNew") - write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex") - write_hdf5(np.asarray([sampling_rate]), "timeCorrection_" + name_1, filepath, "sampling_rate") - else: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") + name_to_timestamps[name] = timestampNew + name_to_correctionIndex[name] = correctionIndex logger.info("Timestamps corrected and converted to seconds.") - # return timeRecStart, correctionIndex, timestampNew + return name_to_timestamps, name_to_correctionIndex # function to check if naming convention was followed while saving storeslist file @@ -269,6 +263,7 @@ def read_control_and_signal(filepath, storesList): name_to_data = {} name_to_timestamps = {} name_to_sampling_rate = {} + name_to_npoints = {} for i in range(channels_arr.shape[1]): control_name = channels_arr[0, i] @@ -284,6 +279,12 @@ def read_control_and_signal(filepath, storesList): signal_timestamps = read_hdf5(signal_storename, filepath, "timestamps") control_sampling_rate = read_hdf5(control_storename, filepath, "sampling_rate") signal_sampling_rate = read_hdf5(signal_storename, filepath, "sampling_rate") + try: # TODO: define npoints for csv datasets + control_npoints = read_hdf5(control_storename, filepath, "npoints") + signal_npoints = read_hdf5(signal_storename, filepath, "npoints") + except KeyError: # npoints is not defined for csv datasets + control_npoints = None + signal_npoints = None name_to_data[control_name] = control_data name_to_data[signal_name] = signal_data @@ -291,5 +292,7 @@ def read_control_and_signal(filepath, storesList): name_to_timestamps[signal_name] = signal_timestamps name_to_sampling_rate[control_name] = control_sampling_rate name_to_sampling_rate[signal_name] = signal_sampling_rate + name_to_npoints[control_name] = control_npoints + name_to_npoints[signal_name] = signal_npoints - return name_to_data, name_to_timestamps, name_to_sampling_rate + return name_to_data, name_to_timestamps, name_to_sampling_rate, name_to_npoints diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 413246d..db9d8d0 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -279,15 +279,36 @@ def execute_timestamp_correction(folderNames, inputParameters): storesList = add_control_channel(filepath, storesList) if cond == True: - timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList) + control_and_signal_dicts = read_control_and_signal(filepath, storesList) + name_to_data, name_to_timestamps, name_to_sampling_rate, name_to_npoints = control_and_signal_dicts + corrected_name_to_timestamps, name_to_correctionIndex = timestampCorrection_tdt( + filepath, + timeForLightsTurnOn, + storesList, + name_to_timestamps, + name_to_data, + name_to_sampling_rate, + name_to_npoints, + ) + write_corrected_timestamps( + filepath, + corrected_name_to_timestamps, + name_to_timestamps, + name_to_sampling_rate, + name_to_correctionIndex, + ) else: control_and_signal_dicts = read_control_and_signal(filepath, storesList) - name_to_data, name_to_timestamps, name_to_sampling_rate = control_and_signal_dicts - corrected_name_to_timestamps = timestampCorrection_csv( + name_to_data, name_to_timestamps, name_to_sampling_rate, _ = control_and_signal_dicts + corrected_name_to_timestamps, name_to_correctionIndex = timestampCorrection_csv( timeForLightsTurnOn, storesList, name_to_data, name_to_timestamps ) write_corrected_timestamps( - filepath, corrected_name_to_timestamps, name_to_timestamps, name_to_sampling_rate + filepath, + corrected_name_to_timestamps, + name_to_timestamps, + name_to_sampling_rate, + name_to_correctionIndex, ) for k in range(storesList.shape[1]): From b73417063e15f8a1dafe9615bbc6abafcdcbcb23 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 12 Dec 2025 11:48:19 -0800 Subject: [PATCH 070/150] Removed, now unused file path parameter. --- src/guppy/analysis/timestamp_correction.py | 2 +- src/guppy/preprocess.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index 4e37efe..cd662bd 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -71,7 +71,7 @@ def write_corrected_timestamps( # function to correct timestamps after eliminating first few seconds of the data (for TDT data) def timestampCorrection_tdt( - filepath, timeForLightsTurnOn, storesList, name_to_timestamps, name_to_data, name_to_sampling_rate, name_to_npoints + timeForLightsTurnOn, storesList, name_to_timestamps, name_to_data, name_to_sampling_rate, name_to_npoints ): logger.debug( f"Correcting timestamps by getting rid of the first {timeForLightsTurnOn} seconds and convert timestamps to seconds" diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index db9d8d0..83659bf 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -282,7 +282,6 @@ def execute_timestamp_correction(folderNames, inputParameters): control_and_signal_dicts = read_control_and_signal(filepath, storesList) name_to_data, name_to_timestamps, name_to_sampling_rate, name_to_npoints = control_and_signal_dicts corrected_name_to_timestamps, name_to_correctionIndex = timestampCorrection_tdt( - filepath, timeForLightsTurnOn, storesList, name_to_timestamps, From 4402cbb20f02273c78020b2aa0d20f98236e1c9c Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 12 Dec 2025 12:10:39 -0800 Subject: [PATCH 071/150] Consolidated TDT and CSV timestamp correction functions into a single timestamp_correction function with a mode parameter. --- src/guppy/analysis/timestamp_correction.py | 72 ++++++---------------- src/guppy/preprocess.py | 54 ++++++---------- 2 files changed, 40 insertions(+), 86 deletions(-) diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index cd662bd..df72800 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -14,45 +14,6 @@ logger = logging.getLogger(__name__) -# function to correct timestamps after eliminating first few seconds of the data (for csv data) -def timestampCorrection_csv(timeForLightsTurnOn, storesList, name_to_data, name_to_timestamps): - logger.debug( - f"Correcting timestamps by getting rid of the first {timeForLightsTurnOn} seconds and convert timestamps to seconds" - ) - name_to_timestamps = name_to_timestamps.copy() - name_to_correctionIndex = {} - storenames = storesList[0, :] - names_for_storenames = storesList[1, :] - arr = get_control_and_signal_channel_names(storesList) - - indices = check_cntrl_sig_length(arr, name_to_data) - - for i in range(arr.shape[1]): - name_1 = arr[0, i].split("_")[-1] - name_2 = arr[1, i].split("_")[-1] - if name_1 != name_2: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - - # dirname = os.path.dirname(path[i]) - idx = np.where(names_for_storenames == indices[i])[0] - - if idx.shape[0] == 0: - logger.error(f"{arr[0,i]} does not exist in the stores list file.") - raise Exception("{} does not exist in the stores list file.".format(arr[0, i])) - - name = names_for_storenames[idx][0] - timestamp = name_to_timestamps[name] - - correctionIndex = np.where(timestamp >= timeForLightsTurnOn)[0] - timestampNew = timestamp[correctionIndex] - name_to_timestamps[name] = timestampNew - name_to_correctionIndex[name] = correctionIndex - - logger.info("Timestamps corrected and converted to seconds.") - return name_to_timestamps, name_to_correctionIndex - - def write_corrected_timestamps( filepath, corrected_name_to_timestamps, name_to_timestamps, name_to_sampling_rate, name_to_correctionIndex ): @@ -69,13 +30,16 @@ def write_corrected_timestamps( write_hdf5(sampling_rate, "timeCorrection_" + name_1, filepath, "sampling_rate") -# function to correct timestamps after eliminating first few seconds of the data (for TDT data) -def timestampCorrection_tdt( - timeForLightsTurnOn, storesList, name_to_timestamps, name_to_data, name_to_sampling_rate, name_to_npoints +# function to correct timestamps after eliminating first few seconds of the data (for csv or TDT data depending on mode) +def timestampCorrection( + timeForLightsTurnOn, storesList, name_to_timestamps, name_to_data, name_to_sampling_rate, name_to_npoints, mode ): logger.debug( f"Correcting timestamps by getting rid of the first {timeForLightsTurnOn} seconds and convert timestamps to seconds" ) + if mode not in ["tdt", "csv"]: + logger.error("Mode should be either 'tdt' or 'csv'") + raise ValueError("Mode should be either 'tdt' or 'csv'") name_to_timestamps = name_to_timestamps.copy() name_to_correctionIndex = {} storenames = storesList[0, :] @@ -103,16 +67,20 @@ def timestampCorrection_tdt( sampling_rate = name_to_sampling_rate[name] npoints = name_to_npoints[name] - timeRecStart = timestamp[0] - timestamps = np.subtract(timestamp, timeRecStart) - adder = np.arange(npoints) / sampling_rate - lengthAdder = adder.shape[0] - timestampNew = np.zeros((len(timestamps), lengthAdder)) - for i in range(lengthAdder): - timestampNew[:, i] = np.add(timestamps, adder[i]) - timestampNew = (timestampNew.T).reshape(-1, order="F") - correctionIndex = np.where(timestampNew >= timeForLightsTurnOn)[0] - timestampNew = timestampNew[correctionIndex] + if mode == "tdt": + timeRecStart = timestamp[0] + timestamps = np.subtract(timestamp, timeRecStart) + adder = np.arange(npoints) / sampling_rate + lengthAdder = adder.shape[0] + timestampNew = np.zeros((len(timestamps), lengthAdder)) + for i in range(lengthAdder): + timestampNew[:, i] = np.add(timestamps, adder[i]) + timestampNew = (timestampNew.T).reshape(-1, order="F") + correctionIndex = np.where(timestampNew >= timeForLightsTurnOn)[0] + timestampNew = timestampNew[correctionIndex] + elif mode == "csv": + correctionIndex = np.where(timestamp >= timeForLightsTurnOn)[0] + timestampNew = timestamp[correctionIndex] name_to_timestamps[name] = timestampNew name_to_correctionIndex[name] = correctionIndex diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 83659bf..19626dd 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -25,8 +25,7 @@ create_control_channel, decide_naming_convention_and_applyCorrection, read_control_and_signal, - timestampCorrection_csv, - timestampCorrection_tdt, + timestampCorrection, write_corrected_timestamps, ) from .analysis.z_score import compute_z_score @@ -267,7 +266,7 @@ def execute_timestamp_correction(folderNames, inputParameters): for i in range(len(folderNames)): filepath = folderNames[i] storesListPath = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))) - cond = check_TDT(folderNames[i]) + mode = "tdt" if check_TDT(folderNames[i]) else "csv" logger.debug(f"Timestamps corrections started for {filepath}") for j in range(len(storesListPath)): filepath = storesListPath[j] @@ -278,37 +277,24 @@ def execute_timestamp_correction(folderNames, inputParameters): if isosbestic_control == False: storesList = add_control_channel(filepath, storesList) - if cond == True: - control_and_signal_dicts = read_control_and_signal(filepath, storesList) - name_to_data, name_to_timestamps, name_to_sampling_rate, name_to_npoints = control_and_signal_dicts - corrected_name_to_timestamps, name_to_correctionIndex = timestampCorrection_tdt( - timeForLightsTurnOn, - storesList, - name_to_timestamps, - name_to_data, - name_to_sampling_rate, - name_to_npoints, - ) - write_corrected_timestamps( - filepath, - corrected_name_to_timestamps, - name_to_timestamps, - name_to_sampling_rate, - name_to_correctionIndex, - ) - else: - control_and_signal_dicts = read_control_and_signal(filepath, storesList) - name_to_data, name_to_timestamps, name_to_sampling_rate, _ = control_and_signal_dicts - corrected_name_to_timestamps, name_to_correctionIndex = timestampCorrection_csv( - timeForLightsTurnOn, storesList, name_to_data, name_to_timestamps - ) - write_corrected_timestamps( - filepath, - corrected_name_to_timestamps, - name_to_timestamps, - name_to_sampling_rate, - name_to_correctionIndex, - ) + control_and_signal_dicts = read_control_and_signal(filepath, storesList) + name_to_data, name_to_timestamps, name_to_sampling_rate, name_to_npoints = control_and_signal_dicts + corrected_name_to_timestamps, name_to_correctionIndex = timestampCorrection( + timeForLightsTurnOn, + storesList, + name_to_timestamps, + name_to_data, + name_to_sampling_rate, + name_to_npoints, + mode=mode, + ) + write_corrected_timestamps( + filepath, + corrected_name_to_timestamps, + name_to_timestamps, + name_to_sampling_rate, + name_to_correctionIndex, + ) for k in range(storesList.shape[1]): decide_naming_convention_and_applyCorrection( From ca735ce723a870e972308131f7cb1cd020a6ab61 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 12 Dec 2025 12:21:50 -0800 Subject: [PATCH 072/150] Cleaned up some inefficient code --- src/guppy/analysis/timestamp_correction.py | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index df72800..efa4c52 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -94,25 +94,16 @@ def timestampCorrection( def decide_naming_convention_and_applyCorrection(filepath, timeForLightsTurnOn, event, displayName, storesList): logger.debug("Applying correction of timestamps to the data and event timestamps") - storesList = storesList[1, :] - - arr = [] - for i in range(storesList.shape[0]): - if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): - arr.append(storesList[i]) - - arr = sorted(arr, key=str.casefold) - arr = np.asarray(arr).reshape(2, -1) + arr = get_control_and_signal_channel_names(storesList) for i in range(arr.shape[1]): name_1 = arr[0, i].split("_")[-1] name_2 = arr[1, i].split("_")[-1] - # dirname = os.path.dirname(path[i]) - if name_1 == name_2: - applyCorrection(filepath, timeForLightsTurnOn, event, displayName, name_1) - else: + if name_1 != name_2: logger.error("Error in naming convention of files or Error in storesList file") raise Exception("Error in naming convention of files or Error in storesList file") + else: + applyCorrection(filepath, timeForLightsTurnOn, event, displayName, name_1) logger.info("Timestamps corrections applied to the data and event timestamps.") @@ -153,10 +144,6 @@ def applyCorrection(filepath, timeForLightsTurnOn, event, displayName, naming): arr = np.subtract(arr, timeForLightsTurnOn) write_hdf5(arr, displayName + "_" + naming, filepath, "ts") - # if isosbestic_control==False and 'control' in displayName.lower(): - # control = create_control_channel(filepath, displayName) - # write_hdf5(control, displayName, filepath, 'data') - # main function to create control channel using # signal channel and save it to a file From 262681bcab890d51f73d65856ade3533a6b97842 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 12 Dec 2025 13:12:16 -0800 Subject: [PATCH 073/150] Pulled read operations out of the applyCorrection functions. --- src/guppy/analysis/timestamp_correction.py | 84 +++++++++++++++++----- src/guppy/preprocess.py | 26 ++++++- 2 files changed, 92 insertions(+), 18 deletions(-) diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index efa4c52..2da2020 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -91,7 +91,19 @@ def timestampCorrection( # function to check if naming convention was followed while saving storeslist file # and apply timestamps correction using the function applyCorrection -def decide_naming_convention_and_applyCorrection(filepath, timeForLightsTurnOn, event, displayName, storesList): +def decide_naming_convention_and_applyCorrection( + filepath, + timeForLightsTurnOn, + event, + displayName, + storesList, + name_1_to_corrected_timestamps, + name_1_to_timestamps, + name_1_to_sampling_rate, + name_1_to_correctionIndex, + data, + ttl_timestamps, +): logger.debug("Applying correction of timestamps to the data and event timestamps") arr = get_control_and_signal_channel_names(storesList) @@ -103,36 +115,61 @@ def decide_naming_convention_and_applyCorrection(filepath, timeForLightsTurnOn, logger.error("Error in naming convention of files or Error in storesList file") raise Exception("Error in naming convention of files or Error in storesList file") else: - applyCorrection(filepath, timeForLightsTurnOn, event, displayName, name_1) + corrected_timestamps = name_1_to_corrected_timestamps[name_1] + timestamps = name_1_to_timestamps[name_1] + timeRecStart = timestamps[0] + sampling_rate = name_1_to_sampling_rate[name_1] + correctionIndex = name_1_to_correctionIndex[name_1] + applyCorrection( + filepath, + timeForLightsTurnOn, + event, + displayName, + name_1, + corrected_timestamps, + sampling_rate, + correctionIndex, + timeRecStart, + data, + ttl_timestamps, + ) logger.info("Timestamps corrections applied to the data and event timestamps.") # function to apply correction to control, signal and event timestamps -def applyCorrection(filepath, timeForLightsTurnOn, event, displayName, naming): +def applyCorrection( + filepath, + timeForLightsTurnOn, + event, + displayName, + naming, + corrected_timestamps, + sampling_rate, + correctionIndex, + timeRecStart, + data, + ttl_timestamps, +): cond = check_TDT(os.path.dirname(filepath)) - if cond == True: - timeRecStart = read_hdf5("timeCorrection_" + naming, filepath, "timeRecStart")[0] - - timestampNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - correctionIndex = read_hdf5("timeCorrection_" + naming, filepath, "correctionIndex") - + timestampNew = corrected_timestamps if "control" in displayName.lower() or "signal" in displayName.lower(): - split_name = displayName.split("_")[-1] - if split_name == naming: - pass - else: - correctionIndex = read_hdf5("timeCorrection_" + split_name, filepath, "correctionIndex") - arr = read_hdf5(event, filepath, "data") + # TODO: double-check that this code is not reachable + # split_name = displayName.split("_")[-1] + # if split_name == naming: + # pass + # else: + # correctionIndex = read_hdf5("timeCorrection_" + split_name, filepath, "correctionIndex") + arr = data if (arr == 0).all() == True: arr = arr else: arr = arr[correctionIndex] write_hdf5(arr, displayName, filepath, "data") else: - arr = read_hdf5(event, filepath, "timestamps") + arr = ttl_timestamps if cond == True: res = (arr >= timeRecStart).all() if res == True: @@ -251,3 +288,18 @@ def read_control_and_signal(filepath, storesList): name_to_npoints[signal_name] = signal_npoints return name_to_data, name_to_timestamps, name_to_sampling_rate, name_to_npoints + + +def read_ttl(filepath, storesList): + channels_arr = get_control_and_signal_channel_names(storesList) + storenames = storesList[0, :] + names_for_storenames = storesList[1, :] + + name_to_timestamps = {} + for storename, name in zip(storenames, names_for_storenames): + if storename in channels_arr: + continue + timestamps = read_hdf5(storename, filepath, "timestamps") + name_to_timestamps[name] = timestamps + + return name_to_timestamps diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 19626dd..1715cfc 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -25,6 +25,7 @@ create_control_channel, decide_naming_convention_and_applyCorrection, read_control_and_signal, + read_ttl, timestampCorrection, write_corrected_timestamps, ) @@ -295,10 +296,31 @@ def execute_timestamp_correction(folderNames, inputParameters): name_to_sampling_rate, name_to_correctionIndex, ) - + name_1_to_corrected_timestamps = { + name.split("_")[-1]: ts for name, ts in corrected_name_to_timestamps.items() + } + name_1_to_timestamps = {name.split("_")[-1]: ts for name, ts in name_to_timestamps.items()} + name_1_to_sampling_rate = {name.split("_")[-1]: sr for name, sr in name_to_sampling_rate.items()} + name_1_to_correctionIndex = {name.split("_")[-1]: idx for name, idx in name_to_correctionIndex.items()} + + name_to_timestamps_ttl = read_ttl(filepath, storesList) for k in range(storesList.shape[1]): + data = name_to_data[storesList[1, k]] if storesList[1, k] in name_to_data else None + ttl_timestamps = ( + name_to_timestamps_ttl[storesList[1, k]] if storesList[1, k] in name_to_timestamps_ttl else None + ) decide_naming_convention_and_applyCorrection( - filepath, timeForLightsTurnOn, storesList[0, k], storesList[1, k], storesList + filepath, + timeForLightsTurnOn, + storesList[0, k], + storesList[1, k], + storesList, + name_1_to_corrected_timestamps, + name_1_to_timestamps, + name_1_to_sampling_rate, + name_1_to_correctionIndex, + data, + ttl_timestamps, ) # check if isosbestic control is false and also if new control channel is added From b6173dd889e892f65f7e2c2f096dd10c88acee17 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 12 Dec 2025 16:41:46 -0800 Subject: [PATCH 074/150] split up applyCorrection by ttl vs signal_and_control --- src/guppy/analysis/timestamp_correction.py | 112 ++++++++++++++++++++- src/guppy/preprocess.py | 70 ++++++++----- 2 files changed, 154 insertions(+), 28 deletions(-) diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index 2da2020..a1088c9 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -137,6 +137,116 @@ def decide_naming_convention_and_applyCorrection( logger.info("Timestamps corrections applied to the data and event timestamps.") +def decide_naming_and_applyCorrection_signal_and_control( + filepath, + storesList, + name_to_correctionIndex, + name_to_data, +): + logger.debug("Applying correction of timestamps to the data and event timestamps") + storenames = storesList[0, :] + names_for_storenames = storesList[1, :] + arr = get_control_and_signal_channel_names(storesList) + indices = check_cntrl_sig_length(arr, name_to_data) + + for i in range(arr.shape[1]): + name_1 = arr[0, i].split("_")[-1] + name_2 = arr[1, i].split("_")[-1] + if name_1 != name_2: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + + idx = np.where(names_for_storenames == indices[i])[0] + if idx.shape[0] == 0: + logger.error(f"{arr[0,i]} does not exist in the stores list file.") + raise Exception("{} does not exist in the stores list file.".format(arr[0, i])) + + name = names_for_storenames[idx][0] + correctionIndex = name_to_correctionIndex[name] + control_name = arr[0, i] + signal_name = arr[1, i] + control_data = name_to_data[control_name] + signal_data = name_to_data[signal_name] + applyCorrection_signal_and_control(filepath, control_name, correctionIndex, control_data) + applyCorrection_signal_and_control(filepath, signal_name, correctionIndex, signal_data) + + logger.info("Timestamps corrections applied to the data and event timestamps.") + + +def applyCorrection_signal_and_control(filepath, displayName, correctionIndex, data): + arr = data + if (arr == 0).all() == True: + arr = arr + else: + arr = arr[correctionIndex] + write_hdf5(arr, displayName, filepath, "data") + + +def decide_naming_and_applyCorrection_ttl( + filepath, + timeForLightsTurnOn, + storesList, + name_to_timestamps_ttl, + name_to_timestamps, + name_to_data, +): + logger.debug("Applying correction of timestamps to the data and event timestamps") + storenames = storesList[0, :] + names_for_storenames = storesList[1, :] + arr = get_control_and_signal_channel_names(storesList) + indices = check_cntrl_sig_length(arr, name_to_data) + + for ttl_name, ttl_timestamps in name_to_timestamps_ttl.items(): + displayName = ttl_name + for i in range(arr.shape[1]): + name_1 = arr[0, i].split("_")[-1] + name_2 = arr[1, i].split("_")[-1] + if name_1 != name_2: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + + idx = np.where(names_for_storenames == indices[i])[0] + if idx.shape[0] == 0: + logger.error(f"{arr[0,i]} does not exist in the stores list file.") + raise Exception("{} does not exist in the stores list file.".format(arr[0, i])) + + name = names_for_storenames[idx][0] + timestamps = name_to_timestamps[name] + timeRecStart = timestamps[0] + applyCorrection_ttl( + filepath, + timeForLightsTurnOn, + displayName, + name_1, + timeRecStart, + ttl_timestamps, + ) + + logger.info("Timestamps corrections applied to the data and event timestamps.") + + +def applyCorrection_ttl( + filepath, + timeForLightsTurnOn, + displayName, + naming, + timeRecStart, + ttl_timestamps, +): + cond = check_TDT(os.path.dirname(filepath)) + arr = ttl_timestamps + if cond == True: + res = (arr >= timeRecStart).all() + if res == True: + arr = np.subtract(arr, timeRecStart) + arr = np.subtract(arr, timeForLightsTurnOn) + else: + arr = np.subtract(arr, timeForLightsTurnOn) + else: + arr = np.subtract(arr, timeForLightsTurnOn) + write_hdf5(arr, displayName + "_" + naming, filepath, "ts") + + # function to apply correction to control, signal and event timestamps def applyCorrection( filepath, @@ -297,7 +407,7 @@ def read_ttl(filepath, storesList): name_to_timestamps = {} for storename, name in zip(storenames, names_for_storenames): - if storename in channels_arr: + if name in channels_arr: continue timestamps = read_hdf5(storename, filepath, "timestamps") name_to_timestamps[name] = timestamps diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 1715cfc..acea813 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -23,7 +23,8 @@ ) from .analysis.timestamp_correction import ( create_control_channel, - decide_naming_convention_and_applyCorrection, + decide_naming_and_applyCorrection_signal_and_control, + decide_naming_and_applyCorrection_ttl, read_control_and_signal, read_ttl, timestampCorrection, @@ -280,7 +281,7 @@ def execute_timestamp_correction(folderNames, inputParameters): control_and_signal_dicts = read_control_and_signal(filepath, storesList) name_to_data, name_to_timestamps, name_to_sampling_rate, name_to_npoints = control_and_signal_dicts - corrected_name_to_timestamps, name_to_correctionIndex = timestampCorrection( + name_to_corrected_timestamps, name_to_correctionIndex = timestampCorrection( timeForLightsTurnOn, storesList, name_to_timestamps, @@ -291,37 +292,52 @@ def execute_timestamp_correction(folderNames, inputParameters): ) write_corrected_timestamps( filepath, - corrected_name_to_timestamps, + name_to_corrected_timestamps, name_to_timestamps, name_to_sampling_rate, name_to_correctionIndex, ) - name_1_to_corrected_timestamps = { - name.split("_")[-1]: ts for name, ts in corrected_name_to_timestamps.items() - } - name_1_to_timestamps = {name.split("_")[-1]: ts for name, ts in name_to_timestamps.items()} - name_1_to_sampling_rate = {name.split("_")[-1]: sr for name, sr in name_to_sampling_rate.items()} - name_1_to_correctionIndex = {name.split("_")[-1]: idx for name, idx in name_to_correctionIndex.items()} name_to_timestamps_ttl = read_ttl(filepath, storesList) - for k in range(storesList.shape[1]): - data = name_to_data[storesList[1, k]] if storesList[1, k] in name_to_data else None - ttl_timestamps = ( - name_to_timestamps_ttl[storesList[1, k]] if storesList[1, k] in name_to_timestamps_ttl else None - ) - decide_naming_convention_and_applyCorrection( - filepath, - timeForLightsTurnOn, - storesList[0, k], - storesList[1, k], - storesList, - name_1_to_corrected_timestamps, - name_1_to_timestamps, - name_1_to_sampling_rate, - name_1_to_correctionIndex, - data, - ttl_timestamps, - ) + decide_naming_and_applyCorrection_signal_and_control( + filepath, + storesList, + name_to_correctionIndex, + name_to_data, + ) + decide_naming_and_applyCorrection_ttl( + filepath, + timeForLightsTurnOn, + storesList, + name_to_timestamps_ttl, + name_to_timestamps, + name_to_data, + ) + + # name_1_to_corrected_timestamps = { + # name.split("_")[-1]: ts for name, ts in name_to_corrected_timestamps.items() + # } + # name_1_to_timestamps = {name.split("_")[-1]: ts for name, ts in name_to_timestamps.items()} + # name_1_to_sampling_rate = {name.split("_")[-1]: sr for name, sr in name_to_sampling_rate.items()} + # name_1_to_correctionIndex = {name.split("_")[-1]: idx for name, idx in name_to_correctionIndex.items()} + # for k in range(storesList.shape[1]): # TODO: Refactor nested loops for clarity + # data = name_to_data[storesList[1, k]] if storesList[1, k] in name_to_data else None + # ttl_timestamps = ( + # name_to_timestamps_ttl[storesList[1, k]] if storesList[1, k] in name_to_timestamps_ttl else None + # ) + # decide_naming_convention_and_applyCorrection( + # filepath, + # timeForLightsTurnOn, + # storesList[0, k], + # storesList[1, k], + # storesList, + # name_1_to_corrected_timestamps, + # name_1_to_timestamps, + # name_1_to_sampling_rate, + # name_1_to_correctionIndex, + # data, + # ttl_timestamps, + # ) # check if isosbestic control is false and also if new control channel is added if isosbestic_control == False: From 4bfc1a7c41ca9ab792b4484f1fa68b5f06b8b23e Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 12 Dec 2025 16:42:47 -0800 Subject: [PATCH 075/150] Removed commented section. --- src/guppy/preprocess.py | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index acea813..543f565 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -314,31 +314,6 @@ def execute_timestamp_correction(folderNames, inputParameters): name_to_data, ) - # name_1_to_corrected_timestamps = { - # name.split("_")[-1]: ts for name, ts in name_to_corrected_timestamps.items() - # } - # name_1_to_timestamps = {name.split("_")[-1]: ts for name, ts in name_to_timestamps.items()} - # name_1_to_sampling_rate = {name.split("_")[-1]: sr for name, sr in name_to_sampling_rate.items()} - # name_1_to_correctionIndex = {name.split("_")[-1]: idx for name, idx in name_to_correctionIndex.items()} - # for k in range(storesList.shape[1]): # TODO: Refactor nested loops for clarity - # data = name_to_data[storesList[1, k]] if storesList[1, k] in name_to_data else None - # ttl_timestamps = ( - # name_to_timestamps_ttl[storesList[1, k]] if storesList[1, k] in name_to_timestamps_ttl else None - # ) - # decide_naming_convention_and_applyCorrection( - # filepath, - # timeForLightsTurnOn, - # storesList[0, k], - # storesList[1, k], - # storesList, - # name_1_to_corrected_timestamps, - # name_1_to_timestamps, - # name_1_to_sampling_rate, - # name_1_to_correctionIndex, - # data, - # ttl_timestamps, - # ) - # check if isosbestic control is false and also if new control channel is added if isosbestic_control == False: create_control_channel(filepath, storesList, window=101) From b01a58f525f20a9a0f29c06b01e30c4672fa3f57 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 12 Dec 2025 16:57:31 -0800 Subject: [PATCH 076/150] Refactored applyCorrection inside timestampCorrection for signal and control --- src/guppy/analysis/timestamp_correction.py | 25 +++++++++++++++++++++- src/guppy/preprocess.py | 8 +------ 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index a1088c9..3d5c73c 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -32,7 +32,14 @@ def write_corrected_timestamps( # function to correct timestamps after eliminating first few seconds of the data (for csv or TDT data depending on mode) def timestampCorrection( - timeForLightsTurnOn, storesList, name_to_timestamps, name_to_data, name_to_sampling_rate, name_to_npoints, mode + filepath, + timeForLightsTurnOn, + storesList, + name_to_timestamps, + name_to_data, + name_to_sampling_rate, + name_to_npoints, + mode, ): logger.debug( f"Correcting timestamps by getting rid of the first {timeForLightsTurnOn} seconds and convert timestamps to seconds" @@ -49,6 +56,8 @@ def timestampCorrection( indices = check_cntrl_sig_length(arr, name_to_data) for i in range(arr.shape[1]): + control_name = arr[0, i] + signal_name = arr[1, i] name_1 = arr[0, i].split("_")[-1] name_2 = arr[1, i].split("_")[-1] if name_1 != name_2: @@ -85,6 +94,20 @@ def timestampCorrection( name_to_timestamps[name] = timestampNew name_to_correctionIndex[name] = correctionIndex + arr = name_to_data[control_name] + if (arr == 0).all() == True: + arr = arr + else: + arr = arr[correctionIndex] + write_hdf5(arr, control_name, filepath, "data") + + arr = name_to_data[signal_name] + if (arr == 0).all() == True: + arr = arr + else: + arr = arr[correctionIndex] + write_hdf5(arr, signal_name, filepath, "data") + logger.info("Timestamps corrected and converted to seconds.") return name_to_timestamps, name_to_correctionIndex diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 543f565..df07c21 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -23,7 +23,6 @@ ) from .analysis.timestamp_correction import ( create_control_channel, - decide_naming_and_applyCorrection_signal_and_control, decide_naming_and_applyCorrection_ttl, read_control_and_signal, read_ttl, @@ -282,6 +281,7 @@ def execute_timestamp_correction(folderNames, inputParameters): control_and_signal_dicts = read_control_and_signal(filepath, storesList) name_to_data, name_to_timestamps, name_to_sampling_rate, name_to_npoints = control_and_signal_dicts name_to_corrected_timestamps, name_to_correctionIndex = timestampCorrection( + filepath, timeForLightsTurnOn, storesList, name_to_timestamps, @@ -299,12 +299,6 @@ def execute_timestamp_correction(folderNames, inputParameters): ) name_to_timestamps_ttl = read_ttl(filepath, storesList) - decide_naming_and_applyCorrection_signal_and_control( - filepath, - storesList, - name_to_correctionIndex, - name_to_data, - ) decide_naming_and_applyCorrection_ttl( filepath, timeForLightsTurnOn, From 62cb84f921fbb26c6a7b78e76ca037d68a12bb18 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 12 Dec 2025 17:19:28 -0800 Subject: [PATCH 077/150] Pulled write operations back out of timestamp_correction. --- src/guppy/analysis/timestamp_correction.py | 53 ++++++++++------------ src/guppy/preprocess.py | 5 +- 2 files changed, 28 insertions(+), 30 deletions(-) diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index 3d5c73c..e8144f3 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -30,9 +30,13 @@ def write_corrected_timestamps( write_hdf5(sampling_rate, "timeCorrection_" + name_1, filepath, "sampling_rate") +def write_corrected_data(filepath, name_to_corrected_data): + for name, data in name_to_corrected_data.items(): + write_hdf5(data, name, filepath, "data") + + # function to correct timestamps after eliminating first few seconds of the data (for csv or TDT data depending on mode) def timestampCorrection( - filepath, timeForLightsTurnOn, storesList, name_to_timestamps, @@ -47,19 +51,20 @@ def timestampCorrection( if mode not in ["tdt", "csv"]: logger.error("Mode should be either 'tdt' or 'csv'") raise ValueError("Mode should be either 'tdt' or 'csv'") - name_to_timestamps = name_to_timestamps.copy() + name_to_corrected_timestamps = {} name_to_correctionIndex = {} + name_to_corrected_data = {} storenames = storesList[0, :] names_for_storenames = storesList[1, :] - arr = get_control_and_signal_channel_names(storesList) + data = get_control_and_signal_channel_names(storesList) - indices = check_cntrl_sig_length(arr, name_to_data) + indices = check_cntrl_sig_length(data, name_to_data) - for i in range(arr.shape[1]): - control_name = arr[0, i] - signal_name = arr[1, i] - name_1 = arr[0, i].split("_")[-1] - name_2 = arr[1, i].split("_")[-1] + for i in range(data.shape[1]): + control_name = data[0, i] + signal_name = data[1, i] + name_1 = data[0, i].split("_")[-1] + name_2 = data[1, i].split("_")[-1] if name_1 != name_2: logger.error("Error in naming convention of files or Error in storesList file") raise Exception("Error in naming convention of files or Error in storesList file") @@ -68,8 +73,8 @@ def timestampCorrection( idx = np.where(names_for_storenames == indices[i])[0] if idx.shape[0] == 0: - logger.error(f"{arr[0,i]} does not exist in the stores list file.") - raise Exception("{} does not exist in the stores list file.".format(arr[0, i])) + logger.error(f"{data[0,i]} does not exist in the stores list file.") + raise Exception("{} does not exist in the stores list file.".format(data[0, i])) name = names_for_storenames[idx][0] timestamp = name_to_timestamps[name] @@ -91,25 +96,17 @@ def timestampCorrection( correctionIndex = np.where(timestamp >= timeForLightsTurnOn)[0] timestampNew = timestamp[correctionIndex] - name_to_timestamps[name] = timestampNew - name_to_correctionIndex[name] = correctionIndex - - arr = name_to_data[control_name] - if (arr == 0).all() == True: - arr = arr - else: - arr = arr[correctionIndex] - write_hdf5(arr, control_name, filepath, "data") - - arr = name_to_data[signal_name] - if (arr == 0).all() == True: - arr = arr - else: - arr = arr[correctionIndex] - write_hdf5(arr, signal_name, filepath, "data") + for displayName in [control_name, signal_name]: + name_to_corrected_timestamps[displayName] = timestampNew + name_to_correctionIndex[displayName] = correctionIndex + data = name_to_data[displayName] + if (data == 0).all() == True: + name_to_corrected_data[displayName] = data + else: + name_to_corrected_data[displayName] = data[correctionIndex] logger.info("Timestamps corrected and converted to seconds.") - return name_to_timestamps, name_to_correctionIndex + return name_to_corrected_timestamps, name_to_correctionIndex, name_to_corrected_data # function to check if naming convention was followed while saving storeslist file diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index df07c21..4653ce3 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -27,6 +27,7 @@ read_control_and_signal, read_ttl, timestampCorrection, + write_corrected_data, write_corrected_timestamps, ) from .analysis.z_score import compute_z_score @@ -280,8 +281,7 @@ def execute_timestamp_correction(folderNames, inputParameters): control_and_signal_dicts = read_control_and_signal(filepath, storesList) name_to_data, name_to_timestamps, name_to_sampling_rate, name_to_npoints = control_and_signal_dicts - name_to_corrected_timestamps, name_to_correctionIndex = timestampCorrection( - filepath, + name_to_corrected_timestamps, name_to_correctionIndex, name_to_corrected_data = timestampCorrection( timeForLightsTurnOn, storesList, name_to_timestamps, @@ -297,6 +297,7 @@ def execute_timestamp_correction(folderNames, inputParameters): name_to_sampling_rate, name_to_correctionIndex, ) + write_corrected_data(filepath, name_to_corrected_data) name_to_timestamps_ttl = read_ttl(filepath, storesList) decide_naming_and_applyCorrection_ttl( From 36ba6b848362e827a489d9700e6ae41d29f6f974 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 15 Dec 2025 09:13:33 -0800 Subject: [PATCH 078/150] Pulled write operations out of applyCorrection_ttl. --- src/guppy/analysis/timestamp_correction.py | 46 +++++++++++++--------- src/guppy/preprocess.py | 7 ++-- 2 files changed, 31 insertions(+), 22 deletions(-) diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index e8144f3..d9d873f 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -203,12 +203,12 @@ def applyCorrection_signal_and_control(filepath, displayName, correctionIndex, d def decide_naming_and_applyCorrection_ttl( - filepath, timeForLightsTurnOn, storesList, name_to_timestamps_ttl, name_to_timestamps, name_to_data, + mode, ): logger.debug("Applying correction of timestamps to the data and event timestamps") storenames = storesList[0, :] @@ -216,8 +216,8 @@ def decide_naming_and_applyCorrection_ttl( arr = get_control_and_signal_channel_names(storesList) indices = check_cntrl_sig_length(arr, name_to_data) + compound_name_to_corrected_ttl_timestamps = {} for ttl_name, ttl_timestamps in name_to_timestamps_ttl.items(): - displayName = ttl_name for i in range(arr.shape[1]): name_1 = arr[0, i].split("_")[-1] name_2 = arr[1, i].split("_")[-1] @@ -233,38 +233,46 @@ def decide_naming_and_applyCorrection_ttl( name = names_for_storenames[idx][0] timestamps = name_to_timestamps[name] timeRecStart = timestamps[0] - applyCorrection_ttl( - filepath, + corrected_ttl_timestamps = applyCorrection_ttl( timeForLightsTurnOn, - displayName, - name_1, timeRecStart, ttl_timestamps, + mode, ) + compound_name = ttl_name + "_" + name_1 + compound_name_to_corrected_ttl_timestamps[compound_name] = corrected_ttl_timestamps logger.info("Timestamps corrections applied to the data and event timestamps.") + return compound_name_to_corrected_ttl_timestamps def applyCorrection_ttl( - filepath, timeForLightsTurnOn, - displayName, - naming, timeRecStart, ttl_timestamps, + mode, ): - cond = check_TDT(os.path.dirname(filepath)) - arr = ttl_timestamps - if cond == True: - res = (arr >= timeRecStart).all() + corrected_ttl_timestamps = ttl_timestamps + if mode == "tdt": + res = (corrected_ttl_timestamps >= timeRecStart).all() if res == True: - arr = np.subtract(arr, timeRecStart) - arr = np.subtract(arr, timeForLightsTurnOn) + corrected_ttl_timestamps = np.subtract(corrected_ttl_timestamps, timeRecStart) + corrected_ttl_timestamps = np.subtract(corrected_ttl_timestamps, timeForLightsTurnOn) else: - arr = np.subtract(arr, timeForLightsTurnOn) - else: - arr = np.subtract(arr, timeForLightsTurnOn) - write_hdf5(arr, displayName + "_" + naming, filepath, "ts") + corrected_ttl_timestamps = np.subtract(corrected_ttl_timestamps, timeForLightsTurnOn) + elif mode == "csv": + corrected_ttl_timestamps = np.subtract(corrected_ttl_timestamps, timeForLightsTurnOn) + return corrected_ttl_timestamps + + +def write_corrected_ttl_timestamps( + filepath, + compound_name_to_corrected_ttl_timestamps, +): + logger.debug("Applying correction of timestamps to the data and event timestamps") + for compound_name, corrected_ttl_timestamps in compound_name_to_corrected_ttl_timestamps.items(): + write_hdf5(corrected_ttl_timestamps, compound_name, filepath, "ts") + logger.info("Timestamps corrections applied to the data and event timestamps.") # function to apply correction to control, signal and event timestamps diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 4653ce3..127e929 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -29,6 +29,7 @@ timestampCorrection, write_corrected_data, write_corrected_timestamps, + write_corrected_ttl_timestamps, ) from .analysis.z_score import compute_z_score @@ -300,15 +301,15 @@ def execute_timestamp_correction(folderNames, inputParameters): write_corrected_data(filepath, name_to_corrected_data) name_to_timestamps_ttl = read_ttl(filepath, storesList) - decide_naming_and_applyCorrection_ttl( - filepath, + compound_name_to_corrected_ttl_timestamps = decide_naming_and_applyCorrection_ttl( timeForLightsTurnOn, storesList, name_to_timestamps_ttl, name_to_timestamps, name_to_data, + mode=mode, ) - + write_corrected_ttl_timestamps(filepath, compound_name_to_corrected_ttl_timestamps) # check if isosbestic control is false and also if new control channel is added if isosbestic_control == False: create_control_channel(filepath, storesList, window=101) From 05d855ec34dd29adde0d21c1f0685571000adf74 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 15 Dec 2025 09:34:51 -0800 Subject: [PATCH 079/150] Move add_control_channel and create_control_channel to the control_channel module --- src/guppy/analysis/control_channel.py | 81 ++++++++++++++++++++++ src/guppy/analysis/timestamp_correction.py | 28 -------- src/guppy/preprocess.py | 51 +------------- 3 files changed, 82 insertions(+), 78 deletions(-) diff --git a/src/guppy/analysis/control_channel.py b/src/guppy/analysis/control_channel.py index 2da82e2..d9f6ad8 100644 --- a/src/guppy/analysis/control_channel.py +++ b/src/guppy/analysis/control_channel.py @@ -1,12 +1,93 @@ import logging +import os import numpy as np +import pandas as pd from scipy import signal as ss from scipy.optimize import curve_fit +from .io_utils import ( + read_hdf5, + write_hdf5, +) + logger = logging.getLogger(__name__) +# This function just creates placeholder Control-HDF5 files that are then immediately overwritten later on in the pipeline. +# TODO: Refactor this function to avoid unnecessary file creation. +# function to add control channel when there is no +# isosbestic control channel and update the storeslist file +def add_control_channel(filepath, arr): + + storenames = arr[0, :] + storesList = np.char.lower(arr[1, :]) + + keep_control = np.array([]) + # check a case if there is isosbestic control channel present + for i in range(storesList.shape[0]): + if "control" in storesList[i].lower(): + name = storesList[i].split("_")[-1] + new_str = "signal_" + str(name).lower() + find_signal = [True for i in storesList if i == new_str] + if len(find_signal) > 1: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + if len(find_signal) == 0: + logger.error( + "Isosbectic control channel parameter is set to False and still \ + storeslist file shows there is control channel present" + ) + raise Exception( + "Isosbectic control channel parameter is set to False and still \ + storeslist file shows there is control channel present" + ) + else: + continue + + for i in range(storesList.shape[0]): + if "signal" in storesList[i].lower(): + name = storesList[i].split("_")[-1] + new_str = "control_" + str(name).lower() + find_signal = [True for i in storesList if i == new_str] + if len(find_signal) == 0: + src, dst = os.path.join(filepath, arr[0, i] + ".hdf5"), os.path.join( + filepath, "cntrl" + str(i) + ".hdf5" + ) + shutil.copyfile(src, dst) + arr = np.concatenate((arr, [["cntrl" + str(i)], ["control_" + str(arr[1, i].split("_")[-1])]]), axis=1) + + np.savetxt(os.path.join(filepath, "storesList.csv"), arr, delimiter=",", fmt="%s") + + return arr + + +# main function to create control channel using +# signal channel and save it to a file +def create_control_channel(filepath, arr, window=5001): + + storenames = arr[0, :] + storesList = arr[1, :] + + for i in range(storesList.shape[0]): + event_name, event = storesList[i], storenames[i] + if "control" in event_name.lower() and "cntrl" in event.lower(): + logger.debug("Creating control channel from signal channel using curve-fitting") + name = event_name.split("_")[-1] + signal = read_hdf5("signal_" + name, filepath, "data") + timestampNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") + sampling_rate = np.full(timestampNew.shape, np.nan) + sampling_rate[0] = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] + + control = helper_create_control_channel(signal, timestampNew, window) + + write_hdf5(control, event_name, filepath, "data") + d = {"timestamps": timestampNew, "data": control, "sampling_rate": sampling_rate} + df = pd.DataFrame(d) + df.to_csv(os.path.join(os.path.dirname(filepath), event.lower() + ".csv"), index=False) + logger.info("Control channel from signal channel created using curve-fitting") + + # TODO: figure out why a control channel is created for both timestamp correction and z-score steps. # helper function to create control channel using signal channel # by curve fitting signal channel to exponential function diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index d9d873f..709deca 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -2,9 +2,7 @@ import os import numpy as np -import pandas as pd -from .control_channel import helper_create_control_channel from .io_utils import ( check_TDT, read_hdf5, @@ -320,32 +318,6 @@ def applyCorrection( write_hdf5(arr, displayName + "_" + naming, filepath, "ts") -# main function to create control channel using -# signal channel and save it to a file -def create_control_channel(filepath, arr, window=5001): - - storenames = arr[0, :] - storesList = arr[1, :] - - for i in range(storesList.shape[0]): - event_name, event = storesList[i], storenames[i] - if "control" in event_name.lower() and "cntrl" in event.lower(): - logger.debug("Creating control channel from signal channel using curve-fitting") - name = event_name.split("_")[-1] - signal = read_hdf5("signal_" + name, filepath, "data") - timestampNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") - sampling_rate = np.full(timestampNew.shape, np.nan) - sampling_rate[0] = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] - - control = helper_create_control_channel(signal, timestampNew, window) - - write_hdf5(control, event_name, filepath, "data") - d = {"timestamps": timestampNew, "data": control, "sampling_rate": sampling_rate} - df = pd.DataFrame(d) - df.to_csv(os.path.join(os.path.dirname(filepath), event.lower() + ".csv"), index=False) - logger.info("Control channel from signal channel created using curve-fitting") - - # function to check control and signal channel has same length # if not, take a smaller length and do pre-processing def check_cntrl_sig_length(channels_arr, name_to_data): diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 127e929..9f1f14e 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -2,7 +2,6 @@ import json import logging import os -import shutil import sys import matplotlib.pyplot as plt @@ -13,6 +12,7 @@ processTimestampsForArtifacts, ) from .analysis.combine_data import combineData +from .analysis.control_channel import add_control_channel, create_control_channel from .analysis.io_utils import ( check_storeslistfile, check_TDT, @@ -22,7 +22,6 @@ takeOnlyDirs, ) from .analysis.timestamp_correction import ( - create_control_channel, decide_naming_and_applyCorrection_ttl, read_control_and_signal, read_ttl, @@ -212,54 +211,6 @@ def visualizeControlAndSignal(filepath, removeArtifacts): visualize(filepath, ts, control, signal, cntrl_sig_fit, plot_name, removeArtifacts) -# This function just creates placeholder Control-HDF5 files that are then immediately overwritten later on in the pipeline. -# TODO: Refactor this function to avoid unnecessary file creation. -# function to add control channel when there is no -# isosbestic control channel and update the storeslist file -def add_control_channel(filepath, arr): - - storenames = arr[0, :] - storesList = np.char.lower(arr[1, :]) - - keep_control = np.array([]) - # check a case if there is isosbestic control channel present - for i in range(storesList.shape[0]): - if "control" in storesList[i].lower(): - name = storesList[i].split("_")[-1] - new_str = "signal_" + str(name).lower() - find_signal = [True for i in storesList if i == new_str] - if len(find_signal) > 1: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - if len(find_signal) == 0: - logger.error( - "Isosbectic control channel parameter is set to False and still \ - storeslist file shows there is control channel present" - ) - raise Exception( - "Isosbectic control channel parameter is set to False and still \ - storeslist file shows there is control channel present" - ) - else: - continue - - for i in range(storesList.shape[0]): - if "signal" in storesList[i].lower(): - name = storesList[i].split("_")[-1] - new_str = "control_" + str(name).lower() - find_signal = [True for i in storesList if i == new_str] - if len(find_signal) == 0: - src, dst = os.path.join(filepath, arr[0, i] + ".hdf5"), os.path.join( - filepath, "cntrl" + str(i) + ".hdf5" - ) - shutil.copyfile(src, dst) - arr = np.concatenate((arr, [["cntrl" + str(i)], ["control_" + str(arr[1, i].split("_")[-1])]]), axis=1) - - np.savetxt(os.path.join(filepath, "storesList.csv"), arr, delimiter=",", fmt="%s") - - return arr - - # function to execute timestamps corrections using functions timestampCorrection and decide_naming_convention_and_applyCorrection def execute_timestamp_correction(folderNames, inputParameters): From 1f65c14b838096c4625e5895a791fb5d0976a64e Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 15 Dec 2025 10:57:02 -0800 Subject: [PATCH 080/150] Moved read and write to standard_io.py. --- src/guppy/analysis/io_utils.py | 19 +++ src/guppy/analysis/timestamp_correction.py | 138 +++++---------------- src/guppy/preprocess.py | 28 ++--- 3 files changed, 66 insertions(+), 119 deletions(-) diff --git a/src/guppy/analysis/io_utils.py b/src/guppy/analysis/io_utils.py index 8b10127..c11edba 100644 --- a/src/guppy/analysis/io_utils.py +++ b/src/guppy/analysis/io_utils.py @@ -166,3 +166,22 @@ def check_storeslistfile(folderNames): storesList = np.unique(storesList, axis=1) return storesList + + +def get_control_and_signal_channel_names(storesList): + storenames = storesList[0, :] + names_for_storenames = storesList[1, :] + + channels_arr = [] + for i in range(names_for_storenames.shape[0]): + if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): + channels_arr.append(names_for_storenames[i]) + + channels_arr = sorted(channels_arr, key=str.casefold) + try: + channels_arr = np.asarray(channels_arr).reshape(2, -1) + except: + logger.error("Error in saving stores list file or spelling mistake for control or signal") + raise Exception("Error in saving stores list file or spelling mistake for control or signal") + + return channels_arr diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index 709deca..f48a255 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -5,32 +5,47 @@ from .io_utils import ( check_TDT, - read_hdf5, + get_control_and_signal_channel_names, write_hdf5, ) logger = logging.getLogger(__name__) -def write_corrected_timestamps( - filepath, corrected_name_to_timestamps, name_to_timestamps, name_to_sampling_rate, name_to_correctionIndex +def correct_timestamps( + timeForLightsTurnOn, + storesList, + name_to_timestamps, + name_to_data, + name_to_sampling_rate, + name_to_npoints, + name_to_timestamps_ttl, + mode, ): - for name, correctionIndex in name_to_correctionIndex.items(): - timestamps = name_to_timestamps[name] - corrected_timestamps = corrected_name_to_timestamps[name] - sampling_rate = name_to_sampling_rate[name] - if sampling_rate.shape == (): # numpy scalar - sampling_rate = np.asarray([sampling_rate]) - name_1 = name.split("_")[-1] - write_hdf5(np.asarray([timestamps[0]]), "timeCorrection_" + name_1, filepath, "timeRecStart") - write_hdf5(corrected_timestamps, "timeCorrection_" + name_1, filepath, "timestampNew") - write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex") - write_hdf5(sampling_rate, "timeCorrection_" + name_1, filepath, "sampling_rate") - + name_to_corrected_timestamps, name_to_correctionIndex, name_to_corrected_data = timestampCorrection( + timeForLightsTurnOn, + storesList, + name_to_timestamps, + name_to_data, + name_to_sampling_rate, + name_to_npoints, + mode=mode, + ) + compound_name_to_corrected_ttl_timestamps = decide_naming_and_applyCorrection_ttl( + timeForLightsTurnOn, + storesList, + name_to_timestamps_ttl, + name_to_timestamps, + name_to_data, + mode=mode, + ) -def write_corrected_data(filepath, name_to_corrected_data): - for name, data in name_to_corrected_data.items(): - write_hdf5(data, name, filepath, "data") + return ( + name_to_corrected_timestamps, + name_to_correctionIndex, + name_to_corrected_data, + compound_name_to_corrected_ttl_timestamps, + ) # function to correct timestamps after eliminating first few seconds of the data (for csv or TDT data depending on mode) @@ -263,16 +278,6 @@ def applyCorrection_ttl( return corrected_ttl_timestamps -def write_corrected_ttl_timestamps( - filepath, - compound_name_to_corrected_ttl_timestamps, -): - logger.debug("Applying correction of timestamps to the data and event timestamps") - for compound_name, corrected_ttl_timestamps in compound_name_to_corrected_ttl_timestamps.items(): - write_hdf5(corrected_ttl_timestamps, compound_name, filepath, "ts") - logger.info("Timestamps corrections applied to the data and event timestamps.") - - # function to apply correction to control, signal and event timestamps def applyCorrection( filepath, @@ -336,80 +341,3 @@ def check_cntrl_sig_length(channels_arr, name_to_data): indices.append(signal_name) return indices - - -def get_control_and_signal_channel_names(storesList): - storenames = storesList[0, :] - names_for_storenames = storesList[1, :] - - channels_arr = [] - for i in range(names_for_storenames.shape[0]): - if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): - channels_arr.append(names_for_storenames[i]) - - channels_arr = sorted(channels_arr, key=str.casefold) - try: - channels_arr = np.asarray(channels_arr).reshape(2, -1) - except: - logger.error("Error in saving stores list file or spelling mistake for control or signal") - raise Exception("Error in saving stores list file or spelling mistake for control or signal") - - return channels_arr - - -def read_control_and_signal(filepath, storesList): - channels_arr = get_control_and_signal_channel_names(storesList) - storenames = storesList[0, :] - names_for_storenames = storesList[1, :] - - name_to_data = {} - name_to_timestamps = {} - name_to_sampling_rate = {} - name_to_npoints = {} - - for i in range(channels_arr.shape[1]): - control_name = channels_arr[0, i] - signal_name = channels_arr[1, i] - idx_c = np.where(storesList == control_name)[0] - idx_s = np.where(storesList == signal_name)[0] - control_storename = storenames[idx_c[0]] - signal_storename = storenames[idx_s[0]] - - control_data = read_hdf5(control_storename, filepath, "data") - signal_data = read_hdf5(signal_storename, filepath, "data") - control_timestamps = read_hdf5(control_storename, filepath, "timestamps") - signal_timestamps = read_hdf5(signal_storename, filepath, "timestamps") - control_sampling_rate = read_hdf5(control_storename, filepath, "sampling_rate") - signal_sampling_rate = read_hdf5(signal_storename, filepath, "sampling_rate") - try: # TODO: define npoints for csv datasets - control_npoints = read_hdf5(control_storename, filepath, "npoints") - signal_npoints = read_hdf5(signal_storename, filepath, "npoints") - except KeyError: # npoints is not defined for csv datasets - control_npoints = None - signal_npoints = None - - name_to_data[control_name] = control_data - name_to_data[signal_name] = signal_data - name_to_timestamps[control_name] = control_timestamps - name_to_timestamps[signal_name] = signal_timestamps - name_to_sampling_rate[control_name] = control_sampling_rate - name_to_sampling_rate[signal_name] = signal_sampling_rate - name_to_npoints[control_name] = control_npoints - name_to_npoints[signal_name] = signal_npoints - - return name_to_data, name_to_timestamps, name_to_sampling_rate, name_to_npoints - - -def read_ttl(filepath, storesList): - channels_arr = get_control_and_signal_channel_names(storesList) - storenames = storesList[0, :] - names_for_storenames = storesList[1, :] - - name_to_timestamps = {} - for storename, name in zip(storenames, names_for_storenames): - if name in channels_arr: - continue - timestamps = read_hdf5(storename, filepath, "timestamps") - name_to_timestamps[name] = timestamps - - return name_to_timestamps diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 9f1f14e..aa0c761 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -21,15 +21,14 @@ read_hdf5, takeOnlyDirs, ) -from .analysis.timestamp_correction import ( - decide_naming_and_applyCorrection_ttl, +from .analysis.standard_io import ( read_control_and_signal, read_ttl, - timestampCorrection, write_corrected_data, write_corrected_timestamps, write_corrected_ttl_timestamps, ) +from .analysis.timestamp_correction import correct_timestamps from .analysis.z_score import compute_z_score logger = logging.getLogger(__name__) @@ -233,15 +232,25 @@ def execute_timestamp_correction(folderNames, inputParameters): control_and_signal_dicts = read_control_and_signal(filepath, storesList) name_to_data, name_to_timestamps, name_to_sampling_rate, name_to_npoints = control_and_signal_dicts - name_to_corrected_timestamps, name_to_correctionIndex, name_to_corrected_data = timestampCorrection( + name_to_timestamps_ttl = read_ttl(filepath, storesList) + + timestamps_dicts = correct_timestamps( timeForLightsTurnOn, storesList, name_to_timestamps, name_to_data, name_to_sampling_rate, name_to_npoints, + name_to_timestamps_ttl, mode=mode, ) + ( + name_to_corrected_timestamps, + name_to_correctionIndex, + name_to_corrected_data, + compound_name_to_corrected_ttl_timestamps, + ) = timestamps_dicts + write_corrected_timestamps( filepath, name_to_corrected_timestamps, @@ -250,17 +259,8 @@ def execute_timestamp_correction(folderNames, inputParameters): name_to_correctionIndex, ) write_corrected_data(filepath, name_to_corrected_data) - - name_to_timestamps_ttl = read_ttl(filepath, storesList) - compound_name_to_corrected_ttl_timestamps = decide_naming_and_applyCorrection_ttl( - timeForLightsTurnOn, - storesList, - name_to_timestamps_ttl, - name_to_timestamps, - name_to_data, - mode=mode, - ) write_corrected_ttl_timestamps(filepath, compound_name_to_corrected_ttl_timestamps) + # check if isosbestic control is false and also if new control channel is added if isosbestic_control == False: create_control_channel(filepath, storesList, window=101) From b628232b16de5a59260e8caa09b75a3504a56c40 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 15 Dec 2025 10:57:18 -0800 Subject: [PATCH 081/150] Moved read and write to standard_io.py. --- src/guppy/analysis/standard_io.py | 100 ++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 src/guppy/analysis/standard_io.py diff --git a/src/guppy/analysis/standard_io.py b/src/guppy/analysis/standard_io.py new file mode 100644 index 0000000..2ce8189 --- /dev/null +++ b/src/guppy/analysis/standard_io.py @@ -0,0 +1,100 @@ +import logging + +import numpy as np + +from .io_utils import ( + get_control_and_signal_channel_names, + read_hdf5, + write_hdf5, +) + +logger = logging.getLogger(__name__) + + +def read_control_and_signal(filepath, storesList): + channels_arr = get_control_and_signal_channel_names(storesList) + storenames = storesList[0, :] + names_for_storenames = storesList[1, :] + + name_to_data = {} + name_to_timestamps = {} + name_to_sampling_rate = {} + name_to_npoints = {} + + for i in range(channels_arr.shape[1]): + control_name = channels_arr[0, i] + signal_name = channels_arr[1, i] + idx_c = np.where(storesList == control_name)[0] + idx_s = np.where(storesList == signal_name)[0] + control_storename = storenames[idx_c[0]] + signal_storename = storenames[idx_s[0]] + + control_data = read_hdf5(control_storename, filepath, "data") + signal_data = read_hdf5(signal_storename, filepath, "data") + control_timestamps = read_hdf5(control_storename, filepath, "timestamps") + signal_timestamps = read_hdf5(signal_storename, filepath, "timestamps") + control_sampling_rate = read_hdf5(control_storename, filepath, "sampling_rate") + signal_sampling_rate = read_hdf5(signal_storename, filepath, "sampling_rate") + try: # TODO: define npoints for csv datasets + control_npoints = read_hdf5(control_storename, filepath, "npoints") + signal_npoints = read_hdf5(signal_storename, filepath, "npoints") + except KeyError: # npoints is not defined for csv datasets + control_npoints = None + signal_npoints = None + + name_to_data[control_name] = control_data + name_to_data[signal_name] = signal_data + name_to_timestamps[control_name] = control_timestamps + name_to_timestamps[signal_name] = signal_timestamps + name_to_sampling_rate[control_name] = control_sampling_rate + name_to_sampling_rate[signal_name] = signal_sampling_rate + name_to_npoints[control_name] = control_npoints + name_to_npoints[signal_name] = signal_npoints + + return name_to_data, name_to_timestamps, name_to_sampling_rate, name_to_npoints + + +def read_ttl(filepath, storesList): + channels_arr = get_control_and_signal_channel_names(storesList) + storenames = storesList[0, :] + names_for_storenames = storesList[1, :] + + name_to_timestamps = {} + for storename, name in zip(storenames, names_for_storenames): + if name in channels_arr: + continue + timestamps = read_hdf5(storename, filepath, "timestamps") + name_to_timestamps[name] = timestamps + + return name_to_timestamps + + +def write_corrected_timestamps( + filepath, corrected_name_to_timestamps, name_to_timestamps, name_to_sampling_rate, name_to_correctionIndex +): + for name, correctionIndex in name_to_correctionIndex.items(): + timestamps = name_to_timestamps[name] + corrected_timestamps = corrected_name_to_timestamps[name] + sampling_rate = name_to_sampling_rate[name] + if sampling_rate.shape == (): # numpy scalar + sampling_rate = np.asarray([sampling_rate]) + name_1 = name.split("_")[-1] + write_hdf5(np.asarray([timestamps[0]]), "timeCorrection_" + name_1, filepath, "timeRecStart") + write_hdf5(corrected_timestamps, "timeCorrection_" + name_1, filepath, "timestampNew") + write_hdf5(correctionIndex, "timeCorrection_" + name_1, filepath, "correctionIndex") + write_hdf5(sampling_rate, "timeCorrection_" + name_1, filepath, "sampling_rate") + + +def write_corrected_data(filepath, name_to_corrected_data): + for name, data in name_to_corrected_data.items(): + write_hdf5(data, name, filepath, "data") + + +def write_corrected_ttl_timestamps( + filepath, + compound_name_to_corrected_ttl_timestamps, +): + logger.debug("Applying correction of timestamps to the data and event timestamps") + for compound_name, corrected_ttl_timestamps in compound_name_to_corrected_ttl_timestamps.items(): + write_hdf5(corrected_ttl_timestamps, compound_name, filepath, "ts") + logger.info("Timestamps corrections applied to the data and event timestamps.") From 90e838bccde583051ddbf52e5d8902f4c4f01c00 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 15 Dec 2025 11:06:02 -0800 Subject: [PATCH 082/150] Removed unused functions after the refactor. --- src/guppy/analysis/timestamp_correction.py | 145 +-------------------- 1 file changed, 1 insertion(+), 144 deletions(-) diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index f48a255..60cf76a 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -1,13 +1,8 @@ import logging -import os import numpy as np -from .io_utils import ( - check_TDT, - get_control_and_signal_channel_names, - write_hdf5, -) +from .io_utils import get_control_and_signal_channel_names logger = logging.getLogger(__name__) @@ -122,99 +117,6 @@ def timestampCorrection( return name_to_corrected_timestamps, name_to_correctionIndex, name_to_corrected_data -# function to check if naming convention was followed while saving storeslist file -# and apply timestamps correction using the function applyCorrection -def decide_naming_convention_and_applyCorrection( - filepath, - timeForLightsTurnOn, - event, - displayName, - storesList, - name_1_to_corrected_timestamps, - name_1_to_timestamps, - name_1_to_sampling_rate, - name_1_to_correctionIndex, - data, - ttl_timestamps, -): - - logger.debug("Applying correction of timestamps to the data and event timestamps") - arr = get_control_and_signal_channel_names(storesList) - - for i in range(arr.shape[1]): - name_1 = arr[0, i].split("_")[-1] - name_2 = arr[1, i].split("_")[-1] - if name_1 != name_2: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - else: - corrected_timestamps = name_1_to_corrected_timestamps[name_1] - timestamps = name_1_to_timestamps[name_1] - timeRecStart = timestamps[0] - sampling_rate = name_1_to_sampling_rate[name_1] - correctionIndex = name_1_to_correctionIndex[name_1] - applyCorrection( - filepath, - timeForLightsTurnOn, - event, - displayName, - name_1, - corrected_timestamps, - sampling_rate, - correctionIndex, - timeRecStart, - data, - ttl_timestamps, - ) - - logger.info("Timestamps corrections applied to the data and event timestamps.") - - -def decide_naming_and_applyCorrection_signal_and_control( - filepath, - storesList, - name_to_correctionIndex, - name_to_data, -): - logger.debug("Applying correction of timestamps to the data and event timestamps") - storenames = storesList[0, :] - names_for_storenames = storesList[1, :] - arr = get_control_and_signal_channel_names(storesList) - indices = check_cntrl_sig_length(arr, name_to_data) - - for i in range(arr.shape[1]): - name_1 = arr[0, i].split("_")[-1] - name_2 = arr[1, i].split("_")[-1] - if name_1 != name_2: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - - idx = np.where(names_for_storenames == indices[i])[0] - if idx.shape[0] == 0: - logger.error(f"{arr[0,i]} does not exist in the stores list file.") - raise Exception("{} does not exist in the stores list file.".format(arr[0, i])) - - name = names_for_storenames[idx][0] - correctionIndex = name_to_correctionIndex[name] - control_name = arr[0, i] - signal_name = arr[1, i] - control_data = name_to_data[control_name] - signal_data = name_to_data[signal_name] - applyCorrection_signal_and_control(filepath, control_name, correctionIndex, control_data) - applyCorrection_signal_and_control(filepath, signal_name, correctionIndex, signal_data) - - logger.info("Timestamps corrections applied to the data and event timestamps.") - - -def applyCorrection_signal_and_control(filepath, displayName, correctionIndex, data): - arr = data - if (arr == 0).all() == True: - arr = arr - else: - arr = arr[correctionIndex] - write_hdf5(arr, displayName, filepath, "data") - - def decide_naming_and_applyCorrection_ttl( timeForLightsTurnOn, storesList, @@ -278,51 +180,6 @@ def applyCorrection_ttl( return corrected_ttl_timestamps -# function to apply correction to control, signal and event timestamps -def applyCorrection( - filepath, - timeForLightsTurnOn, - event, - displayName, - naming, - corrected_timestamps, - sampling_rate, - correctionIndex, - timeRecStart, - data, - ttl_timestamps, -): - - cond = check_TDT(os.path.dirname(filepath)) - - timestampNew = corrected_timestamps - if "control" in displayName.lower() or "signal" in displayName.lower(): - # TODO: double-check that this code is not reachable - # split_name = displayName.split("_")[-1] - # if split_name == naming: - # pass - # else: - # correctionIndex = read_hdf5("timeCorrection_" + split_name, filepath, "correctionIndex") - arr = data - if (arr == 0).all() == True: - arr = arr - else: - arr = arr[correctionIndex] - write_hdf5(arr, displayName, filepath, "data") - else: - arr = ttl_timestamps - if cond == True: - res = (arr >= timeRecStart).all() - if res == True: - arr = np.subtract(arr, timeRecStart) - arr = np.subtract(arr, timeForLightsTurnOn) - else: - arr = np.subtract(arr, timeForLightsTurnOn) - else: - arr = np.subtract(arr, timeForLightsTurnOn) - write_hdf5(arr, displayName + "_" + naming, filepath, "ts") - - # function to check control and signal channel has same length # if not, take a smaller length and do pre-processing def check_cntrl_sig_length(channels_arr, name_to_data): From bf57616f1671a0c5a0ca674cceb6c36cbdbc8fe5 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 15 Dec 2025 11:47:54 -0800 Subject: [PATCH 083/150] Refactored artifact removal separate from z score --- src/guppy/preprocess.py | 62 ++++++++++++++++++++++++++++------------- 1 file changed, 43 insertions(+), 19 deletions(-) diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index aa0c761..4f72929 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -270,15 +270,11 @@ def execute_timestamp_correction(folderNames, inputParameters): logger.info(f"Timestamps corrections finished for {filepath}") -# function to compute z-score and deltaF/F using functions : compute_z_score and/or processTimestampsForArtifacts +# function to compute z-score and deltaF/F def execute_zscore(folderNames, inputParameters): - timeForLightsTurnOn = inputParameters["timeForLightsTurnOn"] - remove_artifacts = inputParameters["removeArtifacts"] - artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] plot_zScore_dff = inputParameters["plot_zScore_dff"] combine_data = inputParameters["combine_data"] - isosbestic_control = inputParameters["isosbestic_control"] storesListPath = [] for i in range(len(folderNames)): @@ -292,20 +288,9 @@ def execute_zscore(folderNames, inputParameters): for j in range(len(storesListPath)): filepath = storesListPath[j] - storesList = np.genfromtxt(os.path.join(filepath, "storesList.csv"), dtype="str", delimiter=",").reshape(2, -1) - if remove_artifacts == True: - logger.debug("Removing Artifacts from the data and correcting timestamps...") - compute_z_score(filepath, inputParameters) - if artifactsRemovalMethod == "concatenate": - processTimestampsForArtifacts(filepath, timeForLightsTurnOn, storesList) - else: - addingNaNtoChunksWithArtifacts(filepath, storesList) - visualizeControlAndSignal(filepath, remove_artifacts) - logger.info("Artifacts from the data are removed and timestamps are corrected.") - else: - compute_z_score(filepath, inputParameters) - visualizeControlAndSignal(filepath, remove_artifacts) + compute_z_score(filepath, inputParameters) + visualizeControlAndSignal(filepath, removeArtifacts=False) if plot_zScore_dff == "z_score": visualize_z_score(filepath) @@ -319,7 +304,42 @@ def execute_zscore(folderNames, inputParameters): inputParameters["step"] += 1 plt.show() - logger.info("Signal data and event timestamps are extracted.") + logger.info("Z-score computation completed.") + + +# function to remove artifacts from z-score data +def execute_artifact_removal(folderNames, inputParameters): + + timeForLightsTurnOn = inputParameters["timeForLightsTurnOn"] + artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] + combine_data = inputParameters["combine_data"] + + storesListPath = [] + for i in range(len(folderNames)): + if combine_data == True: + storesListPath.append([folderNames[i][0]]) + else: + filepath = folderNames[i] + storesListPath.append(takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*")))) + + storesListPath = np.concatenate(storesListPath) + + for j in range(len(storesListPath)): + filepath = storesListPath[j] + storesList = np.genfromtxt(os.path.join(filepath, "storesList.csv"), dtype="str", delimiter=",").reshape(2, -1) + + logger.debug("Removing artifacts from the data...") + if artifactsRemovalMethod == "concatenate": + processTimestampsForArtifacts(filepath, timeForLightsTurnOn, storesList) + else: + addingNaNtoChunksWithArtifacts(filepath, storesList) + visualizeControlAndSignal(filepath, removeArtifacts=True) + logger.info("Artifacts removed and timestamps corrected.") + + writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n") + inputParameters["step"] += 1 + + logger.info("Artifact removal completed.") def extractTsAndSignal(inputParameters): @@ -350,6 +370,8 @@ def extractTsAndSignal(inputParameters): writeToFile(str((pbMaxValue + 1) * 10) + "\n" + str(10) + "\n") execute_timestamp_correction(folderNames, inputParameters) execute_zscore(folderNames, inputParameters) + if remove_artifacts == True: + execute_artifact_removal(folderNames, inputParameters) else: pbMaxValue = 1 + len(folderNames) writeToFile(str((pbMaxValue) * 10) + "\n" + str(10) + "\n") @@ -357,6 +379,8 @@ def extractTsAndSignal(inputParameters): storesList = check_storeslistfile(folderNames) op_folder = combineData(folderNames, inputParameters, storesList) execute_zscore(op_folder, inputParameters) + if remove_artifacts == True: + execute_artifact_removal(op_folder, inputParameters) def main(input_parameters): From a03d018fb3ee1a5cf5558a8a8afc34f8019d665a Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 15 Dec 2025 14:27:38 -0800 Subject: [PATCH 084/150] Added artifact removal parameter back to execute_zscore. --- src/guppy/preprocess.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 4f72929..ad4507e 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -275,6 +275,7 @@ def execute_zscore(folderNames, inputParameters): plot_zScore_dff = inputParameters["plot_zScore_dff"] combine_data = inputParameters["combine_data"] + remove_artifacts = inputParameters["removeArtifacts"] storesListPath = [] for i in range(len(folderNames)): @@ -290,7 +291,8 @@ def execute_zscore(folderNames, inputParameters): filepath = storesListPath[j] compute_z_score(filepath, inputParameters) - visualizeControlAndSignal(filepath, removeArtifacts=False) + if not remove_artifacts: + visualizeControlAndSignal(filepath, removeArtifacts=remove_artifacts) if plot_zScore_dff == "z_score": visualize_z_score(filepath) @@ -334,11 +336,11 @@ def execute_artifact_removal(folderNames, inputParameters): else: addingNaNtoChunksWithArtifacts(filepath, storesList) visualizeControlAndSignal(filepath, removeArtifacts=True) - logger.info("Artifacts removed and timestamps corrected.") writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n") inputParameters["step"] += 1 + plt.show() logger.info("Artifact removal completed.") From e0a4ca80e6b470c6d9d53e2a8c3032e93246e5a9 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 15 Dec 2025 14:31:13 -0800 Subject: [PATCH 085/150] Removed idle removeArtifacts parameter from compute z-score function. --- src/guppy/analysis/z_score.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/src/guppy/analysis/z_score.py b/src/guppy/analysis/z_score.py index b5032be..87bf184 100644 --- a/src/guppy/analysis/z_score.py +++ b/src/guppy/analysis/z_score.py @@ -19,7 +19,6 @@ def compute_z_score(filepath, inputParameters): logger.debug(f"Computing z-score for each of the data in {filepath}") - remove_artifacts = inputParameters["removeArtifacts"] path_1 = find_files(filepath, "control_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) path_2 = find_files(filepath, "signal_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'signal*')) @@ -48,14 +47,9 @@ def compute_z_score(filepath, inputParameters): # signal_smooth = ss.filtfilt(b, a, signal) # _score, dff = helper_z_score(control_smooth, signal_smooth) z_score, dff, control_fit = helper_z_score(control, signal, filepath, name, inputParameters) - if remove_artifacts == True: - write_hdf5(z_score, "z_score_" + name, filepath, "data") - write_hdf5(dff, "dff_" + name, filepath, "data") - write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") - else: - write_hdf5(z_score, "z_score_" + name, filepath, "data") - write_hdf5(dff, "dff_" + name, filepath, "data") - write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") + write_hdf5(z_score, "z_score_" + name, filepath, "data") + write_hdf5(dff, "dff_" + name, filepath, "data") + write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") else: logger.error("Error in naming convention of files or Error in storesList file") raise Exception("Error in naming convention of files or Error in storesList file") From 44292ae41c2e7cc7ff2a94c93040da30ddba739d Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 15 Dec 2025 15:55:25 -0800 Subject: [PATCH 086/150] Streamlined remove artifact branch of the helper_z_score function. --- src/guppy/analysis/z_score.py | 62 +++++++++++++++-------------------- 1 file changed, 27 insertions(+), 35 deletions(-) diff --git a/src/guppy/analysis/z_score.py b/src/guppy/analysis/z_score.py index 87bf184..5f64d7f 100644 --- a/src/guppy/analysis/z_score.py +++ b/src/guppy/analysis/z_score.py @@ -80,42 +80,34 @@ def helper_z_score(control, signal, filepath, name, inputParameters): # helper_ if removeArtifacts == True: coords = fetchCoords(filepath, name, tsNew) - - # for artifacts removal, each chunk which was selected by user is being processed individually and then - # z-score is calculated - for i in range(coords.shape[0]): - tsNew_index = np.where((tsNew > coords[i, 0]) & (tsNew < coords[i, 1]))[0] - if isosbestic_control == False: - control_arr = helper_create_control_channel(signal[tsNew_index], tsNew[tsNew_index], window=101) - signal_arr = signal[tsNew_index] - norm_data, control_fit = execute_controlFit_dff( - control_arr, signal_arr, isosbestic_control, filter_window - ) - temp_control_arr[tsNew_index] = control_arr - if i < coords.shape[0] - 1: - blank_index = np.where((tsNew > coords[i, 1]) & (tsNew < coords[i + 1, 0]))[0] - temp_control_arr[blank_index] = np.full(blank_index.shape[0], np.nan) - else: - control_arr = control[tsNew_index] - signal_arr = signal[tsNew_index] - norm_data, control_fit = execute_controlFit_dff( - control_arr, signal_arr, isosbestic_control, filter_window - ) - norm_data_arr[tsNew_index] = norm_data - control_fit_arr[tsNew_index] = control_fit - - if artifactsRemovalMethod == "concatenate": - norm_data_arr = norm_data_arr[~np.isnan(norm_data_arr)] - control_fit_arr = control_fit_arr[~np.isnan(control_fit_arr)] - z_score = z_score_computation(norm_data_arr, tsNew, inputParameters) - z_score_arr = np.concatenate((z_score_arr, z_score)) else: - tsNew_index = np.arange(tsNew.shape[0]) - norm_data, control_fit = execute_controlFit_dff(control, signal, isosbestic_control, filter_window) - z_score = z_score_computation(norm_data, tsNew, inputParameters) - z_score_arr = np.concatenate((z_score_arr, z_score)) - norm_data_arr[tsNew_index] = norm_data # np.concatenate((norm_data_arr, norm_data)) - control_fit_arr[tsNew_index] = control_fit # np.concatenate((control_fit_arr, control_fit)) + dt = tsNew[1] - tsNew[0] + coords = np.array([[tsNew[0] - dt, tsNew[-1] + dt]]) + + # for artifacts removal, each chunk which was selected by user is being processed individually and then + # z-score is calculated + for i in range(coords.shape[0]): + tsNew_index = np.where((tsNew > coords[i, 0]) & (tsNew < coords[i, 1]))[0] + if isosbestic_control == False: + control_arr = helper_create_control_channel(signal[tsNew_index], tsNew[tsNew_index], window=101) + signal_arr = signal[tsNew_index] + norm_data, control_fit = execute_controlFit_dff(control_arr, signal_arr, isosbestic_control, filter_window) + temp_control_arr[tsNew_index] = control_arr + if i < coords.shape[0] - 1: + blank_index = np.where((tsNew > coords[i, 1]) & (tsNew < coords[i + 1, 0]))[0] + temp_control_arr[blank_index] = np.full(blank_index.shape[0], np.nan) + else: + control_arr = control[tsNew_index] + signal_arr = signal[tsNew_index] + norm_data, control_fit = execute_controlFit_dff(control_arr, signal_arr, isosbestic_control, filter_window) + norm_data_arr[tsNew_index] = norm_data + control_fit_arr[tsNew_index] = control_fit + + if artifactsRemovalMethod == "concatenate": + norm_data_arr = norm_data_arr[~np.isnan(norm_data_arr)] + control_fit_arr = control_fit_arr[~np.isnan(control_fit_arr)] + z_score = z_score_computation(norm_data_arr, tsNew, inputParameters) + z_score_arr = np.concatenate((z_score_arr, z_score)) # handle the case if there are chunks being cut in the front and the end if isosbestic_control == False and removeArtifacts == True: From 6da97c08ec9da0448b9a7ace28f31ebea463b62b Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 15 Dec 2025 16:02:49 -0800 Subject: [PATCH 087/150] Streamlined remove artifact branch of the helper_z_score function pt 2 --- src/guppy/analysis/control_channel.py | 1 + src/guppy/analysis/z_score.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/guppy/analysis/control_channel.py b/src/guppy/analysis/control_channel.py index d9f6ad8..605bd17 100644 --- a/src/guppy/analysis/control_channel.py +++ b/src/guppy/analysis/control_channel.py @@ -1,5 +1,6 @@ import logging import os +import shutil import numpy as np import pandas as pd diff --git a/src/guppy/analysis/z_score.py b/src/guppy/analysis/z_score.py index 5f64d7f..9472322 100644 --- a/src/guppy/analysis/z_score.py +++ b/src/guppy/analysis/z_score.py @@ -110,7 +110,7 @@ def helper_z_score(control, signal, filepath, name, inputParameters): # helper_ z_score_arr = np.concatenate((z_score_arr, z_score)) # handle the case if there are chunks being cut in the front and the end - if isosbestic_control == False and removeArtifacts == True: + if isosbestic_control == False: coords = coords.flatten() # front chunk idx = np.where((tsNew >= tsNew[0]) & (tsNew < coords[0]))[0] From d8bfcc0d8ba9c1e06b9c484613dd6e4c7fec3d05 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 15 Dec 2025 16:14:14 -0800 Subject: [PATCH 088/150] Pulled remove_artifact code out of helper_z_score --- src/guppy/analysis/z_score.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/guppy/analysis/z_score.py b/src/guppy/analysis/z_score.py index 9472322..60bb88a 100644 --- a/src/guppy/analysis/z_score.py +++ b/src/guppy/analysis/z_score.py @@ -46,7 +46,14 @@ def compute_z_score(filepath, inputParameters): # control_smooth = ss.filtfilt(b, a, control) # signal_smooth = ss.filtfilt(b, a, signal) # _score, dff = helper_z_score(control_smooth, signal_smooth) - z_score, dff, control_fit = helper_z_score(control, signal, filepath, name, inputParameters) + tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") + removeArtifacts = inputParameters["removeArtifacts"] + if removeArtifacts == True: + coords = fetchCoords(filepath, name, tsNew) + else: + dt = tsNew[1] - tsNew[0] + coords = np.array([[tsNew[0] - dt, tsNew[-1] + dt]]) + z_score, dff, control_fit = helper_z_score(control, signal, filepath, name, inputParameters, coords) write_hdf5(z_score, "z_score_" + name, filepath, "data") write_hdf5(dff, "dff_" + name, filepath, "data") write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") @@ -58,9 +65,10 @@ def compute_z_score(filepath, inputParameters): # helper function to compute z-score and deltaF/F -def helper_z_score(control, signal, filepath, name, inputParameters): # helper_z_score(control_smooth, signal_smooth): +def helper_z_score( + control, signal, filepath, name, inputParameters, coords +): # helper_z_score(control_smooth, signal_smooth): - removeArtifacts = inputParameters["removeArtifacts"] artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] filter_window = inputParameters["filter_window"] @@ -68,8 +76,6 @@ def helper_z_score(control, signal, filepath, name, inputParameters): # helper_ tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") coords_path = os.path.join(filepath, "coordsForPreProcessing_" + name + ".npy") - logger.info("Remove Artifacts : ", removeArtifacts) - if (control == 0).all() == True: control = np.zeros(tsNew.shape[0]) @@ -78,12 +84,6 @@ def helper_z_score(control, signal, filepath, name, inputParameters): # helper_ control_fit_arr = np.full(tsNew.shape[0], np.nan) temp_control_arr = np.full(tsNew.shape[0], np.nan) - if removeArtifacts == True: - coords = fetchCoords(filepath, name, tsNew) - else: - dt = tsNew[1] - tsNew[0] - coords = np.array([[tsNew[0] - dt, tsNew[-1] + dt]]) - # for artifacts removal, each chunk which was selected by user is being processed individually and then # z-score is calculated for i in range(coords.shape[0]): From b33c522ed317376f771794f166003d98bc815f4c Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 15 Dec 2025 16:28:20 -0800 Subject: [PATCH 089/150] Pulled remove_artifact code into dedicated fn --- src/guppy/analysis/z_score.py | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/src/guppy/analysis/z_score.py b/src/guppy/analysis/z_score.py index 60bb88a..7537f9d 100644 --- a/src/guppy/analysis/z_score.py +++ b/src/guppy/analysis/z_score.py @@ -25,35 +25,26 @@ def compute_z_score(filepath, inputParameters): path = sorted(path_1 + path_2, key=str.casefold) - b = np.divide(np.ones((100,)), 100) - a = 1 - if len(path) % 2 != 0: logger.error("There are not equal number of Control and Signal data") raise Exception("There are not equal number of Control and Signal data") path = np.asarray(path).reshape(2, -1) + removeArtifacts = inputParameters["removeArtifacts"] for i in range(path.shape[1]): name_1 = ((os.path.basename(path[0, i])).split(".")[0]).split("_") name_2 = ((os.path.basename(path[1, i])).split(".")[0]).split("_") - # dirname = os.path.dirname(path[i]) if name_1[-1] == name_2[-1]: name = name_1[-1] control = read_hdf5("", path[0, i], "data").reshape(-1) signal = read_hdf5("", path[1, i], "data").reshape(-1) - # control_smooth = ss.filtfilt(b, a, control) - # signal_smooth = ss.filtfilt(b, a, signal) - # _score, dff = helper_z_score(control_smooth, signal_smooth) tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") - removeArtifacts = inputParameters["removeArtifacts"] - if removeArtifacts == True: - coords = fetchCoords(filepath, name, tsNew) - else: - dt = tsNew[1] - tsNew[0] - coords = np.array([[tsNew[0] - dt, tsNew[-1] + dt]]) - z_score, dff, control_fit = helper_z_score(control, signal, filepath, name, inputParameters, coords) + + coords = get_coords(filepath, name, tsNew, removeArtifacts) + z_score, dff, control_fit = helper_z_score(control, signal, tsNew, filepath, name, inputParameters, coords) + write_hdf5(z_score, "z_score_" + name, filepath, "data") write_hdf5(dff, "dff_" + name, filepath, "data") write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") @@ -64,17 +55,23 @@ def compute_z_score(filepath, inputParameters): logger.info(f"z-score for the data in {filepath} computed.") +def get_coords(filepath, name, tsNew, removeArtifacts): # TODO: Make less redundant with fetchCoords + if removeArtifacts == True: + coords = fetchCoords(filepath, name, tsNew) + else: + dt = tsNew[1] - tsNew[0] + coords = np.array([[tsNew[0] - dt, tsNew[-1] + dt]]) + return coords + + # helper function to compute z-score and deltaF/F def helper_z_score( - control, signal, filepath, name, inputParameters, coords + control, signal, tsNew, filepath, name, inputParameters, coords ): # helper_z_score(control_smooth, signal_smooth): artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] filter_window = inputParameters["filter_window"] - isosbestic_control = inputParameters["isosbestic_control"] - tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") - coords_path = os.path.join(filepath, "coordsForPreProcessing_" + name + ".npy") if (control == 0).all() == True: control = np.zeros(tsNew.shape[0]) From e87c80963224de6e298fab3c50514598cf6a0009 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 15 Dec 2025 16:44:41 -0800 Subject: [PATCH 090/150] Pulled write code out of helper_z_score --- src/guppy/analysis/z_score.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/guppy/analysis/z_score.py b/src/guppy/analysis/z_score.py index 7537f9d..0dd4171 100644 --- a/src/guppy/analysis/z_score.py +++ b/src/guppy/analysis/z_score.py @@ -43,11 +43,15 @@ def compute_z_score(filepath, inputParameters): tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") coords = get_coords(filepath, name, tsNew, removeArtifacts) - z_score, dff, control_fit = helper_z_score(control, signal, tsNew, filepath, name, inputParameters, coords) + z_score, dff, control_fit, temp_control_arr = helper_z_score( + control, signal, tsNew, filepath, name, inputParameters, coords + ) write_hdf5(z_score, "z_score_" + name, filepath, "data") write_hdf5(dff, "dff_" + name, filepath, "data") write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") + if temp_control_arr is not None: + write_hdf5(temp_control_arr, "control_" + name, filepath, "data") else: logger.error("Error in naming convention of files or Error in storesList file") raise Exception("Error in naming convention of files or Error in storesList file") @@ -115,9 +119,10 @@ def helper_z_score( # end chunk idx = np.where((tsNew > coords[-1]) & (tsNew <= tsNew[-1]))[0] temp_control_arr[idx] = np.full(idx.shape[0], np.nan) - write_hdf5(temp_control_arr, "control_" + name, filepath, "data") + else: + temp_control_arr = None - return z_score_arr, norm_data_arr, control_fit_arr + return z_score_arr, norm_data_arr, control_fit_arr, temp_control_arr # function to filter control and signal channel, also execute above two function : controlFit and deltaFF From cf7345888e6c42e330263ca596271348b36d57a7 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 15 Dec 2025 16:47:26 -0800 Subject: [PATCH 091/150] inverted input handling --- src/guppy/analysis/z_score.py | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/src/guppy/analysis/z_score.py b/src/guppy/analysis/z_score.py index 0dd4171..8fc598b 100644 --- a/src/guppy/analysis/z_score.py +++ b/src/guppy/analysis/z_score.py @@ -35,27 +35,26 @@ def compute_z_score(filepath, inputParameters): for i in range(path.shape[1]): name_1 = ((os.path.basename(path[0, i])).split(".")[0]).split("_") name_2 = ((os.path.basename(path[1, i])).split(".")[0]).split("_") - - if name_1[-1] == name_2[-1]: - name = name_1[-1] - control = read_hdf5("", path[0, i], "data").reshape(-1) - signal = read_hdf5("", path[1, i], "data").reshape(-1) - tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") - - coords = get_coords(filepath, name, tsNew, removeArtifacts) - z_score, dff, control_fit, temp_control_arr = helper_z_score( - control, signal, tsNew, filepath, name, inputParameters, coords - ) - - write_hdf5(z_score, "z_score_" + name, filepath, "data") - write_hdf5(dff, "dff_" + name, filepath, "data") - write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") - if temp_control_arr is not None: - write_hdf5(temp_control_arr, "control_" + name, filepath, "data") - else: + if name_1[-1] != name_2[-1]: logger.error("Error in naming convention of files or Error in storesList file") raise Exception("Error in naming convention of files or Error in storesList file") + name = name_1[-1] + control = read_hdf5("", path[0, i], "data").reshape(-1) + signal = read_hdf5("", path[1, i], "data").reshape(-1) + tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") + + coords = get_coords(filepath, name, tsNew, removeArtifacts) + z_score, dff, control_fit, temp_control_arr = helper_z_score( + control, signal, tsNew, filepath, name, inputParameters, coords + ) + + write_hdf5(z_score, "z_score_" + name, filepath, "data") + write_hdf5(dff, "dff_" + name, filepath, "data") + write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") + if temp_control_arr is not None: + write_hdf5(temp_control_arr, "control_" + name, filepath, "data") + logger.info(f"z-score for the data in {filepath} computed.") From 7304fae988fdf569532f6918acabf0b6b902b08e Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 15 Dec 2025 16:50:34 -0800 Subject: [PATCH 092/150] removed unnecessary parameters --- src/guppy/analysis/z_score.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/guppy/analysis/z_score.py b/src/guppy/analysis/z_score.py index 8fc598b..1afe9e5 100644 --- a/src/guppy/analysis/z_score.py +++ b/src/guppy/analysis/z_score.py @@ -45,9 +45,7 @@ def compute_z_score(filepath, inputParameters): tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") coords = get_coords(filepath, name, tsNew, removeArtifacts) - z_score, dff, control_fit, temp_control_arr = helper_z_score( - control, signal, tsNew, filepath, name, inputParameters, coords - ) + z_score, dff, control_fit, temp_control_arr = helper_z_score(control, signal, tsNew, inputParameters, coords) write_hdf5(z_score, "z_score_" + name, filepath, "data") write_hdf5(dff, "dff_" + name, filepath, "data") @@ -68,9 +66,7 @@ def get_coords(filepath, name, tsNew, removeArtifacts): # TODO: Make less redun # helper function to compute z-score and deltaF/F -def helper_z_score( - control, signal, tsNew, filepath, name, inputParameters, coords -): # helper_z_score(control_smooth, signal_smooth): +def helper_z_score(control, signal, tsNew, inputParameters, coords): artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] filter_window = inputParameters["filter_window"] From 965f62b4edc3455c6414eea6432b6325caa69580 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 16 Dec 2025 11:40:26 -0800 Subject: [PATCH 093/150] purified helper_z_score --- src/guppy/analysis/z_score.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/guppy/analysis/z_score.py b/src/guppy/analysis/z_score.py index 1afe9e5..167863a 100644 --- a/src/guppy/analysis/z_score.py +++ b/src/guppy/analysis/z_score.py @@ -18,6 +18,10 @@ # compute z-score and deltaF/F and save it to hdf5 file def compute_z_score(filepath, inputParameters): + artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] + filter_window = inputParameters["filter_window"] + isosbestic_control = inputParameters["isosbestic_control"] + logger.debug(f"Computing z-score for each of the data in {filepath}") path_1 = find_files(filepath, "control_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) @@ -45,7 +49,9 @@ def compute_z_score(filepath, inputParameters): tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") coords = get_coords(filepath, name, tsNew, removeArtifacts) - z_score, dff, control_fit, temp_control_arr = helper_z_score(control, signal, tsNew, inputParameters, coords) + z_score, dff, control_fit, temp_control_arr = helper_z_score( + control, signal, tsNew, inputParameters, coords, artifactsRemovalMethod, filter_window, isosbestic_control + ) write_hdf5(z_score, "z_score_" + name, filepath, "data") write_hdf5(dff, "dff_" + name, filepath, "data") @@ -66,12 +72,9 @@ def get_coords(filepath, name, tsNew, removeArtifacts): # TODO: Make less redun # helper function to compute z-score and deltaF/F -def helper_z_score(control, signal, tsNew, inputParameters, coords): - - artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] - filter_window = inputParameters["filter_window"] - isosbestic_control = inputParameters["isosbestic_control"] - +def helper_z_score( + control, signal, tsNew, inputParameters, coords, artifactsRemovalMethod, filter_window, isosbestic_control +): if (control == 0).all() == True: control = np.zeros(tsNew.shape[0]) From c49d05f32bf2933abdc02bdeac73ed4ad2043607 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 16 Dec 2025 11:44:38 -0800 Subject: [PATCH 094/150] purified z_score_computation --- src/guppy/analysis/z_score.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/guppy/analysis/z_score.py b/src/guppy/analysis/z_score.py index 167863a..7dae540 100644 --- a/src/guppy/analysis/z_score.py +++ b/src/guppy/analysis/z_score.py @@ -75,6 +75,8 @@ def get_coords(filepath, name, tsNew, removeArtifacts): # TODO: Make less redun def helper_z_score( control, signal, tsNew, inputParameters, coords, artifactsRemovalMethod, filter_window, isosbestic_control ): + zscore_method = inputParameters["zscore_method"] + baseline_start, baseline_end = inputParameters["baselineWindowStart"], inputParameters["baselineWindowEnd"] if (control == 0).all() == True: control = np.zeros(tsNew.shape[0]) @@ -105,7 +107,7 @@ def helper_z_score( if artifactsRemovalMethod == "concatenate": norm_data_arr = norm_data_arr[~np.isnan(norm_data_arr)] control_fit_arr = control_fit_arr[~np.isnan(control_fit_arr)] - z_score = z_score_computation(norm_data_arr, tsNew, inputParameters) + z_score = z_score_computation(norm_data_arr, tsNew, zscore_method, baseline_start, baseline_end) z_score_arr = np.concatenate((z_score_arr, z_score)) # handle the case if there are chunks being cut in the front and the end @@ -173,11 +175,7 @@ def filterSignal(filter_window, signal): # function to compute z-score based on z-score computation method -def z_score_computation(dff, timestamps, inputParameters): - - zscore_method = inputParameters["zscore_method"] - baseline_start, baseline_end = inputParameters["baselineWindowStart"], inputParameters["baselineWindowEnd"] - +def z_score_computation(dff, timestamps, zscore_method, baseline_start, baseline_end): if zscore_method == "standard z-score": numerator = np.subtract(dff, np.nanmean(dff)) zscore = np.divide(numerator, np.nanstd(dff)) From a88c026aef77be33f3154caaed65b2d595be11d8 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 16 Dec 2025 11:46:53 -0800 Subject: [PATCH 095/150] purified helper_z_score --- src/guppy/analysis/z_score.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/src/guppy/analysis/z_score.py b/src/guppy/analysis/z_score.py index 7dae540..31645b5 100644 --- a/src/guppy/analysis/z_score.py +++ b/src/guppy/analysis/z_score.py @@ -21,6 +21,8 @@ def compute_z_score(filepath, inputParameters): artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] filter_window = inputParameters["filter_window"] isosbestic_control = inputParameters["isosbestic_control"] + zscore_method = inputParameters["zscore_method"] + baseline_start, baseline_end = inputParameters["baselineWindowStart"], inputParameters["baselineWindowEnd"] logger.debug(f"Computing z-score for each of the data in {filepath}") @@ -50,7 +52,16 @@ def compute_z_score(filepath, inputParameters): coords = get_coords(filepath, name, tsNew, removeArtifacts) z_score, dff, control_fit, temp_control_arr = helper_z_score( - control, signal, tsNew, inputParameters, coords, artifactsRemovalMethod, filter_window, isosbestic_control + control, + signal, + tsNew, + coords, + artifactsRemovalMethod, + filter_window, + isosbestic_control, + zscore_method, + baseline_start, + baseline_end, ) write_hdf5(z_score, "z_score_" + name, filepath, "data") @@ -73,10 +84,17 @@ def get_coords(filepath, name, tsNew, removeArtifacts): # TODO: Make less redun # helper function to compute z-score and deltaF/F def helper_z_score( - control, signal, tsNew, inputParameters, coords, artifactsRemovalMethod, filter_window, isosbestic_control + control, + signal, + tsNew, + coords, + artifactsRemovalMethod, + filter_window, + isosbestic_control, + zscore_method, + baseline_start, + baseline_end, ): - zscore_method = inputParameters["zscore_method"] - baseline_start, baseline_end = inputParameters["baselineWindowStart"], inputParameters["baselineWindowEnd"] if (control == 0).all() == True: control = np.zeros(tsNew.shape[0]) From bf268f81147a5b471d9506c63a26ab34080074f9 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 16 Dec 2025 12:12:07 -0800 Subject: [PATCH 096/150] Refactored zscore to use a single high-level compute_zscore function that is pure and moved all the impure code into execute_zscore in preprocess.py. --- src/guppy/analysis/io_utils.py | 9 ++++ src/guppy/analysis/standard_io.py | 16 +++++++ src/guppy/analysis/z_score.py | 78 +------------------------------ src/guppy/preprocess.py | 44 ++++++++++++++++- 4 files changed, 69 insertions(+), 78 deletions(-) diff --git a/src/guppy/analysis/io_utils.py b/src/guppy/analysis/io_utils.py index c11edba..b467c37 100644 --- a/src/guppy/analysis/io_utils.py +++ b/src/guppy/analysis/io_utils.py @@ -133,6 +133,15 @@ def fetchCoords(filepath, naming, data): return coords +def get_coords(filepath, name, tsNew, removeArtifacts): # TODO: Make less redundant with fetchCoords + if removeArtifacts == True: + coords = fetchCoords(filepath, name, tsNew) + else: + dt = tsNew[1] - tsNew[0] + coords = np.array([[tsNew[0] - dt, tsNew[-1] + dt]]) + return coords + + def get_all_stores_for_combining_data(folderNames): op = [] for i in range(100): diff --git a/src/guppy/analysis/standard_io.py b/src/guppy/analysis/standard_io.py index 2ce8189..b6fcd8a 100644 --- a/src/guppy/analysis/standard_io.py +++ b/src/guppy/analysis/standard_io.py @@ -98,3 +98,19 @@ def write_corrected_ttl_timestamps( for compound_name, corrected_ttl_timestamps in compound_name_to_corrected_ttl_timestamps.items(): write_hdf5(corrected_ttl_timestamps, compound_name, filepath, "ts") logger.info("Timestamps corrections applied to the data and event timestamps.") + + +def read_corrected_data(control_path, signal_path, filepath, name): + control = read_hdf5("", control_path, "data").reshape(-1) + signal = read_hdf5("", signal_path, "data").reshape(-1) + tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") + + return control, signal, tsNew + + +def write_zscore(filepath, name, z_score, dff, control_fit, temp_control_arr): + write_hdf5(z_score, "z_score_" + name, filepath, "data") + write_hdf5(dff, "dff_" + name, filepath, "data") + write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") + if temp_control_arr is not None: + write_hdf5(temp_control_arr, "control_" + name, filepath, "data") diff --git a/src/guppy/analysis/z_score.py b/src/guppy/analysis/z_score.py index 31645b5..34b29ee 100644 --- a/src/guppy/analysis/z_score.py +++ b/src/guppy/analysis/z_score.py @@ -1,89 +1,15 @@ import logging -import os import numpy as np from scipy import signal as ss from .control_channel import helper_create_control_channel -from .io_utils import ( - fetchCoords, - find_files, - read_hdf5, - write_hdf5, -) logger = logging.getLogger(__name__) -# compute z-score and deltaF/F and save it to hdf5 file -def compute_z_score(filepath, inputParameters): - - artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] - filter_window = inputParameters["filter_window"] - isosbestic_control = inputParameters["isosbestic_control"] - zscore_method = inputParameters["zscore_method"] - baseline_start, baseline_end = inputParameters["baselineWindowStart"], inputParameters["baselineWindowEnd"] - - logger.debug(f"Computing z-score for each of the data in {filepath}") - - path_1 = find_files(filepath, "control_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) - path_2 = find_files(filepath, "signal_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'signal*')) - - path = sorted(path_1 + path_2, key=str.casefold) - - if len(path) % 2 != 0: - logger.error("There are not equal number of Control and Signal data") - raise Exception("There are not equal number of Control and Signal data") - - path = np.asarray(path).reshape(2, -1) - removeArtifacts = inputParameters["removeArtifacts"] - - for i in range(path.shape[1]): - name_1 = ((os.path.basename(path[0, i])).split(".")[0]).split("_") - name_2 = ((os.path.basename(path[1, i])).split(".")[0]).split("_") - if name_1[-1] != name_2[-1]: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - - name = name_1[-1] - control = read_hdf5("", path[0, i], "data").reshape(-1) - signal = read_hdf5("", path[1, i], "data").reshape(-1) - tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") - - coords = get_coords(filepath, name, tsNew, removeArtifacts) - z_score, dff, control_fit, temp_control_arr = helper_z_score( - control, - signal, - tsNew, - coords, - artifactsRemovalMethod, - filter_window, - isosbestic_control, - zscore_method, - baseline_start, - baseline_end, - ) - - write_hdf5(z_score, "z_score_" + name, filepath, "data") - write_hdf5(dff, "dff_" + name, filepath, "data") - write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") - if temp_control_arr is not None: - write_hdf5(temp_control_arr, "control_" + name, filepath, "data") - - logger.info(f"z-score for the data in {filepath} computed.") - - -def get_coords(filepath, name, tsNew, removeArtifacts): # TODO: Make less redundant with fetchCoords - if removeArtifacts == True: - coords = fetchCoords(filepath, name, tsNew) - else: - dt = tsNew[1] - tsNew[0] - coords = np.array([[tsNew[0] - dt, tsNew[-1] + dt]]) - return coords - - -# helper function to compute z-score and deltaF/F -def helper_z_score( +# high-level function to compute z-score and deltaF/F +def compute_z_score( control, signal, tsNew, diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index ad4507e..5829a2d 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -18,15 +18,18 @@ check_TDT, find_files, get_all_stores_for_combining_data, # noqa: F401 -- Necessary for other modules that depend on preprocess.py + get_coords, read_hdf5, takeOnlyDirs, ) from .analysis.standard_io import ( read_control_and_signal, + read_corrected_data, read_ttl, write_corrected_data, write_corrected_timestamps, write_corrected_ttl_timestamps, + write_zscore, ) from .analysis.timestamp_correction import correct_timestamps from .analysis.z_score import compute_z_score @@ -276,6 +279,11 @@ def execute_zscore(folderNames, inputParameters): plot_zScore_dff = inputParameters["plot_zScore_dff"] combine_data = inputParameters["combine_data"] remove_artifacts = inputParameters["removeArtifacts"] + artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] + filter_window = inputParameters["filter_window"] + isosbestic_control = inputParameters["isosbestic_control"] + zscore_method = inputParameters["zscore_method"] + baseline_start, baseline_end = inputParameters["baselineWindowStart"], inputParameters["baselineWindowEnd"] storesListPath = [] for i in range(len(folderNames)): @@ -284,13 +292,45 @@ def execute_zscore(folderNames, inputParameters): else: filepath = folderNames[i] storesListPath.append(takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*")))) - storesListPath = np.concatenate(storesListPath) for j in range(len(storesListPath)): filepath = storesListPath[j] + logger.debug(f"Computing z-score for each of the data in {filepath}") + path_1 = find_files(filepath, "control_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) + path_2 = find_files(filepath, "signal_*", ignore_case=True) # glob.glob(os.path.join(filepath, 'signal*')) + path = sorted(path_1 + path_2, key=str.casefold) + if len(path) % 2 != 0: + logger.error("There are not equal number of Control and Signal data") + raise Exception("There are not equal number of Control and Signal data") + path = np.asarray(path).reshape(2, -1) + + for i in range(path.shape[1]): + name_1 = ((os.path.basename(path[0, i])).split(".")[0]).split("_") + name_2 = ((os.path.basename(path[1, i])).split(".")[0]).split("_") + if name_1[-1] != name_2[-1]: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + name = name_1[-1] + + control, signal, tsNew = read_corrected_data(path[0, i], path[1, i], filepath, name) + coords = get_coords(filepath, name, tsNew, remove_artifacts) + z_score, dff, control_fit, temp_control_arr = compute_z_score( + control, + signal, + tsNew, + coords, + artifactsRemovalMethod, + filter_window, + isosbestic_control, + zscore_method, + baseline_start, + baseline_end, + ) + write_zscore(filepath, name, z_score, dff, control_fit, temp_control_arr) + + logger.info(f"z-score for the data in {filepath} computed.") - compute_z_score(filepath, inputParameters) if not remove_artifacts: visualizeControlAndSignal(filepath, removeArtifacts=remove_artifacts) From 4d49fd973f34b31af1b24bd66086e056004ea076 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 16 Dec 2025 13:47:15 -0800 Subject: [PATCH 097/150] Refactored read-out of addingNaNValues --- src/guppy/analysis/artifact_removal.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index ac483bb..0106ec6 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -27,12 +27,15 @@ def addingNaNtoChunksWithArtifacts(filepath, events): if name_1[-1] == name_2[-1]: name = name_1[-1] sampling_rate = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] + ts = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") + coords = fetchCoords(filepath, name, ts) for i in range(len(storesList)): if ( "control_" + name.lower() in storesList[i].lower() or "signal_" + name.lower() in storesList[i].lower() ): # changes done - data = addingNaNValues(filepath, storesList[i], name) + data = read_hdf5(storesList[i], filepath, "data").reshape(-1) + data = addingNaNValues(data=data, ts=ts, coords=coords) write_hdf5(data, storesList[i], filepath, "data") else: if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): @@ -151,11 +154,7 @@ def eliminateTs(filepath, timeForLightsTurnOn, event, sampling_rate, naming): # adding nan values to removed chunks # when using artifacts removal method - replace with NaN -def addingNaNValues(filepath, event, naming): - - ts = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - data = read_hdf5(event, filepath, "data").reshape(-1) - coords = fetchCoords(filepath, naming, ts) +def addingNaNValues(*, data, ts, coords): if (data == 0).all() == True: data = np.zeros(ts.shape[0]) From a80f080e3f0b9fc69c5b3b83f42020ab599f82f9 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 16 Dec 2025 13:54:44 -0800 Subject: [PATCH 098/150] Refactored read out of removeTTLs --- src/guppy/analysis/artifact_removal.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index 0106ec6..599372e 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -41,7 +41,8 @@ def addingNaNtoChunksWithArtifacts(filepath, events): if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): continue else: - ts = removeTTLs(filepath, storesList[i], name) + ts = read_hdf5(storesList[i] + "_" + name, filepath, "ts").reshape(-1) + ts = removeTTLs(ts=ts, coords=coords) write_hdf5(ts, storesList[i] + "_" + name, filepath, "ts") else: @@ -174,11 +175,7 @@ def addingNaNValues(*, data, ts, coords): # remove event TTLs which falls in the removed chunks # when using artifacts removal method - replace with NaN -def removeTTLs(filepath, event, naming): - tsNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - ts = read_hdf5(event + "_" + naming, filepath, "ts").reshape(-1) - coords = fetchCoords(filepath, naming, tsNew) - +def removeTTLs(*, ts, coords): ts_arr = np.array([]) for i in range(coords.shape[0]): ts_index = np.where((ts > coords[i, 0]) & (ts < coords[i, 1]))[0] From 1b2066d5db7d2ad364c21e7f968a02fffd490f73 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 16 Dec 2025 14:08:48 -0800 Subject: [PATCH 099/150] Refactored read out of eliminateData and eliminateTs --- src/guppy/analysis/artifact_removal.py | 32 ++++++++++++++++---------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index 599372e..f7e95a3 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -73,15 +73,31 @@ def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, events): "control_" + name.lower() in storesList[i].lower() or "signal_" + name.lower() in storesList[i].lower() ): # changes done + ts = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") + data = read_hdf5(storesList[i], filepath, "data").reshape(-1) + coords = fetchCoords(filepath, name, ts) data, timestampNew = eliminateData( - filepath, timeForLightsTurnOn, storesList[i], sampling_rate, name + data=data, + ts=ts, + coords=coords, + timeForLightsTurnOn=timeForLightsTurnOn, + sampling_rate=sampling_rate, ) write_hdf5(data, storesList[i], filepath, "data") else: if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): continue else: - ts = eliminateTs(filepath, timeForLightsTurnOn, storesList[i], sampling_rate, name) + tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") + ts = read_hdf5(storesList[i] + "_" + name, filepath, "ts").reshape(-1) + coords = fetchCoords(filepath, name, tsNew) + ts = eliminateTs( + ts=ts, + tsNew=tsNew, + coords=coords, + timeForLightsTurnOn=timeForLightsTurnOn, + sampling_rate=sampling_rate, + ) write_hdf5(ts, storesList[i] + "_" + name, filepath, "ts") # timestamp_dict[name] = timestampNew @@ -93,11 +109,7 @@ def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, events): # helper function to process control and signal timestamps -def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): - - ts = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - data = read_hdf5(event, filepath, "data").reshape(-1) - coords = fetchCoords(filepath, naming, ts) +def eliminateData(*, data, ts, coords, timeForLightsTurnOn, sampling_rate): if (data == 0).all() == True: data = np.zeros(ts.shape[0]) @@ -126,11 +138,7 @@ def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): # helper function to align event timestamps with the control and signal timestamps -def eliminateTs(filepath, timeForLightsTurnOn, event, sampling_rate, naming): - - tsNew = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - ts = read_hdf5(event + "_" + naming, filepath, "ts").reshape(-1) - coords = fetchCoords(filepath, naming, tsNew) +def eliminateTs(*, ts, tsNew, coords, timeForLightsTurnOn, sampling_rate): ts_arr = np.array([]) tsNew_arr = np.array([]) From 7275b50342300cbb73146824d7e317663506b089 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 16 Dec 2025 14:31:34 -0800 Subject: [PATCH 100/150] cleaned up addingNaNtoChunksWithArtifacts --- src/guppy/analysis/artifact_removal.py | 47 ++++++++++++-------------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index f7e95a3..a17eb0e 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -13,41 +13,38 @@ logger = logging.getLogger(__name__) -def addingNaNtoChunksWithArtifacts(filepath, events): +def addingNaNtoChunksWithArtifacts(filepath, storesList): logger.debug("Replacing chunks with artifacts by NaN values.") - storesList = events[1, :] + names_for_storenames = storesList[1, :] path = decide_naming_convention(filepath) for j in range(path.shape[1]): name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") - # dirname = os.path.dirname(path[i]) - if name_1[-1] == name_2[-1]: - name = name_1[-1] - sampling_rate = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] - ts = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") - coords = fetchCoords(filepath, name, ts) - for i in range(len(storesList)): - if ( - "control_" + name.lower() in storesList[i].lower() - or "signal_" + name.lower() in storesList[i].lower() - ): # changes done - data = read_hdf5(storesList[i], filepath, "data").reshape(-1) - data = addingNaNValues(data=data, ts=ts, coords=coords) - write_hdf5(data, storesList[i], filepath, "data") - else: - if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): - continue - else: - ts = read_hdf5(storesList[i] + "_" + name, filepath, "ts").reshape(-1) - ts = removeTTLs(ts=ts, coords=coords) - write_hdf5(ts, storesList[i] + "_" + name, filepath, "ts") - - else: + if name_1[-1] != name_2[-1]: logger.error("Error in naming convention of files or Error in storesList file") raise Exception("Error in naming convention of files or Error in storesList file") + name = name_1[-1] + + sampling_rate = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] + ts = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") + coords = fetchCoords(filepath, name, ts) + for i in range(len(names_for_storenames)): + if ( + "control_" + name.lower() in names_for_storenames[i].lower() + or "signal_" + name.lower() in names_for_storenames[i].lower() + ): # changes done + data = read_hdf5(names_for_storenames[i], filepath, "data").reshape(-1) + data = addingNaNValues(data=data, ts=ts, coords=coords) + write_hdf5(data, names_for_storenames[i], filepath, "data") + else: + if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): + continue + ts = read_hdf5(names_for_storenames[i] + "_" + name, filepath, "ts").reshape(-1) + ts = removeTTLs(ts=ts, coords=coords) + write_hdf5(ts, names_for_storenames[i] + "_" + name, filepath, "ts") logger.info("Chunks with artifacts are replaced by NaN values.") From 07dcfa80ede5a9b5ba15ab7b27da5319aa2ec709 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 16 Dec 2025 15:08:14 -0800 Subject: [PATCH 101/150] moved read to the top of addingNaNtoChunksWithArtifacts --- src/guppy/analysis/artifact_removal.py | 32 +++++++++----- src/guppy/analysis/standard_io.py | 59 ++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 10 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index a17eb0e..3af3001 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -9,11 +9,21 @@ read_hdf5, write_hdf5, ) +from .standard_io import ( + read_control_and_signal, + read_coords_pairwise, + read_corrected_timestamps_pairwise, + read_corrected_ttl_timestamps, +) logger = logging.getLogger(__name__) def addingNaNtoChunksWithArtifacts(filepath, storesList): + name_to_data, _, _, _ = read_control_and_signal(filepath, storesList) + pair_name_to_tsNew = read_corrected_timestamps_pairwise(filepath) + pair_name_to_coords = read_coords_pairwise(filepath, pair_name_to_tsNew) + compound_name_to_ttl_timestamps = read_corrected_ttl_timestamps(filepath, storesList, pair_name_to_tsNew) logger.debug("Replacing chunks with artifacts by NaN values.") names_for_storenames = storesList[1, :] @@ -26,25 +36,27 @@ def addingNaNtoChunksWithArtifacts(filepath, storesList): if name_1[-1] != name_2[-1]: logger.error("Error in naming convention of files or Error in storesList file") raise Exception("Error in naming convention of files or Error in storesList file") - name = name_1[-1] + pair_name = name_1[-1] - sampling_rate = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] - ts = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") - coords = fetchCoords(filepath, name, ts) + tsNew = pair_name_to_tsNew[pair_name] + coords = pair_name_to_coords[pair_name] for i in range(len(names_for_storenames)): if ( - "control_" + name.lower() in names_for_storenames[i].lower() - or "signal_" + name.lower() in names_for_storenames[i].lower() + "control_" + pair_name.lower() in names_for_storenames[i].lower() + or "signal_" + pair_name.lower() in names_for_storenames[i].lower() ): # changes done - data = read_hdf5(names_for_storenames[i], filepath, "data").reshape(-1) - data = addingNaNValues(data=data, ts=ts, coords=coords) + # data = read_hdf5(names_for_storenames[i], filepath, "data").reshape(-1) + data = name_to_data[names_for_storenames[i]].reshape(-1) + data = addingNaNValues(data=data, ts=tsNew, coords=coords) write_hdf5(data, names_for_storenames[i], filepath, "data") else: if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): continue - ts = read_hdf5(names_for_storenames[i] + "_" + name, filepath, "ts").reshape(-1) + ttl_name = names_for_storenames[i] + compound_name = ttl_name + "_" + pair_name + ts = compound_name_to_ttl_timestamps[compound_name].reshape(-1) ts = removeTTLs(ts=ts, coords=coords) - write_hdf5(ts, names_for_storenames[i] + "_" + name, filepath, "ts") + write_hdf5(ts, names_for_storenames[i] + "_" + pair_name, filepath, "ts") logger.info("Chunks with artifacts are replaced by NaN values.") diff --git a/src/guppy/analysis/standard_io.py b/src/guppy/analysis/standard_io.py index b6fcd8a..9c2b7c5 100644 --- a/src/guppy/analysis/standard_io.py +++ b/src/guppy/analysis/standard_io.py @@ -1,8 +1,11 @@ import logging +import os import numpy as np from .io_utils import ( + decide_naming_convention, + fetchCoords, get_control_and_signal_channel_names, read_hdf5, write_hdf5, @@ -114,3 +117,59 @@ def write_zscore(filepath, name, z_score, dff, control_fit, temp_control_arr): write_hdf5(control_fit, "cntrl_sig_fit_" + name, filepath, "data") if temp_control_arr is not None: write_hdf5(temp_control_arr, "control_" + name, filepath, "data") + + +def read_corrected_timestamps_pairwise(filepath): + pair_name_to_tsNew = {} + path = decide_naming_convention(filepath) + for j in range(path.shape[1]): + name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") + name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") + if name_1[-1] != name_2[-1]: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + name = name_1[-1] + + tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") + pair_name_to_tsNew[name] = tsNew + return pair_name_to_tsNew + + +def read_coords_pairwise(filepath, pair_name_to_tsNew): + pair_name_to_coords = {} + path = decide_naming_convention(filepath) + for j in range(path.shape[1]): + name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") + name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") + if name_1[-1] != name_2[-1]: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + pair_name = name_1[-1] + + tsNew = pair_name_to_tsNew[pair_name] + coords = fetchCoords(filepath, pair_name, tsNew) + pair_name_to_coords[pair_name] = coords + return pair_name_to_coords + + +def read_corrected_ttl_timestamps(filepath, storesList): + compound_name_to_ttl_timestamps = {} + storenames = storesList[0, :] + names_for_storenames = storesList[1, :] + arr = get_control_and_signal_channel_names(storesList) + + for storename, name in zip(storenames, names_for_storenames): + if name in arr: + continue + ttl_name = name + for i in range(arr.shape[1]): + name_1 = arr[0, i].split("_")[-1] + name_2 = arr[1, i].split("_")[-1] + if name_1 != name_2: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + compound_name = ttl_name + "_" + name_1 + ts = read_hdf5(compound_name, filepath, "ts") + compound_name_to_ttl_timestamps[compound_name] = ts + + return compound_name_to_ttl_timestamps From 8e037759ed2ac405ff6e615ec7ca572156b8723c Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 16 Dec 2025 15:18:43 -0800 Subject: [PATCH 102/150] moved read out of addingNaNtoChunksWithArtifacts --- src/guppy/analysis/artifact_removal.py | 16 +++------------- src/guppy/preprocess.py | 16 +++++++++++++++- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index 3af3001..97e24f3 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -9,22 +9,13 @@ read_hdf5, write_hdf5, ) -from .standard_io import ( - read_control_and_signal, - read_coords_pairwise, - read_corrected_timestamps_pairwise, - read_corrected_ttl_timestamps, -) logger = logging.getLogger(__name__) -def addingNaNtoChunksWithArtifacts(filepath, storesList): - name_to_data, _, _, _ = read_control_and_signal(filepath, storesList) - pair_name_to_tsNew = read_corrected_timestamps_pairwise(filepath) - pair_name_to_coords = read_coords_pairwise(filepath, pair_name_to_tsNew) - compound_name_to_ttl_timestamps = read_corrected_ttl_timestamps(filepath, storesList, pair_name_to_tsNew) - +def addingNaNtoChunksWithArtifacts( + filepath, storesList, pair_name_to_tsNew, pair_name_to_coords, name_to_data, compound_name_to_ttl_timestamps +): logger.debug("Replacing chunks with artifacts by NaN values.") names_for_storenames = storesList[1, :] @@ -45,7 +36,6 @@ def addingNaNtoChunksWithArtifacts(filepath, storesList): "control_" + pair_name.lower() in names_for_storenames[i].lower() or "signal_" + pair_name.lower() in names_for_storenames[i].lower() ): # changes done - # data = read_hdf5(names_for_storenames[i], filepath, "data").reshape(-1) data = name_to_data[names_for_storenames[i]].reshape(-1) data = addingNaNValues(data=data, ts=tsNew, coords=coords) write_hdf5(data, names_for_storenames[i], filepath, "data") diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 5829a2d..184d9fa 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -24,7 +24,10 @@ ) from .analysis.standard_io import ( read_control_and_signal, + read_coords_pairwise, read_corrected_data, + read_corrected_timestamps_pairwise, + read_corrected_ttl_timestamps, read_ttl, write_corrected_data, write_corrected_timestamps, @@ -374,7 +377,18 @@ def execute_artifact_removal(folderNames, inputParameters): if artifactsRemovalMethod == "concatenate": processTimestampsForArtifacts(filepath, timeForLightsTurnOn, storesList) else: - addingNaNtoChunksWithArtifacts(filepath, storesList) + name_to_data, _, _, _ = read_control_and_signal(filepath, storesList) + pair_name_to_tsNew = read_corrected_timestamps_pairwise(filepath) + pair_name_to_coords = read_coords_pairwise(filepath, pair_name_to_tsNew) + compound_name_to_ttl_timestamps = read_corrected_ttl_timestamps(filepath, storesList) + addingNaNtoChunksWithArtifacts( + filepath, + storesList, + pair_name_to_tsNew, + pair_name_to_coords, + name_to_data, + compound_name_to_ttl_timestamps, + ) visualizeControlAndSignal(filepath, removeArtifacts=True) writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n") From a87c507144e8ebe5968a22d68718f716dee44d67 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 16 Dec 2025 17:15:18 -0800 Subject: [PATCH 103/150] fixed data read bug --- src/guppy/analysis/standard_io.py | 15 +++++++++++++++ src/guppy/preprocess.py | 3 ++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/guppy/analysis/standard_io.py b/src/guppy/analysis/standard_io.py index 9c2b7c5..f8d291b 100644 --- a/src/guppy/analysis/standard_io.py +++ b/src/guppy/analysis/standard_io.py @@ -152,6 +152,21 @@ def read_coords_pairwise(filepath, pair_name_to_tsNew): return pair_name_to_coords +def read_corrected_data_dict(filepath, storesList): # TODO: coordinate with read_corrected_data + name_to_corrected_data = {} + storenames = storesList[0, :] + names_for_storenames = storesList[1, :] + control_and_signal_names = get_control_and_signal_channel_names(storesList) + + for storename, name in zip(storenames, names_for_storenames): + if name not in control_and_signal_names: + continue + data = read_hdf5(name, filepath, "data").reshape(-1) + name_to_corrected_data[name] = data + + return name_to_corrected_data + + def read_corrected_ttl_timestamps(filepath, storesList): compound_name_to_ttl_timestamps = {} storenames = storesList[0, :] diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 184d9fa..0c0e176 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -26,6 +26,7 @@ read_control_and_signal, read_coords_pairwise, read_corrected_data, + read_corrected_data_dict, read_corrected_timestamps_pairwise, read_corrected_ttl_timestamps, read_ttl, @@ -377,7 +378,7 @@ def execute_artifact_removal(folderNames, inputParameters): if artifactsRemovalMethod == "concatenate": processTimestampsForArtifacts(filepath, timeForLightsTurnOn, storesList) else: - name_to_data, _, _, _ = read_control_and_signal(filepath, storesList) + name_to_data = read_corrected_data_dict(filepath, storesList) pair_name_to_tsNew = read_corrected_timestamps_pairwise(filepath) pair_name_to_coords = read_coords_pairwise(filepath, pair_name_to_tsNew) compound_name_to_ttl_timestamps = read_corrected_ttl_timestamps(filepath, storesList) From b1cbc836971c2faee6c1b633a7ee3d7122e398c2 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 16 Dec 2025 17:29:08 -0800 Subject: [PATCH 104/150] Refactored write operations out of addingNaNtoChunksWithArtifacts --- src/guppy/analysis/artifact_removal.py | 8 ++++++-- src/guppy/analysis/standard_io.py | 13 +++++++++++++ src/guppy/preprocess.py | 7 ++++++- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index 97e24f3..db40e64 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -21,6 +21,8 @@ def addingNaNtoChunksWithArtifacts( path = decide_naming_convention(filepath) + name_to_corrected_data = {} + compound_name_to_corrected_ttl_timestamps = {} for j in range(path.shape[1]): name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") @@ -38,7 +40,7 @@ def addingNaNtoChunksWithArtifacts( ): # changes done data = name_to_data[names_for_storenames[i]].reshape(-1) data = addingNaNValues(data=data, ts=tsNew, coords=coords) - write_hdf5(data, names_for_storenames[i], filepath, "data") + name_to_corrected_data[names_for_storenames[i]] = data else: if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): continue @@ -46,9 +48,11 @@ def addingNaNtoChunksWithArtifacts( compound_name = ttl_name + "_" + pair_name ts = compound_name_to_ttl_timestamps[compound_name].reshape(-1) ts = removeTTLs(ts=ts, coords=coords) - write_hdf5(ts, names_for_storenames[i] + "_" + pair_name, filepath, "ts") + compound_name_to_corrected_ttl_timestamps[compound_name] = ts logger.info("Chunks with artifacts are replaced by NaN values.") + return name_to_corrected_data, compound_name_to_corrected_ttl_timestamps + # main function to align timestamps for control, signal and event timestamps for artifacts removal def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, events): diff --git a/src/guppy/analysis/standard_io.py b/src/guppy/analysis/standard_io.py index f8d291b..ad7408e 100644 --- a/src/guppy/analysis/standard_io.py +++ b/src/guppy/analysis/standard_io.py @@ -188,3 +188,16 @@ def read_corrected_ttl_timestamps(filepath, storesList): compound_name_to_ttl_timestamps[compound_name] = ts return compound_name_to_ttl_timestamps + + +def write_nan_corrected_data(filepath, name_to_corrected_data): + for name, data in name_to_corrected_data.items(): + write_hdf5(data, name, filepath, "data") + + +def write_nan_corrected_ttl_timestamps( + filepath, + compound_name_to_corrected_ttl_timestamps, +): + for compound_name, corrected_ttl_timestamps in compound_name_to_corrected_ttl_timestamps.items(): + write_hdf5(corrected_ttl_timestamps, compound_name, filepath, "ts") diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 0c0e176..a625bc9 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -33,6 +33,8 @@ write_corrected_data, write_corrected_timestamps, write_corrected_ttl_timestamps, + write_nan_corrected_data, + write_nan_corrected_ttl_timestamps, write_zscore, ) from .analysis.timestamp_correction import correct_timestamps @@ -382,7 +384,7 @@ def execute_artifact_removal(folderNames, inputParameters): pair_name_to_tsNew = read_corrected_timestamps_pairwise(filepath) pair_name_to_coords = read_coords_pairwise(filepath, pair_name_to_tsNew) compound_name_to_ttl_timestamps = read_corrected_ttl_timestamps(filepath, storesList) - addingNaNtoChunksWithArtifacts( + name_to_data, compound_name_to_ttl_timestamps = addingNaNtoChunksWithArtifacts( filepath, storesList, pair_name_to_tsNew, @@ -390,6 +392,9 @@ def execute_artifact_removal(folderNames, inputParameters): name_to_data, compound_name_to_ttl_timestamps, ) + write_nan_corrected_data(filepath, name_to_data) + write_nan_corrected_ttl_timestamps(filepath, compound_name_to_ttl_timestamps) + visualizeControlAndSignal(filepath, removeArtifacts=True) writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n") From 393d3aa79fbbccb9335d73612d2747ef131d1421 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 17 Dec 2025 08:15:06 -0800 Subject: [PATCH 105/150] Refactored filepath out of addingNaNtoChunksWithArtifacts --- src/guppy/analysis/artifact_removal.py | 14 +++----------- src/guppy/preprocess.py | 1 - 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index db40e64..556a719 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -14,23 +14,15 @@ def addingNaNtoChunksWithArtifacts( - filepath, storesList, pair_name_to_tsNew, pair_name_to_coords, name_to_data, compound_name_to_ttl_timestamps + storesList, pair_name_to_tsNew, pair_name_to_coords, name_to_data, compound_name_to_ttl_timestamps ): logger.debug("Replacing chunks with artifacts by NaN values.") names_for_storenames = storesList[1, :] - - path = decide_naming_convention(filepath) + pair_names = pair_name_to_tsNew.keys() name_to_corrected_data = {} compound_name_to_corrected_ttl_timestamps = {} - for j in range(path.shape[1]): - name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") - name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") - if name_1[-1] != name_2[-1]: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - pair_name = name_1[-1] - + for pair_name in pair_names: tsNew = pair_name_to_tsNew[pair_name] coords = pair_name_to_coords[pair_name] for i in range(len(names_for_storenames)): diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index a625bc9..8555b55 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -385,7 +385,6 @@ def execute_artifact_removal(folderNames, inputParameters): pair_name_to_coords = read_coords_pairwise(filepath, pair_name_to_tsNew) compound_name_to_ttl_timestamps = read_corrected_ttl_timestamps(filepath, storesList) name_to_data, compound_name_to_ttl_timestamps = addingNaNtoChunksWithArtifacts( - filepath, storesList, pair_name_to_tsNew, pair_name_to_coords, From 22f4f182c24851a6aa2e9abb62f85dc71e96551d Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 17 Dec 2025 08:18:09 -0800 Subject: [PATCH 106/150] Renamed some variables in processTimestampsForArtifacts --- src/guppy/analysis/artifact_removal.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index 556a719..51c2d19 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -47,10 +47,10 @@ def addingNaNtoChunksWithArtifacts( # main function to align timestamps for control, signal and event timestamps for artifacts removal -def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, events): +def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, storesList): logger.debug("Processing timestamps to get rid of artifacts using concatenate method...") - storesList = events[1, :] + names_for_storenames = storesList[1, :] path = decide_naming_convention(filepath) @@ -63,13 +63,13 @@ def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, events): name = name_1[-1] sampling_rate = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] - for i in range(len(storesList)): + for i in range(len(names_for_storenames)): if ( - "control_" + name.lower() in storesList[i].lower() - or "signal_" + name.lower() in storesList[i].lower() + "control_" + name.lower() in names_for_storenames[i].lower() + or "signal_" + name.lower() in names_for_storenames[i].lower() ): # changes done ts = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") - data = read_hdf5(storesList[i], filepath, "data").reshape(-1) + data = read_hdf5(names_for_storenames[i], filepath, "data").reshape(-1) coords = fetchCoords(filepath, name, ts) data, timestampNew = eliminateData( data=data, @@ -78,13 +78,13 @@ def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, events): timeForLightsTurnOn=timeForLightsTurnOn, sampling_rate=sampling_rate, ) - write_hdf5(data, storesList[i], filepath, "data") + write_hdf5(data, names_for_storenames[i], filepath, "data") else: - if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): + if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): continue else: tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") - ts = read_hdf5(storesList[i] + "_" + name, filepath, "ts").reshape(-1) + ts = read_hdf5(names_for_storenames[i] + "_" + name, filepath, "ts").reshape(-1) coords = fetchCoords(filepath, name, tsNew) ts = eliminateTs( ts=ts, @@ -93,7 +93,7 @@ def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, events): timeForLightsTurnOn=timeForLightsTurnOn, sampling_rate=sampling_rate, ) - write_hdf5(ts, storesList[i] + "_" + name, filepath, "ts") + write_hdf5(ts, names_for_storenames[i] + "_" + name, filepath, "ts") # timestamp_dict[name] = timestampNew write_hdf5(timestampNew, "timeCorrection_" + name, filepath, "timestampNew") From a4a162f2267a295ed89d3ee3aca7188f23e596fb Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 17 Dec 2025 08:23:21 -0800 Subject: [PATCH 107/150] Refactored read out of processTimestampsForArtifacts --- src/guppy/analysis/artifact_removal.py | 24 ++++++++++++++++-------- src/guppy/preprocess.py | 14 +++++++++++++- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index 51c2d19..8e78669 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -5,7 +5,6 @@ from .io_utils import ( decide_naming_convention, - fetchCoords, read_hdf5, write_hdf5, ) @@ -47,7 +46,15 @@ def addingNaNtoChunksWithArtifacts( # main function to align timestamps for control, signal and event timestamps for artifacts removal -def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, storesList): +def processTimestampsForArtifacts( + filepath, + timeForLightsTurnOn, + storesList, + pair_name_to_tsNew, + pair_name_to_coords, + name_to_data, + compound_name_to_ttl_timestamps, +): logger.debug("Processing timestamps to get rid of artifacts using concatenate method...") names_for_storenames = storesList[1, :] @@ -68,9 +75,9 @@ def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, storesList): "control_" + name.lower() in names_for_storenames[i].lower() or "signal_" + name.lower() in names_for_storenames[i].lower() ): # changes done - ts = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") - data = read_hdf5(names_for_storenames[i], filepath, "data").reshape(-1) - coords = fetchCoords(filepath, name, ts) + ts = pair_name_to_tsNew[name] + data = name_to_data[names_for_storenames[i]] + coords = pair_name_to_coords[name] data, timestampNew = eliminateData( data=data, ts=ts, @@ -83,9 +90,10 @@ def processTimestampsForArtifacts(filepath, timeForLightsTurnOn, storesList): if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): continue else: - tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") - ts = read_hdf5(names_for_storenames[i] + "_" + name, filepath, "ts").reshape(-1) - coords = fetchCoords(filepath, name, tsNew) + compound_name = names_for_storenames[i] + "_" + name + tsNew = pair_name_to_tsNew[name] + ts = compound_name_to_ttl_timestamps[compound_name] + coords = pair_name_to_coords[name] ts = eliminateTs( ts=ts, tsNew=tsNew, diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 8555b55..dd02bd0 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -378,7 +378,19 @@ def execute_artifact_removal(folderNames, inputParameters): logger.debug("Removing artifacts from the data...") if artifactsRemovalMethod == "concatenate": - processTimestampsForArtifacts(filepath, timeForLightsTurnOn, storesList) + name_to_data = read_corrected_data_dict(filepath, storesList) + pair_name_to_tsNew = read_corrected_timestamps_pairwise(filepath) + pair_name_to_coords = read_coords_pairwise(filepath, pair_name_to_tsNew) + compound_name_to_ttl_timestamps = read_corrected_ttl_timestamps(filepath, storesList) + processTimestampsForArtifacts( + filepath, + timeForLightsTurnOn, + storesList, + pair_name_to_tsNew, + pair_name_to_coords, + name_to_data, + compound_name_to_ttl_timestamps, + ) else: name_to_data = read_corrected_data_dict(filepath, storesList) pair_name_to_tsNew = read_corrected_timestamps_pairwise(filepath) From a25e7acaecf27e7ad1fd4667478af72509205e35 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 17 Dec 2025 08:31:16 -0800 Subject: [PATCH 108/150] Refactored read out of processTimestampsForArtifacts --- src/guppy/analysis/artifact_removal.py | 5 ++--- src/guppy/analysis/standard_io.py | 5 ++++- src/guppy/preprocess.py | 5 +++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index 8e78669..852cfb8 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -5,7 +5,6 @@ from .io_utils import ( decide_naming_convention, - read_hdf5, write_hdf5, ) @@ -51,6 +50,7 @@ def processTimestampsForArtifacts( timeForLightsTurnOn, storesList, pair_name_to_tsNew, + pair_name_to_sampling_rate, pair_name_to_coords, name_to_data, compound_name_to_ttl_timestamps, @@ -65,10 +65,9 @@ def processTimestampsForArtifacts( for j in range(path.shape[1]): name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") - # dirname = os.path.dirname(path[i]) if name_1[-1] == name_2[-1]: name = name_1[-1] - sampling_rate = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] + sampling_rate = pair_name_to_sampling_rate[name] for i in range(len(names_for_storenames)): if ( diff --git a/src/guppy/analysis/standard_io.py b/src/guppy/analysis/standard_io.py index ad7408e..bba3d20 100644 --- a/src/guppy/analysis/standard_io.py +++ b/src/guppy/analysis/standard_io.py @@ -121,6 +121,7 @@ def write_zscore(filepath, name, z_score, dff, control_fit, temp_control_arr): def read_corrected_timestamps_pairwise(filepath): pair_name_to_tsNew = {} + pair_name_to_sampling_rate = {} path = decide_naming_convention(filepath) for j in range(path.shape[1]): name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") @@ -131,8 +132,10 @@ def read_corrected_timestamps_pairwise(filepath): name = name_1[-1] tsNew = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") + sampling_rate = read_hdf5("timeCorrection_" + name, filepath, "sampling_rate")[0] pair_name_to_tsNew[name] = tsNew - return pair_name_to_tsNew + pair_name_to_sampling_rate[name] = sampling_rate + return pair_name_to_tsNew, pair_name_to_sampling_rate def read_coords_pairwise(filepath, pair_name_to_tsNew): diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index dd02bd0..b618deb 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -379,7 +379,7 @@ def execute_artifact_removal(folderNames, inputParameters): logger.debug("Removing artifacts from the data...") if artifactsRemovalMethod == "concatenate": name_to_data = read_corrected_data_dict(filepath, storesList) - pair_name_to_tsNew = read_corrected_timestamps_pairwise(filepath) + pair_name_to_tsNew, pair_name_to_sampling_rate = read_corrected_timestamps_pairwise(filepath) pair_name_to_coords = read_coords_pairwise(filepath, pair_name_to_tsNew) compound_name_to_ttl_timestamps = read_corrected_ttl_timestamps(filepath, storesList) processTimestampsForArtifacts( @@ -387,13 +387,14 @@ def execute_artifact_removal(folderNames, inputParameters): timeForLightsTurnOn, storesList, pair_name_to_tsNew, + pair_name_to_sampling_rate, pair_name_to_coords, name_to_data, compound_name_to_ttl_timestamps, ) else: name_to_data = read_corrected_data_dict(filepath, storesList) - pair_name_to_tsNew = read_corrected_timestamps_pairwise(filepath) + pair_name_to_tsNew, _ = read_corrected_timestamps_pairwise(filepath) pair_name_to_coords = read_coords_pairwise(filepath, pair_name_to_tsNew) compound_name_to_ttl_timestamps = read_corrected_ttl_timestamps(filepath, storesList) name_to_data, compound_name_to_ttl_timestamps = addingNaNtoChunksWithArtifacts( From 3c7057916867bf451f809156735288c329984b4a Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 17 Dec 2025 08:36:35 -0800 Subject: [PATCH 109/150] Reorganized processTimestampsForArtifacts --- src/guppy/analysis/artifact_removal.py | 77 ++++++++++++-------------- 1 file changed, 36 insertions(+), 41 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index 852cfb8..ebc1df9 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -61,52 +61,47 @@ def processTimestampsForArtifacts( path = decide_naming_convention(filepath) - timestamp_dict = dict() for j in range(path.shape[1]): name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") - if name_1[-1] == name_2[-1]: - name = name_1[-1] - sampling_rate = pair_name_to_sampling_rate[name] - - for i in range(len(names_for_storenames)): - if ( - "control_" + name.lower() in names_for_storenames[i].lower() - or "signal_" + name.lower() in names_for_storenames[i].lower() - ): # changes done - ts = pair_name_to_tsNew[name] - data = name_to_data[names_for_storenames[i]] - coords = pair_name_to_coords[name] - data, timestampNew = eliminateData( - data=data, - ts=ts, - coords=coords, - timeForLightsTurnOn=timeForLightsTurnOn, - sampling_rate=sampling_rate, - ) - write_hdf5(data, names_for_storenames[i], filepath, "data") - else: - if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): - continue - else: - compound_name = names_for_storenames[i] + "_" + name - tsNew = pair_name_to_tsNew[name] - ts = compound_name_to_ttl_timestamps[compound_name] - coords = pair_name_to_coords[name] - ts = eliminateTs( - ts=ts, - tsNew=tsNew, - coords=coords, - timeForLightsTurnOn=timeForLightsTurnOn, - sampling_rate=sampling_rate, - ) - write_hdf5(ts, names_for_storenames[i] + "_" + name, filepath, "ts") - - # timestamp_dict[name] = timestampNew - write_hdf5(timestampNew, "timeCorrection_" + name, filepath, "timestampNew") - else: + if name_1[-1] != name_2[-1]: logger.error("Error in naming convention of files or Error in storesList file") raise Exception("Error in naming convention of files or Error in storesList file") + name = name_1[-1] + + sampling_rate = pair_name_to_sampling_rate[name] + tsNew = pair_name_to_tsNew[name] + coords = pair_name_to_coords[name] + + for i in range(len(names_for_storenames)): + if ( + "control_" + name.lower() in names_for_storenames[i].lower() + or "signal_" + name.lower() in names_for_storenames[i].lower() + ): # changes done + data = name_to_data[names_for_storenames[i]] + data, timestampNew = eliminateData( + data=data, + ts=tsNew, + coords=coords, + timeForLightsTurnOn=timeForLightsTurnOn, + sampling_rate=sampling_rate, + ) + write_hdf5(data, names_for_storenames[i], filepath, "data") + else: + if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): + continue + compound_name = names_for_storenames[i] + "_" + name + ts = compound_name_to_ttl_timestamps[compound_name] + ts = eliminateTs( + ts=ts, + tsNew=tsNew, + coords=coords, + timeForLightsTurnOn=timeForLightsTurnOn, + sampling_rate=sampling_rate, + ) + write_hdf5(ts, names_for_storenames[i] + "_" + name, filepath, "ts") + + write_hdf5(timestampNew, "timeCorrection_" + name, filepath, "timestampNew") logger.info("Timestamps processed, artifacts are removed and good chunks are concatenated.") From b7d054967b992e113a404139306bf53fbe5baab8 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 17 Dec 2025 08:56:43 -0800 Subject: [PATCH 110/150] Removed write from processTimestampsForArtifacts --- src/guppy/analysis/artifact_removal.py | 33 +++++++++++++++++--------- src/guppy/analysis/standard_io.py | 5 ++++ src/guppy/preprocess.py | 6 ++++- 3 files changed, 32 insertions(+), 12 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index ebc1df9..08ffc98 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -5,7 +5,6 @@ from .io_utils import ( decide_naming_convention, - write_hdf5, ) logger = logging.getLogger(__name__) @@ -61,22 +60,25 @@ def processTimestampsForArtifacts( path = decide_naming_convention(filepath) + name_to_corrected_data = {} + pair_name_to_corrected_timestamps = {} + compound_name_to_corrected_ttl_timestamps = {} for j in range(path.shape[1]): name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") if name_1[-1] != name_2[-1]: logger.error("Error in naming convention of files or Error in storesList file") raise Exception("Error in naming convention of files or Error in storesList file") - name = name_1[-1] + pair_name = name_1[-1] - sampling_rate = pair_name_to_sampling_rate[name] - tsNew = pair_name_to_tsNew[name] - coords = pair_name_to_coords[name] + sampling_rate = pair_name_to_sampling_rate[pair_name] + tsNew = pair_name_to_tsNew[pair_name] + coords = pair_name_to_coords[pair_name] for i in range(len(names_for_storenames)): if ( - "control_" + name.lower() in names_for_storenames[i].lower() - or "signal_" + name.lower() in names_for_storenames[i].lower() + "control_" + pair_name.lower() in names_for_storenames[i].lower() + or "signal_" + pair_name.lower() in names_for_storenames[i].lower() ): # changes done data = name_to_data[names_for_storenames[i]] data, timestampNew = eliminateData( @@ -86,11 +88,13 @@ def processTimestampsForArtifacts( timeForLightsTurnOn=timeForLightsTurnOn, sampling_rate=sampling_rate, ) - write_hdf5(data, names_for_storenames[i], filepath, "data") + name_to_corrected_data[names_for_storenames[i]] = data + pair_name_to_corrected_timestamps[pair_name] = timestampNew + # write_hdf5(data, names_for_storenames[i], filepath, "data") else: if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): continue - compound_name = names_for_storenames[i] + "_" + name + compound_name = names_for_storenames[i] + "_" + pair_name ts = compound_name_to_ttl_timestamps[compound_name] ts = eliminateTs( ts=ts, @@ -99,11 +103,18 @@ def processTimestampsForArtifacts( timeForLightsTurnOn=timeForLightsTurnOn, sampling_rate=sampling_rate, ) - write_hdf5(ts, names_for_storenames[i] + "_" + name, filepath, "ts") + compound_name_to_corrected_ttl_timestamps[compound_name] = ts + # write_hdf5(ts, names_for_storenames[i] + "_" + pair_name, filepath, "ts") - write_hdf5(timestampNew, "timeCorrection_" + name, filepath, "timestampNew") + # write_hdf5(timestampNew, "timeCorrection_" + pair_name, filepath, "timestampNew") logger.info("Timestamps processed, artifacts are removed and good chunks are concatenated.") + return ( + name_to_corrected_data, + pair_name_to_corrected_timestamps, + compound_name_to_corrected_ttl_timestamps, + ) + # helper function to process control and signal timestamps def eliminateData(*, data, ts, coords, timeForLightsTurnOn, sampling_rate): diff --git a/src/guppy/analysis/standard_io.py b/src/guppy/analysis/standard_io.py index bba3d20..3131da5 100644 --- a/src/guppy/analysis/standard_io.py +++ b/src/guppy/analysis/standard_io.py @@ -204,3 +204,8 @@ def write_nan_corrected_ttl_timestamps( ): for compound_name, corrected_ttl_timestamps in compound_name_to_corrected_ttl_timestamps.items(): write_hdf5(corrected_ttl_timestamps, compound_name, filepath, "ts") + + +def write_concat_corrected_timestamps(filepath, pair_name_to_corrected_timestamps): + for pair_name, timestamps in pair_name_to_corrected_timestamps.items(): + write_hdf5(timestamps, "timeCorrection_" + pair_name, filepath, "timestampNew") diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index b618deb..3f899a9 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -30,6 +30,7 @@ read_corrected_timestamps_pairwise, read_corrected_ttl_timestamps, read_ttl, + write_concat_corrected_timestamps, write_corrected_data, write_corrected_timestamps, write_corrected_ttl_timestamps, @@ -382,7 +383,7 @@ def execute_artifact_removal(folderNames, inputParameters): pair_name_to_tsNew, pair_name_to_sampling_rate = read_corrected_timestamps_pairwise(filepath) pair_name_to_coords = read_coords_pairwise(filepath, pair_name_to_tsNew) compound_name_to_ttl_timestamps = read_corrected_ttl_timestamps(filepath, storesList) - processTimestampsForArtifacts( + name_to_data, pair_name_to_timestamps, compound_name_to_ttl_timestamps = processTimestampsForArtifacts( filepath, timeForLightsTurnOn, storesList, @@ -392,6 +393,9 @@ def execute_artifact_removal(folderNames, inputParameters): name_to_data, compound_name_to_ttl_timestamps, ) + write_nan_corrected_data(filepath, name_to_data) + write_concat_corrected_timestamps(filepath, pair_name_to_timestamps) + write_nan_corrected_ttl_timestamps(filepath, compound_name_to_ttl_timestamps) else: name_to_data = read_corrected_data_dict(filepath, storesList) pair_name_to_tsNew, _ = read_corrected_timestamps_pairwise(filepath) From 61b2712d1aceb8bf894ad1d5868c66760b2b75f5 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 17 Dec 2025 08:57:15 -0800 Subject: [PATCH 111/150] Removed write from processTimestampsForArtifacts --- src/guppy/analysis/artifact_removal.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index 08ffc98..4ac22c9 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -90,7 +90,6 @@ def processTimestampsForArtifacts( ) name_to_corrected_data[names_for_storenames[i]] = data pair_name_to_corrected_timestamps[pair_name] = timestampNew - # write_hdf5(data, names_for_storenames[i], filepath, "data") else: if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): continue @@ -104,9 +103,7 @@ def processTimestampsForArtifacts( sampling_rate=sampling_rate, ) compound_name_to_corrected_ttl_timestamps[compound_name] = ts - # write_hdf5(ts, names_for_storenames[i] + "_" + pair_name, filepath, "ts") - # write_hdf5(timestampNew, "timeCorrection_" + pair_name, filepath, "timestampNew") logger.info("Timestamps processed, artifacts are removed and good chunks are concatenated.") return ( From 2dc18cc51a47a8efb03e6f093df4355a6c473a7f Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 17 Dec 2025 08:59:30 -0800 Subject: [PATCH 112/150] Refactored filepath out of processTimestampsForArtifacts --- src/guppy/analysis/artifact_removal.py | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index 4ac22c9..c661e49 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -1,12 +1,7 @@ import logging -import os import numpy as np -from .io_utils import ( - decide_naming_convention, -) - logger = logging.getLogger(__name__) @@ -45,7 +40,6 @@ def addingNaNtoChunksWithArtifacts( # main function to align timestamps for control, signal and event timestamps for artifacts removal def processTimestampsForArtifacts( - filepath, timeForLightsTurnOn, storesList, pair_name_to_tsNew, @@ -54,23 +48,14 @@ def processTimestampsForArtifacts( name_to_data, compound_name_to_ttl_timestamps, ): - logger.debug("Processing timestamps to get rid of artifacts using concatenate method...") names_for_storenames = storesList[1, :] - - path = decide_naming_convention(filepath) + pair_names = pair_name_to_tsNew.keys() name_to_corrected_data = {} pair_name_to_corrected_timestamps = {} compound_name_to_corrected_ttl_timestamps = {} - for j in range(path.shape[1]): - name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") - name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") - if name_1[-1] != name_2[-1]: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - pair_name = name_1[-1] - + for pair_name in pair_names: sampling_rate = pair_name_to_sampling_rate[pair_name] tsNew = pair_name_to_tsNew[pair_name] coords = pair_name_to_coords[pair_name] From bfb18e058f3cae8abffde64412895d913a5a2c46 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 17 Dec 2025 09:10:11 -0800 Subject: [PATCH 113/150] Consolidated write operations --- src/guppy/analysis/standard_io.py | 23 +++++++++++------------ src/guppy/preprocess.py | 25 ++++++++----------------- 2 files changed, 19 insertions(+), 29 deletions(-) diff --git a/src/guppy/analysis/standard_io.py b/src/guppy/analysis/standard_io.py index 3131da5..89f1b40 100644 --- a/src/guppy/analysis/standard_io.py +++ b/src/guppy/analysis/standard_io.py @@ -193,19 +193,18 @@ def read_corrected_ttl_timestamps(filepath, storesList): return compound_name_to_ttl_timestamps -def write_nan_corrected_data(filepath, name_to_corrected_data): - for name, data in name_to_corrected_data.items(): - write_hdf5(data, name, filepath, "data") +def write_artifact_corrected_timestamps(filepath, pair_name_to_corrected_timestamps): + for pair_name, timestamps in pair_name_to_corrected_timestamps.items(): + write_hdf5(timestamps, "timeCorrection_" + pair_name, filepath, "timestampNew") -def write_nan_corrected_ttl_timestamps( +def write_artifact_removal( filepath, - compound_name_to_corrected_ttl_timestamps, + name_to_corrected_data, + pair_name_to_corrected_timestamps, + compound_name_to_corrected_ttl_timestamps=None, ): - for compound_name, corrected_ttl_timestamps in compound_name_to_corrected_ttl_timestamps.items(): - write_hdf5(corrected_ttl_timestamps, compound_name, filepath, "ts") - - -def write_concat_corrected_timestamps(filepath, pair_name_to_corrected_timestamps): - for pair_name, timestamps in pair_name_to_corrected_timestamps.items(): - write_hdf5(timestamps, "timeCorrection_" + pair_name, filepath, "timestampNew") + write_corrected_data(filepath, name_to_corrected_data) + write_corrected_ttl_timestamps(filepath, compound_name_to_corrected_ttl_timestamps) + if pair_name_to_corrected_timestamps is not None: + write_artifact_corrected_timestamps(filepath, pair_name_to_corrected_timestamps) diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 3f899a9..fc90b77 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -30,12 +30,10 @@ read_corrected_timestamps_pairwise, read_corrected_ttl_timestamps, read_ttl, - write_concat_corrected_timestamps, + write_artifact_removal, write_corrected_data, write_corrected_timestamps, write_corrected_ttl_timestamps, - write_nan_corrected_data, - write_nan_corrected_ttl_timestamps, write_zscore, ) from .analysis.timestamp_correction import correct_timestamps @@ -377,14 +375,14 @@ def execute_artifact_removal(folderNames, inputParameters): filepath = storesListPath[j] storesList = np.genfromtxt(os.path.join(filepath, "storesList.csv"), dtype="str", delimiter=",").reshape(2, -1) + name_to_data = read_corrected_data_dict(filepath, storesList) + pair_name_to_tsNew, pair_name_to_sampling_rate = read_corrected_timestamps_pairwise(filepath) + pair_name_to_coords = read_coords_pairwise(filepath, pair_name_to_tsNew) + compound_name_to_ttl_timestamps = read_corrected_ttl_timestamps(filepath, storesList) + logger.debug("Removing artifacts from the data...") if artifactsRemovalMethod == "concatenate": - name_to_data = read_corrected_data_dict(filepath, storesList) - pair_name_to_tsNew, pair_name_to_sampling_rate = read_corrected_timestamps_pairwise(filepath) - pair_name_to_coords = read_coords_pairwise(filepath, pair_name_to_tsNew) - compound_name_to_ttl_timestamps = read_corrected_ttl_timestamps(filepath, storesList) name_to_data, pair_name_to_timestamps, compound_name_to_ttl_timestamps = processTimestampsForArtifacts( - filepath, timeForLightsTurnOn, storesList, pair_name_to_tsNew, @@ -393,14 +391,7 @@ def execute_artifact_removal(folderNames, inputParameters): name_to_data, compound_name_to_ttl_timestamps, ) - write_nan_corrected_data(filepath, name_to_data) - write_concat_corrected_timestamps(filepath, pair_name_to_timestamps) - write_nan_corrected_ttl_timestamps(filepath, compound_name_to_ttl_timestamps) else: - name_to_data = read_corrected_data_dict(filepath, storesList) - pair_name_to_tsNew, _ = read_corrected_timestamps_pairwise(filepath) - pair_name_to_coords = read_coords_pairwise(filepath, pair_name_to_tsNew) - compound_name_to_ttl_timestamps = read_corrected_ttl_timestamps(filepath, storesList) name_to_data, compound_name_to_ttl_timestamps = addingNaNtoChunksWithArtifacts( storesList, pair_name_to_tsNew, @@ -408,9 +399,9 @@ def execute_artifact_removal(folderNames, inputParameters): name_to_data, compound_name_to_ttl_timestamps, ) - write_nan_corrected_data(filepath, name_to_data) - write_nan_corrected_ttl_timestamps(filepath, compound_name_to_ttl_timestamps) + pair_name_to_timestamps = None + write_artifact_removal(filepath, name_to_data, pair_name_to_timestamps, compound_name_to_ttl_timestamps) visualizeControlAndSignal(filepath, removeArtifacts=True) writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n") From d4f3de43f207f84d3b7ff6ad67021f59e9263cc1 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 17 Dec 2025 09:30:02 -0800 Subject: [PATCH 114/150] Consolidated into single remove_artifacts fn --- src/guppy/analysis/artifact_removal.py | 40 ++++++++++++++++++++++++++ src/guppy/preprocess.py | 34 +++++++--------------- 2 files changed, 51 insertions(+), 23 deletions(-) diff --git a/src/guppy/analysis/artifact_removal.py b/src/guppy/analysis/artifact_removal.py index c661e49..d3da042 100644 --- a/src/guppy/analysis/artifact_removal.py +++ b/src/guppy/analysis/artifact_removal.py @@ -5,6 +5,46 @@ logger = logging.getLogger(__name__) +def remove_artifacts( + timeForLightsTurnOn, + storesList, + pair_name_to_tsNew, + pair_name_to_sampling_rate, + pair_name_to_coords, + name_to_data, + compound_name_to_ttl_timestamps, + method, +): + if method == "concatenate": + name_to_corrected_data, pair_name_to_corrected_timestamps, compound_name_to_corrected_ttl_timestamps = ( + processTimestampsForArtifacts( + timeForLightsTurnOn, + storesList, + pair_name_to_tsNew, + pair_name_to_sampling_rate, + pair_name_to_coords, + name_to_data, + compound_name_to_ttl_timestamps, + ) + ) + logger.info("Artifacts removed using concatenate method.") + elif method == "replace with NaN": + name_to_corrected_data, compound_name_to_corrected_ttl_timestamps = addingNaNtoChunksWithArtifacts( + storesList, + pair_name_to_tsNew, + pair_name_to_coords, + name_to_data, + compound_name_to_ttl_timestamps, + ) + pair_name_to_corrected_timestamps = None + logger.info("Artifacts removed using NaN replacement method.") + else: + logger.error("Invalid artifact removal method specified.") + raise ValueError("Invalid artifact removal method specified.") + + return name_to_corrected_data, pair_name_to_corrected_timestamps, compound_name_to_corrected_ttl_timestamps + + def addingNaNtoChunksWithArtifacts( storesList, pair_name_to_tsNew, pair_name_to_coords, name_to_data, compound_name_to_ttl_timestamps ): diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index fc90b77..46fc7c7 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -7,10 +7,7 @@ import matplotlib.pyplot as plt import numpy as np -from .analysis.artifact_removal import ( - addingNaNtoChunksWithArtifacts, - processTimestampsForArtifacts, -) +from .analysis.artifact_removal import remove_artifacts from .analysis.combine_data import combineData from .analysis.control_channel import add_control_channel, create_control_channel from .analysis.io_utils import ( @@ -381,25 +378,16 @@ def execute_artifact_removal(folderNames, inputParameters): compound_name_to_ttl_timestamps = read_corrected_ttl_timestamps(filepath, storesList) logger.debug("Removing artifacts from the data...") - if artifactsRemovalMethod == "concatenate": - name_to_data, pair_name_to_timestamps, compound_name_to_ttl_timestamps = processTimestampsForArtifacts( - timeForLightsTurnOn, - storesList, - pair_name_to_tsNew, - pair_name_to_sampling_rate, - pair_name_to_coords, - name_to_data, - compound_name_to_ttl_timestamps, - ) - else: - name_to_data, compound_name_to_ttl_timestamps = addingNaNtoChunksWithArtifacts( - storesList, - pair_name_to_tsNew, - pair_name_to_coords, - name_to_data, - compound_name_to_ttl_timestamps, - ) - pair_name_to_timestamps = None + name_to_data, pair_name_to_timestamps, compound_name_to_ttl_timestamps = remove_artifacts( + timeForLightsTurnOn, + storesList, + pair_name_to_tsNew, + pair_name_to_sampling_rate, + pair_name_to_coords, + name_to_data, + compound_name_to_ttl_timestamps, + method=artifactsRemovalMethod, + ) write_artifact_removal(filepath, name_to_data, pair_name_to_timestamps, compound_name_to_ttl_timestamps) visualizeControlAndSignal(filepath, removeArtifacts=True) From c23aa1ddf12ba4c2525574f64e5952586db03113 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 17 Dec 2025 13:17:21 -0800 Subject: [PATCH 115/150] fixed bug with read_control_and_signal --- src/guppy/analysis/standard_io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/guppy/analysis/standard_io.py b/src/guppy/analysis/standard_io.py index 89f1b40..e7fe8e0 100644 --- a/src/guppy/analysis/standard_io.py +++ b/src/guppy/analysis/standard_io.py @@ -27,8 +27,8 @@ def read_control_and_signal(filepath, storesList): for i in range(channels_arr.shape[1]): control_name = channels_arr[0, i] signal_name = channels_arr[1, i] - idx_c = np.where(storesList == control_name)[0] - idx_s = np.where(storesList == signal_name)[0] + idx_c = np.where(names_for_storenames == control_name)[0] + idx_s = np.where(names_for_storenames == signal_name)[0] control_storename = storenames[idx_c[0]] signal_storename = storenames[idx_s[0]] From 1cda972960addc599f378d699d6d8eaa2da9e12e Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 17 Dec 2025 13:18:32 -0800 Subject: [PATCH 116/150] fixed naming bug in timestampCorrection --- src/guppy/analysis/timestamp_correction.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/guppy/analysis/timestamp_correction.py b/src/guppy/analysis/timestamp_correction.py index 60cf76a..0806fb8 100644 --- a/src/guppy/analysis/timestamp_correction.py +++ b/src/guppy/analysis/timestamp_correction.py @@ -64,15 +64,15 @@ def timestampCorrection( name_to_corrected_data = {} storenames = storesList[0, :] names_for_storenames = storesList[1, :] - data = get_control_and_signal_channel_names(storesList) + channels_arr = get_control_and_signal_channel_names(storesList) - indices = check_cntrl_sig_length(data, name_to_data) + indices = check_cntrl_sig_length(channels_arr, name_to_data) - for i in range(data.shape[1]): - control_name = data[0, i] - signal_name = data[1, i] - name_1 = data[0, i].split("_")[-1] - name_2 = data[1, i].split("_")[-1] + for i in range(channels_arr.shape[1]): + control_name = channels_arr[0, i] + signal_name = channels_arr[1, i] + name_1 = channels_arr[0, i].split("_")[-1] + name_2 = channels_arr[1, i].split("_")[-1] if name_1 != name_2: logger.error("Error in naming convention of files or Error in storesList file") raise Exception("Error in naming convention of files or Error in storesList file") @@ -81,8 +81,8 @@ def timestampCorrection( idx = np.where(names_for_storenames == indices[i])[0] if idx.shape[0] == 0: - logger.error(f"{data[0,i]} does not exist in the stores list file.") - raise Exception("{} does not exist in the stores list file.".format(data[0, i])) + logger.error(f"{channels_arr[0,i]} does not exist in the stores list file.") + raise Exception("{} does not exist in the stores list file.".format(channels_arr[0, i])) name = names_for_storenames[idx][0] timestamp = name_to_timestamps[name] From 19986c81c974138d2007982badd8ae2a8dcc679a Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 19 Dec 2025 16:36:18 -0800 Subject: [PATCH 117/150] Fixed combinedata bug --- src/guppy/analysis/combine_data.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py index f89315f..cf96835 100644 --- a/src/guppy/analysis/combine_data.py +++ b/src/guppy/analysis/combine_data.py @@ -66,6 +66,8 @@ def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): for i in range(len(filepath)): ts = read_hdf5("timeCorrection_" + naming, filepath[i], "timestampNew") data = read_hdf5(event, filepath[i], "data").reshape(-1) + print(f"{ts.shape = }") + print(f"{data.shape = }") # index = np.where((ts>coords[i,0]) & (ts Date: Fri, 19 Dec 2025 16:42:28 -0800 Subject: [PATCH 118/150] Fixed combinedata bug --- src/guppy/analysis/combine_data.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py index cf96835..3da338d 100644 --- a/src/guppy/analysis/combine_data.py +++ b/src/guppy/analysis/combine_data.py @@ -66,8 +66,6 @@ def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): for i in range(len(filepath)): ts = read_hdf5("timeCorrection_" + naming, filepath[i], "timestampNew") data = read_hdf5(event, filepath[i], "data").reshape(-1) - print(f"{ts.shape = }") - print(f"{data.shape = }") # index = np.where((ts>coords[i,0]) & (ts Date: Fri, 19 Dec 2025 16:51:22 -0800 Subject: [PATCH 119/150] Reorganized into execute_combined_data and combine_data. --- src/guppy/analysis/combine_data.py | 49 +----------------------------- src/guppy/preprocess.py | 48 +++++++++++++++++++++++++++-- 2 files changed, 47 insertions(+), 50 deletions(-) diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py index 3da338d..3ab73d3 100644 --- a/src/guppy/analysis/combine_data.py +++ b/src/guppy/analysis/combine_data.py @@ -1,4 +1,3 @@ -import glob import logging import os @@ -6,59 +5,13 @@ from .io_utils import ( decide_naming_convention, - get_all_stores_for_combining_data, read_hdf5, - takeOnlyDirs, write_hdf5, ) logger = logging.getLogger(__name__) -# function to combine data when there are two different data files for the same recording session -# it will combine the data, do timestamps processing and save the combined data in the first output folder. -def combineData(folderNames, inputParameters, storesList): - - logger.debug("Combining Data from different data files...") - timeForLightsTurnOn = inputParameters["timeForLightsTurnOn"] - op_folder = [] - for i in range(len(folderNames)): - filepath = folderNames[i] - op_folder.append(takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*")))) - - op_folder = list(np.concatenate(op_folder).flatten()) - sampling_rate_fp = [] - for i in range(len(folderNames)): - filepath = folderNames[i] - storesListPath = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))) - for j in range(len(storesListPath)): - filepath = storesListPath[j] - storesList_new = np.genfromtxt( - os.path.join(filepath, "storesList.csv"), dtype="str", delimiter="," - ).reshape(2, -1) - sampling_rate_fp.append(glob.glob(os.path.join(filepath, "timeCorrection_*"))) - - # check if sampling rate is same for both data - sampling_rate_fp = np.concatenate(sampling_rate_fp) - sampling_rate = [] - for i in range(sampling_rate_fp.shape[0]): - sampling_rate.append(read_hdf5("", sampling_rate_fp[i], "sampling_rate")) - - res = all(i == sampling_rate[0] for i in sampling_rate) - if res == False: - logger.error("To combine the data, sampling rate for both the data should be same.") - raise Exception("To combine the data, sampling rate for both the data should be same.") - - # get the output folders informatinos - op = get_all_stores_for_combining_data(op_folder) - - # processing timestamps for combining the data - processTimestampsForCombiningData(op, timeForLightsTurnOn, storesList, sampling_rate[0]) - logger.info("Data is combined from different data files.") - - return op - - def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): arr = np.array([]) @@ -113,7 +66,7 @@ def eliminateTs(filepath, timeForLightsTurnOn, event, sampling_rate, naming): return ts_arr -def processTimestampsForCombiningData(filepath, timeForLightsTurnOn, events, sampling_rate): +def combine_data(filepath, timeForLightsTurnOn, events, sampling_rate): logger.debug("Processing timestamps for combining data...") diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 46fc7c7..17d1fb5 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -8,7 +8,7 @@ import numpy as np from .analysis.artifact_removal import remove_artifacts -from .analysis.combine_data import combineData +from .analysis.combine_data import combine_data from .analysis.control_channel import add_control_channel, create_control_channel from .analysis.io_utils import ( check_storeslistfile, @@ -399,6 +399,50 @@ def execute_artifact_removal(folderNames, inputParameters): logger.info("Artifact removal completed.") +# function to combine data when there are two different data files for the same recording session +# it will combine the data, do timestamps processing and save the combined data in the first output folder. +def execute_combine_data(folderNames, inputParameters, storesList): + + logger.debug("Combining Data from different data files...") + timeForLightsTurnOn = inputParameters["timeForLightsTurnOn"] + op_folder = [] + for i in range(len(folderNames)): + filepath = folderNames[i] + op_folder.append(takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*")))) + + op_folder = list(np.concatenate(op_folder).flatten()) + sampling_rate_fp = [] + for i in range(len(folderNames)): + filepath = folderNames[i] + storesListPath = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))) + for j in range(len(storesListPath)): + filepath = storesListPath[j] + storesList_new = np.genfromtxt( + os.path.join(filepath, "storesList.csv"), dtype="str", delimiter="," + ).reshape(2, -1) + sampling_rate_fp.append(glob.glob(os.path.join(filepath, "timeCorrection_*"))) + + # check if sampling rate is same for both data + sampling_rate_fp = np.concatenate(sampling_rate_fp) + sampling_rate = [] + for i in range(sampling_rate_fp.shape[0]): + sampling_rate.append(read_hdf5("", sampling_rate_fp[i], "sampling_rate")) + + res = all(i == sampling_rate[0] for i in sampling_rate) + if res == False: + logger.error("To combine the data, sampling rate for both the data should be same.") + raise Exception("To combine the data, sampling rate for both the data should be same.") + + # get the output folders informatinos + op = get_all_stores_for_combining_data(op_folder) + + # processing timestamps for combining the data + combine_data(op, timeForLightsTurnOn, storesList, sampling_rate[0]) + logger.info("Data is combined from different data files.") + + return op + + def extractTsAndSignal(inputParameters): logger.debug("Extracting signal data and event timestamps...") @@ -434,7 +478,7 @@ def extractTsAndSignal(inputParameters): writeToFile(str((pbMaxValue) * 10) + "\n" + str(10) + "\n") execute_timestamp_correction(folderNames, inputParameters) storesList = check_storeslistfile(folderNames) - op_folder = combineData(folderNames, inputParameters, storesList) + op_folder = execute_combine_data(folderNames, inputParameters, storesList) execute_zscore(op_folder, inputParameters) if remove_artifacts == True: execute_artifact_removal(op_folder, inputParameters) From 042fb33c26327376cb6fe67497667e61341089c4 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 19 Dec 2025 17:14:38 -0800 Subject: [PATCH 120/150] Renamed some variables for clarity. --- src/guppy/analysis/combine_data.py | 28 ++++++++++++++++------------ src/guppy/preprocess.py | 1 - 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py index 3ab73d3..b89f9e1 100644 --- a/src/guppy/analysis/combine_data.py +++ b/src/guppy/analysis/combine_data.py @@ -12,13 +12,13 @@ logger = logging.getLogger(__name__) -def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): +def eliminateData(filepaths, timeForLightsTurnOn, event, sampling_rate, naming): arr = np.array([]) ts_arr = np.array([]) - for i in range(len(filepath)): - ts = read_hdf5("timeCorrection_" + naming, filepath[i], "timestampNew") - data = read_hdf5(event, filepath[i], "data").reshape(-1) + for i in range(len(filepaths)): + ts = read_hdf5("timeCorrection_" + naming, filepaths[i], "timestampNew") + data = read_hdf5(event, filepaths[i], "data").reshape(-1) # index = np.where((ts>coords[i,0]) & (ts Date: Fri, 19 Dec 2025 17:27:49 -0800 Subject: [PATCH 121/150] Refactored read operations out of eliminateData. --- src/guppy/analysis/combine_data.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py index b89f9e1..a63be7e 100644 --- a/src/guppy/analysis/combine_data.py +++ b/src/guppy/analysis/combine_data.py @@ -12,15 +12,16 @@ logger = logging.getLogger(__name__) -def eliminateData(filepaths, timeForLightsTurnOn, event, sampling_rate, naming): +def eliminateData(filepath_to_timestamps, filepath_to_data, timeForLightsTurnOn, event, sampling_rate, naming): arr = np.array([]) ts_arr = np.array([]) - for i in range(len(filepaths)): - ts = read_hdf5("timeCorrection_" + naming, filepaths[i], "timestampNew") - data = read_hdf5(event, filepaths[i], "data").reshape(-1) - - # index = np.where((ts>coords[i,0]) & (ts Date: Fri, 19 Dec 2025 17:36:57 -0800 Subject: [PATCH 122/150] Cleaned up some indentation in combine_data. --- src/guppy/analysis/combine_data.py | 77 +++++++++++++++--------------- 1 file changed, 39 insertions(+), 38 deletions(-) diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py index a63be7e..e2fb719 100644 --- a/src/guppy/analysis/combine_data.py +++ b/src/guppy/analysis/combine_data.py @@ -67,12 +67,12 @@ def eliminateTs(filepath, timeForLightsTurnOn, event, sampling_rate, naming): return ts_arr -def combine_data(filepath: list[list[str]], timeForLightsTurnOn, events, sampling_rate): +def combine_data(filepath: list[list[str]], timeForLightsTurnOn, names_for_storenames, sampling_rate): # filepath = [[folder1_output_0, folder2_output_0], [folder1_output_1, folder2_output_1], ...] logger.debug("Processing timestamps for combining data...") - storesList = events[1, :] + names_for_storenames = names_for_storenames[1, :] for single_output_filepaths in filepath: # single_output_filepaths = [folder1_output_i, folder2_output_i, ...] @@ -81,41 +81,42 @@ def combine_data(filepath: list[list[str]], timeForLightsTurnOn, events, samplin pair_name_to_tsNew = {} for j in range(path.shape[1]): - name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_") - name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_") - if name_1[-1] == name_2[-1]: - name = name_1[-1] - - for i in range(len(storesList)): - if ( - "control_" + name.lower() in storesList[i].lower() - or "signal_" + name.lower() in storesList[i].lower() - ): - filepath_to_timestamps = {} - filepath_to_data = {} - for filepath in single_output_filepaths: - ts = read_hdf5("timeCorrection_" + name, filepath, "timestampNew") - data = read_hdf5(storesList[i], filepath, "data").reshape(-1) - filepath_to_timestamps[filepath] = ts - filepath_to_data[filepath] = data - - data, timestampNew = eliminateData( - filepath_to_timestamps, - filepath_to_data, - timeForLightsTurnOn, - storesList[i], - sampling_rate, - name, - ) - write_hdf5(data, storesList[i], single_output_filepaths[0], "data") - pair_name_to_tsNew[name] = timestampNew - else: - if "control" in storesList[i].lower() or "signal" in storesList[i].lower(): - continue - else: - ts = eliminateTs( - single_output_filepaths, timeForLightsTurnOn, storesList[i], sampling_rate, name - ) - write_hdf5(ts, storesList[i] + "_" + name, single_output_filepaths[0], "ts") + name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_")[-1] + name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_")[-1] + if name_1 != name_2: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + pair_name = name_1 + + for i in range(len(names_for_storenames)): + if ( + "control_" + pair_name.lower() in names_for_storenames[i].lower() + or "signal_" + pair_name.lower() in names_for_storenames[i].lower() + ): + filepath_to_timestamps = {} + filepath_to_data = {} + for filepath in single_output_filepaths: + ts = read_hdf5("timeCorrection_" + pair_name, filepath, "timestampNew") + data = read_hdf5(names_for_storenames[i], filepath, "data").reshape(-1) + filepath_to_timestamps[filepath] = ts + filepath_to_data[filepath] = data + + data, timestampNew = eliminateData( + filepath_to_timestamps, + filepath_to_data, + timeForLightsTurnOn, + names_for_storenames[i], + sampling_rate, + pair_name, + ) + write_hdf5(data, names_for_storenames[i], single_output_filepaths[0], "data") + pair_name_to_tsNew[pair_name] = timestampNew + else: + if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): + continue + ts = eliminateTs( + single_output_filepaths, timeForLightsTurnOn, names_for_storenames[i], sampling_rate, pair_name + ) + write_hdf5(ts, names_for_storenames[i] + "_" + pair_name, single_output_filepaths[0], "ts") for pair_name, tsNew in pair_name_to_tsNew.items(): write_hdf5(tsNew, "timeCorrection_" + pair_name, single_output_filepaths[0], "timestampNew") From d3a8fbc5c302867296f8f4b2a4bb97428e56781d Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 19 Dec 2025 17:41:22 -0800 Subject: [PATCH 123/150] Refactored read operations out of eliminateTs. --- src/guppy/analysis/combine_data.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py index e2fb719..6c00be6 100644 --- a/src/guppy/analysis/combine_data.py +++ b/src/guppy/analysis/combine_data.py @@ -43,11 +43,11 @@ def eliminateTs(filepath, timeForLightsTurnOn, event, sampling_rate, naming): ts_arr = np.array([]) tsNew_arr = np.array([]) for i in range(len(filepath)): - tsNew = read_hdf5("timeCorrection_" + naming, filepath[i], "timestampNew") - if os.path.exists(os.path.join(filepath[i], event + "_" + naming + ".hdf5")): - ts = read_hdf5(event + "_" + naming, filepath[i], "ts").reshape(-1) - else: - ts = np.array([]) + # tsNew = read_hdf5("timeCorrection_" + naming, filepath[i], "timestampNew") + # if os.path.exists(os.path.join(filepath[i], event + "_" + naming + ".hdf5")): + # ts = read_hdf5(event + "_" + naming, filepath[i], "ts").reshape(-1) + # else: + # ts = np.array([]) # logger.info("total time : ", tsNew[-1]) if len(tsNew_arr) == 0: @@ -114,8 +114,24 @@ def combine_data(filepath: list[list[str]], timeForLightsTurnOn, names_for_store else: if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): continue + filepath_to_timestamps = {} + filepath_to_ttl_timestamps = {} + for filepath in single_output_filepaths: + tsNew = read_hdf5("timeCorrection_" + pair_name, filepath, "timestampNew") + if os.path.exists(os.path.join(filepath, names_for_storenames[i] + "_" + pair_name + ".hdf5")): + ts = read_hdf5(names_for_storenames[i] + "_" + pair_name, filepath, "ts").reshape(-1) + else: + ts = np.array([]) + filepath_to_timestamps[filepath] = tsNew + filepath_to_ttl_timestamps[filepath] = ts + ts = eliminateTs( - single_output_filepaths, timeForLightsTurnOn, names_for_storenames[i], sampling_rate, pair_name + filepath_to_timestamps, + filepath_to_ttl_timestamps, + timeForLightsTurnOn, + names_for_storenames[i], + sampling_rate, + pair_name, ) write_hdf5(ts, names_for_storenames[i] + "_" + pair_name, single_output_filepaths[0], "ts") for pair_name, tsNew in pair_name_to_tsNew.items(): From c481d953aafac5919e1afb676a019a35aa338f89 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 19 Dec 2025 17:43:50 -0800 Subject: [PATCH 124/150] Refactored read operations out of eliminateTs. --- src/guppy/analysis/combine_data.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py index 6c00be6..4cddb32 100644 --- a/src/guppy/analysis/combine_data.py +++ b/src/guppy/analysis/combine_data.py @@ -38,11 +38,14 @@ def eliminateData(filepath_to_timestamps, filepath_to_data, timeForLightsTurnOn, return arr, ts_arr -def eliminateTs(filepath, timeForLightsTurnOn, event, sampling_rate, naming): +def eliminateTs(filepath_to_timestamps, filepath_to_ttl_timestamps, timeForLightsTurnOn, event, sampling_rate, naming): ts_arr = np.array([]) tsNew_arr = np.array([]) - for i in range(len(filepath)): + filepaths = list(filepath_to_timestamps.keys()) + for filepath in filepaths: + ts = filepath_to_timestamps[filepath] + tsNew = filepath_to_ttl_timestamps[filepath] # tsNew = read_hdf5("timeCorrection_" + naming, filepath[i], "timestampNew") # if os.path.exists(os.path.join(filepath[i], event + "_" + naming + ".hdf5")): # ts = read_hdf5(event + "_" + naming, filepath[i], "ts").reshape(-1) From ebe24b64799cc603ea719d8fcbc970edd43950ec Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 19 Dec 2025 17:47:11 -0800 Subject: [PATCH 125/150] Refactored read operations out of eliminateTs. --- src/guppy/analysis/combine_data.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py index 4cddb32..6ccddc0 100644 --- a/src/guppy/analysis/combine_data.py +++ b/src/guppy/analysis/combine_data.py @@ -12,7 +12,7 @@ logger = logging.getLogger(__name__) -def eliminateData(filepath_to_timestamps, filepath_to_data, timeForLightsTurnOn, event, sampling_rate, naming): +def eliminateData(filepath_to_timestamps, filepath_to_data, timeForLightsTurnOn, sampling_rate): arr = np.array([]) ts_arr = np.array([]) @@ -20,8 +20,6 @@ def eliminateData(filepath_to_timestamps, filepath_to_data, timeForLightsTurnOn, for filepath in filepaths: ts = filepath_to_timestamps[filepath] data = filepath_to_data[filepath] - # ts = read_hdf5("timeCorrection_" + naming, filepaths[i], "timestampNew") - # data = read_hdf5(event, filepaths[i], "data").reshape(-1) if len(arr) == 0: arr = np.concatenate((arr, data)) @@ -38,7 +36,7 @@ def eliminateData(filepath_to_timestamps, filepath_to_data, timeForLightsTurnOn, return arr, ts_arr -def eliminateTs(filepath_to_timestamps, filepath_to_ttl_timestamps, timeForLightsTurnOn, event, sampling_rate, naming): +def eliminateTs(filepath_to_timestamps, filepath_to_ttl_timestamps, timeForLightsTurnOn, sampling_rate): ts_arr = np.array([]) tsNew_arr = np.array([]) @@ -46,13 +44,6 @@ def eliminateTs(filepath_to_timestamps, filepath_to_ttl_timestamps, timeForLight for filepath in filepaths: ts = filepath_to_timestamps[filepath] tsNew = filepath_to_ttl_timestamps[filepath] - # tsNew = read_hdf5("timeCorrection_" + naming, filepath[i], "timestampNew") - # if os.path.exists(os.path.join(filepath[i], event + "_" + naming + ".hdf5")): - # ts = read_hdf5(event + "_" + naming, filepath[i], "ts").reshape(-1) - # else: - # ts = np.array([]) - - # logger.info("total time : ", tsNew[-1]) if len(tsNew_arr) == 0: sub = tsNew[0] - timeForLightsTurnOn tsNew_arr = np.concatenate((tsNew_arr, tsNew - sub)) @@ -108,9 +99,7 @@ def combine_data(filepath: list[list[str]], timeForLightsTurnOn, names_for_store filepath_to_timestamps, filepath_to_data, timeForLightsTurnOn, - names_for_storenames[i], sampling_rate, - pair_name, ) write_hdf5(data, names_for_storenames[i], single_output_filepaths[0], "data") pair_name_to_tsNew[pair_name] = timestampNew @@ -132,9 +121,7 @@ def combine_data(filepath: list[list[str]], timeForLightsTurnOn, names_for_store filepath_to_timestamps, filepath_to_ttl_timestamps, timeForLightsTurnOn, - names_for_storenames[i], sampling_rate, - pair_name, ) write_hdf5(ts, names_for_storenames[i] + "_" + pair_name, single_output_filepaths[0], "ts") for pair_name, tsNew in pair_name_to_tsNew.items(): From ec9623500781ab96a6a1e7b185ef304606c6a605 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 21 Jan 2026 09:20:04 -0800 Subject: [PATCH 126/150] Pulled loop out of combine_data --- src/guppy/analysis/combine_data.py | 121 ++++++++++++++--------------- src/guppy/preprocess.py | 3 +- 2 files changed, 60 insertions(+), 64 deletions(-) diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py index 6ccddc0..277258c 100644 --- a/src/guppy/analysis/combine_data.py +++ b/src/guppy/analysis/combine_data.py @@ -61,68 +61,63 @@ def eliminateTs(filepath_to_timestamps, filepath_to_ttl_timestamps, timeForLight return ts_arr -def combine_data(filepath: list[list[str]], timeForLightsTurnOn, names_for_storenames, sampling_rate): - # filepath = [[folder1_output_0, folder2_output_0], [folder1_output_1, folder2_output_1], ...] - +def combine_data(filepaths_to_combine: list[str], timeForLightsTurnOn, storesList, sampling_rate): + # filepaths_to_combine = [folder1_output_i, folder2_output_i, ...] logger.debug("Processing timestamps for combining data...") - names_for_storenames = names_for_storenames[1, :] - - for single_output_filepaths in filepath: - # single_output_filepaths = [folder1_output_i, folder2_output_i, ...] - - path = decide_naming_convention(single_output_filepaths[0]) - - pair_name_to_tsNew = {} - for j in range(path.shape[1]): - name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_")[-1] - name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_")[-1] - if name_1 != name_2: - logger.error("Error in naming convention of files or Error in storesList file") - raise Exception("Error in naming convention of files or Error in storesList file") - pair_name = name_1 - - for i in range(len(names_for_storenames)): - if ( - "control_" + pair_name.lower() in names_for_storenames[i].lower() - or "signal_" + pair_name.lower() in names_for_storenames[i].lower() - ): - filepath_to_timestamps = {} - filepath_to_data = {} - for filepath in single_output_filepaths: - ts = read_hdf5("timeCorrection_" + pair_name, filepath, "timestampNew") - data = read_hdf5(names_for_storenames[i], filepath, "data").reshape(-1) - filepath_to_timestamps[filepath] = ts - filepath_to_data[filepath] = data - - data, timestampNew = eliminateData( - filepath_to_timestamps, - filepath_to_data, - timeForLightsTurnOn, - sampling_rate, - ) - write_hdf5(data, names_for_storenames[i], single_output_filepaths[0], "data") - pair_name_to_tsNew[pair_name] = timestampNew - else: - if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): - continue - filepath_to_timestamps = {} - filepath_to_ttl_timestamps = {} - for filepath in single_output_filepaths: - tsNew = read_hdf5("timeCorrection_" + pair_name, filepath, "timestampNew") - if os.path.exists(os.path.join(filepath, names_for_storenames[i] + "_" + pair_name + ".hdf5")): - ts = read_hdf5(names_for_storenames[i] + "_" + pair_name, filepath, "ts").reshape(-1) - else: - ts = np.array([]) - filepath_to_timestamps[filepath] = tsNew - filepath_to_ttl_timestamps[filepath] = ts - - ts = eliminateTs( - filepath_to_timestamps, - filepath_to_ttl_timestamps, - timeForLightsTurnOn, - sampling_rate, - ) - write_hdf5(ts, names_for_storenames[i] + "_" + pair_name, single_output_filepaths[0], "ts") - for pair_name, tsNew in pair_name_to_tsNew.items(): - write_hdf5(tsNew, "timeCorrection_" + pair_name, single_output_filepaths[0], "timestampNew") + names_for_storenames = storesList[1, :] + path = decide_naming_convention(filepaths_to_combine[0]) + + pair_name_to_tsNew = {} + for j in range(path.shape[1]): + name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_")[-1] + name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_")[-1] + if name_1 != name_2: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + pair_name = name_1 + + for i in range(len(names_for_storenames)): + if ( + "control_" + pair_name.lower() in names_for_storenames[i].lower() + or "signal_" + pair_name.lower() in names_for_storenames[i].lower() + ): + filepath_to_timestamps = {} + filepath_to_data = {} + for filepath in filepaths_to_combine: + ts = read_hdf5("timeCorrection_" + pair_name, filepath, "timestampNew") + data = read_hdf5(names_for_storenames[i], filepath, "data").reshape(-1) + filepath_to_timestamps[filepath] = ts + filepath_to_data[filepath] = data + + data, timestampNew = eliminateData( + filepath_to_timestamps, + filepath_to_data, + timeForLightsTurnOn, + sampling_rate, + ) + write_hdf5(data, names_for_storenames[i], filepaths_to_combine[0], "data") + pair_name_to_tsNew[pair_name] = timestampNew + else: + if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): + continue + filepath_to_timestamps = {} + filepath_to_ttl_timestamps = {} + for filepath in filepaths_to_combine: + tsNew = read_hdf5("timeCorrection_" + pair_name, filepath, "timestampNew") + if os.path.exists(os.path.join(filepath, names_for_storenames[i] + "_" + pair_name + ".hdf5")): + ts = read_hdf5(names_for_storenames[i] + "_" + pair_name, filepath, "ts").reshape(-1) + else: + ts = np.array([]) + filepath_to_timestamps[filepath] = tsNew + filepath_to_ttl_timestamps[filepath] = ts + + ts = eliminateTs( + filepath_to_timestamps, + filepath_to_ttl_timestamps, + timeForLightsTurnOn, + sampling_rate, + ) + write_hdf5(ts, names_for_storenames[i] + "_" + pair_name, filepaths_to_combine[0], "ts") + for pair_name, tsNew in pair_name_to_tsNew.items(): + write_hdf5(tsNew, "timeCorrection_" + pair_name, filepaths_to_combine[0], "timestampNew") diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 0c41ae4..a7b6e27 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -436,7 +436,8 @@ def execute_combine_data(folderNames, inputParameters, storesList): op = get_all_stores_for_combining_data(op_folder) # processing timestamps for combining the data - combine_data(op, timeForLightsTurnOn, storesList, sampling_rate[0]) + for filepaths_to_combine in op: + combine_data(filepaths_to_combine, timeForLightsTurnOn, storesList, sampling_rate[0]) logger.info("Data is combined from different data files.") return op From fd1fd332453b37dab102e6c71b5b263f2fcef464 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 21 Jan 2026 11:07:48 -0800 Subject: [PATCH 127/150] Pulled read out of combine_data --- src/guppy/analysis/combine_data.py | 39 +++++++--------- src/guppy/analysis/standard_io.py | 73 ++++++++++++++++++++++++++++++ src/guppy/preprocess.py | 18 +++++++- 3 files changed, 107 insertions(+), 23 deletions(-) diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py index 277258c..8cbeace 100644 --- a/src/guppy/analysis/combine_data.py +++ b/src/guppy/analysis/combine_data.py @@ -5,7 +5,6 @@ from .io_utils import ( decide_naming_convention, - read_hdf5, write_hdf5, ) @@ -61,7 +60,15 @@ def eliminateTs(filepath_to_timestamps, filepath_to_ttl_timestamps, timeForLight return ts_arr -def combine_data(filepaths_to_combine: list[str], timeForLightsTurnOn, storesList, sampling_rate): +def combine_data( + filepaths_to_combine: list[str], + pair_name_to_filepath_to_timestamps: dict[str, dict[str, np.ndarray]], + display_name_to_filepath_to_data: dict[str, dict[str, np.ndarray]], + compound_name_to_filepath_to_ttl_timestamps: dict[str, dict[str, np.ndarray]], + timeForLightsTurnOn, + storesList, + sampling_rate, +): # filepaths_to_combine = [folder1_output_i, folder2_output_i, ...] logger.debug("Processing timestamps for combining data...") @@ -82,35 +89,23 @@ def combine_data(filepaths_to_combine: list[str], timeForLightsTurnOn, storesLis "control_" + pair_name.lower() in names_for_storenames[i].lower() or "signal_" + pair_name.lower() in names_for_storenames[i].lower() ): - filepath_to_timestamps = {} - filepath_to_data = {} - for filepath in filepaths_to_combine: - ts = read_hdf5("timeCorrection_" + pair_name, filepath, "timestampNew") - data = read_hdf5(names_for_storenames[i], filepath, "data").reshape(-1) - filepath_to_timestamps[filepath] = ts - filepath_to_data[filepath] = data - + display_name = names_for_storenames[i] + filepath_to_timestamps = pair_name_to_filepath_to_timestamps[pair_name] + filepath_to_data = display_name_to_filepath_to_data[display_name] data, timestampNew = eliminateData( filepath_to_timestamps, filepath_to_data, timeForLightsTurnOn, sampling_rate, ) - write_hdf5(data, names_for_storenames[i], filepaths_to_combine[0], "data") + write_hdf5(data, display_name, filepaths_to_combine[0], "data") pair_name_to_tsNew[pair_name] = timestampNew else: if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): continue - filepath_to_timestamps = {} - filepath_to_ttl_timestamps = {} - for filepath in filepaths_to_combine: - tsNew = read_hdf5("timeCorrection_" + pair_name, filepath, "timestampNew") - if os.path.exists(os.path.join(filepath, names_for_storenames[i] + "_" + pair_name + ".hdf5")): - ts = read_hdf5(names_for_storenames[i] + "_" + pair_name, filepath, "ts").reshape(-1) - else: - ts = np.array([]) - filepath_to_timestamps[filepath] = tsNew - filepath_to_ttl_timestamps[filepath] = ts + compound_name = names_for_storenames[i] + "_" + pair_name + filepath_to_timestamps = pair_name_to_filepath_to_timestamps[pair_name] + filepath_to_ttl_timestamps = compound_name_to_filepath_to_ttl_timestamps[compound_name] ts = eliminateTs( filepath_to_timestamps, @@ -118,6 +113,6 @@ def combine_data(filepaths_to_combine: list[str], timeForLightsTurnOn, storesLis timeForLightsTurnOn, sampling_rate, ) - write_hdf5(ts, names_for_storenames[i] + "_" + pair_name, filepaths_to_combine[0], "ts") + write_hdf5(ts, compound_name, filepaths_to_combine[0], "ts") for pair_name, tsNew in pair_name_to_tsNew.items(): write_hdf5(tsNew, "timeCorrection_" + pair_name, filepaths_to_combine[0], "timestampNew") diff --git a/src/guppy/analysis/standard_io.py b/src/guppy/analysis/standard_io.py index e7fe8e0..02bbe99 100644 --- a/src/guppy/analysis/standard_io.py +++ b/src/guppy/analysis/standard_io.py @@ -208,3 +208,76 @@ def write_artifact_removal( write_corrected_ttl_timestamps(filepath, compound_name_to_corrected_ttl_timestamps) if pair_name_to_corrected_timestamps is not None: write_artifact_corrected_timestamps(filepath, pair_name_to_corrected_timestamps) + + +def read_timestamps_for_combining_data(filepaths_to_combine): + path = decide_naming_convention(filepaths_to_combine[0]) + pair_name_to_filepath_to_timestamps: dict[str, dict[str, np.ndarray]] = {} + for j in range(path.shape[1]): + name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_")[-1] + name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_")[-1] + if name_1 != name_2: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + pair_name = name_1 + pair_name_to_filepath_to_timestamps[pair_name] = {} + for filepath in filepaths_to_combine: + tsNew = read_hdf5("timeCorrection_" + pair_name, filepath, "timestampNew") + pair_name_to_filepath_to_timestamps[pair_name][filepath] = tsNew + + return pair_name_to_filepath_to_timestamps + + +def read_data_for_combining_data(filepaths_to_combine, storesList): + names_for_storenames = storesList[1, :] + path = decide_naming_convention(filepaths_to_combine[0]) + display_name_to_filepath_to_data: dict[str, dict[str, np.ndarray]] = {} + for j in range(path.shape[1]): + name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_")[-1] + name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_")[-1] + if name_1 != name_2: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + pair_name = name_1 + for i in range(len(names_for_storenames)): + if not ( + "control_" + pair_name.lower() in names_for_storenames[i].lower() + or "signal_" + pair_name.lower() in names_for_storenames[i].lower() + ): + continue + display_name = names_for_storenames[i] + display_name_to_filepath_to_data[display_name] = {} + for filepath in filepaths_to_combine: + data = read_hdf5(display_name, filepath, "data").reshape(-1) + display_name_to_filepath_to_data[display_name][filepath] = data + + return display_name_to_filepath_to_data + + +def read_ttl_timestamps_for_combining_data(filepaths_to_combine, storesList): + names_for_storenames = storesList[1, :] + path = decide_naming_convention(filepaths_to_combine[0]) + compound_name_to_filepath_to_ttl_timestamps: dict[str, dict[str, np.ndarray]] = {} + for j in range(path.shape[1]): + name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_")[-1] + name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_")[-1] + if name_1 != name_2: + logger.error("Error in naming convention of files or Error in storesList file") + raise Exception("Error in naming convention of files or Error in storesList file") + pair_name = name_1 + for i in range(len(names_for_storenames)): + if ( + "control_" + pair_name.lower() in names_for_storenames[i].lower() + or "signal_" + pair_name.lower() in names_for_storenames[i].lower() + ): + continue + compound_name = names_for_storenames[i] + "_" + pair_name + compound_name_to_filepath_to_ttl_timestamps[compound_name] = {} + for filepath in filepaths_to_combine: + if os.path.exists(os.path.join(filepath, names_for_storenames[i] + "_" + pair_name + ".hdf5")): + ts = read_hdf5(names_for_storenames[i] + "_" + pair_name, filepath, "ts").reshape(-1) + else: + ts = np.array([]) + compound_name_to_filepath_to_ttl_timestamps[compound_name][filepath] = ts + + return compound_name_to_filepath_to_ttl_timestamps diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index a7b6e27..17a0cbc 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -26,7 +26,10 @@ read_corrected_data_dict, read_corrected_timestamps_pairwise, read_corrected_ttl_timestamps, + read_data_for_combining_data, + read_timestamps_for_combining_data, read_ttl, + read_ttl_timestamps_for_combining_data, write_artifact_removal, write_corrected_data, write_corrected_timestamps, @@ -437,7 +440,20 @@ def execute_combine_data(folderNames, inputParameters, storesList): # processing timestamps for combining the data for filepaths_to_combine in op: - combine_data(filepaths_to_combine, timeForLightsTurnOn, storesList, sampling_rate[0]) + pair_name_to_filepath_to_timestamps = read_timestamps_for_combining_data(filepaths_to_combine) + display_name_to_filepath_to_data = read_data_for_combining_data(filepaths_to_combine, storesList) + compound_name_to_filepath_to_ttl_timestamps = read_ttl_timestamps_for_combining_data( + filepaths_to_combine, storesList + ) + combine_data( + filepaths_to_combine, + pair_name_to_filepath_to_timestamps, + display_name_to_filepath_to_data, + compound_name_to_filepath_to_ttl_timestamps, + timeForLightsTurnOn, + storesList, + sampling_rate[0], + ) logger.info("Data is combined from different data files.") return op From a8fd7387139bd439c29c7b09a5ed27f3f971e297 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 21 Jan 2026 11:19:05 -0800 Subject: [PATCH 128/150] Pulled write out of combine_data --- src/guppy/analysis/combine_data.py | 11 ++++++----- src/guppy/analysis/standard_io.py | 9 +++++++++ src/guppy/preprocess.py | 5 ++++- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/src/guppy/analysis/combine_data.py b/src/guppy/analysis/combine_data.py index 8cbeace..1eac5b6 100644 --- a/src/guppy/analysis/combine_data.py +++ b/src/guppy/analysis/combine_data.py @@ -5,7 +5,6 @@ from .io_utils import ( decide_naming_convention, - write_hdf5, ) logger = logging.getLogger(__name__) @@ -76,6 +75,8 @@ def combine_data( path = decide_naming_convention(filepaths_to_combine[0]) pair_name_to_tsNew = {} + display_name_to_data = {} + compound_name_to_ttl_timestamps = {} for j in range(path.shape[1]): name_1 = ((os.path.basename(path[0, j])).split(".")[0]).split("_")[-1] name_2 = ((os.path.basename(path[1, j])).split(".")[0]).split("_")[-1] @@ -98,8 +99,8 @@ def combine_data( timeForLightsTurnOn, sampling_rate, ) - write_hdf5(data, display_name, filepaths_to_combine[0], "data") pair_name_to_tsNew[pair_name] = timestampNew + display_name_to_data[display_name] = data else: if "control" in names_for_storenames[i].lower() or "signal" in names_for_storenames[i].lower(): continue @@ -113,6 +114,6 @@ def combine_data( timeForLightsTurnOn, sampling_rate, ) - write_hdf5(ts, compound_name, filepaths_to_combine[0], "ts") - for pair_name, tsNew in pair_name_to_tsNew.items(): - write_hdf5(tsNew, "timeCorrection_" + pair_name, filepaths_to_combine[0], "timestampNew") + compound_name_to_ttl_timestamps[compound_name] = ts + + return pair_name_to_tsNew, display_name_to_data, compound_name_to_ttl_timestamps diff --git a/src/guppy/analysis/standard_io.py b/src/guppy/analysis/standard_io.py index 02bbe99..2baefca 100644 --- a/src/guppy/analysis/standard_io.py +++ b/src/guppy/analysis/standard_io.py @@ -281,3 +281,12 @@ def read_ttl_timestamps_for_combining_data(filepaths_to_combine, storesList): compound_name_to_filepath_to_ttl_timestamps[compound_name][filepath] = ts return compound_name_to_filepath_to_ttl_timestamps + + +def write_combined_data(output_filepath, pair_name_to_tsNew, display_name_to_data, compound_name_to_ttl_timestamps): + for pair_name, tsNew in pair_name_to_tsNew.items(): + write_hdf5(tsNew, "timeCorrection_" + pair_name, output_filepath, "timestampNew") + for display_name, data in display_name_to_data.items(): + write_hdf5(data, display_name, output_filepath, "data") + for compound_name, ts in compound_name_to_ttl_timestamps.items(): + write_hdf5(ts, compound_name, output_filepath, "ts") diff --git a/src/guppy/preprocess.py b/src/guppy/preprocess.py index 17a0cbc..e4812a2 100755 --- a/src/guppy/preprocess.py +++ b/src/guppy/preprocess.py @@ -31,6 +31,7 @@ read_ttl, read_ttl_timestamps_for_combining_data, write_artifact_removal, + write_combined_data, write_corrected_data, write_corrected_timestamps, write_corrected_ttl_timestamps, @@ -445,7 +446,7 @@ def execute_combine_data(folderNames, inputParameters, storesList): compound_name_to_filepath_to_ttl_timestamps = read_ttl_timestamps_for_combining_data( filepaths_to_combine, storesList ) - combine_data( + pair_name_to_tsNew, display_name_to_data, compound_name_to_ttl_timestamps = combine_data( filepaths_to_combine, pair_name_to_filepath_to_timestamps, display_name_to_filepath_to_data, @@ -454,6 +455,8 @@ def execute_combine_data(folderNames, inputParameters, storesList): storesList, sampling_rate[0], ) + output_filepath = filepaths_to_combine[0] + write_combined_data(output_filepath, pair_name_to_tsNew, display_name_to_data, compound_name_to_ttl_timestamps) logger.info("Data is combined from different data files.") return op From dccd54a706a2dbe78167ec7abff7caadf1df66a3 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 21 Jan 2026 12:20:45 -0800 Subject: [PATCH 129/150] Added test for combined data. --- src/guppy/testing/api.py | 6 ++ tests/test_combine_data.py | 138 +++++++++++++++++++++++++++++++++++++ 2 files changed, 144 insertions(+) create mode 100644 tests/test_combine_data.py diff --git a/src/guppy/testing/api.py b/src/guppy/testing/api.py index c647907..98939cf 100644 --- a/src/guppy/testing/api.py +++ b/src/guppy/testing/api.py @@ -268,6 +268,7 @@ def step4( npm_timestamp_column_names: list[str | None] | None = None, npm_time_units: list[str] | None = None, npm_split_events: list[bool] | None = None, + combine_data: bool = False, ) -> None: """ Run pipeline Step 4 (Extract timestamps and signal) via the Panel-backed logic, headlessly. @@ -293,6 +294,8 @@ def step4( List of time units for NPM files, one per CSV file (e.g., 'seconds', 'milliseconds'). None if not applicable. npm_split_events : list[bool] | None List of booleans indicating whether to split events for NPM files, one per CSV file. None if not applicable. + combine_data : bool + Whether to enable data combining logic in Step 4. Raises ------ @@ -345,6 +348,9 @@ def step4( # Inject modality input_params["modality"] = modality + # Inject combine_data + input_params["combine_data"] = combine_data + # Call the underlying Step 4 worker directly (no subprocess) extractTsAndSignal(input_params) diff --git a/tests/test_combine_data.py b/tests/test_combine_data.py new file mode 100644 index 0000000..f7c0261 --- /dev/null +++ b/tests/test_combine_data.py @@ -0,0 +1,138 @@ +import glob +import os +import shutil +from pathlib import Path + +import h5py +import pytest + +from guppy.testing.api import step2, step3, step4, step5 + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_combine_data(tmp_path, monkeypatch): + session_subdirs = [ + "SampleData_Clean/Photo_63_207-181030-103332", + "SampleData_with_artifacts/Photo_048_392-200728-121222", + ] + storenames_map = { + "Dv1A": "control_dms", + "Dv2A": "signal_dms", + "PrtN": "port_entries_dms", + } + expected_region = "dms" + expected_ttl = "port_entries_dms" + modality = "tdt" + + npm_timestamp_column_names = None + npm_time_units = None + npm_split_events = [True, True] + + # Use the CSV sample session + src_base_dir = str(Path(".") / "testing_data") + src_sessions = [os.path.join(src_base_dir, session_subdir) for session_subdir in session_subdirs] + for src_session in src_sessions: + if not os.path.isdir(src_session): + pytest.skip(f"Sample data not available at expected path: {src_session}") + + # Stub matplotlib.pyplot.show to avoid GUI blocking + import matplotlib.pyplot as plt # noqa: F401 + + monkeypatch.setattr("matplotlib.pyplot.show", lambda *args, **kwargs: None) + + # Stage a clean copy of the session into a temporary workspace + tmp_base = tmp_path / "data_root" + tmp_base.mkdir(parents=True, exist_ok=True) + session_copies = [] + for src_session in src_sessions: + dest_name = os.path.basename(src_session) + session_copy = tmp_base / dest_name + shutil.copytree(src_session, session_copy) + session_copies.append(session_copy) + + for session_copy in session_copies: + # Remove any copied artifacts in the temp session (match only this session's output dirs) + for d in glob.glob(os.path.join(session_copy, f"{dest_name}_output_*")): + assert os.path.isdir(d), f"Expected output directory for cleanup, got non-directory: {d}" + shutil.rmtree(d) + params_fp = session_copy / "GuPPyParamtersUsed.json" + if params_fp.exists(): + params_fp.unlink() + + selected_folders = [str(session_copy) for session_copy in session_copies] + base_dir = str(tmp_base) + + # Step 2: create storesList.csv in the temp copy + step2( + base_dir=base_dir, + selected_folders=selected_folders, + storenames_map=storenames_map, + modality=modality, + npm_timestamp_column_names=npm_timestamp_column_names, + npm_time_units=npm_time_units, + npm_split_events=npm_split_events, + ) + + # Step 3: read raw data in the temp copy + step3( + base_dir=base_dir, + selected_folders=selected_folders, + modality=modality, + npm_timestamp_column_names=npm_timestamp_column_names, + npm_time_units=npm_time_units, + npm_split_events=npm_split_events, + ) + + # Step 4: extract timestamps and signal in the temp copy + step4( + base_dir=base_dir, + selected_folders=selected_folders, + modality=modality, + npm_timestamp_column_names=npm_timestamp_column_names, + npm_time_units=npm_time_units, + npm_split_events=npm_split_events, + combine_data=True, + ) + + # Step 5: compute PSTH in the temp copy (headless) + step5( + base_dir=str(tmp_base), + selected_folders=[str(session_copy)], + modality=modality, + npm_timestamp_column_names=npm_timestamp_column_names, + npm_time_units=npm_time_units, + npm_split_events=npm_split_events, + ) + + # Validate outputs exist in the temp copy + session_copy = selected_folders[0] # Outputs are written to the first session folder + basename = os.path.basename(session_copy) + output_dirs = sorted(glob.glob(os.path.join(session_copy, f"{basename}_output_*"))) + assert output_dirs, f"No output directories found in {session_copy}" + out_dir = None + for d in output_dirs: + if os.path.exists(os.path.join(d, "storesList.csv")): + out_dir = d + break + assert out_dir is not None, f"No storesList.csv found in any output directory under {session_copy}" + stores_fp = os.path.join(out_dir, "storesList.csv") + assert os.path.exists(stores_fp), "Missing storesList.csv after Step 2/3/4" + + # Ensure timeCorrection_.hdf5 exists with 'timestampNew' + timecorr = os.path.join(out_dir, f"timeCorrection_{expected_region}.hdf5") + assert os.path.exists(timecorr), f"Missing {timecorr}" + with h5py.File(timecorr, "r") as f: + assert "timestampNew" in f, f"Expected 'timestampNew' dataset in {timecorr}" + + # If TTLs exist, check their per-region 'ts' outputs + if expected_ttl is None: + expected_ttls = [] + elif isinstance(expected_ttl, str): + expected_ttls = [expected_ttl] + else: + expected_ttls = expected_ttl + for expected_ttl in expected_ttls: + ttl_fp = os.path.join(out_dir, f"{expected_ttl}_{expected_region}.hdf5") + assert os.path.exists(ttl_fp), f"Missing TTL-aligned file {ttl_fp}" + with h5py.File(ttl_fp, "r") as f: + assert "ts" in f, f"Expected 'ts' dataset in {ttl_fp}" From ba6ced17f9d23042eb43acc3f4a2a8b0595f84fe Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 21 Jan 2026 14:04:30 -0800 Subject: [PATCH 130/150] Reorganized imports for computePsth.py --- src/guppy/computePsth.py | 49 +--------------------------------------- 1 file changed, 1 insertion(+), 48 deletions(-) diff --git a/src/guppy/computePsth.py b/src/guppy/computePsth.py index 671d1d3..3153f12 100755 --- a/src/guppy/computePsth.py +++ b/src/guppy/computePsth.py @@ -12,13 +12,12 @@ from collections import OrderedDict from itertools import repeat -import h5py import numpy as np import pandas as pd from scipy import signal as ss +from .analysis.io_utils import get_all_stores_for_combining_data, read_hdf5, write_hdf5 from .computeCorr import computeCrossCorrelation, getCorrCombinations, make_dir -from .preprocess import get_all_stores_for_combining_data logger = logging.getLogger(__name__) @@ -36,52 +35,6 @@ def writeToFile(value: str): file.write(value) -# function to read hdf5 file -def read_hdf5(event, filepath, key): - if event: - event = event.replace("\\", "_") - event = event.replace("/", "_") - op = os.path.join(filepath, event + ".hdf5") - else: - op = filepath - - if os.path.exists(op): - with h5py.File(op, "r") as f: - arr = np.asarray(f[key]) - else: - raise Exception("{}.hdf5 file does not exist".format(event)) - - return arr - - -# function to write hdf5 file -def write_hdf5(data, event, filepath, key): - event = event.replace("\\", "_") - event = event.replace("/", "_") - op = os.path.join(filepath, event + ".hdf5") - - # if file does not exist create a new file - if not os.path.exists(op): - with h5py.File(op, "w") as f: - if type(data) is np.ndarray: - f.create_dataset(key, data=data, maxshape=(None,), chunks=True) - else: - f.create_dataset(key, data=data) - # if file already exists, append data to it or add a new key to it - else: - with h5py.File(op, "r+") as f: - if key in list(f.keys()): - if type(data) is np.ndarray: - f[key].resize(data.shape) - arr = f[key] - arr[:] = data - else: - arr = f[key] - arr = data - else: - f.create_dataset(key, data=data, maxshape=(None,), chunks=True) - - def create_Df_area_peak(filepath, arr, name, index=[]): op = os.path.join(filepath, "peak_AUC_" + name + ".h5") From 62d751c7329ca7d35cfc39a0452dbc4168d694fd Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 21 Jan 2026 14:22:55 -0800 Subject: [PATCH 131/150] Refactored psthForEachStorename into 3 execute fns --- src/guppy/computePsth.py | 205 ++++++++++++++++++++------------------- 1 file changed, 106 insertions(+), 99 deletions(-) diff --git a/src/guppy/computePsth.py b/src/guppy/computePsth.py index 3153f12..d6636a9 100755 --- a/src/guppy/computePsth.py +++ b/src/guppy/computePsth.py @@ -668,8 +668,6 @@ def psthForEachStorename(inputParameters): # storesList = np.genfromtxt(inputParameters['storesListPath'], dtype='str', delimiter=',') - folderNames = inputParameters["folderNames"] - folderNamesForAvg = inputParameters["folderNamesForAvg"] average = inputParameters["averageForGroup"] combine_data = inputParameters["combine_data"] numProcesses = inputParameters["numberOfCores"] @@ -687,112 +685,121 @@ def psthForEachStorename(inputParameters): # for average following if statement will be executed if average == True: - if len(folderNamesForAvg) > 0: - storesListPath = [] - for i in range(len(folderNamesForAvg)): - filepath = folderNamesForAvg[i] - storesListPath.append(takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*")))) - storesListPath = np.concatenate(storesListPath) - storesList = np.asarray([[], []]) - for i in range(storesListPath.shape[0]): - storesList = np.concatenate( - ( - storesList, - np.genfromtxt( - os.path.join(storesListPath[i], "storesList.csv"), dtype="str", delimiter="," - ).reshape(2, -1), - ), - axis=1, - ) - storesList = np.unique(storesList, axis=1) - op = makeAverageDir(inputParameters["abspath"]) - np.savetxt(os.path.join(op, "storesList.csv"), storesList, delimiter=",", fmt="%s") - pbMaxValue = 0 - for j in range(storesList.shape[1]): - if "control" in storesList[1, j].lower() or "signal" in storesList[1, j].lower(): - continue - else: - pbMaxValue += 1 - writeToFile(str((1 + pbMaxValue + 1) * 10) + "\n" + str(10) + "\n") - for k in range(storesList.shape[1]): - if "control" in storesList[1, k].lower() or "signal" in storesList[1, k].lower(): - continue - else: - averageForGroup(storesListPath, storesList[1, k], inputParameters) - writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n") - inputParameters["step"] += 1 - - else: - logger.error("Not a single folder name is provided in folderNamesForAvg in inputParamters File.") - raise Exception("Not a single folder name is provided in folderNamesForAvg in inputParamters File.") + execute_average_for_group(inputParameters) # for individual analysis following else statement will be executed else: if combine_data == True: - storesListPath = [] - for i in range(len(folderNames)): - storesListPath.append(takeOnlyDirs(glob.glob(os.path.join(folderNames[i], "*_output_*")))) - storesListPath = list(np.concatenate(storesListPath).flatten()) - op = get_all_stores_for_combining_data(storesListPath) - writeToFile(str((len(op) + len(op) + 1) * 10) + "\n" + str(10) + "\n") - for i in range(len(op)): - storesList = np.asarray([[], []]) - for j in range(len(op[i])): - storesList = np.concatenate( - ( - storesList, - np.genfromtxt(os.path.join(op[i][j], "storesList.csv"), dtype="str", delimiter=",").reshape( - 2, -1 - ), - ), - axis=1, - ) - storesList = np.unique(storesList, axis=1) - for k in range(storesList.shape[1]): - storenamePsth(op[i][0], storesList[1, k], inputParameters) - findPSTHPeakAndArea(op[i][0], storesList[1, k], inputParameters) - computeCrossCorrelation(op[i][0], storesList[1, k], inputParameters) - writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n") - inputParameters["step"] += 1 + execute_psth_combined(inputParameters) else: - storesListPath = [] - for i in range(len(folderNames)): - storesListPath.append(takeOnlyDirs(glob.glob(os.path.join(folderNames[i], "*_output_*")))) - storesListPath = np.concatenate(storesListPath) - writeToFile(str((storesListPath.shape[0] + storesListPath.shape[0] + 1) * 10) + "\n" + str(10) + "\n") - for i in range(len(folderNames)): - logger.debug(f"Computing PSTH, Peak and Area for each event in {folderNames[i]}") - storesListPath = takeOnlyDirs(glob.glob(os.path.join(folderNames[i], "*_output_*"))) - for j in range(len(storesListPath)): - filepath = storesListPath[j] - storesList = np.genfromtxt( - os.path.join(filepath, "storesList.csv"), dtype="str", delimiter="," - ).reshape(2, -1) - - with mp.Pool(numProcesses) as p: - p.starmap(storenamePsth, zip(repeat(filepath), storesList[1, :], repeat(inputParameters))) - - with mp.Pool(numProcesses) as pq: - pq.starmap( - findPSTHPeakAndArea, zip(repeat(filepath), storesList[1, :], repeat(inputParameters)) - ) - - with mp.Pool(numProcesses) as cr: - cr.starmap( - computeCrossCorrelation, zip(repeat(filepath), storesList[1, :], repeat(inputParameters)) - ) - - # for k in range(storesList.shape[1]): - # storenamePsth(filepath, storesList[1,k], inputParameters) - # findPSTHPeakAndArea(filepath, storesList[1,k], inputParameters) - - writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n") - inputParameters["step"] += 1 - logger.info(f"PSTH, Area and Peak are computed for all events in {folderNames[i]}.") + execute_psth(inputParameters) logger.info("PSTH, Area and Peak are computed for all events.") return inputParameters +def execute_psth(inputParameters): + folderNames = inputParameters["folderNames"] + numProcesses = inputParameters["numberOfCores"] + storesListPath = [] + for i in range(len(folderNames)): + storesListPath.append(takeOnlyDirs(glob.glob(os.path.join(folderNames[i], "*_output_*")))) + storesListPath = np.concatenate(storesListPath) + writeToFile(str((storesListPath.shape[0] + storesListPath.shape[0] + 1) * 10) + "\n" + str(10) + "\n") + for i in range(len(folderNames)): + logger.debug(f"Computing PSTH, Peak and Area for each event in {folderNames[i]}") + storesListPath = takeOnlyDirs(glob.glob(os.path.join(folderNames[i], "*_output_*"))) + for j in range(len(storesListPath)): + filepath = storesListPath[j] + storesList = np.genfromtxt(os.path.join(filepath, "storesList.csv"), dtype="str", delimiter=",").reshape( + 2, -1 + ) + + with mp.Pool(numProcesses) as p: + p.starmap(storenamePsth, zip(repeat(filepath), storesList[1, :], repeat(inputParameters))) + + with mp.Pool(numProcesses) as pq: + pq.starmap(findPSTHPeakAndArea, zip(repeat(filepath), storesList[1, :], repeat(inputParameters))) + + with mp.Pool(numProcesses) as cr: + cr.starmap(computeCrossCorrelation, zip(repeat(filepath), storesList[1, :], repeat(inputParameters))) + + # for k in range(storesList.shape[1]): + # storenamePsth(filepath, storesList[1,k], inputParameters) + # findPSTHPeakAndArea(filepath, storesList[1,k], inputParameters) + + writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n") + inputParameters["step"] += 1 + logger.info(f"PSTH, Area and Peak are computed for all events in {folderNames[i]}.") + + +def execute_psth_combined(inputParameters): + folderNames = inputParameters["folderNames"] + storesListPath = [] + for i in range(len(folderNames)): + storesListPath.append(takeOnlyDirs(glob.glob(os.path.join(folderNames[i], "*_output_*")))) + storesListPath = list(np.concatenate(storesListPath).flatten()) + op = get_all_stores_for_combining_data(storesListPath) + writeToFile(str((len(op) + len(op) + 1) * 10) + "\n" + str(10) + "\n") + for i in range(len(op)): + storesList = np.asarray([[], []]) + for j in range(len(op[i])): + storesList = np.concatenate( + ( + storesList, + np.genfromtxt(os.path.join(op[i][j], "storesList.csv"), dtype="str", delimiter=",").reshape(2, -1), + ), + axis=1, + ) + storesList = np.unique(storesList, axis=1) + for k in range(storesList.shape[1]): + storenamePsth(op[i][0], storesList[1, k], inputParameters) + findPSTHPeakAndArea(op[i][0], storesList[1, k], inputParameters) + computeCrossCorrelation(op[i][0], storesList[1, k], inputParameters) + writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n") + inputParameters["step"] += 1 + + +def execute_average_for_group(inputParameters): + folderNamesForAvg = inputParameters["folderNamesForAvg"] + if len(folderNamesForAvg) == 0: + logger.error("Not a single folder name is provided in folderNamesForAvg in inputParamters File.") + raise Exception("Not a single folder name is provided in folderNamesForAvg in inputParamters File.") + + storesListPath = [] + for i in range(len(folderNamesForAvg)): + filepath = folderNamesForAvg[i] + storesListPath.append(takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*")))) + storesListPath = np.concatenate(storesListPath) + storesList = np.asarray([[], []]) + for i in range(storesListPath.shape[0]): + storesList = np.concatenate( + ( + storesList, + np.genfromtxt(os.path.join(storesListPath[i], "storesList.csv"), dtype="str", delimiter=",").reshape( + 2, -1 + ), + ), + axis=1, + ) + storesList = np.unique(storesList, axis=1) + op = makeAverageDir(inputParameters["abspath"]) + np.savetxt(os.path.join(op, "storesList.csv"), storesList, delimiter=",", fmt="%s") + pbMaxValue = 0 + for j in range(storesList.shape[1]): + if "control" in storesList[1, j].lower() or "signal" in storesList[1, j].lower(): + continue + else: + pbMaxValue += 1 + writeToFile(str((1 + pbMaxValue + 1) * 10) + "\n" + str(10) + "\n") + for k in range(storesList.shape[1]): + if "control" in storesList[1, k].lower() or "signal" in storesList[1, k].lower(): + continue + else: + averageForGroup(storesListPath, storesList[1, k], inputParameters) + writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n") + inputParameters["step"] += 1 + + def main(input_parameters): try: inputParameters = psthForEachStorename(input_parameters) From c835aa8b1d7572f140b7d5205eed7a5d66812bb8 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 21 Jan 2026 15:49:25 -0800 Subject: [PATCH 132/150] Reorganzied execute_psth fns into separate modules --- .../cross_correlation.py} | 162 +++---- src/guppy/analysis/io_utils.py | 21 + src/guppy/analysis/psth_peak_and_area.py | 119 +++++ src/guppy/analysis/storename_psth.py | 323 +++++++++++++ src/guppy/combineDataFn.py | 341 -------------- src/guppy/computePsth.py | 442 +----------------- 6 files changed, 540 insertions(+), 868 deletions(-) rename src/guppy/{computeCorr.py => analysis/cross_correlation.py} (86%) create mode 100644 src/guppy/analysis/psth_peak_and_area.py create mode 100644 src/guppy/analysis/storename_psth.py delete mode 100755 src/guppy/combineDataFn.py diff --git a/src/guppy/computeCorr.py b/src/guppy/analysis/cross_correlation.py similarity index 86% rename from src/guppy/computeCorr.py rename to src/guppy/analysis/cross_correlation.py index 9070b43..43d0a10 100644 --- a/src/guppy/computeCorr.py +++ b/src/guppy/analysis/cross_correlation.py @@ -4,47 +4,85 @@ import os import re -import h5py import numpy as np import pandas as pd from scipy import signal -logger = logging.getLogger(__name__) +from .io_utils import make_dir_for_cross_correlation, read_Df, read_hdf5 +logger = logging.getLogger(__name__) -def make_dir(filepath): - op = os.path.join(filepath, "cross_correlation_output") - if not os.path.exists(op): - os.mkdir(op) - return op +def computeCrossCorrelation(filepath, event, inputParameters): + isCompute = inputParameters["computeCorr"] + removeArtifacts = inputParameters["removeArtifacts"] + artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] + if isCompute == True: + if removeArtifacts == True and artifactsRemovalMethod == "concatenate": + raise Exception( + "For cross-correlation, when removeArtifacts is True, artifacts removal method\ + should be replace with NaNs and not concatenate" + ) + corr_info, type = getCorrCombinations(filepath, inputParameters) + if "control" in event.lower() or "signal" in event.lower(): + return + else: + for i in range(1, len(corr_info)): + logger.debug(f"Computing cross-correlation for event {event}...") + for j in range(len(type)): + psth_a = read_Df(filepath, event + "_" + corr_info[i - 1], type[j] + "_" + corr_info[i - 1]) + psth_b = read_Df(filepath, event + "_" + corr_info[i], type[j] + "_" + corr_info[i]) + sample_rate = 1 / (psth_a["timestamps"][1] - psth_a["timestamps"][0]) + psth_a = psth_a.drop(columns=["timestamps", "err", "mean"]) + psth_b = psth_b.drop(columns=["timestamps", "err", "mean"]) + cols_a, cols_b = np.array(psth_a.columns), np.array(psth_b.columns) + if np.intersect1d(cols_a, cols_b).size > 0: + cols = list(np.intersect1d(cols_a, cols_b)) + else: + cols = list(cols_a) + arr_A, arr_B = np.array(psth_a).T, np.array(psth_b).T + cross_corr = helperCrossCorrelation(arr_A, arr_B, sample_rate) + cols.append("timestamps") + create_Df( + make_dir_for_cross_correlation(filepath), + "corr_" + event, + type[j] + "_" + corr_info[i - 1] + "_" + corr_info[i], + cross_corr, + cols, + ) + logger.info(f"Cross-correlation for event {event} computed.") -# function to read hdf5 file -def read_hdf5(event, filepath, key): - if event: - op = os.path.join(filepath, event + ".hdf5") - else: - op = filepath - if os.path.exists(op): - with h5py.File(op, "r") as f: - arr = np.asarray(f[key]) +def getCorrCombinations(filepath, inputParameters): + selectForComputePsth = inputParameters["selectForComputePsth"] + if selectForComputePsth == "z_score": + path = glob.glob(os.path.join(filepath, "z_score_*")) + elif selectForComputePsth == "dff": + path = glob.glob(os.path.join(filepath, "dff_*")) else: - logger.error(f"{event}.hdf5 file does not exist") - raise Exception("{}.hdf5 file does not exist".format(event)) + path = glob.glob(os.path.join(filepath, "z_score_*")) + glob.glob(os.path.join(filepath, "dff_*")) - return arr + names = list() + type = list() + for i in range(len(path)): + basename = (os.path.basename(path[i])).split(".")[0] + names.append(basename.split("_")[-1]) + type.append((os.path.basename(path[i])).split(".")[0].split("_" + names[-1], 1)[0]) + names = list(np.unique(np.array(names))) + type = list(np.unique(np.array(type))) -# function to read h5 file and make a dataframe from it -def read_Df(filepath, event, name): - if name: - op = os.path.join(filepath, event + "_{}.h5".format(name)) + corr_info = list() + if len(names) <= 1: + logger.info("Cross-correlation cannot be computed because only one signal is present.") + return corr_info, type + elif len(names) == 2: + corr_info = names else: - op = os.path.join(filepath, event + ".h5") - df = pd.read_hdf(op, key="df", mode="r") + corr_info = names + corr_info.append(names[0]) - return df + return corr_info, type # same function used to store PSTH in computePsth file @@ -91,38 +129,6 @@ def create_Df(filepath, event, name, psth, columns=[]): df.to_hdf(op, key="df", mode="w") -def getCorrCombinations(filepath, inputParameters): - selectForComputePsth = inputParameters["selectForComputePsth"] - if selectForComputePsth == "z_score": - path = glob.glob(os.path.join(filepath, "z_score_*")) - elif selectForComputePsth == "dff": - path = glob.glob(os.path.join(filepath, "dff_*")) - else: - path = glob.glob(os.path.join(filepath, "z_score_*")) + glob.glob(os.path.join(filepath, "dff_*")) - - names = list() - type = list() - for i in range(len(path)): - basename = (os.path.basename(path[i])).split(".")[0] - names.append(basename.split("_")[-1]) - type.append((os.path.basename(path[i])).split(".")[0].split("_" + names[-1], 1)[0]) - - names = list(np.unique(np.array(names))) - type = list(np.unique(np.array(type))) - - corr_info = list() - if len(names) <= 1: - logger.info("Cross-correlation cannot be computed because only one signal is present.") - return corr_info, type - elif len(names) == 2: - corr_info = names - else: - corr_info = names - corr_info.append(names[0]) - - return corr_info, type - - def helperCrossCorrelation(arr_A, arr_B, sample_rate): cross_corr = list() for a, b in zip(arr_A, arr_B): @@ -139,43 +145,3 @@ def helperCrossCorrelation(arr_A, arr_B, sample_rate): lag_msec = lag_msec.reshape(1, -1) cross_corr_arr = np.concatenate((cross_corr_arr, lag_msec), axis=0) return cross_corr_arr - - -def computeCrossCorrelation(filepath, event, inputParameters): - isCompute = inputParameters["computeCorr"] - removeArtifacts = inputParameters["removeArtifacts"] - artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] - if isCompute == True: - if removeArtifacts == True and artifactsRemovalMethod == "concatenate": - raise Exception( - "For cross-correlation, when removeArtifacts is True, artifacts removal method\ - should be replace with NaNs and not concatenate" - ) - corr_info, type = getCorrCombinations(filepath, inputParameters) - if "control" in event.lower() or "signal" in event.lower(): - return - else: - for i in range(1, len(corr_info)): - logger.debug(f"Computing cross-correlation for event {event}...") - for j in range(len(type)): - psth_a = read_Df(filepath, event + "_" + corr_info[i - 1], type[j] + "_" + corr_info[i - 1]) - psth_b = read_Df(filepath, event + "_" + corr_info[i], type[j] + "_" + corr_info[i]) - sample_rate = 1 / (psth_a["timestamps"][1] - psth_a["timestamps"][0]) - psth_a = psth_a.drop(columns=["timestamps", "err", "mean"]) - psth_b = psth_b.drop(columns=["timestamps", "err", "mean"]) - cols_a, cols_b = np.array(psth_a.columns), np.array(psth_b.columns) - if np.intersect1d(cols_a, cols_b).size > 0: - cols = list(np.intersect1d(cols_a, cols_b)) - else: - cols = list(cols_a) - arr_A, arr_B = np.array(psth_a).T, np.array(psth_b).T - cross_corr = helperCrossCorrelation(arr_A, arr_B, sample_rate) - cols.append("timestamps") - create_Df( - make_dir(filepath), - "corr_" + event, - type[j] + "_" + corr_info[i - 1] + "_" + corr_info[i], - cross_corr, - cols, - ) - logger.info(f"Cross-correlation for event {event} computed.") diff --git a/src/guppy/analysis/io_utils.py b/src/guppy/analysis/io_utils.py index b467c37..c1dd39f 100644 --- a/src/guppy/analysis/io_utils.py +++ b/src/guppy/analysis/io_utils.py @@ -6,6 +6,7 @@ import h5py import numpy as np +import pandas as pd logger = logging.getLogger(__name__) @@ -194,3 +195,23 @@ def get_control_and_signal_channel_names(storesList): raise Exception("Error in saving stores list file or spelling mistake for control or signal") return channels_arr + + +# function to read h5 file and make a dataframe from it +def read_Df(filepath, event, name): + event = event.replace("\\", "_") + event = event.replace("/", "_") + if name: + op = os.path.join(filepath, event + "_{}.h5".format(name)) + else: + op = os.path.join(filepath, event + ".h5") + df = pd.read_hdf(op, key="df", mode="r") + + return df + + +def make_dir_for_cross_correlation(filepath): + op = os.path.join(filepath, "cross_correlation_output") + if not os.path.exists(op): + os.mkdir(op) + return op diff --git a/src/guppy/analysis/psth_peak_and_area.py b/src/guppy/analysis/psth_peak_and_area.py new file mode 100644 index 0000000..849bd29 --- /dev/null +++ b/src/guppy/analysis/psth_peak_and_area.py @@ -0,0 +1,119 @@ +import glob +import logging +import os +import re +from collections import OrderedDict + +import numpy as np +import pandas as pd + +from .io_utils import read_Df, read_hdf5 + +logger = logging.getLogger(__name__) + + +# function to compute PSTH peak and area using the function helperPSTHPeakAndArea save the values to h5 and csv files. +def findPSTHPeakAndArea(filepath, event, inputParameters): + + event = event.replace("\\", "_") + event = event.replace("/", "_") + + # sampling_rate = read_hdf5(storesList[0,0], filepath, 'sampling_rate') + peak_startPoint = inputParameters["peak_startPoint"] + peak_endPoint = inputParameters["peak_endPoint"] + selectForComputePsth = inputParameters["selectForComputePsth"] + + if selectForComputePsth == "z_score": + path = glob.glob(os.path.join(filepath, "z_score_*")) + elif selectForComputePsth == "dff": + path = glob.glob(os.path.join(filepath, "dff_*")) + else: + path = glob.glob(os.path.join(filepath, "z_score_*")) + glob.glob(os.path.join(filepath, "dff_*")) + + if "control" in event.lower() or "signal" in event.lower(): + return 0 + else: + for i in range(len(path)): + logger.info(f"Computing peak and area for PSTH mean signal for event {event}...") + basename = (os.path.basename(path[i])).split(".")[0] + name_1 = basename.split("_")[-1] + sampling_rate = read_hdf5("timeCorrection_" + name_1, filepath, "sampling_rate")[0] + psth = read_Df(filepath, event + "_" + name_1, basename) + cols = list(psth.columns) + regex = re.compile("bin_[(]") + bin_names = [cols[i] for i in range(len(cols)) if regex.match(cols[i])] + regex_trials = re.compile("[+-]?([0-9]*[.])?[0-9]+") + trials_names = [cols[i] for i in range(len(cols)) if regex_trials.match(cols[i])] + psth_mean_bin_names = trials_names + bin_names + ["mean"] + psth_mean_bin_mean = np.asarray(psth[psth_mean_bin_names]) + timestamps = np.asarray(psth["timestamps"]).ravel() # np.asarray(read_Df(filepath, 'ts_psth', '')).ravel() + peak_area = helperPSTHPeakAndArea( + psth_mean_bin_mean, timestamps, sampling_rate, peak_startPoint, peak_endPoint + ) # peak, area = + # arr = np.array([[peak, area]]) + fileName = [os.path.basename(os.path.dirname(filepath))] + index = [fileName[0] + "_" + s for s in psth_mean_bin_names] + create_Df_area_peak( + filepath, peak_area, event + "_" + name_1 + "_" + basename, index=index + ) # columns=['peak', 'area'] + create_csv_area_peak(filepath, peak_area, event + "_" + name_1 + "_" + basename, index=index) + logger.info(f"Peak and Area for PSTH mean signal for event {event} computed.") + + +def helperPSTHPeakAndArea(psth_mean, timestamps, sampling_rate, peak_startPoint, peak_endPoint): + + peak_startPoint = np.asarray(peak_startPoint) + peak_endPoint = np.asarray(peak_endPoint) + + peak_startPoint = peak_startPoint[~np.isnan(peak_startPoint)] + peak_endPoint = peak_endPoint[~np.isnan(peak_endPoint)] + + if peak_startPoint.shape[0] != peak_endPoint.shape[0]: + logger.error("Number of Peak Start Time and Peak End Time are unequal.") + raise Exception("Number of Peak Start Time and Peak End Time are unequal.") + + if np.less_equal(peak_endPoint, peak_startPoint).any() == True: + logger.error( + "Peak End Time is lesser than or equal to Peak Start Time. Please check the Peak parameters window." + ) + raise Exception( + "Peak End Time is lesser than or equal to Peak Start Time. Please check the Peak parameters window." + ) + + peak_area = OrderedDict() + + if peak_startPoint.shape[0] == 0 or peak_endPoint.shape[0] == 0: + peak_area["peak"] = np.nan + peak_area["area"] = np.nan + + for i in range(peak_startPoint.shape[0]): + startPtForPeak = np.where(timestamps >= peak_startPoint[i])[0] + endPtForPeak = np.where(timestamps >= peak_endPoint[i])[0] + if len(startPtForPeak) >= 1 and len(endPtForPeak) >= 1: + peakPoint_pos = startPtForPeak[0] + np.argmax(psth_mean[startPtForPeak[0] : endPtForPeak[0], :], axis=0) + peakPoint_neg = startPtForPeak[0] + np.argmin(psth_mean[startPtForPeak[0] : endPtForPeak[0], :], axis=0) + peak_area["peak_pos_" + str(i + 1)] = np.amax(psth_mean[peakPoint_pos], axis=0) + peak_area["peak_neg_" + str(i + 1)] = np.amin(psth_mean[peakPoint_neg], axis=0) + peak_area["area_" + str(i + 1)] = np.trapz(psth_mean[startPtForPeak[0] : endPtForPeak[0], :], axis=0) + else: + peak_area["peak_" + str(i + 1)] = np.nan + peak_area["area_" + str(i + 1)] = np.nan + + return peak_area + + +def create_Df_area_peak(filepath, arr, name, index=[]): + + op = os.path.join(filepath, "peak_AUC_" + name + ".h5") + dirname = os.path.dirname(filepath) + + df = pd.DataFrame(arr, index=index) + + df.to_hdf(op, key="df", mode="w") + + +def create_csv_area_peak(filepath, arr, name, index=[]): + op = os.path.join(filepath, "peak_AUC_" + name + ".csv") + df = pd.DataFrame(arr, index=index) + + df.to_csv(op) diff --git a/src/guppy/analysis/storename_psth.py b/src/guppy/analysis/storename_psth.py new file mode 100644 index 0000000..db99057 --- /dev/null +++ b/src/guppy/analysis/storename_psth.py @@ -0,0 +1,323 @@ +import glob +import logging +import math +import os +import re + +import numpy as np +import pandas as pd +from scipy import signal as ss + +from .io_utils import read_hdf5, write_hdf5 + +logger = logging.getLogger(__name__) + + +# function to create PSTH for each event using function helper_psth and save the PSTH to h5 file +def storenamePsth(filepath, event, inputParameters): + + event = event.replace("\\", "_") + event = event.replace("/", "_") + + selectForComputePsth = inputParameters["selectForComputePsth"] + bin_psth_trials = inputParameters["bin_psth_trials"] + use_time_or_trials = inputParameters["use_time_or_trials"] + + if selectForComputePsth == "z_score": + path = glob.glob(os.path.join(filepath, "z_score_*")) + elif selectForComputePsth == "dff": + path = glob.glob(os.path.join(filepath, "dff_*")) + else: + path = glob.glob(os.path.join(filepath, "z_score_*")) + glob.glob(os.path.join(filepath, "dff_*")) + + b = np.divide(np.ones((100,)), 100) + a = 1 + + # storesList = storesList + # sampling_rate = read_hdf5(storesList[0,0], filepath, 'sampling_rate') + nSecPrev, nSecPost = inputParameters["nSecPrev"], inputParameters["nSecPost"] + baselineStart, baselineEnd = inputParameters["baselineCorrectionStart"], inputParameters["baselineCorrectionEnd"] + timeInterval = inputParameters["timeInterval"] + + if "control" in event.lower() or "signal" in event.lower(): + return 0 + else: + for i in range(len(path)): + logger.info(f"Computing PSTH for event {event}...") + basename = (os.path.basename(path[i])).split(".")[0] + name_1 = basename.split("_")[-1] + control = read_hdf5("control_" + name_1, os.path.dirname(path[i]), "data") + if (control == 0).all() == True: + signal = read_hdf5("signal_" + name_1, os.path.dirname(path[i]), "data") + z_score = ss.filtfilt(b, a, signal) + just_use_signal = True + else: + z_score = read_hdf5("", path[i], "data") + just_use_signal = False + psth, psth_baselineUncorrected, cols = helper_psth( + z_score, + event, + filepath, + nSecPrev, + nSecPost, + timeInterval, + bin_psth_trials, + use_time_or_trials, + baselineStart, + baselineEnd, + name_1, + just_use_signal, + ) + + create_Df( + filepath, + event + "_" + name_1 + "_baselineUncorrected", + basename, + psth_baselineUncorrected, + columns=cols, + ) # extra + create_Df(filepath, event + "_" + name_1, basename, psth, columns=cols) + logger.info(f"PSTH for event {event} computed.") + + +# *********************************** Functions used by storenamePsth *********************************** # + + +# helper function to make PSTH for each event +def helper_psth( + z_score, + event, + filepath, + nSecPrev, + nSecPost, + timeInterval, + bin_psth_trials, + use_time_or_trials, + baselineStart, + baselineEnd, + naming, + just_use_signal, +): + + event = event.replace("\\", "_") + event = event.replace("/", "_") + + sampling_rate = read_hdf5("timeCorrection_" + naming, filepath, "sampling_rate")[0] + + # calculate time before event timestamp and time after event timestamp + nTsPrev = int(round(nSecPrev * sampling_rate)) + nTsPost = int(round(nSecPost * sampling_rate)) + + totalTs = (-1 * nTsPrev) + nTsPost + increment = ((-1 * nSecPrev) + nSecPost) / totalTs + timeAxis = np.linspace(nSecPrev, nSecPost + increment, totalTs + 1) + timeAxisNew = np.concatenate((timeAxis, timeAxis[::-1])) + + # avoid writing same data to same file in multi-processing + # if not os.path.exists(os.path.join(filepath, 'ts_psth.h5')): + # logger.info('file not exists') + # create_Df(filepath, 'ts_psth', '', timeAxis) + # time.sleep(2) + + ts = read_hdf5(event + "_" + naming, filepath, "ts") + + # reject timestamps for which baseline cannot be calculated because of nan values + new_ts = [] + for i in range(ts.shape[0]): + thisTime = ts[i] # -1 not needed anymore + if thisTime < abs(baselineStart): + continue + else: + new_ts.append(ts[i]) + + # reject burst of timestamps + ts = np.asarray(new_ts) + # skip the event if there are no TTLs + if len(ts) == 0: + new_ts = np.array([]) + logger.info(f"Warning : No TTLs present for {event}. This will cause an error in Visualization step") + else: + new_ts = [ts[0]] + for i in range(1, ts.shape[0]): + thisTime = ts[i] + prevTime = new_ts[-1] + diff = thisTime - prevTime + if diff < timeInterval: + continue + else: + new_ts.append(ts[i]) + + # final timestamps + ts = np.asarray(new_ts) + nTs = ts.shape[0] + + # initialize PSTH vector + psth = np.full((nTs, totalTs + 1), np.nan) + psth_baselineUncorrected = np.full((nTs, totalTs + 1), np.nan) # extra + + # for each timestamp, create trial which will be saved in a PSTH vector + for i in range(nTs): + thisTime = ts[i] # -timeForLightsTurnOn + thisIndex = int(round(thisTime * sampling_rate)) + arr = rowFormation(z_score, thisIndex, -1 * nTsPrev, nTsPost) + if just_use_signal == True: + res = np.subtract(arr, np.nanmean(arr)) + z_score_arr = np.divide(res, np.nanstd(arr)) + arr = z_score_arr + else: + arr = arr + + psth_baselineUncorrected[i, :] = arr # extra + psth[i, :] = baselineCorrection(filepath, arr, timeAxis, baselineStart, baselineEnd) + + write_hdf5(ts, event + "_" + naming, filepath, "ts") + columns = list(ts) + + if use_time_or_trials == "Time (min)" and bin_psth_trials > 0: + timestamps = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") + timestamps = np.divide(timestamps, 60) + ts_min = np.divide(ts, 60) + bin_steps = np.arange(timestamps[0], timestamps[-1] + bin_psth_trials, bin_psth_trials) + indices_each_step = dict() + for i in range(1, bin_steps.shape[0]): + indices_each_step[f"{np.around(bin_steps[i-1],0)}-{np.around(bin_steps[i],0)}"] = np.where( + (ts_min >= bin_steps[i - 1]) & (ts_min <= bin_steps[i]) + )[0] + elif use_time_or_trials == "# of trials" and bin_psth_trials > 0: + bin_steps = np.arange(0, ts.shape[0], bin_psth_trials) + if bin_steps[-1] < ts.shape[0]: + bin_steps = np.concatenate((bin_steps, [ts.shape[0]]), axis=0) + indices_each_step = dict() + for i in range(1, bin_steps.shape[0]): + indices_each_step[f"{bin_steps[i-1]}-{bin_steps[i]}"] = np.arange(bin_steps[i - 1], bin_steps[i]) + else: + indices_each_step = dict() + + psth_bin, psth_bin_baselineUncorrected = [], [] + if indices_each_step: + keys = list(indices_each_step.keys()) + for k in keys: + # no trials in a given bin window, just put all the nan values + if indices_each_step[k].shape[0] == 0: + psth_bin.append(np.full(psth.shape[1], np.nan)) + psth_bin_baselineUncorrected.append(np.full(psth_baselineUncorrected.shape[1], np.nan)) + psth_bin.append(np.full(psth.shape[1], np.nan)) + psth_bin_baselineUncorrected.append(np.full(psth_baselineUncorrected.shape[1], np.nan)) + else: + index = indices_each_step[k] + arr = psth[index, :] + # mean of bins + psth_bin.append(np.nanmean(psth[index, :], axis=0)) + psth_bin_baselineUncorrected.append(np.nanmean(psth_baselineUncorrected[index, :], axis=0)) + psth_bin.append(np.nanstd(psth[index, :], axis=0) / math.sqrt(psth[index, :].shape[0])) + # error of bins + psth_bin_baselineUncorrected.append( + np.nanstd(psth_baselineUncorrected[index, :], axis=0) + / math.sqrt(psth_baselineUncorrected[index, :].shape[0]) + ) + + # adding column names + columns.append(f"bin_({k})") + columns.append(f"bin_err_({k})") + + psth = np.concatenate((psth, psth_bin), axis=0) + psth_baselineUncorrected = np.concatenate((psth_baselineUncorrected, psth_bin_baselineUncorrected), axis=0) + + timeAxis = timeAxis.reshape(1, -1) + psth = np.concatenate((psth, timeAxis), axis=0) + psth_baselineUncorrected = np.concatenate((psth_baselineUncorrected, timeAxis), axis=0) + columns.append("timestamps") + + return psth, psth_baselineUncorrected, columns + + +# function to create dataframe for each event PSTH and save it to h5 file +def create_Df(filepath, event, name, psth, columns=[]): + event = event.replace("\\", "_") + event = event.replace("/", "_") + if name: + op = os.path.join(filepath, event + "_{}.h5".format(name)) + else: + op = os.path.join(filepath, event + ".h5") + + # check if file already exists + # if os.path.exists(op): + # return 0 + + # removing psth binned trials + columns = np.array(columns, dtype="str") + regex = re.compile("bin_*") + single_trials = columns[[i for i in range(len(columns)) if not regex.match(columns[i])]] + single_trials_index = [i for i in range(len(single_trials)) if single_trials[i] != "timestamps"] + + psth = psth.T + if psth.ndim > 1: + mean = np.nanmean(psth[:, single_trials_index], axis=1).reshape(-1, 1) + err = np.nanstd(psth[:, single_trials_index], axis=1) / math.sqrt(psth[:, single_trials_index].shape[1]) + err = err.reshape(-1, 1) + psth = np.hstack((psth, mean)) + psth = np.hstack((psth, err)) + # timestamps = np.asarray(read_Df(filepath, 'ts_psth', '')) + # psth = np.hstack((psth, timestamps)) + try: + ts = read_hdf5(event, filepath, "ts") + ts = np.append(ts, ["mean", "err"]) + except: + ts = None + + if len(columns) == 0: + df = pd.DataFrame(psth, index=None, columns=ts, dtype="float32") + else: + columns = np.asarray(columns) + columns = np.append(columns, ["mean", "err"]) + df = pd.DataFrame(psth, index=None, columns=list(columns), dtype="float32") + + df.to_hdf(op, key="df", mode="w") + + +# ***************************** Functions used by helper_psth ***************************** # + + +# function to create PSTH trials corresponding to each event timestamp +def rowFormation(z_score, thisIndex, nTsPrev, nTsPost): + + if nTsPrev < thisIndex and z_score.shape[0] > (thisIndex + nTsPost): + res = z_score[thisIndex - nTsPrev - 1 : thisIndex + nTsPost] + elif nTsPrev >= thisIndex and z_score.shape[0] > (thisIndex + nTsPost): + mismatch = nTsPrev - thisIndex + 1 + res = np.zeros(nTsPrev + nTsPost + 1) + res[:mismatch] = np.nan + res[mismatch:] = z_score[: thisIndex + nTsPost] + elif nTsPrev >= thisIndex and z_score.shape[0] < (thisIndex + nTsPost): + mismatch1 = nTsPrev - thisIndex + 1 + mismatch2 = (thisIndex + nTsPost) - z_score.shape[0] + res1 = np.full(mismatch1, np.nan) + res2 = z_score + res3 = np.full(mismatch2, np.nan) + res = np.concatenate((res1, np.concatenate((res2, res3)))) + else: + mismatch = (thisIndex + nTsPost) - z_score.shape[0] + res1 = np.zeros(mismatch) + res1[:] = np.nan + res2 = z_score[thisIndex - nTsPrev - 1 : z_score.shape[0]] + res = np.concatenate((res2, res1)) + + return res + + +# function to calculate baseline for each PSTH trial and do baseline correction +def baselineCorrection(filepath, arr, timeAxis, baselineStart, baselineEnd): + + # timeAxis = read_Df(filepath, 'ts_psth', '') + # timeAxis = np.asarray(timeAxis).reshape(-1) + baselineStrtPt = np.where(timeAxis >= baselineStart)[0] + baselineEndPt = np.where(timeAxis >= baselineEnd)[0] + + # logger.info(baselineStrtPt[0], baselineEndPt[0]) + if baselineStart == 0 and baselineEnd == 0: + return arr + + baseline = np.nanmean(arr[baselineStrtPt[0] : baselineEndPt[0]]) + baselineSub = np.subtract(arr, baseline) + + return baselineSub diff --git a/src/guppy/combineDataFn.py b/src/guppy/combineDataFn.py deleted file mode 100755 index 51e2bd0..0000000 --- a/src/guppy/combineDataFn.py +++ /dev/null @@ -1,341 +0,0 @@ -import fnmatch -import logging -import os -import re - -logger = logging.getLogger(__name__) - - -def find_files(path, glob_path, ignore_case=False): - rule = ( - re.compile(fnmatch.translate(glob_path), re.IGNORECASE) - if ignore_case - else re.compile(fnmatch.translate(glob_path)) - ) - no_bytes_path = os.listdir(os.path.expanduser(path)) - str_path = [] - - # converting byte object to string - for x in no_bytes_path: - try: - str_path.append(x.decode("utf-8")) - except: - str_path.append(x) - - return [os.path.join(path, n) for n in str_path if rule.match(n)] - - -def read_hdf5(event, filepath, key): - if event: - op = os.path.join(filepath, event + ".hdf5") - else: - op = filepath - - if os.path.exists(op): - with h5py.File(op, "r") as f: - arr = np.asarray(f[key]) - else: - raise Exception("{}.hdf5 file does not exist".format(event)) - - return arr - - -def write_hdf5(data, event, filepath, key): - op = os.path.join(filepath, event + ".hdf5") - - if not os.path.exists(op): - with h5py.File(op, "w") as f: - if type(data) is np.ndarray: - f.create_dataset(key, data=data, maxshape=(None,), chunks=True) - else: - f.create_dataset(key, data=data) - else: - with h5py.File(op, "r+") as f: - if key in list(f.keys()): - if type(data) is np.ndarray: - f[key].resize(data.shape) - arr = f[key] - arr[:] = data - else: - arr = f[key] - arr = data - else: - f.create_dataset(key, data=data, maxshape=(None,), chunks=True) - - -def decide_naming_convention(filepath): - path_1 = find_files(filepath, "control*", ignore_case=True) # glob.glob(os.path.join(filepath, 'control*')) - - path_2 = find_files(filepath, "signal*", ignore_case=True) # glob.glob(os.path.join(filepath, 'signal*')) - - path = sorted(path_1 + path_2, key=str.casefold) - - if len(path) % 2 != 0: - raise Exception("There are not equal number of Control and Signal data") - - path = np.asarray(path).reshape(2, -1) - - return path - - -def eliminateData(filepath, timeForLightsTurnOn, event, sampling_rate, naming): - - arr = np.array([]) - ts_arr = np.array([]) - for i in range(len(filepath)): - ts = read_hdf5("timeCorrection_" + naming, filepath[i], "timestampNew") - data = read_hdf5(event, filepath[i], "data").reshape(-1) - - # index = np.where((ts>coords[i,0]) & (tscoords[i,0]) & (ts 1: - mean = np.nanmean(psth[:, single_trials_index], axis=1).reshape(-1, 1) - err = np.nanstd(psth[:, single_trials_index], axis=1) / math.sqrt(psth[:, single_trials_index].shape[1]) - err = err.reshape(-1, 1) - psth = np.hstack((psth, mean)) - psth = np.hstack((psth, err)) - # timestamps = np.asarray(read_Df(filepath, 'ts_psth', '')) - # psth = np.hstack((psth, timestamps)) - try: - ts = read_hdf5(event, filepath, "ts") - ts = np.append(ts, ["mean", "err"]) - except: - ts = None - - if len(columns) == 0: - df = pd.DataFrame(psth, index=None, columns=ts, dtype="float32") - else: - columns = np.asarray(columns) - columns = np.append(columns, ["mean", "err"]) - df = pd.DataFrame(psth, index=None, columns=list(columns), dtype="float32") - - df.to_hdf(op, key="df", mode="w") - - -# function to read h5 file and make a dataframe from it -def read_Df(filepath, event, name): - event = event.replace("\\", "_") - event = event.replace("/", "_") - if name: - op = os.path.join(filepath, event + "_{}.h5".format(name)) - else: - op = os.path.join(filepath, event + ".h5") - df = pd.read_hdf(op, key="df", mode="r") - - return df - - -# function to create PSTH trials corresponding to each event timestamp -def rowFormation(z_score, thisIndex, nTsPrev, nTsPost): - - if nTsPrev < thisIndex and z_score.shape[0] > (thisIndex + nTsPost): - res = z_score[thisIndex - nTsPrev - 1 : thisIndex + nTsPost] - elif nTsPrev >= thisIndex and z_score.shape[0] > (thisIndex + nTsPost): - mismatch = nTsPrev - thisIndex + 1 - res = np.zeros(nTsPrev + nTsPost + 1) - res[:mismatch] = np.nan - res[mismatch:] = z_score[: thisIndex + nTsPost] - elif nTsPrev >= thisIndex and z_score.shape[0] < (thisIndex + nTsPost): - mismatch1 = nTsPrev - thisIndex + 1 - mismatch2 = (thisIndex + nTsPost) - z_score.shape[0] - res1 = np.full(mismatch1, np.nan) - res2 = z_score - res3 = np.full(mismatch2, np.nan) - res = np.concatenate((res1, np.concatenate((res2, res3)))) - else: - mismatch = (thisIndex + nTsPost) - z_score.shape[0] - res1 = np.zeros(mismatch) - res1[:] = np.nan - res2 = z_score[thisIndex - nTsPrev - 1 : z_score.shape[0]] - res = np.concatenate((res2, res1)) - - return res - - -# function to calculate baseline for each PSTH trial and do baseline correction -def baselineCorrection(filepath, arr, timeAxis, baselineStart, baselineEnd): - - # timeAxis = read_Df(filepath, 'ts_psth', '') - # timeAxis = np.asarray(timeAxis).reshape(-1) - baselineStrtPt = np.where(timeAxis >= baselineStart)[0] - baselineEndPt = np.where(timeAxis >= baselineEnd)[0] - - # logger.info(baselineStrtPt[0], baselineEndPt[0]) - if baselineStart == 0 and baselineEnd == 0: - return arr - - baseline = np.nanmean(arr[baselineStrtPt[0] : baselineEndPt[0]]) - baselineSub = np.subtract(arr, baseline) - - return baselineSub - - -# helper function to make PSTH for each event -def helper_psth( - z_score, - event, - filepath, - nSecPrev, - nSecPost, - timeInterval, - bin_psth_trials, - use_time_or_trials, - baselineStart, - baselineEnd, - naming, - just_use_signal, -): - - event = event.replace("\\", "_") - event = event.replace("/", "_") - - sampling_rate = read_hdf5("timeCorrection_" + naming, filepath, "sampling_rate")[0] - - # calculate time before event timestamp and time after event timestamp - nTsPrev = int(round(nSecPrev * sampling_rate)) - nTsPost = int(round(nSecPost * sampling_rate)) - - totalTs = (-1 * nTsPrev) + nTsPost - increment = ((-1 * nSecPrev) + nSecPost) / totalTs - timeAxis = np.linspace(nSecPrev, nSecPost + increment, totalTs + 1) - timeAxisNew = np.concatenate((timeAxis, timeAxis[::-1])) - - # avoid writing same data to same file in multi-processing - # if not os.path.exists(os.path.join(filepath, 'ts_psth.h5')): - # logger.info('file not exists') - # create_Df(filepath, 'ts_psth', '', timeAxis) - # time.sleep(2) - - ts = read_hdf5(event + "_" + naming, filepath, "ts") - - # reject timestamps for which baseline cannot be calculated because of nan values - new_ts = [] - for i in range(ts.shape[0]): - thisTime = ts[i] # -1 not needed anymore - if thisTime < abs(baselineStart): - continue - else: - new_ts.append(ts[i]) - - # reject burst of timestamps - ts = np.asarray(new_ts) - # skip the event if there are no TTLs - if len(ts) == 0: - new_ts = np.array([]) - logger.info(f"Warning : No TTLs present for {event}. This will cause an error in Visualization step") - else: - new_ts = [ts[0]] - for i in range(1, ts.shape[0]): - thisTime = ts[i] - prevTime = new_ts[-1] - diff = thisTime - prevTime - if diff < timeInterval: - continue - else: - new_ts.append(ts[i]) - - # final timestamps - ts = np.asarray(new_ts) - nTs = ts.shape[0] - - # initialize PSTH vector - psth = np.full((nTs, totalTs + 1), np.nan) - psth_baselineUncorrected = np.full((nTs, totalTs + 1), np.nan) # extra - - # for each timestamp, create trial which will be saved in a PSTH vector - for i in range(nTs): - thisTime = ts[i] # -timeForLightsTurnOn - thisIndex = int(round(thisTime * sampling_rate)) - arr = rowFormation(z_score, thisIndex, -1 * nTsPrev, nTsPost) - if just_use_signal == True: - res = np.subtract(arr, np.nanmean(arr)) - z_score_arr = np.divide(res, np.nanstd(arr)) - arr = z_score_arr - else: - arr = arr - - psth_baselineUncorrected[i, :] = arr # extra - psth[i, :] = baselineCorrection(filepath, arr, timeAxis, baselineStart, baselineEnd) - - write_hdf5(ts, event + "_" + naming, filepath, "ts") - columns = list(ts) - - if use_time_or_trials == "Time (min)" and bin_psth_trials > 0: - timestamps = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - timestamps = np.divide(timestamps, 60) - ts_min = np.divide(ts, 60) - bin_steps = np.arange(timestamps[0], timestamps[-1] + bin_psth_trials, bin_psth_trials) - indices_each_step = dict() - for i in range(1, bin_steps.shape[0]): - indices_each_step[f"{np.around(bin_steps[i-1],0)}-{np.around(bin_steps[i],0)}"] = np.where( - (ts_min >= bin_steps[i - 1]) & (ts_min <= bin_steps[i]) - )[0] - elif use_time_or_trials == "# of trials" and bin_psth_trials > 0: - bin_steps = np.arange(0, ts.shape[0], bin_psth_trials) - if bin_steps[-1] < ts.shape[0]: - bin_steps = np.concatenate((bin_steps, [ts.shape[0]]), axis=0) - indices_each_step = dict() - for i in range(1, bin_steps.shape[0]): - indices_each_step[f"{bin_steps[i-1]}-{bin_steps[i]}"] = np.arange(bin_steps[i - 1], bin_steps[i]) - else: - indices_each_step = dict() - - psth_bin, psth_bin_baselineUncorrected = [], [] - if indices_each_step: - keys = list(indices_each_step.keys()) - for k in keys: - # no trials in a given bin window, just put all the nan values - if indices_each_step[k].shape[0] == 0: - psth_bin.append(np.full(psth.shape[1], np.nan)) - psth_bin_baselineUncorrected.append(np.full(psth_baselineUncorrected.shape[1], np.nan)) - psth_bin.append(np.full(psth.shape[1], np.nan)) - psth_bin_baselineUncorrected.append(np.full(psth_baselineUncorrected.shape[1], np.nan)) - else: - index = indices_each_step[k] - arr = psth[index, :] - # mean of bins - psth_bin.append(np.nanmean(psth[index, :], axis=0)) - psth_bin_baselineUncorrected.append(np.nanmean(psth_baselineUncorrected[index, :], axis=0)) - psth_bin.append(np.nanstd(psth[index, :], axis=0) / math.sqrt(psth[index, :].shape[0])) - # error of bins - psth_bin_baselineUncorrected.append( - np.nanstd(psth_baselineUncorrected[index, :], axis=0) - / math.sqrt(psth_baselineUncorrected[index, :].shape[0]) - ) - - # adding column names - columns.append(f"bin_({k})") - columns.append(f"bin_err_({k})") - - psth = np.concatenate((psth, psth_bin), axis=0) - psth_baselineUncorrected = np.concatenate((psth_baselineUncorrected, psth_bin_baselineUncorrected), axis=0) - - timeAxis = timeAxis.reshape(1, -1) - psth = np.concatenate((psth, timeAxis), axis=0) - psth_baselineUncorrected = np.concatenate((psth_baselineUncorrected, timeAxis), axis=0) - columns.append("timestamps") - - return psth, psth_baselineUncorrected, columns - - -# function to create PSTH for each event using function helper_psth and save the PSTH to h5 file -def storenamePsth(filepath, event, inputParameters): - - event = event.replace("\\", "_") - event = event.replace("/", "_") - - selectForComputePsth = inputParameters["selectForComputePsth"] - bin_psth_trials = inputParameters["bin_psth_trials"] - use_time_or_trials = inputParameters["use_time_or_trials"] - - if selectForComputePsth == "z_score": - path = glob.glob(os.path.join(filepath, "z_score_*")) - elif selectForComputePsth == "dff": - path = glob.glob(os.path.join(filepath, "dff_*")) - else: - path = glob.glob(os.path.join(filepath, "z_score_*")) + glob.glob(os.path.join(filepath, "dff_*")) - - b = np.divide(np.ones((100,)), 100) - a = 1 - - # storesList = storesList - # sampling_rate = read_hdf5(storesList[0,0], filepath, 'sampling_rate') - nSecPrev, nSecPost = inputParameters["nSecPrev"], inputParameters["nSecPost"] - baselineStart, baselineEnd = inputParameters["baselineCorrectionStart"], inputParameters["baselineCorrectionEnd"] - timeInterval = inputParameters["timeInterval"] - - if "control" in event.lower() or "signal" in event.lower(): - return 0 - else: - for i in range(len(path)): - logger.info(f"Computing PSTH for event {event}...") - basename = (os.path.basename(path[i])).split(".")[0] - name_1 = basename.split("_")[-1] - control = read_hdf5("control_" + name_1, os.path.dirname(path[i]), "data") - if (control == 0).all() == True: - signal = read_hdf5("signal_" + name_1, os.path.dirname(path[i]), "data") - z_score = ss.filtfilt(b, a, signal) - just_use_signal = True - else: - z_score = read_hdf5("", path[i], "data") - just_use_signal = False - psth, psth_baselineUncorrected, cols = helper_psth( - z_score, - event, - filepath, - nSecPrev, - nSecPost, - timeInterval, - bin_psth_trials, - use_time_or_trials, - baselineStart, - baselineEnd, - name_1, - just_use_signal, - ) - - create_Df( - filepath, - event + "_" + name_1 + "_baselineUncorrected", - basename, - psth_baselineUncorrected, - columns=cols, - ) # extra - create_Df(filepath, event + "_" + name_1, basename, psth, columns=cols) - logger.info(f"PSTH for event {event} computed.") - - -def helperPSTHPeakAndArea(psth_mean, timestamps, sampling_rate, peak_startPoint, peak_endPoint): - - peak_startPoint = np.asarray(peak_startPoint) - peak_endPoint = np.asarray(peak_endPoint) - - peak_startPoint = peak_startPoint[~np.isnan(peak_startPoint)] - peak_endPoint = peak_endPoint[~np.isnan(peak_endPoint)] - - if peak_startPoint.shape[0] != peak_endPoint.shape[0]: - logger.error("Number of Peak Start Time and Peak End Time are unequal.") - raise Exception("Number of Peak Start Time and Peak End Time are unequal.") - - if np.less_equal(peak_endPoint, peak_startPoint).any() == True: - logger.error( - "Peak End Time is lesser than or equal to Peak Start Time. Please check the Peak parameters window." - ) - raise Exception( - "Peak End Time is lesser than or equal to Peak Start Time. Please check the Peak parameters window." - ) - - peak_area = OrderedDict() - - if peak_startPoint.shape[0] == 0 or peak_endPoint.shape[0] == 0: - peak_area["peak"] = np.nan - peak_area["area"] = np.nan - - for i in range(peak_startPoint.shape[0]): - startPtForPeak = np.where(timestamps >= peak_startPoint[i])[0] - endPtForPeak = np.where(timestamps >= peak_endPoint[i])[0] - if len(startPtForPeak) >= 1 and len(endPtForPeak) >= 1: - peakPoint_pos = startPtForPeak[0] + np.argmax(psth_mean[startPtForPeak[0] : endPtForPeak[0], :], axis=0) - peakPoint_neg = startPtForPeak[0] + np.argmin(psth_mean[startPtForPeak[0] : endPtForPeak[0], :], axis=0) - peak_area["peak_pos_" + str(i + 1)] = np.amax(psth_mean[peakPoint_pos], axis=0) - peak_area["peak_neg_" + str(i + 1)] = np.amin(psth_mean[peakPoint_neg], axis=0) - peak_area["area_" + str(i + 1)] = np.trapz(psth_mean[startPtForPeak[0] : endPtForPeak[0], :], axis=0) - else: - peak_area["peak_" + str(i + 1)] = np.nan - peak_area["area_" + str(i + 1)] = np.nan - - return peak_area - - -# function to compute PSTH peak and area using the function helperPSTHPeakAndArea save the values to h5 and csv files. -def findPSTHPeakAndArea(filepath, event, inputParameters): - - event = event.replace("\\", "_") - event = event.replace("/", "_") - - # sampling_rate = read_hdf5(storesList[0,0], filepath, 'sampling_rate') - peak_startPoint = inputParameters["peak_startPoint"] - peak_endPoint = inputParameters["peak_endPoint"] - selectForComputePsth = inputParameters["selectForComputePsth"] - - if selectForComputePsth == "z_score": - path = glob.glob(os.path.join(filepath, "z_score_*")) - elif selectForComputePsth == "dff": - path = glob.glob(os.path.join(filepath, "dff_*")) - else: - path = glob.glob(os.path.join(filepath, "z_score_*")) + glob.glob(os.path.join(filepath, "dff_*")) - - if "control" in event.lower() or "signal" in event.lower(): - return 0 - else: - for i in range(len(path)): - logger.info(f"Computing peak and area for PSTH mean signal for event {event}...") - basename = (os.path.basename(path[i])).split(".")[0] - name_1 = basename.split("_")[-1] - sampling_rate = read_hdf5("timeCorrection_" + name_1, filepath, "sampling_rate")[0] - psth = read_Df(filepath, event + "_" + name_1, basename) - cols = list(psth.columns) - regex = re.compile("bin_[(]") - bin_names = [cols[i] for i in range(len(cols)) if regex.match(cols[i])] - regex_trials = re.compile("[+-]?([0-9]*[.])?[0-9]+") - trials_names = [cols[i] for i in range(len(cols)) if regex_trials.match(cols[i])] - psth_mean_bin_names = trials_names + bin_names + ["mean"] - psth_mean_bin_mean = np.asarray(psth[psth_mean_bin_names]) - timestamps = np.asarray(psth["timestamps"]).ravel() # np.asarray(read_Df(filepath, 'ts_psth', '')).ravel() - peak_area = helperPSTHPeakAndArea( - psth_mean_bin_mean, timestamps, sampling_rate, peak_startPoint, peak_endPoint - ) # peak, area = - # arr = np.array([[peak, area]]) - fileName = [os.path.basename(os.path.dirname(filepath))] - index = [fileName[0] + "_" + s for s in psth_mean_bin_names] - create_Df_area_peak( - filepath, peak_area, event + "_" + name_1 + "_" + basename, index=index - ) # columns=['peak', 'area'] - create_csv_area_peak(filepath, peak_area, event + "_" + name_1 + "_" + basename, index=index) - logger.info(f"Peak and Area for PSTH mean signal for event {event} computed.") - - def makeAverageDir(filepath): op = os.path.join(filepath, "average") @@ -655,7 +235,11 @@ def averageForGroup(folderNames, event, inputParameters): corr = np.concatenate((corr, timestamps), axis=0) columns.append("timestamps") create_Df( - make_dir(op), "corr_" + event, type[i] + "_" + corr_info[k - 1] + "_" + corr_info[k], corr, columns=columns + make_dir_for_cross_correlation(op), + "corr_" + event, + type[i] + "_" + corr_info[k - 1] + "_" + corr_info[k], + corr, + columns=columns, ) logger.info("Group of data averaged.") From d182dc206a4dae86700183410216d1a959055656 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 21 Jan 2026 16:08:12 -0800 Subject: [PATCH 133/150] Reorganzied execute_psth fns into separate modules --- src/guppy/analysis/io_utils.py | 9 ++ src/guppy/analysis/storename_psth.py | 47 +----- src/guppy/computePsth.py | 218 +-------------------------- 3 files changed, 14 insertions(+), 260 deletions(-) diff --git a/src/guppy/analysis/io_utils.py b/src/guppy/analysis/io_utils.py index c1dd39f..742ab3b 100644 --- a/src/guppy/analysis/io_utils.py +++ b/src/guppy/analysis/io_utils.py @@ -215,3 +215,12 @@ def make_dir_for_cross_correlation(filepath): if not os.path.exists(op): os.mkdir(op) return op + + +def makeAverageDir(filepath): + + op = os.path.join(filepath, "average") + if not os.path.exists(op): + os.mkdir(op) + + return op diff --git a/src/guppy/analysis/storename_psth.py b/src/guppy/analysis/storename_psth.py index db99057..d582572 100644 --- a/src/guppy/analysis/storename_psth.py +++ b/src/guppy/analysis/storename_psth.py @@ -2,13 +2,12 @@ import logging import math import os -import re import numpy as np -import pandas as pd from scipy import signal as ss from .io_utils import read_hdf5, write_hdf5 +from .psth_utils import create_Df logger = logging.getLogger(__name__) @@ -231,50 +230,6 @@ def helper_psth( return psth, psth_baselineUncorrected, columns -# function to create dataframe for each event PSTH and save it to h5 file -def create_Df(filepath, event, name, psth, columns=[]): - event = event.replace("\\", "_") - event = event.replace("/", "_") - if name: - op = os.path.join(filepath, event + "_{}.h5".format(name)) - else: - op = os.path.join(filepath, event + ".h5") - - # check if file already exists - # if os.path.exists(op): - # return 0 - - # removing psth binned trials - columns = np.array(columns, dtype="str") - regex = re.compile("bin_*") - single_trials = columns[[i for i in range(len(columns)) if not regex.match(columns[i])]] - single_trials_index = [i for i in range(len(single_trials)) if single_trials[i] != "timestamps"] - - psth = psth.T - if psth.ndim > 1: - mean = np.nanmean(psth[:, single_trials_index], axis=1).reshape(-1, 1) - err = np.nanstd(psth[:, single_trials_index], axis=1) / math.sqrt(psth[:, single_trials_index].shape[1]) - err = err.reshape(-1, 1) - psth = np.hstack((psth, mean)) - psth = np.hstack((psth, err)) - # timestamps = np.asarray(read_Df(filepath, 'ts_psth', '')) - # psth = np.hstack((psth, timestamps)) - try: - ts = read_hdf5(event, filepath, "ts") - ts = np.append(ts, ["mean", "err"]) - except: - ts = None - - if len(columns) == 0: - df = pd.DataFrame(psth, index=None, columns=ts, dtype="float32") - else: - columns = np.asarray(columns) - columns = np.append(columns, ["mean", "err"]) - df = pd.DataFrame(psth, index=None, columns=list(columns), dtype="float32") - - df.to_hdf(op, key="df", mode="w") - - # ***************************** Functions used by helper_psth ***************************** # diff --git a/src/guppy/computePsth.py b/src/guppy/computePsth.py index a9b4415..4d12240 100755 --- a/src/guppy/computePsth.py +++ b/src/guppy/computePsth.py @@ -3,25 +3,21 @@ import glob import json import logging -import math import multiprocessing as mp import os -import re import subprocess import sys from itertools import repeat import numpy as np -import pandas as pd -from .analysis.cross_correlation import computeCrossCorrelation, getCorrCombinations +from .analysis.cross_correlation import computeCrossCorrelation from .analysis.io_utils import ( get_all_stores_for_combining_data, - make_dir_for_cross_correlation, - write_hdf5, + makeAverageDir, ) -from .analysis.psth_peak_and_area import findPSTHPeakAndArea, read_Df -from .analysis.storename_psth import create_Df, storenamePsth +from .analysis.psth_peak_and_area import findPSTHPeakAndArea +from .analysis.storename_psth import storenamePsth logger = logging.getLogger(__name__) @@ -39,212 +35,6 @@ def writeToFile(value: str): file.write(value) -def read_Df_area_peak(filepath, name): - op = os.path.join(filepath, "peak_AUC_" + name + ".h5") - df = pd.read_hdf(op, key="df", mode="r") - - return df - - -def makeAverageDir(filepath): - - op = os.path.join(filepath, "average") - if not os.path.exists(op): - os.mkdir(op) - - return op - - -def psth_shape_check(psth): - - each_ln = [] - for i in range(len(psth)): - each_ln.append(psth[i].shape[0]) - - each_ln = np.asarray(each_ln) - keep_ln = each_ln[-1] - - for i in range(len(psth)): - if psth[i].shape[0] > keep_ln: - psth[i] = psth[i][:keep_ln] - elif psth[i].shape[0] < keep_ln: - psth[i] = np.append(psth[i], np.full(keep_ln - len(psth[i]), np.nan)) - else: - psth[i] = psth[i] - - return psth - - -# function to compute average of group of recordings -def averageForGroup(folderNames, event, inputParameters): - - event = event.replace("\\", "_") - event = event.replace("/", "_") - - logger.debug("Averaging group of data...") - path = [] - abspath = inputParameters["abspath"] - selectForComputePsth = inputParameters["selectForComputePsth"] - path_temp_len = [] - op = makeAverageDir(abspath) - - # combining paths to all the selected folders for doing average - for i in range(len(folderNames)): - if selectForComputePsth == "z_score": - path_temp = glob.glob(os.path.join(folderNames[i], "z_score_*")) - elif selectForComputePsth == "dff": - path_temp = glob.glob(os.path.join(folderNames[i], "dff_*")) - else: - path_temp = glob.glob(os.path.join(folderNames[i], "z_score_*")) + glob.glob( - os.path.join(folderNames[i], "dff_*") - ) - - path_temp_len.append(len(path_temp)) - # path_temp = glob.glob(os.path.join(folderNames[i], 'z_score_*')) - for j in range(len(path_temp)): - basename = (os.path.basename(path_temp[j])).split(".")[0] - write_hdf5(np.array([]), basename, op, "data") - name_1 = basename.split("_")[-1] - temp = [folderNames[i], event + "_" + name_1, basename] - path.append(temp) - - # processing of all the paths - path_temp_len = np.asarray(path_temp_len) - max_len = np.argmax(path_temp_len) - - naming = [] - for i in range(len(path)): - naming.append(path[i][2]) - naming = np.unique(np.asarray(naming)) - - new_path = [[] for _ in range(path_temp_len[max_len])] - for i in range(len(path)): - idx = np.where(naming == path[i][2])[0][0] - new_path[idx].append(path[i]) - - # read PSTH for each event and make the average of it. Save the final output to an average folder. - for i in range(len(new_path)): - psth, psth_bins = [], [] - columns = [] - bins_cols = [] - temp_path = new_path[i] - for j in range(len(temp_path)): - # logger.info(os.path.join(temp_path[j][0], temp_path[j][1]+'_{}.h5'.format(temp_path[j][2]))) - if not os.path.exists(os.path.join(temp_path[j][0], temp_path[j][1] + "_{}.h5".format(temp_path[j][2]))): - continue - else: - df = read_Df(temp_path[j][0], temp_path[j][1], temp_path[j][2]) # filepath, event, name - cols = list(df.columns) - regex = re.compile("bin_[(]") - bins_cols = [cols[i] for i in range(len(cols)) if regex.match(cols[i])] - psth.append(np.asarray(df["mean"])) - columns.append(os.path.basename(temp_path[j][0])) - if len(bins_cols) > 0: - psth_bins.append(df[bins_cols]) - - if len(psth) == 0: - logger.warning("Something is wrong with the file search pattern.") - continue - - if len(bins_cols) > 0: - df_bins = pd.concat(psth_bins, axis=1) - df_bins_mean = df_bins.groupby(by=df_bins.columns, axis=1).mean() - df_bins_err = df_bins.groupby(by=df_bins.columns, axis=1).std() / math.sqrt(df_bins.shape[1]) - cols_err = list(df_bins_err.columns) - dict_err = {} - for i in cols_err: - split = i.split("_") - dict_err[i] = "{}_err_{}".format(split[0], split[1]) - df_bins_err = df_bins_err.rename(columns=dict_err) - columns = columns + list(df_bins_mean.columns) + list(df_bins_err.columns) - df_bins_mean_err = pd.concat([df_bins_mean, df_bins_err], axis=1).T - psth, df_bins_mean_err = np.asarray(psth), np.asarray(df_bins_mean_err) - psth = np.concatenate((psth, df_bins_mean_err), axis=0) - else: - psth = psth_shape_check(psth) - psth = np.asarray(psth) - - timestamps = np.asarray(df["timestamps"]).reshape(1, -1) - psth = np.concatenate((psth, timestamps), axis=0) - columns = columns + ["timestamps"] - create_Df(op, temp_path[j][1], temp_path[j][2], psth, columns=columns) - - # read PSTH peak and area for each event and combine them. Save the final output to an average folder - for i in range(len(new_path)): - arr = [] - index = [] - temp_path = new_path[i] - for j in range(len(temp_path)): - if not os.path.exists( - os.path.join(temp_path[j][0], "peak_AUC_" + temp_path[j][1] + "_" + temp_path[j][2] + ".h5") - ): - continue - else: - df = read_Df_area_peak(temp_path[j][0], temp_path[j][1] + "_" + temp_path[j][2]) - arr.append(df) - index.append(list(df.index)) - - if len(arr) == 0: - logger.warning("Something is wrong with the file search pattern.") - continue - index = list(np.concatenate(index)) - new_df = pd.concat(arr, axis=0) # os.path.join(filepath, 'peak_AUC_'+name+'.csv') - new_df.to_csv(os.path.join(op, "peak_AUC_{}_{}.csv".format(temp_path[j][1], temp_path[j][2])), index=index) - new_df.to_hdf( - os.path.join(op, "peak_AUC_{}_{}.h5".format(temp_path[j][1], temp_path[j][2])), - key="df", - mode="w", - index=index, - ) - - # read cross-correlation files and combine them. Save the final output to an average folder - type = [] - for i in range(len(folderNames)): - _, temp_type = getCorrCombinations(folderNames[i], inputParameters) - type.append(temp_type) - - type = np.unique(np.array(type)) - for i in range(len(type)): - corr = [] - columns = [] - df = None - for j in range(len(folderNames)): - corr_info, _ = getCorrCombinations(folderNames[j], inputParameters) - for k in range(1, len(corr_info)): - path = os.path.join( - folderNames[j], - "cross_correlation_output", - "corr_" + event + "_" + type[i] + "_" + corr_info[k - 1] + "_" + corr_info[k], - ) - if not os.path.exists(path + ".h5"): - continue - else: - df = read_Df( - os.path.join(folderNames[j], "cross_correlation_output"), - "corr_" + event, - type[i] + "_" + corr_info[k - 1] + "_" + corr_info[k], - ) - corr.append(df["mean"]) - columns.append(os.path.basename(folderNames[j])) - - if not isinstance(df, pd.DataFrame): - break - - corr = np.array(corr) - timestamps = np.array(df["timestamps"]).reshape(1, -1) - corr = np.concatenate((corr, timestamps), axis=0) - columns.append("timestamps") - create_Df( - make_dir_for_cross_correlation(op), - "corr_" + event, - type[i] + "_" + corr_info[k - 1] + "_" + corr_info[k], - corr, - columns=columns, - ) - - logger.info("Group of data averaged.") - - def psthForEachStorename(inputParameters): logger.info("Computing PSTH, Peak and Area for each event...") From 8dd4042a127c132c127f099c428856cb26052de9 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 21 Jan 2026 16:18:38 -0800 Subject: [PATCH 134/150] Reorganzied execute_psth fns into separate modules --- src/guppy/analysis/psth_average.py | 216 +++++++++++++++++++++++++++ src/guppy/analysis/psth_utils.py | 55 +++++++ src/guppy/analysis/storename_psth.py | 92 +++++------- 3 files changed, 312 insertions(+), 51 deletions(-) create mode 100644 src/guppy/analysis/psth_average.py create mode 100644 src/guppy/analysis/psth_utils.py diff --git a/src/guppy/analysis/psth_average.py b/src/guppy/analysis/psth_average.py new file mode 100644 index 0000000..b539419 --- /dev/null +++ b/src/guppy/analysis/psth_average.py @@ -0,0 +1,216 @@ +import glob +import logging +import math +import os +import re + +import numpy as np +import pandas as pd + +from .cross_correlation import getCorrCombinations +from .io_utils import ( + make_dir_for_cross_correlation, + makeAverageDir, + read_Df, + write_hdf5, +) +from .psth_utils import create_Df + +logger = logging.getLogger(__name__) + + +# function to compute average of group of recordings +def averageForGroup(folderNames, event, inputParameters): + + event = event.replace("\\", "_") + event = event.replace("/", "_") + + logger.debug("Averaging group of data...") + path = [] + abspath = inputParameters["abspath"] + selectForComputePsth = inputParameters["selectForComputePsth"] + path_temp_len = [] + op = makeAverageDir(abspath) + + # combining paths to all the selected folders for doing average + for i in range(len(folderNames)): + if selectForComputePsth == "z_score": + path_temp = glob.glob(os.path.join(folderNames[i], "z_score_*")) + elif selectForComputePsth == "dff": + path_temp = glob.glob(os.path.join(folderNames[i], "dff_*")) + else: + path_temp = glob.glob(os.path.join(folderNames[i], "z_score_*")) + glob.glob( + os.path.join(folderNames[i], "dff_*") + ) + + path_temp_len.append(len(path_temp)) + # path_temp = glob.glob(os.path.join(folderNames[i], 'z_score_*')) + for j in range(len(path_temp)): + basename = (os.path.basename(path_temp[j])).split(".")[0] + write_hdf5(np.array([]), basename, op, "data") + name_1 = basename.split("_")[-1] + temp = [folderNames[i], event + "_" + name_1, basename] + path.append(temp) + + # processing of all the paths + path_temp_len = np.asarray(path_temp_len) + max_len = np.argmax(path_temp_len) + + naming = [] + for i in range(len(path)): + naming.append(path[i][2]) + naming = np.unique(np.asarray(naming)) + + new_path = [[] for _ in range(path_temp_len[max_len])] + for i in range(len(path)): + idx = np.where(naming == path[i][2])[0][0] + new_path[idx].append(path[i]) + + # read PSTH for each event and make the average of it. Save the final output to an average folder. + for i in range(len(new_path)): + psth, psth_bins = [], [] + columns = [] + bins_cols = [] + temp_path = new_path[i] + for j in range(len(temp_path)): + # logger.info(os.path.join(temp_path[j][0], temp_path[j][1]+'_{}.h5'.format(temp_path[j][2]))) + if not os.path.exists(os.path.join(temp_path[j][0], temp_path[j][1] + "_{}.h5".format(temp_path[j][2]))): + continue + else: + df = read_Df(temp_path[j][0], temp_path[j][1], temp_path[j][2]) # filepath, event, name + cols = list(df.columns) + regex = re.compile("bin_[(]") + bins_cols = [cols[i] for i in range(len(cols)) if regex.match(cols[i])] + psth.append(np.asarray(df["mean"])) + columns.append(os.path.basename(temp_path[j][0])) + if len(bins_cols) > 0: + psth_bins.append(df[bins_cols]) + + if len(psth) == 0: + logger.warning("Something is wrong with the file search pattern.") + continue + + if len(bins_cols) > 0: + df_bins = pd.concat(psth_bins, axis=1) + df_bins_mean = df_bins.groupby(by=df_bins.columns, axis=1).mean() + df_bins_err = df_bins.groupby(by=df_bins.columns, axis=1).std() / math.sqrt(df_bins.shape[1]) + cols_err = list(df_bins_err.columns) + dict_err = {} + for i in cols_err: + split = i.split("_") + dict_err[i] = "{}_err_{}".format(split[0], split[1]) + df_bins_err = df_bins_err.rename(columns=dict_err) + columns = columns + list(df_bins_mean.columns) + list(df_bins_err.columns) + df_bins_mean_err = pd.concat([df_bins_mean, df_bins_err], axis=1).T + psth, df_bins_mean_err = np.asarray(psth), np.asarray(df_bins_mean_err) + psth = np.concatenate((psth, df_bins_mean_err), axis=0) + else: + psth = psth_shape_check(psth) + psth = np.asarray(psth) + + timestamps = np.asarray(df["timestamps"]).reshape(1, -1) + psth = np.concatenate((psth, timestamps), axis=0) + columns = columns + ["timestamps"] + create_Df(op, temp_path[j][1], temp_path[j][2], psth, columns=columns) + + # read PSTH peak and area for each event and combine them. Save the final output to an average folder + for i in range(len(new_path)): + arr = [] + index = [] + temp_path = new_path[i] + for j in range(len(temp_path)): + if not os.path.exists( + os.path.join(temp_path[j][0], "peak_AUC_" + temp_path[j][1] + "_" + temp_path[j][2] + ".h5") + ): + continue + else: + df = read_Df_area_peak(temp_path[j][0], temp_path[j][1] + "_" + temp_path[j][2]) + arr.append(df) + index.append(list(df.index)) + + if len(arr) == 0: + logger.warning("Something is wrong with the file search pattern.") + continue + index = list(np.concatenate(index)) + new_df = pd.concat(arr, axis=0) # os.path.join(filepath, 'peak_AUC_'+name+'.csv') + new_df.to_csv(os.path.join(op, "peak_AUC_{}_{}.csv".format(temp_path[j][1], temp_path[j][2])), index=index) + new_df.to_hdf( + os.path.join(op, "peak_AUC_{}_{}.h5".format(temp_path[j][1], temp_path[j][2])), + key="df", + mode="w", + index=index, + ) + + # read cross-correlation files and combine them. Save the final output to an average folder + type = [] + for i in range(len(folderNames)): + _, temp_type = getCorrCombinations(folderNames[i], inputParameters) + type.append(temp_type) + + type = np.unique(np.array(type)) + for i in range(len(type)): + corr = [] + columns = [] + df = None + for j in range(len(folderNames)): + corr_info, _ = getCorrCombinations(folderNames[j], inputParameters) + for k in range(1, len(corr_info)): + path = os.path.join( + folderNames[j], + "cross_correlation_output", + "corr_" + event + "_" + type[i] + "_" + corr_info[k - 1] + "_" + corr_info[k], + ) + if not os.path.exists(path + ".h5"): + continue + else: + df = read_Df( + os.path.join(folderNames[j], "cross_correlation_output"), + "corr_" + event, + type[i] + "_" + corr_info[k - 1] + "_" + corr_info[k], + ) + corr.append(df["mean"]) + columns.append(os.path.basename(folderNames[j])) + + if not isinstance(df, pd.DataFrame): + break + + corr = np.array(corr) + timestamps = np.array(df["timestamps"]).reshape(1, -1) + corr = np.concatenate((corr, timestamps), axis=0) + columns.append("timestamps") + create_Df( + make_dir_for_cross_correlation(op), + "corr_" + event, + type[i] + "_" + corr_info[k - 1] + "_" + corr_info[k], + corr, + columns=columns, + ) + + logger.info("Group of data averaged.") + + +def psth_shape_check(psth): + + each_ln = [] + for i in range(len(psth)): + each_ln.append(psth[i].shape[0]) + + each_ln = np.asarray(each_ln) + keep_ln = each_ln[-1] + + for i in range(len(psth)): + if psth[i].shape[0] > keep_ln: + psth[i] = psth[i][:keep_ln] + elif psth[i].shape[0] < keep_ln: + psth[i] = np.append(psth[i], np.full(keep_ln - len(psth[i]), np.nan)) + else: + psth[i] = psth[i] + + return psth + + +def read_Df_area_peak(filepath, name): + op = os.path.join(filepath, "peak_AUC_" + name + ".h5") + df = pd.read_hdf(op, key="df", mode="r") + + return df diff --git a/src/guppy/analysis/psth_utils.py b/src/guppy/analysis/psth_utils.py new file mode 100644 index 0000000..13b2479 --- /dev/null +++ b/src/guppy/analysis/psth_utils.py @@ -0,0 +1,55 @@ +import logging +import math +import os +import re + +import numpy as np +import pandas as pd + +from .io_utils import read_hdf5 + +logger = logging.getLogger(__name__) + + +# function to create dataframe for each event PSTH and save it to h5 file +def create_Df(filepath, event, name, psth, columns=[]): + event = event.replace("\\", "_") + event = event.replace("/", "_") + if name: + op = os.path.join(filepath, event + "_{}.h5".format(name)) + else: + op = os.path.join(filepath, event + ".h5") + + # check if file already exists + # if os.path.exists(op): + # return 0 + + # removing psth binned trials + columns = np.array(columns, dtype="str") + regex = re.compile("bin_*") + single_trials = columns[[i for i in range(len(columns)) if not regex.match(columns[i])]] + single_trials_index = [i for i in range(len(single_trials)) if single_trials[i] != "timestamps"] + + psth = psth.T + if psth.ndim > 1: + mean = np.nanmean(psth[:, single_trials_index], axis=1).reshape(-1, 1) + err = np.nanstd(psth[:, single_trials_index], axis=1) / math.sqrt(psth[:, single_trials_index].shape[1]) + err = err.reshape(-1, 1) + psth = np.hstack((psth, mean)) + psth = np.hstack((psth, err)) + # timestamps = np.asarray(read_Df(filepath, 'ts_psth', '')) + # psth = np.hstack((psth, timestamps)) + try: + ts = read_hdf5(event, filepath, "ts") + ts = np.append(ts, ["mean", "err"]) + except: + ts = None + + if len(columns) == 0: + df = pd.DataFrame(psth, index=None, columns=ts, dtype="float32") + else: + columns = np.asarray(columns) + columns = np.append(columns, ["mean", "err"]) + df = pd.DataFrame(psth, index=None, columns=list(columns), dtype="float32") + + df.to_hdf(op, key="df", mode="w") diff --git a/src/guppy/analysis/storename_psth.py b/src/guppy/analysis/storename_psth.py index d582572..33d5657 100644 --- a/src/guppy/analysis/storename_psth.py +++ b/src/guppy/analysis/storename_psth.py @@ -17,10 +17,15 @@ def storenamePsth(filepath, event, inputParameters): event = event.replace("\\", "_") event = event.replace("/", "_") + if "control" in event.lower() or "signal" in event.lower(): + return 0 selectForComputePsth = inputParameters["selectForComputePsth"] bin_psth_trials = inputParameters["bin_psth_trials"] use_time_or_trials = inputParameters["use_time_or_trials"] + nSecPrev, nSecPost = inputParameters["nSecPrev"], inputParameters["nSecPost"] + baselineStart, baselineEnd = inputParameters["baselineCorrectionStart"], inputParameters["baselineCorrectionEnd"] + timeInterval = inputParameters["timeInterval"] if selectForComputePsth == "z_score": path = glob.glob(os.path.join(filepath, "z_score_*")) @@ -32,54 +37,42 @@ def storenamePsth(filepath, event, inputParameters): b = np.divide(np.ones((100,)), 100) a = 1 - # storesList = storesList - # sampling_rate = read_hdf5(storesList[0,0], filepath, 'sampling_rate') - nSecPrev, nSecPost = inputParameters["nSecPrev"], inputParameters["nSecPost"] - baselineStart, baselineEnd = inputParameters["baselineCorrectionStart"], inputParameters["baselineCorrectionEnd"] - timeInterval = inputParameters["timeInterval"] - - if "control" in event.lower() or "signal" in event.lower(): - return 0 - else: - for i in range(len(path)): - logger.info(f"Computing PSTH for event {event}...") - basename = (os.path.basename(path[i])).split(".")[0] - name_1 = basename.split("_")[-1] - control = read_hdf5("control_" + name_1, os.path.dirname(path[i]), "data") - if (control == 0).all() == True: - signal = read_hdf5("signal_" + name_1, os.path.dirname(path[i]), "data") - z_score = ss.filtfilt(b, a, signal) - just_use_signal = True - else: - z_score = read_hdf5("", path[i], "data") - just_use_signal = False - psth, psth_baselineUncorrected, cols = helper_psth( - z_score, - event, - filepath, - nSecPrev, - nSecPost, - timeInterval, - bin_psth_trials, - use_time_or_trials, - baselineStart, - baselineEnd, - name_1, - just_use_signal, - ) - - create_Df( - filepath, - event + "_" + name_1 + "_baselineUncorrected", - basename, - psth_baselineUncorrected, - columns=cols, - ) # extra - create_Df(filepath, event + "_" + name_1, basename, psth, columns=cols) - logger.info(f"PSTH for event {event} computed.") - - -# *********************************** Functions used by storenamePsth *********************************** # + for i in range(len(path)): + logger.info(f"Computing PSTH for event {event}...") + basename = (os.path.basename(path[i])).split(".")[0] + name_1 = basename.split("_")[-1] + control = read_hdf5("control_" + name_1, os.path.dirname(path[i]), "data") + if (control == 0).all() == True: + signal = read_hdf5("signal_" + name_1, os.path.dirname(path[i]), "data") + z_score = ss.filtfilt(b, a, signal) + just_use_signal = True + else: + z_score = read_hdf5("", path[i], "data") + just_use_signal = False + psth, psth_baselineUncorrected, cols = helper_psth( + z_score, + event, + filepath, + nSecPrev, + nSecPost, + timeInterval, + bin_psth_trials, + use_time_or_trials, + baselineStart, + baselineEnd, + name_1, + just_use_signal, + ) + + create_Df( + filepath, + event + "_" + name_1 + "_baselineUncorrected", + basename, + psth_baselineUncorrected, + columns=cols, + ) # extra + create_Df(filepath, event + "_" + name_1, basename, psth, columns=cols) + logger.info(f"PSTH for event {event} computed.") # helper function to make PSTH for each event @@ -230,9 +223,6 @@ def helper_psth( return psth, psth_baselineUncorrected, columns -# ***************************** Functions used by helper_psth ***************************** # - - # function to create PSTH trials corresponding to each event timestamp def rowFormation(z_score, thisIndex, nTsPrev, nTsPost): From 2d2d4fa920ab50c92fa4bcb861b4e0314a653da4 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 21 Jan 2026 16:29:20 -0800 Subject: [PATCH 135/150] Reorganzied execute_psth fns into separate modules --- .../{storename_psth.py => compute_psth.py} | 69 +---------------- src/guppy/computePsth.py | 77 +++++++++++++++++-- 2 files changed, 73 insertions(+), 73 deletions(-) rename src/guppy/analysis/{storename_psth.py => compute_psth.py} (75%) diff --git a/src/guppy/analysis/storename_psth.py b/src/guppy/analysis/compute_psth.py similarity index 75% rename from src/guppy/analysis/storename_psth.py rename to src/guppy/analysis/compute_psth.py index 33d5657..887081d 100644 --- a/src/guppy/analysis/storename_psth.py +++ b/src/guppy/analysis/compute_psth.py @@ -1,82 +1,15 @@ -import glob import logging import math -import os import numpy as np -from scipy import signal as ss from .io_utils import read_hdf5, write_hdf5 -from .psth_utils import create_Df logger = logging.getLogger(__name__) -# function to create PSTH for each event using function helper_psth and save the PSTH to h5 file -def storenamePsth(filepath, event, inputParameters): - - event = event.replace("\\", "_") - event = event.replace("/", "_") - if "control" in event.lower() or "signal" in event.lower(): - return 0 - - selectForComputePsth = inputParameters["selectForComputePsth"] - bin_psth_trials = inputParameters["bin_psth_trials"] - use_time_or_trials = inputParameters["use_time_or_trials"] - nSecPrev, nSecPost = inputParameters["nSecPrev"], inputParameters["nSecPost"] - baselineStart, baselineEnd = inputParameters["baselineCorrectionStart"], inputParameters["baselineCorrectionEnd"] - timeInterval = inputParameters["timeInterval"] - - if selectForComputePsth == "z_score": - path = glob.glob(os.path.join(filepath, "z_score_*")) - elif selectForComputePsth == "dff": - path = glob.glob(os.path.join(filepath, "dff_*")) - else: - path = glob.glob(os.path.join(filepath, "z_score_*")) + glob.glob(os.path.join(filepath, "dff_*")) - - b = np.divide(np.ones((100,)), 100) - a = 1 - - for i in range(len(path)): - logger.info(f"Computing PSTH for event {event}...") - basename = (os.path.basename(path[i])).split(".")[0] - name_1 = basename.split("_")[-1] - control = read_hdf5("control_" + name_1, os.path.dirname(path[i]), "data") - if (control == 0).all() == True: - signal = read_hdf5("signal_" + name_1, os.path.dirname(path[i]), "data") - z_score = ss.filtfilt(b, a, signal) - just_use_signal = True - else: - z_score = read_hdf5("", path[i], "data") - just_use_signal = False - psth, psth_baselineUncorrected, cols = helper_psth( - z_score, - event, - filepath, - nSecPrev, - nSecPost, - timeInterval, - bin_psth_trials, - use_time_or_trials, - baselineStart, - baselineEnd, - name_1, - just_use_signal, - ) - - create_Df( - filepath, - event + "_" + name_1 + "_baselineUncorrected", - basename, - psth_baselineUncorrected, - columns=cols, - ) # extra - create_Df(filepath, event + "_" + name_1, basename, psth, columns=cols) - logger.info(f"PSTH for event {event} computed.") - - # helper function to make PSTH for each event -def helper_psth( +def compute_psth( z_score, event, filepath, diff --git a/src/guppy/computePsth.py b/src/guppy/computePsth.py index 4d12240..19f0b40 100755 --- a/src/guppy/computePsth.py +++ b/src/guppy/computePsth.py @@ -10,14 +10,18 @@ from itertools import repeat import numpy as np +from scipy import signal as ss +from .analysis.compute_psth import compute_psth from .analysis.cross_correlation import computeCrossCorrelation from .analysis.io_utils import ( get_all_stores_for_combining_data, makeAverageDir, + read_hdf5, ) +from .analysis.psth_average import averageForGroup from .analysis.psth_peak_and_area import findPSTHPeakAndArea -from .analysis.storename_psth import storenamePsth +from .analysis.psth_utils import create_Df logger = logging.getLogger(__name__) @@ -66,12 +70,75 @@ def psthForEachStorename(inputParameters): if combine_data == True: execute_psth_combined(inputParameters) else: - execute_psth(inputParameters) + orchestrate_psth(inputParameters) logger.info("PSTH, Area and Peak are computed for all events.") return inputParameters -def execute_psth(inputParameters): +# function to create PSTH for each event using function helper_psth and save the PSTH to h5 file +def execute_compute_psth(filepath, event, inputParameters): + + event = event.replace("\\", "_") + event = event.replace("/", "_") + if "control" in event.lower() or "signal" in event.lower(): + return 0 + + selectForComputePsth = inputParameters["selectForComputePsth"] + bin_psth_trials = inputParameters["bin_psth_trials"] + use_time_or_trials = inputParameters["use_time_or_trials"] + nSecPrev, nSecPost = inputParameters["nSecPrev"], inputParameters["nSecPost"] + baselineStart, baselineEnd = inputParameters["baselineCorrectionStart"], inputParameters["baselineCorrectionEnd"] + timeInterval = inputParameters["timeInterval"] + + if selectForComputePsth == "z_score": + path = glob.glob(os.path.join(filepath, "z_score_*")) + elif selectForComputePsth == "dff": + path = glob.glob(os.path.join(filepath, "dff_*")) + else: + path = glob.glob(os.path.join(filepath, "z_score_*")) + glob.glob(os.path.join(filepath, "dff_*")) + + b = np.divide(np.ones((100,)), 100) + a = 1 + + for i in range(len(path)): + logger.info(f"Computing PSTH for event {event}...") + basename = (os.path.basename(path[i])).split(".")[0] + name_1 = basename.split("_")[-1] + control = read_hdf5("control_" + name_1, os.path.dirname(path[i]), "data") + if (control == 0).all() == True: + signal = read_hdf5("signal_" + name_1, os.path.dirname(path[i]), "data") + z_score = ss.filtfilt(b, a, signal) + just_use_signal = True + else: + z_score = read_hdf5("", path[i], "data") + just_use_signal = False + psth, psth_baselineUncorrected, cols = compute_psth( + z_score, + event, + filepath, + nSecPrev, + nSecPost, + timeInterval, + bin_psth_trials, + use_time_or_trials, + baselineStart, + baselineEnd, + name_1, + just_use_signal, + ) + + create_Df( + filepath, + event + "_" + name_1 + "_baselineUncorrected", + basename, + psth_baselineUncorrected, + columns=cols, + ) # extra + create_Df(filepath, event + "_" + name_1, basename, psth, columns=cols) + logger.info(f"PSTH for event {event} computed.") + + +def orchestrate_psth(inputParameters): folderNames = inputParameters["folderNames"] numProcesses = inputParameters["numberOfCores"] storesListPath = [] @@ -89,7 +156,7 @@ def execute_psth(inputParameters): ) with mp.Pool(numProcesses) as p: - p.starmap(storenamePsth, zip(repeat(filepath), storesList[1, :], repeat(inputParameters))) + p.starmap(execute_compute_psth, zip(repeat(filepath), storesList[1, :], repeat(inputParameters))) with mp.Pool(numProcesses) as pq: pq.starmap(findPSTHPeakAndArea, zip(repeat(filepath), storesList[1, :], repeat(inputParameters))) @@ -126,7 +193,7 @@ def execute_psth_combined(inputParameters): ) storesList = np.unique(storesList, axis=1) for k in range(storesList.shape[1]): - storenamePsth(op[i][0], storesList[1, k], inputParameters) + execute_compute_psth(op[i][0], storesList[1, k], inputParameters) findPSTHPeakAndArea(op[i][0], storesList[1, k], inputParameters) computeCrossCorrelation(op[i][0], storesList[1, k], inputParameters) writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n") From 7929ec11f04fbe6f24aaaeebfaf51ea2806fd49e Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 21 Jan 2026 16:41:35 -0800 Subject: [PATCH 136/150] pulled read out of compute_psth --- src/guppy/analysis/compute_psth.py | 28 ++++++++-------------------- src/guppy/computePsth.py | 10 ++++++++++ 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/src/guppy/analysis/compute_psth.py b/src/guppy/analysis/compute_psth.py index 887081d..fd848ac 100644 --- a/src/guppy/analysis/compute_psth.py +++ b/src/guppy/analysis/compute_psth.py @@ -3,7 +3,7 @@ import numpy as np -from .io_utils import read_hdf5, write_hdf5 +from .io_utils import write_hdf5 logger = logging.getLogger(__name__) @@ -22,13 +22,14 @@ def compute_psth( baselineEnd, naming, just_use_signal, + sampling_rate, + ts, + corrected_timestamps, ): event = event.replace("\\", "_") event = event.replace("/", "_") - sampling_rate = read_hdf5("timeCorrection_" + naming, filepath, "sampling_rate")[0] - # calculate time before event timestamp and time after event timestamp nTsPrev = int(round(nSecPrev * sampling_rate)) nTsPost = int(round(nSecPost * sampling_rate)) @@ -38,14 +39,6 @@ def compute_psth( timeAxis = np.linspace(nSecPrev, nSecPost + increment, totalTs + 1) timeAxisNew = np.concatenate((timeAxis, timeAxis[::-1])) - # avoid writing same data to same file in multi-processing - # if not os.path.exists(os.path.join(filepath, 'ts_psth.h5')): - # logger.info('file not exists') - # create_Df(filepath, 'ts_psth', '', timeAxis) - # time.sleep(2) - - ts = read_hdf5(event + "_" + naming, filepath, "ts") - # reject timestamps for which baseline cannot be calculated because of nan values new_ts = [] for i in range(ts.shape[0]): @@ -93,16 +86,15 @@ def compute_psth( arr = arr psth_baselineUncorrected[i, :] = arr # extra - psth[i, :] = baselineCorrection(filepath, arr, timeAxis, baselineStart, baselineEnd) + psth[i, :] = baselineCorrection(arr, timeAxis, baselineStart, baselineEnd) write_hdf5(ts, event + "_" + naming, filepath, "ts") columns = list(ts) if use_time_or_trials == "Time (min)" and bin_psth_trials > 0: - timestamps = read_hdf5("timeCorrection_" + naming, filepath, "timestampNew") - timestamps = np.divide(timestamps, 60) + corrected_timestamps = np.divide(corrected_timestamps, 60) ts_min = np.divide(ts, 60) - bin_steps = np.arange(timestamps[0], timestamps[-1] + bin_psth_trials, bin_psth_trials) + bin_steps = np.arange(corrected_timestamps[0], corrected_timestamps[-1] + bin_psth_trials, bin_psth_trials) indices_each_step = dict() for i in range(1, bin_steps.shape[0]): indices_each_step[f"{np.around(bin_steps[i-1],0)}-{np.around(bin_steps[i],0)}"] = np.where( @@ -184,14 +176,10 @@ def rowFormation(z_score, thisIndex, nTsPrev, nTsPost): # function to calculate baseline for each PSTH trial and do baseline correction -def baselineCorrection(filepath, arr, timeAxis, baselineStart, baselineEnd): - - # timeAxis = read_Df(filepath, 'ts_psth', '') - # timeAxis = np.asarray(timeAxis).reshape(-1) +def baselineCorrection(arr, timeAxis, baselineStart, baselineEnd): baselineStrtPt = np.where(timeAxis >= baselineStart)[0] baselineEndPt = np.where(timeAxis >= baselineEnd)[0] - # logger.info(baselineStrtPt[0], baselineEndPt[0]) if baselineStart == 0 and baselineEnd == 0: return arr diff --git a/src/guppy/computePsth.py b/src/guppy/computePsth.py index 19f0b40..717fdba 100755 --- a/src/guppy/computePsth.py +++ b/src/guppy/computePsth.py @@ -112,6 +112,13 @@ def execute_compute_psth(filepath, event, inputParameters): else: z_score = read_hdf5("", path[i], "data") just_use_signal = False + + sampling_rate = read_hdf5("timeCorrection_" + name_1, filepath, "sampling_rate")[0] + ts = read_hdf5(event + "_" + name_1, filepath, "ts") + if use_time_or_trials == "Time (min)" and bin_psth_trials > 0: + corrected_timestamps = read_hdf5("timeCorrection_" + name_1, filepath, "timestampNew") + else: + corrected_timestamps = None psth, psth_baselineUncorrected, cols = compute_psth( z_score, event, @@ -125,6 +132,9 @@ def execute_compute_psth(filepath, event, inputParameters): baselineEnd, name_1, just_use_signal, + sampling_rate, + ts, + corrected_timestamps, ) create_Df( From fa74a4df089e22dcd87d539f8cf7b3cb1ec83b33 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 21 Jan 2026 16:50:12 -0800 Subject: [PATCH 137/150] pulled write out of compute_psth --- src/guppy/analysis/compute_psth.py | 5 +---- src/guppy/computePsth.py | 4 +++- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/guppy/analysis/compute_psth.py b/src/guppy/analysis/compute_psth.py index fd848ac..80aa8a7 100644 --- a/src/guppy/analysis/compute_psth.py +++ b/src/guppy/analysis/compute_psth.py @@ -3,8 +3,6 @@ import numpy as np -from .io_utils import write_hdf5 - logger = logging.getLogger(__name__) @@ -88,7 +86,6 @@ def compute_psth( psth_baselineUncorrected[i, :] = arr # extra psth[i, :] = baselineCorrection(arr, timeAxis, baselineStart, baselineEnd) - write_hdf5(ts, event + "_" + naming, filepath, "ts") columns = list(ts) if use_time_or_trials == "Time (min)" and bin_psth_trials > 0: @@ -145,7 +142,7 @@ def compute_psth( psth_baselineUncorrected = np.concatenate((psth_baselineUncorrected, timeAxis), axis=0) columns.append("timestamps") - return psth, psth_baselineUncorrected, columns + return psth, psth_baselineUncorrected, columns, ts # function to create PSTH trials corresponding to each event timestamp diff --git a/src/guppy/computePsth.py b/src/guppy/computePsth.py index 717fdba..38b04df 100755 --- a/src/guppy/computePsth.py +++ b/src/guppy/computePsth.py @@ -18,6 +18,7 @@ get_all_stores_for_combining_data, makeAverageDir, read_hdf5, + write_hdf5, ) from .analysis.psth_average import averageForGroup from .analysis.psth_peak_and_area import findPSTHPeakAndArea @@ -119,7 +120,7 @@ def execute_compute_psth(filepath, event, inputParameters): corrected_timestamps = read_hdf5("timeCorrection_" + name_1, filepath, "timestampNew") else: corrected_timestamps = None - psth, psth_baselineUncorrected, cols = compute_psth( + psth, psth_baselineUncorrected, cols, ts = compute_psth( z_score, event, filepath, @@ -136,6 +137,7 @@ def execute_compute_psth(filepath, event, inputParameters): ts, corrected_timestamps, ) + write_hdf5(ts, event + "_" + name_1, filepath, "ts") create_Df( filepath, From efab4476278104cb7489694f4aa20d0e59932912 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Thu, 22 Jan 2026 16:45:47 -0800 Subject: [PATCH 138/150] refactored findPSTHPeakAndArea --- src/guppy/analysis/psth_peak_and_area.py | 55 ++++++++++++------------ 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/src/guppy/analysis/psth_peak_and_area.py b/src/guppy/analysis/psth_peak_and_area.py index 849bd29..11b290a 100644 --- a/src/guppy/analysis/psth_peak_and_area.py +++ b/src/guppy/analysis/psth_peak_and_area.py @@ -17,6 +17,8 @@ def findPSTHPeakAndArea(filepath, event, inputParameters): event = event.replace("\\", "_") event = event.replace("/", "_") + if "control" in event.lower() or "signal" in event.lower(): + return 0 # sampling_rate = read_hdf5(storesList[0,0], filepath, 'sampling_rate') peak_startPoint = inputParameters["peak_startPoint"] @@ -30,34 +32,31 @@ def findPSTHPeakAndArea(filepath, event, inputParameters): else: path = glob.glob(os.path.join(filepath, "z_score_*")) + glob.glob(os.path.join(filepath, "dff_*")) - if "control" in event.lower() or "signal" in event.lower(): - return 0 - else: - for i in range(len(path)): - logger.info(f"Computing peak and area for PSTH mean signal for event {event}...") - basename = (os.path.basename(path[i])).split(".")[0] - name_1 = basename.split("_")[-1] - sampling_rate = read_hdf5("timeCorrection_" + name_1, filepath, "sampling_rate")[0] - psth = read_Df(filepath, event + "_" + name_1, basename) - cols = list(psth.columns) - regex = re.compile("bin_[(]") - bin_names = [cols[i] for i in range(len(cols)) if regex.match(cols[i])] - regex_trials = re.compile("[+-]?([0-9]*[.])?[0-9]+") - trials_names = [cols[i] for i in range(len(cols)) if regex_trials.match(cols[i])] - psth_mean_bin_names = trials_names + bin_names + ["mean"] - psth_mean_bin_mean = np.asarray(psth[psth_mean_bin_names]) - timestamps = np.asarray(psth["timestamps"]).ravel() # np.asarray(read_Df(filepath, 'ts_psth', '')).ravel() - peak_area = helperPSTHPeakAndArea( - psth_mean_bin_mean, timestamps, sampling_rate, peak_startPoint, peak_endPoint - ) # peak, area = - # arr = np.array([[peak, area]]) - fileName = [os.path.basename(os.path.dirname(filepath))] - index = [fileName[0] + "_" + s for s in psth_mean_bin_names] - create_Df_area_peak( - filepath, peak_area, event + "_" + name_1 + "_" + basename, index=index - ) # columns=['peak', 'area'] - create_csv_area_peak(filepath, peak_area, event + "_" + name_1 + "_" + basename, index=index) - logger.info(f"Peak and Area for PSTH mean signal for event {event} computed.") + for i in range(len(path)): + logger.info(f"Computing peak and area for PSTH mean signal for event {event}...") + basename = (os.path.basename(path[i])).split(".")[0] + name_1 = basename.split("_")[-1] + sampling_rate = read_hdf5("timeCorrection_" + name_1, filepath, "sampling_rate")[0] + psth = read_Df(filepath, event + "_" + name_1, basename) + cols = list(psth.columns) + regex = re.compile("bin_[(]") + bin_names = [cols[i] for i in range(len(cols)) if regex.match(cols[i])] + regex_trials = re.compile("[+-]?([0-9]*[.])?[0-9]+") + trials_names = [cols[i] for i in range(len(cols)) if regex_trials.match(cols[i])] + psth_mean_bin_names = trials_names + bin_names + ["mean"] + psth_mean_bin_mean = np.asarray(psth[psth_mean_bin_names]) + timestamps = np.asarray(psth["timestamps"]).ravel() # np.asarray(read_Df(filepath, 'ts_psth', '')).ravel() + peak_area = helperPSTHPeakAndArea( + psth_mean_bin_mean, timestamps, sampling_rate, peak_startPoint, peak_endPoint + ) # peak, area = + # arr = np.array([[peak, area]]) + fileName = [os.path.basename(os.path.dirname(filepath))] + index = [fileName[0] + "_" + s for s in psth_mean_bin_names] + create_Df_area_peak( + filepath, peak_area, event + "_" + name_1 + "_" + basename, index=index + ) # columns=['peak', 'area'] + create_csv_area_peak(filepath, peak_area, event + "_" + name_1 + "_" + basename, index=index) + logger.info(f"Peak and Area for PSTH mean signal for event {event} computed.") def helperPSTHPeakAndArea(psth_mean, timestamps, sampling_rate, peak_startPoint, peak_endPoint): From 0a004eded2d5e71038aed966a205756696c138f5 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Thu, 22 Jan 2026 16:58:27 -0800 Subject: [PATCH 139/150] reorganized findPSTHPeakAndArea --- src/guppy/analysis/psth_peak_and_area.py | 90 +++--------------------- src/guppy/analysis/standard_io.py | 18 +++++ src/guppy/computePsth.py | 61 +++++++++++++++- 3 files changed, 86 insertions(+), 83 deletions(-) diff --git a/src/guppy/analysis/psth_peak_and_area.py b/src/guppy/analysis/psth_peak_and_area.py index 11b290a..2c2c421 100644 --- a/src/guppy/analysis/psth_peak_and_area.py +++ b/src/guppy/analysis/psth_peak_and_area.py @@ -1,65 +1,12 @@ -import glob import logging -import os -import re from collections import OrderedDict import numpy as np -import pandas as pd - -from .io_utils import read_Df, read_hdf5 logger = logging.getLogger(__name__) -# function to compute PSTH peak and area using the function helperPSTHPeakAndArea save the values to h5 and csv files. -def findPSTHPeakAndArea(filepath, event, inputParameters): - - event = event.replace("\\", "_") - event = event.replace("/", "_") - if "control" in event.lower() or "signal" in event.lower(): - return 0 - - # sampling_rate = read_hdf5(storesList[0,0], filepath, 'sampling_rate') - peak_startPoint = inputParameters["peak_startPoint"] - peak_endPoint = inputParameters["peak_endPoint"] - selectForComputePsth = inputParameters["selectForComputePsth"] - - if selectForComputePsth == "z_score": - path = glob.glob(os.path.join(filepath, "z_score_*")) - elif selectForComputePsth == "dff": - path = glob.glob(os.path.join(filepath, "dff_*")) - else: - path = glob.glob(os.path.join(filepath, "z_score_*")) + glob.glob(os.path.join(filepath, "dff_*")) - - for i in range(len(path)): - logger.info(f"Computing peak and area for PSTH mean signal for event {event}...") - basename = (os.path.basename(path[i])).split(".")[0] - name_1 = basename.split("_")[-1] - sampling_rate = read_hdf5("timeCorrection_" + name_1, filepath, "sampling_rate")[0] - psth = read_Df(filepath, event + "_" + name_1, basename) - cols = list(psth.columns) - regex = re.compile("bin_[(]") - bin_names = [cols[i] for i in range(len(cols)) if regex.match(cols[i])] - regex_trials = re.compile("[+-]?([0-9]*[.])?[0-9]+") - trials_names = [cols[i] for i in range(len(cols)) if regex_trials.match(cols[i])] - psth_mean_bin_names = trials_names + bin_names + ["mean"] - psth_mean_bin_mean = np.asarray(psth[psth_mean_bin_names]) - timestamps = np.asarray(psth["timestamps"]).ravel() # np.asarray(read_Df(filepath, 'ts_psth', '')).ravel() - peak_area = helperPSTHPeakAndArea( - psth_mean_bin_mean, timestamps, sampling_rate, peak_startPoint, peak_endPoint - ) # peak, area = - # arr = np.array([[peak, area]]) - fileName = [os.path.basename(os.path.dirname(filepath))] - index = [fileName[0] + "_" + s for s in psth_mean_bin_names] - create_Df_area_peak( - filepath, peak_area, event + "_" + name_1 + "_" + basename, index=index - ) # columns=['peak', 'area'] - create_csv_area_peak(filepath, peak_area, event + "_" + name_1 + "_" + basename, index=index) - logger.info(f"Peak and Area for PSTH mean signal for event {event} computed.") - - -def helperPSTHPeakAndArea(psth_mean, timestamps, sampling_rate, peak_startPoint, peak_endPoint): +def compute_psth_peak_and_area(psth_mean, timestamps, sampling_rate, peak_startPoint, peak_endPoint): peak_startPoint = np.asarray(peak_startPoint) peak_endPoint = np.asarray(peak_endPoint) @@ -79,11 +26,11 @@ def helperPSTHPeakAndArea(psth_mean, timestamps, sampling_rate, peak_startPoint, "Peak End Time is lesser than or equal to Peak Start Time. Please check the Peak parameters window." ) - peak_area = OrderedDict() + peak_and_area = OrderedDict() if peak_startPoint.shape[0] == 0 or peak_endPoint.shape[0] == 0: - peak_area["peak"] = np.nan - peak_area["area"] = np.nan + peak_and_area["peak"] = np.nan + peak_and_area["area"] = np.nan for i in range(peak_startPoint.shape[0]): startPtForPeak = np.where(timestamps >= peak_startPoint[i])[0] @@ -91,28 +38,11 @@ def helperPSTHPeakAndArea(psth_mean, timestamps, sampling_rate, peak_startPoint, if len(startPtForPeak) >= 1 and len(endPtForPeak) >= 1: peakPoint_pos = startPtForPeak[0] + np.argmax(psth_mean[startPtForPeak[0] : endPtForPeak[0], :], axis=0) peakPoint_neg = startPtForPeak[0] + np.argmin(psth_mean[startPtForPeak[0] : endPtForPeak[0], :], axis=0) - peak_area["peak_pos_" + str(i + 1)] = np.amax(psth_mean[peakPoint_pos], axis=0) - peak_area["peak_neg_" + str(i + 1)] = np.amin(psth_mean[peakPoint_neg], axis=0) - peak_area["area_" + str(i + 1)] = np.trapz(psth_mean[startPtForPeak[0] : endPtForPeak[0], :], axis=0) + peak_and_area["peak_pos_" + str(i + 1)] = np.amax(psth_mean[peakPoint_pos], axis=0) + peak_and_area["peak_neg_" + str(i + 1)] = np.amin(psth_mean[peakPoint_neg], axis=0) + peak_and_area["area_" + str(i + 1)] = np.trapz(psth_mean[startPtForPeak[0] : endPtForPeak[0], :], axis=0) else: - peak_area["peak_" + str(i + 1)] = np.nan - peak_area["area_" + str(i + 1)] = np.nan - - return peak_area - - -def create_Df_area_peak(filepath, arr, name, index=[]): - - op = os.path.join(filepath, "peak_AUC_" + name + ".h5") - dirname = os.path.dirname(filepath) - - df = pd.DataFrame(arr, index=index) - - df.to_hdf(op, key="df", mode="w") - - -def create_csv_area_peak(filepath, arr, name, index=[]): - op = os.path.join(filepath, "peak_AUC_" + name + ".csv") - df = pd.DataFrame(arr, index=index) + peak_and_area["peak_" + str(i + 1)] = np.nan + peak_and_area["area_" + str(i + 1)] = np.nan - df.to_csv(op) + return peak_and_area diff --git a/src/guppy/analysis/standard_io.py b/src/guppy/analysis/standard_io.py index 2baefca..c3b323c 100644 --- a/src/guppy/analysis/standard_io.py +++ b/src/guppy/analysis/standard_io.py @@ -2,6 +2,7 @@ import os import numpy as np +import pandas as pd from .io_utils import ( decide_naming_convention, @@ -290,3 +291,20 @@ def write_combined_data(output_filepath, pair_name_to_tsNew, display_name_to_dat write_hdf5(data, display_name, output_filepath, "data") for compound_name, ts in compound_name_to_ttl_timestamps.items(): write_hdf5(ts, compound_name, output_filepath, "ts") + + +def write_peak_and_area_to_hdf5(filepath, arr, name, index=[]): + + op = os.path.join(filepath, "peak_AUC_" + name + ".h5") + dirname = os.path.dirname(filepath) + + df = pd.DataFrame(arr, index=index) + + df.to_hdf(op, key="df", mode="w") + + +def write_peak_and_area_to_csv(filepath, arr, name, index=[]): + op = os.path.join(filepath, "peak_AUC_" + name + ".csv") + df = pd.DataFrame(arr, index=index) + + df.to_csv(op) diff --git a/src/guppy/computePsth.py b/src/guppy/computePsth.py index 38b04df..65b5d21 100755 --- a/src/guppy/computePsth.py +++ b/src/guppy/computePsth.py @@ -5,6 +5,7 @@ import logging import multiprocessing as mp import os +import re import subprocess import sys from itertools import repeat @@ -17,12 +18,17 @@ from .analysis.io_utils import ( get_all_stores_for_combining_data, makeAverageDir, + read_Df, read_hdf5, write_hdf5, ) from .analysis.psth_average import averageForGroup -from .analysis.psth_peak_and_area import findPSTHPeakAndArea +from .analysis.psth_peak_and_area import compute_psth_peak_and_area from .analysis.psth_utils import create_Df +from .analysis.standard_io import ( + write_peak_and_area_to_csv, + write_peak_and_area_to_hdf5, +) logger = logging.getLogger(__name__) @@ -150,6 +156,53 @@ def execute_compute_psth(filepath, event, inputParameters): logger.info(f"PSTH for event {event} computed.") +# function to compute PSTH peak and area using the function helperPSTHPeakAndArea save the values to h5 and csv files. +def execute_compute_psth_peak_and_area(filepath, event, inputParameters): + + event = event.replace("\\", "_") + event = event.replace("/", "_") + if "control" in event.lower() or "signal" in event.lower(): + return 0 + + # sampling_rate = read_hdf5(storesList[0,0], filepath, 'sampling_rate') + peak_startPoint = inputParameters["peak_startPoint"] + peak_endPoint = inputParameters["peak_endPoint"] + selectForComputePsth = inputParameters["selectForComputePsth"] + + if selectForComputePsth == "z_score": + path = glob.glob(os.path.join(filepath, "z_score_*")) + elif selectForComputePsth == "dff": + path = glob.glob(os.path.join(filepath, "dff_*")) + else: + path = glob.glob(os.path.join(filepath, "z_score_*")) + glob.glob(os.path.join(filepath, "dff_*")) + + for i in range(len(path)): + logger.info(f"Computing peak and area for PSTH mean signal for event {event}...") + basename = (os.path.basename(path[i])).split(".")[0] + name_1 = basename.split("_")[-1] + sampling_rate = read_hdf5("timeCorrection_" + name_1, filepath, "sampling_rate")[0] + psth = read_Df(filepath, event + "_" + name_1, basename) + cols = list(psth.columns) + regex = re.compile("bin_[(]") + bin_names = [cols[i] for i in range(len(cols)) if regex.match(cols[i])] + regex_trials = re.compile("[+-]?([0-9]*[.])?[0-9]+") + trials_names = [cols[i] for i in range(len(cols)) if regex_trials.match(cols[i])] + psth_mean_bin_names = trials_names + bin_names + ["mean"] + psth_mean_bin_mean = np.asarray(psth[psth_mean_bin_names]) + timestamps = np.asarray(psth["timestamps"]).ravel() # np.asarray(read_Df(filepath, 'ts_psth', '')).ravel() + peak_area = compute_psth_peak_and_area( + psth_mean_bin_mean, timestamps, sampling_rate, peak_startPoint, peak_endPoint + ) # peak, area = + # arr = np.array([[peak, area]]) + fileName = [os.path.basename(os.path.dirname(filepath))] + index = [fileName[0] + "_" + s for s in psth_mean_bin_names] + write_peak_and_area_to_hdf5( + filepath, peak_area, event + "_" + name_1 + "_" + basename, index=index + ) # columns=['peak', 'area'] + write_peak_and_area_to_csv(filepath, peak_area, event + "_" + name_1 + "_" + basename, index=index) + logger.info(f"Peak and Area for PSTH mean signal for event {event} computed.") + + def orchestrate_psth(inputParameters): folderNames = inputParameters["folderNames"] numProcesses = inputParameters["numberOfCores"] @@ -171,7 +224,9 @@ def orchestrate_psth(inputParameters): p.starmap(execute_compute_psth, zip(repeat(filepath), storesList[1, :], repeat(inputParameters))) with mp.Pool(numProcesses) as pq: - pq.starmap(findPSTHPeakAndArea, zip(repeat(filepath), storesList[1, :], repeat(inputParameters))) + pq.starmap( + execute_compute_psth_peak_and_area, zip(repeat(filepath), storesList[1, :], repeat(inputParameters)) + ) with mp.Pool(numProcesses) as cr: cr.starmap(computeCrossCorrelation, zip(repeat(filepath), storesList[1, :], repeat(inputParameters))) @@ -206,7 +261,7 @@ def execute_psth_combined(inputParameters): storesList = np.unique(storesList, axis=1) for k in range(storesList.shape[1]): execute_compute_psth(op[i][0], storesList[1, k], inputParameters) - findPSTHPeakAndArea(op[i][0], storesList[1, k], inputParameters) + execute_compute_psth_peak_and_area(op[i][0], storesList[1, k], inputParameters) computeCrossCorrelation(op[i][0], storesList[1, k], inputParameters) writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n") inputParameters["step"] += 1 From 1d212556552c464775232b22a00b41e580c5d7b3 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 23 Jan 2026 10:04:37 -0800 Subject: [PATCH 140/150] reorganized x-corr --- src/guppy/analysis/cross_correlation.py | 91 +------------------------ src/guppy/analysis/psth_average.py | 6 +- src/guppy/analysis/psth_utils.py | 46 ++++++++++++- src/guppy/computePsth.py | 55 +++++++++++++-- 4 files changed, 98 insertions(+), 100 deletions(-) diff --git a/src/guppy/analysis/cross_correlation.py b/src/guppy/analysis/cross_correlation.py index 43d0a10..3d69136 100644 --- a/src/guppy/analysis/cross_correlation.py +++ b/src/guppy/analysis/cross_correlation.py @@ -1,58 +1,13 @@ import glob import logging -import math import os -import re import numpy as np -import pandas as pd from scipy import signal -from .io_utils import make_dir_for_cross_correlation, read_Df, read_hdf5 - logger = logging.getLogger(__name__) -def computeCrossCorrelation(filepath, event, inputParameters): - isCompute = inputParameters["computeCorr"] - removeArtifacts = inputParameters["removeArtifacts"] - artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] - if isCompute == True: - if removeArtifacts == True and artifactsRemovalMethod == "concatenate": - raise Exception( - "For cross-correlation, when removeArtifacts is True, artifacts removal method\ - should be replace with NaNs and not concatenate" - ) - corr_info, type = getCorrCombinations(filepath, inputParameters) - if "control" in event.lower() or "signal" in event.lower(): - return - else: - for i in range(1, len(corr_info)): - logger.debug(f"Computing cross-correlation for event {event}...") - for j in range(len(type)): - psth_a = read_Df(filepath, event + "_" + corr_info[i - 1], type[j] + "_" + corr_info[i - 1]) - psth_b = read_Df(filepath, event + "_" + corr_info[i], type[j] + "_" + corr_info[i]) - sample_rate = 1 / (psth_a["timestamps"][1] - psth_a["timestamps"][0]) - psth_a = psth_a.drop(columns=["timestamps", "err", "mean"]) - psth_b = psth_b.drop(columns=["timestamps", "err", "mean"]) - cols_a, cols_b = np.array(psth_a.columns), np.array(psth_b.columns) - if np.intersect1d(cols_a, cols_b).size > 0: - cols = list(np.intersect1d(cols_a, cols_b)) - else: - cols = list(cols_a) - arr_A, arr_B = np.array(psth_a).T, np.array(psth_b).T - cross_corr = helperCrossCorrelation(arr_A, arr_B, sample_rate) - cols.append("timestamps") - create_Df( - make_dir_for_cross_correlation(filepath), - "corr_" + event, - type[j] + "_" + corr_info[i - 1] + "_" + corr_info[i], - cross_corr, - cols, - ) - logger.info(f"Cross-correlation for event {event} computed.") - - def getCorrCombinations(filepath, inputParameters): selectForComputePsth = inputParameters["selectForComputePsth"] if selectForComputePsth == "z_score": @@ -85,51 +40,7 @@ def getCorrCombinations(filepath, inputParameters): return corr_info, type -# same function used to store PSTH in computePsth file -# Here, cross correlation dataframe is saved instead of PSTH -# cross correlation dataframe has the same structure as PSTH file -def create_Df(filepath, event, name, psth, columns=[]): - if name: - op = os.path.join(filepath, event + "_{}.h5".format(name)) - else: - op = os.path.join(filepath, event + ".h5") - - # check if file already exists - # if os.path.exists(op): - # return 0 - - # removing psth binned trials - columns = list(np.array(columns, dtype="str")) - regex = re.compile("bin_*") - single_trials_index = [i for i in range(len(columns)) if not regex.match(columns[i])] - single_trials_index = [i for i in range(len(columns)) if columns[i] != "timestamps"] - - psth = psth.T - if psth.ndim > 1: - mean = np.nanmean(psth[:, single_trials_index], axis=1).reshape(-1, 1) - err = np.nanstd(psth[:, single_trials_index], axis=1) / math.sqrt(psth[:, single_trials_index].shape[1]) - err = err.reshape(-1, 1) - psth = np.hstack((psth, mean)) - psth = np.hstack((psth, err)) - # timestamps = np.asarray(read_Df(filepath, 'ts_psth', '')) - # psth = np.hstack((psth, timestamps)) - try: - ts = read_hdf5(event, filepath, "ts") - ts = np.append(ts, ["mean", "err"]) - except: - ts = None - - if len(columns) == 0: - df = pd.DataFrame(psth, index=None, columns=ts, dtype="float32") - else: - columns = np.asarray(columns) - columns = np.append(columns, ["mean", "err"]) - df = pd.DataFrame(psth, index=None, columns=columns, dtype="float32") - - df.to_hdf(op, key="df", mode="w") - - -def helperCrossCorrelation(arr_A, arr_B, sample_rate): +def compute_cross_correlation(arr_A, arr_B, sample_rate): cross_corr = list() for a, b in zip(arr_A, arr_B): if np.isnan(a).any() or np.isnan(b).any(): diff --git a/src/guppy/analysis/psth_average.py b/src/guppy/analysis/psth_average.py index b539419..4f3c589 100644 --- a/src/guppy/analysis/psth_average.py +++ b/src/guppy/analysis/psth_average.py @@ -14,7 +14,7 @@ read_Df, write_hdf5, ) -from .psth_utils import create_Df +from .psth_utils import create_Df_for_psth logger = logging.getLogger(__name__) @@ -111,7 +111,7 @@ def averageForGroup(folderNames, event, inputParameters): timestamps = np.asarray(df["timestamps"]).reshape(1, -1) psth = np.concatenate((psth, timestamps), axis=0) columns = columns + ["timestamps"] - create_Df(op, temp_path[j][1], temp_path[j][2], psth, columns=columns) + create_Df_for_psth(op, temp_path[j][1], temp_path[j][2], psth, columns=columns) # read PSTH peak and area for each event and combine them. Save the final output to an average folder for i in range(len(new_path)): @@ -178,7 +178,7 @@ def averageForGroup(folderNames, event, inputParameters): timestamps = np.array(df["timestamps"]).reshape(1, -1) corr = np.concatenate((corr, timestamps), axis=0) columns.append("timestamps") - create_Df( + create_Df_for_psth( make_dir_for_cross_correlation(op), "corr_" + event, type[i] + "_" + corr_info[k - 1] + "_" + corr_info[k], diff --git a/src/guppy/analysis/psth_utils.py b/src/guppy/analysis/psth_utils.py index 13b2479..45bc2c7 100644 --- a/src/guppy/analysis/psth_utils.py +++ b/src/guppy/analysis/psth_utils.py @@ -12,7 +12,7 @@ # function to create dataframe for each event PSTH and save it to h5 file -def create_Df(filepath, event, name, psth, columns=[]): +def create_Df_for_psth(filepath, event, name, psth, columns=[]): event = event.replace("\\", "_") event = event.replace("/", "_") if name: @@ -53,3 +53,47 @@ def create_Df(filepath, event, name, psth, columns=[]): df = pd.DataFrame(psth, index=None, columns=list(columns), dtype="float32") df.to_hdf(op, key="df", mode="w") + + +# same function used to store PSTH in computePsth file +# Here, cross correlation dataframe is saved instead of PSTH +# cross correlation dataframe has the same structure as PSTH file +def create_Df_for_cross_correlation(filepath, event, name, psth, columns=[]): + if name: + op = os.path.join(filepath, event + "_{}.h5".format(name)) + else: + op = os.path.join(filepath, event + ".h5") + + # check if file already exists + # if os.path.exists(op): + # return 0 + + # removing psth binned trials + columns = list(np.array(columns, dtype="str")) + regex = re.compile("bin_*") + single_trials_index = [i for i in range(len(columns)) if not regex.match(columns[i])] + single_trials_index = [i for i in range(len(columns)) if columns[i] != "timestamps"] + + psth = psth.T + if psth.ndim > 1: + mean = np.nanmean(psth[:, single_trials_index], axis=1).reshape(-1, 1) + err = np.nanstd(psth[:, single_trials_index], axis=1) / math.sqrt(psth[:, single_trials_index].shape[1]) + err = err.reshape(-1, 1) + psth = np.hstack((psth, mean)) + psth = np.hstack((psth, err)) + # timestamps = np.asarray(read_Df(filepath, 'ts_psth', '')) + # psth = np.hstack((psth, timestamps)) + try: + ts = read_hdf5(event, filepath, "ts") + ts = np.append(ts, ["mean", "err"]) + except: + ts = None + + if len(columns) == 0: + df = pd.DataFrame(psth, index=None, columns=ts, dtype="float32") + else: + columns = np.asarray(columns) + columns = np.append(columns, ["mean", "err"]) + df = pd.DataFrame(psth, index=None, columns=columns, dtype="float32") + + df.to_hdf(op, key="df", mode="w") diff --git a/src/guppy/computePsth.py b/src/guppy/computePsth.py index 65b5d21..04a5f2d 100755 --- a/src/guppy/computePsth.py +++ b/src/guppy/computePsth.py @@ -14,9 +14,10 @@ from scipy import signal as ss from .analysis.compute_psth import compute_psth -from .analysis.cross_correlation import computeCrossCorrelation +from .analysis.cross_correlation import compute_cross_correlation, getCorrCombinations from .analysis.io_utils import ( get_all_stores_for_combining_data, + make_dir_for_cross_correlation, makeAverageDir, read_Df, read_hdf5, @@ -24,7 +25,7 @@ ) from .analysis.psth_average import averageForGroup from .analysis.psth_peak_and_area import compute_psth_peak_and_area -from .analysis.psth_utils import create_Df +from .analysis.psth_utils import create_Df_for_cross_correlation, create_Df_for_psth from .analysis.standard_io import ( write_peak_and_area_to_csv, write_peak_and_area_to_hdf5, @@ -145,14 +146,14 @@ def execute_compute_psth(filepath, event, inputParameters): ) write_hdf5(ts, event + "_" + name_1, filepath, "ts") - create_Df( + create_Df_for_psth( filepath, event + "_" + name_1 + "_baselineUncorrected", basename, psth_baselineUncorrected, columns=cols, ) # extra - create_Df(filepath, event + "_" + name_1, basename, psth, columns=cols) + create_Df_for_psth(filepath, event + "_" + name_1, basename, psth, columns=cols) logger.info(f"PSTH for event {event} computed.") @@ -203,6 +204,46 @@ def execute_compute_psth_peak_and_area(filepath, event, inputParameters): logger.info(f"Peak and Area for PSTH mean signal for event {event} computed.") +def execute_compute_cross_correlation(filepath, event, inputParameters): + isCompute = inputParameters["computeCorr"] + removeArtifacts = inputParameters["removeArtifacts"] + artifactsRemovalMethod = inputParameters["artifactsRemovalMethod"] + if isCompute == True: + if removeArtifacts == True and artifactsRemovalMethod == "concatenate": + raise Exception( + "For cross-correlation, when removeArtifacts is True, artifacts removal method\ + should be replace with NaNs and not concatenate" + ) + corr_info, type = getCorrCombinations(filepath, inputParameters) + if "control" in event.lower() or "signal" in event.lower(): + return + else: + for i in range(1, len(corr_info)): + logger.debug(f"Computing cross-correlation for event {event}...") + for j in range(len(type)): + psth_a = read_Df(filepath, event + "_" + corr_info[i - 1], type[j] + "_" + corr_info[i - 1]) + psth_b = read_Df(filepath, event + "_" + corr_info[i], type[j] + "_" + corr_info[i]) + sample_rate = 1 / (psth_a["timestamps"][1] - psth_a["timestamps"][0]) + psth_a = psth_a.drop(columns=["timestamps", "err", "mean"]) + psth_b = psth_b.drop(columns=["timestamps", "err", "mean"]) + cols_a, cols_b = np.array(psth_a.columns), np.array(psth_b.columns) + if np.intersect1d(cols_a, cols_b).size > 0: + cols = list(np.intersect1d(cols_a, cols_b)) + else: + cols = list(cols_a) + arr_A, arr_B = np.array(psth_a).T, np.array(psth_b).T + cross_corr = compute_cross_correlation(arr_A, arr_B, sample_rate) + cols.append("timestamps") + create_Df_for_cross_correlation( + make_dir_for_cross_correlation(filepath), + "corr_" + event, + type[j] + "_" + corr_info[i - 1] + "_" + corr_info[i], + cross_corr, + cols, + ) + logger.info(f"Cross-correlation for event {event} computed.") + + def orchestrate_psth(inputParameters): folderNames = inputParameters["folderNames"] numProcesses = inputParameters["numberOfCores"] @@ -229,7 +270,9 @@ def orchestrate_psth(inputParameters): ) with mp.Pool(numProcesses) as cr: - cr.starmap(computeCrossCorrelation, zip(repeat(filepath), storesList[1, :], repeat(inputParameters))) + cr.starmap( + execute_compute_cross_correlation, zip(repeat(filepath), storesList[1, :], repeat(inputParameters)) + ) # for k in range(storesList.shape[1]): # storenamePsth(filepath, storesList[1,k], inputParameters) @@ -262,7 +305,7 @@ def execute_psth_combined(inputParameters): for k in range(storesList.shape[1]): execute_compute_psth(op[i][0], storesList[1, k], inputParameters) execute_compute_psth_peak_and_area(op[i][0], storesList[1, k], inputParameters) - computeCrossCorrelation(op[i][0], storesList[1, k], inputParameters) + execute_compute_cross_correlation(op[i][0], storesList[1, k], inputParameters) writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n") inputParameters["step"] += 1 From 707fa186a1c4c21350748f50262c7065a2d61309 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 23 Jan 2026 10:08:14 -0800 Subject: [PATCH 141/150] reorganized x-corr --- src/guppy/analysis/cross_correlation.py | 34 ------------------------- src/guppy/analysis/psth_average.py | 3 +-- src/guppy/analysis/psth_utils.py | 33 ++++++++++++++++++++++++ src/guppy/computePsth.py | 8 ++++-- 4 files changed, 40 insertions(+), 38 deletions(-) diff --git a/src/guppy/analysis/cross_correlation.py b/src/guppy/analysis/cross_correlation.py index 3d69136..726943d 100644 --- a/src/guppy/analysis/cross_correlation.py +++ b/src/guppy/analysis/cross_correlation.py @@ -1,6 +1,4 @@ -import glob import logging -import os import numpy as np from scipy import signal @@ -8,38 +6,6 @@ logger = logging.getLogger(__name__) -def getCorrCombinations(filepath, inputParameters): - selectForComputePsth = inputParameters["selectForComputePsth"] - if selectForComputePsth == "z_score": - path = glob.glob(os.path.join(filepath, "z_score_*")) - elif selectForComputePsth == "dff": - path = glob.glob(os.path.join(filepath, "dff_*")) - else: - path = glob.glob(os.path.join(filepath, "z_score_*")) + glob.glob(os.path.join(filepath, "dff_*")) - - names = list() - type = list() - for i in range(len(path)): - basename = (os.path.basename(path[i])).split(".")[0] - names.append(basename.split("_")[-1]) - type.append((os.path.basename(path[i])).split(".")[0].split("_" + names[-1], 1)[0]) - - names = list(np.unique(np.array(names))) - type = list(np.unique(np.array(type))) - - corr_info = list() - if len(names) <= 1: - logger.info("Cross-correlation cannot be computed because only one signal is present.") - return corr_info, type - elif len(names) == 2: - corr_info = names - else: - corr_info = names - corr_info.append(names[0]) - - return corr_info, type - - def compute_cross_correlation(arr_A, arr_B, sample_rate): cross_corr = list() for a, b in zip(arr_A, arr_B): diff --git a/src/guppy/analysis/psth_average.py b/src/guppy/analysis/psth_average.py index 4f3c589..664cc3d 100644 --- a/src/guppy/analysis/psth_average.py +++ b/src/guppy/analysis/psth_average.py @@ -7,14 +7,13 @@ import numpy as np import pandas as pd -from .cross_correlation import getCorrCombinations from .io_utils import ( make_dir_for_cross_correlation, makeAverageDir, read_Df, write_hdf5, ) -from .psth_utils import create_Df_for_psth +from .psth_utils import create_Df_for_psth, getCorrCombinations logger = logging.getLogger(__name__) diff --git a/src/guppy/analysis/psth_utils.py b/src/guppy/analysis/psth_utils.py index 45bc2c7..c351511 100644 --- a/src/guppy/analysis/psth_utils.py +++ b/src/guppy/analysis/psth_utils.py @@ -1,3 +1,4 @@ +import glob import logging import math import os @@ -97,3 +98,35 @@ def create_Df_for_cross_correlation(filepath, event, name, psth, columns=[]): df = pd.DataFrame(psth, index=None, columns=columns, dtype="float32") df.to_hdf(op, key="df", mode="w") + + +def getCorrCombinations(filepath, inputParameters): + selectForComputePsth = inputParameters["selectForComputePsth"] + if selectForComputePsth == "z_score": + path = glob.glob(os.path.join(filepath, "z_score_*")) + elif selectForComputePsth == "dff": + path = glob.glob(os.path.join(filepath, "dff_*")) + else: + path = glob.glob(os.path.join(filepath, "z_score_*")) + glob.glob(os.path.join(filepath, "dff_*")) + + names = list() + type = list() + for i in range(len(path)): + basename = (os.path.basename(path[i])).split(".")[0] + names.append(basename.split("_")[-1]) + type.append((os.path.basename(path[i])).split(".")[0].split("_" + names[-1], 1)[0]) + + names = list(np.unique(np.array(names))) + type = list(np.unique(np.array(type))) + + corr_info = list() + if len(names) <= 1: + logger.info("Cross-correlation cannot be computed because only one signal is present.") + return corr_info, type + elif len(names) == 2: + corr_info = names + else: + corr_info = names + corr_info.append(names[0]) + + return corr_info, type diff --git a/src/guppy/computePsth.py b/src/guppy/computePsth.py index 04a5f2d..654f299 100755 --- a/src/guppy/computePsth.py +++ b/src/guppy/computePsth.py @@ -14,7 +14,7 @@ from scipy import signal as ss from .analysis.compute_psth import compute_psth -from .analysis.cross_correlation import compute_cross_correlation, getCorrCombinations +from .analysis.cross_correlation import compute_cross_correlation from .analysis.io_utils import ( get_all_stores_for_combining_data, make_dir_for_cross_correlation, @@ -25,7 +25,11 @@ ) from .analysis.psth_average import averageForGroup from .analysis.psth_peak_and_area import compute_psth_peak_and_area -from .analysis.psth_utils import create_Df_for_cross_correlation, create_Df_for_psth +from .analysis.psth_utils import ( + create_Df_for_cross_correlation, + create_Df_for_psth, + getCorrCombinations, +) from .analysis.standard_io import ( write_peak_and_area_to_csv, write_peak_and_area_to_hdf5, From 0ae733307ecfcf0908c9196804a9388101fa2361 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 23 Jan 2026 10:24:32 -0800 Subject: [PATCH 142/150] updated imports --- src/guppy/findTransientsFreqAndAmp.py | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/src/guppy/findTransientsFreqAndAmp.py b/src/guppy/findTransientsFreqAndAmp.py index a31980a..c8cc4de 100755 --- a/src/guppy/findTransientsFreqAndAmp.py +++ b/src/guppy/findTransientsFreqAndAmp.py @@ -7,15 +7,12 @@ import sys from itertools import repeat -import h5py import matplotlib.pyplot as plt import numpy as np import pandas as pd from scipy.signal import argrelextrema -from .preprocess import get_all_stores_for_combining_data - -logger = logging.getLogger(__name__) +from .analysis.io_utils import get_all_stores_for_combining_data, read_hdf5 logger = logging.getLogger(__name__) @@ -33,22 +30,6 @@ def writeToFile(value: str): file.write(value) -def read_hdf5(event, filepath, key): - if event: - op = os.path.join(filepath, event + ".hdf5") - else: - op = filepath - - if os.path.exists(op): - with h5py.File(op, "r") as f: - arr = np.asarray(f[key]) - else: - logger.error(f"{event}.hdf5 file does not exist") - raise Exception("{}.hdf5 file does not exist".format(event)) - - return arr - - def processChunks(arrValues, arrIndexes, highAmpFilt, transientsThresh): arrValues = arrValues[~np.isnan(arrValues)] From 43f3289f47a4707f9836d2761b94b7baeacf9b7b Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 23 Jan 2026 10:32:54 -0800 Subject: [PATCH 143/150] reordered fns bottom --> up --- src/guppy/computePsth.py | 72 ++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/src/guppy/computePsth.py b/src/guppy/computePsth.py index 654f299..32d9be1 100755 --- a/src/guppy/computePsth.py +++ b/src/guppy/computePsth.py @@ -51,42 +51,6 @@ def writeToFile(value: str): file.write(value) -def psthForEachStorename(inputParameters): - - logger.info("Computing PSTH, Peak and Area for each event...") - inputParameters = inputParameters - - # storesList = np.genfromtxt(inputParameters['storesListPath'], dtype='str', delimiter=',') - - average = inputParameters["averageForGroup"] - combine_data = inputParameters["combine_data"] - numProcesses = inputParameters["numberOfCores"] - inputParameters["step"] = 0 - if numProcesses == 0: - numProcesses = mp.cpu_count() - elif numProcesses > mp.cpu_count(): - logger.warning( - "Warning : # of cores parameter set is greater than the cores available \ - available in your machine" - ) - numProcesses = mp.cpu_count() - 1 - - logger.info(f"Average for group : {average}") - - # for average following if statement will be executed - if average == True: - execute_average_for_group(inputParameters) - - # for individual analysis following else statement will be executed - else: - if combine_data == True: - execute_psth_combined(inputParameters) - else: - orchestrate_psth(inputParameters) - logger.info("PSTH, Area and Peak are computed for all events.") - return inputParameters - - # function to create PSTH for each event using function helper_psth and save the PSTH to h5 file def execute_compute_psth(filepath, event, inputParameters): @@ -355,6 +319,42 @@ def execute_average_for_group(inputParameters): inputParameters["step"] += 1 +def psthForEachStorename(inputParameters): + + logger.info("Computing PSTH, Peak and Area for each event...") + inputParameters = inputParameters + + # storesList = np.genfromtxt(inputParameters['storesListPath'], dtype='str', delimiter=',') + + average = inputParameters["averageForGroup"] + combine_data = inputParameters["combine_data"] + numProcesses = inputParameters["numberOfCores"] + inputParameters["step"] = 0 + if numProcesses == 0: + numProcesses = mp.cpu_count() + elif numProcesses > mp.cpu_count(): + logger.warning( + "Warning : # of cores parameter set is greater than the cores available \ + available in your machine" + ) + numProcesses = mp.cpu_count() - 1 + + logger.info(f"Average for group : {average}") + + # for average following if statement will be executed + if average == True: + execute_average_for_group(inputParameters) + + # for individual analysis following else statement will be executed + else: + if combine_data == True: + execute_psth_combined(inputParameters) + else: + orchestrate_psth(inputParameters) + logger.info("PSTH, Area and Peak are computed for all events.") + return inputParameters + + def main(input_parameters): try: inputParameters = psthForEachStorename(input_parameters) From b3ec696c68ced7e9d943260a4465cf53875044f5 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 23 Jan 2026 16:36:46 -0800 Subject: [PATCH 144/150] reorganized findTransientsFreqAndAmp --- src/guppy/findTransientsFreqAndAmp.py | 201 +++++++------------------- 1 file changed, 54 insertions(+), 147 deletions(-) diff --git a/src/guppy/findTransientsFreqAndAmp.py b/src/guppy/findTransientsFreqAndAmp.py index c8cc4de..71b9060 100755 --- a/src/guppy/findTransientsFreqAndAmp.py +++ b/src/guppy/findTransientsFreqAndAmp.py @@ -1,18 +1,16 @@ import glob import json import logging -import math import multiprocessing as mp import os import sys -from itertools import repeat import matplotlib.pyplot as plt import numpy as np import pandas as pd -from scipy.signal import argrelextrema from .analysis.io_utils import get_all_stores_for_combining_data, read_hdf5 +from .analysis.transients import analyze_transients logger = logging.getLogger(__name__) @@ -30,91 +28,6 @@ def writeToFile(value: str): file.write(value) -def processChunks(arrValues, arrIndexes, highAmpFilt, transientsThresh): - - arrValues = arrValues[~np.isnan(arrValues)] - median = np.median(arrValues) - - mad = np.median(np.abs(arrValues - median)) - - firstThreshold = median + (highAmpFilt * mad) - - greaterThanMad = np.where(arrValues > firstThreshold)[0] - - arr = np.arange(arrValues.shape[0]) - lowerThanMad = np.isin(arr, greaterThanMad, invert=True) - filteredOut = arrValues[np.where(lowerThanMad == True)[0]] - - filteredOutMedian = np.median(filteredOut) - filteredOutMad = np.median(np.abs(filteredOut - np.median(filteredOut))) - secondThreshold = filteredOutMedian + (transientsThresh * filteredOutMad) - - greaterThanThreshIndex = np.where(arrValues > secondThreshold)[0] - greaterThanThreshValues = arrValues[greaterThanThreshIndex] - temp = np.zeros(arrValues.shape[0]) - temp[greaterThanThreshIndex] = greaterThanThreshValues - peaks = argrelextrema(temp, np.greater)[0] - - firstThresholdY = np.full(arrValues.shape[0], firstThreshold) - secondThresholdY = np.full(arrValues.shape[0], secondThreshold) - - newPeaks = np.full(arrValues.shape[0], np.nan) - newPeaks[peaks] = peaks + arrIndexes[0] - - # madY = np.full(arrValues.shape[0], mad) - medianY = np.full(arrValues.shape[0], median) - filteredOutMedianY = np.full(arrValues.shape[0], filteredOutMedian) - - return peaks, mad, filteredOutMad, medianY, filteredOutMedianY, firstThresholdY, secondThresholdY - - -def createChunks(z_score, sampling_rate, window): - - logger.debug("Creating chunks for multiprocessing...") - windowPoints = math.ceil(sampling_rate * window) - remainderPoints = math.ceil((sampling_rate * window) - (z_score.shape[0] % windowPoints)) - - if remainderPoints == windowPoints: - padded_z_score = z_score - z_score_index = np.arange(padded_z_score.shape[0]) - else: - padding = np.full(remainderPoints, np.nan) - padded_z_score = np.concatenate((z_score, padding)) - z_score_index = np.arange(padded_z_score.shape[0]) - - reshape = padded_z_score.shape[0] / windowPoints - - if reshape.is_integer() == True: - z_score_chunks = padded_z_score.reshape(int(reshape), -1) - z_score_chunks_index = z_score_index.reshape(int(reshape), -1) - else: - logger.error("Reshaping values should be integer.") - raise Exception("Reshaping values should be integer.") - logger.info("Chunks are created for multiprocessing.") - return z_score_chunks, z_score_chunks_index - - -def calculate_freq_amp(arr, z_score, z_score_chunks_index, timestamps): - peaks = arr[:, 0] - filteredOutMedian = arr[:, 4] - count = 0 - peaksAmp = np.array([]) - peaksInd = np.array([]) - for i in range(z_score_chunks_index.shape[0]): - count += peaks[i].shape[0] - peaksIndexes = peaks[i] + z_score_chunks_index[i][0] - peaksInd = np.concatenate((peaksInd, peaksIndexes)) - amps = z_score[peaksIndexes] - filteredOutMedian[i][0] - peaksAmp = np.concatenate((peaksAmp, amps)) - - peaksInd = peaksInd.ravel() - peaksInd = peaksInd.astype(int) - # logger.info(timestamps) - freq = peaksAmp.shape[0] / ((timestamps[-1] - timestamps[0]) / 60) - - return freq, peaksAmp, peaksInd - - def create_Df(filepath, arr, name, index=[], columns=[]): op = os.path.join(filepath, "freqAndAmp_" + name + ".h5") @@ -170,21 +83,9 @@ def findFreqAndAmp(filepath, inputParameters, window=15, numProcesses=mp.cpu_cou name_1 = basename.split("_")[-1] sampling_rate = read_hdf5("timeCorrection_" + name_1, filepath, "sampling_rate")[0] z_score = read_hdf5("", path[i], "data") - not_nan_indices = ~np.isnan(z_score) - z_score = z_score[not_nan_indices] - z_score_chunks, z_score_chunks_index = createChunks(z_score, sampling_rate, window) - - with mp.Pool(numProcesses) as p: - result = p.starmap( - processChunks, zip(z_score_chunks, z_score_chunks_index, repeat(highAmpFilt), repeat(transientsThresh)) - ) - - result = np.asarray(result, dtype=object) - ts = read_hdf5("timeCorrection_" + name_1, filepath, "timestampNew") - ts = ts[not_nan_indices] - freq, peaksAmp, peaksInd = calculate_freq_amp(result, z_score, z_score_chunks_index, ts) - peaks_occurrences = np.array([ts[peaksInd], peaksAmp]).T - arr = np.array([[freq, np.mean(peaksAmp)]]) + z_score, ts, peaksInd, peaks_occurrences, arr = analyze_transients( + filepath, window, numProcesses, highAmpFilt, transientsThresh, name_1, sampling_rate, z_score + ) fileName = [os.path.basename(os.path.dirname(filepath))] create_Df(filepath, arr, basename, index=fileName, columns=["freq (events/min)", "amplitude"]) create_csv( @@ -297,57 +198,63 @@ def executeFindFreqAndAmp(inputParameters): numProcesses = mp.cpu_count() - 1 if average == True: - if len(folderNamesForAvg) > 0: - storesListPath = [] - for i in range(len(folderNamesForAvg)): - filepath = folderNamesForAvg[i] - storesListPath.append(takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*")))) - storesListPath = np.concatenate(storesListPath) - averageForGroup(storesListPath, inputParameters) - writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n") - inputParameters["step"] += 1 - else: - logger.error("Not a single folder name is provided in folderNamesForAvg in inputParamters File.") - raise Exception("Not a single folder name is provided in folderNamesForAvg in inputParamters File.") - + execute_average_for_group(inputParameters, folderNamesForAvg) else: if combine_data == True: - storesListPath = [] - for i in range(len(folderNames)): - filepath = folderNames[i] - storesListPath.append(takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*")))) - storesListPath = list(np.concatenate(storesListPath).flatten()) - op = get_all_stores_for_combining_data(storesListPath) - for i in range(len(op)): - filepath = op[i][0] - storesList = np.genfromtxt( - os.path.join(filepath, "storesList.csv"), dtype="str", delimiter="," - ).reshape(2, -1) - findFreqAndAmp(filepath, inputParameters, window=moving_window, numProcesses=numProcesses) - writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n") - inputParameters["step"] += 1 - plt.show() + execute_find_freq_and_amp_combined(inputParameters, folderNames, moving_window, numProcesses) else: - for i in range(len(folderNames)): - logger.debug( - f"Finding transients in z-score data of {folderNames[i]} and calculating frequency and amplitude." - ) - filepath = folderNames[i] - storesListPath = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))) - for j in range(len(storesListPath)): - filepath = storesListPath[j] - storesList = np.genfromtxt( - os.path.join(filepath, "storesList.csv"), dtype="str", delimiter="," - ).reshape(2, -1) - findFreqAndAmp(filepath, inputParameters, window=moving_window, numProcesses=numProcesses) - writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n") - inputParameters["step"] += 1 - logger.info("Transients in z-score data found and frequency and amplitude are calculated.") - plt.show() + execute_find_freq_and_amp(inputParameters, folderNames, moving_window, numProcesses) logger.info("Transients in z-score data found and frequency and amplitude are calculated.") +def execute_find_freq_and_amp(inputParameters, folderNames, moving_window, numProcesses): + for i in range(len(folderNames)): + logger.debug(f"Finding transients in z-score data of {folderNames[i]} and calculating frequency and amplitude.") + filepath = folderNames[i] + storesListPath = takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*"))) + for j in range(len(storesListPath)): + filepath = storesListPath[j] + storesList = np.genfromtxt(os.path.join(filepath, "storesList.csv"), dtype="str", delimiter=",").reshape( + 2, -1 + ) + findFreqAndAmp(filepath, inputParameters, window=moving_window, numProcesses=numProcesses) + writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n") + inputParameters["step"] += 1 + logger.info("Transients in z-score data found and frequency and amplitude are calculated.") + plt.show() + + +def execute_find_freq_and_amp_combined(inputParameters, folderNames, moving_window, numProcesses): + storesListPath = [] + for i in range(len(folderNames)): + filepath = folderNames[i] + storesListPath.append(takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*")))) + storesListPath = list(np.concatenate(storesListPath).flatten()) + op = get_all_stores_for_combining_data(storesListPath) + for i in range(len(op)): + filepath = op[i][0] + storesList = np.genfromtxt(os.path.join(filepath, "storesList.csv"), dtype="str", delimiter=",").reshape(2, -1) + findFreqAndAmp(filepath, inputParameters, window=moving_window, numProcesses=numProcesses) + writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n") + inputParameters["step"] += 1 + plt.show() + + +def execute_average_for_group(inputParameters, folderNamesForAvg): + if len(folderNamesForAvg) == 0: + logger.error("Not a single folder name is provided in folderNamesForAvg in inputParamters File.") + raise Exception("Not a single folder name is provided in folderNamesForAvg in inputParamters File.") + storesListPath = [] + for i in range(len(folderNamesForAvg)): + filepath = folderNamesForAvg[i] + storesListPath.append(takeOnlyDirs(glob.glob(os.path.join(filepath, "*_output_*")))) + storesListPath = np.concatenate(storesListPath) + averageForGroup(storesListPath, inputParameters) + writeToFile(str(10 + ((inputParameters["step"] + 1) * 10)) + "\n") + inputParameters["step"] += 1 + + if __name__ == "__main__": try: executeFindFreqAndAmp(json.loads(sys.argv[1])) From 74816400519bd38dd8c1c69b45d8bea10f4ea9c3 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 23 Jan 2026 16:38:03 -0800 Subject: [PATCH 145/150] reorganized findTransientsFreqAndAmp --- src/guppy/analysis/transients.py | 115 +++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 src/guppy/analysis/transients.py diff --git a/src/guppy/analysis/transients.py b/src/guppy/analysis/transients.py new file mode 100644 index 0000000..2e6c189 --- /dev/null +++ b/src/guppy/analysis/transients.py @@ -0,0 +1,115 @@ +import logging +import math +import multiprocessing as mp +from itertools import repeat + +import numpy as np +from scipy.signal import argrelextrema + +from .io_utils import read_hdf5 + +logger = logging.getLogger(__name__) + + +def analyze_transients(filepath, window, numProcesses, highAmpFilt, transientsThresh, name_1, sampling_rate, z_score): + not_nan_indices = ~np.isnan(z_score) + z_score = z_score[not_nan_indices] + z_score_chunks, z_score_chunks_index = createChunks(z_score, sampling_rate, window) + + with mp.Pool(numProcesses) as p: + result = p.starmap( + processChunks, zip(z_score_chunks, z_score_chunks_index, repeat(highAmpFilt), repeat(transientsThresh)) + ) + + result = np.asarray(result, dtype=object) + ts = read_hdf5("timeCorrection_" + name_1, filepath, "timestampNew") + ts = ts[not_nan_indices] + freq, peaksAmp, peaksInd = calculate_freq_amp(result, z_score, z_score_chunks_index, ts) + peaks_occurrences = np.array([ts[peaksInd], peaksAmp]).T + arr = np.array([[freq, np.mean(peaksAmp)]]) + return z_score, ts, peaksInd, peaks_occurrences, arr + + +def processChunks(arrValues, arrIndexes, highAmpFilt, transientsThresh): + + arrValues = arrValues[~np.isnan(arrValues)] + median = np.median(arrValues) + + mad = np.median(np.abs(arrValues - median)) + + firstThreshold = median + (highAmpFilt * mad) + + greaterThanMad = np.where(arrValues > firstThreshold)[0] + + arr = np.arange(arrValues.shape[0]) + lowerThanMad = np.isin(arr, greaterThanMad, invert=True) + filteredOut = arrValues[np.where(lowerThanMad == True)[0]] + + filteredOutMedian = np.median(filteredOut) + filteredOutMad = np.median(np.abs(filteredOut - np.median(filteredOut))) + secondThreshold = filteredOutMedian + (transientsThresh * filteredOutMad) + + greaterThanThreshIndex = np.where(arrValues > secondThreshold)[0] + greaterThanThreshValues = arrValues[greaterThanThreshIndex] + temp = np.zeros(arrValues.shape[0]) + temp[greaterThanThreshIndex] = greaterThanThreshValues + peaks = argrelextrema(temp, np.greater)[0] + + firstThresholdY = np.full(arrValues.shape[0], firstThreshold) + secondThresholdY = np.full(arrValues.shape[0], secondThreshold) + + newPeaks = np.full(arrValues.shape[0], np.nan) + newPeaks[peaks] = peaks + arrIndexes[0] + + # madY = np.full(arrValues.shape[0], mad) + medianY = np.full(arrValues.shape[0], median) + filteredOutMedianY = np.full(arrValues.shape[0], filteredOutMedian) + + return peaks, mad, filteredOutMad, medianY, filteredOutMedianY, firstThresholdY, secondThresholdY + + +def createChunks(z_score, sampling_rate, window): + + logger.debug("Creating chunks for multiprocessing...") + windowPoints = math.ceil(sampling_rate * window) + remainderPoints = math.ceil((sampling_rate * window) - (z_score.shape[0] % windowPoints)) + + if remainderPoints == windowPoints: + padded_z_score = z_score + z_score_index = np.arange(padded_z_score.shape[0]) + else: + padding = np.full(remainderPoints, np.nan) + padded_z_score = np.concatenate((z_score, padding)) + z_score_index = np.arange(padded_z_score.shape[0]) + + reshape = padded_z_score.shape[0] / windowPoints + + if reshape.is_integer() == True: + z_score_chunks = padded_z_score.reshape(int(reshape), -1) + z_score_chunks_index = z_score_index.reshape(int(reshape), -1) + else: + logger.error("Reshaping values should be integer.") + raise Exception("Reshaping values should be integer.") + logger.info("Chunks are created for multiprocessing.") + return z_score_chunks, z_score_chunks_index + + +def calculate_freq_amp(arr, z_score, z_score_chunks_index, timestamps): + peaks = arr[:, 0] + filteredOutMedian = arr[:, 4] + count = 0 + peaksAmp = np.array([]) + peaksInd = np.array([]) + for i in range(z_score_chunks_index.shape[0]): + count += peaks[i].shape[0] + peaksIndexes = peaks[i] + z_score_chunks_index[i][0] + peaksInd = np.concatenate((peaksInd, peaksIndexes)) + amps = z_score[peaksIndexes] - filteredOutMedian[i][0] + peaksAmp = np.concatenate((peaksAmp, amps)) + + peaksInd = peaksInd.ravel() + peaksInd = peaksInd.astype(int) + # logger.info(timestamps) + freq = peaksAmp.shape[0] / ((timestamps[-1] - timestamps[0]) / 60) + + return freq, peaksAmp, peaksInd From 4a87c885c77431f81cce3c5552536b756b693105 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 23 Jan 2026 16:39:52 -0800 Subject: [PATCH 146/150] pulled read out of analyze_transients --- src/guppy/analysis/transients.py | 5 +---- src/guppy/findTransientsFreqAndAmp.py | 3 ++- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/guppy/analysis/transients.py b/src/guppy/analysis/transients.py index 2e6c189..5fd8645 100644 --- a/src/guppy/analysis/transients.py +++ b/src/guppy/analysis/transients.py @@ -6,12 +6,10 @@ import numpy as np from scipy.signal import argrelextrema -from .io_utils import read_hdf5 - logger = logging.getLogger(__name__) -def analyze_transients(filepath, window, numProcesses, highAmpFilt, transientsThresh, name_1, sampling_rate, z_score): +def analyze_transients(ts, window, numProcesses, highAmpFilt, transientsThresh, sampling_rate, z_score): not_nan_indices = ~np.isnan(z_score) z_score = z_score[not_nan_indices] z_score_chunks, z_score_chunks_index = createChunks(z_score, sampling_rate, window) @@ -22,7 +20,6 @@ def analyze_transients(filepath, window, numProcesses, highAmpFilt, transientsTh ) result = np.asarray(result, dtype=object) - ts = read_hdf5("timeCorrection_" + name_1, filepath, "timestampNew") ts = ts[not_nan_indices] freq, peaksAmp, peaksInd = calculate_freq_amp(result, z_score, z_score_chunks_index, ts) peaks_occurrences = np.array([ts[peaksInd], peaksAmp]).T diff --git a/src/guppy/findTransientsFreqAndAmp.py b/src/guppy/findTransientsFreqAndAmp.py index 71b9060..2111332 100755 --- a/src/guppy/findTransientsFreqAndAmp.py +++ b/src/guppy/findTransientsFreqAndAmp.py @@ -83,8 +83,9 @@ def findFreqAndAmp(filepath, inputParameters, window=15, numProcesses=mp.cpu_cou name_1 = basename.split("_")[-1] sampling_rate = read_hdf5("timeCorrection_" + name_1, filepath, "sampling_rate")[0] z_score = read_hdf5("", path[i], "data") + ts = read_hdf5("timeCorrection_" + name_1, filepath, "timestampNew") z_score, ts, peaksInd, peaks_occurrences, arr = analyze_transients( - filepath, window, numProcesses, highAmpFilt, transientsThresh, name_1, sampling_rate, z_score + ts, window, numProcesses, highAmpFilt, transientsThresh, sampling_rate, z_score ) fileName = [os.path.basename(os.path.dirname(filepath))] create_Df(filepath, arr, basename, index=fileName, columns=["freq (events/min)", "amplitude"]) From 4eb229b215171bc589d8f2962bd602dced34f0ee Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 23 Jan 2026 16:50:05 -0800 Subject: [PATCH 147/150] Moved some read-write operations to io_utils and standard_io --- src/guppy/analysis/standard_io.py | 23 ++++++++++ src/guppy/findTransientsFreqAndAmp.py | 65 +++++++-------------------- 2 files changed, 40 insertions(+), 48 deletions(-) diff --git a/src/guppy/analysis/standard_io.py b/src/guppy/analysis/standard_io.py index c3b323c..d6dd9af 100644 --- a/src/guppy/analysis/standard_io.py +++ b/src/guppy/analysis/standard_io.py @@ -308,3 +308,26 @@ def write_peak_and_area_to_csv(filepath, arr, name, index=[]): df = pd.DataFrame(arr, index=index) df.to_csv(op) + + +def write_freq_and_amp_to_hdf5(filepath, arr, name, index=[], columns=[]): + + op = os.path.join(filepath, "freqAndAmp_" + name + ".h5") + dirname = os.path.dirname(filepath) + + df = pd.DataFrame(arr, index=index, columns=columns) + + df.to_hdf(op, key="df", mode="w") + + +def write_freq_and_amp_to_csv(filepath, arr, name, index=[], columns=[]): + op = os.path.join(filepath, name) + df = pd.DataFrame(arr, index=index, columns=columns) + df.to_csv(op) + + +def read_freq_and_amp_from_hdf5(filepath, name): + op = os.path.join(filepath, "freqAndAmp_" + name + ".h5") + df = pd.read_hdf(op, key="df", mode="r") + + return df diff --git a/src/guppy/findTransientsFreqAndAmp.py b/src/guppy/findTransientsFreqAndAmp.py index 2111332..3970c24 100755 --- a/src/guppy/findTransientsFreqAndAmp.py +++ b/src/guppy/findTransientsFreqAndAmp.py @@ -7,50 +7,28 @@ import matplotlib.pyplot as plt import numpy as np -import pandas as pd -from .analysis.io_utils import get_all_stores_for_combining_data, read_hdf5 +from .analysis.io_utils import ( + get_all_stores_for_combining_data, + makeAverageDir, + read_hdf5, + takeOnlyDirs, +) +from .analysis.standard_io import ( + read_freq_and_amp_from_hdf5, + write_freq_and_amp_to_csv, + write_freq_and_amp_to_hdf5, +) from .analysis.transients import analyze_transients logger = logging.getLogger(__name__) -def takeOnlyDirs(paths): - removePaths = [] - for p in paths: - if os.path.isfile(p): - removePaths.append(p) - return list(set(paths) - set(removePaths)) - - def writeToFile(value: str): with open(os.path.join(os.path.expanduser("~"), "pbSteps.txt"), "a") as file: file.write(value) -def create_Df(filepath, arr, name, index=[], columns=[]): - - op = os.path.join(filepath, "freqAndAmp_" + name + ".h5") - dirname = os.path.dirname(filepath) - - df = pd.DataFrame(arr, index=index, columns=columns) - - df.to_hdf(op, key="df", mode="w") - - -def create_csv(filepath, arr, name, index=[], columns=[]): - op = os.path.join(filepath, name) - df = pd.DataFrame(arr, index=index, columns=columns) - df.to_csv(op) - - -def read_Df(filepath, name): - op = os.path.join(filepath, "freqAndAmp_" + name + ".h5") - df = pd.read_hdf(op, key="df", mode="r") - - return df - - def visuzlize_peaks(filepath, z_score, timestamps, peaksIndex): dirname = os.path.dirname(filepath) @@ -88,11 +66,11 @@ def findFreqAndAmp(filepath, inputParameters, window=15, numProcesses=mp.cpu_cou ts, window, numProcesses, highAmpFilt, transientsThresh, sampling_rate, z_score ) fileName = [os.path.basename(os.path.dirname(filepath))] - create_Df(filepath, arr, basename, index=fileName, columns=["freq (events/min)", "amplitude"]) - create_csv( + write_freq_and_amp_to_hdf5(filepath, arr, basename, index=fileName, columns=["freq (events/min)", "amplitude"]) + write_freq_and_amp_to_csv( filepath, arr, "freqAndAmp_" + basename + ".csv", index=fileName, columns=["freq (events/min)", "amplitude"] ) - create_csv( + write_freq_and_amp_to_csv( filepath, peaks_occurrences, "transientsOccurrences_" + basename + ".csv", @@ -103,15 +81,6 @@ def findFreqAndAmp(filepath, inputParameters, window=15, numProcesses=mp.cpu_cou logger.info("Frequency and amplitude of transients in z_score data are calculated.") -def makeAverageDir(filepath): - - op = os.path.join(filepath, "average") - if not os.path.exists(op): - os.mkdir(op) - - return op - - def averageForGroup(folderNames, inputParameters): logger.debug("Combining results for frequency and amplitude of transients in z-score data...") @@ -161,13 +130,13 @@ def averageForGroup(folderNames, inputParameters): if not os.path.exists(os.path.join(temp_path[j][0], "freqAndAmp_" + temp_path[j][1] + ".h5")): continue else: - df = read_Df(temp_path[j][0], temp_path[j][1]) + df = read_freq_and_amp_from_hdf5(temp_path[j][0], temp_path[j][1]) arr.append(np.array([df["freq (events/min)"][0], df["amplitude"][0]])) fileName.append(os.path.basename(temp_path[j][0])) arr = np.asarray(arr) - create_Df(op, arr, temp_path[j][1], index=fileName, columns=["freq (events/min)", "amplitude"]) - create_csv( + write_freq_and_amp_to_hdf5(op, arr, temp_path[j][1], index=fileName, columns=["freq (events/min)", "amplitude"]) + write_freq_and_amp_to_csv( op, arr, "freqAndAmp_" + temp_path[j][1] + ".csv", From bfe7c71ad443988abcdab00e6175d6a97158f643 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 23 Jan 2026 16:56:01 -0800 Subject: [PATCH 148/150] Reorganzied AverageForGroup --- src/guppy/analysis/transients_average.py | 81 ++++++++++++++++++++++++ src/guppy/findTransientsFreqAndAmp.py | 68 +------------------- 2 files changed, 82 insertions(+), 67 deletions(-) create mode 100644 src/guppy/analysis/transients_average.py diff --git a/src/guppy/analysis/transients_average.py b/src/guppy/analysis/transients_average.py new file mode 100644 index 0000000..3b8dd79 --- /dev/null +++ b/src/guppy/analysis/transients_average.py @@ -0,0 +1,81 @@ +import glob +import logging +import os + +import numpy as np + +from .io_utils import ( + makeAverageDir, +) +from .standard_io import ( + read_freq_and_amp_from_hdf5, + write_freq_and_amp_to_csv, + write_freq_and_amp_to_hdf5, +) + +logger = logging.getLogger(__name__) + + +def averageForGroup(folderNames, inputParameters): + + logger.debug("Combining results for frequency and amplitude of transients in z-score data...") + path = [] + abspath = inputParameters["abspath"] + selectForTransientsComputation = inputParameters["selectForTransientsComputation"] + path_temp_len = [] + + for i in range(len(folderNames)): + if selectForTransientsComputation == "z_score": + path_temp = glob.glob(os.path.join(folderNames[i], "z_score_*")) + elif selectForTransientsComputation == "dff": + path_temp = glob.glob(os.path.join(folderNames[i], "dff_*")) + else: + path_temp = glob.glob(os.path.join(folderNames[i], "z_score_*")) + glob.glob( + os.path.join(folderNames[i], "dff_*") + ) + + path_temp_len.append(len(path_temp)) + + for j in range(len(path_temp)): + basename = (os.path.basename(path_temp[j])).split(".")[0] + # name = name[0] + temp = [folderNames[i], basename] + path.append(temp) + + path_temp_len = np.asarray(path_temp_len) + max_len = np.argmax(path_temp_len) + + naming = [] + for i in range(len(path)): + naming.append(path[i][1]) + naming = np.unique(np.asarray(naming)) + + new_path = [[] for _ in range(path_temp_len[max_len])] + for i in range(len(path)): + idx = np.where(naming == path[i][1])[0][0] + new_path[idx].append(path[i]) + + op = makeAverageDir(abspath) + + for i in range(len(new_path)): + arr = [] # np.zeros((len(new_path[i]), 2)) + fileName = [] + temp_path = new_path[i] + for j in range(len(temp_path)): + if not os.path.exists(os.path.join(temp_path[j][0], "freqAndAmp_" + temp_path[j][1] + ".h5")): + continue + else: + df = read_freq_and_amp_from_hdf5(temp_path[j][0], temp_path[j][1]) + arr.append(np.array([df["freq (events/min)"][0], df["amplitude"][0]])) + fileName.append(os.path.basename(temp_path[j][0])) + + arr = np.asarray(arr) + write_freq_and_amp_to_hdf5(op, arr, temp_path[j][1], index=fileName, columns=["freq (events/min)", "amplitude"]) + write_freq_and_amp_to_csv( + op, + arr, + "freqAndAmp_" + temp_path[j][1] + ".csv", + index=fileName, + columns=["freq (events/min)", "amplitude"], + ) + logger.info("Results for frequency and amplitude of transients in z-score data are combined.") diff --git a/src/guppy/findTransientsFreqAndAmp.py b/src/guppy/findTransientsFreqAndAmp.py index 3970c24..f6c3d6e 100755 --- a/src/guppy/findTransientsFreqAndAmp.py +++ b/src/guppy/findTransientsFreqAndAmp.py @@ -10,16 +10,15 @@ from .analysis.io_utils import ( get_all_stores_for_combining_data, - makeAverageDir, read_hdf5, takeOnlyDirs, ) from .analysis.standard_io import ( - read_freq_and_amp_from_hdf5, write_freq_and_amp_to_csv, write_freq_and_amp_to_hdf5, ) from .analysis.transients import analyze_transients +from .analysis.transients_average import averageForGroup logger = logging.getLogger(__name__) @@ -81,71 +80,6 @@ def findFreqAndAmp(filepath, inputParameters, window=15, numProcesses=mp.cpu_cou logger.info("Frequency and amplitude of transients in z_score data are calculated.") -def averageForGroup(folderNames, inputParameters): - - logger.debug("Combining results for frequency and amplitude of transients in z-score data...") - path = [] - abspath = inputParameters["abspath"] - selectForTransientsComputation = inputParameters["selectForTransientsComputation"] - path_temp_len = [] - - for i in range(len(folderNames)): - if selectForTransientsComputation == "z_score": - path_temp = glob.glob(os.path.join(folderNames[i], "z_score_*")) - elif selectForTransientsComputation == "dff": - path_temp = glob.glob(os.path.join(folderNames[i], "dff_*")) - else: - path_temp = glob.glob(os.path.join(folderNames[i], "z_score_*")) + glob.glob( - os.path.join(folderNames[i], "dff_*") - ) - - path_temp_len.append(len(path_temp)) - - for j in range(len(path_temp)): - basename = (os.path.basename(path_temp[j])).split(".")[0] - # name = name[0] - temp = [folderNames[i], basename] - path.append(temp) - - path_temp_len = np.asarray(path_temp_len) - max_len = np.argmax(path_temp_len) - - naming = [] - for i in range(len(path)): - naming.append(path[i][1]) - naming = np.unique(np.asarray(naming)) - - new_path = [[] for _ in range(path_temp_len[max_len])] - for i in range(len(path)): - idx = np.where(naming == path[i][1])[0][0] - new_path[idx].append(path[i]) - - op = makeAverageDir(abspath) - - for i in range(len(new_path)): - arr = [] # np.zeros((len(new_path[i]), 2)) - fileName = [] - temp_path = new_path[i] - for j in range(len(temp_path)): - if not os.path.exists(os.path.join(temp_path[j][0], "freqAndAmp_" + temp_path[j][1] + ".h5")): - continue - else: - df = read_freq_and_amp_from_hdf5(temp_path[j][0], temp_path[j][1]) - arr.append(np.array([df["freq (events/min)"][0], df["amplitude"][0]])) - fileName.append(os.path.basename(temp_path[j][0])) - - arr = np.asarray(arr) - write_freq_and_amp_to_hdf5(op, arr, temp_path[j][1], index=fileName, columns=["freq (events/min)", "amplitude"]) - write_freq_and_amp_to_csv( - op, - arr, - "freqAndAmp_" + temp_path[j][1] + ".csv", - index=fileName, - columns=["freq (events/min)", "amplitude"], - ) - logger.info("Results for frequency and amplitude of transients in z-score data are combined.") - - def executeFindFreqAndAmp(inputParameters): logger.info("Finding transients in z-score data and calculating frequency and amplitude....") From 33a7054fd42f3ac44038550baded4a1bae97889c Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 26 Jan 2026 12:43:36 -0800 Subject: [PATCH 149/150] deleted temp markdowns --- step4_data_flow_analysis.md | 348 --------------- timestamp_correction_analysis.md | 723 ------------------------------- 2 files changed, 1071 deletions(-) delete mode 100644 step4_data_flow_analysis.md delete mode 100644 timestamp_correction_analysis.md diff --git a/step4_data_flow_analysis.md b/step4_data_flow_analysis.md deleted file mode 100644 index d86e938..0000000 --- a/step4_data_flow_analysis.md +++ /dev/null @@ -1,348 +0,0 @@ -# Step 4 (preprocess.py) Data Flow Analysis - -## Overview - -Step 4 processes timestamp-corrected photometry data and computes normalized signals (ΔF/F and z-scores). It handles artifact removal, data combination from multiple sessions, and generates quality control visualizations. - -## High-Level Data Flow - -```mermaid -flowchart TD - A[Entry: extractTsAndSignal] --> B{combine_data?} - - B -->|False| C[execute_timestamp_correction] - B -->|True| D[execute_timestamp_correction] - - C --> E[execute_zscore] - - D --> F[check_storeslistfile] - F --> G[combineData] - G --> H[execute_zscore] - - E --> I[Output: z_score, dff, cntrl_sig_fit HDF5 files] - H --> I - - style A fill:#e1f5ff - style I fill:#d4edda -``` - -## Main Processing Paths - -### Entry Point -**`extractTsAndSignal(inputParameters)`** (line 1178) is the main entry point called by the GUI or API. - -### Path 1: Normal Processing (combine_data = False) -1. `execute_timestamp_correction()` → Correct timestamps and align data -2. `execute_zscore()` → Compute z-scores and ΔF/F - -### Path 2: Combined Data Processing (combine_data = True) -1. `execute_timestamp_correction()` → Correct timestamps for each file -2. `check_storeslistfile()` → Merge store lists from multiple files -3. `combineData()` → Combine data from multiple recording sessions -4. `execute_zscore()` → Compute z-scores and ΔF/F on combined data - -## Detailed Processing Stages - -### Stage 1: Timestamp Correction - -```mermaid -flowchart LR - A[Raw HDF5 files] --> B[Read storesList.csv] - B --> C{isosbestic_control?} - C -->|No| D[add_control_channel] - C -->|Yes| E[timestampCorrection_tdt/csv] - D --> E - E --> F[Eliminate first N seconds] - F --> G[decide_naming_convention_and_applyCorrection] - G --> H[applyCorrection for each store] - H --> I{isosbestic_control?} - I -->|No| J[create_control_channel via curve fitting] - I -->|Yes| K[timeCorrection_*.hdf5 files] - J --> K - - style A fill:#e1f5ff - style K fill:#d4edda -``` - -#### Function: `execute_timestamp_correction(folderNames, inputParameters)` - -**Input:** -- Raw HDF5 files from extractors: `control_*.hdf5`, `signal_*.hdf5`, `event_*.hdf5` - -**Process:** -1. For each session folder: - - Read `storesList.csv` (mapping of raw names to semantic names) - - If no isosbestic control: `add_control_channel()` creates placeholder control files - - **`timestampCorrection_tdt()`** or **`timestampCorrection_csv()`**: - - Eliminates first N seconds (`timeForLightsTurnOn`) - - For TDT: expands timestamps from block timestamps + sampling rate - - For CSV: uses timestamps as-is - - Writes `timeCorrection_*.hdf5` with keys: `timestampNew`, `correctionIndex`, `sampling_rate` - - **`decide_naming_convention_and_applyCorrection()`**: - - For each store, calls `applyCorrection()` to crop data using `correctionIndex` - - For control/signal channels: crops data arrays - - For event channels: subtracts time offset from timestamps - - If no isosbestic control: **`create_control_channel()`** generates synthetic control via curve fitting - -**Output:** -- Timestamp-corrected HDF5 files with trimmed data -- `timeCorrection_*.hdf5` files containing corrected timestamps - -### Stage 2: Z-Score Computation - -```mermaid -flowchart TD - A[Timestamp-corrected HDF5] --> B[compute_z_score] - B --> C{removeArtifacts?} - - C -->|No| D[helper_z_score: full data] - C -->|Yes| E[helper_z_score: chunk-by-chunk] - - D --> F[filterSignal] - E --> F - - F --> G[controlFit: linear regression] - G --> H[deltaFF: compute ΔF/F] - H --> I[z_score_computation] - - I --> J{removeArtifacts?} - - J -->|No| K[Write z_score, dff, cntrl_sig_fit] - J -->|Yes| L{artifactsRemovalMethod?} - - L -->|concatenate| M[processTimestampsForArtifacts] - L -->|NaN| N[addingNaNtoChunksWithArtifacts] - - M --> K - N --> K - - K --> O[visualizeControlAndSignal] - - style A fill:#e1f5ff - style K fill:#d4edda - style O fill:#fff3cd -``` - -#### Function: `execute_zscore(folderNames, inputParameters)` - -**Input:** -- Timestamp-corrected HDF5 files - -**Process:** -1. For each output folder: - - **`compute_z_score(filepath, inputParameters)`**: - - For each control/signal pair: - - **`helper_z_score(control, signal, filepath, name, inputParameters)`**: - - **Without artifacts removal:** - - `execute_controlFit_dff()`: Filter signals → fit control to signal → compute ΔF/F - - `z_score_computation()`: Compute z-score from ΔF/F - - **With artifacts removal:** - - For each user-selected chunk (from `coordsForPreProcessing_*.npy`): - - If no isosbestic: `helper_create_control_channel()` creates synthetic control - - `execute_controlFit_dff()` on chunk - - Concatenate or NaN-fill between chunks - - `z_score_computation()` on processed data - - - Writes: `z_score_*.hdf5`, `dff_*.hdf5`, `cntrl_sig_fit_*.hdf5` - - **If artifacts removal with concatenate method:** - - **`processTimestampsForArtifacts()`**: - - `eliminateData()`: Concatenates good chunks, adjusts timestamps to be continuous - - `eliminateTs()`: Aligns event timestamps with new timeline - - Overwrites data files with concatenated versions - - **If artifacts removal with NaN method:** - - **`addingNaNtoChunksWithArtifacts()`**: - - `addingNaNValues()`: Replaces bad chunks with NaN - - `removeTTLs()`: Filters event timestamps to keep only valid times - - - **`visualizeControlAndSignal()`**: Plots control, signal, cntrl_sig_fit for QC - -**Output:** -- `z_score_*.hdf5` (z-scored signal) -- `dff_*.hdf5` (ΔF/F) -- `cntrl_sig_fit_*.hdf5` (fitted control channel) - -## Key Data Transformations - -### Signal Processing Pipeline - -```mermaid -flowchart LR - A[Raw Signal] --> B[filterSignal: Moving Average] - C[Raw Control] --> D[filterSignal: Moving Average] - - B --> E[controlFit: Linear Regression] - D --> E - - E --> F[control_fit = p0*control + p1] - F --> G[deltaFF] - - B --> G - - G --> H[ΔF/F = signal - control_fit / control_fit * 100] - H --> I[z_score_computation] - - I --> J{zscore_method?} - J -->|standard| K[z = ΔF/F - mean / std] - J -->|baseline| L[z = ΔF/F - baseline_mean / baseline_std] - J -->|robust| M[z = 0.6745 * ΔF/F - median / MAD] - - K --> N[Z-Score Output] - L --> N - M --> N - - style A fill:#e1f5ff - style C fill:#e1f5ff - style N fill:#d4edda -``` - -### Transformation Functions - -1. **`filterSignal(filter_window, signal)`** (line 822) - - Applies moving average filter with configurable window - - Uses `scipy.signal.filtfilt` for zero-phase filtering - -2. **`controlFit(control, signal)`** (line 815) - - Linear regression: fits control to signal - - Returns: `fitted_control = p[0] * control + p[1]` - -3. **`deltaFF(signal, control)`** (line 804) - - Formula: `((signal - control) / control) * 100` - - Computes normalized fluorescence change - -4. **`z_score_computation(dff, timestamps, inputParameters)`** (line 853) - - **Standard z-score:** `(ΔF/F - mean(ΔF/F)) / std(ΔF/F)` - - **Baseline z-score:** `(ΔF/F - mean(baseline)) / std(baseline)` - - **Robust z-score:** `0.6745 * (ΔF/F - median) / MAD` - -## Artifact Removal Workflow - -### Interactive Artifact Selection - -The `visualize()` function (line 469) provides an interactive matplotlib plot: -- **Space key:** Mark artifact boundary (vertical line drawn) -- **'d' key:** Delete last marked boundary -- **Close plot:** Save coordinates to `coordsForPreProcessing_*.npy` - -### Two Removal Methods - -**Concatenate Method:** -- Removes artifact chunks completely -- Concatenates good chunks end-to-end -- Adjusts timestamps to be continuous -- Event timestamps realigned to new timeline - -**NaN Method:** -- Replaces artifact chunks with NaN values -- Preserves original timeline -- Filters out event timestamps in artifact regions - -## Supporting Functions - -### Control Channel Creation - -**`helper_create_control_channel(signal, timestamps, window)`** (line 69) -- Used when no isosbestic control is available -- Applies Savitzky-Golay filter to signal -- Fits to exponential function: `f(x) = a + b * exp(-(1/c) * x)` -- Returns synthetic control channel - -### Data Combination - -**`combineData(folderNames, inputParameters, storesList)`** (line 1084) -- Merges data from multiple recording sessions -- Validates that sampling rates match across sessions -- Calls `processTimestampsForCombiningData()` to align timelines -- Saves combined data to first output folder - -### Coordinate Fetching - -**`fetchCoords(filepath, naming, data)`** (line 610) -- Reads `coordsForPreProcessing_*.npy` (artifact boundary coordinates) -- If file doesn't exist: uses `[0, data[-1]]` (entire recording) -- Validates even number of coordinates (pairs of boundaries) -- Returns reshaped array of coordinate pairs - -## File I/O Summary - -### Files Read - -| File Pattern | Content | Source | -|-------------|---------|--------| -| `control_*.hdf5` | Control channel data | Extractors (Step 3) | -| `signal_*.hdf5` | Signal channel data | Extractors (Step 3) | -| `event_*.hdf5` | Event timestamps | Extractors (Step 3) | -| `storesList.csv` | Channel name mapping | Step 2 | -| `coordsForPreProcessing_*.npy` | Artifact boundaries | User selection (optional) | - -### Files Written - -| File Pattern | Content | Keys | -|-------------|---------|------| -| `timeCorrection_*.hdf5` | Corrected timestamps | `timestampNew`, `correctionIndex`, `sampling_rate`, `timeRecStart` (TDT only) | -| `z_score_*.hdf5` | Z-scored signal | `data` | -| `dff_*.hdf5` | ΔF/F signal | `data` | -| `cntrl_sig_fit_*.hdf5` | Fitted control | `data` | -| `event_*_*.hdf5` | Corrected event timestamps | `ts` | - -## Key Parameters from inputParameters - -| Parameter | Purpose | Default/Options | -|-----------|---------|-----------------| -| `timeForLightsTurnOn` | Seconds to eliminate from start | 1 | -| `filter_window` | Moving average window size | 100 | -| `isosbestic_control` | Use isosbestic control channel? | True/False | -| `removeArtifacts` | Enable artifact removal? | True/False | -| `artifactsRemovalMethod` | How to handle artifacts | "concatenate" / "NaN" | -| `zscore_method` | Z-score computation method | "standard z-score" / "baseline z-score" / "robust z-score" | -| `baselineWindowStart` | Baseline window start (seconds) | 0 | -| `baselineWindowEnd` | Baseline window end (seconds) | 0 | -| `combine_data` | Combine multiple recordings? | True/False | - -## Architecture Notes for Refactoring - -### Current Coupling Issues - -1. **GUI Progress Tracking:** `writeToFile()` writes to `~/pbSteps.txt` for progress bar updates (lines 36-38, 1042, 1171, 1203, 1208, 1220) -2. **Interactive Plotting:** `visualize()` requires user interaction (matplotlib event handlers) -3. **File Path Assumptions:** Hard-coded path patterns (`*_output_*`, naming conventions) -4. **Mixed Responsibilities:** Single functions handle both computation and I/O - -### Recommended Separation Points - -**Backend Analysis Layer Should Include:** -- `filterSignal()` - pure signal processing -- `controlFit()` - pure regression -- `deltaFF()` - pure computation -- `z_score_computation()` - pure statistical computation -- `helper_create_control_channel()` - algorithmic control generation -- Core timestamp correction logic (separated from I/O) -- Core artifact removal logic (separated from I/O) - -**Data I/O Layer Should Include:** -- `read_hdf5()`, `write_hdf5()` - file operations -- Store list reading/writing -- Coordinate file handling -- HDF5 file discovery and path management - -**Frontend Visualization Layer Should Include:** -- `visualize()` - interactive artifact selection -- `visualizeControlAndSignal()` - QC plots -- `visualize_z_score()`, `visualize_dff()` - result visualization -- Progress tracking callbacks (replace `writeToFile()`) - -### Potential Refactoring Strategy - -1. **Extract pure computation functions** into a `signal_processing` module -2. **Create data models** (dataclasses) for: - - TimeCorrectionResult - - ProcessedSignal (with z_score, dff, control_fit) - - ArtifactRegions -3. **Separate I/O operations** into `io_utils` module with consistent interfaces -4. **Create processing pipelines** that accept data objects, return data objects -5. **Move visualization to separate module** with callbacks for progress/interaction -6. **Use dependency injection** for progress callbacks instead of hard-coded file writes diff --git a/timestamp_correction_analysis.md b/timestamp_correction_analysis.md deleted file mode 100644 index 121aa3f..0000000 --- a/timestamp_correction_analysis.md +++ /dev/null @@ -1,723 +0,0 @@ -# Timestamp Correction Module Analysis - -## Overview - -The `timestamp_correction.py` module handles the correction of timestamps for photometry data, including: -- Eliminating the first N seconds of recording (light stabilization period) -- Expanding TDT block timestamps into continuous timestamps -- Creating synthetic control channels when no isosbestic control is present -- Applying corrections to both data channels and event markers - -## Module Structure - -### Entry Point from preprocess.py - -```python -execute_timestamp_correction(folderNames, inputParameters) # preprocess.py:212 -``` - -This orchestrator loops through all session folders and calls functions in this module. - -## Two-Phase Control Channel Creation Pattern - -### Understanding add_control_channel vs create_control_channel - -These two functions work together in a **two-phase process** to handle synthetic control channel generation. They are **not redundant** but serve distinct purposes: - -#### Phase 1: `add_control_channel` (Called BEFORE timestamp correction) - -**Execution:** Line 229 in `execute_timestamp_correction` - -**Purpose:** Create **PLACEHOLDER** control files to satisfy workflow requirements - -**What it does:** -1. Validates that if `isosbestic_control=False`, no real control channels exist -2. For each signal channel without a matching control: - - Copies the raw signal HDF5 file to `cntrl{i}.hdf5` (placeholder) - - Adds entry to storesList: `[["cntrl{i}"], ["control_{region}"]]` -3. Saves updated `storesList.csv` - -**Files created:** -- `cntrl0.hdf5`, `cntrl1.hdf5`, etc. (copies of **RAW** signal data) -- Updated `storesList.csv` with placeholder entries - -**Why it's needed:** -- Timestamp correction workflow expects **paired** control/signal channels in storesList -- Without placeholders, the pairing logic in `timestampCorrection_xxx` and `check_cntrl_sig_length` would fail -- The placeholder **data is never actually used** - it just satisfies structural requirements - -#### Phase 2: `create_control_channel` (Called AFTER timestamp correction) - -**Execution:** Line 243 in `execute_timestamp_correction` - -**Purpose:** Generate **ACTUAL** synthetic control via curve fitting and overwrite placeholders - -**What it does:** -1. Looks for placeholder files (checks: `"control" in event_name.lower() and "cntrl" in event.lower()`) -2. Reads the **CORRECTED** signal data: `signal_{region}.hdf5` (after timestamp correction) -3. Calls `helper_create_control_channel()` to: - - Apply Savitzky-Golay filter to cleaned signal - - Fit to exponential function: `f(x) = a + b * exp(-(1/c) * x)` -4. **OVERWRITES** the placeholder `control_{region}.hdf5` with real synthetic control -5. Also exports to CSV format (legacy) - -**Files written:** -- `control_{region}.hdf5` → `data` (replaces placeholder with curve-fitted control) -- `{raw_name}.csv` (timestamps, data, sampling_rate columns) - -**Why it's separate:** -- Requires **timestamp-corrected** signal data (doesn't exist until after lines 232-239) -- Curve fitting algorithm needs clean timestamps (first N seconds eliminated) -- Cannot be done before timestamp correction without re-correcting the synthetic control - -#### Execution Timeline - -```python -# When isosbestic_control == False: - -# ========== PHASE 1: BEFORE TIMESTAMP CORRECTION ========== -# Line 229: Create placeholders (just file copies) -storesList = add_control_channel(filepath, storesList) -# Result: storesList now has paired structure -# [["Dv1A", "cntrl0"], ["signal_dms", "control_dms"]] -# Files: cntrl0.hdf5 (copy of raw signal, never used) - -# ========== TIMESTAMP CORRECTION PHASE ========== -# Lines 232-234: Process both signal AND placeholder control -timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList) -# Result: Creates timeCorrection_dms.hdf5 with correctionIndex - -# Lines 236-239: Apply corrections to all channels -decide_naming_convention_and_applyCorrection(...) -# Result: signal_dms.hdf5 now contains corrected signal data -# control_dms.hdf5 still contains uncorrected placeholder copy - -# ========== PHASE 2: AFTER TIMESTAMP CORRECTION ========== -# Line 243: Generate REAL synthetic controls -create_control_channel(filepath, storesList, window=101) -# Result: control_dms.hdf5 OVERWRITTEN with curve-fitted synthetic control -# Now contains valid control data derived from corrected signal -``` - -#### Why This Design Exists - -This is a **chicken-and-egg problem solved with placeholders:** - -1. **Requirement:** Timestamp correction expects paired control/signal channels -2. **Constraint:** Synthetic control generation requires timestamp-corrected signal data -3. **Solution:** Create dummy placeholders → correct everything → replace placeholders with real data - -#### Visual Flow - -```mermaid -flowchart TD - A[isosbestic_control = False] --> B[add_control_channel] - B --> C[Copy signal.hdf5 to cntrl0.hdf5] - C --> D[Update storesList.csv] - - D --> E[timestampCorrection_xxx] - E --> F[Creates timeCorrection_dms.hdf5] - - F --> G[decide_naming_convention_and_applyCorrection] - G --> H[Corrects signal_dms.hdf5] - G --> I[Corrects control_dms.hdf5
still contains placeholder] - - I --> J[create_control_channel] - J --> K[Read corrected signal_dms.hdf5] - K --> L[helper_create_control_channel
curve fit] - L --> M[OVERWRITE control_dms.hdf5
with synthetic control] - - style C fill:#fff3cd - style I fill:#fff3cd - style M fill:#d4edda -``` - -#### Refactoring Opportunity - -This placeholder pattern is a **code smell** indicating potential design improvements: - -**Issues:** -1. **Unnecessary I/O:** Placeholder files are written and then overwritten -2. **Confusing flow:** Hard to understand that placeholders are temporary -3. **Tight coupling:** Timestamp correction assumes paired files exist -4. **Wasted computation:** Placeholder controls get timestamp-corrected unnecessarily - -**Potential Improvements:** - -**Option 1: Lazy Control Creation** -- Modify timestamp correction to handle missing controls gracefully -- Only create synthetic controls after all corrections complete -- Remove placeholder file creation entirely - -**Option 2: Data Structure Refactoring** -- Use a data structure that doesn't require physical paired files upfront -- Track "needs synthetic control" as metadata rather than file presence -- Generate and write controls only once at the end - -**Option 3: Two-Pass Workflow** -- First pass: Correct only signal channels -- Second pass: Generate synthetic controls from corrected signals -- Would require refactoring `check_cntrl_sig_length` and pairing logic - -## Function Catalog - -### 1. add_control_channel -**Location:** `timestamp_correction.py:20` -**Purpose:** Create placeholder control channel files when no isosbestic control exists - -```python -def add_control_channel(filepath, arr) -> arr -``` - -**Input:** -- `filepath`: Path to session output folder -- `arr`: 2D array `[[storenames], [storesList]]` from storesList.csv - -**Process:** -1. Validates that control/signal pairs match (raises error if mismatched) -2. For each signal channel without a matching control: - - Copies signal HDF5 file to `cntrl{i}.hdf5` (placeholder) - - Adds entry to storesList array: `[["cntrl{i}"], ["control_{region}"]]` -3. Writes updated storesList.csv - -**Output:** -- Updated `arr` with new control channel entries -- **Files Written:** Updated `storesList.csv`, copied `cntrl*.hdf5` files - -**I/O Summary:** -- **Reads:** Signal HDF5 files (via shutil.copyfile) -- **Writes:** `storesList.csv`, placeholder `cntrl*.hdf5` files - ---- - -### 2. timestampCorrection_csv -**Location:** `timestamp_correction.py:65` -**Purpose:** Correct timestamps for CSV-format data (Doric, NPM, custom CSV) - -```python -def timestampCorrection_csv(filepath, timeForLightsTurnOn, storesList) -``` - -**Input:** -- `filepath`: Path to session output folder -- `timeForLightsTurnOn`: Seconds to eliminate from start (default: 1) -- `storesList`: 2D array `[[storenames], [storesList]]` - -**Process:** -1. Filters storesList to control/signal channels only -2. Pairs control/signal channels, validates naming matches -3. Calls `check_cntrl_sig_length()` to determine which channel to use (shorter one) -4. For each control/signal pair: - - **Reads:** `timestamps` and `sampling_rate` from raw HDF5 - - **Computes:** `correctionIndex = np.where(timestamp >= timeForLightsTurnOn)` - - **Writes:** `timeCorrection_{region}.hdf5` with keys: - - `timestampNew`: Corrected timestamps - - `correctionIndex`: Indices to keep - - `sampling_rate`: Sampling rate - -**Output:** -- **Files Written:** `timeCorrection_{region}.hdf5` for each control/signal pair - -**I/O Summary:** -- **Reads:** `{storename}.hdf5` → `timestamps`, `sampling_rate` -- **Writes:** `timeCorrection_{region}.hdf5` → `timestampNew`, `correctionIndex`, `sampling_rate` - ---- - -### 3. timestampCorrection_tdt -**Location:** `timestamp_correction.py:115` -**Purpose:** Correct timestamps for TDT-format data (expands block timestamps) - -```python -def timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList) -``` - -**Input:** Same as `timestampCorrection_csv` - -**Process:** -1. Filters storesList to control/signal channels only -2. Pairs control/signal channels, validates naming matches -3. Calls `check_cntrl_sig_length()` to determine which channel to use -4. For each control/signal pair: - - **Reads:** `timestamps`, `npoints`, `sampling_rate` from raw HDF5 - - **TDT-specific expansion algorithm:** - ```python - timeRecStart = timestamp[0] - timestamps = np.subtract(timestamp, timeRecStart) # Zero-base - adder = np.arange(npoints) / sampling_rate # Within-block offsets - # Expand: for each block timestamp, add within-block offsets - timestampNew = np.zeros((len(timestamps), lengthAdder)) - for i in range(lengthAdder): - timestampNew[:, i] = np.add(timestamps, adder[i]) - timestampNew = (timestampNew.T).reshape(-1, order="F") # Flatten - correctionIndex = np.where(timestampNew >= timeForLightsTurnOn) - timestampNew = timestampNew[correctionIndex] - ``` - - **Writes:** `timeCorrection_{region}.hdf5` with keys: - - `timeRecStart`: Recording start time (TDT-specific) - - `timestampNew`: Expanded, corrected timestamps - - `correctionIndex`: Indices to keep - - `sampling_rate`: Sampling rate - -**Output:** -- **Files Written:** `timeCorrection_{region}.hdf5` with TDT-specific `timeRecStart` key - -**I/O Summary:** -- **Reads:** `{storename}.hdf5` → `timestamps`, `npoints`, `sampling_rate` -- **Writes:** `timeCorrection_{region}.hdf5` → `timeRecStart`, `timestampNew`, `correctionIndex`, `sampling_rate` - ---- - -### 4. check_cntrl_sig_length -**Location:** `timestamp_correction.py:273` -**Purpose:** Determine which channel (control or signal) to use as reference based on length - -```python -def check_cntrl_sig_length(filepath, channels_arr, storenames, storesList) -> indices -``` - -**Input:** -- `filepath`: Path to session output folder -- `channels_arr`: Paired control/signal array `[["control_A", "control_B"], ["signal_A", "signal_B"]]` -- `storenames`: Raw HDF5 filenames -- `storesList`: Semantic channel names - -**Process:** -1. For each control/signal pair: - - **Reads:** `data` from both control and signal HDF5 - - Compares lengths: `control.shape[0]` vs `signal.shape[0]` - - Returns the shorter one's storename (or signal if equal) - -**Output:** -- List of storenames to use for timestamp correction (one per pair) - -**I/O Summary:** -- **Reads:** `{control_storename}.hdf5` → `data`, `{signal_storename}.hdf5` → `data` - -**Note:** This is a pure analysis function but performs I/O to determine which data to use. - ---- - -### 5. decide_naming_convention_and_applyCorrection -**Location:** `timestamp_correction.py:178` -**Purpose:** Loop through all channels and apply timestamp corrections - -```python -def decide_naming_convention_and_applyCorrection(filepath, timeForLightsTurnOn, event, displayName, storesList) -``` - -**Input:** -- `filepath`: Path to session output folder -- `timeForLightsTurnOn`: Seconds eliminated from start -- `event`: Raw storename (e.g., "Dv1A") -- `displayName`: Semantic name (e.g., "control_DMS") -- `storesList`: Full storesList array - -**Process:** -1. Filters storesList to control/signal channels -2. Pairs channels and validates naming conventions -3. For each pair, calls `applyCorrection(filepath, timeForLightsTurnOn, event, displayName, region)` - -**Output:** -- Delegates to `applyCorrection()` (no direct I/O) - ---- - -### 6. applyCorrection -**Location:** `timestamp_correction.py:205` -**Purpose:** Apply timestamp corrections to data channels or event markers - -```python -def applyCorrection(filepath, timeForLightsTurnOn, event, displayName, naming) -``` - -**Input:** -- `filepath`: Path to session output folder -- `timeForLightsTurnOn`: Seconds eliminated from start -- `event`: Raw storename -- `displayName`: Semantic display name -- `naming`: Region identifier (e.g., "dms") - -**Process:** - -**For Control/Signal Channels:** -1. **Reads:** `timeCorrection_{naming}.hdf5` → `correctionIndex` -2. **Reads:** `{event}.hdf5` → `data` -3. **Applies:** `arr = arr[correctionIndex]` (crops data) -4. **Writes:** `{displayName}.hdf5` → `data` (overwrites with corrected data) - -**For Event Channels:** -1. Detects TDT format: `check_TDT(os.path.dirname(filepath))` -2. **Reads:** `timeCorrection_{naming}.hdf5` → `timeRecStart` (if TDT) -3. **Reads:** `{event}.hdf5` → `timestamps` -4. **Applies corrections:** - - If TDT and timestamps >= timeRecStart: subtract both `timeRecStart` and `timeForLightsTurnOn` - - Otherwise: subtract only `timeForLightsTurnOn` -5. **Writes:** `{event}_{naming}.hdf5` → `ts` (corrected event timestamps) - -**Output:** -- **Files Written:** - - `{displayName}.hdf5` → `data` (for control/signal) - - `{event}_{naming}.hdf5` → `ts` (for events) - -**I/O Summary:** -- **Reads:** `timeCorrection_{naming}.hdf5`, `{event}.hdf5` -- **Writes:** `{displayName}.hdf5` or `{event}_{naming}.hdf5` - ---- - -### 7. create_control_channel -**Location:** `timestamp_correction.py:247` -**Purpose:** Generate synthetic control channel using curve fitting (when no isosbestic control exists) - -```python -def create_control_channel(filepath, arr, window=5001) -``` - -**Input:** -- `filepath`: Path to session output folder -- `arr`: storesList array `[[storenames], [storesList]]` -- `window`: Savitzky-Golay filter window (default: 5001) - -**Process:** -1. Loops through storesList to find placeholder control channels (`cntrl` in storename) -2. For each placeholder: - - **Reads:** `signal_{region}.hdf5` → `data` (corrected signal) - - **Reads:** `timeCorrection_{region}.hdf5` → `timestampNew`, `sampling_rate` - - **Calls:** `helper_create_control_channel(signal, timestampNew, window)` from `control_channel.py` - - Applies Savitzky-Golay filter - - Fits to exponential: `f(x) = a + b * exp(-(1/c) * x)` - - **Writes:** `{control_name}.hdf5` → `data` (synthetic control) - - **Writes:** `{event_name}.csv` with columns: `timestamps`, `data`, `sampling_rate` - -**Output:** -- **Files Written:** - - `control_{region}.hdf5` → `data` (replaces placeholder) - - `{raw_name}.csv` (legacy format export) - -**I/O Summary:** -- **Reads:** `signal_{region}.hdf5` → `data`, `timeCorrection_{region}.hdf5` → `timestampNew`, `sampling_rate` -- **Writes:** `control_{region}.hdf5` → `data`, `{raw_name}.csv` - ---- - -## Data Flow Diagram - -### High-Level Flow (called from execute_timestamp_correction) - -```mermaid -flowchart TD - A[execute_timestamp_correction] --> B[Read storesList.csv] - B --> C{isosbestic_control?} - - C -->|False| D[add_control_channel] - C -->|True| E{Check format} - D --> E - - E -->|TDT| F[timestampCorrection_tdt] - E -->|CSV/Doric/NPM| G[timestampCorrection_csv] - - F --> H[Loop: decide_naming_convention_and_applyCorrection] - G --> H - - H --> I[For each store: applyCorrection] - - I --> J{isosbestic_control?} - J -->|False| K[create_control_channel] - J -->|True| L[Done] - K --> L - - style A fill:#e1f5ff - style L fill:#d4edda -``` - -### Detailed Flow: timestampCorrection Functions - -```mermaid -flowchart LR - A[Raw HDF5 files] --> B[check_cntrl_sig_length] - B --> C[Read control & signal data] - C --> D[Return shorter channel name] - - D --> E{Format?} - E -->|CSV| F[timestampCorrection_csv] - E -->|TDT| G[timestampCorrection_tdt] - - F --> H[Read timestamps from selected channel] - G --> I[Read timestamps, npoints, sampling_rate] - - H --> J[correctionIndex = where >= timeForLightsTurnOn] - I --> K[Expand block timestamps] - K --> J - - J --> L[Write timeCorrection_{region}.hdf5] - - style A fill:#e1f5ff - style L fill:#d4edda -``` - -### Detailed Flow: applyCorrection - -```mermaid -flowchart TD - A[applyCorrection called] --> B{Channel type?} - - B -->|control/signal| C[Read correctionIndex] - B -->|event| D[Read event timestamps] - - C --> E[Read raw data] - E --> F[data = data correctionIndex] - F --> G[Write displayName.hdf5] - - D --> H{TDT format?} - H -->|Yes| I[Read timeRecStart] - H -->|No| J[ts -= timeForLightsTurnOn] - - I --> K[ts -= timeRecStart] - K --> J - J --> L[Write event_region.hdf5] - - style A fill:#e1f5ff - style G fill:#d4edda - style L fill:#d4edda -``` - -### Detailed Flow: Control Channel Creation - -```mermaid -flowchart LR - A[add_control_channel] --> B[For each signal without control] - B --> C[Copy signal.hdf5 to cntrl_i.hdf5] - C --> D[Update storesList.csv] - - D --> E[... timestamp correction ...] - - E --> F[create_control_channel] - F --> G[For each cntrl_i placeholder] - G --> H[Read signal_{region}.hdf5] - H --> I[helper_create_control_channel] - I --> J[Savitzky-Golay filter] - J --> K[Curve fit to exponential] - K --> L[Write control_{region}.hdf5] - L --> M[Export to CSV] - - style A fill:#fff3cd - style M fill:#d4edda -``` - -## Execution Order in execute_timestamp_correction - -```python -# preprocess.py:212-247 -for each session in folderNames: - for each output_folder in session: - # Step 1: Read metadata - storesList = np.genfromtxt("storesList.csv") - - # Step 2: Add placeholder controls if needed - if isosbestic_control == False: - storesList = add_control_channel(filepath, storesList) - - # Step 3: Compute correctionIndex and timestampNew - if check_TDT(folderName): - timestampCorrection_tdt(filepath, timeForLightsTurnOn, storesList) - else: - timestampCorrection_csv(filepath, timeForLightsTurnOn, storesList) - - # Step 4: Apply corrections to all channels/events - for each store in storesList: - decide_naming_convention_and_applyCorrection( - filepath, timeForLightsTurnOn, storename, displayName, storesList - ) - # ^ This calls applyCorrection for each channel - - # Step 5: Generate synthetic controls via curve fitting - if isosbestic_control == False: - create_control_channel(filepath, storesList, window=101) -``` - -## File I/O Summary - -### Files Read - -| Function | Files Read | Keys | -|----------|-----------|------| -| `add_control_channel` | `signal_*.hdf5` (for copying) | - | -| `timestampCorrection_csv` | `{storename}.hdf5` | `timestamps`, `sampling_rate` | -| `timestampCorrection_tdt` | `{storename}.hdf5` | `timestamps`, `npoints`, `sampling_rate` | -| `check_cntrl_sig_length` | `control_*.hdf5`, `signal_*.hdf5` | `data` | -| `applyCorrection` | `timeCorrection_{region}.hdf5`
`{event}.hdf5` | `correctionIndex`, `timeRecStart` (TDT)
`data` or `timestamps` | -| `create_control_channel` | `signal_{region}.hdf5`
`timeCorrection_{region}.hdf5` | `data`
`timestampNew`, `sampling_rate` | - -### Files Written - -| Function | Files Written | Keys | Notes | -|----------|--------------|------|-------| -| `add_control_channel` | `storesList.csv`
`cntrl{i}.hdf5` | -
(copy of signal) | Placeholder files | -| `timestampCorrection_csv` | `timeCorrection_{region}.hdf5` | `timestampNew`, `correctionIndex`, `sampling_rate` | One per region | -| `timestampCorrection_tdt` | `timeCorrection_{region}.hdf5` | `timeRecStart`, `timestampNew`, `correctionIndex`, `sampling_rate` | TDT-specific | -| `applyCorrection` | `{displayName}.hdf5`
`{event}_{region}.hdf5` | `data`
`ts` | Overwrites with corrected data | -| `create_control_channel` | `control_{region}.hdf5`
`{raw_name}.csv` | `data`
timestamps, data, sampling_rate | Replaces placeholder | - -## Key Transformations - -### 1. Timestamp Expansion (TDT only) - -**Input:** Block timestamps (one per acquisition block) -**Algorithm:** -```python -timeRecStart = timestamp[0] -timestamps = timestamp - timeRecStart # Zero-base -adder = np.arange(npoints) / sampling_rate # Within-block offsets [0, 1/fs, 2/fs, ...] -# Matrix multiplication to expand: -timestampNew = zeros((n_blocks, npoints)) -for i in range(npoints): - timestampNew[:, i] = timestamps + adder[i] -timestampNew = timestampNew.T.reshape(-1, order='F') # Column-major flatten -``` -**Output:** Continuous timestamps at full sampling rate - -### 2. Correction Index Computation - -**Input:** Timestamps array, `timeForLightsTurnOn` -**Algorithm:** -```python -correctionIndex = np.where(timestamp >= timeForLightsTurnOn)[0] -``` -**Output:** Indices of timestamps to keep (after eliminating first N seconds) - -### 3. Data Cropping - -**Applied to:** Control/signal data channels -**Algorithm:** -```python -data_corrected = data[correctionIndex] -``` - -### 4. Event Timestamp Adjustment - -**Applied to:** Event markers (TTL pulses) -**Algorithm:** -```python -# CSV format: -ts_corrected = ts - timeForLightsTurnOn - -# TDT format (if ts >= timeRecStart): -ts_corrected = ts - timeRecStart - timeForLightsTurnOn -``` - -### 5. Synthetic Control Generation - -**Input:** Signal channel (already corrected) -**Algorithm:** -1. Apply Savitzky-Golay filter: `filtered_signal = savgol_filter(signal, window, polyorder=3)` -2. Curve fit to exponential: `control = a + b * exp(-(1/c) * t)` -3. Return fitted curve as synthetic control - -## Analysis for I/O Separation - -### Pure Analysis Functions (Minimal I/O) -These could be extracted with I/O injected: -- ❌ None - all functions perform substantial I/O - -### Orchestration Functions (Heavy I/O, Light Analysis) -These coordinate reading/writing and delegate computation: -- `add_control_channel` - File copying and CSV writing -- `decide_naming_convention_and_applyCorrection` - Loops and delegates -- `create_control_channel` - Orchestrates read → process → write - -### Mixed Functions (I/O + Analysis) -These perform both I/O and computation inline: -- `timestampCorrection_csv` - Reads data, computes correctionIndex, writes results -- `timestampCorrection_tdt` - Reads data, expands timestamps, computes correctionIndex, writes -- `applyCorrection` - Reads multiple files, applies transformations, writes -- `check_cntrl_sig_length` - Reads data just to compare lengths - -## Refactoring Recommendations for I/O Separation - -### Option 1: Extract Pure Computation Functions - -Create new pure functions: -```python -# Pure analysis (no I/O) -def compute_correction_index(timestamps, timeForLightsTurnOn): - return np.where(timestamps >= timeForLightsTurnOn)[0] - -def expand_tdt_timestamps(block_timestamps, npoints, sampling_rate): - # TDT expansion algorithm - ... - return expanded_timestamps - -def crop_data_by_index(data, correctionIndex): - return data[correctionIndex] - -def adjust_event_timestamps(ts, timeRecStart, timeForLightsTurnOn, is_tdt): - # Event adjustment logic - ... - return adjusted_ts -``` - -Then modify existing functions to use these pure functions, keeping I/O separate. - -### Option 2: Reader/Writer Pattern - -Create dedicated I/O classes: -```python -class TimestampCorrectionReader: - def read_raw_timestamps(self, filepath, storename): - ... - - def read_correction_data(self, filepath, region): - ... - -class TimestampCorrectionWriter: - def write_correction_file(self, filepath, region, data): - ... - - def write_corrected_data(self, filepath, displayName, data): - ... -``` - -### Option 3: Data Class Pattern - -Return data objects instead of writing directly: -```python -@dataclass -class TimestampCorrection: - timestampNew: np.ndarray - correctionIndex: np.ndarray - sampling_rate: float - timeRecStart: Optional[float] = None # TDT only - -def timestampCorrection_tdt(...) -> TimestampCorrection: - # Compute all values - return TimestampCorrection( - timestampNew=..., - correctionIndex=..., - sampling_rate=..., - timeRecStart=... - ) - -# Separate writer function -def write_timestamp_correction(filepath, region, correction: TimestampCorrection): - write_hdf5(correction.timestampNew, f"timeCorrection_{region}", filepath, "timestampNew") - # ... etc -``` - -## Current I/O Patterns to Refactor - -1. **Inline writes in computation functions:** - - `timestampCorrection_csv` and `timestampCorrection_tdt` compute AND write - - Should separate: compute → return data → write in caller - -2. **Reading for validation only:** - - `check_cntrl_sig_length` reads full data arrays just to compare shapes - - Could be optimized to read only array metadata/shapes - -3. **Side-effect file creation:** - - `add_control_channel` creates files as side effect - - `create_control_channel` both generates data AND writes multiple formats (HDF5 + CSV) - -4. **Mixed responsibilities in applyCorrection:** - - Handles both control/signal cropping AND event timestamp adjustment - - Could be split into two separate functions From 543ddfded2023a43ac9c38601321135b76d495b3 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Thu, 5 Feb 2026 15:00:52 -0800 Subject: [PATCH 150/150] propagate iloc fix --- src/guppy/analysis/transients_average.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/guppy/analysis/transients_average.py b/src/guppy/analysis/transients_average.py index 3b8dd79..9e6d372 100644 --- a/src/guppy/analysis/transients_average.py +++ b/src/guppy/analysis/transients_average.py @@ -66,7 +66,7 @@ def averageForGroup(folderNames, inputParameters): continue else: df = read_freq_and_amp_from_hdf5(temp_path[j][0], temp_path[j][1]) - arr.append(np.array([df["freq (events/min)"][0], df["amplitude"][0]])) + arr.append(np.array([df["freq (events/min)"].iloc[0], df["amplitude"].iloc[0]])) fileName.append(os.path.basename(temp_path[j][0])) arr = np.asarray(arr)