From 462d20dffcd4d7d4ddc8297890cda9899bc86416 Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Wed, 12 Mar 2025 14:45:57 -0700 Subject: [PATCH 001/120] script to output segments detected by humans this script takes in human labels, the wavs, and an output dir to parse through the human labels, identify the segments in the audio file where a human labeled a burrowing owl call, and creates a 3 second segment wav file. Still needs to do the same but for creating samples of the background noise with no bird detections to create a balanced dataset with two classes burrowing owl call and no burrowing owl call. can also be made to create segments with specific call types rather than any burrowing owl detection --- create_dataset/segment_labeled_2017_data.py | 114 ++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 create_dataset/segment_labeled_2017_data.py diff --git a/create_dataset/segment_labeled_2017_data.py b/create_dataset/segment_labeled_2017_data.py new file mode 100644 index 0000000..c871243 --- /dev/null +++ b/create_dataset/segment_labeled_2017_data.py @@ -0,0 +1,114 @@ +"""Create human labeled audio segments. + +Using a CSV with human labels across a large dataset, we can +find the segments in the audio files that correspond to a +burrowing owl call as labeled by a human labeler. We can then +segment these audio chunks into a folder so that we can use +them to easily train other models. We can also do the same +for the rest of the data to obtain segments with no bird +call labels, to provide another class in the same domain +as our bird vocalizations. As there are significantly more +negatives than positives, we can choose if we'd like to get +the same number output or select a higher or lower amount. + +Example: + + $ python segment_labeled_2017_data.py /path/to/human_labels.csv \ + /path/to/directory/of/wavs/ /path/to/directory/output/ + +""" + +import argparse +import os +import pandas as pd +import librosa +from pydub import AudioSegment + + +def create_bird_segments(labels, wavs, output): + """Create human labeled dataframes. + + Main script to create csvs of human labeled data for each + wav file of interest. + + Args: + labels (str): The path to human labeled csv. + wavs (str): The path to all audio files. + output (str): The path to directory where each csv will + output (1 for each wav). + + """ + os.makedirs(output, exist_ok=True) + + scored_data = pd.read_csv(labels) + output = output + "bird_sounds/" + os.makedirs(output, exist_ok=True) + + for audio_file in os.listdir(wavs): + if audio_file.endswith('.wav'): + audio_path = os.path.join(wavs, audio_file) + + try: + time_series, sample_rate = librosa.load(audio_path, sr=None) + audio_duration = librosa.get_duration(y=time_series, + sr=sample_rate) + except Exception as err: + print(f"Error processing {audio_file}: {err}") + continue + + total_chunks = int(audio_duration // 3) + 1 + chunks_data = { + 'Chunk Start': [i * 3 for i in range(total_chunks)], + 'Chunk End': [(i + 1) * 3 for i in range(total_chunks)], + 'Label': ['no'] * total_chunks + } + chunks_df = pd.DataFrame(chunks_data) + + filtered_data = scored_data[scored_data['IN FILE'] == audio_file] + bird_sound = AudioSegment.from_wav(audio_path) + segment_index = 0 + for _, row in filtered_data.iterrows(): + if row['TOP1MATCH'] != 'null': + start_time = float(row['OFFSET']) + end_time = start_time + float(row['DURATION']) + + for i in range(len(chunks_df)): + chunk_start = chunks_df.loc[i, 'Chunk Start'] + chunk_end = chunks_df.loc[i, 'Chunk End'] + if (start_time < chunk_end and end_time > chunk_start): + chunk_start = chunk_start * 1000 + chunk_end = chunk_end * 1000 + segment = bird_sound[chunk_start:chunk_end] + output_file = os.path.join( + output, f'{os.path.splitext(audio_file)[0]}_segment_{segment_index}.wav' + ) + segment.export(output_file, format='wav') + segment_index += 1 + + print("Processing complete!") + +def create_no_bird_segments(labels, wavs, output): + """Create no bird call audio segments. + + """ + +def main(labels, wavs, output): + """Run main script + + """ + create_bird_segments(labels, wavs, output) + create_no_bird_segments(labels, wavs, output) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Input Directory Path' + ) + parser.add_argument('labels', type=str, + help='Path to human labeled csv') + parser.add_argument('wavs', type=str, + help='Path to all wav files that have been labeled') + parser.add_argument('output', type=str, + help='Path to desired directory for output csvs') + args = parser.parse_args() + main(args.labels, args.wavs, args.output) From 8fff3297569a41c6f2787ca1dc610fd846da7581 Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Mon, 17 Mar 2025 12:07:22 -0700 Subject: [PATCH 002/120] add no creation of no bird label segments now it will create a folder of 3 second bird detection segments, as well as a folder from the same wavs with no bird detection segments --- create_dataset/segment_labeled_2017_data.py | 53 +++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/create_dataset/segment_labeled_2017_data.py b/create_dataset/segment_labeled_2017_data.py index c871243..68ff47f 100644 --- a/create_dataset/segment_labeled_2017_data.py +++ b/create_dataset/segment_labeled_2017_data.py @@ -91,6 +91,59 @@ def create_no_bird_segments(labels, wavs, output): """Create no bird call audio segments. """ + os.makedirs(output, exist_ok=True) + + scored_data = pd.read_csv(labels) + output = output + "no_bird_sounds/" + os.makedirs(output, exist_ok=True) + + for audio_file in os.listdir(wavs): + if audio_file.endswith('.wav'): + audio_path = os.path.join(wavs, audio_file) + + try: + time_series, sample_rate = librosa.load(audio_path, sr=None) + audio_duration = librosa.get_duration(y=time_series, sr=sample_rate) + except Exception as err: + print(f"Error processing {audio_file}: {err}") + continue + + total_chunks = int(audio_duration // 3) + 1 + chunks_data = { + 'Chunk Start': [i * 3 for i in range(total_chunks)], + 'Chunk End': [(i + 1) * 3 for i in range(total_chunks)], + 'Label': ['no'] * total_chunks + } + chunks_df = pd.DataFrame(chunks_data) + + filtered_data = scored_data[scored_data['IN FILE'] == audio_file] + + for _, row in filtered_data.iterrows(): + if row['TOP1MATCH'] != 'null': + start_time = float(row['OFFSET']) + end_time = start_time + float(row['DURATION']) + + for i in range(len(chunks_df)): + chunk_start = chunks_df.loc[i, 'Chunk Start'] + chunk_end = chunks_df.loc[i, 'Chunk End'] + if start_time < chunk_end and end_time > chunk_start: + chunks_df.loc[i, 'Label'] = 'bird' + + bird_sound = AudioSegment.from_wav(audio_path) + segment_index = 0 + for i in range(len(chunks_df)): + if chunks_df.loc[i, 'Label'] == 'no': + chunk_start = chunks_df.loc[i, 'Chunk Start'] * 1000 + chunk_end = chunks_df.loc[i, 'Chunk End'] * 1000 + segment = bird_sound[chunk_start:chunk_end] + + output_file = os.path.join( + output, f'{os.path.splitext(audio_file)[0]}_nobird_segment_{segment_index}.wav' + ) + segment.export(output_file, format='wav') + segment_index += 1 + + print("Processing complete!") def main(labels, wavs, output): """Run main script From 1f35b93096b3e58544c6afefc7334d072592e343 Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Tue, 18 Mar 2025 11:37:48 -0700 Subject: [PATCH 003/120] add way to create dataset of segements with more control this file is based off the segment_2017_labeled_data.py file but the difference is this one creates segments that are purely the length of the labeled duration of the file, with an optional padding on the front and end as the durations seem extremely tight, making the sound files very short and slightly imperfect with sometimes cutting off the milliseconds of the beginning and end of a call. this makes a tighter dataset --- cfgs/params_segment_2017_data.yaml | 2 + create_dataset/params_segment_2017_data.py | 177 +++++++++++++++++++++ 2 files changed, 179 insertions(+) create mode 100644 cfgs/params_segment_2017_data.yaml create mode 100644 create_dataset/params_segment_2017_data.py diff --git a/cfgs/params_segment_2017_data.yaml b/cfgs/params_segment_2017_data.yaml new file mode 100644 index 0000000..f09ca5a --- /dev/null +++ b/cfgs/params_segment_2017_data.yaml @@ -0,0 +1,2 @@ +#length added to beginning and end of detection segment (ms) +padding: 100 diff --git a/create_dataset/params_segment_2017_data.py b/create_dataset/params_segment_2017_data.py new file mode 100644 index 0000000..46c3519 --- /dev/null +++ b/create_dataset/params_segment_2017_data.py @@ -0,0 +1,177 @@ +"""Create human labeled audio segments. + +Using a CSV with human labels across a large dataset, we can +find the segments in the audio files that correspond to a +burrowing owl call as labeled by a human labeler. We can then +segment these audio chunks into a folder so that we can use +them to easily train other models. We can also do the same +for the rest of the data to obtain segments with no bird +call labels, to provide another class in the same domain +as our bird vocalizations. As there are significantly more +negatives than positives, we can choose if we'd like to get +the same number output or select a higher or lower amount. + +Example: + + $ python segment_labeled_2017_data.py /path/to/human_labels.csv \ + /path/to/directory/of/wavs/ /path/to/directory/output/ /path/to/config.yaml + +""" + +import argparse +import yaml +import os +import pandas as pd +import librosa +from pydub import AudioSegment + + +def read_configs(config): + """reading in config file variables + + """ + with open(config, "r", encoding='utf-8') as cfg: + configs = yaml.load(cfg, Loader=yaml.SafeLoader) + + return configs + +def create_bird_segments(labels, wavs, output, config): + """Create human labeled dataframes. + + Main script to create csvs of human labeled data for each + wav file of interest. + + Args: + labels (str): The path to human labeled csv. + wavs (str): The path to all audio files. + output (str): The path to directory where each csv will + output (1 for each wav). + + """ + os.makedirs(output, exist_ok=True) + + scored_data = pd.read_csv(labels) + output = output + "bird_sounds/" + os.makedirs(output, exist_ok=True) + + configs = read_configs(config) + padding = configs['padding'] + padding = int(padding) + + for audio_file in os.listdir(wavs): + if audio_file.endswith('.wav'): + audio_path = os.path.join(wavs, audio_file) + + try: + time_series, sample_rate = librosa.load(audio_path, sr=None) + audio_duration = librosa.get_duration(y=time_series, + sr=sample_rate) + except Exception as err: + print(f"Error processing {audio_file}: {err}") + continue + + filtered_data = scored_data[scored_data['IN FILE'] == audio_file] + bird_sound = AudioSegment.from_wav(audio_path) + segment_index = 0 + for _, row in filtered_data.iterrows(): + if row['TOP1MATCH'] != 'null': + start_time = float(row['OFFSET']) + print("Start time in ms? Then end time in ms, then start time w padding, and end time w padding in ms") + print(start_time) + end_time = (start_time + float(row['DURATION'])) + print(end_time) + start_time = start_time * 1000 + end_time = end_time * 1000 + start_time = start_time - padding + print(start_time) + end_time = end_time + padding + print(end_time) + segment = bird_sound[start_time:end_time] + output_file = os.path.join( + output, f'{os.path.splitext(audio_file)[0]}_segment_{segment_index}.wav' + ) + segment.export(output_file, format='wav') + segment_index += 1 + + print("Processing complete!") + +def create_no_bird_segments(labels, wavs, output): + """Create no bird call audio segments. + + """ + os.makedirs(output, exist_ok=True) + + scored_data = pd.read_csv(labels) + output = output + "no_bird_sounds/" + os.makedirs(output, exist_ok=True) + + for audio_file in os.listdir(wavs): + if audio_file.endswith('.wav'): + audio_path = os.path.join(wavs, audio_file) + + try: + time_series, sample_rate = librosa.load(audio_path, sr=None) + audio_duration = librosa.get_duration(y=time_series, sr=sample_rate) + except Exception as err: + print(f"Error processing {audio_file}: {err}") + continue + + total_chunks = int(audio_duration // 3) + 1 + chunks_data = { + 'Chunk Start': [i * 3 for i in range(total_chunks)], + 'Chunk End': [(i + 1) * 3 for i in range(total_chunks)], + 'Label': ['no'] * total_chunks + } + chunks_df = pd.DataFrame(chunks_data) + + filtered_data = scored_data[scored_data['IN FILE'] == audio_file] + + for _, row in filtered_data.iterrows(): + if row['TOP1MATCH'] != 'null': + start_time = float(row['OFFSET']) + end_time = start_time + float(row['DURATION']) + + for i in range(len(chunks_df)): + chunk_start = chunks_df.loc[i, 'Chunk Start'] + chunk_end = chunks_df.loc[i, 'Chunk End'] + if start_time < chunk_end and end_time > chunk_start: + chunks_df.loc[i, 'Label'] = 'bird' + + bird_sound = AudioSegment.from_wav(audio_path) + segment_index = 0 + for i in range(len(chunks_df)): + if chunks_df.loc[i, 'Label'] == 'no': + chunk_start = chunks_df.loc[i, 'Chunk Start'] * 1000 + chunk_end = chunks_df.loc[i, 'Chunk End'] * 1000 + segment = bird_sound[chunk_start:chunk_end] + + output_file = os.path.join( + output, f'{os.path.splitext(audio_file)[0]}_nobird_segment_{segment_index}.wav' + ) + segment.export(output_file, format='wav') + segment_index += 1 + + print("Processing complete!") + +def main(labels, wavs, output, config_file): + """Run main script + + """ + create_bird_segments(labels, wavs, output, config_file) + #create_no_bird_segments(labels, wavs, output) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Input Directory Path' + ) + parser.add_argument('labels', type=str, + help='Path to human labeled csv') + parser.add_argument('wavs', type=str, + help='Path to all wav files that have been labeled') + parser.add_argument('output', type=str, + help='Path to desired directory for output csvs') + parser.add_argument('config_file', type=str, + help='Path to config file') + args = parser.parse_args() + main(args.labels, args.wavs, args.output, args.config_file) From 42c3514abc5dbe9394329efe5e4798958413dcc3 Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Tue, 18 Mar 2025 11:45:40 -0700 Subject: [PATCH 004/120] remove unecessary lines removed print statements and there is no longer a need to use librosa to get the sample rate and audio file duration --- create_dataset/params_segment_2017_data.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/create_dataset/params_segment_2017_data.py b/create_dataset/params_segment_2017_data.py index 46c3519..64c282c 100644 --- a/create_dataset/params_segment_2017_data.py +++ b/create_dataset/params_segment_2017_data.py @@ -22,7 +22,6 @@ import yaml import os import pandas as pd -import librosa from pydub import AudioSegment @@ -62,30 +61,17 @@ def create_bird_segments(labels, wavs, output, config): if audio_file.endswith('.wav'): audio_path = os.path.join(wavs, audio_file) - try: - time_series, sample_rate = librosa.load(audio_path, sr=None) - audio_duration = librosa.get_duration(y=time_series, - sr=sample_rate) - except Exception as err: - print(f"Error processing {audio_file}: {err}") - continue - filtered_data = scored_data[scored_data['IN FILE'] == audio_file] bird_sound = AudioSegment.from_wav(audio_path) segment_index = 0 for _, row in filtered_data.iterrows(): if row['TOP1MATCH'] != 'null': start_time = float(row['OFFSET']) - print("Start time in ms? Then end time in ms, then start time w padding, and end time w padding in ms") - print(start_time) end_time = (start_time + float(row['DURATION'])) - print(end_time) start_time = start_time * 1000 end_time = end_time * 1000 start_time = start_time - padding - print(start_time) end_time = end_time + padding - print(end_time) segment = bird_sound[start_time:end_time] output_file = os.path.join( output, f'{os.path.splitext(audio_file)[0]}_segment_{segment_index}.wav' From e9c826f1e876cbd298b6f95147aee5f17b2eeabe Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Tue, 15 Apr 2025 12:20:17 -0700 Subject: [PATCH 005/120] adding pseudocode for creating dataset starting over the script to create the dataset to employ all that we've learned so far. Two scripts, one to run, and one to have all the functions needed to run. create_dataset.py would be run and would call the functions in walk_buow_labels.py (could be named better). It would create/append to 1 folder and 1 csv all the human labeled detections at varied lengths, and an equal number of fixed duration noise samples from the same audio files. Now that I'm writing this, it might be better for the noise samples to be aggregated after all the detections have been aggregated, so we can randomly get samples for noise from wavs that had NO detections as well. But it's a good start. --- create_dataset/create_dataset.py | 66 ++++++++++++++++++++++++++++++ create_dataset/walk_buow_labels.py | 51 +++++++++++++++++++++++ 2 files changed, 117 insertions(+) create mode 100644 create_dataset/create_dataset.py create mode 100644 create_dataset/walk_buow_labels.py diff --git a/create_dataset/create_dataset.py b/create_dataset/create_dataset.py new file mode 100644 index 0000000..98d545c --- /dev/null +++ b/create_dataset/create_dataset.py @@ -0,0 +1,66 @@ +"""Create dataset of burrowing owl vocalizations and noise. + +This script will parse through 2017 and 2018 human labeled +burrowing owl data. It will create a folder with segments of +labeled detections, and an equal number of noise samples from +the same wav files. It will create a CSV with metadata associated +with the segments. The metadata will include the UUID of the segment, +the label, the original filepath of the original wav the segment came +from, the path to the segment, and the start and end time of the +labeled detection relative to the original wav file. The labeled +segments will be the duration of the label, and the duration of the +noise will be fixed and consistent. The user of the dataset may choose +to pad the labeled detections if they need consistent length segments. + +Usage: + + python create_dataset.py /path/to/human/labeled.csv + /path/to/parent/dir/of/wavs/ /path/to/desired/output/dir/ + +""" +import walk_buow_labels +import argparse + + +def main(): + """ + """ + # parse the inputs + parser = argparse.ArgumentParser( + description='Input Directory Path' + ) + parser.add_argument('labels', type=str, + help='Path to human labeled csv') + parser.add_argument('wav_dir', type=str, + help='Path to directory containing wav files.') + parser.add_argument('output_dir', type=str, + help='Path to desired directory for segments.') + args = parser.parse_args() + main(args.labels, args.wav_dir, args.output_dir) + + # walk dir to list paths to each original wav file + wav_file_paths = get_paths(wav_dir) + # open human label file + labels = csv.read(labels) + #iterate through each individual original wav + + for wav in wav_file_paths: + # check which label format to select parsing method + # create dataframe of only the labels that correspond to the wav + if 1st row['DATE'] in labels endswith."2017": + filtered_labels = filter_labels_2017(wav, labels) + elif 1st row['DATE'] in labels endswith"2018": + filtered_labels = filter_labels_2018(wav, labels) + + # output the labeled segments and return the dataframe of annotations + new_buow_rows = create_segments(wav, filtered_labels, output_dir) + # get the number of labeled detections for that wav + num = num rows in new_rows + # create same number of noise segments from the same wav file randomly + new_noise_rows = create_noise_segments(wav, filtered_labels, num, output_dir) + # combine the buow and noise annotations created + new_rows = new_buow_rows + new_noise_rows + # add the annotations to the csv of metadata for the dataset + create_csv(new_rows) + + logging.info(f"Added " {int(new_rows)*2} "new segments from {wav}") diff --git a/create_dataset/walk_buow_labels.py b/create_dataset/walk_buow_labels.py new file mode 100644 index 0000000..753f508 --- /dev/null +++ b/create_dataset/walk_buow_labels.py @@ -0,0 +1,51 @@ +"""Functions to create segments of detections of interest from wavs. + + +""" +from pandas as pd +import os +from pydub import AudioSegment +import logging + + +def setup_logger(level, filename=None): + """ + """ + + +def get_paths(home_dir): + """ + """ + walk folder recursively and save file path of each wav in a dataframe line + return wavs_file_paths + +def create_segments(wav, filtered_labels, out_path): + """ + """ + audio = AudioSegment.from_wav(wav) + for _, row in filtered_labels.iterrows(): + + logging.info(f"Created segment " {segment}) + return output_rows + +def filter_labels_2017(wavs_file_paths, human_labels) + """ + """ + create a sub dataframe with only the rows of the human labels that correspond to the wav + need to parse based on burrow id first, and then "in file" + return filtered_labels + +def filter_labels_2018(wavs_file_paths, human_labels): + """ + """ + return filtered_labels + +def create_noise_segments(wav, filtered_labels, num, out_path): + """ + """ + + +def create_csv(new_rows): + """ + """ + From f3d93a523c769f5f4277563c46ca720dbe832dbf Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Tue, 15 Apr 2025 16:53:51 -0700 Subject: [PATCH 006/120] make more functions work intermediary commit to save progress, this works to walk the 2017 folders and ensure there's no duplicate labels. was able to catch an error in the 2017 labels where there were some of the same wav files added into another folder. success so far --- create_dataset/create_dataset.py | 36 ++++++++++++++++++----------- create_dataset/walk_buow_labels.py | 37 ++++++++++++++++++++++++------ 2 files changed, 52 insertions(+), 21 deletions(-) diff --git a/create_dataset/create_dataset.py b/create_dataset/create_dataset.py index 98d545c..cec0924 100644 --- a/create_dataset/create_dataset.py +++ b/create_dataset/create_dataset.py @@ -14,19 +14,21 @@ Usage: - python create_dataset.py /path/to/human/labeled.csv + python3 create_dataset.py /path/to/human/labeled.csv /path/to/parent/dir/of/wavs/ /path/to/desired/output/dir/ """ -import walk_buow_labels +from walk_buow_labels import setup_logger, get_paths, create_segments +from walk_buow_labels import filter_labels_2017, filter_labels_2018 +from walk_buow_labels import create_noise_segments, create_csv import argparse - +import pandas as pd def main(): """ """ # parse the inputs - parser = argparse.ArgumentParser( + '''parser = argparse.ArgumentParser( description='Input Directory Path' ) parser.add_argument('labels', type=str, @@ -36,25 +38,28 @@ def main(): parser.add_argument('output_dir', type=str, help='Path to desired directory for segments.') args = parser.parse_args() - main(args.labels, args.wav_dir, args.output_dir) - + main(args.labels, args.wav_dir, args.output_dir)''' + wav_dir = "/mnt/buow/Acoustic_Recordings/2017-2018/2017/Otay/" + labels = "/mnt/buow/Acoustic_Recordings/2017-2018/Results/Otay/2017/all.csv" # walk dir to list paths to each original wav file wav_file_paths = get_paths(wav_dir) # open human label file - labels = csv.read(labels) + labels = pd.read_csv(labels) #iterate through each individual original wav - + if "2017" in labels['DATE'].iloc[0]: + use_2017 = True + elif "2018" in labels['DATE'].iloc[0]: + use_2017 = False for wav in wav_file_paths: # check which label format to select parsing method # create dataframe of only the labels that correspond to the wav - if 1st row['DATE'] in labels endswith."2017": + if use_2017 == True: filtered_labels = filter_labels_2017(wav, labels) - elif 1st row['DATE'] in labels endswith"2018": + elif use_2017 == False: filtered_labels = filter_labels_2018(wav, labels) - # output the labeled segments and return the dataframe of annotations - new_buow_rows = create_segments(wav, filtered_labels, output_dir) - # get the number of labeled detections for that wav + #new_buow_rows = create_segments(wav, filtered_labels, output_dir) + '''# get the number of labeled detections for that wav num = num rows in new_rows # create same number of noise segments from the same wav file randomly new_noise_rows = create_noise_segments(wav, filtered_labels, num, output_dir) @@ -63,4 +68,7 @@ def main(): # add the annotations to the csv of metadata for the dataset create_csv(new_rows) - logging.info(f"Added " {int(new_rows)*2} "new segments from {wav}") + logging.info(f"Added " {int(new_rows)*2} "new segments from {wav}")''' + + +main() diff --git a/create_dataset/walk_buow_labels.py b/create_dataset/walk_buow_labels.py index 753f508..c0eab57 100644 --- a/create_dataset/walk_buow_labels.py +++ b/create_dataset/walk_buow_labels.py @@ -2,11 +2,12 @@ """ -from pandas as pd +import pandas as pd import os from pydub import AudioSegment import logging - +from pathlib import Path +import ntpath def setup_logger(level, filename=None): """ @@ -16,7 +17,12 @@ def setup_logger(level, filename=None): def get_paths(home_dir): """ """ - walk folder recursively and save file path of each wav in a dataframe line + wavs_file_paths = [] + for path, dirs, files in os.walk(home_dir): + for file in files: + if file.endswith('.wav'): + new_file = os.path.join(path, file) + wavs_file_paths.append(new_file) return wavs_file_paths def create_segments(wav, filtered_labels, out_path): @@ -25,14 +31,31 @@ def create_segments(wav, filtered_labels, out_path): audio = AudioSegment.from_wav(wav) for _, row in filtered_labels.iterrows(): - logging.info(f"Created segment " {segment}) + logging.info(f"Created segment {segment}") return output_rows -def filter_labels_2017(wavs_file_paths, human_labels) +def filter_labels_2017(wav, labels): """ """ - create a sub dataframe with only the rows of the human labels that correspond to the wav - need to parse based on burrow id first, and then "in file" + file_name = ntpath.basename(wav) + # isolate labels that match the wav basename + filtered_labels = labels[labels['IN FILE'] == file_name] + index_drop = [] + wav = str(wav) + # ensure the labels match the site and burrow name of wav file + for index, row in filtered_labels.iterrows(): + burrow = row['Burrow'] + bur = burrow[:-1] + site = burrow[-1:] + if bur not in wav: + print(f"{bur} is not in {wav}") + index_drop.append(index) + if site not in wav: + print(f"{site} is not in {wav}") + index_drop.append(index) + for index in index_drop: + filtered_labels.drop(index) + return filtered_labels def filter_labels_2018(wavs_file_paths, human_labels): From 46f3881c57e0fe6e7f0ca01fefa08b740664247f Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Wed, 16 Apr 2025 10:33:16 -0700 Subject: [PATCH 007/120] change structure and names of some files walk_buow_labels didn't make much sense, and i also wanted to separate the functions for filtering the labels, as those are the functions that will be varied based on the label file, whereas ideally the other functions will be able to be the same across data and label files. It will make it easier in the future for us to add other parsers for different formats of label files --- create_dataset/create_dataset.py | 25 ++++++++++---- ...walk_buow_labels.py => create_segments.py} | 29 ---------------- create_dataset/filter_labels.py | 34 +++++++++++++++++++ 3 files changed, 52 insertions(+), 36 deletions(-) rename create_dataset/{walk_buow_labels.py => create_segments.py} (51%) create mode 100644 create_dataset/filter_labels.py diff --git a/create_dataset/create_dataset.py b/create_dataset/create_dataset.py index cec0924..ef14ac6 100644 --- a/create_dataset/create_dataset.py +++ b/create_dataset/create_dataset.py @@ -13,14 +13,14 @@ to pad the labeled detections if they need consistent length segments. Usage: - +/ python3 create_dataset.py /path/to/human/labeled.csv /path/to/parent/dir/of/wavs/ /path/to/desired/output/dir/ """ -from walk_buow_labels import setup_logger, get_paths, create_segments -from walk_buow_labels import filter_labels_2017, filter_labels_2018 -from walk_buow_labels import create_noise_segments, create_csv +from create_segments import setup_logger, get_paths, create_segments +from create_segments import create_noise_segments, create_csv +from filter_labels import filter_labels_2017, filter_labels_2018 import argparse import pandas as pd @@ -31,16 +31,27 @@ def main(): '''parser = argparse.ArgumentParser( description='Input Directory Path' ) - parser.add_argument('labels', type=str, + parser.add_argument('-labels', type=str, help='Path to human labeled csv') - parser.add_argument('wav_dir', type=str, + parser.add_argument('-wav_dir', type=str, help='Path to directory containing wav files.') - parser.add_argument('output_dir', type=str, + parser.add_argument('-output_dir', type=str, help='Path to desired directory for segments.') + parser.add_argument('-l', '--lengthen', type=int, default=0, + help='ms of padding for front and end of detection segment') + parser.add_argument('-e', '--equalize', type=int, + help='each detection segment and noise segment will be the same length, not zero padded') args = parser.parse_args() main(args.labels, args.wav_dir, args.output_dir)''' wav_dir = "/mnt/buow/Acoustic_Recordings/2017-2018/2017/Otay/" labels = "/mnt/buow/Acoustic_Recordings/2017-2018/Results/Otay/2017/all.csv" + output_dir = "/home/katiegarwood/create_dataset/" + if output dir exists + good, if not make + if labels exist, good + if not tell user + if wav dir exists + if not tell user # walk dir to list paths to each original wav file wav_file_paths = get_paths(wav_dir) # open human label file diff --git a/create_dataset/walk_buow_labels.py b/create_dataset/create_segments.py similarity index 51% rename from create_dataset/walk_buow_labels.py rename to create_dataset/create_segments.py index c0eab57..d340def 100644 --- a/create_dataset/walk_buow_labels.py +++ b/create_dataset/create_segments.py @@ -34,35 +34,6 @@ def create_segments(wav, filtered_labels, out_path): logging.info(f"Created segment {segment}") return output_rows -def filter_labels_2017(wav, labels): - """ - """ - file_name = ntpath.basename(wav) - # isolate labels that match the wav basename - filtered_labels = labels[labels['IN FILE'] == file_name] - index_drop = [] - wav = str(wav) - # ensure the labels match the site and burrow name of wav file - for index, row in filtered_labels.iterrows(): - burrow = row['Burrow'] - bur = burrow[:-1] - site = burrow[-1:] - if bur not in wav: - print(f"{bur} is not in {wav}") - index_drop.append(index) - if site not in wav: - print(f"{site} is not in {wav}") - index_drop.append(index) - for index in index_drop: - filtered_labels.drop(index) - - return filtered_labels - -def filter_labels_2018(wavs_file_paths, human_labels): - """ - """ - return filtered_labels - def create_noise_segments(wav, filtered_labels, num, out_path): """ """ diff --git a/create_dataset/filter_labels.py b/create_dataset/filter_labels.py new file mode 100644 index 0000000..939319a --- /dev/null +++ b/create_dataset/filter_labels.py @@ -0,0 +1,34 @@ +import pandas as pd +import os +import ntpath +import logging + + +def filter_labels_2017(wav, labels): + """ + """ + file_name = ntpath.basename(wav) + # isolate labels that match the wav basename + filtered_labels = labels[labels['IN FILE'] == file_name] + index_drop = [] + wav = str(wav) + # ensure the labels match the site and burrow name of wav file + for index, row in filtered_labels.iterrows(): + burrow = row['Burrow'] + bur = burrow[:-1] + site = burrow[-1:] + if bur not in wav: + print(f"{bur} is not in {wav}") + index_drop.append(index) + if site not in wav: + print(f"{site} is not in {wav}") + index_drop.append(index) + for index in index_drop: + filtered_labels.drop(index) + + return filtered_labels + +def filter_labels_2018(wavs_file_paths, human_labels): + """ + """ + return filtered_labels From 76bc584771a02a697558e11c0e57f9790e0800f5 Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Wed, 16 Apr 2025 10:41:25 -0700 Subject: [PATCH 008/120] move older files out of the main section I didn't like my old scripts in with the new and improved functions, params and segment can parse 2017 and 2018 labeled data, segment creates 3 second segment based on retroactively ;labeling 3 second segments, and then params will just create a length of segment equal or slightly longer (depending on your params) to the duration of the label overall --- create_dataset/{ => tests}/params_segment_2017_data.py | 8 ++++++-- create_dataset/{ => tests}/segment_labeled_2017_data.py | 0 2 files changed, 6 insertions(+), 2 deletions(-) rename create_dataset/{ => tests}/params_segment_2017_data.py (95%) rename create_dataset/{ => tests}/segment_labeled_2017_data.py (100%) diff --git a/create_dataset/params_segment_2017_data.py b/create_dataset/tests/params_segment_2017_data.py similarity index 95% rename from create_dataset/params_segment_2017_data.py rename to create_dataset/tests/params_segment_2017_data.py index 64c282c..b0fc5e9 100644 --- a/create_dataset/params_segment_2017_data.py +++ b/create_dataset/tests/params_segment_2017_data.py @@ -22,7 +22,7 @@ import yaml import os import pandas as pd -from pydub import AudioSegment +from pydub import AudioSegment, exceptions def read_configs(config): @@ -62,7 +62,11 @@ def create_bird_segments(labels, wavs, output, config): audio_path = os.path.join(wavs, audio_file) filtered_data = scored_data[scored_data['IN FILE'] == audio_file] - bird_sound = AudioSegment.from_wav(audio_path) + try: + bird_sound = AudioSegment.from_wav(audio_path) + except exceptions.CouldntDecodeError: + print(f"Counldn't decode: {audio_path}, moving to next file.") + continue segment_index = 0 for _, row in filtered_data.iterrows(): if row['TOP1MATCH'] != 'null': diff --git a/create_dataset/segment_labeled_2017_data.py b/create_dataset/tests/segment_labeled_2017_data.py similarity index 100% rename from create_dataset/segment_labeled_2017_data.py rename to create_dataset/tests/segment_labeled_2017_data.py From 841a55ba9e0fb117b2aa9461162f1e8883318a27 Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Wed, 16 Apr 2025 17:04:45 -0700 Subject: [PATCH 009/120] made progress on parsing 2018 labels the main function has been debugged, but needs another close look through to ensure that it's not making any grave mistakes. it doesn't crash while running through all the audio and i believe i found a way to ensure that in the event of a duplicate file name, the label in the all labels file will drop it out of the labels. next is to begin the making of the segment part. --- create_dataset/create_dataset.py | 8 +++--- create_dataset/filter_labels.py | 43 +++++++++++++++++++++++++++++--- 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/create_dataset/create_dataset.py b/create_dataset/create_dataset.py index ef14ac6..6d266e2 100644 --- a/create_dataset/create_dataset.py +++ b/create_dataset/create_dataset.py @@ -43,15 +43,15 @@ def main(): help='each detection segment and noise segment will be the same length, not zero padded') args = parser.parse_args() main(args.labels, args.wav_dir, args.output_dir)''' - wav_dir = "/mnt/buow/Acoustic_Recordings/2017-2018/2017/Otay/" - labels = "/mnt/buow/Acoustic_Recordings/2017-2018/Results/Otay/2017/all.csv" + wav_dir = "/mnt/buow/Acoustic_Recordings/2017-2018/2018/Otay/Lonestar/" + labels = "/mnt/buow/Acoustic_Recordings/2017-2018/Results/Otay/2018/allpks2018.csv" output_dir = "/home/katiegarwood/create_dataset/" - if output dir exists + '''if output dir exists good, if not make if labels exist, good if not tell user if wav dir exists - if not tell user + if not tell user''' # walk dir to list paths to each original wav file wav_file_paths = get_paths(wav_dir) # open human label file diff --git a/create_dataset/filter_labels.py b/create_dataset/filter_labels.py index 939319a..e463801 100644 --- a/create_dataset/filter_labels.py +++ b/create_dataset/filter_labels.py @@ -23,12 +23,49 @@ def filter_labels_2017(wav, labels): if site not in wav: print(f"{site} is not in {wav}") index_drop.append(index) - for index in index_drop: - filtered_labels.drop(index) + + filtered_labels.drop(index_drop) return filtered_labels -def filter_labels_2018(wavs_file_paths, human_labels): +def filter_labels_2018(wav, labels): """ + Because we do not have full file paths, we need to ensure that there + are not duplicate .wav file names that are associated with different burrows/sites. + If we just use the all label file, it would be difficult to determine which burrow/site + is correct for the wav file, because the file paths are inconsistent. This function + chooses the label file to use based on the wav name, and then obtains the labels for + that site/burrow within that folder so that there's no question that it's for that + site/burrow. 2017 is formatted very differently and we are able to back out the burrow/site + from the path to the wav and other information in the all labels file. """ + file_name = ntpath.basename(wav) + path_name = ntpath.dirname(wav) + basepath = os.path.basename(path_name) + if basepath == "ClassificationResults" or basepath == "Classification_Results": + return None + path_labels = [] + path_labels.append(path_name + "/ClassificationResults/") + path_labels.append(path_name + "/Classification_Results/") + path_to_results = None + for path in path_labels: + exists = os.path.exists(path) + if exists == True: + path_to_results = path + else: + print(f"{path} does not exist") + continue + if path_to_results == None: + return None + filtered_labels = labels[labels['IN FILE'] == file_name] + index_to_drop = [] + for index, row in filtered_labels.iterrows(): + check_path = os.path.join(path_to_results, row['Fled_2018_LS133_SM1.csv '].strip()) + if os.path.isfile(check_path): + continue + else: + index_to_drop.append(index) + + filtered_labels = filtered_labels.drop(index_to_drop) + return filtered_labels From 80029d0575e71de52e355614cf581c6057ea634d Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Thu, 17 Apr 2025 15:54:37 -0700 Subject: [PATCH 010/120] made create segments function work with subset of data it currently will create the segments based on the filtered labels for that audio file, create segments, and add the lines of interest to a dataframe with the uuid of the file, the path to the file, the call type label, the file path to the original file, and the duration of the sample. it will return that dataframe. need to check it with all the audio and ensure if there's any more errors, they get caught, and make the functions to put the data from all the wavs together in 1 csv. also need to create the function that makes the noise samples. but currently creating uuid segments and the metadata associated seems successful. --- create_dataset/create_dataset.py | 13 +++++++---- create_dataset/create_segments.py | 38 ++++++++++++++++++++++++++----- create_dataset/filter_labels.py | 22 +++++++++++++++++- 3 files changed, 61 insertions(+), 12 deletions(-) diff --git a/create_dataset/create_dataset.py b/create_dataset/create_dataset.py index 6d266e2..efc0756 100644 --- a/create_dataset/create_dataset.py +++ b/create_dataset/create_dataset.py @@ -37,15 +37,18 @@ def main(): help='Path to directory containing wav files.') parser.add_argument('-output_dir', type=str, help='Path to desired directory for segments.') + parser.add_argument('-class_list', type=str, + help='Path to txt file of list of labeled classes') parser.add_argument('-l', '--lengthen', type=int, default=0, help='ms of padding for front and end of detection segment') parser.add_argument('-e', '--equalize', type=int, help='each detection segment and noise segment will be the same length, not zero padded') args = parser.parse_args() main(args.labels, args.wav_dir, args.output_dir)''' - wav_dir = "/mnt/buow/Acoustic_Recordings/2017-2018/2018/Otay/Lonestar/" + wav_dir = "/mnt/buow/Acoustic_Recordings/2017-2018/2018/Otay/Lonestar/EarlyBreeding/LS128/SM10/" labels = "/mnt/buow/Acoustic_Recordings/2017-2018/Results/Otay/2018/allpks2018.csv" - output_dir = "/home/katiegarwood/create_dataset/" + output_dir = "/home/katie/create_dataset/" + class_list = "/home/katie/class_list.txt" '''if output dir exists good, if not make if labels exist, good @@ -69,9 +72,9 @@ def main(): elif use_2017 == False: filtered_labels = filter_labels_2018(wav, labels) # output the labeled segments and return the dataframe of annotations - #new_buow_rows = create_segments(wav, filtered_labels, output_dir) - '''# get the number of labeled detections for that wav - num = num rows in new_rows + new_buow_rows = create_segments(wav, filtered_labels, output_dir, class_list) + # get the number of labeled detections for that wav + '''num = num rows in new_rows # create same number of noise segments from the same wav file randomly new_noise_rows = create_noise_segments(wav, filtered_labels, num, output_dir) # combine the buow and noise annotations created diff --git a/create_dataset/create_segments.py b/create_dataset/create_segments.py index d340def..05f6099 100644 --- a/create_dataset/create_segments.py +++ b/create_dataset/create_segments.py @@ -4,10 +4,11 @@ """ import pandas as pd import os -from pydub import AudioSegment +from pydub import AudioSegment, exceptions import logging from pathlib import Path import ntpath +import uuid def setup_logger(level, filename=None): """ @@ -25,13 +26,38 @@ def get_paths(home_dir): wavs_file_paths.append(new_file) return wavs_file_paths -def create_segments(wav, filtered_labels, out_path): +def create_segments(wav, filtered_labels, out_path, class_list): """ """ - audio = AudioSegment.from_wav(wav) - for _, row in filtered_labels.iterrows(): - - logging.info(f"Created segment {segment}") + if filtered_labels is None: + print(f"skipping segment creation for {wav} because it does not have labels or is not a file of interest") + return None + output_rows = pd.DataFrame(columns=['segment', 'label', 'segment_path', 'original_path', 'segment_duration_ms']) + with open(class_list, 'r') as file: + classes = file.read() + class_list = classes.split(',') + try: + audio = AudioSegment.from_wav(wav) + except exceptions.CouldntDecodeError: + print(f"Couldn't decode: {audio}, moving to next file") + rows_with_none = filtered_labels[filtered_labels['MANUAL ID*'].isnull()] + filtered_labels['MANUAL ID*'] = filtered_labels['MANUAL ID*'].str.lower() + df_row = 0 + for index, row in filtered_labels.iterrows(): + for call_type in class_list: + if row['MANUAL ID*'] == call_type: + start_time = float(row['OFFSET']) + end_time = (start_time + float(row['DURATION'])) + start_time = start_time * 1000 + end_time = end_time * 1000 + segment = audio[start_time:end_time] + id = uuid.uuid4() + id = str(id) + '.wav' + segment_path = os.path.join(out_path, id) + segment.export(segment_path, format='wav') + output_rows.loc[df_row] = [id, call_type, segment_path, wav, float(row['DURATION'])] + df_row += 1 + print(f"Created segment {segment_path}") return output_rows def create_noise_segments(wav, filtered_labels, num, out_path): diff --git a/create_dataset/filter_labels.py b/create_dataset/filter_labels.py index e463801..368b855 100644 --- a/create_dataset/filter_labels.py +++ b/create_dataset/filter_labels.py @@ -43,11 +43,15 @@ def filter_labels_2018(wav, labels): path_name = ntpath.dirname(wav) basepath = os.path.basename(path_name) if basepath == "ClassificationResults" or basepath == "Classification_Results": + print(f"skipping {wav} because it's basepath is {basepath}") + # skipping extra wav files that exist as duplicates of our wavs of interest within these sub dirs return None + # some of the folders have an underscore and some do not path_labels = [] path_labels.append(path_name + "/ClassificationResults/") path_labels.append(path_name + "/Classification_Results/") path_to_results = None + # checking if it's the one with an underscore vs not for path in path_labels: exists = os.path.exists(path) if exists == True: @@ -56,16 +60,32 @@ def filter_labels_2018(wav, labels): print(f"{path} does not exist") continue if path_to_results == None: + # skipping wav files that are an exception to this folder structure because they're + # not the wav files of interest + print(f"skipping {wav} because it's not a file of interest") return None filtered_labels = labels[labels['IN FILE'] == file_name] index_to_drop = [] + # iterating the columns in labels that match the wav file name for index, row in filtered_labels.iterrows(): check_path = os.path.join(path_to_results, row['Fled_2018_LS133_SM1.csv '].strip()) + # there's a column in the all labels file that has the file name of the subset label file that + # the all labels file was aggregated from, and if the wav file path leads us to + # the label file listed in the all labels file, then it will be apart of the filtered + # labels for that wav. this needs to be checked in case 2 wav files have the same + # file name, but are from different burrows/sites. + # it's worth noting that this could be done a different way, using the subset label files + # for each burrow/site labels, but you'd still need the all labels file to validate, so it + # just felt like more steps if os.path.isfile(check_path): continue else: index_to_drop.append(index) - + # if there were labels associated with a different wav file that happened to have the same + # name, this will drop the labels associated with a different burrow/site filtered_labels = filtered_labels.drop(index_to_drop) return filtered_labels + + +# TODO: There's one subset label file that has no column names, so that error needs to be dealt with. It will currently ignore that one From b673c54e437bd927d0d319fb466fd16eae83ff4f Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Mon, 21 Apr 2025 16:15:18 -0700 Subject: [PATCH 011/120] begin adding functionality to creating noise segments able to create an array for each second of the audio file, and a mask in the spots that have an owl detection + a buffer. leaving the unmasked portions as free reign for using for randomly generating noise segments. need to add now something to randomly peruse the unmasked portion, seeking the num of burrowing owl detections and generating 3s samples from it, and creating a dataframe with the same values as the create_segments function. most of the code can be borrowed from that except for the random selection, and checking --- create_dataset/create_dataset.py | 11 +++++------ create_dataset/create_segments.py | 27 ++++++++++++++++++++++++--- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/create_dataset/create_dataset.py b/create_dataset/create_dataset.py index efc0756..127dc78 100644 --- a/create_dataset/create_dataset.py +++ b/create_dataset/create_dataset.py @@ -14,8 +14,9 @@ Usage: / - python3 create_dataset.py /path/to/human/labeled.csv - /path/to/parent/dir/of/wavs/ /path/to/desired/output/dir/ + python3 create_dataset.py -labels /path/to/human/labeled.csv + -wav_dir /path/to/parent/dir/of/wavs/ -output_dir /path/to/desired/output/dir/ + -class_list /path/to/classes.txt """ from create_segments import setup_logger, get_paths, create_segments @@ -73,11 +74,9 @@ def main(): filtered_labels = filter_labels_2018(wav, labels) # output the labeled segments and return the dataframe of annotations new_buow_rows = create_segments(wav, filtered_labels, output_dir, class_list) - # get the number of labeled detections for that wav - '''num = num rows in new_rows # create same number of noise segments from the same wav file randomly - new_noise_rows = create_noise_segments(wav, filtered_labels, num, output_dir) - # combine the buow and noise annotations created + new_noise_rows = create_noise_segments(wav, new_buow_rows, output_dir) + '''# combine the buow and noise annotations created new_rows = new_buow_rows + new_noise_rows # add the annotations to the csv of metadata for the dataset create_csv(new_rows) diff --git a/create_dataset/create_segments.py b/create_dataset/create_segments.py index 05f6099..25c6e95 100644 --- a/create_dataset/create_segments.py +++ b/create_dataset/create_segments.py @@ -9,6 +9,8 @@ from pathlib import Path import ntpath import uuid +import numpy as np +import random def setup_logger(level, filename=None): """ @@ -32,7 +34,7 @@ def create_segments(wav, filtered_labels, out_path, class_list): if filtered_labels is None: print(f"skipping segment creation for {wav} because it does not have labels or is not a file of interest") return None - output_rows = pd.DataFrame(columns=['segment', 'label', 'segment_path', 'original_path', 'segment_duration_ms']) + output_rows = pd.DataFrame(columns=['segment', 'label', 'segment_path', 'original_path', 'segment_duration_s', 'segment_rel_start_ms']) with open(class_list, 'r') as file: classes = file.read() class_list = classes.split(',') @@ -55,15 +57,34 @@ def create_segments(wav, filtered_labels, out_path, class_list): id = str(id) + '.wav' segment_path = os.path.join(out_path, id) segment.export(segment_path, format='wav') - output_rows.loc[df_row] = [id, call_type, segment_path, wav, float(row['DURATION'])] + output_rows.loc[df_row] = [id, call_type, segment_path, wav, float(row['DURATION']), start_time] df_row += 1 print(f"Created segment {segment_path}") return output_rows -def create_noise_segments(wav, filtered_labels, num, out_path): +# def create_birdnet_segments(wav, out_path, birdnet_class_list=None): + +def create_noise_segments(wav, new_buow_rows, out_path): """ + Randomly select an equal number of 3s noise segments to + the number of detections per audio file, a buffer length + away from all of the detections in the file. """ + try: + audio = AudioSegment.from_wav(wav) + # duration in seconds, cutting off the ms + duration = int(len(audio) / 1000) + except exceptions.CouldntDecodeError: + print(f"Couldn't decode: {audio}, moving to next file") + num = len(new_buow_rows) + seconds_array = np.zeros(duration) + for index, row in new_buow_rows.iterrows(): + start = int((row['segment_rel_start_ms'] / 1000) - 1) + end = int((row['segment_rel_start_ms'] / 1000) + row['segment_duration_s']) + mask_start = max(0, start - 30) + mask_end = min(len(seconds_array), end + 30 + 1) + seconds_array[mask_start:mask_end] = 1 def create_csv(new_rows): """ From 770cfcd73ae89a9b6b55bc35087c7b0a33c5fbaf Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Tue, 22 Apr 2025 12:47:54 -0700 Subject: [PATCH 012/120] working no_buow segment creation this is the messiest function ive ever written but it currently works and i want to save progress. i need to ensure it's choosing viable start times, and there's not some weird rounding logic that doesn't make sense. we're dealing with some values that i convert to ints but due to the buffer added between positive segments i think this is fine. currently makes the segments and adds it to the other df, makes an equal number to the detection segments in this group. --- create_dataset/create_segments.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/create_dataset/create_segments.py b/create_dataset/create_segments.py index 25c6e95..d703a42 100644 --- a/create_dataset/create_segments.py +++ b/create_dataset/create_segments.py @@ -76,8 +76,8 @@ def create_noise_segments(wav, new_buow_rows, out_path): duration = int(len(audio) / 1000) except exceptions.CouldntDecodeError: print(f"Couldn't decode: {audio}, moving to next file") - - num = len(new_buow_rows) + call_type = "no_buow" + num = len(new_buow_rows) * 2 seconds_array = np.zeros(duration) for index, row in new_buow_rows.iterrows(): start = int((row['segment_rel_start_ms'] / 1000) - 1) @@ -85,6 +85,25 @@ def create_noise_segments(wav, new_buow_rows, out_path): mask_start = max(0, start - 30) mask_end = min(len(seconds_array), end + 30 + 1) seconds_array[mask_start:mask_end] = 1 + new_sample = num / 2 + while num > new_sample: + random_index = np.random.choice(len(seconds_array)) + if seconds_array[random_index] == 0: + if seconds_array[random_index + 1] == 0: + if seconds_array[random_index + 2] == 0: + start_time = random_index + 1 * 1000 + end_time = random_index + 4 * 1000 + segment = audio[start_time:end_time] + duration_of_segment = len(segment) / 1000 + id = uuid.uuid4() + id = str(id) + '.wav' + segment_path = os.path.join(out_path, id) + segment.export(segment_path, format='wav') + new_buow_rows.loc[new_sample] = [id, call_type, segment_path, wav, duration_of_segment, start_time] + new_sample += 1 + + print(f"Dataframe with added noise samples {new_buow_rows}") + return new_buow_rows def create_csv(new_rows): """ From 41cc8293c60c3751de81dabd09b379025047d26c Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Tue, 22 Apr 2025 16:20:59 -0700 Subject: [PATCH 013/120] more working added the beginnings of a script to deal with the creation of 5 folds of cleanly distrubuted data, stratified across all classes while not splitting up clips from the same wav file to minimize potential of leakage. also was able to get the data to all get together in one dataframe, but still need to check the relative start time column because something seems off about some of the decimals. also, need to handle a none type error in the noise segment creation because some empty dataframes are still being passed to that one, so need to check for passing None in previous steps. Also need to see how it will handle adding the 2017 and 2018 data together since they need to be run separately. Making the folded dataset is separate because of this- the fact that i need to run 2017 and 2018 separately, though i suppose it could iterate a list and do them both at the same time? seems a little excessive but thats the only way i could do the stratify step otherwise. i think it will be more useful to have it as separate as i feel we'd wanna do this in different ways and with other data. --- create_dataset/create_dataset.py | 66 ++++++++++++++++++------------- create_dataset/create_segments.py | 35 ++++++++-------- create_dataset/strat_k_folds.py | 33 ++++++++++++++++ 3 files changed, 89 insertions(+), 45 deletions(-) create mode 100644 create_dataset/strat_k_folds.py diff --git a/create_dataset/create_dataset.py b/create_dataset/create_dataset.py index 127dc78..4d5a772 100644 --- a/create_dataset/create_dataset.py +++ b/create_dataset/create_dataset.py @@ -24,38 +24,25 @@ from filter_labels import filter_labels_2017, filter_labels_2018 import argparse import pandas as pd +import ntpath +import os -def main(): +def create_dataset(labels, wav_dir, output_dir, class_list): """ """ # parse the inputs - '''parser = argparse.ArgumentParser( - description='Input Directory Path' - ) - parser.add_argument('-labels', type=str, - help='Path to human labeled csv') - parser.add_argument('-wav_dir', type=str, - help='Path to directory containing wav files.') - parser.add_argument('-output_dir', type=str, - help='Path to desired directory for segments.') - parser.add_argument('-class_list', type=str, - help='Path to txt file of list of labeled classes') - parser.add_argument('-l', '--lengthen', type=int, default=0, - help='ms of padding for front and end of detection segment') - parser.add_argument('-e', '--equalize', type=int, - help='each detection segment and noise segment will be the same length, not zero padded') - args = parser.parse_args() - main(args.labels, args.wav_dir, args.output_dir)''' - wav_dir = "/mnt/buow/Acoustic_Recordings/2017-2018/2018/Otay/Lonestar/EarlyBreeding/LS128/SM10/" - labels = "/mnt/buow/Acoustic_Recordings/2017-2018/Results/Otay/2018/allpks2018.csv" - output_dir = "/home/katie/create_dataset/" - class_list = "/home/katie/class_list.txt" '''if output dir exists good, if not make if labels exist, good if not tell user if wav dir exists if not tell user''' + out_file = ntpath.dirname(output_dir) + result_file = os.path.join(out_file, "metadata.csv") + if os.path.exists(result_file): + all_data = pd.read_csv(result_file) + else: + all_data = pd.DataFrame() # walk dir to list paths to each original wav file wav_file_paths = get_paths(wav_dir) # open human label file @@ -75,13 +62,36 @@ def main(): # output the labeled segments and return the dataframe of annotations new_buow_rows = create_segments(wav, filtered_labels, output_dir, class_list) # create same number of noise segments from the same wav file randomly - new_noise_rows = create_noise_segments(wav, new_buow_rows, output_dir) - '''# combine the buow and noise annotations created - new_rows = new_buow_rows + new_noise_rows + all_buow_rows = create_noise_segments(wav, new_buow_rows, output_dir) # add the annotations to the csv of metadata for the dataset - create_csv(new_rows) + + all_data = pd.concat([all_data, all_buow_rows]) + print(all_data) - logging.info(f"Added " {int(new_rows)*2} "new segments from {wav}")''' + print(f"Added {len(all_buow_rows)} new segments from {wav}") + all_data.to_csv(result_file) + print(f"Created results: {result_file}") +def main(labels, wav_dir, output_dir, class_list): + """ + """ + create_dataset(labels, wav_dir, output_dir, class_list) -main() +if __name__=="__main__": + parser = argparse.ArgumentParser( + description='Input Directory Path' + ) + parser.add_argument('-labels', type=str, + help='Path to human labeled csv') + parser.add_argument('-wav_dir', type=str, + help='Path to directory containing wav files.') + parser.add_argument('-output_dir', type=str, + help='Path to desired directory for segments.') + parser.add_argument('-class_list', type=str, + help='Path to txt file of list of labeled classes') + #parser.add_argument('-l', '--lengthen', type=int, default=0, + # help='ms of padding for front and end of detection segment') + # parser.add_argument('-e', '--equalize', type=int, + # help='each detection segment and noise segment will be the same length, not zero padded') + args = parser.parse_args() + main(args.labels, args.wav_dir, args.output_dir, args.class_list) diff --git a/create_dataset/create_segments.py b/create_dataset/create_segments.py index d703a42..a895ba0 100644 --- a/create_dataset/create_segments.py +++ b/create_dataset/create_segments.py @@ -11,6 +11,7 @@ import uuid import numpy as np import random +import ntpath def setup_logger(level, filename=None): """ @@ -45,6 +46,7 @@ def create_segments(wav, filtered_labels, out_path, class_list): rows_with_none = filtered_labels[filtered_labels['MANUAL ID*'].isnull()] filtered_labels['MANUAL ID*'] = filtered_labels['MANUAL ID*'].str.lower() df_row = 0 + path = ntpath.dirname(wav) for index, row in filtered_labels.iterrows(): for call_type in class_list: if row['MANUAL ID*'] == call_type: @@ -88,24 +90,23 @@ def create_noise_segments(wav, new_buow_rows, out_path): new_sample = num / 2 while num > new_sample: random_index = np.random.choice(len(seconds_array)) - if seconds_array[random_index] == 0: - if seconds_array[random_index + 1] == 0: - if seconds_array[random_index + 2] == 0: - start_time = random_index + 1 * 1000 - end_time = random_index + 4 * 1000 - segment = audio[start_time:end_time] - duration_of_segment = len(segment) / 1000 - id = uuid.uuid4() - id = str(id) + '.wav' - segment_path = os.path.join(out_path, id) - segment.export(segment_path, format='wav') - new_buow_rows.loc[new_sample] = [id, call_type, segment_path, wav, duration_of_segment, start_time] - new_sample += 1 + if seconds_array[random_index] == 0 and seconds_array[random_index + 3] == 0: + start_time = random_index + 1 * 1000 + end_time = random_index + 4 * 1000 + segment = audio[start_time:end_time] + duration_of_segment = len(segment) / 1000 + id = uuid.uuid4() + id = str(id) + '.wav' + segment_path = os.path.join(out_path, id) + segment.export(segment_path, format='wav') + new_buow_rows.loc[new_sample] = [id, call_type, segment_path, wav, duration_of_segment, start_time] + new_sample += 1 - print(f"Dataframe with added noise samples {new_buow_rows}") - return new_buow_rows + all_buow_rows = new_buow_rows + return all_buow_rows -def create_csv(new_rows): +def create_csv(new_rows, output_dir): """ """ - + if os.path.exists(output_dir): + pd.con diff --git a/create_dataset/strat_k_folds.py b/create_dataset/strat_k_folds.py new file mode 100644 index 0000000..6624148 --- /dev/null +++ b/create_dataset/strat_k_folds.py @@ -0,0 +1,33 @@ +""" +""" +import pandas as pd +import argparse + +def create_strat_folds(df): + """ + """ + obtain complete class distribution for all classes + divide each class by 5, thats how many need to be in each fold + lets say theres 5000 bo buow, 1000 cluck, 500 coocoo, 500 chick begging, 2000 alarm, 1000 twitter + so each fold needs, 1000 no buow, 100 coocoo, 100 chick begging, 400 alarm, 200 twitter + + if i create a df that has the wav path, the distribution of calls in a column. i can randomly add different ones + together until i get about that disribution per class. then i can go in and add a column to the df that has the fold it's in. donezo + what combination of these properites will give me the closest distribution to a balanced set? + +def main(meta): + """ + """ + df = pd.read(meta) + create_strat_folds(df) + + +if __name__=="__main__": + parser = argparse.ArgumentParser( + description='Input Directory Path' + ) + parser.add_argument('meta', type=str, + help='Path to metadata csv') + args = parser.parse_args() + main(args.meta) + From 719c432ea0bfc4020debc141dab01e124a257a97 Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Thu, 24 Apr 2025 16:35:21 -0700 Subject: [PATCH 014/120] fixed error handling for edge case one of the subset label file names didn't match what the all labels said it was, even though it was the same file. so when it comes across that, it searches for the existence of the actual file name of that sub label file --- create_dataset/filter_labels.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/create_dataset/filter_labels.py b/create_dataset/filter_labels.py index 368b855..fef3c62 100644 --- a/create_dataset/filter_labels.py +++ b/create_dataset/filter_labels.py @@ -79,6 +79,12 @@ def filter_labels_2018(wav, labels): # just felt like more steps if os.path.isfile(check_path): continue + elif row['Fled_2018_LS133_SM1.csv '].strip() == 'EarBreed_2018_LS128_SM10A.csv': + check_path = os.path.join(path_to_results, 'EarBreed_LS128_SM10A.csv') + if os.path.isfile(check_path): + continue + else: + index_to_drop.append(index) else: index_to_drop.append(index) # if there were labels associated with a different wav file that happened to have the same From ba6dda96f89afe5d4e03cc1021a12a3eae9fa0d9 Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Thu, 24 Apr 2025 16:36:58 -0700 Subject: [PATCH 015/120] fix indexing on result csv --- create_dataset/create_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/create_dataset/create_dataset.py b/create_dataset/create_dataset.py index 4d5a772..081e116 100644 --- a/create_dataset/create_dataset.py +++ b/create_dataset/create_dataset.py @@ -65,7 +65,7 @@ def create_dataset(labels, wav_dir, output_dir, class_list): all_buow_rows = create_noise_segments(wav, new_buow_rows, output_dir) # add the annotations to the csv of metadata for the dataset - all_data = pd.concat([all_data, all_buow_rows]) + all_data = pd.concat([all_data, all_buow_rows], ignore_index=True) print(all_data) print(f"Added {len(all_buow_rows)} new segments from {wav}") From 35d2e5e83a769acc9d67e17e7ab2c0aa94584b35 Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Thu, 24 Apr 2025 16:37:43 -0700 Subject: [PATCH 016/120] fixed some edge case error handling fixed an out of bounds error when creating random no_buow segment fixed error print statement that was referencing the thing that didn't exist if the error happened. dealt with a none value being passed by a previous function and assigned it a value that ends the chain of passing none --- create_dataset/create_segments.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/create_dataset/create_segments.py b/create_dataset/create_segments.py index a895ba0..da88e4f 100644 --- a/create_dataset/create_segments.py +++ b/create_dataset/create_segments.py @@ -42,8 +42,7 @@ def create_segments(wav, filtered_labels, out_path, class_list): try: audio = AudioSegment.from_wav(wav) except exceptions.CouldntDecodeError: - print(f"Couldn't decode: {audio}, moving to next file") - rows_with_none = filtered_labels[filtered_labels['MANUAL ID*'].isnull()] + print(f"Couldn't decode: {wav}, moving to next file") filtered_labels['MANUAL ID*'] = filtered_labels['MANUAL ID*'].str.lower() df_row = 0 path = ntpath.dirname(wav) @@ -72,12 +71,16 @@ def create_noise_segments(wav, new_buow_rows, out_path): the number of detections per audio file, a buffer length away from all of the detections in the file. """ + if new_buow_rows is None: + print(f"not creating noise segments from {wav} because there were no labels or no associated labels") + all_buow_rows = pd.DataFrame() + return all_buow_rows try: audio = AudioSegment.from_wav(wav) # duration in seconds, cutting off the ms duration = int(len(audio) / 1000) except exceptions.CouldntDecodeError: - print(f"Couldn't decode: {audio}, moving to next file") + print(f"Couldn't decode: {wav}, moving to next file") call_type = "no_buow" num = len(new_buow_rows) * 2 seconds_array = np.zeros(duration) @@ -88,11 +91,12 @@ def create_noise_segments(wav, new_buow_rows, out_path): mask_end = min(len(seconds_array), end + 30 + 1) seconds_array[mask_start:mask_end] = 1 new_sample = num / 2 + print(f"length of seconds array: {len(seconds_array)}") while num > new_sample: - random_index = np.random.choice(len(seconds_array)) + random_index = np.random.choice(len(seconds_array)-3) if seconds_array[random_index] == 0 and seconds_array[random_index + 3] == 0: - start_time = random_index + 1 * 1000 - end_time = random_index + 4 * 1000 + start_time = (random_index + 1) * 1000 + end_time = (random_index + 4) * 1000 segment = audio[start_time:end_time] duration_of_segment = len(segment) / 1000 id = uuid.uuid4() From bea4cf4c31a489dfcf36fd0b5576005d6e2ad8ba Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Thu, 24 Apr 2025 16:39:27 -0700 Subject: [PATCH 017/120] starting to figure out the problem not really working code but beginning to think about how to keep the groups together while properly creating stratified datasets found a tutorial that deals with the same problem --- create_dataset/strat_k_folds.py | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/create_dataset/strat_k_folds.py b/create_dataset/strat_k_folds.py index 6624148..6790f47 100644 --- a/create_dataset/strat_k_folds.py +++ b/create_dataset/strat_k_folds.py @@ -6,19 +6,35 @@ def create_strat_folds(df): """ """ - obtain complete class distribution for all classes - divide each class by 5, thats how many need to be in each fold - lets say theres 5000 bo buow, 1000 cluck, 500 coocoo, 500 chick begging, 2000 alarm, 1000 twitter - so each fold needs, 1000 no buow, 100 coocoo, 100 chick begging, 400 alarm, 200 twitter + cluck_df = df[df['label'] == 'cluck'] + coocoo_df = df[df['label'] == 'coocoo'] + twitter_df = df[df['label'] == 'twitter'] + alarm_df = df[df['label'] == 'alarm'] + chick_beg_df = df[df['label'] == 'chick_begging'] + no_buow_df = df[df['label'] == 'no_buow'] + # amount needed per fold + print(f"{chick_beg_df}") + len_cluck = len(cluck_df) / 5 + len_coocoo = len(coocoo_df) / 5 + len_twitter = len(twitter_df) / 5 + len_alarm = len(alarm_df) / 5 + len_chick = len(chick_beg_df) / 5 + len_no_buow = len(no_buow_df) / 5 + print(f"len_cluck: {len_cluck} len_coocoo: {len_coocoo} len_twitter: {len_twitter} len_alarm: {len_alarm} len_chick: {len_chick} len_no_buow: {len_no_buow}") - if i create a df that has the wav path, the distribution of calls in a column. i can randomly add different ones + + grouped = df.groupby('original_path') + for index, group in grouped: + print(f"group {index}") + print(group) + '''if i create a df that has the wav path, the distribution of calls in a column. i can randomly add different ones together until i get about that disribution per class. then i can go in and add a column to the df that has the fold it's in. donezo - what combination of these properites will give me the closest distribution to a balanced set? + what combination of these properites will give me the closest distribution to a balanced set?''' def main(meta): """ """ - df = pd.read(meta) + df = pd.read_csv(meta) create_strat_folds(df) From 6fc0ecafecf4ed688febc5e05c1ce45ba562b286 Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Fri, 25 Apr 2025 15:17:53 -0700 Subject: [PATCH 018/120] optimized stratified group k-fold splitter using someone elses optimization group splitting implementation, running strat_k_folds.py uses these functions when given the metadata file thats generated after detection and noise segment creation. it currently doesn't append the fold number to the metadata file, but it takes the class distributions of each 'group' in our case, groups are at the wav file level. it then calculates the optimal split of the groups to obtain the closest class distribution for each fold to the original class distribution. it outputs an array with the fold number for each group, as well as some other metrics like the actual class distribution for each fold, because it won't be perfect given that each group is unbalanced. --- create_dataset/k_fold_split_copy.py | 182 ++++++++++++++++++++++++++++ create_dataset/strat_k_folds.py | 51 ++++---- 2 files changed, 211 insertions(+), 22 deletions(-) create mode 100644 create_dataset/k_fold_split_copy.py diff --git a/create_dataset/k_fold_split_copy.py b/create_dataset/k_fold_split_copy.py new file mode 100644 index 0000000..2296ab6 --- /dev/null +++ b/create_dataset/k_fold_split_copy.py @@ -0,0 +1,182 @@ +"""Optimizing k-fold splits with groups. + +Downloaded and modified from https://github.com/joaofig/strat-group-split/tree/main +""" + +import numpy as np + +from numpy.random import default_rng +from numba import njit +from typing import Set, Tuple + + +def generate_problem(num_groups: int, + num_classes: int, + min_group_size: int, + max_group_size: int, + class_percent: np.array) -> np.ndarray: + + problem = np.zeros((num_groups, num_classes), dtype=int) + + rng = default_rng() + group_sizes = rng.integers(low=min_group_size, + high=max_group_size, + size=num_groups) + + for i in range(num_groups): + # Calculate the + proportions = np.random.normal(class_percent, class_percent / 10) + + problem[i, :] = proportions * group_sizes[i] + return problem + + +@njit +def calculate_cost(problem: np.ndarray, + solution: np.ndarray, + k: int) -> float: + cost = 0.0 + total = np.sum(problem) + class_sums = np.sum(problem, axis=0) + num_classes = problem.shape[1] + + for i in range(k): + idx = solution == i + fold_sum = np.sum(problem[idx, :]) + + # Start by calculating the fold imbalance cost + cost += (fold_sum / total - 1.0 / k) ** 2 + + # Now calculate the cost associated with the class imbalances + # Katie: had to add division by 0 error for if fold_sums equal 0, during testing with subset + # there were no chick begging calls so this row was 0 + for j in range(num_classes): + if fold_sum == 0: + cost += (0 - class_sums[j] / total) ** 2 + else: + cost += (np.sum(problem[idx, j]) / fold_sum - class_sums[j] / total) ** 2 + return cost + + +@njit +def generate_search_space(problem: np.ndarray, + solution: np.ndarray, + k: int) -> np.ndarray: + num_groups = problem.shape[0] + + space = np.zeros((num_groups, k)) + sol = solution.copy() + + for i in range(num_groups): + for j in range(k): + if solution[i] == j: + space[i,j] = np.inf + else: + sol[i] = j + space[i, j] = calculate_cost(problem, sol, k) + sol[i] = solution[i] + return space + + +@njit +def solution_to_str(solution: np.ndarray) -> str: + return "".join([str(n) for n in solution]) + + +def generate_initial_solution(problem: np.ndarray, + k: int, + algo: str="k-bound") -> np.ndarray: + num_groups = problem.shape[0] + if algo == "k-bound": + rng = default_rng() + total = np.sum(problem) + indices = rng.permutation(problem.shape[0]) + + solution = np.zeros(num_groups, dtype=int) + c = 0 + fold_total = 0 + for i in indices: + group = np.sum(problem[i, :]) + if fold_total + group < total / k: + fold_total += group + else: + c = (c + 1) % k + fold_total = group + solution[i] = c + elif algo == "random": + rng = default_rng() + solution = rng.integers(low=0, high=k, size=num_groups) + elif algo == "zeros": + solution = np.zeros(num_groups, dtype=int) + else: + raise Exception("Invalid algorithm name") + return solution + + +def solve(problem: np.ndarray, + k=5, + min_cost=1e-5, + max_retry=100, + verbose=False) -> np.ndarray: + hist = set() + retry = 0 + + solution = generate_initial_solution(problem, k) + incumbent = solution.copy() + low_cost = calculate_cost(problem, solution, k) + cost = 1.0 + while retry < max_retry and cost > min_cost: + decision = generate_search_space(problem, solution, k=5) + grp, cls = select_move(decision, solution, hist) + + if grp != -1: + solution[grp] = cls + cost = calculate_cost(problem, solution, k=5) + if cost < low_cost: + low_cost = cost + incumbent = solution.copy() + retry = 0 + if verbose: + print(cost) + else: + retry += 1 + hist.add(solution_to_str(solution)) + return incumbent + + +def select_move(decision: np.ndarray, + solution: np.ndarray, + history: Set) -> Tuple: + candidates = np.argsort(decision, axis=None) + + for c in candidates: + p = np.unravel_index(c, decision.shape) + s = solution.copy() + s[p[0]] = p[1] + sol_str = solution_to_str(s) + + if sol_str not in history: + return p + return -1, -1 # No move found! + + +def main(): + problem = generate_problem(num_groups=500, + num_classes=4, + min_group_size=400, + max_group_size=2000, + class_percent=np.array([0.4, 0.3, 0.2, 0.1])) + solution = solve(problem, k=5, verbose=True) + + print(np.sum(problem, axis=0) / np.sum(problem)) + print() + + folds = [problem[solution == i] for i in range(5)] + fold_percents = np.array([np.sum(folds[i], axis=0) / np.sum(folds[i]) for i in range(5)]) + print(fold_percents) + print() + print([np.sum(folds[i]) / np.sum(problem) for i in range(5)]) + + +if __name__ == "__main__": + main() diff --git a/create_dataset/strat_k_folds.py b/create_dataset/strat_k_folds.py index 6790f47..9465b12 100644 --- a/create_dataset/strat_k_folds.py +++ b/create_dataset/strat_k_folds.py @@ -1,35 +1,42 @@ """ """ +from k_fold_split_copy import calculate_cost, generate_search_space +from k_fold_split_copy import solution_to_str, generate_initial_solution +from k_fold_split_copy import solve, select_move import pandas as pd import argparse +import numpy as np + def create_strat_folds(df): """ """ - cluck_df = df[df['label'] == 'cluck'] - coocoo_df = df[df['label'] == 'coocoo'] - twitter_df = df[df['label'] == 'twitter'] - alarm_df = df[df['label'] == 'alarm'] - chick_beg_df = df[df['label'] == 'chick_begging'] - no_buow_df = df[df['label'] == 'no_buow'] - # amount needed per fold - print(f"{chick_beg_df}") - len_cluck = len(cluck_df) / 5 - len_coocoo = len(coocoo_df) / 5 - len_twitter = len(twitter_df) / 5 - len_alarm = len(alarm_df) / 5 - len_chick = len(chick_beg_df) / 5 - len_no_buow = len(no_buow_df) / 5 - print(f"len_cluck: {len_cluck} len_coocoo: {len_coocoo} len_twitter: {len_twitter} len_alarm: {len_alarm} len_chick: {len_chick} len_no_buow: {len_no_buow}") - - + num_classes = 6 + df['label'] = df['label'].replace('cluck', 0) + df['label'] = df['label'].replace('coocoo', 1) + df['label'] = df['label'].replace('twitter', 2) + df['label'] = df['label'].replace('alarm', 3) + df['label'] = df['label'].replace('chick_begging', 4) + df['label'] = df['label'].replace('no_buow', 5) + # group is the subset of the index which is the wav file they all come from grouped = df.groupby('original_path') + group_names = [] + group_matrix = [] for index, group in grouped: - print(f"group {index}") - print(group) - '''if i create a df that has the wav path, the distribution of calls in a column. i can randomly add different ones - together until i get about that disribution per class. then i can go in and add a column to the df that has the fold it's in. donezo - what combination of these properites will give me the closest distribution to a balanced set?''' + counts = np.zeros(num_classes, dtype=int) + label_counts = group['label'].value_counts() + for label, count in label_counts.items(): + counts[int(label)] = count + group_matrix.append(counts) + group_names.append(index) + problem = np.array(group_matrix) + print(problem) + solution = solve(problem, k=5, verbose=True) + print(f"solution {solution}") + print(np.sum(problem, axis=0) / np.sum(problem)) + folds = [problem[solution == i] for i in range(5)] + fold_percents = np.array([np.sum(folds[i], axis=0) / np.sum(folds[i]) for i in range(5)]) + print(folds) def main(meta): """ From 16124e5091e13cf941e4d0f42cfdcc44ad8974a8 Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Fri, 25 Apr 2025 16:48:57 -0700 Subject: [PATCH 019/120] create new metadata csv with the fold number in a column this now can add the fold number assignment to the metadata but it does create a new csv. could probably append the old one but this is what it does. there is something funky happening with the indexing, but this is a known issue about that creation of the metadata.csv which would be dealth with in create_dataset pipeline and not the fault of this script. also should create a results file with the metrics of the actual class distributions per fold so it can go in the readme for the dataset --- create_dataset/strat_k_folds.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/create_dataset/strat_k_folds.py b/create_dataset/strat_k_folds.py index 9465b12..ff86f2c 100644 --- a/create_dataset/strat_k_folds.py +++ b/create_dataset/strat_k_folds.py @@ -12,6 +12,7 @@ def create_strat_folds(df): """ """ num_classes = 6 + original_df = df df['label'] = df['label'].replace('cluck', 0) df['label'] = df['label'].replace('coocoo', 1) df['label'] = df['label'].replace('twitter', 2) @@ -29,6 +30,7 @@ def create_strat_folds(df): counts[int(label)] = count group_matrix.append(counts) group_names.append(index) + print(group_names) problem = np.array(group_matrix) print(problem) solution = solve(problem, k=5, verbose=True) @@ -36,13 +38,22 @@ def create_strat_folds(df): print(np.sum(problem, axis=0) / np.sum(problem)) folds = [problem[solution == i] for i in range(5)] fold_percents = np.array([np.sum(folds[i], axis=0) / np.sum(folds[i]) for i in range(5)]) - print(folds) + print(folds) + grouped = original_df.groupby('original_path') + df_with_folds = pd.DataFrame() + count = 0 + for i, group in grouped: + group['fold'] = solution[count] + df_with_folds = pd.concat([df_with_folds, group], ignore_index=True) + count += 1 + return df_with_folds def main(meta): """ """ df = pd.read_csv(meta) - create_strat_folds(df) + df_with_folds = create_strat_folds(df) + df_with_folds.to_csv("5-fold_metadata.csv") if __name__=="__main__": From a99761899c4204f1efd880efd7c335839e8aa99f Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Mon, 28 Apr 2025 15:02:19 -0700 Subject: [PATCH 020/120] fixed indexing issue and added print statements for debug there was a weird indexing issue that caused an extra row of indexes in the final metadata file that had been fixed, and some helpful print statements added during debug stage to help catch issues --- create_dataset/create_dataset.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/create_dataset/create_dataset.py b/create_dataset/create_dataset.py index 081e116..b54b26d 100644 --- a/create_dataset/create_dataset.py +++ b/create_dataset/create_dataset.py @@ -40,7 +40,7 @@ def create_dataset(labels, wav_dir, output_dir, class_list): out_file = ntpath.dirname(output_dir) result_file = os.path.join(out_file, "metadata.csv") if os.path.exists(result_file): - all_data = pd.read_csv(result_file) + all_data = pd.read_csv(result_file, index_col=0) else: all_data = pd.DataFrame() # walk dir to list paths to each original wav file @@ -52,6 +52,8 @@ def create_dataset(labels, wav_dir, output_dir, class_list): use_2017 = True elif "2018" in labels['DATE'].iloc[0]: use_2017 = False + wav_files = [] + num_samples = [] for wav in wav_file_paths: # check which label format to select parsing method # create dataframe of only the labels that correspond to the wav @@ -64,12 +66,20 @@ def create_dataset(labels, wav_dir, output_dir, class_list): # create same number of noise segments from the same wav file randomly all_buow_rows = create_noise_segments(wav, new_buow_rows, output_dir) # add the annotations to the csv of metadata for the dataset - + if not all_buow_rows.empty: + wavv = str(wav) + wav_files.append(wavv) + num_samples.append(len(all_buow_rows)) all_data = pd.concat([all_data, all_buow_rows], ignore_index=True) + print("printing concated data") print(all_data) - print(f"Added {len(all_buow_rows)} new segments from {wav}") + all_data.index = all_data.index.astype(int) all_data.to_csv(result_file) + intt = 0 + for wavs in wav_files: + print(f"{wavs} had {num_samples[intt]} including noise segments") + intt +=1 print(f"Created results: {result_file}") def main(labels, wav_dir, output_dir, class_list): From 37f72b85b416255f2a90faa06b51e7a8eea16967 Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Mon, 28 Apr 2025 15:03:57 -0700 Subject: [PATCH 021/120] added error handling and proper parsing of class list the 2017 data when passed through was missing the chick begging detections because it was parsing the class list wrong. and there were some wav files that made it through the checks as legit but were not actually labeled, so that had to be handled --- create_dataset/create_segments.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/create_dataset/create_segments.py b/create_dataset/create_segments.py index da88e4f..b08ecab 100644 --- a/create_dataset/create_segments.py +++ b/create_dataset/create_segments.py @@ -12,6 +12,8 @@ import numpy as np import random import ntpath +import csv + def setup_logger(level, filename=None): """ @@ -32,22 +34,28 @@ def get_paths(home_dir): def create_segments(wav, filtered_labels, out_path, class_list): """ """ + print(f"creating segments for {wav}") if filtered_labels is None: print(f"skipping segment creation for {wav} because it does not have labels or is not a file of interest") return None + if filtered_labels.empty: + print(f"filtered labels is an empty dataframe, meaning either the sound file was not labeled or has no detections") + return None output_rows = pd.DataFrame(columns=['segment', 'label', 'segment_path', 'original_path', 'segment_duration_s', 'segment_rel_start_ms']) - with open(class_list, 'r') as file: - classes = file.read() - class_list = classes.split(',') + with open(class_list, 'r', newline='') as file: + reader = csv.reader(file) + classes = next(reader) + print(classes) try: audio = AudioSegment.from_wav(wav) except exceptions.CouldntDecodeError: print(f"Couldn't decode: {wav}, moving to next file") filtered_labels['MANUAL ID*'] = filtered_labels['MANUAL ID*'].str.lower() + print(filtered_labels) df_row = 0 path = ntpath.dirname(wav) for index, row in filtered_labels.iterrows(): - for call_type in class_list: + for call_type in classes: if row['MANUAL ID*'] == call_type: start_time = float(row['OFFSET']) end_time = (start_time + float(row['DURATION'])) @@ -60,7 +68,8 @@ def create_segments(wav, filtered_labels, out_path, class_list): segment.export(segment_path, format='wav') output_rows.loc[df_row] = [id, call_type, segment_path, wav, float(row['DURATION']), start_time] df_row += 1 - print(f"Created segment {segment_path}") + else: + continue return output_rows # def create_birdnet_segments(wav, out_path, birdnet_class_list=None): @@ -91,7 +100,6 @@ def create_noise_segments(wav, new_buow_rows, out_path): mask_end = min(len(seconds_array), end + 30 + 1) seconds_array[mask_start:mask_end] = 1 new_sample = num / 2 - print(f"length of seconds array: {len(seconds_array)}") while num > new_sample: random_index = np.random.choice(len(seconds_array)-3) if seconds_array[random_index] == 0 and seconds_array[random_index + 3] == 0: From 3018c15a3ffc4e9670b37c5e5a810294a0efd15f Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Mon, 28 Apr 2025 15:06:59 -0700 Subject: [PATCH 022/120] remove extra blank line --- create_dataset/filter_labels.py | 1 - 1 file changed, 1 deletion(-) diff --git a/create_dataset/filter_labels.py b/create_dataset/filter_labels.py index fef3c62..1874132 100644 --- a/create_dataset/filter_labels.py +++ b/create_dataset/filter_labels.py @@ -90,7 +90,6 @@ def filter_labels_2018(wav, labels): # if there were labels associated with a different wav file that happened to have the same # name, this will drop the labels associated with a different burrow/site filtered_labels = filtered_labels.drop(index_to_drop) - return filtered_labels From 4d35019fb9b2162693bd5fc16b2010a270601c8e Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Mon, 28 Apr 2025 15:07:17 -0700 Subject: [PATCH 023/120] added proper parsing of indexes in metadata csv --- create_dataset/strat_k_folds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/create_dataset/strat_k_folds.py b/create_dataset/strat_k_folds.py index ff86f2c..bbd7e8a 100644 --- a/create_dataset/strat_k_folds.py +++ b/create_dataset/strat_k_folds.py @@ -51,7 +51,7 @@ def create_strat_folds(df): def main(meta): """ """ - df = pd.read_csv(meta) + df = pd.read_csv(meta, index_col=0) df_with_folds = create_strat_folds(df) df_with_folds.to_csv("5-fold_metadata.csv") From 7397fadbfbadba167ee382e94df4bec44524087b Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Mon, 28 Apr 2025 16:07:40 -0700 Subject: [PATCH 024/120] fix error of not dropping indexes --- create_dataset/filter_labels.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/create_dataset/filter_labels.py b/create_dataset/filter_labels.py index 1874132..08891d9 100644 --- a/create_dataset/filter_labels.py +++ b/create_dataset/filter_labels.py @@ -24,8 +24,7 @@ def filter_labels_2017(wav, labels): print(f"{site} is not in {wav}") index_drop.append(index) - filtered_labels.drop(index_drop) - + filtered_labels = filtered_labels.drop(index_drop) return filtered_labels def filter_labels_2018(wav, labels): From 1be173e2c32d56bf3e72ddfe37bfb57b01f28c50 Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Mon, 28 Apr 2025 16:08:17 -0700 Subject: [PATCH 025/120] handling if a wav file for a detection is less than 3s there was one labeled wav file that was literally 1s long and 2/3 of the second was a labeled detection. it was legit and not from another sound file. i decided to have it keep this detection and generate a segment but it cannot generate a complementary no_buow segment. we can change this but it seemed better to just keep the extra detection, unless it starts happening all the time (it wont) --- create_dataset/create_segments.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/create_dataset/create_segments.py b/create_dataset/create_segments.py index b08ecab..889f95f 100644 --- a/create_dataset/create_segments.py +++ b/create_dataset/create_segments.py @@ -101,7 +101,11 @@ def create_noise_segments(wav, new_buow_rows, out_path): seconds_array[mask_start:mask_end] = 1 new_sample = num / 2 while num > new_sample: - random_index = np.random.choice(len(seconds_array)-3) + try: + random_index = np.random.choice(len(seconds_array)-3) + except: + print(f"{wav} is not long enough to generate no_buow sounds, keeping the detection segment but adding no no_buow") + return new_buow_rows if seconds_array[random_index] == 0 and seconds_array[random_index + 3] == 0: start_time = (random_index + 1) * 1000 end_time = (random_index + 4) * 1000 From ecd18b424570616bd29e00f64721713c601684bc Mon Sep 17 00:00:00 2001 From: Sumega Mandadi Date: Tue, 29 Apr 2025 17:26:07 -0700 Subject: [PATCH 026/120] Add files to create perch embeddings --- make_model/make_perch_embeddings.py | 122 +++++++++++++++++++++++++++ make_model/make_perch_svm_dataset.py | 109 ++++++++++++++++++++++++ 2 files changed, 231 insertions(+) create mode 100644 make_model/make_perch_embeddings.py create mode 100644 make_model/make_perch_svm_dataset.py diff --git a/make_model/make_perch_embeddings.py b/make_model/make_perch_embeddings.py new file mode 100644 index 0000000..02b634b --- /dev/null +++ b/make_model/make_perch_embeddings.py @@ -0,0 +1,122 @@ +''' +Create Perch Embeddings Script + +This script processes a directory of audio chunks (.wav files), +creates perch embeddings, and stores the results as a sqlite database + +Usage: + python make_perch_embeddings.py dataset_name path/to/directory/of/wavs + path/to/desired/output/dir + +Outputs: + hoplite.sqlite + usearch.index + +Note: + this code requires: + python, version 3.10+ + numpy, version 1.2+ + tensorflow, version 2+ +''' + +import argparse +from etils import epath + +from perch_hoplite.agile import colab_utils +from perch_hoplite.agile import embed +from perch_hoplite.agile import source_info +from perch_hoplite.db import interface + +def create_embeddings(dataset_name, wavs, output): + ''' + creates perch embeddings + + Args: + dataset_name (str): name of dataset being embedded + wavs (str): path to directory containing .wav audio segments + output (str): path to directory for output files (SQLite DB) + + Returns: + None + ''' + + dataset_base_path = wavs + dataset_fileglob = '*.wav' + db_path = output + model_choice = 'perch_8' + + use_file_sharding = True + + audio_glob = source_info.AudioSourceConfig( + dataset_name=dataset_name, + base_path=dataset_base_path, + file_glob=dataset_fileglob, + min_audio_len_s=1.0, + target_sample_rate_hz=-2, + shard_len_s=60.0 if use_file_sharding else None, + ) + + configs = colab_utils.load_configs( + source_info.AudioSources((audio_glob,)), + db_path, + model_config_key=model_choice, + db_key='sqlite_usearch') + + # Initialize DB + db = configs.db_config.load_db() + num_embeddings = db.count_embeddings() + print('Initialized DB located at ', configs.db_config.db_config.db_path) + + def drop_and_reload_db() -> interface.HopliteDBInterface: + db_path = epath.Path(configs.db_config.db_config.db_path) + for fp in db_path.glob('hoplite.sqlite*'): + fp.unlink() + (db_path / 'usearch.index').unlink() + print('\n Deleted previous db at: ', + configs.db_config.db_config.db_path) + + if num_embeddings > 0: + print('Existing DB contains datasets: ', db.get_dataset_names()) + print('num embeddings: ', num_embeddings) + print(f'This will permanently delete all {num_embeddings} ' + 'embeddings from the existing database.\n') + drop_and_reload_db() + + # Run embedding + print(f'Embedding dataset: {audio_glob.dataset_name}') + + worker = embed.EmbedWorker( + audio_sources=configs.audio_sources_config, + db=db, + model_config=configs.model_config) + + worker.process_all(target_dataset_name=audio_glob.dataset_name) + + print('\n\nEmbedding complete! \nTotal embeddings: ', db.count_embeddings()) + print(f'Embeddings dataset saved at: \n ' + f'\t{output}/hoplite.sqlite \n ' + f'\t{output}/usearch.index') + +def main(dataset_name, wavs, output): + ''' + run main script + ''' + + create_embeddings(dataset_name, wavs, output) + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser( + description='Input Directory Paths' + ) + parser.add_argument('dataset_name', type=str, + help='Name of dataset to embed') + parser.add_argument('wavs', type=str, + help='Path to labeled audio chunks. ' + 'All .wav files will be embedded') + parser.add_argument('output', type=str, + help='Path to desired directory for output database') + args = parser.parse_args() + + main(args.dataset_name, args.wavs, args.output) diff --git a/make_model/make_perch_svm_dataset.py b/make_model/make_perch_svm_dataset.py new file mode 100644 index 0000000..27afd7e --- /dev/null +++ b/make_model/make_perch_svm_dataset.py @@ -0,0 +1,109 @@ +''' +Convert Perch Embedding Output to usabale .csvs + +This script processes the outputs of the perch embedding +scripts and converst to a usable .csv file of all embeddings +with labels to use for training a binary SVM classifier + +Usage: python make_perch_svm_dataset.py /path/to/db/dir label + +Arguments: + database_directory (str): path to directory that contains + hoplite.sqlite & usearch.index + label (str): label for all embeddings in hoplite.sqlite + +Outputs: + label_embeddings_forSVM.csv + +''' + + +import argparse +import sqlite3 +import pandas as pd +import sys +import os +from perch_hoplite.db import sqlite_usearch_impl + + +def split_base_segment(filename): + '''separates source file base and segment # + example: audio_segment_3.wav -> audio, 3 + + Args: + filename: segment file name + + Returns: + source file base name & segment # + ''' + + base, _ = os.path.splitext(filename) + return base.split('_segment_') + +def get_start_stop(seg_id): + ''' calculates start stop timestamp in s + from segment number + + Args: + seg_id (int): segment number of audio chunk + + Returns: + start s, stop s + + ''' + seg_id = int(seg_id) + return (seg_id*3, (seg_id+1)*3) + + +def main(sqlite_dir, label, output_dir): + ''' + runs main script + ''' + + # load database + db = sqlite_usearch_impl.SQLiteUsearchDB.create(sqlite_dir) + + master_data = [] + + n_embeddings = db.count_embeddings() + + for i in range(n_embeddings): + + file_name = db.get_embedding_source(i+1).source_id + base_name, segment_id = split_base_segment(file_name) + start, stop = get_start_stop(segment_id) + base_dict = {'start': start, + 'stop': stop, + 'label': label} + + embedding = db.get_embedding(i+1) + embedding_dict = {f'feature_{j}': val for j, val in enumerate(embedding)} + + full_row = {**base_dict, **embedding_dict} + + master_data.append(full_row) + + + master_df = pd.DataFrame(master_data) + csv_filename = f'{output_dir}/{label}_embeddings_forSVM.csv' + master_df.to_csv(csv_filename) + + print('Complete!') + print(f'Saved at:\n\t{csv_filename}') + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser( + description='Input SQLite Direcotry and Label') + + parser.add_argument('sqlite_dir', type=str, + help='Path to directory that contains ' + 'hoplite.sqlite and usearch.index') + parser.add_argument('label', type=str, + help='Label for all embeddings in given db') + parser.add_argument('output_dir', type=str, + help='Directory for output file') + + args = parser.parse_args() + main(args.sqlite_dir, args.label, args.output_dir) From 2051600679a0ef164f39cf65b4b6021b50721638 Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Mon, 12 May 2025 13:34:22 -0700 Subject: [PATCH 027/120] working version of code fixed chick begging underline error and adding some clearer variable names --- create_dataset/strat_k_folds.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/create_dataset/strat_k_folds.py b/create_dataset/strat_k_folds.py index bbd7e8a..2ec36b7 100644 --- a/create_dataset/strat_k_folds.py +++ b/create_dataset/strat_k_folds.py @@ -17,7 +17,7 @@ def create_strat_folds(df): df['label'] = df['label'].replace('coocoo', 1) df['label'] = df['label'].replace('twitter', 2) df['label'] = df['label'].replace('alarm', 3) - df['label'] = df['label'].replace('chick_begging', 4) + df['label'] = df['label'].replace('chick begging', 4) df['label'] = df['label'].replace('no_buow', 5) # group is the subset of the index which is the wav file they all come from grouped = df.groupby('original_path') @@ -39,10 +39,10 @@ def create_strat_folds(df): folds = [problem[solution == i] for i in range(5)] fold_percents = np.array([np.sum(folds[i], axis=0) / np.sum(folds[i]) for i in range(5)]) print(folds) - grouped = original_df.groupby('original_path') + grouped_original = original_df.groupby('original_path') df_with_folds = pd.DataFrame() count = 0 - for i, group in grouped: + for i, group in grouped_original: group['fold'] = solution[count] df_with_folds = pd.concat([df_with_folds, group], ignore_index=True) count += 1 @@ -53,7 +53,7 @@ def main(meta): """ df = pd.read_csv(meta, index_col=0) df_with_folds = create_strat_folds(df) - df_with_folds.to_csv("5-fold_metadata.csv") + df_with_folds.to_csv("5-fold_meta.csv") if __name__=="__main__": From d19829d477ffd48cd7bc365338e560c225ead2cf Mon Sep 17 00:00:00 2001 From: Sumega Mandadi Date: Thu, 15 May 2025 14:05:58 -0700 Subject: [PATCH 028/120] Change output dataframe format for standardization --- ...dataset.py => prepare_perch_embeddings.py} | 53 ++++--------------- 1 file changed, 10 insertions(+), 43 deletions(-) rename make_model/{make_perch_svm_dataset.py => prepare_perch_embeddings.py} (54%) diff --git a/make_model/make_perch_svm_dataset.py b/make_model/prepare_perch_embeddings.py similarity index 54% rename from make_model/make_perch_svm_dataset.py rename to make_model/prepare_perch_embeddings.py index 27afd7e..fdb0ac3 100644 --- a/make_model/make_perch_svm_dataset.py +++ b/make_model/prepare_perch_embeddings.py @@ -26,36 +26,7 @@ from perch_hoplite.db import sqlite_usearch_impl -def split_base_segment(filename): - '''separates source file base and segment # - example: audio_segment_3.wav -> audio, 3 - - Args: - filename: segment file name - - Returns: - source file base name & segment # - ''' - - base, _ = os.path.splitext(filename) - return base.split('_segment_') - -def get_start_stop(seg_id): - ''' calculates start stop timestamp in s - from segment number - - Args: - seg_id (int): segment number of audio chunk - - Returns: - start s, stop s - - ''' - seg_id = int(seg_id) - return (seg_id*3, (seg_id+1)*3) - - -def main(sqlite_dir, label, output_dir): +def main(sqlite_dir, output_dir, embeddings_description): ''' runs main script ''' @@ -70,23 +41,19 @@ def main(sqlite_dir, label, output_dir): for i in range(n_embeddings): file_name = db.get_embedding_source(i+1).source_id - base_name, segment_id = split_base_segment(file_name) - start, stop = get_start_stop(segment_id) - base_dict = {'start': start, - 'stop': stop, - 'label': label} + + base_dict = {'filename': file_name} embedding = db.get_embedding(i+1) - embedding_dict = {f'feature_{j}': val for j, val in enumerate(embedding)} + embedding_dict = {f'{j}': val for j, val in enumerate(embedding)} full_row = {**base_dict, **embedding_dict} master_data.append(full_row) - master_df = pd.DataFrame(master_data) - csv_filename = f'{output_dir}/{label}_embeddings_forSVM.csv' - master_df.to_csv(csv_filename) + csv_filename = os.path.join(output_dir, f'{embeddings_description}_perch_embeddings.csv') + master_df.to_csv(csv_filename, index=False) print('Complete!') print(f'Saved at:\n\t{csv_filename}') @@ -95,15 +62,15 @@ def main(sqlite_dir, label, output_dir): if __name__ == '__main__': parser = argparse.ArgumentParser( - description='Input SQLite Direcotry and Label') + description='Input Perch Embeddings sqlite database and output directory') parser.add_argument('sqlite_dir', type=str, help='Path to directory that contains ' 'hoplite.sqlite and usearch.index') - parser.add_argument('label', type=str, - help='Label for all embeddings in given db') parser.add_argument('output_dir', type=str, help='Directory for output file') + parser.add_argument('embeddings_description', type=str, + help='Name of embeddings group') args = parser.parse_args() - main(args.sqlite_dir, args.label, args.output_dir) + main(args.sqlite_dir, args.output_dir, args.embeddings_description) From 887df67b3a354136a7364636667f65aa8f496998 Mon Sep 17 00:00:00 2001 From: Sumega Mandadi Date: Thu, 15 May 2025 14:11:18 -0700 Subject: [PATCH 029/120] pylint and update doc strings --- make_model/prepare_perch_embeddings.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/make_model/prepare_perch_embeddings.py b/make_model/prepare_perch_embeddings.py index fdb0ac3..24845b2 100644 --- a/make_model/prepare_perch_embeddings.py +++ b/make_model/prepare_perch_embeddings.py @@ -1,28 +1,31 @@ ''' -Convert Perch Embedding Output to usabale .csvs +Convert Perch Embedding Output to standard embeddings .csv +for easy training of various models This script processes the outputs of the perch embedding -scripts and converst to a usable .csv file of all embeddings -with labels to use for training a binary SVM classifier +scripts and converst to a usable .csv file that stores +filename and embedding -Usage: python make_perch_svm_dataset.py /path/to/db/dir label +Usage: python prepare_perch_embeddings \ + /path/to/db/dir \ + /path/to/output/dir \ + embeddings_description Arguments: database_directory (str): path to directory that contains hoplite.sqlite & usearch.index - label (str): label for all embeddings in hoplite.sqlite + outout_directory (str): path to directory to store output + csv Outputs: - label_embeddings_forSVM.csv + _perch_embeddings.csv ''' +import os import argparse -import sqlite3 import pandas as pd -import sys -import os from perch_hoplite.db import sqlite_usearch_impl @@ -62,7 +65,7 @@ def main(sqlite_dir, output_dir, embeddings_description): if __name__ == '__main__': parser = argparse.ArgumentParser( - description='Input Perch Embeddings sqlite database and output directory') + description='Input Perch Embeddings sqlite database and output directory') parser.add_argument('sqlite_dir', type=str, help='Path to directory that contains ' From 5f438f233e0915aed9c2b71733d1ddd404e80c97 Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Thu, 15 May 2025 14:32:25 -0700 Subject: [PATCH 030/120] fixing flake8 and pylint --- create_dataset/create_dataset.py | 99 +++++++++++++++++++++----------- 1 file changed, 65 insertions(+), 34 deletions(-) diff --git a/create_dataset/create_dataset.py b/create_dataset/create_dataset.py index b54b26d..25440cf 100644 --- a/create_dataset/create_dataset.py +++ b/create_dataset/create_dataset.py @@ -13,30 +13,44 @@ to pad the labeled detections if they need consistent length segments. Usage: -/ python3 create_dataset.py -labels /path/to/human/labeled.csv - -wav_dir /path/to/parent/dir/of/wavs/ -output_dir /path/to/desired/output/dir/ + -wav_dir /path/to/parent/dir/of/wavs/ + -output_dir /path/to/desired/output/dir/ -class_list /path/to/classes.txt """ -from create_segments import setup_logger, get_paths, create_segments -from create_segments import create_noise_segments, create_csv -from filter_labels import filter_labels_2017, filter_labels_2018 import argparse -import pandas as pd import ntpath import os +import pandas as pd +from create_segments import get_paths, create_segments +from create_segments import create_noise_segments +from filter_labels import filter_labels_2017, filter_labels_2018 + def create_dataset(labels, wav_dir, output_dir, class_list): - """ + """Creates labeled and non labeled segments and metadata. + + Creates segments based on human labeled data of a detection, + and then creates an equal number of randomized 'non-detection' + segments at fixed length. It cretaes a uuid for each segment + and spits out a metadata file that matches the segment to its + label, original wav file, relative start time to original wav, + and duration. + + Args: + labels (str): Path to label file. + + wav_dir (str): Path to original wav segments of audio. + + output_dir (str): Path to where the segments and metadata + will go. + + class_list (str): Path to file containing the classes + seen in the human labels file that you + want to create segments for. """ # parse the inputs - '''if output dir exists - good, if not make - if labels exist, good - if not tell user - if wav dir exists - if not tell user''' out_file = ntpath.dirname(output_dir) result_file = os.path.join(out_file, "metadata.csv") if os.path.exists(result_file): @@ -47,7 +61,7 @@ def create_dataset(labels, wav_dir, output_dir, class_list): wav_file_paths = get_paths(wav_dir) # open human label file labels = pd.read_csv(labels) - #iterate through each individual original wav + # iterate through each individual original wav if "2017" in labels['DATE'].iloc[0]: use_2017 = True elif "2018" in labels['DATE'].iloc[0]: @@ -57,14 +71,21 @@ def create_dataset(labels, wav_dir, output_dir, class_list): for wav in wav_file_paths: # check which label format to select parsing method # create dataframe of only the labels that correspond to the wav - if use_2017 == True: - filtered_labels = filter_labels_2017(wav, labels) - elif use_2017 == False: - filtered_labels = filter_labels_2018(wav, labels) + if use_2017: + filtered_labels = filter_labels_2017(wav, + labels) + else: + filtered_labels = filter_labels_2018(wav, + labels) # output the labeled segments and return the dataframe of annotations - new_buow_rows = create_segments(wav, filtered_labels, output_dir, class_list) + new_buow_rows = create_segments(wav, + filtered_labels, + output_dir, + class_list) # create same number of noise segments from the same wav file randomly - all_buow_rows = create_noise_segments(wav, new_buow_rows, output_dir) + all_buow_rows = create_noise_segments(wav, + new_buow_rows, + output_dir) # add the annotations to the csv of metadata for the dataset if not all_buow_rows.empty: wavv = str(wav) @@ -79,29 +100,39 @@ def create_dataset(labels, wav_dir, output_dir, class_list): intt = 0 for wavs in wav_files: print(f"{wavs} had {num_samples[intt]} including noise segments") - intt +=1 + intt += 1 print(f"Created results: {result_file}") + def main(labels, wav_dir, output_dir, class_list): - """ + """Main script to run create dataset. + + Args: + labels (str): Path to label file. + + wav_dir (str): Path to original wav segments of audio. + + output_dir (str): Path to where the segments and metadata + will go. + + class_list (str): Path to file containing the classes + seen in the human labels file that you + want to create segments for. """ create_dataset(labels, wav_dir, output_dir, class_list) -if __name__=="__main__": - parser = argparse.ArgumentParser( + +if __name__ == "__main__": + PARSER = argparse.ArgumentParser( description='Input Directory Path' ) - parser.add_argument('-labels', type=str, + PARSER.add_argument('-labels', type=str, help='Path to human labeled csv') - parser.add_argument('-wav_dir', type=str, + PARSER.add_argument('-wav_dir', type=str, help='Path to directory containing wav files.') - parser.add_argument('-output_dir', type=str, + PARSER.add_argument('-output_dir', type=str, help='Path to desired directory for segments.') - parser.add_argument('-class_list', type=str, + PARSER.add_argument('-class_list', type=str, help='Path to txt file of list of labeled classes') - #parser.add_argument('-l', '--lengthen', type=int, default=0, - # help='ms of padding for front and end of detection segment') - # parser.add_argument('-e', '--equalize', type=int, - # help='each detection segment and noise segment will be the same length, not zero padded') - args = parser.parse_args() - main(args.labels, args.wav_dir, args.output_dir, args.class_list) + ARGS = PARSER.parse_args() + main(ARGS.labels, ARGS.wav_dir, ARGS.output_dir, ARGS.class_list) From c67b6ced9d4735548d19b4adbef506912369512c Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Thu, 15 May 2025 15:37:57 -0700 Subject: [PATCH 031/120] adding some doctstrings --- create_dataset/create_segments.py | 126 ++++++++++++++++++++---------- 1 file changed, 85 insertions(+), 41 deletions(-) diff --git a/create_dataset/create_segments.py b/create_dataset/create_segments.py index 889f95f..62dbd14 100644 --- a/create_dataset/create_segments.py +++ b/create_dataset/create_segments.py @@ -1,27 +1,24 @@ """Functions to create segments of detections of interest from wavs. - +Functions get called in create_dataset.py. """ -import pandas as pd import os -from pydub import AudioSegment, exceptions -import logging -from pathlib import Path -import ntpath import uuid -import numpy as np -import random -import ntpath import csv +import pandas as pd +from pydub import AudioSegment, exceptions +import numpy as np -def setup_logger(level, filename=None): - """ - """ - - def get_paths(home_dir): - """ + """Obtain paths to every wav in the directory provided. + + Args: + home_dir (str): Path to directory containing original wavs. + + Returns: + wavs_file_paths (list): List of all the full paths to a wav in + the given directory. """ wavs_file_paths = [] for path, dirs, files in os.walk(home_dir): @@ -31,17 +28,44 @@ def get_paths(home_dir): wavs_file_paths.append(new_file) return wavs_file_paths + def create_segments(wav, filtered_labels, out_path, class_list): - """ + """Create the labeled segments. + + Args: + wav (str): Path to current wav file in loop. + + filtered_labels (pd.Dataframe): The human label file reduced + to only contain the rows of + detections pertinent to the + wav of interest. + out_path (str): Path to directory where segment will be saved. + + class_list (str): Path to the class list that you'd like segments + to be created for. What the manual ID's are in + the human label file- will ignore everything that + is misspelled or unknown labels. + + Returns: + output_rows (pd.Dataframe): The metadata now associated with the + created segments for a given wav file. """ print(f"creating segments for {wav}") if filtered_labels is None: - print(f"skipping segment creation for {wav} because it does not have labels or is not a file of interest") + print(f"skipping segment creation for {wav} because " + "it does not have labels or is not a file of interest") return None if filtered_labels.empty: - print(f"filtered labels is an empty dataframe, meaning either the sound file was not labeled or has no detections") + print(f"filtered labels is an empty dataframe, " + "meaning either the sound file was not " + "labeled or has no detections") return None - output_rows = pd.DataFrame(columns=['segment', 'label', 'segment_path', 'original_path', 'segment_duration_s', 'segment_rel_start_ms']) + output_rows = pd.DataFrame(columns=['segment', + 'label', + 'segment_path', + 'original_path', + 'segment_duration_s', + 'segment_rel_start_ms']) with open(class_list, 'r', newline='') as file: reader = csv.reader(file) classes = next(reader) @@ -53,7 +77,6 @@ def create_segments(wav, filtered_labels, out_path, class_list): filtered_labels['MANUAL ID*'] = filtered_labels['MANUAL ID*'].str.lower() print(filtered_labels) df_row = 0 - path = ntpath.dirname(wav) for index, row in filtered_labels.iterrows(): for call_type in classes: if row['MANUAL ID*'] == call_type: @@ -66,22 +89,42 @@ def create_segments(wav, filtered_labels, out_path, class_list): id = str(id) + '.wav' segment_path = os.path.join(out_path, id) segment.export(segment_path, format='wav') - output_rows.loc[df_row] = [id, call_type, segment_path, wav, float(row['DURATION']), start_time] + output_rows.loc[df_row] = [id, + call_type, + segment_path, + wav, + float(row['DURATION']), + start_time] df_row += 1 else: continue return output_rows -# def create_birdnet_segments(wav, out_path, birdnet_class_list=None): def create_noise_segments(wav, new_buow_rows, out_path): """ Randomly select an equal number of 3s noise segments to the number of detections per audio file, a buffer length away from all of the detections in the file. + + Args: + wav (str): The path to the given wav. + + new_buow_rows (pd.Dataframe): The human labeled detection + segment metadata for the given + wav. + + out_path (str): The directory where the new no_buow segments will + go to join the human labeled segments. + + Returns: + all_buow_rows (pd.Dataframe): The metadata for the detection as + well as the no_buow segments created + from the given wav. """ if new_buow_rows is None: - print(f"not creating noise segments from {wav} because there were no labels or no associated labels") + print(f"not creating noise segments from {wav} because " + "there were no labels or no associated labels") all_buow_rows = pd.DataFrame() return all_buow_rows try: @@ -95,7 +138,8 @@ def create_noise_segments(wav, new_buow_rows, out_path): seconds_array = np.zeros(duration) for index, row in new_buow_rows.iterrows(): start = int((row['segment_rel_start_ms'] / 1000) - 1) - end = int((row['segment_rel_start_ms'] / 1000) + row['segment_duration_s']) + end = int((row['segment_rel_start_ms'] / 1000) + + row['segment_duration_s']) mask_start = max(0, start - 30) mask_end = min(len(seconds_array), end + 30 + 1) seconds_array[mask_start:mask_end] = 1 @@ -104,25 +148,25 @@ def create_noise_segments(wav, new_buow_rows, out_path): try: random_index = np.random.choice(len(seconds_array)-3) except: - print(f"{wav} is not long enough to generate no_buow sounds, keeping the detection segment but adding no no_buow") + print(f"{wav} is not long enough to generate no_buow sounds, " + "keeping the detection segment but adding no no_buow") return new_buow_rows if seconds_array[random_index] == 0 and seconds_array[random_index + 3] == 0: - start_time = (random_index + 1) * 1000 - end_time = (random_index + 4) * 1000 - segment = audio[start_time:end_time] - duration_of_segment = len(segment) / 1000 - id = uuid.uuid4() - id = str(id) + '.wav' - segment_path = os.path.join(out_path, id) - segment.export(segment_path, format='wav') - new_buow_rows.loc[new_sample] = [id, call_type, segment_path, wav, duration_of_segment, start_time] - new_sample += 1 + start_time = (random_index + 1) * 1000 + end_time = (random_index + 4) * 1000 + segment = audio[start_time:end_time] + duration_of_segment = len(segment) / 1000 + id = uuid.uuid4() + id = str(id) + '.wav' + segment_path = os.path.join(out_path, id) + segment.export(segment_path, format='wav') + new_buow_rows.loc[new_sample] = [id, + call_type, + segment_path, + wav, + duration_of_segment, + start_time] + new_sample += 1 all_buow_rows = new_buow_rows return all_buow_rows - -def create_csv(new_rows, output_dir): - """ - """ - if os.path.exists(output_dir): - pd.con From 14e0e3f14971c36a4ad41ef0f344106020a04dbc Mon Sep 17 00:00:00 2001 From: Sumega Mandadi Date: Wed, 28 May 2025 16:07:17 -0700 Subject: [PATCH 032/120] Update output to new format --- make_model/prepare_perch_embeddings.py | 49 ++++++++++++++++---------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/make_model/prepare_perch_embeddings.py b/make_model/prepare_perch_embeddings.py index 24845b2..393b030 100644 --- a/make_model/prepare_perch_embeddings.py +++ b/make_model/prepare_perch_embeddings.py @@ -1,13 +1,14 @@ ''' -Convert Perch Embedding Output to standard embeddings .csv +Convert Perch Embedding Output to standard embeddings .pkl for easy training of various models This script processes the outputs of the perch embedding -scripts and converst to a usable .csv file that stores -filename and embedding +scripts and converts to a .pkl dataframe that stores +filename, embedding, and related metadata Usage: python prepare_perch_embeddings \ /path/to/db/dir \ + /path/to/metadata/file \ /path/to/output/dir \ embeddings_description @@ -18,7 +19,7 @@ csv Outputs: - _perch_embeddings.csv + _perch_embeddings.pkl ''' @@ -29,37 +30,47 @@ from perch_hoplite.db import sqlite_usearch_impl -def main(sqlite_dir, output_dir, embeddings_description): +def prepare_perch_embeddings(sqlite_dir, + metadata_path, + output_dir, + embeddings_description): ''' runs main script ''' - # load database + # load embeddings database db = sqlite_usearch_impl.SQLiteUsearchDB.create(sqlite_dir) - master_data = [] + # load dataset metadata + metadata = pd.read_csv(metadata_path, index_col=0) + + embeddings_data = [] n_embeddings = db.count_embeddings() for i in range(n_embeddings): file_name = db.get_embedding_source(i+1).source_id + embedding = db.get_embedding(i+1) - base_dict = {'filename': file_name} + base_dict = {'segment': file_name, + 'embedding': embedding} - embedding = db.get_embedding(i+1) - embedding_dict = {f'{j}': val for j, val in enumerate(embedding)} + #embedding_dict = {f'{j}': val for j, val in enumerate(embedding)} + + #full_row = {**base_dict, **embedding_dict} + + embeddings_data.append(base_dict) - full_row = {**base_dict, **embedding_dict} + embeddings_df = pd.DataFrame(embeddings_data) + merged_df = pd.merge(embeddings_df, metadata, on='segment') - master_data.append(full_row) + output_filename = os.path.join(output_dir, f'{embeddings_description}_perch_embeddings.pkl') + merged_df.to_pickle(output_filename) - master_df = pd.DataFrame(master_data) - csv_filename = os.path.join(output_dir, f'{embeddings_description}_perch_embeddings.csv') - master_df.to_csv(csv_filename, index=False) +# merged_df.to_csv(csv_filename, index=False) - print('Complete!') - print(f'Saved at:\n\t{csv_filename}') + print(f'Embeddings saved at:\n\t{output_filename}') if __name__ == '__main__': @@ -70,10 +81,12 @@ def main(sqlite_dir, output_dir, embeddings_description): parser.add_argument('sqlite_dir', type=str, help='Path to directory that contains ' 'hoplite.sqlite and usearch.index') + parser.add_argument('metadata_path', type=str, + help='Path to metadata file') parser.add_argument('output_dir', type=str, help='Directory for output file') parser.add_argument('embeddings_description', type=str, help='Name of embeddings group') args = parser.parse_args() - main(args.sqlite_dir, args.output_dir, args.embeddings_description) + prepare_perch_embeddings(args.sqlite_dir, args.metadata_path, args.output_dir, args.embeddings_description) From 6a7c28582e9739234fb7125e2296f328c5fc1286 Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Wed, 11 Jun 2025 14:24:51 -0700 Subject: [PATCH 033/120] Adds inital files for project Stopping halfway through to get a package manager set up, then once that is set up will return to fill out these files --- model_training/data_augmentation.py | 0 model_training/dataset.py | 0 model_training/models/model.py | 22 ++++++++++++++++++++++ model_training/pipeline.py | 0 model_training/trainer.py | 0 5 files changed, 22 insertions(+) create mode 100644 model_training/data_augmentation.py create mode 100644 model_training/dataset.py create mode 100644 model_training/models/model.py create mode 100644 model_training/pipeline.py create mode 100644 model_training/trainer.py diff --git a/model_training/data_augmentation.py b/model_training/data_augmentation.py new file mode 100644 index 0000000..e69de29 diff --git a/model_training/dataset.py b/model_training/dataset.py new file mode 100644 index 0000000..e69de29 diff --git a/model_training/models/model.py b/model_training/models/model.py new file mode 100644 index 0000000..c9874dd --- /dev/null +++ b/model_training/models/model.py @@ -0,0 +1,22 @@ +from torch import nn +from abc import ABC, abstractmethod +import numpy as np +import typing + + + +class Model(ABC): + """ + Gets an embedding for the model + + This can be the final layer of a model backbone + or a set of useful features + + Returns + embedding + """ + @abstractmethod + def get_embeddings(x) -> np.array: + + + diff --git a/model_training/pipeline.py b/model_training/pipeline.py new file mode 100644 index 0000000..e69de29 diff --git a/model_training/trainer.py b/model_training/trainer.py new file mode 100644 index 0000000..e69de29 From 6f0528dbfd4fc3a5a6e9daeaba27a088e82e085c Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Wed, 11 Jun 2025 14:30:48 -0700 Subject: [PATCH 034/120] Add pyproject.toml --- pyproject.toml | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..8b1abe7 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,27 @@ +[project] +name = "whoot" +version = "0.0.2.dev0" +description = "Tools for capturing, analyzing, and parsing audio data" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "librosa>=0.11.0", + "numpy>=2.2.6", + "pandas>=2.3.0", + "pydub>=0.25.1", + "scikit-learn>=1.7.0", + "tqdm>=4.67.1", +] + +[project.optional-dependencies] +cpu = [ + "torch>=2.7.0", + "torchvision>=0.22.0", +] +cu128 = [ + "torch>=2.7.0", + "torchvision>=0.22.0", +] + +[packages.index] +cu128 = "https://download.pytorch.org/whl/cu128" \ No newline at end of file From 36e1db47e7999bf0231eaf01bf1f15686eff15c7 Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Wed, 11 Jun 2025 14:39:51 -0700 Subject: [PATCH 035/120] add documentation for install --- README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/README.md b/README.md index 3f18983..fe53d3b 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,17 @@ # whoot Tools for capturing, analyzing, and parsing audio data + +# Installation Instructions + +## Default Python Instructions +1) Install Python>=3.12 +2) Run in project root `pip install -e .` + +To install optional dependencies run `pip install -e .[extra1,extra2,...]` + +Current support optional dependency collections include + +- `cpu`: Installs torch and torchvision for CPU use only +- `cu128`: Installs torch and torchvision with Cuda 12.8 Binaries + + From fd8b1e86f0edb68d46fee0c9b3a9ed9b9d7c4c4d Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Wed, 11 Jun 2025 14:43:23 -0700 Subject: [PATCH 036/120] Set python required to 3.10 and above --- README.md | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index fe53d3b..a39ac96 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ Tools for capturing, analyzing, and parsing audio data # Installation Instructions ## Default Python Instructions -1) Install Python>=3.12 +1) Install Python>=3.10 2) Run in project root `pip install -e .` To install optional dependencies run `pip install -e .[extra1,extra2,...]` diff --git a/pyproject.toml b/pyproject.toml index 8b1abe7..a0d456f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "whoot" version = "0.0.2.dev0" description = "Tools for capturing, analyzing, and parsing audio data" readme = "README.md" -requires-python = ">=3.12" +requires-python = ">=3.10" dependencies = [ "librosa>=0.11.0", "numpy>=2.2.6", From 46d0f0ec339dd99c29f94c39ee43f9a889a5bdca Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Wed, 11 Jun 2025 14:57:14 -0700 Subject: [PATCH 037/120] Fixes bug from folder layout Traditional PEP 518 format requires source folders be under src or whoot (repo name). This allows us to keep having mutliple packages/tools in the same repo --- README.md | 4 ++++ pyproject.toml | 5 ++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a39ac96..1d20f54 100644 --- a/README.md +++ b/README.md @@ -15,3 +15,7 @@ Current support optional dependency collections include - `cu128`: Installs torch and torchvision with Cuda 12.8 Binaries + +## Developer Notes + +When adding a new package, like `assess_birdnet` to the whoot toolkit, add your package name to the `[tool.setuptools]` section of `pyproject.toml` \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index a0d456f..0cf3211 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,4 +24,7 @@ cu128 = [ ] [packages.index] -cu128 = "https://download.pytorch.org/whl/cu128" \ No newline at end of file +cu128 = "https://download.pytorch.org/whl/cu128" + +[tool.setuptools] +packages = ["make_model", "assess_birdnet"] \ No newline at end of file From 87acb4d977b2278a2bb773e489217c48924e1176 Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Wed, 11 Jun 2025 14:58:12 -0700 Subject: [PATCH 038/120] Adds .gitignore for build artifacts --- .gitignore | 209 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 209 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..09183a1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,209 @@ +# Created by https://www.toptal.com/developers/gitignore/api/python,venv,visualstudiocode +# Edit at https://www.toptal.com/developers/gitignore?templates=python,venv,visualstudiocode + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + +# ruff +.ruff_cache/ + +# LSP config files +pyrightconfig.json + +### venv ### +# Virtualenv +# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ +[Bb]in +[Ii]nclude +[Ll]ib +[Ll]ib64 +[Ll]ocal +[Ss]cripts +pyvenv.cfg +pip-selfcheck.json + +### VisualStudioCode ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + +### VisualStudioCode Patch ### +# Ignore all local history of files +.history +.ionide + +# End of https://www.toptal.com/developers/gitignore/api/python,venv,visualstudiocode + +uv.lock \ No newline at end of file From 174b1391d0be236dd45934a1b05cca796e15b3ad Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Wed, 11 Jun 2025 15:19:37 -0700 Subject: [PATCH 039/120] Write draft of abstract model class Stopping halfway through again to address package manager --- model_training/models/model.py | 61 +++++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 5 deletions(-) diff --git a/model_training/models/model.py b/model_training/models/model.py index c9874dd..fa1c9c9 100644 --- a/model_training/models/model.py +++ b/model_training/models/model.py @@ -1,8 +1,41 @@ -from torch import nn from abc import ABC, abstractmethod + +from torch import nn import numpy as np -import typing +class ModelOutput(ABC): + """ModelOutput + + Object that stores the output of a model + This allows for standardizing model outputs + So upstream applications don't need to change for spefific models + + Inspired by HuggingFace Models + + Developer: Reccommend for each Model, to have an assocaited ModelOutput class + """ + + def __init__(self, logits: np.array, embeddings: np.array): + self.embeddings = embeddings + self.logits = logits + + +class ModelInput(ABC): + """ModelInput + + Spefifies Input Types + Hopefully should help standardize formatting for models + + Inspired by HuggingFace Models and Tokenizers + + Developer: Reccommend for each Model, to have an assocaited ModelInput class + """ + + def __init__( + self, waveform: np.array | None = None, spectrogram: np.array | None = None + ): + self.waveform = waveform + self.spectrogram = spectrogram class Model(ABC): @@ -12,11 +45,29 @@ class Model(ABC): This can be the final layer of a model backbone or a set of useful features + Args + x: Any | Either np.array or Torch.Tensor, is the input for the model + Returns - embedding + embedding: np.array, some embedding vector representing the input data """ - @abstractmethod - def get_embeddings(x) -> np.array: + def get_embeddings(self, x: ModelInput) -> np.array: + return self.forward(x).embeddings + + """ + Runs some input x through the model + In PyTorch models, this is the same forward function + We just apply the convention for non Pytorch models, + TODO: Some things to concern + - + Args: + x: Any + Returns: + ModelOutput: dict, a dictionary like object that describes + """ + @abstractmethod + def forward(self, x: ModelInput) -> ModelOutput: + pass From a04a654279ddb7d9edaf6046be3311f251be4363 Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Wed, 11 Jun 2025 15:27:31 -0700 Subject: [PATCH 040/120] Updates documentation for venv --- README.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1d20f54..d47a6d8 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,15 @@ Tools for capturing, analyzing, and parsing audio data ## Default Python Instructions 1) Install Python>=3.10 -2) Run in project root `pip install -e .` +2) Create a virtual enviroment via `python -m venv` +3) Activate the enviroment using an activate script: + +- Windows: `.venv\Scripts\activate` +- macOS/Linux: `source .venv/bin/activate` + +If this works, you should see in your command line `(whoot)`. If not check https://docs.python.org/3/library/venv.html#how-venvs-work + +4) Run in project root `pip install -e .` To install optional dependencies run `pip install -e .[extra1,extra2,...]` From a75f0312882423c21ffd0e95c51479435f5d6c5c Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Wed, 11 Jun 2025 16:05:45 -0700 Subject: [PATCH 041/120] Adds loss to ModelOutput --- model_training/models/model.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/model_training/models/model.py b/model_training/models/model.py index fa1c9c9..384d384 100644 --- a/model_training/models/model.py +++ b/model_training/models/model.py @@ -15,9 +15,15 @@ class ModelOutput(ABC): Developer: Reccommend for each Model, to have an assocaited ModelOutput class """ - def __init__(self, logits: np.array, embeddings: np.array): + def __init__( + self, + logits: np.array, + embeddings: np.array, + loss: np.array | None = None + ): self.embeddings = embeddings self.logits = logits + self.loss = loss class ModelInput(ABC): @@ -32,7 +38,9 @@ class ModelInput(ABC): """ def __init__( - self, waveform: np.array | None = None, spectrogram: np.array | None = None + self, + waveform: np.array | None = None, + spectrogram: np.array | None = None, ): self.waveform = waveform self.spectrogram = spectrogram @@ -71,3 +79,11 @@ def get_embeddings(self, x: ModelInput) -> np.array: @abstractmethod def forward(self, x: ModelInput) -> ModelOutput: pass + + + """ + Notes on design for the future + + - Should model implement a way to save/load model to/form disk + + """ \ No newline at end of file From f8d43fac4fca58b92e82c553246f0f7373b399f4 Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Wed, 11 Jun 2025 16:10:37 -0700 Subject: [PATCH 042/120] Update timm_model to include loss function --- model_training/models/timm_model.py | 45 +++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 model_training/models/timm_model.py diff --git a/model_training/models/timm_model.py b/model_training/models/timm_model.py new file mode 100644 index 0000000..d1db531 --- /dev/null +++ b/model_training/models/timm_model.py @@ -0,0 +1,45 @@ +from .model import Model, ModelInput, ModelOutput +import timm +from torch import nn + +""" + Wrapper around the timms model zoo + + See https://timm.fast.ai/ + + Timm model zoo good for computer vision models + Like CNNs, which are useful for spectrograms + + Great repo for models, but currently using this for demoing pipeline +""" +class TimmInputs(ModelInput): + def __init__(self, waveform = None, spectrogram = None): + # Can use inputs to verify correct shape for upstream model + assert spectrogram.shape[1:] == (1, 100, 100) + super().__init__(None, spectrogram) + + +class TimmModel(nn.Module, Model): + def __init__(self, timm_model='resnet34', pretrained=True, in_chans=1, num_classes=6, loss=None): + assert num_classes > 0 + + self.backbone = timm.create_model(timm_model, pretrained=pretrained, in_chans=in_chans) + # Unsure if 1000 is default for all models. Need to check this + self.linear = nn.Linear(1000, num_classes) + + # Models might need diffrent losses during training! + if loss is not None: + self.loss = loss + else: + self.loss = nn.BCEWithLogitsLoss() + + def forward(self, x: TimmInputs) -> ModelOutput: + embedd = self.backbone(x) + logits = self.linear(embedd) + loss = self.loss(logits) + + return ModelOutput( + logits=logits, + embeddings=embedd, + loss=loss + ) From 41473edb917031accfa432f9a5ae7bf957375e0b Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Wed, 11 Jun 2025 17:10:29 -0700 Subject: [PATCH 043/120] Start building dataset handling Starts a system that ensures some standards in model formation to make training easier to work with. Using Apache Arrow datasets from HuggingFace since I find for large datasets, they are faster than pandas + allow for automatically decoding audio data --- .gitignore | 3 +- model_training/README.md | 3 ++ .../data_extractor/buowset_extractor.py | 29 ++++++++++++++++ model_training/dataset.py | 33 +++++++++++++++++++ model_training/pipeline.py | 7 ++++ pyproject.toml | 4 +++ 6 files changed, 78 insertions(+), 1 deletion(-) create mode 100644 model_training/README.md create mode 100644 model_training/data_extractor/buowset_extractor.py diff --git a/.gitignore b/.gitignore index 09183a1..260249f 100644 --- a/.gitignore +++ b/.gitignore @@ -206,4 +206,5 @@ pip-selfcheck.json # End of https://www.toptal.com/developers/gitignore/api/python,venv,visualstudiocode -uv.lock \ No newline at end of file +uv.lock +.ruff_cache \ No newline at end of file diff --git a/model_training/README.md b/model_training/README.md new file mode 100644 index 0000000..d687647 --- /dev/null +++ b/model_training/README.md @@ -0,0 +1,3 @@ +Toolkit for training Machine Learning Classification Models over audio dataset + +Key inspiration is https://github.com/UCSD-E4E/pyha-analyzer-2.0/tree/main. This repo differs in that it uses a traditional training pipeline rather than the Hugging Face Trainer. Hugging face trainer abstracts the training code, which should be explict for this toolkit. diff --git a/model_training/data_extractor/buowset_extractor.py b/model_training/data_extractor/buowset_extractor.py new file mode 100644 index 0000000..3ae793f --- /dev/null +++ b/model_training/data_extractor/buowset_extractor.py @@ -0,0 +1,29 @@ +""" + Inspired by https://github.com/UCSD-E4E/pyha-analyzer-2.0/tree/main/pyha_analyzer/extractors + Standardizes the format of the buowset dataset +""" +import argparse +import os +from datasets import load_dataset, Audio, DatasetDict + +def buowset_extractor( + metadata_csv, + parent_path, + output, + validation_fold = 4, + test_fold = 3, + sr=32_000, + filepath="segment"): + ds = load_dataset(metadata_csv) + ds["audio"] = parent_path + "/" + ds["filepath"] #TODO Better file path handling pls + ds = ds.cast_column("audio", Audio()) + + test_ds = ds.filter(lambda x: x["fold"] == validation_fold) + valid_ds = ds.filter(lambda x: x["fold"] == test_fold) + train_ds = ds.filter(lambda x: x["fold"] != test_fold & x["fold"] != validation_fold) + + return DatasetDict({ + "train": train_ds, + "valid": valid_ds, + "test_ds": test_ds + }) diff --git a/model_training/dataset.py b/model_training/dataset.py index e69de29..bf8a749 100644 --- a/model_training/dataset.py +++ b/model_training/dataset.py @@ -0,0 +1,33 @@ +""" + Pulled from https://github.com/UCSD-E4E/pyha-analyzer-2.0/blob/main/pyha_analyzer/dataset.py + Key idea is we define a generic AudioDataset with uniform features + + Using an Arrow Dataset from Hugging Face's dataset library because + - Cool audio features https://huggingface.co/docs/datasets/en/audio_process + - Faster than pandas, better at manging memory +""" +from datasets import DatasetDict, ClassLabel + +DEFAULT_COLUMNS = ["label", "audio"] + +class AudioDataset(DatasetDict): + def __init__(self, ds: DatasetDict): + self.validate_format(ds) + super().__init__(ds) + + def validate_format(self, ds: DatasetDict): + for split in ds.keys(): + dataset = ds[split] + for column in DEFAULT_COLUMNS: + assert column in dataset.features, ( + f"The column `{column}` is missing from dataset split `{split}`. Required by system" + ) + + def get_number_species(self): #NOTE: Assumes all labels are mutlilabel (the extra feature note) + return self["train"].features["labels"].feature.num_classes + + def get_class_labels(self): + """ + Returns a new ClassLabel Object to make mapping easier between datasets + """ + return ClassLabel(names=self["train"].features["labels"].names) \ No newline at end of file diff --git a/model_training/pipeline.py b/model_training/pipeline.py index e69de29..989135e 100644 --- a/model_training/pipeline.py +++ b/model_training/pipeline.py @@ -0,0 +1,7 @@ +# Extract the dataset + +# Create the model + +# Run training + +# Visualize results \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 0cf3211..73e4038 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,10 @@ cu128 = [ "torchvision>=0.22.0", ] +model_training = [ + "datasets>=3.5.1", +] + [packages.index] cu128 = "https://download.pytorch.org/whl/cu128" From fd9027be5c32ae12527ad1002dd6903586c68785 Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Fri, 13 Jun 2025 11:59:00 -0700 Subject: [PATCH 044/120] Adds pyha-analyzer as a subdependency --- README.md | 1 + pyproject.toml | 10 ++++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d47a6d8..e10d168 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ Current support optional dependency collections include - `cpu`: Installs torch and torchvision for CPU use only - `cu128`: Installs torch and torchvision with Cuda 12.8 Binaries +- `model_training`: Required for running model_training, make sure to add either `cpu` or `cu128` diff --git a/pyproject.toml b/pyproject.toml index 73e4038..f120fee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ description = "Tools for capturing, analyzing, and parsing audio data" readme = "README.md" requires-python = ">=3.10" dependencies = [ - "librosa>=0.11.0", + "librosa>=0.10.2.post1", "numpy>=2.2.6", "pandas>=2.3.0", "pydub>=0.25.1", @@ -25,10 +25,16 @@ cu128 = [ model_training = [ "datasets>=3.5.1", + "timm>=1.0.15", + "pyha-analyzer", ] + [packages.index] cu128 = "https://download.pytorch.org/whl/cu128" [tool.setuptools] -packages = ["make_model", "assess_birdnet"] \ No newline at end of file +packages = ["make_model", "assess_birdnet", "model_training"] + +[tool.uv.sources] +pyha-analyzer = { git = "https://github.com/UCSD-E4E/pyha-analyzer-2.0.git", branch = "support_whoot" } From 4df6591926a3032ea8c52bb3603926b07e7c2e06 Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Fri, 13 Jun 2025 11:59:38 -0700 Subject: [PATCH 045/120] Adds default format for datasets --- model_training/data_extractor/__init__.py | 1 + .../data_extractor/buowset_extractor.py | 23 +++++++++++-------- model_training/dataset.py | 5 +++- 3 files changed, 18 insertions(+), 11 deletions(-) create mode 100644 model_training/data_extractor/__init__.py diff --git a/model_training/data_extractor/__init__.py b/model_training/data_extractor/__init__.py new file mode 100644 index 0000000..9486885 --- /dev/null +++ b/model_training/data_extractor/__init__.py @@ -0,0 +1 @@ +from .buowset_extractor import buowset_extractor diff --git a/model_training/data_extractor/buowset_extractor.py b/model_training/data_extractor/buowset_extractor.py index 3ae793f..f7f94ae 100644 --- a/model_training/data_extractor/buowset_extractor.py +++ b/model_training/data_extractor/buowset_extractor.py @@ -5,25 +5,28 @@ import argparse import os from datasets import load_dataset, Audio, DatasetDict +from ..dataset import AudioDataset + """_summary_ + """ def buowset_extractor( - metadata_csv, - parent_path, - output, - validation_fold = 4, - test_fold = 3, + metadata_csv, + parent_path, + output, #TODO what does output do? + validation_fold = 4, + test_fold = 3, sr=32_000, filepath="segment"): ds = load_dataset(metadata_csv) - ds["audio"] = parent_path + "/" + ds["filepath"] #TODO Better file path handling pls - ds = ds.cast_column("audio", Audio()) - + ds["audio"] = parent_path + "/" + ds[filepath] #TODO Better file path handling pls + ds = ds.cast_column("audio", Audio(sampling_rate=sr)) + test_ds = ds.filter(lambda x: x["fold"] == validation_fold) valid_ds = ds.filter(lambda x: x["fold"] == test_fold) train_ds = ds.filter(lambda x: x["fold"] != test_fold & x["fold"] != validation_fold) - return DatasetDict({ + return AudioDataset(DatasetDict({ "train": train_ds, "valid": valid_ds, "test_ds": test_ds - }) + })) diff --git a/model_training/dataset.py b/model_training/dataset.py index bf8a749..a65a6a7 100644 --- a/model_training/dataset.py +++ b/model_training/dataset.py @@ -5,8 +5,11 @@ Using an Arrow Dataset from Hugging Face's dataset library because - Cool audio features https://huggingface.co/docs/datasets/en/audio_process - Faster than pandas, better at manging memory + + # TODO Use the default stuff from pyha-anaylzer """ from datasets import DatasetDict, ClassLabel +from torch.utils.data import DataLoader DEFAULT_COLUMNS = ["label", "audio"] @@ -23,7 +26,7 @@ def validate_format(self, ds: DatasetDict): f"The column `{column}` is missing from dataset split `{split}`. Required by system" ) - def get_number_species(self): #NOTE: Assumes all labels are mutlilabel (the extra feature note) + def get_num_classes(self): #NOTE: Assumes all labels are mutlilabel (the extra feature note) return self["train"].features["labels"].feature.num_classes def get_class_labels(self): From aa9d5949fa6b28905533fafe22fb805628c0c5aa Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Fri, 13 Jun 2025 12:00:08 -0700 Subject: [PATCH 046/120] Adds data preprocessing and model format --- model_training/models/__init__.py | 1 + model_training/models/model.py | 58 ++++++++++++++++++- model_training/models/timm_model.py | 29 ++++++---- .../preprocessors/default_preprocessor.py | 33 +++++++++++ 4 files changed, 108 insertions(+), 13 deletions(-) create mode 100644 model_training/models/__init__.py create mode 100644 model_training/preprocessors/default_preprocessor.py diff --git a/model_training/models/__init__.py b/model_training/models/__init__.py new file mode 100644 index 0000000..1b8ee1e --- /dev/null +++ b/model_training/models/__init__.py @@ -0,0 +1 @@ +from timm_model import TimmModel, TimmInputs \ No newline at end of file diff --git a/model_training/models/model.py b/model_training/models/model.py index 384d384..3c1c3a9 100644 --- a/model_training/models/model.py +++ b/model_training/models/model.py @@ -1,8 +1,28 @@ from abc import ABC, abstractmethod +from functools import wraps -from torch import nn +from pyha_analyzer.models.base_model import BaseModel +import torch +from torch import nn, Tensor import numpy as np +""" + Wrapper to check to make sure everything is setup properly + Required before using PyhaTrainer +""" +def has_required_inputs(): + def decorator(forward): + @wraps(forward) + def wrapper(self, x): + assert(isinstance(x, self.input_format)) + model_output = forward(self, x) + assert(isinstance(model_output, self.output_format)) + + return model_output + return wrapper + return decorator + + class ModelOutput(ABC): """ModelOutput @@ -19,11 +39,29 @@ def __init__( self, logits: np.array, embeddings: np.array, + labels: np.array | None = None, loss: np.array | None = None ): self.embeddings = embeddings self.logits = logits self.loss = loss + self.labels = labels + + def to_hugging_face(self): + return { + "predictions": self.logits, + "label_ids": [self.labels], + } + + @classmethod + def concat(list_of_outputs: list): + return ModelOutput( + logits = torch.vstack([out.logits for out in list_of_outputs]), + embeddings = torch.vstack([out.embeddings for out in list_of_outputs]), + loss = torch.vstack([out.loss for out in list_of_outputs]), + labels = torch.vstack([out.labels for out in list_of_outputs]), + ) + class ModelInput(ABC): @@ -39,14 +77,27 @@ class ModelInput(ABC): def __init__( self, + labels: np.array, waveform: np.array | None = None, spectrogram: np.array | None = None, ): self.waveform = waveform self.spectrogram = spectrogram + self.labels = labels + + def to_tensor(self, device="cpu"): + self.waveform = Tensor(self.waveform, device=device) + self.spectrogram = Tensor(self.spectrogram, device=device) + self.labels = Tensor(self.labels, device=device) +class Model(ABC, nn.Module, BaseModel): + # TODO Define required class intance variables + # Such as cirteron etc. + def __init__(self, *args, **kwargs): + self.input_format = ModelInput + self.output_format = ModelOutput + super().__init__(*args, **kwargs) -class Model(ABC): """ Gets an embedding for the model @@ -65,7 +116,7 @@ def get_embeddings(self, x: ModelInput) -> np.array: """ Runs some input x through the model - In PyTorch models, this is the same forward function + In PyTorch models, this is the same forward functionlogits We just apply the convention for non Pytorch models, TODO: Some things to concern @@ -77,6 +128,7 @@ def get_embeddings(self, x: ModelInput) -> np.array: ModelOutput: dict, a dictionary like object that describes """ @abstractmethod + @has_required_inputs def forward(self, x: ModelInput) -> ModelOutput: pass diff --git a/model_training/models/timm_model.py b/model_training/models/timm_model.py index d1db531..ba5ab55 100644 --- a/model_training/models/timm_model.py +++ b/model_training/models/timm_model.py @@ -1,6 +1,7 @@ -from .model import Model, ModelInput, ModelOutput import timm -from torch import nn +from torch import nn, Tensor + +from model import Model, ModelInput, ModelOutput, has_required_inputs """ Wrapper around the timms model zoo @@ -13,14 +14,20 @@ Great repo for models, but currently using this for demoing pipeline """ class TimmInputs(ModelInput): - def __init__(self, waveform = None, spectrogram = None): - # Can use inputs to verify correct shape for upstream model - assert spectrogram.shape[1:] == (1, 100, 100) - super().__init__(None, spectrogram) + def __init__(self, labels, waveform = None, spectrogram = None, device="cpu"): + # # Can use inputs to verify correct shape for upstream model + # assert spectrogram.shape[1:] == (1, 100, 100) + super().__init__(labels, waveform, spectrogram) + self.labels = Tensor(labels) + self.spectrogram = Tensor(spectrogram) class TimmModel(nn.Module, Model): def __init__(self, timm_model='resnet34', pretrained=True, in_chans=1, num_classes=6, loss=None): + super().__init__() + self.input_format = TimmInputs + self.output_format = ModelOutput + assert num_classes > 0 self.backbone = timm.create_model(timm_model, pretrained=pretrained, in_chans=in_chans) @@ -32,14 +39,16 @@ def __init__(self, timm_model='resnet34', pretrained=True, in_chans=1, num_class self.loss = loss else: self.loss = nn.BCEWithLogitsLoss() - + + @has_required_inputs def forward(self, x: TimmInputs) -> ModelOutput: - embedd = self.backbone(x) + embedd = self.backbone(x.spectrogram) logits = self.linear(embedd) - loss = self.loss(logits) + loss = self.loss(logits, x.labels) return ModelOutput( logits=logits, embeddings=embedd, - loss=loss + loss=loss, + labels = x.labels ) diff --git a/model_training/preprocessors/default_preprocessor.py b/model_training/preprocessors/default_preprocessor.py new file mode 100644 index 0000000..1a0f682 --- /dev/null +++ b/model_training/preprocessors/default_preprocessor.py @@ -0,0 +1,33 @@ +from pyha_analyzer.preprocessors import MelSpectrogramPreprocessors +from models.model import ModelInput + + +"""_summary_ + +Returns: + _type_: _description_ +""" +class SpectrogramModelInputPreprocessors(MelSpectrogramPreprocessors): + def __init__( + self, + ModelInput:ModelInput, + duration=5, + augment=None, + spectrogram_augments=None, + class_list=..., + n_fft=2048, + hop_length=256, + power=2, + n_mels=256, + dataset_ref=None, + ): + super().__init__( + duration, augment, spectrogram_augments, + class_list, n_fft, hop_length, power, n_mels, + dataset_ref + ) + self.ModelInput = ModelInput + + def __call__(self, batch): + batch = super().__call__(batch) + return self.ModelInput(labels=batch["labels"], spectrogram=batch["audio"]) \ No newline at end of file From a4865a7c84adfdb485630367e1ba39f5382d59ae Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Fri, 13 Jun 2025 12:00:31 -0700 Subject: [PATCH 047/120] Adds the preprocessor for spectrograms --- model_training/preprocessors/__init__.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 model_training/preprocessors/__init__.py diff --git a/model_training/preprocessors/__init__.py b/model_training/preprocessors/__init__.py new file mode 100644 index 0000000..6e03d90 --- /dev/null +++ b/model_training/preprocessors/__init__.py @@ -0,0 +1 @@ +from default_preprocessor import SpectrogramModelInputPreprocessors \ No newline at end of file From 0fe10ba6809703b8ee4c6260f9b530108361631a Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Fri, 13 Jun 2025 12:00:57 -0700 Subject: [PATCH 048/120] Adds a demo train script --- model_training/pipeline.py | 7 ------- model_training/train.py | 40 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 7 deletions(-) delete mode 100644 model_training/pipeline.py create mode 100644 model_training/train.py diff --git a/model_training/pipeline.py b/model_training/pipeline.py deleted file mode 100644 index 989135e..0000000 --- a/model_training/pipeline.py +++ /dev/null @@ -1,7 +0,0 @@ -# Extract the dataset - -# Create the model - -# Run training - -# Visualize results \ No newline at end of file diff --git a/model_training/train.py b/model_training/train.py new file mode 100644 index 0000000..aacc60d --- /dev/null +++ b/model_training/train.py @@ -0,0 +1,40 @@ +from pyha_analyzer import PyhaTrainer, PyhaTrainingArguments + +from .data_extractor import buowset_extractor +from .models import TimmModel, TimmInputs +from .preprocessors import SpectrogramModelInputPreprocessors + + +# Extract the dataset +ds = buowset_extractor(metadata_csv="data.csv", parent_path="data_parent_path/data/", output=None) + +# Create the model +model = TimmModel(num_classes=ds.get_num_classes()) + +# Preprocessors (No augmentation)! +# We define here what the model reads +preprocessor = SpectrogramModelInputPreprocessors( + TimmInputs, + duration=5, + class_list=ds["train"].features["labels"].feature.names +) +ds["train"].set_transform(preprocessor) +ds["valid"].set_transform(preprocessor) +ds["test"].set_transform(preprocessor) + +# Run training +args = PyhaTrainingArguments( + working_dir="working_dir" +) +args.num_train_epochs = 20 +args.eval_steps = 20 +args.run_name = "testing" + +trainer = PyhaTrainer( + model=model, + dataset=ds, + training_args=args, + logger=None, +) +trainer.train() +trainer.evaluate(eval_dataset=ds["test"], metric_key_prefix="Soundscape") From 6e925ccc0e4132820308080c4249fbd85bcbd55b Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Fri, 13 Jun 2025 12:05:02 -0700 Subject: [PATCH 049/120] Lint --- .../data_extractor/buowset_extractor.py | 40 ++++++----- model_training/dataset.py | 24 ++++--- model_training/models/__init__.py | 2 +- model_training/models/model.py | 49 ++++++++------ model_training/models/timm_model.py | 26 +++++--- model_training/preprocessors/__init__.py | 2 +- .../preprocessors/default_preprocessor.py | 40 ++++++----- model_training/train.py | 12 ++-- model_training/trainer.py | 66 +++++++++++++++++++ 9 files changed, 177 insertions(+), 84 deletions(-) diff --git a/model_training/data_extractor/buowset_extractor.py b/model_training/data_extractor/buowset_extractor.py index f7f94ae..fc023ec 100644 --- a/model_training/data_extractor/buowset_extractor.py +++ b/model_training/data_extractor/buowset_extractor.py @@ -1,32 +1,36 @@ """ - Inspired by https://github.com/UCSD-E4E/pyha-analyzer-2.0/tree/main/pyha_analyzer/extractors - Standardizes the format of the buowset dataset +Inspired by https://github.com/UCSD-E4E/pyha-analyzer-2.0/tree/main/pyha_analyzer/extractors +Standardizes the format of the buowset dataset """ + import argparse import os from datasets import load_dataset, Audio, DatasetDict from ..dataset import AudioDataset - """_summary_ - """ +"""_summary_ +""" + + def buowset_extractor( - metadata_csv, - parent_path, - output, #TODO what does output do? - validation_fold = 4, - test_fold = 3, - sr=32_000, - filepath="segment"): + metadata_csv, + parent_path, + output, # TODO what does output do? + validation_fold=4, + test_fold=3, + sr=32_000, + filepath="segment", +): ds = load_dataset(metadata_csv) - ds["audio"] = parent_path + "/" + ds[filepath] #TODO Better file path handling pls + ds["audio"] = parent_path + "/" + ds[filepath] # TODO Better file path handling pls ds = ds.cast_column("audio", Audio(sampling_rate=sr)) test_ds = ds.filter(lambda x: x["fold"] == validation_fold) valid_ds = ds.filter(lambda x: x["fold"] == test_fold) - train_ds = ds.filter(lambda x: x["fold"] != test_fold & x["fold"] != validation_fold) + train_ds = ds.filter( + lambda x: x["fold"] != test_fold & x["fold"] != validation_fold + ) - return AudioDataset(DatasetDict({ - "train": train_ds, - "valid": valid_ds, - "test_ds": test_ds - })) + return AudioDataset( + DatasetDict({"train": train_ds, "valid": valid_ds, "test_ds": test_ds}) + ) diff --git a/model_training/dataset.py b/model_training/dataset.py index a65a6a7..878ef36 100644 --- a/model_training/dataset.py +++ b/model_training/dataset.py @@ -1,18 +1,20 @@ """ - Pulled from https://github.com/UCSD-E4E/pyha-analyzer-2.0/blob/main/pyha_analyzer/dataset.py - Key idea is we define a generic AudioDataset with uniform features - - Using an Arrow Dataset from Hugging Face's dataset library because - - Cool audio features https://huggingface.co/docs/datasets/en/audio_process - - Faster than pandas, better at manging memory - - # TODO Use the default stuff from pyha-anaylzer +Pulled from https://github.com/UCSD-E4E/pyha-analyzer-2.0/blob/main/pyha_analyzer/dataset.py +Key idea is we define a generic AudioDataset with uniform features + +Using an Arrow Dataset from Hugging Face's dataset library because +- Cool audio features https://huggingface.co/docs/datasets/en/audio_process +- Faster than pandas, better at manging memory + +# TODO Use the default stuff from pyha-anaylzer """ + from datasets import DatasetDict, ClassLabel from torch.utils.data import DataLoader DEFAULT_COLUMNS = ["label", "audio"] + class AudioDataset(DatasetDict): def __init__(self, ds: DatasetDict): self.validate_format(ds) @@ -26,11 +28,13 @@ def validate_format(self, ds: DatasetDict): f"The column `{column}` is missing from dataset split `{split}`. Required by system" ) - def get_num_classes(self): #NOTE: Assumes all labels are mutlilabel (the extra feature note) + def get_num_classes( + self, + ): # NOTE: Assumes all labels are mutlilabel (the extra feature note) return self["train"].features["labels"].feature.num_classes def get_class_labels(self): """ Returns a new ClassLabel Object to make mapping easier between datasets """ - return ClassLabel(names=self["train"].features["labels"].names) \ No newline at end of file + return ClassLabel(names=self["train"].features["labels"].names) diff --git a/model_training/models/__init__.py b/model_training/models/__init__.py index 1b8ee1e..c54833c 100644 --- a/model_training/models/__init__.py +++ b/model_training/models/__init__.py @@ -1 +1 @@ -from timm_model import TimmModel, TimmInputs \ No newline at end of file +from timm_model import TimmModel, TimmInputs diff --git a/model_training/models/model.py b/model_training/models/model.py index 3c1c3a9..844fe6c 100644 --- a/model_training/models/model.py +++ b/model_training/models/model.py @@ -10,16 +10,20 @@ Wrapper to check to make sure everything is setup properly Required before using PyhaTrainer """ + + def has_required_inputs(): def decorator(forward): @wraps(forward) def wrapper(self, x): - assert(isinstance(x, self.input_format)) + assert isinstance(x, self.input_format) model_output = forward(self, x) - assert(isinstance(model_output, self.output_format)) + assert isinstance(model_output, self.output_format) return model_output + return wrapper + return decorator @@ -36,12 +40,12 @@ class ModelOutput(ABC): """ def __init__( - self, - logits: np.array, - embeddings: np.array, - labels: np.array | None = None, - loss: np.array | None = None - ): + self, + logits: np.array, + embeddings: np.array, + labels: np.array | None = None, + loss: np.array | None = None, + ): self.embeddings = embeddings self.logits = logits self.loss = loss @@ -52,16 +56,15 @@ def to_hugging_face(self): "predictions": self.logits, "label_ids": [self.labels], } - + @classmethod def concat(list_of_outputs: list): - return ModelOutput( - logits = torch.vstack([out.logits for out in list_of_outputs]), - embeddings = torch.vstack([out.embeddings for out in list_of_outputs]), - loss = torch.vstack([out.loss for out in list_of_outputs]), - labels = torch.vstack([out.labels for out in list_of_outputs]), + return ModelOutput( + logits=torch.vstack([out.logits for out in list_of_outputs]), + embeddings=torch.vstack([out.embeddings for out in list_of_outputs]), + loss=torch.vstack([out.loss for out in list_of_outputs]), + labels=torch.vstack([out.labels for out in list_of_outputs]), ) - class ModelInput(ABC): @@ -76,9 +79,9 @@ class ModelInput(ABC): """ def __init__( - self, + self, labels: np.array, - waveform: np.array | None = None, + waveform: np.array | None = None, spectrogram: np.array | None = None, ): self.waveform = waveform @@ -90,9 +93,12 @@ def to_tensor(self, device="cpu"): self.spectrogram = Tensor(self.spectrogram, device=device) self.labels = Tensor(self.labels, device=device) -class Model(ABC, nn.Module, BaseModel): +""" +BaseModel Class for Whoot +""" +class Model(ABC, BaseModel): # TODO Define required class intance variables - # Such as cirteron etc. + # Such as cirteron etc. def __init__(self, *args, **kwargs): self.input_format = ModelInput self.output_format = ModelOutput @@ -110,6 +116,7 @@ def __init__(self, *args, **kwargs): Returns embedding: np.array, some embedding vector representing the input data """ + def get_embeddings(self, x: ModelInput) -> np.array: return self.forward(x).embeddings @@ -127,15 +134,15 @@ def get_embeddings(self, x: ModelInput) -> np.array: Returns: ModelOutput: dict, a dictionary like object that describes """ + @abstractmethod @has_required_inputs def forward(self, x: ModelInput) -> ModelOutput: pass - """ Notes on design for the future - Should model implement a way to save/load model to/form disk - """ \ No newline at end of file + """ diff --git a/model_training/models/timm_model.py b/model_training/models/timm_model.py index ba5ab55..41baf80 100644 --- a/model_training/models/timm_model.py +++ b/model_training/models/timm_model.py @@ -1,4 +1,4 @@ -import timm +import timm from torch import nn, Tensor from model import Model, ModelInput, ModelOutput, has_required_inputs @@ -13,8 +13,10 @@ Great repo for models, but currently using this for demoing pipeline """ + + class TimmInputs(ModelInput): - def __init__(self, labels, waveform = None, spectrogram = None, device="cpu"): + def __init__(self, labels, waveform=None, spectrogram=None, device="cpu"): # # Can use inputs to verify correct shape for upstream model # assert spectrogram.shape[1:] == (1, 100, 100) super().__init__(labels, waveform, spectrogram) @@ -23,14 +25,23 @@ def __init__(self, labels, waveform = None, spectrogram = None, device="cpu"): class TimmModel(nn.Module, Model): - def __init__(self, timm_model='resnet34', pretrained=True, in_chans=1, num_classes=6, loss=None): + def __init__( + self, + timm_model="resnet34", + pretrained=True, + in_chans=1, + num_classes=6, + loss=None, + ): super().__init__() self.input_format = TimmInputs self.output_format = ModelOutput assert num_classes > 0 - self.backbone = timm.create_model(timm_model, pretrained=pretrained, in_chans=in_chans) + self.backbone = timm.create_model( + timm_model, pretrained=pretrained, in_chans=in_chans + ) # Unsure if 1000 is default for all models. Need to check this self.linear = nn.Linear(1000, num_classes) @@ -46,9 +57,4 @@ def forward(self, x: TimmInputs) -> ModelOutput: logits = self.linear(embedd) loss = self.loss(logits, x.labels) - return ModelOutput( - logits=logits, - embeddings=embedd, - loss=loss, - labels = x.labels - ) + return ModelOutput(logits=logits, embeddings=embedd, loss=loss, labels=x.labels) diff --git a/model_training/preprocessors/__init__.py b/model_training/preprocessors/__init__.py index 6e03d90..dd09b10 100644 --- a/model_training/preprocessors/__init__.py +++ b/model_training/preprocessors/__init__.py @@ -1 +1 @@ -from default_preprocessor import SpectrogramModelInputPreprocessors \ No newline at end of file +from default_preprocessor import SpectrogramModelInputPreprocessors diff --git a/model_training/preprocessors/default_preprocessor.py b/model_training/preprocessors/default_preprocessor.py index 1a0f682..58691c0 100644 --- a/model_training/preprocessors/default_preprocessor.py +++ b/model_training/preprocessors/default_preprocessor.py @@ -7,27 +7,35 @@ Returns: _type_: _description_ """ + + class SpectrogramModelInputPreprocessors(MelSpectrogramPreprocessors): def __init__( - self, - ModelInput:ModelInput, - duration=5, - augment=None, - spectrogram_augments=None, - class_list=..., - n_fft=2048, - hop_length=256, - power=2, - n_mels=256, - dataset_ref=None, - ): + self, + ModelInput: ModelInput, + duration=5, + augment=None, + spectrogram_augments=None, + class_list=..., + n_fft=2048, + hop_length=256, + power=2, + n_mels=256, + dataset_ref=None, + ): super().__init__( - duration, augment, spectrogram_augments, - class_list, n_fft, hop_length, power, n_mels, - dataset_ref + duration, + augment, + spectrogram_augments, + class_list, + n_fft, + hop_length, + power, + n_mels, + dataset_ref, ) self.ModelInput = ModelInput def __call__(self, batch): batch = super().__call__(batch) - return self.ModelInput(labels=batch["labels"], spectrogram=batch["audio"]) \ No newline at end of file + return self.ModelInput(labels=batch["labels"], spectrogram=batch["audio"]) diff --git a/model_training/train.py b/model_training/train.py index aacc60d..c91d7cb 100644 --- a/model_training/train.py +++ b/model_training/train.py @@ -6,7 +6,9 @@ # Extract the dataset -ds = buowset_extractor(metadata_csv="data.csv", parent_path="data_parent_path/data/", output=None) +ds = buowset_extractor( + metadata_csv="data.csv", parent_path="data_parent_path/data/", output=None +) # Create the model model = TimmModel(num_classes=ds.get_num_classes()) @@ -14,18 +16,14 @@ # Preprocessors (No augmentation)! # We define here what the model reads preprocessor = SpectrogramModelInputPreprocessors( - TimmInputs, - duration=5, - class_list=ds["train"].features["labels"].feature.names + TimmInputs, duration=5, class_list=ds["train"].features["labels"].feature.names ) ds["train"].set_transform(preprocessor) ds["valid"].set_transform(preprocessor) ds["test"].set_transform(preprocessor) # Run training -args = PyhaTrainingArguments( - working_dir="working_dir" -) +args = PyhaTrainingArguments(working_dir="working_dir") args.num_train_epochs = 20 args.eval_steps = 20 args.run_name = "testing" diff --git a/model_training/trainer.py b/model_training/trainer.py index e69de29..ae8932c 100644 --- a/model_training/trainer.py +++ b/model_training/trainer.py @@ -0,0 +1,66 @@ +# """ +# The Trainer holds the main training loop, validation loop, and can run evaluation + +# There are some off the shelf options, such as the hugging face Trainer +# Which is in use by https://github.com/UCSD-E4E/pyha-analyzer-2.0/ + +# However, It can be difficult to fit input to perfectly +# match what the hugging face trainer expects +# And we are unlikely to use all the bells and whistles offered by hugging face. + +# So this SimpleTrainer can get the job spefifically for whoot done +# With fewer bells and whistles +# This should hopefully make debugging easier in the future and +# keep the repo focused on whoot applications +# """ + +# import torch +# from transformers import TrainingArguments + +# from .models.model import Model, ModelOutput +# from .dataset import AudioDataset +# from pyha_analyzer.metrics.classification_metrics import AudioClassificationMetrics + + +# class WhootTrainingArguments(TrainingArguments): +# def __init__(self, working_dir): +# super().__init__(working_dir) +# self.logging_steps = 10 +# self.eval_steps = 100 +# self.per_device_train_batch_size = 64 +# self.per_device_eval_batch_size = 64 +# self.dataloader_num_workers = 4 +# self.eval_accumulation_steps = 10 + +# class WhootTrainer(): +# def __init__( +# self, +# model: Model, +# dataset: AudioDataset, +# metrics: AudioClassificationMetrics = None, +# training_args: WhootTrainingArguments = None, +# data_collator=None, +# preprocessor=None, +# ): + +# self.model = model +# self.dataset = dataset +# self.dataloaders = self._get_dataloaders(dataset) +# self.metrics = metrics + +# def run_metrics(self, output_batches:list[ModelOutput]): +# out = ModelOutput.concat(output_batches) +# metrics = self.metrics(out.to_hugging_face()) +# print(metrics) + +# def run_step(self, batch, training=True): + + +# def run_loop(self, split): +# for i in range(): + +# def train(self): + +# def evaluate(self): + + From 8f8fb1836cc5fed2c7207a85cc8a96ed4f877df9 Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Fri, 13 Jun 2025 13:05:50 -0700 Subject: [PATCH 050/120] feat: handle numpy conflict --- .python-version | 1 + pyproject.toml | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) create mode 100644 .python-version diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..e4fba21 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.12 diff --git a/pyproject.toml b/pyproject.toml index f120fee..4853d3a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,10 +3,10 @@ name = "whoot" version = "0.0.2.dev0" description = "Tools for capturing, analyzing, and parsing audio data" readme = "README.md" -requires-python = ">=3.10" +requires-python = ">= 3.10.0, < 3.13.0" dependencies = [ "librosa>=0.10.2.post1", - "numpy>=2.2.6", + "numba==0.61.0", "pandas>=2.3.0", "pydub>=0.25.1", "scikit-learn>=1.7.0", From f831e02b34d1062caf0b0e9f4adfb0bdedbc337a Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Fri, 13 Jun 2025 16:57:23 -0700 Subject: [PATCH 051/120] Moves files to src and fix bugs Starting to test system, there were some import statement bugs + running the pipeline so I had to move the files around. Hence the large commit --- .gitignore | 5 +- README.md | 5 +- .../data_extractor/buowset_extractor.py | 36 --------- model_training/models/__init__.py | 1 - model_training/preprocessors/__init__.py | 1 - model_training/train.py | 38 --------- pyproject.toml | 7 +- .../README.md | 0 whoot_model_training/config.yml | 3 + whoot_model_training/train.py | 65 ++++++++++++++++ .../data_augmentation.py | 0 .../data_extractor/__init__.py | 0 .../data_extractor/buowset_extractor.py | 77 ++++++++++++++++++ .../whoot_model_training}/dataset.py | 10 ++- .../whoot_model_training/models/__init__.py | 1 + .../whoot_model_training}/models/model.py | 18 ++--- .../models/timm_model.py | 12 +-- .../preprocessors/__init__.py | 2 + .../preprocessors/default_preprocessor.py | 8 +- .../spectrogram_preprocessors.py | 78 +++++++++++++++++++ .../whoot_model_training}/trainer.py | 0 21 files changed, 267 insertions(+), 100 deletions(-) delete mode 100644 model_training/data_extractor/buowset_extractor.py delete mode 100644 model_training/models/__init__.py delete mode 100644 model_training/preprocessors/__init__.py delete mode 100644 model_training/train.py rename {model_training => whoot_model_training}/README.md (100%) create mode 100644 whoot_model_training/config.yml create mode 100644 whoot_model_training/train.py rename {model_training => whoot_model_training/whoot_model_training}/data_augmentation.py (100%) rename {model_training => whoot_model_training/whoot_model_training}/data_extractor/__init__.py (100%) create mode 100644 whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py rename {model_training => whoot_model_training/whoot_model_training}/dataset.py (86%) create mode 100644 whoot_model_training/whoot_model_training/models/__init__.py rename {model_training => whoot_model_training/whoot_model_training}/models/model.py (91%) rename {model_training => whoot_model_training/whoot_model_training}/models/timm_model.py (84%) create mode 100644 whoot_model_training/whoot_model_training/preprocessors/__init__.py rename {model_training => whoot_model_training/whoot_model_training}/preprocessors/default_preprocessor.py (71%) create mode 100644 whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py rename {model_training => whoot_model_training/whoot_model_training}/trainer.py (100%) diff --git a/.gitignore b/.gitignore index 260249f..877de52 100644 --- a/.gitignore +++ b/.gitignore @@ -207,4 +207,7 @@ pip-selfcheck.json # End of https://www.toptal.com/developers/gitignore/api/python,venv,visualstudiocode uv.lock -.ruff_cache \ No newline at end of file +.ruff_cache + +# Data Folders +data \ No newline at end of file diff --git a/README.md b/README.md index e10d168..8a729fc 100644 --- a/README.md +++ b/README.md @@ -21,8 +21,11 @@ Current support optional dependency collections include - `cpu`: Installs torch and torchvision for CPU use only - `cu128`: Installs torch and torchvision with Cuda 12.8 Binaries -- `model_training`: Required for running model_training, make sure to add either `cpu` or `cu128` +- `model_training`: Required for running scripts in `whoot/model_training`, make sure to add either `cpu` or `cu128` +## Usage + +Once the enviroment is activated, you should be able to do `python path/to/script.py` to run any of the whoot scripts. If a script states a package is missing, you might not be using the virtual enviroment. ## Developer Notes diff --git a/model_training/data_extractor/buowset_extractor.py b/model_training/data_extractor/buowset_extractor.py deleted file mode 100644 index fc023ec..0000000 --- a/model_training/data_extractor/buowset_extractor.py +++ /dev/null @@ -1,36 +0,0 @@ -""" -Inspired by https://github.com/UCSD-E4E/pyha-analyzer-2.0/tree/main/pyha_analyzer/extractors -Standardizes the format of the buowset dataset -""" - -import argparse -import os -from datasets import load_dataset, Audio, DatasetDict -from ..dataset import AudioDataset - -"""_summary_ -""" - - -def buowset_extractor( - metadata_csv, - parent_path, - output, # TODO what does output do? - validation_fold=4, - test_fold=3, - sr=32_000, - filepath="segment", -): - ds = load_dataset(metadata_csv) - ds["audio"] = parent_path + "/" + ds[filepath] # TODO Better file path handling pls - ds = ds.cast_column("audio", Audio(sampling_rate=sr)) - - test_ds = ds.filter(lambda x: x["fold"] == validation_fold) - valid_ds = ds.filter(lambda x: x["fold"] == test_fold) - train_ds = ds.filter( - lambda x: x["fold"] != test_fold & x["fold"] != validation_fold - ) - - return AudioDataset( - DatasetDict({"train": train_ds, "valid": valid_ds, "test_ds": test_ds}) - ) diff --git a/model_training/models/__init__.py b/model_training/models/__init__.py deleted file mode 100644 index c54833c..0000000 --- a/model_training/models/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from timm_model import TimmModel, TimmInputs diff --git a/model_training/preprocessors/__init__.py b/model_training/preprocessors/__init__.py deleted file mode 100644 index dd09b10..0000000 --- a/model_training/preprocessors/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from default_preprocessor import SpectrogramModelInputPreprocessors diff --git a/model_training/train.py b/model_training/train.py deleted file mode 100644 index c91d7cb..0000000 --- a/model_training/train.py +++ /dev/null @@ -1,38 +0,0 @@ -from pyha_analyzer import PyhaTrainer, PyhaTrainingArguments - -from .data_extractor import buowset_extractor -from .models import TimmModel, TimmInputs -from .preprocessors import SpectrogramModelInputPreprocessors - - -# Extract the dataset -ds = buowset_extractor( - metadata_csv="data.csv", parent_path="data_parent_path/data/", output=None -) - -# Create the model -model = TimmModel(num_classes=ds.get_num_classes()) - -# Preprocessors (No augmentation)! -# We define here what the model reads -preprocessor = SpectrogramModelInputPreprocessors( - TimmInputs, duration=5, class_list=ds["train"].features["labels"].feature.names -) -ds["train"].set_transform(preprocessor) -ds["valid"].set_transform(preprocessor) -ds["test"].set_transform(preprocessor) - -# Run training -args = PyhaTrainingArguments(working_dir="working_dir") -args.num_train_epochs = 20 -args.eval_steps = 20 -args.run_name = "testing" - -trainer = PyhaTrainer( - model=model, - dataset=ds, - training_args=args, - logger=None, -) -trainer.train() -trainer.evaluate(eval_dataset=ds["test"], metric_key_prefix="Soundscape") diff --git a/pyproject.toml b/pyproject.toml index 4853d3a..2cfc805 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,12 +29,17 @@ model_training = [ "pyha-analyzer", ] +notebooks = [ + "ipykernel>=6.29.5", + "ipywidgets>=8.1.6", +] + [packages.index] cu128 = "https://download.pytorch.org/whl/cu128" [tool.setuptools] -packages = ["make_model", "assess_birdnet", "model_training"] +packages = ["make_model", "assess_birdnet", "whoot_model_training"] [tool.uv.sources] pyha-analyzer = { git = "https://github.com/UCSD-E4E/pyha-analyzer-2.0.git", branch = "support_whoot" } diff --git a/model_training/README.md b/whoot_model_training/README.md similarity index 100% rename from model_training/README.md rename to whoot_model_training/README.md diff --git a/whoot_model_training/config.yml b/whoot_model_training/config.yml new file mode 100644 index 0000000..4fba81e --- /dev/null +++ b/whoot_model_training/config.yml @@ -0,0 +1,3 @@ +metadata_csv: data/burrowing_owl_dataset/metadata.csv +data_path: data/burrowing_owl_dataset/audio +hf_cache_path: data/burrowing_owl_dataset/cache/metadata.hf \ No newline at end of file diff --git a/whoot_model_training/train.py b/whoot_model_training/train.py new file mode 100644 index 0000000..5d3fdda --- /dev/null +++ b/whoot_model_training/train.py @@ -0,0 +1,65 @@ + + +import argparse +import yaml + +from pyha_analyzer import PyhaTrainer, PyhaTrainingArguments + +from whoot_model_training.data_extractor import buowset_extractor +from whoot_model_training.models import TimmModel, TimmInputs +from whoot_model_training.preprocessors import SpectrogramModelInputPreprocessors + +def parse_config(config_path): + with open(config_path, 'r') as f: + config = yaml.safe_load(f) + return config + +def train(config_path): + config = parse_config(config_path) + + # Extract the dataset + ds = buowset_extractor( + metadata_csv=config["metadata_csv"], + parent_path=config["data_path"], + output_path=config["hf_cache_path"] + ) + + # Create the model + model = TimmModel(num_classes=ds.get_num_classes()) + + # Preprocessors (No augmentation)! + # We define here what the model reads + preprocessor = SpectrogramModelInputPreprocessors( + TimmInputs, duration=3, class_list=ds.get_class_labels() + ) + + ds["train"].set_transform(preprocessor) + ds["valid"].set_transform(preprocessor) + ds["test"].set_transform(preprocessor) + + # Run training + args = PyhaTrainingArguments(working_dir="working_dir") + args.num_train_epochs = 2 + args.eval_steps = 20 + args.run_name = "testing" + args.report_to="none" #Blocks wandb + + trainer = PyhaTrainer( + model=model, + dataset=ds, + training_args=args, + logger=None, + data_collator=lambda x: x + ) + trainer.train() + trainer.evaluate(eval_dataset=ds["test"], metric_key_prefix="Soundscape") + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Input config path' + ) + parser.add_argument('config', type=str, + help='Path to config.yml') + args = parser.parse_args() + train(args.config) \ No newline at end of file diff --git a/model_training/data_augmentation.py b/whoot_model_training/whoot_model_training/data_augmentation.py similarity index 100% rename from model_training/data_augmentation.py rename to whoot_model_training/whoot_model_training/data_augmentation.py diff --git a/model_training/data_extractor/__init__.py b/whoot_model_training/whoot_model_training/data_extractor/__init__.py similarity index 100% rename from model_training/data_extractor/__init__.py rename to whoot_model_training/whoot_model_training/data_extractor/__init__.py diff --git a/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py b/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py new file mode 100644 index 0000000..c11be07 --- /dev/null +++ b/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py @@ -0,0 +1,77 @@ +""" +Inspired by https://github.com/UCSD-E4E/pyha-analyzer-2.0/tree/main/pyha_analyzer/extractors +Standardizes the format of the buowset dataset +""" + +import argparse +import os + +import numpy as np +from datasets import ( + load_dataset, + Audio, + DatasetDict, + load_from_disk, + ClassLabel, + Sequence +) +from ..dataset import AudioDataset +from pyha_analyzer.extractors.birdset import one_hot_encode_ds_wrapper + + +def one_hot_encode(row, classes): + one_hot = np.zeros(len(classes)) + one_hot[row["labels"]] = 1 + row["labels"] = np.array(one_hot, dtype=float) + return row + +"""_summary_ +""" +def buowset_extractor( + metadata_csv, + parent_path, + output_path, # TODO what does output do? + validation_fold=4, + test_fold=3, + sr=32_000, + filepath="segment", +): + + # if os.path.exists(output_path): + # ds = load_from_disk(output_path) + # return AudioDataset(ds) + + # Hugging face by default defines a train split + ds = load_dataset("csv", data_files=metadata_csv)["train"] + ds = ds.rename_column("label", "labels") #Convention here is labels + + # Convert to a uniform one_hot encoding for classes + ds = ds.class_encode_column("labels") + class_list = ds.features["labels"].names + mutlilabel_class_label = Sequence(ClassLabel(names=class_list)) + ds = ds.map( + lambda row: one_hot_encode(row, class_list) + ).cast_column("labels", mutlilabel_class_label) + + # Get audio into uniform format + + ds = ds.add_column( + "audio", + [os.path.join(parent_path, file) for file in ds[filepath]] + ) + + ds = ds.cast_column("audio", Audio(sampling_rate=sr)) + + # Create splits of the data + test_ds = ds.filter(lambda x: x["fold"] == validation_fold) + valid_ds = ds.filter(lambda x: x["fold"] == test_fold) + train_ds = ds.filter( + lambda x: x["fold"] != test_fold & x["fold"] != validation_fold + ) + ds = AudioDataset( + DatasetDict({"train": train_ds, "valid": valid_ds, "test": test_ds}) + ) + + ds.save_to_disk(output_path) + + return ds diff --git a/model_training/dataset.py b/whoot_model_training/whoot_model_training/dataset.py similarity index 86% rename from model_training/dataset.py rename to whoot_model_training/whoot_model_training/dataset.py index 878ef36..f2e4290 100644 --- a/model_training/dataset.py +++ b/whoot_model_training/whoot_model_training/dataset.py @@ -12,7 +12,7 @@ from datasets import DatasetDict, ClassLabel from torch.utils.data import DataLoader -DEFAULT_COLUMNS = ["label", "audio"] +DEFAULT_COLUMNS = ["labels", "audio"] class AudioDataset(DatasetDict): @@ -32,9 +32,15 @@ def get_num_classes( self, ): # NOTE: Assumes all labels are mutlilabel (the extra feature note) return self["train"].features["labels"].feature.num_classes + + """ + Legacy code had the method name `get_number_species` + """ + def get_number_species(self): + return self.get_num_classes() def get_class_labels(self): """ Returns a new ClassLabel Object to make mapping easier between datasets """ - return ClassLabel(names=self["train"].features["labels"].names) + return ClassLabel(names=self["train"].features["labels"].feature.names) diff --git a/whoot_model_training/whoot_model_training/models/__init__.py b/whoot_model_training/whoot_model_training/models/__init__.py new file mode 100644 index 0000000..c91d3c0 --- /dev/null +++ b/whoot_model_training/whoot_model_training/models/__init__.py @@ -0,0 +1 @@ +from .timm_model import TimmModel, TimmInputs diff --git a/model_training/models/model.py b/whoot_model_training/whoot_model_training/models/model.py similarity index 91% rename from model_training/models/model.py rename to whoot_model_training/whoot_model_training/models/model.py index 844fe6c..8221074 100644 --- a/model_training/models/model.py +++ b/whoot_model_training/whoot_model_training/models/model.py @@ -41,10 +41,10 @@ class ModelOutput(ABC): def __init__( self, - logits: np.array, - embeddings: np.array, - labels: np.array | None = None, - loss: np.array | None = None, + logits: np.ndarray, + embeddings: np.ndarray, + labels: np.ndarray | None = None, + loss: np.ndarray | None = None, ): self.embeddings = embeddings self.logits = logits @@ -80,9 +80,9 @@ class ModelInput(ABC): def __init__( self, - labels: np.array, - waveform: np.array | None = None, - spectrogram: np.array | None = None, + labels: np.ndarray, + waveform: np.ndarray|None = None, + spectrogram: np.ndarray |None = None, ): self.waveform = waveform self.spectrogram = spectrogram @@ -96,7 +96,7 @@ def to_tensor(self, device="cpu"): """ BaseModel Class for Whoot """ -class Model(ABC, BaseModel): +class Model(BaseModel): # TODO Define required class intance variables # Such as cirteron etc. def __init__(self, *args, **kwargs): @@ -136,7 +136,7 @@ def get_embeddings(self, x: ModelInput) -> np.array: """ @abstractmethod - @has_required_inputs + @has_required_inputs() def forward(self, x: ModelInput) -> ModelOutput: pass diff --git a/model_training/models/timm_model.py b/whoot_model_training/whoot_model_training/models/timm_model.py similarity index 84% rename from model_training/models/timm_model.py rename to whoot_model_training/whoot_model_training/models/timm_model.py index 41baf80..38fccad 100644 --- a/model_training/models/timm_model.py +++ b/whoot_model_training/whoot_model_training/models/timm_model.py @@ -1,7 +1,7 @@ import timm from torch import nn, Tensor -from model import Model, ModelInput, ModelOutput, has_required_inputs +from .model import Model, ModelInput, ModelOutput, has_required_inputs """ Wrapper around the timms model zoo @@ -51,10 +51,10 @@ def __init__( else: self.loss = nn.BCEWithLogitsLoss() - @has_required_inputs - def forward(self, x: TimmInputs) -> ModelOutput: - embedd = self.backbone(x.spectrogram) + @has_required_inputs() + def forward(self, data: TimmInputs) -> ModelOutput: + embedd = self.backbone(data.spectrogram) logits = self.linear(embedd) - loss = self.loss(logits, x.labels) + loss = self.loss(logits, data.labels) - return ModelOutput(logits=logits, embeddings=embedd, loss=loss, labels=x.labels) + return ModelOutput(logits=logits, embeddings=embedd, loss=loss, labels=data.labels) diff --git a/whoot_model_training/whoot_model_training/preprocessors/__init__.py b/whoot_model_training/whoot_model_training/preprocessors/__init__.py new file mode 100644 index 0000000..66aa562 --- /dev/null +++ b/whoot_model_training/whoot_model_training/preprocessors/__init__.py @@ -0,0 +1,2 @@ +from .default_preprocessor import SpectrogramModelInputPreprocessors +from .spectrogram_preprocessors import BuowMelSpectrogramPreprocessors diff --git a/model_training/preprocessors/default_preprocessor.py b/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py similarity index 71% rename from model_training/preprocessors/default_preprocessor.py rename to whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py index 58691c0..c1efd12 100644 --- a/model_training/preprocessors/default_preprocessor.py +++ b/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py @@ -1,5 +1,5 @@ -from pyha_analyzer.preprocessors import MelSpectrogramPreprocessors -from models.model import ModelInput +from .spectrogram_preprocessors import BuowMelSpectrogramPreprocessors +from ..models.model import ModelInput """_summary_ @@ -9,7 +9,7 @@ """ -class SpectrogramModelInputPreprocessors(MelSpectrogramPreprocessors): +class SpectrogramModelInputPreprocessors(BuowMelSpectrogramPreprocessors): def __init__( self, ModelInput: ModelInput, @@ -38,4 +38,4 @@ def __init__( def __call__(self, batch): batch = super().__call__(batch) - return self.ModelInput(labels=batch["labels"], spectrogram=batch["audio"]) + return {"data": [self.ModelInput(labels=batch["labels"], spectrogram=batch["audio"])]} diff --git a/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py b/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py new file mode 100644 index 0000000..d3c4356 --- /dev/null +++ b/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py @@ -0,0 +1,78 @@ +""" +Pulled from pyha_analyzer/preprocessors/spectogram_preprocessors.py +""" + +import librosa +import numpy as np +import torchvision.transforms as transforms + +from pyha_analyzer.preprocessors import PreProcessorBase + +class BuowMelSpectrogramPreprocessors(PreProcessorBase): + def __init__( + self, + duration=5, + augment=None, + spectrogram_augments=None, + class_list=[], + n_fft=2048, + hop_length=256, + power=2.0, + n_mels=256, + dataset_ref=None, + ): + self.duration = duration + self.augment = augment + self.spectrogram_augments = spectrogram_augments + + # Below parameter defaults from https://arxiv.org/pdf/2403.10380 pg 25 + self.n_fft=n_fft + self.hop_length=hop_length + self.power=power + self.n_mels=n_mels + + super().__init__(name="MelSpectrogramPreprocessor") + + def __call__(self, batch): + new_audio = [] + new_labels = [] + for item_idx in range(len(batch["audio"])): + label = batch["labels"][item_idx] + y, sr = librosa.load(path=batch["audio"][item_idx]["path"]) + start = 0 + + # Handle out of bound issues + end_sr = int(start * sr) + int(sr * self.duration) + if y.shape[-1] <= end_sr: + y = np.pad(y, end_sr - y.shape[-1]) + + # Audio Based Augmentations + if self.augment != None: + y, label = self.augment(y, sr, label) + + + pillow_transforms = transforms.ToPILImage() + + mels = np.array( + pillow_transforms( + librosa.feature.melspectrogram( + y=y[int(start * sr) : end_sr], sr=sr, + n_fft=self.n_fft, + hop_length=self.hop_length, + power=self.power, + n_mels=self.n_mels, + ) + ), + np.float32)[np.newaxis, ::] / 255 + + if self.spectrogram_augments is not None: + mels = self.spectrogram_augments(mels) + + # print(mels.shape, int(start * sr), y.shape) + new_audio.append(mels) + new_labels.append(label) + + batch["audio"] = new_audio + batch["labels"] = np.array(new_labels, dtype=np.float32) + + return batch \ No newline at end of file diff --git a/model_training/trainer.py b/whoot_model_training/whoot_model_training/trainer.py similarity index 100% rename from model_training/trainer.py rename to whoot_model_training/whoot_model_training/trainer.py From 2639ee5bcc37df83f647813ece5e012bc68c4b75 Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Mon, 16 Jun 2025 11:51:49 -0700 Subject: [PATCH 052/120] condense older versions of creating buowset segments there were two versions i worked on, one chunked the 2017 wavs into 3s chunks and then saved them out if there was a human buow label for that chunk. it could also create random 3s chunks with the rest for 'no_buow' class. The other script was a bit better because it would chunk based on the timestamps of the label, and optionally add some extra buffer before and after in case the duration was a bit shorter than the actual sound. now its just the second script. --- ...egment_2017_data.py => label_2017_wavs.py} | 0 .../tests/segment_labeled_2017_data.py | 167 ------------------ 2 files changed, 167 deletions(-) rename create_dataset/tests/{params_segment_2017_data.py => label_2017_wavs.py} (100%) delete mode 100644 create_dataset/tests/segment_labeled_2017_data.py diff --git a/create_dataset/tests/params_segment_2017_data.py b/create_dataset/tests/label_2017_wavs.py similarity index 100% rename from create_dataset/tests/params_segment_2017_data.py rename to create_dataset/tests/label_2017_wavs.py diff --git a/create_dataset/tests/segment_labeled_2017_data.py b/create_dataset/tests/segment_labeled_2017_data.py deleted file mode 100644 index 68ff47f..0000000 --- a/create_dataset/tests/segment_labeled_2017_data.py +++ /dev/null @@ -1,167 +0,0 @@ -"""Create human labeled audio segments. - -Using a CSV with human labels across a large dataset, we can -find the segments in the audio files that correspond to a -burrowing owl call as labeled by a human labeler. We can then -segment these audio chunks into a folder so that we can use -them to easily train other models. We can also do the same -for the rest of the data to obtain segments with no bird -call labels, to provide another class in the same domain -as our bird vocalizations. As there are significantly more -negatives than positives, we can choose if we'd like to get -the same number output or select a higher or lower amount. - -Example: - - $ python segment_labeled_2017_data.py /path/to/human_labels.csv \ - /path/to/directory/of/wavs/ /path/to/directory/output/ - -""" - -import argparse -import os -import pandas as pd -import librosa -from pydub import AudioSegment - - -def create_bird_segments(labels, wavs, output): - """Create human labeled dataframes. - - Main script to create csvs of human labeled data for each - wav file of interest. - - Args: - labels (str): The path to human labeled csv. - wavs (str): The path to all audio files. - output (str): The path to directory where each csv will - output (1 for each wav). - - """ - os.makedirs(output, exist_ok=True) - - scored_data = pd.read_csv(labels) - output = output + "bird_sounds/" - os.makedirs(output, exist_ok=True) - - for audio_file in os.listdir(wavs): - if audio_file.endswith('.wav'): - audio_path = os.path.join(wavs, audio_file) - - try: - time_series, sample_rate = librosa.load(audio_path, sr=None) - audio_duration = librosa.get_duration(y=time_series, - sr=sample_rate) - except Exception as err: - print(f"Error processing {audio_file}: {err}") - continue - - total_chunks = int(audio_duration // 3) + 1 - chunks_data = { - 'Chunk Start': [i * 3 for i in range(total_chunks)], - 'Chunk End': [(i + 1) * 3 for i in range(total_chunks)], - 'Label': ['no'] * total_chunks - } - chunks_df = pd.DataFrame(chunks_data) - - filtered_data = scored_data[scored_data['IN FILE'] == audio_file] - bird_sound = AudioSegment.from_wav(audio_path) - segment_index = 0 - for _, row in filtered_data.iterrows(): - if row['TOP1MATCH'] != 'null': - start_time = float(row['OFFSET']) - end_time = start_time + float(row['DURATION']) - - for i in range(len(chunks_df)): - chunk_start = chunks_df.loc[i, 'Chunk Start'] - chunk_end = chunks_df.loc[i, 'Chunk End'] - if (start_time < chunk_end and end_time > chunk_start): - chunk_start = chunk_start * 1000 - chunk_end = chunk_end * 1000 - segment = bird_sound[chunk_start:chunk_end] - output_file = os.path.join( - output, f'{os.path.splitext(audio_file)[0]}_segment_{segment_index}.wav' - ) - segment.export(output_file, format='wav') - segment_index += 1 - - print("Processing complete!") - -def create_no_bird_segments(labels, wavs, output): - """Create no bird call audio segments. - - """ - os.makedirs(output, exist_ok=True) - - scored_data = pd.read_csv(labels) - output = output + "no_bird_sounds/" - os.makedirs(output, exist_ok=True) - - for audio_file in os.listdir(wavs): - if audio_file.endswith('.wav'): - audio_path = os.path.join(wavs, audio_file) - - try: - time_series, sample_rate = librosa.load(audio_path, sr=None) - audio_duration = librosa.get_duration(y=time_series, sr=sample_rate) - except Exception as err: - print(f"Error processing {audio_file}: {err}") - continue - - total_chunks = int(audio_duration // 3) + 1 - chunks_data = { - 'Chunk Start': [i * 3 for i in range(total_chunks)], - 'Chunk End': [(i + 1) * 3 for i in range(total_chunks)], - 'Label': ['no'] * total_chunks - } - chunks_df = pd.DataFrame(chunks_data) - - filtered_data = scored_data[scored_data['IN FILE'] == audio_file] - - for _, row in filtered_data.iterrows(): - if row['TOP1MATCH'] != 'null': - start_time = float(row['OFFSET']) - end_time = start_time + float(row['DURATION']) - - for i in range(len(chunks_df)): - chunk_start = chunks_df.loc[i, 'Chunk Start'] - chunk_end = chunks_df.loc[i, 'Chunk End'] - if start_time < chunk_end and end_time > chunk_start: - chunks_df.loc[i, 'Label'] = 'bird' - - bird_sound = AudioSegment.from_wav(audio_path) - segment_index = 0 - for i in range(len(chunks_df)): - if chunks_df.loc[i, 'Label'] == 'no': - chunk_start = chunks_df.loc[i, 'Chunk Start'] * 1000 - chunk_end = chunks_df.loc[i, 'Chunk End'] * 1000 - segment = bird_sound[chunk_start:chunk_end] - - output_file = os.path.join( - output, f'{os.path.splitext(audio_file)[0]}_nobird_segment_{segment_index}.wav' - ) - segment.export(output_file, format='wav') - segment_index += 1 - - print("Processing complete!") - -def main(labels, wavs, output): - """Run main script - - """ - create_bird_segments(labels, wavs, output) - create_no_bird_segments(labels, wavs, output) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='Input Directory Path' - ) - parser.add_argument('labels', type=str, - help='Path to human labeled csv') - parser.add_argument('wavs', type=str, - help='Path to all wav files that have been labeled') - parser.add_argument('output', type=str, - help='Path to desired directory for output csvs') - args = parser.parse_args() - main(args.labels, args.wavs, args.output) From af0bb87d41e8e9f8e2b9ffbc8bdf988511d2995f Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Mon, 16 Jun 2025 11:56:37 -0700 Subject: [PATCH 053/120] fixed pylint error --- create_dataset/create_dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/create_dataset/create_dataset.py b/create_dataset/create_dataset.py index 25440cf..4d21602 100644 --- a/create_dataset/create_dataset.py +++ b/create_dataset/create_dataset.py @@ -61,6 +61,7 @@ def create_dataset(labels, wav_dir, output_dir, class_list): wav_file_paths = get_paths(wav_dir) # open human label file labels = pd.read_csv(labels) + use_2017 = None # iterate through each individual original wav if "2017" in labels['DATE'].iloc[0]: use_2017 = True From 1539b7a747375f515670db731030c165b56c0b5a Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Mon, 16 Jun 2025 12:11:57 -0700 Subject: [PATCH 054/120] fix flake8 and pylint and docstrings fixed some pylint and flake8 errors (ignored too many variable warnings from pylint) and made the docstrings more google style compliant --- create_dataset/create_segments.py | 69 ++++++++++++++----------------- 1 file changed, 32 insertions(+), 37 deletions(-) diff --git a/create_dataset/create_segments.py b/create_dataset/create_segments.py index 62dbd14..c80eaf5 100644 --- a/create_dataset/create_segments.py +++ b/create_dataset/create_segments.py @@ -17,11 +17,11 @@ def get_paths(home_dir): home_dir (str): Path to directory containing original wavs. Returns: - wavs_file_paths (list): List of all the full paths to a wav in - the given directory. + list: List of all the full paths to a wav in + the given directory. """ wavs_file_paths = [] - for path, dirs, files in os.walk(home_dir): + for path, _, files in os.walk(home_dir): for file in files: if file.endswith('.wav'): new_file = os.path.join(path, file) @@ -36,19 +36,17 @@ def create_segments(wav, filtered_labels, out_path, class_list): wav (str): Path to current wav file in loop. filtered_labels (pd.Dataframe): The human label file reduced - to only contain the rows of - detections pertinent to the - wav of interest. + to only contain the rows of detections pertinent to the + wav of interest. out_path (str): Path to directory where segment will be saved. - class_list (str): Path to the class list that you'd like segments - to be created for. What the manual ID's are in - the human label file- will ignore everything that - is misspelled or unknown labels. + to be created for. What the manual ID's are in the human + label file- will ignore everything that is misspelled or + unknown labels. Returns: - output_rows (pd.Dataframe): The metadata now associated with the - created segments for a given wav file. + pd.Dataframe: The metadata now associated with the + created segments for a given wav file. """ print(f"creating segments for {wav}") if filtered_labels is None: @@ -56,7 +54,7 @@ def create_segments(wav, filtered_labels, out_path, class_list): "it does not have labels or is not a file of interest") return None if filtered_labels.empty: - print(f"filtered labels is an empty dataframe, " + print("filtered labels is an empty dataframe, " "meaning either the sound file was not " "labeled or has no detections") return None @@ -66,7 +64,7 @@ def create_segments(wav, filtered_labels, out_path, class_list): 'original_path', 'segment_duration_s', 'segment_rel_start_ms']) - with open(class_list, 'r', newline='') as file: + with open(class_list, 'r', newline='', encoding='utf-8') as file: reader = csv.reader(file) classes = next(reader) print(classes) @@ -77,19 +75,19 @@ def create_segments(wav, filtered_labels, out_path, class_list): filtered_labels['MANUAL ID*'] = filtered_labels['MANUAL ID*'].str.lower() print(filtered_labels) df_row = 0 - for index, row in filtered_labels.iterrows(): + for _, row in filtered_labels.iterrows(): for call_type in classes: if row['MANUAL ID*'] == call_type: start_time = float(row['OFFSET']) - end_time = (start_time + float(row['DURATION'])) + end_time = start_time + float(row['DURATION']) start_time = start_time * 1000 end_time = end_time * 1000 segment = audio[start_time:end_time] - id = uuid.uuid4() - id = str(id) + '.wav' - segment_path = os.path.join(out_path, id) + segment_id = uuid.uuid4() + segment_id = str(segment_id) + '.wav' + segment_path = os.path.join(out_path, segment_id) segment.export(segment_path, format='wav') - output_rows.loc[df_row] = [id, + output_rows.loc[df_row] = [segment_id, call_type, segment_path, wav, @@ -102,25 +100,21 @@ def create_segments(wav, filtered_labels, out_path, class_list): def create_noise_segments(wav, new_buow_rows, out_path): - """ + """Create 'no_buow' segments. Randomly select an equal number of 3s noise segments to the number of detections per audio file, a buffer length away from all of the detections in the file. Args: wav (str): The path to the given wav. - new_buow_rows (pd.Dataframe): The human labeled detection - segment metadata for the given - wav. - + segment metadata for the given wav. out_path (str): The directory where the new no_buow segments will - go to join the human labeled segments. + go to join the human labeled segments. Returns: - all_buow_rows (pd.Dataframe): The metadata for the detection as - well as the no_buow segments created - from the given wav. + pd.Dataframe: The metadata for the detection as well as + the no_buow segments created from the given wav. """ if new_buow_rows is None: print(f"not creating noise segments from {wav} because " @@ -136,10 +130,10 @@ def create_noise_segments(wav, new_buow_rows, out_path): call_type = "no_buow" num = len(new_buow_rows) * 2 seconds_array = np.zeros(duration) - for index, row in new_buow_rows.iterrows(): + for _, row in new_buow_rows.iterrows(): start = int((row['segment_rel_start_ms'] / 1000) - 1) end = int((row['segment_rel_start_ms'] / 1000) - + row['segment_duration_s']) + + row['segment_duration_s']) mask_start = max(0, start - 30) mask_end = min(len(seconds_array), end + 30 + 1) seconds_array[mask_start:mask_end] = 1 @@ -147,20 +141,21 @@ def create_noise_segments(wav, new_buow_rows, out_path): while num > new_sample: try: random_index = np.random.choice(len(seconds_array)-3) - except: + except ValueError: print(f"{wav} is not long enough to generate no_buow sounds, " "keeping the detection segment but adding no no_buow") return new_buow_rows - if seconds_array[random_index] == 0 and seconds_array[random_index + 3] == 0: + if (seconds_array[random_index] == 0 and + seconds_array[random_index + 3] == 0): start_time = (random_index + 1) * 1000 end_time = (random_index + 4) * 1000 segment = audio[start_time:end_time] duration_of_segment = len(segment) / 1000 - id = uuid.uuid4() - id = str(id) + '.wav' - segment_path = os.path.join(out_path, id) + segment_id = uuid.uuid4() + segment_id = str(segment_id) + '.wav' + segment_path = os.path.join(out_path, segment_id) segment.export(segment_path, format='wav') - new_buow_rows.loc[new_sample] = [id, + new_buow_rows.loc[new_sample] = [segment_id, call_type, segment_path, wav, From 6e53425bf40335f69fcb5f580c0d8bd860222524 Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Mon, 16 Jun 2025 13:57:23 -0700 Subject: [PATCH 055/120] flake9, pylint and docstrings --- create_dataset/filter_labels.py | 75 +++++++++++++++++---------------- 1 file changed, 38 insertions(+), 37 deletions(-) diff --git a/create_dataset/filter_labels.py b/create_dataset/filter_labels.py index 08891d9..d6d9373 100644 --- a/create_dataset/filter_labels.py +++ b/create_dataset/filter_labels.py @@ -1,11 +1,26 @@ -import pandas as pd +"""Correlating the wav paths with the labels for 2017 and 2018. + +The label file format is different for the 2018 and 2017 label +files. This means we use different information in those files to +ensure the wav file we found in the folder corresponds to the +label in the label file. Depending on the label file, one +of these two functions gets called to ensure we're dealing +with the proper wav file and only the labels that correspond +to that wav file. +""" import os import ntpath -import logging def filter_labels_2017(wav, labels): - """ + """Filter labels from 2017 data. + + Args: + wav (str): The current wav file. + labels (pd.DataFrame): All of the labels. + + Returns: + pd.DataFrame: The labels associated with the wav of interest. """ file_name = ntpath.basename(wav) # isolate labels that match the wav basename @@ -13,6 +28,7 @@ def filter_labels_2017(wav, labels): index_drop = [] wav = str(wav) # ensure the labels match the site and burrow name of wav file + # this step is crucial, it catches accidential duplicates of wav files for index, row in filtered_labels.iterrows(): burrow = row['Burrow'] bur = burrow[:-1] @@ -27,23 +43,23 @@ def filter_labels_2017(wav, labels): filtered_labels = filtered_labels.drop(index_drop) return filtered_labels + def filter_labels_2018(wav, labels): - """ - Because we do not have full file paths, we need to ensure that there - are not duplicate .wav file names that are associated with different burrows/sites. - If we just use the all label file, it would be difficult to determine which burrow/site - is correct for the wav file, because the file paths are inconsistent. This function - chooses the label file to use based on the wav name, and then obtains the labels for - that site/burrow within that folder so that there's no question that it's for that - site/burrow. 2017 is formatted very differently and we are able to back out the burrow/site - from the path to the wav and other information in the all labels file. + """Filter labels from 2018 data. + + Args: + wav (str): The current wav file. + labels (pd.DataFrame): All of the labels. + + Returns: + pd.DataFrame: The labels associated with the wav of interest. """ file_name = ntpath.basename(wav) path_name = ntpath.dirname(wav) basepath = os.path.basename(path_name) - if basepath == "ClassificationResults" or basepath == "Classification_Results": + if basepath in ('ClassificationResults', 'Classification_Results'): print(f"skipping {wav} because it's basepath is {basepath}") - # skipping extra wav files that exist as duplicates of our wavs of interest within these sub dirs + # skipping extra wav files that exist as duplicates in these sub dirs return None # some of the folders have an underscore and some do not path_labels = [] @@ -53,43 +69,28 @@ def filter_labels_2018(wav, labels): # checking if it's the one with an underscore vs not for path in path_labels: exists = os.path.exists(path) - if exists == True: + if exists is True: path_to_results = path else: print(f"{path} does not exist") continue - if path_to_results == None: - # skipping wav files that are an exception to this folder structure because they're - # not the wav files of interest + if path_to_results is None: print(f"skipping {wav} because it's not a file of interest") return None filtered_labels = labels[labels['IN FILE'] == file_name] index_to_drop = [] # iterating the columns in labels that match the wav file name for index, row in filtered_labels.iterrows(): - check_path = os.path.join(path_to_results, row['Fled_2018_LS133_SM1.csv '].strip()) - # there's a column in the all labels file that has the file name of the subset label file that - # the all labels file was aggregated from, and if the wav file path leads us to - # the label file listed in the all labels file, then it will be apart of the filtered - # labels for that wav. this needs to be checked in case 2 wav files have the same - # file name, but are from different burrows/sites. - # it's worth noting that this could be done a different way, using the subset label files - # for each burrow/site labels, but you'd still need the all labels file to validate, so it - # just felt like more steps + stripped = row['Fled_2018_LS133_SM1.csv '].strip() + check_path = os.path.join(path_to_results, stripped) if os.path.isfile(check_path): continue - elif row['Fled_2018_LS133_SM1.csv '].strip() == 'EarBreed_2018_LS128_SM10A.csv': - check_path = os.path.join(path_to_results, 'EarBreed_LS128_SM10A.csv') - if os.path.isfile(check_path): - continue - else: + if stripped == 'EarBreed_2018_LS128_SM10A.csv': + check_path = os.path.join(path_to_results, + 'EarBreed_LS128_SM10A.csv') + if not os.path.isfile(check_path): index_to_drop.append(index) else: index_to_drop.append(index) - # if there were labels associated with a different wav file that happened to have the same - # name, this will drop the labels associated with a different burrow/site filtered_labels = filtered_labels.drop(index_to_drop) return filtered_labels - - -# TODO: There's one subset label file that has no column names, so that error needs to be dealt with. It will currently ignore that one From e21482cfd622f7906ae85e0971a0b1cde076a46f Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Mon, 16 Jun 2025 14:40:02 -0700 Subject: [PATCH 056/120] fixed pylint and flake8 and docstrings --- create_dataset/strat_k_folds.py | 47 ++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 12 deletions(-) diff --git a/create_dataset/strat_k_folds.py b/create_dataset/strat_k_folds.py index 2ec36b7..0de2eb0 100644 --- a/create_dataset/strat_k_folds.py +++ b/create_dataset/strat_k_folds.py @@ -1,15 +1,32 @@ +"""Split buowset into stratified k-folds. + +Groups detections from the same wav file into 'groups' +and then determines the overall class distribution and +the class distribution for each 'group'. It allocates +all the groups to a 'fold' in a way where the folds +are roughly the same class distribution as the overall +dataset. + +Usage: + python3 strat_k_folds.py /path/to/metadata.csv """ -""" -from k_fold_split_copy import calculate_cost, generate_search_space -from k_fold_split_copy import solution_to_str, generate_initial_solution -from k_fold_split_copy import solve, select_move -import pandas as pd import argparse +import pandas as pd import numpy as np +from k_fold_split_copy import solve + + def create_strat_folds(df): - """ + """Create grouped stratified k-folds. + + Args: + df (pd.Dataframe): The metadata csv from when the dataset was created. + + Returns: + pd.DataFrame: The same metadata but with labels as ints and a new fold + column to denote the fold that segment is apart of. """ num_classes = 6 original_df = df @@ -30,14 +47,17 @@ def create_strat_folds(df): counts[int(label)] = count group_matrix.append(counts) group_names.append(index) - print(group_names) problem = np.array(group_matrix) - print(problem) solution = solve(problem, k=5, verbose=True) + # the fold allocation for each 'group' print(f"solution {solution}") print(np.sum(problem, axis=0) / np.sum(problem)) folds = [problem[solution == i] for i in range(5)] - fold_percents = np.array([np.sum(folds[i], axis=0) / np.sum(folds[i]) for i in range(5)]) + fold_percents = np.array( + [np.sum(folds[i], axis=0) / np.sum(folds[i]) for i in range(5)] + ) + # the % of each class in each fold + print(f"Fold percents: {fold_percents}") print(folds) grouped_original = original_df.groupby('original_path') df_with_folds = pd.DataFrame() @@ -48,15 +68,19 @@ def create_strat_folds(df): count += 1 return df_with_folds + def main(meta): - """ + """Execute main script. + + Args: + meta (str): Path to metadata csv from creating the dataset. """ df = pd.read_csv(meta, index_col=0) df_with_folds = create_strat_folds(df) df_with_folds.to_csv("5-fold_meta.csv") -if __name__=="__main__": +if __name__ == "__main__": parser = argparse.ArgumentParser( description='Input Directory Path' ) @@ -64,4 +88,3 @@ def main(meta): help='Path to metadata csv') args = parser.parse_args() main(args.meta) - From 90c3bf7180468932e8227cad7417640b519049b5 Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Mon, 16 Jun 2025 14:41:25 -0700 Subject: [PATCH 057/120] fixed line too long for my comment leaving this script basically as-is with no linting because it was borrowed from an other repo. I modified one line (denoted with a comment) to account for intances where a class is defined, but there are actually no instances of the class in the set. its a rare error that shouldn't really happen, but during testing it came up so it was a minor change. --- create_dataset/k_fold_split_copy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/create_dataset/k_fold_split_copy.py b/create_dataset/k_fold_split_copy.py index 2296ab6..7453503 100644 --- a/create_dataset/k_fold_split_copy.py +++ b/create_dataset/k_fold_split_copy.py @@ -48,8 +48,8 @@ def calculate_cost(problem: np.ndarray, cost += (fold_sum / total - 1.0 / k) ** 2 # Now calculate the cost associated with the class imbalances - # Katie: had to add division by 0 error for if fold_sums equal 0, during testing with subset - # there were no chick begging calls so this row was 0 + # Katie: had to add division by 0 error for if fold_sums equal 0 + # there were no chick begging calls during test so this row was 0 for j in range(num_classes): if fold_sum == 0: cost += (0 - class_sums[j] / total) ** 2 From cec5f4626ab772a2f820262eb05445ab6f93b848 Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Mon, 16 Jun 2025 16:55:45 -0700 Subject: [PATCH 058/120] fix some doctring stuff --- create_dataset/create_dataset.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/create_dataset/create_dataset.py b/create_dataset/create_dataset.py index 4d21602..0a3b714 100644 --- a/create_dataset/create_dataset.py +++ b/create_dataset/create_dataset.py @@ -40,15 +40,12 @@ def create_dataset(labels, wav_dir, output_dir, class_list): Args: labels (str): Path to label file. - wav_dir (str): Path to original wav segments of audio. - output_dir (str): Path to where the segments and metadata - will go. - + will go. class_list (str): Path to file containing the classes - seen in the human labels file that you - want to create segments for. + seen in the human labels file that you want to create + segments for. """ # parse the inputs out_file = ntpath.dirname(output_dir) @@ -110,15 +107,12 @@ def main(labels, wav_dir, output_dir, class_list): Args: labels (str): Path to label file. - wav_dir (str): Path to original wav segments of audio. - output_dir (str): Path to where the segments and metadata - will go. - + will go. class_list (str): Path to file containing the classes - seen in the human labels file that you - want to create segments for. + seen in the human labels file that you want to + create segments for. """ create_dataset(labels, wav_dir, output_dir, class_list) From b7ffbc47b4cd6554b0e1a8a43746a7d1f9b51e26 Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Tue, 17 Jun 2025 12:49:40 -0700 Subject: [PATCH 059/120] add description for including the class list --- create_dataset/create_dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/create_dataset/create_dataset.py b/create_dataset/create_dataset.py index 0a3b714..a0d036d 100644 --- a/create_dataset/create_dataset.py +++ b/create_dataset/create_dataset.py @@ -45,7 +45,8 @@ def create_dataset(labels, wav_dir, output_dir, class_list): will go. class_list (str): Path to file containing the classes seen in the human labels file that you want to create - segments for. + segments for. Current format is ',' delimited list + in a .txt file. """ # parse the inputs out_file = ntpath.dirname(output_dir) From 4f3133df6c26937049dcb70943ad57e5364b1fbb Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Wed, 18 Jun 2025 13:46:58 -0700 Subject: [PATCH 060/120] Got training working! Fixed bugs blocking model training, going to test it to make sure it finshes Next TODOS - Clean up the codebase - Connect get the cometML hook working - Better config options - Maybe pull some of the codebase into pyha-analyzer-2.0, might be better there... - Get a model on the data! --- whoot_model_training/train.py | 6 +- .../whoot_model_training/models/model.py | 105 ++++++++++++++++-- .../whoot_model_training/models/timm_model.py | 17 +-- .../preprocessors/default_preprocessor.py | 2 +- .../spectrogram_preprocessors.py | 1 - .../whoot_model_training/trainer.py | 66 ----------- 6 files changed, 107 insertions(+), 90 deletions(-) delete mode 100644 whoot_model_training/whoot_model_training/trainer.py diff --git a/whoot_model_training/train.py b/whoot_model_training/train.py index 5d3fdda..a81122c 100644 --- a/whoot_model_training/train.py +++ b/whoot_model_training/train.py @@ -41,6 +41,9 @@ def train(config_path): args = PyhaTrainingArguments(working_dir="working_dir") args.num_train_epochs = 2 args.eval_steps = 20 + args.dataloader_num_workers = 36 + args.per_device_train_batch_size = 32 + args.per_device_eval_batch_size = 32 args.run_name = "testing" args.report_to="none" #Blocks wandb @@ -48,8 +51,7 @@ def train(config_path): model=model, dataset=ds, training_args=args, - logger=None, - data_collator=lambda x: x + logger=None, ) trainer.train() trainer.evaluate(eval_dataset=ds["test"], metric_key_prefix="Soundscape") diff --git a/whoot_model_training/whoot_model_training/models/model.py b/whoot_model_training/whoot_model_training/models/model.py index 8221074..1c3330d 100644 --- a/whoot_model_training/whoot_model_training/models/model.py +++ b/whoot_model_training/whoot_model_training/models/model.py @@ -1,5 +1,6 @@ from abc import ABC, abstractmethod from functools import wraps +from collections import UserDict from pyha_analyzer.models.base_model import BaseModel import torch @@ -15,10 +16,10 @@ def has_required_inputs(): def decorator(forward): @wraps(forward) - def wrapper(self, x): - assert isinstance(x, self.input_format) - model_output = forward(self, x) - assert isinstance(model_output, self.output_format) + def wrapper(self, *args, **kwarg): + #assert isinstance(x, self.input_format) #TODO FIX + model_output = forward(self, *args, **kwarg) + #assert isinstance(model_output, self.output_format) return model_output @@ -26,8 +27,8 @@ def wrapper(self, x): return decorator - -class ModelOutput(ABC): +# TODO: Simplify, most of this should have been done by UserDict... +class ModelOutput(dict, UserDict): """ModelOutput Object that stores the output of a model @@ -41,15 +42,30 @@ class ModelOutput(ABC): def __init__( self, - logits: np.ndarray, - embeddings: np.ndarray, + _map: dict | None = None, + logits: np.ndarray | None = None, + embeddings: np.ndarray | None = None, labels: np.ndarray | None = None, loss: np.ndarray | None = None, - ): - self.embeddings = embeddings + ): + super(UserDict).__init__() + self._main_keys = ["logits", "embeddings", "labels", "loss"] + self.logits = logits - self.loss = loss + self.embeddings = embeddings self.labels = labels + self.loss = loss + self.data = { + "logits": self.logits, + "embeddings": self.embeddings, + "labels": self.labels, + "loss": self.loss + } + if _map is not None: + for key, value in _map: + self[key] = value + + assert isinstance(self, dict) def to_hugging_face(self): return { @@ -65,9 +81,43 @@ def concat(list_of_outputs: list): loss=torch.vstack([out.loss for out in list_of_outputs]), labels=torch.vstack([out.labels for out in list_of_outputs]), ) + + def __len__(self) -> int: + """ + Count the number of batches in this system + + returns batch_size int + """ + return len(self.labels) + + def __setitem__(self, key, value): + if key in self._main_keys: + self.__setattr__(key, value) + self.data[key] = value + + def __getitem__(self, key): + return self.__getattribute__(key) + + def __repr__(self): + return str(self.data) + + def items(self): + data = self.data.items() + return ((col, value) for col, value in data if value is not None ) + + def keys(self): + return [key for key, _ in self.items()] + + def __iter__(self): + return iter(self.keys()) + + def __contains__(self, key): + return key in self.data + + -class ModelInput(ABC): +class ModelInput(ABC, UserDict): """ModelInput Spefifies Input Types @@ -87,12 +137,43 @@ def __init__( self.waveform = waveform self.spectrogram = spectrogram self.labels = labels + self.data = { + "labels": self.labels, + "waveform": self.waveform, + "spectrogram": self.spectrogram + } + self._main_keys = ["labels", "spectrogram", "waveform"] + def to_tensor(self, device="cpu"): self.waveform = Tensor(self.waveform, device=device) self.spectrogram = Tensor(self.spectrogram, device=device) self.labels = Tensor(self.labels, device=device) + def __len__(self) -> int: + """ + Count the number of batches in this system + + returns batch_size int + """ + return len(self.labels) + + def __setitem__(self, key, value): + if key in self._main_keys: + self.__setattr__(key, value) + self.data[key] = value + + # TODO: There might be a smarter way to do something like this... + def __getitem__(self, key): + return self.__getattribute__(key) + + def __repr__(self): + return str(self.data) + + def items(self): + data = super().items() + return ((col, value) for col, value in data if value is not None ) + """ BaseModel Class for Whoot """ diff --git a/whoot_model_training/whoot_model_training/models/timm_model.py b/whoot_model_training/whoot_model_training/models/timm_model.py index 38fccad..435e083 100644 --- a/whoot_model_training/whoot_model_training/models/timm_model.py +++ b/whoot_model_training/whoot_model_training/models/timm_model.py @@ -1,5 +1,6 @@ import timm from torch import nn, Tensor +import numpy as np from .model import Model, ModelInput, ModelOutput, has_required_inputs @@ -20,8 +21,8 @@ def __init__(self, labels, waveform=None, spectrogram=None, device="cpu"): # # Can use inputs to verify correct shape for upstream model # assert spectrogram.shape[1:] == (1, 100, 100) super().__init__(labels, waveform, spectrogram) - self.labels = Tensor(labels) - self.spectrogram = Tensor(spectrogram) + self.labels = Tensor(np.array(labels)) + self.spectrogram = Tensor(np.array(spectrogram)) class TimmModel(nn.Module, Model): @@ -51,10 +52,10 @@ def __init__( else: self.loss = nn.BCEWithLogitsLoss() - @has_required_inputs() - def forward(self, data: TimmInputs) -> ModelOutput: - embedd = self.backbone(data.spectrogram) + @has_required_inputs() #data: TimmInputs + def forward(self, spectrogram, labels=None) -> ModelOutput: + embedd = self.backbone(spectrogram) logits = self.linear(embedd) - loss = self.loss(logits, data.labels) - - return ModelOutput(logits=logits, embeddings=embedd, loss=loss, labels=data.labels) + loss = self.loss(logits, labels) + out = ModelOutput(logits=logits, embeddings=embedd, loss=loss, labels=labels) + return out diff --git a/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py b/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py index c1efd12..74a6750 100644 --- a/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py +++ b/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py @@ -38,4 +38,4 @@ def __init__( def __call__(self, batch): batch = super().__call__(batch) - return {"data": [self.ModelInput(labels=batch["labels"], spectrogram=batch["audio"])]} + return self.ModelInput(labels=batch["labels"], spectrogram=batch["audio"]) diff --git a/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py b/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py index d3c4356..dde93b3 100644 --- a/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py +++ b/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py @@ -68,7 +68,6 @@ def __call__(self, batch): if self.spectrogram_augments is not None: mels = self.spectrogram_augments(mels) - # print(mels.shape, int(start * sr), y.shape) new_audio.append(mels) new_labels.append(label) diff --git a/whoot_model_training/whoot_model_training/trainer.py b/whoot_model_training/whoot_model_training/trainer.py deleted file mode 100644 index ae8932c..0000000 --- a/whoot_model_training/whoot_model_training/trainer.py +++ /dev/null @@ -1,66 +0,0 @@ -# """ -# The Trainer holds the main training loop, validation loop, and can run evaluation - -# There are some off the shelf options, such as the hugging face Trainer -# Which is in use by https://github.com/UCSD-E4E/pyha-analyzer-2.0/ - -# However, It can be difficult to fit input to perfectly -# match what the hugging face trainer expects -# And we are unlikely to use all the bells and whistles offered by hugging face. - -# So this SimpleTrainer can get the job spefifically for whoot done -# With fewer bells and whistles -# This should hopefully make debugging easier in the future and -# keep the repo focused on whoot applications -# """ - -# import torch -# from transformers import TrainingArguments - -# from .models.model import Model, ModelOutput -# from .dataset import AudioDataset -# from pyha_analyzer.metrics.classification_metrics import AudioClassificationMetrics - - -# class WhootTrainingArguments(TrainingArguments): -# def __init__(self, working_dir): -# super().__init__(working_dir) -# self.logging_steps = 10 -# self.eval_steps = 100 -# self.per_device_train_batch_size = 64 -# self.per_device_eval_batch_size = 64 -# self.dataloader_num_workers = 4 -# self.eval_accumulation_steps = 10 - -# class WhootTrainer(): -# def __init__( -# self, -# model: Model, -# dataset: AudioDataset, -# metrics: AudioClassificationMetrics = None, -# training_args: WhootTrainingArguments = None, -# data_collator=None, -# preprocessor=None, -# ): - -# self.model = model -# self.dataset = dataset -# self.dataloaders = self._get_dataloaders(dataset) -# self.metrics = metrics - -# def run_metrics(self, output_batches:list[ModelOutput]): -# out = ModelOutput.concat(output_batches) -# metrics = self.metrics(out.to_hugging_face()) -# print(metrics) - -# def run_step(self, batch, training=True): - - -# def run_loop(self, split): -# for i in range(): - -# def train(self): - -# def evaluate(self): - - From 3f0b20476ed7bf9b3bcc4240ecd1cbc750687b2a Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Wed, 18 Jun 2025 16:30:40 -0700 Subject: [PATCH 061/120] add docstrings, lint, and remove unused functions --- create_dataset/k_fold_split_copy.py | 154 +++++++++++++++++----------- 1 file changed, 95 insertions(+), 59 deletions(-) diff --git a/create_dataset/k_fold_split_copy.py b/create_dataset/k_fold_split_copy.py index 7453503..7c03c98 100644 --- a/create_dataset/k_fold_split_copy.py +++ b/create_dataset/k_fold_split_copy.py @@ -1,40 +1,38 @@ """Optimizing k-fold splits with groups. -Downloaded and modified from https://github.com/joaofig/strat-group-split/tree/main +These functions aid strat_k_folds.py in calculating the +optimal fold allocation for all the groups in the dataset +ensuring the folds are as equal in size as they can be, +while also being as close to the actual class distribution +as possible. + +Downloaded and modified from: +https://github.com/joaofig/strat-group-split/tree/main """ - +from typing import Set, Tuple import numpy as np - from numpy.random import default_rng from numba import njit -from typing import Set, Tuple - - -def generate_problem(num_groups: int, - num_classes: int, - min_group_size: int, - max_group_size: int, - class_percent: np.array) -> np.ndarray: - - problem = np.zeros((num_groups, num_classes), dtype=int) - - rng = default_rng() - group_sizes = rng.integers(low=min_group_size, - high=max_group_size, - size=num_groups) - - for i in range(num_groups): - # Calculate the - proportions = np.random.normal(class_percent, class_percent / 10) - - problem[i, :] = proportions * group_sizes[i] - return problem @njit def calculate_cost(problem: np.ndarray, solution: np.ndarray, k: int) -> float: + """Calculate difference of current solution to optimal solution. + + Args: + problem (np.array): A matrix with a column per class and the + class counts for each group as the values. + solution (np.ndarray): A 1D array where each value is the current + fold allocation for the corresponding group. + k (int): Number of folds. + + Returns: + float: The summation of the differences between the folds' + class distributions from the optimal class distribution, and the + size of the folds to the size the folds should be. + """ cost = 0.0 total = np.sum(problem) class_sums = np.sum(problem, axis=0) @@ -54,7 +52,8 @@ def calculate_cost(problem: np.ndarray, if fold_sum == 0: cost += (0 - class_sums[j] / total) ** 2 else: - cost += (np.sum(problem[idx, j]) / fold_sum - class_sums[j] / total) ** 2 + sum_problem = np.sum(problem[idx, j]) / fold_sum + cost += (sum_problem - class_sums[j] / total) ** 2 return cost @@ -62,6 +61,19 @@ def calculate_cost(problem: np.ndarray, def generate_search_space(problem: np.ndarray, solution: np.ndarray, k: int) -> np.ndarray: + """Generate the search space. + + Args: + problem (np.ndarray): A matrix with a column per class and the + class counts for each group as the values. + solution (np.ndarray): The last known solution. + k (int): Number of folds. + + Returns: + np.ndarray: The search space. Folds as columns and cost values + for each group with a placeholder in one fold each to allow for + a cost calculation relative to the placeholder. + """ num_groups = problem.shape[0] space = np.zeros((num_groups, k)) @@ -70,7 +82,7 @@ def generate_search_space(problem: np.ndarray, for i in range(num_groups): for j in range(k): if solution[i] == j: - space[i,j] = np.inf + space[i, j] = np.inf else: sol[i] = j space[i, j] = calculate_cost(problem, sol, k) @@ -80,12 +92,32 @@ def generate_search_space(problem: np.ndarray, @njit def solution_to_str(solution: np.ndarray) -> str: + """Convert the solution to a string. + + Args: + solution (np.ndarray): The current solution. + Returns: + str: The current solution as a string. + """ return "".join([str(n) for n in solution]) def generate_initial_solution(problem: np.ndarray, k: int, - algo: str="k-bound") -> np.ndarray: + algo: str = "k-bound") -> np.ndarray: + """Generate the first solution. + + Args: + problem (np.array): A matrix with a column per class and the + class counts for each group as the values. + k (int): The number of folds. + algo (str): Method for creating initial solution. Defaults to a + greedy algorithm to satisfy fold proportion requirements only. + + Returns: + np.ndarray: A 1D array where each value is the current fold + allocation for the corresponding group. + """ num_groups = problem.shape[0] if algo == "k-bound": rng = default_rng() @@ -93,16 +125,16 @@ def generate_initial_solution(problem: np.ndarray, indices = rng.permutation(problem.shape[0]) solution = np.zeros(num_groups, dtype=int) - c = 0 + current_fold = 0 fold_total = 0 for i in indices: group = np.sum(problem[i, :]) if fold_total + group < total / k: fold_total += group else: - c = (c + 1) % k + current_fold = (current_fold + 1) % k fold_total = group - solution[i] = c + solution[i] = current_fold elif algo == "random": rng = default_rng() solution = rng.integers(low=0, high=k, size=num_groups) @@ -118,6 +150,22 @@ def solve(problem: np.ndarray, min_cost=1e-5, max_retry=100, verbose=False) -> np.ndarray: + """Solve the problem. + + Args: + problem (np.ndarray): + k (int): Number of folds, default 5. + min_cost (float): The largest the cost can be for an + acceptable solution. Default 1e-5. + max_retry (int): The max amount of times the program will + attempt to alter the current solution for a more + optimal one. + verbose (bool): True for more debug prints, defaults to False. + + Returns: + np.ndarray: Optimized solution as a 1D array where each + value is the fold allocation for each group. + """ hist = set() retry = 0 @@ -147,36 +195,24 @@ def solve(problem: np.ndarray, def select_move(decision: np.ndarray, solution: np.ndarray, history: Set) -> Tuple: + """Select the change to make to the current solution. + + Args: + decision (np.ndarray): The current search space matrix. + solution (np.ndarray): The current solution. + history (Set): Previous solutions. + Returns: + Tuple: Position in the solution matrix to move a group + into a different fold. + """ candidates = np.argsort(decision, axis=None) - for c in candidates: - p = np.unravel_index(c, decision.shape) - s = solution.copy() - s[p[0]] = p[1] - sol_str = solution_to_str(s) + for candidate in candidates: + position = np.unravel_index(candidate, decision.shape) + sol = solution.copy() + sol[position[0]] = position[1] + sol_str = solution_to_str(sol) if sol_str not in history: - return p + return position[0], position[1] return -1, -1 # No move found! - - -def main(): - problem = generate_problem(num_groups=500, - num_classes=4, - min_group_size=400, - max_group_size=2000, - class_percent=np.array([0.4, 0.3, 0.2, 0.1])) - solution = solve(problem, k=5, verbose=True) - - print(np.sum(problem, axis=0) / np.sum(problem)) - print() - - folds = [problem[solution == i] for i in range(5)] - fold_percents = np.array([np.sum(folds[i], axis=0) / np.sum(folds[i]) for i in range(5)]) - print(fold_percents) - print() - print([np.sum(folds[i]) / np.sum(problem) for i in range(5)]) - - -if __name__ == "__main__": - main() From 941eb1d3b47d3aab3aa30006273f6e46dce12ba5 Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Wed, 18 Jun 2025 16:36:02 -0700 Subject: [PATCH 062/120] forgot one docstring line --- create_dataset/k_fold_split_copy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/create_dataset/k_fold_split_copy.py b/create_dataset/k_fold_split_copy.py index 7c03c98..be7e47b 100644 --- a/create_dataset/k_fold_split_copy.py +++ b/create_dataset/k_fold_split_copy.py @@ -153,7 +153,7 @@ def solve(problem: np.ndarray, """Solve the problem. Args: - problem (np.ndarray): + problem (np.ndarray): The problem matrix. k (int): Number of folds, default 5. min_cost (float): The largest the cost can be for an acceptable solution. Default 1e-5. From f0ccdb02d73439ce6d67ddcfff74c8df031a0487 Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Fri, 20 Jun 2025 09:50:54 -0700 Subject: [PATCH 063/120] Clean up code Some linting, documentation, and bug fix for model outputs --- .gitignore | 13 ++- pyproject.toml | 1 + whoot_model_training/train.py | 82 ++++++++++--- .../whoot_model_training/data_augmentation.py | 0 .../data_extractor/__init__.py | 2 +- .../data_extractor/buowset_extractor.py | 52 ++++----- .../whoot_model_training/dataset.py | 57 ++++++--- .../whoot_model_training/models/__init__.py | 2 +- .../whoot_model_training/models/model.py | 110 +++++++++++------- .../whoot_model_training/models/timm_model.py | 35 ++++-- .../preprocessors/__init__.py | 8 +- .../preprocessors/default_preprocessor.py | 20 +++- .../spectrogram_preprocessors.py | 55 +++++---- 13 files changed, 288 insertions(+), 149 deletions(-) delete mode 100644 whoot_model_training/whoot_model_training/data_augmentation.py diff --git a/.gitignore b/.gitignore index 877de52..3ba9a5f 100644 --- a/.gitignore +++ b/.gitignore @@ -210,4 +210,15 @@ uv.lock .ruff_cache # Data Folders -data \ No newline at end of file +data + +# Model Storage +working_dir + + +# testing/debugging notebooks +test.ipynb +buowset.ipynb + +# Question: do we want to commit vscode setting.json files? +settings.json \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 2cfc805..4b5ae8c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ model_training = [ "datasets>=3.5.1", "timm>=1.0.15", "pyha-analyzer", + "comet-ml>=3.43.2", ] notebooks = [ diff --git a/whoot_model_training/train.py b/whoot_model_training/train.py index a81122c..466a7f6 100644 --- a/whoot_model_training/train.py +++ b/whoot_model_training/train.py @@ -1,4 +1,16 @@ +"""Trains a Mutliclass Model with Pytorch and Huggingface +This script can be used to run experiments with different +models and datasets to create any model for bioacoustic classification + +It is intended this script to be heavily modified with each experiment +(say one wants to use a different dataset, one should copy this and change the extractor!) + +Usage: + $ python train.py /path/to/config.yml + +config.yml should contain frequently changed hyperparameters +""" import argparse import yaml @@ -9,19 +21,46 @@ from whoot_model_training.models import TimmModel, TimmInputs from whoot_model_training.preprocessors import SpectrogramModelInputPreprocessors -def parse_config(config_path): - with open(config_path, 'r') as f: +## TODO ALLOW USER TO SELECT THIS +## TODO MAKE DISTRIBUTED TRAINING POSSIBLE +import os +os.environ["CUDA_VISIBLE_DEVICES"] = "0" + +def parse_config(config_path: str) -> dict: + """wrapper to parse config + + Args: + config_path (str): path to config file for training! + + returns: + (dict): hyperparameters parameters + """ + with open(config_path, "r") as f: config = yaml.safe_load(f) return config + def train(config_path): + """Highest level logic for training + + Does the following: + - Formats the dataset into an AudioDataset + - Prepares preprocessing for each audio clip + - Builds the model + - Configures and runs the trainer + - Runs evaluation + + Args: + config_path (str): path to config file for training! + """ + config = parse_config(config_path) # Extract the dataset ds = buowset_extractor( metadata_csv=config["metadata_csv"], parent_path=config["data_path"], - output_path=config["hf_cache_path"] + output_path=config["hf_cache_path"], ) # Create the model @@ -39,29 +78,38 @@ def train(config_path): # Run training args = PyhaTrainingArguments(working_dir="working_dir") + + # REQUIRED ARGS (DO NOT CHANGE VALUES TODO ADD TO TRAINER DIRECTLY) + args.label_names = ["labels"] + args.remove_unused_columns = False + + # OPTIONAL ARGS args.num_train_epochs = 2 - args.eval_steps = 20 + args.eval_steps = 10 args.dataloader_num_workers = 36 - args.per_device_train_batch_size = 32 - args.per_device_eval_batch_size = 32 + args.per_device_train_batch_size = 16 + args.per_device_eval_batch_size = 16 args.run_name = "testing" - args.report_to="none" #Blocks wandb + args.report_to = "comet_ml" # Blocks wandb + + + print(args.accelerator_config.even_batches) + trainer = PyhaTrainer( model=model, dataset=ds, training_args=args, - logger=None, + logger=None, ) - trainer.train() - trainer.evaluate(eval_dataset=ds["test"], metric_key_prefix="Soundscape") + + print(trainer.evaluate(eval_dataset=ds["valid"], metric_key_prefix="TEST FOR METRICS")) + # trainer.train() + -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='Input config path' - ) - parser.add_argument('config', type=str, - help='Path to config.yml') +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Input config path") + parser.add_argument("config", type=str, help="Path to config.yml") args = parser.parse_args() - train(args.config) \ No newline at end of file + train(args.config) diff --git a/whoot_model_training/whoot_model_training/data_augmentation.py b/whoot_model_training/whoot_model_training/data_augmentation.py deleted file mode 100644 index e69de29..0000000 diff --git a/whoot_model_training/whoot_model_training/data_extractor/__init__.py b/whoot_model_training/whoot_model_training/data_extractor/__init__.py index 9486885..365cd63 100644 --- a/whoot_model_training/whoot_model_training/data_extractor/__init__.py +++ b/whoot_model_training/whoot_model_training/data_extractor/__init__.py @@ -1 +1 @@ -from .buowset_extractor import buowset_extractor +from .buowset_extractor import buowset_extractor as buowset_extractor diff --git a/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py b/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py index c11be07..458d4fa 100644 --- a/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py +++ b/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py @@ -1,32 +1,32 @@ -""" +"""Standardizes the format of the buowset dataset + Inspired by https://github.com/UCSD-E4E/pyha-analyzer-2.0/tree/main/pyha_analyzer/extractors -Standardizes the format of the buowset dataset + +The idea being extractors is that they take raw data, and +format it into a uniform dataset format, AudioDataset + +This way, it should be easier to define what a common audio dataset format is between +parts of the codebase for training """ -import argparse import os import numpy as np -from datasets import ( - load_dataset, - Audio, - DatasetDict, - load_from_disk, - ClassLabel, - Sequence -) +from datasets import load_dataset, Audio, DatasetDict, ClassLabel, Sequence, load_from_disk from ..dataset import AudioDataset -from pyha_analyzer.extractors.birdset import one_hot_encode_ds_wrapper -def one_hot_encode(row, classes): +def one_hot_encode(row: dict, classes: list): + """One hot Encodes a list of labels + Args: + row (dict): row of data in a dataset containing a labels column + classes: a list of classes + """ one_hot = np.zeros(len(classes)) one_hot[row["labels"]] = 1 row["labels"] = np.array(one_hot, dtype=float) return row -"""_summary_ -""" def buowset_extractor( metadata_csv, parent_path, @@ -35,29 +35,27 @@ def buowset_extractor( test_fold=3, sr=32_000, filepath="segment", -): - +): # if os.path.exists(output_path): # ds = load_from_disk(output_path) # return AudioDataset(ds) # Hugging face by default defines a train split ds = load_dataset("csv", data_files=metadata_csv)["train"] - ds = ds.rename_column("label", "labels") #Convention here is labels + ds = ds.rename_column("label", "labels") # Convention here is labels # Convert to a uniform one_hot encoding for classes ds = ds.class_encode_column("labels") class_list = ds.features["labels"].names - mutlilabel_class_label = Sequence(ClassLabel(names=class_list)) - ds = ds.map( - lambda row: one_hot_encode(row, class_list) - ).cast_column("labels", mutlilabel_class_label) - + mutlilabel_class_label = Sequence(ClassLabel(names=class_list)) + ds = ds.map(lambda row: one_hot_encode(row, class_list)).cast_column( + "labels", mutlilabel_class_label + ) + # Get audio into uniform format - + ds = ds.add_column( - "audio", - [os.path.join(parent_path, file) for file in ds[filepath]] + "audio", [os.path.join(parent_path, file) for file in ds[filepath]] ) ds = ds.cast_column("audio", Audio(sampling_rate=sr)) @@ -71,7 +69,7 @@ def buowset_extractor( ds = AudioDataset( DatasetDict({"train": train_ds, "valid": valid_ds, "test": test_ds}) ) - + ds.save_to_disk(output_path) return ds diff --git a/whoot_model_training/whoot_model_training/dataset.py b/whoot_model_training/whoot_model_training/dataset.py index f2e4290..1f5db06 100644 --- a/whoot_model_training/whoot_model_training/dataset.py +++ b/whoot_model_training/whoot_model_training/dataset.py @@ -4,23 +4,38 @@ Using an Arrow Dataset from Hugging Face's dataset library because - Cool audio features https://huggingface.co/docs/datasets/en/audio_process -- Faster than pandas, better at manging memory +- Faster than pandas, better at managing memory -# TODO Use the default stuff from pyha-anaylzer +# TODO Use the default stuff from pyha-analyzer """ from datasets import DatasetDict, ClassLabel -from torch.utils.data import DataLoader DEFAULT_COLUMNS = ["labels", "audio"] class AudioDataset(DatasetDict): + """ + AudioDataset Class + + If your dataset is an AudioDataset, it can be read by the rest of the system + + Behind the scenes, this is a Apache Arrow Dataset Dict (via hf library) where + each key is a split of the data (test/train/valid) and the value is an arrow dataset + with at a minimum 2 columns: + - labels (Sequence of class labels, such as [0,10]) + - audio (Audio Column type from hugging face) + """ def __init__(self, ds: DatasetDict): self.validate_format(ds) super().__init__(ds) def validate_format(self, ds: DatasetDict): + """Validates dataset is correctly formatted and ready to be used for training + + Raises: + AssertionError if dataset is not correctly formatted. + """ for split in ds.keys(): dataset = ds[split] for column in DEFAULT_COLUMNS: @@ -28,19 +43,33 @@ def validate_format(self, ds: DatasetDict): f"The column `{column}` is missing from dataset split `{split}`. Required by system" ) - def get_num_classes( - self, - ): # NOTE: Assumes all labels are mutlilabel (the extra feature note) + def get_num_classes(self): + """ + Returns: + (int): the number of classes in this dataset + """ return self["train"].features["labels"].feature.num_classes - - """ - Legacy code had the method name `get_number_species` - """ - def get_number_species(self): - return self.get_num_classes() - def get_class_labels(self): + def get_number_species(self) -> int: + """ + PyhaAnalyzer uses `get_number_species` for getting class count + This... isn't always the case that the dataset is species only (could have calls!) + To support legacy PyhaAnalyzer, we therefore have this function. + + This should be deprecated in future versions of PyhaAnalyzer + + return + (int): number of classes """ - Returns a new ClassLabel Object to make mapping easier between datasets + return self.get_num_classes() + + def get_class_labels(self) -> ClassLabel: + """Class mapping for this dataset + + A common problem is when moving between datasets creating mappings between classes + This aims to help standardize that by being able to get the classLabels for this dataset + + Returns: + (ClassLabel): Mapping of all the names of the labels to their index. """ return ClassLabel(names=self["train"].features["labels"].feature.names) diff --git a/whoot_model_training/whoot_model_training/models/__init__.py b/whoot_model_training/whoot_model_training/models/__init__.py index c91d3c0..313bd2c 100644 --- a/whoot_model_training/whoot_model_training/models/__init__.py +++ b/whoot_model_training/whoot_model_training/models/__init__.py @@ -1 +1 @@ -from .timm_model import TimmModel, TimmInputs +from .timm_model import TimmModel as TimmModel, TimmInputs as TimmInputs diff --git a/whoot_model_training/whoot_model_training/models/model.py b/whoot_model_training/whoot_model_training/models/model.py index 1c3330d..113d261 100644 --- a/whoot_model_training/whoot_model_training/models/model.py +++ b/whoot_model_training/whoot_model_training/models/model.py @@ -1,25 +1,35 @@ +"""Abstract Model Class for training + +Any model trained with this repo SHOULD inherit from these classes found here + +There are 3 main classes +- ModelInput: dict-like class that define required input params to function +- ModelOutput: dict-like class that defines the output from the model +- Model: A PyTorch nn.Module class + +See timm_model.py for example about how these classes can be implemented. +""" + from abc import ABC, abstractmethod from functools import wraps from collections import UserDict from pyha_analyzer.models.base_model import BaseModel import torch -from torch import nn, Tensor +from torch import Tensor import numpy as np -""" - Wrapper to check to make sure everything is setup properly - Required before using PyhaTrainer -""" - - def has_required_inputs(): + """ + Wrapper to check to make sure everything is setup properly + Required before using PyhaTrainer + """ def decorator(forward): @wraps(forward) def wrapper(self, *args, **kwarg): - #assert isinstance(x, self.input_format) #TODO FIX + # assert isinstance(x, self.input_format) #TODO FIX model_output = forward(self, *args, **kwarg) - #assert isinstance(model_output, self.output_format) + # assert isinstance(model_output, self.output_format) return model_output @@ -27,6 +37,7 @@ def wrapper(self, *args, **kwarg): return decorator + # TODO: Simplify, most of this should have been done by UserDict... class ModelOutput(dict, UserDict): """ModelOutput @@ -47,7 +58,7 @@ def __init__( embeddings: np.ndarray | None = None, labels: np.ndarray | None = None, loss: np.ndarray | None = None, - ): + ): super(UserDict).__init__() self._main_keys = ["logits", "embeddings", "labels", "loss"] @@ -56,10 +67,10 @@ def __init__( self.labels = labels self.loss = loss self.data = { - "logits": self.logits, - "embeddings": self.embeddings, + "logits": self.logits, + "embeddings": self.embeddings, "labels": self.labels, - "loss": self.loss + "loss": self.loss, } if _map is not None: for key, value in _map: @@ -81,43 +92,41 @@ def concat(list_of_outputs: list): loss=torch.vstack([out.loss for out in list_of_outputs]), labels=torch.vstack([out.labels for out in list_of_outputs]), ) - + def __len__(self) -> int: """ - Count the number of batches in this system + Count the number of batches in this system - returns batch_size int + returns batch_size int """ return len(self.labels) - + def __setitem__(self, key, value): if key in self._main_keys: self.__setattr__(key, value) self.data[key] = value - + def __getitem__(self, key): return self.__getattribute__(key) - + def __repr__(self): return str(self.data) - + def items(self): data = self.data.items() - return ((col, value) for col, value in data if value is not None ) - + return ((col, value) for col, value in data if value is not None) + def keys(self): return [key for key, _ in self.items()] - + def __iter__(self): return iter(self.keys()) - + def __contains__(self, key): return key in self.data - - -class ModelInput(ABC, UserDict): +class ModelInput(UserDict): """ModelInput Spefifies Input Types @@ -126,25 +135,25 @@ class ModelInput(ABC, UserDict): Inspired by HuggingFace Models and Tokenizers Developer: Reccommend for each Model, to have an assocaited ModelInput class + ALWAYS HAS A LABEL CATEGORY """ def __init__( self, labels: np.ndarray, - waveform: np.ndarray|None = None, - spectrogram: np.ndarray |None = None, + waveform: np.ndarray | None = None, + spectrogram: np.ndarray | None = None, ): self.waveform = waveform self.spectrogram = spectrogram self.labels = labels self.data = { - "labels": self.labels, - "waveform": self.waveform, - "spectrogram": self.spectrogram + "labels": self.labels, + "waveform": self.waveform, + "spectrogram": self.spectrogram, } self._main_keys = ["labels", "spectrogram", "waveform"] - def to_tensor(self, device="cpu"): self.waveform = Tensor(self.waveform, device=device) self.spectrogram = Tensor(self.spectrogram, device=device) @@ -152,32 +161,43 @@ def to_tensor(self, device="cpu"): def __len__(self) -> int: """ - Count the number of batches in this system + Count the number of batches in this system - returns batch_size int + returns batch_size int """ return len(self.labels) - + def __setitem__(self, key, value): if key in self._main_keys: self.__setattr__(key, value) self.data[key] = value - - # TODO: There might be a smarter way to do something like this... + def __getitem__(self, key): return self.__getattribute__(key) - + def __repr__(self): return str(self.data) - + def items(self): - data = super().items() - return ((col, value) for col, value in data if value is not None ) + data = self.data.items() + return ((col, value) for col, value in data if value is not None) + + def keys(self): + return [key for key, _ in self.items()] + + def __iter__(self): + return iter(self.keys()) + + def __contains__(self, key): + return key in self.data + + def get(self, key): + return self.__getattribute__(key) -""" -BaseModel Class for Whoot -""" class Model(BaseModel): + """ + BaseModel Class for Whoot + """ # TODO Define required class intance variables # Such as cirteron etc. def __init__(self, *args, **kwargs): diff --git a/whoot_model_training/whoot_model_training/models/timm_model.py b/whoot_model_training/whoot_model_training/models/timm_model.py index 435e083..de73a8e 100644 --- a/whoot_model_training/whoot_model_training/models/timm_model.py +++ b/whoot_model_training/whoot_model_training/models/timm_model.py @@ -1,11 +1,4 @@ -import timm -from torch import nn, Tensor -import numpy as np - -from .model import Model, ModelInput, ModelOutput, has_required_inputs - -""" - Wrapper around the timms model zoo +"""Wrapper around the timms model zoo See https://timm.fast.ai/ @@ -15,8 +8,18 @@ Great repo for models, but currently using this for demoing pipeline """ +import timm +from torch import nn, Tensor +import numpy as np + +from .model import Model, ModelInput, ModelOutput, has_required_inputs + class TimmInputs(ModelInput): + """Input for TimmModel's + + Spefifies TimmModels needs labels and spectrograms that are Tensors + """ def __init__(self, labels, waveform=None, spectrogram=None, device="cpu"): # # Can use inputs to verify correct shape for upstream model # assert spectrogram.shape[1:] == (1, 100, 100) @@ -26,6 +29,9 @@ def __init__(self, labels, waveform=None, spectrogram=None, device="cpu"): class TimmModel(nn.Module, Model): + """Model that uses a timm's model as its backbone with a linear layer for classification + """ + def __init__( self, timm_model="resnet34", @@ -34,6 +40,14 @@ def __init__( num_classes=6, loss=None, ): + """ + kwargs: + timm_model (str): name of model backbone from timms to use, Default: "resnet34" + pretrained (bool): use a pretrained model from timms, Default: True + in_chans (int): number of channels of audio: Default: 1 + num_classes (int): number of classes in the dataset: Default 6 + loss (any): custom loss function Default: BCEWithLogitsLoss + """ super().__init__() self.input_format = TimmInputs self.output_format = ModelOutput @@ -46,14 +60,15 @@ def __init__( # Unsure if 1000 is default for all models. Need to check this self.linear = nn.Linear(1000, num_classes) - # Models might need diffrent losses during training! + # Models might need different losses during training! if loss is not None: self.loss = loss else: self.loss = nn.BCEWithLogitsLoss() - @has_required_inputs() #data: TimmInputs + @has_required_inputs() # data: TimmInputs def forward(self, spectrogram, labels=None) -> ModelOutput: + # print(len(spectrogram)) # batch size embedd = self.backbone(spectrogram) logits = self.linear(embedd) loss = self.loss(logits, labels) diff --git a/whoot_model_training/whoot_model_training/preprocessors/__init__.py b/whoot_model_training/whoot_model_training/preprocessors/__init__.py index 66aa562..f15458e 100644 --- a/whoot_model_training/whoot_model_training/preprocessors/__init__.py +++ b/whoot_model_training/whoot_model_training/preprocessors/__init__.py @@ -1,2 +1,6 @@ -from .default_preprocessor import SpectrogramModelInputPreprocessors -from .spectrogram_preprocessors import BuowMelSpectrogramPreprocessors +from .default_preprocessor import ( + SpectrogramModelInputPreprocessors as SpectrogramModelInputPreprocessors, +) +from .spectrogram_preprocessors import ( + BuowMelSpectrogramPreprocessors as BuowMelSpectrogramPreprocessors, +) diff --git a/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py b/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py index 74a6750..62a7d6f 100644 --- a/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py +++ b/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py @@ -1,15 +1,23 @@ -from .spectrogram_preprocessors import BuowMelSpectrogramPreprocessors -from ..models.model import ModelInput +"""Default Class for Preprocessing the data + +The dataset is one thing, what we feed into the models is another +Models may require spectrograms, waveforms, etc +Not to mention any online augmentation we want to do -"""_summary_ +The preprocessor class defines a function to preprocess our data during training -Returns: - _type_: _description_ +The default preprocessor allows for many types of preprocessors to run, but it forces the output to fit +the ModelInput class structure. see `whoot_model_training\models\model.py` for more info. """ +from .spectrogram_preprocessors import BuowMelSpectrogramPreprocessors +from ..models.model import ModelInput class SpectrogramModelInputPreprocessors(BuowMelSpectrogramPreprocessors): + """ Defines a preprocessed that after formatting the audio passes a spectrogram + into a ModelInput object. + """ def __init__( self, ModelInput: ModelInput, @@ -36,6 +44,6 @@ def __init__( ) self.ModelInput = ModelInput - def __call__(self, batch): + def __call__(self, batch: dict) -> ModelInput: batch = super().__call__(batch) return self.ModelInput(labels=batch["labels"], spectrogram=batch["audio"]) diff --git a/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py b/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py index dde93b3..8ed2d68 100644 --- a/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py +++ b/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py @@ -8,6 +8,7 @@ from pyha_analyzer.preprocessors import PreProcessorBase + class BuowMelSpectrogramPreprocessors(PreProcessorBase): def __init__( self, @@ -15,21 +16,21 @@ def __init__( augment=None, spectrogram_augments=None, class_list=[], - n_fft=2048, - hop_length=256, - power=2.0, + n_fft=2048, + hop_length=256, + power=2.0, n_mels=256, dataset_ref=None, - ): + ): self.duration = duration self.augment = augment self.spectrogram_augments = spectrogram_augments # Below parameter defaults from https://arxiv.org/pdf/2403.10380 pg 25 - self.n_fft=n_fft - self.hop_length=hop_length - self.power=power - self.n_mels=n_mels + self.n_fft = n_fft + self.hop_length = hop_length + self.power = power + self.n_mels = n_mels super().__init__(name="MelSpectrogramPreprocessor") @@ -47,31 +48,35 @@ def __call__(self, batch): y = np.pad(y, end_sr - y.shape[-1]) # Audio Based Augmentations - if self.augment != None: - y, label = self.augment(y, sr, label) - + if self.augment is not None: + y, label = self.augment(y, sr, label) pillow_transforms = transforms.ToPILImage() - - mels = np.array( - pillow_transforms( - librosa.feature.melspectrogram( - y=y[int(start * sr) : end_sr], sr=sr, - n_fft=self.n_fft, - hop_length=self.hop_length, - power=self.power, - n_mels=self.n_mels, - ) - ), - np.float32)[np.newaxis, ::] / 255 + + mels = ( + np.array( + pillow_transforms( + librosa.feature.melspectrogram( + y=y[int(start * sr) : end_sr], + sr=sr, + n_fft=self.n_fft, + hop_length=self.hop_length, + power=self.power, + n_mels=self.n_mels, + ) + ), + np.float32, + )[np.newaxis, ::] + / 255 + ) if self.spectrogram_augments is not None: mels = self.spectrogram_augments(mels) new_audio.append(mels) new_labels.append(label) - + batch["audio"] = new_audio batch["labels"] = np.array(new_labels, dtype=np.float32) - return batch \ No newline at end of file + return batch From e25e508d19dc69f6763cff9cf744c775d131df1b Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Fri, 20 Jun 2025 09:55:17 -0700 Subject: [PATCH 064/120] Apply better fix for model input and output * Adds metrics and successful training! Was not fully happy with the code, so I decided to rewrite my latest changes at home from scratch, and spend friday morning merging the changes together. This fixes model input / outputs to work with Hugging face more cleanly and allows for metrics to work! * Add training, Hide Training Artifacts --- .gitignore | 3 +- whoot_model_training/train.py | 10 +- .../whoot_model_training/models/model.py | 126 +++--------------- .../whoot_model_training/models/timm_model.py | 10 +- 4 files changed, 28 insertions(+), 121 deletions(-) diff --git a/.gitignore b/.gitignore index 3ba9a5f..103554e 100644 --- a/.gitignore +++ b/.gitignore @@ -221,4 +221,5 @@ test.ipynb buowset.ipynb # Question: do we want to commit vscode setting.json files? -settings.json \ No newline at end of file +settings.json + diff --git a/whoot_model_training/train.py b/whoot_model_training/train.py index 466a7f6..a89be1e 100644 --- a/whoot_model_training/train.py +++ b/whoot_model_training/train.py @@ -85,10 +85,10 @@ def train(config_path): # OPTIONAL ARGS args.num_train_epochs = 2 - args.eval_steps = 10 + args.eval_steps = 20 + args.per_device_train_batch_size = 1 + args.per_device_eval_batch_size = 1 args.dataloader_num_workers = 36 - args.per_device_train_batch_size = 16 - args.per_device_eval_batch_size = 16 args.run_name = "testing" args.report_to = "comet_ml" # Blocks wandb @@ -101,10 +101,10 @@ def train(config_path): dataset=ds, training_args=args, logger=None, + ignore_keys=["predictions", "labels", "embeddings", "loss"] ) - + #trainer.train() print(trainer.evaluate(eval_dataset=ds["valid"], metric_key_prefix="TEST FOR METRICS")) - # trainer.train() diff --git a/whoot_model_training/whoot_model_training/models/model.py b/whoot_model_training/whoot_model_training/models/model.py index 113d261..a8e05b5 100644 --- a/whoot_model_training/whoot_model_training/models/model.py +++ b/whoot_model_training/whoot_model_training/models/model.py @@ -38,7 +38,6 @@ def wrapper(self, *args, **kwarg): return decorator -# TODO: Simplify, most of this should have been done by UserDict... class ModelOutput(dict, UserDict): """ModelOutput @@ -59,74 +58,21 @@ def __init__( labels: np.ndarray | None = None, loss: np.ndarray | None = None, ): - super(UserDict).__init__() - self._main_keys = ["logits", "embeddings", "labels", "loss"] - - self.logits = logits - self.embeddings = embeddings - self.labels = labels - self.loss = loss - self.data = { - "logits": self.logits, - "embeddings": self.embeddings, - "labels": self.labels, - "loss": self.loss, - } - if _map is not None: - for key, value in _map: - self[key] = value - - assert isinstance(self, dict) - - def to_hugging_face(self): - return { - "predictions": self.logits, - "label_ids": [self.labels], - } - - @classmethod - def concat(list_of_outputs: list): - return ModelOutput( - logits=torch.vstack([out.logits for out in list_of_outputs]), - embeddings=torch.vstack([out.embeddings for out in list_of_outputs]), - loss=torch.vstack([out.loss for out in list_of_outputs]), - labels=torch.vstack([out.labels for out in list_of_outputs]), - ) - - def __len__(self) -> int: - """ - Count the number of batches in this system - - returns batch_size int - """ - return len(self.labels) - - def __setitem__(self, key, value): - if key in self._main_keys: - self.__setattr__(key, value) - self.data[key] = value - - def __getitem__(self, key): - return self.__getattribute__(key) - - def __repr__(self): - return str(self.data) + super().__init__({ + "predictions": logits, + "logits": logits, + "labels": [labels], + # "label_ids": [labels], + "embeddings": embeddings, + "loss": loss + }) def items(self): - data = self.data.items() - return ((col, value) for col, value in data if value is not None) + return [(key, value) for (key, value) in super().items() if value is not None] - def keys(self): - return [key for key, _ in self.items()] - def __iter__(self): - return iter(self.keys()) +class ModelInput(UserDict, dict): - def __contains__(self, key): - return key in self.data - - -class ModelInput(UserDict): """ModelInput Spefifies Input Types @@ -144,55 +90,15 @@ def __init__( waveform: np.ndarray | None = None, spectrogram: np.ndarray | None = None, ): - self.waveform = waveform - self.spectrogram = spectrogram - self.labels = labels - self.data = { - "labels": self.labels, - "waveform": self.waveform, - "spectrogram": self.spectrogram, - } - self._main_keys = ["labels", "spectrogram", "waveform"] - - def to_tensor(self, device="cpu"): - self.waveform = Tensor(self.waveform, device=device) - self.spectrogram = Tensor(self.spectrogram, device=device) - self.labels = Tensor(self.labels, device=device) - - def __len__(self) -> int: - """ - Count the number of batches in this system - - returns batch_size int - """ - return len(self.labels) - - def __setitem__(self, key, value): - if key in self._main_keys: - self.__setattr__(key, value) - self.data[key] = value - - def __getitem__(self, key): - return self.__getattribute__(key) - - def __repr__(self): - return str(self.data) + super().__init__({ + "labels": labels, + "waveform": waveform, + "spectrogram": spectrogram + }) def items(self): - data = self.data.items() - return ((col, value) for col, value in data if value is not None) - - def keys(self): - return [key for key, _ in self.items()] + return [(key, value) for (key, value) in super().items() if value is not None] - def __iter__(self): - return iter(self.keys()) - - def __contains__(self, key): - return key in self.data - - def get(self, key): - return self.__getattribute__(key) class Model(BaseModel): """ diff --git a/whoot_model_training/whoot_model_training/models/timm_model.py b/whoot_model_training/whoot_model_training/models/timm_model.py index de73a8e..72f223b 100644 --- a/whoot_model_training/whoot_model_training/models/timm_model.py +++ b/whoot_model_training/whoot_model_training/models/timm_model.py @@ -66,11 +66,11 @@ def __init__( else: self.loss = nn.BCEWithLogitsLoss() - @has_required_inputs() # data: TimmInputs - def forward(self, spectrogram, labels=None) -> ModelOutput: - # print(len(spectrogram)) # batch size + + @has_required_inputs() #data: TimmInputs TODO FIX + def forward(self, labels=None, spectrogram=None) -> ModelOutput: embedd = self.backbone(spectrogram) logits = self.linear(embedd) loss = self.loss(logits, labels) - out = ModelOutput(logits=logits, embeddings=embedd, loss=loss, labels=labels) - return out + + return ModelOutput(logits=logits, embeddings=embedd, loss=loss, labels=labels) From 4ac3ed0613610bc496bfff1106b80b098c4ba8d6 Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Fri, 20 Jun 2025 10:00:30 -0700 Subject: [PATCH 065/120] Make config easier to manage --- .gitignore | 3 +++ whoot_model_training/{ => configs}/config.yml | 0 2 files changed, 3 insertions(+) rename whoot_model_training/{ => configs}/config.yml (100%) diff --git a/.gitignore b/.gitignore index 103554e..e7924ec 100644 --- a/.gitignore +++ b/.gitignore @@ -223,3 +223,6 @@ buowset.ipynb # Question: do we want to commit vscode setting.json files? settings.json +# Block all configs besides the example config +whoot_model_training/configs +!whoot_model_training/configs/config.yml \ No newline at end of file diff --git a/whoot_model_training/config.yml b/whoot_model_training/configs/config.yml similarity index 100% rename from whoot_model_training/config.yml rename to whoot_model_training/configs/config.yml From 3a48a8714c1148ca42c19659b23debd9d42ce99f Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Fri, 20 Jun 2025 10:35:38 -0700 Subject: [PATCH 066/120] Lint, Spell Check, and Documentation while I wait for the models to train --- whoot_model_training/README.md | 17 +++++++++++++- whoot_model_training/train.py | 6 ++--- .../data_extractor/buowset_extractor.py | 16 ++++++++++++- .../whoot_model_training/models/model.py | 18 +++++++-------- .../whoot_model_training/models/timm_model.py | 23 +++++++++++++++---- .../preprocessors/default_preprocessor.py | 21 +++++++++++++++++ 6 files changed, 82 insertions(+), 19 deletions(-) diff --git a/whoot_model_training/README.md b/whoot_model_training/README.md index d687647..c1d2176 100644 --- a/whoot_model_training/README.md +++ b/whoot_model_training/README.md @@ -1,3 +1,18 @@ Toolkit for training Machine Learning Classification Models over audio dataset -Key inspiration is https://github.com/UCSD-E4E/pyha-analyzer-2.0/tree/main. This repo differs in that it uses a traditional training pipeline rather than the Hugging Face Trainer. Hugging face trainer abstracts the training code, which should be explict for this toolkit. +Key inspiration is https://github.com/UCSD-E4E/pyha-analyzer-2.0/tree/main. This repo differs in that it uses a traditional training pipeline rather than the Hugging Face Trainer. Hugging face trainer abstracts the training code, which should be explicit for this toolkit. + + +# Install + +To set up environment for model training: + +1) run steps 1 - 3 of the installation instructions in `whoot/README.md` +2) For step 4, specifically run `pip install -e .[model_training, cu128/cpu]` + +# Running + +0) Add your Comet-ML API to your local environment. See +1) Create a copy of the config found in `configs/config.yml` and fill it out with your dataset +2) Edit train.py to set up training for your dataset. If you are using a new dataset which an extractor does not exist for, contact code authors. +3) run `python train.py path/to/your/config/file.yml` diff --git a/whoot_model_training/train.py b/whoot_model_training/train.py index a89be1e..b1017d4 100644 --- a/whoot_model_training/train.py +++ b/whoot_model_training/train.py @@ -86,8 +86,8 @@ def train(config_path): # OPTIONAL ARGS args.num_train_epochs = 2 args.eval_steps = 20 - args.per_device_train_batch_size = 1 - args.per_device_eval_batch_size = 1 + args.per_device_train_batch_size = 32 + args.per_device_eval_batch_size = 32 args.dataloader_num_workers = 36 args.run_name = "testing" args.report_to = "comet_ml" # Blocks wandb @@ -103,7 +103,7 @@ def train(config_path): logger=None, ignore_keys=["predictions", "labels", "embeddings", "loss"] ) - #trainer.train() + trainer.train() print(trainer.evaluate(eval_dataset=ds["valid"], metric_key_prefix="TEST FOR METRICS")) diff --git a/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py b/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py index 458d4fa..4ec5635 100644 --- a/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py +++ b/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py @@ -36,6 +36,21 @@ def buowset_extractor( sr=32_000, filepath="segment", ): + """Extracts raw data in the buowset format into an AudioDataset + + Args: + Metdata_csv (str): Path to csv containing buowset metadata + parent_path (str): Path to the parent folder for all audio data. + Note its assumed the audio filepath in the csv is relative to parent_path + output_path (str): Path to where HF cache for this dataset should live + validation_fold (int): which fold is considered the validation set Default 4 + test_fold (int): Which fold is considered the test set Default 3 + sr (int): Sample Rate of the audio files Default: 32_000 + filepath (str): Name of the column in the dataset containing the filepaths Default: segment + + Returns: + (AudioDataset): See dataset.py, AudioDatasets are consider the universal dataset for the training pipeline. + """ # if os.path.exists(output_path): # ds = load_from_disk(output_path) # return AudioDataset(ds) @@ -53,7 +68,6 @@ def buowset_extractor( ) # Get audio into uniform format - ds = ds.add_column( "audio", [os.path.join(parent_path, file) for file in ds[filepath]] ) diff --git a/whoot_model_training/whoot_model_training/models/model.py b/whoot_model_training/whoot_model_training/models/model.py index a8e05b5..d0035be 100644 --- a/whoot_model_training/whoot_model_training/models/model.py +++ b/whoot_model_training/whoot_model_training/models/model.py @@ -43,11 +43,11 @@ class ModelOutput(dict, UserDict): Object that stores the output of a model This allows for standardizing model outputs - So upstream applications don't need to change for spefific models + So upstream applications don't need to change for specific models Inspired by HuggingFace Models - Developer: Reccommend for each Model, to have an assocaited ModelOutput class + Developer: recommended for each Model, to have an associated ModelOutput class """ def __init__( @@ -75,12 +75,12 @@ class ModelInput(UserDict, dict): """ModelInput - Spefifies Input Types + Specifies Input Types Hopefully should help standardize formatting for models Inspired by HuggingFace Models and Tokenizers - Developer: Reccommend for each Model, to have an assocaited ModelInput class + Developer: recommended for each Model, to have an assocaited ModelInput class ALWAYS HAS A LABEL CATEGORY """ @@ -101,18 +101,16 @@ def items(self): class Model(BaseModel): + """BaseModel Class for Whoot """ - BaseModel Class for Whoot - """ - # TODO Define required class intance variables - # Such as cirteron etc. + # TODO Define required class instance variables + # Such as criterion etc. def __init__(self, *args, **kwargs): self.input_format = ModelInput self.output_format = ModelOutput super().__init__(*args, **kwargs) - """ - Gets an embedding for the model + """Gets an embedding for the model This can be the final layer of a model backbone or a set of useful features diff --git a/whoot_model_training/whoot_model_training/models/timm_model.py b/whoot_model_training/whoot_model_training/models/timm_model.py index 72f223b..1145074 100644 --- a/whoot_model_training/whoot_model_training/models/timm_model.py +++ b/whoot_model_training/whoot_model_training/models/timm_model.py @@ -18,7 +18,7 @@ class TimmInputs(ModelInput): """Input for TimmModel's - Spefifies TimmModels needs labels and spectrograms that are Tensors + Specifies TimmModels needs labels and spectrograms that are Tensors """ def __init__(self, labels, waveform=None, spectrogram=None, device="cpu"): # # Can use inputs to verify correct shape for upstream model @@ -40,7 +40,8 @@ def __init__( num_classes=6, loss=None, ): - """ + """Init for TimmModel + kwargs: timm_model (str): name of model backbone from timms to use, Default: "resnet34" pretrained (bool): use a pretrained model from timms, Default: True @@ -54,13 +55,18 @@ def __init__( assert num_classes > 0 + # Deep learning CNN backbone self.backbone = timm.create_model( timm_model, pretrained=pretrained, in_chans=in_chans ) - # Unsure if 1000 is default for all models. Need to check this + + # Unsure if 1000 is default for all timm models. Need to check this self.linear = nn.Linear(1000, num_classes) - # Models might need different losses during training! + ## different losses if you want to train for different problems + ## BCEWithLogitsLoss is default as for Bioacoustics, the problem tends mutlilabel! + ## the probability of class A occurring doesn't change the probability of Class B + ## Many individuals can make calls at the same time! if loss is not None: self.loss = loss else: @@ -69,6 +75,15 @@ def __init__( @has_required_inputs() #data: TimmInputs TODO FIX def forward(self, labels=None, spectrogram=None) -> ModelOutput: + """Model forward function + + Args: + labels=None (Torch.Tensor): the ground truth labels for computing loss + spectrogram=None (Torch.Tensor): spectrograms inputs into model + + Returns + (ModelOutput): The model output (logits), latent space representations (embeddings), loss and labels. + """ embedd = self.backbone(spectrogram) logits = self.linear(embedd) loss = self.loss(logits, labels) diff --git a/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py b/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py index 62a7d6f..03e23f9 100644 --- a/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py +++ b/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py @@ -31,6 +31,22 @@ def __init__( n_mels=256, dataset_ref=None, ): + """ Creates a Online preprocessor for MelSpectrograms Based Models + + Formats input into spefific ModelInput format. + + Args: + ModelInput (ModelInput): How the model like input data formatted + Duration (int): Length in seconds of input + augment (none): See TODO WORK ON AUGMENTATIONS + spectrogram_augments (none): TODO WORK ON AUGMENTATIONS + class_list (list): the classes we are working with (used for one hot encoding) + n_fft (int): number of ffts + hop_length (int): hop length + power (int): power, defined by librosa + n_mels (int): number of mels for a melspectrogram + dataset_ref (AudioDataset): a external ref to the rest of the dataset + """ super().__init__( duration, augment, @@ -45,5 +61,10 @@ def __init__( self.ModelInput = ModelInput def __call__(self, batch: dict) -> ModelInput: + """Processes a batch of AudioDataset rows + + For this specific preprocessor, it creates a spectrogram then + Formats the data as a ModelInput + """ batch = super().__call__(batch) return self.ModelInput(labels=batch["labels"], spectrogram=batch["audio"]) From a39a944cd960536c14e55139552e2d0a77909f8d Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Fri, 20 Jun 2025 11:15:03 -0700 Subject: [PATCH 067/120] Add environment set up for config Some comet stuff and settings for changing how many GPUs can be used. Right now just one since distributed training is not working. Probably not putting API key here... I don't want to have people push their APIs to github at all --- whoot_model_training/configs/config.yml | 7 ++++++- whoot_model_training/train.py | 22 ++++++++++++++-------- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/whoot_model_training/configs/config.yml b/whoot_model_training/configs/config.yml index 4fba81e..67599b1 100644 --- a/whoot_model_training/configs/config.yml +++ b/whoot_model_training/configs/config.yml @@ -1,3 +1,8 @@ +# Data paths metadata_csv: data/burrowing_owl_dataset/metadata.csv data_path: data/burrowing_owl_dataset/audio -hf_cache_path: data/burrowing_owl_dataset/cache/metadata.hf \ No newline at end of file +hf_cache_path: data/burrowing_owl_dataset/cache/metadata.hf + +# Required Variables +COMET_PROJECT_NAME: "whoot" +CUDA_VISIBLE_DEVICES: "0" #"0,1" \ No newline at end of file diff --git a/whoot_model_training/train.py b/whoot_model_training/train.py index b1017d4..aa26e03 100644 --- a/whoot_model_training/train.py +++ b/whoot_model_training/train.py @@ -24,7 +24,7 @@ ## TODO ALLOW USER TO SELECT THIS ## TODO MAKE DISTRIBUTED TRAINING POSSIBLE import os -os.environ["CUDA_VISIBLE_DEVICES"] = "0" + def parse_config(config_path: str) -> dict: """wrapper to parse config @@ -40,7 +40,7 @@ def parse_config(config_path: str) -> dict: return config -def train(config_path): +def train(config): """Highest level logic for training Does the following: @@ -51,10 +51,8 @@ def train(config_path): - Runs evaluation Args: - config_path (str): path to config file for training! + config (dict): the config used for training. Defined in yaml file """ - - config = parse_config(config_path) # Extract the dataset ds = buowset_extractor( @@ -64,7 +62,7 @@ def train(config_path): ) # Create the model - model = TimmModel(num_classes=ds.get_num_classes()) + model = TimmModel(timm_model="efficientnet_b0", num_classes=ds.get_num_classes()) # Preprocessors (No augmentation)! # We define here what the model reads @@ -89,7 +87,7 @@ def train(config_path): args.per_device_train_batch_size = 32 args.per_device_eval_batch_size = 32 args.dataloader_num_workers = 36 - args.run_name = "testing" + args.run_name = "efficientnet_b0" args.report_to = "comet_ml" # Blocks wandb @@ -107,9 +105,17 @@ def train(config_path): print(trainer.evaluate(eval_dataset=ds["valid"], metric_key_prefix="TEST FOR METRICS")) +def init_env(config: dict): + print(config) + os.environ["COMET_PROJECT_NAME"] = config["COMET_PROJECT_NAME"] + os.environ["CUDA_VISIBLE_DEVICES"] = config["CUDA_VISIBLE_DEVICES"] + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Input config path") parser.add_argument("config", type=str, help="Path to config.yml") args = parser.parse_args() - train(args.config) + config = parse_config(args.config) + + init_env(config) + train(config) From 492495b646d7f4f580669ab385d96faee8b65093 Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Fri, 20 Jun 2025 11:58:25 -0700 Subject: [PATCH 068/120] Add high level overview of repo --- whoot_model_training/README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/whoot_model_training/README.md b/whoot_model_training/README.md index c1d2176..1da75e1 100644 --- a/whoot_model_training/README.md +++ b/whoot_model_training/README.md @@ -16,3 +16,18 @@ To set up environment for model training: 1) Create a copy of the config found in `configs/config.yml` and fill it out with your dataset 2) Edit train.py to set up training for your dataset. If you are using a new dataset which an extractor does not exist for, contact code authors. 3) run `python train.py path/to/your/config/file.yml` + +# Repo Philosophy + +The most challenging issue with machine learning is the dataset. This training repo intends to make it easy to modularize parts of the training pipeline, and integrate them together, ideally regardless of the dataset. + +The pipeline works in 5 parts: +- Extractors: Extractors take in raw data and reformats it into `AudioDatasets`, apache-arrow data structures implemented via HuggingFace with common columns between any dataset. Every label is one_hot_encoded and treated as mutlilabel regardless of the problem. Audio filepaths as casted into [Audio columns](https://huggingface.co/docs/datasets/v3.6.0/en/package_reference/main_classes#datasets.Audio). Extractors are *unique for each dataset* but *uniform in the AudioDataset*. + +- Preprocessors: Online preprocessors take rows in `AudioDatasets` and output `ModelInputs`, formatted data specific to a given model. Preprocessors read AudioDatasets and translate it so the Model can read it + +- Models: Models have defined `ModelInput` and `ModelOutput` formats. All ModelInputs and ModelOutputs have common data that they are required to have such that the `PyhaTrainer` can understand how to feed information to the Model, and how to read information from the model. All models implement their own loss functions and return a loss given labels. + +- Augmentations: TODO + +- PyhaTrainer: With few exceptions unrelated to bioacoustic classifications, all PyTorch training code is the same. The HuggingFace Trainer and the extension PyhaTrainer handle most training scripts you will ever write. Why not use it and focus on model design, dataset preprocessing and cleaning. As long as the trainer knows how to feed data into a model (`AudioDatasets` and `Preprocessors`) and how to read it (`ModelOutputs`), then it will have no issues. \ No newline at end of file From be03237ce15c63ae270e31ded63e5457bd9bbff4 Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Fri, 20 Jun 2025 12:08:52 -0700 Subject: [PATCH 069/120] Add keep version in whoot/ __init__.py --- pyproject.toml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0cf3211..3c5f28c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "whoot" -version = "0.0.2.dev0" +dynamic = ["version"] description = "Tools for capturing, analyzing, and parsing audio data" readme = "README.md" requires-python = ">=3.10" @@ -13,6 +13,9 @@ dependencies = [ "tqdm>=4.67.1", ] +[tool.setuptools.dynamic] +version = {attr = "whoot.__version__"} + [project.optional-dependencies] cpu = [ "torch>=2.7.0", From 1cfc2000614f0d30c4a0d9e322be038eea1fd6ca Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Fri, 20 Jun 2025 15:19:36 -0700 Subject: [PATCH 070/120] Add leaderboard panel --- comet_ml_panels/leaderboard.py | 53 ++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 comet_ml_panels/leaderboard.py diff --git a/comet_ml_panels/leaderboard.py b/comet_ml_panels/leaderboard.py new file mode 100644 index 0000000..7e54f13 --- /dev/null +++ b/comet_ml_panels/leaderboard.py @@ -0,0 +1,53 @@ +# Comet Python Panels BETA, full documentation available at: +# https://www.comet.com/docs/v2/guides/comet-ui/experiment-management/visualizations/python-panel/ +# Code from original python template +# Modified by Sean Perry, 6/202/2025 +# TODO: FIGURE OUT HOW TO VERSION CONTROL THIS... + + +from comet_ml import API, APIExperiment, ui +import pandas as pd +# import plotly.express as px + +def get_max_metric(df, metric_col="metric"): + # Doing a simple groupby max removes extra useful metadata + # For example + # We may want to know the exact step we had the best score + # But a max groupby will only show the last step at the end + index = df[metric_col].argmax() + return df.iloc[index] + + +# Initialize Comet API +api = API() + +# Get available metrics and select one +available_metrics = ["train/valid_cMAP", "train/valid_ROCAUC"] +selected_metric = ui.dropdown("Select a metric:", available_metrics) + +# Fetch experiment data +experiment_keys = api.get_panel_experiment_keys() +if experiment_keys and selected_metric: + # Fetch the selected metric data for all experiments + metrics_df = api.get_metrics_df(experiment_keys, [selected_metric]) + + # Create Leaderboard View + leaderboard_df = metrics_df.groupby("experiment_key").apply( + lambda df: get_max_metric(df, selected_metric) + ).sort_values(by=selected_metric, ascending=False).reset_index(drop=True) + + leaderboard_df["users"] = leaderboard_df["experiment_key"].apply( + lambda key: APIExperiment(previous_experiment=key).get_user() + ) + + col_order = ["experiment_name", selected_metric, "experiment_key", "step", "users"] + + + + #api_experiment = comet_ml.APIExperiment(previous_experiment='EXPERIMENT-KEY') + #print(api_experiment.get_user()) + # + + ui.display(leaderboard_df[col_order]) +else: + ui.display("No data to plot. Make sure your metric data is logged by step.") From 9174cf6af776eaf93f973e416edd3e14379569cf Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Fri, 20 Jun 2025 16:30:51 -0700 Subject: [PATCH 071/120] Add demo of the supplement to the comet ml logging --- .../whoot_model_training/logger.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 whoot_model_training/whoot_model_training/logger.py diff --git a/whoot_model_training/whoot_model_training/logger.py b/whoot_model_training/whoot_model_training/logger.py new file mode 100644 index 0000000..4417223 --- /dev/null +++ b/whoot_model_training/whoot_model_training/logger.py @@ -0,0 +1,22 @@ +import comet_ml + +class CometMLLoggerSupplement(): + """Note, that is working with the Trainer! + + The Trainer class implements their own CometML Callback during training + See https://github.com/huggingface/transformers/blob/2166b6b4ff09f6dd3867ab982f262f66482aa968/src/transformers/integrations/integration_utils.py#L1031 + This handles a lot but NOT ALL of the logging we want + + This class handles the last 10% of the logging we want such as + - Better dataset hashing + - git hash saving + - etc + """ + + def __init__(self, dataset_info, githash, ): + comet_ml.login() + self.experiment = comet_ml.start() + print("experiment key", self.experiment.id) + + #TODO add these logs to comet_ml + #TODO Check to make sure training doesn't create a new experiment \ No newline at end of file From b9711e978775199f9e95d6e4df87dce08aa11081 Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Fri, 27 Jun 2025 10:30:41 -0700 Subject: [PATCH 072/120] Add better model checkpointing --- whoot_model_training/train.py | 17 ++++++++++------- .../whoot_model_training/trainer.py | 14 ++++++++++++++ 2 files changed, 24 insertions(+), 7 deletions(-) create mode 100644 whoot_model_training/whoot_model_training/trainer.py diff --git a/whoot_model_training/train.py b/whoot_model_training/train.py index aa26e03..383cb19 100644 --- a/whoot_model_training/train.py +++ b/whoot_model_training/train.py @@ -15,12 +15,13 @@ import argparse import yaml -from pyha_analyzer import PyhaTrainer, PyhaTrainingArguments - +from whoot_model_training.trainer import WhootTrainer, WhootTrainingArguments from whoot_model_training.data_extractor import buowset_extractor from whoot_model_training.models import TimmModel, TimmInputs from whoot_model_training.preprocessors import SpectrogramModelInputPreprocessors +import comet_ml + ## TODO ALLOW USER TO SELECT THIS ## TODO MAKE DISTRIBUTED TRAINING POSSIBLE import os @@ -62,7 +63,8 @@ def train(config): ) # Create the model - model = TimmModel(timm_model="efficientnet_b0", num_classes=ds.get_num_classes()) + run_name = "efficientnet_b1_redo_for_weights" + model = TimmModel(timm_model="efficientnet_b1", num_classes=ds.get_num_classes()) # Preprocessors (No augmentation)! # We define here what the model reads @@ -75,7 +77,7 @@ def train(config): ds["test"].set_transform(preprocessor) # Run training - args = PyhaTrainingArguments(working_dir="working_dir") + args = WhootTrainingArguments(run_name=run_name) # REQUIRED ARGS (DO NOT CHANGE VALUES TODO ADD TO TRAINER DIRECTLY) args.label_names = ["labels"] @@ -87,22 +89,23 @@ def train(config): args.per_device_train_batch_size = 32 args.per_device_eval_batch_size = 32 args.dataloader_num_workers = 36 - args.run_name = "efficientnet_b0" + args.run_name = run_name args.report_to = "comet_ml" # Blocks wandb print(args.accelerator_config.even_batches) - trainer = PyhaTrainer( + trainer = WhootTrainer( model=model, dataset=ds, training_args=args, logger=None, ignore_keys=["predictions", "labels", "embeddings", "loss"] ) + trainer.train() - print(trainer.evaluate(eval_dataset=ds["valid"], metric_key_prefix="TEST FOR METRICS")) + # print(trainer.evaluate(eval_dataset=ds["valid"], metric_key_prefix="TEST FOR METRICS")) def init_env(config: dict): diff --git a/whoot_model_training/whoot_model_training/trainer.py b/whoot_model_training/whoot_model_training/trainer.py new file mode 100644 index 0000000..4e6626d --- /dev/null +++ b/whoot_model_training/whoot_model_training/trainer.py @@ -0,0 +1,14 @@ +from datetime import datetime +import os + +from pyha_analyzer import PyhaTrainingArguments +from pyha_analyzer import PyhaTrainer as WhootTrainer +# In case we want to extend the current Trainer, lets temporarily create WhootTrainer! + +class WhootTrainingArguments(PyhaTrainingArguments): + def __init__(self, run_name): + DEFAULT_MODEL_CHECKPOINTS = "model_checkpoints" + checkpoint_created_at = datetime.now().strftime("%m_%d_%Y_%H:%M:%S") + super().__init__(os.path.join(f"{DEFAULT_MODEL_CHECKPOINTS}", + f"{run_name}_{checkpoint_created_at}")) + \ No newline at end of file From bde9b1200a3f24c86199729c6548bfbbe9b73308 Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Fri, 27 Jun 2025 10:30:58 -0700 Subject: [PATCH 073/120] Update gitignore to hide model_checkpoints --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index e7924ec..a9a3190 100644 --- a/.gitignore +++ b/.gitignore @@ -213,7 +213,7 @@ uv.lock data # Model Storage -working_dir +model_checkpoints/* # testing/debugging notebooks From da30e615025b817276c527b4e123610bf937a75b Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Fri, 27 Jun 2025 14:26:38 -0700 Subject: [PATCH 074/120] Added pipeline for augmentations --- whoot_model_training/train.py | 43 +++++++++++++++++-- .../whoot_model_training/__init__.py | 1 + .../data_extractor/buowset_extractor.py | 2 + .../whoot_model_training/logger.py | 5 ++- 4 files changed, 45 insertions(+), 6 deletions(-) create mode 100644 whoot_model_training/whoot_model_training/__init__.py diff --git a/whoot_model_training/train.py b/whoot_model_training/train.py index 383cb19..1ed8b38 100644 --- a/whoot_model_training/train.py +++ b/whoot_model_training/train.py @@ -18,7 +18,11 @@ from whoot_model_training.trainer import WhootTrainer, WhootTrainingArguments from whoot_model_training.data_extractor import buowset_extractor from whoot_model_training.models import TimmModel, TimmInputs +from whoot_model_training import CometMLLoggerSupplement + from whoot_model_training.preprocessors import SpectrogramModelInputPreprocessors +from pyha_analyzer.preprocessors import MixItUp, ComposeAudioLabel +from audiomentations import Compose, AddColorNoise, AddBackgroundNoise, PolarityInversion, Gain import comet_ml @@ -63,16 +67,45 @@ def train(config): ) # Create the model - run_name = "efficientnet_b1_redo_for_weights" + run_name = "efficientnet_b1_augmented_mixitup_gain" model = TimmModel(timm_model="efficientnet_b1", num_classes=ds.get_num_classes()) - # Preprocessors (No augmentation)! + # Preprocessors + + # Augmentations + # TODO: Design better system for saving and reproducing augmentation parameters + wav_augs = ComposeAudioLabel([ + # AddBackgroundNoise( #We don't have background noise yet... + # sounds_path="data_birdset/background_noise", + # min_snr_db=10, + # max_snr_db=30, + # noise_transform=PolarityInversion(), + # p=0.8 + # ), + Gain( + min_gain_db = -12, + max_gain_db = 12, + p = 0.8 + ), + MixItUp( + dataset_ref=ds["train"], + min_snr_db=10, + max_snr_db=30, + noise_transform=PolarityInversion(), + p=0.8 + ) + ]) + # We define here what the model reads + train_preprocessor = SpectrogramModelInputPreprocessors( + TimmInputs, duration=3, class_list=ds.get_class_labels(), augment=wav_augs + ) + preprocessor = SpectrogramModelInputPreprocessors( TimmInputs, duration=3, class_list=ds.get_class_labels() ) - ds["train"].set_transform(preprocessor) + ds["train"].set_transform(train_preprocessor) ds["valid"].set_transform(preprocessor) ds["test"].set_transform(preprocessor) @@ -100,7 +133,9 @@ def train(config): model=model, dataset=ds, training_args=args, - logger=None, + logger=CometMLLoggerSupplement( + augmentations = wav_augs + ), ignore_keys=["predictions", "labels", "embeddings", "loss"] ) diff --git a/whoot_model_training/whoot_model_training/__init__.py b/whoot_model_training/whoot_model_training/__init__.py new file mode 100644 index 0000000..2c99f3c --- /dev/null +++ b/whoot_model_training/whoot_model_training/__init__.py @@ -0,0 +1 @@ +from .logger import CometMLLoggerSupplement as CometMLLoggerSupplement \ No newline at end of file diff --git a/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py b/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py index 4ec5635..e75ce54 100644 --- a/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py +++ b/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py @@ -72,6 +72,8 @@ def buowset_extractor( "audio", [os.path.join(parent_path, file) for file in ds[filepath]] ) + ds = ds.add_column("filepath", ds["audio"]) + ds = ds.cast_column("audio", Audio(sampling_rate=sr)) # Create splits of the data diff --git a/whoot_model_training/whoot_model_training/logger.py b/whoot_model_training/whoot_model_training/logger.py index 4417223..aa0eb34 100644 --- a/whoot_model_training/whoot_model_training/logger.py +++ b/whoot_model_training/whoot_model_training/logger.py @@ -13,10 +13,11 @@ class CometMLLoggerSupplement(): - etc """ - def __init__(self, dataset_info, githash, ): + def __init__(self, augmentations): comet_ml.login() self.experiment = comet_ml.start() - print("experiment key", self.experiment.id) + + self.experiment.log_parameter("augmentations", augmentations) #TODO add these logs to comet_ml #TODO Check to make sure training doesn't create a new experiment \ No newline at end of file From ef0886e911432c118da4753afcca85d834363ed7 Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Fri, 27 Jun 2025 16:18:29 -0700 Subject: [PATCH 075/120] Fixed missing run_name after adding custom logging system --- whoot_model_training/train.py | 3 ++- whoot_model_training/whoot_model_training/logger.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/whoot_model_training/train.py b/whoot_model_training/train.py index 1ed8b38..a822dc4 100644 --- a/whoot_model_training/train.py +++ b/whoot_model_training/train.py @@ -134,7 +134,8 @@ def train(config): dataset=ds, training_args=args, logger=CometMLLoggerSupplement( - augmentations = wav_augs + augmentations = wav_augs, + name = args.run_name ), ignore_keys=["predictions", "labels", "embeddings", "loss"] ) diff --git a/whoot_model_training/whoot_model_training/logger.py b/whoot_model_training/whoot_model_training/logger.py index aa0eb34..6ee621a 100644 --- a/whoot_model_training/whoot_model_training/logger.py +++ b/whoot_model_training/whoot_model_training/logger.py @@ -13,11 +13,12 @@ class CometMLLoggerSupplement(): - etc """ - def __init__(self, augmentations): + def __init__(self, augmentations, name): comet_ml.login() self.experiment = comet_ml.start() self.experiment.log_parameter("augmentations", augmentations) + self.experiment.set_name(name) #TODO add these logs to comet_ml #TODO Check to make sure training doesn't create a new experiment \ No newline at end of file From 0c2009008d11f2e66af2e7c200abbc95e7067eef Mon Sep 17 00:00:00 2001 From: Sumega Mandadi Date: Mon, 30 Jun 2025 11:52:26 -0700 Subject: [PATCH 076/120] Update dataframe format to store embeddings as list --- make_model/buowset/embed_to_df_birdnet.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/make_model/buowset/embed_to_df_birdnet.py b/make_model/buowset/embed_to_df_birdnet.py index 29f400e..58ca7dd 100644 --- a/make_model/buowset/embed_to_df_birdnet.py +++ b/make_model/buowset/embed_to_df_birdnet.py @@ -35,9 +35,10 @@ def obtain_birdnet_embeddings(embeds): filename = ntpath.basename(embed) filename = filename.replace(".birdnet.embeddings.txt", ".wav") dfb = pd.read_csv(embed, - delimiter="[,\t]", + delimiter="[\t]", engine='python', header=None) + dfb[2] = dfb[2].apply(lambda x: [float(i) for i in x.split(',') if i]) dfb_stripped = dfb.drop(dfb.columns[:2], axis=1) flattened = dfb_stripped.values.flatten() if len(flattened) > 1024: @@ -62,7 +63,7 @@ def merge_dfs(metadata, embed_dict): embed_df.index.name = 'segment' df_merged = metadata.merge(embed_df, on='segment') df_merged = df_merged.drop(columns=['segment_duration_s']) - + df_merged = df_merged.rename(columns={0: 'embedding'}) return df_merged From 1eb30c7de2eba83da9d40d486a5ca9d8c1ad81dd Mon Sep 17 00:00:00 2001 From: Sumega Mandadi Date: Mon, 30 Jun 2025 13:43:49 -0700 Subject: [PATCH 077/120] Update dataframe format to match birdnet --- make_model/prepare_perch_embeddings.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/make_model/prepare_perch_embeddings.py b/make_model/prepare_perch_embeddings.py index 393b030..20cfac6 100644 --- a/make_model/prepare_perch_embeddings.py +++ b/make_model/prepare_perch_embeddings.py @@ -64,6 +64,8 @@ def prepare_perch_embeddings(sqlite_dir, embeddings_df = pd.DataFrame(embeddings_data) merged_df = pd.merge(embeddings_df, metadata, on='segment') + merged_df = merged_df.drop('segment_duration_s', axis=1) + merged_df = merged_df[['segment', 'label', 'fold', 'embedding']] output_filename = os.path.join(output_dir, f'{embeddings_description}_perch_embeddings.pkl') merged_df.to_pickle(output_filename) From 019ef96cbd27531948098b5614f4de859d944ad2 Mon Sep 17 00:00:00 2001 From: Sumega Mandadi Date: Mon, 30 Jun 2025 15:45:17 -0700 Subject: [PATCH 078/120] Update to work with new standard embeddings format --- make_model/buowset/make_svm.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/make_model/buowset/make_svm.py b/make_model/buowset/make_svm.py index ed3024a..0d8c298 100644 --- a/make_model/buowset/make_svm.py +++ b/make_model/buowset/make_svm.py @@ -61,11 +61,9 @@ def make_x_and_y(embed_df): train_df = embed_df[embed_df['fold'].isin(TRAINING_FOLDS)] test_df = embed_df[embed_df['fold'].isin(TESTING_FOLDS)] - embedding_cols = embed_df.select_dtypes(include='float64').columns.tolist() - - x_train = train_df[embedding_cols].values + x_train = list(train_df['embedding'].values) y_train = train_df['binary_label'].values - x_test = test_df[embedding_cols].values + x_test = list(test_df['embedding'].values) y_test = test_df['binary_label'].values return x_train, y_train, x_test, y_test From 962da2642f70d4c6a4d32e159296f72b3f5f3cfb Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Tue, 1 Jul 2025 13:50:43 -0700 Subject: [PATCH 079/120] fix pylint error of no columns --- make_model/buowset/embed_to_df_birdnet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/make_model/buowset/embed_to_df_birdnet.py b/make_model/buowset/embed_to_df_birdnet.py index 58ca7dd..e74d04f 100644 --- a/make_model/buowset/embed_to_df_birdnet.py +++ b/make_model/buowset/embed_to_df_birdnet.py @@ -39,7 +39,7 @@ def obtain_birdnet_embeddings(embeds): engine='python', header=None) dfb[2] = dfb[2].apply(lambda x: [float(i) for i in x.split(',') if i]) - dfb_stripped = dfb.drop(dfb.columns[:2], axis=1) + dfb_stripped = dfb.iloc[:, 2:] flattened = dfb_stripped.values.flatten() if len(flattened) > 1024: print(f"filename {filename} has extra lines. Trunicating") From 52db6103d3aced57d972eba74f9963509bbc284e Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Wed, 2 Jul 2025 11:31:23 -0700 Subject: [PATCH 080/120] Lint leaderboard.py --- comet_ml_panels/leaderboard.py | 44 +++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/comet_ml_panels/leaderboard.py b/comet_ml_panels/leaderboard.py index 7e54f13..8e3574b 100644 --- a/comet_ml_panels/leaderboard.py +++ b/comet_ml_panels/leaderboard.py @@ -1,13 +1,23 @@ -# Comet Python Panels BETA, full documentation available at: -# https://www.comet.com/docs/v2/guides/comet-ui/experiment-management/visualizations/python-panel/ -# Code from original python template -# Modified by Sean Perry, 6/202/2025 -# TODO: FIGURE OUT HOW TO VERSION CONTROL THIS... +"""Creates the Leaderboard for Comet ML Panels +This script queries from a given Comet ML project a DataFrame of +model metrics at each step for each model in the project +Then displays the top models. + +Note that updating this file does not update comet-ml. Please +go into the project to update after pushing to GitHub. + +Example: + This is not intended to be run locally. Please test on Comet-ML. + +For Developers: + For more on adding to this see docs at + https://www.comet.com/docs/v2/guides/comet-ui/experiment-management/visualizations/python-panel/ + +""" from comet_ml import API, APIExperiment, ui -import pandas as pd -# import plotly.express as px + def get_max_metric(df, metric_col="metric"): # Doing a simple groupby max removes extra useful metadata @@ -25,7 +35,7 @@ def get_max_metric(df, metric_col="metric"): available_metrics = ["train/valid_cMAP", "train/valid_ROCAUC"] selected_metric = ui.dropdown("Select a metric:", available_metrics) -# Fetch experiment data +# Fetch experiment data experiment_keys = api.get_panel_experiment_keys() if experiment_keys and selected_metric: # Fetch the selected metric data for all experiments @@ -40,14 +50,16 @@ def get_max_metric(df, metric_col="metric"): lambda key: APIExperiment(previous_experiment=key).get_user() ) - col_order = ["experiment_name", selected_metric, "experiment_key", "step", "users"] - + col_order = [ + "experiment_name", + selected_metric, + "experiment_key", + "step", + "users" + ] - - #api_experiment = comet_ml.APIExperiment(previous_experiment='EXPERIMENT-KEY') - #print(api_experiment.get_user()) - # - ui.display(leaderboard_df[col_order]) else: - ui.display("No data to plot. Make sure your metric data is logged by step.") + ui.display( + "No data to plot. Make sure your metric data is logged by step." + ) From cb2531fa37a5fac8d2a86d0830d6f28cb663bcc1 Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Wed, 2 Jul 2025 11:34:53 -0700 Subject: [PATCH 081/120] Add linter dev dependencies --- .flake8 | 2 + pylintrc | 415 +++++++++++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 6 +- 3 files changed, 422 insertions(+), 1 deletion(-) create mode 100644 .flake8 create mode 100644 pylintrc diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..5d1d750 --- /dev/null +++ b/.flake8 @@ -0,0 +1,2 @@ +[flake8] +exclude = .venv/* diff --git a/pylintrc b/pylintrc new file mode 100644 index 0000000..70c1cc1 --- /dev/null +++ b/pylintrc @@ -0,0 +1,415 @@ +# This Pylint rcfile contains a best-effort configuration to uphold the +# best-practices and style described in the Google Python style guide: +# https://google.github.io/styleguide/pyguide.html +# +# Its canonical open-source location is: +# https://google.github.io/styleguide/pylintrc + +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[MAIN] + +# Files or directories to be skipped. They should be base names, not paths. +ignore=third_party + +# Files or directories matching the regex patterns are skipped. The regex +# matches against base names, not paths. +ignore-patterns= + +# Pickle collected data for later comparisons. +persistent=no + +# List of plugins (as comma separated values of python modules names) to load, +# usually to register additional checkers. +load-plugins= + +# Use multiple processes to speed up Pylint. +jobs=4 + +# Allow loading of arbitrary C extensions. Extensions are imported into the +# active Python interpreter and may run arbitrary code. +unsafe-load-any-extension=no + + +[MESSAGES CONTROL] + +# Only show warnings with the listed confidence levels. Leave empty to show +# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED +confidence= + +# Enable the message, report, category or checker with the given id(s). You can +# either give multiple identifier separated by comma (,) or put this option +# multiple time (only on the command line, not in the configuration file where +# it should appear only once). See also the "--disable" option for examples. +#enable= + +# Disable the message, report, category or checker with the given id(s). You +# can either give multiple identifiers separated by comma (,) or put this +# option multiple times (only on the command line, not in the configuration +# file where it should appear only once).You can also use "--disable=all" to +# disable everything first and then reenable specific checks. For example, if +# you want to run only the similarities checker, you can use "--disable=all +# --enable=similarities". If you want to run only the classes checker, but have +# no Warning level messages displayed, use"--disable=all --enable=classes +# --disable=W" +disable=R, + abstract-method, + apply-builtin, + arguments-differ, + attribute-defined-outside-init, + backtick, + bad-option-value, + basestring-builtin, + buffer-builtin, + c-extension-no-member, + consider-using-enumerate, + cmp-builtin, + cmp-method, + coerce-builtin, + coerce-method, + delslice-method, + div-method, + eq-without-hash, + execfile-builtin, + file-builtin, + filter-builtin-not-iterating, + fixme, + getslice-method, + global-statement, + hex-method, + idiv-method, + implicit-str-concat, + import-error, + import-self, + import-star-module-level, + input-builtin, + intern-builtin, + invalid-str-codec, + locally-disabled, + long-builtin, + long-suffix, + map-builtin-not-iterating, + misplaced-comparison-constant, + missing-function-docstring, + metaclass-assignment, + next-method-called, + next-method-defined, + no-absolute-import, + no-init, # added + no-member, + no-name-in-module, + no-self-use, + nonzero-method, + oct-method, + old-division, + old-ne-operator, + old-octal-literal, + old-raise-syntax, + parameter-unpacking, + print-statement, + raising-string, + range-builtin-not-iterating, + raw_input-builtin, + rdiv-method, + reduce-builtin, + relative-import, + reload-builtin, + round-builtin, + setslice-method, + signature-differs, + standarderror-builtin, + suppressed-message, + sys-max-int, + trailing-newlines, + unichr-builtin, + unicode-builtin, + unnecessary-pass, + unpacking-in-except, + useless-else-on-loop, + useless-suppression, + using-cmp-argument, + wrong-import-order, + xrange-builtin, + zip-builtin-not-iterating, + + +[REPORTS] + +# Set the output format. Available formats are text, parseable, colorized, msvs +# (visual studio) and html. You can also give a reporter class, eg +# mypackage.mymodule.MyReporterClass. +output-format=text + +# Tells whether to display a full report or only the messages +reports=no + +# Activate the evaluation score. +score=no + +# Python expression which should return a note less than 10 (10 is the highest +# note). You have access to the variables errors warning, statement which +# respectively contain the number of errors / warnings messages and the total +# number of statements analyzed. This is used by the global evaluation report +# (RP0004). +evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) + +# Template used to display messages. This is a python new-style format string +# used to format the message information. See doc for all details +#msg-template= + + +[BASIC] + +# Good variable names which should always be accepted, separated by a comma +good-names=main,_ + +# Bad variable names which should always be refused, separated by a comma +bad-names= + +# Colon-delimited sets of names that determine each other's naming style when +# the name regexes allow several styles. +name-group= + +# Include a hint for the correct naming format with invalid-name +include-naming-hint=no + +# List of decorators that produce properties, such as abc.abstractproperty. Add +# to this list to register other decorators that produce valid properties. +property-classes=abc.abstractproperty,cached_property.cached_property,cached_property.threaded_cached_property,cached_property.cached_property_with_ttl,cached_property.threaded_cached_property_with_ttl + +# Regular expression matching correct function names +function-rgx=^(?:(?PsetUp|tearDown|setUpModule|tearDownModule)|(?P_?[A-Z][a-zA-Z0-9]*)|(?P_?[a-z][a-z0-9_]*))$ + +# Regular expression matching correct variable names +variable-rgx=^[a-z][a-z0-9_]*$ + +# Regular expression matching correct constant names +const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$ + +# Regular expression matching correct attribute names +attr-rgx=^_{0,2}[a-z][a-z0-9_]*$ + +# Regular expression matching correct argument names +argument-rgx=^[a-z][a-z0-9_]*$ + +# Regular expression matching correct class attribute names +class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$ + +# Regular expression matching correct inline iteration names +inlinevar-rgx=^[a-z][a-z0-9_]*$ + +# Regular expression matching correct class names +class-rgx=^_?[A-Z][a-zA-Z0-9]*$ + +# Regular expression matching correct module names +module-rgx=^(_?[a-z][a-z0-9_]*|__init__)$ + +# Regular expression matching correct method names +method-rgx=(?x)^(?:(?P_[a-z0-9_]+__|runTest|setUp|tearDown|setUpTestCase|tearDownTestCase|setupSelf|tearDownClass|setUpClass|(test|assert)_*[A-Z0-9][a-zA-Z0-9_]*|next)|(?P_{0,2}[A-Z][a-zA-Z0-9_]*)|(?P_{0,2}[a-z][a-z0-9_]*))$ + +# Regular expression which should only match function or class names that do +# not require a docstring. +no-docstring-rgx=(__.*__|main|test.*|.*test|.*Test)$ + +# Minimum line length for functions/classes that require docstrings, shorter +# ones are exempt. +docstring-min-length=12 + + +[TYPECHECK] + +# List of decorators that produce context managers, such as +# contextlib.contextmanager. Add to this list to register other decorators that +# produce valid context managers. +contextmanager-decorators=contextlib.contextmanager,contextlib2.contextmanager + +# List of module names for which member attributes should not be checked +# (useful for modules/projects where namespaces are manipulated during runtime +# and thus existing member attributes cannot be deduced by static analysis. It +# supports qualified module names, as well as Unix pattern matching. +ignored-modules= + +# List of class names for which member attributes should not be checked (useful +# for classes with dynamically set attributes). This supports the use of +# qualified names. +ignored-classes=optparse.Values,thread._local,_thread._local + +# List of members which are set dynamically and missed by pylint inference +# system, and so shouldn't trigger E1101 when accessed. Python regular +# expressions are accepted. +generated-members= + + +[FORMAT] + +# Maximum number of characters on a single line. +max-line-length=80 + +# TODO(https://github.com/pylint-dev/pylint/issues/3352): Direct pylint to exempt +# lines made too long by directives to pytype. + +# Regexp for a line that is allowed to be longer than the limit. +ignore-long-lines=(?x)( + ^\s*(\#\ )??$| + ^\s*(from\s+\S+\s+)?import\s+.+$) + +# Allow the body of an if to be on the same line as the test if there is no +# else. +single-line-if-stmt=yes + +# Maximum number of lines in a module +max-module-lines=99999 + +# String used as indentation unit. The internal Google style guide mandates 2 +# spaces. Google's externaly-published style guide says 4, consistent with +# PEP 8. +indent-string=' ' + +# Number of spaces of indent required inside a hanging or continued line. +indent-after-paren=4 + +# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. +expected-line-ending-format= + + +[MISCELLANEOUS] + +# List of note tags to take in consideration, separated by a comma. +notes=TODO + + +[STRING] + +# This flag controls whether inconsistent-quotes generates a warning when the +# character used as a quote delimiter is used inconsistently within a module. +check-quote-consistency=yes + + +[VARIABLES] + +# Tells whether we should check for unused import in __init__ files. +init-import=no + +# A regular expression matching the name of dummy variables (i.e. expectedly +# not used). +dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_) + +# List of additional names supposed to be defined in builtins. Remember that +# you should avoid to define new builtins when possible. +additional-builtins= + +# List of strings which can identify a callback function by name. A callback +# name must start or end with one of those strings. +callbacks=cb_,_cb + +# List of qualified module names which can have objects that can redefine +# builtins. +redefining-builtins-modules=six,six.moves,past.builtins,future.builtins,functools + + +[LOGGING] + +# Logging modules to check that the string format arguments are in logging +# function parameter format +logging-modules=logging,absl.logging,tensorflow.io.logging + + +[SIMILARITIES] + +# Minimum lines number of a similarity. +min-similarity-lines=4 + +# Ignore comments when computing similarities. +ignore-comments=yes + +# Ignore docstrings when computing similarities. +ignore-docstrings=yes + +# Ignore imports when computing similarities. +ignore-imports=no + + +[SPELLING] + +# Spelling dictionary name. Available dictionaries: none. To make it working +# install python-enchant package. +spelling-dict= + +# List of comma separated words that should not be checked. +spelling-ignore-words= + +# A path to a file that contains private dictionary; one word per line. +spelling-private-dict-file= + +# Tells whether to store unknown words to indicated private dictionary in +# --spelling-private-dict-file option instead of raising a message. +spelling-store-unknown-words=no + + +[IMPORTS] + +# Deprecated modules which should not be used, separated by a comma +deprecated-modules=regsub, + TERMIOS, + Bastion, + rexec, + sets + +# Create a graph of every (i.e. internal and external) dependencies in the +# given file (report RP0402 must not be disabled) +import-graph= + +# Create a graph of external dependencies in the given file (report RP0402 must +# not be disabled) +ext-import-graph= + +# Create a graph of internal dependencies in the given file (report RP0402 must +# not be disabled) +int-import-graph= + +# Force import order to recognize a module as part of the standard +# compatibility libraries. +known-standard-library= + +# Force import order to recognize a module as part of a third party library. +known-third-party=enchant, absl + +# Analyse import fallback blocks. This can be used to support both Python 2 and +# 3 compatible code, which means that the block might have code that exists +# only in one or another interpreter, leading to false positives when analysed. +analyse-fallback-blocks=no + + +[CLASSES] + +# List of method names used to declare (i.e. assign) instance attributes. +defining-attr-methods=__init__, + __new__, + setUp + +# List of member names, which should be excluded from the protected access +# warning. +exclude-protected=_asdict, + _fields, + _replace, + _source, + _make + +# List of valid names for the first argument in a class method. +valid-classmethod-first-arg=cls, + class_ + +# List of valid names for the first argument in a metaclass class method. +valid-metaclass-classmethod-first-arg=mcs diff --git a/pyproject.toml b/pyproject.toml index 3c5f28c..968d6ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,9 +25,13 @@ cu128 = [ "torch>=2.7.0", "torchvision>=0.22.0", ] +dev = [ + "flake8>=7.3.0", + "pylint>=3.3.7", +] [packages.index] cu128 = "https://download.pytorch.org/whl/cu128" [tool.setuptools] -packages = ["make_model", "assess_birdnet"] \ No newline at end of file +packages = ["make_model", "assess_birdnet"] From c110816d5b74cb0eb666ae5d7984d02ecf3e99dc Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Wed, 2 Jul 2025 11:43:07 -0700 Subject: [PATCH 082/120] Add linting docs --- README.md | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d47a6d8..cc04d65 100644 --- a/README.md +++ b/README.md @@ -21,9 +21,21 @@ Current support optional dependency collections include - `cpu`: Installs torch and torchvision for CPU use only - `cu128`: Installs torch and torchvision with Cuda 12.8 Binaries +- `dev`: Installs linters pylint and flake8. MUST be used by developers of whoot +# Developer Notes -## Developer Notes +## Creating a new Project -When adding a new package, like `assess_birdnet` to the whoot toolkit, add your package name to the `[tool.setuptools]` section of `pyproject.toml` \ No newline at end of file +When adding a new package, like `assess_birdnet` to the whoot toolkit, add your package name to the `[tool.setuptools]` section of `pyproject.toml` + +### Linting + +Style guidelines are listed in `.flake8` and `pylintrc`. To use these tools do the following + +1) Follow the Installation Instructions, on pip install do `pip install -e .[dev,extra1,extra2,...]`. +2) Activate the environment + +To run the linters run `python -m flake8` and `python -m pylint --recursive y PATH/TO/FILES.py` +In order to contribute to whoot, both of these must be cleared. \ No newline at end of file From d881195f8daa05638a3e1fc99496069224fd0a01 Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Wed, 2 Jul 2025 14:17:35 -0700 Subject: [PATCH 083/120] Added confusion matrix --- .../whoot_model_training/metrics.py | 32 ++++++++++++++++ .../whoot_model_training/trainer.py | 38 +++++++++++++++++-- 2 files changed, 66 insertions(+), 4 deletions(-) create mode 100644 whoot_model_training/whoot_model_training/metrics.py diff --git a/whoot_model_training/whoot_model_training/metrics.py b/whoot_model_training/whoot_model_training/metrics.py new file mode 100644 index 0000000..efc96fe --- /dev/null +++ b/whoot_model_training/whoot_model_training/metrics.py @@ -0,0 +1,32 @@ +import comet_ml +import torch +from sklearn.metrics import confusion_matrix + +from pyha_analyzer.metrics.classification_metrics import AudioClassificationMetrics + +class WhootMutliClassMetrics(AudioClassificationMetrics): + def __init__(self, classes:list): + self.classes = classes + super().__init__([], len(classes), mutlilabel=True) + + def __call__(self, eval_pred) -> dict[str, float]: + # CMAP / ROCAUC + initial_metrics = super().__call__(eval_pred=eval_pred) + + # For metrics that are not loggable to console + # We can only have comet_ml for these metrics + experiment = comet_ml.get_running_experiment() #TODO CLEAN THIS UP WITH SAVING EXPERIMENT KEY + if experiment is None: + return initial_metrics + logits = torch.Tensor(eval_pred.predictions) + target = torch.Tensor(eval_pred.label_ids).to(torch.long) + + # Confusion Matrix WARNING, ONLY USE IF DATA IS MOSTLY MUTLICLASS + cm = confusion_matrix(torch.argmax(target, dim=1), torch.argmax(logits, dim=1)) + experiment.log_confusion_matrix(matrix=cm.tolist(), labels=self.classes) + + # Return the metrics that can be logged to console AND comet-ml + return initial_metrics + + + \ No newline at end of file diff --git a/whoot_model_training/whoot_model_training/trainer.py b/whoot_model_training/whoot_model_training/trainer.py index 4e6626d..9b18049 100644 --- a/whoot_model_training/whoot_model_training/trainer.py +++ b/whoot_model_training/whoot_model_training/trainer.py @@ -2,13 +2,43 @@ import os from pyha_analyzer import PyhaTrainingArguments -from pyha_analyzer import PyhaTrainer as WhootTrainer -# In case we want to extend the current Trainer, lets temporarily create WhootTrainer! +from pyha_analyzer import PyhaTrainer + +from .metrics import WhootMutliClassMetrics +from .dataset import AudioDataset + class WhootTrainingArguments(PyhaTrainingArguments): def __init__(self, run_name): DEFAULT_MODEL_CHECKPOINTS = "model_checkpoints" checkpoint_created_at = datetime.now().strftime("%m_%d_%Y_%H:%M:%S") - super().__init__(os.path.join(f"{DEFAULT_MODEL_CHECKPOINTS}", + super().__init__(os.path.join(f"{DEFAULT_MODEL_CHECKPOINTS}", f"{run_name}_{checkpoint_created_at}")) - \ No newline at end of file + + +class WhootTrainer(PyhaTrainer): + def __init__( + self, + model, + dataset: AudioDataset, + training_args=None, + logger=None, + data_collator=None, + preprocessor=None, + ignore_keys=... + ): + + metrics = WhootMutliClassMetrics(dataset.get_class_labels().names) + + print("LOGGING NEW METRICS... HOPEFULLY") + + super().__init__( + model, + dataset, + metrics, + training_args, + logger, + data_collator, + preprocessor, + ignore_keys + ) From 67982cd52cdca40fa2dd61008c3556c7898d9724 Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Wed, 2 Jul 2025 16:16:58 -0700 Subject: [PATCH 084/120] Linted --- whoot_model_training/train.py | 136 ++++++++++-------- .../whoot_model_training/__init__.py | 7 +- .../data_extractor/__init__.py | 2 + .../data_extractor/buowset_extractor.py | 31 ++-- .../whoot_model_training/dataset.py | 39 +++-- .../whoot_model_training/logger.py | 6 +- .../whoot_model_training/metrics.py | 27 ++-- .../whoot_model_training/models/__init__.py | 10 +- .../whoot_model_training/models/model.py | 74 +++++----- .../whoot_model_training/models/timm_model.py | 48 ++++--- .../preprocessors/__init__.py | 9 +- .../preprocessors/default_preprocessor.py | 43 +++--- .../spectrogram_preprocessors.py | 30 ++-- .../whoot_model_training/trainer.py | 7 +- 14 files changed, 279 insertions(+), 190 deletions(-) diff --git a/whoot_model_training/train.py b/whoot_model_training/train.py index a822dc4..143aee1 100644 --- a/whoot_model_training/train.py +++ b/whoot_model_training/train.py @@ -4,14 +4,15 @@ models and datasets to create any model for bioacoustic classification It is intended this script to be heavily modified with each experiment -(say one wants to use a different dataset, one should copy this and change the extractor!) +(say one wants to use a different dataset, one should copy this and change the +extractor!) Usage: $ python train.py /path/to/config.yml config.yml should contain frequently changed hyperparameters """ - +import os import argparse import yaml @@ -20,27 +21,33 @@ from whoot_model_training.models import TimmModel, TimmInputs from whoot_model_training import CometMLLoggerSupplement -from whoot_model_training.preprocessors import SpectrogramModelInputPreprocessors -from pyha_analyzer.preprocessors import MixItUp, ComposeAudioLabel -from audiomentations import Compose, AddColorNoise, AddBackgroundNoise, PolarityInversion, Gain +from whoot_model_training.preprocessors import ( + SpectrogramModelInputPreprocessors +) +# from pyha_analyzer.preprocessors import MixItUp, ComposeAudioLabel +# from audiomentations import ( +# Compose, AddColorNoise, +# AddBackgroundNoise, PolarityInversion, Gain +# ) + +# import comet_ml -import comet_ml +# TODO ALLOW USER TO SELECT THIS +# TODO MAKE DISTRIBUTED TRAINING POSSIBLE -## TODO ALLOW USER TO SELECT THIS -## TODO MAKE DISTRIBUTED TRAINING POSSIBLE -import os def parse_config(config_path: str) -> dict: """wrapper to parse config - Args: + Args: config_path (str): path to config file for training! - - returns: - (dict): hyperparameters parameters + + returns: + (dict): hyperparameters parameters """ - with open(config_path, "r") as f: + config = {} + with open(config_path, "r", encoding="UTF-8") as f: config = yaml.safe_load(f) return config @@ -53,9 +60,9 @@ def train(config): - Prepares preprocessing for each audio clip - Builds the model - Configures and runs the trainer - - Runs evaluation + - Runs evaluation - Args: + Args: config (dict): the config used for training. Defined in yaml file """ @@ -67,38 +74,39 @@ def train(config): ) # Create the model - run_name = "efficientnet_b1_augmented_mixitup_gain" - model = TimmModel(timm_model="efficientnet_b1", num_classes=ds.get_num_classes()) + run_name = "efficientnet_b1_testing_confusion_matrix_no_data_aug" + model = TimmModel(timm_model="efficientnet_b1", + num_classes=ds.get_num_classes()) # Preprocessors # Augmentations - # TODO: Design better system for saving and reproducing augmentation parameters - wav_augs = ComposeAudioLabel([ - # AddBackgroundNoise( #We don't have background noise yet... - # sounds_path="data_birdset/background_noise", - # min_snr_db=10, - # max_snr_db=30, - # noise_transform=PolarityInversion(), - # p=0.8 - # ), - Gain( - min_gain_db = -12, - max_gain_db = 12, - p = 0.8 - ), - MixItUp( - dataset_ref=ds["train"], - min_snr_db=10, - max_snr_db=30, - noise_transform=PolarityInversion(), - p=0.8 - ) - ]) + # TODO: Design better system for saving and reproducing augmentation + # wav_augs = ComposeAudioLabel([ + # # AddBackgroundNoise( #We don't have background noise yet... + # # sounds_path="data_birdset/background_noise", + # # min_snr_db=10, + # # max_snr_db=30, + # # noise_transform=PolarityInversion(), + # # p=0.8 + # # ), + # Gain( + # min_gain_db = -12, + # max_gain_db = 12, + # p = 0.8 + # ), + # MixItUp( + # dataset_ref=ds["train"], + # min_snr_db=10, + # max_snr_db=30, + # noise_transform=PolarityInversion(), + # p=0.8 + # ) + # ]) # We define here what the model reads train_preprocessor = SpectrogramModelInputPreprocessors( - TimmInputs, duration=3, class_list=ds.get_class_labels(), augment=wav_augs + TimmInputs, duration=3, class_list=ds.get_class_labels() ) preprocessor = SpectrogramModelInputPreprocessors( @@ -110,41 +118,43 @@ def train(config): ds["test"].set_transform(preprocessor) # Run training - args = WhootTrainingArguments(run_name=run_name) - + training_args = WhootTrainingArguments(run_name=run_name) + # REQUIRED ARGS (DO NOT CHANGE VALUES TODO ADD TO TRAINER DIRECTLY) - args.label_names = ["labels"] - args.remove_unused_columns = False + training_args.label_names = ["labels"] + training_args.remove_unused_columns = False # OPTIONAL ARGS - args.num_train_epochs = 2 - args.eval_steps = 20 - args.per_device_train_batch_size = 32 - args.per_device_eval_batch_size = 32 - args.dataloader_num_workers = 36 - args.run_name = run_name - args.report_to = "comet_ml" # Blocks wandb - + training_args.num_train_epochs = 2 + training_args.eval_steps = 20 + training_args.per_device_train_batch_size = 32 + training_args.per_device_eval_batch_size = 32 + training_args.dataloader_num_workers = 36 + training_args.run_name = run_name + training_args.report_to = "comet_ml" - print(args.accelerator_config.even_batches) - + print(training_args.accelerator_config.even_batches) trainer = WhootTrainer( model=model, dataset=ds, - training_args=args, + training_args=training_args, logger=CometMLLoggerSupplement( - augmentations = wav_augs, - name = args.run_name + augmentations=None, + name=training_args.run_name ), ignore_keys=["predictions", "labels", "embeddings", "loss"] ) trainer.train() - # print(trainer.evaluate(eval_dataset=ds["valid"], metric_key_prefix="TEST FOR METRICS")) - + def init_env(config: dict): + """Sets up local environment for COMET-ML training logging + + Args: config (dict): at a minimum this has the project name + and CUDA devices that are allowed to be used. + """ print(config) os.environ["COMET_PROJECT_NAME"] = config["COMET_PROJECT_NAME"] os.environ["CUDA_VISIBLE_DEVICES"] = config["CUDA_VISIBLE_DEVICES"] @@ -154,7 +164,7 @@ def init_env(config: dict): parser = argparse.ArgumentParser(description="Input config path") parser.add_argument("config", type=str, help="Path to config.yml") args = parser.parse_args() - config = parse_config(args.config) + _config = parse_config(args.config) - init_env(config) - train(config) + init_env(_config) + train(_config) diff --git a/whoot_model_training/whoot_model_training/__init__.py b/whoot_model_training/whoot_model_training/__init__.py index 2c99f3c..f8922cc 100644 --- a/whoot_model_training/whoot_model_training/__init__.py +++ b/whoot_model_training/whoot_model_training/__init__.py @@ -1 +1,6 @@ -from .logger import CometMLLoggerSupplement as CometMLLoggerSupplement \ No newline at end of file +"""Logging Toolkit for different MLops platforms +""" + +from .logger import CometMLLoggerSupplement as CometMLLoggerSupplement + +__all__ = ["CometMLLoggerSupplement"] diff --git a/whoot_model_training/whoot_model_training/data_extractor/__init__.py b/whoot_model_training/whoot_model_training/data_extractor/__init__.py index 365cd63..1418f67 100644 --- a/whoot_model_training/whoot_model_training/data_extractor/__init__.py +++ b/whoot_model_training/whoot_model_training/data_extractor/__init__.py @@ -1 +1,3 @@ from .buowset_extractor import buowset_extractor as buowset_extractor + +__all__ = ["buowset_extractor"] diff --git a/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py b/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py index e75ce54..355267a 100644 --- a/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py +++ b/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py @@ -1,18 +1,26 @@ """Standardizes the format of the buowset dataset -Inspired by https://github.com/UCSD-E4E/pyha-analyzer-2.0/tree/main/pyha_analyzer/extractors +Inspired by https://github.com/UCSD-E4E/pyha-analyzer-2.0/ + tree/main/pyha_analyzer/extractors -The idea being extractors is that they take raw data, and +The idea being extractors is that they take raw data, and format it into a uniform dataset format, AudioDataset -This way, it should be easier to define what a common audio dataset format is between +This way, it should be easier to define what a +common audio dataset format is between parts of the codebase for training """ import os import numpy as np -from datasets import load_dataset, Audio, DatasetDict, ClassLabel, Sequence, load_from_disk +from datasets import ( + load_dataset, + Audio, + DatasetDict, + ClassLabel, + Sequence, +) from ..dataset import AudioDataset @@ -27,6 +35,7 @@ def one_hot_encode(row: dict, classes: list): row["labels"] = np.array(one_hot, dtype=float) return row + def buowset_extractor( metadata_csv, parent_path, @@ -40,16 +49,20 @@ def buowset_extractor( Args: Metdata_csv (str): Path to csv containing buowset metadata - parent_path (str): Path to the parent folder for all audio data. - Note its assumed the audio filepath in the csv is relative to parent_path + parent_path (str): Path to the parent folder for all audio data. + Note its assumed the audio filepath + in the csv is relative to parent_path output_path (str): Path to where HF cache for this dataset should live - validation_fold (int): which fold is considered the validation set Default 4 + validation_fold (int): which fold is considered the validation set + Default 4 test_fold (int): Which fold is considered the test set Default 3 sr (int): Sample Rate of the audio files Default: 32_000 - filepath (str): Name of the column in the dataset containing the filepaths Default: segment + filepath (str): Name of the column in the dataset containing + the filepaths Default: segment Returns: - (AudioDataset): See dataset.py, AudioDatasets are consider the universal dataset for the training pipeline. + (AudioDataset): See dataset.py, AudioDatasets are consider + the universal dataset for the training pipeline. """ # if os.path.exists(output_path): # ds = load_from_disk(output_path) diff --git a/whoot_model_training/whoot_model_training/dataset.py b/whoot_model_training/whoot_model_training/dataset.py index 1f5db06..3d17f6b 100644 --- a/whoot_model_training/whoot_model_training/dataset.py +++ b/whoot_model_training/whoot_model_training/dataset.py @@ -1,5 +1,6 @@ """ -Pulled from https://github.com/UCSD-E4E/pyha-analyzer-2.0/blob/main/pyha_analyzer/dataset.py +Pulled from: +https://github.com/UCSD-E4E/pyha-analyzer-2.0/blob/main/pyha_analyzer/dataset.py Key idea is we define a generic AudioDataset with uniform features Using an Arrow Dataset from Hugging Face's dataset library because @@ -18,10 +19,13 @@ class AudioDataset(DatasetDict): """ AudioDataset Class - If your dataset is an AudioDataset, it can be read by the rest of the system + If your dataset is an AudioDataset, it can be read by + the rest of the system - Behind the scenes, this is a Apache Arrow Dataset Dict (via hf library) where - each key is a split of the data (test/train/valid) and the value is an arrow dataset + Behind the scenes, this is a Apache Arrow Dataset Dict + (via hf library) where + each key is a split of the data (test/train/valid) + and the value is an arrow dataset with at a minimum 2 columns: - labels (Sequence of class labels, such as [0,10]) - audio (Audio Column type from hugging face) @@ -31,17 +35,20 @@ def __init__(self, ds: DatasetDict): super().__init__(ds) def validate_format(self, ds: DatasetDict): - """Validates dataset is correctly formatted and ready to be used for training + """Validates dataset is correctly formatted and ready to be used for + training Raises: - AssertionError if dataset is not correctly formatted. + AssertionError if dataset is not correctly formatted. """ for split in ds.keys(): dataset = ds[split] for column in DEFAULT_COLUMNS: - assert column in dataset.features, ( - f"The column `{column}` is missing from dataset split `{split}`. Required by system" + state = ( + f"The column `{column}` is missing from dataset split `{ + split}`. Required by system" ) + assert column in dataset.features, state def get_num_classes(self): """ @@ -53,12 +60,13 @@ def get_num_classes(self): def get_number_species(self) -> int: """ PyhaAnalyzer uses `get_number_species` for getting class count - This... isn't always the case that the dataset is species only (could have calls!) + This... isn't always the case that the dataset is species only + (could have calls!) To support legacy PyhaAnalyzer, we therefore have this function. This should be deprecated in future versions of PyhaAnalyzer - - return + + return (int): number of classes """ return self.get_num_classes() @@ -66,10 +74,13 @@ def get_number_species(self) -> int: def get_class_labels(self) -> ClassLabel: """Class mapping for this dataset - A common problem is when moving between datasets creating mappings between classes - This aims to help standardize that by being able to get the classLabels for this dataset + A common problem is when moving between datasets + creating mappings between classes + This aims to help standardize that by being + able to get the classLabels for this dataset Returns: - (ClassLabel): Mapping of all the names of the labels to their index. + (ClassLabel): Mapping of all the names of + the labels to their index. """ return ClassLabel(names=self["train"].features["labels"].feature.names) diff --git a/whoot_model_training/whoot_model_training/logger.py b/whoot_model_training/whoot_model_training/logger.py index 6ee621a..e051b68 100644 --- a/whoot_model_training/whoot_model_training/logger.py +++ b/whoot_model_training/whoot_model_training/logger.py @@ -1,10 +1,10 @@ import comet_ml + class CometMLLoggerSupplement(): """Note, that is working with the Trainer! The Trainer class implements their own CometML Callback during training - See https://github.com/huggingface/transformers/blob/2166b6b4ff09f6dd3867ab982f262f66482aa968/src/transformers/integrations/integration_utils.py#L1031 This handles a lot but NOT ALL of the logging we want This class handles the last 10% of the logging we want such as @@ -20,5 +20,5 @@ def __init__(self, augmentations, name): self.experiment.log_parameter("augmentations", augmentations) self.experiment.set_name(name) - #TODO add these logs to comet_ml - #TODO Check to make sure training doesn't create a new experiment \ No newline at end of file + # TODO add these logs to comet_ml + # TODO Check to make sure training doesn't create a new experiment diff --git a/whoot_model_training/whoot_model_training/metrics.py b/whoot_model_training/whoot_model_training/metrics.py index efc96fe..755fc87 100644 --- a/whoot_model_training/whoot_model_training/metrics.py +++ b/whoot_model_training/whoot_model_training/metrics.py @@ -2,10 +2,17 @@ import torch from sklearn.metrics import confusion_matrix -from pyha_analyzer.metrics.classification_metrics import AudioClassificationMetrics +# from torchmetrics.classification import ( +# MultilabelAveragePrecision, +# MultilabelAUROC, +# ) + +from pyha_analyzer.metrics.classification_metrics \ + import AudioClassificationMetrics + class WhootMutliClassMetrics(AudioClassificationMetrics): - def __init__(self, classes:list): + def __init__(self, classes: list): self.classes = classes super().__init__([], len(classes), mutlilabel=True) @@ -15,18 +22,22 @@ def __call__(self, eval_pred) -> dict[str, float]: # For metrics that are not loggable to console # We can only have comet_ml for these metrics - experiment = comet_ml.get_running_experiment() #TODO CLEAN THIS UP WITH SAVING EXPERIMENT KEY + # TODO CLEAN THIS UP WITH SAVING EXPERIMENT KEY + experiment = comet_ml.get_running_experiment() if experiment is None: return initial_metrics logits = torch.Tensor(eval_pred.predictions) target = torch.Tensor(eval_pred.label_ids).to(torch.long) # Confusion Matrix WARNING, ONLY USE IF DATA IS MOSTLY MUTLICLASS - cm = confusion_matrix(torch.argmax(target, dim=1), torch.argmax(logits, dim=1)) - experiment.log_confusion_matrix(matrix=cm.tolist(), labels=self.classes) + cm = confusion_matrix( + torch.argmax(target, dim=1), + torch.argmax(logits, dim=1) + ) + experiment.log_confusion_matrix( + matrix=cm.tolist(), labels=self.classes) + + # Classwise Metrics (graph based) # Return the metrics that can be logged to console AND comet-ml return initial_metrics - - - \ No newline at end of file diff --git a/whoot_model_training/whoot_model_training/models/__init__.py b/whoot_model_training/whoot_model_training/models/__init__.py index 313bd2c..2f6c63b 100644 --- a/whoot_model_training/whoot_model_training/models/__init__.py +++ b/whoot_model_training/whoot_model_training/models/__init__.py @@ -1 +1,9 @@ -from .timm_model import TimmModel as TimmModel, TimmInputs as TimmInputs +"""a Bioacoustic Model Zoo + +Example: + `from whoot_model_training.models import TimmModel +""" + +from .timm_model import TimmModel, TimmInputs + +__all__ = ["TimmModel", "TimmInputs"] diff --git a/whoot_model_training/whoot_model_training/models/model.py b/whoot_model_training/whoot_model_training/models/model.py index d0035be..9b251a5 100644 --- a/whoot_model_training/whoot_model_training/models/model.py +++ b/whoot_model_training/whoot_model_training/models/model.py @@ -7,18 +7,17 @@ - ModelOutput: dict-like class that defines the output from the model - Model: A PyTorch nn.Module class -See timm_model.py for example about how these classes can be implemented. +See timm_model.py for example about how these classes can be implemented. """ -from abc import ABC, abstractmethod +from abc import abstractmethod from functools import wraps from collections import UserDict from pyha_analyzer.models.base_model import BaseModel -import torch -from torch import Tensor import numpy as np + def has_required_inputs(): """ Wrapper to check to make sure everything is setup properly @@ -47,7 +46,8 @@ class ModelOutput(dict, UserDict): Inspired by HuggingFace Models - Developer: recommended for each Model, to have an associated ModelOutput class + Developer: recommended for each Model, to have an associated + ModelOutput class """ def __init__( @@ -68,7 +68,10 @@ def __init__( }) def items(self): - return [(key, value) for (key, value) in super().items() if value is not None] + return [ + (key, value) for ( + key, value + ) in super().items() if value is not None] class ModelInput(UserDict, dict): @@ -80,7 +83,8 @@ class ModelInput(UserDict, dict): Inspired by HuggingFace Models and Tokenizers - Developer: recommended for each Model, to have an assocaited ModelInput class + Developer: recommended for each Model, to have an + associated ModelInput class ALWAYS HAS A LABEL CATEGORY """ @@ -97,57 +101,47 @@ def __init__( }) def items(self): - return [(key, value) for (key, value) in super().items() if value is not None] + return [ + (key, value) for ( + key, value + ) in super( + ).items() if value is not None] class Model(BaseModel): """BaseModel Class for Whoot """ # TODO Define required class instance variables - # Such as criterion etc. def __init__(self, *args, **kwargs): self.input_format = ModelInput self.output_format = ModelOutput super().__init__(*args, **kwargs) - """Gets an embedding for the model - - This can be the final layer of a model backbone - or a set of useful features - - Args - x: Any | Either np.array or Torch.Tensor, is the input for the model - - Returns - embedding: np.array, some embedding vector representing the input data - """ - def get_embeddings(self, x: ModelInput) -> np.array: - return self.forward(x).embeddings + """Gets an embedding for the model - """ - Runs some input x through the model - - In PyTorch models, this is the same forward functionlogits - We just apply the convention for non Pytorch models, + This can be the final layer of a model backbone + or a set of useful features - TODO: Some things to concern - - - Args: - x: Any + Args + x: Any | Either np.array or Torch.Tensor, is the input for the model - Returns: - ModelOutput: dict, a dictionary like object that describes - """ + Returns + embedding: np.array, some embedding vector representing the input data + """ + return self.forward(x).embeddings @abstractmethod @has_required_inputs() def forward(self, x: ModelInput) -> ModelOutput: - pass + """ + Runs some input x through the model - """ - Notes on design for the future + In PyTorch models, this is the same forward function logits + We just apply the convention for non Pytorch models, + Args: + x: Any - - Should model implement a way to save/load model to/form disk - - """ + Returns: + ModelOutput: dict, a dictionary like object that describes + """ diff --git a/whoot_model_training/whoot_model_training/models/timm_model.py b/whoot_model_training/whoot_model_training/models/timm_model.py index 1145074..39d39c6 100644 --- a/whoot_model_training/whoot_model_training/models/timm_model.py +++ b/whoot_model_training/whoot_model_training/models/timm_model.py @@ -1,9 +1,9 @@ -"""Wrapper around the timms model zoo +"""Wrapper around the timms model zoo See https://timm.fast.ai/ Timm model zoo good for computer vision models - Like CNNs, which are useful for spectrograms + Like CNNs, which are useful for spectrograms Great repo for models, but currently using this for demoing pipeline """ @@ -19,8 +19,13 @@ class TimmInputs(ModelInput): """Input for TimmModel's Specifies TimmModels needs labels and spectrograms that are Tensors + + Args: + Labels: the data's label for this batch + spectrogram: audio's spectrogram + waveform: Optional, audio waveform """ - def __init__(self, labels, waveform=None, spectrogram=None, device="cpu"): + def __init__(self, labels, waveform=None, spectrogram=None): # # Can use inputs to verify correct shape for upstream model # assert spectrogram.shape[1:] == (1, 100, 100) super().__init__(labels, waveform, spectrogram) @@ -29,7 +34,8 @@ def __init__(self, labels, waveform=None, spectrogram=None, device="cpu"): class TimmModel(nn.Module, Model): - """Model that uses a timm's model as its backbone with a linear layer for classification + """Model that uses a timm's model as its backbone with a + linear layer for classification """ def __init__( @@ -41,9 +47,10 @@ def __init__( loss=None, ): """Init for TimmModel - + kwargs: - timm_model (str): name of model backbone from timms to use, Default: "resnet34" + timm_model (str): name of model backbone from timms to use, + Default: "resnet34" pretrained (bool): use a pretrained model from timms, Default: True in_chans (int): number of channels of audio: Default: 1 num_classes (int): number of classes in the dataset: Default 6 @@ -59,33 +66,42 @@ def __init__( self.backbone = timm.create_model( timm_model, pretrained=pretrained, in_chans=in_chans ) - + # Unsure if 1000 is default for all timm models. Need to check this self.linear = nn.Linear(1000, num_classes) - ## different losses if you want to train for different problems - ## BCEWithLogitsLoss is default as for Bioacoustics, the problem tends mutlilabel! - ## the probability of class A occurring doesn't change the probability of Class B - ## Many individuals can make calls at the same time! + # different losses if you want to train for different problems + # BCEWithLogitsLoss is default as for Bioacoustics, the problem tends + # mutlilabel! + # the probability of class A occurring doesn't + # change the probability of Class B + # Many individuals can make calls at the same time! if loss is not None: self.loss = loss else: self.loss = nn.BCEWithLogitsLoss() - - @has_required_inputs() #data: TimmInputs TODO FIX + # TODO Fix this so it actually can take in a input object + @has_required_inputs() def forward(self, labels=None, spectrogram=None) -> ModelOutput: """Model forward function Args: - labels=None (Torch.Tensor): the ground truth labels for computing loss + labels=None (Torch.Tensor): the ground truth labels for computing + loss spectrogram=None (Torch.Tensor): spectrograms inputs into model Returns - (ModelOutput): The model output (logits), latent space representations (embeddings), loss and labels. + (ModelOutput): The model output (logits), + latent space representations (embeddings), loss and labels. """ embedd = self.backbone(spectrogram) logits = self.linear(embedd) loss = self.loss(logits, labels) - return ModelOutput(logits=logits, embeddings=embedd, loss=loss, labels=labels) + return ModelOutput( + logits=logits, + embeddings=embedd, + loss=loss, + labels=labels + ) diff --git a/whoot_model_training/whoot_model_training/preprocessors/__init__.py b/whoot_model_training/whoot_model_training/preprocessors/__init__.py index f15458e..13b6db5 100644 --- a/whoot_model_training/whoot_model_training/preprocessors/__init__.py +++ b/whoot_model_training/whoot_model_training/preprocessors/__init__.py @@ -1,6 +1,11 @@ from .default_preprocessor import ( - SpectrogramModelInputPreprocessors as SpectrogramModelInputPreprocessors, + SpectrogramModelInputPreprocessors ) from .spectrogram_preprocessors import ( - BuowMelSpectrogramPreprocessors as BuowMelSpectrogramPreprocessors, + BuowMelSpectrogramPreprocessors ) + +__all__ = [ + "SpectrogramModelInputPreprocessors", + "BuowMelSpectrogramPreprocessors" +] diff --git a/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py b/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py index 03e23f9..7a9ba5e 100644 --- a/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py +++ b/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py @@ -5,60 +5,58 @@ Not to mention any online augmentation we want to do -The preprocessor class defines a function to preprocess our data during training +The preprocessor class defines a function to preprocess our data during +training -The default preprocessor allows for many types of preprocessors to run, but it forces the output to fit -the ModelInput class structure. see `whoot_model_training\models\model.py` for more info. +The default preprocessor allows for many types of preprocessors to run, +but it forces the output to fit the ModelInput class structure. +see `whoot_model_training/models/model.py` for more info. """ -from .spectrogram_preprocessors import BuowMelSpectrogramPreprocessors + +from .spectrogram_preprocessors import BuowMelSpectrogramPreprocessors, SpectrogramParams from ..models.model import ModelInput + class SpectrogramModelInputPreprocessors(BuowMelSpectrogramPreprocessors): - """ Defines a preprocessed that after formatting the audio passes a spectrogram - into a ModelInput object. + """ Defines a preprocessed that after formatting the audio + passes a spectrogram into a ModelInput object. """ def __init__( self, - ModelInput: ModelInput, + model_input: ModelInput, duration=5, augment=None, spectrogram_augments=None, class_list=..., - n_fft=2048, - hop_length=256, - power=2, - n_mels=256, + spectrogram_params:SpectrogramParams = SpectrogramParams(), dataset_ref=None, ): """ Creates a Online preprocessor for MelSpectrograms Based Models - - Formats input into spefific ModelInput format. + + Formats input into spefific ModelInput format. Args: ModelInput (ModelInput): How the model like input data formatted Duration (int): Length in seconds of input augment (none): See TODO WORK ON AUGMENTATIONS spectrogram_augments (none): TODO WORK ON AUGMENTATIONS - class_list (list): the classes we are working with (used for one hot encoding) + class_list (list): the classes we are working with one-hot-encoding n_fft (int): number of ffts hop_length (int): hop length power (int): power, defined by librosa n_mels (int): number of mels for a melspectrogram - dataset_ref (AudioDataset): a external ref to the rest of the dataset + dataset_ref (AudioDataset): a external ref to an AudioDataset """ super().__init__( duration, augment, spectrogram_augments, class_list, - n_fft, - hop_length, - power, - n_mels, dataset_ref, + spectrogram_params ) - self.ModelInput = ModelInput + self.model_input = model_input def __call__(self, batch: dict) -> ModelInput: """Processes a batch of AudioDataset rows @@ -67,4 +65,7 @@ def __call__(self, batch: dict) -> ModelInput: Formats the data as a ModelInput """ batch = super().__call__(batch) - return self.ModelInput(labels=batch["labels"], spectrogram=batch["audio"]) + return self.model_input( + labels=batch["labels"], + spectrogram=batch["audio"] + ) diff --git a/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py b/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py index 8ed2d68..e4f869b 100644 --- a/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py +++ b/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py @@ -1,36 +1,46 @@ """ Pulled from pyha_analyzer/preprocessors/spectogram_preprocessors.py """ +from dataclasses import dataclass import librosa import numpy as np -import torchvision.transforms as transforms +from torchvision import transforms from pyha_analyzer.preprocessors import PreProcessorBase +@dataclass +class SpectrogramParams: + n_fft: int = 2048 + hop_length: int = 256 + power: float = 2.0 + n_mels: int = 256 + +# TODO add mixitup augmentation support class BuowMelSpectrogramPreprocessors(PreProcessorBase): + """Preprocessor for processing audio into spectrograms + Particularly for the buow dataset + """ + def __init__( self, duration=5, augment=None, spectrogram_augments=None, class_list=[], - n_fft=2048, - hop_length=256, - power=2.0, - n_mels=256, dataset_ref=None, + spectrogram_params:SpectrogramParams = SpectrogramParams() ): self.duration = duration self.augment = augment self.spectrogram_augments = spectrogram_augments # Below parameter defaults from https://arxiv.org/pdf/2403.10380 pg 25 - self.n_fft = n_fft - self.hop_length = hop_length - self.power = power - self.n_mels = n_mels + self.n_fft = spectrogram_params.n_fft + self.hop_length = spectrogram_params.hop_length + self.power = spectrogram_params.power + self.n_mels = spectrogram_params.n_mels super().__init__(name="MelSpectrogramPreprocessor") @@ -57,7 +67,7 @@ def __call__(self, batch): np.array( pillow_transforms( librosa.feature.melspectrogram( - y=y[int(start * sr) : end_sr], + y=y[int(start * sr):end_sr], sr=sr, n_fft=self.n_fft, hop_length=self.hop_length, diff --git a/whoot_model_training/whoot_model_training/trainer.py b/whoot_model_training/whoot_model_training/trainer.py index 9b18049..a8b1515 100644 --- a/whoot_model_training/whoot_model_training/trainer.py +++ b/whoot_model_training/whoot_model_training/trainer.py @@ -9,6 +9,8 @@ class WhootTrainingArguments(PyhaTrainingArguments): + """Holds arguments use for training + """ def __init__(self, run_name): DEFAULT_MODEL_CHECKPOINTS = "model_checkpoints" checkpoint_created_at = datetime.now().strftime("%m_%d_%Y_%H:%M:%S") @@ -17,6 +19,9 @@ def __init__(self, run_name): class WhootTrainer(PyhaTrainer): + """The training class + #TODO Improve these docstrings + """ def __init__( self, model, @@ -30,8 +35,6 @@ def __init__( metrics = WhootMutliClassMetrics(dataset.get_class_labels().names) - print("LOGGING NEW METRICS... HOPEFULLY") - super().__init__( model, dataset, From 44dbb78520dadf4a716551678c9736ab9e4a5a3b Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Wed, 2 Jul 2025 16:55:29 -0700 Subject: [PATCH 085/120] Linted (yes again, this is going to be a process...) --- whoot_model_training/train.py | 6 ----- .../whoot_model_training/__init__.py | 2 +- .../data_extractor/__init__.py | 8 +++++- .../data_extractor/buowset_extractor.py | 27 ++++++++++++------- .../whoot_model_training/logger.py | 11 +++++--- .../whoot_model_training/metrics.py | 27 ++++++++++++++----- .../whoot_model_training/models/model.py | 5 ++-- .../preprocessors/__init__.py | 9 +++++++ .../preprocessors/default_preprocessor.py | 6 +++-- .../spectrogram_preprocessors.py | 3 ++- .../whoot_model_training/trainer.py | 16 ++++++++--- 11 files changed, 84 insertions(+), 36 deletions(-) diff --git a/whoot_model_training/train.py b/whoot_model_training/train.py index 143aee1..a92efef 100644 --- a/whoot_model_training/train.py +++ b/whoot_model_training/train.py @@ -30,12 +30,6 @@ # AddBackgroundNoise, PolarityInversion, Gain # ) -# import comet_ml - -# TODO ALLOW USER TO SELECT THIS -# TODO MAKE DISTRIBUTED TRAINING POSSIBLE - - def parse_config(config_path: str) -> dict: """wrapper to parse config diff --git a/whoot_model_training/whoot_model_training/__init__.py b/whoot_model_training/whoot_model_training/__init__.py index f8922cc..feda7bb 100644 --- a/whoot_model_training/whoot_model_training/__init__.py +++ b/whoot_model_training/whoot_model_training/__init__.py @@ -1,6 +1,6 @@ """Logging Toolkit for different MLops platforms """ -from .logger import CometMLLoggerSupplement as CometMLLoggerSupplement +from .logger import CometMLLoggerSupplement __all__ = ["CometMLLoggerSupplement"] diff --git a/whoot_model_training/whoot_model_training/data_extractor/__init__.py b/whoot_model_training/whoot_model_training/data_extractor/__init__.py index 1418f67..96c821c 100644 --- a/whoot_model_training/whoot_model_training/data_extractor/__init__.py +++ b/whoot_model_training/whoot_model_training/data_extractor/__init__.py @@ -1,3 +1,9 @@ -from .buowset_extractor import buowset_extractor as buowset_extractor +"""A zoo for extractors + +Extractors convert raw data into AudioDatasets +Ideally you make a new Extractor for each new raw dataset +""" + +from .buowset_extractor import buowset_extractor __all__ = ["buowset_extractor"] diff --git a/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py b/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py index 355267a..df8fda9 100644 --- a/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py +++ b/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py @@ -12,6 +12,7 @@ """ import os +from dataclasses import dataclass import numpy as np from datasets import ( @@ -36,14 +37,20 @@ def one_hot_encode(row: dict, classes: list): return row +@dataclass +class BuowsetParams(): + """Parameters that describe the Buowset + """ + validation_fold = 4 + test_fold = 3 + sr = 32_000 + filepath = "segment" + def buowset_extractor( metadata_csv, parent_path, - output_path, # TODO what does output do? - validation_fold=4, - test_fold=3, - sr=32_000, - filepath="segment", + output_path, + params: BuowsetParams = BuowsetParams() ): """Extracts raw data in the buowset format into an AudioDataset @@ -82,18 +89,18 @@ def buowset_extractor( # Get audio into uniform format ds = ds.add_column( - "audio", [os.path.join(parent_path, file) for file in ds[filepath]] + "audio", [os.path.join(parent_path, file) for file in ds[params.filepath]] ) ds = ds.add_column("filepath", ds["audio"]) - ds = ds.cast_column("audio", Audio(sampling_rate=sr)) + ds = ds.cast_column("audio", Audio(sampling_rate=params.sr)) # Create splits of the data - test_ds = ds.filter(lambda x: x["fold"] == validation_fold) - valid_ds = ds.filter(lambda x: x["fold"] == test_fold) + test_ds = ds.filter(lambda x: x["fold"] == params.validation_fold) + valid_ds = ds.filter(lambda x: x["fold"] == params.test_fold) train_ds = ds.filter( - lambda x: x["fold"] != test_fold & x["fold"] != validation_fold + lambda x: x["fold"] != params.test_fold & x["fold"] != params.validation_fold ) ds = AudioDataset( DatasetDict({"train": train_ds, "valid": valid_ds, "test": test_ds}) diff --git a/whoot_model_training/whoot_model_training/logger.py b/whoot_model_training/whoot_model_training/logger.py index e051b68..dce3484 100644 --- a/whoot_model_training/whoot_model_training/logger.py +++ b/whoot_model_training/whoot_model_training/logger.py @@ -1,3 +1,11 @@ +""" Contains useful tools for additional logging + +For example, CometMLLoggerSupplement adds additional +logging for data augmentations used compared +to the base logging done by the HF trainer +integration +""" + import comet_ml @@ -19,6 +27,3 @@ def __init__(self, augmentations, name): self.experiment.log_parameter("augmentations", augmentations) self.experiment.set_name(name) - - # TODO add these logs to comet_ml - # TODO Check to make sure training doesn't create a new experiment diff --git a/whoot_model_training/whoot_model_training/metrics.py b/whoot_model_training/whoot_model_training/metrics.py index 755fc87..c87bac1 100644 --- a/whoot_model_training/whoot_model_training/metrics.py +++ b/whoot_model_training/whoot_model_training/metrics.py @@ -1,3 +1,11 @@ +""" Metrics for Bioacoustic Mutlilabel Models + +Helps us evaluate which models do well + +These the metrics with HF Trainer and are called +as part of a callback during training +""" + import comet_ml import torch from sklearn.metrics import confusion_matrix @@ -12,20 +20,30 @@ class WhootMutliClassMetrics(AudioClassificationMetrics): + """Gets CMAP, ROCAUC, and confusion matrices and reports them to + Comet-ML dashboards + """ def __init__(self, classes: list): self.classes = classes + self.training = True super().__init__([], len(classes), mutlilabel=True) def __call__(self, eval_pred) -> dict[str, float]: # CMAP / ROCAUC initial_metrics = super().__call__(eval_pred=eval_pred) + # Confusion Matrix + self.log_comet_ml_only(self, eval_pred) + + # Return the metrics that can be logged to console AND comet-ml + return initial_metrics + + def log_comet_ml_only(self, eval_pred): # For metrics that are not loggable to console # We can only have comet_ml for these metrics - # TODO CLEAN THIS UP WITH SAVING EXPERIMENT KEY experiment = comet_ml.get_running_experiment() if experiment is None: - return initial_metrics + return logits = torch.Tensor(eval_pred.predictions) target = torch.Tensor(eval_pred.label_ids).to(torch.long) @@ -36,8 +54,3 @@ def __call__(self, eval_pred) -> dict[str, float]: ) experiment.log_confusion_matrix( matrix=cm.tolist(), labels=self.classes) - - # Classwise Metrics (graph based) - - # Return the metrics that can be logged to console AND comet-ml - return initial_metrics diff --git a/whoot_model_training/whoot_model_training/models/model.py b/whoot_model_training/whoot_model_training/models/model.py index 9b251a5..91ef654 100644 --- a/whoot_model_training/whoot_model_training/models/model.py +++ b/whoot_model_training/whoot_model_training/models/model.py @@ -124,10 +124,11 @@ def get_embeddings(self, x: ModelInput) -> np.array: or a set of useful features Args - x: Any | Either np.array or Torch.Tensor, is the input for the model + x: Any | Either np.array or Torch.Tensor, the input for the model Returns - embedding: np.array, some embedding vector representing the input data + embedding: np.array, + some embedding vector representing the input data """ return self.forward(x).embeddings diff --git a/whoot_model_training/whoot_model_training/preprocessors/__init__.py b/whoot_model_training/whoot_model_training/preprocessors/__init__.py index 13b6db5..29e114f 100644 --- a/whoot_model_training/whoot_model_training/preprocessors/__init__.py +++ b/whoot_model_training/whoot_model_training/preprocessors/__init__.py @@ -1,3 +1,12 @@ +""" A collection of online preprocessors + +During training online preprocessors convert data +into data ready to be given to a model + +In traditional pytorch world, this would be like +the __get_item__ function of a dataset +""" + from .default_preprocessor import ( SpectrogramModelInputPreprocessors ) diff --git a/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py b/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py index 7a9ba5e..23e3168 100644 --- a/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py +++ b/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py @@ -14,7 +14,9 @@ """ -from .spectrogram_preprocessors import BuowMelSpectrogramPreprocessors, SpectrogramParams +from .spectrogram_preprocessors import ( + BuowMelSpectrogramPreprocessors, SpectrogramParams +) from ..models.model import ModelInput @@ -29,7 +31,7 @@ def __init__( augment=None, spectrogram_augments=None, class_list=..., - spectrogram_params:SpectrogramParams = SpectrogramParams(), + spectrogram_params: SpectrogramParams = SpectrogramParams(), dataset_ref=None, ): """ Creates a Online preprocessor for MelSpectrograms Based Models diff --git a/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py b/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py index e4f869b..ecfea50 100644 --- a/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py +++ b/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py @@ -17,6 +17,7 @@ class SpectrogramParams: power: float = 2.0 n_mels: int = 256 + # TODO add mixitup augmentation support class BuowMelSpectrogramPreprocessors(PreProcessorBase): """Preprocessor for processing audio into spectrograms @@ -30,7 +31,7 @@ def __init__( spectrogram_augments=None, class_list=[], dataset_ref=None, - spectrogram_params:SpectrogramParams = SpectrogramParams() + spectrogram_params: SpectrogramParams = SpectrogramParams() ): self.duration = duration self.augment = augment diff --git a/whoot_model_training/whoot_model_training/trainer.py b/whoot_model_training/whoot_model_training/trainer.py index a8b1515..9529319 100644 --- a/whoot_model_training/whoot_model_training/trainer.py +++ b/whoot_model_training/whoot_model_training/trainer.py @@ -1,3 +1,13 @@ +""" Everything needed to train +given a model and a dataset + +WhootTrainingArguments: A container for the + many many args for WhootTrainer + +WhootTrainer: The class that is going to run training + +""" + from datetime import datetime import os @@ -9,12 +19,12 @@ class WhootTrainingArguments(PyhaTrainingArguments): - """Holds arguments use for training + """Holds arguments use for training """ def __init__(self, run_name): - DEFAULT_MODEL_CHECKPOINTS = "model_checkpoints" + default_checkpoint_path = "model_checkpoints" checkpoint_created_at = datetime.now().strftime("%m_%d_%Y_%H:%M:%S") - super().__init__(os.path.join(f"{DEFAULT_MODEL_CHECKPOINTS}", + super().__init__(os.path.join(f"{default_checkpoint_path}", f"{run_name}_{checkpoint_created_at}")) From 3e1249803319689c54482407d0e4304d920fee51 Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Fri, 4 Jul 2025 10:21:29 -0700 Subject: [PATCH 086/120] Added last of the less destructive linting Next step, time to try and improve how data gets fed into the models --- whoot_model_training/train.py | 4 +- .../data_extractor/buowset_extractor.py | 9 +++- .../whoot_model_training/logger.py | 13 ++++++ .../whoot_model_training/metrics.py | 6 ++- .../whoot_model_training/models/model.py | 2 +- .../whoot_model_training/models/timm_model.py | 12 +++--- .../preprocessors/default_preprocessor.py | 14 ++----- .../spectrogram_preprocessors.py | 41 ++++++++++++++----- .../whoot_model_training/trainer.py | 6 ++- 9 files changed, 73 insertions(+), 34 deletions(-) diff --git a/whoot_model_training/train.py b/whoot_model_training/train.py index a92efef..552c5f9 100644 --- a/whoot_model_training/train.py +++ b/whoot_model_training/train.py @@ -100,11 +100,11 @@ def train(config): # We define here what the model reads train_preprocessor = SpectrogramModelInputPreprocessors( - TimmInputs, duration=3, class_list=ds.get_class_labels() + TimmInputs, duration=3 ) preprocessor = SpectrogramModelInputPreprocessors( - TimmInputs, duration=3, class_list=ds.get_class_labels() + TimmInputs, duration=3 ) ds["train"].set_transform(train_preprocessor) diff --git a/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py b/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py index df8fda9..8adadb2 100644 --- a/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py +++ b/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py @@ -46,6 +46,7 @@ class BuowsetParams(): sr = 32_000 filepath = "segment" + def buowset_extractor( metadata_csv, parent_path, @@ -89,7 +90,9 @@ def buowset_extractor( # Get audio into uniform format ds = ds.add_column( - "audio", [os.path.join(parent_path, file) for file in ds[params.filepath]] + "audio", [ + os.path.join(parent_path, file) for file in ds[params.filepath] + ] ) ds = ds.add_column("filepath", ds["audio"]) @@ -100,7 +103,9 @@ def buowset_extractor( test_ds = ds.filter(lambda x: x["fold"] == params.validation_fold) valid_ds = ds.filter(lambda x: x["fold"] == params.test_fold) train_ds = ds.filter( - lambda x: x["fold"] != params.test_fold & x["fold"] != params.validation_fold + lambda x: x[ + "fold" + ] != params.test_fold & x["fold"] != params.validation_fold ) ds = AudioDataset( DatasetDict({"train": train_ds, "valid": valid_ds, "test": test_ds}) diff --git a/whoot_model_training/whoot_model_training/logger.py b/whoot_model_training/whoot_model_training/logger.py index dce3484..783c45c 100644 --- a/whoot_model_training/whoot_model_training/logger.py +++ b/whoot_model_training/whoot_model_training/logger.py @@ -9,6 +9,7 @@ import comet_ml +# pylint disable-next=R0903 class CometMLLoggerSupplement(): """Note, that is working with the Trainer! @@ -23,7 +24,19 @@ class CometMLLoggerSupplement(): def __init__(self, augmentations, name): comet_ml.login() + self.start(augmentations, name) + + def start(self, augmentations, name): + """Begins a new set of experiments + + Helpful for cases where a new run has begun + """ self.experiment = comet_ml.start() self.experiment.log_parameter("augmentations", augmentations) self.experiment.set_name(name) + + def end(self): + """Fully ends experiment if still running + """ + return self.experiment.end() diff --git a/whoot_model_training/whoot_model_training/metrics.py b/whoot_model_training/whoot_model_training/metrics.py index c87bac1..fe4be0b 100644 --- a/whoot_model_training/whoot_model_training/metrics.py +++ b/whoot_model_training/whoot_model_training/metrics.py @@ -33,12 +33,14 @@ def __call__(self, eval_pred) -> dict[str, float]: initial_metrics = super().__call__(eval_pred=eval_pred) # Confusion Matrix - self.log_comet_ml_only(self, eval_pred) + self.log_comet_ml_only(eval_pred) # Return the metrics that can be logged to console AND comet-ml return initial_metrics - + def log_comet_ml_only(self, eval_pred): + """Logs confusion matrix each eval step + """ # For metrics that are not loggable to console # We can only have comet_ml for these metrics experiment = comet_ml.get_running_experiment() diff --git a/whoot_model_training/whoot_model_training/models/model.py b/whoot_model_training/whoot_model_training/models/model.py index 91ef654..89b80d5 100644 --- a/whoot_model_training/whoot_model_training/models/model.py +++ b/whoot_model_training/whoot_model_training/models/model.py @@ -130,7 +130,7 @@ def get_embeddings(self, x: ModelInput) -> np.array: embedding: np.array, some embedding vector representing the input data """ - return self.forward(x).embeddings + return self.forward(**x).embeddings @abstractmethod @has_required_inputs() diff --git a/whoot_model_training/whoot_model_training/models/timm_model.py b/whoot_model_training/whoot_model_training/models/timm_model.py index 39d39c6..7fb1997 100644 --- a/whoot_model_training/whoot_model_training/models/timm_model.py +++ b/whoot_model_training/whoot_model_training/models/timm_model.py @@ -44,7 +44,6 @@ def __init__( pretrained=True, in_chans=1, num_classes=6, - loss=None, ): """Init for TimmModel @@ -76,10 +75,13 @@ def __init__( # the probability of class A occurring doesn't # change the probability of Class B # Many individuals can make calls at the same time! - if loss is not None: - self.loss = loss - else: - self.loss = nn.BCEWithLogitsLoss() + self.loss = nn.BCEWithLogitsLoss() + + def set_custom_loss(self, loss_fn): + """Set a different loss function + For cases where we don't want BCEWithLogitsLoss + """ + self.loss = loss_fn # TODO Fix this so it actually can take in a input object @has_required_inputs() diff --git a/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py b/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py index 23e3168..b2ed726 100644 --- a/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py +++ b/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py @@ -28,11 +28,8 @@ def __init__( self, model_input: ModelInput, duration=5, - augment=None, - spectrogram_augments=None, - class_list=..., + augments: dict = {"audio":None, "spectrogram":None}, spectrogram_params: SpectrogramParams = SpectrogramParams(), - dataset_ref=None, ): """ Creates a Online preprocessor for MelSpectrograms Based Models @@ -41,8 +38,8 @@ def __init__( Args: ModelInput (ModelInput): How the model like input data formatted Duration (int): Length in seconds of input - augment (none): See TODO WORK ON AUGMENTATIONS - spectrogram_augments (none): TODO WORK ON AUGMENTATIONS + augment (dict): contains two keys: audio, spectrogram each defining + a dict of augmentation names and augmentations to run class_list (list): the classes we are working with one-hot-encoding n_fft (int): number of ffts hop_length (int): hop length @@ -52,10 +49,7 @@ def __init__( """ super().__init__( duration, - augment, - spectrogram_augments, - class_list, - dataset_ref, + augments, spectrogram_params ) self.model_input = model_input diff --git a/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py b/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py index ecfea50..d097a90 100644 --- a/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py +++ b/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py @@ -12,13 +12,19 @@ @dataclass class SpectrogramParams: + """ Dataclass for spectrogram Parameters + + n_fft: (int) number of fft bins + hop_length (int) skip count + power: (float) usually 2 + n_mels: (int) number of mel bins + """ n_fft: int = 2048 hop_length: int = 256 power: float = 2.0 n_mels: int = 256 -# TODO add mixitup augmentation support class BuowMelSpectrogramPreprocessors(PreProcessorBase): """Preprocessor for processing audio into spectrograms Particularly for the buow dataset @@ -27,21 +33,20 @@ class BuowMelSpectrogramPreprocessors(PreProcessorBase): def __init__( self, duration=5, - augment=None, - spectrogram_augments=None, - class_list=[], - dataset_ref=None, + augments={"audio":None, "spectrogram":None}, spectrogram_params: SpectrogramParams = SpectrogramParams() ): + + assert "audio" in augments.keys() and "spectrogram" in augments.keys() self.duration = duration - self.augment = augment - self.spectrogram_augments = spectrogram_augments + self.augments = augments # Below parameter defaults from https://arxiv.org/pdf/2403.10380 pg 25 self.n_fft = spectrogram_params.n_fft self.hop_length = spectrogram_params.hop_length self.power = spectrogram_params.power self.n_mels = spectrogram_params.n_mels + self.spectrogram_params = spectrogram_params super().__init__(name="MelSpectrogramPreprocessor") @@ -59,8 +64,8 @@ def __call__(self, batch): y = np.pad(y, end_sr - y.shape[-1]) # Audio Based Augmentations - if self.augment is not None: - y, label = self.augment(y, sr, label) + if self.augments["audio"] is not None: + y, label = self.augments.audio(y, sr, label) pillow_transforms = transforms.ToPILImage() @@ -81,8 +86,8 @@ def __call__(self, batch): / 255 ) - if self.spectrogram_augments is not None: - mels = self.spectrogram_augments(mels) + if self.augments["spectrogram"] is not None: + mels = self.augments.spectrogram(mels) new_audio.append(mels) new_labels.append(label) @@ -91,3 +96,17 @@ def __call__(self, batch): batch["labels"] = np.array(new_labels, dtype=np.float32) return batch + + def get_augmentations(self): + """Returns a list of augmentations + Perhaps for logging purposes + """ + return self.augments + + def __repr__(self): + return ( + f"""{self.name} + Augmentations: {self.augments} + MelSpectrogram: {self.spectrogram_params} + """ + ) diff --git a/whoot_model_training/whoot_model_training/trainer.py b/whoot_model_training/whoot_model_training/trainer.py index 9529319..8bb8910 100644 --- a/whoot_model_training/whoot_model_training/trainer.py +++ b/whoot_model_training/whoot_model_training/trainer.py @@ -1,7 +1,7 @@ """ Everything needed to train given a model and a dataset -WhootTrainingArguments: A container for the +WhootTrainingArguments: A container for the many many args for WhootTrainer WhootTrainer: The class that is going to run training @@ -32,6 +32,10 @@ class WhootTrainer(PyhaTrainer): """The training class #TODO Improve these docstrings """ + # WhootTrainer is ment to mimic the huggingface trainer + # Including number of arguments + # Aside, we really should consider how useful R0913,R0917 is... + # pylint: disable-next=R0913,R0917 def __init__( self, model, From 515e61cf45642ac47c5d6f722abadc777312a463 Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Fri, 4 Jul 2025 11:03:24 -0700 Subject: [PATCH 087/120] Fixed issue formatting model input --- .../whoot_model_training/models/model.py | 42 +++++++++++++++---- .../whoot_model_training/models/timm_model.py | 23 +++++----- 2 files changed, 45 insertions(+), 20 deletions(-) diff --git a/whoot_model_training/whoot_model_training/models/model.py b/whoot_model_training/whoot_model_training/models/model.py index 89b80d5..00a5fa4 100644 --- a/whoot_model_training/whoot_model_training/models/model.py +++ b/whoot_model_training/whoot_model_training/models/model.py @@ -19,16 +19,31 @@ def has_required_inputs(): - """ - Wrapper to check to make sure everything is setup properly - Required before using PyhaTrainer + """Wrapper for formatting input for a given Model! + + Checks to make sure a model is passed in the correct input + format, and returns the correct output format. + + Usually this is defined by `model.input_format` and + `model.output_format` + + MUST ALWAYS WRAP FORWARD FUNCTION OF MODEL """ def decorator(forward): @wraps(forward) - def wrapper(self, *args, **kwarg): - # assert isinstance(x, self.input_format) #TODO FIX - model_output = forward(self, *args, **kwarg) - # assert isinstance(model_output, self.output_format) + def wrapper(self, x=None, **kwarg): + # During training, data is passed in as kwargs, (**ModelInput) + # due to how hugging face is designed + # this can be confusing if you are making custom models + # During inference, data is passed in as x, (ModelInput) + if x is None: + # ... but during training we just have the model + # pretend like it was passed in a ModelInput + x = self.input_format.from_dict(kwarg) + + assert isinstance(x, self.input_format) + model_output = forward(self, x) + assert isinstance(model_output, self.output_format) return model_output @@ -107,6 +122,19 @@ def items(self): ) in super( ).items() if value is not None] + @classmethod + def from_dict(cls, some_input): + spectrogram, waveform = None, None + labels = some_input["labels"] + if "spectrogram" in some_input: + spectrogram = some_input["spectrogram"] + if "waveform" in some_input: + waveform = some_input["waveform"] + + assert spectrogram is not None or waveform is not None + + return cls(labels, spectrogram=spectrogram, waveform=waveform) + class Model(BaseModel): """BaseModel Class for Whoot diff --git a/whoot_model_training/whoot_model_training/models/timm_model.py b/whoot_model_training/whoot_model_training/models/timm_model.py index 7fb1997..87042df 100644 --- a/whoot_model_training/whoot_model_training/models/timm_model.py +++ b/whoot_model_training/whoot_model_training/models/timm_model.py @@ -9,8 +9,7 @@ """ import timm -from torch import nn, Tensor -import numpy as np +from torch import nn from .model import Model, ModelInput, ModelOutput, has_required_inputs @@ -29,8 +28,8 @@ def __init__(self, labels, waveform=None, spectrogram=None): # # Can use inputs to verify correct shape for upstream model # assert spectrogram.shape[1:] == (1, 100, 100) super().__init__(labels, waveform, spectrogram) - self.labels = Tensor(np.array(labels)) - self.spectrogram = Tensor(np.array(spectrogram)) + self.labels = labels + self.spectrogram = spectrogram class TimmModel(nn.Module, Model): @@ -85,25 +84,23 @@ def set_custom_loss(self, loss_fn): # TODO Fix this so it actually can take in a input object @has_required_inputs() - def forward(self, labels=None, spectrogram=None) -> ModelOutput: + def forward(self, x: TimmInputs) -> ModelOutput: """Model forward function Args: - labels=None (Torch.Tensor): the ground truth labels for computing - loss - spectrogram=None (Torch.Tensor): spectrograms inputs into model + x: (TimmInputs): The specific input format for Timm Models Returns (ModelOutput): The model output (logits), latent space representations (embeddings), loss and labels. """ - embedd = self.backbone(spectrogram) - logits = self.linear(embedd) - loss = self.loss(logits, labels) + embed = self.backbone(x.spectrogram) + logits = self.linear(embed) + loss = self.loss(logits, x.labels) return ModelOutput( logits=logits, - embeddings=embedd, + embeddings=embed, loss=loss, - labels=labels + labels=x.labels ) From 4e189620d694b6d57f398ff68d5fa82b60c724b7 Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Fri, 4 Jul 2025 11:10:58 -0700 Subject: [PATCH 088/120] Finalized rounds of linting, mvp for training done --- .../whoot_model_training/models/model.py | 7 +++++-- .../whoot_model_training/models/timm_model.py | 1 - .../preprocessors/default_preprocessor.py | 7 ++++--- .../spectrogram_preprocessors.py | 21 ++++++++++++++----- 4 files changed, 25 insertions(+), 11 deletions(-) diff --git a/whoot_model_training/whoot_model_training/models/model.py b/whoot_model_training/whoot_model_training/models/model.py index 00a5fa4..e93b79e 100644 --- a/whoot_model_training/whoot_model_training/models/model.py +++ b/whoot_model_training/whoot_model_training/models/model.py @@ -123,7 +123,11 @@ def items(self): ).items() if value is not None] @classmethod - def from_dict(cls, some_input): + def from_dict(cls, some_input: dict): + """Sometimes inputs are given as kwargs + So lets recreate correct inputs for model + via building from a dictionary! + """ spectrogram, waveform = None, None labels = some_input["labels"] if "spectrogram" in some_input: @@ -139,7 +143,6 @@ def from_dict(cls, some_input): class Model(BaseModel): """BaseModel Class for Whoot """ - # TODO Define required class instance variables def __init__(self, *args, **kwargs): self.input_format = ModelInput self.output_format = ModelOutput diff --git a/whoot_model_training/whoot_model_training/models/timm_model.py b/whoot_model_training/whoot_model_training/models/timm_model.py index 87042df..07bbd79 100644 --- a/whoot_model_training/whoot_model_training/models/timm_model.py +++ b/whoot_model_training/whoot_model_training/models/timm_model.py @@ -82,7 +82,6 @@ def set_custom_loss(self, loss_fn): """ self.loss = loss_fn - # TODO Fix this so it actually can take in a input object @has_required_inputs() def forward(self, x: TimmInputs) -> ModelOutput: """Model forward function diff --git a/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py b/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py index b2ed726..667fd53 100644 --- a/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py +++ b/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py @@ -13,9 +13,10 @@ see `whoot_model_training/models/model.py` for more info. """ - from .spectrogram_preprocessors import ( - BuowMelSpectrogramPreprocessors, SpectrogramParams + BuowMelSpectrogramPreprocessors, + SpectrogramParams, + Augmentations ) from ..models.model import ModelInput @@ -28,7 +29,7 @@ def __init__( self, model_input: ModelInput, duration=5, - augments: dict = {"audio":None, "spectrogram":None}, + augments: Augmentations = Augmentations(), spectrogram_params: SpectrogramParams = SpectrogramParams(), ): """ Creates a Online preprocessor for MelSpectrograms Based Models diff --git a/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py b/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py index d097a90..b3fb6df 100644 --- a/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py +++ b/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py @@ -25,6 +25,19 @@ class SpectrogramParams: n_mels: int = 256 +@dataclass +class Augmentations(): + """Dataclass for the augmentations of the model + + audio (list[dict]): per item key name of augmentation, + value is the augmentation + spectrogram (list[dict]): same idea but augmentations + applied onto spectrograms + """ + audio = None + spectrogram = None + + class BuowMelSpectrogramPreprocessors(PreProcessorBase): """Preprocessor for processing audio into spectrograms Particularly for the buow dataset @@ -33,11 +46,9 @@ class BuowMelSpectrogramPreprocessors(PreProcessorBase): def __init__( self, duration=5, - augments={"audio":None, "spectrogram":None}, + augments: Augmentations = Augmentations(), spectrogram_params: SpectrogramParams = SpectrogramParams() ): - - assert "audio" in augments.keys() and "spectrogram" in augments.keys() self.duration = duration self.augments = augments @@ -64,7 +75,7 @@ def __call__(self, batch): y = np.pad(y, end_sr - y.shape[-1]) # Audio Based Augmentations - if self.augments["audio"] is not None: + if self.augments.audio is not None: y, label = self.augments.audio(y, sr, label) pillow_transforms = transforms.ToPILImage() @@ -86,7 +97,7 @@ def __call__(self, batch): / 255 ) - if self.augments["spectrogram"] is not None: + if self.augments.spectrogram is not None: mels = self.augments.spectrogram(mels) new_audio.append(mels) From 22acd072fb0bcdf2a5cabd2bebd6a796e37621f8 Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Fri, 4 Jul 2025 12:21:02 -0700 Subject: [PATCH 089/120] Add task logging --- whoot_model_training/configs/config.yml | 4 +++- whoot_model_training/train.py | 6 +++++- .../whoot_model_training/logger.py | 6 ++++++ .../whoot_model_training/trainer.py | 20 ++++++++++++++++++- 4 files changed, 33 insertions(+), 3 deletions(-) diff --git a/whoot_model_training/configs/config.yml b/whoot_model_training/configs/config.yml index 67599b1..c7b35f2 100644 --- a/whoot_model_training/configs/config.yml +++ b/whoot_model_training/configs/config.yml @@ -5,4 +5,6 @@ hf_cache_path: data/burrowing_owl_dataset/cache/metadata.hf # Required Variables COMET_PROJECT_NAME: "whoot" -CUDA_VISIBLE_DEVICES: "0" #"0,1" \ No newline at end of file +CUDA_VISIBLE_DEVICES: "0" #"0,1" +SUBPROJECT_NAME: +DATASET_NAME: \ No newline at end of file diff --git a/whoot_model_training/train.py b/whoot_model_training/train.py index 552c5f9..7caf899 100644 --- a/whoot_model_training/train.py +++ b/whoot_model_training/train.py @@ -112,7 +112,11 @@ def train(config): ds["test"].set_transform(preprocessor) # Run training - training_args = WhootTrainingArguments(run_name=run_name) + training_args = WhootTrainingArguments( + run_name=run_name, + subproject_name=config["SUBPROJECT_NAME"], + dataset_name=config["DATASET_NAME"], + ) # REQUIRED ARGS (DO NOT CHANGE VALUES TODO ADD TO TRAINER DIRECTLY) training_args.label_names = ["labels"] diff --git a/whoot_model_training/whoot_model_training/logger.py b/whoot_model_training/whoot_model_training/logger.py index 783c45c..23c3e03 100644 --- a/whoot_model_training/whoot_model_training/logger.py +++ b/whoot_model_training/whoot_model_training/logger.py @@ -40,3 +40,9 @@ def end(self): """Fully ends experiment if still running """ return self.experiment.end() + + def log_task(self, task_name): + """Log what task this model should be listed under + """ + self.experiment.log_parameter("task", task_name) + diff --git a/whoot_model_training/whoot_model_training/trainer.py b/whoot_model_training/whoot_model_training/trainer.py index 8bb8910..af8cbd9 100644 --- a/whoot_model_training/whoot_model_training/trainer.py +++ b/whoot_model_training/whoot_model_training/trainer.py @@ -21,9 +21,24 @@ class WhootTrainingArguments(PyhaTrainingArguments): """Holds arguments use for training """ - def __init__(self, run_name): + def __init__(self, + run_name, + subproject_name: str="TESTING", + dataset_name: str="DS_404" + ): + + assert subproject_name is not None + assert dataset_name is not None default_checkpoint_path = "model_checkpoints" checkpoint_created_at = datetime.now().strftime("%m_%d_%Y_%H:%M:%S") + + self.run_name = f"{subproject_name}_{dataset_name}_{run_name}" + self.task_name = f"{subproject_name}_{dataset_name}" + + print( + f"Starting training on {dataset_name} for {subproject_name}" + ) + super().__init__(os.path.join(f"{default_checkpoint_path}", f"{run_name}_{checkpoint_created_at}")) @@ -48,6 +63,9 @@ def __init__( ): metrics = WhootMutliClassMetrics(dataset.get_class_labels().names) + print(logger, type(logger)) + if logger is not None: + logger.log_task(training_args.task_name) super().__init__( model, From e4bf6b5bc49e63336de512810b66260ba822d3c8 Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Fri, 4 Jul 2025 12:26:55 -0700 Subject: [PATCH 090/120] Added task filtering --- comet_ml_panels/leaderboard.py | 97 +++++++++++++++++++--------------- 1 file changed, 55 insertions(+), 42 deletions(-) diff --git a/comet_ml_panels/leaderboard.py b/comet_ml_panels/leaderboard.py index 8e3574b..80771cc 100644 --- a/comet_ml_panels/leaderboard.py +++ b/comet_ml_panels/leaderboard.py @@ -4,9 +4,6 @@ model metrics at each step for each model in the project Then displays the top models. -Note that updating this file does not update comet-ml. Please -go into the project to update after pushing to GitHub. - Example: This is not intended to be run locally. Please test on Comet-ML. @@ -14,52 +11,68 @@ For more on adding to this see docs at https://www.comet.com/docs/v2/guides/comet-ui/experiment-management/visualizations/python-panel/ -""" + Note that updating this file does not update comet-ml. Please + go into the project to update after pushing to GitHub. + Do not include Doc string in comet-ml... for some reason this + is displayed in the comet-ml panel if copied directly +""" from comet_ml import API, APIExperiment, ui - - -def get_max_metric(df, metric_col="metric"): - # Doing a simple groupby max removes extra useful metadata - # For example - # We may want to know the exact step we had the best score - # But a max groupby will only show the last step at the end - index = df[metric_col].argmax() - return df.iloc[index] - +import pandas as pd +import numpy as np # Initialize Comet API api = API() -# Get available metrics and select one +# Select the experiments and metrics to compare available_metrics = ["train/valid_cMAP", "train/valid_ROCAUC"] selected_metric = ui.dropdown("Select a metric:", available_metrics) -# Fetch experiment data +available_tasks = [None, "mutlilabelClass_buowset0"] +selected_task = ui.dropdown("Select a Task:", available_tasks) + experiment_keys = api.get_panel_experiment_keys() -if experiment_keys and selected_metric: - # Fetch the selected metric data for all experiments - metrics_df = api.get_metrics_df(experiment_keys, [selected_metric]) - - # Create Leaderboard View - leaderboard_df = metrics_df.groupby("experiment_key").apply( - lambda df: get_max_metric(df, selected_metric) - ).sort_values(by=selected_metric, ascending=False).reset_index(drop=True) - - leaderboard_df["users"] = leaderboard_df["experiment_key"].apply( - lambda key: APIExperiment(previous_experiment=key).get_user() - ) - - col_order = [ - "experiment_name", - selected_metric, - "experiment_key", - "step", - "users" - ] - - ui.display(leaderboard_df[col_order]) -else: - ui.display( - "No data to plot. Make sure your metric data is logged by step." - ) +data = api.get_metrics_for_chart( + experiment_keys, metrics=[selected_metric], parameters=["task"]) +processed_data = [] + +for key in data: + # Note, some of the early runs have no value for the task + # The following code handles those cases + TASK = None + if "task" in data[key]["params"]: + TASK = data[key]["params"]["task"] + + # Only display the leaderboard for tasks we want + # This CAN include runs with no task + if TASK is not selected_task and TASK != selected_task: + continue + + # Failed runs may not have metrics + if len(data[key]["metrics"]) == 0: + continue + + metric_values = data[key]["metrics"][0]["values"] + max_index = np.argmax(metric_values) + + processed_data.append({ + "experiment_name": data[key]["experimentName"], + "experiment_key": key, + selected_metric: max(metric_values), + "step": data[key]["metrics"][0]["steps"][max_index], + }) + +leaderboard_df = pd.DataFrame(processed_data) + +leaderboard_df["users"] = leaderboard_df["experiment_key"].apply( + lambda key: APIExperiment(previous_experiment=key).get_user() +) + +col_order = [ + "experiment_name", + selected_metric, + "experiment_key", + "step", + "users" +] +ui.display(leaderboard_df[col_order]) From 7024de5b3520aa5f7eb1ad89794f9a2ea45bf11a Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Fri, 4 Jul 2025 12:29:39 -0700 Subject: [PATCH 091/120] Fixed sorting --- comet_ml_panels/leaderboard.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/comet_ml_panels/leaderboard.py b/comet_ml_panels/leaderboard.py index 80771cc..a190093 100644 --- a/comet_ml_panels/leaderboard.py +++ b/comet_ml_panels/leaderboard.py @@ -62,7 +62,8 @@ "step": data[key]["metrics"][0]["steps"][max_index], }) -leaderboard_df = pd.DataFrame(processed_data) +leaderboard_df = pd.DataFrame(processed_data).sort_values( + selected_metric, ascending=False) leaderboard_df["users"] = leaderboard_df["experiment_key"].apply( lambda key: APIExperiment(previous_experiment=key).get_user() From caad26538823c78eb460f877628d18372f19fd8c Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Fri, 4 Jul 2025 12:32:53 -0700 Subject: [PATCH 092/120] Linted after task update --- whoot_model_training/whoot_model_training/logger.py | 3 +-- whoot_model_training/whoot_model_training/trainer.py | 8 ++++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/whoot_model_training/whoot_model_training/logger.py b/whoot_model_training/whoot_model_training/logger.py index 23c3e03..7e0e6ab 100644 --- a/whoot_model_training/whoot_model_training/logger.py +++ b/whoot_model_training/whoot_model_training/logger.py @@ -40,9 +40,8 @@ def end(self): """Fully ends experiment if still running """ return self.experiment.end() - + def log_task(self, task_name): """Log what task this model should be listed under """ self.experiment.log_parameter("task", task_name) - diff --git a/whoot_model_training/whoot_model_training/trainer.py b/whoot_model_training/whoot_model_training/trainer.py index af8cbd9..c68631a 100644 --- a/whoot_model_training/whoot_model_training/trainer.py +++ b/whoot_model_training/whoot_model_training/trainer.py @@ -23,10 +23,9 @@ class WhootTrainingArguments(PyhaTrainingArguments): """ def __init__(self, run_name, - subproject_name: str="TESTING", - dataset_name: str="DS_404" - ): - + subproject_name: str = "TESTING", + dataset_name: str = "DS_404"): + assert subproject_name is not None assert dataset_name is not None default_checkpoint_path = "model_checkpoints" @@ -50,6 +49,7 @@ class WhootTrainer(PyhaTrainer): # WhootTrainer is ment to mimic the huggingface trainer # Including number of arguments # Aside, we really should consider how useful R0913,R0917 is... + # pylint: disable-next=R0913,R0917 def __init__( self, From 351852e99695e644bda102c9354399defdf8c0d8 Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Fri, 4 Jul 2025 12:33:33 -0700 Subject: [PATCH 093/120] Linted --- whoot_model_training/whoot_model_training/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whoot_model_training/whoot_model_training/trainer.py b/whoot_model_training/whoot_model_training/trainer.py index c68631a..8898ecd 100644 --- a/whoot_model_training/whoot_model_training/trainer.py +++ b/whoot_model_training/whoot_model_training/trainer.py @@ -49,7 +49,7 @@ class WhootTrainer(PyhaTrainer): # WhootTrainer is ment to mimic the huggingface trainer # Including number of arguments # Aside, we really should consider how useful R0913,R0917 is... - + # pylint: disable-next=R0913,R0917 def __init__( self, From 0efd9c5442449a8f016865daebf9ce52e40c3f54 Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Fri, 4 Jul 2025 13:25:36 -0700 Subject: [PATCH 094/120] Clean code --- whoot_model_training/train.py | 4 ---- whoot_model_training/whoot_model_training/dataset.py | 2 -- whoot_model_training/whoot_model_training/trainer.py | 3 +++ 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/whoot_model_training/train.py b/whoot_model_training/train.py index 7caf899..1c01b8d 100644 --- a/whoot_model_training/train.py +++ b/whoot_model_training/train.py @@ -118,10 +118,6 @@ def train(config): dataset_name=config["DATASET_NAME"], ) - # REQUIRED ARGS (DO NOT CHANGE VALUES TODO ADD TO TRAINER DIRECTLY) - training_args.label_names = ["labels"] - training_args.remove_unused_columns = False - # OPTIONAL ARGS training_args.num_train_epochs = 2 training_args.eval_steps = 20 diff --git a/whoot_model_training/whoot_model_training/dataset.py b/whoot_model_training/whoot_model_training/dataset.py index 3d17f6b..a86f1ae 100644 --- a/whoot_model_training/whoot_model_training/dataset.py +++ b/whoot_model_training/whoot_model_training/dataset.py @@ -6,8 +6,6 @@ Using an Arrow Dataset from Hugging Face's dataset library because - Cool audio features https://huggingface.co/docs/datasets/en/audio_process - Faster than pandas, better at managing memory - -# TODO Use the default stuff from pyha-analyzer """ from datasets import DatasetDict, ClassLabel diff --git a/whoot_model_training/whoot_model_training/trainer.py b/whoot_model_training/whoot_model_training/trainer.py index 8898ecd..409cce1 100644 --- a/whoot_model_training/whoot_model_training/trainer.py +++ b/whoot_model_training/whoot_model_training/trainer.py @@ -41,6 +41,9 @@ def __init__(self, super().__init__(os.path.join(f"{default_checkpoint_path}", f"{run_name}_{checkpoint_created_at}")) + #Override defaults + self.label_names = ["labels"] + self.remove_unused_columns = False class WhootTrainer(PyhaTrainer): """The training class From bbbfde1d472eef763913c488ffbd3f276230e1ba Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Fri, 4 Jul 2025 14:07:58 -0700 Subject: [PATCH 095/120] Cleaned code --- whoot_model_training/train.py | 14 ++++---- .../data_extractor/buowset_extractor.py | 10 +++--- .../whoot_model_training/logger.py | 4 +++ .../whoot_model_training/metrics.py | 31 +++++++++++----- .../whoot_model_training/models/__init__.py | 9 ++++- .../whoot_model_training/models/model.py | 4 +++ .../whoot_model_training/trainer.py | 35 +++++++++++++------ 7 files changed, 75 insertions(+), 32 deletions(-) diff --git a/whoot_model_training/train.py b/whoot_model_training/train.py index 1c01b8d..6a40f00 100644 --- a/whoot_model_training/train.py +++ b/whoot_model_training/train.py @@ -24,6 +24,8 @@ from whoot_model_training.preprocessors import ( SpectrogramModelInputPreprocessors ) + +# Uncomment for use with data augmentation # from pyha_analyzer.preprocessors import MixItUp, ComposeAudioLabel # from audiomentations import ( # Compose, AddColorNoise, @@ -74,8 +76,8 @@ def train(config): # Preprocessors - # Augmentations - # TODO: Design better system for saving and reproducing augmentation + # Uncomment if doing work with data augmentation + # # Augmentations # wav_augs = ComposeAudioLabel([ # # AddBackgroundNoise( #We don't have background noise yet... # # sounds_path="data_birdset/background_noise", @@ -98,7 +100,7 @@ def train(config): # ) # ]) - # We define here what the model reads + # Offline preprocessors prepare data for training train_preprocessor = SpectrogramModelInputPreprocessors( TimmInputs, duration=3 ) @@ -118,16 +120,13 @@ def train(config): dataset_name=config["DATASET_NAME"], ) - # OPTIONAL ARGS + # COMMON OPTIONAL ARGS training_args.num_train_epochs = 2 training_args.eval_steps = 20 training_args.per_device_train_batch_size = 32 training_args.per_device_eval_batch_size = 32 training_args.dataloader_num_workers = 36 training_args.run_name = run_name - training_args.report_to = "comet_ml" - - print(training_args.accelerator_config.even_batches) trainer = WhootTrainer( model=model, @@ -137,7 +136,6 @@ def train(config): augmentations=None, name=training_args.run_name ), - ignore_keys=["predictions", "labels", "embeddings", "loss"] ) trainer.train() diff --git a/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py b/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py index 8adadb2..085a78c 100644 --- a/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py +++ b/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py @@ -40,6 +40,11 @@ def one_hot_encode(row: dict, classes: list): @dataclass class BuowsetParams(): """Parameters that describe the Buowset + + validation_fold (int): label for valid split + test_fold (int): label for valid split + sample_rate (int): sample rate of the data + filepath (int): name of column in csv for filepaths """ validation_fold = 4 test_fold = 3 @@ -72,10 +77,6 @@ def buowset_extractor( (AudioDataset): See dataset.py, AudioDatasets are consider the universal dataset for the training pipeline. """ - # if os.path.exists(output_path): - # ds = load_from_disk(output_path) - # return AudioDataset(ds) - # Hugging face by default defines a train split ds = load_dataset("csv", data_files=metadata_csv)["train"] ds = ds.rename_column("label", "labels") # Convention here is labels @@ -96,7 +97,6 @@ def buowset_extractor( ) ds = ds.add_column("filepath", ds["audio"]) - ds = ds.cast_column("audio", Audio(sampling_rate=params.sr)) # Create splits of the data diff --git a/whoot_model_training/whoot_model_training/logger.py b/whoot_model_training/whoot_model_training/logger.py index 7e0e6ab..a7ca5c1 100644 --- a/whoot_model_training/whoot_model_training/logger.py +++ b/whoot_model_training/whoot_model_training/logger.py @@ -43,5 +43,9 @@ def end(self): def log_task(self, task_name): """Log what task this model should be listed under + + Args: + task_name: usually what task the model is doing + and the dataset being used for training """ self.experiment.log_parameter("task", task_name) diff --git a/whoot_model_training/whoot_model_training/metrics.py b/whoot_model_training/whoot_model_training/metrics.py index fe4be0b..3b4033b 100644 --- a/whoot_model_training/whoot_model_training/metrics.py +++ b/whoot_model_training/whoot_model_training/metrics.py @@ -4,17 +4,16 @@ These the metrics with HF Trainer and are called as part of a callback during training + +WhootMutliClassMetrics: Computes CMAP, ROCAUC and + confusion matrices each evaluation step of + the trainer """ import comet_ml import torch from sklearn.metrics import confusion_matrix -# from torchmetrics.classification import ( -# MultilabelAveragePrecision, -# MultilabelAUROC, -# ) - from pyha_analyzer.metrics.classification_metrics \ import AudioClassificationMetrics @@ -29,7 +28,17 @@ def __init__(self, classes: list): super().__init__([], len(classes), mutlilabel=True) def __call__(self, eval_pred) -> dict[str, float]: - # CMAP / ROCAUC + """Log all metrics + + eval_pred: package of data provided by trainer + contains + - predictions: np.array of model outputs + - label_ids: np.array of ground truth targets + + returns: + (dict) key name of metric, float metric score + """ + # CMAP / ROCAUC, done by AudioClassificationMetrics initial_metrics = super().__call__(eval_pred=eval_pred) # Confusion Matrix @@ -39,7 +48,12 @@ def __call__(self, eval_pred) -> dict[str, float]: return initial_metrics def log_comet_ml_only(self, eval_pred): - """Logs confusion matrix each eval step + """Logs confusion matrix + + eval_pred: package of data provided by trainer + contains + - predictions: np.array of model outputs + - label_ids: np.array of ground truth targets """ # For metrics that are not loggable to console # We can only have comet_ml for these metrics @@ -49,7 +63,8 @@ def log_comet_ml_only(self, eval_pred): logits = torch.Tensor(eval_pred.predictions) target = torch.Tensor(eval_pred.label_ids).to(torch.long) - # Confusion Matrix WARNING, ONLY USE IF DATA IS MOSTLY MUTLICLASS + # Confusion Matrix WARNING, ONLY MAKES SENSE + # IF DATA IS MOSTLY MUTLICLASS cm = confusion_matrix( torch.argmax(target, dim=1), torch.argmax(logits, dim=1) diff --git a/whoot_model_training/whoot_model_training/models/__init__.py b/whoot_model_training/whoot_model_training/models/__init__.py index 2f6c63b..589aa49 100644 --- a/whoot_model_training/whoot_model_training/models/__init__.py +++ b/whoot_model_training/whoot_model_training/models/__init__.py @@ -5,5 +5,12 @@ """ from .timm_model import TimmModel, TimmInputs +from .model import Model, ModelInput, ModelOutput -__all__ = ["TimmModel", "TimmInputs"] +__all__ = [ + "TimmModel", + "TimmInputs", + "Model", + "ModelInput", + "ModelOutput" +] diff --git a/whoot_model_training/whoot_model_training/models/model.py b/whoot_model_training/whoot_model_training/models/model.py index e93b79e..06e3593 100644 --- a/whoot_model_training/whoot_model_training/models/model.py +++ b/whoot_model_training/whoot_model_training/models/model.py @@ -65,6 +65,10 @@ class ModelOutput(dict, UserDict): ModelOutput class """ + # ignore some of the outputs when computing metrics + # When overwriting DON"T FORGET TO INCLUDE THIS + ignore_keys=["predictions", "labels", "embeddings", "loss"] + def __init__( self, _map: dict | None = None, diff --git a/whoot_model_training/whoot_model_training/trainer.py b/whoot_model_training/whoot_model_training/trainer.py index 409cce1..abc6d36 100644 --- a/whoot_model_training/whoot_model_training/trainer.py +++ b/whoot_model_training/whoot_model_training/trainer.py @@ -5,7 +5,6 @@ many many args for WhootTrainer WhootTrainer: The class that is going to run training - """ from datetime import datetime @@ -16,6 +15,7 @@ from .metrics import WhootMutliClassMetrics from .dataset import AudioDataset +from .models import Model class WhootTrainingArguments(PyhaTrainingArguments): @@ -28,9 +28,12 @@ def __init__(self, assert subproject_name is not None assert dataset_name is not None + default_checkpoint_path = "model_checkpoints" checkpoint_created_at = datetime.now().strftime("%m_%d_%Y_%H:%M:%S") + # run_name is name of the model + # task_name is name of the model task and dataset trained self.run_name = f"{subproject_name}_{dataset_name}_{run_name}" self.task_name = f"{subproject_name}_{dataset_name}" @@ -41,13 +44,27 @@ def __init__(self, super().__init__(os.path.join(f"{default_checkpoint_path}", f"{run_name}_{checkpoint_created_at}")) - #Override defaults + #Required for whoot: override defaults in PyhaTrainingArguments self.label_names = ["labels"] self.remove_unused_columns = False + self.report_to = "comet_ml" + class WhootTrainer(PyhaTrainer): - """The training class - #TODO Improve these docstrings + """Trainers run the training of a model + + Model (Model): a pytorch model for training + should inherit from BaseModel + see `models/model.py` + Dataset (AudioDataset): A canonical audio dataset + Ideally attached some a preprocessor and returns ModelInputs + training_args (WhootTrainingArugments): + All the parameters that define training + Logger (CometMLLoggerSupplement): + Class that adds additional logging + On top of logging done by PyhaTrainer + preprocessor (PreProcessorBase): + Preprocessor used for formatting the data """ # WhootTrainer is ment to mimic the huggingface trainer # Including number of arguments @@ -56,13 +73,11 @@ class WhootTrainer(PyhaTrainer): # pylint: disable-next=R0913,R0917 def __init__( self, - model, + model: Model, dataset: AudioDataset, - training_args=None, + training_args: WhootTrainingArguments = None, logger=None, - data_collator=None, preprocessor=None, - ignore_keys=... ): metrics = WhootMutliClassMetrics(dataset.get_class_labels().names) @@ -76,7 +91,7 @@ def __init__( metrics, training_args, logger, - data_collator, + None, # Data Collator, about to be deprecated preprocessor, - ignore_keys + model.output_format.ignore_keys ) From 32164151a4a8f63138ef1aebc2531dbee39b0931 Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Fri, 4 Jul 2025 14:10:52 -0700 Subject: [PATCH 096/120] Linted --- whoot_model_training/whoot_model_training/models/model.py | 2 +- whoot_model_training/whoot_model_training/trainer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/whoot_model_training/whoot_model_training/models/model.py b/whoot_model_training/whoot_model_training/models/model.py index 06e3593..d3ee241 100644 --- a/whoot_model_training/whoot_model_training/models/model.py +++ b/whoot_model_training/whoot_model_training/models/model.py @@ -67,7 +67,7 @@ class ModelOutput(dict, UserDict): # ignore some of the outputs when computing metrics # When overwriting DON"T FORGET TO INCLUDE THIS - ignore_keys=["predictions", "labels", "embeddings", "loss"] + ignore_keys = ["predictions", "labels", "embeddings", "loss"] def __init__( self, diff --git a/whoot_model_training/whoot_model_training/trainer.py b/whoot_model_training/whoot_model_training/trainer.py index abc6d36..d57641b 100644 --- a/whoot_model_training/whoot_model_training/trainer.py +++ b/whoot_model_training/whoot_model_training/trainer.py @@ -44,7 +44,7 @@ def __init__(self, super().__init__(os.path.join(f"{default_checkpoint_path}", f"{run_name}_{checkpoint_created_at}")) - #Required for whoot: override defaults in PyhaTrainingArguments + # Required for whoot: override defaults in PyhaTrainingArguments self.label_names = ["labels"] self.remove_unused_columns = False self.report_to = "comet_ml" From 1186e93baa34cdce3a2178544c94cf1f0e44843c Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Fri, 4 Jul 2025 14:13:39 -0700 Subject: [PATCH 097/120] Add binary extractor for buowset (#39) * Add binary extractor for buowset * Adds binary extractor --- whoot_model_training/train.py | 2 +- .../data_extractor/__init__.py | 8 ++-- .../data_extractor/buowset_extractor.py | 41 ++++++++++++++++++- 3 files changed, 46 insertions(+), 5 deletions(-) diff --git a/whoot_model_training/train.py b/whoot_model_training/train.py index 6a40f00..59df78b 100644 --- a/whoot_model_training/train.py +++ b/whoot_model_training/train.py @@ -17,7 +17,7 @@ import yaml from whoot_model_training.trainer import WhootTrainer, WhootTrainingArguments -from whoot_model_training.data_extractor import buowset_extractor +from whoot_model_training.data_extractor import buowset_extractor, buowset_binary_extractor from whoot_model_training.models import TimmModel, TimmInputs from whoot_model_training import CometMLLoggerSupplement diff --git a/whoot_model_training/whoot_model_training/data_extractor/__init__.py b/whoot_model_training/whoot_model_training/data_extractor/__init__.py index 96c821c..2648cb0 100644 --- a/whoot_model_training/whoot_model_training/data_extractor/__init__.py +++ b/whoot_model_training/whoot_model_training/data_extractor/__init__.py @@ -3,7 +3,9 @@ Extractors convert raw data into AudioDatasets Ideally you make a new Extractor for each new raw dataset """ +from .buowset_extractor import ( + buowset_extractor, + buowset_binary_extractor, +) -from .buowset_extractor import buowset_extractor - -__all__ = ["buowset_extractor"] +__all__ = ["buowset_extractor", "buowset_binary_extractor"] diff --git a/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py b/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py index 085a78c..a764a17 100644 --- a/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py +++ b/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py @@ -24,7 +24,7 @@ ) from ..dataset import AudioDataset - +# MAKE LOG OF DATASET USED def one_hot_encode(row: dict, classes: list): """One hot Encodes a list of labels Args: @@ -80,6 +80,7 @@ def buowset_extractor( # Hugging face by default defines a train split ds = load_dataset("csv", data_files=metadata_csv)["train"] ds = ds.rename_column("label", "labels") # Convention here is labels + # Convert to a uniform one_hot encoding for classes ds = ds.class_encode_column("labels") @@ -114,3 +115,41 @@ def buowset_extractor( ds.save_to_disk(output_path) return ds + + + +def binarize_data(row, target_col=0): + row["labels"] = [row["labels"][target_col], 1-row["labels"][target_col]] + return row + +def buowset_binary_extractor( + metadata_csv, + parent_path, + output_path, # TODO what does output do? + validation_fold=4, + test_fold=3, + sr=32_000, + filepath="segment", + target_col = 0 + ): + + + ads = buowset_extractor(metadata_csv, + parent_path, + output_path, + validation_fold=validation_fold, + test_fold=test_fold, + sr=sr, + filepath=filepath + ) + + binary_class_label = Sequence(ClassLabel(names=["no_buow", "buow"])) + print(binary_class_label.feature.num_classes) + for split in ads: + ads[split] = ads[split].map(lambda row: binarize_data(row, target_col=target_col)).cast_column( + "labels", binary_class_label + ) + + print(ads.get_num_classes()) + + return ads \ No newline at end of file From 7f6d3e3d28dc9589fa3052e50ca4bd6a79f832d0 Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Fri, 4 Jul 2025 14:29:43 -0700 Subject: [PATCH 098/120] Lint the binary model and retest --- .../data_extractor/buowset_extractor.py | 65 +++++++++++++------ 1 file changed, 45 insertions(+), 20 deletions(-) diff --git a/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py b/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py index a764a17..0f78d70 100644 --- a/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py +++ b/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py @@ -9,6 +9,8 @@ This way, it should be easier to define what a common audio dataset format is between parts of the codebase for training + +Supports both mutlilabel and binary labels """ import os @@ -24,7 +26,7 @@ ) from ..dataset import AudioDataset -# MAKE LOG OF DATASET USED + def one_hot_encode(row: dict, classes: list): """One hot Encodes a list of labels Args: @@ -51,6 +53,7 @@ class BuowsetParams(): sr = 32_000 filepath = "segment" +## Multilabel Extractor def buowset_extractor( metadata_csv, @@ -80,7 +83,6 @@ def buowset_extractor( # Hugging face by default defines a train split ds = load_dataset("csv", data_files=metadata_csv)["train"] ds = ds.rename_column("label", "labels") # Convention here is labels - # Convert to a uniform one_hot encoding for classes ds = ds.class_encode_column("labels") @@ -116,40 +118,63 @@ def buowset_extractor( return ds - +## Binary Extractor def binarize_data(row, target_col=0): + """ Convert a mutlilabel label into a binary one + + Args: + row (dict): an example of data + target_col (int): which index is the label for no_buow + + returns + row (dict): now with a binary label instead + """ row["labels"] = [row["labels"][target_col], 1-row["labels"][target_col]] return row + def buowset_binary_extractor( metadata_csv, parent_path, - output_path, # TODO what does output do? - validation_fold=4, - test_fold=3, - sr=32_000, - filepath="segment", - target_col = 0 - ): + output_path, + target_col=0): + """Extracts raw data in the buowset format into an AudioDataset + BUT only allows for two classes: no_buow, yes_buow + + Args: + Metdata_csv (str): Path to csv containing buowset metadata + parent_path (str): Path to the parent folder for all audio data. + Note its assumed the audio filepath + in the csv is relative to parent_path + output_path (str): Path to where HF cache for this dataset should live + validation_fold (int): which fold is considered the validation set + Default 4 + test_fold (int): Which fold is considered the test set Default 3 + sr (int): Sample Rate of the audio files Default: 32_000 + target_col (int): label for no_buow + Returns: + (AudioDataset): See dataset.py, AudioDatasets are consider + the universal dataset for the training pipeline. + """ - ads = buowset_extractor(metadata_csv, + # Use the original extractor to create a mutlilabeled dataset + ads = buowset_extractor( + metadata_csv, parent_path, output_path, - validation_fold=validation_fold, - test_fold=test_fold, - sr=sr, - filepath=filepath ) + # Now we just need to convert labels from mutlilabel to + # 0 or 1 binary_class_label = Sequence(ClassLabel(names=["no_buow", "buow"])) print(binary_class_label.feature.num_classes) for split in ads: - ads[split] = ads[split].map(lambda row: binarize_data(row, target_col=target_col)).cast_column( - "labels", binary_class_label - ) - + ads[split] = ads[split].map( + lambda row: binarize_data(row, target_col=target_col) + ).cast_column("labels", binary_class_label) + print(ads.get_num_classes()) - return ads \ No newline at end of file + return ads From 9e3007dbcbdf700d1739efe4cab6064aaeed18de Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Fri, 4 Jul 2025 14:30:13 -0700 Subject: [PATCH 099/120] Cleans up train script demos --- whoot_model_training/train.py | 2 +- whoot_model_training/train_binary.py | 162 +++++++++++++++++++++++++++ 2 files changed, 163 insertions(+), 1 deletion(-) create mode 100644 whoot_model_training/train_binary.py diff --git a/whoot_model_training/train.py b/whoot_model_training/train.py index 59df78b..6a40f00 100644 --- a/whoot_model_training/train.py +++ b/whoot_model_training/train.py @@ -17,7 +17,7 @@ import yaml from whoot_model_training.trainer import WhootTrainer, WhootTrainingArguments -from whoot_model_training.data_extractor import buowset_extractor, buowset_binary_extractor +from whoot_model_training.data_extractor import buowset_extractor from whoot_model_training.models import TimmModel, TimmInputs from whoot_model_training import CometMLLoggerSupplement diff --git a/whoot_model_training/train_binary.py b/whoot_model_training/train_binary.py new file mode 100644 index 0000000..6613fa4 --- /dev/null +++ b/whoot_model_training/train_binary.py @@ -0,0 +1,162 @@ +"""Trains a Mutliclass Model with Pytorch and Huggingface + +This script can be used to run experiments with different +models and datasets to create any model for bioacoustic classification + +It is intended this script to be heavily modified with each experiment +(say one wants to use a different dataset, one should copy this and change the +extractor!) + +Usage: + $ python train.py /path/to/config.yml + +config.yml should contain frequently changed hyperparameters +""" +import os +import argparse +import yaml + +from whoot_model_training.trainer import WhootTrainer, WhootTrainingArguments +from whoot_model_training.data_extractor import buowset_binary_extractor +from whoot_model_training.models import TimmModel, TimmInputs +from whoot_model_training import CometMLLoggerSupplement + +from whoot_model_training.preprocessors import ( + SpectrogramModelInputPreprocessors +) + +# Uncomment for use with data augmentation +# from pyha_analyzer.preprocessors import MixItUp, ComposeAudioLabel +# from audiomentations import ( +# Compose, AddColorNoise, +# AddBackgroundNoise, PolarityInversion, Gain +# ) + + +def parse_config(config_path: str) -> dict: + """wrapper to parse config + + Args: + config_path (str): path to config file for training! + + returns: + (dict): hyperparameters parameters + """ + config = {} + with open(config_path, "r", encoding="UTF-8") as f: + config = yaml.safe_load(f) + return config + + +def train(config): + """Highest level logic for training + + Does the following: + - Formats the dataset into an AudioDataset + - Prepares preprocessing for each audio clip + - Builds the model + - Configures and runs the trainer + - Runs evaluation + + Args: + config (dict): the config used for training. Defined in yaml file + """ + + # Extract the dataset + ds = buowset_binary_extractor( + metadata_csv=config["metadata_csv"], + parent_path=config["data_path"], + output_path=config["hf_cache_path"], + ) + + # Create the model + run_name = "efficientnet_b1_testing_confusion_matrix_no_data_aug" + model = TimmModel(timm_model="efficientnet_b1", + num_classes=ds.get_num_classes()) + + # Preprocessors + + # Uncomment if doing work with data augmentation + # # Augmentations + # wav_augs = ComposeAudioLabel([ + # # AddBackgroundNoise( #We don't have background noise yet... + # # sounds_path="data_birdset/background_noise", + # # min_snr_db=10, + # # max_snr_db=30, + # # noise_transform=PolarityInversion(), + # # p=0.8 + # # ), + # Gain( + # min_gain_db = -12, + # max_gain_db = 12, + # p = 0.8 + # ), + # MixItUp( + # dataset_ref=ds["train"], + # min_snr_db=10, + # max_snr_db=30, + # noise_transform=PolarityInversion(), + # p=0.8 + # ) + # ]) + + # Offline preprocessors prepare data for training + train_preprocessor = SpectrogramModelInputPreprocessors( + TimmInputs, duration=3 + ) + + preprocessor = SpectrogramModelInputPreprocessors( + TimmInputs, duration=3 + ) + + ds["train"].set_transform(train_preprocessor) + ds["valid"].set_transform(preprocessor) + ds["test"].set_transform(preprocessor) + + # Run training + training_args = WhootTrainingArguments( + run_name=run_name, + subproject_name=config["SUBPROJECT_NAME"], + dataset_name=config["DATASET_NAME"], + ) + + # COMMON OPTIONAL ARGS + training_args.num_train_epochs = 2 + training_args.eval_steps = 20 + training_args.per_device_train_batch_size = 32 + training_args.per_device_eval_batch_size = 32 + training_args.dataloader_num_workers = 36 + training_args.run_name = run_name + + trainer = WhootTrainer( + model=model, + dataset=ds, + training_args=training_args, + logger=CometMLLoggerSupplement( + augmentations=None, + name=training_args.run_name + ), + ) + + trainer.train() + + +def init_env(config: dict): + """Sets up local environment for COMET-ML training logging + + Args: config (dict): at a minimum this has the project name + and CUDA devices that are allowed to be used. + """ + print(config) + os.environ["COMET_PROJECT_NAME"] = config["COMET_PROJECT_NAME"] + os.environ["CUDA_VISIBLE_DEVICES"] = config["CUDA_VISIBLE_DEVICES"] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Input config path") + parser.add_argument("config", type=str, help="Path to config.yml") + args = parser.parse_args() + _config = parse_config(args.config) + + init_env(_config) + train(_config) From e7115c9ad87911e8c0f499619200001dbfea908b Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Fri, 4 Jul 2025 14:36:20 -0700 Subject: [PATCH 100/120] Make it easier to add tasks to leaderboard --- comet_ml_panels/leaderboard.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/comet_ml_panels/leaderboard.py b/comet_ml_panels/leaderboard.py index a190093..ff597d5 100644 --- a/comet_ml_panels/leaderboard.py +++ b/comet_ml_panels/leaderboard.py @@ -28,12 +28,18 @@ available_metrics = ["train/valid_cMAP", "train/valid_ROCAUC"] selected_metric = ui.dropdown("Select a metric:", available_metrics) -available_tasks = [None, "mutlilabelClass_buowset0"] -selected_task = ui.dropdown("Select a Task:", available_tasks) - experiment_keys = api.get_panel_experiment_keys() data = api.get_metrics_for_chart( experiment_keys, metrics=[selected_metric], parameters=["task"]) + +# Given all experiments, find all possible tasks to measure! +available_tasks = list( + set(data[key]["params"]["task"] + for key in data if "task" in data[key]["params"]) +) +available_tasks.append(None) +selected_task = ui.dropdown("Select a Task:", available_tasks) + processed_data = [] for key in data: From 8a09acaeb01892457f0e11502e4f29fc6b61c5b2 Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Fri, 4 Jul 2025 14:38:21 -0700 Subject: [PATCH 101/120] Lint --- .../whoot_model_training/data_extractor/buowset_extractor.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py b/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py index 0f78d70..965a49f 100644 --- a/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py +++ b/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py @@ -53,7 +53,6 @@ class BuowsetParams(): sr = 32_000 filepath = "segment" -## Multilabel Extractor def buowset_extractor( metadata_csv, @@ -118,7 +117,6 @@ def buowset_extractor( return ds -## Binary Extractor def binarize_data(row, target_col=0): """ Convert a mutlilabel label into a binary one From bab8398ec2f3fe25e8f1ff6bc60e0839616c193e Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Wed, 9 Jul 2025 10:07:29 -0700 Subject: [PATCH 102/120] Fixed install Based on review feedback --- pyproject.toml | 2 +- whoot_model_training/README.md | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4b5ae8c..62bfe4d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ cu128 = [ model_training = [ "datasets>=3.5.1", "timm>=1.0.15", - "pyha-analyzer", + "pyha-analyzer@git+https://github.com/UCSD-E4E/pyha-analyzer-2.0.git#egg=support_whoot", "comet-ml>=3.43.2", ] diff --git a/whoot_model_training/README.md b/whoot_model_training/README.md index 1da75e1..65ef4cf 100644 --- a/whoot_model_training/README.md +++ b/whoot_model_training/README.md @@ -8,7 +8,9 @@ Key inspiration is https://github.com/UCSD-E4E/pyha-analyzer-2.0/tree/main. This To set up environment for model training: 1) run steps 1 - 3 of the installation instructions in `whoot/README.md` -2) For step 4, specifically run `pip install -e .[model_training, cu128/cpu]` +2) For step 4, specifically run `pip install -e .[model_training, cpu]` for cpu training, `pip install -e .[model_training, cu128]` for training on Nvidia GPUs + +Note that you should check what is supported by CUDA on your machine. See developers if you need a different CUDA version # Running From a596f3093a06c5e33218c6467411fab8f37ff466 Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Wed, 9 Jul 2025 10:12:06 -0700 Subject: [PATCH 103/120] Fixed error handling Tbh I have no idea if this is the best way to do this. Might look it up later but decided to just do a quick fix --- whoot_model_training/whoot_model_training/dataset.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/whoot_model_training/whoot_model_training/dataset.py b/whoot_model_training/whoot_model_training/dataset.py index a86f1ae..ec11f85 100644 --- a/whoot_model_training/whoot_model_training/dataset.py +++ b/whoot_model_training/whoot_model_training/dataset.py @@ -42,9 +42,11 @@ def validate_format(self, ds: DatasetDict): for split in ds.keys(): dataset = ds[split] for column in DEFAULT_COLUMNS: + phrase_one = "The column `" + phrase_two = "` is missing from dataset split `" + phrase_three = "`. Required by system" state = ( - f"The column `{column}` is missing from dataset split `{ - split}`. Required by system" + f"{phrase_one}{column}{phrase_two}{split}{phrase_three}" ) assert column in dataset.features, state From b2d58df2c31f7195902c855fd93a35b7091ebfb4 Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Wed, 9 Jul 2025 10:23:45 -0700 Subject: [PATCH 104/120] Update documentation for config.yml --- whoot_model_training/README.md | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/whoot_model_training/README.md b/whoot_model_training/README.md index 65ef4cf..a539c01 100644 --- a/whoot_model_training/README.md +++ b/whoot_model_training/README.md @@ -15,10 +15,32 @@ Note that you should check what is supported by CUDA on your machine. See develo # Running 0) Add your Comet-ML API to your local environment. See -1) Create a copy of the config found in `configs/config.yml` and fill it out with your dataset +1) Create a copy of the config found in `configs/config.yml` and fill it out for your dataset. See the [config](#config) section 2) Edit train.py to set up training for your dataset. If you are using a new dataset which an extractor does not exist for, contact code authors. 3) run `python train.py path/to/your/config/file.yml` +# Config + +## Default Config Properties +The properties of `config.yml` are as follows: +### Data paths +`metadata_csv`: the path to the metadata file for your dataset. +`data_path`: Path to the highest level parent folder containing audio. Audio can be in a different path than the metadata! +`hf_cache_path`: cache for hugging face. This path will be automatically made as you run the script, this would be the location of where the new file should go + +### Required Variables +`COMET_PROJECT_NAME`: "whoot", this is the project on comet-ml training will run on. +`CUDA_VISIBLE_DEVICES`: "0" or "0,1", this controls how many GPUs the training uses. +`SUBPROJECT_NAME`: Some description to help filter which training this is used for, can be the task being done (multi_label_classification) or something else (fun_training_test) +`DATASET_NAME`: Name of the dataset being trained on, will be embedded on comet_ml to make searching easier + +## Project Specific config information +### Buowset +The filenames in metadata_csv are the audio files found in `data_path`. + +`SUBPROJECT_NAME` is either "binary" or "mutlilabelClass" +`DATASET_NAME` is buowset0 + # Repo Philosophy The most challenging issue with machine learning is the dataset. This training repo intends to make it easy to modularize parts of the training pipeline, and integrate them together, ideally regardless of the dataset. From fcaa6bbe1c277649071f52e0938e5ab26c392c3b Mon Sep 17 00:00:00 2001 From: sean1572 Date: Wed, 9 Jul 2025 15:46:25 -0700 Subject: [PATCH 105/120] Renamed extra deps from model_training to model-training pip does not like underscores... --- README.md | 2 +- pyproject.toml | 6 +++--- whoot_model_training/README.md | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 8a729fc..414b82a 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ Current support optional dependency collections include - `cpu`: Installs torch and torchvision for CPU use only - `cu128`: Installs torch and torchvision with Cuda 12.8 Binaries -- `model_training`: Required for running scripts in `whoot/model_training`, make sure to add either `cpu` or `cu128` +- `model-training`: Required for running scripts in `whoot/model_training`, make sure to add either `cpu` or `cu128` ## Usage diff --git a/pyproject.toml b/pyproject.toml index 62bfe4d..0330799 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,7 @@ dependencies = [ "numba==0.61.0", "pandas>=2.3.0", "pydub>=0.25.1", + "pyyaml>=6.0.2", "scikit-learn>=1.7.0", "tqdm>=4.67.1", ] @@ -22,9 +23,8 @@ cu128 = [ "torch>=2.7.0", "torchvision>=0.22.0", ] - -model_training = [ - "datasets>=3.5.1", +model-training = [ + "datasets>=3.5.1,<4.0.0", "timm>=1.0.15", "pyha-analyzer@git+https://github.com/UCSD-E4E/pyha-analyzer-2.0.git#egg=support_whoot", "comet-ml>=3.43.2", diff --git a/whoot_model_training/README.md b/whoot_model_training/README.md index a539c01..65ca90f 100644 --- a/whoot_model_training/README.md +++ b/whoot_model_training/README.md @@ -8,7 +8,7 @@ Key inspiration is https://github.com/UCSD-E4E/pyha-analyzer-2.0/tree/main. This To set up environment for model training: 1) run steps 1 - 3 of the installation instructions in `whoot/README.md` -2) For step 4, specifically run `pip install -e .[model_training, cpu]` for cpu training, `pip install -e .[model_training, cu128]` for training on Nvidia GPUs +2) For step 4, specifically run `pip install -e .[model-training, cpu]` for cpu training, `pip install -e .[model-training, cu128]` for training on Nvidia GPUs Note that you should check what is supported by CUDA on your machine. See developers if you need a different CUDA version From c11eb1cf877e82c8766df02501e4d917b1885cfc Mon Sep 17 00:00:00 2001 From: sean1572 Date: Wed, 9 Jul 2025 16:09:36 -0700 Subject: [PATCH 106/120] Fixed install for pyha --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0330799..8b1b906 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ cu128 = [ model-training = [ "datasets>=3.5.1,<4.0.0", "timm>=1.0.15", - "pyha-analyzer@git+https://github.com/UCSD-E4E/pyha-analyzer-2.0.git#egg=support_whoot", + "pyha-analyzer@git+https://github.com/UCSD-E4E/pyha-analyzer-2.0.git@support_whoot", "comet-ml>=3.43.2", ] From 2fa8207836e597bf38fa2a2e160dbf0ec00f8c90 Mon Sep 17 00:00:00 2001 From: Sumega Date: Wed, 9 Jul 2025 19:03:30 -0700 Subject: [PATCH 107/120] Fix docstrings --- make_model/prepare_perch_embeddings.py | 39 +++++++++++++++++--------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/make_model/prepare_perch_embeddings.py b/make_model/prepare_perch_embeddings.py index 20cfac6..c006b8f 100644 --- a/make_model/prepare_perch_embeddings.py +++ b/make_model/prepare_perch_embeddings.py @@ -7,19 +7,23 @@ filename, embedding, and related metadata Usage: python prepare_perch_embeddings \ - /path/to/db/dir \ - /path/to/metadata/file \ - /path/to/output/dir \ + /path/to/sqlite_dir \ + /path/to/metadata_file \ + /path/to/output_dir \ embeddings_description Arguments: - database_directory (str): path to directory that contains + sqlite_dir (str): path to directory that contains hoplite.sqlite & usearch.index - outout_directory (str): path to directory to store output - csv + metadata_path (str): path to metadata file with labels + and fold information + outout_dir (str): path to directory to store output + pkl + embeddings_description (str) :description of set of embeddings + for file naming purposes Outputs: - _perch_embeddings.pkl + _perch_embeddings.pkl ''' @@ -35,7 +39,20 @@ def prepare_perch_embeddings(sqlite_dir, output_dir, embeddings_description): ''' - runs main script + converts raw perch embeddings (from sqlite database ) into standard + dataframe format for SVM. + + Args: + sqlite_dir (str): path to directory that contains + hoplite.sqlite $ usearch.index + metadata_path (str): path to metadata file + output_dir (str): path to directory to store + output .pkl file + embeddings_description (str): description of set of embeddings + for file naming purposes + + Returns: + None ''' # load embeddings database @@ -56,10 +73,6 @@ def prepare_perch_embeddings(sqlite_dir, base_dict = {'segment': file_name, 'embedding': embedding} - #embedding_dict = {f'{j}': val for j, val in enumerate(embedding)} - - #full_row = {**base_dict, **embedding_dict} - embeddings_data.append(base_dict) embeddings_df = pd.DataFrame(embeddings_data) @@ -70,8 +83,6 @@ def prepare_perch_embeddings(sqlite_dir, output_filename = os.path.join(output_dir, f'{embeddings_description}_perch_embeddings.pkl') merged_df.to_pickle(output_filename) -# merged_df.to_csv(csv_filename, index=False) - print(f'Embeddings saved at:\n\t{output_filename}') From 82f13ccaad0a0364dc899a45b09b9a7f502c2980 Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Fri, 11 Jul 2025 09:37:24 -0700 Subject: [PATCH 108/120] Add COMET_WORKSPACE config --- whoot_model_training/configs/config.yml | 1 + whoot_model_training/train.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/whoot_model_training/configs/config.yml b/whoot_model_training/configs/config.yml index c7b35f2..bc79c85 100644 --- a/whoot_model_training/configs/config.yml +++ b/whoot_model_training/configs/config.yml @@ -6,5 +6,6 @@ hf_cache_path: data/burrowing_owl_dataset/cache/metadata.hf # Required Variables COMET_PROJECT_NAME: "whoot" CUDA_VISIBLE_DEVICES: "0" #"0,1" +COMET_WORKSPACE: SUBPROJECT_NAME: DATASET_NAME: \ No newline at end of file diff --git a/whoot_model_training/train.py b/whoot_model_training/train.py index 6a40f00..6057446 100644 --- a/whoot_model_training/train.py +++ b/whoot_model_training/train.py @@ -150,6 +150,9 @@ def init_env(config: dict): print(config) os.environ["COMET_PROJECT_NAME"] = config["COMET_PROJECT_NAME"] os.environ["CUDA_VISIBLE_DEVICES"] = config["CUDA_VISIBLE_DEVICES"] + check_for_comet = config["COMET_WORKSPACE"] is not None + assert check_for_comet, "Make sure to add a COMET_WORKSPACE to config" + os.environ["COMET_WORKSPACE"] = config["COMET_WORKSPACE"] if __name__ == "__main__": From 64342b8fbb5d303ebc98364dcd3c9302d08bff27 Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Fri, 11 Jul 2025 09:38:29 -0700 Subject: [PATCH 109/120] Fix missing comet link in readme.md --- whoot_model_training/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whoot_model_training/README.md b/whoot_model_training/README.md index 65ca90f..639d457 100644 --- a/whoot_model_training/README.md +++ b/whoot_model_training/README.md @@ -14,7 +14,7 @@ Note that you should check what is supported by CUDA on your machine. See develo # Running -0) Add your Comet-ML API to your local environment. See +0) Add your Comet-ML API to your local environment. See https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/ 1) Create a copy of the config found in `configs/config.yml` and fill it out for your dataset. See the [config](#config) section 2) Edit train.py to set up training for your dataset. If you are using a new dataset which an extractor does not exist for, contact code authors. 3) run `python train.py path/to/your/config/file.yml` From 0578805cdce6e451cc6008865daaa77227f89dce Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Fri, 11 Jul 2025 09:50:39 -0700 Subject: [PATCH 110/120] Cleaned config --- whoot_model_training/configs/config.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/whoot_model_training/configs/config.yml b/whoot_model_training/configs/config.yml index bc79c85..bed5982 100644 --- a/whoot_model_training/configs/config.yml +++ b/whoot_model_training/configs/config.yml @@ -6,6 +6,6 @@ hf_cache_path: data/burrowing_owl_dataset/cache/metadata.hf # Required Variables COMET_PROJECT_NAME: "whoot" CUDA_VISIBLE_DEVICES: "0" #"0,1" -COMET_WORKSPACE: -SUBPROJECT_NAME: -DATASET_NAME: \ No newline at end of file +COMET_WORKSPACE: +SUBPROJECT_NAME: +DATASET_NAME: \ No newline at end of file From d7f30a3651a89469fd158cdb0f39686e66b11ec4 Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Fri, 11 Jul 2025 10:50:52 -0700 Subject: [PATCH 111/120] Renamed comment to offline to online --- whoot_model_training/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whoot_model_training/train.py b/whoot_model_training/train.py index 6057446..f295426 100644 --- a/whoot_model_training/train.py +++ b/whoot_model_training/train.py @@ -100,7 +100,7 @@ def train(config): # ) # ]) - # Offline preprocessors prepare data for training + # Online preprocessors prepare data for training train_preprocessor = SpectrogramModelInputPreprocessors( TimmInputs, duration=3 ) From 3a5a4bd0d55b1e601419c05baf3ca40b67df3cc9 Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Fri, 11 Jul 2025 11:30:26 -0700 Subject: [PATCH 112/120] Reworked preprocessors to clarify inheritance Old code had weird naming conventions caused by iterating during the development processes. Current code generalizes the base_preprocessor to allow for other preprocessors and model_inputs --- whoot_model_training/train.py | 6 +- whoot_model_training/train_binary.py | 6 +- .../preprocessors/__init__.py | 6 +- ...t_preprocessor.py => base_preprocessor.py} | 69 ++++++++++++++----- 4 files changed, 59 insertions(+), 28 deletions(-) rename whoot_model_training/whoot_model_training/preprocessors/{default_preprocessor.py => base_preprocessor.py} (57%) diff --git a/whoot_model_training/train.py b/whoot_model_training/train.py index f295426..2b82679 100644 --- a/whoot_model_training/train.py +++ b/whoot_model_training/train.py @@ -22,7 +22,7 @@ from whoot_model_training import CometMLLoggerSupplement from whoot_model_training.preprocessors import ( - SpectrogramModelInputPreprocessors + MelModelInputPreprocessor ) # Uncomment for use with data augmentation @@ -101,11 +101,11 @@ def train(config): # ]) # Online preprocessors prepare data for training - train_preprocessor = SpectrogramModelInputPreprocessors( + train_preprocessor = MelModelInputPreprocessor( TimmInputs, duration=3 ) - preprocessor = SpectrogramModelInputPreprocessors( + preprocessor = MelModelInputPreprocessor( TimmInputs, duration=3 ) diff --git a/whoot_model_training/train_binary.py b/whoot_model_training/train_binary.py index 6613fa4..a86c867 100644 --- a/whoot_model_training/train_binary.py +++ b/whoot_model_training/train_binary.py @@ -22,7 +22,7 @@ from whoot_model_training import CometMLLoggerSupplement from whoot_model_training.preprocessors import ( - SpectrogramModelInputPreprocessors + MelModelInputPreprocessor ) # Uncomment for use with data augmentation @@ -101,11 +101,11 @@ def train(config): # ]) # Offline preprocessors prepare data for training - train_preprocessor = SpectrogramModelInputPreprocessors( + train_preprocessor = MelModelInputPreprocessor( TimmInputs, duration=3 ) - preprocessor = SpectrogramModelInputPreprocessors( + preprocessor = MelModelInputPreprocessor( TimmInputs, duration=3 ) diff --git a/whoot_model_training/whoot_model_training/preprocessors/__init__.py b/whoot_model_training/whoot_model_training/preprocessors/__init__.py index 29e114f..43cfa11 100644 --- a/whoot_model_training/whoot_model_training/preprocessors/__init__.py +++ b/whoot_model_training/whoot_model_training/preprocessors/__init__.py @@ -7,14 +7,14 @@ the __get_item__ function of a dataset """ -from .default_preprocessor import ( - SpectrogramModelInputPreprocessors +from .base_preprocessor import ( + MelModelInputPreprocessor ) from .spectrogram_preprocessors import ( BuowMelSpectrogramPreprocessors ) __all__ = [ - "SpectrogramModelInputPreprocessors", + "MelModelInputPreprocessor", "BuowMelSpectrogramPreprocessors" ] diff --git a/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py b/whoot_model_training/whoot_model_training/preprocessors/base_preprocessor.py similarity index 57% rename from whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py rename to whoot_model_training/whoot_model_training/preprocessors/base_preprocessor.py index 667fd53..821b157 100644 --- a/whoot_model_training/whoot_model_training/preprocessors/default_preprocessor.py +++ b/whoot_model_training/whoot_model_training/preprocessors/base_preprocessor.py @@ -12,6 +12,9 @@ but it forces the output to fit the ModelInput class structure. see `whoot_model_training/models/model.py` for more info. """ +# pylint: disable=too-few-public-methods + +from pyha_analyzer.preprocessors import PreProcessorBase from .spectrogram_preprocessors import ( BuowMelSpectrogramPreprocessors, @@ -21,10 +24,50 @@ from ..models.model import ModelInput -class SpectrogramModelInputPreprocessors(BuowMelSpectrogramPreprocessors): - """ Defines a preprocessed that after formatting the audio +class SpectrogramModelInPreprocessors(PreProcessorBase): + """ Defines a preprocessor that after formatting the audio passes a spectrogram into a ModelInput object. """ + def __init__( + self, + spec_preprocessor: PreProcessorBase, + model_input: ModelInput, + ): + """ Wrapper to get the raw spectrogram output of spec_preprocessor + and format it neatly into a model_input + + Args: + spec_preprocessor (PreProcessorBase): a preprocessor that + creates spectrograms + ModelInput (ModelInput): How the model like input data formatted + """ + self.spec_preprocessor = spec_preprocessor + self.model_input = model_input + super().__init__(name="SpectrogramModelInPreprocessors") + + def __call__(self, batch: dict) -> ModelInput: + """Processes a batch of AudioDataset rows + + For this specific preprocessor, it creates a spectrogram then + Formats the data as a ModelInput + """ + batch = self.spec_preprocessor(batch) + return self.model_input( + labels=batch["labels"], + spectrogram=batch["audio"] + ) + + +class MelModelInputPreprocessor(SpectrogramModelInPreprocessors): + """Demo of how SpectrogramModelInPreprocessors can work by using a specific + Kind of Spectrogram Preprocessor, BuowMelSpectrogramPreprocessors + + This was created in part because legacy implementation of + SpectrogramModelInputPreprocessors had these parameters and subclassed + BuowMelSpectrogramPreprocessors. This class replicates the + format of the old SpectrogramModelInputPreprocessors + class with the new functionality + """ def __init__( self, model_input: ModelInput, @@ -48,21 +91,9 @@ def __init__( n_mels (int): number of mels for a melspectrogram dataset_ref (AudioDataset): a external ref to an AudioDataset """ - super().__init__( - duration, - augments, - spectrogram_params - ) - self.model_input = model_input - - def __call__(self, batch: dict) -> ModelInput: - """Processes a batch of AudioDataset rows - - For this specific preprocessor, it creates a spectrogram then - Formats the data as a ModelInput - """ - batch = super().__call__(batch) - return self.model_input( - labels=batch["labels"], - spectrogram=batch["audio"] + spec_preprocessor = BuowMelSpectrogramPreprocessors( + duration=duration, + augments=augments, + spectrogram_params=spectrogram_params ) + super().__init__(spec_preprocessor, model_input) From 6211a33c973144667e80a7ca5f6c271dbb14adac Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Fri, 1 Aug 2025 14:32:47 -0700 Subject: [PATCH 113/120] Improved model saving with PretrainedModel --- whoot_model_training/configs/config.yml | 3 +- whoot_model_training/train.py | 11 ++++--- .../whoot_model_training/models/__init__.py | 3 +- .../whoot_model_training/models/model.py | 7 ++-- .../whoot_model_training/models/timm_model.py | 33 +++++++++++++------ 5 files changed, 39 insertions(+), 18 deletions(-) diff --git a/whoot_model_training/configs/config.yml b/whoot_model_training/configs/config.yml index bed5982..c0563f5 100644 --- a/whoot_model_training/configs/config.yml +++ b/whoot_model_training/configs/config.yml @@ -8,4 +8,5 @@ COMET_PROJECT_NAME: "whoot" CUDA_VISIBLE_DEVICES: "0" #"0,1" COMET_WORKSPACE: SUBPROJECT_NAME: -DATASET_NAME: \ No newline at end of file +DATASET_NAME: +COMET_WORKSPACE: \ No newline at end of file diff --git a/whoot_model_training/train.py b/whoot_model_training/train.py index 2b82679..aa31161 100644 --- a/whoot_model_training/train.py +++ b/whoot_model_training/train.py @@ -18,7 +18,7 @@ from whoot_model_training.trainer import WhootTrainer, WhootTrainingArguments from whoot_model_training.data_extractor import buowset_extractor -from whoot_model_training.models import TimmModel, TimmInputs +from whoot_model_training.models import TimmModel, TimmInputs, TimmModelConfig from whoot_model_training import CometMLLoggerSupplement from whoot_model_training.preprocessors import ( @@ -71,8 +71,10 @@ def train(config): # Create the model run_name = "efficientnet_b1_testing_confusion_matrix_no_data_aug" - model = TimmModel(timm_model="efficientnet_b1", - num_classes=ds.get_num_classes()) + model_config = TimmModelConfig( + timm_model="efficientnet_b1", + num_classes=ds.get_num_classes()) + model = TimmModel(model_config) # Preprocessors @@ -122,7 +124,7 @@ def train(config): # COMMON OPTIONAL ARGS training_args.num_train_epochs = 2 - training_args.eval_steps = 20 + training_args.eval_steps = 100 training_args.per_device_train_batch_size = 32 training_args.per_device_eval_batch_size = 32 training_args.dataloader_num_workers = 36 @@ -139,6 +141,7 @@ def train(config): ) trainer.train() + model.save_pretrained("model_checkpoints/test") def init_env(config: dict): diff --git a/whoot_model_training/whoot_model_training/models/__init__.py b/whoot_model_training/whoot_model_training/models/__init__.py index 589aa49..ce57c26 100644 --- a/whoot_model_training/whoot_model_training/models/__init__.py +++ b/whoot_model_training/whoot_model_training/models/__init__.py @@ -4,12 +4,13 @@ `from whoot_model_training.models import TimmModel """ -from .timm_model import TimmModel, TimmInputs +from .timm_model import TimmModel, TimmInputs, TimmModelConfig from .model import Model, ModelInput, ModelOutput __all__ = [ "TimmModel", "TimmInputs", + "TimmModelConfig", "Model", "ModelInput", "ModelOutput" diff --git a/whoot_model_training/whoot_model_training/models/model.py b/whoot_model_training/whoot_model_training/models/model.py index d3ee241..c36286c 100644 --- a/whoot_model_training/whoot_model_training/models/model.py +++ b/whoot_model_training/whoot_model_training/models/model.py @@ -15,6 +15,7 @@ from collections import UserDict from pyha_analyzer.models.base_model import BaseModel +from transformers import PreTrainedModel, PretrainedConfig import numpy as np @@ -144,13 +145,15 @@ def from_dict(cls, some_input: dict): return cls(labels, spectrogram=spectrogram, waveform=waveform) -class Model(BaseModel): +class Model(PreTrainedModel, BaseModel): """BaseModel Class for Whoot """ def __init__(self, *args, **kwargs): self.input_format = ModelInput self.output_format = ModelOutput - super().__init__(*args, **kwargs) + super().__init__(PretrainedConfig()) + super(BaseModel).__init__(*args, **kwargs) + def get_embeddings(self, x: ModelInput) -> np.array: """Gets an embedding for the model diff --git a/whoot_model_training/whoot_model_training/models/timm_model.py b/whoot_model_training/whoot_model_training/models/timm_model.py index 07bbd79..966f775 100644 --- a/whoot_model_training/whoot_model_training/models/timm_model.py +++ b/whoot_model_training/whoot_model_training/models/timm_model.py @@ -12,7 +12,7 @@ from torch import nn from .model import Model, ModelInput, ModelOutput, has_required_inputs - +from transformers import PretrainedConfig class TimmInputs(ModelInput): """Input for TimmModel's @@ -31,18 +31,30 @@ def __init__(self, labels, waveform=None, spectrogram=None): self.labels = labels self.spectrogram = spectrogram +class TimmModelConfig(PretrainedConfig): + def __init__( + self, + timm_model="resnet34", + pretrained=True, + in_chans=1, + num_classes=6, + **kwargs + ): + self.timm_model= timm_model + self.pretrained = pretrained + self.in_chans = in_chans + self.num_classes = num_classes + super().__init__(**kwargs) -class TimmModel(nn.Module, Model): +class TimmModel(Model, nn.Module): """Model that uses a timm's model as its backbone with a linear layer for classification """ + config_class = TimmModelConfig def __init__( self, - timm_model="resnet34", - pretrained=True, - in_chans=1, - num_classes=6, + config: TimmModelConfig ): """Init for TimmModel @@ -57,16 +69,17 @@ def __init__( super().__init__() self.input_format = TimmInputs self.output_format = ModelOutput - - assert num_classes > 0 + self.config = config + + assert config.num_classes > 0 # Deep learning CNN backbone self.backbone = timm.create_model( - timm_model, pretrained=pretrained, in_chans=in_chans + config.timm_model, pretrained=config.pretrained, in_chans=config.in_chans ) # Unsure if 1000 is default for all timm models. Need to check this - self.linear = nn.Linear(1000, num_classes) + self.linear = nn.Linear(1000, config.num_classes) # different losses if you want to train for different problems # BCEWithLogitsLoss is default as for Bioacoustics, the problem tends From 0318e630f1331a0aff845febbbc67d609a075f82 Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Wed, 6 Aug 2025 16:38:23 -0700 Subject: [PATCH 114/120] Add google style docstrings, Simplify pylintrc Originally thought pylintrc controlled docstrings, but its actually flake8. Reverted the changes in pylintrc that attempted to control docstring style --- .flake8 | 1 + .gitignore | 4 +- pylintrc | 417 +------------------------------------------------ pyproject.toml | 11 +- 4 files changed, 13 insertions(+), 420 deletions(-) diff --git a/.flake8 b/.flake8 index 5d1d750..190b04a 100644 --- a/.flake8 +++ b/.flake8 @@ -1,2 +1,3 @@ [flake8] exclude = .venv/* +docstring-convention=google \ No newline at end of file diff --git a/.gitignore b/.gitignore index 09183a1..08ed59a 100644 --- a/.gitignore +++ b/.gitignore @@ -206,4 +206,6 @@ pip-selfcheck.json # End of https://www.toptal.com/developers/gitignore/api/python,venv,visualstudiocode -uv.lock \ No newline at end of file +uv.lock +data +model_checkpoints \ No newline at end of file diff --git a/pylintrc b/pylintrc index 70c1cc1..4c4c4de 100644 --- a/pylintrc +++ b/pylintrc @@ -1,415 +1,2 @@ -# This Pylint rcfile contains a best-effort configuration to uphold the -# best-practices and style described in the Google Python style guide: -# https://google.github.io/styleguide/pyguide.html -# -# Its canonical open-source location is: -# https://google.github.io/styleguide/pylintrc - -# Copyright 2018 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -[MAIN] - -# Files or directories to be skipped. They should be base names, not paths. -ignore=third_party - -# Files or directories matching the regex patterns are skipped. The regex -# matches against base names, not paths. -ignore-patterns= - -# Pickle collected data for later comparisons. -persistent=no - -# List of plugins (as comma separated values of python modules names) to load, -# usually to register additional checkers. -load-plugins= - -# Use multiple processes to speed up Pylint. -jobs=4 - -# Allow loading of arbitrary C extensions. Extensions are imported into the -# active Python interpreter and may run arbitrary code. -unsafe-load-any-extension=no - - -[MESSAGES CONTROL] - -# Only show warnings with the listed confidence levels. Leave empty to show -# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED -confidence= - -# Enable the message, report, category or checker with the given id(s). You can -# either give multiple identifier separated by comma (,) or put this option -# multiple time (only on the command line, not in the configuration file where -# it should appear only once). See also the "--disable" option for examples. -#enable= - -# Disable the message, report, category or checker with the given id(s). You -# can either give multiple identifiers separated by comma (,) or put this -# option multiple times (only on the command line, not in the configuration -# file where it should appear only once).You can also use "--disable=all" to -# disable everything first and then reenable specific checks. For example, if -# you want to run only the similarities checker, you can use "--disable=all -# --enable=similarities". If you want to run only the classes checker, but have -# no Warning level messages displayed, use"--disable=all --enable=classes -# --disable=W" -disable=R, - abstract-method, - apply-builtin, - arguments-differ, - attribute-defined-outside-init, - backtick, - bad-option-value, - basestring-builtin, - buffer-builtin, - c-extension-no-member, - consider-using-enumerate, - cmp-builtin, - cmp-method, - coerce-builtin, - coerce-method, - delslice-method, - div-method, - eq-without-hash, - execfile-builtin, - file-builtin, - filter-builtin-not-iterating, - fixme, - getslice-method, - global-statement, - hex-method, - idiv-method, - implicit-str-concat, - import-error, - import-self, - import-star-module-level, - input-builtin, - intern-builtin, - invalid-str-codec, - locally-disabled, - long-builtin, - long-suffix, - map-builtin-not-iterating, - misplaced-comparison-constant, - missing-function-docstring, - metaclass-assignment, - next-method-called, - next-method-defined, - no-absolute-import, - no-init, # added - no-member, - no-name-in-module, - no-self-use, - nonzero-method, - oct-method, - old-division, - old-ne-operator, - old-octal-literal, - old-raise-syntax, - parameter-unpacking, - print-statement, - raising-string, - range-builtin-not-iterating, - raw_input-builtin, - rdiv-method, - reduce-builtin, - relative-import, - reload-builtin, - round-builtin, - setslice-method, - signature-differs, - standarderror-builtin, - suppressed-message, - sys-max-int, - trailing-newlines, - unichr-builtin, - unicode-builtin, - unnecessary-pass, - unpacking-in-except, - useless-else-on-loop, - useless-suppression, - using-cmp-argument, - wrong-import-order, - xrange-builtin, - zip-builtin-not-iterating, - - -[REPORTS] - -# Set the output format. Available formats are text, parseable, colorized, msvs -# (visual studio) and html. You can also give a reporter class, eg -# mypackage.mymodule.MyReporterClass. -output-format=text - -# Tells whether to display a full report or only the messages -reports=no - -# Activate the evaluation score. -score=no - -# Python expression which should return a note less than 10 (10 is the highest -# note). You have access to the variables errors warning, statement which -# respectively contain the number of errors / warnings messages and the total -# number of statements analyzed. This is used by the global evaluation report -# (RP0004). -evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) - -# Template used to display messages. This is a python new-style format string -# used to format the message information. See doc for all details -#msg-template= - - -[BASIC] - -# Good variable names which should always be accepted, separated by a comma -good-names=main,_ - -# Bad variable names which should always be refused, separated by a comma -bad-names= - -# Colon-delimited sets of names that determine each other's naming style when -# the name regexes allow several styles. -name-group= - -# Include a hint for the correct naming format with invalid-name -include-naming-hint=no - -# List of decorators that produce properties, such as abc.abstractproperty. Add -# to this list to register other decorators that produce valid properties. -property-classes=abc.abstractproperty,cached_property.cached_property,cached_property.threaded_cached_property,cached_property.cached_property_with_ttl,cached_property.threaded_cached_property_with_ttl - -# Regular expression matching correct function names -function-rgx=^(?:(?PsetUp|tearDown|setUpModule|tearDownModule)|(?P_?[A-Z][a-zA-Z0-9]*)|(?P_?[a-z][a-z0-9_]*))$ - -# Regular expression matching correct variable names -variable-rgx=^[a-z][a-z0-9_]*$ - -# Regular expression matching correct constant names -const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$ - -# Regular expression matching correct attribute names -attr-rgx=^_{0,2}[a-z][a-z0-9_]*$ - -# Regular expression matching correct argument names -argument-rgx=^[a-z][a-z0-9_]*$ - -# Regular expression matching correct class attribute names -class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$ - -# Regular expression matching correct inline iteration names -inlinevar-rgx=^[a-z][a-z0-9_]*$ - -# Regular expression matching correct class names -class-rgx=^_?[A-Z][a-zA-Z0-9]*$ - -# Regular expression matching correct module names -module-rgx=^(_?[a-z][a-z0-9_]*|__init__)$ - -# Regular expression matching correct method names -method-rgx=(?x)^(?:(?P_[a-z0-9_]+__|runTest|setUp|tearDown|setUpTestCase|tearDownTestCase|setupSelf|tearDownClass|setUpClass|(test|assert)_*[A-Z0-9][a-zA-Z0-9_]*|next)|(?P_{0,2}[A-Z][a-zA-Z0-9_]*)|(?P_{0,2}[a-z][a-z0-9_]*))$ - -# Regular expression which should only match function or class names that do -# not require a docstring. -no-docstring-rgx=(__.*__|main|test.*|.*test|.*Test)$ - -# Minimum line length for functions/classes that require docstrings, shorter -# ones are exempt. -docstring-min-length=12 - - -[TYPECHECK] - -# List of decorators that produce context managers, such as -# contextlib.contextmanager. Add to this list to register other decorators that -# produce valid context managers. -contextmanager-decorators=contextlib.contextmanager,contextlib2.contextmanager - -# List of module names for which member attributes should not be checked -# (useful for modules/projects where namespaces are manipulated during runtime -# and thus existing member attributes cannot be deduced by static analysis. It -# supports qualified module names, as well as Unix pattern matching. -ignored-modules= - -# List of class names for which member attributes should not be checked (useful -# for classes with dynamically set attributes). This supports the use of -# qualified names. -ignored-classes=optparse.Values,thread._local,_thread._local - -# List of members which are set dynamically and missed by pylint inference -# system, and so shouldn't trigger E1101 when accessed. Python regular -# expressions are accepted. -generated-members= - - -[FORMAT] - -# Maximum number of characters on a single line. -max-line-length=80 - -# TODO(https://github.com/pylint-dev/pylint/issues/3352): Direct pylint to exempt -# lines made too long by directives to pytype. - -# Regexp for a line that is allowed to be longer than the limit. -ignore-long-lines=(?x)( - ^\s*(\#\ )??$| - ^\s*(from\s+\S+\s+)?import\s+.+$) - -# Allow the body of an if to be on the same line as the test if there is no -# else. -single-line-if-stmt=yes - -# Maximum number of lines in a module -max-module-lines=99999 - -# String used as indentation unit. The internal Google style guide mandates 2 -# spaces. Google's externaly-published style guide says 4, consistent with -# PEP 8. -indent-string=' ' - -# Number of spaces of indent required inside a hanging or continued line. -indent-after-paren=4 - -# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. -expected-line-ending-format= - - -[MISCELLANEOUS] - -# List of note tags to take in consideration, separated by a comma. -notes=TODO - - -[STRING] - -# This flag controls whether inconsistent-quotes generates a warning when the -# character used as a quote delimiter is used inconsistently within a module. -check-quote-consistency=yes - - -[VARIABLES] - -# Tells whether we should check for unused import in __init__ files. -init-import=no - -# A regular expression matching the name of dummy variables (i.e. expectedly -# not used). -dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_) - -# List of additional names supposed to be defined in builtins. Remember that -# you should avoid to define new builtins when possible. -additional-builtins= - -# List of strings which can identify a callback function by name. A callback -# name must start or end with one of those strings. -callbacks=cb_,_cb - -# List of qualified module names which can have objects that can redefine -# builtins. -redefining-builtins-modules=six,six.moves,past.builtins,future.builtins,functools - - -[LOGGING] - -# Logging modules to check that the string format arguments are in logging -# function parameter format -logging-modules=logging,absl.logging,tensorflow.io.logging - - -[SIMILARITIES] - -# Minimum lines number of a similarity. -min-similarity-lines=4 - -# Ignore comments when computing similarities. -ignore-comments=yes - -# Ignore docstrings when computing similarities. -ignore-docstrings=yes - -# Ignore imports when computing similarities. -ignore-imports=no - - -[SPELLING] - -# Spelling dictionary name. Available dictionaries: none. To make it working -# install python-enchant package. -spelling-dict= - -# List of comma separated words that should not be checked. -spelling-ignore-words= - -# A path to a file that contains private dictionary; one word per line. -spelling-private-dict-file= - -# Tells whether to store unknown words to indicated private dictionary in -# --spelling-private-dict-file option instead of raising a message. -spelling-store-unknown-words=no - - -[IMPORTS] - -# Deprecated modules which should not be used, separated by a comma -deprecated-modules=regsub, - TERMIOS, - Bastion, - rexec, - sets - -# Create a graph of every (i.e. internal and external) dependencies in the -# given file (report RP0402 must not be disabled) -import-graph= - -# Create a graph of external dependencies in the given file (report RP0402 must -# not be disabled) -ext-import-graph= - -# Create a graph of internal dependencies in the given file (report RP0402 must -# not be disabled) -int-import-graph= - -# Force import order to recognize a module as part of the standard -# compatibility libraries. -known-standard-library= - -# Force import order to recognize a module as part of a third party library. -known-third-party=enchant, absl - -# Analyse import fallback blocks. This can be used to support both Python 2 and -# 3 compatible code, which means that the block might have code that exists -# only in one or another interpreter, leading to false positives when analysed. -analyse-fallback-blocks=no - - -[CLASSES] - -# List of method names used to declare (i.e. assign) instance attributes. -defining-attr-methods=__init__, - __new__, - setUp - -# List of member names, which should be excluded from the protected access -# warning. -exclude-protected=_asdict, - _fields, - _replace, - _source, - _make - -# List of valid names for the first argument in a class method. -valid-classmethod-first-arg=cls, - class_ - -# List of valid names for the first argument in a metaclass class method. -valid-metaclass-classmethod-first-arg=mcs +max-args=10 +max-positional-arguments=10 \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 968d6ee..ab9f3ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,13 @@ dependencies = [ "tqdm>=4.67.1", ] +[dependency-groups] +dev = [ + "flake8>=7.3.0", + "pylint>=3.3.7", + "flake8-docstrings>=1.7.0", +] + [tool.setuptools.dynamic] version = {attr = "whoot.__version__"} @@ -25,10 +32,6 @@ cu128 = [ "torch>=2.7.0", "torchvision>=0.22.0", ] -dev = [ - "flake8>=7.3.0", - "pylint>=3.3.7", -] [packages.index] cu128 = "https://download.pytorch.org/whl/cu128" From 4eb928e7fb63b5998da37c8ccb96744ceb86cb17 Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Wed, 6 Aug 2025 16:41:43 -0700 Subject: [PATCH 115/120] Remove docstring CometML has weird behavior with strings, where it displays docstrings, going to email their support line and see what that is about. The current solution is to turn it into a comment --- comet_ml_panels/leaderboard.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/comet_ml_panels/leaderboard.py b/comet_ml_panels/leaderboard.py index ff597d5..7f73c6d 100644 --- a/comet_ml_panels/leaderboard.py +++ b/comet_ml_panels/leaderboard.py @@ -1,22 +1,22 @@ -"""Creates the Leaderboard for Comet ML Panels +# """Creates the Leaderboard for Comet ML Panels -This script queries from a given Comet ML project a DataFrame of -model metrics at each step for each model in the project -Then displays the top models. +# This script queries from a given Comet ML project a DataFrame of +# model metrics at each step for each model in the project +# Then displays the top models. -Example: - This is not intended to be run locally. Please test on Comet-ML. +# Example: +# This is not intended to be run locally. Please test on Comet-ML. -For Developers: - For more on adding to this see docs at - https://www.comet.com/docs/v2/guides/comet-ui/experiment-management/visualizations/python-panel/ +# For Developers: +# For more on adding to this see docs at +# https://www.comet.com/docs/v2/guides/comet-ui/experiment-management/visualizations/python-panel/ - Note that updating this file does not update comet-ml. Please - go into the project to update after pushing to GitHub. +# Note that updating this file does not update comet-ml. Please +# go into the project to update after pushing to GitHub. - Do not include Doc string in comet-ml... for some reason this - is displayed in the comet-ml panel if copied directly -""" +# Do not include Doc string in comet-ml... for some reason this +# is displayed in the comet-ml panel if copied directly +# """ from comet_ml import API, APIExperiment, ui import pandas as pd import numpy as np From c4e8b36bfe0c1195149ecf110bd7034670ef87c9 Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Wed, 13 Aug 2025 10:13:15 -0700 Subject: [PATCH 116/120] Fix mispelling of multilabel --- whoot_model_training/README.md | 4 ++-- .../data_extractor/buowset_extractor.py | 12 ++++++------ whoot_model_training/whoot_model_training/metrics.py | 4 ++-- .../whoot_model_training/models/timm_model.py | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/whoot_model_training/README.md b/whoot_model_training/README.md index 639d457..4138f86 100644 --- a/whoot_model_training/README.md +++ b/whoot_model_training/README.md @@ -38,7 +38,7 @@ The properties of `config.yml` are as follows: ### Buowset The filenames in metadata_csv are the audio files found in `data_path`. -`SUBPROJECT_NAME` is either "binary" or "mutlilabelClass" +`SUBPROJECT_NAME` is either "binary" or "multilabelClass" `DATASET_NAME` is buowset0 # Repo Philosophy @@ -46,7 +46,7 @@ The filenames in metadata_csv are the audio files found in `data_path`. The most challenging issue with machine learning is the dataset. This training repo intends to make it easy to modularize parts of the training pipeline, and integrate them together, ideally regardless of the dataset. The pipeline works in 5 parts: -- Extractors: Extractors take in raw data and reformats it into `AudioDatasets`, apache-arrow data structures implemented via HuggingFace with common columns between any dataset. Every label is one_hot_encoded and treated as mutlilabel regardless of the problem. Audio filepaths as casted into [Audio columns](https://huggingface.co/docs/datasets/v3.6.0/en/package_reference/main_classes#datasets.Audio). Extractors are *unique for each dataset* but *uniform in the AudioDataset*. +- Extractors: Extractors take in raw data and reformats it into `AudioDatasets`, apache-arrow data structures implemented via HuggingFace with common columns between any dataset. Every label is one_hot_encoded and treated as multilabel regardless of the problem. Audio filepaths as casted into [Audio columns](https://huggingface.co/docs/datasets/v3.6.0/en/package_reference/main_classes#datasets.Audio). Extractors are *unique for each dataset* but *uniform in the AudioDataset*. - Preprocessors: Online preprocessors take rows in `AudioDatasets` and output `ModelInputs`, formatted data specific to a given model. Preprocessors read AudioDatasets and translate it so the Model can read it diff --git a/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py b/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py index 965a49f..55b96fd 100644 --- a/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py +++ b/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py @@ -10,7 +10,7 @@ common audio dataset format is between parts of the codebase for training -Supports both mutlilabel and binary labels +Supports both multilabel and binary labels """ import os @@ -86,9 +86,9 @@ def buowset_extractor( # Convert to a uniform one_hot encoding for classes ds = ds.class_encode_column("labels") class_list = ds.features["labels"].names - mutlilabel_class_label = Sequence(ClassLabel(names=class_list)) + multilabel_class_label = Sequence(ClassLabel(names=class_list)) ds = ds.map(lambda row: one_hot_encode(row, class_list)).cast_column( - "labels", mutlilabel_class_label + "labels", multilabel_class_label ) # Get audio into uniform format @@ -119,7 +119,7 @@ def buowset_extractor( def binarize_data(row, target_col=0): - """ Convert a mutlilabel label into a binary one + """ Convert a multilabel label into a binary one Args: row (dict): an example of data @@ -157,14 +157,14 @@ def buowset_binary_extractor( the universal dataset for the training pipeline. """ - # Use the original extractor to create a mutlilabeled dataset + # Use the original extractor to create a multilabeled dataset ads = buowset_extractor( metadata_csv, parent_path, output_path, ) - # Now we just need to convert labels from mutlilabel to + # Now we just need to convert labels from multilabel to # 0 or 1 binary_class_label = Sequence(ClassLabel(names=["no_buow", "buow"])) print(binary_class_label.feature.num_classes) diff --git a/whoot_model_training/whoot_model_training/metrics.py b/whoot_model_training/whoot_model_training/metrics.py index 3b4033b..f0ae9b1 100644 --- a/whoot_model_training/whoot_model_training/metrics.py +++ b/whoot_model_training/whoot_model_training/metrics.py @@ -1,4 +1,4 @@ -""" Metrics for Bioacoustic Mutlilabel Models +""" Metrics for Bioacoustic multilabel Models Helps us evaluate which models do well @@ -25,7 +25,7 @@ class WhootMutliClassMetrics(AudioClassificationMetrics): def __init__(self, classes: list): self.classes = classes self.training = True - super().__init__([], len(classes), mutlilabel=True) + super().__init__([], len(classes), multilabel=True) def __call__(self, eval_pred) -> dict[str, float]: """Log all metrics diff --git a/whoot_model_training/whoot_model_training/models/timm_model.py b/whoot_model_training/whoot_model_training/models/timm_model.py index 966f775..c0cff1e 100644 --- a/whoot_model_training/whoot_model_training/models/timm_model.py +++ b/whoot_model_training/whoot_model_training/models/timm_model.py @@ -83,7 +83,7 @@ def __init__( # different losses if you want to train for different problems # BCEWithLogitsLoss is default as for Bioacoustics, the problem tends - # mutlilabel! + # multilabel! # the probability of class A occurring doesn't # change the probability of Class B # Many individuals can make calls at the same time! From f6fcbe496fe74599639f127066b4be2676e46b6c Mon Sep 17 00:00:00 2001 From: Katie Garwood <115747639+kgarwoodsdzwa@users.noreply.github.com> Date: Wed, 13 Aug 2025 19:00:38 +0100 Subject: [PATCH 117/120] 60 esc extractor (#68) * just making this into a branch copies buowset extractor into a panda one and updated the init * adding current scripts for esc50 instead of pandas was working on it on my laptop, but need to commit to be able to train on a gpu or just more powerful cpu because my process keeps getting killed by lack of ram it seems * lowered steps and increased epochs the model was never evaluating because the default buowset steps were being used (100) when esc50 is much smaller so it only had 38 steps. So it was appearing like it was not crashing and was doing the training, but would never get evaluated. It was also not seeming to learn fast enough with only 2 epochs (more classes than buowset?) so epochs were increased to 10 which allowed the model to actually start learning * fix some pylint, still snake case stuff to fix * fixed pylint and removed git tracking from the train.py the train.py is always meant to just be copied and modified by each user, so it didnt need to be tracked for esc50. the only differences were the data extractor and the steps and epochs. pylint done on the extractor --- .../data_extractor/__init__.py | 3 +- .../data_extractor/esc50_extractor.py | 127 ++++++++++++++++++ 2 files changed, 129 insertions(+), 1 deletion(-) create mode 100644 whoot_model_training/whoot_model_training/data_extractor/esc50_extractor.py diff --git a/whoot_model_training/whoot_model_training/data_extractor/__init__.py b/whoot_model_training/whoot_model_training/data_extractor/__init__.py index 2648cb0..32b6689 100644 --- a/whoot_model_training/whoot_model_training/data_extractor/__init__.py +++ b/whoot_model_training/whoot_model_training/data_extractor/__init__.py @@ -7,5 +7,6 @@ buowset_extractor, buowset_binary_extractor, ) +from .esc50_extractor import esc50_extractor -__all__ = ["buowset_extractor", "buowset_binary_extractor"] +__all__ = ["buowset_extractor", "buowset_binary_extractor", "esc50_extractor"] diff --git a/whoot_model_training/whoot_model_training/data_extractor/esc50_extractor.py b/whoot_model_training/whoot_model_training/data_extractor/esc50_extractor.py new file mode 100644 index 0000000..e5fdab7 --- /dev/null +++ b/whoot_model_training/whoot_model_training/data_extractor/esc50_extractor.py @@ -0,0 +1,127 @@ +"""Standardizes the format of the ESC-50 dataset + +Inspired by https://github.com/UCSD-E4E/pyha-analyzer-2.0/ + tree/main/pyha_analyzer/extractors + +The idea being extractors is that they take raw data, and +format it into a uniform dataset format, AudioDataset + +This way, it should be easier to define what a +common audio dataset format is between +parts of the codebase for training + +Supports multilabel. + +Dataset: https://github.com/karolpiczak/ESC-50# +""" + +import os +from dataclasses import dataclass + +import numpy as np +from datasets import ( + load_dataset, + Audio, + DatasetDict, + ClassLabel, + Sequence, +) +from ..dataset import AudioDataset + + +def one_hot_encode(row: dict, classes: list): + """One hot Encodes a list of labels + Args: + row (dict): row of data in a dataset containing a labels column + classes: a list of classes + """ + one_hot = np.zeroes(len(classes)) + one_hot[row["labels"]] = 1 + row["labels"] = np.array(one_hot, dtype=float) + return row + + +@dataclass +class ESC50Params(): + """Parameters that describe ESC-50 + + validation_fold (int): label for valid split + test_fold (int): label for valid split + sample_rate (int): sample rate of the data + filepath (string): name of column in csv for filepaths + """ + validation_fold = 4 + test_fold = 5 + sample_rate = 44_100 + filepath = "filename" + + +def esc50_extractor( + metadata_csv, + parent_path, + output_path, + params: ESC50Params = ESC50Params() +): + """Extracts raw data in the ESC-50 format into an AudioDataset + + Args: + Metdata_csv (str): Path to csv containing ESC-50 metadata + parent_path (str): Path to the parent folder for all audio data. + Note its assumed the audio filepath + in the csv is relative to parent_path + output_path (str): Path to where HF cache for this dataset should live + validation_fold (int): which fold is considered the validation set + Default 4 + test_fold (int): Which fold is considered the test set Default 3 + sr (int): Sample Rate of the audio files Default: 44_100 + filepath (str): Name of the column in the dataset containing + the filepaths Default: filename + + Returns: + (AudioDataset): See dataset.py, AudioDatasets are consider + the universal dataset for the training pipeline. + """ + # Hugging face by default defines a train split + dataset = load_dataset("csv", data_files=metadata_csv)["train"] + dataset = dataset.rename_column("category", "labels") + + dataset = dataset.class_encode_column("labels") + + class_list = dataset.features["labels"].names + + multilabel_class_label = Sequence(ClassLabel(names=class_list)) + + dataset = dataset.map( + lambda row: one_hot_encode(row, class_list) + ).cast_column( + "labels", + multilabel_class_label + ) + + dataset = dataset.add_column( + "audio", [ + os.path.join(parent_path, + file) for file in dataset[params.filepath] + ] + ) + dataset = dataset.add_column("filepath", dataset["audio"]) + dataset = dataset.cast_column("audio", + Audio(sampling_rate=params.sample_rate)) + + # Create splits of the data + test_ds = dataset.filter(lambda x: x["fold"] == params.test_fold) + valid_ds = dataset.filter(lambda x: x["fold"] == params.validation_fold) + train_ds = dataset.filter( + lambda x: ( + x["fold"] != params.test_fold + and x["fold"] != params.validation_fold + ) + ) + + dataset = AudioDataset( + DatasetDict({"train": train_ds, "valid": valid_ds, "test": test_ds}) + ) + + dataset.save_to_disk(output_path) + + return dataset From 8dc307beb309ebb154e87d8513fd9856b8ec3493 Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Wed, 13 Aug 2025 11:06:35 -0700 Subject: [PATCH 118/120] Swap test_fold and vaild_fold in buowset_extractor.py --- .../whoot_model_training/data_extractor/buowset_extractor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py b/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py index 55b96fd..564558d 100644 --- a/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py +++ b/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py @@ -102,8 +102,8 @@ def buowset_extractor( ds = ds.cast_column("audio", Audio(sampling_rate=params.sr)) # Create splits of the data - test_ds = ds.filter(lambda x: x["fold"] == params.validation_fold) - valid_ds = ds.filter(lambda x: x["fold"] == params.test_fold) + test_ds = ds.filter(lambda x: x["fold"] == params.test_fold) + valid_ds = ds.filter(lambda x: x["fold"] == params.validation_fold) train_ds = ds.filter( lambda x: x[ "fold" From 99db502766062ff9d2f1452cfb306e280223cf6f Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Wed, 13 Aug 2025 11:22:36 -0700 Subject: [PATCH 119/120] Removed unneeded change for this PR --- .gitignore | 2 -- 1 file changed, 2 deletions(-) diff --git a/.gitignore b/.gitignore index 08ed59a..3d62566 100644 --- a/.gitignore +++ b/.gitignore @@ -207,5 +207,3 @@ pip-selfcheck.json # End of https://www.toptal.com/developers/gitignore/api/python,venv,visualstudiocode uv.lock -data -model_checkpoints \ No newline at end of file From 618da32386153dd61aa160799a05d600965e31b5 Mon Sep 17 00:00:00 2001 From: Sean1572 Date: Wed, 13 Aug 2025 16:31:48 -0700 Subject: [PATCH 120/120] Linted and added more documentation --- pylintrc | 1 + whoot_model_training/train.py | 13 ++-- whoot_model_training/train_binary.py | 19 +++--- .../whoot_model_training/__init__.py | 3 +- .../data_extractor/__init__.py | 2 +- .../data_extractor/buowset_extractor.py | 33 +++++---- .../data_extractor/esc50_extractor.py | 9 +-- .../whoot_model_training/dataset.py | 40 ++++++----- .../whoot_model_training/logger.py | 21 ++++-- .../whoot_model_training/metrics.py | 27 +++++--- .../whoot_model_training/models/__init__.py | 2 +- .../whoot_model_training/models/model.py | 67 +++++++++++++++---- .../whoot_model_training/models/timm_model.py | 59 ++++++++++------ .../preprocessors/__init__.py | 2 +- .../preprocessors/base_preprocessor.py | 44 +++++++----- .../spectrogram_preprocessors.py | 30 +++++++-- .../whoot_model_training/trainer.py | 52 ++++++++------ 17 files changed, 270 insertions(+), 154 deletions(-) diff --git a/pylintrc b/pylintrc index 4c4c4de..4b1c907 100644 --- a/pylintrc +++ b/pylintrc @@ -1,2 +1,3 @@ +[disable] max-args=10 max-positional-arguments=10 \ No newline at end of file diff --git a/whoot_model_training/train.py b/whoot_model_training/train.py index aa31161..316f860 100644 --- a/whoot_model_training/train.py +++ b/whoot_model_training/train.py @@ -1,4 +1,4 @@ -"""Trains a Mutliclass Model with Pytorch and Huggingface +"""Trains a Mutliclass Model with Pytorch and Huggingface. This script can be used to run experiments with different models and datasets to create any model for bioacoustic classification @@ -34,12 +34,12 @@ def parse_config(config_path: str) -> dict: - """wrapper to parse config + """Wrapper to parse config. Args: config_path (str): path to config file for training! - returns: + Returns: (dict): hyperparameters parameters """ config = {} @@ -49,7 +49,7 @@ def parse_config(config_path: str) -> dict: def train(config): - """Highest level logic for training + """Highest level logic for training. Does the following: - Formats the dataset into an AudioDataset @@ -61,7 +61,6 @@ def train(config): Args: config (dict): the config used for training. Defined in yaml file """ - # Extract the dataset ds = buowset_extractor( metadata_csv=config["metadata_csv"], @@ -70,7 +69,7 @@ def train(config): ) # Create the model - run_name = "efficientnet_b1_testing_confusion_matrix_no_data_aug" + run_name = "flac_pylint_test_efficientnet_b1_buowset" model_config = TimmModelConfig( timm_model="efficientnet_b1", num_classes=ds.get_num_classes()) @@ -145,7 +144,7 @@ def train(config): def init_env(config: dict): - """Sets up local environment for COMET-ML training logging + """Sets up local environment for COMET-ML training logging. Args: config (dict): at a minimum this has the project name and CUDA devices that are allowed to be used. diff --git a/whoot_model_training/train_binary.py b/whoot_model_training/train_binary.py index a86c867..4dc7707 100644 --- a/whoot_model_training/train_binary.py +++ b/whoot_model_training/train_binary.py @@ -1,4 +1,4 @@ -"""Trains a Mutliclass Model with Pytorch and Huggingface +"""Trains a Mutliclass Model with Pytorch and Huggingface. This script can be used to run experiments with different models and datasets to create any model for bioacoustic classification @@ -18,7 +18,7 @@ from whoot_model_training.trainer import WhootTrainer, WhootTrainingArguments from whoot_model_training.data_extractor import buowset_binary_extractor -from whoot_model_training.models import TimmModel, TimmInputs +from whoot_model_training.models import TimmModel, TimmInputs, TimmModelConfig from whoot_model_training import CometMLLoggerSupplement from whoot_model_training.preprocessors import ( @@ -34,12 +34,12 @@ def parse_config(config_path: str) -> dict: - """wrapper to parse config + """Wrapper to parse config. Args: config_path (str): path to config file for training! - returns: + Returns: (dict): hyperparameters parameters """ config = {} @@ -49,7 +49,7 @@ def parse_config(config_path: str) -> dict: def train(config): - """Highest level logic for training + """Highest level logic for training! Does the following: - Formats the dataset into an AudioDataset @@ -61,7 +61,6 @@ def train(config): Args: config (dict): the config used for training. Defined in yaml file """ - # Extract the dataset ds = buowset_binary_extractor( metadata_csv=config["metadata_csv"], @@ -71,8 +70,10 @@ def train(config): # Create the model run_name = "efficientnet_b1_testing_confusion_matrix_no_data_aug" - model = TimmModel(timm_model="efficientnet_b1", - num_classes=ds.get_num_classes()) + model_config = TimmModelConfig( + timm_model="efficientnet_b1", + num_classes=ds.get_num_classes()) + model = TimmModel(model_config) # Preprocessors @@ -142,7 +143,7 @@ def train(config): def init_env(config: dict): - """Sets up local environment for COMET-ML training logging + """Sets up local environment for COMET-ML training logging. Args: config (dict): at a minimum this has the project name and CUDA devices that are allowed to be used. diff --git a/whoot_model_training/whoot_model_training/__init__.py b/whoot_model_training/whoot_model_training/__init__.py index feda7bb..638ac1f 100644 --- a/whoot_model_training/whoot_model_training/__init__.py +++ b/whoot_model_training/whoot_model_training/__init__.py @@ -1,5 +1,4 @@ -"""Logging Toolkit for different MLops platforms -""" +"""Logging Toolkit for different MLops platforms.""" from .logger import CometMLLoggerSupplement diff --git a/whoot_model_training/whoot_model_training/data_extractor/__init__.py b/whoot_model_training/whoot_model_training/data_extractor/__init__.py index 32b6689..5e0ffe7 100644 --- a/whoot_model_training/whoot_model_training/data_extractor/__init__.py +++ b/whoot_model_training/whoot_model_training/data_extractor/__init__.py @@ -1,4 +1,4 @@ -"""A zoo for extractors +"""A zoo for extractors. Extractors convert raw data into AudioDatasets Ideally you make a new Extractor for each new raw dataset diff --git a/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py b/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py index 564558d..10e9e83 100644 --- a/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py +++ b/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py @@ -1,4 +1,4 @@ -"""Standardizes the format of the buowset dataset +"""Standardizes the format of the buowset dataset. Inspired by https://github.com/UCSD-E4E/pyha-analyzer-2.0/ tree/main/pyha_analyzer/extractors @@ -28,7 +28,8 @@ def one_hot_encode(row: dict, classes: list): - """One hot Encodes a list of labels + """One hot Encodes a list of labels. + Args: row (dict): row of data in a dataset containing a labels column classes: a list of classes @@ -41,12 +42,13 @@ def one_hot_encode(row: dict, classes: list): @dataclass class BuowsetParams(): - """Parameters that describe the Buowset + """Parameters that describe the Buowset. - validation_fold (int): label for valid split - test_fold (int): label for valid split - sample_rate (int): sample rate of the data - filepath (int): name of column in csv for filepaths + Args: + validation_fold (int): label for valid split + test_fold (int): label for valid split + sample_rate (int): sample rate of the data + filepath (int): name of column in csv for filepaths """ validation_fold = 4 test_fold = 3 @@ -60,7 +62,7 @@ def buowset_extractor( output_path, params: BuowsetParams = BuowsetParams() ): - """Extracts raw data in the buowset format into an AudioDataset + """Extracts raw data in the buowset format into an AudioDataset. Args: Metdata_csv (str): Path to csv containing buowset metadata @@ -119,13 +121,13 @@ def buowset_extractor( def binarize_data(row, target_col=0): - """ Convert a multilabel label into a binary one + """Convert a multilabel label into a binary one. Args: row (dict): an example of data target_col (int): which index is the label for no_buow - returns + Returns: row (dict): now with a binary label instead """ row["labels"] = [row["labels"][target_col], 1-row["labels"][target_col]] @@ -137,10 +139,11 @@ def buowset_binary_extractor( parent_path, output_path, target_col=0): - """Extracts raw data in the buowset format into an AudioDataset - BUT only allows for two classes: no_buow, yes_buow + """Extracts raw data in the buowset format into an AudioDataset. + + BUT only allows for two classes: no_buow, yes_buow - Args: + Args: Metdata_csv (str): Path to csv containing buowset metadata parent_path (str): Path to the parent folder for all audio data. Note its assumed the audio filepath @@ -156,7 +159,6 @@ def buowset_binary_extractor( (AudioDataset): See dataset.py, AudioDatasets are consider the universal dataset for the training pipeline. """ - # Use the original extractor to create a multilabeled dataset ads = buowset_extractor( metadata_csv, @@ -167,12 +169,9 @@ def buowset_binary_extractor( # Now we just need to convert labels from multilabel to # 0 or 1 binary_class_label = Sequence(ClassLabel(names=["no_buow", "buow"])) - print(binary_class_label.feature.num_classes) for split in ads: ads[split] = ads[split].map( lambda row: binarize_data(row, target_col=target_col) ).cast_column("labels", binary_class_label) - print(ads.get_num_classes()) - return ads diff --git a/whoot_model_training/whoot_model_training/data_extractor/esc50_extractor.py b/whoot_model_training/whoot_model_training/data_extractor/esc50_extractor.py index e5fdab7..4dd26bf 100644 --- a/whoot_model_training/whoot_model_training/data_extractor/esc50_extractor.py +++ b/whoot_model_training/whoot_model_training/data_extractor/esc50_extractor.py @@ -1,4 +1,4 @@ -"""Standardizes the format of the ESC-50 dataset +"""Standardizes the format of the ESC-50 dataset. Inspired by https://github.com/UCSD-E4E/pyha-analyzer-2.0/ tree/main/pyha_analyzer/extractors @@ -30,7 +30,8 @@ def one_hot_encode(row: dict, classes: list): - """One hot Encodes a list of labels + """One hot Encodes a list of labels. + Args: row (dict): row of data in a dataset containing a labels column classes: a list of classes @@ -43,7 +44,7 @@ def one_hot_encode(row: dict, classes: list): @dataclass class ESC50Params(): - """Parameters that describe ESC-50 + """Parameters that describe ESC-50. validation_fold (int): label for valid split test_fold (int): label for valid split @@ -62,7 +63,7 @@ def esc50_extractor( output_path, params: ESC50Params = ESC50Params() ): - """Extracts raw data in the ESC-50 format into an AudioDataset + """Extracts raw data in the ESC-50 format into an AudioDataset. Args: Metdata_csv (str): Path to csv containing ESC-50 metadata diff --git a/whoot_model_training/whoot_model_training/dataset.py b/whoot_model_training/whoot_model_training/dataset.py index ec11f85..70e86c4 100644 --- a/whoot_model_training/whoot_model_training/dataset.py +++ b/whoot_model_training/whoot_model_training/dataset.py @@ -1,4 +1,5 @@ -""" +"""The Canonical Dataset used for any and all bioacoustic training. + Pulled from: https://github.com/UCSD-E4E/pyha-analyzer-2.0/blob/main/pyha_analyzer/dataset.py Key idea is we define a generic AudioDataset with uniform features @@ -14,8 +15,7 @@ class AudioDataset(DatasetDict): - """ - AudioDataset Class + """AudioDataset Class. If your dataset is an AudioDataset, it can be read by the rest of the system @@ -29,12 +29,16 @@ class AudioDataset(DatasetDict): - audio (Audio Column type from hugging face) """ def __init__(self, ds: DatasetDict): + """Creates the Audio Datasets. + + ds should be in the AudioDataset format after + being extracted by extractors + """ self.validate_format(ds) super().__init__(ds) def validate_format(self, ds: DatasetDict): - """Validates dataset is correctly formatted and ready to be used for - training + """Validates dataset is correctly formatted. Raises: AssertionError if dataset is not correctly formatted. @@ -51,28 +55,30 @@ def validate_format(self, ds: DatasetDict): assert column in dataset.features, state def get_num_classes(self): - """ - Returns: - (int): the number of classes in this dataset + """Gets the number of classes in the dataset. + + Returns: + (int): the number of classes in this dataset """ return self["train"].features["labels"].feature.num_classes def get_number_species(self) -> int: - """ - PyhaAnalyzer uses `get_number_species` for getting class count - This... isn't always the case that the dataset is species only - (could have calls!) - To support legacy PyhaAnalyzer, we therefore have this function. + """Get the number of classes in the dataset! - This should be deprecated in future versions of PyhaAnalyzer + PyhaAnalyzer uses `get_number_species` for getting class count + This... isn't always the case that the dataset is species only + (could have calls!) + To support legacy PyhaAnalyzer, we therefore have this function. - return - (int): number of classes + This should be deprecated in future versions of PyhaAnalyzer + + Returns: + (int): number of classes """ return self.get_num_classes() def get_class_labels(self) -> ClassLabel: - """Class mapping for this dataset + """Class mapping for this dataset. A common problem is when moving between datasets creating mappings between classes diff --git a/whoot_model_training/whoot_model_training/logger.py b/whoot_model_training/whoot_model_training/logger.py index a7ca5c1..5dad594 100644 --- a/whoot_model_training/whoot_model_training/logger.py +++ b/whoot_model_training/whoot_model_training/logger.py @@ -1,4 +1,4 @@ -""" Contains useful tools for additional logging +"""Contains useful tools for additional logging. For example, CometMLLoggerSupplement adds additional logging for data augmentations used compared @@ -23,13 +23,25 @@ class CometMLLoggerSupplement(): """ def __init__(self, augmentations, name): + """Log in and start new experiment. + + Args: + augmentations: list of augmentations + To record what was used during run + name (str): run name + """ comet_ml.login() self.start(augmentations, name) def start(self, augmentations, name): - """Begins a new set of experiments + """Begins a new set of experiments. Helpful for cases where a new run has begun + + Args: + augmentations: list of augmentations + To record what was used during run + name (str): run name """ self.experiment = comet_ml.start() @@ -37,12 +49,11 @@ def start(self, augmentations, name): self.experiment.set_name(name) def end(self): - """Fully ends experiment if still running - """ + """Fully ends experiment if still running.""" return self.experiment.end() def log_task(self, task_name): - """Log what task this model should be listed under + """Log what task this model should be listed under. Args: task_name: usually what task the model is doing diff --git a/whoot_model_training/whoot_model_training/metrics.py b/whoot_model_training/whoot_model_training/metrics.py index f0ae9b1..6206f43 100644 --- a/whoot_model_training/whoot_model_training/metrics.py +++ b/whoot_model_training/whoot_model_training/metrics.py @@ -1,4 +1,4 @@ -""" Metrics for Bioacoustic multilabel Models +"""Metrics for Bioacoustic multilabel Models. Helps us evaluate which models do well @@ -19,23 +19,30 @@ class WhootMutliClassMetrics(AudioClassificationMetrics): - """Gets CMAP, ROCAUC, and confusion matrices and reports them to - Comet-ML dashboards + """Report metrics to logging. + + Supports CMAP, ROCAUC, and confusion matrices. + and reports them to Comet-ML dashboards """ def __init__(self, classes: list): + """Initializes metric reporting. + + classes (list): all classes used by model + """ self.classes = classes self.training = True super().__init__([], len(classes), multilabel=True) def __call__(self, eval_pred) -> dict[str, float]: - """Log all metrics + """Log all metrics. - eval_pred: package of data provided by trainer - contains - - predictions: np.array of model outputs - - label_ids: np.array of ground truth targets + Args: + eval_pred: package of data provided by trainer + contains + - predictions: np.array of model outputs + - label_ids: np.array of ground truth targets - returns: + Returns: (dict) key name of metric, float metric score """ # CMAP / ROCAUC, done by AudioClassificationMetrics @@ -48,7 +55,7 @@ def __call__(self, eval_pred) -> dict[str, float]: return initial_metrics def log_comet_ml_only(self, eval_pred): - """Logs confusion matrix + """Logs confusion matrix. eval_pred: package of data provided by trainer contains diff --git a/whoot_model_training/whoot_model_training/models/__init__.py b/whoot_model_training/whoot_model_training/models/__init__.py index ce57c26..3c539ff 100644 --- a/whoot_model_training/whoot_model_training/models/__init__.py +++ b/whoot_model_training/whoot_model_training/models/__init__.py @@ -1,4 +1,4 @@ -"""a Bioacoustic Model Zoo +"""a Bioacoustic Model Zoo! Example: `from whoot_model_training.models import TimmModel diff --git a/whoot_model_training/whoot_model_training/models/model.py b/whoot_model_training/whoot_model_training/models/model.py index c36286c..aedc56f 100644 --- a/whoot_model_training/whoot_model_training/models/model.py +++ b/whoot_model_training/whoot_model_training/models/model.py @@ -1,4 +1,4 @@ -"""Abstract Model Class for training +"""Abstract Model Class for training. Any model trained with this repo SHOULD inherit from these classes found here @@ -54,7 +54,7 @@ def wrapper(self, x=None, **kwarg): class ModelOutput(dict, UserDict): - """ModelOutput + """ModelOutput. Object that stores the output of a model This allows for standardizing model outputs @@ -78,6 +78,15 @@ def __init__( labels: np.ndarray | None = None, loss: np.ndarray | None = None, ): + """Create a new output to a model! + + Args: + logits: raw output from model + embeddings: some latent space encoding of data + Useful for transfer learning! + labels: labels for computing metrics + loss: loss as computed by the model + """ super().__init__({ "predictions": logits, "logits": logits, @@ -88,6 +97,10 @@ def __init__( }) def items(self): + """Get all items in dict. + + But only if they are defined (not null)! + """ return [ (key, value) for ( key, value @@ -95,8 +108,7 @@ def items(self): class ModelInput(UserDict, dict): - - """ModelInput + """ModelInput. Specifies Input Types Hopefully should help standardize formatting for models @@ -114,6 +126,13 @@ def __init__( waveform: np.ndarray | None = None, spectrogram: np.ndarray | None = None, ): + """Create a new input to a model! + + Args: + labels: one_hot encoded labels + waveform: raw audio signal + spectrogram: 2d matrix to represent the waveform + """ super().__init__({ "labels": labels, "waveform": waveform, @@ -121,6 +140,10 @@ def __init__( }) def items(self): + """Get all items in dict. + + But only if they are defined (not null)! + """ return [ (key, value) for ( key, value @@ -129,7 +152,9 @@ def items(self): @classmethod def from_dict(cls, some_input: dict): - """Sometimes inputs are given as kwargs + """Recreates input for models! + + Sometimes inputs are given as kwargs So lets recreate correct inputs for model via building from a dictionary! """ @@ -146,17 +171,24 @@ def from_dict(cls, some_input: dict): class Model(PreTrainedModel, BaseModel): - """BaseModel Class for Whoot - """ - def __init__(self, *args, **kwargs): + """BaseModel Class for Whoot.""" + def __init__(self): + """Creates a basic model format. + + Anytime you create a new model, check if you need + to specify an input and output format for this model! + """ self.input_format = ModelInput self.output_format = ModelOutput - super().__init__(PretrainedConfig()) - super(BaseModel).__init__(*args, **kwargs) - + PreTrainedModel.__init__(self, PretrainedConfig()) + + assert hasattr(self.forward, "__wrapped__"), ( + "Please put `@has_required_inputs()", + "on the forward function of the model" + ) def get_embeddings(self, x: ModelInput) -> np.array: - """Gets an embedding for the model + """Gets an embedding for the model. This can be the final layer of a model backbone or a set of useful features @@ -173,8 +205,7 @@ def get_embeddings(self, x: ModelInput) -> np.array: @abstractmethod @has_required_inputs() def forward(self, x: ModelInput) -> ModelOutput: - """ - Runs some input x through the model + """Runs some input x through the model. In PyTorch models, this is the same forward function logits We just apply the convention for non Pytorch models, @@ -184,3 +215,11 @@ def forward(self, x: ModelInput) -> ModelOutput: Returns: ModelOutput: dict, a dictionary like object that describes """ + + def get_position_embeddings(self): + """Required by PretrainedModel, not needed for our work yet!""" + print("this model doesn't support position_embeddings") + + def resize_position_embeddings(self, new_num_position_embeddings: int): + """Required by PretrainedModel, not needed for our work yet!""" + print("this model doesn't support position_embeddings") diff --git a/whoot_model_training/whoot_model_training/models/timm_model.py b/whoot_model_training/whoot_model_training/models/timm_model.py index c0cff1e..aac2094 100644 --- a/whoot_model_training/whoot_model_training/models/timm_model.py +++ b/whoot_model_training/whoot_model_training/models/timm_model.py @@ -1,37 +1,42 @@ -"""Wrapper around the timms model zoo +"""Wrapper around the timms model zoo! - See https://timm.fast.ai/ +See https://timm.fast.ai/ - Timm model zoo good for computer vision models - Like CNNs, which are useful for spectrograms +Timm model zoo good for computer vision models +Like CNNs, which are useful for spectrograms - Great repo for models, but currently using this for demoing pipeline +Great repo for models, but currently using this for demoing pipeline """ import timm from torch import nn +from transformers import PretrainedConfig from .model import Model, ModelInput, ModelOutput, has_required_inputs -from transformers import PretrainedConfig + class TimmInputs(ModelInput): - """Input for TimmModel's + """Input for TimmModels. Specifies TimmModels needs labels and spectrograms that are Tensors - - Args: - Labels: the data's label for this batch - spectrogram: audio's spectrogram - waveform: Optional, audio waveform """ def __init__(self, labels, waveform=None, spectrogram=None): + """Creates TimmInputs. + + Args: + labels: the data's label for this batch + spectrogram: audio's spectrogram + waveform: Optional, audio waveform + """ # # Can use inputs to verify correct shape for upstream model # assert spectrogram.shape[1:] == (1, 100, 100) super().__init__(labels, waveform, spectrogram) self.labels = labels self.spectrogram = spectrogram + class TimmModelConfig(PretrainedConfig): + """Config for Timm Model Zoo Models!""" def __init__( self, timm_model="resnet34", @@ -40,23 +45,30 @@ def __init__( num_classes=6, **kwargs ): - self.timm_model= timm_model + """Creates Config. + + Args: + timm_model (str): name of a model in timm model zoo + pretrained (bool): use pretrain weights from timms + in_chans (int): channels in audio, mono is 1 + num_classes (int): number of classes in dataset, for cls + """ + self.timm_model = timm_model self.pretrained = pretrained self.in_chans = in_chans self.num_classes = num_classes super().__init__(**kwargs) + class TimmModel(Model, nn.Module): - """Model that uses a timm's model as its backbone with a - linear layer for classification - """ + """Model that uses a timm's model.""" config_class = TimmModelConfig def __init__( self, config: TimmModelConfig ): - """Init for TimmModel + """Init for TimmModel. kwargs: timm_model (str): name of model backbone from timms to use, @@ -70,12 +82,13 @@ def __init__( self.input_format = TimmInputs self.output_format = ModelOutput self.config = config - assert config.num_classes > 0 # Deep learning CNN backbone self.backbone = timm.create_model( - config.timm_model, pretrained=config.pretrained, in_chans=config.in_chans + config.timm_model, + pretrained=config.pretrained, + in_chans=config.in_chans ) # Unsure if 1000 is default for all timm models. Need to check this @@ -90,14 +103,18 @@ def __init__( self.loss = nn.BCEWithLogitsLoss() def set_custom_loss(self, loss_fn): - """Set a different loss function + """Set a different loss function. + For cases where we don't want BCEWithLogitsLoss + + Args: + loss_fn: Function to compute loss, ideally in pytorch """ self.loss = loss_fn @has_required_inputs() def forward(self, x: TimmInputs) -> ModelOutput: - """Model forward function + """Model forward function. Args: x: (TimmInputs): The specific input format for Timm Models diff --git a/whoot_model_training/whoot_model_training/preprocessors/__init__.py b/whoot_model_training/whoot_model_training/preprocessors/__init__.py index 43cfa11..8efacfb 100644 --- a/whoot_model_training/whoot_model_training/preprocessors/__init__.py +++ b/whoot_model_training/whoot_model_training/preprocessors/__init__.py @@ -1,4 +1,4 @@ -""" A collection of online preprocessors +"""A collection of online preprocessors. During training online preprocessors convert data into data ready to be given to a model diff --git a/whoot_model_training/whoot_model_training/preprocessors/base_preprocessor.py b/whoot_model_training/whoot_model_training/preprocessors/base_preprocessor.py index 821b157..a7ad953 100644 --- a/whoot_model_training/whoot_model_training/preprocessors/base_preprocessor.py +++ b/whoot_model_training/whoot_model_training/preprocessors/base_preprocessor.py @@ -1,4 +1,4 @@ -"""Default Class for Preprocessing the data +"""Default Class for Preprocessing the data. The dataset is one thing, what we feed into the models is another Models may require spectrograms, waveforms, etc @@ -25,28 +25,30 @@ class SpectrogramModelInPreprocessors(PreProcessorBase): - """ Defines a preprocessor that after formatting the audio - passes a spectrogram into a ModelInput object. + """Defines a preprocessor that after formatting the audio. + + Passes a spectrogram into a ModelInput object. """ def __init__( self, spec_preprocessor: PreProcessorBase, model_input: ModelInput, ): - """ Wrapper to get the raw spectrogram output of spec_preprocessor + """Wrapper to get the raw spectrogram output of spec_preprocessor. + and format it neatly into a model_input Args: spec_preprocessor (PreProcessorBase): a preprocessor that creates spectrograms - ModelInput (ModelInput): How the model like input data formatted + model_input (ModelInput): How the model like input data formatted """ self.spec_preprocessor = spec_preprocessor self.model_input = model_input super().__init__(name="SpectrogramModelInPreprocessors") def __call__(self, batch: dict) -> ModelInput: - """Processes a batch of AudioDataset rows + """Processes a batch of AudioDataset rows. For this specific preprocessor, it creates a spectrogram then Formats the data as a ModelInput @@ -59,8 +61,9 @@ def __call__(self, batch: dict) -> ModelInput: class MelModelInputPreprocessor(SpectrogramModelInPreprocessors): - """Demo of how SpectrogramModelInPreprocessors can work by using a specific - Kind of Spectrogram Preprocessor, BuowMelSpectrogramPreprocessors + """Demo of how SpectrogramModelInPreprocessors works. + + Uses a kind of Spectrogram Preprocessor, BuowMelSpectrogramPreprocessors This was created in part because legacy implementation of SpectrogramModelInputPreprocessors had these parameters and subclassed @@ -75,21 +78,26 @@ def __init__( augments: Augmentations = Augmentations(), spectrogram_params: SpectrogramParams = SpectrogramParams(), ): - """ Creates a Online preprocessor for MelSpectrograms Based Models + """Creates a Online preprocessor for MelSpectrograms Based Models. Formats input into spefific ModelInput format. Args: - ModelInput (ModelInput): How the model like input data formatted - Duration (int): Length in seconds of input - augment (dict): contains two keys: audio, spectrogram each defining + model_input (ModelInput): How the model like input data formatted + duration (int): Length in seconds of input + augments (dict): contains two keys: audio, + spectrogram each defining a dict of augmentation names and augmentations to run - class_list (list): the classes we are working with one-hot-encoding - n_fft (int): number of ffts - hop_length (int): hop length - power (int): power, defined by librosa - n_mels (int): number of mels for a melspectrogram - dataset_ref (AudioDataset): a external ref to an AudioDataset + spectrogram_params (SpectrogramParams): + has the following parameters: + class_list (list): the classes we are + working with one-hot-encoding + n_fft (int): number of ffts + hop_length (int): hop length + power (int): power, defined by librosa + n_mels (int): number of mels for a melspectrogram + dataset_ref (AudioDataset): a + external ref to an AudioDataset """ spec_preprocessor = BuowMelSpectrogramPreprocessors( duration=duration, diff --git a/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py b/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py index b3fb6df..555a801 100644 --- a/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py +++ b/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py @@ -1,4 +1,5 @@ -""" +"""Defines preprocessors for creating spectrograms. + Pulled from pyha_analyzer/preprocessors/spectogram_preprocessors.py """ from dataclasses import dataclass @@ -12,7 +13,7 @@ @dataclass class SpectrogramParams: - """ Dataclass for spectrogram Parameters + """Dataclass for spectrogram Parameters. n_fft: (int) number of fft bins hop_length (int) skip count @@ -27,7 +28,7 @@ class SpectrogramParams: @dataclass class Augmentations(): - """Dataclass for the augmentations of the model + """Dataclass for the augmentations of the model. audio (list[dict]): per item key name of augmentation, value is the augmentation @@ -39,7 +40,8 @@ class Augmentations(): class BuowMelSpectrogramPreprocessors(PreProcessorBase): - """Preprocessor for processing audio into spectrograms + """Preprocessor for processing audio into spectrograms. + Particularly for the buow dataset """ @@ -49,6 +51,14 @@ def __init__( augments: Augmentations = Augmentations(), spectrogram_params: SpectrogramParams = SpectrogramParams() ): + """Defines a BuowMelSpectrogramPreprocessors. + + Args: + duration (float): length of chunk of data to train on + augments (Augmentations): An augmentation to apply to waveforms + spectrogram_params (SpectrogramParams): + config for spectrogram generation + """ self.duration = duration self.augments = augments @@ -62,6 +72,7 @@ def __init__( super().__init__(name="MelSpectrogramPreprocessor") def __call__(self, batch): + """Process a batch of data from an AudioDataset.""" new_audio = [] new_labels = [] for item_idx in range(len(batch["audio"])): @@ -109,12 +120,21 @@ def __call__(self, batch): return batch def get_augmentations(self): - """Returns a list of augmentations + """Returns a list of augmentations. + Perhaps for logging purposes + + Returns: + (list) all the augmentations """ return self.augments def __repr__(self): + """Use representation to describe the augmentations. + + Returns: + (str) all information about this preprocessor + """ return ( f"""{self.name} Augmentations: {self.augments} diff --git a/whoot_model_training/whoot_model_training/trainer.py b/whoot_model_training/whoot_model_training/trainer.py index d57641b..93f8702 100644 --- a/whoot_model_training/whoot_model_training/trainer.py +++ b/whoot_model_training/whoot_model_training/trainer.py @@ -1,4 +1,5 @@ -""" Everything needed to train +"""Everything needed to train! + given a model and a dataset WhootTrainingArguments: A container for the @@ -19,13 +20,20 @@ class WhootTrainingArguments(PyhaTrainingArguments): - """Holds arguments use for training - """ + """Holds arguments use for training.""" def __init__(self, - run_name, + run_name: str, subproject_name: str = "TESTING", dataset_name: str = "DS_404"): - + """Create Arguments. + + Args: + run_name (str): name of the current run + subproject_name (str): name of subproject + These experiments are a part of + dataset_name (str): name of dataset + used for model experiments + """ assert subproject_name is not None assert dataset_name is not None @@ -51,21 +59,7 @@ def __init__(self, class WhootTrainer(PyhaTrainer): - """Trainers run the training of a model - - Model (Model): a pytorch model for training - should inherit from BaseModel - see `models/model.py` - Dataset (AudioDataset): A canonical audio dataset - Ideally attached some a preprocessor and returns ModelInputs - training_args (WhootTrainingArugments): - All the parameters that define training - Logger (CometMLLoggerSupplement): - Class that adds additional logging - On top of logging done by PyhaTrainer - preprocessor (PreProcessorBase): - Preprocessor used for formatting the data - """ + """Trainers run the training of a model.""" # WhootTrainer is ment to mimic the huggingface trainer # Including number of arguments # Aside, we really should consider how useful R0913,R0917 is... @@ -79,9 +73,23 @@ def __init__( logger=None, preprocessor=None, ): - + """Creates a trainer to hold training setup. + + Args: + model (Model): a pytorch model for training + should inherit from BaseModel see `models/model.py` + dataset (AudioDataset): A canonical audio dataset + Ideally attached some a preprocessor and returns ModelInputs + training_args (WhootTrainingArugments): + All the parameters that define training + logger (CometMLLoggerSupplement): + Class that adds additional logging + On top of logging done by PyhaTrainer + preprocessor (PreProcessorBase): + Preprocessor used for formatting the data + """ metrics = WhootMutliClassMetrics(dataset.get_class_labels().names) - print(logger, type(logger)) + if logger is not None: logger.log_task(training_args.task_name)