From 3c978c8b17c78c41072d415f58e1e77bf7945d48 Mon Sep 17 00:00:00 2001 From: Katie Garwood Date: Mon, 12 Jan 2026 14:38:58 -0800 Subject: [PATCH] group by sites rather than by wav file to reduce leakage potential to reduce leakage, buowsets were grouped by their original wav file so that detections from the same file were not split up into different folds to reduce leakage. for further leakage mitigation, this change groups by site so that not only are the same original wav files kept in the same folds, but files from the same site also stay in the same fold. there are not many sites in buowset 1.0-1.2, so some of the classes are not represented in all the folds, like no matter how you split it, for some semblance of keeping distribution the same, one fold will never have chick begging if we stratify this way until we gain more data or more sites. --- create_dataset/strat_k_folds.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/create_dataset/strat_k_folds.py b/create_dataset/strat_k_folds.py index 0de2eb0..fbb799d 100644 --- a/create_dataset/strat_k_folds.py +++ b/create_dataset/strat_k_folds.py @@ -1,6 +1,6 @@ """Split buowset into stratified k-folds. -Groups detections from the same wav file into 'groups' +Groups detections from the same site into 'groups' and then determines the overall class distribution and the class distribution for each 'group'. It allocates all the groups to a 'fold' in a way where the folds @@ -8,7 +8,7 @@ dataset. Usage: - python3 strat_k_folds.py /path/to/metadata.csv + python3 strat_k_folds.py /path/to/metadata.csv /path/to/site_list.txt """ import argparse import pandas as pd @@ -18,17 +18,23 @@ from k_fold_split_copy import solve -def create_strat_folds(df): +def create_strat_folds(df, site_ids): """Create grouped stratified k-folds. Args: df (pd.Dataframe): The metadata csv from when the dataset was created. + site_ids (list): Site names (found in original file path) to group by. Returns: pd.DataFrame: The same metadata but with labels as ints and a new fold column to denote the fold that segment is apart of. """ num_classes = 6 + for index, row in df.iterrows(): + for site_id in site_ids: + if site_id in row['original_path']: + df.loc[index, 'site_id'] = site_id + print(df.head) original_df = df df['label'] = df['label'].replace('cluck', 0) df['label'] = df['label'].replace('coocoo', 1) @@ -37,7 +43,7 @@ def create_strat_folds(df): df['label'] = df['label'].replace('chick begging', 4) df['label'] = df['label'].replace('no_buow', 5) # group is the subset of the index which is the wav file they all come from - grouped = df.groupby('original_path') + grouped = df.groupby('site_id') group_names = [] group_matrix = [] for index, group in grouped: @@ -59,7 +65,7 @@ def create_strat_folds(df): # the % of each class in each fold print(f"Fold percents: {fold_percents}") print(folds) - grouped_original = original_df.groupby('original_path') + grouped_original = original_df.groupby('site_id') df_with_folds = pd.DataFrame() count = 0 for i, group in grouped_original: @@ -69,14 +75,17 @@ def create_strat_folds(df): return df_with_folds -def main(meta): +def main(meta, sites): """Execute main script. Args: meta (str): Path to metadata csv from creating the dataset. + sites (str): Path to sites to group by. """ df = pd.read_csv(meta, index_col=0) - df_with_folds = create_strat_folds(df) + with open(sites, 'r', encoding='utf-8') as file: + site_ids = [line.strip() for line in file.readlines()] + df_with_folds = create_strat_folds(df, site_ids) df_with_folds.to_csv("5-fold_meta.csv") @@ -86,5 +95,7 @@ def main(meta): ) parser.add_argument('meta', type=str, help='Path to metadata csv') + parser.add_argument('sites', type=str, + help='Path to site list') args = parser.parse_args() - main(args.meta) + main(args.meta, args.sites)