From ece3ed27fe42a286b23312a2dcf3152718aa7b06 Mon Sep 17 00:00:00 2001 From: Eric Bezzam Date: Sun, 18 Aug 2024 20:12:04 +0200 Subject: [PATCH 1/5] Add hugging face dataset examples. --- .gitignore | 1 + examples/configs/hf_dataset.yaml | 29 ++++ examples/create_huggingface_dataset.py | 229 +++++++++++++++++++++++++ examples/use_huggingface_dataset.py | 50 ++++++ 4 files changed, 309 insertions(+) create mode 100644 examples/configs/hf_dataset.yaml create mode 100644 examples/create_huggingface_dataset.py create mode 100644 examples/use_huggingface_dataset.py diff --git a/.gitignore b/.gitignore index 959bbd5..5b82096 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ project_env/ .DS_Store +dataset/* # from hydra outputs/ diff --git a/examples/configs/hf_dataset.yaml b/examples/configs/hf_dataset.yaml new file mode 100644 index 0000000..99f5361 --- /dev/null +++ b/examples/configs/hf_dataset.yaml @@ -0,0 +1,29 @@ +# python examples/create_huggingface_dataset.py hf_token=YOUR_TOKEN +hydra: + job: + chdir: True # change to output folder + job_logging: + formatters: + simple: + format: '[%(levelname)s] - %(message)s' + +repo_id: bezzam/dummy-dataset +seed: 0 +test_size: 0.15 +hf_token: + +data_dir: + audio: + dir: dataset/data_audio + type: wav + image: + dir: dataset/data_images + type: png + text: + dir: dataset/data_text + type: txt + label: + file: dataset/data_labels.csv + label: True + +stratify_by_column: label diff --git a/examples/create_huggingface_dataset.py b/examples/create_huggingface_dataset.py new file mode 100644 index 0000000..8fc4187 --- /dev/null +++ b/examples/create_huggingface_dataset.py @@ -0,0 +1,229 @@ +""" +We will create a dataset with images, audios, and text data +so that you can see how various data types can be pushed to +Hugging Face! + +The default configuration is in `examples/configs/hf_dataset.yaml`: + +```bash +# install +pip install datasets huggingface_hub soundfile + +# make a WRITE token on HuggingFace: https://huggingface.co/settings/tokens + +# run +python examples/create_huggingface_dataset.py \ +hf_token=... \ +``` +""" + +import hydra +from hydra.utils import to_absolute_path +import os +import time +import glob +import numpy as np +import soundfile as sf +from PIL import Image as PILImage +from datasets import Dataset, Image, Audio, ClassLabel +from omegaconf import open_dict +from huggingface_hub import upload_file +import re +import pandas as pd + + +# -- helper functions +def convert(text): + return int(text) if text.isdigit() else text.lower() + + +def alphanum_key(key): + return [convert(c) for c in re.split("([0-9]+)", key)] + + +def natural_sort(arr): + return sorted(arr, key=alphanum_key) + + +@hydra.main(version_base=None, config_path="configs", config_name="hf_dataset") +def main(config): + + start_time = time.time() + + # extract and check parameters + repo_id = config.repo_id + hf_token = config.hf_token + test_size = config.test_size + + assert repo_id is not None, "Please provide a Hugging Face repo_id." + assert hf_token is not None, "Please provide a Hugging Face token." + + # to absolute path, as needed by Hugging Face upload + for data in config.data_dir: + if "dir" in config.data_dir[data]: + config.data_dir[data]["dir"] = to_absolute_path(config.data_dir[data]["dir"]) + elif "file" in config.data_dir[data]: + config.data_dir[data]["file"] = to_absolute_path(config.data_dir[data]["file"]) + + # Step 1: Check data (create dummy data if not present) + n_files = 100 # number of dummy files to create + for data in config.data_dir: + + # for directory of data + if "dir" in config.data_dir[data]: + input_dir = config.data_dir[data]["dir"] + data_type = config.data_dir[data]["type"] + + if not os.path.exists(input_dir): + # create dummy data + print(f"-- Creating {n_files} dummy {data_type} files in {input_dir}") + os.makedirs(input_dir, exist_ok=True) + for i in range(n_files): + if data_type == "png": + img = np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8) + img_path = os.path.join(input_dir, f"{i}.png") + PILImage.fromarray(img).save(img_path) + elif data_type == "wav": + audio = np.random.randn(16000) + audio_path = os.path.join(input_dir, f"{i}.wav") + sf.write(audio_path, audio, samplerate=16000) + elif data_type == "txt": + text = f"Hello, this is file {i}" + text_path = os.path.join(input_dir, f"{i}.txt") + with open(text_path, "w") as f: + f.write(text) + + # check number of files + files = glob.glob(os.path.join(input_dir, "*." + data_type)) + n_files = len(files) + print(f"Found {n_files} {data_type} files in {input_dir}") + + # for CSV file where each line is a data point + elif "file" in config.data_dir[data]: + input_file = config.data_dir[data]["file"] + + if not os.path.exists(input_file): + # create dummy labels + labels = ["good", "ok", "bad"] + file_labels = np.random.choice(labels, n_files) + with open(input_file, "w") as f: + for i in range(n_files): + f.write(f"{i},{file_labels[i]}\n") + print(f"-- Created dummy labels file at {input_file}") + + # check number of unique labels (open with Pandas) + df = pd.read_csv(input_file, header=None) + n_files = len(df) + labels = df[1].unique() + n_labels = len(df[1].unique()) + print(f"Found {n_files} lines with {n_labels} unique labels ({labels}) in {input_file}") + + else: + raise ValueError("Please provide either `dir` or `file` in data_dir") + + # -- only keep common files across all datasets + bn = [os.path.basename(f).split(".")[0] for f in files] + for data in config.data_dir: + if "dir" in config.data_dir[data]: + input_dir = config.data_dir[data]["dir"] + data_type = config.data_dir[data]["type"] + files = glob.glob(os.path.join(input_dir, "*." + data_type)) + bn_data = [os.path.basename(f).split(".")[0] for f in files] + common_files = list(set(bn).intersection(bn_data)) + common_files = natural_sort(common_files) + print(f"Number of common files: {len(common_files)}") + + # -- add common files into dictionary + for data in config.data_dir: + if "dir" in config.data_dir[data]: + with open_dict(config): + config.data_dir[data]["data"] = common_files + if "file" in config.data_dir[data]: + # take row according to common_files + df = pd.read_csv(config.data_dir[data]["file"], header=None) + # -- make first column string + df[0] = df[0].astype(str) + df = df[df[0].isin(common_files)] + with open_dict(config): + config.data_dir[data]["data"] = df[1].tolist() + + # Step 2: Create train and test data + dataset_dict = {} + + # -- create dictionary of content + for data in config.data_dir: + if "dir" in config.data_dir[data]: + files = config.data_dir[data]["data"] + data_type = config.data_dir[data]["type"] + data_files = [ + os.path.join(config.data_dir[data]["dir"], f"{f}.{data_type}") for f in files + ] + + if data_type in ["txt"]: + # open file content for text files + data_files = [open(f).read() for f in data_files] + dataset_dict[data] = data_files + elif "file" in config.data_dir[data]: + dataset_dict[data] = config.data_dir[data]["data"] + + # -- create dataset + dataset = Dataset.from_dict(dataset_dict) + for data in config.data_dir: + if "dir" in config.data_dir[data]: + if config.data_dir[data]["type"] in ["png", "jpg", "jpeg", "tiff"]: + dataset = dataset.cast_column(data, Image()) + elif config.data_dir[data]["type"] in ["wav", "mp3", "flac", "ogg"]: + dataset = dataset.cast_column(data, Audio()) + elif "file" in config.data_dir[data]: + if config.data_dir[data]["label"]: + labels = list(set(config.data_dir[data]["data"])) + dataset = dataset.cast_column(data, ClassLabel(names=labels)) + + # -- split into train and test + dataset = dataset.train_test_split( + test_size=test_size, + seed=config.seed, + shuffle=True, + stratify_by_column=config.stratify_by_column, # shuffle must be True + ) + print(dataset) + + """ + DatasetDict({ + train: Dataset({ + features: ['audio', 'images', 'text', 'labels'], + num_rows: 85 + }) + test: Dataset({ + features: ['audio', 'images', 'text', 'labels'], + num_rows: 15 + }) + }) + """ + + # Step 3: Push to Hugging Face + dataset.push_to_hub(repo_id, token=hf_token) + + # -- push individual files + for data in config.data_dir: + if "dir" in config.data_dir[data]: + # push first file + local_fp = os.path.join( + config.data_dir[data]["dir"], + config.data_dir[data]["data"][0] + "." + config.data_dir[data]["type"], + ) + remote_fn = "example." + config.data_dir[data]["type"] + upload_file( + path_or_fileobj=local_fp, + path_in_repo=remote_fn, + repo_id=repo_id, + repo_type="dataset", + token=hf_token, + ) + + # total time in minutes + print(f"Total time: {(time.time() - start_time) / 60} minutes") + + +if __name__ == "__main__": + main() diff --git a/examples/use_huggingface_dataset.py b/examples/use_huggingface_dataset.py new file mode 100644 index 0000000..73e0a12 --- /dev/null +++ b/examples/use_huggingface_dataset.py @@ -0,0 +1,50 @@ +""" +In this script, we use the Hugging Face dataset made +from the script examples/create_huggingface_dataset.py + +The dataset is available at: +https://huggingface.co/bezzam/dummy-dataset + +```bash +# install +pip install datasets librosa soundfile + +# run +python examples/use_huggingface_dataset.py +``` + +During the first run, the dataset will be downloaded and cached. +Subsequent runs will use the cached dataset. + +""" + +from datasets import load_dataset +import numpy as np + + +# load train and test splits +ds_train = load_dataset("bezzam/dummy-dataset", split="train") +ds_test = load_dataset("bezzam/dummy-dataset", split="test") +print(f"Number of training samples: {len(ds_train)}") +print(f"Number of test samples: {len(ds_test)}") + +# load first example +print("\n---- First example:") +example = ds_train[0] + +# -- audio duration +duration = len(example["audio"]["array"]) / example["audio"]["sampling_rate"] +print(f"Duration of audio: {duration:.2f} seconds") + +# -- image size +image = np.array(example["image"]) +print(f"Size of image: {image.shape}") + +# -- text +text = example["text"] +print(f"Text: {text}") + +# -- label +label = example["label"] +label_str = ds_train.features["label"].int2str(label) +print(f"Label: {label_str}") From 51e5bbf3f81b1dca669515618a9790f03da2f714 Mon Sep 17 00:00:00 2001 From: Eric Bezzam Date: Sun, 18 Aug 2024 20:18:17 +0200 Subject: [PATCH 2/5] Update README. --- README.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/README.rst b/README.rst index 0fcad78..413139a 100644 --- a/README.rst +++ b/README.rst @@ -54,6 +54,7 @@ choose the features that you like. This flexibility is one of the best * Unit tests and continuous integration. * Packaging and distribution. * Remove development. +* Creating and sharing datasets with Hugging Face. The accompanying `slides `__ From aecb90782cc1d0a0b03dc5dad77f49b746706bdf Mon Sep 17 00:00:00 2001 From: Eric Bezzam Date: Sun, 18 Aug 2024 21:53:11 +0200 Subject: [PATCH 3/5] Fix typo. --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index f1495ad..f0f06e9 100644 --- a/README.rst +++ b/README.rst @@ -61,7 +61,7 @@ choose the features that you like. This flexibility is one of the best * Code formatting. * Unit tests and continuous integration. * Packaging and distribution. -* Remove development. +* Remote development. * Creating and sharing datasets with Hugging Face. The accompanying From 9b28e3fda32b47bf9a72b51cf1468fa1461e69e1 Mon Sep 17 00:00:00 2001 From: Eric Bezzam Date: Mon, 19 Aug 2024 08:27:50 +0200 Subject: [PATCH 4/5] Add randomness in image sizes. --- examples/create_huggingface_dataset.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/create_huggingface_dataset.py b/examples/create_huggingface_dataset.py index 8fc4187..df1bc28 100644 --- a/examples/create_huggingface_dataset.py +++ b/examples/create_huggingface_dataset.py @@ -80,13 +80,16 @@ def main(config): os.makedirs(input_dir, exist_ok=True) for i in range(n_files): if data_type == "png": - img = np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8) + dim = np.random.randint(100, 200) + img = np.random.randint(0, 255, (dim, dim, 3), dtype=np.uint8) img_path = os.path.join(input_dir, f"{i}.png") PILImage.fromarray(img).save(img_path) elif data_type == "wav": - audio = np.random.randn(16000) + duration = np.random.randint(1, 4) + sample_rate = 16000 + audio = np.random.randn(duration * sample_rate) audio_path = os.path.join(input_dir, f"{i}.wav") - sf.write(audio_path, audio, samplerate=16000) + sf.write(audio_path, audio, samplerate=sample_rate) elif data_type == "txt": text = f"Hello, this is file {i}" text_path = os.path.join(input_dir, f"{i}.txt") From 159b4908fa0c38218ed4b28db90be612daa69599 Mon Sep 17 00:00:00 2001 From: Eric Bezzam Date: Thu, 3 Apr 2025 08:47:55 +0200 Subject: [PATCH 5/5] Update use_huggingface_dataset.py --- examples/use_huggingface_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/use_huggingface_dataset.py b/examples/use_huggingface_dataset.py index 73e0a12..da78de2 100644 --- a/examples/use_huggingface_dataset.py +++ b/examples/use_huggingface_dataset.py @@ -3,7 +3,7 @@ from the script examples/create_huggingface_dataset.py The dataset is available at: -https://huggingface.co/bezzam/dummy-dataset +https://huggingface.co/datasets/bezzam/dummy-dataset ```bash # install