From ece3ed27fe42a286b23312a2dcf3152718aa7b06 Mon Sep 17 00:00:00 2001
From: Eric Bezzam <ebezzam@gmail.com>
Date: Sun, 18 Aug 2024 20:12:04 +0200
Subject: [PATCH 1/5] Add hugging face dataset examples.

---
 .gitignore                             |   1 +
 examples/configs/hf_dataset.yaml       |  29 ++++
 examples/create_huggingface_dataset.py | 229 +++++++++++++++++++++++++
 examples/use_huggingface_dataset.py    |  50 ++++++
 4 files changed, 309 insertions(+)
 create mode 100644 examples/configs/hf_dataset.yaml
 create mode 100644 examples/create_huggingface_dataset.py
 create mode 100644 examples/use_huggingface_dataset.py

diff --git a/.gitignore b/.gitignore
index 959bbd5..5b82096 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 project_env/
 .DS_Store
+dataset/*
 
 # from hydra
 outputs/   
diff --git a/examples/configs/hf_dataset.yaml b/examples/configs/hf_dataset.yaml
new file mode 100644
index 0000000..99f5361
--- /dev/null
+++ b/examples/configs/hf_dataset.yaml
@@ -0,0 +1,29 @@
+# python examples/create_huggingface_dataset.py hf_token=YOUR_TOKEN
+hydra:
+  job:
+    chdir: True    # change to output folder
+  job_logging:
+    formatters:
+      simple:
+        format: '[%(levelname)s] - %(message)s'
+
+repo_id: bezzam/dummy-dataset
+seed: 0
+test_size: 0.15
+hf_token:
+
+data_dir:
+  audio:
+    dir: dataset/data_audio
+    type: wav
+  image:
+    dir: dataset/data_images
+    type: png
+  text:
+    dir: dataset/data_text
+    type: txt
+  label:
+    file: dataset/data_labels.csv
+    label: True
+
+stratify_by_column: label
diff --git a/examples/create_huggingface_dataset.py b/examples/create_huggingface_dataset.py
new file mode 100644
index 0000000..8fc4187
--- /dev/null
+++ b/examples/create_huggingface_dataset.py
@@ -0,0 +1,229 @@
+"""
+We will create a dataset with images, audios, and text data
+so that you can see how various data types can be pushed to
+Hugging Face!
+
+The default configuration is in `examples/configs/hf_dataset.yaml`:
+
+```bash
+# install
+pip install datasets huggingface_hub soundfile
+
+# make a WRITE token on HuggingFace: https://huggingface.co/settings/tokens
+
+# run
+python examples/create_huggingface_dataset.py \
+hf_token=... \
+```
+"""
+
+import hydra
+from hydra.utils import to_absolute_path
+import os
+import time
+import glob
+import numpy as np
+import soundfile as sf
+from PIL import Image as PILImage
+from datasets import Dataset, Image, Audio, ClassLabel
+from omegaconf import open_dict
+from huggingface_hub import upload_file
+import re
+import pandas as pd
+
+
+# -- helper functions
+def convert(text):
+    return int(text) if text.isdigit() else text.lower()
+
+
+def alphanum_key(key):
+    return [convert(c) for c in re.split("([0-9]+)", key)]
+
+
+def natural_sort(arr):
+    return sorted(arr, key=alphanum_key)
+
+
+@hydra.main(version_base=None, config_path="configs", config_name="hf_dataset")
+def main(config):
+
+    start_time = time.time()
+
+    # extract and check parameters
+    repo_id = config.repo_id
+    hf_token = config.hf_token
+    test_size = config.test_size
+
+    assert repo_id is not None, "Please provide a Hugging Face repo_id."
+    assert hf_token is not None, "Please provide a Hugging Face token."
+
+    # to absolute path, as needed by Hugging Face upload
+    for data in config.data_dir:
+        if "dir" in config.data_dir[data]:
+            config.data_dir[data]["dir"] = to_absolute_path(config.data_dir[data]["dir"])
+        elif "file" in config.data_dir[data]:
+            config.data_dir[data]["file"] = to_absolute_path(config.data_dir[data]["file"])
+
+    # Step 1: Check data (create dummy data if not present)
+    n_files = 100  # number of dummy files to create
+    for data in config.data_dir:
+
+        # for directory of data
+        if "dir" in config.data_dir[data]:
+            input_dir = config.data_dir[data]["dir"]
+            data_type = config.data_dir[data]["type"]
+
+            if not os.path.exists(input_dir):
+                # create dummy data
+                print(f"-- Creating {n_files} dummy {data_type} files in {input_dir}")
+                os.makedirs(input_dir, exist_ok=True)
+                for i in range(n_files):
+                    if data_type == "png":
+                        img = np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8)
+                        img_path = os.path.join(input_dir, f"{i}.png")
+                        PILImage.fromarray(img).save(img_path)
+                    elif data_type == "wav":
+                        audio = np.random.randn(16000)
+                        audio_path = os.path.join(input_dir, f"{i}.wav")
+                        sf.write(audio_path, audio, samplerate=16000)
+                    elif data_type == "txt":
+                        text = f"Hello, this is file {i}"
+                        text_path = os.path.join(input_dir, f"{i}.txt")
+                        with open(text_path, "w") as f:
+                            f.write(text)
+
+            # check number of files
+            files = glob.glob(os.path.join(input_dir, "*." + data_type))
+            n_files = len(files)
+            print(f"Found {n_files} {data_type} files in {input_dir}")
+
+        # for CSV file where each line is a data point
+        elif "file" in config.data_dir[data]:
+            input_file = config.data_dir[data]["file"]
+
+            if not os.path.exists(input_file):
+                # create dummy labels
+                labels = ["good", "ok", "bad"]
+                file_labels = np.random.choice(labels, n_files)
+                with open(input_file, "w") as f:
+                    for i in range(n_files):
+                        f.write(f"{i},{file_labels[i]}\n")
+                print(f"-- Created dummy labels file at {input_file}")
+
+            # check number of unique labels (open with Pandas)
+            df = pd.read_csv(input_file, header=None)
+            n_files = len(df)
+            labels = df[1].unique()
+            n_labels = len(df[1].unique())
+            print(f"Found {n_files} lines with {n_labels} unique labels ({labels}) in {input_file}")
+
+        else:
+            raise ValueError("Please provide either `dir` or `file` in data_dir")
+
+    # -- only keep common files across all datasets
+    bn = [os.path.basename(f).split(".")[0] for f in files]
+    for data in config.data_dir:
+        if "dir" in config.data_dir[data]:
+            input_dir = config.data_dir[data]["dir"]
+            data_type = config.data_dir[data]["type"]
+            files = glob.glob(os.path.join(input_dir, "*." + data_type))
+            bn_data = [os.path.basename(f).split(".")[0] for f in files]
+            common_files = list(set(bn).intersection(bn_data))
+    common_files = natural_sort(common_files)
+    print(f"Number of common files: {len(common_files)}")
+
+    # -- add common files into dictionary
+    for data in config.data_dir:
+        if "dir" in config.data_dir[data]:
+            with open_dict(config):
+                config.data_dir[data]["data"] = common_files
+        if "file" in config.data_dir[data]:
+            # take row according to common_files
+            df = pd.read_csv(config.data_dir[data]["file"], header=None)
+            # -- make first column string
+            df[0] = df[0].astype(str)
+            df = df[df[0].isin(common_files)]
+            with open_dict(config):
+                config.data_dir[data]["data"] = df[1].tolist()
+
+    # Step 2: Create train and test data
+    dataset_dict = {}
+
+    # -- create dictionary of content
+    for data in config.data_dir:
+        if "dir" in config.data_dir[data]:
+            files = config.data_dir[data]["data"]
+            data_type = config.data_dir[data]["type"]
+            data_files = [
+                os.path.join(config.data_dir[data]["dir"], f"{f}.{data_type}") for f in files
+            ]
+
+            if data_type in ["txt"]:
+                # open file content for text files
+                data_files = [open(f).read() for f in data_files]
+            dataset_dict[data] = data_files
+        elif "file" in config.data_dir[data]:
+            dataset_dict[data] = config.data_dir[data]["data"]
+
+    # -- create dataset
+    dataset = Dataset.from_dict(dataset_dict)
+    for data in config.data_dir:
+        if "dir" in config.data_dir[data]:
+            if config.data_dir[data]["type"] in ["png", "jpg", "jpeg", "tiff"]:
+                dataset = dataset.cast_column(data, Image())
+            elif config.data_dir[data]["type"] in ["wav", "mp3", "flac", "ogg"]:
+                dataset = dataset.cast_column(data, Audio())
+        elif "file" in config.data_dir[data]:
+            if config.data_dir[data]["label"]:
+                labels = list(set(config.data_dir[data]["data"]))
+                dataset = dataset.cast_column(data, ClassLabel(names=labels))
+
+    # -- split into train and test
+    dataset = dataset.train_test_split(
+        test_size=test_size,
+        seed=config.seed,
+        shuffle=True,
+        stratify_by_column=config.stratify_by_column,  # shuffle must be True
+    )
+    print(dataset)
+
+    """
+    DatasetDict({
+        train: Dataset({
+            features: ['audio', 'images', 'text', 'labels'],
+            num_rows: 85
+        })
+        test: Dataset({
+            features: ['audio', 'images', 'text', 'labels'],
+            num_rows: 15
+        })
+    })
+    """
+
+    # Step 3: Push to Hugging Face
+    dataset.push_to_hub(repo_id, token=hf_token)
+
+    # -- push individual files
+    for data in config.data_dir:
+        if "dir" in config.data_dir[data]:
+            # push first file
+            local_fp = os.path.join(
+                config.data_dir[data]["dir"],
+                config.data_dir[data]["data"][0] + "." + config.data_dir[data]["type"],
+            )
+            remote_fn = "example." + config.data_dir[data]["type"]
+            upload_file(
+                path_or_fileobj=local_fp,
+                path_in_repo=remote_fn,
+                repo_id=repo_id,
+                repo_type="dataset",
+                token=hf_token,
+            )
+
+    # total time in minutes
+    print(f"Total time: {(time.time() - start_time) / 60} minutes")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/use_huggingface_dataset.py b/examples/use_huggingface_dataset.py
new file mode 100644
index 0000000..73e0a12
--- /dev/null
+++ b/examples/use_huggingface_dataset.py
@@ -0,0 +1,50 @@
+"""
+In this script, we use the Hugging Face dataset made
+from the script examples/create_huggingface_dataset.py
+
+The dataset is available at:
+https://huggingface.co/bezzam/dummy-dataset
+
+```bash
+# install
+pip install datasets librosa soundfile
+
+# run
+python examples/use_huggingface_dataset.py
+```
+
+During the first run, the dataset will be downloaded and cached.
+Subsequent runs will use the cached dataset.
+
+"""
+
+from datasets import load_dataset
+import numpy as np
+
+
+# load train and test splits
+ds_train = load_dataset("bezzam/dummy-dataset", split="train")
+ds_test = load_dataset("bezzam/dummy-dataset", split="test")
+print(f"Number of training samples: {len(ds_train)}")
+print(f"Number of test samples: {len(ds_test)}")
+
+# load first example
+print("\n---- First example:")
+example = ds_train[0]
+
+# -- audio duration
+duration = len(example["audio"]["array"]) / example["audio"]["sampling_rate"]
+print(f"Duration of audio: {duration:.2f} seconds")
+
+# -- image size
+image = np.array(example["image"])
+print(f"Size of image: {image.shape}")
+
+# -- text
+text = example["text"]
+print(f"Text: {text}")
+
+# -- label
+label = example["label"]
+label_str = ds_train.features["label"].int2str(label)
+print(f"Label: {label_str}")

From 51e5bbf3f81b1dca669515618a9790f03da2f714 Mon Sep 17 00:00:00 2001
From: Eric Bezzam <ebezzam@gmail.com>
Date: Sun, 18 Aug 2024 20:18:17 +0200
Subject: [PATCH 2/5] Update README.

---
 README.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.rst b/README.rst
index 0fcad78..413139a 100644
--- a/README.rst
+++ b/README.rst
@@ -54,6 +54,7 @@ choose the features that you like. This flexibility is one of the best
 * Unit tests and continuous integration.
 * Packaging and distribution.
 * Remove development.
+* Creating and sharing datasets with Hugging Face.
 
 The accompanying 
 `slides <https://docs.google.com/presentation/d/1BnezhwUy22DiF72wss8GU_YIMfhjortz-uILdIFGuoM/edit?usp=sharing>`__ 

From aecb90782cc1d0a0b03dc5dad77f49b746706bdf Mon Sep 17 00:00:00 2001
From: Eric Bezzam <ebezzam@gmail.com>
Date: Sun, 18 Aug 2024 21:53:11 +0200
Subject: [PATCH 3/5] Fix typo.

---
 README.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.rst b/README.rst
index f1495ad..f0f06e9 100644
--- a/README.rst
+++ b/README.rst
@@ -61,7 +61,7 @@ choose the features that you like. This flexibility is one of the best
 * Code formatting.
 * Unit tests and continuous integration.
 * Packaging and distribution.
-* Remove development.
+* Remote development.
 * Creating and sharing datasets with Hugging Face.
 
 The accompanying 

From 9b28e3fda32b47bf9a72b51cf1468fa1461e69e1 Mon Sep 17 00:00:00 2001
From: Eric Bezzam <ebezzam@gmail.com>
Date: Mon, 19 Aug 2024 08:27:50 +0200
Subject: [PATCH 4/5] Add randomness in image sizes.

---
 examples/create_huggingface_dataset.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/create_huggingface_dataset.py b/examples/create_huggingface_dataset.py
index 8fc4187..df1bc28 100644
--- a/examples/create_huggingface_dataset.py
+++ b/examples/create_huggingface_dataset.py
@@ -80,13 +80,16 @@ def main(config):
                 os.makedirs(input_dir, exist_ok=True)
                 for i in range(n_files):
                     if data_type == "png":
-                        img = np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8)
+                        dim = np.random.randint(100, 200)
+                        img = np.random.randint(0, 255, (dim, dim, 3), dtype=np.uint8)
                         img_path = os.path.join(input_dir, f"{i}.png")
                         PILImage.fromarray(img).save(img_path)
                     elif data_type == "wav":
-                        audio = np.random.randn(16000)
+                        duration = np.random.randint(1, 4)
+                        sample_rate = 16000
+                        audio = np.random.randn(duration * sample_rate)
                         audio_path = os.path.join(input_dir, f"{i}.wav")
-                        sf.write(audio_path, audio, samplerate=16000)
+                        sf.write(audio_path, audio, samplerate=sample_rate)
                     elif data_type == "txt":
                         text = f"Hello, this is file {i}"
                         text_path = os.path.join(input_dir, f"{i}.txt")

From 159b4908fa0c38218ed4b28db90be612daa69599 Mon Sep 17 00:00:00 2001
From: Eric Bezzam <ebezzam@gmail.com>
Date: Thu, 3 Apr 2025 08:47:55 +0200
Subject: [PATCH 5/5] Update use_huggingface_dataset.py

---
 examples/use_huggingface_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/use_huggingface_dataset.py b/examples/use_huggingface_dataset.py
index 73e0a12..da78de2 100644
--- a/examples/use_huggingface_dataset.py
+++ b/examples/use_huggingface_dataset.py
@@ -3,7 +3,7 @@
 from the script examples/create_huggingface_dataset.py
 
 The dataset is available at:
-https://huggingface.co/bezzam/dummy-dataset
+https://huggingface.co/datasets/bezzam/dummy-dataset
 
 ```bash
 # install