From da2b7d912b82a80c1d78c99e9a4a7ed0145096d5 Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@ip-172-31-95-146.ec2.internal>
Date: Fri, 24 Jun 2022 15:20:20 +0000
Subject: [PATCH 01/40] added new dataset

---
 .gitignore | 3 ++-
 README.md  | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/.gitignore b/.gitignore
index b323e78..7369905 100644
--- a/.gitignore
+++ b/.gitignore
@@ -133,4 +133,5 @@ dmypy.json
 utils/__pycache__/make_tar_utils.cpython-37.pyc
 /data_preprocess/process_audioset/
 
-*.out
\ No newline at end of file
+*.out
+test.py
\ No newline at end of file
diff --git a/README.md b/README.md
index 9e5bc5a..9a97161 100644
--- a/README.md
+++ b/README.md
@@ -202,6 +202,7 @@ If you contribute to process a new dataset, please move the final webdataset to
 | Fine-grained Vocal Imitation Set                 | This dataset includes 763 crowd-sourced vocal imitations of 108 sound events.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |[Click here](https://zenodo.org/record/3538534)                                          | audio / sound effects |   |
 | VimSketch Dataset                                | VimSketch Dataset combines two publicly available datasets, created by the Interactive Audio Lab for the task of Query by Vocal Imitation (QBV). VimSketch contains 542 reference sounds (including a variety of animal sounds, musical snippets, and environmental noise samples) and 12,543 vocal imitations of those reference sounds with a minimum of 13 and a maximum of 37 vocal imitations per reference.                                                                                                                                                                                                        |[Click here](https://zenodo.org/record/2596911)                                          | audio / sound effects |   |
 | OtoMobile Dataset                                | OtoMobile dataset is a collection of recordings of failing car components, created by the Interactive Audio Lab at Northwestern University. OtoMobile consists of 65 recordings of vehicles with failing components, along with annotations.                                                                                                                                                                                                                                                                                                                                                                             |[Click here](https://zenodo.org/record/3382945) <br>(restricted access)                                |                       |   |
+| Canadian  French  Emotional (CaFE)  speech  dataset                                | This paper introduces the newly released Canadian French Emotional (CaFE) speech dataset and gives details about its design and content. This dataset contains six different sentences, pronounced by six male and six female actors, in six basic emotions plus one neutral emotion. The six basic emotions are acted in two different intensities. The audio is digitally recorded at high-resolution (192 kHz sampling rate, 24 bits per sample). This new dataset is freely available under a Creative Commons license (CC BY-NC-SA 4.0)                      |[Paper](https://dl.acm.org/doi/10.1145/3204949.3208121) [Download](https://zenodo.org/record/1478765)      | audio / transcription                       |   |
 
 ### Music Dataset List
 | Name                            | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  | URL                                                                                                                                        | Text Type   | Status(location)  |

From b7f5044220b74ff65952f07b7c8123671468f75f Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@ip-172-31-95-146.ec2.internal>
Date: Mon, 27 Jun 2022 09:30:53 +0000
Subject: [PATCH 02/40] added CpVoST

---
 .gitignore | 2 +-
 README.md  | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 7369905..938f3a2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -134,4 +134,4 @@ utils/__pycache__/make_tar_utils.cpython-37.pyc
 /data_preprocess/process_audioset/
 
 *.out
-test.py
\ No newline at end of file
+test.*py*
\ No newline at end of file
diff --git a/README.md b/README.md
index 9a97161..1a6fdbf 100644
--- a/README.md
+++ b/README.md
@@ -203,8 +203,10 @@ If you contribute to process a new dataset, please move the final webdataset to
 | VimSketch Dataset                                | VimSketch Dataset combines two publicly available datasets, created by the Interactive Audio Lab for the task of Query by Vocal Imitation (QBV). VimSketch contains 542 reference sounds (including a variety of animal sounds, musical snippets, and environmental noise samples) and 12,543 vocal imitations of those reference sounds with a minimum of 13 and a maximum of 37 vocal imitations per reference.                                                                                                                                                                                                        |[Click here](https://zenodo.org/record/2596911)                                          | audio / sound effects |   |
 | OtoMobile Dataset                                | OtoMobile dataset is a collection of recordings of failing car components, created by the Interactive Audio Lab at Northwestern University. OtoMobile consists of 65 recordings of vehicles with failing components, along with annotations.                                                                                                                                                                                                                                                                                                                                                                             |[Click here](https://zenodo.org/record/3382945) <br>(restricted access)                                |                       |   |
 | Canadian  French  Emotional (CaFE)  speech  dataset                                | This paper introduces the newly released Canadian French Emotional (CaFE) speech dataset and gives details about its design and content. This dataset contains six different sentences, pronounced by six male and six female actors, in six basic emotions plus one neutral emotion. The six basic emotions are acted in two different intensities. The audio is digitally recorded at high-resolution (192 kHz sampling rate, 24 bits per sample). This new dataset is freely available under a Creative Commons license (CC BY-NC-SA 4.0)                      |[Paper](https://dl.acm.org/doi/10.1145/3204949.3208121) [Download](https://zenodo.org/record/1478765)      | audio / transcription                       |   |
+|CoVoST: A Large-Scale Multilingual Speech-To-Text Translation Corpus                                | we created CoVoST, a large-scale multilingual ST corpus based on Common Voice, to foster ST research with the largest ever open dataset. Its latest version covers translations from English into 15 languages---Arabic, Catalan, Welsh, German, Estonian, Persian, Indonesian, Japanese, Latvian, Mongolian, Slovenian, Swedish, Tamil, Turkish, Chinese---and from 21 languages into English, including the 15 target languages as well as Spanish, French, Italian, Dutch, Portuguese, Russian. It has total 2,880 hours of speech and is diversified with 78K speakers.                      |[Paper](https://arxiv.org/pdf/2007.10310.pdf) [Download](https://github.com/facebookresearch/covost)      | audio / transcription                       |   |
 
 ### Music Dataset List
+
 | Name                            | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  | URL                                                                                                                                        | Text Type   | Status(location)  |
 |---------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------                                                                                                                                                                                                                                                                                                                                                                                                                    |--------------------------------------------------------------------------                                                                  |-------------|-------------------|
 |        Free Music Archive       | We introduce the Free Music Archive (FMA), an open and easily accessible dataset suitable for evaluating several tasks in MIR, a field concerned with browsing, searching, and organizing large music collections. The community's growing interest in feature and end-to-end learning is however restrained by the limited availability of large audio datasets. The FMA aims to overcome this hurdle by providing 917 GiB and 343 days of Creative Commons-licensed audio from 106,574 tracks from 16,341 artists and 14,854 albums, arranged in a hierarchical taxonomy of 161 genres. It provides full-length and high-quality audio, pre-computed features, together with track- and user-level metadata, tags, and free-form text such as biographies. We here describe the dataset and how it was created, propose a train/validation/test split and three subsets, discuss some suitable MIR tasks, and evaluate some baselines for genre recognition. Code, data, and usage examples are available at https://github.com/mdeff/fma. | [Click here](https://academictorrents.com/details/dba20c45d4d6fa6453a4e99d2f8a4817893cfb94)                                                | Music |                         |

From 014f519062d729354be3174ec84308e64eebebfa Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@ip-172-31-95-146.ec2.internal>
Date: Mon, 27 Jun 2022 09:33:20 +0000
Subject: [PATCH 03/40] removed duplidate dataset

---
 README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 1a6fdbf..6b95874 100644
--- a/README.md
+++ b/README.md
@@ -193,7 +193,7 @@ If you contribute to process a new dataset, please move the final webdataset to
 | surfing ai                                       | 30k+ - proprietary                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |[click here](http://www.surfing.ai/speech-data/)                                                      |   |   |
 | LibriSpeech                                      | a collection of approximately 1,000 hours of audiobooks that are a part of the LibriVox project.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |[click here](https://paperswithcode.com/dataset/librispeech)                                          |   |   |
 | Europarl-ST                                      | a Multilingual Speech Translation Corpus, that contains paired audio-text samples for Speech Translation, constructed using the debates carried out in the European Parliament in the period between 2008 and 2012.                                                                                                                                                                                                                                                                                                                                                                                                      |[click here](https://www.mllp.upv.es/europarl-st/)                                                    |   |   |
-| CoVoST                                           | a large-scale multilingual ST corpus based on Common Voice, to foster ST research with the largest ever open dataset. Its latest version covers translations from English into 15 languages---Arabic, Catalan, Welsh, German, Estonian, Persian, Indonesian, Japanese, Latvian, Mongolian, Slovenian, Swedish, Tamil, Turkish, Chinese---and from 21 languages into English, including the 15 target languages as well as Spanish, French, Italian, Dutch, Portuguese, Russian. It has total 2,880 hours of speech and is diversified with 78K speakers.                                                                 |[Click here](https://github.com/facebookresearch/covost)                                      |                       |   |
+| CoVoST                                           | a large-scale multilingual ST corpus based on Common Voice, to foster ST research with the largest ever open dataset. Its latest version covers translations from English into 15 languages---Arabic, Catalan, Welsh, German, Estonian, Persian, Indonesian, Japanese, Latvian, Mongolian, Slovenian, Swedish, Tamil, Turkish, Chinese---and from 21 languages into English, including the 15 target languages as well as Spanish, French, Italian, Dutch, Portuguese, Russian. It has total 2,880 hours of speech and is diversified with 78K speakers.                                                                 |[Paper](https://arxiv.org/pdf/2007.10310.pdf) [Download](https://github.com/facebookresearch/covost)                                      |                       |   |
 | GigaSpeech                                       | an evolving, multi-domain English speech recognition corpus with 10,000 hours of high quality labeled audio suitable for supervised training, and 40,000 hours of total audio suitable for semi-supervised and unsupervised training.                                                                                                                                                                                                                                                                                                                                                                                    |[Click here](https://github.com/speechcolab/gigaspeech)                                      |                       |   |
 | LJSpeech Dataset                                 | This is a public domain speech dataset consisting of 13,100 short audio clips of a single speaker reading passages from 7 non-fiction books. A transcription is provided for each clip. Clips vary in length from 1 to 10 seconds and have a total length of approximately 24 hours.                                                                                                                                                                                                                                                                                                                                     |[Click here](https://keithito.com/lj-speech-dataset/)  <br> Or <br> [download](https://data.keithito.com/data/speech/ljspeech-1.1.tar.bz2)  |                       |   |
 | Spotify English-Language Podcast Dataset         | This dataset consists of 100,000 episodes from different podcast shows on Spotify. The dataset is available for research purposes. We are releasing this dataset more widely to facilitate research on podcasts through the lens of speech and audio technology, natural language processing, information retrieval, and linguistics. The dataset contains about 50,000 hours of audio, and over 600 million transcribed words. The episodes span a variety of lengths, topics, styles, and qualities. Only non-commercial research is permitted on this dataset                                                         |[Click here](https://podcastsdataset.byspotify.com/)                                        |                       |   |
@@ -203,7 +203,6 @@ If you contribute to process a new dataset, please move the final webdataset to
 | VimSketch Dataset                                | VimSketch Dataset combines two publicly available datasets, created by the Interactive Audio Lab for the task of Query by Vocal Imitation (QBV). VimSketch contains 542 reference sounds (including a variety of animal sounds, musical snippets, and environmental noise samples) and 12,543 vocal imitations of those reference sounds with a minimum of 13 and a maximum of 37 vocal imitations per reference.                                                                                                                                                                                                        |[Click here](https://zenodo.org/record/2596911)                                          | audio / sound effects |   |
 | OtoMobile Dataset                                | OtoMobile dataset is a collection of recordings of failing car components, created by the Interactive Audio Lab at Northwestern University. OtoMobile consists of 65 recordings of vehicles with failing components, along with annotations.                                                                                                                                                                                                                                                                                                                                                                             |[Click here](https://zenodo.org/record/3382945) <br>(restricted access)                                |                       |   |
 | Canadian  French  Emotional (CaFE)  speech  dataset                                | This paper introduces the newly released Canadian French Emotional (CaFE) speech dataset and gives details about its design and content. This dataset contains six different sentences, pronounced by six male and six female actors, in six basic emotions plus one neutral emotion. The six basic emotions are acted in two different intensities. The audio is digitally recorded at high-resolution (192 kHz sampling rate, 24 bits per sample). This new dataset is freely available under a Creative Commons license (CC BY-NC-SA 4.0)                      |[Paper](https://dl.acm.org/doi/10.1145/3204949.3208121) [Download](https://zenodo.org/record/1478765)      | audio / transcription                       |   |
-|CoVoST: A Large-Scale Multilingual Speech-To-Text Translation Corpus                                | we created CoVoST, a large-scale multilingual ST corpus based on Common Voice, to foster ST research with the largest ever open dataset. Its latest version covers translations from English into 15 languages---Arabic, Catalan, Welsh, German, Estonian, Persian, Indonesian, Japanese, Latvian, Mongolian, Slovenian, Swedish, Tamil, Turkish, Chinese---and from 21 languages into English, including the 15 target languages as well as Spanish, French, Italian, Dutch, Portuguese, Russian. It has total 2,880 hours of speech and is diversified with 78K speakers.                      |[Paper](https://arxiv.org/pdf/2007.10310.pdf) [Download](https://github.com/facebookresearch/covost)      | audio / transcription                       |   |
 
 ### Music Dataset List
 

From 50305e45b6ca0bc7efef56aad5f0ffd1156ccaec Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@ip-172-31-95-146.ec2.internal>
Date: Wed, 29 Jun 2022 08:24:19 +0000
Subject: [PATCH 04/40] added to do list

---
 current_dataset/ToDO.md | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 current_dataset/ToDO.md

diff --git a/current_dataset/ToDO.md b/current_dataset/ToDO.md
new file mode 100644
index 0000000..2cf9838
--- /dev/null
+++ b/current_dataset/ToDO.md
@@ -0,0 +1,6 @@
+# ToDo
+
+- [X] LJSpeech
+- [X] MSWC
+- [ ] GigaSpeech
+- [ ] CoVoST

From 0d911c5adde4f344898a3535f255df06c03e48f5 Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@ip-172-31-95-146.ec2.internal>
Date: Wed, 29 Jun 2022 10:58:33 +0000
Subject: [PATCH 05/40] First commit

---
 current_dataset/preprocess_CoVoST.py | 79 ++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 current_dataset/preprocess_CoVoST.py

diff --git a/current_dataset/preprocess_CoVoST.py b/current_dataset/preprocess_CoVoST.py
new file mode 100644
index 0000000..84d3a57
--- /dev/null
+++ b/current_dataset/preprocess_CoVoST.py
@@ -0,0 +1,79 @@
+"""
+Code for preprocess LJSpeech Corpus:
+https://keithito.com/LJ-Speech-Dataset/
+"""
+
+import glob
+import tqdm
+import os
+import glob
+import pandas as pd
+import sys
+import tarfile
+import json
+import shutil
+import fsspec
+
+from sklearn.model_selection import train_test_split
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+from utils.audio_utils import audio_to_flac
+from utils.make_tar_utils import tardir
+
+def convert_and_json_dump(file:str, dest:str, df):
+    audio_to_flac(file, dest)
+    with open(dest.replace('.flac', '.json'), 'w') as f:
+        json.dump({'filename': os.path.join(*dest.split('/')[4:]), 'text':df['norm_text'], 'tag':{'raw_text':df['raw_text']}}, f)
+
+
+def split_all_audio_files(df, dest_root_path, max_workers=96):
+    if not os.path.exists(dest_root_path):
+        raise FileNotFoundError(f'Please Check {dest_root_path} exists')
+
+    l = len(df)
+    with tqdm.tqdm(total=l, desc=f'Processing {dest_root_path}') as pbar:
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            threads = [executor.submit(convert_and_json_dump, row[0], os.path.join(dest_root_path, f'{i}.flac'), row) for i, row in enumerate(df.iloc())]
+            for _ in as_completed(threads):
+                pbar.update(1)
+
+if __name__ == '__main__':
+    import multiprocessing
+
+    max_workers = multiprocessing.cpu_count()
+    chunk = 512
+    generate_subset_tsv = True
+
+    root_path = '/home/knoriy/datasets/raw_datasets/ljspeech/'
+    tar_dir = "/home/knoriy/datasets/raw_datasets/ljspeech/LJSpeech-1.1.tar.bz2"
+    metadata_dir = "/home/knoriy/datasets/raw_datasets/ljspeech/metadata.csv"
+
+    dataset_name = 'ljspeech'
+
+    s3 = fsspec.filesystem('s3')
+    s3_dest = f's-laion/knoriy/LJSpeech/{dataset_name}_tars/'
+
+    # load metadata and configure audio paths
+    df = pd.read_csv(metadata_dir, header=None, names=['path', 'raw_text', 'norm_text'], sep='|')
+    for i in df.iloc():
+        i[0] = f'{os.path.join(root_path, "wavs", i[0])+".wav"}'
+
+    # create train, test, valid splits
+    train, test = train_test_split(df, test_size=0.2)
+    valid, test = train_test_split(test, test_size=0.2)
+    train_test_val = {'train/':train, 'test/':test, 'valid/':valid}
+
+    
+    for key in tqdm.tqdm(train_test_val, desc=f'processing:'):
+        df = train_test_val[key]
+        
+        dest_path = os.path.join(root_path.replace('raw_datasets', 'processed_datasets'),key )
+        os.makedirs(dest_path, exist_ok=True)
+
+        split_all_audio_files(df, dest_path)
+        tardir(dest_path, dest_path, chunk, delete_file=True)
+
+        # upload to s3 and delete local
+        s3.put(dest_path, os.path.join(s3_dest, key), recursive=True)
+        shutil.rmtree(dest_path)
\ No newline at end of file

From ef4dfcb58286e6e2d5fea62d690d80e4764b9e37 Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@ip-172-31-95-146.ec2.internal>
Date: Wed, 29 Jun 2022 13:17:38 +0000
Subject: [PATCH 06/40] _

---
 current_dataset/preprocess_CoVoST.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/current_dataset/preprocess_CoVoST.py b/current_dataset/preprocess_CoVoST.py
index 84d3a57..8165bd6 100644
--- a/current_dataset/preprocess_CoVoST.py
+++ b/current_dataset/preprocess_CoVoST.py
@@ -39,6 +39,9 @@ def split_all_audio_files(df, dest_root_path, max_workers=96):
                 pbar.update(1)
 
 if __name__ == '__main__':
+    raise NotImplementedError("This processing CoVoST is nto implemented yet")
+
+
     import multiprocessing
 
     max_workers = multiprocessing.cpu_count()
@@ -64,7 +67,7 @@ def split_all_audio_files(df, dest_root_path, max_workers=96):
     valid, test = train_test_split(test, test_size=0.2)
     train_test_val = {'train/':train, 'test/':test, 'valid/':valid}
 
-    
+
     for key in tqdm.tqdm(train_test_val, desc=f'processing:'):
         df = train_test_val[key]
         

From 59c0f49a4e2d118b7310d717858b76093f3756c3 Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@ip-172-31-36-230.ec2.internal>
Date: Thu, 7 Jul 2022 09:54:12 +0000
Subject: [PATCH 07/40] Updated yaml to work with slurm

---
 current_dataset/preprocess_mswc.py |  6 ++---
 environment.yml                    | 42 +++++++++++++++++++++++++-----
 2 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/current_dataset/preprocess_mswc.py b/current_dataset/preprocess_mswc.py
index 70cf3f0..8aa5ae3 100644
--- a/current_dataset/preprocess_mswc.py
+++ b/current_dataset/preprocess_mswc.py
@@ -44,12 +44,12 @@ def split_all_audio_files(df, src_root_path, dest_root_path, max_workers=96):
     chunk = 512
     generate_subset_tsv = True
 
-    root_path = '/home/knoriy/datasets/raw_datasets/mswc/'
-    tar_dir = "/home/knoriy/datasets/raw_datasets/mswc/mswc.tar.gz"
+    root_path = '/home/knoriy/fsx/raw_datasets/mswc/'
+    tar_dir = "/home/knoriy/fsx/raw_datasets/mswc/mswc.tar.gz"
     dataset_name = 'mswc'
 
     s3 = fsspec.filesystem('s3')
-    s3_dest = f's-laion/multilingual_spoken_words/{dataset_name}_tars/'
+    s3_dest = f's-laion/knoriy/mswc/{dataset_name}_tars/'
 
     language_tars_dirs = sorted(glob.glob(os.path.join(root_path, "audio/**.tar.gz")))
     if not language_tars_dirs:
diff --git a/environment.yml b/environment.yml
index eb59b44..e036fab 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,10 +1,12 @@
 name: audio_dataset
 channels:
   - pytorch
+  - conda-forge
   - defaults
 dependencies:
   - _libgcc_mutex=0.1=main
   - _openmp_mutex=5.1=1_gnu
+  - absl-py=1.1.0=pyhd8ed1ab_0
   - aiobotocore=2.1.0=pyhd3eb1b0_0
   - aiohttp=3.8.1=py39h7f8727e_1
   - aioitertools=0.7.1=pyhd3eb1b0_0
@@ -14,28 +16,37 @@ dependencies:
   - attrs=21.4.0=pyhd3eb1b0_0
   - backcall=0.2.0=pyhd3eb1b0_0
   - blas=1.0=mkl
+  - blinker=1.4=py_1
   - botocore=1.23.24=pyhd3eb1b0_0
   - bottleneck=1.3.4=py39hce1f21e_0
   - brotlipy=0.7.0=py39h27cfd23_1003
   - bzip2=1.0.8=h7b6447c_0
-  - ca-certificates=2022.4.26=h06a4308_0
-  - certifi=2022.5.18.1=py39h06a4308_0
+  - c-ares=1.18.1=h7f98852_0
+  - ca-certificates=2022.6.15=ha878542_0
+  - cachetools=5.0.0=pyhd8ed1ab_0
+  - certifi=2022.6.15=py39hf3d152e_0
   - cffi=1.15.0=py39hd667e15_1
   - charset-normalizer=2.0.4=pyhd3eb1b0_0
+  - click=8.1.3=py39hf3d152e_0
   - cryptography=37.0.1=py39h9ce1e76_0
   - cudatoolkit=11.3.1=h2bc3f7f_2
   - debugpy=1.5.1=py39h295c915_0
   - decorator=5.1.1=pyhd3eb1b0_0
   - entrypoints=0.4=py39h06a4308_0
   - executing=0.8.3=pyhd3eb1b0_0
-  - ffmpeg=4.3=hf484d3e_0
+  - ffmpeg=4.3.2=hca11adc_0
   - freetype=2.11.0=h70c0345_0
   - frozenlist=1.2.0=py39h7f8727e_0
   - fsspec=2022.1.0=pyhd3eb1b0_0
+  - future=0.18.2=py39hf3d152e_5
   - giflib=5.2.1=h7b6447c_0
   - gmp=6.2.1=h295c915_3
   - gnutls=3.6.15=he1e5248_0
+  - google-auth=2.9.0=pyh6c4a22f_0
+  - google-auth-oauthlib=0.4.6=pyhd8ed1ab_0
+  - grpcio=1.42.0=py39hce63b2e_0
   - idna=3.3=pyhd3eb1b0_0
+  - importlib-metadata=4.11.4=py39hf3d152e_0
   - intel-openmp=2021.4.0=h06a4308_3561
   - ipykernel=6.9.1=py39h06a4308_0
   - ipython=8.3.0=py39h06a4308_0
@@ -53,6 +64,7 @@ dependencies:
   - libiconv=1.16=h7f8727e_2
   - libidn2=2.3.2=h7f8727e_0
   - libpng=1.6.37=hbc83047_0
+  - libprotobuf=3.15.8=h780b84a_1
   - libsodium=1.0.18=h7b6447c_0
   - libstdcxx-ng=11.2.0=h1234567_1
   - libtasn1=4.16.0=h27cfd23_0
@@ -62,6 +74,7 @@ dependencies:
   - libwebp=1.2.2=h55f646e_0
   - libwebp-base=1.2.2=h7f8727e_0
   - lz4-c=1.9.3=h295c915_1
+  - markdown=3.3.7=pyhd8ed1ab_0
   - matplotlib-inline=0.1.2=pyhd3eb1b0_2
   - mkl=2021.4.0=h06a4308_640
   - mkl-service=2.4.0=py39h7f8727e_0
@@ -74,8 +87,9 @@ dependencies:
   - numexpr=2.8.1=py39h6abb31d_0
   - numpy=1.22.3=py39he7a7128_0
   - numpy-base=1.22.3=py39hf524024_0
+  - oauthlib=3.2.0=pyhd8ed1ab_0
   - openh264=2.1.1=h4ff587b_0
-  - openssl=1.1.1o=h7f8727e_0
+  - openssl=1.1.1p=h5eee18b_0
   - packaging=21.3=pyhd3eb1b0_0
   - pandas=1.4.2=py39h295c915_0
   - parso=0.8.3=pyhd3eb1b0_0
@@ -84,26 +98,39 @@ dependencies:
   - pillow=9.0.1=py39h22f2fdc_0
   - pip=21.2.4=py39h06a4308_0
   - prompt-toolkit=3.0.20=pyhd3eb1b0_0
+  - protobuf=3.15.8=py39he80948d_0
   - ptyprocess=0.7.0=pyhd3eb1b0_2
   - pure_eval=0.2.2=pyhd3eb1b0_0
+  - pyasn1=0.4.8=py_0
+  - pyasn1-modules=0.2.7=py_0
   - pycparser=2.21=pyhd3eb1b0_0
   - pygments=2.11.2=pyhd3eb1b0_0
+  - pyjwt=2.4.0=pyhd8ed1ab_0
   - pyopenssl=22.0.0=pyhd3eb1b0_0
   - pyparsing=3.0.4=pyhd3eb1b0_0
   - pysocks=1.7.1=py39h06a4308_0
   - python=3.9.12=h12debd9_1
   - python-dateutil=2.8.2=pyhd3eb1b0_0
+  - python_abi=3.9=2_cp39
   - pytorch=1.11.0=py3.9_cuda11.3_cudnn8.2.0_0
+  - pytorch-lightning=0.8.5=py_0
   - pytorch-mutex=1.0=cuda
   - pytz=2022.1=py39h06a4308_0
+  - pyu2f=0.1.5=pyhd8ed1ab_0
+  - pyyaml=6.0=py39hb9d737c_4
   - pyzmq=22.3.0=py39h295c915_2
   - readline=8.1.2=h7f8727e_1
   - requests=2.27.1=pyhd3eb1b0_0
+  - requests-oauthlib=1.3.1=pyhd8ed1ab_0
+  - rsa=4.8=pyhd8ed1ab_0
   - s3fs=2022.1.0=pyhd3eb1b0_0
   - setuptools=61.2.0=py39h06a4308_0
   - six=1.16.0=pyhd3eb1b0_1
   - sqlite=3.38.3=hc218d9a_0
   - stack_data=0.2.0=pyhd3eb1b0_0
+  - tensorboard=2.9.1=pyhd8ed1ab_0
+  - tensorboard-data-server=0.6.0=py39hd97740a_2
+  - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0
   - tk=8.6.12=h1ccaba5_0
   - torchaudio=0.11.0=py39_cu113
   - torchvision=0.12.0=py39_cu113
@@ -115,11 +142,15 @@ dependencies:
   - tzdata=2022a=hda174b7_0
   - urllib3=1.26.9=py39h06a4308_0
   - wcwidth=0.2.5=pyhd3eb1b0_0
+  - werkzeug=2.1.2=pyhd8ed1ab_1
   - wheel=0.37.1=pyhd3eb1b0_0
   - wrapt=1.13.3=py39h7f8727e_2
+  - x264=1!161.3030=h7f98852_1
   - xz=5.2.5=h7f8727e_1
+  - yaml=0.2.5=h7f98852_2
   - yarl=1.6.3=py39h27cfd23_0
   - zeromq=4.3.4=h2531618_0
+  - zipp=3.8.0=pyhd8ed1ab_0
   - zlib=1.2.12=h7f8727e_2
   - zstd=1.5.2=ha4553b6_0
   - pip:
@@ -131,11 +162,10 @@ dependencies:
     - llvmlite==0.38.1
     - numba==0.55.2
     - pooch==1.6.0
-    - pyyaml==6.0
     - resampy==0.2.2
     - scikit-learn==1.1.1
     - scipy==1.8.1
     - soundfile==0.10.3.post1
     - threadpoolctl==3.1.0
     - webdataset==0.2.5
-prefix: /home/knoriy/miniconda3/envs/audio_dataset
+prefix: /home/knoriy/fsx/miniconda3/envs/audio_dataset

From fb95086c852ac3d85f960e64bbdce73d8e611197 Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@ip-172-31-36-230.ec2.internal>
Date: Thu, 7 Jul 2022 10:08:05 +0000
Subject: [PATCH 08/40] move yml file back to root as it is being used by other
 scripts outside of data_preprocess

---
 data_preprocess/environment.yml => environment.yml | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename data_preprocess/environment.yml => environment.yml (100%)

diff --git a/data_preprocess/environment.yml b/environment.yml
similarity index 100%
rename from data_preprocess/environment.yml
rename to environment.yml

From 0ac7933dea7e8b09f67fb99664c7b9356054ac10 Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@ip-172-31-36-230.ec2.internal>
Date: Thu, 7 Jul 2022 10:45:53 +0000
Subject: [PATCH 09/40] _

---
 current_dataset/preprocess_mswc.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/current_dataset/preprocess_mswc.py b/current_dataset/preprocess_mswc.py
index 8aa5ae3..7a318b8 100644
--- a/current_dataset/preprocess_mswc.py
+++ b/current_dataset/preprocess_mswc.py
@@ -56,15 +56,22 @@ def split_all_audio_files(df, src_root_path, dest_root_path, max_workers=96):
         raise FileNotFoundError(f"Please check that the file have been extracted: {root_path}")
 
     for dir in tqdm.tqdm(language_tars_dirs, desc=f'processing: '):
-        audio_path = dir
-        with tarfile.open(audio_path, mode='r:gz') as mswc_audio:
+        if dir == '/home/knoriy/fsx/raw_datasets/mswc/audio/en.tar.gz':
+            audio_path = dir
             audio_path = os.path.split(audio_path)[0]
-            mswc_audio.extractall(audio_path)
 
-        splits_path = dir.replace('audio', 'splits')
-        with tarfile.open(splits_path, mode='r:gz') as mswc_split:
+            splits_path = dir.replace('audio', 'splits')
             splits_path = splits_path.replace('.tar.gz', '/')
-            mswc_split.extractall(splits_path)
+        else:
+            audio_path = dir
+            with tarfile.open(audio_path, mode='r:gz') as mswc_audio:
+                audio_path = os.path.split(audio_path)[0]
+                mswc_audio.extractall(audio_path)
+
+            splits_path = dir.replace('audio', 'splits')
+            with tarfile.open(splits_path, mode='r:gz') as mswc_split:
+                splits_path = splits_path.replace('.tar.gz', '/')
+                mswc_split.extractall(splits_path)
 
         tmp = glob.glob(os.path.join(splits_path, '**.csv'), recursive=True)
         csv_paths = []

From 6d6f929f815b15d7b5f0fed65430b16bf0d919fe Mon Sep 17 00:00:00 2001
From: knoriy
 <knoriy@compute-od-cpu-dy-r6i-32xlarge-1.hpc-1click-production.pcluster>
Date: Fri, 8 Jul 2022 10:58:24 +0000
Subject: [PATCH 10/40] renamed file

---
 .../{process_GigaSpeech.py => preprocess_GigaSpeech.py}     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
 rename current_dataset/{process_GigaSpeech.py => preprocess_GigaSpeech.py} (94%)

diff --git a/current_dataset/process_GigaSpeech.py b/current_dataset/preprocess_GigaSpeech.py
similarity index 94%
rename from current_dataset/process_GigaSpeech.py
rename to current_dataset/preprocess_GigaSpeech.py
index 947b2a5..5ffd2f0 100644
--- a/current_dataset/process_GigaSpeech.py
+++ b/current_dataset/preprocess_GigaSpeech.py
@@ -45,8 +45,8 @@ def split_all_audio_files(df, dest_root_path, max_workers=96):
     max_workers = 2
     chunk = 512
 
-    root_path = '/mnt/knoriy/raw_datasets/gigaspeech/'
-    metadata_dir = "/mnt/knoriy/raw_datasets/gigaspeech/GigaSpeech.json"
+    root_path = '/home/knoriy/fsx/raw_datasets/GigaSpeech/gigaspeech/'
+    metadata_dir = "/home/knoriy/fsx/raw_datasets/GigaSpeech/gigaspeech/GigaSpeech.json"
 
     dataset_name = 'gigaspeech'
 
@@ -54,7 +54,7 @@ def split_all_audio_files(df, dest_root_path, max_workers=96):
     s3_dest = f's-laion/knoriy/GigaSpeech/{dataset_name}_tars/'
 
     # load metadata and configure audio paths
-    raw_df = pd.read_json(metadata_dir)[:2]
+    raw_df = pd.read_json(metadata_dir)
 
     new_df_data = []
     for row in tqdm.tqdm(raw_df.iloc(), total=len(raw_df), desc='Generating dataframe: '):

From eacade6db51c465b358c5f19bd944e59c70790e1 Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@ip-172-31-36-230.ec2.internal>
Date: Fri, 8 Jul 2022 11:47:43 +0000
Subject: [PATCH 11/40] removed unwanted text

---
 current_dataset/preprocess_GigaSpeech.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/current_dataset/preprocess_GigaSpeech.py b/current_dataset/preprocess_GigaSpeech.py
index 5ffd2f0..00237dc 100644
--- a/current_dataset/preprocess_GigaSpeech.py
+++ b/current_dataset/preprocess_GigaSpeech.py
@@ -64,7 +64,7 @@ def split_all_audio_files(df, dest_root_path, max_workers=96):
             except:
                 catagory = 'N/A'
             
-            if seg['text_tn'] == '<SIL>':
+            if seg['text_tn'] == '<SIL>' or seg['text_tn'] == '<NOISE>':
                 continue
 
             new_df_data.append(

From 43af9a7606fcccdd99288b7a00befcbe3127d837 Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@ip-172-31-36-230.ec2.internal>
Date: Mon, 11 Jul 2022 14:30:18 +0000
Subject: [PATCH 12/40] _

---
 current_dataset/preprocess_GigaSpeech.py | 2 +-
 current_dataset/preprocess_LJSpeech.py   | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/current_dataset/preprocess_GigaSpeech.py b/current_dataset/preprocess_GigaSpeech.py
index 00237dc..629faaf 100644
--- a/current_dataset/preprocess_GigaSpeech.py
+++ b/current_dataset/preprocess_GigaSpeech.py
@@ -42,7 +42,7 @@ def split_all_audio_files(df, dest_root_path, max_workers=96):
     import multiprocessing
 
     max_workers = multiprocessing.cpu_count()
-    max_workers = 2
+    print("Num workers: ", max_workers)
     chunk = 512
 
     root_path = '/home/knoriy/fsx/raw_datasets/GigaSpeech/gigaspeech/'
diff --git a/current_dataset/preprocess_LJSpeech.py b/current_dataset/preprocess_LJSpeech.py
index 84d3a57..922e274 100644
--- a/current_dataset/preprocess_LJSpeech.py
+++ b/current_dataset/preprocess_LJSpeech.py
@@ -42,6 +42,7 @@ def split_all_audio_files(df, dest_root_path, max_workers=96):
     import multiprocessing
 
     max_workers = multiprocessing.cpu_count()
+    print("Num workers: ", max_workers)
     chunk = 512
     generate_subset_tsv = True
 

From a1576fe3dd7b1f489982af780fd28d660da50a8f Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@ip-172-31-34-60.ec2.internal>
Date: Mon, 11 Jul 2022 15:36:10 +0000
Subject: [PATCH 13/40] First commit

---
 current_dataset/preprocess_RAVDESS.py | 82 +++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100644 current_dataset/preprocess_RAVDESS.py

diff --git a/current_dataset/preprocess_RAVDESS.py b/current_dataset/preprocess_RAVDESS.py
new file mode 100644
index 0000000..ffa2a33
--- /dev/null
+++ b/current_dataset/preprocess_RAVDESS.py
@@ -0,0 +1,82 @@
+"""
+Code for preprocess GigaSpeech Corpus:
+https://github.com/SpeechColab/GigaSpeech
+"""
+
+import glob
+import tqdm
+import os
+import glob
+import pandas as pd
+import sys
+import tarfile
+import json
+import shutil
+import fsspec
+
+from sklearn.model_selection import train_test_split
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+from utils.audio_utils import audio_to_flac
+from utils.make_tar_utils import tardir
+
+def convert_and_json_dump(file:str, dest:str, df):
+    audio_to_flac(file, dest, segment_start=df['begin_time'], segment_end=df['end_time'])
+    with open(dest.replace('.flac', '.json'), 'w') as f:
+        json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':df['text'], 'tag':df['tag']}, f)
+
+
+def split_all_audio_files(df, dest_root_path, max_workers=96):
+    if not os.path.exists(dest_root_path):
+        raise FileNotFoundError(f'Please Check {dest_root_path} exists')
+
+    l = len(df)
+    with tqdm.tqdm(total=l, desc=f'Processing {dest_root_path}') as pbar:
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            threads = [executor.submit(convert_and_json_dump, row[0], os.path.join(dest_root_path, f'{i}.flac'), row) for i, row in enumerate(df.iloc())]
+            for _ in as_completed(threads):
+                pbar.update(1)
+
+def create_df(root_path:str):
+    wavs = glob.glob(root_path)
+    return wavs
+
+
+if __name__ == '__main__':
+    import multiprocessing
+
+    max_workers = multiprocessing.cpu_count()
+    print("Num workers: ", max_workers)
+    chunk = 512
+
+    root_path = '/home/knoriy/fsx/raw_datasets/RAVDESS/ravdess/'
+    metadata_dir = "/home/knoriy/fsx/raw_datasets/RAVDESS/ravdess/GigaSpeech.json"
+
+    dataset_name = 'ravdess'
+
+    s3 = fsspec.filesystem('s3')
+    s3_dest = f's-laion/knoriy/RAVDESS/{dataset_name}_tars/'
+
+    # load metadata and configure audio paths
+    df = create_df(root_path)
+    print(df)
+
+    # # create train, test, valid splits
+    # train, test = train_test_split(df, test_size=0.2)
+    # valid, test = train_test_split(test, test_size=0.2)
+    # train_test_val = {'train/':train, 'test/':test, 'valid/':valid}
+
+    
+    # for key in tqdm.tqdm(train_test_val, desc=f'processing:'):
+    #     df = train_test_val[key]
+        
+    #     dest_path = os.path.join(root_path.replace('raw_datasets', 'processed_datasets'), key)
+    #     os.makedirs(dest_path, exist_ok=True)
+
+    #     split_all_audio_files(df, dest_path)
+    #     tardir(dest_path, dest_path, chunk, delete_file=True)
+
+    #     # upload to s3 and delete local
+    #     s3.put(dest_path, os.path.join(s3_dest, key), recursive=True)
+    #     shutil.rmtree(dest_path)
\ No newline at end of file

From 4e34bd3296f62ae5428c431ac9b24511d4ce546a Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@ip-172-31-36-230.ec2.internal>
Date: Mon, 11 Jul 2022 17:22:32 +0000
Subject: [PATCH 14/40] added clause to avoid duplicate calculation

---
 current_dataset/preprocess_CoVoST.py     |  6 ++-
 current_dataset/preprocess_GigaSpeech.py |  6 ++-
 current_dataset/preprocess_LJSpeech.py   |  6 ++-
 current_dataset/preprocess_RAVDESS.py    | 68 ++++++++++++++++--------
 current_dataset/preprocess_mswc.py       |  6 ++-
 5 files changed, 65 insertions(+), 27 deletions(-)

diff --git a/current_dataset/preprocess_CoVoST.py b/current_dataset/preprocess_CoVoST.py
index 8165bd6..e18a771 100644
--- a/current_dataset/preprocess_CoVoST.py
+++ b/current_dataset/preprocess_CoVoST.py
@@ -21,7 +21,11 @@
 from utils.audio_utils import audio_to_flac
 from utils.make_tar_utils import tardir
 
-def convert_and_json_dump(file:str, dest:str, df):
+def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False):
+    if os.path.isfile(dest) and overwrite==False:
+        print(f'{dest} already exists, skiping')
+        return
+        
     audio_to_flac(file, dest)
     with open(dest.replace('.flac', '.json'), 'w') as f:
         json.dump({'filename': os.path.join(*dest.split('/')[4:]), 'text':df['norm_text'], 'tag':{'raw_text':df['raw_text']}}, f)
diff --git a/current_dataset/preprocess_GigaSpeech.py b/current_dataset/preprocess_GigaSpeech.py
index 629faaf..11ab473 100644
--- a/current_dataset/preprocess_GigaSpeech.py
+++ b/current_dataset/preprocess_GigaSpeech.py
@@ -21,7 +21,11 @@
 from utils.audio_utils import audio_to_flac
 from utils.make_tar_utils import tardir
 
-def convert_and_json_dump(file:str, dest:str, df):
+def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False):
+    if os.path.isfile(dest) and overwrite==False:
+        print(f'{dest} already exists, skiping')
+        return
+    
     audio_to_flac(file, dest, segment_start=df['begin_time'], segment_end=df['end_time'])
     with open(dest.replace('.flac', '.json'), 'w') as f:
         json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':df['text'], 'tag':df['tag']}, f)
diff --git a/current_dataset/preprocess_LJSpeech.py b/current_dataset/preprocess_LJSpeech.py
index 922e274..849ac9c 100644
--- a/current_dataset/preprocess_LJSpeech.py
+++ b/current_dataset/preprocess_LJSpeech.py
@@ -21,7 +21,11 @@
 from utils.audio_utils import audio_to_flac
 from utils.make_tar_utils import tardir
 
-def convert_and_json_dump(file:str, dest:str, df):
+def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False):
+    if os.path.isfile(dest) and overwrite==False:
+        print(f'{dest} already exists, skiping')
+        return
+        
     audio_to_flac(file, dest)
     with open(dest.replace('.flac', '.json'), 'w') as f:
         json.dump({'filename': os.path.join(*dest.split('/')[4:]), 'text':df['norm_text'], 'tag':{'raw_text':df['raw_text']}}, f)
diff --git a/current_dataset/preprocess_RAVDESS.py b/current_dataset/preprocess_RAVDESS.py
index ffa2a33..60a7be7 100644
--- a/current_dataset/preprocess_RAVDESS.py
+++ b/current_dataset/preprocess_RAVDESS.py
@@ -21,10 +21,15 @@
 from utils.audio_utils import audio_to_flac
 from utils.make_tar_utils import tardir
 
-def convert_and_json_dump(file:str, dest:str, df):
-    audio_to_flac(file, dest, segment_start=df['begin_time'], segment_end=df['end_time'])
+def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False, verbose=False):
+    if os.path.isfile(dest) and overwrite==False:
+        if verbose==True:
+            print(f'{dest} already exists, skiping')
+        return
+
+    audio_to_flac(file, dest)
     with open(dest.replace('.flac', '.json'), 'w') as f:
-        json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':df['text'], 'tag':df['tag']}, f)
+        json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':df['text']}, f)
 
 
 def split_all_audio_files(df, dest_root_path, max_workers=96):
@@ -38,21 +43,39 @@ def split_all_audio_files(df, dest_root_path, max_workers=96):
             for _ in as_completed(threads):
                 pbar.update(1)
 
-def create_df(root_path:str):
-    wavs = glob.glob(root_path)
-    return wavs
+def create_df(root_path:str, dataset_name:str=None):
+    wavs = glob.glob(os.path.join(root_path, '**/*.wav'), recursive=True)
+    codes = {   'modality':{'01':'full-AV', '02':'video-only', '03':'audio-only'},
+                'Vocal channel':{'01':'speech', '02':'song'},
+                'Emotion':{'01':'neutral', '02':'calm', '03':'happy', '04':'sad', '05':'angry', '06':'fearful', '07':'disgust', '08':'surprised'},
+                'Emotional intensity':{'01':'normal', '02':'strong'},
+                'Statement':{'01':"Kids are talking by the door", '02':"Dogs are sitting by the door"},
+                'Repetition':{'01':1, '02':2},
+                }
+    df_data = []
+    for wav in tqdm.tqdm(wavs):
+        file_name = os.path.basename(wav).split('.')[0]
+        wav_codes = file_name.split('-')
+
+        text = []
+        for i, code in enumerate(codes.values()):
+            text.append(code[wav_codes[i]])
+
+        song_or_speech = 'says' if text[1] == 'speech' else 'sings'
+        text = f'A person {song_or_speech}, "{text[4]}" in a {text[2]} and {text[3]} voice.'
+        df_data.append({ 'path':wav, 'text':text})
+
+    return pd.DataFrame(df_data)
 
 
 if __name__ == '__main__':
     import multiprocessing
 
     max_workers = multiprocessing.cpu_count()
-    print("Num workers: ", max_workers)
+    # print("Num workers: ", max_workers)
     chunk = 512
 
     root_path = '/home/knoriy/fsx/raw_datasets/RAVDESS/ravdess/'
-    metadata_dir = "/home/knoriy/fsx/raw_datasets/RAVDESS/ravdess/GigaSpeech.json"
-
     dataset_name = 'ravdess'
 
     s3 = fsspec.filesystem('s3')
@@ -60,23 +83,22 @@ def create_df(root_path:str):
 
     # load metadata and configure audio paths
     df = create_df(root_path)
-    print(df)
 
-    # # create train, test, valid splits
-    # train, test = train_test_split(df, test_size=0.2)
-    # valid, test = train_test_split(test, test_size=0.2)
-    # train_test_val = {'train/':train, 'test/':test, 'valid/':valid}
+    # create train, test, valid splits
+    train, test = train_test_split(df, test_size=0.2)
+    valid, test = train_test_split(test, test_size=0.2)
+    train_test_val = {'train/':train, 'test/':test, 'valid/':valid}
 
     
-    # for key in tqdm.tqdm(train_test_val, desc=f'processing:'):
-    #     df = train_test_val[key]
+    for key in tqdm.tqdm(train_test_val, desc=f'processing:'):
+        df = train_test_val[key]
         
-    #     dest_path = os.path.join(root_path.replace('raw_datasets', 'processed_datasets'), key)
-    #     os.makedirs(dest_path, exist_ok=True)
+        dest_path = os.path.join(root_path.replace('raw_datasets', 'processed_datasets'), key)
+        os.makedirs(dest_path, exist_ok=True)
 
-    #     split_all_audio_files(df, dest_path)
-    #     tardir(dest_path, dest_path, chunk, delete_file=True)
+        split_all_audio_files(df, dest_path)
+        tardir(dest_path, dest_path, chunk, delete_file=True)
 
-    #     # upload to s3 and delete local
-    #     s3.put(dest_path, os.path.join(s3_dest, key), recursive=True)
-    #     shutil.rmtree(dest_path)
\ No newline at end of file
+        # upload to s3 and delete local
+        s3.put(dest_path, os.path.join(s3_dest, key), recursive=True)
+        shutil.rmtree(dest_path)
\ No newline at end of file
diff --git a/current_dataset/preprocess_mswc.py b/current_dataset/preprocess_mswc.py
index 7a318b8..ded30ee 100644
--- a/current_dataset/preprocess_mswc.py
+++ b/current_dataset/preprocess_mswc.py
@@ -20,7 +20,11 @@
 from utils.audio_utils import audio_to_flac
 from utils.make_tar_utils import tardir
 
-def convert_and_json_dump(file:str, dest:str, df):
+def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False):
+    if os.path.isfile(dest) and overwrite==False:
+        print(f'{dest} already exists, skiping')
+        return
+    
     audio_to_flac(file, dest)
     with open(dest.replace('.flac', '.json'), 'w') as f:
         json.dump({'filename': os.path.join(*dest.split('/')[4:]), 'text':df['WORD'], 'tag':{'gender':df['GENDER'], 'language':dest.split('/')[-2]}}, f)

From 68ca8ffad0e3221159d63738732c676592932cec Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@ip-172-31-36-230.ec2.internal>
Date: Tue, 12 Jul 2022 13:52:59 +0000
Subject: [PATCH 15/40] Screipt to process CREMA-D

---
 current_dataset/preprocess_CREMA-D.py | 124 ++++++++++++++++++++++++++
 1 file changed, 124 insertions(+)
 create mode 100644 current_dataset/preprocess_CREMA-D.py

diff --git a/current_dataset/preprocess_CREMA-D.py b/current_dataset/preprocess_CREMA-D.py
new file mode 100644
index 0000000..de381cc
--- /dev/null
+++ b/current_dataset/preprocess_CREMA-D.py
@@ -0,0 +1,124 @@
+"""
+Code for preprocess GigaSpeech Corpus:
+https://github.com/SpeechColab/GigaSpeech
+"""
+
+import glob
+import tqdm
+import os
+import glob
+import pandas as pd
+import sys
+import tarfile
+import json
+import shutil
+import fsspec
+
+from sklearn.model_selection import train_test_split
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+from utils.audio_utils import audio_to_flac
+from utils.make_tar_utils import tardir
+
+def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False, verbose=False):
+    if os.path.isfile(dest) and overwrite==False:
+        if verbose==True:
+            print(f'{dest} already exists, skiping')
+        return
+    audio_to_flac(file, dest)
+    with open(dest.replace('.flac', '.json'), 'w') as f:
+        json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':df['text']}, f)
+
+
+def split_all_audio_files(df, dest_root_path, max_workers=96):
+    if not os.path.exists(dest_root_path):
+        raise FileNotFoundError(f'Please Check {dest_root_path} exists')
+
+    l = len(df)
+    with tqdm.tqdm(total=l, desc=f'Processing {dest_root_path}') as pbar:
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            threads = [executor.submit(convert_and_json_dump, row[0], os.path.join(dest_root_path, f'{i}.flac'), row) for i, row in enumerate(df.iloc())]
+            for _ in as_completed(threads):
+                pbar.update(1)
+
+def create_df(root_path:str, dataset_name:str=None):
+    wavs = glob.glob(os.path.join(root_path, '**/*.wav'), recursive=True)
+    codes = {   'Statement':{   'IEO':"It's eleven o'clock", 
+                                'TIE':"That is exactly what happened",
+                                'IOM':"I'm on my way to the meeting",
+                                'IWW':"I wonder what this is about",
+                                'TAI':"The airplane is almost full",
+                                'MTI':"Maybe tomorrow it will be cold",
+                                'IWL':"I would like a new alarm clock",
+                                'ITH':"I think I have a doctor's appointment",
+                                'DFA':"Don't forget a jacket",
+                                'ITS':"I think I've seen this before",
+                                'TSI':"The surface is slick",
+                                'WSI':"We'll stop in a couple of minutes",
+                                },
+                'Emotion':{     'ANG':'angery',
+                                'DIS':'disgusted',
+                                'FEA':'fearful',
+                                'HAP':'happy',
+                                'NEU':'neutral',
+                                'SAD':'sad',
+                        },
+                'Emotional intensity':{ 'LO':'Low', 
+                                        'MD':'Medium',
+                                        'HI':'High',
+                                        'XX':'Unspecified',
+                                        },
+                }
+    demographics = pd.read_csv('/home/knoriy/fsx/raw_datasets/CREMA-D/VideoDemographics.csv', names=["ActorID","Age","Sex","Race","Ethnicity"])
+    df_data = []
+    for wav in tqdm.tqdm(wavs):
+        file_name = os.path.basename(wav).split('.')[0]
+        wav_codes = file_name.split('_')
+        text_meta = [codes['Statement'][wav_codes[1]], codes['Emotion'][wav_codes[2]], codes['Emotional intensity'][wav_codes[3]]]
+        demograpthics_meta = demographics.loc[demographics['ActorID'] == wav_codes[0]]
+
+        male_or_female = 'woman' if demograpthics_meta["Sex"].values[0] == 'Female' else 'man'
+        intensity = '' if text_meta[2] == 'Unspecified' else f'and {text_meta[2]} '
+        text = f'A {male_or_female} saying "{text_meta[0]}" in a {text_meta[1]} {intensity}voice.'
+        df_data.append({ 'path':wav, 'text':text})
+
+    return pd.DataFrame(df_data)
+
+
+if __name__ == '__main__':
+    import multiprocessing
+
+    max_workers = multiprocessing.cpu_count()
+    print("Num workers: ", max_workers)
+    chunk = 512
+
+    root_path = '/home/knoriy/fsx/raw_datasets/CREMA-D/AudioWAV/'
+    dataset_name = 'crema-d'
+
+    s3 = fsspec.filesystem('s3')
+    s3_dest = f's-laion/knoriy/RAVDESS/{dataset_name}_tars/'
+
+    # load metadata and configure audio paths
+    df = create_df(root_path)
+    print(df.iloc()[1]['text'])
+
+    # create train, test, valid splits
+    train, test = train_test_split(df, test_size=0.2)
+    valid, test = train_test_split(test, test_size=0.2)
+    train_test_val = {'valid/':valid, 'train/':train, 'test/':test}
+
+    
+    for key in tqdm.tqdm(train_test_val, desc=f'processing:'):
+        df = train_test_val[key]
+        
+        dest_path = os.path.join(root_path.replace('raw_datasets', 'processed_datasets').replace('AudioWAV/', ''), key)
+        os.makedirs(dest_path, exist_ok=True)
+
+        split_all_audio_files(df, dest_path)
+        # tardir(dest_path, dest_path, chunk, delete_file=True)
+
+        # # upload to s3 and delete local
+        # s3.put(dest_path, os.path.join(s3_dest, key), recursive=True)
+        # shutil.rmtree(dest_path)
+        break
\ No newline at end of file

From caf739a58b37c178c9babdf41bece5ad61760ee7 Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@ip-172-31-36-230.ec2.internal>
Date: Tue, 12 Jul 2022 16:12:34 +0000
Subject: [PATCH 16/40] fixed issue where file were saved in wrong area

---
 current_dataset/preprocess_CREMA-D.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/current_dataset/preprocess_CREMA-D.py b/current_dataset/preprocess_CREMA-D.py
index de381cc..165d838 100644
--- a/current_dataset/preprocess_CREMA-D.py
+++ b/current_dataset/preprocess_CREMA-D.py
@@ -81,7 +81,7 @@ def create_df(root_path:str, dataset_name:str=None):
         male_or_female = 'woman' if demograpthics_meta["Sex"].values[0] == 'Female' else 'man'
         intensity = '' if text_meta[2] == 'Unspecified' else f'and {text_meta[2]} '
         text = f'A {male_or_female} saying "{text_meta[0]}" in a {text_meta[1]} {intensity}voice.'
-        df_data.append({ 'path':wav, 'text':text})
+        df_data.append({ 'path':wav, 'text':text, 'tag':{'transcript':text_meta[0], 'emotion':text_meta[1], 'gender':demograpthics_meta["Sex"].values[0], 'age':demograpthics_meta["Age"].values[0] }})
 
     return pd.DataFrame(df_data)
 
@@ -94,14 +94,13 @@ def create_df(root_path:str, dataset_name:str=None):
     chunk = 512
 
     root_path = '/home/knoriy/fsx/raw_datasets/CREMA-D/AudioWAV/'
-    dataset_name = 'crema-d'
+    dataset_name = 'CREMA-D'
 
     s3 = fsspec.filesystem('s3')
-    s3_dest = f's-laion/knoriy/RAVDESS/{dataset_name}_tars/'
+    s3_dest = f's-laion/knoriy/{dataset_name}/{dataset_name}_tars/'
 
     # load metadata and configure audio paths
     df = create_df(root_path)
-    print(df.iloc()[1]['text'])
 
     # create train, test, valid splits
     train, test = train_test_split(df, test_size=0.2)
@@ -116,9 +115,8 @@ def create_df(root_path:str, dataset_name:str=None):
         os.makedirs(dest_path, exist_ok=True)
 
         split_all_audio_files(df, dest_path)
-        # tardir(dest_path, dest_path, chunk, delete_file=True)
+        tardir(dest_path, dest_path, chunk, delete_file=True)
 
-        # # upload to s3 and delete local
-        # s3.put(dest_path, os.path.join(s3_dest, key), recursive=True)
-        # shutil.rmtree(dest_path)
-        break
\ No newline at end of file
+        # upload to s3 and delete local
+        s3.put(dest_path, os.path.join(s3_dest, key), recursive=True)
+        shutil.rmtree(dest_path)
\ No newline at end of file

From 890d63f67891809a25c3eedaeab7fdb55f9964e0 Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@ip-172-31-36-230.ec2.internal>
Date: Wed, 13 Jul 2022 09:28:50 +0000
Subject: [PATCH 17/40] script to download appropreate tsv

---
 current_dataset/preprocess_CoVoST.py | 105 ++++++++++++++++++++-------
 1 file changed, 78 insertions(+), 27 deletions(-)

diff --git a/current_dataset/preprocess_CoVoST.py b/current_dataset/preprocess_CoVoST.py
index e18a771..7a3daed 100644
--- a/current_dataset/preprocess_CoVoST.py
+++ b/current_dataset/preprocess_CoVoST.py
@@ -4,6 +4,7 @@
 """
 
 import glob
+from tokenize import Name
 import tqdm
 import os
 import glob
@@ -21,11 +22,11 @@
 from utils.audio_utils import audio_to_flac
 from utils.make_tar_utils import tardir
 
-def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False):
+def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False, verbose=False):
     if os.path.isfile(dest) and overwrite==False:
-        print(f'{dest} already exists, skiping')
+        if verbose==True:
+            print(f'{dest} already exists, skiping')
         return
-        
     audio_to_flac(file, dest)
     with open(dest.replace('.flac', '.json'), 'w') as f:
         json.dump({'filename': os.path.join(*dest.split('/')[4:]), 'text':df['norm_text'], 'tag':{'raw_text':df['raw_text']}}, f)
@@ -42,45 +43,95 @@ def split_all_audio_files(df, dest_root_path, max_workers=96):
             for _ in as_completed(threads):
                 pbar.update(1)
 
-if __name__ == '__main__':
-    raise NotImplementedError("This processing CoVoST is nto implemented yet")
-
+def download_tsvs(urls:list, output_dir:str):
+    os.makedirs(output_dir)
+    for url in urls:
+        cmd = f'curl {url} --output {os.path.join(output_dir, url.split("/")[-1])}'
+        os.system(cmd)
 
+if __name__ == '__main__':
+    x_2_eng = [
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.fr_en.tsv.tar.gz",
+        # "https://dl.fbaipublicfiles.com/covost/covost_v2.de_en.tsv.tar.gz",
+        # "https://dl.fbaipublicfiles.com/covost/covost_v2.es_en.tsv.tar.gz",
+        # "https://dl.fbaipublicfiles.com/covost/covost_v2.ca_en.tsv.tar.gz",
+        # "https://dl.fbaipublicfiles.com/covost/covost_v2.it_en.tsv.tar.gz",
+        # "https://dl.fbaipublicfiles.com/covost/covost_v2.ru_en.tsv.tar.gz",
+        # "https://dl.fbaipublicfiles.com/covost/covost_v2.zh-CN_en.tsv.tar.gz",
+        # "https://dl.fbaipublicfiles.com/covost/covost_v2.pt_en.tsv.tar.gz",
+        # "https://dl.fbaipublicfiles.com/covost/covost_v2.fa_en.tsv.tar.gz",
+        # "https://dl.fbaipublicfiles.com/covost/covost_v2.et_en.tsv.tar.gz",
+        # "https://dl.fbaipublicfiles.com/covost/covost_v2.mn_en.tsv.tar.gz",
+        # "https://dl.fbaipublicfiles.com/covost/covost_v2.nl_en.tsv.tar.gz",
+        # "https://dl.fbaipublicfiles.com/covost/covost_v2.tr_en.tsv.tar.gz",
+        # "https://dl.fbaipublicfiles.com/covost/covost_v2.ar_en.tsv.tar.gz",
+        # "https://dl.fbaipublicfiles.com/covost/covost_v2.sv-SE_en.tsv.tar.gz",
+        # "https://dl.fbaipublicfiles.com/covost/covost_v2.lv_en.tsv.tar.gz",
+        # "https://dl.fbaipublicfiles.com/covost/covost_v2.sl_en.tsv.tar.gz",
+        # "https://dl.fbaipublicfiles.com/covost/covost_v2.ta_en.tsv.tar.gz",
+        # "https://dl.fbaipublicfiles.com/covost/covost_v2.ja_en.tsv.tar.gz",
+        # "https://dl.fbaipublicfiles.com/covost/covost_v2.id_en.tsv.tar.gz",
+        # "https://dl.fbaipublicfiles.com/covost/covost_v2.cy_en.tsv.tar.gz",
+    ]
+    eng_2_x = [
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_de.tsv.tar.gz',
+        # 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_ca.tsv.tar.gz',
+        # 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_zh-CN.tsv.tar.gz',
+        # 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_fa.tsv.tar.gz',
+        # 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_et.tsv.tar.gz',
+        # 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_mn.tsv.tar.gz',
+        # 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_tr.tsv.tar.gz',
+        # 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_ar.tsv.tar.gz',
+        # 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_sv-SE.tsv.tar.gz',
+        # 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_lv.tsv.tar.gz',
+        # 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_sl.tsv.tar.gz',
+        # 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_ta.tsv.tar.gz',
+        # 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_ja.tsv.tar.gz',
+        # 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_id.tsv.tar.gz',
+        # 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_cy.tsv.tar.gz',
+    ]
+    # download_tsvs(eng_2_x, "/home/knoriy/fsx/raw_datasets/CoVoST_2/tsvs/eng_2_x")
     import multiprocessing
 
     max_workers = multiprocessing.cpu_count()
     chunk = 512
     generate_subset_tsv = True
 
-    root_path = '/home/knoriy/datasets/raw_datasets/ljspeech/'
-    tar_dir = "/home/knoriy/datasets/raw_datasets/ljspeech/LJSpeech-1.1.tar.bz2"
-    metadata_dir = "/home/knoriy/datasets/raw_datasets/ljspeech/metadata.csv"
+    root_path = '/home/knoriy/datasets/raw_datasets/CoVoST_2/'
+    metadata_dir = "/home/knoriy/datasets/raw_datasets/CoVoST_2/"
 
-    dataset_name = 'ljspeech'
+    dataset_name = 'CoVoST_2'
 
     s3 = fsspec.filesystem('s3')
-    s3_dest = f's-laion/knoriy/LJSpeech/{dataset_name}_tars/'
+    s3_dest = f's-laion/knoriy/{dataset_name}/{dataset_name}_tars/'
 
     # load metadata and configure audio paths
-    df = pd.read_csv(metadata_dir, header=None, names=['path', 'raw_text', 'norm_text'], sep='|')
-    for i in df.iloc():
-        i[0] = f'{os.path.join(root_path, "wavs", i[0])+".wav"}'
+    df = pd.read_csv('/home/knoriy/fsx/raw_datasets/CoVoST_2/tsvs/eng_2_x/covost_v2.en_de.tsv', sep='\t')
+    print(df.head())
 
-    # create train, test, valid splits
-    train, test = train_test_split(df, test_size=0.2)
-    valid, test = train_test_split(test, test_size=0.2)
-    train_test_val = {'train/':train, 'test/':test, 'valid/':valid}
+    # # create train, test, valid splits
+    # train, test = train_test_split(df, test_size=0.2)
+    # valid, test = train_test_split(test, test_size=0.2)
+    # train_test_val = {'train/':train, 'test/':test, 'valid/':valid}
 
 
-    for key in tqdm.tqdm(train_test_val, desc=f'processing:'):
-        df = train_test_val[key]
+    # for key in tqdm.tqdm(train_test_val, desc=f'processing:'):
+    #     df = train_test_val[key]
         
-        dest_path = os.path.join(root_path.replace('raw_datasets', 'processed_datasets'),key )
-        os.makedirs(dest_path, exist_ok=True)
+    #     dest_path = os.path.join(root_path.replace('raw_datasets', 'processed_datasets'),key )
+    #     os.makedirs(dest_path, exist_ok=True)
+
+    #     split_all_audio_files(df, dest_path)
+    #     tardir(dest_path, dest_path, chunk, delete_file=True)
+
+    #     # upload to s3 and delete local
+    #     s3.put(dest_path, os.path.join(s3_dest, key), recursive=True)
+    #     shutil.rmtree(dest_path)
 
-        split_all_audio_files(df, dest_path)
-        tardir(dest_path, dest_path, chunk, delete_file=True)
 
-        # upload to s3 and delete local
-        s3.put(dest_path, os.path.join(s3_dest, key), recursive=True)
-        shutil.rmtree(dest_path)
\ No newline at end of file
+    '''
+        python get_covost_splits.py \
+        --version 2 --src-lang en_de --tgt-lang <tgt_lang_code> \
+        --root <root path to the translation TSV and output TSVs> \
+        --cv-tsv <path to validated.tsv>
+    '''
\ No newline at end of file

From 1e8f87b3d9e85b693002f1718818054822229619 Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@ip-172-31-34-60.ec2.internal>
Date: Wed, 13 Jul 2022 18:48:09 +0000
Subject: [PATCH 18/40] Added caching of dfs to avoid recomputing the same sf
 multiple time if running on multiple nodes

---
 current_dataset/preprocess_CoVoST.py     | 88 +++++++++++++-----------
 current_dataset/preprocess_GigaSpeech.py | 54 ++++++++-------
 2 files changed, 77 insertions(+), 65 deletions(-)

diff --git a/current_dataset/preprocess_CoVoST.py b/current_dataset/preprocess_CoVoST.py
index 7a3daed..384b141 100644
--- a/current_dataset/preprocess_CoVoST.py
+++ b/current_dataset/preprocess_CoVoST.py
@@ -43,54 +43,60 @@ def split_all_audio_files(df, dest_root_path, max_workers=96):
             for _ in as_completed(threads):
                 pbar.update(1)
 
-def download_tsvs(urls:list, output_dir:str):
-    os.makedirs(output_dir)
+def download_tsvs(urls:list, output_dir:str, extract:bool=False):
+    os.makedirs(output_dir, exist_ok=True)
     for url in urls:
-        cmd = f'curl {url} --output {os.path.join(output_dir, url.split("/")[-1])}'
-        os.system(cmd)
+        dest_path = os.path.join(output_dir, url.split("/")[-1])
+        if os.path.isfile(dest_path):
+            continue
+        os.system(f'curl {url} --output {dest_path}')
+
+        if extract:
+            os.system(f'tar -xf {dest_path}')
 
 if __name__ == '__main__':
     x_2_eng = [
         "https://dl.fbaipublicfiles.com/covost/covost_v2.fr_en.tsv.tar.gz",
-        # "https://dl.fbaipublicfiles.com/covost/covost_v2.de_en.tsv.tar.gz",
-        # "https://dl.fbaipublicfiles.com/covost/covost_v2.es_en.tsv.tar.gz",
-        # "https://dl.fbaipublicfiles.com/covost/covost_v2.ca_en.tsv.tar.gz",
-        # "https://dl.fbaipublicfiles.com/covost/covost_v2.it_en.tsv.tar.gz",
-        # "https://dl.fbaipublicfiles.com/covost/covost_v2.ru_en.tsv.tar.gz",
-        # "https://dl.fbaipublicfiles.com/covost/covost_v2.zh-CN_en.tsv.tar.gz",
-        # "https://dl.fbaipublicfiles.com/covost/covost_v2.pt_en.tsv.tar.gz",
-        # "https://dl.fbaipublicfiles.com/covost/covost_v2.fa_en.tsv.tar.gz",
-        # "https://dl.fbaipublicfiles.com/covost/covost_v2.et_en.tsv.tar.gz",
-        # "https://dl.fbaipublicfiles.com/covost/covost_v2.mn_en.tsv.tar.gz",
-        # "https://dl.fbaipublicfiles.com/covost/covost_v2.nl_en.tsv.tar.gz",
-        # "https://dl.fbaipublicfiles.com/covost/covost_v2.tr_en.tsv.tar.gz",
-        # "https://dl.fbaipublicfiles.com/covost/covost_v2.ar_en.tsv.tar.gz",
-        # "https://dl.fbaipublicfiles.com/covost/covost_v2.sv-SE_en.tsv.tar.gz",
-        # "https://dl.fbaipublicfiles.com/covost/covost_v2.lv_en.tsv.tar.gz",
-        # "https://dl.fbaipublicfiles.com/covost/covost_v2.sl_en.tsv.tar.gz",
-        # "https://dl.fbaipublicfiles.com/covost/covost_v2.ta_en.tsv.tar.gz",
-        # "https://dl.fbaipublicfiles.com/covost/covost_v2.ja_en.tsv.tar.gz",
-        # "https://dl.fbaipublicfiles.com/covost/covost_v2.id_en.tsv.tar.gz",
-        # "https://dl.fbaipublicfiles.com/covost/covost_v2.cy_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.de_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.es_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.ca_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.it_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.ru_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.zh-CN_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.pt_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.fa_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.et_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.mn_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.nl_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.tr_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.ar_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.sv-SE_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.lv_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.sl_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.ta_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.ja_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.id_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.cy_en.tsv.tar.gz",
     ]
     eng_2_x = [
         'https://dl.fbaipublicfiles.com/covost/covost_v2.en_de.tsv.tar.gz',
-        # 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_ca.tsv.tar.gz',
-        # 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_zh-CN.tsv.tar.gz',
-        # 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_fa.tsv.tar.gz',
-        # 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_et.tsv.tar.gz',
-        # 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_mn.tsv.tar.gz',
-        # 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_tr.tsv.tar.gz',
-        # 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_ar.tsv.tar.gz',
-        # 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_sv-SE.tsv.tar.gz',
-        # 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_lv.tsv.tar.gz',
-        # 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_sl.tsv.tar.gz',
-        # 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_ta.tsv.tar.gz',
-        # 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_ja.tsv.tar.gz',
-        # 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_id.tsv.tar.gz',
-        # 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_cy.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_ca.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_zh-CN.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_fa.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_et.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_mn.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_tr.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_ar.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_sv-SE.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_lv.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_sl.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_ta.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_ja.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_id.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_cy.tsv.tar.gz',
     ]
-    # download_tsvs(eng_2_x, "/home/knoriy/fsx/raw_datasets/CoVoST_2/tsvs/eng_2_x")
+    download_tsvs(eng_2_x, "/home/knoriy/fsx/raw_datasets/CoVoST_2/tsvs/")
+    download_tsvs(x_2_eng, "/home/knoriy/fsx/raw_datasets/CoVoST_2/tsvs")
     import multiprocessing
 
     max_workers = multiprocessing.cpu_count()
@@ -106,10 +112,10 @@ def download_tsvs(urls:list, output_dir:str):
     s3_dest = f's-laion/knoriy/{dataset_name}/{dataset_name}_tars/'
 
     # load metadata and configure audio paths
-    df = pd.read_csv('/home/knoriy/fsx/raw_datasets/CoVoST_2/tsvs/eng_2_x/covost_v2.en_de.tsv', sep='\t')
+    df = pd.read_csv('/home/knoriy/fsx/raw_datasets/CoVoST_2/tsvs/covost_v2.en_de.dev.tsv', sep='\t')
     print(df.head())
 
-    # # create train, test, valid splits
+    # create train, test, valid splits
     # train, test = train_test_split(df, test_size=0.2)
     # valid, test = train_test_split(test, test_size=0.2)
     # train_test_val = {'train/':train, 'test/':test, 'valid/':valid}
diff --git a/current_dataset/preprocess_GigaSpeech.py b/current_dataset/preprocess_GigaSpeech.py
index 11ab473..09b394f 100644
--- a/current_dataset/preprocess_GigaSpeech.py
+++ b/current_dataset/preprocess_GigaSpeech.py
@@ -58,30 +58,36 @@ def split_all_audio_files(df, dest_root_path, max_workers=96):
     s3_dest = f's-laion/knoriy/GigaSpeech/{dataset_name}_tars/'
 
     # load metadata and configure audio paths
-    raw_df = pd.read_json(metadata_dir)
-
-    new_df_data = []
-    for row in tqdm.tqdm(raw_df.iloc(), total=len(raw_df), desc='Generating dataframe: '):
-        for seg in row['audios']['segments']:
-            try:
-                catagory = row['audios']['category']
-            except:
-                catagory = 'N/A'
-            
-            if seg['text_tn'] == '<SIL>' or seg['text_tn'] == '<NOISE>':
-                continue
-
-            new_df_data.append(
-                {'path':f'{os.path.join(root_path, row["audios"]["path"])}', 
-                'begin_time': seg['begin_time'], 
-                'end_time': seg['end_time'], 
-                'text': seg['text_tn'],
-                'tag':{ 'language':row['language'], 
-                        'url':row['audios']['url'], 
-                        'category':catagory,
-                        'speaker':row['audios']['speaker']}
-                })
-    df = pd.DataFrame(new_df_data)
+    cache_df_path = os.path.join(root_path, 'temp_df.csv')
+    if os.path.isfile(cache_df_path):
+        df = pd.read_csv(cache_df_path, sep='\t')
+    else:
+        raw_df = pd.read_json(metadata_dir)
+
+        new_df_data = []
+        for row in tqdm.tqdm(raw_df.iloc(), total=len(raw_df), desc='Generating dataframe: '):
+            for seg in row['audios']['segments']:
+                try:
+                    catagory = row['audios']['category']
+                except:
+                    catagory = 'N/A'
+                
+                if seg['text_tn'] == '<SIL>' or seg['text_tn'] == '<NOISE>':
+                    continue
+
+                new_df_data.append(
+                    {'path':f'{os.path.join(root_path, row["audios"]["path"])}', 
+                    'begin_time': seg['begin_time'], 
+                    'end_time': seg['end_time'], 
+                    'text': seg['text_tn'],
+                    'tag':{ 'language':row['language'], 
+                            'url':row['audios']['url'], 
+                            'category':catagory,
+                            'speaker':row['audios']['speaker']}
+                    })
+        df = pd.DataFrame(new_df_data)
+        df.to_csv(cache_df_path, sep='\t', index=False)
+        
     print(df.head())
 
     # create train, test, valid splits

From 7bf8ede8623613bf27695ea6d032831f7395554e Mon Sep 17 00:00:00 2001
From: knoriy
 <knoriy@compute-od-cpu-dy-m5zn-12xlarge-3.hpc-1click-prod-320.pcluster>
Date: Tue, 2 Aug 2022 10:11:09 +0000
Subject: [PATCH 19/40] fixed error where .json did not conform with
 audio-dataset

---
 current_dataset/preprocess_CREMA-D.py    |  2 +-
 current_dataset/preprocess_CoVoST.py     | 76 ++++++++++++++++++------
 current_dataset/preprocess_GigaSpeech.py |  6 +-
 current_dataset/preprocess_LJSpeech.py   |  9 ++-
 current_dataset/preprocess_RAVDESS.py    |  2 +-
 current_dataset/preprocess_mswc.py       |  5 +-
 6 files changed, 72 insertions(+), 28 deletions(-)

diff --git a/current_dataset/preprocess_CREMA-D.py b/current_dataset/preprocess_CREMA-D.py
index 165d838..98e2af2 100644
--- a/current_dataset/preprocess_CREMA-D.py
+++ b/current_dataset/preprocess_CREMA-D.py
@@ -28,7 +28,7 @@ def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False, verbose=
         return
     audio_to_flac(file, dest)
     with open(dest.replace('.flac', '.json'), 'w') as f:
-        json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':df['text']}, f)
+        json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':[df['text']]}, f)
 
 
 def split_all_audio_files(df, dest_root_path, max_workers=96):
diff --git a/current_dataset/preprocess_CoVoST.py b/current_dataset/preprocess_CoVoST.py
index 384b141..0d6b0f3 100644
--- a/current_dataset/preprocess_CoVoST.py
+++ b/current_dataset/preprocess_CoVoST.py
@@ -4,6 +4,7 @@
 """
 
 import glob
+from tabnanny import verbose
 from tokenize import Name
 import tqdm
 import os
@@ -29,7 +30,8 @@ def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False, verbose=
         return
     audio_to_flac(file, dest)
     with open(dest.replace('.flac', '.json'), 'w') as f:
-        json.dump({'filename': os.path.join(*dest.split('/')[4:]), 'text':df['norm_text'], 'tag':{'raw_text':df['raw_text']}}, f)
+        print(dest)
+        json.dump({'filename': os.path.join(*dest.split('/')[3:]), 'text':[df['text']], 'original_data':df['original_data']}, f)
 
 
 def split_all_audio_files(df, dest_root_path, max_workers=96):
@@ -39,7 +41,7 @@ def split_all_audio_files(df, dest_root_path, max_workers=96):
     l = len(df)
     with tqdm.tqdm(total=l, desc=f'Processing {dest_root_path}') as pbar:
         with ThreadPoolExecutor(max_workers=max_workers) as executor:
-            threads = [executor.submit(convert_and_json_dump, row[0], os.path.join(dest_root_path, f'{i}.flac'), row) for i, row in enumerate(df.iloc())]
+            threads = [executor.submit(convert_and_json_dump, row[0], os.path.join(dest_root_path, f'{i}.flac'), row, overwrite=False, verbose=False) for i, row in enumerate(df.iloc())]
             for _ in as_completed(threads):
                 pbar.update(1)
 
@@ -95,49 +97,89 @@ def download_tsvs(urls:list, output_dir:str, extract:bool=False):
         'https://dl.fbaipublicfiles.com/covost/covost_v2.en_id.tsv.tar.gz',
         'https://dl.fbaipublicfiles.com/covost/covost_v2.en_cy.tsv.tar.gz',
     ]
+    get_language_from_key = {
+        'en':'english',
+        'de':'german', 
+        'fr':'french', 
+        'nl':'dutch', 
+        'ru':'russian', 
+        'es':'spanish', 
+        'it':'italian', 
+        'tr':'turkish', 
+        'fa':'persian',
+        'ca':'catalan', 
+        'zh-cn':'chinese', 
+        'pt':'portuguese',
+        'et':'estonian',
+        'mn':'mongolian',
+        'ar':'arabic',
+        'sv-se':'swedish',
+        'lv':'latvian',
+        'sl':'slovenian',
+        'ta':'tamil',
+        'ja':'japanese',
+        'id':'indonesian',
+        'cy':'welsh',
+        } 
     download_tsvs(eng_2_x, "/home/knoriy/fsx/raw_datasets/CoVoST_2/tsvs/")
     download_tsvs(x_2_eng, "/home/knoriy/fsx/raw_datasets/CoVoST_2/tsvs")
+    
     import multiprocessing
 
     max_workers = multiprocessing.cpu_count()
     chunk = 512
     generate_subset_tsv = True
 
-    root_path = '/home/knoriy/datasets/raw_datasets/CoVoST_2/'
-    metadata_dir = "/home/knoriy/datasets/raw_datasets/CoVoST_2/"
+    root_path = '/home/knoriy/fsx/raw_datasets/CoVoST_2/'
+    metadata_dir = "/home/knoriy/fsx/raw_datasets/CoVoST_2/"
 
     dataset_name = 'CoVoST_2'
+    COMMON_VOICE_VERSION = 'cv-corpus-10.0-2022-07-04'
 
     s3 = fsspec.filesystem('s3')
     s3_dest = f's-laion/knoriy/{dataset_name}/{dataset_name}_tars/'
 
     # load metadata and configure audio paths
-    df = pd.read_csv('/home/knoriy/fsx/raw_datasets/CoVoST_2/tsvs/covost_v2.en_de.dev.tsv', sep='\t')
-    print(df.head())
+    tsvs = glob.glob(os.path.join(root_path, 'tsvs/**/*.tsv'), recursive=True)[2:]
 
-    # create train, test, valid splits
-    # train, test = train_test_split(df, test_size=0.2)
-    # valid, test = train_test_split(test, test_size=0.2)
-    # train_test_val = {'train/':train, 'test/':test, 'valid/':valid}
 
+    for tsv in tqdm.tqdm(tsvs, desc=f'processing:'):
+        raw_df = pd.read_csv(tsv, sep='\t')
+        train_val_or_test, language = tsv.split('.')[-2], tsv.split('.')[-3]
 
-    # for key in tqdm.tqdm(train_test_val, desc=f'processing:'):
-    #     df = train_test_val[key]
+        data = {}
+        for row in raw_df.iloc():
+
+            data.setdefault('paths', []).append(os.path.join(root_path, COMMON_VOICE_VERSION, language.split('_')[0], "clips", row['path']))
+            data.setdefault('text', []).append(f"{row['translation']} translated to {get_language_from_key[language.split('_')[0]]}")
+            data.setdefault('original_data', []).append(
+                {
+                    "sentence":row['sentence'],
+                    "translation":row['translation'],
+                    "client_id":row['client_id'],
+                }
+            )
+
+        df = pd.DataFrame(data)[:2]
+        print(df)
         
-    #     dest_path = os.path.join(root_path.replace('raw_datasets', 'processed_datasets'),key )
-    #     os.makedirs(dest_path, exist_ok=True)
+        dest_path = os.path.join(root_path.replace('raw_datasets', 'processed_datasets'), language, train_val_or_test)
+        print(dest_path)
+        os.makedirs(dest_path, exist_ok=True)
 
-    #     split_all_audio_files(df, dest_path)
+        split_all_audio_files(df, dest_path)
     #     tardir(dest_path, dest_path, chunk, delete_file=True)
 
     #     # upload to s3 and delete local
     #     s3.put(dest_path, os.path.join(s3_dest, key), recursive=True)
     #     shutil.rmtree(dest_path)
 
+        break
+
 
     '''
-        python get_covost_splits.py \
+        python /home/knoriy/fsx/raw_datasets/CoVoST_2/covost/get_covost_splits.py \
         --version 2 --src-lang en_de --tgt-lang <tgt_lang_code> \
         --root <root path to the translation TSV and output TSVs> \
-        --cv-tsv <path to validated.tsv>
+        --cv-tsv /home/knoriy/fsx/raw_datasets/CoVoST_2/cv-corpus-10.0-2022-07-04/en/validated.tsv
     '''
\ No newline at end of file
diff --git a/current_dataset/preprocess_GigaSpeech.py b/current_dataset/preprocess_GigaSpeech.py
index 09b394f..245dbd6 100644
--- a/current_dataset/preprocess_GigaSpeech.py
+++ b/current_dataset/preprocess_GigaSpeech.py
@@ -24,11 +24,13 @@
 def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False):
     if os.path.isfile(dest) and overwrite==False:
         print(f'{dest} already exists, skiping')
+        with open(dest.replace('.flac', '.json'), 'w') as f:
+            json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':[df['text']], 'original_data':df['original_data']}, f)
         return
     
     audio_to_flac(file, dest, segment_start=df['begin_time'], segment_end=df['end_time'])
     with open(dest.replace('.flac', '.json'), 'w') as f:
-        json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':df['text'], 'tag':df['tag']}, f)
+        json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':[df['text']], 'original_data':df['original_data']}, f)
 
 
 def split_all_audio_files(df, dest_root_path, max_workers=96):
@@ -80,7 +82,7 @@ def split_all_audio_files(df, dest_root_path, max_workers=96):
                     'begin_time': seg['begin_time'], 
                     'end_time': seg['end_time'], 
                     'text': seg['text_tn'],
-                    'tag':{ 'language':row['language'], 
+                    'original_data':{ 'language':row['language'], 
                             'url':row['audios']['url'], 
                             'category':catagory,
                             'speaker':row['audios']['speaker']}
diff --git a/current_dataset/preprocess_LJSpeech.py b/current_dataset/preprocess_LJSpeech.py
index 849ac9c..9c345b9 100644
--- a/current_dataset/preprocess_LJSpeech.py
+++ b/current_dataset/preprocess_LJSpeech.py
@@ -25,10 +25,10 @@ def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False):
     if os.path.isfile(dest) and overwrite==False:
         print(f'{dest} already exists, skiping')
         return
-        
+
     audio_to_flac(file, dest)
     with open(dest.replace('.flac', '.json'), 'w') as f:
-        json.dump({'filename': os.path.join(*dest.split('/')[4:]), 'text':df['norm_text'], 'tag':{'raw_text':df['raw_text']}}, f)
+        json.dump({'filename': os.path.join(*dest.split('/')[4:]), 'text':[df['norm_text']], 'original_data':{'raw_text':df['raw_text']}}, f)
 
 
 def split_all_audio_files(df, dest_root_path, max_workers=96):
@@ -50,9 +50,8 @@ def split_all_audio_files(df, dest_root_path, max_workers=96):
     chunk = 512
     generate_subset_tsv = True
 
-    root_path = '/home/knoriy/datasets/raw_datasets/ljspeech/'
-    tar_dir = "/home/knoriy/datasets/raw_datasets/ljspeech/LJSpeech-1.1.tar.bz2"
-    metadata_dir = "/home/knoriy/datasets/raw_datasets/ljspeech/metadata.csv"
+    root_path = '/home/knoriy/fsx/raw_datasets/ljspeech/'
+    metadata_dir = "/home/knoriy/fsx/raw_datasets/ljspeech/metadata.csv"
 
     dataset_name = 'ljspeech'
 
diff --git a/current_dataset/preprocess_RAVDESS.py b/current_dataset/preprocess_RAVDESS.py
index 60a7be7..dd56cb4 100644
--- a/current_dataset/preprocess_RAVDESS.py
+++ b/current_dataset/preprocess_RAVDESS.py
@@ -29,7 +29,7 @@ def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False, verbose=
 
     audio_to_flac(file, dest)
     with open(dest.replace('.flac', '.json'), 'w') as f:
-        json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':df['text']}, f)
+        json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':[df['text']]}, f)
 
 
 def split_all_audio_files(df, dest_root_path, max_workers=96):
diff --git a/current_dataset/preprocess_mswc.py b/current_dataset/preprocess_mswc.py
index ded30ee..84191a8 100644
--- a/current_dataset/preprocess_mswc.py
+++ b/current_dataset/preprocess_mswc.py
@@ -21,13 +21,14 @@
 from utils.make_tar_utils import tardir
 
 def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False):
+    with open(dest.replace('.flac', '.json'), 'w') as f:
+        json.dump({'filename': os.path.join(*dest.split('/')[4:]), 'text':[df['WORD']], 'original_data':{'gender':df['GENDER'], 'language':dest.split('/')[-2]}}, f)
     if os.path.isfile(dest) and overwrite==False:
         print(f'{dest} already exists, skiping')
         return
-    
     audio_to_flac(file, dest)
     with open(dest.replace('.flac', '.json'), 'w') as f:
-        json.dump({'filename': os.path.join(*dest.split('/')[4:]), 'text':df['WORD'], 'tag':{'gender':df['GENDER'], 'language':dest.split('/')[-2]}}, f)
+        json.dump({'filename': os.path.join(*dest.split('/')[4:]), 'text':[df['WORD']], 'original_data':{'gender':df['GENDER'], 'language':dest.split('/')[-2]}}, f)
 
 
 def split_all_audio_files(df, src_root_path, dest_root_path, max_workers=96):

From a5941ba587ea0f961ee7d3afa62afe574594650f Mon Sep 17 00:00:00 2001
From: knoriy
 <knoriy@compute-od-cpu-dy-m5zn-12xlarge-3.hpc-1click-prod-320.pcluster>
Date: Tue, 2 Aug 2022 12:12:33 +0000
Subject: [PATCH 20/40] processing setup for en-X and x_en

---
 current_dataset/preprocess_CoVoST.py | 53 ++++++++++++++++------------
 1 file changed, 30 insertions(+), 23 deletions(-)

diff --git a/current_dataset/preprocess_CoVoST.py b/current_dataset/preprocess_CoVoST.py
index 0d6b0f3..f6d063d 100644
--- a/current_dataset/preprocess_CoVoST.py
+++ b/current_dataset/preprocess_CoVoST.py
@@ -33,7 +33,6 @@ def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False, verbose=
         print(dest)
         json.dump({'filename': os.path.join(*dest.split('/')[3:]), 'text':[df['text']], 'original_data':df['original_data']}, f)
 
-
 def split_all_audio_files(df, dest_root_path, max_workers=96):
     if not os.path.exists(dest_root_path):
         raise FileNotFoundError(f'Please Check {dest_root_path} exists')
@@ -56,6 +55,20 @@ def download_tsvs(urls:list, output_dir:str, extract:bool=False):
         if extract:
             os.system(f'tar -xf {dest_path}')
 
+def extract_covost_2_tsvs(tsv_tar_dir:str, dest:str, cv_tsv:str, version=2):
+    extract_covost_2_tar_cmd = f'tar -xf {tsv_tar_dir} -C {dest}'
+    os.system(extract_covost_2_tar_cmd)
+
+    src_lang, tgt_lang = os.path.basename(tsv_tar_dir).split('.')[1].split('_')
+    get_covost_splits_cmd = f'python /home/knoriy/fsx/raw_datasets/CoVoST_2/covost/get_covost_splits.py \
+        --version {version} \
+        --src-lang {src_lang} \
+        --tgt-lang {tgt_lang} \
+        --root {dest} \
+        --cv-tsv {cv_tsv} \
+        '
+    os.system(get_covost_splits_cmd)
+
 if __name__ == '__main__':
     x_2_eng = [
         "https://dl.fbaipublicfiles.com/covost/covost_v2.fr_en.tsv.tar.gz",
@@ -139,19 +152,24 @@ def download_tsvs(urls:list, output_dir:str, extract:bool=False):
     s3 = fsspec.filesystem('s3')
     s3_dest = f's-laion/knoriy/{dataset_name}/{dataset_name}_tars/'
 
-    # load metadata and configure audio paths
-    tsvs = glob.glob(os.path.join(root_path, 'tsvs/**/*.tsv'), recursive=True)[2:]
+    # for tar in tqdm.tqdm(glob.glob(os.path.join(root_path, 'tsvs/**/*.tar.gz'), recursive=True)):
+    #     extract_covost_2_tsvs(tar, os.path.join(root_path, 'tsvs/'), '/home/knoriy/fsx/raw_datasets/CoVoST_2/cv-corpus-10.0-2022-07-04/en/validated.tsv')
 
+    # load metadata and configure audio paths
+    tsvs = []
+    for tsv in glob.glob(os.path.join(root_path, 'tsvs/**/*.tsv'), recursive=True):
+        if any(word in os.path.basename(tsv) for word in ['test', 'train', 'dev']):
+            tsvs.append(tsv)
 
     for tsv in tqdm.tqdm(tsvs, desc=f'processing:'):
-        raw_df = pd.read_csv(tsv, sep='\t')
-        train_val_or_test, language = tsv.split('.')[-2], tsv.split('.')[-3]
+        raw_df = pd.read_csv(tsv, sep='\t', on_bad_lines='skip')
+        IS_TRAIN_VAL_OR_TEST, LANGUAGE = tsv.split('.')[-2], tsv.split('.')[-3]
 
         data = {}
         for row in raw_df.iloc():
 
-            data.setdefault('paths', []).append(os.path.join(root_path, COMMON_VOICE_VERSION, language.split('_')[0], "clips", row['path']))
-            data.setdefault('text', []).append(f"{row['translation']} translated to {get_language_from_key[language.split('_')[0]]}")
+            data.setdefault('paths', []).append(os.path.join(root_path, COMMON_VOICE_VERSION, LANGUAGE.split('_')[0], "clips", row['path']))
+            data.setdefault('text', []).append(f"{row['translation']} translated to {get_language_from_key[LANGUAGE.split('_')[0]]}")
             data.setdefault('original_data', []).append(
                 {
                     "sentence":row['sentence'],
@@ -161,25 +179,14 @@ def download_tsvs(urls:list, output_dir:str, extract:bool=False):
             )
 
         df = pd.DataFrame(data)[:2]
-        print(df)
         
-        dest_path = os.path.join(root_path.replace('raw_datasets', 'processed_datasets'), language, train_val_or_test)
-        print(dest_path)
+        dest_path = os.path.join(root_path.replace('raw_datasets', 'processed_datasets'), LANGUAGE, IS_TRAIN_VAL_OR_TEST)
         os.makedirs(dest_path, exist_ok=True)
 
         split_all_audio_files(df, dest_path)
-    #     tardir(dest_path, dest_path, chunk, delete_file=True)
-
-    #     # upload to s3 and delete local
-    #     s3.put(dest_path, os.path.join(s3_dest, key), recursive=True)
-    #     shutil.rmtree(dest_path)
-
-        break
+        # tardir(dest_path, dest_path, chunk, delete_file=True)
 
+        # # upload to s3 and delete local
+        # s3.put(dest_path, os.path.join(s3_dest, key), recursive=True)
+        # shutil.rmtree(dest_path)
 
-    '''
-        python /home/knoriy/fsx/raw_datasets/CoVoST_2/covost/get_covost_splits.py \
-        --version 2 --src-lang en_de --tgt-lang <tgt_lang_code> \
-        --root <root path to the translation TSV and output TSVs> \
-        --cv-tsv /home/knoriy/fsx/raw_datasets/CoVoST_2/cv-corpus-10.0-2022-07-04/en/validated.tsv
-    '''
\ No newline at end of file

From 2f9951eea5d1bd1ecce49c5fa7c6339e89bf0388 Mon Sep 17 00:00:00 2001
From: knoriy
 <knoriy@compute-od-cpu-dy-m5zn-12xlarge-3.hpc-1click-prod-320.pcluster>
Date: Tue, 2 Aug 2022 13:49:52 +0000
Subject: [PATCH 21/40] processing setup for en-X and x_en

---
 current_dataset/preprocess_CoVoST.py | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/current_dataset/preprocess_CoVoST.py b/current_dataset/preprocess_CoVoST.py
index f6d063d..3a08487 100644
--- a/current_dataset/preprocess_CoVoST.py
+++ b/current_dataset/preprocess_CoVoST.py
@@ -46,7 +46,7 @@ def split_all_audio_files(df, dest_root_path, max_workers=96):
 
 def download_tsvs(urls:list, output_dir:str, extract:bool=False):
     os.makedirs(output_dir, exist_ok=True)
-    for url in urls:
+    for url in tqdm.tqdm(urls, desc="Downloading tsvs"):
         dest_path = os.path.join(output_dir, url.split("/")[-1])
         if os.path.isfile(dest_path):
             continue
@@ -70,6 +70,8 @@ def extract_covost_2_tsvs(tsv_tar_dir:str, dest:str, cv_tsv:str, version=2):
     os.system(get_covost_splits_cmd)
 
 if __name__ == '__main__':
+    import multiprocessing
+
     x_2_eng = [
         "https://dl.fbaipublicfiles.com/covost/covost_v2.fr_en.tsv.tar.gz",
         "https://dl.fbaipublicfiles.com/covost/covost_v2.de_en.tsv.tar.gz",
@@ -134,10 +136,6 @@ def extract_covost_2_tsvs(tsv_tar_dir:str, dest:str, cv_tsv:str, version=2):
         'id':'indonesian',
         'cy':'welsh',
         } 
-    download_tsvs(eng_2_x, "/home/knoriy/fsx/raw_datasets/CoVoST_2/tsvs/")
-    download_tsvs(x_2_eng, "/home/knoriy/fsx/raw_datasets/CoVoST_2/tsvs")
-    
-    import multiprocessing
 
     max_workers = multiprocessing.cpu_count()
     chunk = 512
@@ -152,8 +150,12 @@ def extract_covost_2_tsvs(tsv_tar_dir:str, dest:str, cv_tsv:str, version=2):
     s3 = fsspec.filesystem('s3')
     s3_dest = f's-laion/knoriy/{dataset_name}/{dataset_name}_tars/'
 
-    # for tar in tqdm.tqdm(glob.glob(os.path.join(root_path, 'tsvs/**/*.tar.gz'), recursive=True)):
-    #     extract_covost_2_tsvs(tar, os.path.join(root_path, 'tsvs/'), '/home/knoriy/fsx/raw_datasets/CoVoST_2/cv-corpus-10.0-2022-07-04/en/validated.tsv')
+    download_tsvs(eng_2_x, os.path.join(root_path, 'tsvs/'))
+    # download_tsvs(x_2_eng, "/home/knoriy/fsx/raw_datasets/CoVoST_2/tsvs")
+
+    for tar in tqdm.tqdm(glob.glob(os.path.join(root_path, 'tsvs/**/*.tar.gz'), recursive=True), desc='Extracting tsvs'):
+        if os.path.isd(tar)
+        extract_covost_2_tsvs(tar, os.path.join(root_path, 'tsvs/'), '/home/knoriy/fsx/raw_datasets/CoVoST_2/cv-corpus-10.0-2022-07-04/en/validated.tsv')
 
     # load metadata and configure audio paths
     tsvs = []
@@ -180,13 +182,13 @@ def extract_covost_2_tsvs(tsv_tar_dir:str, dest:str, cv_tsv:str, version=2):
 
         df = pd.DataFrame(data)[:2]
         
-        dest_path = os.path.join(root_path.replace('raw_datasets', 'processed_datasets'), LANGUAGE, IS_TRAIN_VAL_OR_TEST)
+        dest_path = os.path.join(root_path.replace('raw_datasets', 'processed_datasets'), LANGUAGE, IS_TRAIN_VAL_OR_TEST, "/")
         os.makedirs(dest_path, exist_ok=True)
 
         split_all_audio_files(df, dest_path)
-        # tardir(dest_path, dest_path, chunk, delete_file=True)
+        tardir(dest_path, dest_path, chunk, delete_file=True)
 
-        # # upload to s3 and delete local
-        # s3.put(dest_path, os.path.join(s3_dest, key), recursive=True)
+        # upload to s3 and delete local
+        # s3.put(dest_path, os.path.join(s3_dest, LANGUAGE, IS_TRAIN_VAL_OR_TEST)+'/', recursive=True)
         # shutil.rmtree(dest_path)
-
+        print(dest_path)

From 2c3036d7b56a661ed9c04dbeb1c44722914c455c Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@ip-172-31-37-81.ec2.internal>
Date: Tue, 2 Aug 2022 14:46:55 +0000
Subject: [PATCH 22/40] setup for eng to X

---
 current_dataset/preprocess_CoVoST.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/current_dataset/preprocess_CoVoST.py b/current_dataset/preprocess_CoVoST.py
index 3a08487..35f463c 100644
--- a/current_dataset/preprocess_CoVoST.py
+++ b/current_dataset/preprocess_CoVoST.py
@@ -30,7 +30,6 @@ def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False, verbose=
         return
     audio_to_flac(file, dest)
     with open(dest.replace('.flac', '.json'), 'w') as f:
-        print(dest)
         json.dump({'filename': os.path.join(*dest.split('/')[3:]), 'text':[df['text']], 'original_data':df['original_data']}, f)
 
 def split_all_audio_files(df, dest_root_path, max_workers=96):
@@ -153,9 +152,9 @@ def extract_covost_2_tsvs(tsv_tar_dir:str, dest:str, cv_tsv:str, version=2):
     download_tsvs(eng_2_x, os.path.join(root_path, 'tsvs/'))
     # download_tsvs(x_2_eng, "/home/knoriy/fsx/raw_datasets/CoVoST_2/tsvs")
 
-    for tar in tqdm.tqdm(glob.glob(os.path.join(root_path, 'tsvs/**/*.tar.gz'), recursive=True), desc='Extracting tsvs'):
-        if os.path.isd(tar)
-        extract_covost_2_tsvs(tar, os.path.join(root_path, 'tsvs/'), '/home/knoriy/fsx/raw_datasets/CoVoST_2/cv-corpus-10.0-2022-07-04/en/validated.tsv')
+    # uncomment to extract and create CoVoST tsvs
+    # for tar in tqdm.tqdm(glob.glob(os.path.join(root_path, 'tsvs/**/*.tar.gz'), recursive=True), desc='Extracting tsvs'):
+    #     extract_covost_2_tsvs(tar, os.path.join(root_path, 'tsvs/'), '/home/knoriy/fsx/raw_datasets/CoVoST_2/cv-corpus-10.0-2022-07-04/en/validated.tsv')
 
     # load metadata and configure audio paths
     tsvs = []
@@ -180,15 +179,14 @@ def extract_covost_2_tsvs(tsv_tar_dir:str, dest:str, cv_tsv:str, version=2):
                 }
             )
 
-        df = pd.DataFrame(data)[:2]
+        df = pd.DataFrame(data)
         
-        dest_path = os.path.join(root_path.replace('raw_datasets', 'processed_datasets'), LANGUAGE, IS_TRAIN_VAL_OR_TEST, "/")
+        dest_path = os.path.join(root_path.replace('raw_datasets', 'processed_datasets'), LANGUAGE, IS_TRAIN_VAL_OR_TEST, '')
         os.makedirs(dest_path, exist_ok=True)
 
         split_all_audio_files(df, dest_path)
         tardir(dest_path, dest_path, chunk, delete_file=True)
 
         # upload to s3 and delete local
-        # s3.put(dest_path, os.path.join(s3_dest, LANGUAGE, IS_TRAIN_VAL_OR_TEST)+'/', recursive=True)
-        # shutil.rmtree(dest_path)
-        print(dest_path)
+        s3.put(dest_path, os.path.join(s3_dest, LANGUAGE, IS_TRAIN_VAL_OR_TEST)+'/', recursive=True)
+        shutil.rmtree(dest_path)

From fb5d9b7f6d83e367c48554d1aaddaaa005d8d535 Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@ip-172-31-37-81.ec2.internal>
Date: Thu, 11 Aug 2022 09:16:45 +0000
Subject: [PATCH 23/40] Fixed json

---
 current_dataset/preprocess_GigaSpeech.py | 2 --
 current_dataset/preprocess_mswc.py       | 2 --
 2 files changed, 4 deletions(-)

diff --git a/current_dataset/preprocess_GigaSpeech.py b/current_dataset/preprocess_GigaSpeech.py
index 245dbd6..1a7a94e 100644
--- a/current_dataset/preprocess_GigaSpeech.py
+++ b/current_dataset/preprocess_GigaSpeech.py
@@ -24,8 +24,6 @@
 def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False):
     if os.path.isfile(dest) and overwrite==False:
         print(f'{dest} already exists, skiping')
-        with open(dest.replace('.flac', '.json'), 'w') as f:
-            json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':[df['text']], 'original_data':df['original_data']}, f)
         return
     
     audio_to_flac(file, dest, segment_start=df['begin_time'], segment_end=df['end_time'])
diff --git a/current_dataset/preprocess_mswc.py b/current_dataset/preprocess_mswc.py
index 84191a8..31a43eb 100644
--- a/current_dataset/preprocess_mswc.py
+++ b/current_dataset/preprocess_mswc.py
@@ -21,8 +21,6 @@
 from utils.make_tar_utils import tardir
 
 def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False):
-    with open(dest.replace('.flac', '.json'), 'w') as f:
-        json.dump({'filename': os.path.join(*dest.split('/')[4:]), 'text':[df['WORD']], 'original_data':{'gender':df['GENDER'], 'language':dest.split('/')[-2]}}, f)
     if os.path.isfile(dest) and overwrite==False:
         print(f'{dest} already exists, skiping')
         return

From 68e37c09b4fcd2288807de308e875e0ed72b9a92 Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@ip-172-31-36-138.ec2.internal>
Date: Mon, 15 Aug 2022 11:58:47 +0000
Subject: [PATCH 24/40] modified to process given language

---
 current_dataset/preprocess_mswc.py | 107 +++++++++++++++--------------
 1 file changed, 56 insertions(+), 51 deletions(-)

diff --git a/current_dataset/preprocess_mswc.py b/current_dataset/preprocess_mswc.py
index 31a43eb..d639fb9 100644
--- a/current_dataset/preprocess_mswc.py
+++ b/current_dataset/preprocess_mswc.py
@@ -42,6 +42,10 @@ def split_all_audio_files(df, src_root_path, dest_root_path, max_workers=96):
 
 if __name__ == '__main__':
     import multiprocessing
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--job", required=True)
+    args = parser.parse_args()
     
     max_workers = multiprocessing.cpu_count()
     chunk = 512
@@ -58,58 +62,59 @@ def split_all_audio_files(df, src_root_path, dest_root_path, max_workers=96):
     if not language_tars_dirs:
         raise FileNotFoundError(f"Please check that the file have been extracted: {root_path}")
 
-    for dir in tqdm.tqdm(language_tars_dirs, desc=f'processing: '):
-        if dir == '/home/knoriy/fsx/raw_datasets/mswc/audio/en.tar.gz':
-            audio_path = dir
+    dir = args.job
+
+    if dir == '/home/knoriy/fsx/raw_datasets/mswc/audio/en.tar.gz':
+        audio_path = dir
+        audio_path = os.path.split(audio_path)[0]
+
+        splits_path = dir.replace('audio', 'splits')
+        splits_path = splits_path.replace('.tar.gz', '/')
+    else:
+        audio_path = dir
+        with tarfile.open(audio_path, mode='r:gz') as mswc_audio:
             audio_path = os.path.split(audio_path)[0]
+            mswc_audio.extractall(audio_path)
 
-            splits_path = dir.replace('audio', 'splits')
+        splits_path = dir.replace('audio', 'splits')
+        with tarfile.open(splits_path, mode='r:gz') as mswc_split:
             splits_path = splits_path.replace('.tar.gz', '/')
+            mswc_split.extractall(splits_path)
+
+    tmp = glob.glob(os.path.join(splits_path, '**.csv'), recursive=True)
+    csv_paths = []
+    for csv_path in tmp:
+        if '_splits.csv' not in csv_path:
+            csv_paths.append(csv_path)
+
+    for csv_path in csv_paths:
+        if 'train' in csv_path:
+            train_test_dev = 'train/'
+        elif 'test' in csv_path:
+            train_test_dev = 'test/'
+        elif 'dev' in csv_path:
+            train_test_dev = 'valid/'
         else:
-            audio_path = dir
-            with tarfile.open(audio_path, mode='r:gz') as mswc_audio:
-                audio_path = os.path.split(audio_path)[0]
-                mswc_audio.extractall(audio_path)
-
-            splits_path = dir.replace('audio', 'splits')
-            with tarfile.open(splits_path, mode='r:gz') as mswc_split:
-                splits_path = splits_path.replace('.tar.gz', '/')
-                mswc_split.extractall(splits_path)
-
-        tmp = glob.glob(os.path.join(splits_path, '**.csv'), recursive=True)
-        csv_paths = []
-        for csv_path in tmp:
-            if '_splits.csv' not in csv_path:
-                csv_paths.append(csv_path)
-
-        for csv_path in csv_paths:
-            if 'train' in csv_path:
-                train_test_dev = 'train/'
-            elif 'test' in csv_path:
-                train_test_dev = 'test/'
-            elif 'dev' in csv_path:
-                train_test_dev = 'valid/'
-            else:
-                train_test_dev = 'other/'
-            df = pd.read_csv(csv_path)
-
-            # Convert to .flac
-            dest_path  = splits_path.replace('.tar.gz', '/').replace('/raw_datasets/', '/processed_datasets/').replace('splits/', '')
-            dest_path  = os.path.join(dest_path, train_test_dev)
-
-            src_path = os.path.join(splits_path.replace('.tar.gz', '/').replace('splits/', 'audio/'), 'clips')
-            os.makedirs(dest_path, exist_ok=True)
-            os.makedirs(src_path, exist_ok=True)
-
-            split_all_audio_files(df, src_path, dest_path, max_workers)
-
-            tardir(dest_path, dest_path, chunk, delete_file=True)
-
-            # upload to s3 and delete local
-            s3.put(dest_path, os.path.join(s3_dest, os.path.basename(dir.split('.')[0]), train_test_dev), recursive=True)
-            print('File Uploaded to: ', os.path.join(s3_dest, os.path.basename(dir.split('.')[0]), train_test_dev))
-            shutil.rmtree(dest_path)
-
-        # clean extracted files
-        shutil.rmtree(splits_path.replace('splits/', 'audio/'))
-        shutil.rmtree(splits_path)
\ No newline at end of file
+            train_test_dev = 'other/'
+        df = pd.read_csv(csv_path)
+
+        # Convert to .flac
+        dest_path  = splits_path.replace('.tar.gz', '/').replace('/raw_datasets/', '/processed_datasets/').replace('splits/', '')
+        dest_path  = os.path.join(dest_path, train_test_dev)
+
+        src_path = os.path.join(splits_path.replace('.tar.gz', '/').replace('splits/', 'audio/'), 'clips')
+        os.makedirs(dest_path, exist_ok=True)
+        os.makedirs(src_path, exist_ok=True)
+
+        split_all_audio_files(df, src_path, dest_path, max_workers)
+
+        tardir(dest_path, dest_path, chunk, delete_file=True)
+
+        # upload to s3 and delete local
+        s3.put(dest_path, os.path.join(s3_dest, os.path.basename(dir.split('.')[0]), train_test_dev), recursive=True)
+        print('File Uploaded to: ', os.path.join(s3_dest, os.path.basename(dir.split('.')[0]), train_test_dev))
+        shutil.rmtree(dest_path)
+
+    # clean extracted files
+    shutil.rmtree(splits_path.replace('splits/', 'audio/'))
+    shutil.rmtree(splits_path)
\ No newline at end of file

From 8ca65097824812ec299c796acd8083bcdf11274e Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@ip-172-31-36-138.ec2.internal>
Date: Tue, 16 Aug 2022 17:07:59 +0000
Subject: [PATCH 25/40] Fixed bug when a trailing / is needed

---
 utils/make_tar_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/utils/make_tar_utils.py b/utils/make_tar_utils.py
index 91d7bc4..a55c553 100644
--- a/utils/make_tar_utils.py
+++ b/utils/make_tar_utils.py
@@ -36,15 +36,15 @@ def tardir(
     if n_split * n_entry_each != len(filelist):
         n_split += 1
     size_dict = {
-        os.path.basename(tar_name) + str(i) + ".tar": n_entry_each
+        os.path.join(os.path.basename(tar_name), str(i) + ".tar"): n_entry_each
         for i in range(n_split)
     }
     if n_split * n_entry_each != len(filelist):
-        size_dict[os.path.basename(tar_name) + str(n_split - 1) + ".tar"] = (
+        size_dict[os.path.join(os.path.basename(tar_name), str(n_split - 1) + ".tar")] = (
             len(filelist) - (n_split - 1) * n_entry_each
         )
     for i in tqdm(range(start_idx, n_split + start_idx), desc='Creating .tar file:'):
-        with tarfile.open(tar_name + str(i) + ".tar", "w") as tar_handle:
+        with tarfile.open(os.path.join(tar_name, str(i) + ".tar"), "w") as tar_handle:
             for j in range(count, len(filelist)):
                 audio = filelist[j]
                 basename = ".".join(audio.split(".")[:-1])

From a664c0db12da782f1c032a1e3887c351dc1e6025 Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@ip-172-31-36-138.ec2.internal>
Date: Tue, 16 Aug 2022 17:08:43 +0000
Subject: [PATCH 26/40] processing code for EmoV-DB

---
 current_dataset/preprocess_EmoV_DB.py | 109 ++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 current_dataset/preprocess_EmoV_DB.py

diff --git a/current_dataset/preprocess_EmoV_DB.py b/current_dataset/preprocess_EmoV_DB.py
new file mode 100644
index 0000000..5e4eaf8
--- /dev/null
+++ b/current_dataset/preprocess_EmoV_DB.py
@@ -0,0 +1,109 @@
+import os
+from sre_parse import Verbose
+import sys
+import json
+import tqdm
+import pandas as pd
+import pathlib
+import fsspec
+import shutil
+
+
+from multiprocessing import Pool
+from itertools import repeat
+from sklearn.model_selection import train_test_split
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+from utils.audio_utils import audio_to_flac
+from utils.make_tar_utils import tardir
+
+
+def convert_and_json_dump(df:pd.DataFrame, overwrite:bool=False, verbose:bool=False):
+    dest = df['dest']
+    file = df['path']
+
+    os.makedirs(pathlib.Path(dest).parent, exist_ok=True)
+
+    if os.path.isfile(dest) and overwrite==False:
+        if verbose==True:
+            print(f'{dest} already exists, skiping')
+        return
+    audio_to_flac(file, dest)
+    with open(dest.replace('.flac', '.json'), 'w') as f:
+        json.dump({'filename': os.path.join(*dest.split('/')[3:]), 'text':[df['text']], 'original_data':None}, f)
+    return dest.replace('.flac', '.json')
+
+def extract_tars(dir:pathlib.Path, dest:pathlib.Path):
+    glob = dir.glob("**/*.tar.gz")
+
+    for path in glob:
+        path = pathlib.Path(path)
+        tmp_dest = dest.joinpath(*(path.stem.split('_'))).with_suffix('')
+        tmp_dest.mkdir(parents=True, exist_ok=True)
+        cmd = f'tar -xf {path} -C {tmp_dest}'
+        os.system(cmd)
+
+
+def run_tasks(extract:bool=False, overwrite:bool=False, verbose:bool=False, chunksize:int=1):
+
+    dataset_name = 'EmoV_DB'
+    chunk = 512
+
+    s3 = fsspec.filesystem('s3')
+    s3_dest = pathlib.Path(f's-laion/knoriy/{dataset_name}/{dataset_name}_tars/')
+
+    root_data_dir = pathlib.Path('/home/knoriy/fsx/raw_datasets/EmoV_db/')
+    extracted_data_dir = pathlib.Path('/home/knoriy/fsx/raw_datasets/EmoV_db/raw/')
+    if extract:
+        extract_tars(root_data_dir, extracted_data_dir)
+    
+    raw_df = pd.read_csv(root_data_dir.joinpath('cmuarctic.csv'), sep="\t", header=None)
+
+    glob = extracted_data_dir.glob('**/**/*.wav')
+    train, test = train_test_split(list(glob), test_size=0.3)
+    test, valid = train_test_split(list(test), test_size=0.3)
+    train_test_valid = {'train':train, 'test':test, 'valid':valid}
+
+    EmoV_DB_gender = {'sam':'male', 'jenie':'female', 'josh':'male', 'bea':'females'}
+
+    for key in train_test_valid:
+        dest_path = None
+        df_data = []
+        for i, path in enumerate(train_test_valid[key]):
+            root_path = path.parents[0]
+            file_name = path.name
+            emotion = root_path.name
+            actor = root_path.parents[0].name
+            dest_path = str(path.parents[3].joinpath('EmoV_DB_tars', key)).replace('raw_datasets', 'processed_datasets')
+
+            current_file = raw_df.loc[int(file_name.split('.')[0].split('_')[-1])-1]
+
+            data = {}
+
+            data['gender'] = EmoV_DB_gender[actor]
+            data['emotion'] = emotion
+            data['path'] = path
+            data['dest'] = str(pathlib.Path(dest_path).joinpath(f'{i}.flac'))
+            data['text'] = current_file[1]
+
+            df_data.append(data)
+
+        df = pd.DataFrame(df_data)
+
+        print(f'starting pool for {key}')
+        with Pool() as pool:
+            for result in tqdm.tqdm(pool.starmap(convert_and_json_dump, zip(df.iloc, repeat(overwrite), repeat(verbose)), chunksize=chunksize), total=len(df_data)):
+                pass
+
+        tardir(dest_path, dest_path, chunk, delete_file=True)
+
+        # upload to s3 and delete local
+        s3.put(dest_path, s3_dest.joinpath(key), recursive=True)
+        print('File Uploaded to: ', s3_dest.joinpath(key))
+        shutil.rmtree(dest_path)
+    
+    # clean Extracted Files
+    shutil.rmtree(extracted_data_dir)
+
+if __name__ == '__main__':
+    run_tasks(extract=True, chunksize=10)

From 5323d86fa14e1d92cf865b5bea8180f353c2cb6c Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@ip-172-31-36-138.ec2.internal>
Date: Tue, 16 Aug 2022 17:32:10 +0000
Subject: [PATCH 27/40] processing and cleaning old files

---
 current_dataset/preprocess_EmoV_DB.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/current_dataset/preprocess_EmoV_DB.py b/current_dataset/preprocess_EmoV_DB.py
index 5e4eaf8..3d2aab5 100644
--- a/current_dataset/preprocess_EmoV_DB.py
+++ b/current_dataset/preprocess_EmoV_DB.py
@@ -30,7 +30,7 @@ def convert_and_json_dump(df:pd.DataFrame, overwrite:bool=False, verbose:bool=Fa
         return
     audio_to_flac(file, dest)
     with open(dest.replace('.flac', '.json'), 'w') as f:
-        json.dump({'filename': os.path.join(*dest.split('/')[3:]), 'text':[df['text']], 'original_data':None}, f)
+        json.dump({'filename': os.path.join(*dest.split('/')[3:]), 'text':[df['text']], 'original_data':df['original_data']}, f)
     return dest.replace('.flac', '.json')
 
 def extract_tars(dir:pathlib.Path, dest:pathlib.Path):
@@ -50,7 +50,7 @@ def run_tasks(extract:bool=False, overwrite:bool=False, verbose:bool=False, chun
     chunk = 512
 
     s3 = fsspec.filesystem('s3')
-    s3_dest = pathlib.Path(f's-laion/knoriy/{dataset_name}/{dataset_name}_tars/')
+    s3_dest = pathlib.Path(f's-laion-audio/webdataset_tar/{dataset_name}/')
 
     root_data_dir = pathlib.Path('/home/knoriy/fsx/raw_datasets/EmoV_db/')
     extracted_data_dir = pathlib.Path('/home/knoriy/fsx/raw_datasets/EmoV_db/raw/')
@@ -84,7 +84,9 @@ def run_tasks(extract:bool=False, overwrite:bool=False, verbose:bool=False, chun
             data['emotion'] = emotion
             data['path'] = path
             data['dest'] = str(pathlib.Path(dest_path).joinpath(f'{i}.flac'))
-            data['text'] = current_file[1]
+            data['text'] = f'A {EmoV_DB_gender[actor]} saying "{current_file[1]}" in a {emotion} voice'
+            data['original_data'] = {'gender':EmoV_DB_gender[actor], 'emotion':emotion, 'raw_text':current_file[1]}
+
 
             df_data.append(data)
 

From bc1fbad569cb9526e605b7f2bb1d0326f002462f Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@ip-172-31-36-138.ec2.internal>
Date: Mon, 22 Aug 2022 11:21:03 +0000
Subject: [PATCH 28/40] Improved multiprocessing workload, processing improve
 from 3 it/s to 30000+ it/s

---
 current_dataset/preprocess_mswc.py | 71 ++++++++++++++----------------
 1 file changed, 32 insertions(+), 39 deletions(-)

diff --git a/current_dataset/preprocess_mswc.py b/current_dataset/preprocess_mswc.py
index d639fb9..d7b4c9f 100644
--- a/current_dataset/preprocess_mswc.py
+++ b/current_dataset/preprocess_mswc.py
@@ -14,13 +14,19 @@
 import shutil
 import fsspec
 
-from concurrent.futures import ThreadPoolExecutor, as_completed
+import multiprocessing
+from multiprocessing import Pool
+from itertools import repeat
+
 
 sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
 from utils.audio_utils import audio_to_flac
 from utils.make_tar_utils import tardir
 
-def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False):
+def convert_and_json_dump(df:pd.DataFrame, overwrite:bool=False, verbose:bool=False):
+    dest = df['dest_path']
+    file = df['src_path']
+
     if os.path.isfile(dest) and overwrite==False:
         print(f'{dest} already exists, skiping')
         return
@@ -28,23 +34,16 @@ def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False):
     with open(dest.replace('.flac', '.json'), 'w') as f:
         json.dump({'filename': os.path.join(*dest.split('/')[4:]), 'text':[df['WORD']], 'original_data':{'gender':df['GENDER'], 'language':dest.split('/')[-2]}}, f)
 
-
-def split_all_audio_files(df, src_root_path, dest_root_path, max_workers=96):
-    if not os.path.exists(dest_root_path):
-        raise FileNotFoundError(f'Please Check {dest_root_path} exists')
-
-    l = len(df)
-    with tqdm.tqdm(total=l, desc=f'Processing {dest_root_path}') as pbar:
-        with ThreadPoolExecutor(max_workers=max_workers) as executor:
-            threads = [executor.submit(convert_and_json_dump, os.path.join(src_root_path, row['LINK']), os.path.join(dest_root_path, f'{i}.flac'), row) for i, row in enumerate(df.iloc())]
-            for _ in as_completed(threads):
-                pbar.update(1)
+def split_all_audio_files(df, overwrite:bool=False, verbose:bool=False, chunksize:int=1):
+    print(f'starting pool')
+    with Pool() as pool:
+        for result in tqdm.tqdm(pool.starmap(convert_and_json_dump, zip(df.iloc, repeat(overwrite), repeat(verbose)), chunksize=chunksize), total=len(df)):
+            pass
 
 if __name__ == '__main__':
-    import multiprocessing
     import argparse
     parser = argparse.ArgumentParser()
-    parser.add_argument("--job", required=True)
+    parser.add_argument("--job", help='Directory to the files to process, e.g. "/home/knoriy/fsx/raw_datasets/mswc/audio/fr.tar.gz" ', required=True)
     args = parser.parse_args()
     
     max_workers = multiprocessing.cpu_count()
@@ -64,22 +63,14 @@ def split_all_audio_files(df, src_root_path, dest_root_path, max_workers=96):
 
     dir = args.job
 
-    if dir == '/home/knoriy/fsx/raw_datasets/mswc/audio/en.tar.gz':
-        audio_path = dir
-        audio_path = os.path.split(audio_path)[0]
+    with tarfile.open(dir, mode='r:gz') as mswc_audio:
+        audio_path = os.path.split(dir)[0]
+        mswc_audio.extractall(audio_path)
 
-        splits_path = dir.replace('audio', 'splits')
+    splits_path = dir.replace('audio', 'splits')
+    with tarfile.open(splits_path, mode='r:gz') as mswc_split:
         splits_path = splits_path.replace('.tar.gz', '/')
-    else:
-        audio_path = dir
-        with tarfile.open(audio_path, mode='r:gz') as mswc_audio:
-            audio_path = os.path.split(audio_path)[0]
-            mswc_audio.extractall(audio_path)
-
-        splits_path = dir.replace('audio', 'splits')
-        with tarfile.open(splits_path, mode='r:gz') as mswc_split:
-            splits_path = splits_path.replace('.tar.gz', '/')
-            mswc_split.extractall(splits_path)
+        mswc_split.extractall(splits_path)
 
     tmp = glob.glob(os.path.join(splits_path, '**.csv'), recursive=True)
     csv_paths = []
@@ -96,8 +87,6 @@ def split_all_audio_files(df, src_root_path, dest_root_path, max_workers=96):
             train_test_dev = 'valid/'
         else:
             train_test_dev = 'other/'
-        df = pd.read_csv(csv_path)
-
         # Convert to .flac
         dest_path  = splits_path.replace('.tar.gz', '/').replace('/raw_datasets/', '/processed_datasets/').replace('splits/', '')
         dest_path  = os.path.join(dest_path, train_test_dev)
@@ -106,15 +95,19 @@ def split_all_audio_files(df, src_root_path, dest_root_path, max_workers=96):
         os.makedirs(dest_path, exist_ok=True)
         os.makedirs(src_path, exist_ok=True)
 
+        df = pd.read_csv(csv_path)
+        df['dest_path'] = [os.path.join(dest_path, f'{i}.flac') for i, _ in enumerate(df.iloc())]
+        df['src_path'] = [os.path.join(src_path, row['LINK']) for i, row in enumerate(df.iloc())]
+
         split_all_audio_files(df, src_path, dest_path, max_workers)
 
-        tardir(dest_path, dest_path, chunk, delete_file=True)
+    #     tardir(dest_path, dest_path, chunk, delete_file=True)
 
-        # upload to s3 and delete local
-        s3.put(dest_path, os.path.join(s3_dest, os.path.basename(dir.split('.')[0]), train_test_dev), recursive=True)
-        print('File Uploaded to: ', os.path.join(s3_dest, os.path.basename(dir.split('.')[0]), train_test_dev))
-        shutil.rmtree(dest_path)
+    #     # upload to s3 and delete local
+    #     s3.put(dest_path, os.path.join(s3_dest, os.path.basename(dir.split('.')[0]), train_test_dev), recursive=True)
+    #     print('File Uploaded to: ', os.path.join(s3_dest, os.path.basename(dir.split('.')[0]), train_test_dev))
+    #     shutil.rmtree(dest_path)
 
-    # clean extracted files
-    shutil.rmtree(splits_path.replace('splits/', 'audio/'))
-    shutil.rmtree(splits_path)
\ No newline at end of file
+    # # clean extracted files
+    # shutil.rmtree(splits_path.replace('splits/', 'audio/'))
+    # shutil.rmtree(splits_path)
\ No newline at end of file

From c0b7bec86934163e6e0f5297395d3cfbba459d8f Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@cpu16-dy-m6i-4xlarge-1.hpc-1click-cpu.pcluster>
Date: Mon, 5 Dec 2022 16:47:42 +0000
Subject: [PATCH 29/40] add: processor for cv11.0

---
 current_dataset/preprocess_common_voice.py | 68 ++++++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 current_dataset/preprocess_common_voice.py

diff --git a/current_dataset/preprocess_common_voice.py b/current_dataset/preprocess_common_voice.py
new file mode 100644
index 0000000..1b14407
--- /dev/null
+++ b/current_dataset/preprocess_common_voice.py
@@ -0,0 +1,68 @@
+import os
+import sys
+import tqdm
+import json
+import pathlib
+import fsspec
+import shutil
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import soundfile as sf
+from datasets import load_dataset
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+
+
+from utils.audio_utils import audio_to_flac
+from utils.make_tar_utils import tardir
+
+
+def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False):
+    if os.path.isfile(dest) and overwrite==False:
+        print(f'{dest} already exists, skiping')
+        return
+
+    sf.write(dest, df['audio']['array'], df['audio']['sampling_rate'])
+    with open(dest.replace('.flac', '.json'), 'w') as f:
+        json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':[df['sentence']], 'original_data':{'up_votes':df['up_votes'], 'down_votes':df['down_votes'], 'age':df['age'], 'gender':df['gender'], 'accent':df['accent'], 'language':df['locale']}}, f)
+
+
+def split_all_audio_files(data, dest_root_path, max_workers=96):
+    if not os.path.exists(dest_root_path):
+        raise FileNotFoundError(f'Please Check {dest_root_path} exists')
+
+    l = len(data)
+    with tqdm.tqdm(total=l, desc=f'Processing {dest_root_path}') as pbar:
+        with ThreadPoolExecutor() as executor:
+            threads = [executor.submit(convert_and_json_dump, row["audio"]["path"], os.path.join(dest_root_path, f'{i}.flac'), row) for i, row in enumerate(data)]
+            for _ in as_completed(threads):
+                pbar.update(1)
+
+
+
+def main():
+    import multiprocessing
+    max_workers = multiprocessing.cpu_count()
+
+    langs = ['ab', 'ar', 'en', 'fa', 'fr', 'es', 'sl', 'kab', 'cy', 'ca', 'de', 'tt', 'ta', 'ru', 'nl', 'it', 'eu', 'tr', 'zh-TW', 'br', 'pt', 'eo', 'zh-CN', 'id', 'ia', 'lv', 'ja', 'rw', 'sv-SE', 'cnh', 'et', 'ky', 'ro', 'hsb', 'el', 'cs', 'pl', 'rm-sursilv', 'rm-vallader', 'mn', 'zh-HK', 'cv', 'uk', 'mt', 'as', 'ka', 'fy-NL', 'dv', 'pa-IN', 'vi', 'or', 'ga-IE', 'fi', 'hu', 'th', 'lt', 'lg', 'hi', 'bas', 'sk', 'kmr', 'bg', 'kk', 'ba', 'gl', 'ug', 'hy-AM', 'be', 'ur', 'gn', 'sr', 'uz', 'mr', 'da', 'myv', 'nn-NO', 'ha', 'ckb', 'ml', 'mdf', 'sw', 'sat', 'tig', 'ig', 'nan-tw', 'mhr', 'bn', 'tok', 'yue', 'sah', 'mk', 'sc', 'skr', 'ti', 'mrj', 'tw', 'vot', 'az', 'ast', 'ne-NP']
+    dataset_name = "common_voice_11_0"
+    s3 = fsspec.filesystem('s3')
+
+    for lang in langs[1:]:
+        for split in ["train", "test", "validation"]:
+            wikipedia_dataset = load_dataset(f"mozilla-foundation/{dataset_name}", lang, split=split)
+
+            if split == "validation": split = "valid"
+            root_dest_path = pathlib.Path(f"/fsx/knoriy/processed_datasets/common_voice/{lang}/{split}/")
+            root_dest_path.mkdir(parents=True, exist_ok=True)
+            s3_dest = f's-laion-audio/webdataset_tar/{dataset_name}/{lang}/{split}/'
+
+            split_all_audio_files(wikipedia_dataset, root_dest_path, max_workers)
+            tardir(str(root_dest_path), str(root_dest_path), 512, delete_file=True)
+
+            # upload to s3 and delete local
+            s3.put(str(root_dest_path), s3_dest, recursive=True)
+            shutil.rmtree(root_dest_path)
+        # break
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file

From 812e399ef79c7f836c22d9e538a71ef1248909f3 Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@cpu16-dy-m6i-4xlarge-1.hpc-1click-cpu.pcluster>
Date: Mon, 5 Dec 2022 17:02:19 +0000
Subject: [PATCH 30/40] add: script to process common voice

---
 .../download_and_preprocess_common_voice.sh   | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 download_script/download_and_preprocess_common_voice.sh

diff --git a/download_script/download_and_preprocess_common_voice.sh b/download_script/download_and_preprocess_common_voice.sh
new file mode 100644
index 0000000..9b23336
--- /dev/null
+++ b/download_script/download_and_preprocess_common_voice.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+#SBATCH --partition=cpu16
+#SBATCH --job-name=audio-dataset
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=8
+#SBATCH --comment clap
+#SBATCH --output=%x_%j.out
+#SBATCH --exclusive
+
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib
+export NCCL_PROTO=simple
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/aws-ofi-nccl/lib
+export PATH=$PATH:/opt/amazon/efa/bin:/opt/amazon/openmpi/bin
+export FI_EFA_FORK_SAFE=1
+export FI_LOG_LEVEL=1
+export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
+export NCCL_DEBUG=info
+export OMPI_MCA_mtl_base_verbose=1
+export FI_EFA_ENABLE_SHM_TRANSFER=0
+export FI_PROVIDER=efa
+export FI_EFA_TX_MIN_CREDITS=64
+export NCCL_TREE_THRESHOLD=0
+
+echo Running job on $SLURM_JOB_NUM_NODES, 
+
+srun --comment clap /fsx/home-knoriy/miniconda3/envs/audio_dataset/bin/python /fsx/knoriy/code/audio-dataset/current_dataset/preprocess_common_voice.py
\ No newline at end of file

From 1c07775ce131aa4b8caf0852bb11f13bc5b95426 Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@cpu16-dy-r6i-4xlarge-45.hpc-1click-cpu.pcluster>
Date: Fri, 13 Jan 2023 11:05:43 +0000
Subject: [PATCH 31/40] updated cmd

---
 current_dataset/preprocess_CREMA-D.py      | 16 ++++++--
 current_dataset/preprocess_common_voice.py | 48 +++++++++++++---------
 current_dataset/preprocess_mswc.py         | 20 +++++----
 current_dataset/start_slurm_jobs.py        |  7 ++++
 4 files changed, 59 insertions(+), 32 deletions(-)
 create mode 100644 current_dataset/start_slurm_jobs.py

diff --git a/current_dataset/preprocess_CREMA-D.py b/current_dataset/preprocess_CREMA-D.py
index 98e2af2..1d2a59d 100644
--- a/current_dataset/preprocess_CREMA-D.py
+++ b/current_dataset/preprocess_CREMA-D.py
@@ -16,6 +16,7 @@
 
 from sklearn.model_selection import train_test_split
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from multiprocessing import Pool
 
 sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
 from utils.audio_utils import audio_to_flac
@@ -70,7 +71,7 @@ def create_df(root_path:str, dataset_name:str=None):
                                         'XX':'Unspecified',
                                         },
                 }
-    demographics = pd.read_csv('/home/knoriy/fsx/raw_datasets/CREMA-D/VideoDemographics.csv', names=["ActorID","Age","Sex","Race","Ethnicity"])
+    demographics = pd.read_csv('/fsx/knoriy/raw_datasets/CREMA-D/VideoDemographics.csv', names=["ActorID","Age","Sex","Race","Ethnicity"])
     df_data = []
     for wav in tqdm.tqdm(wavs):
         file_name = os.path.basename(wav).split('.')[0]
@@ -80,8 +81,8 @@ def create_df(root_path:str, dataset_name:str=None):
 
         male_or_female = 'woman' if demograpthics_meta["Sex"].values[0] == 'Female' else 'man'
         intensity = '' if text_meta[2] == 'Unspecified' else f'and {text_meta[2]} '
-        text = f'A {male_or_female} saying "{text_meta[0]}" in a {text_meta[1]} {intensity}voice.'
-        df_data.append({ 'path':wav, 'text':text, 'tag':{'transcript':text_meta[0], 'emotion':text_meta[1], 'gender':demograpthics_meta["Sex"].values[0], 'age':demograpthics_meta["Age"].values[0] }})
+        text = f'A {male_or_female} saying "{text_meta[0]}" in a {text_meta[1]} {intensity} voice.'
+        df_data.append({ 'path':wav, 'text':[text], 'tag':{'transcript':text_meta[0], 'language':'english', 'emotion':text_meta[1], 'gender':demograpthics_meta["Sex"].values[0], 'age':demograpthics_meta["Age"].values[0] }})
 
     return pd.DataFrame(df_data)
 
@@ -93,12 +94,19 @@ def create_df(root_path:str, dataset_name:str=None):
     print("Num workers: ", max_workers)
     chunk = 512
 
-    root_path = '/home/knoriy/fsx/raw_datasets/CREMA-D/AudioWAV/'
+    root_path = '/fsx/knoriy/raw_datasets/CREMA-D/AudioWAV/'
     dataset_name = 'CREMA-D'
 
     s3 = fsspec.filesystem('s3')
     s3_dest = f's-laion/knoriy/{dataset_name}/{dataset_name}_tars/'
 
+    original_tar_dir = '/fsx/knoriy/raw_datasets/CREMA-D/crema-d.tar.gz'
+
+    print('Extracting tar')
+    with tarfile.open(original_tar_dir, mode='r:gz') as file:
+        audio_path = os.path.split(original_tar_dir)[0]
+        file.extractall(audio_path)
+
     # load metadata and configure audio paths
     df = create_df(root_path)
 
diff --git a/current_dataset/preprocess_common_voice.py b/current_dataset/preprocess_common_voice.py
index 1b14407..0f3f75d 100644
--- a/current_dataset/preprocess_common_voice.py
+++ b/current_dataset/preprocess_common_voice.py
@@ -17,10 +17,10 @@
 
 
 def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False):
-    if os.path.isfile(dest) and overwrite==False:
+    if os.path.isfile(dest) and os.path.isfile(dest.replace('.flac', '.json')) and not overwrite:
         print(f'{dest} already exists, skiping')
         return
-
+    
     sf.write(dest, df['audio']['array'], df['audio']['sampling_rate'])
     with open(dest.replace('.flac', '.json'), 'w') as f:
         json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':[df['sentence']], 'original_data':{'up_votes':df['up_votes'], 'down_votes':df['down_votes'], 'age':df['age'], 'gender':df['gender'], 'accent':df['accent'], 'language':df['locale']}}, f)
@@ -33,7 +33,7 @@ def split_all_audio_files(data, dest_root_path, max_workers=96):
     l = len(data)
     with tqdm.tqdm(total=l, desc=f'Processing {dest_root_path}') as pbar:
         with ThreadPoolExecutor() as executor:
-            threads = [executor.submit(convert_and_json_dump, row["audio"]["path"], os.path.join(dest_root_path, f'{i}.flac'), row) for i, row in enumerate(data)]
+            threads = [executor.submit(convert_and_json_dump, row["audio"]["path"], os.path.join(dest_root_path, f'{i}.flac'), row, False) for i, row in enumerate(data)]
             for _ in as_completed(threads):
                 pbar.update(1)
 
@@ -47,22 +47,30 @@ def main():
     dataset_name = "common_voice_11_0"
     s3 = fsspec.filesystem('s3')
 
-    for lang in langs[1:]:
-        for split in ["train", "test", "validation"]:
-            wikipedia_dataset = load_dataset(f"mozilla-foundation/{dataset_name}", lang, split=split)
-
-            if split == "validation": split = "valid"
-            root_dest_path = pathlib.Path(f"/fsx/knoriy/processed_datasets/common_voice/{lang}/{split}/")
-            root_dest_path.mkdir(parents=True, exist_ok=True)
-            s3_dest = f's-laion-audio/webdataset_tar/{dataset_name}/{lang}/{split}/'
-
-            split_all_audio_files(wikipedia_dataset, root_dest_path, max_workers)
-            tardir(str(root_dest_path), str(root_dest_path), 512, delete_file=True)
-
-            # upload to s3 and delete local
-            s3.put(str(root_dest_path), s3_dest, recursive=True)
-            shutil.rmtree(root_dest_path)
-        # break
+    with tqdm.tqdm(total=len(langs)) as pbar:
+        for lang in langs:
+            pbar.set_description(f'Prcessing {lang}')
+            for split in ["train", "test", "validation"]:
+                wikipedia_dataset = load_dataset(f"mozilla-foundation/{dataset_name}", lang, split=split)
+
+                if split == "validation": split = "valid"
+                root_dest_path = pathlib.Path(f"/fsx/knoriy/processed_datasets/{dataset_name}/{lang}/{split}/")
+                root_dest_path.mkdir(parents=True, exist_ok=True)
+
+                split_all_audio_files(wikipedia_dataset, root_dest_path, max_workers)
+                tardir(str(root_dest_path), str(root_dest_path), 512, delete_file=False)
+
+                # Upload only tar files to s3
+                tar_files = (root_dest_path.glob('*.tar'))
+                for tar in tar_files:
+                    # upload to s3 and delete local
+                    pbar.set_description(f'Prcessing {lang}: uploading {str(tar)} to s3')
+                    s3_dest = f's-laion-audio/webdataset_tar/{dataset_name}/{lang}/{split}/{tar.name}'
+                    s3.put(str(tar), s3_dest)
+                # shutil.rmtree(root_dest_path)
+                # break
+            pbar.update(1)
+            # break
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
diff --git a/current_dataset/preprocess_mswc.py b/current_dataset/preprocess_mswc.py
index d7b4c9f..af268e8 100644
--- a/current_dataset/preprocess_mswc.py
+++ b/current_dataset/preprocess_mswc.py
@@ -50,8 +50,8 @@ def split_all_audio_files(df, overwrite:bool=False, verbose:bool=False, chunksiz
     chunk = 512
     generate_subset_tsv = True
 
-    root_path = '/home/knoriy/fsx/raw_datasets/mswc/'
-    tar_dir = "/home/knoriy/fsx/raw_datasets/mswc/mswc.tar.gz"
+    root_path = '/fsx/knoriy/raw_datasets/mswc/'
+    tar_dir = "/fsx/knoriy/raw_datasets/mswc/mswc.tar.gz"
     dataset_name = 'mswc'
 
     s3 = fsspec.filesystem('s3')
@@ -98,15 +98,19 @@ def split_all_audio_files(df, overwrite:bool=False, verbose:bool=False, chunksiz
         df = pd.read_csv(csv_path)
         df['dest_path'] = [os.path.join(dest_path, f'{i}.flac') for i, _ in enumerate(df.iloc())]
         df['src_path'] = [os.path.join(src_path, row['LINK']) for i, row in enumerate(df.iloc())]
+        
+        print("nan found", len(df[df.isna().any(axis=1)]))
+        df = df.dropna()
+        print("nan after drop:", len(df[df.isna().any(axis=1)]))
 
-        split_all_audio_files(df, src_path, dest_path, max_workers)
+        split_all_audio_files(df, overwrite=True, chunksize=max_workers)
 
-    #     tardir(dest_path, dest_path, chunk, delete_file=True)
+        tardir(dest_path, dest_path, chunk, delete_file=True)
 
-    #     # upload to s3 and delete local
-    #     s3.put(dest_path, os.path.join(s3_dest, os.path.basename(dir.split('.')[0]), train_test_dev), recursive=True)
-    #     print('File Uploaded to: ', os.path.join(s3_dest, os.path.basename(dir.split('.')[0]), train_test_dev))
-    #     shutil.rmtree(dest_path)
+        # upload to s3 and delete local
+        s3.put(dest_path, os.path.join(s3_dest, os.path.basename(dir.split('.')[0]), train_test_dev), recursive=True)
+        print('File Uploaded to: s3://', os.path.join(s3_dest, os.path.basename(dir.split('.')[0]), train_test_dev))
+        shutil.rmtree(dest_path)
 
     # # clean extracted files
     # shutil.rmtree(splits_path.replace('splits/', 'audio/'))
diff --git a/current_dataset/start_slurm_jobs.py b/current_dataset/start_slurm_jobs.py
new file mode 100644
index 0000000..a9a8b0d
--- /dev/null
+++ b/current_dataset/start_slurm_jobs.py
@@ -0,0 +1,7 @@
+import os
+import glob
+
+paths = glob.glob('/fsx/knoriy/raw_datasets/mswc/audio/*.tar.gz')
+
+for path in paths:
+    os.system( f"srun --comment clap --output=outs/%j.out --exclusive /fsx/home-knoriy/miniconda3/envs/audio_dataset/bin/python /fsx/knoriy/code/audio-dataset/current_dataset/preprocess_mswc.py --job {path} &")
\ No newline at end of file

From 850b12094dd6c6fc5a902cd29d53f651cb2bc2b9 Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@cpu16-dy-r6i-4xlarge-45.hpc-1click-cpu.pcluster>
Date: Fri, 13 Jan 2023 13:22:35 +0000
Subject: [PATCH 32/40] automated extrators and updated text

---
 current_dataset/preprocess_CREMA-D.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/current_dataset/preprocess_CREMA-D.py b/current_dataset/preprocess_CREMA-D.py
index 1d2a59d..647737d 100644
--- a/current_dataset/preprocess_CREMA-D.py
+++ b/current_dataset/preprocess_CREMA-D.py
@@ -22,7 +22,7 @@
 from utils.audio_utils import audio_to_flac
 from utils.make_tar_utils import tardir
 
-def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False, verbose=False):
+def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=True, verbose=False):
     if os.path.isfile(dest) and overwrite==False:
         if verbose==True:
             print(f'{dest} already exists, skiping')
@@ -81,8 +81,8 @@ def create_df(root_path:str, dataset_name:str=None):
 
         male_or_female = 'woman' if demograpthics_meta["Sex"].values[0] == 'Female' else 'man'
         intensity = '' if text_meta[2] == 'Unspecified' else f'and {text_meta[2]} '
-        text = f'A {male_or_female} saying "{text_meta[0]}" in a {text_meta[1]} {intensity} voice.'
-        df_data.append({ 'path':wav, 'text':[text], 'tag':{'transcript':text_meta[0], 'language':'english', 'emotion':text_meta[1], 'gender':demograpthics_meta["Sex"].values[0], 'age':demograpthics_meta["Age"].values[0] }})
+        text = f'A {male_or_female} saying "{text_meta[0]}" in a {text_meta[1]} {intensity} voice. {text_meta[2]}'
+        df_data.append({ 'path':wav, 'text':text, 'tag':{'transcript':text_meta[0], 'language':'english', 'emotion':text_meta[1], 'gender':demograpthics_meta["Sex"].values[0], 'age':demograpthics_meta["Age"].values[0] }})
 
     return pd.DataFrame(df_data)
 
@@ -98,14 +98,14 @@ def create_df(root_path:str, dataset_name:str=None):
     dataset_name = 'CREMA-D'
 
     s3 = fsspec.filesystem('s3')
-    s3_dest = f's-laion/knoriy/{dataset_name}/{dataset_name}_tars/'
+    s3_dest = f's-laion-audio/webdataset_tar/{dataset_name}/'
 
     original_tar_dir = '/fsx/knoriy/raw_datasets/CREMA-D/crema-d.tar.gz'
 
-    print('Extracting tar')
-    with tarfile.open(original_tar_dir, mode='r:gz') as file:
-        audio_path = os.path.split(original_tar_dir)[0]
-        file.extractall(audio_path)
+    # print('Extracting tar')
+    # with tarfile.open(original_tar_dir, mode='r:gz') as file:
+    #     audio_path = os.path.split(original_tar_dir)[0]
+    #     file.extractall(audio_path)
 
     # load metadata and configure audio paths
     df = create_df(root_path)

From a7265cf37797a6a6768eab690bbc957613a0b647 Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@ip-172-31-38-94.ec2.internal>
Date: Mon, 16 Jan 2023 15:08:48 +0000
Subject: [PATCH 33/40] fix: fixed but laguage was not set correctly, being set
 as train,test or valid

---
 current_dataset/preprocess_mswc.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/current_dataset/preprocess_mswc.py b/current_dataset/preprocess_mswc.py
index af268e8..5d36731 100644
--- a/current_dataset/preprocess_mswc.py
+++ b/current_dataset/preprocess_mswc.py
@@ -32,7 +32,7 @@ def convert_and_json_dump(df:pd.DataFrame, overwrite:bool=False, verbose:bool=Fa
         return
     audio_to_flac(file, dest)
     with open(dest.replace('.flac', '.json'), 'w') as f:
-        json.dump({'filename': os.path.join(*dest.split('/')[4:]), 'text':[df['WORD']], 'original_data':{'gender':df['GENDER'], 'language':dest.split('/')[-2]}}, f)
+        json.dump({'filename': os.path.join(*dest.split('/')[4:]), 'text':[df['WORD']], 'original_data':{'gender':df['GENDER'], 'language':dest.split('/')[-3]}}, f)
 
 def split_all_audio_files(df, overwrite:bool=False, verbose:bool=False, chunksize:int=1):
     print(f'starting pool')
@@ -55,7 +55,7 @@ def split_all_audio_files(df, overwrite:bool=False, verbose:bool=False, chunksiz
     dataset_name = 'mswc'
 
     s3 = fsspec.filesystem('s3')
-    s3_dest = f's-laion/knoriy/mswc/{dataset_name}_tars/'
+    s3_dest = f's-laion-audio/webdataset_tar/{dataset_name}/'
 
     language_tars_dirs = sorted(glob.glob(os.path.join(root_path, "audio/**.tar.gz")))
     if not language_tars_dirs:

From 3eb72e7f87d2d7552894c159fa7bc66e6c15deca Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@cpu16-dy-c6i-4xlarge-1.hpc-1click-cpu.pcluster>
Date: Thu, 4 May 2023 17:14:38 +0000
Subject: [PATCH 34/40] updated to include audioset descriptions

---
 data_preprocess/preprocess_audioset.py | 21 ++++----
 data_preprocess/preprocess_audioset.sh | 66 +++++++++++++-------------
 2 files changed, 44 insertions(+), 43 deletions(-)

diff --git a/data_preprocess/preprocess_audioset.py b/data_preprocess/preprocess_audioset.py
index fd5ae05..6f13fdd 100644
--- a/data_preprocess/preprocess_audioset.py
+++ b/data_preprocess/preprocess_audioset.py
@@ -4,6 +4,7 @@
 import glob
 from tqdm import tqdm
 import sys
+import json
 
 sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
 from utils.audio_utils import audio_to_flac
@@ -56,24 +57,24 @@ def process_single_audio(file_path, json_data, output_dir):
 
     # Load metadata
     unbalanced_csv_path = os.path.join(args.metadata_dir, f'{args.metadata_name}.csv')
-    with open(unbalanced_csv_path, 'r') as f:
+    with open(unbalanced_csv_path) as f:
         lines = f.readlines()
+        lines = lines[3:]
+        header_list = ['YTID', 'start_seconds', 'end_seconds', 'positive_labels']
+        class_metadata = [l.strip().split(', ') for l in lines]
+        class_metadata = pd.DataFrame(class_metadata, columns=header_list)
+        class_metadata = dict(zip(class_metadata.YTID, class_metadata.positive_labels))
 
-    lines = lines[3:]
-    header_list = ['YTID', 'start_seconds', 'end_seconds', 'positive_labels']
-    class_metadata = [l.strip().split(', ') for l in lines]
-    class_metadata = pd.DataFrame(class_metadata, columns=header_list)
+    with open(os.path.join(args.metadata_dir,'/ontology.json')) as f:
+        ontology = json.load(f)
+        ontology_dict = {i['id']: (i['name'], i['description']) for i in ontology}
 
-    class_to_name_map = pd.read_csv(os.path.join(args.metadata_dir, 'class_labels_indices.csv'))
-
-    class_metadata = dict(zip(class_metadata.YTID, class_metadata.positive_labels))
-    class_to_name_map = dict(zip(class_to_name_map.mid, class_to_name_map.display_name))
 
     wav_all = glob.glob(f'{args.wav_dir}/*.wav')
     futures = []
     for file in tqdm(wav_all):
         # process_single_audio(file, class_metadata, class_to_name_map, args.output_dir)
-        json_data = get_json(file, class_metadata, class_to_name_map)
+        json_data = get_json(file, class_metadata, ontology_dict)
         futures.append(
             executor.submit(partial(process_single_audio, file, json_data, args.output_dir)))
 
diff --git a/data_preprocess/preprocess_audioset.sh b/data_preprocess/preprocess_audioset.sh
index 83965e9..d9e6fa1 100644
--- a/data_preprocess/preprocess_audioset.sh
+++ b/data_preprocess/preprocess_audioset.sh
@@ -1,79 +1,79 @@
 #!/bin/bash
 
-# preliminary: create /mnt/audio_clip/audioset, clone code from audio-dataset
+# preliminary: create /tmp/audioset, clone code from audio-dataset
 
-cd /mnt/audio_clip/audioset
+cd /tmp/audioset
 mkdir metadata
 cd metadata
 
 wget http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/eval_segments.csv
 wget http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/balanced_train_segments.csv
 wget http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/unbalanced_train_segments.csv
-wget https://raw.githubusercontent.com/qiuqiangkong/audioset_tagging_cnn/master/metadata/class_labels_indices.csv
+wget https://raw.githubusercontent.com/audioset/ontology/master/ontology.json
 
 cd ~/audio-dataset
 
 for i in $(seq -w 00 40)
 do
-  aws s3 --region us-east-1 cp s3://laion-audio/raw_dataset/audioset/unbalanced_train_segments/unbalanced_train_segments_part"${i}"_partial.zip /mnt/audio_clip/audioset/zip/
-  aws s3 --region us-east-1 cp s3://laion-audio/raw_dataset/audioset/unbalanced_train_segments/unbalanced_train_segments_part"${i}"_partial.z01 /mnt/audio_clip/audioset/zip/
-  aws s3 --region us-east-1 cp s3://laion-audio/raw_dataset/audioset/unbalanced_train_segments/unbalanced_train_segments_part"${i}"_partial.z02 /mnt/audio_clip/audioset/zip/
+  aws s3 --region us-east-1 cp s3://laion-audio/raw_dataset/audioset/unbalanced_train_segments/unbalanced_train_segments_part"${i}"_partial.zip /tmp/audioset/zip/
+  aws s3 --region us-east-1 cp s3://laion-audio/raw_dataset/audioset/unbalanced_train_segments/unbalanced_train_segments_part"${i}"_partial.z01 /tmp/audioset/zip/
+  aws s3 --region us-east-1 cp s3://laion-audio/raw_dataset/audioset/unbalanced_train_segments/unbalanced_train_segments_part"${i}"_partial.z02 /tmp/audioset/zip/
 
-  7z e /mnt/audio_clip/audioset/zip/unbalanced_train_segments_part"${i}"_partial.zip -o/mnt/audio_clip/audioset/audios
+  7z e /tmp/audioset/zip/unbalanced_train_segments_part"${i}"_partial.zip -o/tmp/audioset/audios
 
   python data_preprocess/preprocess_audioset.py \
-  --metadata_dir /mnt/audio_clip/audioset/metadata \
+  --metadata_dir /tmp/audioset/metadata \
   --metadata_name unbalanced_train_segments \
-  --wav_dir /mnt/audio_clip/audioset/audios \
-  --output_dir /mnt/audio_clip/audioset/processed_data
+  --wav_dir /tmp/audioset/audios \
+  --output_dir /tmp/audioset/processed_data
 
-  rm /mnt/audio_clip/audioset/zip/unbalanced_train_segments_part"${i}"_partial*
-  rm -rf /mnt/audio_clip/audioset/audios
+  rm /tmp/audioset/zip/unbalanced_train_segments_part"${i}"_partial*
+  rm -rf /tmp/audioset/audios
 done
 
-aws s3 --region us-east-1 cp s3://laion-audio/raw_dataset/audioset/balanced_train_segments.zip /mnt/audio_clip/audioset/
-aws s3 --region us-east-1 cp s3://laion-audio/raw_dataset/audioset/eval_segments.zip /mnt/audio_clip/audioset/
+aws s3 --region us-east-1 cp s3://laion-audio/raw_dataset/audioset/balanced_train_segments.zip /tmp/audioset/
+aws s3 --region us-east-1 cp s3://laion-audio/raw_dataset/audioset/eval_segments.zip /tmp/audioset/
 
-cd /mnt/audio_clip/audioset/
+cd /tmp/audioset/
 unzip balanced_train_segments.zip
 unzip eval_segments.zip
 
 cd ~/audio-dataset
 
 python data_preprocess/preprocess_audioset.py \
---metadata_dir /mnt/audio_clip/audioset/metadata \
+--metadata_dir /tmp/audioset/metadata \
 --metadata_name balanced_train_segments \
---wav_dir /mnt/audio_clip/audioset/balanced_train_segments \
---output_dir /mnt/audio_clip/audioset/processed_data_balanced_train_segments
+--wav_dir /tmp/audioset/balanced_train_segments \
+--output_dir /tmp/audioset/processed_data_balanced_train_segments
 
 python data_preprocess/preprocess_audioset.py \
---metadata_dir /mnt/audio_clip/audioset/metadata \
+--metadata_dir /tmp/audioset/metadata \
 --metadata_name eval_segments \
---wav_dir /mnt/audio_clip/audioset/eval_segments \
---output_dir /mnt/audio_clip/audioset/processed_data_eval_segments
+--wav_dir /tmp/audioset/eval_segments \
+--output_dir /tmp/audioset/processed_data_eval_segments
 
-python data_check/remove_bad_flac.py --dir /mnt/audio_clip/audioset/processed_data_eval_segments
-python data_check/remove_bad_flac.py --dir /mnt/audio_clip/audioset/processed_data_balanced_train_segments
-python data_check/remove_bad_flac.py --dir /mnt/audio_clip/audioset/processed_data
+python data_check/remove_bad_flac.py --dir /tmp/audioset/processed_data_eval_segments
+python data_check/remove_bad_flac.py --dir /tmp/audioset/processed_data_balanced_train_segments
+python data_check/remove_bad_flac.py --dir /tmp/audioset/processed_data
 
 python ./utils/make_tar.py \
---input /mnt/audio_clip/audioset/processed_data \
---output /mnt/audio_clip/audioset/webdataset_tar/unbalanced_train/ \
+--input /tmp/audioset/processed_data \
+--output /tmp/audioset/webdataset_tar/unbalanced_train/ \
 --dataclass none \
 --delete_file
 
 python ./utils/make_tar.py \
---input /mnt/audio_clip/audioset/processed_data_balanced_train_segments \
---output /mnt/audio_clip/audioset/webdataset_tar/balanced_train/ \
+--input /tmp/audioset/processed_data_balanced_train_segments \
+--output /tmp/audioset/webdataset_tar/balanced_train/ \
 --dataclass none \
 --delete_file
 
 python ./utils/make_tar.py \
---input /mnt/audio_clip/audioset/processed_data_eval_segments \
---output /mnt/audio_clip/audioset/webdataset_tar/eval/ \
+--input /tmp/audioset/processed_data_eval_segments \
+--output /tmp/audioset/webdataset_tar/eval/ \
 --dataclass none \
 --delete_file
 
-aws s3 --region us-east-1 cp /mnt/audio_clip/audioset/webdataset_tar/balanced_train s3://laion-audio/webdataset_tar/audioset/balanced_train --recursive
-aws s3 --region us-east-1 cp /mnt/audio_clip/audioset/webdataset_tar/eval s3://laion-audio/webdataset_tar/audioset/eval --recursive
-aws s3 --region us-east-1 cp /mnt/audio_clip/audioset/webdataset_tar/unbalanced_train s3://laion-audio/webdataset_tar/audioset/unbalanced_train --recursive
\ No newline at end of file
+aws s3 --region us-east-1 cp /tmp/audioset/webdataset_tar/balanced_train s3://laion-audio/webdataset_tar/audioset_description/balanced_train --recursive
+aws s3 --region us-east-1 cp /tmp/audioset/webdataset_tar/eval s3://laion-audio/webdataset_tar/audioset_description/eval --recursive
+aws s3 --region us-east-1 cp /tmp/audioset/webdataset_tar/unbalanced_train s3://laion-audio/webdataset_tar/audioset_description/unbalanced_train --recursive
\ No newline at end of file

From bb85a09f6119bd1421b91b12c10247e98ece58d3 Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@ip-172-31-44-155.ec2.internal>
Date: Mon, 19 Jun 2023 00:42:51 +0000
Subject: [PATCH 35/40] backup

---
 data_preprocess/preprocess_audioset.py |  2 +-
 data_preprocess/preprocess_audioset.sh | 26 +++++++++++++-------------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/data_preprocess/preprocess_audioset.py b/data_preprocess/preprocess_audioset.py
index 6f13fdd..8308192 100644
--- a/data_preprocess/preprocess_audioset.py
+++ b/data_preprocess/preprocess_audioset.py
@@ -65,7 +65,7 @@ def process_single_audio(file_path, json_data, output_dir):
         class_metadata = pd.DataFrame(class_metadata, columns=header_list)
         class_metadata = dict(zip(class_metadata.YTID, class_metadata.positive_labels))
 
-    with open(os.path.join(args.metadata_dir,'/ontology.json')) as f:
+    with open(os.path.join(args.metadata_dir,'ontology.json')) as f:
         ontology = json.load(f)
         ontology_dict = {i['id']: (i['name'], i['description']) for i in ontology}
 
diff --git a/data_preprocess/preprocess_audioset.sh b/data_preprocess/preprocess_audioset.sh
index d9e6fa1..38cdaf4 100644
--- a/data_preprocess/preprocess_audioset.sh
+++ b/data_preprocess/preprocess_audioset.sh
@@ -11,15 +11,15 @@ wget http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/balanced_tr
 wget http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/unbalanced_train_segments.csv
 wget https://raw.githubusercontent.com/audioset/ontology/master/ontology.json
 
-cd ~/audio-dataset
+cd /fsx/knoriy/code/audio-dataset
 
 for i in $(seq -w 00 40)
 do
-  aws s3 --region us-east-1 cp s3://laion-audio/raw_dataset/audioset/unbalanced_train_segments/unbalanced_train_segments_part"${i}"_partial.zip /tmp/audioset/zip/
-  aws s3 --region us-east-1 cp s3://laion-audio/raw_dataset/audioset/unbalanced_train_segments/unbalanced_train_segments_part"${i}"_partial.z01 /tmp/audioset/zip/
-  aws s3 --region us-east-1 cp s3://laion-audio/raw_dataset/audioset/unbalanced_train_segments/unbalanced_train_segments_part"${i}"_partial.z02 /tmp/audioset/zip/
+  aws s3 cp s3://s-laion-audio/raw_dataset/audioset/unbalanced_train_segments/unbalanced_train_segments_part"${i}"_partial.zip /tmp/audioset/zip/
+  aws s3 cp s3://s-laion-audio/raw_dataset/audioset/unbalanced_train_segments/unbalanced_train_segments_part"${i}"_partial.z01 /tmp/audioset/zip/
+  aws s3 cp s3://s-laion-audio/raw_dataset/audioset/unbalanced_train_segments/unbalanced_train_segments_part"${i}"_partial.z02 /tmp/audioset/zip/
 
-  7z e /tmp/audioset/zip/unbalanced_train_segments_part"${i}"_partial.zip -o/tmp/audioset/audios
+  7z e /mnt/audio_clip/audioset/zip/unbalanced_train_segments_part"${i}"_partial.zip -o/mnt/audio_clip/audioset/audios
 
   python data_preprocess/preprocess_audioset.py \
   --metadata_dir /tmp/audioset/metadata \
@@ -27,18 +27,18 @@ do
   --wav_dir /tmp/audioset/audios \
   --output_dir /tmp/audioset/processed_data
 
-  rm /tmp/audioset/zip/unbalanced_train_segments_part"${i}"_partial*
-  rm -rf /tmp/audioset/audios
+  # rm /tmp/audioset/zip/unbalanced_train_segments_part"${i}"_partial*
+  # rm -rf /tmp/audioset/audios
 done
 
-aws s3 --region us-east-1 cp s3://laion-audio/raw_dataset/audioset/balanced_train_segments.zip /tmp/audioset/
-aws s3 --region us-east-1 cp s3://laion-audio/raw_dataset/audioset/eval_segments.zip /tmp/audioset/
+aws s3 cp s3://s-laion-audio/raw_dataset/audioset/balanced_train_segments.zip /tmp/audioset/
+aws s3 cp s3://s-laion-audio/raw_dataset/audioset/eval_segments.zip /tmp/audioset/
 
 cd /tmp/audioset/
 unzip balanced_train_segments.zip
 unzip eval_segments.zip
 
-cd ~/audio-dataset
+cd /fsx/knoriy/code/audio-dataset
 
 python data_preprocess/preprocess_audioset.py \
 --metadata_dir /tmp/audioset/metadata \
@@ -74,6 +74,6 @@ python ./utils/make_tar.py \
 --dataclass none \
 --delete_file
 
-aws s3 --region us-east-1 cp /tmp/audioset/webdataset_tar/balanced_train s3://laion-audio/webdataset_tar/audioset_description/balanced_train --recursive
-aws s3 --region us-east-1 cp /tmp/audioset/webdataset_tar/eval s3://laion-audio/webdataset_tar/audioset_description/eval --recursive
-aws s3 --region us-east-1 cp /tmp/audioset/webdataset_tar/unbalanced_train s3://laion-audio/webdataset_tar/audioset_description/unbalanced_train --recursive
\ No newline at end of file
+aws s3 cp /tmp/audioset/webdataset_tar/balanced_train s3://s-laion-audio/webdataset_tar/audioset_description/balanced_train --recursive
+aws s3 cp /tmp/audioset/webdataset_tar/eval s3://s-laion-audio/webdataset_tar/audioset_description/eval --recursive
+aws s3 cp /tmp/audioset/webdataset_tar/unbalanced_train s3://s-laion-audio/webdataset_tar/audioset_description/unbalanced_train --recursive
\ No newline at end of file

From 054461765fb653f474952ab27e239e816aa99488 Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@ip-172-31-44-155.ec2.internal>
Date: Mon, 19 Jun 2023 00:43:09 +0000
Subject: [PATCH 36/40] backup

---
 current_dataset/preprocess_audioset.py | 124 +++++++++++++++++++++++++
 1 file changed, 124 insertions(+)
 create mode 100644 current_dataset/preprocess_audioset.py

diff --git a/current_dataset/preprocess_audioset.py b/current_dataset/preprocess_audioset.py
new file mode 100644
index 0000000..d569e47
--- /dev/null
+++ b/current_dataset/preprocess_audioset.py
@@ -0,0 +1,124 @@
+import os
+import json
+import sys
+import tqdm
+import json
+import pathlib
+import fsspec
+import pandas as pd
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from datasets import load_dataset
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+
+from utils.audio_utils import audio_to_flac
+from utils.make_tar_utils import tardir
+
+
+def get_json(file_path, class_metadata, ontology_dict, class_only=False):
+    audio_id = os.path.basename(file_path).replace('.wav', '')
+    class_labels = class_metadata[audio_id].replace('"', '').split(',')
+
+    if class_only:
+        class_names = [ontology_dict[c][0] for c in class_labels]
+    else:
+        class_names = [f"{ontology_dict[c][0]} ({ontology_dict[c][1]})" for c in class_labels]
+
+    if len(class_names) > 1:
+        text = "The sounds of " + ", ".join(class_names[:-1]) + " and " + class_names[-1]
+    elif len(class_names) == 1:
+        text = "The sound of " + class_names[0][0], class_names[0][1]
+    else:
+        raise ValueError("No class label found for audio id: {}".format(audio_id))
+
+    json_data = {'text': text,
+                 'original_data': {'class_labels': class_labels,
+                                   'class_names': [ontology_dict[c][0] for c in class_labels], 
+                                   'class_descriptions': [ontology_dict[c][1] for c in class_labels],
+                                   }
+                 }
+    return json_data
+
+def convert_and_json_dump(file:str, dest:str, df, class_metadata, ontology_dict, class_only=False, overwrite:bool=False):
+    if os.path.isfile(dest) and os.path.isfile(dest.replace('.flac', '.json')) and not overwrite:
+        print(f'{dest} already exists, skiping')
+        return
+    audio_to_flac(file, dest)
+
+
+    get_json(file, class_metadata, ontology_dict, class_only=False)
+    with open(dest.replace('.flac', '.json'), 'w') as f:
+        m_dump = get_json(file, class_metadata, ontology_dict, class_only=False)
+        m_dump['filename'] = os.path.join(*dest.split('/')[5:])
+        json.dump(m_dump, f)
+
+
+def split_all_audio_files(data, dest_root_path, ontology_dict, class_metadata, max_workers=96):
+    if not os.path.exists(dest_root_path):
+        raise FileNotFoundError(f'Please Check {dest_root_path} exists')
+
+    l = len(data)
+    with tqdm.tqdm(total=l, desc=f'Processing {dest_root_path}') as pbar:
+        with ThreadPoolExecutor() as executor:
+            threads = [executor.submit(convert_and_json_dump, row["audio"]["path"], os.path.join(dest_root_path, f'{i}.flac'), row, class_metadata, ontology_dict, False, False) for i, row in enumerate(data)]
+            for _ in as_completed(threads):
+                pbar.update(1)
+
+
+def main():
+    import multiprocessing
+    max_workers = multiprocessing.cpu_count()
+
+    ###############
+    # Get metadata
+    ###############
+    
+    #load ontology
+    # !wget -O /tmp/ontology.json https://raw.githubusercontent.com/audioset/ontology/master/ontology.json
+
+    with open('/tmp/ontology.json') as f:
+        ontology = json.load(f)
+        ontology_dict = {i['id']: (i['name'], i['description']) for i in ontology}
+
+    #get and load CSV
+    # !wget -O /tmp/eval_segments.csv http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/eval_segments.csv
+
+    with open('/tmp/eval_segments.csv') as f:
+        lines = f.readlines()
+        lines = lines[3:]
+        header_list = ['YTID', 'start_seconds', 'end_seconds', 'positive_labels']
+        class_metadata = [l.strip().split(', ') for l in lines]
+        class_metadata = pd.DataFrame(class_metadata, columns=header_list)
+        class_metadata = dict(zip(class_metadata.YTID, class_metadata.positive_labels))
+
+    langs = ['ab', 'ar', 'en', 'fa', 'fr', 'es', 'sl', 'kab', 'cy', 'ca', 'de', 'tt', 'ta', 'ru', 'nl', 'it', 'eu', 'tr', 'zh-TW', 'br', 'pt', 'eo', 'zh-CN', 'id', 'ia', 'lv', 'ja', 'rw', 'sv-SE', 'cnh', 'et', 'ky', 'ro', 'hsb', 'el', 'cs', 'pl', 'rm-sursilv', 'rm-vallader', 'mn', 'zh-HK', 'cv', 'uk', 'mt', 'as', 'ka', 'fy-NL', 'dv', 'pa-IN', 'vi', 'or', 'ga-IE', 'fi', 'hu', 'th', 'lt', 'lg', 'hi', 'bas', 'sk', 'kmr', 'bg', 'kk', 'ba', 'gl', 'ug', 'hy-AM', 'be', 'ur', 'gn', 'sr', 'uz', 'mr', 'da', 'myv', 'nn-NO', 'ha', 'ckb', 'ml', 'mdf', 'sw', 'sat', 'tig', 'ig', 'nan-tw', 'mhr', 'bn', 'tok', 'yue', 'sah', 'mk', 'sc', 'skr', 'ti', 'mrj', 'tw', 'vot', 'az', 'ast', 'ne-NP']
+    dataset_name = "common_voice_11_0"
+    s3 = fsspec.filesystem('s3')
+
+    with tqdm.tqdm(total=len(langs)) as pbar:
+        for lang in langs:
+            pbar.set_description(f'Prcessing {lang}')
+            for split in ["train", "test", "validation"]:
+                wikipedia_dataset = load_dataset(f"mozilla-foundation/{dataset_name}", lang, split=split)
+
+                if split == "validation": split = "valid"
+                root_dest_path = pathlib.Path(f"/fsx/knoriy/processed_datasets/{dataset_name}/{lang}/{split}/")
+                root_dest_path.mkdir(parents=True, exist_ok=True)
+
+                split_all_audio_files(wikipedia_dataset, root_dest_path, ontology_dict, class_metadata, max_workers)
+                tardir(str(root_dest_path), str(root_dest_path), 512, delete_file=False)
+
+                # Upload only tar files to s3
+                tar_files = (root_dest_path.glob('*.tar'))
+                for tar in tar_files:
+                    # upload to s3 and delete local
+                    pbar.set_description(f'Prcessing {lang}: uploading {str(tar)} to s3')
+                    s3_dest = f's-laion-audio/webdataset_tar/{dataset_name}/{lang}/{split}/{tar.name}'
+                    s3.put(str(tar), s3_dest)
+                # shutil.rmtree(root_dest_path)
+                # break
+            pbar.update(1)
+            # break
+
+if __name__ == '__main__':
+    main()

From e1e3643fd9d10da4856139ebf746397dc45fdb8b Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@ip-172-64-56-121.us-west-2.compute.internal>
Date: Wed, 5 Jul 2023 15:16:53 +0000
Subject: [PATCH 37/40] updated json file text to include metadata

---
 current_dataset/preprocess_CREMA-D.py | 35 ++++++++++++++-------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/current_dataset/preprocess_CREMA-D.py b/current_dataset/preprocess_CREMA-D.py
index 647737d..cdb08dc 100644
--- a/current_dataset/preprocess_CREMA-D.py
+++ b/current_dataset/preprocess_CREMA-D.py
@@ -29,7 +29,7 @@ def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=True, verbose=F
         return
     audio_to_flac(file, dest)
     with open(dest.replace('.flac', '.json'), 'w') as f:
-        json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':[df['text']]}, f)
+        json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':[df['text']], 'original_data':df['tag']}, f)
 
 
 def split_all_audio_files(df, dest_root_path, max_workers=96):
@@ -44,7 +44,7 @@ def split_all_audio_files(df, dest_root_path, max_workers=96):
                 pbar.update(1)
 
 def create_df(root_path:str, dataset_name:str=None):
-    wavs = glob.glob(os.path.join(root_path, '**/*.wav'), recursive=True)
+    wavs = glob.glob(os.path.join(root_path, 'AudioWAV',  '**/*.wav'), recursive=True)
     codes = {   'Statement':{   'IEO':"It's eleven o'clock", 
                                 'TIE':"That is exactly what happened",
                                 'IOM':"I'm on my way to the meeting",
@@ -65,13 +65,13 @@ def create_df(root_path:str, dataset_name:str=None):
                                 'NEU':'neutral',
                                 'SAD':'sad',
                         },
-                'Emotional intensity':{ 'LO':'Low', 
-                                        'MD':'Medium',
-                                        'HI':'High',
-                                        'XX':'Unspecified',
+                'Emotional intensity':{ 'LO':'low', 
+                                        'MD':'medium',
+                                        'HI':'high',
+                                        'XX':'unspecified',
                                         },
                 }
-    demographics = pd.read_csv('/fsx/knoriy/raw_datasets/CREMA-D/VideoDemographics.csv', names=["ActorID","Age","Sex","Race","Ethnicity"])
+    demographics = pd.read_csv(os.path.join(root_path, 'VideoDemographics.csv'), names=["ActorID","Age","Sex","Race","Ethnicity"])
     df_data = []
     for wav in tqdm.tqdm(wavs):
         file_name = os.path.basename(wav).split('.')[0]
@@ -80,9 +80,9 @@ def create_df(root_path:str, dataset_name:str=None):
         demograpthics_meta = demographics.loc[demographics['ActorID'] == wav_codes[0]]
 
         male_or_female = 'woman' if demograpthics_meta["Sex"].values[0] == 'Female' else 'man'
-        intensity = '' if text_meta[2] == 'Unspecified' else f'and {text_meta[2]} '
-        text = f'A {male_or_female} saying "{text_meta[0]}" in a {text_meta[1]} {intensity} voice. {text_meta[2]}'
-        df_data.append({ 'path':wav, 'text':text, 'tag':{'transcript':text_meta[0], 'language':'english', 'emotion':text_meta[1], 'gender':demograpthics_meta["Sex"].values[0], 'age':demograpthics_meta["Age"].values[0] }})
+        intensity = '' if text_meta[2] == 'unspecified' else f' with {text_meta[2]} emotional intensity'
+        text = f'A {demograpthics_meta["Age"].values[0]} year-old {male_or_female}, saying "{text_meta[0]}" in a {text_meta[1]} voice{intensity}.'
+        df_data.append({ 'path':wav, 'text':text, 'tag':{'transcript':text_meta[0], 'language':'english', 'emotion':text_meta[1], 'emotion_intensity':text_meta[2], 'gender':demograpthics_meta["Sex"].values[0], 'age':demograpthics_meta["Age"].values[0] }})
 
     return pd.DataFrame(df_data)
 
@@ -94,11 +94,11 @@ def create_df(root_path:str, dataset_name:str=None):
     print("Num workers: ", max_workers)
     chunk = 512
 
-    root_path = '/fsx/knoriy/raw_datasets/CREMA-D/AudioWAV/'
+    root_path = '/admin/home-knoriy/DELETEME/CREMA-D/'
     dataset_name = 'CREMA-D'
 
     s3 = fsspec.filesystem('s3')
-    s3_dest = f's-laion-audio/webdataset_tar/{dataset_name}/'
+    s3_dest = f'laion-west-audio/webdataset_tar/{dataset_name}/'
 
     original_tar_dir = '/fsx/knoriy/raw_datasets/CREMA-D/crema-d.tar.gz'
 
@@ -119,12 +119,13 @@ def create_df(root_path:str, dataset_name:str=None):
     for key in tqdm.tqdm(train_test_val, desc=f'processing:'):
         df = train_test_val[key]
         
-        dest_path = os.path.join(root_path.replace('raw_datasets', 'processed_datasets').replace('AudioWAV/', ''), key)
+        dest_path = os.path.join(root_path.replace('CREMA-D', 'CREMA-D_processed').replace('AudioWAV/', ''), key)
         os.makedirs(dest_path, exist_ok=True)
 
+
         split_all_audio_files(df, dest_path)
-        tardir(dest_path, dest_path, chunk, delete_file=True)
+        tardir(dest_path, dest_path, chunk, delete_file=False)
 
-        # upload to s3 and delete local
-        s3.put(dest_path, os.path.join(s3_dest, key), recursive=True)
-        shutil.rmtree(dest_path)
\ No newline at end of file
+    #     # upload to s3 and delete local
+    #     s3.put(dest_path, os.path.join(s3_dest, key), recursive=True)
+    #     shutil.rmtree(dest_path)
\ No newline at end of file

From 6045d5d1997ecc055fbe27177598b6da47fe81a9 Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@ip-172-64-56-121.us-west-2.compute.internal>
Date: Wed, 5 Jul 2023 15:31:54 +0000
Subject: [PATCH 38/40] Added meta

---
 current_dataset/preprocess_CREMA-D.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/current_dataset/preprocess_CREMA-D.py b/current_dataset/preprocess_CREMA-D.py
index cdb08dc..a894e5b 100644
--- a/current_dataset/preprocess_CREMA-D.py
+++ b/current_dataset/preprocess_CREMA-D.py
@@ -122,10 +122,9 @@ def create_df(root_path:str, dataset_name:str=None):
         dest_path = os.path.join(root_path.replace('CREMA-D', 'CREMA-D_processed').replace('AudioWAV/', ''), key)
         os.makedirs(dest_path, exist_ok=True)
 
-
         split_all_audio_files(df, dest_path)
-        tardir(dest_path, dest_path, chunk, delete_file=False)
+        tardir(dest_path, dest_path, chunk, delete_file=True)
 
-    #     # upload to s3 and delete local
-    #     s3.put(dest_path, os.path.join(s3_dest, key), recursive=True)
-    #     shutil.rmtree(dest_path)
\ No newline at end of file
+        # upload to s3 and delete local
+        s3.put(dest_path, os.path.join(s3_dest, key), recursive=True)
+        shutil.rmtree(dest_path)
\ No newline at end of file

From a919011f2859e3ba2603fe4e4a995e7fc7a003ac Mon Sep 17 00:00:00 2001
From: knoriy <knoriy@ip-172-64-56-121.us-west-2.compute.internal>
Date: Wed, 5 Jul 2023 15:51:48 +0000
Subject: [PATCH 39/40] upload to s3

---
 current_dataset/preprocess_CREMA-D.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/current_dataset/preprocess_CREMA-D.py b/current_dataset/preprocess_CREMA-D.py
index a894e5b..8b936c7 100644
--- a/current_dataset/preprocess_CREMA-D.py
+++ b/current_dataset/preprocess_CREMA-D.py
@@ -58,7 +58,7 @@ def create_df(root_path:str, dataset_name:str=None):
                                 'TSI':"The surface is slick",
                                 'WSI':"We'll stop in a couple of minutes",
                                 },
-                'Emotion':{     'ANG':'angery',
+                'Emotion':{     'ANG':'angry',
                                 'DIS':'disgusted',
                                 'FEA':'fearful',
                                 'HAP':'happy',

From 3fc2359f5e36bbb17d34b596626f0b1a827b1e3f Mon Sep 17 00:00:00 2001
From: knoriy <knoriy72@gmail.com>
Date: Fri, 7 Jul 2023 11:42:00 +0000
Subject: [PATCH 40/40] added emns dataset

---
 current_dataset/preprocess_EMNS.py | 87 ++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 current_dataset/preprocess_EMNS.py

diff --git a/current_dataset/preprocess_EMNS.py b/current_dataset/preprocess_EMNS.py
new file mode 100644
index 0000000..e9348f4
--- /dev/null
+++ b/current_dataset/preprocess_EMNS.py
@@ -0,0 +1,87 @@
+import glob
+import tqdm
+import os
+import glob
+import pandas as pd
+import sys
+import tarfile
+import json
+import shutil
+import fsspec
+
+from sklearn.model_selection import train_test_split
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from multiprocessing import Pool
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+from utils.audio_utils import audio_to_flac
+from utils.make_tar_utils import tardir
+
+def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=True, verbose=False):
+    if os.path.isfile(dest) and overwrite==False:
+        if verbose==True:
+            print(f'{dest} already exists, skiping')
+        return
+    audio_to_flac(file, dest)
+    with open(dest.replace('.flac', '.json'), 'w') as f:
+        json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':[df['text']], 'original_data':df['original_data']}, f)
+
+
+def split_all_audio_files(df, dest_root_path, max_workers=96):
+    if not os.path.exists(dest_root_path):
+        raise FileNotFoundError(f'Please Check {dest_root_path} exists')
+
+    l = len(df)
+    with tqdm.tqdm(total=l, desc=f'Processing {dest_root_path}') as pbar:
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            threads = [executor.submit(convert_and_json_dump, row[0], os.path.join(dest_root_path, f'{i}.flac'), row) for i, row in enumerate(df.iloc())]
+            for _ in as_completed(threads):
+                pbar.update(1)
+
+def create_df(root_path:str, dataset_name:str=None):
+    df = pd.read_csv(os.path.join(root_path, 'metadata.csv'), sep='|')
+
+    df_data = []
+    for row in df.iloc:
+        path = os.path.join(root_path, row['audio_recording'].replace('wavs/', 'cleaned_webm/'))
+        text = row['description'].format(user_id=f"A {row['gender']} in their {row['age']}", transcription=row['utterance'], emotion=row['emotion']) + f" Emotion intensity: {row['level']}."
+        df_data.append({'path':path, 'text':text, 'original_data':{'age': row['age'], 'gender':row['gender'], 'emotion':row['emotion']}, 'transcript':row['utterance'], "level":row['level']})
+
+    return pd.DataFrame(df_data)
+
+
+if __name__ == '__main__':
+    import multiprocessing
+
+    max_workers = multiprocessing.cpu_count()
+    print("Num workers: ", max_workers)
+    chunk = 512
+
+    root_path = '/admin/home-knoriy/DELETEME/EMNS/'
+    dataset_name = 'EMNS'
+
+    s3 = fsspec.filesystem('s3')
+    s3_dest = f'laion-west-audio/webdataset_tar/{dataset_name}/'
+
+    # load metadata and configure audio paths
+    df = create_df(root_path)
+
+    # create train, test, valid splits
+    train, test = train_test_split(df, test_size=0.2)
+    valid, test = train_test_split(test, test_size=0.2)
+    train_test_val = {'valid/':valid, 'train/':train, 'test/':test}
+
+
+    
+    for key in tqdm.tqdm(train_test_val, desc=f'processing:'):
+        df = train_test_val[key]
+        
+        dest_path = os.path.join(root_path.replace(dataset_name, f'{dataset_name}_processed'), key)
+        os.makedirs(dest_path, exist_ok=True)
+
+        split_all_audio_files(df, dest_path)
+        tardir(dest_path, dest_path, chunk, delete_file=True)
+
+        # upload to s3 and delete local
+        # s3.put(dest_path, os.path.join(s3_dest, key), recursive=True)
+        # shutil.rmtree(dest_path)
\ No newline at end of file