diff --git a/.gitignore b/.gitignore
index b323e78..938f3a2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -133,4 +133,5 @@ dmypy.json
 utils/__pycache__/make_tar_utils.cpython-37.pyc
 /data_preprocess/process_audioset/
 
-*.out
\ No newline at end of file
+*.out
+test.*py*
\ No newline at end of file
diff --git a/current_dataset/ToDO.md b/current_dataset/ToDO.md
new file mode 100644
index 0000000..2cf9838
--- /dev/null
+++ b/current_dataset/ToDO.md
@@ -0,0 +1,6 @@
+# ToDo
+
+- [X] LJSpeech
+- [X] MSWC
+- [ ] GigaSpeech
+- [ ] CoVoST
diff --git a/current_dataset/preprocess_CREMA-D.py b/current_dataset/preprocess_CREMA-D.py
new file mode 100644
index 0000000..8b936c7
--- /dev/null
+++ b/current_dataset/preprocess_CREMA-D.py
@@ -0,0 +1,130 @@
+"""
+Code for preprocess GigaSpeech Corpus:
+https://github.com/SpeechColab/GigaSpeech
+"""
+
+import glob
+import tqdm
+import os
+import glob
+import pandas as pd
+import sys
+import tarfile
+import json
+import shutil
+import fsspec
+
+from sklearn.model_selection import train_test_split
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from multiprocessing import Pool
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+from utils.audio_utils import audio_to_flac
+from utils.make_tar_utils import tardir
+
+def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=True, verbose=False):
+    if os.path.isfile(dest) and overwrite==False:
+        if verbose==True:
+            print(f'{dest} already exists, skiping')
+        return
+    audio_to_flac(file, dest)
+    with open(dest.replace('.flac', '.json'), 'w') as f:
+        json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':[df['text']], 'original_data':df['tag']}, f)
+
+
+def split_all_audio_files(df, dest_root_path, max_workers=96):
+    if not os.path.exists(dest_root_path):
+        raise FileNotFoundError(f'Please Check {dest_root_path} exists')
+
+    l = len(df)
+    with tqdm.tqdm(total=l, desc=f'Processing {dest_root_path}') as pbar:
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            threads = [executor.submit(convert_and_json_dump, row[0], os.path.join(dest_root_path, f'{i}.flac'), row) for i, row in enumerate(df.iloc())]
+            for _ in as_completed(threads):
+                pbar.update(1)
+
+def create_df(root_path:str, dataset_name:str=None):
+    wavs = glob.glob(os.path.join(root_path, 'AudioWAV',  '**/*.wav'), recursive=True)
+    codes = {   'Statement':{   'IEO':"It's eleven o'clock", 
+                                'TIE':"That is exactly what happened",
+                                'IOM':"I'm on my way to the meeting",
+                                'IWW':"I wonder what this is about",
+                                'TAI':"The airplane is almost full",
+                                'MTI':"Maybe tomorrow it will be cold",
+                                'IWL':"I would like a new alarm clock",
+                                'ITH':"I think I have a doctor's appointment",
+                                'DFA':"Don't forget a jacket",
+                                'ITS':"I think I've seen this before",
+                                'TSI':"The surface is slick",
+                                'WSI':"We'll stop in a couple of minutes",
+                                },
+                'Emotion':{     'ANG':'angry',
+                                'DIS':'disgusted',
+                                'FEA':'fearful',
+                                'HAP':'happy',
+                                'NEU':'neutral',
+                                'SAD':'sad',
+                        },
+                'Emotional intensity':{ 'LO':'low', 
+                                        'MD':'medium',
+                                        'HI':'high',
+                                        'XX':'unspecified',
+                                        },
+                }
+    demographics = pd.read_csv(os.path.join(root_path, 'VideoDemographics.csv'), names=["ActorID","Age","Sex","Race","Ethnicity"])
+    df_data = []
+    for wav in tqdm.tqdm(wavs):
+        file_name = os.path.basename(wav).split('.')[0]
+        wav_codes = file_name.split('_')
+        text_meta = [codes['Statement'][wav_codes[1]], codes['Emotion'][wav_codes[2]], codes['Emotional intensity'][wav_codes[3]]]
+        demograpthics_meta = demographics.loc[demographics['ActorID'] == wav_codes[0]]
+
+        male_or_female = 'woman' if demograpthics_meta["Sex"].values[0] == 'Female' else 'man'
+        intensity = '' if text_meta[2] == 'unspecified' else f' with {text_meta[2]} emotional intensity'
+        text = f'A {demograpthics_meta["Age"].values[0]} year-old {male_or_female}, saying "{text_meta[0]}" in a {text_meta[1]} voice{intensity}.'
+        df_data.append({ 'path':wav, 'text':text, 'tag':{'transcript':text_meta[0], 'language':'english', 'emotion':text_meta[1], 'emotion_intensity':text_meta[2], 'gender':demograpthics_meta["Sex"].values[0], 'age':demograpthics_meta["Age"].values[0] }})
+
+    return pd.DataFrame(df_data)
+
+
+if __name__ == '__main__':
+    import multiprocessing
+
+    max_workers = multiprocessing.cpu_count()
+    print("Num workers: ", max_workers)
+    chunk = 512
+
+    root_path = '/admin/home-knoriy/DELETEME/CREMA-D/'
+    dataset_name = 'CREMA-D'
+
+    s3 = fsspec.filesystem('s3')
+    s3_dest = f'laion-west-audio/webdataset_tar/{dataset_name}/'
+
+    original_tar_dir = '/fsx/knoriy/raw_datasets/CREMA-D/crema-d.tar.gz'
+
+    # print('Extracting tar')
+    # with tarfile.open(original_tar_dir, mode='r:gz') as file:
+    #     audio_path = os.path.split(original_tar_dir)[0]
+    #     file.extractall(audio_path)
+
+    # load metadata and configure audio paths
+    df = create_df(root_path)
+
+    # create train, test, valid splits
+    train, test = train_test_split(df, test_size=0.2)
+    valid, test = train_test_split(test, test_size=0.2)
+    train_test_val = {'valid/':valid, 'train/':train, 'test/':test}
+
+    
+    for key in tqdm.tqdm(train_test_val, desc=f'processing:'):
+        df = train_test_val[key]
+        
+        dest_path = os.path.join(root_path.replace('CREMA-D', 'CREMA-D_processed').replace('AudioWAV/', ''), key)
+        os.makedirs(dest_path, exist_ok=True)
+
+        split_all_audio_files(df, dest_path)
+        tardir(dest_path, dest_path, chunk, delete_file=True)
+
+        # upload to s3 and delete local
+        s3.put(dest_path, os.path.join(s3_dest, key), recursive=True)
+        shutil.rmtree(dest_path)
\ No newline at end of file
diff --git a/current_dataset/preprocess_CoVoST.py b/current_dataset/preprocess_CoVoST.py
new file mode 100644
index 0000000..35f463c
--- /dev/null
+++ b/current_dataset/preprocess_CoVoST.py
@@ -0,0 +1,192 @@
+"""
+Code for preprocess LJSpeech Corpus:
+https://keithito.com/LJ-Speech-Dataset/
+"""
+
+import glob
+from tabnanny import verbose
+from tokenize import Name
+import tqdm
+import os
+import glob
+import pandas as pd
+import sys
+import tarfile
+import json
+import shutil
+import fsspec
+
+from sklearn.model_selection import train_test_split
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+from utils.audio_utils import audio_to_flac
+from utils.make_tar_utils import tardir
+
+def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False, verbose=False):
+    if os.path.isfile(dest) and overwrite==False:
+        if verbose==True:
+            print(f'{dest} already exists, skiping')
+        return
+    audio_to_flac(file, dest)
+    with open(dest.replace('.flac', '.json'), 'w') as f:
+        json.dump({'filename': os.path.join(*dest.split('/')[3:]), 'text':[df['text']], 'original_data':df['original_data']}, f)
+
+def split_all_audio_files(df, dest_root_path, max_workers=96):
+    if not os.path.exists(dest_root_path):
+        raise FileNotFoundError(f'Please Check {dest_root_path} exists')
+
+    l = len(df)
+    with tqdm.tqdm(total=l, desc=f'Processing {dest_root_path}') as pbar:
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            threads = [executor.submit(convert_and_json_dump, row[0], os.path.join(dest_root_path, f'{i}.flac'), row, overwrite=False, verbose=False) for i, row in enumerate(df.iloc())]
+            for _ in as_completed(threads):
+                pbar.update(1)
+
+def download_tsvs(urls:list, output_dir:str, extract:bool=False):
+    os.makedirs(output_dir, exist_ok=True)
+    for url in tqdm.tqdm(urls, desc="Downloading tsvs"):
+        dest_path = os.path.join(output_dir, url.split("/")[-1])
+        if os.path.isfile(dest_path):
+            continue
+        os.system(f'curl {url} --output {dest_path}')
+
+        if extract:
+            os.system(f'tar -xf {dest_path}')
+
+def extract_covost_2_tsvs(tsv_tar_dir:str, dest:str, cv_tsv:str, version=2):
+    extract_covost_2_tar_cmd = f'tar -xf {tsv_tar_dir} -C {dest}'
+    os.system(extract_covost_2_tar_cmd)
+
+    src_lang, tgt_lang = os.path.basename(tsv_tar_dir).split('.')[1].split('_')
+    get_covost_splits_cmd = f'python /home/knoriy/fsx/raw_datasets/CoVoST_2/covost/get_covost_splits.py \
+        --version {version} \
+        --src-lang {src_lang} \
+        --tgt-lang {tgt_lang} \
+        --root {dest} \
+        --cv-tsv {cv_tsv} \
+        '
+    os.system(get_covost_splits_cmd)
+
+if __name__ == '__main__':
+    import multiprocessing
+
+    x_2_eng = [
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.fr_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.de_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.es_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.ca_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.it_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.ru_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.zh-CN_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.pt_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.fa_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.et_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.mn_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.nl_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.tr_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.ar_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.sv-SE_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.lv_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.sl_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.ta_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.ja_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.id_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.cy_en.tsv.tar.gz",
+    ]
+    eng_2_x = [
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_de.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_ca.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_zh-CN.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_fa.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_et.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_mn.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_tr.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_ar.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_sv-SE.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_lv.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_sl.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_ta.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_ja.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_id.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_cy.tsv.tar.gz',
+    ]
+    get_language_from_key = {
+        'en':'english',
+        'de':'german', 
+        'fr':'french', 
+        'nl':'dutch', 
+        'ru':'russian', 
+        'es':'spanish', 
+        'it':'italian', 
+        'tr':'turkish', 
+        'fa':'persian',
+        'ca':'catalan', 
+        'zh-cn':'chinese', 
+        'pt':'portuguese',
+        'et':'estonian',
+        'mn':'mongolian',
+        'ar':'arabic',
+        'sv-se':'swedish',
+        'lv':'latvian',
+        'sl':'slovenian',
+        'ta':'tamil',
+        'ja':'japanese',
+        'id':'indonesian',
+        'cy':'welsh',
+        } 
+
+    max_workers = multiprocessing.cpu_count()
+    chunk = 512
+    generate_subset_tsv = True
+
+    root_path = '/home/knoriy/fsx/raw_datasets/CoVoST_2/'
+    metadata_dir = "/home/knoriy/fsx/raw_datasets/CoVoST_2/"
+
+    dataset_name = 'CoVoST_2'
+    COMMON_VOICE_VERSION = 'cv-corpus-10.0-2022-07-04'
+
+    s3 = fsspec.filesystem('s3')
+    s3_dest = f's-laion/knoriy/{dataset_name}/{dataset_name}_tars/'
+
+    download_tsvs(eng_2_x, os.path.join(root_path, 'tsvs/'))
+    # download_tsvs(x_2_eng, "/home/knoriy/fsx/raw_datasets/CoVoST_2/tsvs")
+
+    # uncomment to extract and create CoVoST tsvs
+    # for tar in tqdm.tqdm(glob.glob(os.path.join(root_path, 'tsvs/**/*.tar.gz'), recursive=True), desc='Extracting tsvs'):
+    #     extract_covost_2_tsvs(tar, os.path.join(root_path, 'tsvs/'), '/home/knoriy/fsx/raw_datasets/CoVoST_2/cv-corpus-10.0-2022-07-04/en/validated.tsv')
+
+    # load metadata and configure audio paths
+    tsvs = []
+    for tsv in glob.glob(os.path.join(root_path, 'tsvs/**/*.tsv'), recursive=True):
+        if any(word in os.path.basename(tsv) for word in ['test', 'train', 'dev']):
+            tsvs.append(tsv)
+
+    for tsv in tqdm.tqdm(tsvs, desc=f'processing:'):
+        raw_df = pd.read_csv(tsv, sep='\t', on_bad_lines='skip')
+        IS_TRAIN_VAL_OR_TEST, LANGUAGE = tsv.split('.')[-2], tsv.split('.')[-3]
+
+        data = {}
+        for row in raw_df.iloc():
+
+            data.setdefault('paths', []).append(os.path.join(root_path, COMMON_VOICE_VERSION, LANGUAGE.split('_')[0], "clips", row['path']))
+            data.setdefault('text', []).append(f"{row['translation']} translated to {get_language_from_key[LANGUAGE.split('_')[0]]}")
+            data.setdefault('original_data', []).append(
+                {
+                    "sentence":row['sentence'],
+                    "translation":row['translation'],
+                    "client_id":row['client_id'],
+                }
+            )
+
+        df = pd.DataFrame(data)
+        
+        dest_path = os.path.join(root_path.replace('raw_datasets', 'processed_datasets'), LANGUAGE, IS_TRAIN_VAL_OR_TEST, '')
+        os.makedirs(dest_path, exist_ok=True)
+
+        split_all_audio_files(df, dest_path)
+        tardir(dest_path, dest_path, chunk, delete_file=True)
+
+        # upload to s3 and delete local
+        s3.put(dest_path, os.path.join(s3_dest, LANGUAGE, IS_TRAIN_VAL_OR_TEST)+'/', recursive=True)
+        shutil.rmtree(dest_path)
diff --git a/current_dataset/preprocess_EMNS.py b/current_dataset/preprocess_EMNS.py
new file mode 100644
index 0000000..e9348f4
--- /dev/null
+++ b/current_dataset/preprocess_EMNS.py
@@ -0,0 +1,87 @@
+import glob
+import tqdm
+import os
+import glob
+import pandas as pd
+import sys
+import tarfile
+import json
+import shutil
+import fsspec
+
+from sklearn.model_selection import train_test_split
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from multiprocessing import Pool
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+from utils.audio_utils import audio_to_flac
+from utils.make_tar_utils import tardir
+
+def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=True, verbose=False):
+    if os.path.isfile(dest) and overwrite==False:
+        if verbose==True:
+            print(f'{dest} already exists, skiping')
+        return
+    audio_to_flac(file, dest)
+    with open(dest.replace('.flac', '.json'), 'w') as f:
+        json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':[df['text']], 'original_data':df['original_data']}, f)
+
+
+def split_all_audio_files(df, dest_root_path, max_workers=96):
+    if not os.path.exists(dest_root_path):
+        raise FileNotFoundError(f'Please Check {dest_root_path} exists')
+
+    l = len(df)
+    with tqdm.tqdm(total=l, desc=f'Processing {dest_root_path}') as pbar:
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            threads = [executor.submit(convert_and_json_dump, row[0], os.path.join(dest_root_path, f'{i}.flac'), row) for i, row in enumerate(df.iloc())]
+            for _ in as_completed(threads):
+                pbar.update(1)
+
+def create_df(root_path:str, dataset_name:str=None):
+    df = pd.read_csv(os.path.join(root_path, 'metadata.csv'), sep='|')
+
+    df_data = []
+    for row in df.iloc:
+        path = os.path.join(root_path, row['audio_recording'].replace('wavs/', 'cleaned_webm/'))
+        text = row['description'].format(user_id=f"A {row['gender']} in their {row['age']}", transcription=row['utterance'], emotion=row['emotion']) + f" Emotion intensity: {row['level']}."
+        df_data.append({'path':path, 'text':text, 'original_data':{'age': row['age'], 'gender':row['gender'], 'emotion':row['emotion']}, 'transcript':row['utterance'], "level":row['level']})
+
+    return pd.DataFrame(df_data)
+
+
+if __name__ == '__main__':
+    import multiprocessing
+
+    max_workers = multiprocessing.cpu_count()
+    print("Num workers: ", max_workers)
+    chunk = 512
+
+    root_path = '/admin/home-knoriy/DELETEME/EMNS/'
+    dataset_name = 'EMNS'
+
+    s3 = fsspec.filesystem('s3')
+    s3_dest = f'laion-west-audio/webdataset_tar/{dataset_name}/'
+
+    # load metadata and configure audio paths
+    df = create_df(root_path)
+
+    # create train, test, valid splits
+    train, test = train_test_split(df, test_size=0.2)
+    valid, test = train_test_split(test, test_size=0.2)
+    train_test_val = {'valid/':valid, 'train/':train, 'test/':test}
+
+
+    
+    for key in tqdm.tqdm(train_test_val, desc=f'processing:'):
+        df = train_test_val[key]
+        
+        dest_path = os.path.join(root_path.replace(dataset_name, f'{dataset_name}_processed'), key)
+        os.makedirs(dest_path, exist_ok=True)
+
+        split_all_audio_files(df, dest_path)
+        tardir(dest_path, dest_path, chunk, delete_file=True)
+
+        # upload to s3 and delete local
+        # s3.put(dest_path, os.path.join(s3_dest, key), recursive=True)
+        # shutil.rmtree(dest_path)
\ No newline at end of file
diff --git a/current_dataset/preprocess_EmoV_DB.py b/current_dataset/preprocess_EmoV_DB.py
new file mode 100644
index 0000000..3d2aab5
--- /dev/null
+++ b/current_dataset/preprocess_EmoV_DB.py
@@ -0,0 +1,111 @@
+import os
+from sre_parse import Verbose
+import sys
+import json
+import tqdm
+import pandas as pd
+import pathlib
+import fsspec
+import shutil
+
+
+from multiprocessing import Pool
+from itertools import repeat
+from sklearn.model_selection import train_test_split
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+from utils.audio_utils import audio_to_flac
+from utils.make_tar_utils import tardir
+
+
+def convert_and_json_dump(df:pd.DataFrame, overwrite:bool=False, verbose:bool=False):
+    dest = df['dest']
+    file = df['path']
+
+    os.makedirs(pathlib.Path(dest).parent, exist_ok=True)
+
+    if os.path.isfile(dest) and overwrite==False:
+        if verbose==True:
+            print(f'{dest} already exists, skiping')
+        return
+    audio_to_flac(file, dest)
+    with open(dest.replace('.flac', '.json'), 'w') as f:
+        json.dump({'filename': os.path.join(*dest.split('/')[3:]), 'text':[df['text']], 'original_data':df['original_data']}, f)
+    return dest.replace('.flac', '.json')
+
+def extract_tars(dir:pathlib.Path, dest:pathlib.Path):
+    glob = dir.glob("**/*.tar.gz")
+
+    for path in glob:
+        path = pathlib.Path(path)
+        tmp_dest = dest.joinpath(*(path.stem.split('_'))).with_suffix('')
+        tmp_dest.mkdir(parents=True, exist_ok=True)
+        cmd = f'tar -xf {path} -C {tmp_dest}'
+        os.system(cmd)
+
+
+def run_tasks(extract:bool=False, overwrite:bool=False, verbose:bool=False, chunksize:int=1):
+
+    dataset_name = 'EmoV_DB'
+    chunk = 512
+
+    s3 = fsspec.filesystem('s3')
+    s3_dest = pathlib.Path(f's-laion-audio/webdataset_tar/{dataset_name}/')
+
+    root_data_dir = pathlib.Path('/home/knoriy/fsx/raw_datasets/EmoV_db/')
+    extracted_data_dir = pathlib.Path('/home/knoriy/fsx/raw_datasets/EmoV_db/raw/')
+    if extract:
+        extract_tars(root_data_dir, extracted_data_dir)
+    
+    raw_df = pd.read_csv(root_data_dir.joinpath('cmuarctic.csv'), sep="\t", header=None)
+
+    glob = extracted_data_dir.glob('**/**/*.wav')
+    train, test = train_test_split(list(glob), test_size=0.3)
+    test, valid = train_test_split(list(test), test_size=0.3)
+    train_test_valid = {'train':train, 'test':test, 'valid':valid}
+
+    EmoV_DB_gender = {'sam':'male', 'jenie':'female', 'josh':'male', 'bea':'females'}
+
+    for key in train_test_valid:
+        dest_path = None
+        df_data = []
+        for i, path in enumerate(train_test_valid[key]):
+            root_path = path.parents[0]
+            file_name = path.name
+            emotion = root_path.name
+            actor = root_path.parents[0].name
+            dest_path = str(path.parents[3].joinpath('EmoV_DB_tars', key)).replace('raw_datasets', 'processed_datasets')
+
+            current_file = raw_df.loc[int(file_name.split('.')[0].split('_')[-1])-1]
+
+            data = {}
+
+            data['gender'] = EmoV_DB_gender[actor]
+            data['emotion'] = emotion
+            data['path'] = path
+            data['dest'] = str(pathlib.Path(dest_path).joinpath(f'{i}.flac'))
+            data['text'] = f'A {EmoV_DB_gender[actor]} saying "{current_file[1]}" in a {emotion} voice'
+            data['original_data'] = {'gender':EmoV_DB_gender[actor], 'emotion':emotion, 'raw_text':current_file[1]}
+
+
+            df_data.append(data)
+
+        df = pd.DataFrame(df_data)
+
+        print(f'starting pool for {key}')
+        with Pool() as pool:
+            for result in tqdm.tqdm(pool.starmap(convert_and_json_dump, zip(df.iloc, repeat(overwrite), repeat(verbose)), chunksize=chunksize), total=len(df_data)):
+                pass
+
+        tardir(dest_path, dest_path, chunk, delete_file=True)
+
+        # upload to s3 and delete local
+        s3.put(dest_path, s3_dest.joinpath(key), recursive=True)
+        print('File Uploaded to: ', s3_dest.joinpath(key))
+        shutil.rmtree(dest_path)
+    
+    # clean Extracted Files
+    shutil.rmtree(extracted_data_dir)
+
+if __name__ == '__main__':
+    run_tasks(extract=True, chunksize=10)
diff --git a/current_dataset/process_GigaSpeech.py b/current_dataset/preprocess_GigaSpeech.py
similarity index 57%
rename from current_dataset/process_GigaSpeech.py
rename to current_dataset/preprocess_GigaSpeech.py
index 947b2a5..1a7a94e 100644
--- a/current_dataset/process_GigaSpeech.py
+++ b/current_dataset/preprocess_GigaSpeech.py
@@ -21,10 +21,14 @@
 from utils.audio_utils import audio_to_flac
 from utils.make_tar_utils import tardir
 
-def convert_and_json_dump(file:str, dest:str, df):
+def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False):
+    if os.path.isfile(dest) and overwrite==False:
+        print(f'{dest} already exists, skiping')
+        return
+    
     audio_to_flac(file, dest, segment_start=df['begin_time'], segment_end=df['end_time'])
     with open(dest.replace('.flac', '.json'), 'w') as f:
-        json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':df['text'], 'tag':df['tag']}, f)
+        json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':[df['text']], 'original_data':df['original_data']}, f)
 
 
 def split_all_audio_files(df, dest_root_path, max_workers=96):
@@ -42,11 +46,11 @@ def split_all_audio_files(df, dest_root_path, max_workers=96):
     import multiprocessing
 
     max_workers = multiprocessing.cpu_count()
-    max_workers = 2
+    print("Num workers: ", max_workers)
     chunk = 512
 
-    root_path = '/mnt/knoriy/raw_datasets/gigaspeech/'
-    metadata_dir = "/mnt/knoriy/raw_datasets/gigaspeech/GigaSpeech.json"
+    root_path = '/home/knoriy/fsx/raw_datasets/GigaSpeech/gigaspeech/'
+    metadata_dir = "/home/knoriy/fsx/raw_datasets/GigaSpeech/gigaspeech/GigaSpeech.json"
 
     dataset_name = 'gigaspeech'
 
@@ -54,30 +58,36 @@ def split_all_audio_files(df, dest_root_path, max_workers=96):
     s3_dest = f's-laion/knoriy/GigaSpeech/{dataset_name}_tars/'
 
     # load metadata and configure audio paths
-    raw_df = pd.read_json(metadata_dir)[:2]
-
-    new_df_data = []
-    for row in tqdm.tqdm(raw_df.iloc(), total=len(raw_df), desc='Generating dataframe: '):
-        for seg in row['audios']['segments']:
-            try:
-                catagory = row['audios']['category']
-            except:
-                catagory = 'N/A'
-            
-            if seg['text_tn'] == '<SIL>':
-                continue
-
-            new_df_data.append(
-                {'path':f'{os.path.join(root_path, row["audios"]["path"])}', 
-                'begin_time': seg['begin_time'], 
-                'end_time': seg['end_time'], 
-                'text': seg['text_tn'],
-                'tag':{ 'language':row['language'], 
-                        'url':row['audios']['url'], 
-                        'category':catagory,
-                        'speaker':row['audios']['speaker']}
-                })
-    df = pd.DataFrame(new_df_data)
+    cache_df_path = os.path.join(root_path, 'temp_df.csv')
+    if os.path.isfile(cache_df_path):
+        df = pd.read_csv(cache_df_path, sep='\t')
+    else:
+        raw_df = pd.read_json(metadata_dir)
+
+        new_df_data = []
+        for row in tqdm.tqdm(raw_df.iloc(), total=len(raw_df), desc='Generating dataframe: '):
+            for seg in row['audios']['segments']:
+                try:
+                    catagory = row['audios']['category']
+                except:
+                    catagory = 'N/A'
+                
+                if seg['text_tn'] == '<SIL>' or seg['text_tn'] == '<NOISE>':
+                    continue
+
+                new_df_data.append(
+                    {'path':f'{os.path.join(root_path, row["audios"]["path"])}', 
+                    'begin_time': seg['begin_time'], 
+                    'end_time': seg['end_time'], 
+                    'text': seg['text_tn'],
+                    'original_data':{ 'language':row['language'], 
+                            'url':row['audios']['url'], 
+                            'category':catagory,
+                            'speaker':row['audios']['speaker']}
+                    })
+        df = pd.DataFrame(new_df_data)
+        df.to_csv(cache_df_path, sep='\t', index=False)
+        
     print(df.head())
 
     # create train, test, valid splits
diff --git a/current_dataset/preprocess_LJSpeech.py b/current_dataset/preprocess_LJSpeech.py
index 84d3a57..9c345b9 100644
--- a/current_dataset/preprocess_LJSpeech.py
+++ b/current_dataset/preprocess_LJSpeech.py
@@ -21,10 +21,14 @@
 from utils.audio_utils import audio_to_flac
 from utils.make_tar_utils import tardir
 
-def convert_and_json_dump(file:str, dest:str, df):
+def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False):
+    if os.path.isfile(dest) and overwrite==False:
+        print(f'{dest} already exists, skiping')
+        return
+
     audio_to_flac(file, dest)
     with open(dest.replace('.flac', '.json'), 'w') as f:
-        json.dump({'filename': os.path.join(*dest.split('/')[4:]), 'text':df['norm_text'], 'tag':{'raw_text':df['raw_text']}}, f)
+        json.dump({'filename': os.path.join(*dest.split('/')[4:]), 'text':[df['norm_text']], 'original_data':{'raw_text':df['raw_text']}}, f)
 
 
 def split_all_audio_files(df, dest_root_path, max_workers=96):
@@ -42,12 +46,12 @@ def split_all_audio_files(df, dest_root_path, max_workers=96):
     import multiprocessing
 
     max_workers = multiprocessing.cpu_count()
+    print("Num workers: ", max_workers)
     chunk = 512
     generate_subset_tsv = True
 
-    root_path = '/home/knoriy/datasets/raw_datasets/ljspeech/'
-    tar_dir = "/home/knoriy/datasets/raw_datasets/ljspeech/LJSpeech-1.1.tar.bz2"
-    metadata_dir = "/home/knoriy/datasets/raw_datasets/ljspeech/metadata.csv"
+    root_path = '/home/knoriy/fsx/raw_datasets/ljspeech/'
+    metadata_dir = "/home/knoriy/fsx/raw_datasets/ljspeech/metadata.csv"
 
     dataset_name = 'ljspeech'
 
diff --git a/current_dataset/preprocess_RAVDESS.py b/current_dataset/preprocess_RAVDESS.py
new file mode 100644
index 0000000..dd56cb4
--- /dev/null
+++ b/current_dataset/preprocess_RAVDESS.py
@@ -0,0 +1,104 @@
+"""
+Code for preprocess GigaSpeech Corpus:
+https://github.com/SpeechColab/GigaSpeech
+"""
+
+import glob
+import tqdm
+import os
+import glob
+import pandas as pd
+import sys
+import tarfile
+import json
+import shutil
+import fsspec
+
+from sklearn.model_selection import train_test_split
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+from utils.audio_utils import audio_to_flac
+from utils.make_tar_utils import tardir
+
+def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False, verbose=False):
+    if os.path.isfile(dest) and overwrite==False:
+        if verbose==True:
+            print(f'{dest} already exists, skiping')
+        return
+
+    audio_to_flac(file, dest)
+    with open(dest.replace('.flac', '.json'), 'w') as f:
+        json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':[df['text']]}, f)
+
+
+def split_all_audio_files(df, dest_root_path, max_workers=96):
+    if not os.path.exists(dest_root_path):
+        raise FileNotFoundError(f'Please Check {dest_root_path} exists')
+
+    l = len(df)
+    with tqdm.tqdm(total=l, desc=f'Processing {dest_root_path}') as pbar:
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            threads = [executor.submit(convert_and_json_dump, row[0], os.path.join(dest_root_path, f'{i}.flac'), row) for i, row in enumerate(df.iloc())]
+            for _ in as_completed(threads):
+                pbar.update(1)
+
+def create_df(root_path:str, dataset_name:str=None):
+    wavs = glob.glob(os.path.join(root_path, '**/*.wav'), recursive=True)
+    codes = {   'modality':{'01':'full-AV', '02':'video-only', '03':'audio-only'},
+                'Vocal channel':{'01':'speech', '02':'song'},
+                'Emotion':{'01':'neutral', '02':'calm', '03':'happy', '04':'sad', '05':'angry', '06':'fearful', '07':'disgust', '08':'surprised'},
+                'Emotional intensity':{'01':'normal', '02':'strong'},
+                'Statement':{'01':"Kids are talking by the door", '02':"Dogs are sitting by the door"},
+                'Repetition':{'01':1, '02':2},
+                }
+    df_data = []
+    for wav in tqdm.tqdm(wavs):
+        file_name = os.path.basename(wav).split('.')[0]
+        wav_codes = file_name.split('-')
+
+        text = []
+        for i, code in enumerate(codes.values()):
+            text.append(code[wav_codes[i]])
+
+        song_or_speech = 'says' if text[1] == 'speech' else 'sings'
+        text = f'A person {song_or_speech}, "{text[4]}" in a {text[2]} and {text[3]} voice.'
+        df_data.append({ 'path':wav, 'text':text})
+
+    return pd.DataFrame(df_data)
+
+
+if __name__ == '__main__':
+    import multiprocessing
+
+    max_workers = multiprocessing.cpu_count()
+    # print("Num workers: ", max_workers)
+    chunk = 512
+
+    root_path = '/home/knoriy/fsx/raw_datasets/RAVDESS/ravdess/'
+    dataset_name = 'ravdess'
+
+    s3 = fsspec.filesystem('s3')
+    s3_dest = f's-laion/knoriy/RAVDESS/{dataset_name}_tars/'
+
+    # load metadata and configure audio paths
+    df = create_df(root_path)
+
+    # create train, test, valid splits
+    train, test = train_test_split(df, test_size=0.2)
+    valid, test = train_test_split(test, test_size=0.2)
+    train_test_val = {'train/':train, 'test/':test, 'valid/':valid}
+
+    
+    for key in tqdm.tqdm(train_test_val, desc=f'processing:'):
+        df = train_test_val[key]
+        
+        dest_path = os.path.join(root_path.replace('raw_datasets', 'processed_datasets'), key)
+        os.makedirs(dest_path, exist_ok=True)
+
+        split_all_audio_files(df, dest_path)
+        tardir(dest_path, dest_path, chunk, delete_file=True)
+
+        # upload to s3 and delete local
+        s3.put(dest_path, os.path.join(s3_dest, key), recursive=True)
+        shutil.rmtree(dest_path)
\ No newline at end of file
diff --git a/current_dataset/preprocess_audioset.py b/current_dataset/preprocess_audioset.py
new file mode 100644
index 0000000..d569e47
--- /dev/null
+++ b/current_dataset/preprocess_audioset.py
@@ -0,0 +1,124 @@
+import os
+import json
+import sys
+import tqdm
+import json
+import pathlib
+import fsspec
+import pandas as pd
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from datasets import load_dataset
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+
+from utils.audio_utils import audio_to_flac
+from utils.make_tar_utils import tardir
+
+
+def get_json(file_path, class_metadata, ontology_dict, class_only=False):
+    audio_id = os.path.basename(file_path).replace('.wav', '')
+    class_labels = class_metadata[audio_id].replace('"', '').split(',')
+
+    if class_only:
+        class_names = [ontology_dict[c][0] for c in class_labels]
+    else:
+        class_names = [f"{ontology_dict[c][0]} ({ontology_dict[c][1]})" for c in class_labels]
+
+    if len(class_names) > 1:
+        text = "The sounds of " + ", ".join(class_names[:-1]) + " and " + class_names[-1]
+    elif len(class_names) == 1:
+        text = "The sound of " + class_names[0][0], class_names[0][1]
+    else:
+        raise ValueError("No class label found for audio id: {}".format(audio_id))
+
+    json_data = {'text': text,
+                 'original_data': {'class_labels': class_labels,
+                                   'class_names': [ontology_dict[c][0] for c in class_labels], 
+                                   'class_descriptions': [ontology_dict[c][1] for c in class_labels],
+                                   }
+                 }
+    return json_data
+
+def convert_and_json_dump(file:str, dest:str, df, class_metadata, ontology_dict, class_only=False, overwrite:bool=False):
+    if os.path.isfile(dest) and os.path.isfile(dest.replace('.flac', '.json')) and not overwrite:
+        print(f'{dest} already exists, skiping')
+        return
+    audio_to_flac(file, dest)
+
+
+    get_json(file, class_metadata, ontology_dict, class_only=False)
+    with open(dest.replace('.flac', '.json'), 'w') as f:
+        m_dump = get_json(file, class_metadata, ontology_dict, class_only=False)
+        m_dump['filename'] = os.path.join(*dest.split('/')[5:])
+        json.dump(m_dump, f)
+
+
+def split_all_audio_files(data, dest_root_path, ontology_dict, class_metadata, max_workers=96):
+    if not os.path.exists(dest_root_path):
+        raise FileNotFoundError(f'Please Check {dest_root_path} exists')
+
+    l = len(data)
+    with tqdm.tqdm(total=l, desc=f'Processing {dest_root_path}') as pbar:
+        with ThreadPoolExecutor() as executor:
+            threads = [executor.submit(convert_and_json_dump, row["audio"]["path"], os.path.join(dest_root_path, f'{i}.flac'), row, class_metadata, ontology_dict, False, False) for i, row in enumerate(data)]
+            for _ in as_completed(threads):
+                pbar.update(1)
+
+
+def main():
+    import multiprocessing
+    max_workers = multiprocessing.cpu_count()
+
+    ###############
+    # Get metadata
+    ###############
+    
+    #load ontology
+    # !wget -O /tmp/ontology.json https://raw.githubusercontent.com/audioset/ontology/master/ontology.json
+
+    with open('/tmp/ontology.json') as f:
+        ontology = json.load(f)
+        ontology_dict = {i['id']: (i['name'], i['description']) for i in ontology}
+
+    #get and load CSV
+    # !wget -O /tmp/eval_segments.csv http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/eval_segments.csv
+
+    with open('/tmp/eval_segments.csv') as f:
+        lines = f.readlines()
+        lines = lines[3:]
+        header_list = ['YTID', 'start_seconds', 'end_seconds', 'positive_labels']
+        class_metadata = [l.strip().split(', ') for l in lines]
+        class_metadata = pd.DataFrame(class_metadata, columns=header_list)
+        class_metadata = dict(zip(class_metadata.YTID, class_metadata.positive_labels))
+
+    langs = ['ab', 'ar', 'en', 'fa', 'fr', 'es', 'sl', 'kab', 'cy', 'ca', 'de', 'tt', 'ta', 'ru', 'nl', 'it', 'eu', 'tr', 'zh-TW', 'br', 'pt', 'eo', 'zh-CN', 'id', 'ia', 'lv', 'ja', 'rw', 'sv-SE', 'cnh', 'et', 'ky', 'ro', 'hsb', 'el', 'cs', 'pl', 'rm-sursilv', 'rm-vallader', 'mn', 'zh-HK', 'cv', 'uk', 'mt', 'as', 'ka', 'fy-NL', 'dv', 'pa-IN', 'vi', 'or', 'ga-IE', 'fi', 'hu', 'th', 'lt', 'lg', 'hi', 'bas', 'sk', 'kmr', 'bg', 'kk', 'ba', 'gl', 'ug', 'hy-AM', 'be', 'ur', 'gn', 'sr', 'uz', 'mr', 'da', 'myv', 'nn-NO', 'ha', 'ckb', 'ml', 'mdf', 'sw', 'sat', 'tig', 'ig', 'nan-tw', 'mhr', 'bn', 'tok', 'yue', 'sah', 'mk', 'sc', 'skr', 'ti', 'mrj', 'tw', 'vot', 'az', 'ast', 'ne-NP']
+    dataset_name = "common_voice_11_0"
+    s3 = fsspec.filesystem('s3')
+
+    with tqdm.tqdm(total=len(langs)) as pbar:
+        for lang in langs:
+            pbar.set_description(f'Prcessing {lang}')
+            for split in ["train", "test", "validation"]:
+                wikipedia_dataset = load_dataset(f"mozilla-foundation/{dataset_name}", lang, split=split)
+
+                if split == "validation": split = "valid"
+                root_dest_path = pathlib.Path(f"/fsx/knoriy/processed_datasets/{dataset_name}/{lang}/{split}/")
+                root_dest_path.mkdir(parents=True, exist_ok=True)
+
+                split_all_audio_files(wikipedia_dataset, root_dest_path, ontology_dict, class_metadata, max_workers)
+                tardir(str(root_dest_path), str(root_dest_path), 512, delete_file=False)
+
+                # Upload only tar files to s3
+                tar_files = (root_dest_path.glob('*.tar'))
+                for tar in tar_files:
+                    # upload to s3 and delete local
+                    pbar.set_description(f'Prcessing {lang}: uploading {str(tar)} to s3')
+                    s3_dest = f's-laion-audio/webdataset_tar/{dataset_name}/{lang}/{split}/{tar.name}'
+                    s3.put(str(tar), s3_dest)
+                # shutil.rmtree(root_dest_path)
+                # break
+            pbar.update(1)
+            # break
+
+if __name__ == '__main__':
+    main()
diff --git a/current_dataset/preprocess_common_voice.py b/current_dataset/preprocess_common_voice.py
new file mode 100644
index 0000000..0f3f75d
--- /dev/null
+++ b/current_dataset/preprocess_common_voice.py
@@ -0,0 +1,76 @@
+import os
+import sys
+import tqdm
+import json
+import pathlib
+import fsspec
+import shutil
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import soundfile as sf
+from datasets import load_dataset
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+
+
+from utils.audio_utils import audio_to_flac
+from utils.make_tar_utils import tardir
+
+
+def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False):
+    if os.path.isfile(dest) and os.path.isfile(dest.replace('.flac', '.json')) and not overwrite:
+        print(f'{dest} already exists, skiping')
+        return
+    
+    sf.write(dest, df['audio']['array'], df['audio']['sampling_rate'])
+    with open(dest.replace('.flac', '.json'), 'w') as f:
+        json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':[df['sentence']], 'original_data':{'up_votes':df['up_votes'], 'down_votes':df['down_votes'], 'age':df['age'], 'gender':df['gender'], 'accent':df['accent'], 'language':df['locale']}}, f)
+
+
+def split_all_audio_files(data, dest_root_path, max_workers=96):
+    if not os.path.exists(dest_root_path):
+        raise FileNotFoundError(f'Please Check {dest_root_path} exists')
+
+    l = len(data)
+    with tqdm.tqdm(total=l, desc=f'Processing {dest_root_path}') as pbar:
+        with ThreadPoolExecutor() as executor:
+            threads = [executor.submit(convert_and_json_dump, row["audio"]["path"], os.path.join(dest_root_path, f'{i}.flac'), row, False) for i, row in enumerate(data)]
+            for _ in as_completed(threads):
+                pbar.update(1)
+
+
+
+def main():
+    import multiprocessing
+    max_workers = multiprocessing.cpu_count()
+
+    langs = ['ab', 'ar', 'en', 'fa', 'fr', 'es', 'sl', 'kab', 'cy', 'ca', 'de', 'tt', 'ta', 'ru', 'nl', 'it', 'eu', 'tr', 'zh-TW', 'br', 'pt', 'eo', 'zh-CN', 'id', 'ia', 'lv', 'ja', 'rw', 'sv-SE', 'cnh', 'et', 'ky', 'ro', 'hsb', 'el', 'cs', 'pl', 'rm-sursilv', 'rm-vallader', 'mn', 'zh-HK', 'cv', 'uk', 'mt', 'as', 'ka', 'fy-NL', 'dv', 'pa-IN', 'vi', 'or', 'ga-IE', 'fi', 'hu', 'th', 'lt', 'lg', 'hi', 'bas', 'sk', 'kmr', 'bg', 'kk', 'ba', 'gl', 'ug', 'hy-AM', 'be', 'ur', 'gn', 'sr', 'uz', 'mr', 'da', 'myv', 'nn-NO', 'ha', 'ckb', 'ml', 'mdf', 'sw', 'sat', 'tig', 'ig', 'nan-tw', 'mhr', 'bn', 'tok', 'yue', 'sah', 'mk', 'sc', 'skr', 'ti', 'mrj', 'tw', 'vot', 'az', 'ast', 'ne-NP']
+    dataset_name = "common_voice_11_0"
+    s3 = fsspec.filesystem('s3')
+
+    with tqdm.tqdm(total=len(langs)) as pbar:
+        for lang in langs:
+            pbar.set_description(f'Prcessing {lang}')
+            for split in ["train", "test", "validation"]:
+                wikipedia_dataset = load_dataset(f"mozilla-foundation/{dataset_name}", lang, split=split)
+
+                if split == "validation": split = "valid"
+                root_dest_path = pathlib.Path(f"/fsx/knoriy/processed_datasets/{dataset_name}/{lang}/{split}/")
+                root_dest_path.mkdir(parents=True, exist_ok=True)
+
+                split_all_audio_files(wikipedia_dataset, root_dest_path, max_workers)
+                tardir(str(root_dest_path), str(root_dest_path), 512, delete_file=False)
+
+                # Upload only tar files to s3
+                tar_files = (root_dest_path.glob('*.tar'))
+                for tar in tar_files:
+                    # upload to s3 and delete local
+                    pbar.set_description(f'Prcessing {lang}: uploading {str(tar)} to s3')
+                    s3_dest = f's-laion-audio/webdataset_tar/{dataset_name}/{lang}/{split}/{tar.name}'
+                    s3.put(str(tar), s3_dest)
+                # shutil.rmtree(root_dest_path)
+                # break
+            pbar.update(1)
+            # break
+
+if __name__ == '__main__':
+    main()
diff --git a/current_dataset/preprocess_mswc.py b/current_dataset/preprocess_mswc.py
index 70cf3f0..5d36731 100644
--- a/current_dataset/preprocess_mswc.py
+++ b/current_dataset/preprocess_mswc.py
@@ -14,92 +14,104 @@
 import shutil
 import fsspec
 
-from concurrent.futures import ThreadPoolExecutor, as_completed
+import multiprocessing
+from multiprocessing import Pool
+from itertools import repeat
+
 
 sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
 from utils.audio_utils import audio_to_flac
 from utils.make_tar_utils import tardir
 
-def convert_and_json_dump(file:str, dest:str, df):
+def convert_and_json_dump(df:pd.DataFrame, overwrite:bool=False, verbose:bool=False):
+    dest = df['dest_path']
+    file = df['src_path']
+
+    if os.path.isfile(dest) and overwrite==False:
+        print(f'{dest} already exists, skiping')
+        return
     audio_to_flac(file, dest)
     with open(dest.replace('.flac', '.json'), 'w') as f:
-        json.dump({'filename': os.path.join(*dest.split('/')[4:]), 'text':df['WORD'], 'tag':{'gender':df['GENDER'], 'language':dest.split('/')[-2]}}, f)
-
-
-def split_all_audio_files(df, src_root_path, dest_root_path, max_workers=96):
-    if not os.path.exists(dest_root_path):
-        raise FileNotFoundError(f'Please Check {dest_root_path} exists')
+        json.dump({'filename': os.path.join(*dest.split('/')[4:]), 'text':[df['WORD']], 'original_data':{'gender':df['GENDER'], 'language':dest.split('/')[-3]}}, f)
 
-    l = len(df)
-    with tqdm.tqdm(total=l, desc=f'Processing {dest_root_path}') as pbar:
-        with ThreadPoolExecutor(max_workers=max_workers) as executor:
-            threads = [executor.submit(convert_and_json_dump, os.path.join(src_root_path, row['LINK']), os.path.join(dest_root_path, f'{i}.flac'), row) for i, row in enumerate(df.iloc())]
-            for _ in as_completed(threads):
-                pbar.update(1)
+def split_all_audio_files(df, overwrite:bool=False, verbose:bool=False, chunksize:int=1):
+    print(f'starting pool')
+    with Pool() as pool:
+        for result in tqdm.tqdm(pool.starmap(convert_and_json_dump, zip(df.iloc, repeat(overwrite), repeat(verbose)), chunksize=chunksize), total=len(df)):
+            pass
 
 if __name__ == '__main__':
-    import multiprocessing
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--job", help='Directory to the files to process, e.g. "/home/knoriy/fsx/raw_datasets/mswc/audio/fr.tar.gz" ', required=True)
+    args = parser.parse_args()
     
     max_workers = multiprocessing.cpu_count()
     chunk = 512
     generate_subset_tsv = True
 
-    root_path = '/home/knoriy/datasets/raw_datasets/mswc/'
-    tar_dir = "/home/knoriy/datasets/raw_datasets/mswc/mswc.tar.gz"
+    root_path = '/fsx/knoriy/raw_datasets/mswc/'
+    tar_dir = "/fsx/knoriy/raw_datasets/mswc/mswc.tar.gz"
     dataset_name = 'mswc'
 
     s3 = fsspec.filesystem('s3')
-    s3_dest = f's-laion/multilingual_spoken_words/{dataset_name}_tars/'
+    s3_dest = f's-laion-audio/webdataset_tar/{dataset_name}/'
 
     language_tars_dirs = sorted(glob.glob(os.path.join(root_path, "audio/**.tar.gz")))
     if not language_tars_dirs:
         raise FileNotFoundError(f"Please check that the file have been extracted: {root_path}")
 
-    for dir in tqdm.tqdm(language_tars_dirs, desc=f'processing: '):
-        audio_path = dir
-        with tarfile.open(audio_path, mode='r:gz') as mswc_audio:
-            audio_path = os.path.split(audio_path)[0]
-            mswc_audio.extractall(audio_path)
-
-        splits_path = dir.replace('audio', 'splits')
-        with tarfile.open(splits_path, mode='r:gz') as mswc_split:
-            splits_path = splits_path.replace('.tar.gz', '/')
-            mswc_split.extractall(splits_path)
-
-        tmp = glob.glob(os.path.join(splits_path, '**.csv'), recursive=True)
-        csv_paths = []
-        for csv_path in tmp:
-            if '_splits.csv' not in csv_path:
-                csv_paths.append(csv_path)
-
-        for csv_path in csv_paths:
-            if 'train' in csv_path:
-                train_test_dev = 'train/'
-            elif 'test' in csv_path:
-                train_test_dev = 'test/'
-            elif 'dev' in csv_path:
-                train_test_dev = 'valid/'
-            else:
-                train_test_dev = 'other/'
-            df = pd.read_csv(csv_path)
-
-            # Convert to .flac
-            dest_path  = splits_path.replace('.tar.gz', '/').replace('/raw_datasets/', '/processed_datasets/').replace('splits/', '')
-            dest_path  = os.path.join(dest_path, train_test_dev)
-
-            src_path = os.path.join(splits_path.replace('.tar.gz', '/').replace('splits/', 'audio/'), 'clips')
-            os.makedirs(dest_path, exist_ok=True)
-            os.makedirs(src_path, exist_ok=True)
-
-            split_all_audio_files(df, src_path, dest_path, max_workers)
-
-            tardir(dest_path, dest_path, chunk, delete_file=True)
-
-            # upload to s3 and delete local
-            s3.put(dest_path, os.path.join(s3_dest, os.path.basename(dir.split('.')[0]), train_test_dev), recursive=True)
-            print('File Uploaded to: ', os.path.join(s3_dest, os.path.basename(dir.split('.')[0]), train_test_dev))
-            shutil.rmtree(dest_path)
-
-        # clean extracted files
-        shutil.rmtree(splits_path.replace('splits/', 'audio/'))
-        shutil.rmtree(splits_path)
\ No newline at end of file
+    dir = args.job
+
+    with tarfile.open(dir, mode='r:gz') as mswc_audio:
+        audio_path = os.path.split(dir)[0]
+        mswc_audio.extractall(audio_path)
+
+    splits_path = dir.replace('audio', 'splits')
+    with tarfile.open(splits_path, mode='r:gz') as mswc_split:
+        splits_path = splits_path.replace('.tar.gz', '/')
+        mswc_split.extractall(splits_path)
+
+    tmp = glob.glob(os.path.join(splits_path, '**.csv'), recursive=True)
+    csv_paths = []
+    for csv_path in tmp:
+        if '_splits.csv' not in csv_path:
+            csv_paths.append(csv_path)
+
+    for csv_path in csv_paths:
+        if 'train' in csv_path:
+            train_test_dev = 'train/'
+        elif 'test' in csv_path:
+            train_test_dev = 'test/'
+        elif 'dev' in csv_path:
+            train_test_dev = 'valid/'
+        else:
+            train_test_dev = 'other/'
+        # Convert to .flac
+        dest_path  = splits_path.replace('.tar.gz', '/').replace('/raw_datasets/', '/processed_datasets/').replace('splits/', '')
+        dest_path  = os.path.join(dest_path, train_test_dev)
+
+        src_path = os.path.join(splits_path.replace('.tar.gz', '/').replace('splits/', 'audio/'), 'clips')
+        os.makedirs(dest_path, exist_ok=True)
+        os.makedirs(src_path, exist_ok=True)
+
+        df = pd.read_csv(csv_path)
+        df['dest_path'] = [os.path.join(dest_path, f'{i}.flac') for i, _ in enumerate(df.iloc())]
+        df['src_path'] = [os.path.join(src_path, row['LINK']) for i, row in enumerate(df.iloc())]
+        
+        print("nan found", len(df[df.isna().any(axis=1)]))
+        df = df.dropna()
+        print("nan after drop:", len(df[df.isna().any(axis=1)]))
+
+        split_all_audio_files(df, overwrite=True, chunksize=max_workers)
+
+        tardir(dest_path, dest_path, chunk, delete_file=True)
+
+        # upload to s3 and delete local
+        s3.put(dest_path, os.path.join(s3_dest, os.path.basename(dir.split('.')[0]), train_test_dev), recursive=True)
+        print('File Uploaded to: s3://', os.path.join(s3_dest, os.path.basename(dir.split('.')[0]), train_test_dev))
+        shutil.rmtree(dest_path)
+
+    # # clean extracted files
+    # shutil.rmtree(splits_path.replace('splits/', 'audio/'))
+    # shutil.rmtree(splits_path)
\ No newline at end of file
diff --git a/current_dataset/start_slurm_jobs.py b/current_dataset/start_slurm_jobs.py
new file mode 100644
index 0000000..a9a8b0d
--- /dev/null
+++ b/current_dataset/start_slurm_jobs.py
@@ -0,0 +1,7 @@
+import os
+import glob
+
+paths = glob.glob('/fsx/knoriy/raw_datasets/mswc/audio/*.tar.gz')
+
+for path in paths:
+    os.system( f"srun --comment clap --output=outs/%j.out --exclusive /fsx/home-knoriy/miniconda3/envs/audio_dataset/bin/python /fsx/knoriy/code/audio-dataset/current_dataset/preprocess_mswc.py --job {path} &")
\ No newline at end of file
diff --git a/data_preprocess/preprocess_audioset.py b/data_preprocess/preprocess_audioset.py
index fd5ae05..8308192 100644
--- a/data_preprocess/preprocess_audioset.py
+++ b/data_preprocess/preprocess_audioset.py
@@ -4,6 +4,7 @@
 import glob
 from tqdm import tqdm
 import sys
+import json
 
 sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
 from utils.audio_utils import audio_to_flac
@@ -56,24 +57,24 @@ def process_single_audio(file_path, json_data, output_dir):
 
     # Load metadata
     unbalanced_csv_path = os.path.join(args.metadata_dir, f'{args.metadata_name}.csv')
-    with open(unbalanced_csv_path, 'r') as f:
+    with open(unbalanced_csv_path) as f:
         lines = f.readlines()
+        lines = lines[3:]
+        header_list = ['YTID', 'start_seconds', 'end_seconds', 'positive_labels']
+        class_metadata = [l.strip().split(', ') for l in lines]
+        class_metadata = pd.DataFrame(class_metadata, columns=header_list)
+        class_metadata = dict(zip(class_metadata.YTID, class_metadata.positive_labels))
 
-    lines = lines[3:]
-    header_list = ['YTID', 'start_seconds', 'end_seconds', 'positive_labels']
-    class_metadata = [l.strip().split(', ') for l in lines]
-    class_metadata = pd.DataFrame(class_metadata, columns=header_list)
+    with open(os.path.join(args.metadata_dir,'ontology.json')) as f:
+        ontology = json.load(f)
+        ontology_dict = {i['id']: (i['name'], i['description']) for i in ontology}
 
-    class_to_name_map = pd.read_csv(os.path.join(args.metadata_dir, 'class_labels_indices.csv'))
-
-    class_metadata = dict(zip(class_metadata.YTID, class_metadata.positive_labels))
-    class_to_name_map = dict(zip(class_to_name_map.mid, class_to_name_map.display_name))
 
     wav_all = glob.glob(f'{args.wav_dir}/*.wav')
     futures = []
     for file in tqdm(wav_all):
         # process_single_audio(file, class_metadata, class_to_name_map, args.output_dir)
-        json_data = get_json(file, class_metadata, class_to_name_map)
+        json_data = get_json(file, class_metadata, ontology_dict)
         futures.append(
             executor.submit(partial(process_single_audio, file, json_data, args.output_dir)))
 
diff --git a/data_preprocess/preprocess_audioset.sh b/data_preprocess/preprocess_audioset.sh
index 83965e9..38cdaf4 100644
--- a/data_preprocess/preprocess_audioset.sh
+++ b/data_preprocess/preprocess_audioset.sh
@@ -1,79 +1,79 @@
 #!/bin/bash
 
-# preliminary: create /mnt/audio_clip/audioset, clone code from audio-dataset
+# preliminary: create /tmp/audioset, clone code from audio-dataset
 
-cd /mnt/audio_clip/audioset
+cd /tmp/audioset
 mkdir metadata
 cd metadata
 
 wget http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/eval_segments.csv
 wget http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/balanced_train_segments.csv
 wget http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/unbalanced_train_segments.csv
-wget https://raw.githubusercontent.com/qiuqiangkong/audioset_tagging_cnn/master/metadata/class_labels_indices.csv
+wget https://raw.githubusercontent.com/audioset/ontology/master/ontology.json
 
-cd ~/audio-dataset
+cd /fsx/knoriy/code/audio-dataset
 
 for i in $(seq -w 00 40)
 do
-  aws s3 --region us-east-1 cp s3://laion-audio/raw_dataset/audioset/unbalanced_train_segments/unbalanced_train_segments_part"${i}"_partial.zip /mnt/audio_clip/audioset/zip/
-  aws s3 --region us-east-1 cp s3://laion-audio/raw_dataset/audioset/unbalanced_train_segments/unbalanced_train_segments_part"${i}"_partial.z01 /mnt/audio_clip/audioset/zip/
-  aws s3 --region us-east-1 cp s3://laion-audio/raw_dataset/audioset/unbalanced_train_segments/unbalanced_train_segments_part"${i}"_partial.z02 /mnt/audio_clip/audioset/zip/
+  aws s3 cp s3://s-laion-audio/raw_dataset/audioset/unbalanced_train_segments/unbalanced_train_segments_part"${i}"_partial.zip /tmp/audioset/zip/
+  aws s3 cp s3://s-laion-audio/raw_dataset/audioset/unbalanced_train_segments/unbalanced_train_segments_part"${i}"_partial.z01 /tmp/audioset/zip/
+  aws s3 cp s3://s-laion-audio/raw_dataset/audioset/unbalanced_train_segments/unbalanced_train_segments_part"${i}"_partial.z02 /tmp/audioset/zip/
 
   7z e /mnt/audio_clip/audioset/zip/unbalanced_train_segments_part"${i}"_partial.zip -o/mnt/audio_clip/audioset/audios
 
   python data_preprocess/preprocess_audioset.py \
-  --metadata_dir /mnt/audio_clip/audioset/metadata \
+  --metadata_dir /tmp/audioset/metadata \
   --metadata_name unbalanced_train_segments \
-  --wav_dir /mnt/audio_clip/audioset/audios \
-  --output_dir /mnt/audio_clip/audioset/processed_data
+  --wav_dir /tmp/audioset/audios \
+  --output_dir /tmp/audioset/processed_data
 
-  rm /mnt/audio_clip/audioset/zip/unbalanced_train_segments_part"${i}"_partial*
-  rm -rf /mnt/audio_clip/audioset/audios
+  # rm /tmp/audioset/zip/unbalanced_train_segments_part"${i}"_partial*
+  # rm -rf /tmp/audioset/audios
 done
 
-aws s3 --region us-east-1 cp s3://laion-audio/raw_dataset/audioset/balanced_train_segments.zip /mnt/audio_clip/audioset/
-aws s3 --region us-east-1 cp s3://laion-audio/raw_dataset/audioset/eval_segments.zip /mnt/audio_clip/audioset/
+aws s3 cp s3://s-laion-audio/raw_dataset/audioset/balanced_train_segments.zip /tmp/audioset/
+aws s3 cp s3://s-laion-audio/raw_dataset/audioset/eval_segments.zip /tmp/audioset/
 
-cd /mnt/audio_clip/audioset/
+cd /tmp/audioset/
 unzip balanced_train_segments.zip
 unzip eval_segments.zip
 
-cd ~/audio-dataset
+cd /fsx/knoriy/code/audio-dataset
 
 python data_preprocess/preprocess_audioset.py \
---metadata_dir /mnt/audio_clip/audioset/metadata \
+--metadata_dir /tmp/audioset/metadata \
 --metadata_name balanced_train_segments \
---wav_dir /mnt/audio_clip/audioset/balanced_train_segments \
---output_dir /mnt/audio_clip/audioset/processed_data_balanced_train_segments
+--wav_dir /tmp/audioset/balanced_train_segments \
+--output_dir /tmp/audioset/processed_data_balanced_train_segments
 
 python data_preprocess/preprocess_audioset.py \
---metadata_dir /mnt/audio_clip/audioset/metadata \
+--metadata_dir /tmp/audioset/metadata \
 --metadata_name eval_segments \
---wav_dir /mnt/audio_clip/audioset/eval_segments \
---output_dir /mnt/audio_clip/audioset/processed_data_eval_segments
+--wav_dir /tmp/audioset/eval_segments \
+--output_dir /tmp/audioset/processed_data_eval_segments
 
-python data_check/remove_bad_flac.py --dir /mnt/audio_clip/audioset/processed_data_eval_segments
-python data_check/remove_bad_flac.py --dir /mnt/audio_clip/audioset/processed_data_balanced_train_segments
-python data_check/remove_bad_flac.py --dir /mnt/audio_clip/audioset/processed_data
+python data_check/remove_bad_flac.py --dir /tmp/audioset/processed_data_eval_segments
+python data_check/remove_bad_flac.py --dir /tmp/audioset/processed_data_balanced_train_segments
+python data_check/remove_bad_flac.py --dir /tmp/audioset/processed_data
 
 python ./utils/make_tar.py \
---input /mnt/audio_clip/audioset/processed_data \
---output /mnt/audio_clip/audioset/webdataset_tar/unbalanced_train/ \
+--input /tmp/audioset/processed_data \
+--output /tmp/audioset/webdataset_tar/unbalanced_train/ \
 --dataclass none \
 --delete_file
 
 python ./utils/make_tar.py \
---input /mnt/audio_clip/audioset/processed_data_balanced_train_segments \
---output /mnt/audio_clip/audioset/webdataset_tar/balanced_train/ \
+--input /tmp/audioset/processed_data_balanced_train_segments \
+--output /tmp/audioset/webdataset_tar/balanced_train/ \
 --dataclass none \
 --delete_file
 
 python ./utils/make_tar.py \
---input /mnt/audio_clip/audioset/processed_data_eval_segments \
---output /mnt/audio_clip/audioset/webdataset_tar/eval/ \
+--input /tmp/audioset/processed_data_eval_segments \
+--output /tmp/audioset/webdataset_tar/eval/ \
 --dataclass none \
 --delete_file
 
-aws s3 --region us-east-1 cp /mnt/audio_clip/audioset/webdataset_tar/balanced_train s3://laion-audio/webdataset_tar/audioset/balanced_train --recursive
-aws s3 --region us-east-1 cp /mnt/audio_clip/audioset/webdataset_tar/eval s3://laion-audio/webdataset_tar/audioset/eval --recursive
-aws s3 --region us-east-1 cp /mnt/audio_clip/audioset/webdataset_tar/unbalanced_train s3://laion-audio/webdataset_tar/audioset/unbalanced_train --recursive
\ No newline at end of file
+aws s3 cp /tmp/audioset/webdataset_tar/balanced_train s3://s-laion-audio/webdataset_tar/audioset_description/balanced_train --recursive
+aws s3 cp /tmp/audioset/webdataset_tar/eval s3://s-laion-audio/webdataset_tar/audioset_description/eval --recursive
+aws s3 cp /tmp/audioset/webdataset_tar/unbalanced_train s3://s-laion-audio/webdataset_tar/audioset_description/unbalanced_train --recursive
\ No newline at end of file
diff --git a/download_script/download_and_preprocess_common_voice.sh b/download_script/download_and_preprocess_common_voice.sh
new file mode 100644
index 0000000..9b23336
--- /dev/null
+++ b/download_script/download_and_preprocess_common_voice.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+#SBATCH --partition=cpu16
+#SBATCH --job-name=audio-dataset
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=8
+#SBATCH --comment clap
+#SBATCH --output=%x_%j.out
+#SBATCH --exclusive
+
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib
+export NCCL_PROTO=simple
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/aws-ofi-nccl/lib
+export PATH=$PATH:/opt/amazon/efa/bin:/opt/amazon/openmpi/bin
+export FI_EFA_FORK_SAFE=1
+export FI_LOG_LEVEL=1
+export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
+export NCCL_DEBUG=info
+export OMPI_MCA_mtl_base_verbose=1
+export FI_EFA_ENABLE_SHM_TRANSFER=0
+export FI_PROVIDER=efa
+export FI_EFA_TX_MIN_CREDITS=64
+export NCCL_TREE_THRESHOLD=0
+
+echo Running job on $SLURM_JOB_NUM_NODES, 
+
+srun --comment clap /fsx/home-knoriy/miniconda3/envs/audio_dataset/bin/python /fsx/knoriy/code/audio-dataset/current_dataset/preprocess_common_voice.py
\ No newline at end of file
diff --git a/data_preprocess/environment.yml b/environment.yml
similarity index 77%
rename from data_preprocess/environment.yml
rename to environment.yml
index eb59b44..e036fab 100644
--- a/data_preprocess/environment.yml
+++ b/environment.yml
@@ -1,10 +1,12 @@
 name: audio_dataset
 channels:
   - pytorch
+  - conda-forge
   - defaults
 dependencies:
   - _libgcc_mutex=0.1=main
   - _openmp_mutex=5.1=1_gnu
+  - absl-py=1.1.0=pyhd8ed1ab_0
   - aiobotocore=2.1.0=pyhd3eb1b0_0
   - aiohttp=3.8.1=py39h7f8727e_1
   - aioitertools=0.7.1=pyhd3eb1b0_0
@@ -14,28 +16,37 @@ dependencies:
   - attrs=21.4.0=pyhd3eb1b0_0
   - backcall=0.2.0=pyhd3eb1b0_0
   - blas=1.0=mkl
+  - blinker=1.4=py_1
   - botocore=1.23.24=pyhd3eb1b0_0
   - bottleneck=1.3.4=py39hce1f21e_0
   - brotlipy=0.7.0=py39h27cfd23_1003
   - bzip2=1.0.8=h7b6447c_0
-  - ca-certificates=2022.4.26=h06a4308_0
-  - certifi=2022.5.18.1=py39h06a4308_0
+  - c-ares=1.18.1=h7f98852_0
+  - ca-certificates=2022.6.15=ha878542_0
+  - cachetools=5.0.0=pyhd8ed1ab_0
+  - certifi=2022.6.15=py39hf3d152e_0
   - cffi=1.15.0=py39hd667e15_1
   - charset-normalizer=2.0.4=pyhd3eb1b0_0
+  - click=8.1.3=py39hf3d152e_0
   - cryptography=37.0.1=py39h9ce1e76_0
   - cudatoolkit=11.3.1=h2bc3f7f_2
   - debugpy=1.5.1=py39h295c915_0
   - decorator=5.1.1=pyhd3eb1b0_0
   - entrypoints=0.4=py39h06a4308_0
   - executing=0.8.3=pyhd3eb1b0_0
-  - ffmpeg=4.3=hf484d3e_0
+  - ffmpeg=4.3.2=hca11adc_0
   - freetype=2.11.0=h70c0345_0
   - frozenlist=1.2.0=py39h7f8727e_0
   - fsspec=2022.1.0=pyhd3eb1b0_0
+  - future=0.18.2=py39hf3d152e_5
   - giflib=5.2.1=h7b6447c_0
   - gmp=6.2.1=h295c915_3
   - gnutls=3.6.15=he1e5248_0
+  - google-auth=2.9.0=pyh6c4a22f_0
+  - google-auth-oauthlib=0.4.6=pyhd8ed1ab_0
+  - grpcio=1.42.0=py39hce63b2e_0
   - idna=3.3=pyhd3eb1b0_0
+  - importlib-metadata=4.11.4=py39hf3d152e_0
   - intel-openmp=2021.4.0=h06a4308_3561
   - ipykernel=6.9.1=py39h06a4308_0
   - ipython=8.3.0=py39h06a4308_0
@@ -53,6 +64,7 @@ dependencies:
   - libiconv=1.16=h7f8727e_2
   - libidn2=2.3.2=h7f8727e_0
   - libpng=1.6.37=hbc83047_0
+  - libprotobuf=3.15.8=h780b84a_1
   - libsodium=1.0.18=h7b6447c_0
   - libstdcxx-ng=11.2.0=h1234567_1
   - libtasn1=4.16.0=h27cfd23_0
@@ -62,6 +74,7 @@ dependencies:
   - libwebp=1.2.2=h55f646e_0
   - libwebp-base=1.2.2=h7f8727e_0
   - lz4-c=1.9.3=h295c915_1
+  - markdown=3.3.7=pyhd8ed1ab_0
   - matplotlib-inline=0.1.2=pyhd3eb1b0_2
   - mkl=2021.4.0=h06a4308_640
   - mkl-service=2.4.0=py39h7f8727e_0
@@ -74,8 +87,9 @@ dependencies:
   - numexpr=2.8.1=py39h6abb31d_0
   - numpy=1.22.3=py39he7a7128_0
   - numpy-base=1.22.3=py39hf524024_0
+  - oauthlib=3.2.0=pyhd8ed1ab_0
   - openh264=2.1.1=h4ff587b_0
-  - openssl=1.1.1o=h7f8727e_0
+  - openssl=1.1.1p=h5eee18b_0
   - packaging=21.3=pyhd3eb1b0_0
   - pandas=1.4.2=py39h295c915_0
   - parso=0.8.3=pyhd3eb1b0_0
@@ -84,26 +98,39 @@ dependencies:
   - pillow=9.0.1=py39h22f2fdc_0
   - pip=21.2.4=py39h06a4308_0
   - prompt-toolkit=3.0.20=pyhd3eb1b0_0
+  - protobuf=3.15.8=py39he80948d_0
   - ptyprocess=0.7.0=pyhd3eb1b0_2
   - pure_eval=0.2.2=pyhd3eb1b0_0
+  - pyasn1=0.4.8=py_0
+  - pyasn1-modules=0.2.7=py_0
   - pycparser=2.21=pyhd3eb1b0_0
   - pygments=2.11.2=pyhd3eb1b0_0
+  - pyjwt=2.4.0=pyhd8ed1ab_0
   - pyopenssl=22.0.0=pyhd3eb1b0_0
   - pyparsing=3.0.4=pyhd3eb1b0_0
   - pysocks=1.7.1=py39h06a4308_0
   - python=3.9.12=h12debd9_1
   - python-dateutil=2.8.2=pyhd3eb1b0_0
+  - python_abi=3.9=2_cp39
   - pytorch=1.11.0=py3.9_cuda11.3_cudnn8.2.0_0
+  - pytorch-lightning=0.8.5=py_0
   - pytorch-mutex=1.0=cuda
   - pytz=2022.1=py39h06a4308_0
+  - pyu2f=0.1.5=pyhd8ed1ab_0
+  - pyyaml=6.0=py39hb9d737c_4
   - pyzmq=22.3.0=py39h295c915_2
   - readline=8.1.2=h7f8727e_1
   - requests=2.27.1=pyhd3eb1b0_0
+  - requests-oauthlib=1.3.1=pyhd8ed1ab_0
+  - rsa=4.8=pyhd8ed1ab_0
   - s3fs=2022.1.0=pyhd3eb1b0_0
   - setuptools=61.2.0=py39h06a4308_0
   - six=1.16.0=pyhd3eb1b0_1
   - sqlite=3.38.3=hc218d9a_0
   - stack_data=0.2.0=pyhd3eb1b0_0
+  - tensorboard=2.9.1=pyhd8ed1ab_0
+  - tensorboard-data-server=0.6.0=py39hd97740a_2
+  - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0
   - tk=8.6.12=h1ccaba5_0
   - torchaudio=0.11.0=py39_cu113
   - torchvision=0.12.0=py39_cu113
@@ -115,11 +142,15 @@ dependencies:
   - tzdata=2022a=hda174b7_0
   - urllib3=1.26.9=py39h06a4308_0
   - wcwidth=0.2.5=pyhd3eb1b0_0
+  - werkzeug=2.1.2=pyhd8ed1ab_1
   - wheel=0.37.1=pyhd3eb1b0_0
   - wrapt=1.13.3=py39h7f8727e_2
+  - x264=1!161.3030=h7f98852_1
   - xz=5.2.5=h7f8727e_1
+  - yaml=0.2.5=h7f98852_2
   - yarl=1.6.3=py39h27cfd23_0
   - zeromq=4.3.4=h2531618_0
+  - zipp=3.8.0=pyhd8ed1ab_0
   - zlib=1.2.12=h7f8727e_2
   - zstd=1.5.2=ha4553b6_0
   - pip:
@@ -131,11 +162,10 @@ dependencies:
     - llvmlite==0.38.1
     - numba==0.55.2
     - pooch==1.6.0
-    - pyyaml==6.0
     - resampy==0.2.2
     - scikit-learn==1.1.1
     - scipy==1.8.1
     - soundfile==0.10.3.post1
     - threadpoolctl==3.1.0
     - webdataset==0.2.5
-prefix: /home/knoriy/miniconda3/envs/audio_dataset
+prefix: /home/knoriy/fsx/miniconda3/envs/audio_dataset
diff --git a/utils/make_tar_utils.py b/utils/make_tar_utils.py
index 91d7bc4..a55c553 100644
--- a/utils/make_tar_utils.py
+++ b/utils/make_tar_utils.py
@@ -36,15 +36,15 @@ def tardir(
     if n_split * n_entry_each != len(filelist):
         n_split += 1
     size_dict = {
-        os.path.basename(tar_name) + str(i) + ".tar": n_entry_each
+        os.path.join(os.path.basename(tar_name), str(i) + ".tar"): n_entry_each
         for i in range(n_split)
     }
     if n_split * n_entry_each != len(filelist):
-        size_dict[os.path.basename(tar_name) + str(n_split - 1) + ".tar"] = (
+        size_dict[os.path.join(os.path.basename(tar_name), str(n_split - 1) + ".tar")] = (
             len(filelist) - (n_split - 1) * n_entry_each
         )
     for i in tqdm(range(start_idx, n_split + start_idx), desc='Creating .tar file:'):
-        with tarfile.open(tar_name + str(i) + ".tar", "w") as tar_handle:
+        with tarfile.open(os.path.join(tar_name, str(i) + ".tar"), "w") as tar_handle:
             for j in range(count, len(filelist)):
                 audio = filelist[j]
                 basename = ".".join(audio.split(".")[:-1])