diff --git a/.gitignore b/.gitignore index b323e78..938f3a2 100644 --- a/.gitignore +++ b/.gitignore @@ -133,4 +133,5 @@ dmypy.json utils/__pycache__/make_tar_utils.cpython-37.pyc /data_preprocess/process_audioset/ -*.out \ No newline at end of file +*.out +test.*py* \ No newline at end of file diff --git a/current_dataset/ToDO.md b/current_dataset/ToDO.md new file mode 100644 index 0000000..2cf9838 --- /dev/null +++ b/current_dataset/ToDO.md @@ -0,0 +1,6 @@ +# ToDo + +- [X] LJSpeech +- [X] MSWC +- [ ] GigaSpeech +- [ ] CoVoST diff --git a/current_dataset/preprocess_CREMA-D.py b/current_dataset/preprocess_CREMA-D.py new file mode 100644 index 0000000..8b936c7 --- /dev/null +++ b/current_dataset/preprocess_CREMA-D.py @@ -0,0 +1,130 @@ +""" +Code for preprocess GigaSpeech Corpus: +https://github.com/SpeechColab/GigaSpeech +""" + +import glob +import tqdm +import os +import glob +import pandas as pd +import sys +import tarfile +import json +import shutil +import fsspec + +from sklearn.model_selection import train_test_split +from concurrent.futures import ThreadPoolExecutor, as_completed +from multiprocessing import Pool + +sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) +from utils.audio_utils import audio_to_flac +from utils.make_tar_utils import tardir + +def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=True, verbose=False): + if os.path.isfile(dest) and overwrite==False: + if verbose==True: + print(f'{dest} already exists, skiping') + return + audio_to_flac(file, dest) + with open(dest.replace('.flac', '.json'), 'w') as f: + json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':[df['text']], 'original_data':df['tag']}, f) + + +def split_all_audio_files(df, dest_root_path, max_workers=96): + if not os.path.exists(dest_root_path): + raise FileNotFoundError(f'Please Check {dest_root_path} exists') + + l = len(df) + with tqdm.tqdm(total=l, desc=f'Processing {dest_root_path}') as pbar: + with ThreadPoolExecutor(max_workers=max_workers) as executor: + threads = [executor.submit(convert_and_json_dump, row[0], os.path.join(dest_root_path, f'{i}.flac'), row) for i, row in enumerate(df.iloc())] + for _ in as_completed(threads): + pbar.update(1) + +def create_df(root_path:str, dataset_name:str=None): + wavs = glob.glob(os.path.join(root_path, 'AudioWAV', '**/*.wav'), recursive=True) + codes = { 'Statement':{ 'IEO':"It's eleven o'clock", + 'TIE':"That is exactly what happened", + 'IOM':"I'm on my way to the meeting", + 'IWW':"I wonder what this is about", + 'TAI':"The airplane is almost full", + 'MTI':"Maybe tomorrow it will be cold", + 'IWL':"I would like a new alarm clock", + 'ITH':"I think I have a doctor's appointment", + 'DFA':"Don't forget a jacket", + 'ITS':"I think I've seen this before", + 'TSI':"The surface is slick", + 'WSI':"We'll stop in a couple of minutes", + }, + 'Emotion':{ 'ANG':'angry', + 'DIS':'disgusted', + 'FEA':'fearful', + 'HAP':'happy', + 'NEU':'neutral', + 'SAD':'sad', + }, + 'Emotional intensity':{ 'LO':'low', + 'MD':'medium', + 'HI':'high', + 'XX':'unspecified', + }, + } + demographics = pd.read_csv(os.path.join(root_path, 'VideoDemographics.csv'), names=["ActorID","Age","Sex","Race","Ethnicity"]) + df_data = [] + for wav in tqdm.tqdm(wavs): + file_name = os.path.basename(wav).split('.')[0] + wav_codes = file_name.split('_') + text_meta = [codes['Statement'][wav_codes[1]], codes['Emotion'][wav_codes[2]], codes['Emotional intensity'][wav_codes[3]]] + demograpthics_meta = demographics.loc[demographics['ActorID'] == wav_codes[0]] + + male_or_female = 'woman' if demograpthics_meta["Sex"].values[0] == 'Female' else 'man' + intensity = '' if text_meta[2] == 'unspecified' else f' with {text_meta[2]} emotional intensity' + text = f'A {demograpthics_meta["Age"].values[0]} year-old {male_or_female}, saying "{text_meta[0]}" in a {text_meta[1]} voice{intensity}.' + df_data.append({ 'path':wav, 'text':text, 'tag':{'transcript':text_meta[0], 'language':'english', 'emotion':text_meta[1], 'emotion_intensity':text_meta[2], 'gender':demograpthics_meta["Sex"].values[0], 'age':demograpthics_meta["Age"].values[0] }}) + + return pd.DataFrame(df_data) + + +if __name__ == '__main__': + import multiprocessing + + max_workers = multiprocessing.cpu_count() + print("Num workers: ", max_workers) + chunk = 512 + + root_path = '/admin/home-knoriy/DELETEME/CREMA-D/' + dataset_name = 'CREMA-D' + + s3 = fsspec.filesystem('s3') + s3_dest = f'laion-west-audio/webdataset_tar/{dataset_name}/' + + original_tar_dir = '/fsx/knoriy/raw_datasets/CREMA-D/crema-d.tar.gz' + + # print('Extracting tar') + # with tarfile.open(original_tar_dir, mode='r:gz') as file: + # audio_path = os.path.split(original_tar_dir)[0] + # file.extractall(audio_path) + + # load metadata and configure audio paths + df = create_df(root_path) + + # create train, test, valid splits + train, test = train_test_split(df, test_size=0.2) + valid, test = train_test_split(test, test_size=0.2) + train_test_val = {'valid/':valid, 'train/':train, 'test/':test} + + + for key in tqdm.tqdm(train_test_val, desc=f'processing:'): + df = train_test_val[key] + + dest_path = os.path.join(root_path.replace('CREMA-D', 'CREMA-D_processed').replace('AudioWAV/', ''), key) + os.makedirs(dest_path, exist_ok=True) + + split_all_audio_files(df, dest_path) + tardir(dest_path, dest_path, chunk, delete_file=True) + + # upload to s3 and delete local + s3.put(dest_path, os.path.join(s3_dest, key), recursive=True) + shutil.rmtree(dest_path) \ No newline at end of file diff --git a/current_dataset/preprocess_CoVoST.py b/current_dataset/preprocess_CoVoST.py new file mode 100644 index 0000000..35f463c --- /dev/null +++ b/current_dataset/preprocess_CoVoST.py @@ -0,0 +1,192 @@ +""" +Code for preprocess LJSpeech Corpus: +https://keithito.com/LJ-Speech-Dataset/ +""" + +import glob +from tabnanny import verbose +from tokenize import Name +import tqdm +import os +import glob +import pandas as pd +import sys +import tarfile +import json +import shutil +import fsspec + +from sklearn.model_selection import train_test_split +from concurrent.futures import ThreadPoolExecutor, as_completed + +sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) +from utils.audio_utils import audio_to_flac +from utils.make_tar_utils import tardir + +def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False, verbose=False): + if os.path.isfile(dest) and overwrite==False: + if verbose==True: + print(f'{dest} already exists, skiping') + return + audio_to_flac(file, dest) + with open(dest.replace('.flac', '.json'), 'w') as f: + json.dump({'filename': os.path.join(*dest.split('/')[3:]), 'text':[df['text']], 'original_data':df['original_data']}, f) + +def split_all_audio_files(df, dest_root_path, max_workers=96): + if not os.path.exists(dest_root_path): + raise FileNotFoundError(f'Please Check {dest_root_path} exists') + + l = len(df) + with tqdm.tqdm(total=l, desc=f'Processing {dest_root_path}') as pbar: + with ThreadPoolExecutor(max_workers=max_workers) as executor: + threads = [executor.submit(convert_and_json_dump, row[0], os.path.join(dest_root_path, f'{i}.flac'), row, overwrite=False, verbose=False) for i, row in enumerate(df.iloc())] + for _ in as_completed(threads): + pbar.update(1) + +def download_tsvs(urls:list, output_dir:str, extract:bool=False): + os.makedirs(output_dir, exist_ok=True) + for url in tqdm.tqdm(urls, desc="Downloading tsvs"): + dest_path = os.path.join(output_dir, url.split("/")[-1]) + if os.path.isfile(dest_path): + continue + os.system(f'curl {url} --output {dest_path}') + + if extract: + os.system(f'tar -xf {dest_path}') + +def extract_covost_2_tsvs(tsv_tar_dir:str, dest:str, cv_tsv:str, version=2): + extract_covost_2_tar_cmd = f'tar -xf {tsv_tar_dir} -C {dest}' + os.system(extract_covost_2_tar_cmd) + + src_lang, tgt_lang = os.path.basename(tsv_tar_dir).split('.')[1].split('_') + get_covost_splits_cmd = f'python /home/knoriy/fsx/raw_datasets/CoVoST_2/covost/get_covost_splits.py \ + --version {version} \ + --src-lang {src_lang} \ + --tgt-lang {tgt_lang} \ + --root {dest} \ + --cv-tsv {cv_tsv} \ + ' + os.system(get_covost_splits_cmd) + +if __name__ == '__main__': + import multiprocessing + + x_2_eng = [ + "https://dl.fbaipublicfiles.com/covost/covost_v2.fr_en.tsv.tar.gz", + "https://dl.fbaipublicfiles.com/covost/covost_v2.de_en.tsv.tar.gz", + "https://dl.fbaipublicfiles.com/covost/covost_v2.es_en.tsv.tar.gz", + "https://dl.fbaipublicfiles.com/covost/covost_v2.ca_en.tsv.tar.gz", + "https://dl.fbaipublicfiles.com/covost/covost_v2.it_en.tsv.tar.gz", + "https://dl.fbaipublicfiles.com/covost/covost_v2.ru_en.tsv.tar.gz", + "https://dl.fbaipublicfiles.com/covost/covost_v2.zh-CN_en.tsv.tar.gz", + "https://dl.fbaipublicfiles.com/covost/covost_v2.pt_en.tsv.tar.gz", + "https://dl.fbaipublicfiles.com/covost/covost_v2.fa_en.tsv.tar.gz", + "https://dl.fbaipublicfiles.com/covost/covost_v2.et_en.tsv.tar.gz", + "https://dl.fbaipublicfiles.com/covost/covost_v2.mn_en.tsv.tar.gz", + "https://dl.fbaipublicfiles.com/covost/covost_v2.nl_en.tsv.tar.gz", + "https://dl.fbaipublicfiles.com/covost/covost_v2.tr_en.tsv.tar.gz", + "https://dl.fbaipublicfiles.com/covost/covost_v2.ar_en.tsv.tar.gz", + "https://dl.fbaipublicfiles.com/covost/covost_v2.sv-SE_en.tsv.tar.gz", + "https://dl.fbaipublicfiles.com/covost/covost_v2.lv_en.tsv.tar.gz", + "https://dl.fbaipublicfiles.com/covost/covost_v2.sl_en.tsv.tar.gz", + "https://dl.fbaipublicfiles.com/covost/covost_v2.ta_en.tsv.tar.gz", + "https://dl.fbaipublicfiles.com/covost/covost_v2.ja_en.tsv.tar.gz", + "https://dl.fbaipublicfiles.com/covost/covost_v2.id_en.tsv.tar.gz", + "https://dl.fbaipublicfiles.com/covost/covost_v2.cy_en.tsv.tar.gz", + ] + eng_2_x = [ + 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_de.tsv.tar.gz', + 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_ca.tsv.tar.gz', + 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_zh-CN.tsv.tar.gz', + 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_fa.tsv.tar.gz', + 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_et.tsv.tar.gz', + 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_mn.tsv.tar.gz', + 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_tr.tsv.tar.gz', + 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_ar.tsv.tar.gz', + 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_sv-SE.tsv.tar.gz', + 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_lv.tsv.tar.gz', + 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_sl.tsv.tar.gz', + 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_ta.tsv.tar.gz', + 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_ja.tsv.tar.gz', + 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_id.tsv.tar.gz', + 'https://dl.fbaipublicfiles.com/covost/covost_v2.en_cy.tsv.tar.gz', + ] + get_language_from_key = { + 'en':'english', + 'de':'german', + 'fr':'french', + 'nl':'dutch', + 'ru':'russian', + 'es':'spanish', + 'it':'italian', + 'tr':'turkish', + 'fa':'persian', + 'ca':'catalan', + 'zh-cn':'chinese', + 'pt':'portuguese', + 'et':'estonian', + 'mn':'mongolian', + 'ar':'arabic', + 'sv-se':'swedish', + 'lv':'latvian', + 'sl':'slovenian', + 'ta':'tamil', + 'ja':'japanese', + 'id':'indonesian', + 'cy':'welsh', + } + + max_workers = multiprocessing.cpu_count() + chunk = 512 + generate_subset_tsv = True + + root_path = '/home/knoriy/fsx/raw_datasets/CoVoST_2/' + metadata_dir = "/home/knoriy/fsx/raw_datasets/CoVoST_2/" + + dataset_name = 'CoVoST_2' + COMMON_VOICE_VERSION = 'cv-corpus-10.0-2022-07-04' + + s3 = fsspec.filesystem('s3') + s3_dest = f's-laion/knoriy/{dataset_name}/{dataset_name}_tars/' + + download_tsvs(eng_2_x, os.path.join(root_path, 'tsvs/')) + # download_tsvs(x_2_eng, "/home/knoriy/fsx/raw_datasets/CoVoST_2/tsvs") + + # uncomment to extract and create CoVoST tsvs + # for tar in tqdm.tqdm(glob.glob(os.path.join(root_path, 'tsvs/**/*.tar.gz'), recursive=True), desc='Extracting tsvs'): + # extract_covost_2_tsvs(tar, os.path.join(root_path, 'tsvs/'), '/home/knoriy/fsx/raw_datasets/CoVoST_2/cv-corpus-10.0-2022-07-04/en/validated.tsv') + + # load metadata and configure audio paths + tsvs = [] + for tsv in glob.glob(os.path.join(root_path, 'tsvs/**/*.tsv'), recursive=True): + if any(word in os.path.basename(tsv) for word in ['test', 'train', 'dev']): + tsvs.append(tsv) + + for tsv in tqdm.tqdm(tsvs, desc=f'processing:'): + raw_df = pd.read_csv(tsv, sep='\t', on_bad_lines='skip') + IS_TRAIN_VAL_OR_TEST, LANGUAGE = tsv.split('.')[-2], tsv.split('.')[-3] + + data = {} + for row in raw_df.iloc(): + + data.setdefault('paths', []).append(os.path.join(root_path, COMMON_VOICE_VERSION, LANGUAGE.split('_')[0], "clips", row['path'])) + data.setdefault('text', []).append(f"{row['translation']} translated to {get_language_from_key[LANGUAGE.split('_')[0]]}") + data.setdefault('original_data', []).append( + { + "sentence":row['sentence'], + "translation":row['translation'], + "client_id":row['client_id'], + } + ) + + df = pd.DataFrame(data) + + dest_path = os.path.join(root_path.replace('raw_datasets', 'processed_datasets'), LANGUAGE, IS_TRAIN_VAL_OR_TEST, '') + os.makedirs(dest_path, exist_ok=True) + + split_all_audio_files(df, dest_path) + tardir(dest_path, dest_path, chunk, delete_file=True) + + # upload to s3 and delete local + s3.put(dest_path, os.path.join(s3_dest, LANGUAGE, IS_TRAIN_VAL_OR_TEST)+'/', recursive=True) + shutil.rmtree(dest_path) diff --git a/current_dataset/preprocess_EMNS.py b/current_dataset/preprocess_EMNS.py new file mode 100644 index 0000000..e9348f4 --- /dev/null +++ b/current_dataset/preprocess_EMNS.py @@ -0,0 +1,87 @@ +import glob +import tqdm +import os +import glob +import pandas as pd +import sys +import tarfile +import json +import shutil +import fsspec + +from sklearn.model_selection import train_test_split +from concurrent.futures import ThreadPoolExecutor, as_completed +from multiprocessing import Pool + +sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) +from utils.audio_utils import audio_to_flac +from utils.make_tar_utils import tardir + +def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=True, verbose=False): + if os.path.isfile(dest) and overwrite==False: + if verbose==True: + print(f'{dest} already exists, skiping') + return + audio_to_flac(file, dest) + with open(dest.replace('.flac', '.json'), 'w') as f: + json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':[df['text']], 'original_data':df['original_data']}, f) + + +def split_all_audio_files(df, dest_root_path, max_workers=96): + if not os.path.exists(dest_root_path): + raise FileNotFoundError(f'Please Check {dest_root_path} exists') + + l = len(df) + with tqdm.tqdm(total=l, desc=f'Processing {dest_root_path}') as pbar: + with ThreadPoolExecutor(max_workers=max_workers) as executor: + threads = [executor.submit(convert_and_json_dump, row[0], os.path.join(dest_root_path, f'{i}.flac'), row) for i, row in enumerate(df.iloc())] + for _ in as_completed(threads): + pbar.update(1) + +def create_df(root_path:str, dataset_name:str=None): + df = pd.read_csv(os.path.join(root_path, 'metadata.csv'), sep='|') + + df_data = [] + for row in df.iloc: + path = os.path.join(root_path, row['audio_recording'].replace('wavs/', 'cleaned_webm/')) + text = row['description'].format(user_id=f"A {row['gender']} in their {row['age']}", transcription=row['utterance'], emotion=row['emotion']) + f" Emotion intensity: {row['level']}." + df_data.append({'path':path, 'text':text, 'original_data':{'age': row['age'], 'gender':row['gender'], 'emotion':row['emotion']}, 'transcript':row['utterance'], "level":row['level']}) + + return pd.DataFrame(df_data) + + +if __name__ == '__main__': + import multiprocessing + + max_workers = multiprocessing.cpu_count() + print("Num workers: ", max_workers) + chunk = 512 + + root_path = '/admin/home-knoriy/DELETEME/EMNS/' + dataset_name = 'EMNS' + + s3 = fsspec.filesystem('s3') + s3_dest = f'laion-west-audio/webdataset_tar/{dataset_name}/' + + # load metadata and configure audio paths + df = create_df(root_path) + + # create train, test, valid splits + train, test = train_test_split(df, test_size=0.2) + valid, test = train_test_split(test, test_size=0.2) + train_test_val = {'valid/':valid, 'train/':train, 'test/':test} + + + + for key in tqdm.tqdm(train_test_val, desc=f'processing:'): + df = train_test_val[key] + + dest_path = os.path.join(root_path.replace(dataset_name, f'{dataset_name}_processed'), key) + os.makedirs(dest_path, exist_ok=True) + + split_all_audio_files(df, dest_path) + tardir(dest_path, dest_path, chunk, delete_file=True) + + # upload to s3 and delete local + # s3.put(dest_path, os.path.join(s3_dest, key), recursive=True) + # shutil.rmtree(dest_path) \ No newline at end of file diff --git a/current_dataset/preprocess_EmoV_DB.py b/current_dataset/preprocess_EmoV_DB.py new file mode 100644 index 0000000..3d2aab5 --- /dev/null +++ b/current_dataset/preprocess_EmoV_DB.py @@ -0,0 +1,111 @@ +import os +from sre_parse import Verbose +import sys +import json +import tqdm +import pandas as pd +import pathlib +import fsspec +import shutil + + +from multiprocessing import Pool +from itertools import repeat +from sklearn.model_selection import train_test_split + +sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) +from utils.audio_utils import audio_to_flac +from utils.make_tar_utils import tardir + + +def convert_and_json_dump(df:pd.DataFrame, overwrite:bool=False, verbose:bool=False): + dest = df['dest'] + file = df['path'] + + os.makedirs(pathlib.Path(dest).parent, exist_ok=True) + + if os.path.isfile(dest) and overwrite==False: + if verbose==True: + print(f'{dest} already exists, skiping') + return + audio_to_flac(file, dest) + with open(dest.replace('.flac', '.json'), 'w') as f: + json.dump({'filename': os.path.join(*dest.split('/')[3:]), 'text':[df['text']], 'original_data':df['original_data']}, f) + return dest.replace('.flac', '.json') + +def extract_tars(dir:pathlib.Path, dest:pathlib.Path): + glob = dir.glob("**/*.tar.gz") + + for path in glob: + path = pathlib.Path(path) + tmp_dest = dest.joinpath(*(path.stem.split('_'))).with_suffix('') + tmp_dest.mkdir(parents=True, exist_ok=True) + cmd = f'tar -xf {path} -C {tmp_dest}' + os.system(cmd) + + +def run_tasks(extract:bool=False, overwrite:bool=False, verbose:bool=False, chunksize:int=1): + + dataset_name = 'EmoV_DB' + chunk = 512 + + s3 = fsspec.filesystem('s3') + s3_dest = pathlib.Path(f's-laion-audio/webdataset_tar/{dataset_name}/') + + root_data_dir = pathlib.Path('/home/knoriy/fsx/raw_datasets/EmoV_db/') + extracted_data_dir = pathlib.Path('/home/knoriy/fsx/raw_datasets/EmoV_db/raw/') + if extract: + extract_tars(root_data_dir, extracted_data_dir) + + raw_df = pd.read_csv(root_data_dir.joinpath('cmuarctic.csv'), sep="\t", header=None) + + glob = extracted_data_dir.glob('**/**/*.wav') + train, test = train_test_split(list(glob), test_size=0.3) + test, valid = train_test_split(list(test), test_size=0.3) + train_test_valid = {'train':train, 'test':test, 'valid':valid} + + EmoV_DB_gender = {'sam':'male', 'jenie':'female', 'josh':'male', 'bea':'females'} + + for key in train_test_valid: + dest_path = None + df_data = [] + for i, path in enumerate(train_test_valid[key]): + root_path = path.parents[0] + file_name = path.name + emotion = root_path.name + actor = root_path.parents[0].name + dest_path = str(path.parents[3].joinpath('EmoV_DB_tars', key)).replace('raw_datasets', 'processed_datasets') + + current_file = raw_df.loc[int(file_name.split('.')[0].split('_')[-1])-1] + + data = {} + + data['gender'] = EmoV_DB_gender[actor] + data['emotion'] = emotion + data['path'] = path + data['dest'] = str(pathlib.Path(dest_path).joinpath(f'{i}.flac')) + data['text'] = f'A {EmoV_DB_gender[actor]} saying "{current_file[1]}" in a {emotion} voice' + data['original_data'] = {'gender':EmoV_DB_gender[actor], 'emotion':emotion, 'raw_text':current_file[1]} + + + df_data.append(data) + + df = pd.DataFrame(df_data) + + print(f'starting pool for {key}') + with Pool() as pool: + for result in tqdm.tqdm(pool.starmap(convert_and_json_dump, zip(df.iloc, repeat(overwrite), repeat(verbose)), chunksize=chunksize), total=len(df_data)): + pass + + tardir(dest_path, dest_path, chunk, delete_file=True) + + # upload to s3 and delete local + s3.put(dest_path, s3_dest.joinpath(key), recursive=True) + print('File Uploaded to: ', s3_dest.joinpath(key)) + shutil.rmtree(dest_path) + + # clean Extracted Files + shutil.rmtree(extracted_data_dir) + +if __name__ == '__main__': + run_tasks(extract=True, chunksize=10) diff --git a/current_dataset/process_GigaSpeech.py b/current_dataset/preprocess_GigaSpeech.py similarity index 57% rename from current_dataset/process_GigaSpeech.py rename to current_dataset/preprocess_GigaSpeech.py index 947b2a5..1a7a94e 100644 --- a/current_dataset/process_GigaSpeech.py +++ b/current_dataset/preprocess_GigaSpeech.py @@ -21,10 +21,14 @@ from utils.audio_utils import audio_to_flac from utils.make_tar_utils import tardir -def convert_and_json_dump(file:str, dest:str, df): +def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False): + if os.path.isfile(dest) and overwrite==False: + print(f'{dest} already exists, skiping') + return + audio_to_flac(file, dest, segment_start=df['begin_time'], segment_end=df['end_time']) with open(dest.replace('.flac', '.json'), 'w') as f: - json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':df['text'], 'tag':df['tag']}, f) + json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':[df['text']], 'original_data':df['original_data']}, f) def split_all_audio_files(df, dest_root_path, max_workers=96): @@ -42,11 +46,11 @@ def split_all_audio_files(df, dest_root_path, max_workers=96): import multiprocessing max_workers = multiprocessing.cpu_count() - max_workers = 2 + print("Num workers: ", max_workers) chunk = 512 - root_path = '/mnt/knoriy/raw_datasets/gigaspeech/' - metadata_dir = "/mnt/knoriy/raw_datasets/gigaspeech/GigaSpeech.json" + root_path = '/home/knoriy/fsx/raw_datasets/GigaSpeech/gigaspeech/' + metadata_dir = "/home/knoriy/fsx/raw_datasets/GigaSpeech/gigaspeech/GigaSpeech.json" dataset_name = 'gigaspeech' @@ -54,30 +58,36 @@ def split_all_audio_files(df, dest_root_path, max_workers=96): s3_dest = f's-laion/knoriy/GigaSpeech/{dataset_name}_tars/' # load metadata and configure audio paths - raw_df = pd.read_json(metadata_dir)[:2] - - new_df_data = [] - for row in tqdm.tqdm(raw_df.iloc(), total=len(raw_df), desc='Generating dataframe: '): - for seg in row['audios']['segments']: - try: - catagory = row['audios']['category'] - except: - catagory = 'N/A' - - if seg['text_tn'] == '': - continue - - new_df_data.append( - {'path':f'{os.path.join(root_path, row["audios"]["path"])}', - 'begin_time': seg['begin_time'], - 'end_time': seg['end_time'], - 'text': seg['text_tn'], - 'tag':{ 'language':row['language'], - 'url':row['audios']['url'], - 'category':catagory, - 'speaker':row['audios']['speaker']} - }) - df = pd.DataFrame(new_df_data) + cache_df_path = os.path.join(root_path, 'temp_df.csv') + if os.path.isfile(cache_df_path): + df = pd.read_csv(cache_df_path, sep='\t') + else: + raw_df = pd.read_json(metadata_dir) + + new_df_data = [] + for row in tqdm.tqdm(raw_df.iloc(), total=len(raw_df), desc='Generating dataframe: '): + for seg in row['audios']['segments']: + try: + catagory = row['audios']['category'] + except: + catagory = 'N/A' + + if seg['text_tn'] == '' or seg['text_tn'] == '': + continue + + new_df_data.append( + {'path':f'{os.path.join(root_path, row["audios"]["path"])}', + 'begin_time': seg['begin_time'], + 'end_time': seg['end_time'], + 'text': seg['text_tn'], + 'original_data':{ 'language':row['language'], + 'url':row['audios']['url'], + 'category':catagory, + 'speaker':row['audios']['speaker']} + }) + df = pd.DataFrame(new_df_data) + df.to_csv(cache_df_path, sep='\t', index=False) + print(df.head()) # create train, test, valid splits diff --git a/current_dataset/preprocess_LJSpeech.py b/current_dataset/preprocess_LJSpeech.py index 84d3a57..9c345b9 100644 --- a/current_dataset/preprocess_LJSpeech.py +++ b/current_dataset/preprocess_LJSpeech.py @@ -21,10 +21,14 @@ from utils.audio_utils import audio_to_flac from utils.make_tar_utils import tardir -def convert_and_json_dump(file:str, dest:str, df): +def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False): + if os.path.isfile(dest) and overwrite==False: + print(f'{dest} already exists, skiping') + return + audio_to_flac(file, dest) with open(dest.replace('.flac', '.json'), 'w') as f: - json.dump({'filename': os.path.join(*dest.split('/')[4:]), 'text':df['norm_text'], 'tag':{'raw_text':df['raw_text']}}, f) + json.dump({'filename': os.path.join(*dest.split('/')[4:]), 'text':[df['norm_text']], 'original_data':{'raw_text':df['raw_text']}}, f) def split_all_audio_files(df, dest_root_path, max_workers=96): @@ -42,12 +46,12 @@ def split_all_audio_files(df, dest_root_path, max_workers=96): import multiprocessing max_workers = multiprocessing.cpu_count() + print("Num workers: ", max_workers) chunk = 512 generate_subset_tsv = True - root_path = '/home/knoriy/datasets/raw_datasets/ljspeech/' - tar_dir = "/home/knoriy/datasets/raw_datasets/ljspeech/LJSpeech-1.1.tar.bz2" - metadata_dir = "/home/knoriy/datasets/raw_datasets/ljspeech/metadata.csv" + root_path = '/home/knoriy/fsx/raw_datasets/ljspeech/' + metadata_dir = "/home/knoriy/fsx/raw_datasets/ljspeech/metadata.csv" dataset_name = 'ljspeech' diff --git a/current_dataset/preprocess_RAVDESS.py b/current_dataset/preprocess_RAVDESS.py new file mode 100644 index 0000000..dd56cb4 --- /dev/null +++ b/current_dataset/preprocess_RAVDESS.py @@ -0,0 +1,104 @@ +""" +Code for preprocess GigaSpeech Corpus: +https://github.com/SpeechColab/GigaSpeech +""" + +import glob +import tqdm +import os +import glob +import pandas as pd +import sys +import tarfile +import json +import shutil +import fsspec + +from sklearn.model_selection import train_test_split +from concurrent.futures import ThreadPoolExecutor, as_completed + +sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) +from utils.audio_utils import audio_to_flac +from utils.make_tar_utils import tardir + +def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False, verbose=False): + if os.path.isfile(dest) and overwrite==False: + if verbose==True: + print(f'{dest} already exists, skiping') + return + + audio_to_flac(file, dest) + with open(dest.replace('.flac', '.json'), 'w') as f: + json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':[df['text']]}, f) + + +def split_all_audio_files(df, dest_root_path, max_workers=96): + if not os.path.exists(dest_root_path): + raise FileNotFoundError(f'Please Check {dest_root_path} exists') + + l = len(df) + with tqdm.tqdm(total=l, desc=f'Processing {dest_root_path}') as pbar: + with ThreadPoolExecutor(max_workers=max_workers) as executor: + threads = [executor.submit(convert_and_json_dump, row[0], os.path.join(dest_root_path, f'{i}.flac'), row) for i, row in enumerate(df.iloc())] + for _ in as_completed(threads): + pbar.update(1) + +def create_df(root_path:str, dataset_name:str=None): + wavs = glob.glob(os.path.join(root_path, '**/*.wav'), recursive=True) + codes = { 'modality':{'01':'full-AV', '02':'video-only', '03':'audio-only'}, + 'Vocal channel':{'01':'speech', '02':'song'}, + 'Emotion':{'01':'neutral', '02':'calm', '03':'happy', '04':'sad', '05':'angry', '06':'fearful', '07':'disgust', '08':'surprised'}, + 'Emotional intensity':{'01':'normal', '02':'strong'}, + 'Statement':{'01':"Kids are talking by the door", '02':"Dogs are sitting by the door"}, + 'Repetition':{'01':1, '02':2}, + } + df_data = [] + for wav in tqdm.tqdm(wavs): + file_name = os.path.basename(wav).split('.')[0] + wav_codes = file_name.split('-') + + text = [] + for i, code in enumerate(codes.values()): + text.append(code[wav_codes[i]]) + + song_or_speech = 'says' if text[1] == 'speech' else 'sings' + text = f'A person {song_or_speech}, "{text[4]}" in a {text[2]} and {text[3]} voice.' + df_data.append({ 'path':wav, 'text':text}) + + return pd.DataFrame(df_data) + + +if __name__ == '__main__': + import multiprocessing + + max_workers = multiprocessing.cpu_count() + # print("Num workers: ", max_workers) + chunk = 512 + + root_path = '/home/knoriy/fsx/raw_datasets/RAVDESS/ravdess/' + dataset_name = 'ravdess' + + s3 = fsspec.filesystem('s3') + s3_dest = f's-laion/knoriy/RAVDESS/{dataset_name}_tars/' + + # load metadata and configure audio paths + df = create_df(root_path) + + # create train, test, valid splits + train, test = train_test_split(df, test_size=0.2) + valid, test = train_test_split(test, test_size=0.2) + train_test_val = {'train/':train, 'test/':test, 'valid/':valid} + + + for key in tqdm.tqdm(train_test_val, desc=f'processing:'): + df = train_test_val[key] + + dest_path = os.path.join(root_path.replace('raw_datasets', 'processed_datasets'), key) + os.makedirs(dest_path, exist_ok=True) + + split_all_audio_files(df, dest_path) + tardir(dest_path, dest_path, chunk, delete_file=True) + + # upload to s3 and delete local + s3.put(dest_path, os.path.join(s3_dest, key), recursive=True) + shutil.rmtree(dest_path) \ No newline at end of file diff --git a/current_dataset/preprocess_audioset.py b/current_dataset/preprocess_audioset.py new file mode 100644 index 0000000..d569e47 --- /dev/null +++ b/current_dataset/preprocess_audioset.py @@ -0,0 +1,124 @@ +import os +import json +import sys +import tqdm +import json +import pathlib +import fsspec +import pandas as pd +from concurrent.futures import ThreadPoolExecutor, as_completed +from datasets import load_dataset + +sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) + +from utils.audio_utils import audio_to_flac +from utils.make_tar_utils import tardir + + +def get_json(file_path, class_metadata, ontology_dict, class_only=False): + audio_id = os.path.basename(file_path).replace('.wav', '') + class_labels = class_metadata[audio_id].replace('"', '').split(',') + + if class_only: + class_names = [ontology_dict[c][0] for c in class_labels] + else: + class_names = [f"{ontology_dict[c][0]} ({ontology_dict[c][1]})" for c in class_labels] + + if len(class_names) > 1: + text = "The sounds of " + ", ".join(class_names[:-1]) + " and " + class_names[-1] + elif len(class_names) == 1: + text = "The sound of " + class_names[0][0], class_names[0][1] + else: + raise ValueError("No class label found for audio id: {}".format(audio_id)) + + json_data = {'text': text, + 'original_data': {'class_labels': class_labels, + 'class_names': [ontology_dict[c][0] for c in class_labels], + 'class_descriptions': [ontology_dict[c][1] for c in class_labels], + } + } + return json_data + +def convert_and_json_dump(file:str, dest:str, df, class_metadata, ontology_dict, class_only=False, overwrite:bool=False): + if os.path.isfile(dest) and os.path.isfile(dest.replace('.flac', '.json')) and not overwrite: + print(f'{dest} already exists, skiping') + return + audio_to_flac(file, dest) + + + get_json(file, class_metadata, ontology_dict, class_only=False) + with open(dest.replace('.flac', '.json'), 'w') as f: + m_dump = get_json(file, class_metadata, ontology_dict, class_only=False) + m_dump['filename'] = os.path.join(*dest.split('/')[5:]) + json.dump(m_dump, f) + + +def split_all_audio_files(data, dest_root_path, ontology_dict, class_metadata, max_workers=96): + if not os.path.exists(dest_root_path): + raise FileNotFoundError(f'Please Check {dest_root_path} exists') + + l = len(data) + with tqdm.tqdm(total=l, desc=f'Processing {dest_root_path}') as pbar: + with ThreadPoolExecutor() as executor: + threads = [executor.submit(convert_and_json_dump, row["audio"]["path"], os.path.join(dest_root_path, f'{i}.flac'), row, class_metadata, ontology_dict, False, False) for i, row in enumerate(data)] + for _ in as_completed(threads): + pbar.update(1) + + +def main(): + import multiprocessing + max_workers = multiprocessing.cpu_count() + + ############### + # Get metadata + ############### + + #load ontology + # !wget -O /tmp/ontology.json https://raw.githubusercontent.com/audioset/ontology/master/ontology.json + + with open('/tmp/ontology.json') as f: + ontology = json.load(f) + ontology_dict = {i['id']: (i['name'], i['description']) for i in ontology} + + #get and load CSV + # !wget -O /tmp/eval_segments.csv http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/eval_segments.csv + + with open('/tmp/eval_segments.csv') as f: + lines = f.readlines() + lines = lines[3:] + header_list = ['YTID', 'start_seconds', 'end_seconds', 'positive_labels'] + class_metadata = [l.strip().split(', ') for l in lines] + class_metadata = pd.DataFrame(class_metadata, columns=header_list) + class_metadata = dict(zip(class_metadata.YTID, class_metadata.positive_labels)) + + langs = ['ab', 'ar', 'en', 'fa', 'fr', 'es', 'sl', 'kab', 'cy', 'ca', 'de', 'tt', 'ta', 'ru', 'nl', 'it', 'eu', 'tr', 'zh-TW', 'br', 'pt', 'eo', 'zh-CN', 'id', 'ia', 'lv', 'ja', 'rw', 'sv-SE', 'cnh', 'et', 'ky', 'ro', 'hsb', 'el', 'cs', 'pl', 'rm-sursilv', 'rm-vallader', 'mn', 'zh-HK', 'cv', 'uk', 'mt', 'as', 'ka', 'fy-NL', 'dv', 'pa-IN', 'vi', 'or', 'ga-IE', 'fi', 'hu', 'th', 'lt', 'lg', 'hi', 'bas', 'sk', 'kmr', 'bg', 'kk', 'ba', 'gl', 'ug', 'hy-AM', 'be', 'ur', 'gn', 'sr', 'uz', 'mr', 'da', 'myv', 'nn-NO', 'ha', 'ckb', 'ml', 'mdf', 'sw', 'sat', 'tig', 'ig', 'nan-tw', 'mhr', 'bn', 'tok', 'yue', 'sah', 'mk', 'sc', 'skr', 'ti', 'mrj', 'tw', 'vot', 'az', 'ast', 'ne-NP'] + dataset_name = "common_voice_11_0" + s3 = fsspec.filesystem('s3') + + with tqdm.tqdm(total=len(langs)) as pbar: + for lang in langs: + pbar.set_description(f'Prcessing {lang}') + for split in ["train", "test", "validation"]: + wikipedia_dataset = load_dataset(f"mozilla-foundation/{dataset_name}", lang, split=split) + + if split == "validation": split = "valid" + root_dest_path = pathlib.Path(f"/fsx/knoriy/processed_datasets/{dataset_name}/{lang}/{split}/") + root_dest_path.mkdir(parents=True, exist_ok=True) + + split_all_audio_files(wikipedia_dataset, root_dest_path, ontology_dict, class_metadata, max_workers) + tardir(str(root_dest_path), str(root_dest_path), 512, delete_file=False) + + # Upload only tar files to s3 + tar_files = (root_dest_path.glob('*.tar')) + for tar in tar_files: + # upload to s3 and delete local + pbar.set_description(f'Prcessing {lang}: uploading {str(tar)} to s3') + s3_dest = f's-laion-audio/webdataset_tar/{dataset_name}/{lang}/{split}/{tar.name}' + s3.put(str(tar), s3_dest) + # shutil.rmtree(root_dest_path) + # break + pbar.update(1) + # break + +if __name__ == '__main__': + main() diff --git a/current_dataset/preprocess_common_voice.py b/current_dataset/preprocess_common_voice.py new file mode 100644 index 0000000..0f3f75d --- /dev/null +++ b/current_dataset/preprocess_common_voice.py @@ -0,0 +1,76 @@ +import os +import sys +import tqdm +import json +import pathlib +import fsspec +import shutil +from concurrent.futures import ThreadPoolExecutor, as_completed +import soundfile as sf +from datasets import load_dataset + +sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) + + +from utils.audio_utils import audio_to_flac +from utils.make_tar_utils import tardir + + +def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False): + if os.path.isfile(dest) and os.path.isfile(dest.replace('.flac', '.json')) and not overwrite: + print(f'{dest} already exists, skiping') + return + + sf.write(dest, df['audio']['array'], df['audio']['sampling_rate']) + with open(dest.replace('.flac', '.json'), 'w') as f: + json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':[df['sentence']], 'original_data':{'up_votes':df['up_votes'], 'down_votes':df['down_votes'], 'age':df['age'], 'gender':df['gender'], 'accent':df['accent'], 'language':df['locale']}}, f) + + +def split_all_audio_files(data, dest_root_path, max_workers=96): + if not os.path.exists(dest_root_path): + raise FileNotFoundError(f'Please Check {dest_root_path} exists') + + l = len(data) + with tqdm.tqdm(total=l, desc=f'Processing {dest_root_path}') as pbar: + with ThreadPoolExecutor() as executor: + threads = [executor.submit(convert_and_json_dump, row["audio"]["path"], os.path.join(dest_root_path, f'{i}.flac'), row, False) for i, row in enumerate(data)] + for _ in as_completed(threads): + pbar.update(1) + + + +def main(): + import multiprocessing + max_workers = multiprocessing.cpu_count() + + langs = ['ab', 'ar', 'en', 'fa', 'fr', 'es', 'sl', 'kab', 'cy', 'ca', 'de', 'tt', 'ta', 'ru', 'nl', 'it', 'eu', 'tr', 'zh-TW', 'br', 'pt', 'eo', 'zh-CN', 'id', 'ia', 'lv', 'ja', 'rw', 'sv-SE', 'cnh', 'et', 'ky', 'ro', 'hsb', 'el', 'cs', 'pl', 'rm-sursilv', 'rm-vallader', 'mn', 'zh-HK', 'cv', 'uk', 'mt', 'as', 'ka', 'fy-NL', 'dv', 'pa-IN', 'vi', 'or', 'ga-IE', 'fi', 'hu', 'th', 'lt', 'lg', 'hi', 'bas', 'sk', 'kmr', 'bg', 'kk', 'ba', 'gl', 'ug', 'hy-AM', 'be', 'ur', 'gn', 'sr', 'uz', 'mr', 'da', 'myv', 'nn-NO', 'ha', 'ckb', 'ml', 'mdf', 'sw', 'sat', 'tig', 'ig', 'nan-tw', 'mhr', 'bn', 'tok', 'yue', 'sah', 'mk', 'sc', 'skr', 'ti', 'mrj', 'tw', 'vot', 'az', 'ast', 'ne-NP'] + dataset_name = "common_voice_11_0" + s3 = fsspec.filesystem('s3') + + with tqdm.tqdm(total=len(langs)) as pbar: + for lang in langs: + pbar.set_description(f'Prcessing {lang}') + for split in ["train", "test", "validation"]: + wikipedia_dataset = load_dataset(f"mozilla-foundation/{dataset_name}", lang, split=split) + + if split == "validation": split = "valid" + root_dest_path = pathlib.Path(f"/fsx/knoriy/processed_datasets/{dataset_name}/{lang}/{split}/") + root_dest_path.mkdir(parents=True, exist_ok=True) + + split_all_audio_files(wikipedia_dataset, root_dest_path, max_workers) + tardir(str(root_dest_path), str(root_dest_path), 512, delete_file=False) + + # Upload only tar files to s3 + tar_files = (root_dest_path.glob('*.tar')) + for tar in tar_files: + # upload to s3 and delete local + pbar.set_description(f'Prcessing {lang}: uploading {str(tar)} to s3') + s3_dest = f's-laion-audio/webdataset_tar/{dataset_name}/{lang}/{split}/{tar.name}' + s3.put(str(tar), s3_dest) + # shutil.rmtree(root_dest_path) + # break + pbar.update(1) + # break + +if __name__ == '__main__': + main() diff --git a/current_dataset/preprocess_mswc.py b/current_dataset/preprocess_mswc.py index 70cf3f0..5d36731 100644 --- a/current_dataset/preprocess_mswc.py +++ b/current_dataset/preprocess_mswc.py @@ -14,92 +14,104 @@ import shutil import fsspec -from concurrent.futures import ThreadPoolExecutor, as_completed +import multiprocessing +from multiprocessing import Pool +from itertools import repeat + sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) from utils.audio_utils import audio_to_flac from utils.make_tar_utils import tardir -def convert_and_json_dump(file:str, dest:str, df): +def convert_and_json_dump(df:pd.DataFrame, overwrite:bool=False, verbose:bool=False): + dest = df['dest_path'] + file = df['src_path'] + + if os.path.isfile(dest) and overwrite==False: + print(f'{dest} already exists, skiping') + return audio_to_flac(file, dest) with open(dest.replace('.flac', '.json'), 'w') as f: - json.dump({'filename': os.path.join(*dest.split('/')[4:]), 'text':df['WORD'], 'tag':{'gender':df['GENDER'], 'language':dest.split('/')[-2]}}, f) - - -def split_all_audio_files(df, src_root_path, dest_root_path, max_workers=96): - if not os.path.exists(dest_root_path): - raise FileNotFoundError(f'Please Check {dest_root_path} exists') + json.dump({'filename': os.path.join(*dest.split('/')[4:]), 'text':[df['WORD']], 'original_data':{'gender':df['GENDER'], 'language':dest.split('/')[-3]}}, f) - l = len(df) - with tqdm.tqdm(total=l, desc=f'Processing {dest_root_path}') as pbar: - with ThreadPoolExecutor(max_workers=max_workers) as executor: - threads = [executor.submit(convert_and_json_dump, os.path.join(src_root_path, row['LINK']), os.path.join(dest_root_path, f'{i}.flac'), row) for i, row in enumerate(df.iloc())] - for _ in as_completed(threads): - pbar.update(1) +def split_all_audio_files(df, overwrite:bool=False, verbose:bool=False, chunksize:int=1): + print(f'starting pool') + with Pool() as pool: + for result in tqdm.tqdm(pool.starmap(convert_and_json_dump, zip(df.iloc, repeat(overwrite), repeat(verbose)), chunksize=chunksize), total=len(df)): + pass if __name__ == '__main__': - import multiprocessing + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--job", help='Directory to the files to process, e.g. "/home/knoriy/fsx/raw_datasets/mswc/audio/fr.tar.gz" ', required=True) + args = parser.parse_args() max_workers = multiprocessing.cpu_count() chunk = 512 generate_subset_tsv = True - root_path = '/home/knoriy/datasets/raw_datasets/mswc/' - tar_dir = "/home/knoriy/datasets/raw_datasets/mswc/mswc.tar.gz" + root_path = '/fsx/knoriy/raw_datasets/mswc/' + tar_dir = "/fsx/knoriy/raw_datasets/mswc/mswc.tar.gz" dataset_name = 'mswc' s3 = fsspec.filesystem('s3') - s3_dest = f's-laion/multilingual_spoken_words/{dataset_name}_tars/' + s3_dest = f's-laion-audio/webdataset_tar/{dataset_name}/' language_tars_dirs = sorted(glob.glob(os.path.join(root_path, "audio/**.tar.gz"))) if not language_tars_dirs: raise FileNotFoundError(f"Please check that the file have been extracted: {root_path}") - for dir in tqdm.tqdm(language_tars_dirs, desc=f'processing: '): - audio_path = dir - with tarfile.open(audio_path, mode='r:gz') as mswc_audio: - audio_path = os.path.split(audio_path)[0] - mswc_audio.extractall(audio_path) - - splits_path = dir.replace('audio', 'splits') - with tarfile.open(splits_path, mode='r:gz') as mswc_split: - splits_path = splits_path.replace('.tar.gz', '/') - mswc_split.extractall(splits_path) - - tmp = glob.glob(os.path.join(splits_path, '**.csv'), recursive=True) - csv_paths = [] - for csv_path in tmp: - if '_splits.csv' not in csv_path: - csv_paths.append(csv_path) - - for csv_path in csv_paths: - if 'train' in csv_path: - train_test_dev = 'train/' - elif 'test' in csv_path: - train_test_dev = 'test/' - elif 'dev' in csv_path: - train_test_dev = 'valid/' - else: - train_test_dev = 'other/' - df = pd.read_csv(csv_path) - - # Convert to .flac - dest_path = splits_path.replace('.tar.gz', '/').replace('/raw_datasets/', '/processed_datasets/').replace('splits/', '') - dest_path = os.path.join(dest_path, train_test_dev) - - src_path = os.path.join(splits_path.replace('.tar.gz', '/').replace('splits/', 'audio/'), 'clips') - os.makedirs(dest_path, exist_ok=True) - os.makedirs(src_path, exist_ok=True) - - split_all_audio_files(df, src_path, dest_path, max_workers) - - tardir(dest_path, dest_path, chunk, delete_file=True) - - # upload to s3 and delete local - s3.put(dest_path, os.path.join(s3_dest, os.path.basename(dir.split('.')[0]), train_test_dev), recursive=True) - print('File Uploaded to: ', os.path.join(s3_dest, os.path.basename(dir.split('.')[0]), train_test_dev)) - shutil.rmtree(dest_path) - - # clean extracted files - shutil.rmtree(splits_path.replace('splits/', 'audio/')) - shutil.rmtree(splits_path) \ No newline at end of file + dir = args.job + + with tarfile.open(dir, mode='r:gz') as mswc_audio: + audio_path = os.path.split(dir)[0] + mswc_audio.extractall(audio_path) + + splits_path = dir.replace('audio', 'splits') + with tarfile.open(splits_path, mode='r:gz') as mswc_split: + splits_path = splits_path.replace('.tar.gz', '/') + mswc_split.extractall(splits_path) + + tmp = glob.glob(os.path.join(splits_path, '**.csv'), recursive=True) + csv_paths = [] + for csv_path in tmp: + if '_splits.csv' not in csv_path: + csv_paths.append(csv_path) + + for csv_path in csv_paths: + if 'train' in csv_path: + train_test_dev = 'train/' + elif 'test' in csv_path: + train_test_dev = 'test/' + elif 'dev' in csv_path: + train_test_dev = 'valid/' + else: + train_test_dev = 'other/' + # Convert to .flac + dest_path = splits_path.replace('.tar.gz', '/').replace('/raw_datasets/', '/processed_datasets/').replace('splits/', '') + dest_path = os.path.join(dest_path, train_test_dev) + + src_path = os.path.join(splits_path.replace('.tar.gz', '/').replace('splits/', 'audio/'), 'clips') + os.makedirs(dest_path, exist_ok=True) + os.makedirs(src_path, exist_ok=True) + + df = pd.read_csv(csv_path) + df['dest_path'] = [os.path.join(dest_path, f'{i}.flac') for i, _ in enumerate(df.iloc())] + df['src_path'] = [os.path.join(src_path, row['LINK']) for i, row in enumerate(df.iloc())] + + print("nan found", len(df[df.isna().any(axis=1)])) + df = df.dropna() + print("nan after drop:", len(df[df.isna().any(axis=1)])) + + split_all_audio_files(df, overwrite=True, chunksize=max_workers) + + tardir(dest_path, dest_path, chunk, delete_file=True) + + # upload to s3 and delete local + s3.put(dest_path, os.path.join(s3_dest, os.path.basename(dir.split('.')[0]), train_test_dev), recursive=True) + print('File Uploaded to: s3://', os.path.join(s3_dest, os.path.basename(dir.split('.')[0]), train_test_dev)) + shutil.rmtree(dest_path) + + # # clean extracted files + # shutil.rmtree(splits_path.replace('splits/', 'audio/')) + # shutil.rmtree(splits_path) \ No newline at end of file diff --git a/current_dataset/start_slurm_jobs.py b/current_dataset/start_slurm_jobs.py new file mode 100644 index 0000000..a9a8b0d --- /dev/null +++ b/current_dataset/start_slurm_jobs.py @@ -0,0 +1,7 @@ +import os +import glob + +paths = glob.glob('/fsx/knoriy/raw_datasets/mswc/audio/*.tar.gz') + +for path in paths: + os.system( f"srun --comment clap --output=outs/%j.out --exclusive /fsx/home-knoriy/miniconda3/envs/audio_dataset/bin/python /fsx/knoriy/code/audio-dataset/current_dataset/preprocess_mswc.py --job {path} &") \ No newline at end of file diff --git a/data_preprocess/preprocess_audioset.py b/data_preprocess/preprocess_audioset.py index fd5ae05..8308192 100644 --- a/data_preprocess/preprocess_audioset.py +++ b/data_preprocess/preprocess_audioset.py @@ -4,6 +4,7 @@ import glob from tqdm import tqdm import sys +import json sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) from utils.audio_utils import audio_to_flac @@ -56,24 +57,24 @@ def process_single_audio(file_path, json_data, output_dir): # Load metadata unbalanced_csv_path = os.path.join(args.metadata_dir, f'{args.metadata_name}.csv') - with open(unbalanced_csv_path, 'r') as f: + with open(unbalanced_csv_path) as f: lines = f.readlines() + lines = lines[3:] + header_list = ['YTID', 'start_seconds', 'end_seconds', 'positive_labels'] + class_metadata = [l.strip().split(', ') for l in lines] + class_metadata = pd.DataFrame(class_metadata, columns=header_list) + class_metadata = dict(zip(class_metadata.YTID, class_metadata.positive_labels)) - lines = lines[3:] - header_list = ['YTID', 'start_seconds', 'end_seconds', 'positive_labels'] - class_metadata = [l.strip().split(', ') for l in lines] - class_metadata = pd.DataFrame(class_metadata, columns=header_list) + with open(os.path.join(args.metadata_dir,'ontology.json')) as f: + ontology = json.load(f) + ontology_dict = {i['id']: (i['name'], i['description']) for i in ontology} - class_to_name_map = pd.read_csv(os.path.join(args.metadata_dir, 'class_labels_indices.csv')) - - class_metadata = dict(zip(class_metadata.YTID, class_metadata.positive_labels)) - class_to_name_map = dict(zip(class_to_name_map.mid, class_to_name_map.display_name)) wav_all = glob.glob(f'{args.wav_dir}/*.wav') futures = [] for file in tqdm(wav_all): # process_single_audio(file, class_metadata, class_to_name_map, args.output_dir) - json_data = get_json(file, class_metadata, class_to_name_map) + json_data = get_json(file, class_metadata, ontology_dict) futures.append( executor.submit(partial(process_single_audio, file, json_data, args.output_dir))) diff --git a/data_preprocess/preprocess_audioset.sh b/data_preprocess/preprocess_audioset.sh index 83965e9..38cdaf4 100644 --- a/data_preprocess/preprocess_audioset.sh +++ b/data_preprocess/preprocess_audioset.sh @@ -1,79 +1,79 @@ #!/bin/bash -# preliminary: create /mnt/audio_clip/audioset, clone code from audio-dataset +# preliminary: create /tmp/audioset, clone code from audio-dataset -cd /mnt/audio_clip/audioset +cd /tmp/audioset mkdir metadata cd metadata wget http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/eval_segments.csv wget http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/balanced_train_segments.csv wget http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/unbalanced_train_segments.csv -wget https://raw.githubusercontent.com/qiuqiangkong/audioset_tagging_cnn/master/metadata/class_labels_indices.csv +wget https://raw.githubusercontent.com/audioset/ontology/master/ontology.json -cd ~/audio-dataset +cd /fsx/knoriy/code/audio-dataset for i in $(seq -w 00 40) do - aws s3 --region us-east-1 cp s3://laion-audio/raw_dataset/audioset/unbalanced_train_segments/unbalanced_train_segments_part"${i}"_partial.zip /mnt/audio_clip/audioset/zip/ - aws s3 --region us-east-1 cp s3://laion-audio/raw_dataset/audioset/unbalanced_train_segments/unbalanced_train_segments_part"${i}"_partial.z01 /mnt/audio_clip/audioset/zip/ - aws s3 --region us-east-1 cp s3://laion-audio/raw_dataset/audioset/unbalanced_train_segments/unbalanced_train_segments_part"${i}"_partial.z02 /mnt/audio_clip/audioset/zip/ + aws s3 cp s3://s-laion-audio/raw_dataset/audioset/unbalanced_train_segments/unbalanced_train_segments_part"${i}"_partial.zip /tmp/audioset/zip/ + aws s3 cp s3://s-laion-audio/raw_dataset/audioset/unbalanced_train_segments/unbalanced_train_segments_part"${i}"_partial.z01 /tmp/audioset/zip/ + aws s3 cp s3://s-laion-audio/raw_dataset/audioset/unbalanced_train_segments/unbalanced_train_segments_part"${i}"_partial.z02 /tmp/audioset/zip/ 7z e /mnt/audio_clip/audioset/zip/unbalanced_train_segments_part"${i}"_partial.zip -o/mnt/audio_clip/audioset/audios python data_preprocess/preprocess_audioset.py \ - --metadata_dir /mnt/audio_clip/audioset/metadata \ + --metadata_dir /tmp/audioset/metadata \ --metadata_name unbalanced_train_segments \ - --wav_dir /mnt/audio_clip/audioset/audios \ - --output_dir /mnt/audio_clip/audioset/processed_data + --wav_dir /tmp/audioset/audios \ + --output_dir /tmp/audioset/processed_data - rm /mnt/audio_clip/audioset/zip/unbalanced_train_segments_part"${i}"_partial* - rm -rf /mnt/audio_clip/audioset/audios + # rm /tmp/audioset/zip/unbalanced_train_segments_part"${i}"_partial* + # rm -rf /tmp/audioset/audios done -aws s3 --region us-east-1 cp s3://laion-audio/raw_dataset/audioset/balanced_train_segments.zip /mnt/audio_clip/audioset/ -aws s3 --region us-east-1 cp s3://laion-audio/raw_dataset/audioset/eval_segments.zip /mnt/audio_clip/audioset/ +aws s3 cp s3://s-laion-audio/raw_dataset/audioset/balanced_train_segments.zip /tmp/audioset/ +aws s3 cp s3://s-laion-audio/raw_dataset/audioset/eval_segments.zip /tmp/audioset/ -cd /mnt/audio_clip/audioset/ +cd /tmp/audioset/ unzip balanced_train_segments.zip unzip eval_segments.zip -cd ~/audio-dataset +cd /fsx/knoriy/code/audio-dataset python data_preprocess/preprocess_audioset.py \ ---metadata_dir /mnt/audio_clip/audioset/metadata \ +--metadata_dir /tmp/audioset/metadata \ --metadata_name balanced_train_segments \ ---wav_dir /mnt/audio_clip/audioset/balanced_train_segments \ ---output_dir /mnt/audio_clip/audioset/processed_data_balanced_train_segments +--wav_dir /tmp/audioset/balanced_train_segments \ +--output_dir /tmp/audioset/processed_data_balanced_train_segments python data_preprocess/preprocess_audioset.py \ ---metadata_dir /mnt/audio_clip/audioset/metadata \ +--metadata_dir /tmp/audioset/metadata \ --metadata_name eval_segments \ ---wav_dir /mnt/audio_clip/audioset/eval_segments \ ---output_dir /mnt/audio_clip/audioset/processed_data_eval_segments +--wav_dir /tmp/audioset/eval_segments \ +--output_dir /tmp/audioset/processed_data_eval_segments -python data_check/remove_bad_flac.py --dir /mnt/audio_clip/audioset/processed_data_eval_segments -python data_check/remove_bad_flac.py --dir /mnt/audio_clip/audioset/processed_data_balanced_train_segments -python data_check/remove_bad_flac.py --dir /mnt/audio_clip/audioset/processed_data +python data_check/remove_bad_flac.py --dir /tmp/audioset/processed_data_eval_segments +python data_check/remove_bad_flac.py --dir /tmp/audioset/processed_data_balanced_train_segments +python data_check/remove_bad_flac.py --dir /tmp/audioset/processed_data python ./utils/make_tar.py \ ---input /mnt/audio_clip/audioset/processed_data \ ---output /mnt/audio_clip/audioset/webdataset_tar/unbalanced_train/ \ +--input /tmp/audioset/processed_data \ +--output /tmp/audioset/webdataset_tar/unbalanced_train/ \ --dataclass none \ --delete_file python ./utils/make_tar.py \ ---input /mnt/audio_clip/audioset/processed_data_balanced_train_segments \ ---output /mnt/audio_clip/audioset/webdataset_tar/balanced_train/ \ +--input /tmp/audioset/processed_data_balanced_train_segments \ +--output /tmp/audioset/webdataset_tar/balanced_train/ \ --dataclass none \ --delete_file python ./utils/make_tar.py \ ---input /mnt/audio_clip/audioset/processed_data_eval_segments \ ---output /mnt/audio_clip/audioset/webdataset_tar/eval/ \ +--input /tmp/audioset/processed_data_eval_segments \ +--output /tmp/audioset/webdataset_tar/eval/ \ --dataclass none \ --delete_file -aws s3 --region us-east-1 cp /mnt/audio_clip/audioset/webdataset_tar/balanced_train s3://laion-audio/webdataset_tar/audioset/balanced_train --recursive -aws s3 --region us-east-1 cp /mnt/audio_clip/audioset/webdataset_tar/eval s3://laion-audio/webdataset_tar/audioset/eval --recursive -aws s3 --region us-east-1 cp /mnt/audio_clip/audioset/webdataset_tar/unbalanced_train s3://laion-audio/webdataset_tar/audioset/unbalanced_train --recursive \ No newline at end of file +aws s3 cp /tmp/audioset/webdataset_tar/balanced_train s3://s-laion-audio/webdataset_tar/audioset_description/balanced_train --recursive +aws s3 cp /tmp/audioset/webdataset_tar/eval s3://s-laion-audio/webdataset_tar/audioset_description/eval --recursive +aws s3 cp /tmp/audioset/webdataset_tar/unbalanced_train s3://s-laion-audio/webdataset_tar/audioset_description/unbalanced_train --recursive \ No newline at end of file diff --git a/download_script/download_and_preprocess_common_voice.sh b/download_script/download_and_preprocess_common_voice.sh new file mode 100644 index 0000000..9b23336 --- /dev/null +++ b/download_script/download_and_preprocess_common_voice.sh @@ -0,0 +1,26 @@ +#!/bin/bash +#SBATCH --partition=cpu16 +#SBATCH --job-name=audio-dataset +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=8 +#SBATCH --comment clap +#SBATCH --output=%x_%j.out +#SBATCH --exclusive + +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib +export NCCL_PROTO=simple +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/aws-ofi-nccl/lib +export PATH=$PATH:/opt/amazon/efa/bin:/opt/amazon/openmpi/bin +export FI_EFA_FORK_SAFE=1 +export FI_LOG_LEVEL=1 +export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn +export NCCL_DEBUG=info +export OMPI_MCA_mtl_base_verbose=1 +export FI_EFA_ENABLE_SHM_TRANSFER=0 +export FI_PROVIDER=efa +export FI_EFA_TX_MIN_CREDITS=64 +export NCCL_TREE_THRESHOLD=0 + +echo Running job on $SLURM_JOB_NUM_NODES, + +srun --comment clap /fsx/home-knoriy/miniconda3/envs/audio_dataset/bin/python /fsx/knoriy/code/audio-dataset/current_dataset/preprocess_common_voice.py \ No newline at end of file diff --git a/data_preprocess/environment.yml b/environment.yml similarity index 77% rename from data_preprocess/environment.yml rename to environment.yml index eb59b44..e036fab 100644 --- a/data_preprocess/environment.yml +++ b/environment.yml @@ -1,10 +1,12 @@ name: audio_dataset channels: - pytorch + - conda-forge - defaults dependencies: - _libgcc_mutex=0.1=main - _openmp_mutex=5.1=1_gnu + - absl-py=1.1.0=pyhd8ed1ab_0 - aiobotocore=2.1.0=pyhd3eb1b0_0 - aiohttp=3.8.1=py39h7f8727e_1 - aioitertools=0.7.1=pyhd3eb1b0_0 @@ -14,28 +16,37 @@ dependencies: - attrs=21.4.0=pyhd3eb1b0_0 - backcall=0.2.0=pyhd3eb1b0_0 - blas=1.0=mkl + - blinker=1.4=py_1 - botocore=1.23.24=pyhd3eb1b0_0 - bottleneck=1.3.4=py39hce1f21e_0 - brotlipy=0.7.0=py39h27cfd23_1003 - bzip2=1.0.8=h7b6447c_0 - - ca-certificates=2022.4.26=h06a4308_0 - - certifi=2022.5.18.1=py39h06a4308_0 + - c-ares=1.18.1=h7f98852_0 + - ca-certificates=2022.6.15=ha878542_0 + - cachetools=5.0.0=pyhd8ed1ab_0 + - certifi=2022.6.15=py39hf3d152e_0 - cffi=1.15.0=py39hd667e15_1 - charset-normalizer=2.0.4=pyhd3eb1b0_0 + - click=8.1.3=py39hf3d152e_0 - cryptography=37.0.1=py39h9ce1e76_0 - cudatoolkit=11.3.1=h2bc3f7f_2 - debugpy=1.5.1=py39h295c915_0 - decorator=5.1.1=pyhd3eb1b0_0 - entrypoints=0.4=py39h06a4308_0 - executing=0.8.3=pyhd3eb1b0_0 - - ffmpeg=4.3=hf484d3e_0 + - ffmpeg=4.3.2=hca11adc_0 - freetype=2.11.0=h70c0345_0 - frozenlist=1.2.0=py39h7f8727e_0 - fsspec=2022.1.0=pyhd3eb1b0_0 + - future=0.18.2=py39hf3d152e_5 - giflib=5.2.1=h7b6447c_0 - gmp=6.2.1=h295c915_3 - gnutls=3.6.15=he1e5248_0 + - google-auth=2.9.0=pyh6c4a22f_0 + - google-auth-oauthlib=0.4.6=pyhd8ed1ab_0 + - grpcio=1.42.0=py39hce63b2e_0 - idna=3.3=pyhd3eb1b0_0 + - importlib-metadata=4.11.4=py39hf3d152e_0 - intel-openmp=2021.4.0=h06a4308_3561 - ipykernel=6.9.1=py39h06a4308_0 - ipython=8.3.0=py39h06a4308_0 @@ -53,6 +64,7 @@ dependencies: - libiconv=1.16=h7f8727e_2 - libidn2=2.3.2=h7f8727e_0 - libpng=1.6.37=hbc83047_0 + - libprotobuf=3.15.8=h780b84a_1 - libsodium=1.0.18=h7b6447c_0 - libstdcxx-ng=11.2.0=h1234567_1 - libtasn1=4.16.0=h27cfd23_0 @@ -62,6 +74,7 @@ dependencies: - libwebp=1.2.2=h55f646e_0 - libwebp-base=1.2.2=h7f8727e_0 - lz4-c=1.9.3=h295c915_1 + - markdown=3.3.7=pyhd8ed1ab_0 - matplotlib-inline=0.1.2=pyhd3eb1b0_2 - mkl=2021.4.0=h06a4308_640 - mkl-service=2.4.0=py39h7f8727e_0 @@ -74,8 +87,9 @@ dependencies: - numexpr=2.8.1=py39h6abb31d_0 - numpy=1.22.3=py39he7a7128_0 - numpy-base=1.22.3=py39hf524024_0 + - oauthlib=3.2.0=pyhd8ed1ab_0 - openh264=2.1.1=h4ff587b_0 - - openssl=1.1.1o=h7f8727e_0 + - openssl=1.1.1p=h5eee18b_0 - packaging=21.3=pyhd3eb1b0_0 - pandas=1.4.2=py39h295c915_0 - parso=0.8.3=pyhd3eb1b0_0 @@ -84,26 +98,39 @@ dependencies: - pillow=9.0.1=py39h22f2fdc_0 - pip=21.2.4=py39h06a4308_0 - prompt-toolkit=3.0.20=pyhd3eb1b0_0 + - protobuf=3.15.8=py39he80948d_0 - ptyprocess=0.7.0=pyhd3eb1b0_2 - pure_eval=0.2.2=pyhd3eb1b0_0 + - pyasn1=0.4.8=py_0 + - pyasn1-modules=0.2.7=py_0 - pycparser=2.21=pyhd3eb1b0_0 - pygments=2.11.2=pyhd3eb1b0_0 + - pyjwt=2.4.0=pyhd8ed1ab_0 - pyopenssl=22.0.0=pyhd3eb1b0_0 - pyparsing=3.0.4=pyhd3eb1b0_0 - pysocks=1.7.1=py39h06a4308_0 - python=3.9.12=h12debd9_1 - python-dateutil=2.8.2=pyhd3eb1b0_0 + - python_abi=3.9=2_cp39 - pytorch=1.11.0=py3.9_cuda11.3_cudnn8.2.0_0 + - pytorch-lightning=0.8.5=py_0 - pytorch-mutex=1.0=cuda - pytz=2022.1=py39h06a4308_0 + - pyu2f=0.1.5=pyhd8ed1ab_0 + - pyyaml=6.0=py39hb9d737c_4 - pyzmq=22.3.0=py39h295c915_2 - readline=8.1.2=h7f8727e_1 - requests=2.27.1=pyhd3eb1b0_0 + - requests-oauthlib=1.3.1=pyhd8ed1ab_0 + - rsa=4.8=pyhd8ed1ab_0 - s3fs=2022.1.0=pyhd3eb1b0_0 - setuptools=61.2.0=py39h06a4308_0 - six=1.16.0=pyhd3eb1b0_1 - sqlite=3.38.3=hc218d9a_0 - stack_data=0.2.0=pyhd3eb1b0_0 + - tensorboard=2.9.1=pyhd8ed1ab_0 + - tensorboard-data-server=0.6.0=py39hd97740a_2 + - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0 - tk=8.6.12=h1ccaba5_0 - torchaudio=0.11.0=py39_cu113 - torchvision=0.12.0=py39_cu113 @@ -115,11 +142,15 @@ dependencies: - tzdata=2022a=hda174b7_0 - urllib3=1.26.9=py39h06a4308_0 - wcwidth=0.2.5=pyhd3eb1b0_0 + - werkzeug=2.1.2=pyhd8ed1ab_1 - wheel=0.37.1=pyhd3eb1b0_0 - wrapt=1.13.3=py39h7f8727e_2 + - x264=1!161.3030=h7f98852_1 - xz=5.2.5=h7f8727e_1 + - yaml=0.2.5=h7f98852_2 - yarl=1.6.3=py39h27cfd23_0 - zeromq=4.3.4=h2531618_0 + - zipp=3.8.0=pyhd8ed1ab_0 - zlib=1.2.12=h7f8727e_2 - zstd=1.5.2=ha4553b6_0 - pip: @@ -131,11 +162,10 @@ dependencies: - llvmlite==0.38.1 - numba==0.55.2 - pooch==1.6.0 - - pyyaml==6.0 - resampy==0.2.2 - scikit-learn==1.1.1 - scipy==1.8.1 - soundfile==0.10.3.post1 - threadpoolctl==3.1.0 - webdataset==0.2.5 -prefix: /home/knoriy/miniconda3/envs/audio_dataset +prefix: /home/knoriy/fsx/miniconda3/envs/audio_dataset diff --git a/utils/make_tar_utils.py b/utils/make_tar_utils.py index 91d7bc4..a55c553 100644 --- a/utils/make_tar_utils.py +++ b/utils/make_tar_utils.py @@ -36,15 +36,15 @@ def tardir( if n_split * n_entry_each != len(filelist): n_split += 1 size_dict = { - os.path.basename(tar_name) + str(i) + ".tar": n_entry_each + os.path.join(os.path.basename(tar_name), str(i) + ".tar"): n_entry_each for i in range(n_split) } if n_split * n_entry_each != len(filelist): - size_dict[os.path.basename(tar_name) + str(n_split - 1) + ".tar"] = ( + size_dict[os.path.join(os.path.basename(tar_name), str(n_split - 1) + ".tar")] = ( len(filelist) - (n_split - 1) * n_entry_each ) for i in tqdm(range(start_idx, n_split + start_idx), desc='Creating .tar file:'): - with tarfile.open(tar_name + str(i) + ".tar", "w") as tar_handle: + with tarfile.open(os.path.join(tar_name, str(i) + ".tar"), "w") as tar_handle: for j in range(count, len(filelist)): audio = filelist[j] basename = ".".join(audio.split(".")[:-1])