diff --git a/tests/test_nsf_hifigan.py b/tests/test_nsf_hifigan.py index f1baaa2a..81df2444 100644 --- a/tests/test_nsf_hifigan.py +++ b/tests/test_nsf_hifigan.py @@ -1,4 +1,5 @@ import soundfile as sf +import torch import torchaudio from fish_diffusion.modules.pitch_extractors import ParselMouthPitchExtractor @@ -10,6 +11,10 @@ audio, sr = torchaudio.load(source) +# Change the multichannel audio to single channel +if audio.shape[0] > 1: + audio = torch.mean(audio, dim=0, keepdim=True) + mel = gan.wav2spec(audio) f0 = ParselMouthPitchExtractor(f0_min=40.0, f0_max=2000.0, keep_zeros=False)( audio, sr, pad_to=mel.shape[-1] diff --git a/tools/nsf_hifigan/config_v1_openvpi.json b/tools/nsf_hifigan/config_v1_openvpi.json new file mode 100644 index 00000000..6bfdb545 --- /dev/null +++ b/tools/nsf_hifigan/config_v1_openvpi.json @@ -0,0 +1,61 @@ +{ + "resblock": "1", + "learning_rate": 0.0002, + "adam_b1": 0.8, + "adam_b2": 0.99, + "lr_decay": 0.999, + "upsample_rates": [ + 8, + 8, + 2, + 2, + 2 + ], + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4, + 4 + ], + "upsample_initial_channel": 512, + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "discriminator_periods": [ + 3, + 5, + 7, + 11, + 17, + 23, + 37 + ], + "segment_size": 16384, + "num_mels": 128, + "n_fft": 2048, + "hop_size": 512, + "win_size": 2048, + "sampling_rate": 44100, + "fmin": 40, + "fmax": 16000 +}