fishaudio · colstone · May 1, 2024 · May 1, 2024 · Jun 29, 2024 · Jun 29, 2024
diff --git a/tests/test_nsf_hifigan.py b/tests/test_nsf_hifigan.py
@@ -1,4 +1,5 @@
 import soundfile as sf
+import torch
 import torchaudio
 
 from fish_diffusion.modules.pitch_extractors import ParselMouthPitchExtractor
@@ -10,6 +11,10 @@
 
 audio, sr = torchaudio.load(source)
 
+# Change the multichannel audio to single channel
+if audio.shape[0] > 1:
+    audio = torch.mean(audio, dim=0, keepdim=True)
+
 mel = gan.wav2spec(audio)
 f0 = ParselMouthPitchExtractor(f0_min=40.0, f0_max=2000.0, keep_zeros=False)(
     audio, sr, pad_to=mel.shape[-1]

diff --git a/tools/nsf_hifigan/config_v1_openvpi.json b/tools/nsf_hifigan/config_v1_openvpi.json
@@ -0,0 +1,61 @@
+{
+    "resblock": "1",
+    "learning_rate": 0.0002,
+    "adam_b1": 0.8,
+    "adam_b2": 0.99,
+    "lr_decay": 0.999,
+    "upsample_rates": [
+        8,
+        8,
+        2,
+        2,
+        2
+    ],
+    "upsample_kernel_sizes": [
+        16,
+        16,
+        4,
+        4,
+        4
+    ],
+    "upsample_initial_channel": 512,
+    "resblock_kernel_sizes": [
+        3,
+        7,
+        11
+    ],
+    "resblock_dilation_sizes": [
+        [
+            1,
+            3,
+            5
+        ],
+        [
+            1,
+            3,
+            5
+        ],
+        [
+            1,
+            3,
+            5
+        ]
+    ],
+    "discriminator_periods": [
+        3,
+        5,
+        7,
+        11,
+        17,
+        23,
+        37
+    ],
+    "segment_size": 16384,
+    "num_mels": 128,
+    "n_fft": 2048,
+    "hop_size": 512,
+    "win_size": 2048,
+    "sampling_rate": 44100,
+    "fmin": 40,
+    "fmax": 16000
+}