-
Notifications
You must be signed in to change notification settings - Fork 1
Open
Description
I found your repo from this issue: jik876/hifi-gan#63
I am still confused about the mismatch between repos in Mel spectrogram generation. I collect some method from some TTS repo, there are some differences such as
- STFT from torch vs librosa
- Log mel with base e vs base 10
- Difference in padding
- Use center or not
def get_mel_librosa1(wave):
wave = wave / max_wav_value
wave = wave.astype('float32')
mel = librosa.feature.melspectrogram(y=wave, sr=sampling_rate, n_mels=num_mels, n_fft=fft_size, hop_length=hop_size, win_length=win_length, window=window_librosa) #, center=True, pad_mode='constant', power=2.0)
return mel
def get_mel_librosa2(wave):
wave = wave / max_wav_value
wave = wave.astype('float32')
sgram = librosa.stft(wave, n_fft=fft_size, hop_length=hop_size, win_length=win_length, window=window_librosa)
sgram_mag, _ = librosa.magphase(sgram)
mel_scale_sgram = librosa.feature.melspectrogram(S=sgram_mag, sr=sampling_rate, n_mels=num_mels, n_fft=fft_size, hop_length=hop_size, win_length=win_length, window=window_librosa)
mel_sgram = librosa.amplitude_to_db(mel_scale_sgram, ref=np.min)
return mel_sgram
def get_mel_parallelwavegan(wave):
# get amplitude spectrogram
wave = wave / max_wav_value
wave = wave.astype('float32')
x_stft = librosa.stft(wave, n_fft=fft_size, hop_length=hop_size, win_length=win_length, window=window_librosa, center=True, pad_mode="reflect")
spc = np.abs(x_stft).T # (#frames, #bins)
mel = np.maximum(eps, np.dot(spc, melbasis.T))
return np.log10(mel).T
def get_mel_tacotron2(wave):
wave = torch.FloatTensor(wave)
audio_norm = wave / max_wav_value
audio_norm = audio_norm.unsqueeze(0)
audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
_stft = TacotronSTFT(fft_size, hop_size, fft_size, num_mels, sampling_rate, fmin, fmax)
melspec = _stft.mel_spectrogram(audio_norm)
melspec = torch.squeeze(melspec, 0)
return melspec.cpu().detach().numpy()
def get_mel_hifigan_origin(y):
y = y/max_wav_value
y = torch.FloatTensor([y]).to(device)
y = torch.nn.functional.pad(y.unsqueeze(1), (int((fft_size-hop_size)/2), int((fft_size-hop_size)/2)), mode='reflect').squeeze(1)
spec = torch.stft(y, fft_size, hop_length=hop_size, win_length=win_length, window=window_torch, center=False, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
mel_basis = torch.from_numpy( melbasis ).float().to(device)
spec = torch.matmul(mel_basis, spec)
spec = torch.log(torch.clamp(spec, min=1e-5) * 1)
return spec.cpu().detach().numpy()[0]
def get_mel_hifigan_center(y):
y = y/max_wav_value
y = torch.FloatTensor([y]).to(device)
# y = torch.nn.functional.pad(y.unsqueeze(1), (int((fft_size-hop_size)/2), int((fft_size-hop_size)/2)), mode='reflect').squeeze(1)
spec = torch.stft(y, fft_size, hop_length=hop_size, win_length=win_length, window=window_torch, center=True, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
mel_basis = torch.from_numpy( melbasis ).float().to(device)
spec = torch.matmul(mel_basis, spec)
spec = torch.log(torch.clamp(spec, min=1e-5) * 1)
return spec.cpu().detach().numpy()[0]
def get_mel_hifigan_change_pad(y):
# https://github.com/jik876/hifi-gan/issues/63
y = y/max_wav_value
y = torch.FloatTensor([y]).to(device)
y = torch.nn.functional.pad(y.unsqueeze(1), (int((fft_size)/2), int((fft_size)/2)), mode='reflect').squeeze(1)
spec = torch.stft(y, fft_size, hop_length=hop_size, win_length=win_length, window=window_torch, center=False, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
mel_basis = torch.from_numpy( melbasis ).float().to(device)
spec = torch.matmul(mel_basis, spec)
spec = torch.log(torch.clamp(spec, min=1e-5) * 1)
return spec.cpu().detach().numpy()[0]
mel0 = get_mel_librosa1(wave)
mel1 = get_mel_librosa2(wave)
mel2 = get_mel_parallelwavegan(wave)
mel3 = get_mel_tacotron2(wave)
mel4 = get_mel_hifigan_origin(wave)
mel5 = get_mel_hifigan_center(wave)
mel6 = get_mel_hifigan_change_pad(wave)
(80, 487)
(80, 487)
(80, 487)
(80, 487)
(80, 486)
(80, 487)
(80, 487)
Only the origin way of hifigan repo give difference shape: get_mel_hifigan_origin
Do you have any comments on this, when I compare element values, there is no total match between these method.
One more question, Is there any benchmark for these Vocoders?
Metadata
Metadata
Assignees
Labels
No labels