Skip to content

The right way to generate mel-spectrogram #3

@v-nhandt21

Description

@v-nhandt21

I found your repo from this issue: jik876/hifi-gan#63

I am still confused about the mismatch between repos in Mel spectrogram generation. I collect some method from some TTS repo, there are some differences such as

  • STFT from torch vs librosa
  • Log mel with base e vs base 10
  • Difference in padding
  • Use center or not

def get_mel_librosa1(wave):
     wave = wave / max_wav_value
     wave = wave.astype('float32')
     mel = librosa.feature.melspectrogram(y=wave, sr=sampling_rate, n_mels=num_mels, n_fft=fft_size, hop_length=hop_size, win_length=win_length, window=window_librosa) #, center=True, pad_mode='constant', power=2.0)
     return mel

def get_mel_librosa2(wave):
     wave = wave / max_wav_value
     wave = wave.astype('float32')
     sgram = librosa.stft(wave, n_fft=fft_size, hop_length=hop_size, win_length=win_length, window=window_librosa)
     sgram_mag, _ = librosa.magphase(sgram)
     mel_scale_sgram = librosa.feature.melspectrogram(S=sgram_mag, sr=sampling_rate, n_mels=num_mels, n_fft=fft_size, hop_length=hop_size, win_length=win_length, window=window_librosa)
     mel_sgram = librosa.amplitude_to_db(mel_scale_sgram, ref=np.min)
     return mel_sgram

def get_mel_parallelwavegan(wave):
     # get amplitude spectrogram
     wave = wave / max_wav_value
     wave = wave.astype('float32')
     x_stft = librosa.stft(wave, n_fft=fft_size, hop_length=hop_size, win_length=win_length, window=window_librosa, center=True, pad_mode="reflect")
     spc = np.abs(x_stft).T  # (#frames, #bins)
     mel = np.maximum(eps, np.dot(spc, melbasis.T))
     return np.log10(mel).T

def get_mel_tacotron2(wave):
     wave = torch.FloatTensor(wave)
     audio_norm = wave / max_wav_value
     audio_norm = audio_norm.unsqueeze(0)
     audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)

     _stft = TacotronSTFT(fft_size, hop_size, fft_size, num_mels, sampling_rate, fmin, fmax)

     melspec = _stft.mel_spectrogram(audio_norm)
     melspec = torch.squeeze(melspec, 0)
     return melspec.cpu().detach().numpy()

def get_mel_hifigan_origin(y):
     y = y/max_wav_value
     y = torch.FloatTensor([y]).to(device)
     y = torch.nn.functional.pad(y.unsqueeze(1), (int((fft_size-hop_size)/2), int((fft_size-hop_size)/2)), mode='reflect').squeeze(1)
     spec = torch.stft(y, fft_size, hop_length=hop_size, win_length=win_length, window=window_torch, center=False, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
     spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
     mel_basis = torch.from_numpy( melbasis ).float().to(device)
     spec = torch.matmul(mel_basis, spec)
     spec = torch.log(torch.clamp(spec, min=1e-5) * 1)
     return spec.cpu().detach().numpy()[0]

def get_mel_hifigan_center(y):
     y = y/max_wav_value
     y = torch.FloatTensor([y]).to(device)
     # y = torch.nn.functional.pad(y.unsqueeze(1), (int((fft_size-hop_size)/2), int((fft_size-hop_size)/2)), mode='reflect').squeeze(1)
     spec = torch.stft(y, fft_size, hop_length=hop_size, win_length=win_length, window=window_torch, center=True, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
     spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
     mel_basis = torch.from_numpy( melbasis ).float().to(device)
     spec = torch.matmul(mel_basis, spec)
     spec = torch.log(torch.clamp(spec, min=1e-5) * 1)
     return spec.cpu().detach().numpy()[0]

def get_mel_hifigan_change_pad(y):
     # https://github.com/jik876/hifi-gan/issues/63
     y = y/max_wav_value
     y = torch.FloatTensor([y]).to(device)
     y = torch.nn.functional.pad(y.unsqueeze(1), (int((fft_size)/2), int((fft_size)/2)), mode='reflect').squeeze(1)
     spec = torch.stft(y, fft_size, hop_length=hop_size, win_length=win_length, window=window_torch, center=False, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
     spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
     mel_basis = torch.from_numpy( melbasis ).float().to(device)
     spec = torch.matmul(mel_basis, spec)
     spec = torch.log(torch.clamp(spec, min=1e-5) * 1)

     return spec.cpu().detach().numpy()[0]
 mel0 = get_mel_librosa1(wave)
 mel1 = get_mel_librosa2(wave)
 mel2 = get_mel_parallelwavegan(wave)
 mel3 = get_mel_tacotron2(wave)
 mel4 = get_mel_hifigan_origin(wave)
 mel5 = get_mel_hifigan_center(wave)
 mel6 = get_mel_hifigan_change_pad(wave)
(80, 487)
(80, 487)
(80, 487)
(80, 487)
(80, 486)
(80, 487)
(80, 487)

Only the origin way of hifigan repo give difference shape: get_mel_hifigan_origin

Do you have any comments on this, when I compare element values, there is no total match between these method.

One more question, Is there any benchmark for these Vocoders?

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions