I found your repo from this issue:
I am still confused about the mismatch between repos in Mel spectrogram generation. I collect some method from some TTS repo, there are some differences such as
def get_mel_librosa1(wave): wave = wave / max_wav_value wave = wave.astype('float32') mel = librosa.feature.melspectrogram(y=wave, sr=sampling_rate, n_mels=num_mels, n_fft=fft_size, hop_length=hop_size, win_length=win_length, window=window_librosa) #, center=True, pad_mode='constant', power=2.0) return mel def get_mel_librosa2(wave): wave = wave / max_wav_value wave = wave.astype('float32') sgram = librosa.stft(wave, n_fft=fft_size, hop_length=hop_size, win_length=win_length, window=window_librosa) sgram_mag, _ = librosa.magphase(sgram) mel_scale_sgram = librosa.feature.melspectrogram(S=sgram_mag, sr=sampling_rate, n_mels=num_mels, n_fft=fft_size, hop_length=hop_size, win_length=win_length, window=window_librosa) mel_sgram = librosa.amplitude_to_db(mel_scale_sgram, ref=np.min) return mel_sgram def get_mel_parallelwavegan(wave): # get amplitude spectrogram wave = wave / max_wav_value wave = wave.astype('float32') x_stft = librosa.stft(wave, n_fft=fft_size, hop_length=hop_size, win_length=win_length, window=window_librosa, center=True, pad_mode="reflect") spc = np.abs(x_stft).T # (#frames, #bins) mel = np.maximum(eps,, melbasis.T)) return np.log10(mel).T def get_mel_tacotron2(wave): wave = torch.FloatTensor(wave) audio_norm = wave / max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) _stft = TacotronSTFT(fft_size, hop_size, fft_size, num_mels, sampling_rate, fmin, fmax) melspec = _stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) return melspec.cpu().detach().numpy() def get_mel_hifigan_origin(y): y = y/max_wav_value y = torch.FloatTensor([y]).to(device) y = torch.nn.functional.pad(y.unsqueeze(1), (int((fft_size-hop_size)/2), int((fft_size-hop_size)/2)), mode='reflect').squeeze(1) spec = torch.stft(y, fft_size, hop_length=hop_size, win_length=win_length, window=window_torch, center=False, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9)) mel_basis = torch.from_numpy( melbasis ).float().to(device) spec = torch.matmul(mel_basis, spec) spec = torch.log(torch.clamp(spec, min=1e-5) * 1) return spec.cpu().detach().numpy()[0] def get_mel_hifigan_center(y): y = y/max_wav_value y = torch.FloatTensor([y]).to(device) # y = torch.nn.functional.pad(y.unsqueeze(1), (int((fft_size-hop_size)/2), int((fft_size-hop_size)/2)), mode='reflect').squeeze(1) spec = torch.stft(y, fft_size, hop_length=hop_size, win_length=win_length, window=window_torch, center=True, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9)) mel_basis = torch.from_numpy( melbasis ).float().to(device) spec = torch.matmul(mel_basis, spec) spec = torch.log(torch.clamp(spec, min=1e-5) * 1) return spec.cpu().detach().numpy()[0] def get_mel_hifigan_change_pad(y): # y = y/max_wav_value y = torch.FloatTensor([y]).to(device) y = torch.nn.functional.pad(y.unsqueeze(1), (int((fft_size)/2), int((fft_size)/2)), mode='reflect').squeeze(1) spec = torch.stft(y, fft_size, hop_length=hop_size, win_length=win_length, window=window_torch, center=False, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9)) mel_basis = torch.from_numpy( melbasis ).float().to(device) spec = torch.matmul(mel_basis, spec) spec = torch.log(torch.clamp(spec, min=1e-5) * 1) return spec.cpu().detach().numpy()[0]
mel0 = get_mel_librosa1(wave) mel1 = get_mel_librosa2(wave) mel2 = get_mel_parallelwavegan(wave) mel3 = get_mel_tacotron2(wave) mel4 = get_mel_hifigan_origin(wave) mel5 = get_mel_hifigan_center(wave) mel6 = get_mel_hifigan_change_pad(wave)
(80, 487) (80, 487) (80, 487) (80, 487) (80, 486) (80, 487) (80, 487)
Only the origin way of hifigan repo give difference shape: get_mel_hifigan_origin
Do you have any comments on this, when I compare element values, there is no total match between these method.
One more question, Is there any benchmark for these Vocoders?
Which should I use when I want to fine tune Taco to Hifigan?
