CookiePPP / VocoderComparisons

Train/test a variety of open source vocoders using the same input features and dataset. Then infer together for easy side-by-side comparisons.
MIT License
6 stars 1 forks source link

The right way to generate mel-spectrogram #3

Open v-nhandt21 opened 2 years ago

v-nhandt21 commented 2 years ago

I found your repo from this issue: https://github.com/jik876/hifi-gan/issues/63

I am still confused about the mismatch between repos in Mel spectrogram generation. I collect some method from some TTS repo, there are some differences such as


def get_mel_librosa1(wave):
     wave = wave / max_wav_value
     wave = wave.astype('float32')
     mel = librosa.feature.melspectrogram(y=wave, sr=sampling_rate, n_mels=num_mels, n_fft=fft_size, hop_length=hop_size, win_length=win_length, window=window_librosa) #, center=True, pad_mode='constant', power=2.0)
     return mel

def get_mel_librosa2(wave):
     wave = wave / max_wav_value
     wave = wave.astype('float32')
     sgram = librosa.stft(wave, n_fft=fft_size, hop_length=hop_size, win_length=win_length, window=window_librosa)
     sgram_mag, _ = librosa.magphase(sgram)
     mel_scale_sgram = librosa.feature.melspectrogram(S=sgram_mag, sr=sampling_rate, n_mels=num_mels, n_fft=fft_size, hop_length=hop_size, win_length=win_length, window=window_librosa)
     mel_sgram = librosa.amplitude_to_db(mel_scale_sgram, ref=np.min)
     return mel_sgram

def get_mel_parallelwavegan(wave):
     # get amplitude spectrogram
     wave = wave / max_wav_value
     wave = wave.astype('float32')
     x_stft = librosa.stft(wave, n_fft=fft_size, hop_length=hop_size, win_length=win_length, window=window_librosa, center=True, pad_mode="reflect")
     spc = np.abs(x_stft).T  # (#frames, #bins)
     mel = np.maximum(eps, np.dot(spc, melbasis.T))
     return np.log10(mel).T

def get_mel_tacotron2(wave):
     wave = torch.FloatTensor(wave)
     audio_norm = wave / max_wav_value
     audio_norm = audio_norm.unsqueeze(0)
     audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)

     _stft = TacotronSTFT(fft_size, hop_size, fft_size, num_mels, sampling_rate, fmin, fmax)

     melspec = _stft.mel_spectrogram(audio_norm)
     melspec = torch.squeeze(melspec, 0)
     return melspec.cpu().detach().numpy()

def get_mel_hifigan_origin(y):
     y = y/max_wav_value
     y = torch.FloatTensor([y]).to(device)
     y = torch.nn.functional.pad(y.unsqueeze(1), (int((fft_size-hop_size)/2), int((fft_size-hop_size)/2)), mode='reflect').squeeze(1)
     spec = torch.stft(y, fft_size, hop_length=hop_size, win_length=win_length, window=window_torch, center=False, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
     spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
     mel_basis = torch.from_numpy( melbasis ).float().to(device)
     spec = torch.matmul(mel_basis, spec)
     spec = torch.log(torch.clamp(spec, min=1e-5) * 1)
     return spec.cpu().detach().numpy()[0]

def get_mel_hifigan_center(y):
     y = y/max_wav_value
     y = torch.FloatTensor([y]).to(device)
     # y = torch.nn.functional.pad(y.unsqueeze(1), (int((fft_size-hop_size)/2), int((fft_size-hop_size)/2)), mode='reflect').squeeze(1)
     spec = torch.stft(y, fft_size, hop_length=hop_size, win_length=win_length, window=window_torch, center=True, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
     spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
     mel_basis = torch.from_numpy( melbasis ).float().to(device)
     spec = torch.matmul(mel_basis, spec)
     spec = torch.log(torch.clamp(spec, min=1e-5) * 1)
     return spec.cpu().detach().numpy()[0]

def get_mel_hifigan_change_pad(y):
     # https://github.com/jik876/hifi-gan/issues/63
     y = y/max_wav_value
     y = torch.FloatTensor([y]).to(device)
     y = torch.nn.functional.pad(y.unsqueeze(1), (int((fft_size)/2), int((fft_size)/2)), mode='reflect').squeeze(1)
     spec = torch.stft(y, fft_size, hop_length=hop_size, win_length=win_length, window=window_torch, center=False, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
     spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
     mel_basis = torch.from_numpy( melbasis ).float().to(device)
     spec = torch.matmul(mel_basis, spec)
     spec = torch.log(torch.clamp(spec, min=1e-5) * 1)

     return spec.cpu().detach().numpy()[0]
 mel0 = get_mel_librosa1(wave)
 mel1 = get_mel_librosa2(wave)
 mel2 = get_mel_parallelwavegan(wave)
 mel3 = get_mel_tacotron2(wave)
 mel4 = get_mel_hifigan_origin(wave)
 mel5 = get_mel_hifigan_center(wave)
 mel6 = get_mel_hifigan_change_pad(wave)
(80, 487)
(80, 487)
(80, 487)
(80, 487)
(80, 486)
(80, 487)
(80, 487)

Only the origin way of hifigan repo give difference shape: get_mel_hifigan_origin

Do you have any comments on this, when I compare element values, there is no total match between these method.

One more question, Is there any benchmark for these Vocoders?

v-nhandt21 commented 2 years ago

Which should I use when I want to fine tune Taco to Hifigan?