Wendison / VQMIVC

Official implementation of VQMIVC: One-shot (any-to-any) Voice Conversion @ Interspeech 2021 + Online playing demo!
MIT License
340 stars 55 forks source link

Other vocoder #21

Closed 2Bye closed 2 years ago

2Bye commented 2 years ago

Hello, i tried use another vocoder - HiFi Gan with your model. But i faced with problem which get output with noise audio or silence.

I transopted the logmel output to a regular input for HiFiGan [1, 80, X] for int16 i get very noise audio, for int32 i get silence

My inference code:

import torch
import numpy as np
from scipy.io.wavfile import write

from hifi_gan.env import AttrDict
from hifi_gan.meldataset import mel_spectrogram, MAX_WAV_VALUE, load_wav
from hifi_gan.models_hifi import Generator
import soundfile as sf
from model_encoder import Encoder, Encoder_lf0
from model_decoder import Decoder_ac
from model_encoder import SpeakerEncoder as Encoder_spk
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
import subprocess
from spectrogram import logmelspectrogram
import resampy
import pyworld as pw
import argparse

def extract_logmel(wav_path, mean, std, sr=16000):
    # wav, fs = librosa.load(wav_path, sr=sr)
    wav, fs = sf.read(wav_path)
    if fs != sr:
        wav = resampy.resample(wav, fs, sr, axis=0)
        fs = sr
    #wav, _ = librosa.effects.trim(wav, top_db=15)
    # duration = len(wav)/fs
    assert fs == 16000
    peak = np.abs(wav).max()
    if peak > 1.0:
        wav /= peak
    mel = logmelspectrogram(
                x=wav,
                fs=fs,
                n_mels=80,
                n_fft=400,
                n_shift=160,
                win_length=400,
                window='hann',
                fmin=80,
                fmax=7600,
            )

    mel = (mel - mean) / (std + 1e-8)
    tlen = mel.shape[0]
    frame_period = 160/fs*1000
    f0, timeaxis = pw.dio(wav.astype('float64'), fs, frame_period=frame_period)
    f0 = pw.stonemask(wav.astype('float64'), f0, timeaxis, fs)
    f0 = f0[:tlen].reshape(-1).astype('float32')
    nonzeros_indices = np.nonzero(f0)
    lf0 = f0.copy()
    lf0[nonzeros_indices] = np.log(f0[nonzeros_indices]) # for f0(Hz), lf0 > 0 when f0 != 0
    mean, std = np.mean(lf0[nonzeros_indices]), np.std(lf0[nonzeros_indices])
    lf0[nonzeros_indices] = (lf0[nonzeros_indices] - mean) / (std + 1e-8)
    return mel, lf0

### Load Vocoder
import json
config_file = '../FastPitch/hifi/config.json'
hifi = '../FastPitch/hifi/g_02500000'

with open(config_file) as f:
        data = f.read()
json_config = json.loads(data)
h = AttrDict(json_config)

generator_hifi = Generator(h).to('cuda')
state_dict_g = load_checkpoint(hifi, 'cuda')
generator_hifi.load_state_dict(state_dict_g['generator'])
generator_hifi.eval()
generator_hifi.remove_weight_norm()

checkpoint_path = 'All_model.ckpt-350.pt'
### load_model
encoder = Encoder(in_channels=80, channels=512, n_embeddings=512, z_dim=64, c_dim=256)
encoder_lf0 = Encoder_lf0()
encoder_spk = Encoder_spk()
decoder = Decoder_ac(dim_neck=64)
encoder.to('cuda')
encoder_lf0.to('cuda')
encoder_spk.to('cuda')
decoder.to('cuda')

checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage)
encoder.load_state_dict(checkpoint["encoder"])
encoder_spk.load_state_dict(checkpoint["encoder_spk"])
decoder.load_state_dict(checkpoint["decoder"])

encoder.eval();
encoder_spk.eval();
decoder.eval();

def convert(src_wav_path, ref_wav_path, generator_hifi):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    mel_stats = np.load('./mel_stats/stats.npy')
    mean = mel_stats[0]
    std = mel_stats[1]
    src_mel, src_lf0 = extract_logmel(src_wav_path, mean, std)
    ref_mel, _ = extract_logmel(ref_wav_path, mean, std)
    src_mel = torch.FloatTensor(src_mel.T).unsqueeze(0).to(device)
    src_lf0 = torch.FloatTensor(src_lf0).unsqueeze(0).to(device)
    ref_mel = torch.FloatTensor(ref_mel.T).unsqueeze(0).to(device)
    out_filename = os.path.basename(src_wav_path).split('.')[0] 
    with torch.no_grad():
        z, _, _, _ = encoder.encode(src_mel)
        lf0_embs = encoder_lf0(src_lf0)
        spk_emb = encoder_spk(ref_mel)
        output = decoder(z, lf0_embs, spk_emb)
        output = output.transpose(1,2)
        # output[0] = 2.718281**output[0]

    print('synthesize waveform...')

    with torch.no_grad():
        #mel = torch.FloatTensor(mel.cpu()).to(device)
        y_g_hat = generator_hifi(output)
        audio = y_g_hat.squeeze()
        audio = audio * MAX_WAV_VALUE
        audio = audio.cpu().numpy().astype('int16')

    return audio

audio = convert('wav1.wav', 'wav2.wav',
        generator_hifi)
write('test.wav', 16000, audio)
Wendison commented 2 years ago

Hi, I didn't test the using of HifiGAN to convert the mel-spectrograms generated by the released VQMIVC models, maybe you need to check whether your extraction way of mel-spectrograms used to train HifiGAN is the same as mine, if not, then the generated waveform by your HifiGAN is problematic.

wj-gxy commented 8 months ago

@2Bye 你好,我也遇到了这个问题,请问你最后是用的hifigan吗?我训练的中文数据集