sh-lee-prml / BigVGAN

Unofficial pytorch implementation of BigVGAN: A Universal Neural Vocoder with Large-Scale Training
MIT License
130 stars 16 forks source link

inference not working #11

Open eschmidbauer opened 2 years ago

eschmidbauer commented 2 years ago

Hi- i've trained a model for 23000 steps. Using tensorboard, i can hear the eval improvements each checkpoint, and it sounds great! The problem is, I am not able to generate any wav files using inference. The script I've created generates wav files, but they are all static & noise. It would be great if there were an inference script in the repo :)

Here is my config

{
  "train": {
    "log_interval": 10,
    "eval_interval": 100,
    "seed": 1234,
    "epochs": 2900,
    "learning_rate": 2e-4,
    "betas": [0.8, 0.99],
    "eps": 1e-9,
    "batch_size": 16,
    "fp16_run": true,
    "lr_decay": 0.999875,
    "segment_size": 8192,
    "init_lr_ratio": 1,
    "warmup_epochs": 0,
    "c_mel": 45

  },
  "data": {
    "training_files": "./dataset/custom/preprocessed_npz",
    "validation_files":"./dataset/custom/preprocessed_npz",
    "text_cleaners":["english_cleaners2"],
    "max_wav_value": 32768.0,
    "sampling_rate": 22050,
    "filter_length": 1024,
    "hop_length": 256,
    "win_length": 1024,
    "n_mel_channels": 80,
    "mel_fmin": 0.0,
    "mel_fmax": null,
    "add_blank": true,
    "n_speakers": 1,
    "cleaned_text": true,
    "aug_rate": 1.0,
    "top_db": 20
  },
  "model": {
    "p_dropout": 0.1,
    "resblock_kernel_sizes": [3,7,11],
    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
    "upsample_rates": [8,8,2,2],
    "upsample_initial_channel": 512,
    "upsample_kernel_sizes": [16,16,4,4],
    "use_spectral_norm": false

  }
}

Here is a custom script im trying to put together.

import os
import torch

from glob import glob
import tqdm
import numpy as np
from scipy.io.wavfile import write
import utils

from mel_processing import mel_spectrogram_torch
from models_bigvgan import Generator

def main():
    print('Initializing Inference Process..')
    h = utils.get_hparams_from_dir('logs/bigvgan')
    torch.cuda.manual_seed(1234)
    device = torch.device('cuda')
    mel_channels = h.data.n_mel_channels
    #generator = Generator(h.data.filter_length // 2 + 1,
    #        h.model.resblock_kernel_sizes,h.model.resblock_dilation_sizes, h.model.upsample_rates, h.model.upsample_initial_channel, h.model.upsample_kernel_sizes).to(device)
    generator = Generator(mel_channels,
            h.model.resblock_kernel_sizes,h.model.resblock_dilation_sizes, h.model.upsample_rates, h.model.upsample_initial_channel, h.model.upsample_kernel_sizes).to(device)
    state_dict_g = utils.load_checkpoint(utils.latest_checkpoint_path("logs/bigvgan/", "G_*.pth"), generator)
    generator.eval()
    generator.remove_weight_norm()
    npz_path = glob(os.path.join("dataset/custom/preprocessed_npz/p100/", os.path.join("test", "*.npz")))
    print("data len: ", len(npz_path))
    print('Parameters:', generator.parameters())
    print(f'mel_channels {mel_channels}')
    for path in tqdm.tqdm(npz_path, desc="synthesizing each utterance"):
        files = np.load(path)
        file_name = os.path.splitext(os.path.basename(path))[0]
        with torch.no_grad():
            audio = torch.FloatTensor(files['audio'])
            audio = audio.to(device)
            audio = audio / 32768

            mel = mel_spectrogram_torch(audio.unsqueeze(0), h.data.filter_length, mel_channels, h.data.sampling_rate, h.data.hop_length, h.data.win_length, h.data.mel_fmin, h.data.mel_fmax)
            audio = generator(mel)
            audio = audio.squeeze()
            audio = audio / (torch.abs(audio).max()) * 0.999 * 32768.0
            audio = audio.cpu().numpy().astype('int16')
            file_name = "generated_{}.wav".format(file_name)
            output_file = os.path.join("inference/", file_name)
            write(output_file, 22050, audio)

if __name__ == '__main__':
    main()
Liu-Ruolan commented 2 years ago

linear spectrogram is used for input~