Closed skol101 closed 2 years ago
I will update inference code soon.
import os
import torch
import argparse
from glob import glob
import tqdm
import numpy as np
from scipy.io.wavfile import write
import utils
from mel_processing import mel_spectrogram_torch
from models_bigvgan import Generator
h = None
device = None
def get_mel(x):
return mel_spectrogram_torch(x, h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax)
def load_checkpoint(filepath, device):
assert os.path.isfile(filepath)
print("Loading '{}'".format(filepath))
checkpoint_dict = torch.load(filepath, map_location=device)
print("Complete.")
return checkpoint_dict
def scan_checkpoint(cp_dir, prefix):
pattern = os.path.join(cp_dir, prefix + '*')
cp_list = glob.glob(pattern)
if len(cp_list) == 0:
return ''
return sorted(cp_list)[-1]
def inference(a):
os.makedirs(a.output_dir, exist_ok=True)
generator = Generator(h).to(device)
state_dict_g = load_checkpoint(a.checkpoint_file, device)
generator.load_state_dict(state_dict_g['generator'])
generator.eval()
generator.remove_weight_norm()
npz_path = glob(os.path.join(a.input_dir, os.path.join("test", "*.npz")))
print("data len: ", len(npz_path))
i = 0
for path in tqdm.tqdm(npz_path, desc="synthesizing each utterance"):
files = np.load(path)
file_name = os.path.splitext(os.path.basename(path))[0]
with torch.no_grad():
audio = torch.FloatTensor(files['audio'])
audio = audio.to(device)
audio = audio / 32768
mel = get_mel(audio.unsqueeze(0))
audio= generator(mel)
audio = audio.squeeze()
audio = audio / (torch.abs(audio).max()) * 0.999 * 32768.0
audio = audio.cpu().numpy().astype('int16')
file_name = "generated_{}.wav".format(file_name)
output_file = os.path.join(a.output_dir, file_name)
write(output_file, 22050, audio)
def main():
print('Initializing Inference Process..')
parser = argparse.ArgumentParser()
parser.add_argument('--input_dir', default='')
parser.add_argument('--output_dir', default='inference/')
parser.add_argument('--checkpoint_file', default='logs/checkpoint')
a = parser.parse_args()
config_file = os.path.join(os.path.split(a.checkpoint_file)[0], 'config.json')
global h
h = utils.get_hparams_from_file(config_file)
torch.manual_seed(1234)
global device
if torch.cuda.is_available():
torch.cuda.manual_seed(1234)
device = torch.device('cuda')
else:
device = torch.device('cpu')
inference(a)
if __name__ == '__main__':
main()
What do a substitute for HiFi-GAN for VC task mean? Did you mean the VC task of VITS?? In this case, you can use the same code of VITS for VC task.
I'm experiementing with StarGANv2-VC, where we can use one of ParallelWaveGan trained vocoders.
# load vocoder
from parallel_wavegan.utils import load_model
vocoder = load_model("Vocoder/checkpoint-400000steps.pkl").to('cuda').eval()
vocoder.remove_weight_norm()
_ = vocoder.eval()
with torch.no_grad():
f0_feat = F0_model.get_feature_GAN(source.unsqueeze(1))
out = starganv2.generator(source.unsqueeze(1), ref, F0=f0_feat)
c = out.transpose(-1, -2).squeeze().to('cuda')
y_out = vocoder.inference(c)
y_out = y_out.view(-1).cpu()
That's where I'm wondering about replace in-place of vocoder.inference with BigVGAN vocoder, assuming it's trained on the same dataset.
If you use the same Mel-spectrogram for vocoder training, I think it will works!
As I understand these params must match:
"sampling_rate": 22050,
"filter_length": 1024,
"hop_length": 256,
"win_length": 1024,
In StarGanv2-VC
sr: 24000
spect_params:
n_fft: 2048
win_length: 1200
hop_length: 300
Yes. Additionally,
"n_mel_channels": 80,
"mel_fmin": 0.0,
"mel_fmax":
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) # Someone use the minimum value of 1e-9 for spectrogram
Also, above params must match.
filter_length
Is the same param as n_fft in StarGanv2 ?
yes they are the same parameter...!
Also, above params must match. spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
What is this function used for? In Stargan VC there's log normalisation through
mel_tensor = (torch.log(1e-5 + mel_tensor) - self.mean) / self.std
Also, when using these params
"max_wav_value": 32768.0,
"sampling_rate": 24000,
"filter_length": 2048,
"hop_length": 300,
"win_length": 1200,
"n_mel_channels": 80,
I get the error
BigVGAN/train_bigvgan_vocoder.py:194: UserWarning: Using a target size (torch.Size([16, 80, 23])) that is different to the input size (torch.Size([16, 80, 27])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
I understand this has to do with the changed hop_size, and "segment_size": 8192 ?
Also, above params must match. spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
What is this function used for? In Stargan VC there's log normalisation through
mel_tensor = (torch.log(1e-5 + mel_tensor) - self.mean) / self.std
https://github.com/sh-lee-prml/BigVGAN/blob/main/mel_processing.py#L107
Also, when using these params
"max_wav_value": 32768.0, "sampling_rate": 24000, "filter_length": 2048, "hop_length": 300, "win_length": 1200, "n_mel_channels": 80,
I get the error
BigVGAN/train_bigvgan_vocoder.py:194: UserWarning: Using a target size (torch.Size([16, 80, 23])) that is different to the input size (torch.Size([16, 80, 27])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size. loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
I understand this has to do with the changed hop_size, and "segment_size": 8192 ?
Yes~ change segmen_size 32*300 = 9600 but you have to check your GPU Memory first. Modify it for your dataset~!
We have RTX 3090 (24gb), set batch_size to 16, because with 32 we get "Cuda out of memory errors".
After changing "segment_size" to 9600, still the same error albeit with different target and input sizes: Using a target size (torch.Size([16, 80, 27])) that is different to the input size (torch.Size([16, 80, 32]))
Oh... I missed some points about hop length and upsampling rates of your data... I think you have to change your upsampling rate (e.g., 6x5x5x2=300 or upsampling rates of vocoder which you used before). In the original setting, the model upsamples the Mel-spectrogram to waveform by 256x (8x8x2x2=256) upsampling.
Our dataset is VCTK-like dataset, the only difference is that is sampling rate is 24000.
Our dataset is VCTK-like dataset, the only difference is that is sampling rate is 24000.
Could you check the shape of spec.pt?? I suspect that you may use the old spec.pt preprocessed by our version.
In this case, you have to remove your pt files first. After that, train again
torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
I haven't changed them. This is what I've changed and still cannot train on 24kHz dataset.
{
"train": {
"log_interval": 200,
"eval_interval": 5000,
"seed": 1234,
"epochs": 20000,
"learning_rate": 2e-4,
"betas": [0.8, 0.99],
"eps": 1e-9,
"batch_size": 16,
"fp16_run": true,
"lr_decay": 0.999875,
"segment_size": 9600,
"init_lr_ratio": 1,
"warmup_epochs": 0,
"c_mel": 45
},
"data": {
"training_files": "./dataset/VCTK-Corpus/preprocessed_npz",
"validation_files":"./dataset/VCTK-Corpus/preprocessed_npz",
"text_cleaners":["english_cleaners2"],
"max_wav_value": 32768.0,
"sampling_rate": 24000,
"filter_length": 2048,
"hop_length": 300,
"win_length": 1200,
"n_mel_channels": 80,
"mel_fmin": 0.0,
"mel_fmax": null,
"add_blank": true,
"n_speakers": 125,
"cleaned_text": true,
"aug_rate": 1.0,
"top_db": 20
},
"model": {
"p_dropout": 0.1,
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
"upsample_rates": [5,5,4,3],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [10,10,8,6],
"use_spectral_norm": false
}
}
spec shape is torch.Size([16, 80, 43])
RuntimeError: Given groups=1, weight of size [512, 513, 7], expected input[16, 1025, 32] to have 513 channels, but got 1025 channels instead
Should this line be changed, though?
initial_freq = [690, 5513, 11025, 22050]
I've prepared datase with above params. Now training terminates with error
BigVGAN/losses.py", line 15, in feature_loss
loss += torch.mean(torch.abs(rl - gl))
exception! The size of tensor a (24) must match the size of tensor b (25) at non-singleton dimension 3
exception! The size of tensor a (24) must match the size of tensor b (25) at non-singleton dimension 3
exception! The size of tensor a (1600) must match the size of tensor b (1613) at non-singleton dimension 2
exception! The size of tensor a (534) must match the size of tensor b (538) at non-singleton dimension 2
exception! The size of tensor a (178) must match the size of tensor b (180) at non-singleton dimension 2
exception! The size of tensor a (1067) must match the size of tensor b (1075) at non-singleton dimension 2
rl and gl tensor sizes are indeed different
Actually, I have not tried this with sampling rate of 24,000 and hop size of 300
I think that you also have to change the kernel size for padding size and change initial-freq for your sampling rate
"upsample_rates": [5,5,4,3],
"upsample_kernel_sizes": [11,11,8,7],
initial_freq = [400, 2000, 8000, 24000]
When using 32 frames of Mel-spectrogram as an input, the output must be 9600 frame.
Check it first plz~~!
Thank you!
FloatTens
Something is wrong with inference code
mel_outputs = model.inference(audio_A, mel_A, mel_len_A, audio_B, mel_B, mel_len_B)
...
generator = Generator(h.data.filter_length // 2 + 1,
h.model.resblock_kernel_sizes,h.model.resblock_dilation_sizes, h.model.upsample_rates, h.model.upsample_initial_channel, h.model.upsample_kernel_sizes).to(device)
state_dict_g = load_checkpoint("BigVGAN/logs/bigvgan_22k/G_140000.pth", device)
generator.load_state_dict(state_dict_g)
generator.eval()
generator.remove_weight_norm()
with torch.no_grad():
#audio = torch.FloatTensor(converted_mel)
#audio = audio.to(device)
#audio = audio / 32768
#mel = get_mel(audio.unsqueeze(0))
audio= generator(mel_outputs)
results in
RuntimeError: Given groups=1, weight of size [512, 513, 7], expected input[1, 477, 80] to have 513 channels, but got 477 channels instead
This model that generates mel_outputs works fine with its own vocoder that uses the same code to generate mel_spec as your repo https://github.com/intory89/StyleVC/blob/eb0bcf54bb06812e0f13788253f61f38a24585b6/vocoder/meldataset_custom.py#L50
Also, minor issues:
FloatTens
Something is wrong with inference code
mel_outputs = model.inference(audio_A, mel_A, mel_len_A, audio_B, mel_B, mel_len_B) ... generator = Generator(h.data.filter_length // 2 + 1, h.model.resblock_kernel_sizes,h.model.resblock_dilation_sizes, h.model.upsample_rates, h.model.upsample_initial_channel, h.model.upsample_kernel_sizes).to(device) state_dict_g = load_checkpoint("BigVGAN/logs/bigvgan_22k/G_140000.pth", device) generator.load_state_dict(state_dict_g) generator.eval() generator.remove_weight_norm() with torch.no_grad(): #audio = torch.FloatTensor(converted_mel) #audio = audio.to(device) #audio = audio / 32768 #mel = get_mel(audio.unsqueeze(0)) audio= generator(mel_outputs)
results in
RuntimeError: Given groups=1, weight of size [512, 513, 7], expected input[1, 477, 80] to have 513 channels, but got 477 channels instead
This model that generates mel_outputs works fine with its own vocoder that uses the same code to generate mel_spec as your repo https://github.com/intory89/StyleVC/blob/eb0bcf54bb06812e0f13788253f61f38a24585b6/vocoder/meldataset_custom.py#L50
FloatTens
Something is wrong with inference code
mel_outputs = model.inference(audio_A, mel_A, mel_len_A, audio_B, mel_B, mel_len_B) ... generator = Generator(h.data.filter_length // 2 + 1, h.model.resblock_kernel_sizes,h.model.resblock_dilation_sizes, h.model.upsample_rates, h.model.upsample_initial_channel, h.model.upsample_kernel_sizes).to(device) state_dict_g = load_checkpoint("BigVGAN/logs/bigvgan_22k/G_140000.pth", device) generator.load_state_dict(state_dict_g) generator.eval() generator.remove_weight_norm() with torch.no_grad(): #audio = torch.FloatTensor(converted_mel) #audio = audio.to(device) #audio = audio / 32768 #mel = get_mel(audio.unsqueeze(0)) audio= generator(mel_outputs)
results in
RuntimeError: Given groups=1, weight of size [512, 513, 7], expected input[1, 477, 80] to have 513 channels, but got 477 channels instead
This model that generates mel_outputs works fine with its own vocoder that uses the same code to generate mel_spec as your repo https://github.com/intory89/StyleVC/blob/eb0bcf54bb06812e0f13788253f61f38a24585b6/vocoder/meldataset_custom.py#L50
Change the input size of filter which you choose (linear: 513, Mel: 80) and transpose your input shape. Then, it will work.
Generator(h.data.filter_length // 2 + 1,...)
Also, minor issues:
- prefix "dec."for each weight and bias
- there's no key 'generator' in the dictionary, but 'model'.
Also, minor issues:
- prefix "dec."for each weight and bias
- there's no key 'generator' in the dictionary, but 'model'.
Thank you. I will change it soon
Thanks to your suggestions I'm now able to train on 24k dataset, but your comment on changing the filter length isn't clear.
Change the input size of filter which you choose (linear: 513, Mel: 80) and transpose your input shape. Then, it will work.
Generator(h.data.filter_length // 2 + 1,...)
BigVGAN Model trained on sample rate 24000 with filter length of 2048 will have the following weights with sizes: conv_pre.weight torch.Size([512, 1025, 7])
So how do we change the mel outputs from the VC model that will be acceptable to the BigVGAN's generator?
f0_feat = F0_model.get_feature_GAN(source.unsqueeze(1))
out = starganv2.generator(source.unsqueeze(1), ref, F0=f0_feat)
c = out.transpose(-1, -2).squeeze(1).to('cuda')
print(c.shape)
y_out = vocoder(c) -- fails here
y_out = y_out.view(-1).cpu()
torch.Size([1, 572, 80]) RuntimeError: Given groups=1, weight of size [512, 1025, 7], expected input[1, 572, 80] to have 1025 channels, but got 572 channels instead
I've changed part of data_utils_vocoder.py get_audio() from your code to
MEL_PARAMS = {
"sample_rate": sample_rate,
"n_mels": 80,
"n_fft": 2048,
"win_length": 1200,
"hop_length": 300
}
...
spec =torchaudio.transforms.MelSpectrogram(**MEL_PARAMS)
But when trying to train the error is the same as with inference:
File "/home/sk/work/BigVGAN/models_bigvgan.py", line 121, in forward
x = self.conv_pre(x)
File "/home/sk/anaconda3/envs/vc/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/home/sk/anaconda3/envs/vc/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 301, in forward
return self._conv_forward(input, self.weight, self.bias)
File "/home/sk/anaconda3/envs/vc/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 297, in _conv_forward
return F.conv1d(input, weight, bias, self.stride,
RuntimeError: Given groups=1, weight of size [512, 1025, 7], expected input[16, 80, 32] to have 1025 channels, but got 80 channels instead
As you mentioned, the hyperparameters of spectrogram for your VC model and vocoder must be same.
In this repository, I use the linear spectrogram as an input so the input size of network is "h.data.filter_length // 2 + 1".
In your case using Mel-spectrogram with 80 bins, you should change the hyperparameter about input size for your model...
Well, I think just changing the hyper params isn't enough -- the VC model must be changed to use linear spectrogram.
Yes... that's what i said
All hyperparameters including what the model uses Mel or Linear spectrogram must be same.
Reading the paper they have based their model on Hifigan, which use Mel spectrogram, correct?
AFAIK, most vocoders use mel spectrograms, therefore it's easy to switch between different vocoders to test them out.
You are right. Most vocoders use mel-spectrogram.
Refer to this issue for Mel-spectrogram (https://github.com/sh-lee-prml/BigVGAN/issues/1)
There are many options for speech processing and I usually use linear spectrogram for end-to-end text-to-speech.
Ty for pointing me to the right issue. That's actually what I needed)
Could you provide inference example?