Open kdrkdrkdr opened 3 months ago
I have written the code as shown below, but it is not working correctly, so I am reaching out for help.
import os
import numpy as np
import librosa
import torch
import torchaudio
import yaml
from munch import Munch
from parallel_wavegan.utils import load_model
from Models.models import Generator, MappingNetwork, StyleEncoder
from Utils.JDC.model import JDCNet
SR = 24000
MEAN, STD = -4, 4
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
F0_model = None
vocoder = None
stargan = None
def load_wav(file_path, sr=SR):
wav, _ = librosa.load(file_path, sr=sr)
return wav / np.max(np.abs(wav))
def preprocess(wave):
to_mel = torchaudio.transforms.MelSpectrogram(
n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
wave_tensor = torch.from_numpy(wave).float()
mel_tensor = to_mel(wave_tensor)
mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - MEAN) / STD
return mel_tensor
def build_stargan_model(model_params):
args = Munch(model_params)
generator = Generator(args.dim_in, args.style_dim, args.max_conv_dim, w_hpf=args.w_hpf, F0_channel=args.F0_channel)
mapping_network = MappingNetwork(args.latent_dim, args.style_dim, args.num_domains, hidden_dim=args.max_conv_dim)
style_encoder = StyleEncoder(args.dim_in, args.style_dim, args.num_domains, args.max_conv_dim)
return Munch(generator=generator, mapping_network=mapping_network, style_encoder=style_encoder)
def load_models(config_path, model_path, f0_path, vocoder_path):
global F0_model, vocoder, stargan
# Load F0 model
F0_model = JDCNet(num_class=1, seq_len=192)
F0_model.load_state_dict(torch.load(f0_path, map_location=DEVICE)['net'])
F0_model.eval().to(DEVICE)
# Load vocoder
vocoder = load_model(vocoder_path).to(DEVICE).eval()
vocoder.remove_weight_norm()
# Load StarGAN model
with open(config_path) as f:
stargan_config = yaml.safe_load(f)
stargan = build_stargan_model(stargan_config["model_params"])
params = torch.load(model_path, map_location=DEVICE)['model_ema']
_ = [stargan[key].load_state_dict(params[key], strict=False) for key in stargan]
_ = [stargan[key].eval().to(DEVICE) for key in stargan]
def voice_conversion(src_path, tgt_path):
# Load and preprocess audio
src_wav = load_wav(src_path)
tgt_wav = load_wav(tgt_path)
src_mel = preprocess(src_wav).to(DEVICE)
tgt_mel = preprocess(tgt_wav).to(DEVICE)
# Perform conversion
with torch.no_grad():
f0_feat = F0_model.get_feature_GAN(src_mel.unsqueeze(1))
tgt_style = stargan.style_encoder(tgt_mel.unsqueeze(1))
converted_mel = stargan.generator(src_mel.unsqueeze(1), tgt_style, F0=f0_feat)
converted_audio = vocoder.inference(converted_mel.squeeze().transpose(-1, -2))
converted_audio = converted_audio.squeeze().cpu().numpy()
return converted_audio, SR
if __name__ == "__main__":
config_path = '/home/kdr/Desktop/project-elnino/emo-stargan/Configs/speaker_domain_config.yml'
model_path = '/home/kdr/Desktop/project-elnino/emo-stargan/stargan_emo.pth'
f0_path = '/home/kdr/Desktop/project-elnino/emo-stargan/Utils/JDC/bst.t7'
vocoder_path = '/home/kdr/Desktop/project-elnino/emo-stargan/Utils/vocoder/checkpoint-2500000steps.pkl'
load_models(config_path, model_path, f0_path, vocoder_path)
print("Hello")
src_path = '/home/kdr/Desktop/project-elnino/emo-stargan/Samples/1_Sad/baseline.wav'
tgt_path = '/home/kdr/Desktop/project-elnino/emo-stargan/Samples/1_Sad/source.wav'
converted_audio, sample_rate = voice_conversion(src_path, tgt_path)
librosa.output.write_wav('converted_audio.wav', converted_audio, sample_rate)
Traceback (most recent call last):
File "/home/kdr/Desktop/project-elnino/emo-stargan/test.py", line 96, in
Can you provide me with the code to convert wav to wav to do it the indirect way?