suhitaghosh10 / emo-stargan

Implementation of Emo-StarGAN
MIT License
45 stars 4 forks source link

Indirect conversion? #29

Open kdrkdrkdr opened 4 weeks ago

kdrkdrkdr commented 4 weeks ago

Can you provide me with the code to convert wav to wav to do it the indirect way?

kdrkdrkdr commented 4 weeks ago

I have written the code as shown below, but it is not working correctly, so I am reaching out for help.

import os
import numpy as np
import librosa
import torch
import torchaudio
import yaml
from munch import Munch
from parallel_wavegan.utils import load_model
from Models.models import Generator, MappingNetwork, StyleEncoder
from Utils.JDC.model import JDCNet

SR = 24000
MEAN, STD = -4, 4
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

F0_model = None
vocoder = None
stargan = None

def load_wav(file_path, sr=SR):
    wav, _ = librosa.load(file_path, sr=sr)
    return wav / np.max(np.abs(wav))

def preprocess(wave):
    to_mel = torchaudio.transforms.MelSpectrogram(
        n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
    wave_tensor = torch.from_numpy(wave).float()
    mel_tensor = to_mel(wave_tensor)
    mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - MEAN) / STD
    return mel_tensor

def build_stargan_model(model_params):
    args = Munch(model_params)
    generator = Generator(args.dim_in, args.style_dim, args.max_conv_dim, w_hpf=args.w_hpf, F0_channel=args.F0_channel)
    mapping_network = MappingNetwork(args.latent_dim, args.style_dim, args.num_domains, hidden_dim=args.max_conv_dim)
    style_encoder = StyleEncoder(args.dim_in, args.style_dim, args.num_domains, args.max_conv_dim)

    return Munch(generator=generator, mapping_network=mapping_network, style_encoder=style_encoder)

def load_models(config_path, model_path, f0_path, vocoder_path):
    global F0_model, vocoder, stargan

    # Load F0 model
    F0_model = JDCNet(num_class=1, seq_len=192)
    F0_model.load_state_dict(torch.load(f0_path, map_location=DEVICE)['net'])
    F0_model.eval().to(DEVICE)

    # Load vocoder
    vocoder = load_model(vocoder_path).to(DEVICE).eval()
    vocoder.remove_weight_norm()

    # Load StarGAN model
    with open(config_path) as f:
        stargan_config = yaml.safe_load(f)

    stargan = build_stargan_model(stargan_config["model_params"])
    params = torch.load(model_path, map_location=DEVICE)['model_ema']
    _ = [stargan[key].load_state_dict(params[key], strict=False) for key in stargan]
    _ = [stargan[key].eval().to(DEVICE) for key in stargan]

def voice_conversion(src_path, tgt_path):
    # Load and preprocess audio
    src_wav = load_wav(src_path)
    tgt_wav = load_wav(tgt_path)

    src_mel = preprocess(src_wav).to(DEVICE)
    tgt_mel = preprocess(tgt_wav).to(DEVICE)

    # Perform conversion
    with torch.no_grad():
        f0_feat = F0_model.get_feature_GAN(src_mel.unsqueeze(1))
        tgt_style = stargan.style_encoder(tgt_mel.unsqueeze(1))

        converted_mel = stargan.generator(src_mel.unsqueeze(1), tgt_style, F0=f0_feat)

        converted_audio = vocoder.inference(converted_mel.squeeze().transpose(-1, -2))
        converted_audio = converted_audio.squeeze().cpu().numpy()

    return converted_audio, SR

if __name__ == "__main__":
    config_path = '/home/kdr/Desktop/project-elnino/emo-stargan/Configs/speaker_domain_config.yml'
    model_path = '/home/kdr/Desktop/project-elnino/emo-stargan/stargan_emo.pth'
    f0_path = '/home/kdr/Desktop/project-elnino/emo-stargan/Utils/JDC/bst.t7'
    vocoder_path = '/home/kdr/Desktop/project-elnino/emo-stargan/Utils/vocoder/checkpoint-2500000steps.pkl'

    load_models(config_path, model_path, f0_path, vocoder_path)

    print("Hello")

    src_path = '/home/kdr/Desktop/project-elnino/emo-stargan/Samples/1_Sad/baseline.wav'
    tgt_path = '/home/kdr/Desktop/project-elnino/emo-stargan/Samples/1_Sad/source.wav'

    converted_audio, sample_rate = voice_conversion(src_path, tgt_path)
    librosa.output.write_wav('converted_audio.wav', converted_audio, sample_rate)

Traceback (most recent call last): File "/home/kdr/Desktop/project-elnino/emo-stargan/test.py", line 96, in converted_audio, sample_rate = voice_conversion(src_path, tgt_path) File "/home/kdr/Desktop/project-elnino/emo-stargan/test.py", line 73, in voice_conversion tgt_style = stargan.style_encoder(tgt_mel.unsqueeze(1)) File "/home/kdr/anaconda3/envs/elnino/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl return self._call_impl(*args, *kwargs) File "/home/kdr/anaconda3/envs/elnino/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl return forward_call(args, **kwargs) TypeError: forward() missing 1 required positional argument: 'y'