kan-bayashi / ParallelWaveGAN

Unofficial Parallel WaveGAN (+ MelGAN & Multi-band MelGAN & HiFi-GAN & StyleMelGAN) with Pytorch
https://kan-bayashi.github.io/ParallelWaveGAN/
MIT License
1.54k stars 339 forks source link

For who need a simple script to inference/simulate spoofing data from bonafide audios with all of models in this repo #429

Closed v-nhandt21 closed 3 months ago

v-nhandt21 commented 3 months ago

import numpy as np
import yaml, glob, torchaudio
from parallel_wavegan.utils import load_model
from parallel_wavegan.bin.preprocess import logmelfilterbank

def get_feature(audio_file, config):
     audio, fs = librosa.load(audio_file)

     assert len(audio.shape) == 1, f"{audio_file} seems to be multi-channel signal."
     assert (np.abs(audio).max() <= 1.0), f"{audio_file} seems to be different from 16 bit PCM."
     assert (fs == config["sampling_rate"]), f"{audio_file} seems to have a different sampling rate."

     if config["trim_silence"]:
          audio, _ = librosa.effects.trim(
               audio,
               top_db=config["trim_threshold_in_db"],
               frame_length=config["trim_frame_size"],
               hop_length=config["trim_hop_size"],
          )

     x = librosa.resample(audio, orig_sr=fs, target_sr=config["sampling_rate"])

     mel = logmelfilterbank(
          x,
          sampling_rate=config['sampling_rate'],
          hop_size=config['hop_size'],
          fft_size=config['fft_size'],
          win_length=config['win_length'],
          window=config['window'],
          num_mels=config['num_mels'],
          fmin=config['fmin'],
          fmax=config['fmax'],
     )
     return mel

def generate(groundtruth_path, output_path, checkpoint):

     os.makedirs(output_path, exist_ok=True)

     config = None 

     if config is None:
          dirname = os.path.dirname(checkpoint)
          config = os.path.join(dirname, "config.yml")
          with open(config) as f:
               config = yaml.load(f, Loader=yaml.Loader)

     scaler = StandardScaler()
     stats_file = os.path.join(dirname, "stats.h5")
     if config["format"] == "hdf5":
          scaler.mean_ = read_hdf5(stats_file, "mean")
          scaler.scale_ = read_hdf5(stats_file, "scale")
     elif config["format"] == "npy":
          scaler.mean_ = np.load(stats_file)[0]
          scaler.scale_ = np.load(stats_file)[1]
     model = load_model(checkpoint)
     model.to("cuda").eval()

     for file_path in glob.glob(groundtruth_path + "/*.wav"):
          id = file_path.split("/")[-1].split(".")[0]

          mel = get_feature(file_path, config)
          mel = scaler.transform(mel)
          mel = torch.tensor(mel).T.cuda()
          mel = torch.unsqueeze(mel, 0)

          with torch.no_grad():
               y = model(mel)
          y = y.detach().cpu()

          torchaudio.save(output_path + "/" + str(id) + ".wav", y[0], sample_rate=config['sampling_rate'])

if __name__ == "__main__":

     from parallel_wavegan.utils import download_pretrained_model, PRETRAINED_MODEL_LIST

     for tag in PRETRAINED_MODEL_LIST.keys():
          print("==================================> ", tag)
          download_path = download_pretrained_model(tag)
          print(download_path)
          generate(groundtruth_path = "../DATA/groundtruth", \
                   output_path = "../DATA/" + tag, \
                   checkpoint = download_path
                   )```