auspicious3000 / SpeechSplit

Unsupervised Speech Decomposition Via Triple Information Bottleneck
http://arxiv.org/abs/2004.11284
MIT License
636 stars 92 forks source link

I write the follow code to get the wrong pkl! Who can help me? #20

Closed c1a1o1 closed 3 years ago

c1a1o1 commented 3 years ago

Extract spectrogram and f0: python make_spect_f0.py

Generate training metadata: python make_metadata.py

My code is based on the above step! Who can help me?

import os import sys import pickle import numpy as np import soundfile as sf from scipy import signal from librosa.filters import mel from numpy.random import RandomState from pysptk import sptk from utils import butter_highpass from utils import speaker_normalization from utils import pySTFT import torch from autovc.model_bl import D_VECTOR from collections import OrderedDict mel_basis = mel(16000, 1024, fmin=90, fmax=7600, n_mels=80).T min_level = np.exp(-100 / 20 * np.log(10)) b, a = butter_highpass(30, 16000, order=5)

C = D_VECTOR(dim_input=80, dim_cell=768, dim_emb=256).eval().cuda() c_checkpoint = torch.load('assets/3000000-BL.ckpt') new_state_dict = OrderedDict() for key, val in c_checkpoint['model_b'].items(): new_key = key[7:] new_state_dict[new_key] = val C.load_state_dict(new_state_dict) num_uttrs = 1 len_crop = 128

spk2gen = pickle.load(open('assets/spk2gen.pkl', "rb"))

Modify as needed

rootDir = 'assets/wavs' targetDir_f0 = 'assets/raptf0' targetDir = 'assets/spmel'

dirName, subdirList, _ = next(os.walk(rootDir)) print('Found directory: %s' % dirName) speakers = [] for subdir in sorted(subdirList): print(subdir)

if not os.path.exists(os.path.join(targetDir, subdir)):
    os.makedirs(os.path.join(targetDir, subdir))
if not os.path.exists(os.path.join(targetDir_f0, subdir)):
    os.makedirs(os.path.join(targetDir_f0, subdir))    
_,_, fileList = next(os.walk(os.path.join(dirName,subdir)))

if spk2gen[subdir] == 'M':
    lo, hi = 50, 250
elif spk2gen[subdir] == 'F':
    lo, hi = 100, 600
else:
    raise ValueError
utterances = []
utterances.append(subdir)
_, _, fileList = next(os.walk(os.path.join(dirName, subdir)))
# make speaker embedding  [Speaker_Name , One-hot , [Mel, normed-F0, length, utterance_name] ]
assert len(fileList) >= num_uttrs
idx_uttrs = np.random.choice(len(fileList), size=num_uttrs, replace=False)
utterances.append(idx_uttrs)

prng = RandomState(int(subdir[1:]))

for i in range(num_uttrs):
    dirName2=dirName.replace("wavs", "spmel")
    npyfile=fileList[idx_uttrs[i]].replace("wav", "npy")
    tmp = np.load(os.path.join(dirName2, subdir, npyfile))
    # choose another utterance if the current one is too short
    embs = []
    left = np.random.randint(0, tmp.shape[0]-len_crop)
    melsp = torch.from_numpy(tmp[np.newaxis, left:left+len_crop, :]).cuda()
    emb = C(melsp)
    embs.append(emb.detach().squeeze().cpu().numpy())
    #embs1=emb.detach().squeeze().cpu().numpy()
    # read audio file
    x, fs = sf.read(os.path.join(dirName, subdir, fileList[idx_uttrs[i]]))
    assert fs == 16000
    if x.shape[0] % 256 == 0:
        x = np.concatenate((x, np.array([1e-06])), axis=0)
    y = signal.filtfilt(b, a, x)
    wav = y * 0.96 + (prng.rand(y.shape[0]) - 0.5) * 1e-06

    # compute spectrogram
    D = pySTFT(wav).T
    D_mel = np.dot(D, mel_basis)
    D_db = 20 * np.log10(np.maximum(min_level, D_mel)) - 16
    S = (D_db + 100) / 100

    # extract f0  [Speaker_Name , One-hot , [Mel, normed-F0, length, utterance_name] ]
    f0_rapt = sptk.rapt(wav.astype(np.float32) * 32768, fs, 256, min=lo, max=hi, otype=2)
    index_nonzero = (f0_rapt != -1e10)
    mean_f0, std_f0 = np.mean(f0_rapt[index_nonzero]), np.std(f0_rapt[index_nonzero])
    f0_norm = speaker_normalization(f0_rapt, index_nonzero, mean_f0, std_f0)
    embs.append(f0_norm)
    #embs2=f0_norm
    embs.append(tmp.shape[0])
    #embs3= tmp.shape[0]
    embs.append(subdir)
    #embs4= subdir

    embss = tuple(embs)
utterances.append(embss)
speakers.append(utterances)

with open(os.path.join(rootDir, 'train.pkl'), 'wb') as handle: pickle.dump(speakers, handle)