Open SandyPanda-MLDL opened 2 months ago
MCD Corrected code
import os import math import glob import librosa import pyworld import pysptk import numpy as np import matplotlib.pyplot as plot
alpha = 0.65 fft_size = 512 mcep_size = 24 SAMPLING_RATE = 24000 FRAME_PERIOD = 5.0
def load_wav(wavfile, sr): wav, = librosa.load(wav_file, sr=sr, mono=True) return wav
def MCD(x, y): log_spec_dB_const = 10.0 / math.log(10.0) math.sqrt(2.0) diff = x - y return log_spec_dB_const math.sqrt(np.inner(diff, diff))
def MCEP(wavfile, mcep_target_directory, alpha=0.65, fft_size=512, mcep_size=24): if not os.path.exists(mcep_target_directory): os.makedirs(mcep_target_directory) loaded_wav_file = load_wav(wavfile, sr=SAMPLINGRATE) , spectralenvelop, = pyworld.wav2world(loaded_wav_file.astype(np.double), fs=SAMPLING_RATE, frame_period=FRAME_PERIOD, fft_size=fft_size) mcep = pysptk.sptk.mcep(spectral_envelop, order=mcep_size, alpha=alpha, maxiter=0, etype=1, eps=1.0E-8, min_det=0.0, itype=3) fname = os.path.basename(wavfile).split('.')[0] np.save(os.path.join(mcep_target_directory, fname + '.npy'), mcep, allow_pickle=False)
def mcd_cal(mcep_org_files, mcep_synth_files, MCD): min_cost_tot = 0.0 mcd_std_tot = 0.0 total_frames = 0 for i in mcep_org_files: for j in mcep_synth_files: split_org_file, split_synthfile = os.path.basename(i).split(''), os.path.basename(j).split('_') org_speaker, org_speaker_id = split_org_file[0], split_org_file[-1] synth_speaker, synth_speaker_id = split_synth_file[0], split_synth_file[-1] if org_speaker == synth_speaker and org_speaker_id == synth_speaker_id: org_mcep_npy = np.load(i) frame_no = len(org_mcep_npy) synth_mcep_npy = np.load(j) mincost, = librosa.sequence.dtw(org_mcep_npy[:, 1:].T, synth_mcep_npy[:, 1:].T, metric=MCD) min_cost_tot += np.mean(min_cost) mcd_std_tot +=np.std(min_cost)
total_frames += frame_no
# Calculate mean and standard deviation
mcd_mean = min_cost_tot / total_frames
mcd_std = mcd_std_tot/ total_frames
return mcd_mean, mcd_std, total_frames
def main():
ORIGINAL_PATH = '/hdd2/Sandipan/SDhar-Projects/StyleTTS2-new/ESD_Objective_Evaluation/Sad/'
SYNTHESIZED_PATH = '/hdd2/Sandipan/SDhar-Projects/StyleTTS2-new/ESD_Objective_Evaluation/Sad_Synthesized_Baseline/'
#/hdd2/Sandipan/SDhar-Projects/StyleTTS2-new/Glow_Styletts2_synthesized
# Directories for MCEP files
dir_org_speech_mcep = os.path.join(ORIGINAL_PATH, 'mceps_trg')
dir_converted_speech_mcep = os.path.join(SYNTHESIZED_PATH, 'mceps_conv')
# Create MCEP directories if they do not exist
os.makedirs(dir_org_speech_mcep, exist_ok=True)
os.makedirs(dir_converted_speech_mcep, exist_ok=True)
# List of WAV files
dir_org_speech_wav = glob.glob(os.path.join(ORIGINAL_PATH, '*.wav'))
dir_converted_speech_wav = glob.glob(os.path.join(SYNTHESIZED_PATH, '*.wav'))
# Compute MCEP for original and synthesized speeches
for wav in dir_org_speech_wav:
MCEP(wav, dir_org_speech_mcep, fft_size=fft_size, mcep_size=mcep_size)
for wav in dir_converted_speech_wav:
MCEP(wav, dir_converted_speech_mcep, fft_size=fft_size, mcep_size=mcep_size)
# List of MCEP files
org_file = glob.glob(os.path.join(dir_org_speech_mcep, '*'))
synth_file = glob.glob(os.path.join(dir_converted_speech_mcep, '*'))
# Define cost function
cost_function = MCD # Make sure this function is imported or defined elsewhere
# Calculate MCD
mcd_mean, mcd_std, frames_used = mcd_cal(org_file, synth_file, cost_function)
# Print results
print(f'MCD Mean = {mcd_mean:.2f} dB')
print(f'MCD Standard Deviation = {mcd_std:.2f} dB')
print(f'Total number of frames = {frames_used}')
if name == "main": main()
Corrected logf0rmse import os import math import glob import librosa import pyworld import pysptk import numpy as np import matplotlib.pyplot as plot sampling_rate = 16000 num_mcep = 24 frame_period = 5.0 n_frames = 128 wavs_org=[] wavs_synth=[]
ORIGINAL_PATH= '/hdd2/Sandipan/SDhar-Projects/StyleTTS2-new/ESD_Objective_Evaluation/Sad' SYNTHESIZED_PATH='/hdd2/Sandipan/SDhar-Projects/StyleTTS2-new/ESD_Objective_Evaluation/Sad_Synthesized_Baseline'
Org_paths = glob.glob(ORIGINAL_PATH+'/.wav') Synth_paths = glob.glob(SYNTHESIZED_PATH+'/.wav') Org_paths.sort() Synth_paths.sort()
def load_wav(wav_file, sr):
wav, _ = librosa.load(wav_file, sr=sr, mono=True)
return wav
for i in range(len(Org_paths)): assert os.path.basename(Org_paths[i])==os.path.basename(Synth_paths[i])
for i in range(len(Org_paths)): if os.path.basename(Org_paths[i])==os.path.basename(Synth_paths[i]): wavs_org.append(load_wav(wav_file = Org_paths[i], sr = sampling_rate)) wavs_synth.append(load_wav(wav_file = Synth_paths[i], sr = sampling_rate))
def world_encode_data(wavs, fs, frame_period = 5.0, coded_dim = 24):
f0s = []
log_f0s_concatenated0=[]
num_mcep = 24
for i in range(len(wavs)):
wav = wavs[i]
wav = wav.astype(np.float64)
f0, _ = pyworld.harvest(wav, fs, frame_period = frame_period, f0_floor = 71.0, f0_ceil = 800.0)
f0s.append(f0)
log_f0s_concatenated0.append(np.ma.log(f0s[i]))
#print(len(f0))
return log_f0s_concatenated0 #f0s
f0s_org = world_encode_data(wavs = wavs_org, fs = sampling_rate, frame_period = frame_period, coded_dim = num_mcep) f0s_synth = world_encode_data(wavs = wavs_synth, fs = sampling_rate, frame_period = frame_period, coded_dim = num_mcep)
min_cost_tot=[] for i in range(len(wavs_org)): frame_len=0 def logf0_rmse(x, y): # method to calculate cost
log_spec_dB_const = 1/len(frame_len)
# print(y)
diff = x - y
# print(x," ",y," ",len(y))
# print(diff)
#print(log_spec_dB_const * math.sqrt(np.inner(diff, diff)))
return log_spec_dB_const * math.sqrt(np.inner(diff, diff))
if len(f0s_org[i])<len(f0s_synth[i]): frame_len=f0s_org[i] else: frame_len=f0s_synth[i]
cost_function = logf0_rmse mincost, = librosa.sequence.dtw(f0s_org[i][:].T, f0s_synth[i][:].T, metric=cost_function)
min_cost_tot.append(np.mean(min_cost))
logF0RMSE=sum(min_cost_tot)/len(min_cost_tot) std=np.std(min_cost_tot) print(f"logF0_RMSE = {logF0RMSE} and std {std}")
Corrected MSD
from scipy.io import wavfile import pysptk from pysptk.synthesis import Synthesizer, MLSADF import pyworld from os.path import join, basename import math
from nnmnkwii import preprocessing as P
from nnmnkwii.paramgen import unit_variance_mlpg_matrix
import gantts
from hparams import vc as hp
import librosa import librosa.display import IPython from IPython.display import Audio import os import numpy as np import glob from tqdm import tnrange
fs = 16000 frame_period=5.0 hop_length = int(fs (frame_period 0.001)) ms_fftlen = 4096 modfs = fs / hop_length ms_freq = np.arange(ms_fftlen//2 + 1)/ms_fftlen * modfs
def dynamic_time_warping(a, b): d, wp = librosa.sequence.dtw(a, b, metric='euclidean') return d, wp
def compute_static_features(path): fs, x = wavfile.read(path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=5.0) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=24, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] return mc
def modspec(x, n=4096, norm=None, return_phase=False):
def mean_modspec(path): mss = [] for wav in path: mgc = compute_static_features(wav)
print(mgc)
orig_path ='/hdd2/Sandipan/SDhar-Projects/StyleTTS2-new/Objective_Evaluation/Original' synth_path = '/hdd2/Sandipan/SDhar-Projects/StyleTTS2-new/Objective_Evaluation/Proposed' orig_wav_files=glob.glob(orig_path+'/.wav') synth_wav_files=glob.glob(synth_path+'/.wav')
ms_into2out_orig=mean_modspec(orig_wav_files) ms_into2out_synth=mean_modspec(synth_wav_files)
new=0 msd_list = [] for i in range(24): a=ms_into2out_orig[i, :].T b=ms_into2out_synth[i,:].T d, wp = dynamic_time_warping(a, b)
Calculate mean squared difference (MSD) based on DTW-aligned sequences
Total number of frames are 24
MSD=math.sqrt(1/len(mean_modspec(orig_wav_files).T))*math.sqrt(new/24) MSD_std = np.std(msd_list) print(f'MSD is {MSD} and std is {MSD_std}')