Corrected MSD

from scipy.io import wavfile import pysptk from pysptk.synthesis import Synthesizer, MLSADF import pyworld from os.path import join, basename import math

from nnmnkwii import preprocessing as P

from nnmnkwii.paramgen import unit_variance_mlpg_matrix

import gantts

from hparams import vc as hp

import librosa import librosa.display import IPython from IPython.display import Audio import os import numpy as np import glob from tqdm import tnrange

fs = 16000 frame_period=5.0 hop_length = int(fs (frame_period 0.001)) ms_fftlen = 4096 modfs = fs / hop_length ms_freq = np.arange(ms_fftlen//2 + 1)/ms_fftlen * modfs

def dynamic_time_warping(a, b): d, wp = librosa.sequence.dtw(a, b, metric='euclidean') return d, wp

def compute_static_features(path): fs, x = wavfile.read(path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=5.0) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=24, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] return mc

def modspec(x, n=4096, norm=None, return_phase=False):

# DFT against time axis
s_complex = np.fft.rfft(x, n=n, axis=0, norm=norm)
assert s_complex.shape[0] == n // 2 + 1
R, im = s_complex.real, s_complex.imag
ms = R * R + im * im

# TODO: this is ugly...
if return_phase:
    return ms, np.exp(1.0j * np.angle(s_complex))
else:
    return ms

def mean_modspec(path): mss = [] for wav in path: mgc = compute_static_features(wav)

print(mgc)

    #b=log(modspec(mgc, n=ms_fftlen))
    #print(b)
    #print(np.any(b<=0))

    #print("wav"+)
    ms = np.log(modspec(mgc, n=ms_fftlen))
    mss.append(ms)
    #return mss
    return np.mean(np.array(mss), axis=(0,))

orig_path ='/hdd2/Sandipan/SDhar-Projects/StyleTTS2-new/Objective_Evaluation/Original' synth_path = '/hdd2/Sandipan/SDhar-Projects/StyleTTS2-new/Objective_Evaluation/Proposed' orig_wav_files=glob.glob(orig_path+'/.wav') synth_wav_files=glob.glob(synth_path+'/.wav')

ms_into2out_orig=mean_modspec(orig_wav_files) ms_into2out_synth=mean_modspec(synth_wav_files)

new=0 msd_list = [] for i in range(24): a=ms_into2out_orig[i, :].T b=ms_into2out_synth[i,:].T d, wp = dynamic_time_warping(a, b)

Calculate mean squared difference (MSD) based on DTW-aligned sequences

aligned_a = a[wp[0]]
aligned_b = b[wp[1]]
diff_act = np.mean(np.square(aligned_a - aligned_b))

# diff=np.mean(np.absolute(a-b))
diff=(np.inner(diff_act, diff_act))
new=new+diff
msd_list.append(math.sqrt(1/len(mean_modspec(orig_wav_files).T))*math.sqrt(diff_act))

Total number of frames are 24

MSD=math.sqrt(1/len(mean_modspec(orig_wav_files).T))*math.sqrt(new/24) MSD_std = np.std(msd_list) print(f'MSD is {MSD} and std is {MSD_std}')

MCD Corrected code

import os import math import glob import librosa import pyworld import pysptk import numpy as np import matplotlib.pyplot as plot

Parameters

alpha = 0.65 fft_size = 512 mcep_size = 24 SAMPLING_RATE = 24000 FRAME_PERIOD = 5.0

def load_wav(wavfile, sr): wav, = librosa.load(wav_file, sr=sr, mono=True) return wav

def MCD(x, y): log_spec_dB_const = 10.0 / math.log(10.0) math.sqrt(2.0) diff = x - y return log_spec_dB_const math.sqrt(np.inner(diff, diff))

def MCEP(wavfile, mcep_target_directory, alpha=0.65, fft_size=512, mcep_size=24): if not os.path.exists(mcep_target_directory): os.makedirs(mcep_target_directory) loaded_wav_file = load_wav(wavfile, sr=SAMPLINGRATE) , spectralenvelop, = pyworld.wav2world(loaded_wav_file.astype(np.double), fs=SAMPLING_RATE, frame_period=FRAME_PERIOD, fft_size=fft_size) mcep = pysptk.sptk.mcep(spectral_envelop, order=mcep_size, alpha=alpha, maxiter=0, etype=1, eps=1.0E-8, min_det=0.0, itype=3) fname = os.path.basename(wavfile).split('.')[0] np.save(os.path.join(mcep_target_directory, fname + '.npy'), mcep, allow_pickle=False)

def mcd_cal(mcep_org_files, mcep_synth_files, MCD): min_cost_tot = 0.0 mcd_std_tot = 0.0 total_frames = 0 for i in mcep_org_files: for j in mcep_synth_files: split_org_file, split_synthfile = os.path.basename(i).split(''), os.path.basename(j).split('_') org_speaker, org_speaker_id = split_org_file[0], split_org_file[-1] synth_speaker, synth_speaker_id = split_synth_file[0], split_synth_file[-1] if org_speaker == synth_speaker and org_speaker_id == synth_speaker_id: org_mcep_npy = np.load(i) frame_no = len(org_mcep_npy) synth_mcep_npy = np.load(j) mincost, = librosa.sequence.dtw(org_mcep_npy[:, 1:].T, synth_mcep_npy[:, 1:].T, metric=MCD) min_cost_tot += np.mean(min_cost) mcd_std_tot +=np.std(min_cost)

total_frames += frame_no

            total_frames += frame_no

# Calculate mean and standard deviation
mcd_mean = min_cost_tot / total_frames
mcd_std = mcd_std_tot/ total_frames

return mcd_mean, mcd_std, total_frames

def main():

Define paths

ORIGINAL_PATH = '/hdd2/Sandipan/SDhar-Projects/StyleTTS2-new/ESD_Objective_Evaluation/Sad/'
SYNTHESIZED_PATH = '/hdd2/Sandipan/SDhar-Projects/StyleTTS2-new/ESD_Objective_Evaluation/Sad_Synthesized_Baseline/'
#/hdd2/Sandipan/SDhar-Projects/StyleTTS2-new/Glow_Styletts2_synthesized
# Directories for MCEP files
dir_org_speech_mcep = os.path.join(ORIGINAL_PATH, 'mceps_trg')
dir_converted_speech_mcep = os.path.join(SYNTHESIZED_PATH, 'mceps_conv')

# Create MCEP directories if they do not exist
os.makedirs(dir_org_speech_mcep, exist_ok=True)
os.makedirs(dir_converted_speech_mcep, exist_ok=True)

# List of WAV files
dir_org_speech_wav = glob.glob(os.path.join(ORIGINAL_PATH, '*.wav'))
dir_converted_speech_wav = glob.glob(os.path.join(SYNTHESIZED_PATH, '*.wav'))

# Compute MCEP for original and synthesized speeches
for wav in dir_org_speech_wav:
    MCEP(wav, dir_org_speech_mcep, fft_size=fft_size, mcep_size=mcep_size)
for wav in dir_converted_speech_wav:
    MCEP(wav, dir_converted_speech_mcep, fft_size=fft_size, mcep_size=mcep_size)

# List of MCEP files
org_file = glob.glob(os.path.join(dir_org_speech_mcep, '*'))
synth_file = glob.glob(os.path.join(dir_converted_speech_mcep, '*'))

# Define cost function
cost_function = MCD  # Make sure this function is imported or defined elsewhere

# Calculate MCD
mcd_mean, mcd_std, frames_used = mcd_cal(org_file, synth_file, cost_function)

# Print results
print(f'MCD Mean = {mcd_mean:.2f} dB')
print(f'MCD Standard Deviation = {mcd_std:.2f} dB')
print(f'Total number of frames = {frames_used}')

if name == "main": main()

Corrected logf0rmse import os import math import glob import librosa import pyworld import pysptk import numpy as np import matplotlib.pyplot as plot sampling_rate = 16000 num_mcep = 24 frame_period = 5.0 n_frames = 128 wavs_org=[] wavs_synth=[]

ORIGINAL_PATH= '/hdd2/Sandipan/SDhar-Projects/StyleTTS2-new/ESD_Objective_Evaluation/Sad' SYNTHESIZED_PATH='/hdd2/Sandipan/SDhar-Projects/StyleTTS2-new/ESD_Objective_Evaluation/Sad_Synthesized_Baseline'

Org_paths = glob.glob(ORIGINAL_PATH+'/.wav') Synth_paths = glob.glob(SYNTHESIZED_PATH+'/.wav') Org_paths.sort() Synth_paths.sort()

def load_wav(wav_file, sr):

wav, _ = librosa.load(wav_file, sr=sr, mono=True)

return wav

for i in range(len(Org_paths)): assert os.path.basename(Org_paths[i])==os.path.basename(Synth_paths[i])

for i in range(len(Org_paths)): if os.path.basename(Org_paths[i])==os.path.basename(Synth_paths[i]): wavs_org.append(load_wav(wav_file = Org_paths[i], sr = sampling_rate)) wavs_synth.append(load_wav(wav_file = Synth_paths[i], sr = sampling_rate))

print(len(wavs_org))

def world_encode_data(wavs, fs, frame_period = 5.0, coded_dim = 24):

f0s = []
log_f0s_concatenated0=[]
num_mcep = 24
for i in range(len(wavs)):
    wav = wavs[i]
    wav = wav.astype(np.float64)
    f0, _  = pyworld.harvest(wav, fs, frame_period = frame_period, f0_floor = 71.0, f0_ceil = 800.0)
    f0s.append(f0)
    log_f0s_concatenated0.append(np.ma.log(f0s[i]))
    #print(len(f0))

return log_f0s_concatenated0 #f0s

f0s_org = world_encode_data(wavs = wavs_org, fs = sampling_rate, frame_period = frame_period, coded_dim = num_mcep) f0s_synth = world_encode_data(wavs = wavs_synth, fs = sampling_rate, frame_period = frame_period, coded_dim = num_mcep)

min_cost_tot=[] for i in range(len(wavs_org)): frame_len=0 def logf0_rmse(x, y): # method to calculate cost

y=pad_to(y,len(x))

log_spec_dB_const = 1/len(frame_len)
# print(y)
diff = x - y
# print(x,"  ",y,"  ",len(y))
# print(diff)
#print(log_spec_dB_const * math.sqrt(np.inner(diff, diff)))
return log_spec_dB_const * math.sqrt(np.inner(diff, diff))

if len(f0s_org[i])<len(f0s_synth[i]): frame_len=f0s_org[i] else: frame_len=f0s_synth[i]

cost_function = logf0_rmse mincost, = librosa.sequence.dtw(f0s_org[i][:].T, f0s_synth[i][:].T, metric=cost_function)

print(len(min_cost))

min_cost_tot.append(np.mean(min_cost))

logF0RMSE=sum(min_cost_tot)/len(min_cost_tot) std=np.std(min_cost_tot) print(f"logF0_RMSE = {logF0RMSE} and std {std}")

SandyPanda-MLDL / -Evaluation-Metrics-Used-For-The-Performance-Evaluation-of-Voice-Conversion-VC-Models

Corrected MSD code shared here #1