HarryVolek / PyTorch_Speaker_Verification

PyTorch implementation of "Generalized End-to-End Loss for Speaker Verification" by Wan, Li et al.
BSD 3-Clause "New" or "Revised" License
575 stars 165 forks source link

Single inference #75

Open fabianoluzbr opened 3 years ago

fabianoluzbr commented 3 years ago

Hello guys, I'm new to this tool. I would like to know the path of the stones to calculate the similarity between two audio files. I started making a code here but it still has a lot of error. Can someone help me ? Thanks... ` import os import random import time import torch import librosa import numpy as np from torch.utils.data import DataLoader

from hparam import hparam as hp from speech_embedder_net import SpeechEmbedder, GE2ELoss, get_centroids, get_cossim from scipy.io.wavfile import read

def get_utterance(utter_path):

utter_min_len = (hp.data.tisv_frame * hp.data.hop + hp.data.window) * hp.data.sr    # lower bound of utterance length

utter, sr = librosa.core.load(utter_path, hp.data.sr)        # load utterance audio
intervals = librosa.effects.split(utter, top_db=30)         # voice activity detection
# this works fine for timit but if you get array of shape 0 for any other audio change value of top_db
# for vctk dataset use top_db=100
utterances_spec = []

for interval in intervals:
    if (interval[1]-interval[0]) > utter_min_len:           # If partial utterance is sufficient long,
        utter_part = utter[interval[0]:interval[1]]         # save first and last 180 frames of spectrogram.
        S = librosa.core.stft(y=utter_part, n_fft=hp.data.nfft,win_length=int(hp.data.window * sr), hop_length=int(hp.data.hop * sr))
        S = np.abs(S) ** 2
        mel_basis = librosa.filters.mel(sr=hp.data.sr, n_fft=hp.data.nfft, n_mels=hp.data.nmels)
        S = np.log10(np.dot(mel_basis, S) + 1e-6)           # log mel spectrogram of utterances
        utterances_spec.append(S[:, :hp.data.tisv_frame])    # first 180 frames of partial utterance
        utterances_spec.append(S[:, -hp.data.tisv_frame:])   # last 180 frames of partial utterance

utterances_spec = np.array(utterances_spec)
return utterances_spec

def get_data(sample_path):

utterance = get_utterance(sample_path)
utterance = utterance[:,:,:160]              # TODO implement variable length batch size
utterance = torch.tensor(np.transpose(utterance, axes=(0,2,1)))     # transpose [batch, frames, n_mels]
return utterance

def get_similarity(model_path, sample_a, sample_b):

data_a = get_data(sample_a)
data_b = get_data(sample_b)

embedder_net = SpeechEmbedder()
embedder_net.load_state_dict(torch.load(model_path))
embedder_net.eval()

enrollment_embeddings = embedder_net(data_a)
verification_embeddings = embedder_net(data_b)    

#enrollment_embeddings = torch.reshape(enrollment_embeddings, (1, 1//2, enrollment_embeddings.size(1)))
enrollment_centroids = get_centroids(enrollment_embeddings)

return get_cossim(verification_embeddings, enrollment_embeddings)

if name=="main":

file1 = "test/audio_1.wav"
file2 = "test/audio_2.wav"
get_similarity(hp.model.model_path,file1,file2)

`