Picovoice / eagle

On-device speaker recognition engine powered by deep learning
Apache License 2.0
29 stars 3 forks source link

Question : Eagle return only 0 #79

Closed dedaleDev closed 2 months ago

dedaleDev commented 2 months ago

I have written a code that tries to detect a speaker with a .wav audio file in Python SDK. But when I run my code, Eagle returns only a list contains 0.0. However, the voice train is really different (a woman and a man). And the recording is made at 16 kHz with a professional microphone. For example : I have this trace back :

Checking for: Woman.eagle
User score for Woman.eagle: 0.0
Checking for: Man.eagle
User score for Man.eagle: 0.0
Recognized as: Woman.eagle

This is my code, can you help me ?


import pveagle
from pvrecorder import PvRecorder
import os 
import wave
import numpy as np

access_key = "NA0NuP+5Orn3NUuj8UHB6Sj1VaolSuM2qvYlQeeWbYs6epGzsACtYA==";

def analyseVoiceRepertories(): 
    # List all files in /src/caches/users
    listUsers = []
    for file in os.listdir("src/caches/users"):
        if file.endswith(".eagle"):
            listUsers.append(file)
    return listUsers
def trainVoiceRecognition():
    name = input("Entrez votre prénom : ").strip().replace(" ", "")
    try:
        eagle_profiler = pveagle.create_profiler(access_key=access_key)
    except pveagle.EagleError as e:
        print(f"Erreur lors de la création du profiler: {e}")
        return

    recorder = PvRecorder(device_index=0, frame_length=eagle_profiler.min_enroll_samples)
    recorder.start()
    print("Recording started... Speak please")

    enroll_percentage = 0.0
    while enroll_percentage < 100.0:
        audio_frame = recorder.read()
        enroll_percentage, feedback = eagle_profiler.enroll(audio_frame)
        print(f"Progression de l'enregistrement : {enroll_percentage:.2f}% - {feedback.name}")
        if feedback == pveagle.EagleProfilerEnrollFeedback.QUALITY_ISSUE : 
            print("La qualité de l'enregistrement est insuffisante.")

    recorder.stop()
    speaker_profile = eagle_profiler.export()

    with open(f"src/caches/users/{name}.eagle", 'wb+') as f:
        f.write(pveagle.EagleProfile.to_bytes(speaker_profile))

    eagle_profiler.delete()
    print("Training finished and profile saved")

def wavFileToPCM(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Le fichier {file_path} est introuvable.")
    with wave.open(file_path, 'rb') as wav_file:
        # Récupérer les paramètres
        n_channels = wav_file.getnchannels()
        sample_width = wav_file.getsampwidth()
        frame_rate = wav_file.getframerate()
        if frame_rate != 16000:
            raise ValueError(f"Taux d'échantillonnage non supporté: {frame_rate}, 16kHz attendu.")

        n_frames = wav_file.getnframes()
        frames = wav_file.readframes(n_frames)

        if sample_width == 2:
            pcm_data = np.frombuffer(frames, dtype=np.int16)
        elif sample_width == 1:
            pcm_data = np.frombuffer(frames, dtype=np.uint8) - 128
        else:
            raise ValueError(f"Largeur d'échantillon non supportée: {sample_width}")

        return pcm_data

def VoiceRecognition():
    Users = analyseVoiceRepertories()
    if not os.path.exists("src/caches/request.wav"):
        print("Le fichier request.wav est introuvable.")
        return

    pcm_data = wavFileToPCM("src/caches/request.wav")
    print(pcm_data)
    userScore = []
    for user in Users:
        print(f"Checking for: {user}")
        with open(f"src/caches/users/{user}", 'rb') as f:
            speakerProfile = pveagle.EagleProfile.from_bytes(f.read())

        try:
            eagle = pveagle.create_recognizer(access_key, speakerProfile)

        except pveagle.EagleError as e:
            print(f"Erreur lors de la reconnaissance de la voix : {e}")
            continue

        score = []
        frame_length = eagle.frame_length
        for i in range(0, len(pcm_data), frame_length):
            frame = pcm_data[i:i + frame_length]
            if len(frame) != frame_length:
                print(f"Frame incomplete at index {i}, expected {frame_length}, got {len(frame)}")
                continue
            scores = eagle.process(frame)
            score.append(scores[0])
        eagle.delete()

        average_score = sum(score) / len(score) if score else 0
        print(f"User score for {user}: {average_score}")
        userScore.append(average_score)

    if userScore:
        recognized_user = Users[userScore.index(max(userScore))]
        print(f"Recognized as: {recognized_user}")
        return recognized_user
    else:
        print("Aucun utilisateur reconnu.")
        return None

#trainVoiceRecognition()
VoiceRecognition()