rhasspy / wyoming

Peer-to-peer protocol for voice assistants
MIT License
138 stars 20 forks source link

Speech-to-Text with Huggingface Whisper V3 #26

Open Bensonheimer992 opened 1 month ago

Bensonheimer992 commented 1 month ago

Hey im trying to make a little Python Script for Transcribing Voice to Text with the Huggingface Inferrence API but nothing happens can someone maybe help me ?

import argparse
import asyncio
from functools import partial

from huggingface_hub import model_info
from wyoming.flycheck_server import AsyncServer
from wyoming.info import AsrModel, AsrProgram, Attribution, Info
import logging
from handler import HuggingfaceWhisper

LOGGER = logging.getLogger(__name__)

async def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--key", required=True, help="Your Huggingface API Key")
    parser.add_argument("--uri", required=True, help="unix:// or tcp://")

    args = parser.parse_args()

    wyoming_info = Info(
        asr=[
            AsrProgram(
                name="Huggingface Whisper",
                description="Faster Whisper transcription with Whisper Large V3",
                attribution=Attribution(
                    name="Bensonheimer992",
                    url="https://github.com/Bensonheimer992"
                ),
                installed=True,
                version="1.0",
                models=[
                    AsrModel(
                        name="Whisper Large V3",
                        description="The Large Whisper Model",
                        attribution=Attribution(
                            name="OpenAI",
                            url="https://huggingface.co/openai",
                        ),
                        installed=True,
                        languages=["de", "en"],
                        version="3.0",
                    )
                ],
            )
        ],
    )

    server = AsyncServer.from_uri(args.uri)
    LOGGER.info("Ready!")
    lock = asyncio.Lock()
    await server.run(
        partial(
            HuggingfaceWhisper,
            wyoming_info,
            args,
            lock
        )
    )

if __name__ == "__main__":
    asyncio.run(main())
import argparse
import asyncio
import logging
import os.path
import tempfile
import wave
from typing import Optional

import aiohttp
from wyoming.asr import Transcript, Transcribe
from wyoming.audio import AudioChunk, AudioStop
from wyoming.event import Event
from wyoming.info import Info, Describe
from wyoming.server import AsyncEventHandler

LOGGER = logging.getLogger(__name__)

API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large-v3"

class HuggingfaceWhisper(AsyncEventHandler):
    def __init__(self, wyoming_info: Info, cliargs: argparse.Namespace, lock: asyncio.Lock, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)

        self.cliargs = cliargs
        self.wyoming_info_event = wyoming_info.event()
        self.lock = lock
        self.wyoming_info_event = wyoming_info.event()
        self.wavdir = tempfile.TemporaryDirectory()
        self.wavpath = os.path.join(self.wavdir.name, "speech.wav")
        self.wavfile = Optional[wave.Wave_write]

    async def handle_event(self, event: Event) -> bool:
        if AudioChunk.is_type(event.type):
            chunk = AudioChunk.from_event(event)
            if self.wavfile is None:
                self.wavfile = wave.open(self.wavpath, "wb")
                self.wavfile.setframerate(chunk.rate)
                self.wavfile.setsampwidth(chunk.width)
                self.wavfile.setnchannels(chunk.channels)

            self.wavfile.writeframes(chunk.audio)
            return True

        if AudioStop.is_type(event.type):
            LOGGER.debug("Audio Stopped. Transcribing ...")
            assert self.wavfile is not None

            self.wavfile.close()
            self.wavfile = None

            async with self.lock:
                try:
                    headers = {"Authorization": f"Bearer {self.cliargs.key}"}
                    async with aiohttp.ClientSession() as session:
                        with open(self.wavpath, "rb") as f:
                            data = f.read()
                            async with session.post(API_URL, headers=headers, data=data) as response:
                                if response.status == 200:
                                    result = await response.json()
                                    text = result.get('text', '')
                                    LOGGER.info("Transcription Recieved")
                                    await self.write_event(Transcript(text=text).event())
                                else:
                                    LOGGER.error(f"Error from Huggingface API: {response.status}")
                except Exception as e:
                    LOGGER.error(f"Error during Transcription: {str(e)}")

        if Transcribe.is_type(event.type):
            return True

        if Describe.is_type(event.type):
            await self.write_event(self.wyoming_info_event)
            LOGGER.debug("Sent Info")
            return True

        return False