Aider-AI / aider

aider is AI pair programming in your terminal
https://aider.chat/
Apache License 2.0
20.43k stars 1.88k forks source link

Contribution: upgrade aider/voice.py to groq/whisper-large-v3-turbo #2039

Open MeDott29 opened 3 days ago

MeDott29 commented 3 days ago

upgrade to fast and free whisper

import math
import os
import queue
import tempfile
import time
import warnings

from prompt_toolkit.shortcuts import prompt
from litellm import transcription
from groq import Groq
client = Groq()

# from aider.llm import litellm

from .dump import dump  # noqa: F401

warnings.filterwarnings(
    "ignore", message="Couldn't find ffmpeg or avconv - defaulting to ffmpeg, but may not work"
)

from pydub import AudioSegment  # noqa

try:
    import soundfile as sf
except (OSError, ModuleNotFoundError):
    sf = None

import sounddevice as sd  # Moved import to top for consistency

class SoundDeviceError(Exception):
    pass

class Voice:
    max_rms = 0
    min_rms = 1e5
    pct = 0

    threshold = 0.15

    def __init__(self, audio_format="wav", device_id=8):
        if sf is None:
            raise SoundDeviceError("soundfile library is not available.")
        try:
            print(f"Initializing sound device {device_id}...")
            self.sd = sd
        except (OSError, ModuleNotFoundError):
            raise SoundDeviceError("sounddevice library is not available.")

        # Validate device ID
        available_devices = self.sd.query_devices()
        if device_id >= len(available_devices):
            raise ValueError(f"Device ID {device_id} does not exist. Available devices are:")
            for idx, device in enumerate(available_devices):
                print(f"{idx}: {device['name']}")

        self.device_id = device_id

        if audio_format not in ["wav", "mp3", "webm"]:
            raise ValueError(f"Unsupported audio format: {audio_format}")
        self.audio_format = audio_format

    def callback(self, indata, frames, time_info, status):
        """This is called (from a separate thread) for each audio block."""
        import numpy as np

        if status:
            print(f"Status: {status}", flush=True)
        rms = np.sqrt(np.mean(indata**2))
        self.max_rms = max(self.max_rms, rms)
        self.min_rms = min(self.min_rms, rms)

        rng = self.max_rms - self.min_rms
        if rng > 0.001:
            self.pct = (rms - self.min_rms) / rng
        else:
            self.pct = 0.5

        self.q.put(indata.copy())

    def get_prompt(self):
        num = 10
        if math.isnan(self.pct) or self.pct < self.threshold:
            cnt = 0
        else:
            cnt = int(self.pct * 10)

        bar = "░" * cnt + "█" * (num - cnt)
        bar = bar[:num]

        dur = time.time() - self.start_time
        return f"Recording, press ENTER when done... {dur:.1f}sec {bar}"

    def record_and_transcribe(self, history=None, language=None):
        try:
            return self.raw_record_and_transcribe(history, language)
        except KeyboardInterrupt:
            return
        except SoundDeviceError as e:
            print(f"Error: {e}")
            print("Please ensure you have a working audio input device connected and try again.")
            return

    def raw_record_and_transcribe(self, history, language):
        self.q = queue.Queue()

        temp_wav = tempfile.mktemp(suffix=".wav")

        try:
            device_info = self.sd.query_devices(self.device_id, "input")
            sample_rate = int(device_info["default_samplerate"])
        except (TypeError, ValueError):
            sample_rate = 16000  # fallback to 16kHz if unable to query device
        except self.sd.PortAudioError:
            raise SoundDeviceError(
                "No audio input device detected. Please check your audio settings and try again."
            )

        self.start_time = time.time()

        try:
            with self.sd.InputStream(
                samplerate=sample_rate,
                channels=1,
                callback=self.callback,
                device=self.device_id
            ):
                prompt(self.get_prompt, refresh_interval=0.1)
        except self.sd.PortAudioError as err:
            raise SoundDeviceError(f"Error accessing audio input device: {err}")

        with sf.SoundFile(temp_wav, mode="x", samplerate=sample_rate, channels=1) as file:
            while not self.q.empty():
                file.write(self.q.get())

        if self.audio_format != "wav":
            filename = tempfile.mktemp(suffix=f".{self.audio_format}")
            audio = AudioSegment.from_wav(temp_wav)
            audio.export(filename, format=self.audio_format)
            os.remove(temp_wav)
        else:
            filename = temp_wav

        with open(filename, "rb") as fh:
            try:
                transcript = client.audio.transcriptions.create(
                    model="whisper-large-v3-turbo", file=fh, prompt=history, language=language
                )

            except Exception as err:
                print(f"Unable to transcribe {filename}: {err}")
                return

        if self.audio_format != "wav":
            os.remove(filename)

        text = transcript.text
        return text

if __name__ == "__main__":
    api_key = os.getenv("GROQ_API_KEY")  # Ensure this matches your environment variable
    if not api_key:
        raise ValueError("Please set the GROQ_API_KEY environment variable.")
    print(Voice(device_id=8).record_and_transcribe())

This is hard coded to 'device_id=8' Notes: did I miss the config option to set up a default audio device? It would be nice to be presented with a list of available audio devices so we can select the one we want.

Version and model info

No response

atljoseph commented 16 hours ago

Please add this so transcription can be local/free. What about the tts text to speech part, though? It’d be cool to have a convo inside aider back and forth. It’d be great to be able to configure the endpoint called independently there too. For instance, use any model/provider for coding LLM, but allow an independent endpoint for the TTS. Example: openedai-speech is a pretty good tts server, and free. Or third party libs