perceptually faster inference through pre-completion inference of audio

I was experimenting with part processing microphone audio input while the microphone is still recording the latter part of the speech and this does produce lower latency as you have a "head start" compared to waiting for the utterence to complete..

The main problem I encountered was that the transcription sometimes changes for the latest word as it is being processed - here are 2 examples of evolving output What was those? What was Lord Nelson? What was Lord Nelson? What was Lord Nelson's What was Lord Nelson's flagship? What was Lord Nelson's flagship What was Lord Nelson's flagship called?

What is What is 10 What is 10 plus What is 10 plus 2 What is 10 plus 230? What is 10 plus 235?

how can I know where to chop the audio so I can have just a 1 word lag? rather than iterating through the whole audio? can I get timestamps? is there any way to prime the model with the preceding words for context as if it were iterating through th entire audio in order to save processing time? Heres my current code

import pyaudio, torch
import numpy as np; from collections import deque
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "distil-whisper/distil-large-v3"
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, use_safetensors=True).to(device)
processor = AutoProcessor.from_pretrained(model_id)
transcriber = pipeline("automatic-speech-recognition",model=model,tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,max_new_tokens=128, torch_dtype=torch_dtype,device=device,)
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paFloat32,channels=1,rate=processor.feature_extractor.sampling_rate,input=True,frames_per_buffer=8192)

def transcribe():
    recorded_frames = deque(); recorded_transcription = deque()
    low_volume_iterations_threshold=7; pre_recording_iterations=1
    iterations =0; is_recording = False;
    try:
        while True:
            audio_data = np.frombuffer(stream.read(4096, exception_on_overflow=False), dtype=np.float32)
            iterations += 1
            if is_recording:
                recorded_frames.append(audio_data)
                recorded_audio = np.concatenate(list(recorded_frames))
                transcription = transcriber(recorded_audio, generate_kwargs={"max_new_tokens": 40})
                new_t= transcription["text"]
                print(new_t[1:])
                recorded_transcription.append(new_t[1:])
                if np.abs(audio_data).mean() < 0.1:
                    if iterations > low_volume_iterations_threshold:
                        is_recording = False
                        iterations = 0  # Reset counter
                        print(transcription["text"][1:])  # Assuming you want to trim the first character
                        recorded_frames=[]
                        print(recorded_transcription)
                        recorded_transcription.clear()
            else:
                if np.abs(audio_data).mean() > 0.05:
                    is_recording = True
                    recorded_frames.clear()
                    recorded_frames.append(audio_data)
                    iterations=0
                    if len(recorded_frames) > pre_recording_iterations:
                        recorded_frames = deque(list(recorded_frames)[-pre_recording_iterations:])
    finally:
        stream.stop_stream()
        stream.close()
        p.terminate()

transcribe()

huggingface / distil-whisper

perceptually faster inference through pre-completion inference of audio #114