Uberi / speech_recognition

Speech recognition module for Python, supporting several engines and APIs, online and offline.
https://pypi.python.org/pypi/SpeechRecognition/
BSD 3-Clause "New" or "Revised" License
8.44k stars 2.4k forks source link

fasterwhisper and distilwhisper implementations #730

Open sujitvasanth opened 10 months ago

sujitvasanth commented 10 months ago

Hi @Uberi I wrote some extensions to your API for faster whisper and distil whisper that just need to be added to the init.py file to work - they will load the models automatically.

def recognize_whisper(self, audio_data, model="base.en", show_dict=False, load_options=None, language=None, translate=False, **transcribe_options):
...
      return result["text"]
def recognize_fasterwhisper(self, audio_data, model="small", show_dict=False, load_options=None, language=None, translate=False, **transcribe_options):
    #custom recognizer for faster whisper
    assert isinstance(audio_data, AudioData), "Data must be audio data"
    import numpy as np
    import soundfile as sf
    import torch
    from faster_whisper import WhisperModel

    if load_options or not hasattr(self, "whisper_model") or self.whisper_model.get(model) is None:
        self.whisper_model = getattr(self, "whisper_model", {})
        #self.whisper_model[model] = WhisperModel("base", device="cpu", compute_type="int8")
        self.whisper_model[model] = WhisperModel("tiny", device="cuda", compute_type="auto")

    wav_bytes = audio_data.get_wav_data(convert_rate=16000)
    wav_stream = io.BytesIO(wav_bytes)
    audio_array, sampling_rate = sf.read(wav_stream)
    audio_array = audio_array.astype(np.float32)

    segments, info = self.whisper_model[model].transcribe(audio_array, beam_size=5,)
    text =""
    for segment in segments:
        #print("%s " % (segment.text))
        text=text+segment.text+" "
        #print(text)
    if show_dict:
        return result
    else:
        return text.lower()

def recognize_distilwhisper(self, audio_data, model="distil-whisper/distil-small.en", show_dict=False, load_options=None, language=None, translate=False, **transcribe_options):
    #custom recognizer for distill-whisper
    assert isinstance(audio_data, AudioData), "Data must be audio data"
    import numpy as np
    import soundfile as sf
    import torch
    from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
    model_id = "distil-whisper/distil-small.en"

    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
    model.to(device)
    processor = AutoProcessor.from_pretrained(model_id)
    whisper = pipeline(
        "automatic-speech-recognition",model=model,tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,max_new_tokens=128,
        torch_dtype=torch_dtype,device=device,)

    wav_bytes = audio_data.get_wav_data(convert_rate=16000)
    wav_stream = io.BytesIO(wav_bytes)
    audio_array, sampling_rate = sf.read(wav_stream)
    audio_array = audio_array.astype(np.float16)

    text = whisper(audio_array,
                    chunk_length_s=50,
                    stride_length_s=10,
                    batch_size=8)
    print(text)
    if show_dict:
        return result
    else:
        return text["text"]`

would be great if you would consider adding these to your API. Just needs some minor alterations to allow users to choose which model and gpu/cpu

new dependencies required

pip3 install faster-whisper
pip3 install transformers optimum accelerate
MiskaWasTaken commented 9 months ago

great idea nice work

sujitvasanth commented 8 months ago

@MiskaWasTaken please test the code out and let me know what you think

Masame commented 5 months ago

I might have to copy you and add support for whisperx

Genesis1231 commented 5 months ago

this is good work, i am using faster-whisper to transcribe anyway. trying it out tomo