Open sujitvasanth opened 10 months ago
Hi @Uberi I wrote some extensions to your API for faster whisper and distil whisper that just need to be added to the init.py file to work - they will load the models automatically.
def recognize_whisper(self, audio_data, model="base.en", show_dict=False, load_options=None, language=None, translate=False, **transcribe_options): ... return result["text"]
def recognize_fasterwhisper(self, audio_data, model="small", show_dict=False, load_options=None, language=None, translate=False, **transcribe_options): #custom recognizer for faster whisper assert isinstance(audio_data, AudioData), "Data must be audio data" import numpy as np import soundfile as sf import torch from faster_whisper import WhisperModel if load_options or not hasattr(self, "whisper_model") or self.whisper_model.get(model) is None: self.whisper_model = getattr(self, "whisper_model", {}) #self.whisper_model[model] = WhisperModel("base", device="cpu", compute_type="int8") self.whisper_model[model] = WhisperModel("tiny", device="cuda", compute_type="auto") wav_bytes = audio_data.get_wav_data(convert_rate=16000) wav_stream = io.BytesIO(wav_bytes) audio_array, sampling_rate = sf.read(wav_stream) audio_array = audio_array.astype(np.float32) segments, info = self.whisper_model[model].transcribe(audio_array, beam_size=5,) text ="" for segment in segments: #print("%s " % (segment.text)) text=text+segment.text+" " #print(text) if show_dict: return result else: return text.lower() def recognize_distilwhisper(self, audio_data, model="distil-whisper/distil-small.en", show_dict=False, load_options=None, language=None, translate=False, **transcribe_options): #custom recognizer for distill-whisper assert isinstance(audio_data, AudioData), "Data must be audio data" import numpy as np import soundfile as sf import torch from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 model_id = "distil-whisper/distil-small.en" model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True) model.to(device) processor = AutoProcessor.from_pretrained(model_id) whisper = pipeline( "automatic-speech-recognition",model=model,tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor,max_new_tokens=128, torch_dtype=torch_dtype,device=device,) wav_bytes = audio_data.get_wav_data(convert_rate=16000) wav_stream = io.BytesIO(wav_bytes) audio_array, sampling_rate = sf.read(wav_stream) audio_array = audio_array.astype(np.float16) text = whisper(audio_array, chunk_length_s=50, stride_length_s=10, batch_size=8) print(text) if show_dict: return result else: return text["text"]`
would be great if you would consider adding these to your API. Just needs some minor alterations to allow users to choose which model and gpu/cpu
new dependencies required
pip3 install faster-whisper pip3 install transformers optimum accelerate
great idea nice work
@MiskaWasTaken please test the code out and let me know what you think
I might have to copy you and add support for whisperx
this is good work, i am using faster-whisper to transcribe anyway. trying it out tomo
Hi @Uberi I wrote some extensions to your API for faster whisper and distil whisper that just need to be added to the init.py file to work - they will load the models automatically.
would be great if you would consider adding these to your API. Just needs some minor alterations to allow users to choose which model and gpu/cpu
new dependencies required