A Fundamental End-to-End Speech Recognition Toolkit and Open Source SOTA Pretrained Models, Supporting Speech Recognition, Voice Activity Detection, Text Post-processing etc.
Traceback (most recent call last): | 0/3 [00:00<?, ?it/s]
File "/UD-AI-TextToSpeech/text_to_speech/server_gpu.py", line 183, in audio2text
text = sense_voice_model(params)
File "/UD-AI-TextToSpeech/text_to_speech/audio_to_text/sense_voice_model.py", line 20, in call
res = self.sense_voice_model.generate(
File "/usr/local/lib/python3.10/dist-packages/funasr/auto/auto_model.py", line 263, in generate
return self.inference_with_vad(input, input_len=input_len, cfg)
File "/usr/local/lib/python3.10/dist-packages/funasr/auto/auto_model.py", line 417, in inference_with_vad
results = self.inference(
File "/usr/local/lib/python3.10/dist-packages/funasr/auto/auto_model.py", line 302, in inference
res = model.inference(batch, kwargs)
File "/usr/local/lib/python3.10/dist-packages/funasr/models/sense_voice/model.py", line 832, in inference
speech, speech_lengths = extract_fbank(
File "/usr/local/lib/python3.10/dist-packages/funasr/utils/load_utils.py", line 173, in extract_fbank
data, data_len = frontend(data, data_len, kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, *kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/funasr/frontends/wav_frontend.py", line 134, in forward
mat = kaldi.fbank(
File "/usr/local/lib/python3.10/dist-packages/torchaudio/compliance/kaldi.py", line 591, in fbank
waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
File "/usr/local/lib/python3.10/dist-packages/torchaudio/compliance/kaldi.py", line 142, in _get_waveform_and_window_properties
assert 2 <= window_size <= len(waveform), "choose a window size {} that is [2, {}]".format(
AssertionError: choose a window size 400 that is [2, 160]
同样的视频,同样的代码,在windows上没有报错,但是linux上报错
Traceback (most recent call last): | 0/3 [00:00<?, ?it/s] File "/UD-AI-TextToSpeech/text_to_speech/server_gpu.py", line 183, in audio2text text = sense_voice_model(params) File "/UD-AI-TextToSpeech/text_to_speech/audio_to_text/sense_voice_model.py", line 20, in call res = self.sense_voice_model.generate( File "/usr/local/lib/python3.10/dist-packages/funasr/auto/auto_model.py", line 263, in generate return self.inference_with_vad(input, input_len=input_len, cfg) File "/usr/local/lib/python3.10/dist-packages/funasr/auto/auto_model.py", line 417, in inference_with_vad results = self.inference( File "/usr/local/lib/python3.10/dist-packages/funasr/auto/auto_model.py", line 302, in inference res = model.inference(batch, kwargs) File "/usr/local/lib/python3.10/dist-packages/funasr/models/sense_voice/model.py", line 832, in inference speech, speech_lengths = extract_fbank( File "/usr/local/lib/python3.10/dist-packages/funasr/utils/load_utils.py", line 173, in extract_fbank data, data_len = frontend(data, data_len, kwargs) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl return self._call_impl(*args, *kwargs) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1541, in _call_impl return forward_call(args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/funasr/frontends/wav_frontend.py", line 134, in forward mat = kaldi.fbank( File "/usr/local/lib/python3.10/dist-packages/torchaudio/compliance/kaldi.py", line 591, in fbank waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties( File "/usr/local/lib/python3.10/dist-packages/torchaudio/compliance/kaldi.py", line 142, in _get_waveform_and_window_properties assert 2 <= window_size <= len(waveform), "choose a window size {} that is [2, {}]".format( AssertionError: choose a window size 400 that is [2, 160] 同样的视频,同样的代码,在windows上没有报错,但是linux上报错