A Fundamental End-to-End Speech Recognition Toolkit and Open Source SOTA Pretrained Models, Supporting Speech Recognition, Voice Activity Detection, Text Post-processing etc.
Notice: In order to resolve issues more efficiently, please raise issue following the template.
(注意:为了更加高效率解决您遇到的问题,请按照模板提问,补充细节)
🐛 Bug
chunk_size: [0, 8, 4], chunk_stride: 7680, total_chunk_num: 8, len: 57344
cut part: 0:7680, is_end: False
rtf_avg: 0.172: 100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 10.49it/s]
streaming res: [{'key': 'rand_key_PS6YwfuNhFLOv', 'text': ''}]
each chunk time: 0.10932087898254395
cut part: 7680:15360, is_end: False
rtf_avg: 0.223: 100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 8.44it/s]
streaming res: [{'key': 'rand_key_2mpbUrhToxYkv', 'text': '我'}]
each chunk time: 0.1370253562927246
cut part: 15360:23040, is_end: False
rtf_avg: 0.249: 100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 7.70it/s]
streaming res: [{'key': 'rand_key_B0dgYj2Soc0KO', 'text': '准'}]
each chunk time: 0.1448209285736084
cut part: 23040:30720, is_end: False
rtf_avg: 0.237: 100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 7.87it/s]
streaming res: [{'key': 'rand_key_67IMRebhOmM1K', 'text': '备'}]
each chunk time: 0.14313936233520508
cut part: 30720:38400, is_end: False
rtf_avg: 0.249: 100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 7.64it/s]
streaming res: [{'key': 'rand_key_hzfx1hcMbm9lT', 'text': '吃'}]
each chunk time: 0.14990878105163574
cut part: 38400:46080, is_end: False
rtf_avg: 0.227: 100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 8.22it/s]
streaming res: [{'key': 'rand_key_hEnUZd7RbIBNg', 'text': '点不一'}]
each chunk time: 0.13858675956726074
cut part: 46080:53760, is_end: False
rtf_avg: 0.233: 100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 8.30it/s]
streaming res: [{'key': 'rand_key_1qeoePtwBldGD', 'text': '样的'}]
each chunk time: 0.14059233665466309
cut part: 53760:61440, is_end: True
rtf_avg: 0.445: 100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 8.64it/s]
streaming res: [{'key': 'rand_key_cMgSzmqw5UE15', 'text': '东西'}]
each chunk time: 0.12980437278747559
Recording started for hardware trigger.
Waiting for hardware trigger.
Recording stopped.
Recording started for hardware trigger.
Waiting for hardware trigger.
Recording stopped.
audio_data: (0,), audio_cache: (4096,)
Recording started for hardware trigger.
Waiting for hardware trigger.
Recording stopped.
audio_data: (0,), audio_cache: (4096,)
Recording started for hardware trigger.
Waiting for hardware trigger.
Recording stopped.
audio_data: (0,), audio_cache: (4096,)
Recording started for hardware trigger.
Waiting for hardware trigger.
Recording stopped.
audio_data: (0,), audio_cache: (4096,)
Recording started for hardware trigger.
Waiting for hardware trigger.
Recording stopped.
audio_data: (4096,), audio_cache: (4096,)
chunk_size: [0, 8, 4], chunk_stride: 7680, total_chunk_num: 2, len: 8192
cut part: 0:7680, is_end: False
rtf_avg: 0.184: 100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 9.85it/s]
streaming res: [{'key': 'rand_key_6KkSRn9XdYRk2', 'text': ''}]
each chunk time: 0.11647748947143555
**cut part: 7680:15360, is_end: True**
0%| | 0/1 [00:00<?, ?it/s]Traceback (most recent call last):
File "d:\CodeSoftWare\Anaconda\anaconda3\envs\vits\lib\runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "d:\CodeSoftWare\Anaconda\anaconda3\envs\vits\lib\runpy.py", line 87, in _run_code
exec(code, run_globals)
File "c:\Users\Irving Gao\.vscode\extensions\ms-python.debugpy-2024.3.10721008-win32-x64\bundled\libs\debugpy\adapter/../..\debugpy\launcher/../..\debugpy\__main__.py", line 39, in <module>
cli.main()
File "c:\Users\Irving Gao\.vscode\extensions\ms-python.debugpy-2024.3.10721008-win32-x64\bundled\libs\debugpy\adapter/../..\debugpy\launcher/../..\debugpy/..\debugpy\server\cli.py", line 430, in main
run()
File "c:\Users\Irving Gao\.vscode\extensions\ms-python.debugpy-2024.3.10721008-win32-x64\bundled\libs\debugpy\adapter/../..\debugpy\launcher/../..\debugpy/..\debugpy\server\cli.py", line 284, in run_file
runpy.run_path(target, run_name="__main__")
File "c:\Users\Irving Gao\.vscode\extensions\ms-python.debugpy-2024.3.10721008-win32-x64\bundled\libs\debugpy\_vendored\pydevd\_pydevd_bundle\pydevd_runpy.py", line 321, in run_path
return _run_module_code(code, init_globals, run_name,
File "c:\Users\Irving Gao\.vscode\extensions\ms-python.debugpy-2024.3.10721008-win32-x64\bundled\libs\debugpy\_vendored\pydevd\_pydevd_bundle\pydevd_runpy.py", line 135, in _run_module_code
_run_code(code, mod_globals, init_globals,
File "c:\Users\Irving Gao\.vscode\extensions\ms-python.debugpy-2024.3.10721008-win32-x64\bundled\libs\debugpy\_vendored\pydevd\_pydevd_bundle\pydevd_runpy.py", line 124, in _run_code
exec(code, run_globals)
File "g:\WorkSpace\CodeWorkspace\GPT_projects\vits_project\vits-uma-genshin-honkai\tools\tests\test_funasr.py", line 5, in <module>
test_asr_microphone()
File "g:\workspace\codeworkspace\gpt_projects\vits_project\vits-uma-genshin-honkai\takway\tests\funasr_audio.py", line 31, in test_asr_microphone
text_dict = asr.streaming_recognize(data, auto_det_end=True)
File "g:\workspace\codeworkspace\gpt_projects\vits_project\vits-uma-genshin-honkai\takway\stt\funasr_utils.py", line 114, in streaming_recognize
res = self.asr_model.generate(input=speech_chunk, cache=self.asr_cache, is_final=is_end, chunk_size=self.chunk_size, encoder_chunk_look_back=self.encoder_chunk_look_back, decoder_chunk_look_back=self.decoder_chunk_look_back)
File "d:\CodeSoftWare\Anaconda\anaconda3\envs\vits\lib\site-packages\funasr\auto\auto_model.py", line 202, in generate
return self.inference(input, input_len=input_len, **cfg)
File "d:\CodeSoftWare\Anaconda\anaconda3\envs\vits\lib\site-packages\funasr\auto\auto_model.py", line 237, in inference
results, meta_data = model.inference(**batch, **kwargs)
File "d:\CodeSoftWare\Anaconda\anaconda3\envs\vits\lib\site-packages\funasr\models\paraformer_streaming\model.py", line 545, in inference
File "d:\CodeSoftWare\Anaconda\anaconda3\envs\vits\lib\site-packages\funasr\models\scama\encoder.py", line 437, in forward_chunk
xs_pad = self.embed(xs_pad, cache)
File "d:\CodeSoftWare\Anaconda\anaconda3\envs\vits\lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "d:\CodeSoftWare\Anaconda\anaconda3\envs\vits\lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "d:\CodeSoftWare\Anaconda\anaconda3\envs\vits\lib\site-packages\funasr\models\transformer\embedding.py", line 430, in forward
batch_size, timesteps, input_dim = x.size()
ValueError: not enough values to unpack (expected 3, got 1)
To Reproduce
Steps to reproduce the behavior (always include the command you ran):
Run cmd '....'
See error
Code sample
import time
import numpy as np
from funasr import AutoModel
from takway.stt.base_stt import STTBase
class FunAutoSpeechRecognizer(STTBase):
def __init__(self,
model_path="paraformer-zh-streaming",
device="cuda",
RATE=16000,
cfg_path=None,
debug=False,
chunk_ms=480,
encoder_chunk_look_back=4,
decoder_chunk_look_back=1,
**kwargs):
super().__init__(RATE=RATE, cfg_path=cfg_path, debug=debug)
self.asr_model = AutoModel(model=model_path, device=device, **kwargs)
self.encoder_chunk_look_back = encoder_chunk_look_back #number of chunks to lookback for encoder self-attention
self.decoder_chunk_look_back = decoder_chunk_look_back #number of encoder chunks to lookback for decoder cross-attention
#[0, 8, 4] 480ms, [0, 10, 5] 600ms
if chunk_ms == 480:
self.chunk_size = [0, 8, 4]
elif chunk_ms == 600:
self.chunk_size = [0, 10, 5]
else:
raise ValueError("`chunk_ms` should be 480 or 600, and type is int.")
self.chunk_partial_size = self.chunk_size[1] * 960
self.audio_cache = None
self.asr_cache = {}
def check_audio_type(self, audio_data):
"""check audio data type and convert it to bytes if necessary."""
if isinstance(audio_data, bytes):
pass
elif isinstance(audio_data, list):
audio_data = b''.join(audio_data)
elif isinstance(audio_data, str):
audio_data = decode_str2bytes(audio_data)
elif isinstance(audio_data, io.BytesIO):
wf = wave.open(audio_data, 'rb')
audio_data = wf.readframes(wf.getnframes())
if isinstance(audio_data, bytes):
audio_data = np.frombuffer(audio_data, dtype=np.int16)
else:
raise TypeError(f"audio_data must be bytes, str or io.BytesIO, but got {type(audio_data)}")
return audio_data
def recognize(self, audio_data):
"""recognize audio data to text"""
audio_data = self.check_audio_type(audio_data)
result = self.asr_model.generate(input=audio_data,
batch_size_s=300,
hotword=self.keywords)
print(result)
return result
def streaming_recognize(self,
audio_data,
is_end=False,
auto_det_end=False):
"""recognize partial result
Args:
audio_data: bytes or numpy array, partial audio data
is_end: bool, whether the audio data is the end of a sentence
auto_det_end: bool, whether to automatically detect the end of a audio data
"""
text_dict = dict(text=[], is_end=is_end)
audio_data = self.check_audio_type(audio_data)
if self.audio_cache is None:
self.audio_cache = audio_data
else:
print(f"audio_data: {audio_data.shape}, audio_cache: {self.audio_cache.shape}")
if self.audio_cache.shape[0] > 0:
self.audio_cache = np.concatenate([self.audio_cache, audio_data], axis=0)
if not is_end and self.audio_cache.shape[0] < self.chunk_partial_size:
return text_dict
total_chunk_num = int((len(self.audio_cache)-1)/self.chunk_partial_size)
if is_end:
# if the audio data is the end of a sentence, \
# we need to add one more chunk to the end to \
# ensure the end of the sentence is recognized correctly.
auto_det_end = True
if auto_det_end:
total_chunk_num += 1
print(f"chunk_size: {self.chunk_size}, chunk_stride: {self.chunk_partial_size}, total_chunk_num: {total_chunk_num}, len: {len(self.audio_cache)}")
for i in range(total_chunk_num):
if auto_det_end:
is_end = i == total_chunk_num - 1
start_idx = i*self.chunk_partial_size
if auto_det_end:
end_idx = (i+1)*self.chunk_partial_size if i < total_chunk_num-1 else -1
else:
end_idx = (i+1)*self.chunk_partial_size if i < total_chunk_num else -1
print(f"cut part: {start_idx}:{end_idx}, is_end: {is_end}, i: {i}, total_chunk_num: {total_chunk_num}")
t_stamp = time.time()
speech_chunk = self.audio_cache[start_idx:end_idx]
# speech_chunk = self.audio_cache[i*self.chunk_partial_size:(i+1)*self.chunk_partial_size].copy() # 创建一个可写副本
# assert speech_chunk.flags.writeable == True, "speech_chunk should be a writable array"
res = self.asr_model.generate(input=speech_chunk, cache=self.asr_cache, is_final=is_end, chunk_size=self.chunk_size, encoder_chunk_look_back=self.encoder_chunk_look_back, decoder_chunk_look_back=self.decoder_chunk_look_back)
# print(f"asr_cache: {self.asr_cache.flags.writeable}")
# self.asr_cache['encoder']['feats'].flags.writeable
# self.asr_cache['encoder']
print(f"streaming res: {res}")
text_dict['text'].append(self.text_postprecess(res[0], data_id='text'))
print(f"each chunk time: {time.time()-t_stamp}")
if is_end:
self.audio_cache = None
self.asr_cache = {}
else:
self.audio_cache = self.audio_cache[end_idx:] # cut the processed part from audio_cache
# print(f"text_dict: {text_dict}")
return text_dict
Notice: In order to resolve issues more efficiently, please raise issue following the template. (注意:为了更加高效率解决您遇到的问题,请按照模板提问,补充细节)
🐛 Bug
To Reproduce
Steps to reproduce the behavior (always include the command you ran):
Code sample
Expected behavior
Environment
pip
, source): pipAdditional context