modelscope / FunASR

A Fundamental End-to-End Speech Recognition Toolkit and Open Source SOTA Pretrained Models, Supporting Speech Recognition, Voice Activity Detection, Text Post-processing etc.
https://www.funasr.com
Other
6.47k stars 688 forks source link

识别音频文件报错 #2101

Closed cpken closed 3 weeks ago

cpken commented 3 weeks ago

识别音频文件报错

但是可以运行完程序流程,结果正确。

🐛 Bug

ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "conda_env/FunASR/lib/python3.10/site-packages/uvicorn/protocols/http/h11_impl.py", line 406, in run_asgi
    result = await app(  # type: ignore[func-returns-value]
  File "conda_env/FunASR/lib/python3.10/site-packages/uvicorn/middleware/proxy_headers.py", line 70, in __call__
    return await self.app(scope, receive, send)
  File "conda_env/FunASR/lib/python3.10/site-packages/fastapi/applications.py", line 1054, in __call__
    await super().__call__(scope, receive, send)
  File "conda_env/FunASR/lib/python3.10/site-packages/starlette/applications.py", line 123, in __call__
    await self.middleware_stack(scope, receive, send)
  File "conda_env/FunASR/lib/python3.10/site-packages/starlette/middleware/errors.py", line 186, in __call__
    raise exc
  File "conda_env/FunASR/lib/python3.10/site-packages/starlette/middleware/errors.py", line 164, in __call__
    await self.app(scope, receive, _send)
  File "conda_env/FunASR/lib/python3.10/site-packages/gradio/route_utils.py", line 760, in __call__
    await self.app(scope, receive, send)
  File "conda_env/FunASR/lib/python3.10/site-packages/starlette/middleware/exceptions.py", line 65, in __call__
    await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)
  File "conda_env/FunASR/lib/python3.10/site-packages/starlette/_exception_handler.py", line 64, in wrapped_app
    raise exc
  File "conda_env/FunASR/lib/python3.10/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
    await app(scope, receive, sender)
  File "conda_env/FunASR/lib/python3.10/site-packages/starlette/routing.py", line 754, in __call__
    await self.middleware_stack(scope, receive, send)
  File "conda_env/FunASR/lib/python3.10/site-packages/starlette/routing.py", line 774, in app
    await route.handle(scope, receive, send)
  File "conda_env/FunASR/lib/python3.10/site-packages/starlette/routing.py", line 295, in handle
    await self.app(scope, receive, send)
  File "conda_env/FunASR/lib/python3.10/site-packages/starlette/routing.py", line 77, in app
    await wrap_app_handling_exceptions(app, request)(scope, receive, send)
  File "conda_env/FunASR/lib/python3.10/site-packages/starlette/_exception_handler.py", line 64, in wrapped_app
    raise exc
  File "conda_env/FunASR/lib/python3.10/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
    await app(scope, receive, sender)
  File "conda_env/FunASR/lib/python3.10/site-packages/starlette/routing.py", line 75, in app
    await response(scope, receive, send)
  File "conda_env/FunASR/lib/python3.10/site-packages/starlette/responses.py", line 348, in __call__
    await send(
  File "conda_env/FunASR/lib/python3.10/site-packages/starlette/_exception_handler.py", line 50, in sender
    await send(message)
  File "conda_env/FunASR/lib/python3.10/site-packages/starlette/_exception_handler.py", line 50, in sender
    await send(message)
  File "conda_env/FunASR/lib/python3.10/site-packages/starlette/middleware/errors.py", line 161, in _send
    await send(message)
  File "conda_env/FunASR/lib/python3.10/site-packages/uvicorn/protocols/http/h11_impl.py", line 510, in send
    output = self.conn.send(event=h11.EndOfMessage())
  File "conda_env/FunASR/lib/python3.10/site-packages/h11/_connection.py", line 512, in send
    data_list = self.send_with_data_passthrough(event)
  File "conda_env/FunASR/lib/python3.10/site-packages/h11/_connection.py", line 545, in send_with_data_passthrough
    writer(event, data_list.append)
  File "conda_env/FunASR/lib/python3.10/site-packages/h11/_writers.py", line 67, in __call__
    self.send_eom(event.headers, write)
  File "conda_env/FunASR/lib/python3.10/site-packages/h11/_writers.py", line 96, in send_eom
    raise LocalProtocolError("Too little data for declared Content-Length")
h11._util.LocalProtocolError: Too little data for declared Content-Length

环境

代码

# conding=utf-8
import os
import sys
import gradio as gr
import numpy as np
import torch
import torchaudio
from funasr import AutoModel

# 获取当前文件所在目录的绝对路径
model_path = "./models/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
print('model_path:', model_path)
vad_model = "./models/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch" # fsmn-vad
punc_model = "./models/iic/punc_ct-transformer_cn-en-common-vocab471067-large" # ct-punc-c

model = AutoModel(
    model=model_path,
    vad_model=vad_model,
    punc_model=punc_model,
    ngpu=1,# 0 for cpu, 1 for gpu
    ncpu=4,
    device="cuda",# cuda, cpu
    disable_pbar=True,
    disable_log=True,
    disable_update=True,# 禁止自动更新模型
)

emo_dict = {
    "<|HAPPY|>": "😊",
    "<|SAD|>": "😔",
    "<|ANGRY|>": "😡",
    "<|NEUTRAL|>": "",
    "<|FEARFUL|>": "😰",
    "<|DISGUSTED|>": "🤢",
    "<|SURPRISED|>": "😮",
}

event_dict = {
    "<|BGM|>": "🎼",
    "<|Speech|>": "",
    "<|Applause|>": "👏",
    "<|Laughter|>": "😀",
    "<|Cry|>": "😭",
    "<|Sneeze|>": "🤧",
    "<|Breath|>": "",
    "<|Cough|>": "🤧",
}

emoji_dict = {
    "<|nospeech|><|Event_UNK|>": "❓",
    "<|zh|>": "",
    "<|en|>": "",
    "<|yue|>": "",
    "<|ja|>": "",
    "<|ko|>": "",
    "<|nospeech|>": "",
    "<|HAPPY|>": "😊",
    "<|SAD|>": "😔",
    "<|ANGRY|>": "😡",
    "<|NEUTRAL|>": "",
    "<|BGM|>": "🎼",
    "<|Speech|>": "",
    "<|Applause|>": "👏",
    "<|Laughter|>": "😀",
    "<|FEARFUL|>": "😰",
    "<|DISGUSTED|>": "🤢",
    "<|SURPRISED|>": "😮",
    "<|Cry|>": "😭",
    "<|EMO_UNKNOWN|>": "",
    "<|Sneeze|>": "🤧",
    "<|Breath|>": "",
    "<|Cough|>": "😷",
    "<|Sing|>": "",
    "<|Speech_Noise|>": "",
    "<|withitn|>": "",
    "<|woitn|>": "",
    "<|GBG|>": "",
    "<|Event_UNK|>": "",
}

lang_dict = {
    "<|zh|>": "<|lang|>",
    "<|en|>": "<|lang|>",
    "<|yue|>": "<|lang|>",
    "<|ja|>": "<|lang|>",
    "<|ko|>": "<|lang|>",
    "<|nospeech|>": "<|lang|>",
}

emo_set = {"😊", "😔", "😡", "😰", "🤢", "😮"}
event_set = {"🎼", "👏", "😀", "😭", "🤧", "😷", }

def format_str(s):
    for sptk in emoji_dict:
        s = s.replace(sptk, emoji_dict[sptk])
    return s

def format_str_v2(s):
    sptk_dict = {}
    for sptk in emoji_dict:
        sptk_dict[sptk] = s.count(sptk)
        s = s.replace(sptk, "")
    emo = "<|NEUTRAL|>"
    for e in emo_dict:
        if sptk_dict[e] > sptk_dict[emo]:
            emo = e
    for e in event_dict:
        if sptk_dict[e] > 0:
            s = event_dict[e] + s
    s = s + emo_dict[emo]

    for emoji in emo_set.union(event_set):
        s = s.replace(" " + emoji, emoji)
        s = s.replace(emoji + " ", emoji)
    return s.strip()

def format_str_v3(s):
    def get_emo(s):
        return s[-1] if s[-1] in emo_set else None

    def get_event(s):
        return s[0] if s[0] in event_set else None

    s = s.replace("<|nospeech|><|Event_UNK|>", "❓")
    for lang in lang_dict:
        s = s.replace(lang, "<|lang|>")
    s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")]
    new_s = " " + s_list[0]
    cur_ent_event = get_event(new_s)
    for i in range(1, len(s_list)):
        if len(s_list[i]) == 0:
            continue
        if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) is not None:
            s_list[i] = s_list[i][1:]
        # else:
        cur_ent_event = get_event(s_list[i])
        if get_emo(s_list[i]) is not None and get_emo(s_list[i]) == get_emo(new_s):
            new_s = new_s[:-1]
        new_s += s_list[i].strip().lstrip()
    new_s = new_s.replace("The.", " ")
    return new_s.strip()

def truncate_audio(audio, sample_rate: int=16000, max_length: int=30):
    """音频截取"""
    # 计算音频的实际长度
    actual_length = audio.shape[0] / sample_rate
    print('音频长度:', actual_length)
    # 如果音频长度超过最大长度,则截断它
    if actual_length > max_length:
        # 截取音频的前30秒
        audio = audio[:int(max_length * sample_rate)]
    return audio

def model_inference(input_wav, wav_upload, language: str="auto", fs: int=16000):
    if isinstance(input_wav, tuple):
        # input_wav = (44100, array([ 0,  0,  0, ..., 73, 80, 74], dtype=int16))
        fs, input_wav = input_wav
        # 音频截断
        input_wav = truncate_audio(input_wav, fs, 30)
        input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
        if len(input_wav.shape) > 1:
            input_wav = input_wav.mean(-1)
        if fs != 16000:
            print(f"audio_fs: {fs}")
            resampler = torchaudio.transforms.Resample(fs, 16000)
            input_wav_t = torch.from_numpy(input_wav).to(torch.float32)
            input_wav = resampler(input_wav_t[None, :])[0, :].numpy()
    elif os.path.exists(wav_upload):
        input_wav = wav_upload
    else:
        gr.Error("无效音频")
        return None
    print('input_wav:')
    print(input_wav)
    merge_vad = True
    print(f"language: {language}, merge_vad: {merge_vad}")
    text = model.generate(input=input_wav,
                          cache={},
                          language=language,
                          use_itn=True,
                          batch_size_s=60,
                          merge_vad=merge_vad,
                          ban_emo_unk=False,
                          merge_length_s=15)
    print(text)
    text = text[0]["text"]
    text = format_str_v3(text)
    print(text)
    return text

def launch():
    with gr.Blocks(theme=gr.themes.Soft()) as demo:
        gr.Markdown("## FunASR Demo")
        with gr.Group():
            with gr.Row():
                audio_inputs = gr.Microphone(
                    label="请说话(30秒以内)",
                    format="wav",# 音频格式
                    interactive=True,# 编辑
                    autoplay=False,# 自动播放
                    show_download_button=False,# 下载按钮
                    max_length=30,# 最大音频时长,单位秒
                    loop=False,# 循环播放
                    scale=2
                )
                wav_upload = gr.Audio(
                    sources="upload",
                    type="filepath",
                    label="选择音频文件,注意采样率不低于 16khz",
                    scale = 2
                )
            with gr.Row():
                fn_button = gr.Button("开始", variant="primary")
            with gr.Row():
                text_outputs = gr.Textbox(label="结果")
        fn_button.click(fn=model_inference, inputs=[audio_inputs, wav_upload], outputs=text_outputs)
    demo.title = 'FunASR Demo'
    demo.launch(
        server_name="0.0.0.0",
        server_port=9527,
        inbrowser=False,
        share=False,
        ssl_verify=False
    )

if __name__ == "__main__":
    launch()
LauraGPT commented 3 weeks ago

Can you reproduce the issue using the example from the documentation?

cpken commented 3 weeks ago

今天更新了最新的代码就不会报错了。