HG-ha / SenseVoice-Api

阿里SenseVoice的fastpi封装,采用onnx发布,附带量化模型,支持GPU。支持从URL文件进行语音识别。
42 stars 3 forks source link

语音格式问题 #4

Open Upcreat opened 2 months ago

Upcreat commented 2 months ago

您好,请问一下,下面这个示例,如果需要输入的文件是mp3格式的要怎么改呀,直接使用会出现错误”{"detail":"buffer size must be a multiple of element size"}“。

curl --request POST \ --url http://127.0.0.1:8000/upload-file/ \ --header 'content-type: multipart/form-data' \ --form 'files=@asr_example_zh.wav'

HG-ha commented 2 months ago

这个问题来自于funasr没有对其他声道音频支持

在参考此方法解决前确保安装了依赖:pydub

参考方案: 修改funasr>auto>auto_model.py文件

修改方法 prepare_data_iterator为以下内容

def prepare_data_iterator(data_in, input_len=None, data_type=None, key=None):
    """ """
    data_list = []
    key_list = []
    filelist = [".scp", ".txt", ".json", ".jsonl", ".text"]

    chars = string.ascii_letters + string.digits
    if isinstance(data_in, str):
        if data_in.startswith("http://") or data_in.startswith("https://"):  # url
            data_in = download_from_url(data_in)

    if isinstance(data_in, str) and os.path.exists(
        data_in
    ):  # wav_path; filelist: wav.scp, file.jsonl;text.txt;
        _, file_extension = os.path.splitext(data_in)
        file_extension = file_extension.lower()
        if file_extension in filelist:  # filelist: wav.scp, file.jsonl;text.txt;
            with open(data_in, encoding="utf-8") as fin:
                for line in fin:
                    key = "rand_key_" + "".join(random.choice(chars) for _ in range(13))
                    if data_in.endswith(".jsonl"):  # file.jsonl: json.dumps({"source": data})
                        lines = json.loads(line.strip())
                        data = lines["source"]
                        key = data["key"] if "key" in data else key
                    else:  # filelist, wav.scp, text.txt: id \t data or data
                        lines = line.strip().split(maxsplit=1)
                        data = lines[1] if len(lines) > 1 else lines[0]
                        key = lines[0] if len(lines) > 1 else key

                    data_list.append(data)
                    key_list.append(key)
        else:
            if key is None:
                # key = "rand_key_" + "".join(random.choice(chars) for _ in range(13))
                key = misc.extract_filename_without_extension(data_in)
            data_list = [data_in]
            key_list = [key]
    elif isinstance(data_in, (list, tuple)):
        if data_type is not None and isinstance(data_type, (list, tuple)):  # mutiple inputs
            data_list_tmp = []
            for data_in_i, data_type_i in zip(data_in, data_type):
                key_list, data_list_i = prepare_data_iterator(
                    data_in=data_in_i, data_type=data_type_i
                )
                data_list_tmp.append(data_list_i)
            data_list = []
            for item in zip(*data_list_tmp):
                data_list.append(item)
        else:
            # [audio sample point, fbank, text]
            data_list = data_in
            key_list = []
            for data_i in data_in:
                if isinstance(data_i, str) and os.path.exists(data_i):
                    key = misc.extract_filename_without_extension(data_i)
                else:
                    if key is None:
                        key = "rand_key_" + "".join(random.choice(chars) for _ in range(13))
                key_list.append(key)

    else:  # raw text; audio sample point, fbank; bytes
        if isinstance(data_in, bytes):  # audio bytes
            from pydub import AudioSegment
            import io
            audio_io = io.BytesIO(data_in)

            # 尝试加载音频数据
            try:
                # 通过 AudioSegment 加载音频数据
                audio = AudioSegment.from_file(audio_io)
            except Exception as e:
                raise ValueError(f"无法加载音频文件: {e}")

            # 将音频数据转换为 numpy 数组
            samples = np.array(audio.get_array_of_samples())

            # 如果是立体声,调整为单声道
            if audio.channels == 2:
                samples = samples.reshape(-1, 2).mean(axis=1)

            # 确保数据是 int16 类型
            samples = samples.astype(np.int16)
            data_in = load_bytes(samples.tobytes())
        if key is None:
            key = "rand_key_" + "".join(random.choice(chars) for _ in range(13))
        data_list = [data_in]
        key_list = [key]

    return key_list, data_list
Upcreat commented 2 months ago

感谢您的回复。我在官方的huggingface看到相关推理介绍,是支持多语音格式推理解析的吧,还是说我的理解有问题。 这是相关链接:https://huggingface.co/FunAudioLLM/SenseVoiceSmall image

HG-ha commented 1 month ago

这个麻烦参考官方示例自己测试下呢,后续有时间了我测试后会更新此issue