Open Upcreat opened 3 months ago
这个问题来自于funasr没有对其他声道音频支持
在参考此方法解决前确保安装了依赖:pydub
参考方案: 修改funasr>auto>auto_model.py文件
修改方法 prepare_data_iterator为以下内容
def prepare_data_iterator(data_in, input_len=None, data_type=None, key=None):
""" """
data_list = []
key_list = []
filelist = [".scp", ".txt", ".json", ".jsonl", ".text"]
chars = string.ascii_letters + string.digits
if isinstance(data_in, str):
if data_in.startswith("http://") or data_in.startswith("https://"): # url
data_in = download_from_url(data_in)
if isinstance(data_in, str) and os.path.exists(
data_in
): # wav_path; filelist: wav.scp, file.jsonl;text.txt;
_, file_extension = os.path.splitext(data_in)
file_extension = file_extension.lower()
if file_extension in filelist: # filelist: wav.scp, file.jsonl;text.txt;
with open(data_in, encoding="utf-8") as fin:
for line in fin:
key = "rand_key_" + "".join(random.choice(chars) for _ in range(13))
if data_in.endswith(".jsonl"): # file.jsonl: json.dumps({"source": data})
lines = json.loads(line.strip())
data = lines["source"]
key = data["key"] if "key" in data else key
else: # filelist, wav.scp, text.txt: id \t data or data
lines = line.strip().split(maxsplit=1)
data = lines[1] if len(lines) > 1 else lines[0]
key = lines[0] if len(lines) > 1 else key
data_list.append(data)
key_list.append(key)
else:
if key is None:
# key = "rand_key_" + "".join(random.choice(chars) for _ in range(13))
key = misc.extract_filename_without_extension(data_in)
data_list = [data_in]
key_list = [key]
elif isinstance(data_in, (list, tuple)):
if data_type is not None and isinstance(data_type, (list, tuple)): # mutiple inputs
data_list_tmp = []
for data_in_i, data_type_i in zip(data_in, data_type):
key_list, data_list_i = prepare_data_iterator(
data_in=data_in_i, data_type=data_type_i
)
data_list_tmp.append(data_list_i)
data_list = []
for item in zip(*data_list_tmp):
data_list.append(item)
else:
# [audio sample point, fbank, text]
data_list = data_in
key_list = []
for data_i in data_in:
if isinstance(data_i, str) and os.path.exists(data_i):
key = misc.extract_filename_without_extension(data_i)
else:
if key is None:
key = "rand_key_" + "".join(random.choice(chars) for _ in range(13))
key_list.append(key)
else: # raw text; audio sample point, fbank; bytes
if isinstance(data_in, bytes): # audio bytes
from pydub import AudioSegment
import io
audio_io = io.BytesIO(data_in)
# 尝试加载音频数据
try:
# 通过 AudioSegment 加载音频数据
audio = AudioSegment.from_file(audio_io)
except Exception as e:
raise ValueError(f"无法加载音频文件: {e}")
# 将音频数据转换为 numpy 数组
samples = np.array(audio.get_array_of_samples())
# 如果是立体声,调整为单声道
if audio.channels == 2:
samples = samples.reshape(-1, 2).mean(axis=1)
# 确保数据是 int16 类型
samples = samples.astype(np.int16)
data_in = load_bytes(samples.tobytes())
if key is None:
key = "rand_key_" + "".join(random.choice(chars) for _ in range(13))
data_list = [data_in]
key_list = [key]
return key_list, data_list
感谢您的回复。我在官方的huggingface看到相关推理介绍,是支持多语音格式推理解析的吧,还是说我的理解有问题。 这是相关链接:https://huggingface.co/FunAudioLLM/SenseVoiceSmall
这个麻烦参考官方示例自己测试下呢,后续有时间了我测试后会更新此issue
您好,请问一下,下面这个示例,如果需要输入的文件是mp3格式的要怎么改呀,直接使用会出现错误”{"detail":"buffer size must be a multiple of element size"}“。
curl --request POST \ --url http://127.0.0.1:8000/upload-file/ \ --header 'content-type: multipart/form-data' \ --form 'files=@asr_example_zh.wav'