Closed lonngxiang closed 1 year ago
import subprocess
import sounddevice as sd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import sherpa_ncnn
def create_recognizer():
# Please replace the model files if needed.
# See https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html
# for download links.
# base_file = "sherpa-ncnn-conv-emformer-transducer-2022-12-06"
# base_file = "sherpa-ncnn-lstm-transducer-small-2023-02-13"
base_file = r"D:\*****s\sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13"
# base_file = "sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16"
# base_file = "sherpa-ncnn-streaming-zipformer-20M-2023-02-17"
recognizer = sherpa_ncnn.Recognizer(
tokens="{}\\tokens.txt".format(base_file),
encoder_param="{}\encoder_jit_trace-pnnx.ncnn.param".format(base_file),
encoder_bin="{}\encoder_jit_trace-pnnx.ncnn.bin".format(base_file),
decoder_param="{}\decoder_jit_trace-pnnx.ncnn.param".format(base_file),
decoder_bin="{}\decoder_jit_trace-pnnx.ncnn.bin".format(base_file),
joiner_param="{}\joiner_jit_trace-pnnx.ncnn.param".format(base_file),
joiner_bin="{}\joiner_jit_trace-pnnx.ncnn.bin".format(base_file),
num_threads=4,
)
return recognizer
# def main():
print("Started! Please speak")
recognizer = create_recognizer()
# sample_rate = recognizer.sample_rate
# samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms
# 远程RTSP音频流的URL
# url = "your_rtsp_url"
url = r'D:\sound\222.mp4'
# FFmpeg命令参数
ffmpeg_cmd = [
"ffmpeg",
"-i", url,
"-f", "s16le",
"-acodec", "pcm_s16le",
"-ar", "16000",
"-"
]
# 创建FFmpeg进程
process = subprocess.Popen(
ffmpeg_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
bufsize=1600
)
# 定义音频流的采样率、通道数和每次读取的样本数量
sample_rate = 16000
channels = 1
frames_per_read = 1600
# 打开sounddevice的输入流
stream = sd.InputStream(channels=channels, samplerate=sample_rate)
# 开始输入流
stream.start()
last_result = ""
i=0
# 读取和处理音频数据
while True:
# 从FFmpeg进程中读取音频数据
data = process.stdout.read(frames_per_read * channels * 2) # 每个样本16位,乘以2
if not data:
break
# 将音频数据转换为numpy数组
samples = np.frombuffer(data, dtype=np.int16)
samples = samples.astype(np.float32)
# samples = MinMaxScaler(feature_range=(-1, 1)).fit_transform(samples.reshape(-1, 1))
samples /= 32768.0 # 归一化到[-1, 1]范围
# print(samples.shape, samples)
# 处理音频数据
# 在这里添加您的音频处理代码
recognizer.accept_waveform(sample_rate, samples)
result = recognizer.text
# print("result:",result,"last_result:",last_result)
if last_result != result:
if i==0:
print("{}".format(result),end='')
last_result = result
i=i+1
else:
last_result_len=len(last_result)
new_word = result[last_result_len:]
# print(last_result,result,new_word)
print("{}".format(new_word),end='', flush=True)
last_result = result
# 通过输入流读取处理后的音频数据
status, input_data = stream.read(frames_per_read)
# 处理读取的音频数据
# 在这里添加您对读取的音频数据的处理代码
# 关闭输入流
stream.stop()
stream.close()
# 关闭FFmpeg进程
process.stdout.close()
process.terminate()
Does sherpa_ncnn support remote network rtsp microphones?
You may find https://github.com/ossrs/srs-k2 helpful.
The transcription results are incorrect
Could you give more details?
Thank you. I'll check it out first
Does sherpa_ncnn support remote network rtsp microphones?
You may find https://github.com/ossrs/srs-k2 helpful.
The transcription results are incorrect
Could you give more details?
Thank you. I'll check it out first
The actual video sound content of the above code is inconsistent with the predicted results of the model
Please change
ffmpeg_cmd = [
"ffmpeg",
"-i", url,
"-f", "s16le",
"-acodec", "pcm_s16le",
"-ar", "16000",
"-"
]
to
ffmpeg_cmd = [
"ffmpeg",
"-i", url,
"-f", "s16le",
"-acodec", "pcm_s16le",
"-ac", "1",
"-ar", "16000",
"-"
]
It supports only a single channel.
After the fix, everything should work as expected.
Please change
ffmpeg_cmd = [ "ffmpeg", "-i", url, "-f", "s16le", "-acodec", "pcm_s16le", "-ar", "16000", "-" ]
to
ffmpeg_cmd = [ "ffmpeg", "-i", url, "-f", "s16le", "-acodec", "pcm_s16le", "-ac", "1", "-ar", "16000", "-" ]
It supports only a single channel.
After the fix, everything should work as expected.
ok,tks
You may find https://github.com/ossrs/srs-k2 helpful.
Could you give more details?