Open wxzz opened 3 weeks ago
前面文章参考:克隆人像和声音,metahuman与GPT-SoVITS接口打通,过程汇总(https://github.com/lipku/metahuman-stream/issues/210)
1.流程概要:前台语音及上传到后台->asr语音识别->LLM对话模型->metahuman呈现 2.增加asr语音识别库 下载模型:https://openaipublic.azureedge.net/main/whisper/models/9ecf779972d90ba49c06d968637d720dd632c55bbf19d441fb42bf17a411e794/small.pt 把模型名称改为:whisper_small.pt,在metahuman工程根目录新建目录“sound”,并把模型文件复制到“sound”目录下。
在“sound”目录下新建:speech2text.py 程序文件,代码实现如下: import os import whisper
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com" os.environ["CUDA_VISIBLE_DEVICES"] = "2" os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
model = whisper.load_model("sound/whisper_small.pt") #['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large-v1', 'large-v2', 'large-v3', 'large']
def speech2text(speech_file): result = model.transcribe(speech_file,fp16=False,verbose = True,language="Chinese") return result['text']
3.增加LLM对话模型,我们使用百度的文心一言 在llm目录增加:yiyan_sdk.py 程序文件,代码实现如下: import os import qianfan
os.environ["QIANFAN_AK"] = "fLdNo" os.environ["QIANFAN_SK"] = "mXjXq"
def yiyan_chat(user,msg): resp = qianfan.ChatCompletion().do(endpoint="completions_pro", messages=[{"role":""+user+"","content":""+msg+""}], temperature=0.95, top_p=0.8, penalty_score=1, disable_search=False, enable_citation=False) return resp.body["result"]
4.修改前端界面:webrtcapi.html,增加代码。
<button id="upload_sound_btn">对话</button> <audio class="mt-1" style="width: 300px;height: 40px;" id="audioPlayer" controls></audio>
日志:
5.修改js操作代码:client.js,增加代码如下: let mediaRecorder; let audioChunks = []; const chatButton = document.getElementById('chat_btn'); const audioPlayer = document.getElementById('audioPlayer'); chatButton.addEventListener('mousedown', start_sound); chatButton.addEventListener('mouseup', stop_sound);
const uploadRecordButton = document.getElementById('upload_sound_btn'); uploadRecordButton.addEventListener('click', upload_sound); const chatlogText = document.getElementById('chatlog');
function start_sound() { audioChunks = []; navigator.mediaDevices.getUserMedia({audio: true}) .then(stream => { mediaRecorder = new MediaRecorder(stream); mediaRecorder.ondataavailable = event => { if (event.data.size > 0) { audioChunks.push(event.data); } }; mediaRecorder.onstop = () => { const audioBlob = new Blob(audioChunks, {type: 'audio/wav'}); const audioUrl = URL.createObjectURL(audioBlob); audioPlayer.src = audioUrl; }; mediaRecorder.start(); chatButton.textContent = '录制中...'; audioPlayer.src = null; audioPlayer.load(); //startRecordButton.disabled = true; //stopRecordButton.disabled = false; }) .catch(error => { console.error('getUserMedia error:', error); }); }
function stop_sound() { mediaRecorder.stop(); //startRecordButton.disabled = false; chatButton.textContent = '录制对话'; //stopRecordButton.disabled = true; }
function upload_sound() { //上传对话音频 if(audioPlayer.src==null) return ; const audioBlob = new Blob(audioChunks, {type: 'audio/wav'}); const formData = new FormData(); formData.append('audio', audioBlob,'recorded_audio.wav'); sessionid=document.getElementById('sessionid').value; formData.append('sessionid',parseInt(sessionid)); let index = layer.load(); fetch('/uploadsound', { method: 'POST', body: formData }) .then(response => response.json()) .then(data => { layer.close(index);
chatlogText.value += data['data'] + '\n'; }) .catch(error => { layer.close(index); console.error('Upload error:', error); });
}
6.修改后端代码 在if name == 'main'函数中增加: appasync.router.add_route(method='POST',path='/uploadsound',handler=uploadsound)
增加uploadsound函数,代码实现如下: async def uploadsound(request):
try: data= await request.post() sessionid = int(data['sessionid']) fileobj = data["audio"] filename=fileobj.filename dir=os.path.join("sound") if(not os.path.exists(dir)): os.mkdir(dir) save_dir=os.path.join(dir,filename) if(os.path.exists(save_dir)): os.remove(save_dir) filebytes=fileobj.file.read() with open(save_dir,'wb') as file: file.write(filebytes) text=speech2text(save_dir) text_respose=yiyan_chat("user",text) nerfreals[sessionid].pause_talk() nerfreals[sessionid].put_msg_txt(text_respose) return web.Response( content_type="application/json", text=json.dumps( {"code": 0, "msg":"ok","data": ""+"【我】:"+text+"\n"+"【数字我】:"+text_respose+""} ), ) except Exception as e: return web.Response( content_type="application/json", text=json.dumps( {"code": -1, "msg":"err","data": ""+e.args[0]+""} ), )
7.实现效果,如下图
大赞,可以分享下github代码吗?
大佬有错误,有报错。分享一下
完整代码就在上面。
有错误,分享啥。
前面文章参考:克隆人像和声音,metahuman与GPT-SoVITS接口打通,过程汇总(https://github.com/lipku/metahuman-stream/issues/210)
1.流程概要:前台语音及上传到后台->asr语音识别->LLM对话模型->metahuman呈现 2.增加asr语音识别库 下载模型:https://openaipublic.azureedge.net/main/whisper/models/9ecf779972d90ba49c06d968637d720dd632c55bbf19d441fb42bf17a411e794/small.pt 把模型名称改为:whisper_small.pt,在metahuman工程根目录新建目录“sound”,并把模型文件复制到“sound”目录下。
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com" os.environ["CUDA_VISIBLE_DEVICES"] = "2" os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
model = whisper.load_model("sound/whisper_small.pt") #['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large-v1', 'large-v2', 'large-v3', 'large']
def speech2text(speech_file): result = model.transcribe(speech_file,fp16=False,verbose = True,language="Chinese") return result['text']
3.增加LLM对话模型,我们使用百度的文心一言 在llm目录增加:yiyan_sdk.py 程序文件,代码实现如下: import os import qianfan
os.environ["QIANFAN_AK"] = "fLdNo" os.environ["QIANFAN_SK"] = "mXjXq"
def yiyan_chat(user,msg): resp = qianfan.ChatCompletion().do(endpoint="completions_pro", messages=[{"role":""+user+"","content":""+msg+""}], temperature=0.95, top_p=0.8, penalty_score=1, disable_search=False, enable_citation=False) return resp.body["result"]
4.修改前端界面:webrtcapi.html,增加代码。
日志:
5.修改js操作代码:client.js,增加代码如下: let mediaRecorder; let audioChunks = []; const chatButton = document.getElementById('chat_btn'); const audioPlayer = document.getElementById('audioPlayer'); chatButton.addEventListener('mousedown', start_sound); chatButton.addEventListener('mouseup', stop_sound);
function start_sound() { audioChunks = []; navigator.mediaDevices.getUserMedia({audio: true}) .then(stream => { mediaRecorder = new MediaRecorder(stream); mediaRecorder.ondataavailable = event => { if (event.data.size > 0) { audioChunks.push(event.data); } }; mediaRecorder.onstop = () => { const audioBlob = new Blob(audioChunks, {type: 'audio/wav'}); const audioUrl = URL.createObjectURL(audioBlob); audioPlayer.src = audioUrl; }; mediaRecorder.start(); chatButton.textContent = '录制中...'; audioPlayer.src = null; audioPlayer.load(); //startRecordButton.disabled = true; //stopRecordButton.disabled = false; }) .catch(error => { console.error('getUserMedia error:', error); }); }
function stop_sound() { mediaRecorder.stop(); //startRecordButton.disabled = false; chatButton.textContent = '录制对话'; //stopRecordButton.disabled = true; }
function upload_sound() { //上传对话音频 if(audioPlayer.src==null) return ; const audioBlob = new Blob(audioChunks, {type: 'audio/wav'}); const formData = new FormData(); formData.append('audio', audioBlob,'recorded_audio.wav'); sessionid=document.getElementById('sessionid').value; formData.append('sessionid',parseInt(sessionid)); let index = layer.load(); fetch('/uploadsound', { method: 'POST', body: formData }) .then(response => response.json()) .then(data => { layer.close(index);
}
6.修改后端代码 在if name == 'main'函数中增加: appasync.router.add_route(method='POST',path='/uploadsound',handler=uploadsound)
增加uploadsound函数,代码实现如下: async def uploadsound(request):
7.实现效果,如下图