Open anstonjie opened 2 weeks ago
we have provided grpc example in runtime, or you can wrap your own api
Try this example:
Install fastapi first: pip install fastapi
, then fastapi dev --port 3001
import io,time
from fastapi import FastAPI, Response
from cosyvoice.cli.cosyvoice import CosyVoice
from cosyvoice.utils.file_utils import load_wav
import torchaudio
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT')
# sft usage
print(cosyvoice.list_avaliable_spks())
app = FastAPI()
@app.get("/api/voice/tts")
async def tts(query: str):
start = time.process_time()
output = cosyvoice.inference_sft(query, '中文女')
end = time.process_time()
print("infer time:", end-start)
buffer = io.BytesIO()
torchaudio.save(buffer, output['tts_speech'], 22050, format="wav")
buffer.seek(0)
return Response(content=buffer.read(-1), media_type="audio/wav")
@app.get("/")
async def root():
return {"message": "Hello World"}
Try this example:
Install fastapi first:
pip install fastapi
, thenfastapi dev --port 3001
import io,time from fastapi import FastAPI, Response from cosyvoice.cli.cosyvoice import CosyVoice from cosyvoice.utils.file_utils import load_wav import torchaudio cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT') # sft usage print(cosyvoice.list_avaliable_spks()) app = FastAPI() @app.get("/api/voice/tts") async def tts(query: str): start = time.process_time() output = cosyvoice.inference_sft(query, '中文女') end = time.process_time() print("infer time:", end-start) buffer = io.BytesIO() torchaudio.save(buffer, output['tts_speech'], 22050, format="wav") buffer.seek(0) return Response(content=buffer.read(-1), media_type="audio/wav") @app.get("/") async def root(): return {"message": "Hello World"}
we will be happy if you can make a fastapi pr
Try this example: Install fastapi first:
pip install fastapi
, thenfastapi dev --port 3001
import io,time from fastapi import FastAPI, Response from cosyvoice.cli.cosyvoice import CosyVoice from cosyvoice.utils.file_utils import load_wav import torchaudio cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT') # sft usage print(cosyvoice.list_avaliable_spks()) app = FastAPI() @app.get("/api/voice/tts") async def tts(query: str): start = time.process_time() output = cosyvoice.inference_sft(query, '中文女') end = time.process_time() print("infer time:", end-start) buffer = io.BytesIO() torchaudio.save(buffer, output['tts_speech'], 22050, format="wav") buffer.seek(0) return Response(content=buffer.read(-1), media_type="audio/wav") @app.get("/") async def root(): return {"message": "Hello World"}
we will be happy if you can make a fastapi pr
Ok, let me create PR.
thank you
@iflamed why not gradio_client ...
能制作个API吗
同求,看官方提供的grpc api使用有些复杂
Try this example:
Install fastapi first:
pip install fastapi
, thenfastapi dev --port 3001
import io,time from fastapi import FastAPI, Response from cosyvoice.cli.cosyvoice import CosyVoice from cosyvoice.utils.file_utils import load_wav import torchaudio cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT') # sft usage print(cosyvoice.list_avaliable_spks()) app = FastAPI() @app.get("/api/voice/tts") async def tts(query: str): start = time.process_time() output = cosyvoice.inference_sft(query, '中文女') end = time.process_time() print("infer time:", end-start) buffer = io.BytesIO() torchaudio.save(buffer, output['tts_speech'], 22050, format="wav") buffer.seek(0) return Response(content=buffer.read(-1), media_type="audio/wav") @app.get("/") async def root(): return {"message": "Hello World"}
如果是inference_zero_shot需要上传音频的需要怎么封装呢
我也不会呀,你会python不?Sent from newbing’s iPhoneOn Jul 10, 2024, at 11:56, dragon10 @.***> wrote:
Try this example: Install fastapi first: pip install fastapi, then fastapi dev --port 3001 import io,time from fastapi import FastAPI, Response from cosyvoice.cli.cosyvoice import CosyVoice from cosyvoice.utils.file_utils import load_wav import torchaudio
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT')
print(cosyvoice.list_avaliable_spks()) app = FastAPI()
@app.get("/api/voice/tts") async def tts(query: str): start = time.process_time() output = cosyvoice.inference_sft(query, '中文女') end = time.process_time() print("infer time:", end-start) buffer = io.BytesIO() torchaudio.save(buffer, output['tts_speech'], 22050, format="wav") buffer.seek(0) return Response(content=buffer.read(-1), media_type="audio/wav")
@app.get("/") async def root(): return {"message": "Hello World"}
如果是inference_zero_shot需要上传音频的需要怎么封装呢
—Reply to this email directly, view it on GitHub, or unsubscribe.You are receiving this because you were mentioned.Message ID: @.***>
已封装,供参考(代码说明可以参考 https://blog.csdn.net/weixin_42357472/article/details/140321056):
api.py
import time
import io, os, sys
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append('{}'.format(ROOT_DIR))
sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))
import numpy as np
from flask import Flask, request, Response
import torch
import torchaudio
from cosyvoice.cli.cosyvoice import CosyVoice
from cosyvoice.utils.file_utils import load_wav
##指定模型地址,对于零样本/跨语言推理,请使用CosyVoice-300M模型。对于SFT推理,
##请使用CosyVoice-300M-SFT模型。对于指令推理,请使用CosyVoice-300M-Instruct模型。
cosyvoice = CosyVoice('/data/pretrained_models/CosyVoice-300M')
print(cosyvoice.list_avaliable_spks())
app = Flask(__name__)
@app.route("/inference/sft", methods=['POST'])
def sft():
question_data = request.get_json()
query = question_data.get('query')
speaker = question_data.get('speaker')
if not query:
return {"error": "Query parameter 'query' is required"}, 400
start = time.process_time()
output = cosyvoice.inference_sft(query, speaker)
end = time.process_time()
print("infer time:", end - start)
buffer = io.BytesIO()
torchaudio.save(buffer, output['tts_speech'], 22050, format="wav")
buffer.seek(0)
return Response(buffer.read(), mimetype="audio/wav")
@app.route("/inference/zero_shot", methods=['POST'])
def zero_shot():
question_data = request.get_json()
tts_text = question_data.get('query')
prompt_text = question_data.get('prompt_text')
prompt_speech = load_wav(question_data.get('prompt_speech'), 16000)
prompt_audio = (prompt_speech.numpy() * (2**15)).astype(np.int16).tobytes()
prompt_speech_16k = torch.from_numpy(np.array(np.frombuffer(prompt_audio, dtype=np.int16))).unsqueeze(dim=0)
prompt_speech_16k = prompt_speech_16k.float() / (2**15)
start = time.process_time()
output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k)
end = time.process_time()
print("infer time:", end - start)
buffer = io.BytesIO()
torchaudio.save(buffer, output['tts_speech'], 22050, format="wav")
buffer.seek(0)
return Response(buffer.read(), mimetype="audio/wav")
@app.route("/inference/cross_lingual", methods=['POST'])
def cross_lingual():
question_data = request.get_json()
tts_text = question_data.get('query')
prompt_speech = load_wav(question_data.get('prompt_speech'), 16000)
prompt_audio = (prompt_speech.numpy() * (2**15)).astype(np.int16).tobytes()
prompt_speech_16k = torch.from_numpy(np.array(np.frombuffer(prompt_audio, dtype=np.int16))).unsqueeze(dim=0)
prompt_speech_16k = prompt_speech_16k.float() / (2**15)
start = time.process_time()
output = cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k)
end = time.process_time()
print("infer time:", end - start)
buffer = io.BytesIO()
torchaudio.save(buffer, output['tts_speech'], 22050, format="wav")
buffer.seek(0)
return Response(buffer.read(), mimetype="audio/wav")
@app.route("/inference/instruct", methods=['POST'])
def instruct():
question_data = request.get_json()
tts_text = question_data.get('query')
speaker = question_data.get('speaker')
instruct_text = question_data.get('instruct_text')
start = time.process_time()
output = cosyvoice.inference_instruct(tts_text, speaker, instruct_text)
end = time.process_time()
print("infer time:", end - start)
buffer = io.BytesIO()
torchaudio.save(buffer, output['tts_speech'], 22050, format="wav")
buffer.seek(0)
return Response(buffer.read(), mimetype="audio/wav")
if __name__ == "__main__":
app.run(host='0.0.0.0', port=50000,)
已封装,供参考,代码说明可以参考 https://blog.csdn.net/weixin_42357472/article/details/140321056
api.py
import time import io, os, sys ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.append('{}'.format(ROOT_DIR)) sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR)) import numpy as np from flask import Flask, request, Response import torch import torchaudio from cosyvoice.cli.cosyvoice import CosyVoice from cosyvoice.utils.file_utils import load_wav ##指定模型地址,对于零样本/跨语言推理,请使用CosyVoice-300M模型。对于SFT推理, ##请使用CosyVoice-300M-SFT模型。对于指令推理,请使用CosyVoice-300M-Instruct模型。 cosyvoice = CosyVoice('/data/pretrained_models/CosyVoice-300M') print(cosyvoice.list_avaliable_spks()) app = Flask(__name__) @app.route("/inference/sft", methods=['POST']) def sft(): question_data = request.get_json() query = question_data.get('query') speaker = question_data.get('speaker') if not query: return {"error": "Query parameter 'query' is required"}, 400 start = time.process_time() output = cosyvoice.inference_sft(query, speaker) end = time.process_time() print("infer time:", end - start) buffer = io.BytesIO() torchaudio.save(buffer, output['tts_speech'], 22050, format="wav") buffer.seek(0) return Response(buffer.read(), mimetype="audio/wav") @app.route("/inference/zero_shot", methods=['POST']) def zero_shot(): question_data = request.get_json() tts_text = question_data.get('query') prompt_text = question_data.get('prompt_text') prompt_speech = load_wav(question_data.get('prompt_speech'), 16000) prompt_audio = (prompt_speech.numpy() * (2**15)).astype(np.int16).tobytes() prompt_speech_16k = torch.from_numpy(np.array(np.frombuffer(prompt_audio, dtype=np.int16))).unsqueeze(dim=0) prompt_speech_16k = prompt_speech_16k.float() / (2**15) start = time.process_time() output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k) end = time.process_time() print("infer time:", end - start) buffer = io.BytesIO() torchaudio.save(buffer, output['tts_speech'], 22050, format="wav") buffer.seek(0) return Response(buffer.read(), mimetype="audio/wav") @app.route("/inference/cross_lingual", methods=['POST']) def cross_lingual(): question_data = request.get_json() tts_text = question_data.get('query') prompt_speech = load_wav(question_data.get('prompt_speech'), 16000) prompt_audio = (prompt_speech.numpy() * (2**15)).astype(np.int16).tobytes() prompt_speech_16k = torch.from_numpy(np.array(np.frombuffer(prompt_audio, dtype=np.int16))).unsqueeze(dim=0) prompt_speech_16k = prompt_speech_16k.float() / (2**15) start = time.process_time() output = cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k) end = time.process_time() print("infer time:", end - start) buffer = io.BytesIO() torchaudio.save(buffer, output['tts_speech'], 22050, format="wav") buffer.seek(0) return Response(buffer.read(), mimetype="audio/wav") @app.route("/inference/instruct", methods=['POST']) def instruct(): question_data = request.get_json() tts_text = question_data.get('query') speaker = question_data.get('speaker') instruct_text = question_data.get('instruct_text') start = time.process_time() output = cosyvoice.inference_instruct(tts_text, speaker, instruct_text) end = time.process_time() print("infer time:", end - start) buffer = io.BytesIO() torchaudio.save(buffer, output['tts_speech'], 22050, format="wav") buffer.seek(0) return Response(buffer.read(), mimetype="audio/wav") if __name__ == "__main__": app.run(host='0.0.0.0', port=50000,)
这个api怎么调用?还是要自己封装