Open scalar27 opened 3 months ago
thank you, we will do it later
Hello, here's my contribution
webui.py
inference_mode_list = ['Pre-trained Voice', '3s Fast Reproduction', 'Cross-lingual Reproduction', 'Natural Language Control']
instruct_dict = {'Pre-trained Voice': '1. Select a pre-trained voice\n2. Click the Generate Audio button',
'3s Fast Reproduction': '1. Select a prompt audio file, or record a prompt audio (no longer than 30s). If both are provided, the prompt audio file will be prioritized\n2. Enter the prompt text\n3. Click the Generate Audio button',
'Cross-lingual Reproduction': '1. Select a prompt audio file, or record a prompt audio (no longer than 30s). If both are provided, the prompt audio file will be prioritized\n2. Click the Generate Audio button',
'Natural Language Control': '1. Select a pre-trained voice\n2. Enter the instruction text\n3. Click the Generate Audio button'}
def change_instruction(mode_checkbox_group):
return instruct_dict[mode_checkbox_group]
def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed):
if prompt_wav_upload is not None:
prompt_wav = prompt_wav_upload
elif prompt_wav_record is not None:
prompt_wav = prompt_wav_record
else:
prompt_wav = None
# if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
if mode_checkbox_group in ['Natural Language Control']:
if cosyvoice.frontend.instruct is False:
gr.Warning('You are using the Natural Language Control mode, the {} model does not support this mode, please use the iic/CosyVoice-300M-Instruct model'.format(args.model_dir))
return (target_sr, default_data)
if instruct_text == '':
gr.Warning('You are using the Natural Language Control mode, please enter instruction text')
return (target_sr, default_data)
if prompt_wav is not None or prompt_text != '':
gr.Info('You are using the Natural Language Control mode, the prompt audio/prompt text will be ignored')
# if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
if mode_checkbox_group in ['Cross-lingual Reproduction']:
if cosyvoice.frontend.instruct is True:
gr.Warning('You are using the Cross-lingual Reproduction mode, the {} model does not support this mode, please use the iic/CosyVoice-300M model'.format(args.model_dir))
return (target_sr, default_data)
if instruct_text != '':
gr.Info('You are using the Cross-lingual Reproduction mode, the instruction text will be ignored')
if prompt_wav is None:
gr.Warning('You are using the Cross-lingual Reproduction mode, please provide a prompt audio')
return (target_sr, default_data)
gr.Info('You are using the Cross-lingual Reproduction mode, please ensure that the synthesized text and prompt text are in different languages')
# if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
if mode_checkbox_group in ['3s Fast Reproduction', 'Cross-lingual Reproduction']:
if prompt_wav is None:
gr.Warning('The prompt audio is empty, did you forget to input the prompt audio?')
return (target_sr, default_data)
if torchaudio.info(prompt_wav).sample_rate < prompt_sr:
gr.Warning('The prompt audio sample rate {} is lower than {}'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
return (target_sr, default_data)
# sft mode only use sft_dropdown
if mode_checkbox_group in ['Pre-trained Voice']:
if instruct_text != '' or prompt_wav is not None or prompt_text != '':
gr.Info('You are using the Pre-trained Voice mode, the prompt text/prompt audio/instruction text will be ignored!')
# zero_shot mode only use prompt_wav prompt text
if mode_checkbox_group in ['3s Fast Reproduction']:
if prompt_text == '':
gr.Warning('The prompt text is empty, did you forget to input the prompt text?')
return (target_sr, default_data)
if instruct_text != '':
gr.Info('You are using the 3s Fast Reproduction mode, the pre-trained voice/instruction text will be ignored!')
if mode_checkbox_group == 'Pre-trained Voice':
logging.info('get sft inference request')
set_all_random_seed(seed)
output = cosyvoice.inference_sft(tts_text, sft_dropdown)
elif mode_checkbox_group == '3s Fast Reproduction':
logging.info('get zero_shot inference request')
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
set_all_random_seed(seed)
output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k)
elif mode_checkbox_group == 'Cross-lingual Reproduction':
logging.info('get cross_lingual inference request')
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
set_all_random_seed(seed)
output = cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k)
else:
logging.info('get instruct inference request')
set_all_random_seed(seed)
output = cosyvoice.inference_instruct(tts_text, sft_dropdown, instruct_text)
audio_data = output['tts_speech'].numpy().flatten()
return (target_sr, audio_data)
and
gr.Markdown("#### Please enter the text to be synthesized, select the inference mode, and follow the steps provided")
tts_text = gr.Textbox(label="Enter Text to Synthesize", lines=1, value="I am a newly released generative speech model by the Tongyi Speech Team, providing a comfortable and natural speech synthesis capability.")
with gr.Row():
mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='Select Inference Mode', value=inference_mode_list[0])
instruction_text = gr.Text(label="Instructions", value=instruct_dict[inference_mode_list[0]], scale=0.5)
sft_dropdown = gr.Dropdown(choices=sft_spk, label='Select Pre-trained Voice [1.CN fem, 2.CN male, 3.JP male, 4.CANT fem, 5.EN fem, 6.EN male, 7.KO fem]', value=sft_spk[0], scale=0.25)
seed_button = gr.Button(value="\U0001F3B2")
seed = gr.Number(value=0, label="Random Inference Seed")
with gr.Row():
prompt_wav_upload = gr.Audio(sources='upload', type='filepath', label='Upload Prompt Audio File (Sampling rate no less than 16kHz)')
prompt_wav_record = gr.Audio(sources='microphone', type='filepath', label='Record Prompt Audio File')
prompt_text = gr.Textbox(label="Enter Prompt Text", lines=1, placeholder="Please enter prompt text that matches the content of the prompt audio, automatic recognition is currently not supported...", value='')
instruct_text = gr.Textbox(label="Enter Instruct Text", lines=1, placeholder="Please enter instruct text.", value='')
generate_button = gr.Button("Generate Audio")
audio_output = gr.Audio(label="Synthesized Audio")
But for the pretrained voices, IDK.
Is your feature request related to a problem? Please describe. As I only speak English, I cannot really use the cosyvoice gradio interface.
Describe the solution you'd like Please consider adding an English version.
Describe alternatives you've considered A clear and concise description of any alternative solutions or features you've considered.
Additional context Add any other context or screenshots about the feature request here.