FunAudioLLM / CosyVoice

Multi-lingual large voice generation model, providing inference, training and deployment full-stack ability.
https://funaudiollm.github.io/
Apache License 2.0
6.34k stars 677 forks source link

Please consider including an English version of the gradio interface #264

Open scalar27 opened 3 months ago

scalar27 commented 3 months ago

Is your feature request related to a problem? Please describe. As I only speak English, I cannot really use the cosyvoice gradio interface.

Describe the solution you'd like Please consider adding an English version.

Describe alternatives you've considered A clear and concise description of any alternative solutions or features you've considered.

Additional context Add any other context or screenshots about the feature request here.

aluminumbox commented 3 months ago

thank you, we will do it later

j2l commented 2 months ago

Hello, here's my contribution

webui.py

inference_mode_list = ['Pre-trained Voice', '3s Fast Reproduction', 'Cross-lingual Reproduction', 'Natural Language Control']
instruct_dict = {'Pre-trained Voice': '1. Select a pre-trained voice\n2. Click the Generate Audio button',
                 '3s Fast Reproduction': '1. Select a prompt audio file, or record a prompt audio (no longer than 30s). If both are provided, the prompt audio file will be prioritized\n2. Enter the prompt text\n3. Click the Generate Audio button',
                 'Cross-lingual Reproduction': '1. Select a prompt audio file, or record a prompt audio (no longer than 30s). If both are provided, the prompt audio file will be prioritized\n2. Click the Generate Audio button',
                 'Natural Language Control': '1. Select a pre-trained voice\n2. Enter the instruction text\n3. Click the Generate Audio button'}

def change_instruction(mode_checkbox_group):
    return instruct_dict[mode_checkbox_group]

def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed):
    if prompt_wav_upload is not None:
        prompt_wav = prompt_wav_upload
    elif prompt_wav_record is not None:
        prompt_wav = prompt_wav_record
    else:
        prompt_wav = None
    # if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
    if mode_checkbox_group in ['Natural Language Control']:
        if cosyvoice.frontend.instruct is False:
            gr.Warning('You are using the Natural Language Control mode, the {} model does not support this mode, please use the iic/CosyVoice-300M-Instruct model'.format(args.model_dir))
            return (target_sr, default_data)
        if instruct_text == '':
            gr.Warning('You are using the Natural Language Control mode, please enter instruction text')
            return (target_sr, default_data)
        if prompt_wav is not None or prompt_text != '':
            gr.Info('You are using the Natural Language Control mode, the prompt audio/prompt text will be ignored')
    # if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
    if mode_checkbox_group in ['Cross-lingual Reproduction']:
        if cosyvoice.frontend.instruct is True:
            gr.Warning('You are using the Cross-lingual Reproduction mode, the {} model does not support this mode, please use the iic/CosyVoice-300M model'.format(args.model_dir))
            return (target_sr, default_data)
        if instruct_text != '':
            gr.Info('You are using the Cross-lingual Reproduction mode, the instruction text will be ignored')
        if prompt_wav is None:
            gr.Warning('You are using the Cross-lingual Reproduction mode, please provide a prompt audio')
            return (target_sr, default_data)
        gr.Info('You are using the Cross-lingual Reproduction mode, please ensure that the synthesized text and prompt text are in different languages')
    # if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
    if mode_checkbox_group in ['3s Fast Reproduction', 'Cross-lingual Reproduction']:
        if prompt_wav is None:
            gr.Warning('The prompt audio is empty, did you forget to input the prompt audio?')
            return (target_sr, default_data)
        if torchaudio.info(prompt_wav).sample_rate < prompt_sr:
            gr.Warning('The prompt audio sample rate {} is lower than {}'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
            return (target_sr, default_data)
    # sft mode only use sft_dropdown
    if mode_checkbox_group in ['Pre-trained Voice']:
        if instruct_text != '' or prompt_wav is not None or prompt_text != '':
            gr.Info('You are using the Pre-trained Voice mode, the prompt text/prompt audio/instruction text will be ignored!')
    # zero_shot mode only use prompt_wav prompt text
    if mode_checkbox_group in ['3s Fast Reproduction']:
        if prompt_text == '':
            gr.Warning('The prompt text is empty, did you forget to input the prompt text?')
            return (target_sr, default_data)
        if instruct_text != '':
            gr.Info('You are using the 3s Fast Reproduction mode, the pre-trained voice/instruction text will be ignored!')

    if mode_checkbox_group == 'Pre-trained Voice':
        logging.info('get sft inference request')
        set_all_random_seed(seed)
        output = cosyvoice.inference_sft(tts_text, sft_dropdown)
    elif mode_checkbox_group == '3s Fast Reproduction':
        logging.info('get zero_shot inference request')
        prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
        set_all_random_seed(seed)
        output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k)
    elif mode_checkbox_group == 'Cross-lingual Reproduction':
        logging.info('get cross_lingual inference request')
        prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
        set_all_random_seed(seed)
        output = cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k)
    else:
        logging.info('get instruct inference request')
        set_all_random_seed(seed)
        output = cosyvoice.inference_instruct(tts_text, sft_dropdown, instruct_text)
    audio_data = output['tts_speech'].numpy().flatten()
    return (target_sr, audio_data)

and

gr.Markdown("#### Please enter the text to be synthesized, select the inference mode, and follow the steps provided")

        tts_text = gr.Textbox(label="Enter Text to Synthesize", lines=1, value="I am a newly released generative speech model by the Tongyi Speech Team, providing a comfortable and natural speech synthesis capability.")

        with gr.Row():
            mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='Select Inference Mode', value=inference_mode_list[0])
            instruction_text = gr.Text(label="Instructions", value=instruct_dict[inference_mode_list[0]], scale=0.5)
            sft_dropdown = gr.Dropdown(choices=sft_spk, label='Select Pre-trained Voice  [1.CN fem, 2.CN male, 3.JP male, 4.CANT fem, 5.EN fem, 6.EN male, 7.KO fem]', value=sft_spk[0], scale=0.25)
                seed_button = gr.Button(value="\U0001F3B2")
                seed = gr.Number(value=0, label="Random Inference Seed")

        with gr.Row():
            prompt_wav_upload = gr.Audio(sources='upload', type='filepath', label='Upload Prompt Audio File (Sampling rate no less than 16kHz)')
            prompt_wav_record = gr.Audio(sources='microphone', type='filepath', label='Record Prompt Audio File')
        prompt_text = gr.Textbox(label="Enter Prompt Text", lines=1, placeholder="Please enter prompt text that matches the content of the prompt audio, automatic recognition is currently not supported...", value='')
        instruct_text = gr.Textbox(label="Enter Instruct Text", lines=1, placeholder="Please enter instruct text.", value='')

        generate_button = gr.Button("Generate Audio")

        audio_output = gr.Audio(label="Synthesized Audio")

But for the pretrained voices, IDK.