oobabooga / text-generation-webui

A Gradio web UI for Large Language Models.
GNU Affero General Public License v3.0
40.28k stars 5.28k forks source link

resolution for silero_tts not supporting long texts more than 1000 tokens :+1: :1st_place_medal: #6492

Open Netmees opened 17 hours ago

Netmees commented 17 hours ago

Describe the bug

not supporting lon texts mor than 1000 tokens

Is there an existing issue for this?

Reproduction

ask something large

Screenshot

No response

Logs

model.save_wav(ssml_text=silero_input, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
  File "<torch_package_0>.multi_acc_v3_package.py", line 366, in save_wav
    audio = self.apply_tts(text=text,
            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<torch_package_0>.multi_acc_v3_package.py", line 340, in apply_tts
    raise Exception("Model couldn't generate your text, probably it's too long")
Exception: Model couldn't generate your text, probably it's too long

System Info

xeon
Netmees commented 17 hours ago

solution inprove the code over several ia tools

import html import json import random import time from pathlib import Path

import gradio as gr import torch import numpy as np import soundfile as sf

from extensions.silero_tts import tts_preprocessor from modules import chat, shared, ui_chat from modules.utils import gradio

torch._C._jit_set_profiling_mode(False)

params = { 'activate': True, 'speaker': 'en_56', 'language': 'English', 'model_id': 'v3_en', 'sample_rate': 48000, 'device': 'cpu', 'show_text': False, 'autoplay': True, 'voice_pitch': 'medium', 'voice_speed': 'medium', 'local_cache_path': '' # User can override the default cache path to something other via settings.json }

current_params = params.copy()

with open(Path("extensions/silero_tts/languages.json"), encoding='utf8') as f: languages = json.load(f)

voice_pitches = ['x-low', 'low', 'medium', 'high', 'x-high'] voice_speeds = ['x-slow', 'slow', 'medium', 'fast', 'x-fast']

Used for making text xml compatible, needed for voice pitch and speed control

table = str.maketrans({ "<": "<", ">": ">", "&": "&", "'": "'", '"': """, })

def xmlesc(txt): return txt.translate(table)

def load_model(): torch_cache_path = torch.hub.get_dir() if params['local_cache_path'] == '' else params['local_cache_path'] model_path = torch_cache_path + "/snakers4_silero-models_master/src/silero/model/" + params['model_id'] + ".pt" if Path(model_path).is_file(): print(f'\nUsing Silero TTS cached checkpoint found at {torch_cache_path}') model, example_text = torch.hub.load(repo_or_dir=torch_cache_path + '/snakers4_silero-models_master/', model='silero_tts', language=languages[params['language']]["lang_id"], speaker=params['model_id'], source='local', path=model_path, force_reload=True) else: print(f'\nSilero TTS cache not found at {torch_cache_path}. Attempting to download...') model, example_text = torch.hub.load(repo_or_dir='snakers4/silero-models', model='silero_tts', language=languages[params['language']]["lang_id"], speaker=params['model_id']) model.to(params['device']) return model

def remove_tts_from_history(history): for i, entry in enumerate(history['internal']): history['visible'][i] = [history['visible'][i][0], entry[1]]

return history

def toggle_text_in_history(history): for i, entry in enumerate(history['visible']): visible_reply = entry[1] if visible_reply.startswith('<audio'): if params['show_text']: reply = history['internal'][i][1] history['visible'][i] = [history['visible'][i][0], f"{visible_reply.split('')[0]}\n\n{reply}"] else: history['visible'][i] = [history['visible'][i][0], f"{visible_reply.split('')[0]}"]

return history

def state_modifier(state): if not params['activate']: return state

state['stream'] = False
return state

def input_modifier(string, state): if not params['activate']: return string

shared.processing_message = "*Is recording a voice message...*"
return string

def history_modifier(history):

Remove autoplay from the last reply

if len(history['internal']) > 0:
    history['visible'][-1] = [
        history['visible'][-1][0],
        history['visible'][-1][1].replace('controls autoplay>', 'controls>')
    ]

return history

def output_modifier(string, state):

#"""
#Modifies the output string based on various parameters and state.

#Args:
    #string: The input string to be modified.
    #state: A dictionary containing the current state of the system.

#Returns:
    #The modified string.
#"""

#global model, current_params, streaming_state

## Check if parameters have changed and load the model if necessary
#for i in params:
    #if params[i] != current_params[i]:
        #model = load_model()
        #current_params = params.copy()
        #break

## If activation is disabled, return the original string
#if not params['activate']:
    #return string

## Preprocess the string and handle empty replies
#original_string = string
#string = tts_preprocessor.preprocess(html.unescape(string))

#if string == '':
    #string = '*Empty reply, try regenerating*'
#else:
    ## Generate the audio file and create the HTML audio element
    #output_file = Path(f'extensions/silero_tts/outputs/{state["character_menu"]}_{int(time.time())}.wav')
    #prosody = '<prosody rate="{}" pitch="{}">'.format(params['voice_speed'], params['voice_pitch'])
    #silero_input = f'<speak>{prosody}{xmlesc(string)}</prosody></speak>'
    #model.save_wav(ssml_text=silero_input, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))

    #autoplay = 'autoplay' if params['autoplay'] else ''
    #string = f'<audio src="file/{output_file.as_posix()}" controls {autoplay}></audio>'
    #if params['show_text']:
        #string += f'\n\n{original_string}'

#shared.processing_message = "*Is typing...*"
#return string

def chunk_text(text, max_length=1000): """ Chunks a long text into smaller pieces for processing.

Args:
    text: The input text to be chunked.
    max_length: The maximum length of each chunk.

Returns:
    A list of chunks.
"""

chunks = []
while len(text) > max_length:
    chunk = text[:max_length]
    last_period = chunk.rfind('.')
    if last_period != -1:
        chunk = text[:last_period + 1]
        text = text[last_period + 1:]
    else:
        text = text[max_length:]
    chunks.append(chunk)
chunks.append(text)
return chunks

def apply_tts(text, **kwargs): """ Applies text-to-speech using the my_tts module.

Args:
    text: The input text to be converted to speech.
    **kwargs: Additional keyword arguments to pass to the `my_tts` module.

Returns:
    The generated audio data.
"""

audio_data = my_tts.synthesize_speech(text, **kwargs)
return audio_data

def process_long_text(model, text, **kwargs): """ Processes a long text by chunking it and generating audio for each chunk.

Args:
    model: The TTS model to use.
    text: The input text to be processed.
    **kwargs: Additional keyword arguments to pass to the TTS model.

Returns:
    The concatenated audio for the entire text.
"""

chunks = chunk_text(text)
audio_chunks = []
for chunk in chunks:
    audio_chunk = model.apply_tts(text=chunk, **kwargs)
    audio_chunks.append(audio_chunk)
return np.concatenate(audio_chunks)

def output_modifier(string, state): global model, current_params, streaming_state

# Check if parameters have changed and load the model if necessary
for i in params:
    if params[i] != current_params[i]:
        model = load_model()
        current_params = params.copy()
        break

# If activation is disabled, return the original string
if not params['activate']:
    return string

# Preprocess the string and handle empty replies
original_string = string
string = tts_preprocessor.preprocess(html.unescape(string))

if string == '':
    string = '*Empty reply, try regenerating*'
else:
    # Generate the audio file and create the HTML audio element
    output_file = Path(f'extensions/silero_tts/outputs/{state["character_menu"]}_{int(time.time())}.wav')
    prosody = '<prosody rate="{}" pitch="{}">'.format(params['voice_speed'], params['voice_pitch'])
    silero_input = f'<speak>{prosody}{xmlesc(string)}</prosody></speak>'

    # Use the process_long_text function to handle longer texts
    audio = process_long_text(model, silero_input, speaker=params['speaker'], sample_rate=int(params['sample_rate']))

    # Save the concatenated audio to a file
    sf.write(str(output_file), audio, int(params['sample_rate']))

    autoplay = 'autoplay' if params['autoplay'] else ''
    string = f'<audio src="file/{output_file.as_posix()}" controls {autoplay}></audio>'
    if params['show_text']:
        string += f'\n\n{original_string}'

shared.processing_message = "*Is typing...*"
return string

def setup(): global model model = load_model()

def random_sentence(): with open(Path("extensions/silero_tts/harvard_sentences.txt")) as f: return random.choice(list(f))

def voice_preview(string): global model, current_params, streaming_state

for i in params:
    if params[i] != current_params[i]:
        model = load_model()
        current_params = params.copy()
        break

string = tts_preprocessor.preprocess(string or random_sentence())

output_file = Path('extensions/silero_tts/outputs/voice_preview.wav')
prosody = f"<prosody rate=\"{params['voice_speed']}\" pitch=\"{params['voice_pitch']}\">"
silero_input = f'<speak>{prosody}{xmlesc(string)}</prosody></speak>'
model.save_wav(ssml_text=silero_input, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))

return f'<audio src="file/{output_file.as_posix()}?{int(time.time())}" controls autoplay></audio>'

def language_change(lang): global params params.update({"language": lang, "speaker": languages[lang]["default_voice"], "model_id": languages[lang]["model_id"]}) return gr.update(choices=languages[lang]["voices"], value=languages[lang]["default_voice"])

def custom_css(): path_to_css = Path(file).parent.resolve() / 'style.css' return open(path_to_css, 'r').read()

def ui():

Gradio elements

with gr.Accordion("Silero TTS"):
    with gr.Row():
        activate = gr.Checkbox(value=params['activate'], label='Activate TTS')
        autoplay = gr.Checkbox(value=params['autoplay'], label='Play TTS automatically')

    show_text = gr.Checkbox(value=params['show_text'], label='Show message text under audio player')

    with gr.Row():
        language = gr.Dropdown(value=params['language'], choices=sorted(languages.keys()), label='Language')
        voice = gr.Dropdown(value=params['speaker'], choices=languages[params['language']]["voices"], label='TTS voice')
    with gr.Row():
        v_pitch = gr.Dropdown(value=params['voice_pitch'], choices=voice_pitches, label='Voice pitch')
        v_speed = gr.Dropdown(value=params['voice_speed'], choices=voice_speeds, label='Voice speed')

    with gr.Row():
        preview_text = gr.Text(show_label=False, placeholder="Preview text", elem_id="silero_preview_text")
        preview_play = gr.Button("Preview")
        preview_audio = gr.HTML(visible=False)

    with gr.Row():
        convert = gr.Button('Permanently replace audios with the message texts')
        convert_cancel = gr.Button('Cancel', visible=False)
        convert_confirm = gr.Button('Confirm (cannot be undone)', variant="stop", visible=False)

# Convert history with confirmation
convert_arr = [convert_confirm, convert, convert_cancel]
convert.click(lambda: [gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, convert_arr)
convert_confirm.click(
    lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr).then(
    remove_tts_from_history, gradio('history'), gradio('history')).then(
    chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then(
    chat.redraw_html, gradio(ui_chat.reload_arr), gradio('display'))

convert_cancel.click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr)

# Toggle message text in history
show_text.change(
    lambda x: params.update({"show_text": x}), show_text, None).then(
    toggle_text_in_history, gradio('history'), gradio('history')).then(
    chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then(
    chat.redraw_html, gradio(ui_chat.reload_arr), gradio('display'))

# Event functions to update the parameters in the backend
activate.change(lambda x: params.update({"activate": x}), activate, None)
autoplay.change(lambda x: params.update({"autoplay": x}), autoplay, None)
language.change(language_change, language, voice, show_progress=False)
voice.change(lambda x: params.update({"speaker": x}), voice, None)
v_pitch.change(lambda x: params.update({"voice_pitch": x}), v_pitch, None)
v_speed.change(lambda x: params.update({"voice_speed": x}), v_speed, None)

# Play preview
preview_text.submit(voice_preview, preview_text, preview_audio)
preview_play.click(voice_preview, preview_text, preview_audio)
Netmees commented 17 hours ago

main modifications: add: import numpy as np import soundfile as sf

add and sustitute :

def chunk_text(text, max_length=1000): """ Chunks a long text into smaller pieces for processing.

Args:
    text: The input text to be chunked.
    max_length: The maximum length of each chunk.

Returns:
    A list of chunks.
"""

chunks = []
while len(text) > max_length:
    chunk = text[:max_length]
    last_period = chunk.rfind('.')
    if last_period != -1:
        chunk = text[:last_period + 1]
        text = text[last_period + 1:]
    else:
        text = text[max_length:]
    chunks.append(chunk)
chunks.append(text)
return chunks

def apply_tts(text, **kwargs): """ Applies text-to-speech using the my_tts module.

Args:
    text: The input text to be converted to speech.
    **kwargs: Additional keyword arguments to pass to the `my_tts` module.

Returns:
    The generated audio data.
"""

audio_data = my_tts.synthesize_speech(text, **kwargs)
return audio_data

def process_long_text(model, text, **kwargs): """ Processes a long text by chunking it and generating audio for each chunk.

Args:
    model: The TTS model to use.
    text: The input text to be processed.
    **kwargs: Additional keyword arguments to pass to the TTS model.

Returns:
    The concatenated audio for the entire text.
"""

chunks = chunk_text(text)
audio_chunks = []
for chunk in chunks:
    audio_chunk = model.apply_tts(text=chunk, **kwargs)
    audio_chunks.append(audio_chunk)
return np.concatenate(audio_chunks)

def output_modifier(string, state): global model, current_params, streaming_state

# Check if parameters have changed and load the model if necessary
for i in params:
    if params[i] != current_params[i]:
        model = load_model()
        current_params = params.copy()
        break

# If activation is disabled, return the original string
if not params['activate']:
    return string

# Preprocess the string and handle empty replies
original_string = string
string = tts_preprocessor.preprocess(html.unescape(string))

if string == '':
    string = '*Empty reply, try regenerating*'
else:
    # Generate the audio file and create the HTML audio element
    output_file = Path(f'extensions/silero_tts/outputs/{state["character_menu"]}_{int(time.time())}.wav')
    prosody = '<prosody rate="{}" pitch="{}">'.format(params['voice_speed'], params['voice_pitch'])
    silero_input = f'<speak>{prosody}{xmlesc(string)}</prosody></speak>'

    # Use the process_long_text function to handle longer texts
    audio = process_long_text(model, silero_input, speaker=params['speaker'], sample_rate=int(params['sample_rate']))

    # Save the concatenated audio to a file
    sf.write(str(output_file), audio, int(params['sample_rate']))

    autoplay = 'autoplay' if params['autoplay'] else ''
    string = f'<audio src="file/{output_file.as_posix()}" controls {autoplay}></audio>'
    if params['show_text']:
        string += f'\n\n{original_string}'

shared.processing_message = "*Is typing...*"
return string