[Feature] Edge Text to speech

FerLuisxd commented 1 year ago

Microsoft edge uses some natural sounding voices, would increase the quality of the tts. There is one pip package that could work! https://pypi.org/project/edge-tts/

Extraltodeus commented 1 year ago

I actually opened the issues tab to request that too. These voices are excellent and the service is super reliable while not using any local resource.

I'm using that simple script that I made which can be used with python script_name.py --message "message content" --voice-id 7 --rate-change -10

import warnings
warnings.filterwarnings("ignore")
import sounddevice as sd
import librosa
import edge_tts
import asyncio
import os
import sys
import argparse
import re
from pathlib import Path

parser = argparse.ArgumentParser()
parser.add_argument('--message', type=str, default='Bonjour')
parser.add_argument('--voice-id', type=str, default='7')
parser.add_argument('--rate-change', type=str, default='-7')
args = parser.parse_args()

voix = [
    'fr-BE-CharlineNeural', # 0
    'fr-BE-GerardNeural',   # 1
    'fr-CA-AntoineNeural',  # 2
    'fr-CA-JeanNeural',     # 3
    'fr-CA-SylvieNeural',   # 4
    'fr-CH-ArianeNeural',   # 5
    'fr-CH-FabriceNeural',  # 6
    'fr-FR-DeniseNeural',   # 7
    'fr-FR-EloiseNeural',   # 8
    'fr-FR-HenriNeural'     # 9
]

OUTPUT_FILE = "audio.mp3"

def remove_emojis_and_non_text_chars(text):
    # This regular expression pattern matches emojis and other non-text characters
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese characters
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
        "]+",
        flags=re.UNICODE,
    )
    return emoji_pattern.sub(r"", text)

async def _main(OUTPUT_FILE,TEXT,VOICE=7,RATE=0) -> None:
    sys.stdout = open(os.devnull, 'w')
    if RATE != 0:
        prefix = "+" if RATE > 0 else ""
        RATE = prefix+str(RATE)+"%"
        communicate = edge_tts.Communicate(TEXT,voice=voix[VOICE],rate=RATE)
    else:
        communicate = edge_tts.Communicate(TEXT,voice=voix[VOICE])
    with open(OUTPUT_FILE, "wb") as file:
        async for chunk in communicate.stream():
            if chunk["type"] == "audio":
                file.write(chunk["data"])
            elif chunk["type"] == "WordBoundary":
                print(f"WordBoundary: {chunk}")
    sys.stdout = sys.__stdout__

def tts_voice(TEXT,VOICE=7,RATE=0):
    FOLDER_PATH = "voice_outputs/"
    Path(FOLDER_PATH).mkdir(exist_ok=True)
    OUTPUT_FILE = f"{len(os.listdir(FOLDER_PATH)):04d}_audio.mp3"
    OUTPUT_FILE = os.path.join(FOLDER_PATH,OUTPUT_FILE)
    asyncio.get_event_loop().run_until_complete(_main(OUTPUT_FILE,TEXT,VOICE,RATE))
    audio_data, sample_rate = librosa.load(OUTPUT_FILE)
    sd.play(audio_data, sample_rate)
    sd.wait()

if __name__ == "__main__":
    message = args.message
    message = remove_emojis_and_non_text_chars(message)
    voice_id = int(args.voice_id)
    voice_rate = int(args.rate_change)
    tts_voice(message,voice_id,voice_rate)

Cohee1207 commented 1 year ago

Did that last night in the main SillyTavern repo, but it didn't work out well on Termux. Just pushed new API endpoints here, moving to UI plugin support.

Extraltodeus commented 1 year ago

Wow that was a fast answer! :)

Can't wait to try it! (I already copied the files but can't see the option in the UI yet, I failed at reusing your removed files from the last commit on the main repo too lol)

Cohee1207 commented 1 year ago

Added to dev branch of ST.

Extraltodeus commented 1 year ago

Oh thank you! It works :)

SillyTavern / SillyTavern-Extras

[Feature] Edge Text to speech #34