Closed FerLuisxd closed 1 year ago
I actually opened the issues tab to request that too. These voices are excellent and the service is super reliable while not using any local resource.
I'm using that simple script that I made which can be used with python script_name.py --message "message content" --voice-id 7 --rate-change -10
import warnings
warnings.filterwarnings("ignore")
import sounddevice as sd
import librosa
import edge_tts
import asyncio
import os
import sys
import argparse
import re
from pathlib import Path
parser = argparse.ArgumentParser()
parser.add_argument('--message', type=str, default='Bonjour')
parser.add_argument('--voice-id', type=str, default='7')
parser.add_argument('--rate-change', type=str, default='-7')
args = parser.parse_args()
voix = [
'fr-BE-CharlineNeural', # 0
'fr-BE-GerardNeural', # 1
'fr-CA-AntoineNeural', # 2
'fr-CA-JeanNeural', # 3
'fr-CA-SylvieNeural', # 4
'fr-CH-ArianeNeural', # 5
'fr-CH-FabriceNeural', # 6
'fr-FR-DeniseNeural', # 7
'fr-FR-EloiseNeural', # 8
'fr-FR-HenriNeural' # 9
]
OUTPUT_FILE = "audio.mp3"
def remove_emojis_and_non_text_chars(text):
# This regular expression pattern matches emojis and other non-text characters
emoji_pattern = re.compile(
"["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002500-\U00002BEF" # chinese characters
u"\U00002702-\U000027B0"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u"\U00010000-\U0010ffff"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u200d"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\ufe0f" # dingbats
u"\u3030"
"]+",
flags=re.UNICODE,
)
return emoji_pattern.sub(r"", text)
async def _main(OUTPUT_FILE,TEXT,VOICE=7,RATE=0) -> None:
sys.stdout = open(os.devnull, 'w')
if RATE != 0:
prefix = "+" if RATE > 0 else ""
RATE = prefix+str(RATE)+"%"
communicate = edge_tts.Communicate(TEXT,voice=voix[VOICE],rate=RATE)
else:
communicate = edge_tts.Communicate(TEXT,voice=voix[VOICE])
with open(OUTPUT_FILE, "wb") as file:
async for chunk in communicate.stream():
if chunk["type"] == "audio":
file.write(chunk["data"])
elif chunk["type"] == "WordBoundary":
print(f"WordBoundary: {chunk}")
sys.stdout = sys.__stdout__
def tts_voice(TEXT,VOICE=7,RATE=0):
FOLDER_PATH = "voice_outputs/"
Path(FOLDER_PATH).mkdir(exist_ok=True)
OUTPUT_FILE = f"{len(os.listdir(FOLDER_PATH)):04d}_audio.mp3"
OUTPUT_FILE = os.path.join(FOLDER_PATH,OUTPUT_FILE)
asyncio.get_event_loop().run_until_complete(_main(OUTPUT_FILE,TEXT,VOICE,RATE))
audio_data, sample_rate = librosa.load(OUTPUT_FILE)
sd.play(audio_data, sample_rate)
sd.wait()
if __name__ == "__main__":
message = args.message
message = remove_emojis_and_non_text_chars(message)
voice_id = int(args.voice_id)
voice_rate = int(args.rate_change)
tts_voice(message,voice_id,voice_rate)
Did that last night in the main SillyTavern repo, but it didn't work out well on Termux. Just pushed new API endpoints here, moving to UI plugin support.
Wow that was a fast answer! :)
Can't wait to try it! (I already copied the files but can't see the option in the UI yet, I failed at reusing your removed files from the last commit on the main repo too lol)
Added to dev branch of ST.
Oh thank you! It works :)
Microsoft edge uses some natural sounding voices, would increase the quality of the tts. There is one pip package that could work! https://pypi.org/project/edge-tts/