import platform
import argparse
import torch
import random
import json
from pathlib import Path
from TTS.api import TTS
import uuid
import html
import soundfile as sf
def is_mac_os():
return platform.system() == 'Darwin'
def get_available_voices():
this_dir = str(Path(__file__).parent.resolve())
return sorted([voice.name for voice in Path(f"{this_dir}/targets").glob("*.wav")])
def random_sentence():
with open(Path("harvard_sentences.txt")) as f:
return random.choice(list(f))
def gen_voice(string, spk, speed, english, model_name, file_name="output.wav"):
string = html.unescape(string)
short_uuid = str(uuid.uuid4())[:8]
output_file = Path(file_name)
this_dir = str(Path(__file__).parent.resolve())
tts = TTS(model_name=model_name).to(device)
tts.tts_to_file(
text=string,
speed=speed,
file_path=output_file,
speaker_wav=[f"{this_dir}/targets/" + spk],
language=languages[english]
)
return output_file
def update_speakers():
updated_speakers = {p.stem: str(p) for p in list(Path('targets').iterdir())}
return list(updated_speakers.keys())
def handle_recorded_audio(audio_data, speaker_dropdown, filename="user_entered"):
if not audio_data:
return speaker_dropdown
sample_rate, audio_content = audio_data
save_path = f"targets/{filename}.wav"
sf.write(save_path, audio_content, sample_rate)
new_speakers = update_speakers()
updated_dropdown = gr.Dropdown(choices=new_speakers, value=filename)
return updated_dropdown
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Text-to-Speech CLI Application")
parser.add_argument("--text", help="Text to convert to speech", required=True)
parser.add_argument("--speaker", help="Select speaker", choices=get_available_voices(), default="Rogger")
parser.add_argument("--speed", type=float, help="Speed of speech", default=0.8)
parser.add_argument("--language", help="Language of text", default="English")
parser.add_argument("--model_name", help="TTS model name", default="tts_models/multilingual/multi-dataset/xtts_v2")
parser.add_argument("--filename", help="Filename for recorded audio", default="output.wav")
args = parser.parse_args()
# Load language data
with open(Path('languages.json'), encoding='utf8') as f:
languages = json.load(f)
# Set device
device = torch.device('cpu') if is_mac_os() else torch.device('cuda:0')
# Generate voice
output_file = gen_voice(args.text, args.speaker, args.speed, args.language, args.model_name, args.filename)
print(f"Voice generated and saved to: {output_file}")
run-cli.bat
@echo off
python -m venv venv
call venv\Scripts\activate
if errorlevel 1 goto :error
python cli.py --text "Testing testing" --speaker user_entered.wav --language English --filename "output.wav"
pause
exit /b 0
:error
echo Failed to activate the virtual environment.
pause
exit /b 1
To run the cli.py script with command-line arguments, you can use the following examples:
Basic example with required arguments:
python cli.py --text "Hello, this is a test."
Example with additional optional arguments:
python cli.py --text "Hello, this is a test." --speaker Rogger.wav --speed 1.0 --language English --model_name "tts_models/multilingual/multi-dataset/xtts_v2" --filename "custom_output.wav"
python cli.py --text "Hello, this is a test." --speaker user_entered.wav --speed 1.0 --language English --model_name "tts_models/multilingual/multi-dataset/xtts_v2" --filename "custom_output.wav"
user_entered.wav and Rogger.wav is at the targets folder for context.
Make sure to replace the values with your specific requirements. Note that the --speaker argument should be one of the available voices returned by the get_available_voices() function.
cli.py
run-cli.bat
To run the
cli.py
script with command-line arguments, you can use the following examples:Basic example with required arguments:
Example with additional optional arguments:
user_entered.wav and Rogger.wav is at the targets folder for context.
Make sure to replace the values with your specific requirements. Note that the
--speaker
argument should be one of the available voices returned by theget_available_voices()
function.