BoltzmannEntropy / xtts2-ui

A User Interface for XTTS-2 Text-Based Voice Cloning using only 10 seconds of speech
MIT License
259 stars 42 forks source link

For CLI use case. In case someone wants to use it without gradio #5

Closed ProjCRys closed 9 months ago

ProjCRys commented 11 months ago

cli.py

import platform
import argparse
import torch
import random
import json
from pathlib import Path
from TTS.api import TTS
import uuid
import html
import soundfile as sf

def is_mac_os():
    return platform.system() == 'Darwin'

def get_available_voices():
    this_dir = str(Path(__file__).parent.resolve())
    return sorted([voice.name for voice in Path(f"{this_dir}/targets").glob("*.wav")])

def random_sentence():
    with open(Path("harvard_sentences.txt")) as f:
        return random.choice(list(f))

def gen_voice(string, spk, speed, english, model_name, file_name="output.wav"):
    string = html.unescape(string)
    short_uuid = str(uuid.uuid4())[:8]
    output_file = Path(file_name)
    this_dir = str(Path(__file__).parent.resolve())
    tts = TTS(model_name=model_name).to(device)
    tts.tts_to_file(
        text=string,
        speed=speed,
        file_path=output_file,
        speaker_wav=[f"{this_dir}/targets/" + spk],
        language=languages[english]
    )
    return output_file

def update_speakers():
    updated_speakers = {p.stem: str(p) for p in list(Path('targets').iterdir())}
    return list(updated_speakers.keys())

def handle_recorded_audio(audio_data, speaker_dropdown, filename="user_entered"):
    if not audio_data:
        return speaker_dropdown

    sample_rate, audio_content = audio_data

    save_path = f"targets/{filename}.wav"

    sf.write(save_path, audio_content, sample_rate)

    new_speakers = update_speakers()

    updated_dropdown = gr.Dropdown(choices=new_speakers, value=filename)
    return updated_dropdown

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Text-to-Speech CLI Application")
    parser.add_argument("--text", help="Text to convert to speech", required=True)
    parser.add_argument("--speaker", help="Select speaker", choices=get_available_voices(), default="Rogger")
    parser.add_argument("--speed", type=float, help="Speed of speech", default=0.8)
    parser.add_argument("--language", help="Language of text", default="English")
    parser.add_argument("--model_name", help="TTS model name", default="tts_models/multilingual/multi-dataset/xtts_v2")
    parser.add_argument("--filename", help="Filename for recorded audio", default="output.wav")

    args = parser.parse_args()

    # Load language data
    with open(Path('languages.json'), encoding='utf8') as f:
        languages = json.load(f)

    # Set device
    device = torch.device('cpu') if is_mac_os() else torch.device('cuda:0')

    # Generate voice
    output_file = gen_voice(args.text, args.speaker, args.speed, args.language, args.model_name, args.filename)

    print(f"Voice generated and saved to: {output_file}")

run-cli.bat

@echo off
python -m venv venv
call venv\Scripts\activate
if errorlevel 1 goto :error

python cli.py --text "Testing testing" --speaker user_entered.wav --language English --filename "output.wav"
pause
exit /b 0

:error
echo Failed to activate the virtual environment.
pause
exit /b 1

To run the cli.py script with command-line arguments, you can use the following examples:

  1. Basic example with required arguments:

    python cli.py --text "Hello, this is a test."
  2. Example with additional optional arguments:

    python cli.py --text "Hello, this is a test." --speaker Rogger.wav --speed 1.0 --language English --model_name "tts_models/multilingual/multi-dataset/xtts_v2" --filename "custom_output.wav"
python cli.py --text "Hello, this is a test." --speaker user_entered.wav --speed 1.0 --language English --model_name "tts_models/multilingual/multi-dataset/xtts_v2" --filename "custom_output.wav"

user_entered.wav and Rogger.wav is at the targets folder for context.

Make sure to replace the values with your specific requirements. Note that the --speaker argument should be one of the available voices returned by the get_available_voices() function.

BoltzmannEntropy commented 9 months ago

Thanks.