wannaphong / ttsmms

TTS with The Massively Multilingual Speech (MMS) project
MIT License
223 stars 37 forks source link

allow for 16bit PCM - which is more common and return bytes in array #14

Open willwade opened 3 months ago

willwade commented 3 months ago
willwade commented 3 months ago

Im hoping this shouldnt break anything.

willwade commented 3 months ago

I thought I'd share this somewhere - this probably isn't the best place for it but since it uses my pcm16 code

The problem: Its really slow for recurrent synth calls. The answer - wrap it in a worker thread

import pyaudio
from ttsmms import TTS
import threading
from queue import Queue, Empty

def play_audio(audio_bytes, sample_rate=16000):
    try:
        p = pyaudio.PyAudio()
        stream = p.open(format=pyaudio.paInt16,  # Ensure the format matches 16-bit PCM
                        channels=1,
                        rate=sample_rate,
                        output=True)
        stream.write(audio_bytes)
        stream.stop_stream()
        stream.close()
        p.terminate()
    except Exception as e:
        print(f"Error playing audio: {e}")

class TTSWorker(threading.Thread):
    def __init__(self, model_path, queue, response_queue):
        super().__init__()
        self.tts = TTS(model_path)
        self.queue = queue
        self.response_queue = response_queue
        self.running = True

    def run(self):
        while self.running:
            try:
                text = self.queue.get(timeout=1)
                if text is None:
                    self.running = False
                else:
                    result = self.tts.synthesis(text, convert_to_pcm16=True)
                    self.response_queue.put(result)
            except Empty:
                continue

    def stop(self):
        self.running = False
        self.queue.put(None)
        self.join()

# Create a queue for sending text to the TTS worker
tts_queue = Queue()
response_queue = Queue()

# Create and start the TTS worker
tts_worker = TTSWorker('/Users/willwade/mms_models/eng', tts_queue, response_queue)
tts_worker.start()

def synthesize_speech(text):
    tts_queue.put(text)
    result = response_queue.get()
    return result

# Example usage
text = "Hello world"
result = synthesize_speech(text)
audio_bytes = result["audio_bytes"]
sample_rate = result["sampling_rate"]
# Play the audio bytes
play_audio(audio_bytes, sample_rate)

# Example usage - oart 2
text = "And all my friends"
result = synthesize_speech(text)
audio_bytes = result["audio_bytes"]
sample_rate = result["sampling_rate"]
# Play the audio bytes
play_audio(audio_bytes, sample_rate)

# Stop the TTS worker when done
tts_worker.stop()

Its really quick the second, third etc time around