Open Chafid opened 1 week ago
This is probably not right so please don't just expect this to work.
class GoogleTTS(AbstractTTS):
# Other methods remain unchanged
def synth_to_bytestream(
self, text: Any, format: Optional[str] = "wav"
) -> Generator[bytes, None, None]:
"""
Synthesizes text to an in-memory bytestream and retrieves word timings using
AbstractTTS's estimate_word_timings method.
:param text: The text to synthesize.
:param format: The desired audio format (e.g., 'wav', 'mp3', 'flac').
:return: A generator yielding bytes objects containing audio data.
"""
# Generate estimated word timings using the abstract method
self.timings = self.estimate_word_timings(text)
# Generate audio stream data and yield as chunks
for timing in self.timings:
word = timing[2] # Extract word from timing data
# Replace with actual synthesized bytes from TTS for each word
audio_chunk = self.synth(word) # Placeholder for actual synthesis
yield audio_chunk
And Sherpa onnx
class SherpaOnnxTTS(AbstractTTS):
def synth_to_bytestream(
self, text: Any, format: Optional[str] = "wav"
) -> Generator[bytes, None, None]:
"""
Synthesizes text to an in-memory bytestream and estimates word timings.
Yields audio data chunks as they are generated.
:param text: The text to synthesize.
:param format: The desired audio format (e.g., 'wav', 'mp3', 'flac').
:return: A generator yielding bytes objects containing audio data.
"""
try:
logging.info(
f"[SherpaOnnxTTS.synth_to_bytestream] Synthesizing text: {text}"
)
# Generate estimated word timings using the abstract method
self.timings = self.estimate_word_timings(text)
# Buffer to store audio chunks for conversion and timing
audio_chunks = []
# Iterate over generated audio chunks
for chunk_idx, (progress, samples) in enumerate(
self.generate_audio_chunks(text)
):
logging.info(
f"Processing audio chunk {chunk_idx} with progress {progress}"
)
# Collect audio chunks for conversion
audio_chunks.append(samples)
# Concatenate current chunks for conversion
current_audio = np.concatenate(audio_chunks, axis=0)
# Convert PCM data to the desired audio format
converted_audio = self._convert_audio(
current_audio, format, self.audio_rate
)
# Strip WAV header if necessary
if format == "wav" and converted_audio[:4] == b"RIFF":
logging.info("Stripping wav header from bytestream")
converted_audio = self._strip_wav_header(converted_audio)
# Yield the converted audio chunk
yield converted_audio
# Reset the buffer after yielding
audio_chunks = []
# After all chunks are processed, perform any necessary finalization
if audio_chunks:
current_audio = np.concatenate(audio_chunks, axis=0)
converted_audio = self._convert_audio(
current_audio, format, self.audio_rate
)
if format == "wav" and converted_audio[:4] == b"RIFF":
converted_audio = self._strip_wav_header(converted_audio)
yield converted_audio
except Exception as e:
logging.error(f"Error in synth_to_bytestream: {e}")
raise
Nb. Would also need
def start_playback_with_callbacks(self, text: str, callback=None): """Play synthesized audio with word timing callbacks.""" if callback is None: callback = self.on_word_callback
# Start playback using synth_to_bytestream with timings
for start, end, word in self.timings:
delay = max(0, start - time.time())
timer = threading.Timer(delay, callback, args=(word, start, end))
timer.start()
self.timers.append(timer)
Prerequisites
For more information, see the contributing guide.
Description
word timing in google does not work since google tts use synth_to_bytestream which does not implemented the timing
Steps to Reproduce
run tts.start_playback_with_callbacks(text, callback=my_callback) using google tts.
Expected behavior: [What you expected to happen] word timing won't be printed
Actual behavior: [What actually happened] word timing is implemented in synth_to_bytes instead of synth_to_bytestream
Notes Similar case with sharponnx, with similar root cause
This is also happening for SAPI, but the issue here is it is using synth_to_byte, but that function only serves as a wrapper for the client synth function on SAPI, so it does not implement the word timing as well