Subtitle to Audio

Subtitle to audio, generate audio/speech from any subtitle file using Coqui-ai TTS and synchronize the audio timing according to subtitle time.

Demo :

Dependencies

ffmpeg, pydub, librosa, coqui-ai TTS, ffmpeg-python

Installation

pip install TTS
pip install git+https://github.com/bnsantoso/sub-to-audio

pip install TTS
pip install subtoaudio

ffmpeg on linux

apt-get install ffmpeg

Example usage

Basic use is very similiar to Coqui-ai TTS, you can check their documentation and the .

!Note: Use non-overlapping subtitles with an optimal Character per Second / CPS for best result

!Note: Use software like aegisub to edit your subtitle

from subtoaudio import SubToAudio

# list all model
SubToAudio().coqui_model()

# get model index
model = SubToAudio().coqui_model()[1]

# The code will output 'yoursubtitle.wav' in the current directory.
sub = SubToAudio(model_name=model)
subtitle = sub.subtitle("yoursubtitle.srt")
sub.convert_to_audio(sub_data=subtitle)

# you can choose 1100 different language using fairseq model
sub = SubToAudio(fairseq_language='<lang-iso_code>')
subtitle = sub.subtitle("yoursubtitle.ass")
sub.convert_to_audio(sub_data=subtitle) 

# specify model name
sub = SubToAudio(model_name="tts_models/multilingual/multi-dataset/your_tts")
subtitle = sub.subtitle("yoursubtitle.srt")
sub.convert_to_audio(sub_data=subtitle, output_path="subtitle.wav")

# specify model and config path
sub = SubToAudio(model_path="path/to/your/model.pth" config_path="config/path.json")
subtitle = sub.subtitle("yoursubtitle.srt")
sub.convert_to_audio(sub_data=subtitle)

# speaker=tts.speakers[0] or None if model doesnt have multiple speakers
# language=tts.languages[0] or None if doesnt have multiple languages

# list speaker
sub.speakers()
speaker1 = sub.speakers()[1]

# list languages
sub.languages()
langu = sub.languages()[0]

sub = SubToAudio(model_name="tts_models/multilingual/multi-dataset/your_tts")
subtitle = sub.subtitle("yoursubtitle.srt")
sub.convert_to_audio(sub_data=subtitle, language=langu, speaker=speaker1, output_path="subtitle.wav")

# Save temporary audio to current folder
sub = SubToAudio(model_name="tts_models/multilingual/multi-dataset/your_tts")
subtitle = sub.subtitle("yoursubtitle.srt")
sub.convert_to_audio(sub_data=subtitle, output_path="subtitle.wav", save_temp=True)

Voice Conversion

To use voice conversion method, you must pass voice_conversion:bool and speaker_wav:str paramater on self.convert_to_audio. Voice conversion cannot run if your model have multiple speakers.

from subtoaudio import SubToAudio

sub = SubToAudio(fairseq_language="eng")
subtitle = sub.subtitle("yoursubtitle.srt")
sub.convert_to_audio(sub_data=subtitle, voice_conversion=True, speaker_wav="voice.wav", language="en")

Coqui Studio Api

To use Coqui Studio Api you'll need to configure the COQUI_STUDIO_TOKEN environment variable.

import os

os.environ['COQUI_STUDIO_TOKEN'] = # yourapi

After your token set you can get coqui studio model, you can follow this name convention coqui_studio/en/<studio_speaker_name>/coqui_studio

from subtoaudio import SubToAudio

sub = SubToAudio(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False)
subtitle = sub.subtitle("yoursubtitle.srt")
sub.convert_to_audio(sub_data=subtitle, output_path="subtitle.wav", save_temp=True)

# use emotion paramater and speed paramater
sub.convert_to_audio(sub_data=subtitle, output_path="subtitle.wav", emotion="Happy", speed=1.5)

Tempo Mode

Use the tempo_mode parameter to speed up the audio. There are three tempo modes:

tempo_mode="all" : This accelerates all audio. Use tempo_speed=float to specify the speed.
tempo_mode="overflow" : This accelerates the audio to match the total subtitle duration plus the blank duration before the next subtitle appears. 'tempo_limit' will limit the speed increase during overflow.
tempo_mode="precise" : This accelerates the audio to match the duration the subtitle appears."

from subtoaudio import SubToAudio

# Speed up tempo or speech rate
sub = SubToAudio(model_name="tts_models/de/thorsten/tacotron2-DDC")
subtitle = sub.subtitle("yoursubtitle.srt")
sub.convert_to_audio(sub_data=subtitle, tempo_mode="all", tempo_speed=1.3)

# Change the tempo or speech rate of all audio files , default is 1.2
sub = SubToAudio("tts_models/multilingual/multi-dataset/xtts_v1")
subtitle = sub.subtitle("yoursubtitle.srt")
sub.convert_to_audio(sub_data=subtitle, tempo_mode="all", tempo_speed=1.3)

# Change tempo or speech rate to audio that doesn't match the subtitle duration
sub = SubToAudio(fairseq_language="ind")
subtitle = sub.subtitle("yoursubtitle.srt")
sub.convert_to_audio(sub_data=subtitle, tempo_mode="overflow")

# Limit tempo speed on the overflow mode 
sub = SubToAudio(fairseq_language="ind")
subtitle = sub.subtitle("yoursubtitle.srt")
sub.convert_to_audio(sub_data=subtitle, tempo_mode="overflow", tempo_limit=1.2)

# Match audio length to subtitle duration
sub = SubToAudio(fairseq_language="ind")
subtitle = sub.subtitle("yoursubtitle.srt")
sub.convert_to_audio(sub_data=subtitle, tempo_mode="precise")

Shift Mode

shift_mode parameter will shift audio that doesnt match subtitle duration.

shift_mode="right" : Shift audio time to the right and prevent audio overlaping.
shift_mode="left" : Shift audio to the left and prevent audio overlap, but be cautious of limited space on the left side, as some audio may disappear.
shift_mode="interpose" : Shift audio to mid position and prevent right and left of audio overlaping. (Note: This mode can be clunky, so use it cautiously.)
shift_mode="left-overlap" : Shift audio time to the left, allowing overlap.
shift_mode="interpose-overlap" : Shift audio to mid position, allowing overlap.
shift_limit=int or "str" : limit audio shift, use integer for millisecond or string like 2.5s for second

from subtoaudio import SubToAudio

# shift mode with limit of 2 second to the right.

sub = SubToAudio(fairseq_language="vie")
subtitle = sub.subtitle("yoursubtitle.srt")
sub.convert_to_audio(sub_data=sub, tempo_mode="overflow", shift_mode="right", limit_shift="2s")

# shift audio to left position or, time before next subtitle appear

sub = SubToAudio(fairseq_language="fra")
subtitle = sub.subtitle("yoursubtitle.srt")
sub.convert_to_audio(sub_data=sub, shift_mode="left-overlap")

# shift to left, and limit shift only 1 sec.
sub = SubToAudio(fairseq_language="ind")
subtitle = sub.subtitle("yoursubtitle.srt")
sub.convert_to_audio(sub_data=sub, shift_mode="left", shift_limit=1000) # 1000 = 1s

Bark and Tortoise example

from subtoaudio import SubToAudio

#  Random Speaker will give you weird result when using bark model with SubToAudio

# Bark random
sub = SubToAudio("tts_models/multilingual/multi-dataset/bark")
subtitle = sub.subtitle("yoursubtitle.srt")
sub.convert_to_audio(sub_data=subtitle, tempo_mode="overflow")

# Tortoise random
sub = SubToAudio("tts_models/en/multi-dataset/tortoise-v2")
subtitle = sub.subtitle("yoursubtitle.srt")
sub.convert_to_audio(sub_data=subtitle, shift_mode="overflow", preset="fast")

#  To use voice clone you need voice_dir and speaker paramater
#  Voice Clone expecting .wav or .npz file inside folder speaker_1
#  voice/speaker_1/hana.wav or voice/speaker_1/hana.npz
#  if your speaker folder only have .wav file, it will generate .npz file after you runing it.

sub = SubToAudio("tts_models/multilingual/multi-dataset/bark")
subtitle = sub.subtitle("yoursubtitle.srt")
sub.convert_to_audio(sub_data=subtitle, tempo_mode="overflow", voice_dir="voice/",speaker="speaker_1")

# same with bark, the folder structure like this 'voice/speaker2/ron.wav'
sub = SubToAudio("tts_models/en/multi-dataset/tortoise-v2")
subtitle = sub.subtitle("yoursubtitle.ass")
sub.convert_to_audio(sub_data=subtitle, tempo_mode="overflow", voice_dir="voice/", speaker="speaker2")

Citation

Eren, G., & The Coqui TTS Team. (2021). Coqui TTS (Version 1.4) [Computer software]. https://doi.org/10.5281/zenodo.6334862

bnsantoso / sub-to-audio

readme