The original code of submaker doesn't know the last position in a sentence, it just splits subtitles by words_in_cue( default value is 10). But how do we know which is the last position in the sentence? Very simple, just count the number of times the last word(or alphabet) appears in the sentence. Before that, you need to split original text into sentences in some way, so that you get sentences, the last word, the number of times the last word appears.
First, we need to take it in two steps.
Step 1, you need to replace the existing 'submaker.py' with the new 'submaker.py' in 'venv\Lib\site-packages\edge_tts'.
Step 2, you need to split text into sentences in some way, before submaker
STEP 1, the following is the new submaker.py.
####################################### dividing line ##########################################
"""
SubMaker package for the Edge TTS project.
SubMaker is a package that makes the process of creating subtitles with
information provided by the service easier.
"""
import math
from typing import List, Tuple
from xml.sax.saxutils import escape, unescape
def formatter(sub_line_count: int, start_time: float, end_time: float, subdata: str) -> str:
"""
formatter returns the timecode and the text of the subtitle.
"""
return (
f"{sub_line_count}\n"
f"{mktimestamp(start_time)} --> {mktimestamp(end_time)}\n"
f"{escape(subdata)}\n\n"
)
def mktimestamp(time_unit: float) -> str:
"""
mktimestamp returns the timecode of the subtitle.
The timecode is in the format of 00:00:00.000.
Returns:
str: The timecode of the subtitle.
"""
hour = math.floor(time_unit / 10**7 / 3600)
minute = math.floor((time_unit / 10**7 / 60) % 60)
seconds = (time_unit / 10**7) % 60
# return f"{hour:02d}:{minute:02d}:{seconds:06.3f}"
return f"{hour:02d}:{minute:02d}:{seconds:06.3f}".replace(".", ",")
class SubMaker:
"""
SubMaker class
"""
def __init__(self) -> None:
"""
SubMaker constructor.
"""
self.offset: List[Tuple[float, float]] = []
self.subs: List[str] = []
def create_sub(self, timestamp: Tuple[float, float], text: str) -> None:
"""
create_sub creates a subtitle with the given timestamp and text
and adds it to the list of subtitles
Args:
timestamp (tuple): The offset and duration of the subtitle.
text (str): The text of the subtitle.
Returns:
None
"""
self.offset.append((timestamp[0], timestamp[0] + timestamp[1]))
self.subs.append(text)
def generate_subs(self, three_dimensional_list, words_in_cue: int = 10) -> str:
"""
generate_subs generates the complete subtitle file.
Args:
words_in_cue (int): defines the number of words in a given cue
Returns:
str: The complete subtitle file.
three_dimensional_list:
[(sentence, last_word, last_word_num), (sentence, last_word, last_word_num)]
"""
if len(self.subs) != len(self.offset):
raise ValueError("subs and offset are not of the same length")
if words_in_cue <= 0:
raise ValueError("words_in_cue must be greater than 0")
# data = "WEBVTT\r\n\r\n"
data = ''
sub_state_count = 0
sub_state_start = -1.0
sub_state_subs = ""
sub_line_count = 0 # new variable used to indicate which line of subtitle this is
for idx, (offset, subs) in enumerate(zip(self.offset, self.subs)):
start_time, end_time = offset
subs = unescape(subs)
# wordboundary is guaranteed not to contain whitespace
# if len(sub_state_subs) > 0:
# sub_state_subs += " "
sub_state_subs += subs
if sub_state_start == -1.0:
sub_state_start = start_time
sub_state_count += 1
sentence, last_word, last_word_num = three_dimensional_list[sub_line_count]
if sub_state_subs.count(last_word) == last_word_num or idx == len(self.offset) - 1:
sub_line_count += 1
# subs = sub_state_subs
subs = sentence
split_subs: List[str] = [
subs[i : i + 79] for i in range(0, len(subs), 79)
]
for i in range(len(split_subs) - 1):
sub = split_subs[i]
split_at_word = True
if sub[-1] == " ":
split_subs[i] = sub[:-1]
split_at_word = False
if sub[0] == " ":
split_subs[i] = sub[1:]
split_at_word = False
if split_at_word:
split_subs[i] += "-"
data += formatter(
sub_line_count=sub_line_count,
start_time=sub_state_start,
end_time=end_time,
subdata="\r\n".join(split_subs),
)
sub_state_count = 0
sub_state_start = -1
sub_state_subs = ""
return data
####################################### dividing line ##########################################
STEP 2, split the text into sentences using the following code:
####################################### dividing line ##########################################
import edge_tts
import asyncio
import re
def spinoff_sentence(sentence):
last_word = sentence[-1]
last_word_num = sentence.count(last_word)
return (sentence, last_word, last_word_num)
async def tts_subtitle(text, three_dimensional_list, voice, audio_path, subtitle_path):
communicate = edge_tts.Communicate(text, voice)
submaker = edge_tts.SubMaker()
with open(audio_path, "wb") as file:
async for chunk in communicate.stream():
if chunk["type"] == "audio":
file.write(chunk["data"])
elif chunk["type"] == "WordBoundary":
submaker.create_sub((chunk["offset"], chunk["duration"]), chunk["text"])
with open(subtitle_path, "w", encoding="utf-8") as file:
file.write(submaker.generate_subs(three_dimensional_list=three_dimensional_list))
async def main():
file_path = r'C:\1.txt'
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
# pattern_eng = r"[,.!<>;:?\{}]"
# sentences = re.split(pattern_eng, text)
pattern_chi = r"[:“”‘’──{}【】·《》〈〉,、;。?!]"
sentences = re.split(pattern_chi, text)
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
three_dimensional_list = []
for sentence in sentences:
three_dimensional_list.append(spinoff_sentence(sentence))
# voice = 'en-US-ChristopherNeural'
voice = 'zh-CN-XiaoyiNeural'
audio_path = r'C:\audio.mp3'
subtitle_path = r'C:\subtitle.srt'
await tts_subtitle(text, three_dimensional_list, voice, audio_path, subtitle_path)
if __name__ == "__main__":
loop = asyncio.get_event_loop_policy().get_event_loop()
try:
loop.run_until_complete(main())
finally:
loop.close()
####################################### dividing line ##########################################
In the code above, it works in both english and chinese, and even any other language. The only difference is the pattern your choose to split the text into sentences, you can modify the pattern to meet your own needs.
Note: the format of subtitle that the original submaker.py exports is vtt, but the format of subtitle of new submaker.py is srt.
Thanks for the efforts of edge-tts's author again.
The original code of submaker doesn't know the last position in a sentence, it just splits subtitles by words_in_cue( default value is 10). But how do we know which is the last position in the sentence? Very simple, just count the number of times the last word(or alphabet) appears in the sentence. Before that, you need to split original text into sentences in some way, so that you get sentences, the last word, the number of times the last word appears.
First, we need to take it in two steps. Step 1, you need to replace the existing 'submaker.py' with the new 'submaker.py' in 'venv\Lib\site-packages\edge_tts'. Step 2, you need to split text into sentences in some way, before submaker
STEP 1, the following is the new submaker.py. ####################################### dividing line ##########################################
####################################### dividing line ##########################################
In the code above, it works in both english and chinese, and even any other language. The only difference is the pattern your choose to split the text into sentences, you can modify the pattern to meet your own needs.
Note: the format of subtitle that the original submaker.py exports is vtt, but the format of subtitle of new submaker.py is srt.
Thanks for the efforts of edge-tts's author again.