Closed giandre closed 1 month ago
Hello,
Thank you :-)
I lack time currently, but I keep your suggestion in mind.
The part which would be the most time-consuming would be the split per chapter phase, since there is no clear definition for text or pdf. It would be a set of heuristic rules, thus not perfect. I already have several strategies in mind.
If we are lucky, pdf file will have a set of metadata, but if you have already worked with pdf, you know that a pdf can be surprising ;-)
Some background: Actually, I'm working on another project that I've not published yet. The difference is that it is a set of command line tools using the power of the pipe '|' operator in Linux. You can chain actions using it.
I've already developed:
tp
: text processing tool which is able to load numerous file types and extracts text (currently audio and text). I'm adding support of the most common picture, audio, text, office document formats. So there is an overlap with your request.
Fyi: tp
does more: transcription, punctuation restoration, translation, and summarization.
tts
: which converts the text in voice that you can either listen or record in a mp3 file.
In your case, read a text file to a mp3 would be: tp book.txt | tts --o book.mp3
Hi, I ended up using Claude 3 to help me with the code. I created a new PY file and left everything else the same. Now I can generate audio from a full text. I will try later to build the upload a file (PDF or DOCX) but for now this is doing the trick for me. Below is the code "I" used :). Thanks again for your answer.
import os import tkinter as tk from tkinter import filedialog, font, scrolledtext, ttk import customtkinter as ctk import my_edge_tts
class TextToSpeechUI(ctk.CTk):
def __init__(self):
super().__init__()
self.title("Text to Speech Converter")
self.geometry("900x700") # Adjusted initial window size
ctk.set_appearance_mode("dark") # Set the theme to dark mode
ctk.set_default_color_theme("green")
# Variables
self.voice_var = tk.StringVar()
self.output_file_path = tk.StringVar()
self.playback_speed_percentage = tk.IntVar(value=100)
self.volume_percentage = tk.IntVar(value=100)
self.pitch_hz = tk.IntVar(value=0)
# Main Frame
self.main_frame = ctk.CTkFrame(self)
self.main_frame.pack(fill=tk.BOTH, expand=True, padx=20, pady=20)
# Rich Text Input
self.text_label = ctk.CTkLabel(self.main_frame, text="Enter text:")
self.text_label.pack(pady=(0, 5))
self.text_input = scrolledtext.ScrolledText(self.main_frame, height=10, width=80, wrap=tk.WORD) # Reduced initial height
self.text_input.pack(fill=tk.BOTH, expand=True, pady=(0, 10))
# Voice Dropdown
self.voice_label = ctk.CTkLabel(self.main_frame, text="Select voice:")
self.voice_label.pack(pady=(10, 5))
self.voice_options = sorted(my_edge_tts.load_voices())
# Create and configure the Combobox
self.voice_combobox = ttk.Combobox(self.main_frame, textvariable=self.voice_var, values=self.voice_options)
self.voice_combobox.pack(pady=(0, 10), fill=tk.X)
self.voice_combobox.bind('<KeyRelease>', self.update_voice_list)
# Style the Combobox to match the dark theme
self.style = ttk.Style()
self.style.theme_use('clam')
self.style.configure('TCombobox', fieldbackground='#2b2b2b', background='#2b2b2b', foreground='white')
self.style.map('TCombobox', fieldbackground=[('readonly', '#2b2b2b')])
self.style.map('TCombobox', selectbackground=[('readonly', '#2b2b2b')])
self.style.map('TCombobox', selectforeground=[('readonly', 'white')])
# Playback Speed Slider
self.playback_speed_label = ctk.CTkLabel(self.main_frame, text="Playback Speed Percentage:")
self.playback_speed_label.pack(pady=(10, 5))
self.playback_speed_slider = ttk.Scale(self.main_frame, from_=0, to=200, orient=tk.HORIZONTAL, variable=self.playback_speed_percentage, length=300) # Reduced length
self.playback_speed_slider.pack(fill=tk.X, padx=20)
self.playback_speed_value_label = ctk.CTkLabel(self.main_frame, textvariable=self.playback_speed_percentage)
self.playback_speed_value_label.pack(pady=(0, 10))
# Volume Slider
self.volume_label = ctk.CTkLabel(self.main_frame, text="Volume Percentage:")
self.volume_label.pack(pady=(10, 5))
self.volume_slider = ttk.Scale(self.main_frame, from_=50, to=200, orient=tk.HORIZONTAL, variable=self.volume_percentage, length=300) # Reduced length
self.volume_slider.pack(fill=tk.X, padx=20)
self.volume_value_label = ctk.CTkLabel(self.main_frame, textvariable=self.volume_percentage)
self.volume_value_label.pack(pady=(0, 10))
# Pitch Slider
self.pitch_label = ctk.CTkLabel(self.main_frame, text="Pitch Hz:")
self.pitch_label.pack(pady=(10, 5))
self.pitch_slider = ttk.Scale(self.main_frame, from_=-200, to=200, orient=tk.HORIZONTAL, variable=self.pitch_hz, length=300) # Adjusted range and length
self.pitch_slider.pack(fill=tk.X, padx=20)
self.pitch_value_label = ctk.CTkLabel(self.main_frame, textvariable=self.pitch_hz)
self.pitch_value_label.pack(pady=(0, 10))
# Generate Button
self.generate_button = ctk.CTkButton(self.main_frame, text="Generate Audio", command=self._generate_audio)
self.generate_button.pack(pady=10)
# Log Label
self.log_label = ctk.CTkLabel(self.main_frame, text="")
self.log_label.pack(pady=10)
def create_toolbar(self):
toolbar = ctk.CTkFrame(self.main_frame)
toolbar.pack(fill=tk.X, pady=(0, 5))
# Bold button
bold_button = ctk.CTkButton(toolbar, text="B", width=30, command=self.toggle_bold)
bold_button.pack(side=tk.LEFT, padx=2)
# Italic button
italic_button = ctk.CTkButton(toolbar, text="I", width=30, command=self.toggle_italic)
italic_button.pack(side=tk.LEFT, padx=2)
# Underline button
underline_button = ctk.CTkButton(toolbar, text="U", width=30, command=self.toggle_underline)
underline_button.pack(side=tk.LEFT, padx=2)
def toggle_bold(self):
current_tags = self.text_input.tag_names("sel.first")
if "bold" in current_tags:
self.text_input.tag_remove("bold", "sel.first", "sel.last")
else:
self.text_input.tag_add("bold", "sel.first", "sel.last")
self.text_input.tag_configure("bold", font=font.Font(weight="bold"))
def toggle_italic(self):
current_tags = self.text_input.tag_names("sel.first")
if "italic" in current_tags:
self.text_input.tag_remove("italic", "sel.first", "sel.last")
else:
self.text_input.tag_add("italic", "sel.first", "sel.last")
self.text_input.tag_configure("italic", font=font.Font(slant="italic"))
def toggle_underline(self):
current_tags = self.text_input.tag_names("sel.first")
if "underline" in current_tags:
self.text_input.tag_remove("underline", "sel.first", "sel.last")
else:
self.text_input.tag_add("underline", "sel.first", "sel.last")
self.text_input.tag_configure("underline", underline=True)
def update_voice_list(self, event):
value = event.widget.get()
if value == '':
self.voice_combobox['values'] = self.voice_options
else:
data = []
for item in self.voice_options:
if value.lower() in item.lower():
data.append(item)
self.voice_combobox['values'] = data
def _log(self, text):
self.log_label.configure(text=text)
self.update()
def _generate_audio(self):
text = self.text_input.get("1.0", tk.END).strip()
if not text:
self._log("Please enter some text.")
return
voice = self.voice_var.get()
if not voice:
self._log("Please select a voice.")
return
output_file = filedialog.asksaveasfilename(defaultextension=".mp3", filetypes=[("MP3 files", "*.mp3")])
if not output_file:
return
self._log("Generating audio...")
try:
# Create a temporary text file
temp_file_path = "temp_text.txt"
with open(temp_file_path, "w", encoding="utf-8") as temp_file:
temp_file.write(text)
# Generate MP3 file
my_edge_tts.generate_mp3_file(
temp_file_path,
output_file,
voice,
self.playback_speed_percentage.get(),
self.volume_percentage.get(),
self.pitch_hz.get()
)
# Remove temporary file
os.remove(temp_file_path)
self._log(f"Audio generated successfully: {output_file}")
except Exception as e:
self._log(f"Error generating audio: {str(e)}")
if name == "main": app = TextToSpeechUI() app.mainloop()
This tool is a little gem. The output quality is simply awesome, and it works great with epubs. I am not a developer but I tried ChatGPT to help me add more functionality to your code but I failed... Have you considered adding text and PDF functionality as well? This will make the tool even better!