Open sensboston opened 9 months ago
import os import argparse import speech_recognition as sr from pydub import AudioSegment from pydub.silence import split_on_silence from pydub.silence import detect_nonsilent
def main(): parser = argparse.ArgumentParser(description="Convert audio to text and generate SRT subtitles") parser.add_argument("input_audio", help="Input audio file path") parser.add_argument("--output", "-o", help="Output SRT file path", default="output.srt") parser.add_argument("--language", "-l", help="Language for speech recognition", default="tr") parser.add_argument("--min-silence-len", type=int, default=450, help="Minimum silence length in milliseconds") parser.add_argument("--silence-thresh", type=int, default=-14, help="Silence threshold in dBFS") parser.add_argument("--keep-silence", type=int, default=450, help="Keep silence duration in milliseconds") args = parser.parse_args()
input_audio_path = args.input_audio
output_srt_path = args.output
lang = args.language
min_silence_len = args.min_silence_len
silence_thresh = args.silence_thresh
keep_silence = args.keep_silence
r = sr.Recognizer()
def make_three_number(number):
number = str(number)
while len(number) < 3:
number += "0"
return number
def make_two_number(number):
number = str(number)
while len(number) < 2:
number = "0" + number
return number
def nonsilent_object_to_srt_time_string(array):
start,finish = array[0],array[1]
startDict = {}
finishDict = {}
startDict["Milisecond"] = make_three_number(start%1000)
start = int(start/1000)
startDict["Second"] = make_two_number(start%60)
startDict["Minute"] = make_two_number(int(start/60)%60)
startDict["Hour"] = make_two_number(int(start/60/60))
finishDict["Milisecond"] = make_three_number(finish%1000)
finish = int(finish/1000)
finishDict["Second"] = make_two_number(finish%60)
finishDict["Minute"] = make_two_number(int(finish/60)%60)
finishDict["Hour"] = make_two_number(int(finish/60/60))
time_string = f'{startDict["Hour"]}:{startDict["Minute"]}:{startDict["Second"]},{startDict["Milisecond"]} --> '
time_string += f'{finishDict["Hour"]}:{finishDict["Minute"]}:{finishDict["Second"]},{finishDict["Milisecond"]}'
return time_string
def get_large_audio_transcription(path,lang):
print("Audio Translating Started")
path += ".wav"
sound = AudioSegment.from_wav(path)
# sound += 20
chunks = split_on_silence(sound,
min_silence_len = 450,
silence_thresh = sound.dBFS-14,
keep_silence=450,
)
chunks_details = detect_nonsilent(sound,
min_silence_len = 450,
silence_thresh = sound.dBFS-14,
)
srt_file_content = []
print_counter = 1
folder_name = "audio-chunks"
if not os.path.isdir(folder_name):
os.mkdir(folder_name)
for i, audio_chunk in enumerate(chunks, start=1):
chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
audio_chunk.export(chunk_filename, format="wav")
with sr.AudioFile(chunk_filename) as source:
audio_listened = r.record(source)
try:
text = r.recognize_google(audio_listened,language=lang)
except sr.UnknownValueError as e:
pass
else:
text = f"{text.capitalize()}. "
srt_file_content.append(f"{print_counter}")
srt_file_content.append(nonsilent_object_to_srt_time_string(chunks_details[i-1]))
srt_file_content.append(text)
srt_file_content.append("")
print(f"{i}/{len(chunks)}")
print_counter += 1
for i in range(len(chunks)):
file_path = os.path.join(folder_name, f"chunk{i+1}.wav")
if os.path.exists(file_path):
os.remove(file_path)
os.rmdir(folder_name)
if srt_file_content[len(srt_file_content)-1] == "\n":
del srt_file_content[len(srt_file_content)-1]
print("Audio Translating Finished")
return srt_file_content
srt_file_content = get_large_audio_transcription(input_audio_path, lang)
with open(output_srt_path, "w") as srt_file:
srt_file.write("\n".join(srt_file_content))
if name == "main":
main()
Sorry, I'm not a Python expert, now it gives me:
input_audio_path = args.input_audio
NameError: name 'args' is not defined
By the way, you may close the issue: I've found https://github.com/openai/whisper - it works great right "out of the box" 😉
I downloaded your Python program, installed all dependencies but running python main.py 1.wav gives me nothing :(