Implementing a different version of youtube_transcriber.py using yt-dlp and Whisper

240db commented 5 days ago

Inspiration

So there is a gradio space https://huggingface.co/spaces/hf-audio/whisper-large-v3 that uses whisper, from the hugging face api :

import spaces
import torch

import gradio as gr
import yt_dlp as youtube_dl
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read

import tempfile
import os

MODEL_NAME = "openai/whisper-large-v3"
BATCH_SIZE = 8
FILE_LIMIT_MB = 1000
YT_LENGTH_LIMIT_S = 3600  # limit to 1 hour YouTube files

device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)

@spaces.GPU
def transcribe(inputs, task):
    if inputs is None:
        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")

    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
    return  text

def _return_yt_html_embed(yt_url):
    video_id = yt_url.split("?v=")[-1]
    HTML_str = (
        f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
        " </center>"
    )
    return HTML_str

def download_yt_audio(yt_url, filename):
    info_loader = youtube_dl.YoutubeDL()

    try:
        info = info_loader.extract_info(yt_url, download=False)
    except youtube_dl.utils.DownloadError as err:
        raise gr.Error(str(err))

    file_length = info["duration_string"]
    file_h_m_s = file_length.split(":")
    file_h_m_s = [int(sub_length) for sub_length in file_h_m_s]

    if len(file_h_m_s) == 1:
        file_h_m_s.insert(0, 0)
    if len(file_h_m_s) == 2:
        file_h_m_s.insert(0, 0)
    file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]

    if file_length_s > YT_LENGTH_LIMIT_S:
        yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
        file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
        raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")

    ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}

    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        try:
            ydl.download([yt_url])
        except youtube_dl.utils.ExtractorError as err:
            raise gr.Error(str(err))

@spaces.GPU
def yt_transcribe(yt_url, task, max_filesize=75.0):
    html_embed_str = _return_yt_html_embed(yt_url)

    with tempfile.TemporaryDirectory() as tmpdirname:
        filepath = os.path.join(tmpdirname, "video.mp4")
        download_yt_audio(yt_url, filepath)
        with open(filepath, "rb") as f:
            inputs = f.read()

    inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
    inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}

    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]

    return html_embed_str, text

demo = gr.Blocks()

mf_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(sources="microphone", type="filepath"),
        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
    ],
    outputs="text",
    title="Whisper Large V3: Transcribe Audio",
    description=(
        "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
        f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
        " of arbitrary length."
    ),
    allow_flagging="never",
)

file_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(sources="upload", type="filepath", label="Audio file"),
        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
    ],
    outputs="text",
    title="Whisper Large V3: Transcribe Audio",
    description=(
        "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
        f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
        " of arbitrary length."
    ),
    allow_flagging="never",
)

yt_transcribe = gr.Interface(
    fn=yt_transcribe,
    inputs=[
        gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
    ],
    outputs=["html", "text"],
    title="Whisper Large V3: Transcribe YouTube",
    description=(
        "Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
        f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
        " arbitrary length."
    ),
    allow_flagging="never",
)

with demo:
    gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])

demo.queue().launch()

but you can also set Whisper locally, in this case, just including these two as dependencies in case you want to do this locally with open-source alternatives. Here is a

sketch for replacing the current youtube_transcriber.py

for one that uses Whisper and yt-dlp instead:

import os
import tempfile
import yt_dlp as youtube_dl
import whisper

class YouTubeTranscriber:
    def __init__(self):
        # Load Whisper model locally (e.g., using 'base' model; you can change this to any model like 'tiny', 'small', etc.)
        self.model = whisper.load_model("base")

    def download_yt_audio(self, url: str) -> str:
        """
        Downloads the audio from a YouTube video using yt-dlp and saves it as an mp3 file in a temporary directory.

        Args:
            url (str): The YouTube video URL.

        Returns:
            str: The file path to the downloaded audio file.
        """
        with tempfile.TemporaryDirectory() as tmpdirname:
            output_path = os.path.join(tmpdirname, "audio.mp3")
            ydl_opts = {
                'format': 'bestaudio/best',
                'outtmpl': output_path,
                'postprocessors': [{
                    'key': 'FFmpegExtractAudio',
                    'preferredcodec': 'mp3',
                    'preferredquality': '192',
                }],
            }

            with youtube_dl.YoutubeDL(ydl_opts) as ydl:
                ydl.download([url])

            return output_path

    def transcribe_audio(self, audio_path: str) -> str:
        """
        Transcribes the downloaded audio file using Whisper.

        Args:
            audio_path (str): The path to the audio file.

        Returns:
            str: The transcribed text.
        """
        # Use Whisper to transcribe the audio file
        result = self.model.transcribe(audio_path)
        return result['text']

    def transcribe_from_youtube(self, url: str) -> str:
        """
        Downloads a YouTube video's audio and transcribes it using Whisper.

        Args:
            url (str): The YouTube video URL.

        Returns:
            str: The transcribed text.
        """
        try:
            # Download audio from YouTube
            audio_path = self.download_yt_audio(url)
            # Transcribe the audio
            transcript = self.transcribe_audio(audio_path)
            return transcript
        except Exception as e:
            raise RuntimeError(f"Error during transcription: {e}")

def main():
    # Example usage
    yt_url = "https://www.youtube.com/watch?v=nFbJCoTK0_g"  # Replace with the desired YouTube URL
    transcriber = YouTubeTranscriber()

    try:
        transcript = transcriber.transcribe_from_youtube(yt_url)
        print("Transcript extracted successfully.")
        print("First 500 characters of the transcript:")
        print(transcript[:500] + "..." if len(transcript) > 500 else transcript)
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

I think it is still calling ffmpeg to convert to audio which is not necessary as you can do it from yt-dlp to almost any audio format. I will edit this further

souzatharsis commented 5 days ago

That's intriguing, thanks for sharing.

What would be the benefit of using Whisper + yt-dlp versus simply taking the pre-generated transcript form youtube? I am concerned the proposed solution would add too much latency + require additional API Keys / cost $.

Would appreciate if you could elaborate on your thoughts.

Thanks!

240db commented 5 days ago

its no match to videos where the subtitle is available, but that is usually for an older video, not every video might have the subtitles especially the newer ones. For longer videos say a Fed or Central Bank speech, you can transcribe with faster tools but whisper is a bit more robust, even more robust than the auto generated youtube subtitles. So you would be able to generate podcast about more recent content, the content could be transcribed as its released (audio/video) even if no subtitle was provided.

The subtitles from youtube are great to cut the overhead and they also provide multilingual versions which is great, that whisper solution misses the translation part, but the transcription might not be as accurate as whisper large.

Anyway only for those videos that do not have subtitles or if you want to try a higher quality transcript, not that will matter too much for Gemini but it can help to get more immediate content parsed to the pipe

souzatharsis commented 5 days ago

Awesome, this makes a lot of sense and sounds like an interesting enhancement!

240db commented 5 days ago

Cool! Also, yt-dlp and whisper allow one to download any video, from any website really, it supports downloading videos from Instagram Reels or other third party sites.

As for whisper, in case users have a mp4 source or any external media, it could be processed with ffmpeg if needed then transcribed with whisper to make a transcript.txt.

souzatharsis commented 1 hour ago

v0.2.1 makes podcastfy multimodal; images + text for now but pathway to any modality. having said that, in case youtube video does not have captions, we should simply download the video and pass it to LLM as I'm passing images and text today. In addition to implementing download video feature, I'd need to add support for video in the LLM.

souzatharsis / podcastfy

Implementing a different version of youtube_transcriber.py using yt-dlp and Whisper #26

Inspiration

sketch for replacing the current youtube_transcriber.py