How can I select which captions should be downloaded in get_transcript()?

RadoslavL commented 1 year ago

If have a caption name string (example: "English (auto-generated)"), how can I select this specific caption to get downloaded? How can I get the transcript's name from the API?

Thank you in advance!

tombulled commented 11 months ago

I'm more than happy to create an example for this - will look into this now for you

tombulled commented 11 months ago

Ok so this is the first approach I've got working (this lists captions for all available languages):

from innertube import InnerTube
from pprint import pprint

# YouTube Web CLient
web = InnerTube("WEB", "2.20230920.00.00")

# Linus Tech Tips - I couldn't do my job without this. - PiKVM
player = web.player("232opnNPGNo")

for caption_track in player["captions"]["playerCaptionsTracklistRenderer"]["captionTracks"]:
    base_url: str = caption_track["baseUrl"]
    name: str = caption_track["name"]["simpleText"]
    language_code: str = caption_track["languageCode"]

    print(f"Captions for {name!r} [{language_code}]:")

    captions: dict = web.adaptor.session.get(base_url, params={"fmt": "json3"}).json()

    pprint(captions)
    print()

A few important points about this approach:

This uses YouTube's /api/timedtext endpoint, which is not a true part of the InnerTube API
This makes use of the httpx.Client session the InnerTube client uses under the hood which is a bit hacky. If you want to take this approach, it's probably worth instantiating your own InnerTubeAdaptor.
The /api/timedtext endpoint returns XML by default, hence the addition of an fmt=json3 query parameter to get it to return JSON

I'll do some more digging to see if there's an easier/better way to do this using the InnerTube API

tombulled commented 11 months ago

Here's a lashed-together example that pulls the data from the InnerTube API:

from innertube import InnerTube

PANEL_IDENTIFIER_TRANSCRIPT = "engagement-panel-searchable-transcript"

def extract_transcript_params(next_data):
    engagement_panels = next_data["engagementPanels"]

    for engagement_panel in engagement_panels:
        engagement_panel_section = engagement_panel[
            "engagementPanelSectionListRenderer"
        ]

        if (
            engagement_panel_section.get("panelIdentifier")
            != PANEL_IDENTIFIER_TRANSCRIPT
        ):
            continue

        return engagement_panel_section["content"]["continuationItemRenderer"][
            "continuationEndpoint"
        ]["getTranscriptEndpoint"]["params"]

def extract_transcript_languages(transcript_data):
    transcript_menu_items = transcript_data["actions"][0][
        "updateEngagementPanelAction"
    ]["content"]["transcriptRenderer"]["content"]["transcriptSearchPanelRenderer"][
        "footer"
    ][
        "transcriptFooterRenderer"
    ][
        "languageMenu"
    ][
        "sortFilterSubMenuRenderer"
    ][
        "subMenuItems"
    ]

    transcript_languages = []

    for transcript_menu_item in transcript_menu_items:
        title = transcript_menu_item["title"]
        continuation = transcript_menu_item["continuation"]["reloadContinuationData"][
            "continuation"
        ]

        transcript_languages.append(
            {
                "language": title,
                "params": continuation,
            }
        )

    return transcript_languages

def extract_transcript_segments(transcript_data):
    return transcript_data["actions"][0]["updateEngagementPanelAction"]["content"][
        "transcriptRenderer"
    ]["content"]["transcriptSearchPanelRenderer"]["body"][
        "transcriptSegmentListRenderer"
    ][
        "initialSegments"
    ]

# YouTube Web CLient
client = InnerTube("WEB", "2.20230920.00.00")

# Linus Tech Tips - I couldn't do my job without this. - PiKVM
data = client.next("232opnNPGNo")

transcript_params = extract_transcript_params(data)

transcript = client.get_transcript(transcript_params)

languages = extract_transcript_languages(transcript)

for language in languages:
    print(f"Transcript for {language['language']!r}:")

    language_transcript = client.get_transcript(language["params"])

    segments = extract_transcript_segments(language_transcript)

    for segment in segments:
        segment_key = next(iter(segment.keys()))

        if segment_key == "transcriptSectionHeaderRenderer":
            header_segment = segment["transcriptSectionHeaderRenderer"]

            snippet = header_segment["snippet"]["simpleText"]

            print(f"# {snippet}")
        elif segment_key == "transcriptSegmentRenderer":
            segment = segment["transcriptSegmentRenderer"]

            start_time = segment["startTimeText"]["simpleText"]
            snippet = segment["snippet"]["runs"][0]["text"]

            print(f"[{start_time}] {snippet}")
        else:
            raise Exception(f"Unknown segment renderer {segment_key!r}")

    print()

tombulled commented 11 months ago

Here's a more concrete example that uses the timed-text API and parses the data using pydantic (highly recommend pydantic):

from typing import Final, Optional, Sequence

import httpx
from pydantic import AliasPath, BaseModel, ConfigDict, Field
from pydantic.alias_generators import to_camel

import innertube.api
from innertube import InnerTube
from innertube.config import config
from innertube.models import ClientContext

def get_client_context(client_name: str, /) -> ClientContext:
    client_context: Optional[ClientContext] = innertube.api.get_context(client_name)

    if client_context is None:
        raise Exception(f"No context available for client {client_name!r}")

    return client_context

WEB: Final[InnerTube] = InnerTube("WEB", "2.20230920.00.00")
WEB_CONTEXT: Final[ClientContext] = get_client_context("WEB")
WEB_SESSION: Final[httpx.Client] = httpx.Client(
    base_url=config.base_url,
    headers=WEB_CONTEXT.headers(),
)

class BaseCamelModel(BaseModel):
    model_config = ConfigDict(alias_generator=to_camel)

class CaptionTrack(BaseCamelModel):
    base_url: str
    name: str = Field(validation_alias=AliasPath("name", "simpleText"))
    vss_id: str
    language_code: str
    is_translatable: bool

class TimedTextEventSegment(BaseCamelModel):
    utf8: str
    t_offset_ms: Optional[int] = None
    ac_asr_conf: Optional[int] = None

class TimedTextEvent(BaseCamelModel):
    t_start_ms: int
    d_duration_ms: Optional[int] = None
    id: Optional[int] = None
    wp_win_pos_id: Optional[int] = None
    ws_win_style_id: Optional[int] = None
    w_win_id: Optional[int] = None
    a_append: Optional[int] = None
    segs: Optional[Sequence[TimedTextEventSegment]] = None

class TimedText(BaseCamelModel):
    wire_magic: str
    pens: Sequence[dict]
    ws_win_styles: Sequence[dict]
    wp_win_positions: Sequence[dict]
    events: Sequence[TimedTextEvent]

def get_caption_tracks(video_id: str, /) -> Sequence[CaptionTrack]:
    player: dict = WEB.player(video_id)

    caption_tracks: Sequence[dict] = player["captions"][
        "playerCaptionsTracklistRenderer"
    ]["captionTracks"]

    return [
        CaptionTrack.model_validate(caption_track) for caption_track in caption_tracks
    ]

def get_timed_text(url: str, /) -> TimedText:
    response: httpx.Response = WEB_SESSION.get(url, params={"fmt": "json3"})

    response_data: dict = response.json()

    return TimedText.model_validate(response_data)

def print_timed_text(timed_text: TimedText, /) -> None:
    timed_text_event: TimedTextEvent
    for timed_text_event in timed_text.events:
        timed_text_event_segments: Sequence[TimedTextEventSegment] = (
            timed_text_event.segs or ()
        )

        composite_segment: str = "".join(
            timed_text_event_segment.utf8
            for timed_text_event_segment in timed_text_event_segments
        )

        print(f"[{timed_text_event.t_start_ms}] {composite_segment}")

# Linus Tech Tips - I couldn't do my job without this. - PiKVM
video_id: str = "232opnNPGNo"

caption_tracks: Sequence[CaptionTrack] = get_caption_tracks(video_id)

caption_track: CaptionTrack
for caption_track in caption_tracks:
    print(f"Fetching timed-text for {caption_track.name!r}...")

    timed_text: TimedText = get_timed_text(caption_track.base_url)

    print_timed_text(timed_text)
    print()

RadoslavL commented 11 months ago

Thank you! I knew this solution existed, but didn't want to use 3 API calls. I guess there isn't a way around that.

tombulled / innertube

How can I select which captions should be downloaded in get_transcript()? #57