yt-dlp / yt-dlp

A feature-rich command-line audio/video downloader
https://discord.gg/H5MNcFW63r
The Unlicense
85.85k stars 6.69k forks source link

[youtube] auto-generated subtitles from livestream-vods #4130

Closed shirishag75 closed 2 years ago

shirishag75 commented 2 years ago

Checklist

Description

Unable to get subtitles from specific YT videos. Sharing an example, The warning comes at the very last and it doesn't tell what the issue is :(

Verbose log

$ yt-dlp -vvv -c -f 137 kBnTxqqadfo --write-auto-sub
[debug] Command-line config: ['-vvv', '-c', '-f', '137', 'kBnTxqqadfo', '--write-auto-sub']
[debug] Encodings: locale UTF-8, fs utf-8, pref UTF-8, out utf-8, error utf-8, screen utf-8
[debug] yt-dlp version 2022.05.18 [b14d52355]
[debug] Python version 3.10.5 (CPython 64bit) - Linux-5.18.0-1-amd64-x86_64-with-glibc2.33
[debug] Checking exe version: ffprobe -bsfs
[debug] Checking exe version: ffmpeg -bsfs
[debug] exe versions: ffmpeg 4.4.2-1 (setts), ffprobe 4.4.2-1, phantomjs 2.1.1, rtmpdump 2.4
[debug] Optional libraries: Cryptodome-3.11.0, brotli-1.0.9, certifi-2020.06.20, mutagen-1.45.1, secretstorage-3.3.2, sqlite3-2.6.0, websockets-10.2, xattr-0.9.9
[debug] Proxy map: {}
[debug] [youtube] Extracting URL: kBnTxqqadfo
[youtube] kBnTxqqadfo: Downloading webpage
[youtube] kBnTxqqadfo: Downloading android player API JSON
[youtube] kBnTxqqadfo: Downloading m3u8 information
WARNING: [youtube] Ignoring subtitle tracks found in the HLS manifest; if any subtitle tracks are missing, please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U
pukkandan commented 2 years ago

Related: https://github.com/yt-dlp/yt-dlp/issues/2039

pukkandan commented 2 years ago
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py
index ccb41cb2e..453ff28f3 100644
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -2297,7 +2297,7 @@ def refetch_manifest(format_id, delay):
             microformats = traverse_obj(
                 prs, (..., 'microformat', 'playerMicroformatRenderer'),
                 expected_type=dict, default=[])
-            _, is_live, _, formats = self._list_formats(video_id, microformats, video_details, prs, player_url)
+            _, is_live, _, formats, _ = self._list_formats(video_id, microformats, video_details, prs, player_url)
             start_time = time.time()

         def mpd_feed(format_id, delay):
@@ -3121,7 +3121,7 @@ def append_client(*client_names):
             self.report_warning(last_error)
         return prs, player_url

-    def _extract_formats(self, streaming_data, video_id, player_url, is_live, duration):
+    def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, is_live):
         itags, stream_ids = {}, []
         itag_qualities, res_qualities = {}, {}
         q = qualities([
@@ -3278,17 +3278,22 @@ def process_manifest_format(f, proto, itag):
                 if val in qdict), -1)
             return True

+        subtitles = {}
         for sd in streaming_data:
             hls_manifest_url = get_hls and sd.get('hlsManifestUrl')
             if hls_manifest_url:
-                for f in self._extract_m3u8_formats(hls_manifest_url, video_id, 'mp4', fatal=False):
+                fmts, subs = self._extract_m3u8_formats_and_subtitles(hls_manifest_url, video_id, 'mp4', fatal=False, live=is_live)
+                subtitles = self._merge_subtitles(subs, subtitles)
+                for f in fmts:
                     if process_manifest_format(f, 'hls', self._search_regex(
                             r'/itag/(\d+)', f['url'], 'itag', default=None)):
                         yield f

             dash_manifest_url = get_dash and sd.get('dashManifestUrl')
             if dash_manifest_url:
-                for f in self._extract_mpd_formats(dash_manifest_url, video_id, fatal=False):
+                formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False)
+                subtitles = self._merge_subtitles(subs, subtitles)  # Prioritize HLS subs over DASH
+                for f in formats:
                     if process_manifest_format(f, 'dash', f['format_id']):
                         f['filesize'] = int_or_none(self._search_regex(
                             r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None))
@@ -3296,6 +3301,7 @@ def process_manifest_format(f, proto, itag):
                             f['is_from_start'] = True

                         yield f
+        yield subtitles

     def _extract_storyboard(self, player_responses, duration):
         spec = get_first(
@@ -3353,9 +3359,9 @@ def _list_formats(self, video_id, microformats, video_details, player_responses,
             is_live = get_first(live_broadcast_details, 'isLiveNow')

         streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[])
-        formats = list(self._extract_formats(streaming_data, video_id, player_url, is_live, duration))
+        *formats, subtitles = self._extract_formats_and_subtitles(streaming_data, video_id, player_url, is_live)

-        return live_broadcast_details, is_live, streaming_data, formats
+        return live_broadcast_details, is_live, streaming_data, formats, subtitles

     def _real_extract(self, url):
         url, smuggled_data = unsmuggle_url(url, {})
@@ -3446,8 +3452,8 @@ def feed_entry(name):
                     'The livestream has not finished processing. Only 4 hours of the video can be currently downloaded. '
                     'This is a known issue and patches are welcome')

-        live_broadcast_details, is_live, streaming_data, formats = self._list_formats(
-            video_id, microformats, video_details, player_responses, player_url, duration)
+        live_broadcast_details, is_live, streaming_data, formats, automatic_captions = \
+            self._list_formats(video_id, microformats, video_details, player_responses, player_url)

         if not formats:
             if not self.get_param('allow_unplayable_formats') and traverse_obj(streaming_data, (..., 'licenseInfos')):
@@ -3577,6 +3583,7 @@ def feed_entry(name):
             'release_timestamp': live_start_time,
         }

+        subtitles = {}
         pctr = traverse_obj(player_responses, (..., 'captions', 'playerCaptionsTracklistRenderer'), expected_type=dict)
         if pctr:
             def get_lang_code(track):
@@ -3603,7 +3610,6 @@ def process_language(container, base_url, lang_code, sub_name, query):
                         'name': sub_name,
                     })

-            subtitles, automatic_captions = {}, {}
             for lang_code, caption_track in captions.items():
                 base_url = caption_track.get('baseUrl')
                 orig_lang = parse_qs(base_url).get('lang', [None])[-1]
@@ -3634,8 +3640,9 @@ def process_language(container, base_url, lang_code, sub_name, query):
                     # Setting tlang=lang returns damaged subtitles.
                     process_language(automatic_captions, base_url, trans_code, trans_name,
                                      {} if orig_lang == orig_trans_code else {'tlang': trans_code})
-            info['automatic_captions'] = automatic_captions
-            info['subtitles'] = subtitles
+
+        info['automatic_captions'] = automatic_captions
+        info['subtitles'] = subtitles

         parsed_url = compat_urllib_parse_urlparse(url)
         for component in [parsed_url.fragment, parsed_url.query]:

This extracts the subtitles, but our webvtt parser is unable to process the subtitle fragments.

cc @fstirlitz

shirishag75 commented 2 years ago

The bit of code that you shared, would that be in the new release, and if yes, when will you be releasing the new release? As far as the subtitle is extracted into proper .en.vtt on the same path as the .webm media file I don't think I need to worry. MPV will read it. Looking forward to know when you are releasing the new release. ytdl 22.06.22 perhaps ???

pukkandan commented 2 years ago

No, this code is useless without fixing the parser

shirishag75 commented 2 years ago

No, this code is useless without fixing the parser

Oh, so that's why you called the attention to the other gentleman, perhaps he might be able to fix the parser end of things. Thank you for responding so quickly to the issue. And I did see you did a new release. Just filed a wishlist bug, so hopefully by the week-end the new release will be in Debian sid and then testing -

https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1013332

fstirlitz commented 2 years ago

YouTube uses obsolete metadata headers in its WebVTT files that were removed from the standard back in 2017 (Because following standards is for the hoi polloi and Google knows better, right? Yet another case of https://nitter.42l.fr/Rich_Harris/status/1220412711768666114.) This isn’t a problem for non-segmented subtitles, because we don’t parse them at all.

I guess we can pass them through as-is or strip them. There is a Kind header we may be interested in, which can probably differentiate between (transcribed) captions and (translated) subtitles.

shirishag75 commented 2 years ago

YouTube uses obsolete metadata headers in its WebVTT files that were removed from the standard back in 2017 (Because following standards is for the hoi polloi and Google knows better, right? Yet another case of https://nitter.42l.fr/Rich_Harris/status/1220412711768666114.) This isn’t a problem for non-segmented subtitles, because we don’t parse them at all.

I guess we can pass them through as-is or strip them. There is a Kind header we may be interested in, which can probably differentiate between (transcribed) captions and (translated) subtitles.

Is there a bug or something that tells what Google's plans are. Looking forward to the header or whatever can be implemented.

shirishag75 commented 2 years ago

@pukkandan any update on this ???

fstirlitz commented 2 years ago

I actually have a ready patch for this in my drafts:

--- a/yt_dlp/webvtt.py
+++ b/yt_dlp/webvtt.py
@@ -161,6 +161,15 @@ class Magic(HeaderBlock):
     _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
     _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*')

+    # This was removed from the spec in the 2017 revision;
+    # the last spec draft to describe this syntax element is
+    # <https://www.w3.org/TR/2015/WD-webvtt1-20151208/#webvtt-metadata-header>.
+    # Nevertheless, YouTube keeps serving those, because Google
+    # knows better than those pesky standards bodies, right?
+
+    _REGEX_META = re.compile(
+        r'(?:(?!-->)[^\r\n])+:(?:(?!-->)[^\r\n])+(?:\r\n|[\r\n])')
+
     @classmethod
     def __parse_tsmap(cls, parser):
         parser = parser.child()
@@ -200,13 +209,19 @@ def parse(cls, parser):
             raise ParseError(parser)

         extra = m.group(1)
+        meta = ''
         local, mpegts = None, None
-        if parser.consume(cls._REGEX_TSMAP):
-            local, mpegts = cls.__parse_tsmap(parser)
-        if not parser.consume(_REGEX_NL):
+        while not parser.consume(_REGEX_NL):
+            if parser.consume(cls._REGEX_TSMAP):
+                local, mpegts = cls.__parse_tsmap(parser)
+                continue
+            m = parser.consume(cls._REGEX_META)
+            if m:
+                meta += m.group(0)
+                continue
             raise ParseError(parser)
         parser.commit()
-        return cls(extra=extra, mpegts=mpegts, local=local)
+        return cls(extra=extra, mpegts=mpegts, local=local, meta=meta)

     def write_into(self, stream):
         stream.write('WEBVTT')
@@ -219,6 +234,8 @@ def write_into(self, stream):
             stream.write(',MPEGTS:')
             stream.write(str(self.mpegts if self.mpegts is not None else 0))
             stream.write('\n')
+        if self.meta:
+            stream.write(self.meta)
         stream.write('\n')

Perhaps the headers could be parsed more meaningfully (e.g. into an ordered multi-map), but this is enough for them not to generate errors and to be preserved in the final file.

pukkandan commented 2 years ago

Do you want me to merge the patch directly, or do you want to make a PR?

fstirlitz commented 2 years ago

Too busy currently to make a PR. Also, I don’t remember the last time I actually exercised this code, so consider it untested.

pukkandan commented 2 years ago

No worries. I'll test and merge this directly

shirishag75 commented 2 years ago

nice, looking forward to seeing this :)

fstirlitz commented 2 years ago

Eh, of course I got it wrong. The way I wrote it at first, X-TIMESTAMP-MAP would be stuffed into .meta and never captured by __parse_tsmap. See the corrected version.