Closed shirishag75 closed 2 years ago
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py
index ccb41cb2e..453ff28f3 100644
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -2297,7 +2297,7 @@ def refetch_manifest(format_id, delay):
microformats = traverse_obj(
prs, (..., 'microformat', 'playerMicroformatRenderer'),
expected_type=dict, default=[])
- _, is_live, _, formats = self._list_formats(video_id, microformats, video_details, prs, player_url)
+ _, is_live, _, formats, _ = self._list_formats(video_id, microformats, video_details, prs, player_url)
start_time = time.time()
def mpd_feed(format_id, delay):
@@ -3121,7 +3121,7 @@ def append_client(*client_names):
self.report_warning(last_error)
return prs, player_url
- def _extract_formats(self, streaming_data, video_id, player_url, is_live, duration):
+ def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, is_live):
itags, stream_ids = {}, []
itag_qualities, res_qualities = {}, {}
q = qualities([
@@ -3278,17 +3278,22 @@ def process_manifest_format(f, proto, itag):
if val in qdict), -1)
return True
+ subtitles = {}
for sd in streaming_data:
hls_manifest_url = get_hls and sd.get('hlsManifestUrl')
if hls_manifest_url:
- for f in self._extract_m3u8_formats(hls_manifest_url, video_id, 'mp4', fatal=False):
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(hls_manifest_url, video_id, 'mp4', fatal=False, live=is_live)
+ subtitles = self._merge_subtitles(subs, subtitles)
+ for f in fmts:
if process_manifest_format(f, 'hls', self._search_regex(
r'/itag/(\d+)', f['url'], 'itag', default=None)):
yield f
dash_manifest_url = get_dash and sd.get('dashManifestUrl')
if dash_manifest_url:
- for f in self._extract_mpd_formats(dash_manifest_url, video_id, fatal=False):
+ formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False)
+ subtitles = self._merge_subtitles(subs, subtitles) # Prioritize HLS subs over DASH
+ for f in formats:
if process_manifest_format(f, 'dash', f['format_id']):
f['filesize'] = int_or_none(self._search_regex(
r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None))
@@ -3296,6 +3301,7 @@ def process_manifest_format(f, proto, itag):
f['is_from_start'] = True
yield f
+ yield subtitles
def _extract_storyboard(self, player_responses, duration):
spec = get_first(
@@ -3353,9 +3359,9 @@ def _list_formats(self, video_id, microformats, video_details, player_responses,
is_live = get_first(live_broadcast_details, 'isLiveNow')
streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[])
- formats = list(self._extract_formats(streaming_data, video_id, player_url, is_live, duration))
+ *formats, subtitles = self._extract_formats_and_subtitles(streaming_data, video_id, player_url, is_live)
- return live_broadcast_details, is_live, streaming_data, formats
+ return live_broadcast_details, is_live, streaming_data, formats, subtitles
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
@@ -3446,8 +3452,8 @@ def feed_entry(name):
'The livestream has not finished processing. Only 4 hours of the video can be currently downloaded. '
'This is a known issue and patches are welcome')
- live_broadcast_details, is_live, streaming_data, formats = self._list_formats(
- video_id, microformats, video_details, player_responses, player_url, duration)
+ live_broadcast_details, is_live, streaming_data, formats, automatic_captions = \
+ self._list_formats(video_id, microformats, video_details, player_responses, player_url)
if not formats:
if not self.get_param('allow_unplayable_formats') and traverse_obj(streaming_data, (..., 'licenseInfos')):
@@ -3577,6 +3583,7 @@ def feed_entry(name):
'release_timestamp': live_start_time,
}
+ subtitles = {}
pctr = traverse_obj(player_responses, (..., 'captions', 'playerCaptionsTracklistRenderer'), expected_type=dict)
if pctr:
def get_lang_code(track):
@@ -3603,7 +3610,6 @@ def process_language(container, base_url, lang_code, sub_name, query):
'name': sub_name,
})
- subtitles, automatic_captions = {}, {}
for lang_code, caption_track in captions.items():
base_url = caption_track.get('baseUrl')
orig_lang = parse_qs(base_url).get('lang', [None])[-1]
@@ -3634,8 +3640,9 @@ def process_language(container, base_url, lang_code, sub_name, query):
# Setting tlang=lang returns damaged subtitles.
process_language(automatic_captions, base_url, trans_code, trans_name,
{} if orig_lang == orig_trans_code else {'tlang': trans_code})
- info['automatic_captions'] = automatic_captions
- info['subtitles'] = subtitles
+
+ info['automatic_captions'] = automatic_captions
+ info['subtitles'] = subtitles
parsed_url = compat_urllib_parse_urlparse(url)
for component in [parsed_url.fragment, parsed_url.query]:
This extracts the subtitles, but our webvtt parser is unable to process the subtitle fragments.
cc @fstirlitz
The bit of code that you shared, would that be in the new release, and if yes, when will you be releasing the new release? As far as the subtitle is extracted into proper
No, this code is useless without fixing the parser
No, this code is useless without fixing the parser
Oh, so that's why you called the attention to the other gentleman, perhaps he might be able to fix the parser end of things. Thank you for responding so quickly to the issue. And I did see you did a new release. Just filed a wishlist bug, so hopefully by the week-end the new release will be in Debian sid and then testing -
YouTube uses obsolete metadata headers in its WebVTT files that were removed from the standard back in 2017 (Because following standards is for the hoi polloi and Google knows better, right? Yet another case of https://nitter.42l.fr/Rich_Harris/status/1220412711768666114.) This isn’t a problem for non-segmented subtitles, because we don’t parse them at all.
I guess we can pass them through as-is or strip them. There is a Kind
header we may be interested in, which can probably differentiate between (transcribed) captions and (translated) subtitles.
YouTube uses obsolete metadata headers in its WebVTT files that were removed from the standard back in 2017 (Because following standards is for the hoi polloi and Google knows better, right? Yet another case of https://nitter.42l.fr/Rich_Harris/status/1220412711768666114.) This isn’t a problem for non-segmented subtitles, because we don’t parse them at all.
I guess we can pass them through as-is or strip them. There is a
Kind
header we may be interested in, which can probably differentiate between (transcribed) captions and (translated) subtitles.
Is there a bug or something that tells what Google's plans are. Looking forward to the header or whatever can be implemented.
@pukkandan any update on this ???
I actually have a ready patch for this in my drafts:
--- a/yt_dlp/webvtt.py
+++ b/yt_dlp/webvtt.py
@@ -161,6 +161,15 @@ class Magic(HeaderBlock):
_REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
_REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*')
+ # This was removed from the spec in the 2017 revision;
+ # the last spec draft to describe this syntax element is
+ # <https://www.w3.org/TR/2015/WD-webvtt1-20151208/#webvtt-metadata-header>.
+ # Nevertheless, YouTube keeps serving those, because Google
+ # knows better than those pesky standards bodies, right?
+
+ _REGEX_META = re.compile(
+ r'(?:(?!-->)[^\r\n])+:(?:(?!-->)[^\r\n])+(?:\r\n|[\r\n])')
+
@classmethod
def __parse_tsmap(cls, parser):
parser = parser.child()
@@ -200,13 +209,19 @@ def parse(cls, parser):
raise ParseError(parser)
extra = m.group(1)
+ meta = ''
local, mpegts = None, None
- if parser.consume(cls._REGEX_TSMAP):
- local, mpegts = cls.__parse_tsmap(parser)
- if not parser.consume(_REGEX_NL):
+ while not parser.consume(_REGEX_NL):
+ if parser.consume(cls._REGEX_TSMAP):
+ local, mpegts = cls.__parse_tsmap(parser)
+ continue
+ m = parser.consume(cls._REGEX_META)
+ if m:
+ meta += m.group(0)
+ continue
raise ParseError(parser)
parser.commit()
- return cls(extra=extra, mpegts=mpegts, local=local)
+ return cls(extra=extra, mpegts=mpegts, local=local, meta=meta)
def write_into(self, stream):
stream.write('WEBVTT')
@@ -219,6 +234,8 @@ def write_into(self, stream):
stream.write(',MPEGTS:')
stream.write(str(self.mpegts if self.mpegts is not None else 0))
stream.write('\n')
+ if self.meta:
+ stream.write(self.meta)
stream.write('\n')
Perhaps the headers could be parsed more meaningfully (e.g. into an ordered multi-map), but this is enough for them not to generate errors and to be preserved in the final file.
Do you want me to merge the patch directly, or do you want to make a PR?
Too busy currently to make a PR. Also, I don’t remember the last time I actually exercised this code, so consider it untested.
No worries. I'll test and merge this directly
nice, looking forward to seeing this :)
Eh, of course I got it wrong. The way I wrote it at first, X-TIMESTAMP-MAP
would be stuffed into .meta
and never captured by __parse_tsmap
. See the corrected version.
Checklist
Description
Unable to get subtitles from specific YT videos. Sharing an example, The warning comes at the very last and it doesn't tell what the issue is :(
Verbose log