Open brunoVanDame opened 2 years ago
The video metadata is no longer where the extractor expected.
This patch (to the git master or the release) bypasses the crash and allows the media to be fetched without the missing metadata, while we go looking for it:
--- old/youtube_dl/extractor/imdb.py
+++ new/youtube_dl/extractor/imdb.py
@@ -1,3 +1,4 @@
+# coding: utf-8
from __future__ import unicode_literals
import base64
@@ -90,7 +91,7 @@
'https://www.imdb.com/video/vi' + video_id, video_id)
video_metadata = self._parse_json(self._search_regex(
r'args\.push\(\s*({.+?})\s*\)\s*;', webpage,
- 'video metadata'), video_id)
+ 'video metadata', fatal=False) or '{}', video_id)
video_info = video_metadata.get('VIDEO_INFO')
if video_info and isinstance(video_info, dict):
@@ -106,12 +107,13 @@
return {
'id': video_id,
+
'title': title,
'alt_title': info.get('videoSubTitle'),
'formats': formats,
'description': info.get('videoDescription'),
'thumbnail': url_or_none(try_get(
- video_metadata, lambda x: x['videoSlate']['source'])),
+ info, lambda x: x['videoSlate']['source'])),
'duration': parse_duration(info.get('videoRuntime')),
}
$ python -m youtube_dl -v -F --ignore-config 'https://www.imdb.com/video/vi1705771289'
[debug] System config: []
[debug] User config: []
[debug] Custom config: []
[debug] Command-line args: [u'-v', u'-F', u'--ignore-config', u'https://www.imdb.com/video/vi1705771289']
[debug] Encodings: locale UTF-8, fs UTF-8, out UTF-8, pref UTF-8
[debug] youtube-dl version 2021.12.17
[debug] Git HEAD: 871645a4a
[debug] Python version 2.7.17 (CPython) - Linux-4.4.0-210-generic-i686-with-Ubuntu-16.04-xenial
[debug] exe versions: avconv 4.3, avprobe 4.3, ffmpeg 4.3, ffprobe 4.3
[debug] Proxy map: {}
[imdb] 1705771289: Downloading JSON metadata
[imdb] 1705771289: Downloading m3u8 information
[imdb] 1705771289: Downloading webpage
WARNING: unable to extract video metadata; please report this issue on https://yt-dl.org/bug . Make sure you are using the latest version; see https://yt-dl.org/update on how to update. Be sure to call youtube-dl with the --verbose flag and include its complete output.
[info] Available formats for 1705771289:
format code extension resolution note
SD mp4 unknown
480p mp4 unknown
hls-342 mp4 240x180 342k , avc1.4d001e, mp4a.40.29
hls-522 mp4 360x270 522k , avc1.4d001e, mp4a.40.5
hls-763 mp4 384x288 763k , avc1.4d001e, mp4a.40.5
hls-1090 mp4 480x360 1090k , avc1.4d001e, mp4a.40.5
hls-1589 mp4 480x360 1589k , avc1.4d001e, mp4a.40.5
hls-2313 mp4 528x396 2313k , avc1.64001e, mp4a.40.5
hls-3492 mp4 640x480 3492k , avc1.64001f, mp4a.40.2
hls-5656 mp4 640x480 5656k , avc1.64001f, mp4a.40.2
hls-9077 mp4 640x480 9077k , avc1.640028, mp4a.40.2 (best)
$
And this actually gets the metadata from the Next.js-ified hydration JSON:
--- old/youtube_dl/extractor/imdb.py
+++ new/youtube_dl/extractor/imdb.py
@@ -1,3 +1,4 @@
+# coding: utf-8
from __future__ import unicode_literals
import base64
@@ -5,11 +6,14 @@
import re
from .common import InfoExtractor
+from ..compat import compat_str
from ..utils import (
determine_ext,
+ get_element_by_id,
mimetype2ext,
parse_duration,
qualities,
+ str_or_none,
try_get,
url_or_none,
)
@@ -25,7 +29,7 @@
'info_dict': {
'id': '2524815897',
'ext': 'mp4',
- 'title': 'No. 2',
+ 'title': 'Ice Age 4: Continental Drift',
'description': 'md5:87bd0bdc61e351f21f20d2d7441cb4e7',
'duration': 152,
}
@@ -49,6 +53,34 @@
'only_matching': True,
}]
+ def _extract_formats(self, fmt_list, video_id):
+ if not isinstance(fmt_list, (list, tuple)):
+ return []
+ quality = qualities(('SD', '480p', '720p', '1080p'))
+ formats = []
+ for encoding in fmt_list:
+ video_url = url_or_none(try_get(encoding, lambda x: x['url']))
+ if not video_url:
+ continue
+ ext = mimetype2ext(encoding.get(
+ 'mimeType')) or determine_ext(video_url)
+ if ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ video_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ preference=1, m3u8_id='hls', fatal=False))
+ continue
+ format_id = (
+ try_get(encoding, lambda x: x['displayName']['value'], compat_str)
+ or encoding.get('definition'))
+ formats.append({
+ 'format_id': format_id,
+ 'url': video_url,
+ 'ext': ext,
+ 'quality': quality(format_id),
+ 'language': encoding.get('language'),
+ })
+ return formats
+
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -62,57 +94,70 @@
}).encode()).decode(),
})[0]
- quality = qualities(('SD', '480p', '720p', '1080p'))
- formats = []
- for encoding in data['videoLegacyEncodings']:
- if not encoding or not isinstance(encoding, dict):
- continue
- video_url = url_or_none(encoding.get('url'))
- if not video_url:
- continue
- ext = mimetype2ext(encoding.get(
- 'mimeType')) or determine_ext(video_url)
- if ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- video_url, video_id, 'mp4', entry_protocol='m3u8_native',
- preference=1, m3u8_id='hls', fatal=False))
- continue
- format_id = encoding.get('definition')
- formats.append({
- 'format_id': format_id,
- 'url': video_url,
- 'ext': ext,
- 'quality': quality(format_id),
- })
- self._sort_formats(formats)
+ formats = self._extract_formats(try_get(data, lambda x: x['videoLegacyEncodings'], list), video_id)
webpage = self._download_webpage(
'https://www.imdb.com/video/vi' + video_id, video_id)
- video_metadata = self._parse_json(self._search_regex(
- r'args\.push\(\s*({.+?})\s*\)\s*;', webpage,
- 'video metadata'), video_id)
+ video_metadata = try_get(
+ self._parse_json(get_element_by_id('__NEXT_DATA__', webpage), video_id, fatal=False),
+ lambda x: x['props']['pageProps']['videoPlaybackData']['video'],
+ dict)
+ if video_metadata:
- video_info = video_metadata.get('VIDEO_INFO')
- if video_info and isinstance(video_info, dict):
+ def get_value(data, prop, value_name='value', expected_type=compat_str):
+ return try_get(data, lambda x: x[prop][value_name], expected_type)
+
+ title = try_get(video_metadata,
+ lambda x: x['primaryTitle']['titleText']['text'],
+ compat_str)
+ alt_title = get_value(video_metadata, 'name')
+ if not title:
+ title = alt_title
+ if title == alt_title:
+ alt_title = None
+ if not alt_title:
+ alt_title = try_get(video_metadata,
+ lambda x: x['primaryTitle']['originalTitleText']['text'],
+ compat_str)
+ if title == alt_title:
+ alt_title = None
+ description = get_value(video_metadata, 'description')
+
+ formats.extend(self._extract_formats(video_metadata.get('playbackURLs'), video_id))
+
+ thumbnail = url_or_none(get_value(video_metadata, 'thumbnail', value_name='url'))
+ duration = get_value(video_metadata, 'runtime', expected_type=int)
+
+ else:
+ video_metadata = self._parse_json(self._search_regex(
+ r'args\.push\(\s*({.+?})\s*\)\s*;', webpage,
+ 'video metadata', fatal=False) or '{}', video_id)
+
+ video_info = try_get(video_metadata, lambda x: x['VIDEO_INFO'], dict) or {}
info = try_get(
- video_info, lambda x: x[list(video_info.keys())[0]][0], dict)
- else:
- info = {}
+ video_info, lambda x: x[list(video_info.keys())[0]][0], dict) or {}
+ title = info.get('videoTitle')
+ alt_title = info.get('videoSubTitle')
+ description = info.get('videoDescription')
+ thumbnail = url_or_none(try_get(
+ info, lambda x: x['videoSlate']['source']))
+ duration = parse_duration(info.get('videoRuntime'))
- title = self._html_search_meta(
- ['og:title', 'twitter:title'], webpage) or self._html_search_regex(
- r'<title>(.+?)</title>', webpage, 'title',
- default=None) or info['videoTitle']
+ if not title:
+ title = (
+ self._html_search_meta(('og:title', 'twitter:title'), webpage)
+ or self._html_search_regex(r'(?s)<title\b[^>]*>(.+?)</title>', webpage, 'title'))
+
+ self._sort_formats(formats)
return {
'id': video_id,
'title': title,
- 'alt_title': info.get('videoSubTitle'),
+ 'alt_title': alt_title,
'formats': formats,
- 'description': info.get('videoDescription'),
- 'thumbnail': url_or_none(try_get(
- video_metadata, lambda x: x['videoSlate']['source'])),
- 'duration': parse_duration(info.get('videoRuntime')),
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
}
Hello,
Thank you very much for the patch. But, I do not now how to apply it. Plan was to do it manually.
In the imdb.py file I have I however, I cannot find the lines needed to be changed. It seems to be a different one.
I have my imdb.py file attached. Can you please adapt the file ?
Is that possible ?
Thanks,
Bruno
On 4/6/22 13:56, dirkf wrote:
The video metadata is no longer where the extractor expected.
This patch (to the git master or the release) bypasses the crash and allows the media to be fetched without the missing metadata, while we go looking for it:
--- old/youtube_dl/extractor/imdb.py +++ new/youtube_dl/extractor/imdb.py @@-1,3 +1,4 @@ +# coding: utf-8 from future import unicode_literals
import base64 @@ -90,7 +91,7 @@ 'https://www.imdb.com/video/vi' + video_id, video_id) video_metadata = self._parse_json(self._search_regex( r'args.push(\s({.+?})\s)\s*;', webpage,
- 'video metadata'), video_id)
'video metadata',fatal=False)or '{}',video_id)
video_info = video_metadata.get('VIDEO_INFO') if video_info and isinstance(video_info,dict):
@@ -106,12 +107,13 @@
return { 'id': video_id, + 'title': title, 'alt_title': info.get('videoSubTitle'), 'formats': formats, 'description': info.get('videoDescription'), 'thumbnail': url_or_none(try_get(
- video_metadata, lambda x: x['videoSlate']['source'])),
- info, lambda x: x['videoSlate']['source'])), 'duration': parse_duration(info.get('videoRuntime')), }
$python -m youtube_dl -v -F --ignore-config 'https://www.imdb.com/video/vi1705771289' [debug] System config: [] [debug] User config: [] [debug] Custom config: [] [debug] Command-line args: [u'-v', u'-F', u'--ignore-config', u'https://www.imdb.com/video/vi1705771289'] [debug] Encodings: locale UTF-8, fs UTF-8, out UTF-8, pref UTF-8 [debug] youtube-dl version 2021.12.17 [debug] Git HEAD: 871645a4a [debug] Python version 2.7.17 (CPython) - Linux-4.4.0-210-generic-i686-with-Ubuntu-16.04-xenial [debug] exe versions: avconv 4.3, avprobe 4.3, ffmpeg 4.3, ffprobe 4.3 [debug] Proxy map: {} [imdb] 1705771289: Downloading JSON metadata [imdb] 1705771289: Downloading m3u8 information [imdb] 1705771289: Downloading webpage WARNING: unable to extract video metadata; please report this issue on https://yt-dl.org/bug . Make sure you are using the latest version; see https://yt-dl.org/update on how to update. Be sure to call youtube-dl with the --verbose flag and include its complete output. [info] Available formats for 1705771289: format code extension resolution note SD mp4 unknown 480p mp4 unknown hls-342 mp4 240x180 342k , avc1.4d001e, mp4a.40.29 hls-522 mp4 360x270 522k , avc1.4d001e, mp4a.40.5 hls-763 mp4 384x288 763k , avc1.4d001e, mp4a.40.5 hls-1090 mp4 480x360 1090k , avc1.4d001e, mp4a.40.5 hls-1589 mp4 480x360 1589k , avc1.4d001e, mp4a.40.5 hls-2313 mp4 528x396 2313k , avc1.64001e, mp4a.40.5 hls-3492 mp4 640x480 3492k , avc1.64001f, mp4a.40.2 hls-5656 mp4 640x480 5656k , avc1.64001f, mp4a.40.2 hls-9077 mp4 640x480 9077k , avc1.640028, mp4a.40.2 (best) $
— Reply to this email directly, view it on GitHub https://github.com/ytdl-org/youtube-dl/issues/30824#issuecomment-1090182847, or unsubscribe https://github.com/notifications/unsubscribe-auth/ASC5G4R5STKDHZDN3UOJXJ3VDV3WRANCNFSM5SU2ROMQ. You are receiving this because you authored the thread.Message ID: @.***>
Here's my imdb.py :
from future import unicode_literals
import re
from .common import InfoExtractor from ..utils import ( mimetype2ext, qualities, remove_end, )
class ImdbIE(InfoExtractor):
IE_NAME = 'imdb'
IE_DESC = 'Internet Movie Database trailers'
_VALID_URL = r'https?://(?:www|m).imdb.com/(?:video|title).+?[/-]vi(?P
_TESTS = [{
'url': 'http://www.imdb.com/video/imdb/vi2524815897',
'info_dict': {
'id': '2524815897',
'ext': 'mp4',
'title': 'Ice Age: Continental Drift Trailer (No. 2)',
'description': 'md5:9061c2219254e5d14e03c25c98e96a81',
}
}, {
'url': 'http://www.imdb.com/video/_/vi2524815897',
'only_matching': True,
}, {
'url': 'http://www.imdb.com/title/tt1667889/?ref_=ext_shr_eml_vi#lb-vi2524815897',
'only_matching': True,
}, {
'url': 'http://www.imdb.com/title/tt1667889/#lb-vi2524815897',
'only_matching': True,
}, {
'url': 'http://www.imdb.com/videoplayer/vi1562949145',
'only_matching': True,
}, {
'url': 'http://www.imdb.com/title/tt4218696/videoplayer/vi2608641561',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage('http://www.imdb.com/video/imdb/vi%s' % video_id, video_id)
descr = self._html_search_regex(
r'(?s)<span itemprop="description">(.*?)</span>',
webpage, 'description', fatal=False)
player_url = 'http://www.imdb.com/video/imdb/vi%s/imdb/single' % video_id
player_page = self._download_webpage(
player_url, video_id, 'Downloading player page')
# the player page contains the info for the default format, we have to
# fetch other pages for the rest of the formats
extra_formats = re.findall(r'href="(?P<url>%s.*?)".*?>(?P<name>.*?)<' % re.escape(player_url), player_page)
format_pages = [
self._download_webpage(
f_url, video_id, 'Downloading info for %s format' % f_name)
for f_url, f_name in extra_formats]
format_pages.append(player_page)
quality = qualities(('SD', '480p', '720p', '1080p'))
formats = []
for format_page in format_pages:
json_data = self._search_regex(
r'<script[^>]+class="imdb-player-data"[^>]*?>(.*?)</script>',
format_page, 'json data', flags=re.DOTALL)
info = self._parse_json(json_data, video_id, fatal=False)
if not info:
continue
format_info = info.get('videoPlayerObject', {}).get('video', {})
if not format_info:
continue
video_info_list = format_info.get('videoInfoList')
if not video_info_list or not isinstance(video_info_list, list):
continue
video_info = video_info_list[0]
if not video_info or not isinstance(video_info, dict):
continue
video_url = video_info.get('videoUrl')
if not video_url:
continue
format_id = format_info.get('ffname')
formats.append({
'format_id': format_id,
'url': video_url,
'ext': mimetype2ext(video_info.get('videoMimeType')),
'quality': quality(format_id),
})
self._sort_formats(formats)
return {
'id': video_id,
'title': remove_end(self._og_search_title(webpage), ' - IMDb'),
'formats': formats,
'description': descr,
'thumbnail': format_info.get('slate'),
}
class ImdbListIE(InfoExtractor):
IE_NAME = 'imdb:list'
IE_DESC = 'Internet Movie Database lists'
_VALIDURL = r'https?://(?:www.)?imdb.com/list/(?P
def _real_extract(self, url):
list_id = self._match_id(url)
webpage = self._download_webpage(url, list_id)
entries = [
self.url_result('http://www.imdb.com' + m, 'Imdb')
for m in re.findall(r'href="(/video/imdb/vi[^"]+)"\s+data-type="playlist"', webpage)]
list_title = self._html_search_regex(
r'<h1 class="header">(.*?)</h1>', webpage, 'list title')
return self.playlist_result(entries, list_id, list_title)
I'll post a PR from which you can pull the entire extractor file in due course.
Damn, my stupid mistake. Have no (not much) experience with python. I was looking at the wrong imdb.py file. Had 2 versions on my machine. So, for now its ok.
Just writing for others with similar problems: I solved it by updating, in python the command is pip install --upgrade yt-dlp
.
Checklist
Verbose log
Description
My command line is this : /usr/local/bin/youtube-dl -v https://www.imdb.com/video/vi1705771289