Open nmarty195 opened 2 years ago
The same issue here for "fb.watch" domain facebook videos
@MehdiMJ1, if your problem isn't covered by an open issue, please open a separate issue (following the template provided): the Jove extractor isn't relevant to Facebook videos.
The JoVE URL redirects to https://www.jove.com/t/61339/... but that page doesn't contain what the extractor is expecting to find.
If it looks at https://www.jove.com/v/61339/... instead, it finds a video URL but crashes because there's no comment count, and the video URL is just a UUID that has to be appended to https://ljsp.lwcdn.com/web/public/native/config/media/ to fetch JSON containing the video links. When the extractor is tweaked to handle the page, it just gets DRM playlist URLs :-(((. Eg, the M3U8 URL uses Sample-AES.
The patch below allows this unhelpful result. Perhaps other pages have videos that aren't DRM protected.
--- a/youtube_dl/extractor/jove.py
+++ b/youtube_dl/extractor/jove.py
@@ -5,13 +5,23 @@
from .common import InfoExtractor
from ..utils import (
ExtractorError,
+ determine_ext,
+ int_or_none,
+ mimetype2ext,
+ parse_duration,
+ try_get,
- unified_strdate
+ unified_strdate,
+ unified_timestamp,
+ url_or_none,
+ urljoin,
)
+from ..compat import compat_str
class JoveIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?jove\.com/video/(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?jove\.com/(?:(?P<video>video|t)|v)/(?P<id>[0-9]+)'
_CHAPTERS_URL = 'http://www.jove.com/video-chapters?videoid={video_id:}'
+ _VIDEO_URL_BASE ='https://ljsp.lwcdn.com/web/public/native/config/media/'
_TESTS = [
{
'url': 'http://www.jove.com/video/2744/electrode-positioning-montage-transcranial-direct-current',
@@ -40,10 +50,36 @@
]
+ def _parse_formats(self, sources_data, video_id=None,
+ m3u8_id=None, mpd_id=None, base_url=None):
+ urls = []
+ formats = []
+ for source in sources_data:
+ if not isinstance(source, dict):
+ continue
+ source_url = urljoin(
+ base_url, self._proto_relative_url(source.get('src')))
+ if not source_url or source_url in urls:
+ continue
+ urls.append(source_url)
+ source_type = source.get('type') or ''
+ ext = mimetype2ext(source_type) or determine_ext(source_url)
+ if source_type == 'hls' or ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ source_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id=m3u8_id, fatal=False))
+ elif source_type == 'dash' or ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ source_url, video_id, mpd_id=mpd_id, fatal=False))
+ return formats
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
+ url_type = mobj.group('video')
+ if url_type:
+ url = url.replace('/%s/' % url_type, '/v/')
webpage = self._download_webpage(url, video_id)
chapters_id = self._html_search_regex(
@@ -51,7 +87,7 @@
chapters_xml = self._download_xml(
self._CHAPTERS_URL.format(video_id=chapters_id),
- video_id, note='Downloading chapters XML',
+ chapters_id, note='Downloading chapters XML',
errnote='Failed to download chapters XML')
video_url = chapters_xml.attrib.get('video')
@@ -60,21 +96,40 @@
title = self._html_search_meta('citation_title', webpage, 'title')
thumbnail = self._og_search_thumbnail(webpage)
- description = self._html_search_regex(
- r'<div id="section_body_summary"><p class="jove_content">(.+?)</p>',
- webpage, 'description', fatal=False)
+ description = (
+ self._html_search_meta('citation_abstract', webpage, 'description', default=None)
+ or self._html_search_regex(
+ r'<div id="section_body_summary"><p class="jove_content">(.+?)</p>',
+ webpage, 'description', fatal=False))
publish_date = unified_strdate(self._html_search_meta(
'citation_publication_date', webpage, 'publish date', fatal=False))
- comment_count = int(self._html_search_regex(
+ comment_count = int_or_none(self._html_search_regex(
r'<meta name="num_comments" content="(\d+) Comments?"',
- webpage, 'comment count', fatal=False))
-
- return {
- 'id': video_id,
+ webpage, 'comment count', default=None))
+ vidobj = self._search_json_ld(webpage, chapters_id, 'VideoObject', default={})
+ info = {
+ 'id': chapters_id,
+ 'display_id': video_id,
'title': title,
- 'url': video_url,
'thumbnail': thumbnail,
'description': description,
'upload_date': publish_date,
'comment_count': comment_count,
+ 'timestamp': unified_timestamp(vidobj.get('uploadDate', publish_date)),
+ 'duration': parse_duration(vidobj.get('duration')),
+ 'view_count': int_or_none(vidobj.get('interactionCount')),
}
+ formats = []
+ if not url_or_none(video_url):
+ video_url = urljoin(self._VIDEO_URL_BASE, video_url)
+ sources = try_get(self._download_json(video_url, chapters_id,
+ note='Downloading JSON source',
+ # errnote=False, # would silence warnings
+ fatal=False),
+ lambda x: x['src'], list) or []
+ formats = self._parse_formats(sources, chapters_id, base_url=video_url)
+ if formats:
+ info['formats'] = formats
+ else:
+ info['url'] = video_url
+ return info
thanks for looking this up :) . it's too bad it can't be fixed, but there's still screen cams!
Jove won't download. I tried with different URL structures, but it told me to report here.
[debug] System config: [] [debug] User config: [] [debug] Custom config: [] [debug] Command-line args: ['--verbose', 'https://www.jove.com/video/61339'] [debug] Encodings: locale cp1252, fs utf-8, out utf-8, pref cp1252 [debug] youtube-dl version 2021.06.06 [debug] Python version 3.9.7 (CPython) - Windows-10-10.0.19043-SP0 [debug] exe versions: none [debug] Proxy map: {} [Jove] 61339: Downloading webpage ERROR: Unable to extract chapters id; please report this issue on https://yt-dl.org/bug . Make sure you are using the latest version; see https://yt-dl.org/update on how to update. Be sure to call youtube-dl with the --verbose flag and include its complete output. Traceback (most recent call last): File "C:\Users\nicod\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\youtube_dl\YoutubeDL.py", line 815, in wrapper return func(self, *args, **kwargs) File "C:\Users\nicod\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\youtube_dl\YoutubeDL.py", line 836, in __extract_info ie_result = ie.extract(url) File "C:\Users\nicod\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\youtube_dl\extractor\common.py", line 534, in extract ie_result = self._real_extract(url) File "C:\Users\nicod\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\youtube_dl\extractor\jove.py", line 49, in _real_extract chapters_id = self._html_search_regex( File "C:\Users\nicod\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\youtube_dl\extractor\common.py", line 1021, in _html_search_regex res = self._search_regex(pattern, string, name, default, fatal, flags, group) File "C:\Users\nicod\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\youtube_dl\extractor\common.py", line 1012, in _search_regex raise RegexNotFoundError('Unable to extract %s' % _name) youtube_dl.utils.RegexNotFoundError: Unable to extract chapters id; please report this issue on https://yt-dl.org/bug . Make sure you are using the latest version; see https://yt-dl.org/update on how to update. Be sure to call youtube-dl with the --verbose flag and include its complete output.