ytdl-org / youtube-dl

Command-line program to download videos from YouTube.com and other video sites
http://ytdl-org.github.io/youtube-dl/
The Unlicense
131.38k stars 9.96k forks source link

Jove download #30067

Open nmarty195 opened 2 years ago

nmarty195 commented 2 years ago

Jove won't download. I tried with different URL structures, but it told me to report here.

[debug] System config: [] [debug] User config: [] [debug] Custom config: [] [debug] Command-line args: ['--verbose', 'https://www.jove.com/video/61339'] [debug] Encodings: locale cp1252, fs utf-8, out utf-8, pref cp1252 [debug] youtube-dl version 2021.06.06 [debug] Python version 3.9.7 (CPython) - Windows-10-10.0.19043-SP0 [debug] exe versions: none [debug] Proxy map: {} [Jove] 61339: Downloading webpage ERROR: Unable to extract chapters id; please report this issue on https://yt-dl.org/bug . Make sure you are using the latest version; see https://yt-dl.org/update on how to update. Be sure to call youtube-dl with the --verbose flag and include its complete output. Traceback (most recent call last): File "C:\Users\nicod\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\youtube_dl\YoutubeDL.py", line 815, in wrapper return func(self, *args, **kwargs) File "C:\Users\nicod\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\youtube_dl\YoutubeDL.py", line 836, in __extract_info ie_result = ie.extract(url) File "C:\Users\nicod\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\youtube_dl\extractor\common.py", line 534, in extract ie_result = self._real_extract(url) File "C:\Users\nicod\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\youtube_dl\extractor\jove.py", line 49, in _real_extract chapters_id = self._html_search_regex( File "C:\Users\nicod\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\youtube_dl\extractor\common.py", line 1021, in _html_search_regex res = self._search_regex(pattern, string, name, default, fatal, flags, group) File "C:\Users\nicod\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\youtube_dl\extractor\common.py", line 1012, in _search_regex raise RegexNotFoundError('Unable to extract %s' % _name) youtube_dl.utils.RegexNotFoundError: Unable to extract chapters id; please report this issue on https://yt-dl.org/bug . Make sure you are using the latest version; see https://yt-dl.org/update on how to update. Be sure to call youtube-dl with the --verbose flag and include its complete output.

MehdiMJ1 commented 2 years ago

The same issue here for "fb.watch" domain facebook videos

dirkf commented 2 years ago

@MehdiMJ1, if your problem isn't covered by an open issue, please open a separate issue (following the template provided): the Jove extractor isn't relevant to Facebook videos.

dirkf commented 2 years ago

The JoVE URL redirects to https://www.jove.com/t/61339/... but that page doesn't contain what the extractor is expecting to find.

If it looks at https://www.jove.com/v/61339/... instead, it finds a video URL but crashes because there's no comment count, and the video URL is just a UUID that has to be appended to https://ljsp.lwcdn.com/web/public/native/config/media/ to fetch JSON containing the video links. When the extractor is tweaked to handle the page, it just gets DRM playlist URLs :-(((. Eg, the M3U8 URL uses Sample-AES.

The patch below allows this unhelpful result. Perhaps other pages have videos that aren't DRM protected.

--- a/youtube_dl/extractor/jove.py
+++ b/youtube_dl/extractor/jove.py
@@ -5,13 +5,23 @@
 from .common import InfoExtractor
 from ..utils import (
     ExtractorError,
+    determine_ext,
+    int_or_none,
+    mimetype2ext,
+    parse_duration,
+    try_get,
-    unified_strdate
+    unified_strdate,
+    unified_timestamp,
+    url_or_none,
+    urljoin,
 )
+from ..compat import compat_str

 class JoveIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?jove\.com/video/(?P<id>[0-9]+)'
+    _VALID_URL = r'https?://(?:www\.)?jove\.com/(?:(?P<video>video|t)|v)/(?P<id>[0-9]+)'
     _CHAPTERS_URL = 'http://www.jove.com/video-chapters?videoid={video_id:}'
+    _VIDEO_URL_BASE ='https://ljsp.lwcdn.com/web/public/native/config/media/'
     _TESTS = [
         {
             'url': 'http://www.jove.com/video/2744/electrode-positioning-montage-transcranial-direct-current',
@@ -40,10 +50,36 @@

     ]

+    def _parse_formats(self, sources_data, video_id=None,
+                       m3u8_id=None, mpd_id=None, base_url=None):
+        urls = []
+        formats = []
+        for source in sources_data:
+            if not isinstance(source, dict):
+                continue
+            source_url = urljoin(
+                base_url, self._proto_relative_url(source.get('src')))
+            if not source_url or source_url in urls:
+                continue
+            urls.append(source_url)
+            source_type = source.get('type') or ''
+            ext = mimetype2ext(source_type) or determine_ext(source_url)
+            if source_type == 'hls' or ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    source_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id=m3u8_id, fatal=False))
+            elif source_type == 'dash' or ext == 'mpd':
+                formats.extend(self._extract_mpd_formats(
+                    source_url, video_id, mpd_id=mpd_id, fatal=False))
+        return formats
+
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')

+        url_type = mobj.group('video')
+        if url_type:
+            url = url.replace('/%s/' % url_type, '/v/')
         webpage = self._download_webpage(url, video_id)

         chapters_id = self._html_search_regex(
@@ -51,7 +87,7 @@

         chapters_xml = self._download_xml(
             self._CHAPTERS_URL.format(video_id=chapters_id),
-            video_id, note='Downloading chapters XML',
+            chapters_id, note='Downloading chapters XML',
             errnote='Failed to download chapters XML')

         video_url = chapters_xml.attrib.get('video')
@@ -60,21 +96,40 @@

         title = self._html_search_meta('citation_title', webpage, 'title')
         thumbnail = self._og_search_thumbnail(webpage)
-        description = self._html_search_regex(
-            r'<div id="section_body_summary"><p class="jove_content">(.+?)</p>',
-            webpage, 'description', fatal=False)
+        description = (
+            self._html_search_meta('citation_abstract', webpage, 'description', default=None)
+            or self._html_search_regex(
+                r'<div id="section_body_summary"><p class="jove_content">(.+?)</p>',
+                webpage, 'description', fatal=False))
         publish_date = unified_strdate(self._html_search_meta(
             'citation_publication_date', webpage, 'publish date', fatal=False))
-        comment_count = int(self._html_search_regex(
+        comment_count = int_or_none(self._html_search_regex(
             r'<meta name="num_comments" content="(\d+) Comments?"',
-            webpage, 'comment count', fatal=False))
-
-        return {
-            'id': video_id,
+            webpage, 'comment count', default=None))
+        vidobj = self._search_json_ld(webpage, chapters_id, 'VideoObject', default={})
+        info = {
+            'id': chapters_id,
+            'display_id': video_id,
             'title': title,
-            'url': video_url,
             'thumbnail': thumbnail,
             'description': description,
             'upload_date': publish_date,
             'comment_count': comment_count,
+            'timestamp': unified_timestamp(vidobj.get('uploadDate', publish_date)),
+            'duration': parse_duration(vidobj.get('duration')),
+            'view_count': int_or_none(vidobj.get('interactionCount')),
         }
+        formats = []
+        if not url_or_none(video_url):
+            video_url = urljoin(self._VIDEO_URL_BASE, video_url)
+            sources = try_get(self._download_json(video_url, chapters_id,
+                                                  note='Downloading JSON source',
+                                                  # errnote=False,  # would silence warnings
+                                                  fatal=False),
+                              lambda x: x['src'], list) or []                  
+            formats = self._parse_formats(sources, chapters_id, base_url=video_url)
+        if formats:
+            info['formats'] = formats
+        else:    
+            info['url'] = video_url
+        return info
nmarty195 commented 2 years ago

thanks for looking this up :) . it's too bad it can't be fixed, but there's still screen cams!