Open philszalay opened 1 year ago
It would be, but apparently (I have no knowledge of the site beyond what is in the extractor code) the site has been reworked with Next.js and the targets sought by the extractor no longer exist, such as the JS variable Playables
reported above.
This rewrite against the master branch passes the tests.
--- old/youtube_dl/extractor/beatport.py
+++ new/youtube_dl/extractor/beatport.py
@@ -1,23 +1,35 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..compat import compat_str
-from ..utils import int_or_none
+from ..utils import (
+ determine_ext,
+ int_or_none,
+ join_nonempty,
+ merge_dicts,
+ parse_iso8601,
+ T,
+ traverse_obj,
+ txt_or_none,
+ unified_strdate,
+ url_or_none,
+ variadic,
+)
class BeatportIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.|pro\.)?beatport\.com/track/(?P<display_id>[^/]+)/(?P<id>[0-9]+)'
_TESTS = [{
'url': 'https://beatport.com/track/synesthesia-original-mix/5379371',
- 'md5': 'b3c34d8639a2f6a7f734382358478887',
+ 'md5': 'cfcc245aafcad52a837b2c5a60a472c9',
'info_dict': {
'id': '5379371',
'display_id': 'synesthesia-original-mix',
- 'ext': 'mp4',
+ 'ext': 'mp3',
'title': 'Froxic - Synesthesia (Original Mix)',
+ 'timestamp': 1397854513,
+ 'upload_date': '20140428',
},
}, {
'url': 'https://beatport.com/track/love-and-war-original-mix/3756896',
@@ -27,20 +39,86 @@
'display_id': 'love-and-war-original-mix',
'ext': 'mp3',
'title': 'Wolfgang Gartner - Love & War (Original Mix)',
+ 'timestamp': 1346195831,
+ 'upload_date': '20120917',
},
}, {
'url': 'https://beatport.com/track/birds-original-mix/4991738',
- 'md5': 'a1fd8e8046de3950fd039304c186c05f',
+ 'md5': '2dff00955b13c182931a708d979801b6',
'info_dict': {
'id': '4991738',
'display_id': 'birds-original-mix',
- 'ext': 'mp4',
+ 'ext': 'mp3',
'title': "Tos, Middle Milk, Mumblin' Johnsson - Birds (Original Mix)",
+ 'timestamp': 1386121876,
+ 'upload_date': '20131209',
}
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
+ track_id, display_id = mobj.group('id', 'display_id')
+
+ webpage = self._download_webpage(url, display_id)
+
+ next_data = self._search_nextjs_data(webpage, display_id, fatal=False)
+ if not next_data:
+ return self._old_real_extract(url)
+
+ track = traverse_obj(
+ next_data,
+ ('props', 'pageProps', lambda k, v: k == 'track' and v['id'] == int(track_id)),
+ get_all=False)
+
+ title = track['name']
+ artists = ', '.join(traverse_obj(track, ('artists', Ellipsis, 'name', T(txt_or_none)))) or None
+ title = join_nonempty(artists, title, delim=' - ')
+ title = join_nonempty(
+ title, traverse_obj(track, ('mix_name', T(lambda s: '(' + s + ')'))),
+ delim=' ')
+
+ formats = []
+ # next.js page has <= 1 sample URL
+ f_url = traverse_obj(track, ('sample_url', T(url_or_none)))
+ if f_url:
+ ext = determine_ext(f_url)
+ fmt = {
+ 'url': f_url,
+ 'ext': ext,
+ 'format_id': ext,
+ 'vcodec': 'none',
+ }
+ if ext == 'mp3':
+ fmt['preference'] = 0
+ fmt['acodec'] = 'mp3'
+ fmt['abr'] = 96
+ fmt['asr'] = 44100
+ elif ext == 'mp4':
+ fmt['preference'] = 1
+ fmt['acodec'] = 'aac'
+ fmt['abr'] = 96
+ fmt['asr'] = 44100
+ formats.append(fmt)
+ self._sort_formats(formats)
+
+ return merge_dicts({
+ 'id': track_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'formats': formats,
+ 'artists': artists,
+ }, traverse_obj(track, {
+ 'disc_number': ('catalog_number', T(int_or_none)),
+ 'timestamp': ('encoded_date', T(parse_iso8601)),
+ 'categories': ('genre', 'name', T(txt_or_none), T(variadic)),
+ 'thumbnail': ('image', 'uri', T(url_or_none)),
+ 'upload_date': (('new_release_date', 'publish_date'), T(unified_strdate)),
+ 'track_number': ('number', T(int_or_none)),
+ 'album': ('release', 'name', T(txt_or_none)),
+ }, get_all=False))
+
+ def _old_real_extract(self, url):
+ mobj = self._match_valid_url(url)
track_id = mobj.group('id')
display_id = mobj.group('display_id')
@@ -48,8 +126,8 @@
playables = self._parse_json(
self._search_regex(
- r'window\.Playables\s*=\s*({.+?});', webpage,
- 'playables info', flags=re.DOTALL),
+ r'(?s)window\.Playables\s*=\s*({.+?});', webpage,
+ 'playables info'),
track_id)
track = next(t for t in playables['tracks'] if t['id'] == int(track_id))
The page offers sample audio extracted from the full track available for purchase. Does the site offer full downloads with login or otherwise? Also, the old site offered AAC but the test URL that did so now only has the MP3 sample.
@dirkf thank you! I can confirm that it works now. When logged in and with a subscription it is possible to listen to the full tracks. Do you know if a download with a login is possible atm? If I provide USERNAME
and PASSWORD
I still get the sample audio.
The extractor doesn't know how to login using --password ...
/-p ...
etc. You could try passing --cookies ...
from your logged-in browser session but the patch above is clearly asking for the sample_url
. Similarly, the old code only fetched the preview
tracks.
If full tracks are available when logged in, it should be possible to extract them. A user with a login would have to analyse the data, or share the login details, or provide the output of --write-pages
when logged-in cookies are supplied.
You need to install the master, or nightly, code. join_nonempty()
is one of a quite a few new utility functions added since 2021-12.
Checklist
Verbose log
Description
When trying to download a track from beatport with the following command
youtube-dl --verbose https://www.beatport.com/track/dont-care/16624764
, I get the error shown above. In my opinion this should be working without any problems.