diff options
Diffstat (limited to 'youtube_dl/extractor/jamendo.py')
| -rw-r--r-- | youtube_dl/extractor/jamendo.py | 170 | 
1 files changed, 103 insertions, 67 deletions
| diff --git a/youtube_dl/extractor/jamendo.py b/youtube_dl/extractor/jamendo.py index c21827618..12e21eb6f 100644 --- a/youtube_dl/extractor/jamendo.py +++ b/youtube_dl/extractor/jamendo.py @@ -1,38 +1,26 @@  # coding: utf-8  from __future__ import unicode_literals -import re +import hashlib +import random -from ..compat import compat_urlparse +from ..compat import compat_str  from .common import InfoExtractor -from ..utils import parse_duration - - -class JamendoBaseIE(InfoExtractor): -    def _extract_meta(self, webpage, fatal=True): -        title = self._og_search_title( -            webpage, default=None) or self._search_regex( -            r'<title>([^<]+)', webpage, -            'title', default=None) -        if title: -            title = self._search_regex( -                r'(.+?)\s*\|\s*Jamendo Music', title, 'title', default=None) -        if not title: -            title = self._html_search_meta( -                'name', webpage, 'title', fatal=fatal) -        mobj = re.search(r'(.+) - (.+)', title or '') -        artist, second = mobj.groups() if mobj else [None] * 2 -        return title, artist, second - - -class JamendoIE(JamendoBaseIE): +from ..utils import ( +    clean_html, +    int_or_none, +    try_get, +) + + +class JamendoIE(InfoExtractor):      _VALID_URL = r'''(?x)                      https?://                          (?:                              licensing\.jamendo\.com/[^/]+|                              (?:www\.)?jamendo\.com                          ) -                        /track/(?P<id>[0-9]+)/(?P<display_id>[^/?#&]+) +                        /track/(?P<id>[0-9]+)(?:/(?P<display_id>[^/?#&]+))?                      '''      _TESTS = [{          'url': 'https://www.jamendo.com/track/196219/stories-from-emona-i', @@ -45,7 +33,9 @@ class JamendoIE(JamendoBaseIE):              'artist': 'Maya Filipič',              'track': 'Stories from Emona I',              'duration': 210, -            'thumbnail': r're:^https?://.*\.jpg' +            'thumbnail': r're:^https?://.*\.jpg', +            'timestamp': 1217438117, +            'upload_date': '20080730',          }      }, {          'url': 'https://licensing.jamendo.com/en/track/1496667/energetic-rock', @@ -53,15 +43,19 @@ class JamendoIE(JamendoBaseIE):      }]      def _real_extract(self, url): -        mobj = self._VALID_URL_RE.match(url) -        track_id = mobj.group('id') -        display_id = mobj.group('display_id') - -        webpage = self._download_webpage( -            'https://www.jamendo.com/track/%s/%s' % (track_id, display_id), -            display_id) - -        title, artist, track = self._extract_meta(webpage) +        track_id, display_id = self._VALID_URL_RE.match(url).groups() +        webpage = self._download_webpage(url, track_id) +        models = self._parse_json(self._html_search_regex( +            r"data-bundled-models='([^']+)", +            webpage, 'bundled models'), track_id) +        track = models['track']['models'][0] +        title = track_name = track['name'] +        get_model = lambda x: try_get(models, lambda y: y[x]['models'][0], dict) or {} +        artist = get_model('artist') +        artist_name = artist.get('name') +        if artist_name: +            title = '%s - %s' % (artist_name, title) +        album = get_model('album')          formats = [{              'url': 'https://%s.jamendo.com/?trackid=%s&format=%s&from=app-97dab294' @@ -77,31 +71,58 @@ class JamendoIE(JamendoBaseIE):          ))]          self._sort_formats(formats) -        thumbnail = self._html_search_meta( -            'image', webpage, 'thumbnail', fatal=False) -        duration = parse_duration(self._search_regex( -            r'<span[^>]+itemprop=["\']duration["\'][^>]+content=["\'](.+?)["\']', -            webpage, 'duration', fatal=False)) +        urls = [] +        thumbnails = [] +        for _, covers in track.get('cover', {}).items(): +            for cover_id, cover_url in covers.items(): +                if not cover_url or cover_url in urls: +                    continue +                urls.append(cover_url) +                size = int_or_none(cover_id.lstrip('size')) +                thumbnails.append({ +                    'id': cover_id, +                    'url': cover_url, +                    'width': size, +                    'height': size, +                }) + +        tags = [] +        for tag in track.get('tags', []): +            tag_name = tag.get('name') +            if not tag_name: +                continue +            tags.append(tag_name) + +        stats = track.get('stats') or {}          return {              'id': track_id,              'display_id': display_id, -            'thumbnail': thumbnail, +            'thumbnails': thumbnails,              'title': title, -            'duration': duration, -            'artist': artist, -            'track': track, -            'formats': formats +            'description': track.get('description'), +            'duration': int_or_none(track.get('duration')), +            'artist': artist_name, +            'track': track_name, +            'album': album.get('name'), +            'formats': formats, +            'license': '-'.join(track.get('licenseCC', [])) or None, +            'timestamp': int_or_none(track.get('dateCreated')), +            'view_count': int_or_none(stats.get('listenedAll')), +            'like_count': int_or_none(stats.get('favorited')), +            'average_rating': int_or_none(stats.get('averageNote')), +            'tags': tags,          } -class JamendoAlbumIE(JamendoBaseIE): -    _VALID_URL = r'https?://(?:www\.)?jamendo\.com/album/(?P<id>[0-9]+)/(?P<display_id>[\w-]+)' +class JamendoAlbumIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?jamendo\.com/album/(?P<id>[0-9]+)'      _TEST = {          'url': 'https://www.jamendo.com/album/121486/duck-on-cover',          'info_dict': {              'id': '121486', -            'title': 'Shearer - Duck On Cover' +            'title': 'Duck On Cover', +            'description': 'md5:c2920eaeef07d7af5b96d7c64daf1239',          },          'playlist': [{              'md5': 'e1a2fcb42bda30dfac990212924149a8', @@ -111,6 +132,8 @@ class JamendoAlbumIE(JamendoBaseIE):                  'title': 'Shearer - Warmachine',                  'artist': 'Shearer',                  'track': 'Warmachine', +                'timestamp': 1368089771, +                'upload_date': '20130509',              }          }, {              'md5': '1f358d7b2f98edfe90fd55dac0799d50', @@ -120,6 +143,8 @@ class JamendoAlbumIE(JamendoBaseIE):                  'title': 'Shearer - Without Your Ghost',                  'artist': 'Shearer',                  'track': 'Without Your Ghost', +                'timestamp': 1368089771, +                'upload_date': '20130509',              }          }],          'params': { @@ -127,24 +152,35 @@ class JamendoAlbumIE(JamendoBaseIE):          }      } +    def _call_api(self, resource, resource_id): +        path = '/api/%ss' % resource +        rand = compat_str(random.random()) +        return self._download_json( +            'https://www.jamendo.com' + path, resource_id, query={ +                'id[]': resource_id, +            }, headers={ +                'X-Jam-Call': '$%s*%s~' % (hashlib.sha1((path + rand).encode()).hexdigest(), rand) +            })[0] +      def _real_extract(self, url): -        mobj = self._VALID_URL_RE.match(url) -        album_id = mobj.group('id') - -        webpage = self._download_webpage(url, mobj.group('display_id')) - -        title, artist, album = self._extract_meta(webpage, fatal=False) - -        entries = [{ -            '_type': 'url_transparent', -            'url': compat_urlparse.urljoin(url, m.group('path')), -            'ie_key': JamendoIE.ie_key(), -            'id': self._search_regex( -                r'/track/(\d+)', m.group('path'), 'track id', default=None), -            'artist': artist, -            'album': album, -        } for m in re.finditer( -            r'<a[^>]+href=(["\'])(?P<path>(?:(?!\1).)+)\1[^>]+class=["\'][^>]*js-trackrow-albumpage-link', -            webpage)] - -        return self.playlist_result(entries, album_id, title) +        album_id = self._match_id(url) +        album = self._call_api('album', album_id) +        album_name = album.get('name') + +        entries = [] +        for track in album.get('tracks', []): +            track_id = track.get('id') +            if not track_id: +                continue +            track_id = compat_str(track_id) +            entries.append({ +                '_type': 'url_transparent', +                'url': 'https://www.jamendo.com/track/' + track_id, +                'ie_key': JamendoIE.ie_key(), +                'id': track_id, +                'album': album_name, +            }) + +        return self.playlist_result( +            entries, album_id, album_name, +            clean_html(try_get(album, lambda x: x['description']['en'], compat_str))) | 
