diff options
| author | Remita Amine <remitamine@gmail.com> | 2020-11-16 17:26:09 +0100 | 
|---|---|---|
| committer | Remita Amine <remitamine@gmail.com> | 2020-11-16 17:26:09 +0100 | 
| commit | 3f1748b9445e9d9367d29221c4b7bf9b88895e4e (patch) | |
| tree | 6ff7390faa34751be45c21294a0ca00554adff41 | |
| parent | fe13087cd152c7eb98ea418bad0419a15c767c08 (diff) | |
[bandcamp] fix extraction
| -rw-r--r-- | youtube_dl/extractor/bandcamp.py | 151 | 
1 files changed, 59 insertions, 92 deletions
diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index f14b407dc..731c7c25c 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -1,3 +1,4 @@ +# coding: utf-8  from __future__ import unicode_literals  import random @@ -5,10 +6,7 @@ import re  import time  from .common import InfoExtractor -from ..compat import ( -    compat_str, -    compat_urlparse, -) +from ..compat import compat_str  from ..utils import (      ExtractorError,      float_or_none, @@ -17,30 +15,32 @@ from ..utils import (      parse_filesize,      str_or_none,      try_get, -    unescapeHTML,      update_url_query,      unified_strdate,      unified_timestamp,      url_or_none, +    urljoin,  )  class BandcampIE(InfoExtractor): -    _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<title>[^/?#&]+)' +    _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<id>[^/?#&]+)'      _TESTS = [{          'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',          'md5': 'c557841d5e50261777a6585648adf439',          'info_dict': {              'id': '1812978515',              'ext': 'mp3', -            'title': "youtube-dl  \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", +            'title': "youtube-dl  \"'/\\ä↭ - youtube-dl  \"'/\\ä↭ - youtube-dl test song \"'/\\ä↭",              'duration': 9.8485, +            'uploader': 'youtube-dl  "\'/\\ä↭', +            'upload_date': '20121129', +            'timestamp': 1354224127,          },          '_skip': 'There is a limit of 200 free downloads / month for the test song'      }, {          # free download          'url': 'http://benprunty.bandcamp.com/track/lanius-battle', -        'md5': '853e35bf34aa1d6fe2615ae612564b36',          'info_dict': {              'id': '2650410135',              'ext': 'aiff', @@ -79,11 +79,16 @@ class BandcampIE(InfoExtractor):          },      }] +    def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True): +        return self._parse_json(self._html_search_regex( +            r'data-%s=(["\'])({.+?})\1' % attr, webpage, +            attr + ' data', group=2), video_id, fatal=fatal) +      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        title = mobj.group('title') +        title = self._match_id(url)          webpage = self._download_webpage(url, title) -        thumbnail = self._html_search_meta('og:image', webpage, default=None) +        tralbum = self._extract_data_attr(webpage, title) +        thumbnail = self._og_search_thumbnail(webpage)          track_id = None          track = None @@ -91,10 +96,7 @@ class BandcampIE(InfoExtractor):          duration = None          formats = [] -        track_info = self._parse_json( -            self._search_regex( -                r'trackinfo\s*:\s*\[\s*({.+?})\s*\]\s*,\s*?\n', -                webpage, 'track info', default='{}'), title) +        track_info = try_get(tralbum, lambda x: x['trackinfo'][0], dict)          if track_info:              file_ = track_info.get('file')              if isinstance(file_, dict): @@ -111,37 +113,25 @@ class BandcampIE(InfoExtractor):                          'abr': int_or_none(abr_str),                      })              track = track_info.get('title') -            track_id = str_or_none(track_info.get('track_id') or track_info.get('id')) +            track_id = str_or_none( +                track_info.get('track_id') or track_info.get('id'))              track_number = int_or_none(track_info.get('track_num'))              duration = float_or_none(track_info.get('duration')) -        def extract(key): -            return self._search_regex( -                r'\b%s\s*["\']?\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % key, -                webpage, key, default=None, group='value') - -        artist = extract('artist') -        album = extract('album_title') +        embed = self._extract_data_attr(webpage, title, 'embed', False) +        current = tralbum.get('current') or {} +        artist = embed.get('artist') or current.get('artist') or tralbum.get('artist')          timestamp = unified_timestamp( -            extract('publish_date') or extract('album_publish_date')) -        release_date = unified_strdate(extract('album_release_date')) +            current.get('publish_date') or tralbum.get('album_publish_date')) -        download_link = self._search_regex( -            r'freeDownloadPage\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, -            'download link', default=None, group='url') +        download_link = tralbum.get('freeDownloadPage')          if download_link: -            track_id = self._search_regex( -                r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$', -                webpage, 'track id') +            track_id = compat_str(tralbum['id'])              download_webpage = self._download_webpage(                  download_link, track_id, 'Downloading free downloads page') -            blob = self._parse_json( -                self._search_regex( -                    r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage, -                    'blob', group='blob'), -                track_id, transform_source=unescapeHTML) +            blob = self._extract_data_attr(download_webpage, track_id, 'blob')              info = try_get(                  blob, (lambda x: x['digital_items'][0], @@ -207,20 +197,20 @@ class BandcampIE(InfoExtractor):              'thumbnail': thumbnail,              'uploader': artist,              'timestamp': timestamp, -            'release_date': release_date, +            'release_date': unified_strdate(tralbum.get('album_release_date')),              'duration': duration,              'track': track,              'track_number': track_number,              'track_id': track_id,              'artist': artist, -            'album': album, +            'album': embed.get('album_title'),              'formats': formats,          } -class BandcampAlbumIE(InfoExtractor): +class BandcampAlbumIE(BandcampIE):      IE_NAME = 'Bandcamp:album' -    _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?' +    _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<id>[^/?#&]+))?'      _TESTS = [{          'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', @@ -230,7 +220,10 @@ class BandcampAlbumIE(InfoExtractor):                  'info_dict': {                      'id': '1353101989',                      'ext': 'mp3', -                    'title': 'Intro', +                    'title': 'Blazo - Intro', +                    'timestamp': 1311756226, +                    'upload_date': '20110727', +                    'uploader': 'Blazo',                  }              },              { @@ -238,7 +231,10 @@ class BandcampAlbumIE(InfoExtractor):                  'info_dict': {                      'id': '38097443',                      'ext': 'mp3', -                    'title': 'Kero One - Keep It Alive (Blazo remix)', +                    'title': 'Blazo - Kero One - Keep It Alive (Blazo remix)', +                    'timestamp': 1311757238, +                    'upload_date': '20110727', +                    'uploader': 'Blazo',                  }              },          ], @@ -294,41 +290,31 @@ class BandcampAlbumIE(InfoExtractor):                  else super(BandcampAlbumIE, cls).suitable(url))      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        uploader_id = mobj.group('subdomain') -        album_id = mobj.group('album_id') +        uploader_id, album_id = re.match(self._VALID_URL, url).groups()          playlist_id = album_id or uploader_id          webpage = self._download_webpage(url, playlist_id) -        track_elements = re.findall( -            r'(?s)<div[^>]*>(.*?<a[^>]+href="([^"]+?)"[^>]+itemprop="url"[^>]*>.*?)</div>', webpage) -        if not track_elements: +        tralbum = self._extract_data_attr(webpage, playlist_id) +        track_info = tralbum.get('trackinfo') +        if not track_info:              raise ExtractorError('The page doesn\'t contain any tracks')          # Only tracks with duration info have songs          entries = [              self.url_result( -                compat_urlparse.urljoin(url, t_path), -                ie=BandcampIE.ie_key(), -                video_title=self._search_regex( -                    r'<span\b[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)', -                    elem_content, 'track title', fatal=False)) -            for elem_content, t_path in track_elements -            if self._html_search_meta('duration', elem_content, default=None)] - -        title = self._html_search_regex( -            r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"', -            webpage, 'title', fatal=False) -        if title: -            title = title.replace(r'\"', '"') +                urljoin(url, t['title_link']), BandcampIE.ie_key(), +                str_or_none(t.get('track_id') or t.get('id')), t.get('title')) +            for t in track_info +            if t.get('duration')] +          return {              '_type': 'playlist',              'uploader_id': uploader_id,              'id': playlist_id, -            'title': title, +            'title': try_get(tralbum, lambda x: x['current']['title'], compat_str),              'entries': entries,          } -class BandcampWeeklyIE(InfoExtractor): +class BandcampWeeklyIE(BandcampIE):      IE_NAME = 'Bandcamp:weekly'      _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)'      _TESTS = [{ @@ -343,29 +329,23 @@ class BandcampWeeklyIE(InfoExtractor):              'release_date': '20170404',              'series': 'Bandcamp Weekly',              'episode': 'Magic Moments', -            'episode_number': 208,              'episode_id': '224', -        } +        }, +        'params': { +            'format': 'opus-lo', +        },      }, {          'url': 'https://bandcamp.com/?blah/blah@&show=228',          'only_matching': True      }]      def _real_extract(self, url): -        video_id = self._match_id(url) -        webpage = self._download_webpage(url, video_id) +        show_id = self._match_id(url) +        webpage = self._download_webpage(url, show_id) -        blob = self._parse_json( -            self._search_regex( -                r'data-blob=(["\'])(?P<blob>{.+?})\1', webpage, -                'blob', group='blob'), -            video_id, transform_source=unescapeHTML) +        blob = self._extract_data_attr(webpage, show_id, 'blob') -        show = blob['bcw_show'] - -        # This is desired because any invalid show id redirects to `bandcamp.com` -        # which happens to expose the latest Bandcamp Weekly episode. -        show_id = int_or_none(show.get('show_id')) or int_or_none(video_id) +        show = blob['bcw_data'][show_id]          formats = []          for format_id, format_url in show['audio_stream'].items(): @@ -390,20 +370,8 @@ class BandcampWeeklyIE(InfoExtractor):          if subtitle:              title += ' - %s' % subtitle -        episode_number = None -        seq = blob.get('bcw_seq') - -        if seq and isinstance(seq, list): -            try: -                episode_number = next( -                    int_or_none(e.get('episode_number')) -                    for e in seq -                    if isinstance(e, dict) and int_or_none(e.get('id')) == show_id) -            except StopIteration: -                pass -          return { -            'id': video_id, +            'id': show_id,              'title': title,              'description': show.get('desc') or show.get('short_desc'),              'duration': float_or_none(show.get('audio_duration')), @@ -411,7 +379,6 @@ class BandcampWeeklyIE(InfoExtractor):              'release_date': unified_strdate(show.get('published_date')),              'series': 'Bandcamp Weekly',              'episode': show.get('subtitle'), -            'episode_number': episode_number, -            'episode_id': compat_str(video_id), +            'episode_id': show_id,              'formats': formats          }  | 
