diff options
| author | Yen Chi Hsuan <yan12125@gmail.com> | 2017-02-05 22:47:04 +0800 | 
|---|---|---|
| committer | Yen Chi Hsuan <yan12125@gmail.com> | 2017-02-05 22:47:04 +0800 | 
| commit | 019f4c03717bfd2b887309e5a4c96ea82cbedf34 (patch) | |
| tree | ed91a55edbb67810c73d8a0359818c2ac96baf82 | |
| parent | 2ab2c0d1f53f66614eda4fefb042e851e78097f0 (diff) | |
[bandcamp] Fix extraction for incomplete albums
Closes #11727
| -rw-r--r-- | ChangeLog | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/bandcamp.py | 19 | 
2 files changed, 17 insertions, 3 deletions
@@ -1,6 +1,7 @@  version <unreleased>  Extractors +* [bandcamp] Fix extraction for incomplete albums (#11727)  * [iwara] Fix extraction (#11781)  * [googledrive] Fix extraction on Python 3.6 diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 88c590e98..056e06376 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -209,6 +209,15 @@ class BandcampAlbumIE(InfoExtractor):              'id': 'entropy-ep',          },          'playlist_mincount': 3, +    }, { +        # not all tracks have songs +        'url': 'https://insulters.bandcamp.com/album/we-are-the-plague', +        'info_dict': { +            'id': 'we-are-the-plague', +            'title': 'WE ARE THE PLAGUE', +            'uploader_id': 'insulters', +        }, +        'playlist_count': 2,      }]      def _real_extract(self, url): @@ -217,12 +226,16 @@ class BandcampAlbumIE(InfoExtractor):          album_id = mobj.group('album_id')          playlist_id = album_id or uploader_id          webpage = self._download_webpage(url, playlist_id) -        tracks_paths = re.findall(r'<a href="(.*?)" itemprop="url">', webpage) -        if not tracks_paths: +        track_elements = re.findall( +            r'(?s)<div[^>]*>(.*?<a[^>]+href="([^"]+?)"[^>]+itemprop="url"[^>]*>.*?)</div>', webpage) +        if not track_elements:              raise ExtractorError('The page doesn\'t contain any tracks') +        # Only tracks with duration info have songs          entries = [              self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key()) -            for t_path in tracks_paths] +            for elem_content, t_path in track_elements +            if self._html_search_meta('duration', elem_content, default=None)] +          title = self._html_search_regex(              r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"',              webpage, 'title', fatal=False)  | 
