diff options
author | Yen Chi Hsuan <yan12125@gmail.com> | 2017-02-05 22:47:04 +0800 |
---|---|---|
committer | Yen Chi Hsuan <yan12125@gmail.com> | 2017-02-05 22:47:04 +0800 |
commit | 019f4c03717bfd2b887309e5a4c96ea82cbedf34 (patch) | |
tree | ed91a55edbb67810c73d8a0359818c2ac96baf82 | |
parent | 2ab2c0d1f53f66614eda4fefb042e851e78097f0 (diff) |
[bandcamp] Fix extraction for incomplete albums
Closes #11727
-rw-r--r-- | ChangeLog | 1 | ||||
-rw-r--r-- | youtube_dl/extractor/bandcamp.py | 19 |
2 files changed, 17 insertions, 3 deletions
@@ -1,6 +1,7 @@ version <unreleased> Extractors +* [bandcamp] Fix extraction for incomplete albums (#11727) * [iwara] Fix extraction (#11781) * [googledrive] Fix extraction on Python 3.6 diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 88c590e98..056e06376 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -209,6 +209,15 @@ class BandcampAlbumIE(InfoExtractor): 'id': 'entropy-ep', }, 'playlist_mincount': 3, + }, { + # not all tracks have songs + 'url': 'https://insulters.bandcamp.com/album/we-are-the-plague', + 'info_dict': { + 'id': 'we-are-the-plague', + 'title': 'WE ARE THE PLAGUE', + 'uploader_id': 'insulters', + }, + 'playlist_count': 2, }] def _real_extract(self, url): @@ -217,12 +226,16 @@ class BandcampAlbumIE(InfoExtractor): album_id = mobj.group('album_id') playlist_id = album_id or uploader_id webpage = self._download_webpage(url, playlist_id) - tracks_paths = re.findall(r'<a href="(.*?)" itemprop="url">', webpage) - if not tracks_paths: + track_elements = re.findall( + r'(?s)<div[^>]*>(.*?<a[^>]+href="([^"]+?)"[^>]+itemprop="url"[^>]*>.*?)</div>', webpage) + if not track_elements: raise ExtractorError('The page doesn\'t contain any tracks') + # Only tracks with duration info have songs entries = [ self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key()) - for t_path in tracks_paths] + for elem_content, t_path in track_elements + if self._html_search_meta('duration', elem_content, default=None)] + title = self._html_search_regex( r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"', webpage, 'title', fatal=False) |