aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYen Chi Hsuan <yan12125@gmail.com>2017-02-05 22:47:04 +0800
committerYen Chi Hsuan <yan12125@gmail.com>2017-02-05 22:47:04 +0800
commit019f4c03717bfd2b887309e5a4c96ea82cbedf34 (patch)
treeed91a55edbb67810c73d8a0359818c2ac96baf82
parent2ab2c0d1f53f66614eda4fefb042e851e78097f0 (diff)
[bandcamp] Fix extraction for incomplete albums
Closes #11727
-rw-r--r--ChangeLog1
-rw-r--r--youtube_dl/extractor/bandcamp.py19
2 files changed, 17 insertions, 3 deletions
diff --git a/ChangeLog b/ChangeLog
index 77286dbef..984191925 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,6 +1,7 @@
version <unreleased>
Extractors
+* [bandcamp] Fix extraction for incomplete albums (#11727)
* [iwara] Fix extraction (#11781)
* [googledrive] Fix extraction on Python 3.6
diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py
index 88c590e98..056e06376 100644
--- a/youtube_dl/extractor/bandcamp.py
+++ b/youtube_dl/extractor/bandcamp.py
@@ -209,6 +209,15 @@ class BandcampAlbumIE(InfoExtractor):
'id': 'entropy-ep',
},
'playlist_mincount': 3,
+ }, {
+ # not all tracks have songs
+ 'url': 'https://insulters.bandcamp.com/album/we-are-the-plague',
+ 'info_dict': {
+ 'id': 'we-are-the-plague',
+ 'title': 'WE ARE THE PLAGUE',
+ 'uploader_id': 'insulters',
+ },
+ 'playlist_count': 2,
}]
def _real_extract(self, url):
@@ -217,12 +226,16 @@ class BandcampAlbumIE(InfoExtractor):
album_id = mobj.group('album_id')
playlist_id = album_id or uploader_id
webpage = self._download_webpage(url, playlist_id)
- tracks_paths = re.findall(r'<a href="(.*?)" itemprop="url">', webpage)
- if not tracks_paths:
+ track_elements = re.findall(
+ r'(?s)<div[^>]*>(.*?<a[^>]+href="([^"]+?)"[^>]+itemprop="url"[^>]*>.*?)</div>', webpage)
+ if not track_elements:
raise ExtractorError('The page doesn\'t contain any tracks')
+ # Only tracks with duration info have songs
entries = [
self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key())
- for t_path in tracks_paths]
+ for elem_content, t_path in track_elements
+ if self._html_search_meta('duration', elem_content, default=None)]
+
title = self._html_search_regex(
r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"',
webpage, 'title', fatal=False)