aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl
diff options
context:
space:
mode:
authorSergey M․ <dstftw@gmail.com>2018-08-31 03:35:55 +0700
committerSergey M․ <dstftw@gmail.com>2018-08-31 03:35:55 +0700
commit4991e16c2a62ecd1a81cbb9ff6563885c1bf6210 (patch)
tree1f4e918432deefb213d97458116f5a36b3f89600 /youtube_dl
parent14b7a24c19d70ec172eaedf799bcf81445ac9403 (diff)
[bandcamp] Extract more metadata (closes #13197)
Diffstat (limited to 'youtube_dl')
-rw-r--r--youtube_dl/extractor/bandcamp.py234
1 files changed, 142 insertions, 92 deletions
diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py
index bc4c5165a..f14b407dc 100644
--- a/youtube_dl/extractor/bandcamp.py
+++ b/youtube_dl/extractor/bandcamp.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import json
import random
import re
import time
@@ -16,15 +15,18 @@ from ..utils import (
int_or_none,
KNOWN_EXTENSIONS,
parse_filesize,
+ str_or_none,
+ try_get,
unescapeHTML,
update_url_query,
unified_strdate,
+ unified_timestamp,
url_or_none,
)
class BandcampIE(InfoExtractor):
- _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P<title>[^/?#&]+)'
+ _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<title>[^/?#&]+)'
_TESTS = [{
'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
'md5': 'c557841d5e50261777a6585648adf439',
@@ -36,24 +38,44 @@ class BandcampIE(InfoExtractor):
},
'_skip': 'There is a limit of 200 free downloads / month for the test song'
}, {
+ # free download
'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
- 'md5': '0369ace6b939f0927e62c67a1a8d9fa7',
+ 'md5': '853e35bf34aa1d6fe2615ae612564b36',
'info_dict': {
'id': '2650410135',
'ext': 'aiff',
'title': 'Ben Prunty - Lanius (Battle)',
+ 'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Ben Prunty',
+ 'timestamp': 1396508491,
+ 'upload_date': '20140403',
+ 'release_date': '20140403',
+ 'duration': 260.877,
+ 'track': 'Lanius (Battle)',
+ 'track_number': 1,
+ 'track_id': '2650410135',
+ 'artist': 'Ben Prunty',
+ 'album': 'FTL: Advanced Edition Soundtrack',
},
}, {
+ # no free download, mp3 128
'url': 'https://relapsealumni.bandcamp.com/track/hail-to-fire',
+ 'md5': 'fec12ff55e804bb7f7ebeb77a800c8b7',
'info_dict': {
'id': '2584466013',
'ext': 'mp3',
- 'title': 'Hail to Fire',
+ 'title': 'Mastodon - Hail to Fire',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'uploader': 'Mastodon',
+ 'timestamp': 1322005399,
+ 'upload_date': '20111122',
+ 'release_date': '20040207',
+ 'duration': 120.79,
+ 'track': 'Hail to Fire',
'track_number': 5,
- },
- 'params': {
- 'skip_download': True,
+ 'track_id': '2584466013',
+ 'artist': 'Mastodon',
+ 'album': 'Call of the Mastodon',
},
}]
@@ -62,19 +84,23 @@ class BandcampIE(InfoExtractor):
title = mobj.group('title')
webpage = self._download_webpage(url, title)
thumbnail = self._html_search_meta('og:image', webpage, default=None)
- m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
- if not m_download:
- m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage)
- if m_trackinfo:
- json_code = m_trackinfo.group(1)
- data = json.loads(json_code)[0]
- track_id = compat_str(data['id'])
-
- if not data.get('file'):
- raise ExtractorError('Not streamable', video_id=track_id, expected=True)
-
- formats = []
- for format_id, format_url in data['file'].items():
+
+ track_id = None
+ track = None
+ track_number = None
+ duration = None
+
+ formats = []
+ track_info = self._parse_json(
+ self._search_regex(
+ r'trackinfo\s*:\s*\[\s*({.+?})\s*\]\s*,\s*?\n',
+ webpage, 'track info', default='{}'), title)
+ if track_info:
+ file_ = track_info.get('file')
+ if isinstance(file_, dict):
+ for format_id, format_url in file_.items():
+ if not url_or_none(format_url):
+ continue
ext, abr_str = format_id.split('-', 1)
formats.append({
'format_id': format_id,
@@ -84,86 +110,110 @@ class BandcampIE(InfoExtractor):
'acodec': ext,
'abr': int_or_none(abr_str),
})
+ track = track_info.get('title')
+ track_id = str_or_none(track_info.get('track_id') or track_info.get('id'))
+ track_number = int_or_none(track_info.get('track_num'))
+ duration = float_or_none(track_info.get('duration'))
+
+ def extract(key):
+ return self._search_regex(
+ r'\b%s\s*["\']?\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % key,
+ webpage, key, default=None, group='value')
+
+ artist = extract('artist')
+ album = extract('album_title')
+ timestamp = unified_timestamp(
+ extract('publish_date') or extract('album_publish_date'))
+ release_date = unified_strdate(extract('album_release_date'))
+
+ download_link = self._search_regex(
+ r'freeDownloadPage\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
+ 'download link', default=None, group='url')
+ if download_link:
+ track_id = self._search_regex(
+ r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$',
+ webpage, 'track id')
+
+ download_webpage = self._download_webpage(
+ download_link, track_id, 'Downloading free downloads page')
+
+ blob = self._parse_json(
+ self._search_regex(
+ r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage,
+ 'blob', group='blob'),
+ track_id, transform_source=unescapeHTML)
+
+ info = try_get(
+ blob, (lambda x: x['digital_items'][0],
+ lambda x: x['download_items'][0]), dict)
+ if info:
+ downloads = info.get('downloads')
+ if isinstance(downloads, dict):
+ if not track:
+ track = info.get('title')
+ if not artist:
+ artist = info.get('artist')
+ if not thumbnail:
+ thumbnail = info.get('thumb_url')
+
+ download_formats = {}
+ download_formats_list = blob.get('download_formats')
+ if isinstance(download_formats_list, list):
+ for f in blob['download_formats']:
+ name, ext = f.get('name'), f.get('file_extension')
+ if all(isinstance(x, compat_str) for x in (name, ext)):
+ download_formats[name] = ext.strip('.')
+
+ for format_id, f in downloads.items():
+ format_url = f.get('url')
+ if not format_url:
+ continue
+ # Stat URL generation algorithm is reverse engineered from
+ # download_*_bundle_*.js
+ stat_url = update_url_query(
+ format_url.replace('/download/', '/statdownload/'), {
+ '.rand': int(time.time() * 1000 * random.random()),
+ })
+ format_id = f.get('encoding_name') or format_id
+ stat = self._download_json(
+ stat_url, track_id, 'Downloading %s JSON' % format_id,
+ transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1],
+ fatal=False)
+ if not stat:
+ continue
+ retry_url = url_or_none(stat.get('retry_url'))
+ if not retry_url:
+ continue
+ formats.append({
+ 'url': self._proto_relative_url(retry_url, 'http:'),
+ 'ext': download_formats.get(format_id),
+ 'format_id': format_id,
+ 'format_note': f.get('description'),
+ 'filesize': parse_filesize(f.get('size_mb')),
+ 'vcodec': 'none',
+ })
- self._sort_formats(formats)
-
- return {
- 'id': track_id,
- 'title': data['title'],
- 'thumbnail': thumbnail,
- 'formats': formats,
- 'duration': float_or_none(data.get('duration')),
- 'track_number': int_or_none(data.get('track_num')),
- }
- else:
- raise ExtractorError('No free songs found')
-
- download_link = m_download.group(1)
- video_id = self._search_regex(
- r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$',
- webpage, 'video id')
-
- download_webpage = self._download_webpage(
- download_link, video_id, 'Downloading free downloads page')
-
- blob = self._parse_json(
- self._search_regex(
- r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage,
- 'blob', group='blob'),
- video_id, transform_source=unescapeHTML)
-
- info = blob['digital_items'][0]
-
- downloads = info['downloads']
- track = info['title']
+ self._sort_formats(formats)
- artist = info.get('artist')
title = '%s - %s' % (artist, track) if artist else track
- download_formats = {}
- for f in blob['download_formats']:
- name, ext = f.get('name'), f.get('file_extension')
- if all(isinstance(x, compat_str) for x in (name, ext)):
- download_formats[name] = ext.strip('.')
-
- formats = []
- for format_id, f in downloads.items():
- format_url = f.get('url')
- if not format_url:
- continue
- # Stat URL generation algorithm is reverse engineered from
- # download_*_bundle_*.js
- stat_url = update_url_query(
- format_url.replace('/download/', '/statdownload/'), {
- '.rand': int(time.time() * 1000 * random.random()),
- })
- format_id = f.get('encoding_name') or format_id
- stat = self._download_json(
- stat_url, video_id, 'Downloading %s JSON' % format_id,
- transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1],
- fatal=False)
- if not stat:
- continue
- retry_url = url_or_none(stat.get('retry_url'))
- if not retry_url:
- continue
- formats.append({
- 'url': self._proto_relative_url(retry_url, 'http:'),
- 'ext': download_formats.get(format_id),
- 'format_id': format_id,
- 'format_note': f.get('description'),
- 'filesize': parse_filesize(f.get('size_mb')),
- 'vcodec': 'none',
- })
- self._sort_formats(formats)
+ if not duration:
+ duration = float_or_none(self._html_search_meta(
+ 'duration', webpage, default=None))
return {
- 'id': video_id,
+ 'id': track_id,
'title': title,
- 'thumbnail': info.get('thumb_url') or thumbnail,
- 'uploader': info.get('artist'),
- 'artist': artist,
+ 'thumbnail': thumbnail,
+ 'uploader': artist,
+ 'timestamp': timestamp,
+ 'release_date': release_date,
+ 'duration': duration,
'track': track,
+ 'track_number': track_number,
+ 'track_id': track_id,
+ 'artist': artist,
+ 'album': album,
'formats': formats,
}