aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSergey M․ <dstftw@gmail.com>2016-04-30 21:50:23 +0600
committerSergey M․ <dstftw@gmail.com>2016-04-30 21:50:23 +0600
commit4e0c0c1508810eb494cd32ef00fb75d03d03ce6f (patch)
tree3ff69bac8ade3e5c14c7204f3b01592969bceb00
parent89c0dc9a5fadc3927f7c03f5829e4f2ef8555888 (diff)
[xiami] Improve extraction (Closes #9079)
* Switch to JSON source * Add abstract IE for playlists * Extract more track related metadata
-rw-r--r--youtube_dl/extractor/extractors.py2
-rw-r--r--youtube_dl/extractor/xiami.py199
2 files changed, 99 insertions, 102 deletions
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index 14ca9eaee..737960a01 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -942,7 +942,7 @@ from .xhamster import (
XHamsterEmbedIE,
)
from .xiami import (
- XiamiIE,
+ XiamiSongIE,
XiamiAlbumIE,
XiamiArtistIE,
XiamiCollectionIE
diff --git a/youtube_dl/extractor/xiami.py b/youtube_dl/extractor/xiami.py
index a28d63c48..e4ed306b4 100644
--- a/youtube_dl/extractor/xiami.py
+++ b/youtube_dl/extractor/xiami.py
@@ -1,50 +1,42 @@
-# -*- coding: utf-8 -*-
-
+# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import (
- xpath_element,
- xpath_text,
- xpath_with_ns,
- int_or_none,
- ExtractorError
-)
from ..compat import compat_urllib_parse_unquote
+from ..utils import int_or_none
class XiamiBaseIE(InfoExtractor):
-
- _XML_BASE_URL = 'http://www.xiami.com/song/playlist/id'
- _NS_MAP = {'xm': 'http://xspf.org/ns/0/'}
-
- def _extract_track(self, track):
- artist = xpath_text(track, xpath_with_ns('xm:artist', self._NS_MAP), default='')
- artist = artist.split(';')
-
- ret = {
- 'id': xpath_text(track, xpath_with_ns('xm:song_id', self._NS_MAP)),
- 'title': xpath_text(track, xpath_with_ns('xm:title', self._NS_MAP)),
- 'album': xpath_text(track, xpath_with_ns('xm:album_name', self._NS_MAP)),
- 'artist': ';'.join(artist) if artist else None,
- 'creator': artist[0] if artist else None,
- 'url': self._decrypt(xpath_text(track, xpath_with_ns('xm:location', self._NS_MAP))),
- 'thumbnail': xpath_text(track, xpath_with_ns('xm:pic', self._NS_MAP), default=None),
- 'duration': int_or_none(xpath_text(track, xpath_with_ns('xm:length', self._NS_MAP))),
+ _API_BASE_URL = 'http://www.xiami.com/song/playlist/cat/json/id'
+
+ def _extract_track(self, track, track_id=None):
+ title = track['title']
+ track_url = self._decrypt(track['location'])
+
+ subtitles = {}
+ lyrics_url = track.get('lyric_url') or track.get('lyric')
+ if lyrics_url and lyrics_url.startswith('http'):
+ subtitles['origin'] = [{'url': lyrics_url}]
+
+ return {
+ 'id': track.get('song_id') or track_id,
+ 'url': track_url,
+ 'title': title,
+ 'thumbnail': track.get('pic') or track.get('album_pic'),
+ 'duration': int_or_none(track.get('length')),
+ 'creator': track.get('artist', '').split(';')[0],
+ 'track': title,
+ 'album': track.get('album_name'),
+ 'artist': track.get('artist'),
+ 'subtitles': subtitles,
}
- lyrics_url = xpath_text(track, xpath_with_ns('xm:lyric', self._NS_MAP))
- if lyrics_url and lyrics_url.endswith('.lrc'):
- ret['description'] = self._download_webpage(lyrics_url, ret['id'])
- return ret
-
- def _extract_xml(self, _id, typ=''):
- playlist = self._download_xml('%s/%s%s' % (self._XML_BASE_URL, _id, typ), _id)
- tracklist = xpath_element(playlist, xpath_with_ns('./xm:trackList', self._NS_MAP))
-
- if not len(tracklist):
- raise ExtractorError('No track found')
- return [self._extract_track(track) for track in tracklist]
+ def _extract_tracks(self, item_id, typ=None):
+ playlist = self._download_json(
+ '%s/%s%s' % (self._API_BASE_URL, item_id, '/type/%s' % typ if typ else ''), item_id)
+ return [
+ self._extract_track(track, item_id)
+ for track in playlist['data']['trackList']]
@staticmethod
def _decrypt(origin):
@@ -62,75 +54,87 @@ class XiamiBaseIE(InfoExtractor):
ans = ''
for i in range(0, short_lenth + 1):
for j in range(0, n):
- if len(l[j])>i:
+ if len(l[j]) > i:
ans += l[j][i]
return compat_urllib_parse_unquote(ans).replace('^', '0')
-class XiamiIE(XiamiBaseIE):
+class XiamiSongIE(XiamiBaseIE):
IE_NAME = 'xiami:song'
IE_DESC = '虾米音乐'
- _VALID_URL = r'http://www\.xiami\.com/song/(?P<id>[0-9]+)'
- _TESTS = [
- {
- 'url': 'http://www.xiami.com/song/1775610518',
- 'md5': '521dd6bea40fd5c9c69f913c232cb57e',
- 'info_dict': {
- 'id': '1775610518',
- 'ext': 'mp3',
- 'title': 'Woman',
- 'creator': 'HONNE',
- 'album': 'Woman',
- 'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg',
- 'description': 'md5:052ec7de41ca19f67e7fd70a1bfc4e0b',
- }
- },
- {
- 'url': 'http://www.xiami.com/song/1775256504',
- 'md5': '932a3abd45c6aa2b1fdbe028fcb4c4fc',
- 'info_dict': {
- 'id': '1775256504',
- 'ext': 'mp3',
- 'title': '悟空',
- 'creator': '戴荃',
- 'album': '悟空',
- 'description': 'md5:206e67e84f9bed1d473d04196a00b990',
- }
- },
- ]
+ _VALID_URL = r'https?://(?:www\.)?xiami\.com/song/(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'http://www.xiami.com/song/1775610518',
+ 'md5': '521dd6bea40fd5c9c69f913c232cb57e',
+ 'info_dict': {
+ 'id': '1775610518',
+ 'ext': 'mp3',
+ 'title': 'Woman',
+ 'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg',
+ 'duration': 265,
+ 'creator': 'HONNE',
+ 'track': 'Woman',
+ 'album': 'Woman',
+ 'artist': 'HONNE',
+ 'subtitles': {
+ 'origin': [{
+ 'ext': 'lrc',
+ }],
+ },
+ }
+ }, {
+ 'url': 'http://www.xiami.com/song/1775256504',
+ 'md5': '932a3abd45c6aa2b1fdbe028fcb4c4fc',
+ 'info_dict': {
+ 'id': '1775256504',
+ 'ext': 'mp3',
+ 'title': '悟空',
+ 'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg',
+ 'duration': 200,
+ 'creator': '戴荃',
+ 'track': '悟空',
+ 'album': '悟空',
+ 'artist': '戴荃',
+ 'subtitles': {
+ 'origin': [{
+ 'ext': 'lrc',
+ }],
+ },
+ }
+ }]
def _real_extract(self, url):
- _id = self._match_id(url)
- return self._extract_xml(_id)[0]
+ return self._extract_tracks(self._match_id(url))[0]
-class XiamiAlbumIE(XiamiBaseIE):
+class XiamiPlaylistBaseIE(XiamiBaseIE):
+ def _real_extract(self, url):
+ item_id = self._match_id(url)
+ return self.playlist_result(self._extract_tracks(item_id, self._TYPE), item_id)
+
+
+class XiamiAlbumIE(XiamiPlaylistBaseIE):
IE_NAME = 'xiami:album'
IE_DESC = '虾米音乐 - 专辑'
- _VALID_URL = r'http://www\.xiami\.com/album/(?P<id>[0-9]+)'
- _TESTS = [
- {
- 'url': 'http://www.xiami.com/album/2100300444',
- 'info_dict': {
- 'id': '2100300444',
- },
- 'playlist_count': 10,
+ _VALID_URL = r'https?://(?:www\.)?xiami\.com/album/(?P<id>[0-9]+)'
+ _TYPE = '1'
+ _TESTS = [{
+ 'url': 'http://www.xiami.com/album/2100300444',
+ 'info_dict': {
+ 'id': '2100300444',
},
- {
- 'url': 'http://www.xiami.com/album/512288?spm=a1z1s.6843761.1110925389.6.hhE9p9',
- 'only_matching': True,
- }
- ]
-
- def _real_extract(self, url):
- _id = self._match_id(url)
- return self.playlist_result(self._extract_xml(_id, '/type/1'), _id)
+ 'playlist_count': 10,
+ }, {
+ 'url': 'http://www.xiami.com/album/512288?spm=a1z1s.6843761.1110925389.6.hhE9p9',
+ 'only_matching': True,
+ }]
-class XiamiArtistIE(XiamiBaseIE):
+class XiamiArtistIE(XiamiPlaylistBaseIE):
IE_NAME = 'xiami:artist'
IE_DESC = '虾米音乐 - 歌手'
- _VALID_URL = r'http://www\.xiami\.com/artist/(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?xiami\.com/artist/(?P<id>[0-9]+)'
+ _TYPE = '2'
_TEST = {
'url': 'http://www.xiami.com/artist/2132?spm=0.0.0.0.dKaScp',
'info_dict': {
@@ -139,23 +143,16 @@ class XiamiArtistIE(XiamiBaseIE):
'playlist_count': 20,
}
- def _real_extract(self, url):
- _id = self._match_id(url)
- return self.playlist_result(self._extract_xml(_id, '/type/2'), _id)
-
-class XiamiCollectionIE(XiamiBaseIE):
+class XiamiCollectionIE(XiamiPlaylistBaseIE):
IE_NAME = 'xiami:collection'
IE_DESC = '虾米音乐 - 精选集'
- _VALID_URL = r'http://www\.xiami\.com/collect/(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?xiami\.com/collect/(?P<id>[0-9]+)'
+ _TYPE = '3'
_TEST = {
'url': 'http://www.xiami.com/collect/156527391?spm=a1z1s.2943601.6856193.12.4jpBnr',
'info_dict': {
'id': '156527391',
},
- 'playlist_count': 26,
+ 'playlist_mincount': 29,
}
-
- def _real_extract(self, url):
- _id = self._match_id(url)
- return self.playlist_result(self._extract_xml(_id, '/type/3'), _id)