diff options
author | Yen Chi Hsuan <yan12125@gmail.com> | 2017-07-20 22:57:51 +0800 |
---|---|---|
committer | Yen Chi Hsuan <yan12125@gmail.com> | 2017-07-20 22:57:51 +0800 |
commit | fa63cf6c2301972b7d0ae76fb7a11c7d1a2786a9 (patch) | |
tree | 6f290e729b83afd9574bbf045b1e8fd9432aabc9 | |
parent | 85f5a74b6cf44b0c8b612c264c36eaabd958f501 (diff) |
[youku:show] Fix playlist extraction (closes #13248)
-rw-r--r-- | ChangeLog | 1 | ||||
-rw-r--r-- | youtube_dl/extractor/youku.py | 66 |
2 files changed, 39 insertions, 28 deletions
@@ -1,6 +1,7 @@ version <unreleased> Extractors +* [youku:show] Fix playlist extraction (#13248) + [dispeak] Recognize sevt subdomain (#13276) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index dcce15d77..4ae9adb51 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import itertools import random import re import string @@ -222,50 +221,61 @@ class YoukuShowIE(InfoExtractor): _VALID_URL = r'https?://list\.youku\.com/show/id_(?P<id>[0-9a-z]+)\.html' IE_NAME = 'youku:show' - _TEST = { + _TESTS = [{ 'url': 'http://list.youku.com/show/id_zc7c670be07ff11e48b3f.html', 'info_dict': { 'id': 'zc7c670be07ff11e48b3f', - 'title': '花千骨 未删减版', + 'title': '花千骨 DVD版', 'description': 'md5:a1ae6f5618571bbeb5c9821f9c81b558', }, 'playlist_count': 50, - } + }, { + # Episode number not starting from 1 + 'url': 'http://list.youku.com/show/id_zefbfbd70efbfbd780bef.html', + 'info_dict': { + 'id': 'zefbfbd70efbfbd780bef', + 'title': '超级飞侠3', + 'description': 'md5:275715156abebe5ccc2a1992e9d56b98', + }, + 'playlist_count': 24, + }] - _PAGE_SIZE = 40 + def _extract_entries(self, playlist_data_url, show_id, idx, query, url): + query['callback'] = 'cb' + playlist_data = self._download_json( + playlist_data_url, show_id, query=query, + note='Downloading playlist data page %d' % (idx + 1), + transform_source=lambda s: js_to_json(strip_jsonp(s)))['html'] + drama_list = (get_element_by_class('p-drama-grid', playlist_data) or + get_element_by_class('p-drama-half-row', playlist_data)) + if drama_list is None: + raise ExtractorError('No episodes found') + video_urls = re.findall(r'<a[^>]+href="([^"]+)"', drama_list) + return playlist_data, [ + self.url_result(urljoin(url, video_url), YoukuIE.ie_key()) + for video_url in video_urls] def _real_extract(self, url): show_id = self._match_id(url) webpage = self._download_webpage(url, show_id) - entries = [] page_config = self._parse_json(self._search_regex( r'var\s+PageConfig\s*=\s*({.+});', webpage, 'page config'), show_id, transform_source=js_to_json) - for idx in itertools.count(0): - if idx == 0: - playlist_data_url = 'http://list.youku.com/show/module' - query = {'id': page_config['showid'], 'tab': 'point'} - else: - playlist_data_url = 'http://list.youku.com/show/point' - query = { + first_page, entries = self._extract_entries( + 'http://list.youku.com/show/module', show_id, 0, { + 'id': page_config['showid'], + 'tab': 'showInfo', + }, url) + # The first reload_id has the same items as first_page + reload_ids = re.findall('<li[^>]+data-id="([^"]+)">', first_page)[1:] + for idx, reload_id in enumerate(reload_ids): + _, new_entries = self._extract_entries( + 'http://list.youku.com/show/episode', show_id, idx + 1, { 'id': page_config['showid'], - 'stage': 'reload_%d' % (self._PAGE_SIZE * idx + 1), - } - query['callback'] = 'cb' - playlist_data = self._download_json( - playlist_data_url, show_id, query=query, - note='Downloading playlist data page %d' % (idx + 1), - transform_source=lambda s: js_to_json(strip_jsonp(s)))['html'] - video_urls = re.findall( - r'<div[^>]+class="p-thumb"[^<]+<a[^>]+href="([^"]+)"', - playlist_data) - new_entries = [ - self.url_result(urljoin(url, video_url), YoukuIE.ie_key()) - for video_url in video_urls] + 'stage': reload_id, + }, url) entries.extend(new_entries) - if len(new_entries) < self._PAGE_SIZE: - break desc = self._html_search_meta('description', webpage, fatal=False) playlist_title = desc.split(',')[0] if desc else None |