diff options
author | Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com> | 2015-06-09 23:49:11 +0200 |
---|---|---|
committer | Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com> | 2015-06-09 23:49:11 +0200 |
commit | 70219b0f4371fe54cc72d025ce06fc4691ba12fa (patch) | |
tree | 76cd48b86b3890dc692839c67c923ea52e9d1b44 /youtube_dl/extractor/youtube.py | |
parent | bd5bc0cd5af257abf7a1a4c14a9dd39c4f97e622 (diff) |
[youtube:playlist] Use an iterator for the entries (closes #5935)
So that '--playlist-end' downloads only the required pages.
Diffstat (limited to 'youtube_dl/extractor/youtube.py')
-rw-r--r-- | youtube_dl/extractor/youtube.py | 49 |
1 files changed, 24 insertions, 25 deletions
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 083da777d..3448bec4f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1290,7 +1290,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): def _extract_playlist(self, playlist_id): url = self._TEMPLATE_URL % playlist_id page = self._download_webpage(url, playlist_id) - more_widget_html = content_html = page for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page): match = match.strip() @@ -1310,36 +1309,36 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): self.report_warning('Youtube gives an alert message: ' + match) # Extract the video ids from the playlist pages - ids = [] - - for page_num in itertools.count(1): - matches = re.finditer(self._VIDEO_RE, content_html) - # We remove the duplicates and the link with index 0 - # (it's not the first video of the playlist) - new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0') - ids.extend(new_ids) - - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) - if not mobj: - break + def _entries(): + more_widget_html = content_html = page + for page_num in itertools.count(1): + matches = re.finditer(self._VIDEO_RE, content_html) + # We remove the duplicates and the link with index 0 + # (it's not the first video of the playlist) + new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0') + for vid_id in new_ids: + yield self.url_result(vid_id, 'Youtube', video_id=vid_id) + + mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) + if not mobj: + break - more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), playlist_id, - 'Downloading page #%s' % page_num, - transform_source=uppercase_escape) - content_html = more['content_html'] - if not content_html.strip(): - # Some webpages show a "Load more" button but they don't - # have more videos - break - more_widget_html = more['load_more_widget_html'] + more = self._download_json( + 'https://youtube.com/%s' % mobj.group('more'), playlist_id, + 'Downloading page #%s' % page_num, + transform_source=uppercase_escape) + content_html = more['content_html'] + if not content_html.strip(): + # Some webpages show a "Load more" button but they don't + # have more videos + break + more_widget_html = more['load_more_widget_html'] playlist_title = self._html_search_regex( r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>', page, 'title') - url_results = self._ids_to_results(ids) - return self.playlist_result(url_results, playlist_id, playlist_title) + return self.playlist_result(_entries(), playlist_id, playlist_title) def _real_extract(self, url): # Extract playlist id |