diff options
| author | Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com> | 2015-06-09 23:49:11 +0200 | 
|---|---|---|
| committer | Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com> | 2015-06-09 23:49:11 +0200 | 
| commit | 70219b0f4371fe54cc72d025ce06fc4691ba12fa (patch) | |
| tree | 76cd48b86b3890dc692839c67c923ea52e9d1b44 | |
| parent | bd5bc0cd5af257abf7a1a4c14a9dd39c4f97e622 (diff) | |
[youtube:playlist] Use an iterator for the entries (closes #5935)
So that '--playlist-end' downloads only the required pages.
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 49 | 
1 files changed, 24 insertions, 25 deletions
| diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 083da777d..3448bec4f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1290,7 +1290,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):      def _extract_playlist(self, playlist_id):          url = self._TEMPLATE_URL % playlist_id          page = self._download_webpage(url, playlist_id) -        more_widget_html = content_html = page          for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):              match = match.strip() @@ -1310,36 +1309,36 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):                  self.report_warning('Youtube gives an alert message: ' + match)          # Extract the video ids from the playlist pages -        ids = [] - -        for page_num in itertools.count(1): -            matches = re.finditer(self._VIDEO_RE, content_html) -            # We remove the duplicates and the link with index 0 -            # (it's not the first video of the playlist) -            new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0') -            ids.extend(new_ids) - -            mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) -            if not mobj: -                break +        def _entries(): +            more_widget_html = content_html = page +            for page_num in itertools.count(1): +                matches = re.finditer(self._VIDEO_RE, content_html) +                # We remove the duplicates and the link with index 0 +                # (it's not the first video of the playlist) +                new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0') +                for vid_id in new_ids: +                    yield self.url_result(vid_id, 'Youtube', video_id=vid_id) + +                mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) +                if not mobj: +                    break -            more = self._download_json( -                'https://youtube.com/%s' % mobj.group('more'), playlist_id, -                'Downloading page #%s' % page_num, -                transform_source=uppercase_escape) -            content_html = more['content_html'] -            if not content_html.strip(): -                # Some webpages show a "Load more" button but they don't -                # have more videos -                break -            more_widget_html = more['load_more_widget_html'] +                more = self._download_json( +                    'https://youtube.com/%s' % mobj.group('more'), playlist_id, +                    'Downloading page #%s' % page_num, +                    transform_source=uppercase_escape) +                content_html = more['content_html'] +                if not content_html.strip(): +                    # Some webpages show a "Load more" button but they don't +                    # have more videos +                    break +                more_widget_html = more['load_more_widget_html']          playlist_title = self._html_search_regex(              r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',              page, 'title') -        url_results = self._ids_to_results(ids) -        return self.playlist_result(url_results, playlist_id, playlist_title) +        return self.playlist_result(_entries(), playlist_id, playlist_title)      def _real_extract(self, url):          # Extract playlist id | 
