diff options
author | Philipp Hagemeister <phihag@phihag.de> | 2014-01-20 11:36:47 +0100 |
---|---|---|
committer | Philipp Hagemeister <phihag@phihag.de> | 2014-01-20 11:36:47 +0100 |
commit | b7ab05908440915c6c5faa541abe00c62a88bc27 (patch) | |
tree | 3b7e87361b7dce60ff7bdbe13bd33844fcb7d18e /youtube_dl/extractor/youtube.py | |
parent | c91778f8c0ba120378cb806f694fdc3f94a5634c (diff) |
Add infrastructure for paged lists
This commit allows to download pages in playlists as needed instead of all at once.
Before this commit,
youtube-dl http://www.youtube.com/user/ANNnewsCH/videos --playlist-end 2 --skip-download
took quite some time - now it's almost instantaneous.
As an example, the youtube:user extractor has been converted.
Fixes #2175
Diffstat (limited to 'youtube_dl/extractor/youtube.py')
-rw-r--r-- | youtube_dl/extractor/youtube.py | 28 |
1 files changed, 10 insertions, 18 deletions
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 248b30ffb..dd1a58f3f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -27,6 +27,7 @@ from ..utils import ( get_element_by_id, get_element_by_attribute, ExtractorError, + PagedList, RegexNotFoundError, unescapeHTML, unified_strdate, @@ -1580,44 +1581,35 @@ class YoutubeUserIE(InfoExtractor): # page by page until there are no video ids - it means we got # all of them. - url_results = [] - - for pagenum in itertools.count(0): + def download_page(pagenum): start_index = pagenum * self._GDATA_PAGE_SIZE + 1 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index) - page = self._download_webpage(gdata_url, username, - u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE)) + page = self._download_webpage( + gdata_url, username, + u'Downloading video ids from %d to %d' % ( + start_index, start_index + self._GDATA_PAGE_SIZE)) try: response = json.loads(page) except ValueError as err: raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err)) if 'entry' not in response['feed']: - # Number of videos is a multiple of self._MAX_RESULTS - break + return # Extract video identifiers entries = response['feed']['entry'] for entry in entries: title = entry['title']['$t'] video_id = entry['id']['$t'].split('/')[-1] - url_results.append({ + yield { '_type': 'url', 'url': video_id, 'ie_key': 'Youtube', 'id': 'video_id', 'title': title, - }) - - # A little optimization - if current page is not - # "full", ie. does not contain PAGE_SIZE video ids then - # we can assume that this page is the last one - there - # are no more ids on further pages - no need to query - # again. - - if len(entries) < self._GDATA_PAGE_SIZE: - break + } + url_results = PagedList(download_page, self._GDATA_PAGE_SIZE) return self.playlist_result(url_results, playlist_title=username) |