aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhilipp Hagemeister <phihag@phihag.de>2014-01-20 11:36:47 +0100
committerPhilipp Hagemeister <phihag@phihag.de>2014-01-20 11:36:47 +0100
commitb7ab05908440915c6c5faa541abe00c62a88bc27 (patch)
tree3b7e87361b7dce60ff7bdbe13bd33844fcb7d18e
parentc91778f8c0ba120378cb806f694fdc3f94a5634c (diff)
Add infrastructure for paged lists
This commit allows to download pages in playlists as needed instead of all at once. Before this commit, youtube-dl http://www.youtube.com/user/ANNnewsCH/videos --playlist-end 2 --skip-download took quite some time - now it's almost instantaneous. As an example, the youtube:user extractor has been converted. Fixes #2175
-rw-r--r--test/test_utils.py22
-rw-r--r--youtube_dl/YoutubeDL.py23
-rw-r--r--youtube_dl/extractor/youtube.py28
-rw-r--r--youtube_dl/utils.py44
4 files changed, 92 insertions, 25 deletions
diff --git a/test/test_utils.py b/test/test_utils.py
index bee355ee0..349c1107f 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -18,6 +18,7 @@ from youtube_dl.utils import (
find_xpath_attr,
get_meta_content,
orderedSet,
+ PagedList,
parse_duration,
sanitize_filename,
shell_quote,
@@ -200,5 +201,26 @@ class TestUtil(unittest.TestCase):
self.assertEqual(parse_duration('9:12:43'), 33163)
self.assertEqual(parse_duration('x:y'), None)
+ def test_paged_list(self):
+ def testPL(size, pagesize, sliceargs, expected):
+ def get_page(pagenum):
+ firstid = pagenum * pagesize
+ upto = min(size, pagenum * pagesize + pagesize)
+ for i in range(firstid, upto):
+ yield i
+
+ pl = PagedList(get_page, pagesize)
+ got = pl.getslice(*sliceargs)
+ self.assertEqual(got, expected)
+
+ testPL(5, 2, (), [0, 1, 2, 3, 4])
+ testPL(5, 2, (1,), [1, 2, 3, 4])
+ testPL(5, 2, (2,), [2, 3, 4])
+ testPL(5, 2, (4,), [4])
+ testPL(5, 2, (0, 3), [0, 1, 2])
+ testPL(5, 2, (1, 4), [1, 2, 3])
+ testPL(5, 2, (2, 99), [2, 3, 4])
+ testPL(5, 2, (20, 99), [])
+
if __name__ == '__main__':
unittest.main()
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py
index a0ab89b3d..2ad6f1028 100644
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -39,6 +39,7 @@ from .utils import (
locked_file,
make_HTTPS_handler,
MaxDownloadsReached,
+ PagedList,
PostProcessingError,
platform_name,
preferredencoding,
@@ -575,19 +576,27 @@ class YoutubeDL(object):
playlist_results = []
- n_all_entries = len(ie_result['entries'])
playliststart = self.params.get('playliststart', 1) - 1
playlistend = self.params.get('playlistend', None)
# For backwards compatibility, interpret -1 as whole list
if playlistend == -1:
playlistend = None
- entries = ie_result['entries'][playliststart:playlistend]
- n_entries = len(entries)
-
- self.to_screen(
- "[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
- (ie_result['extractor'], playlist, n_all_entries, n_entries))
+ if isinstance(ie_result['entries'], list):
+ n_all_entries = len(ie_result['entries'])
+ entries = ie_result['entries'][playliststart:playlistend]
+ n_entries = len(entries)
+ self.to_screen(
+ "[%s] playlist %s: Collected %d video ids (downloading %d of them)" %
+ (ie_result['extractor'], playlist, n_all_entries, n_entries))
+ else:
+ assert isinstance(ie_result['entries'], PagedList)
+ entries = ie_result['entries'].getslice(
+ playliststart, playlistend)
+ n_entries = len(entries)
+ self.to_screen(
+ "[%s] playlist %s: Downloading %d videos" %
+ (ie_result['extractor'], playlist, n_entries))
for i, entry in enumerate(entries, 1):
self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries))
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 248b30ffb..dd1a58f3f 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -27,6 +27,7 @@ from ..utils import (
get_element_by_id,
get_element_by_attribute,
ExtractorError,
+ PagedList,
RegexNotFoundError,
unescapeHTML,
unified_strdate,
@@ -1580,44 +1581,35 @@ class YoutubeUserIE(InfoExtractor):
# page by page until there are no video ids - it means we got
# all of them.
- url_results = []
-
- for pagenum in itertools.count(0):
+ def download_page(pagenum):
start_index = pagenum * self._GDATA_PAGE_SIZE + 1
gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
- page = self._download_webpage(gdata_url, username,
- u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
+ page = self._download_webpage(
+ gdata_url, username,
+ u'Downloading video ids from %d to %d' % (
+ start_index, start_index + self._GDATA_PAGE_SIZE))
try:
response = json.loads(page)
except ValueError as err:
raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
if 'entry' not in response['feed']:
- # Number of videos is a multiple of self._MAX_RESULTS
- break
+ return
# Extract video identifiers
entries = response['feed']['entry']
for entry in entries:
title = entry['title']['$t']
video_id = entry['id']['$t'].split('/')[-1]
- url_results.append({
+ yield {
'_type': 'url',
'url': video_id,
'ie_key': 'Youtube',
'id': 'video_id',
'title': title,
- })
-
- # A little optimization - if current page is not
- # "full", ie. does not contain PAGE_SIZE video ids then
- # we can assume that this page is the last one - there
- # are no more ids on further pages - no need to query
- # again.
-
- if len(entries) < self._GDATA_PAGE_SIZE:
- break
+ }
+ url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
return self.playlist_result(url_results, playlist_title=username)
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 73fe1ad0a..ff124d9e8 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -6,6 +6,7 @@ import datetime
import email.utils
import errno
import gzip
+import itertools
import io
import json
import locale
@@ -1161,3 +1162,46 @@ def check_executable(exe, args=[]):
except OSError:
return False
return exe
+
+
+class PagedList(object):
+ def __init__(self, pagefunc, pagesize):
+ self._pagefunc = pagefunc
+ self._pagesize = pagesize
+
+ def getslice(self, start=0, end=None):
+ res = []
+ for pagenum in itertools.count(start // self._pagesize):
+ firstid = pagenum * self._pagesize
+ nextfirstid = pagenum * self._pagesize + self._pagesize
+ if start >= nextfirstid:
+ continue
+
+ page_results = list(self._pagefunc(pagenum))
+
+ startv = (
+ start % self._pagesize
+ if firstid <= start < nextfirstid
+ else 0)
+
+ endv = (
+ ((end - 1) % self._pagesize) + 1
+ if (end is not None and firstid <= end <= nextfirstid)
+ else None)
+
+ if startv != 0 or endv is not None:
+ page_results = page_results[startv:endv]
+ res.extend(page_results)
+
+ # A little optimization - if current page is not "full", ie. does
+ # not contain page_size videos then we can assume that this page
+ # is the last one - there are no more ids on further pages -
+ # i.e. no need to query again.
+ if len(page_results) + startv < self._pagesize:
+ break
+
+ # If we got the whole page, but the next page is not interesting,
+ # break out early as well
+ if end == nextfirstid:
+ break
+ return res