diff options
| -rw-r--r-- | test/test_utils.py | 22 | ||||
| -rw-r--r-- | youtube_dl/YoutubeDL.py | 23 | ||||
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 28 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 44 | 
4 files changed, 92 insertions, 25 deletions
| diff --git a/test/test_utils.py b/test/test_utils.py index bee355ee0..349c1107f 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -18,6 +18,7 @@ from youtube_dl.utils import (      find_xpath_attr,      get_meta_content,      orderedSet, +    PagedList,      parse_duration,      sanitize_filename,      shell_quote, @@ -200,5 +201,26 @@ class TestUtil(unittest.TestCase):          self.assertEqual(parse_duration('9:12:43'), 33163)          self.assertEqual(parse_duration('x:y'), None) +    def test_paged_list(self): +        def testPL(size, pagesize, sliceargs, expected): +            def get_page(pagenum): +                firstid = pagenum * pagesize +                upto = min(size, pagenum * pagesize + pagesize) +                for i in range(firstid, upto): +                    yield i + +            pl = PagedList(get_page, pagesize) +            got = pl.getslice(*sliceargs) +            self.assertEqual(got, expected) + +        testPL(5, 2, (), [0, 1, 2, 3, 4]) +        testPL(5, 2, (1,), [1, 2, 3, 4]) +        testPL(5, 2, (2,), [2, 3, 4]) +        testPL(5, 2, (4,), [4]) +        testPL(5, 2, (0, 3), [0, 1, 2]) +        testPL(5, 2, (1, 4), [1, 2, 3]) +        testPL(5, 2, (2, 99), [2, 3, 4]) +        testPL(5, 2, (20, 99), []) +  if __name__ == '__main__':      unittest.main() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a0ab89b3d..2ad6f1028 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -39,6 +39,7 @@ from .utils import (      locked_file,      make_HTTPS_handler,      MaxDownloadsReached, +    PagedList,      PostProcessingError,      platform_name,      preferredencoding, @@ -575,19 +576,27 @@ class YoutubeDL(object):              playlist_results = [] -            n_all_entries = len(ie_result['entries'])              playliststart = self.params.get('playliststart', 1) - 1              playlistend = self.params.get('playlistend', None)              # For backwards compatibility, interpret -1 as whole list              if playlistend == -1:                  playlistend = None -            entries = ie_result['entries'][playliststart:playlistend] -            n_entries = len(entries) - -            self.to_screen( -                "[%s] playlist '%s': Collected %d video ids (downloading %d of them)" % -                (ie_result['extractor'], playlist, n_all_entries, n_entries)) +            if isinstance(ie_result['entries'], list): +                n_all_entries = len(ie_result['entries']) +                entries = ie_result['entries'][playliststart:playlistend] +                n_entries = len(entries) +                self.to_screen( +                    "[%s] playlist %s: Collected %d video ids (downloading %d of them)" % +                    (ie_result['extractor'], playlist, n_all_entries, n_entries)) +            else: +                assert isinstance(ie_result['entries'], PagedList) +                entries = ie_result['entries'].getslice( +                    playliststart, playlistend) +                n_entries = len(entries) +                self.to_screen( +                    "[%s] playlist %s: Downloading %d videos" % +                    (ie_result['extractor'], playlist, n_entries))              for i, entry in enumerate(entries, 1):                  self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries)) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 248b30ffb..dd1a58f3f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -27,6 +27,7 @@ from ..utils import (      get_element_by_id,      get_element_by_attribute,      ExtractorError, +    PagedList,      RegexNotFoundError,      unescapeHTML,      unified_strdate, @@ -1580,44 +1581,35 @@ class YoutubeUserIE(InfoExtractor):          # page by page until there are no video ids - it means we got          # all of them. -        url_results = [] - -        for pagenum in itertools.count(0): +        def download_page(pagenum):              start_index = pagenum * self._GDATA_PAGE_SIZE + 1              gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index) -            page = self._download_webpage(gdata_url, username, -                                          u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE)) +            page = self._download_webpage( +                gdata_url, username, +                u'Downloading video ids from %d to %d' % ( +                    start_index, start_index + self._GDATA_PAGE_SIZE))              try:                  response = json.loads(page)              except ValueError as err:                  raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))              if 'entry' not in response['feed']: -                # Number of videos is a multiple of self._MAX_RESULTS -                break +                return              # Extract video identifiers              entries = response['feed']['entry']              for entry in entries:                  title = entry['title']['$t']                  video_id = entry['id']['$t'].split('/')[-1] -                url_results.append({ +                yield {                      '_type': 'url',                      'url': video_id,                      'ie_key': 'Youtube',                      'id': 'video_id',                      'title': title, -                }) - -            # A little optimization - if current page is not -            # "full", ie. does not contain PAGE_SIZE video ids then -            # we can assume that this page is the last one - there -            # are no more ids on further pages - no need to query -            # again. - -            if len(entries) < self._GDATA_PAGE_SIZE: -                break +                } +        url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)          return self.playlist_result(url_results, playlist_title=username) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 73fe1ad0a..ff124d9e8 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -6,6 +6,7 @@ import datetime  import email.utils  import errno  import gzip +import itertools  import io  import json  import locale @@ -1161,3 +1162,46 @@ def check_executable(exe, args=[]):      except OSError:          return False      return exe + + +class PagedList(object): +    def __init__(self, pagefunc, pagesize): +        self._pagefunc = pagefunc +        self._pagesize = pagesize + +    def getslice(self, start=0, end=None): +        res = [] +        for pagenum in itertools.count(start // self._pagesize): +            firstid = pagenum * self._pagesize +            nextfirstid = pagenum * self._pagesize + self._pagesize +            if start >= nextfirstid: +                continue + +            page_results = list(self._pagefunc(pagenum)) + +            startv = ( +                start % self._pagesize +                if firstid <= start < nextfirstid +                else 0) + +            endv = ( +                ((end - 1) % self._pagesize) + 1 +                if (end is not None and firstid <= end <= nextfirstid) +                else None) + +            if startv != 0 or endv is not None: +                page_results = page_results[startv:endv] +            res.extend(page_results) + +            # A little optimization - if current page is not "full", ie. does +            # not contain page_size videos then we can assume that this page +            # is the last one - there are no more ids on further pages - +            # i.e. no need to query again. +            if len(page_results) + startv < self._pagesize: +                break + +            # If we got the whole page, but the next page is not interesting, +            # break out early as well +            if end == nextfirstid: +                break +        return res | 
