diff options
| author | Philipp Hagemeister <phihag@phihag.de> | 2014-09-29 00:36:06 +0200 | 
|---|---|---|
| committer | Philipp Hagemeister <phihag@phihag.de> | 2014-09-29 00:36:06 +0200 | 
| commit | 9c44d2429b90dece734df778c63b04c15e91c1ca (patch) | |
| tree | 8ab92c7d2e6e9ca62d261a42385332462ba08949 | |
| parent | d2e32f7df56ab497175437bffdcdfedbd71ca8d9 (diff) | |
[vimeo:likes] Support large like lists (Fixes #3847)
| -rw-r--r-- | test/test_utils.py | 9 | ||||
| -rw-r--r-- | youtube_dl/extractor/vimeo.py | 66 | ||||
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 4 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 39 | 
4 files changed, 89 insertions, 29 deletions
diff --git a/test/test_utils.py b/test/test_utils.py index 3efbed29d..6419b3ca9 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -22,7 +22,8 @@ from youtube_dl.utils import (      fix_xml_ampersands,      get_meta_content,      orderedSet, -    PagedList, +    OnDemandPagedList, +    InAdvancePagedList,      parse_duration,      read_batch_urls,      sanitize_filename, @@ -246,10 +247,14 @@ class TestUtil(unittest.TestCase):                  for i in range(firstid, upto):                      yield i -            pl = PagedList(get_page, pagesize) +            pl = OnDemandPagedList(get_page, pagesize)              got = pl.getslice(*sliceargs)              self.assertEqual(got, expected) +            iapl = InAdvancePagedList(get_page, size // pagesize + 1, pagesize) +            got = iapl.getslice(*sliceargs) +            self.assertEqual(got, expected) +          testPL(5, 2, (), [0, 1, 2, 3, 4])          testPL(5, 2, (1,), [1, 2, 3, 4])          testPL(5, 2, (2,), [2, 3, 4]) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 4be1b8785..403d0bb28 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -8,18 +8,19 @@ import itertools  from .common import InfoExtractor  from .subtitles import SubtitlesInfoExtractor  from ..utils import ( +    clean_html,      compat_HTTPError,      compat_urllib_parse,      compat_urllib_request, -    clean_html, -    get_element_by_attribute, +    compat_urlparse,      ExtractorError, +    get_element_by_attribute, +    InAdvancePagedList, +    int_or_none,      RegexNotFoundError, -    smuggle_url,      std_headers,      unsmuggle_url,      urlencode_postdata, -    int_or_none,  ) @@ -533,32 +534,55 @@ class VimeoWatchLaterIE(VimeoBaseInfoExtractor, VimeoChannelIE):  class VimeoLikesIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes(?:$|[?#])' +    _VALID_URL = r'https?://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes/?(?:$|[?#]|sort:)'      IE_NAME = 'vimeo:likes'      IE_DESC = 'Vimeo user likes'      _TEST = { -        'url': 'https://vimeo.com/user20132939/likes', -        'playlist_mincount': 4, -        'add_ies': ['Generic'], +        'url': 'https://vimeo.com/user755559/likes/', +        'playlist_mincount': 293,          "info_dict": { -            "description": "Videos Philipp Hagemeister likes on Vimeo.", -            "title": "Vimeo / Philipp Hagemeister's likes", -        }, -        'params': { -            'extract_flat': False, +            "description": "See all the videos urza likes", +            "title": 'Videos urza likes',          },      }      def _real_extract(self, url):          user_id = self._match_id(url) -        rss_url = '%s//vimeo.com/user%s/likes/rss' % ( -            self.http_scheme(), user_id) -        surl = smuggle_url(rss_url, { -            'force_videoid': '%s_likes' % user_id, -            'to_generic': True, -        }) +        webpage = self._download_webpage(url, user_id) +        page_count = self._int( +            self._search_regex( +                r'''(?x)<li><a\s+href="[^"]+"\s+data-page="([0-9]+)"> +                    .*?</a></li>\s*<li\s+class="pagination_next"> +                ''', webpage, 'page count'), +            'page count', fatal=True) +        PAGE_SIZE = 12 +        title = self._html_search_regex( +            r'(?s)<h1>(.+?)</h1>', webpage, 'title', fatal=False) +        description = self._html_search_meta('description', webpage) + +        def _get_page(idx): +            page_url = '%s//vimeo.com/user%s/likes/page:%d/sort:date' % ( +                self.http_scheme(), user_id, idx + 1) +            webpage = self._download_webpage( +                page_url, user_id, +                note='Downloading page %d/%d' % (idx + 1, page_count)) +            video_list = self._search_regex( +                r'(?s)<ol class="js-browse_list[^"]+"[^>]*>(.*?)</ol>', +                webpage, 'video content') +            paths = re.findall( +                r'<li[^>]*>\s*<a\s+href="([^"]+)"', video_list) +            for path in paths: +                yield { +                    '_type': 'url', +                    'url': compat_urlparse.urljoin(page_url, path), +                } + +        pl = InAdvancePagedList(_get_page, page_count, PAGE_SIZE)          return { -            '_type': 'url', -            'url': surl, +            '_type': 'playlist', +            'id': 'user%s_likes' % user_id, +            'title': title, +            'description': description, +            'entries': pl,          } diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 99198e380..045507bc7 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -26,7 +26,7 @@ from ..utils import (      get_element_by_attribute,      ExtractorError,      int_or_none, -    PagedList, +    OnDemandPagedList,      unescapeHTML,      unified_strdate,      orderedSet, @@ -1341,7 +1341,7 @@ class YoutubeUserIE(InfoExtractor):                      'id': video_id,                      'title': title,                  } -        url_results = PagedList(download_page, self._GDATA_PAGE_SIZE) +        url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)          return self.playlist_result(url_results, playlist_title=username) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index b644f4e92..9f49507c1 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1384,14 +1384,16 @@ def check_executable(exe, args=[]):  class PagedList(object): -    def __init__(self, pagefunc, pagesize): -        self._pagefunc = pagefunc -        self._pagesize = pagesize -      def __len__(self):          # This is only useful for tests          return len(self.getslice()) + +class OnDemandPagedList(PagedList): +    def __init__(self, pagefunc, pagesize): +        self._pagefunc = pagefunc +        self._pagesize = pagesize +      def getslice(self, start=0, end=None):          res = []          for pagenum in itertools.count(start // self._pagesize): @@ -1430,6 +1432,35 @@ class PagedList(object):          return res +class InAdvancePagedList(PagedList): +    def __init__(self, pagefunc, pagecount, pagesize): +        self._pagefunc = pagefunc +        self._pagecount = pagecount +        self._pagesize = pagesize + +    def getslice(self, start=0, end=None): +        res = [] +        start_page = start // self._pagesize +        end_page = ( +            self._pagecount if end is None else (end // self._pagesize + 1)) +        skip_elems = start - start_page * self._pagesize +        only_more = None if end is None else end - start +        for pagenum in range(start_page, end_page): +            page = list(self._pagefunc(pagenum)) +            if skip_elems: +                page = page[skip_elems:] +                skip_elems = None +            if only_more is not None: +                if len(page) < only_more: +                    only_more -= len(page) +                else: +                    page = page[:only_more] +                    res.extend(page) +                    break +            res.extend(page) +        return res + +  def uppercase_escape(s):      unicode_escape = codecs.getdecoder('unicode_escape')      return re.sub(  | 
