diff options
Diffstat (limited to 'youtube_dl/extractor/vimeo.py')
| -rw-r--r-- | youtube_dl/extractor/vimeo.py | 96 | 
1 files changed, 81 insertions, 15 deletions
| diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index bc01d7fbf..d2c36b58a 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -8,17 +8,19 @@ import itertools  from .common import InfoExtractor  from .subtitles import SubtitlesInfoExtractor  from ..utils import ( +    clean_html,      compat_HTTPError,      compat_urllib_parse,      compat_urllib_request, -    clean_html, -    get_element_by_attribute, +    compat_urlparse,      ExtractorError, +    get_element_by_attribute, +    InAdvancePagedList, +    int_or_none,      RegexNotFoundError,      std_headers,      unsmuggle_url,      urlencode_postdata, -    int_or_none,  ) @@ -89,6 +91,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):                  'uploader_id': 'openstreetmapus',                  'uploader': 'OpenStreetMap US',                  'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography', +                'description': 'md5:380943ec71b89736ff4bf27183233d09',                  'duration': 1595,              },          }, @@ -103,6 +106,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):                  'uploader': 'The BLN & Business of Software',                  'uploader_id': 'theblnbusinessofsoftware',                  'duration': 3610, +                'description': None,              },          },          { @@ -117,6 +121,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):                  'uploader_id': 'user18948128',                  'uploader': 'Jaime Marquínez Ferrándiz',                  'duration': 10, +                'description': 'This is "youtube-dl password protected test video" by Jaime Marquínez Ferrándiz on Vimeo, the home for high quality videos and the people who love them.',              },              'params': {                  'videopassword': 'youtube-dl', @@ -203,6 +208,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):          # Extract ID from URL          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('id') +        orig_url = url          if mobj.group('pro') or mobj.group('player'):              url = 'http://player.vimeo.com/video/' + video_id @@ -273,18 +279,23 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):                  _, video_thumbnail = sorted((int(width if width.isdigit() else 0), t_url) for (width, t_url) in video_thumbs.items())[-1]          # Extract video description -        video_description = None -        try: -            video_description = get_element_by_attribute("class", "description_wrapper", webpage) -            if video_description: -                video_description = clean_html(video_description) -        except AssertionError as err: -            # On some pages like (http://player.vimeo.com/video/54469442) the -            # html tags are not closed, python 2.6 cannot handle it -            if err.args[0] == 'we should not get here!': -                pass -            else: -                raise + +        video_description = self._html_search_regex( +            r'(?s)<div\s+class="[^"]*description[^"]*"[^>]*>(.*?)</div>', +            webpage, 'description', default=None) +        if not video_description: +            video_description = self._html_search_meta( +                'description', webpage, default=None) +        if not video_description and mobj.group('pro'): +            orig_webpage = self._download_webpage( +                orig_url, video_id, +                note='Downloading webpage for description', +                fatal=False) +            if orig_webpage: +                video_description = self._html_search_meta( +                    'description', orig_webpage, default=None) +        if not video_description and not mobj.group('player'): +            self._downloader.report_warning('Cannot find video description')          # Extract video duration          video_duration = int_or_none(config["video"].get("duration")) @@ -529,3 +540,58 @@ class VimeoWatchLaterIE(VimeoBaseInfoExtractor, VimeoChannelIE):      def _real_extract(self, url):          return self._extract_videos('watchlater', 'https://vimeo.com/home/watchlater') + + +class VimeoLikesIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes/?(?:$|[?#]|sort:)' +    IE_NAME = 'vimeo:likes' +    IE_DESC = 'Vimeo user likes' +    _TEST = { +        'url': 'https://vimeo.com/user755559/likes/', +        'playlist_mincount': 293, +        "info_dict": { +            "description": "See all the videos urza likes", +            "title": 'Videos urza likes', +        }, +    } + +    def _real_extract(self, url): +        user_id = self._match_id(url) +        webpage = self._download_webpage(url, user_id) +        page_count = self._int( +            self._search_regex( +                r'''(?x)<li><a\s+href="[^"]+"\s+data-page="([0-9]+)"> +                    .*?</a></li>\s*<li\s+class="pagination_next"> +                ''', webpage, 'page count'), +            'page count', fatal=True) +        PAGE_SIZE = 12 +        title = self._html_search_regex( +            r'(?s)<h1>(.+?)</h1>', webpage, 'title', fatal=False) +        description = self._html_search_meta('description', webpage) + +        def _get_page(idx): +            page_url = '%s//vimeo.com/user%s/likes/page:%d/sort:date' % ( +                self.http_scheme(), user_id, idx + 1) +            webpage = self._download_webpage( +                page_url, user_id, +                note='Downloading page %d/%d' % (idx + 1, page_count)) +            video_list = self._search_regex( +                r'(?s)<ol class="js-browse_list[^"]+"[^>]*>(.*?)</ol>', +                webpage, 'video content') +            paths = re.findall( +                r'<li[^>]*>\s*<a\s+href="([^"]+)"', video_list) +            for path in paths: +                yield { +                    '_type': 'url', +                    'url': compat_urlparse.urljoin(page_url, path), +                } + +        pl = InAdvancePagedList(_get_page, page_count, PAGE_SIZE) + +        return { +            '_type': 'playlist', +            'id': 'user%s_likes' % user_id, +            'title': title, +            'description': description, +            'entries': pl, +        } | 
