diff options
Diffstat (limited to 'youtube_dl/extractor/vimeo.py')
-rw-r--r-- | youtube_dl/extractor/vimeo.py | 99 |
1 files changed, 66 insertions, 33 deletions
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 4be1b8785..d2c36b58a 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -8,18 +8,19 @@ import itertools from .common import InfoExtractor from .subtitles import SubtitlesInfoExtractor from ..utils import ( + clean_html, compat_HTTPError, compat_urllib_parse, compat_urllib_request, - clean_html, - get_element_by_attribute, + compat_urlparse, ExtractorError, + get_element_by_attribute, + InAdvancePagedList, + int_or_none, RegexNotFoundError, - smuggle_url, std_headers, unsmuggle_url, urlencode_postdata, - int_or_none, ) @@ -90,6 +91,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): 'uploader_id': 'openstreetmapus', 'uploader': 'OpenStreetMap US', 'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography', + 'description': 'md5:380943ec71b89736ff4bf27183233d09', 'duration': 1595, }, }, @@ -104,6 +106,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): 'uploader': 'The BLN & Business of Software', 'uploader_id': 'theblnbusinessofsoftware', 'duration': 3610, + 'description': None, }, }, { @@ -118,6 +121,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): 'uploader_id': 'user18948128', 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, + 'description': 'This is "youtube-dl password protected test video" by Jaime Marquínez Ferrándiz on Vimeo, the home for high quality videos and the people who love them.', }, 'params': { 'videopassword': 'youtube-dl', @@ -204,6 +208,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): # Extract ID from URL mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + orig_url = url if mobj.group('pro') or mobj.group('player'): url = 'http://player.vimeo.com/video/' + video_id @@ -274,18 +279,23 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): _, video_thumbnail = sorted((int(width if width.isdigit() else 0), t_url) for (width, t_url) in video_thumbs.items())[-1] # Extract video description - video_description = None - try: - video_description = get_element_by_attribute("class", "description_wrapper", webpage) - if video_description: - video_description = clean_html(video_description) - except AssertionError as err: - # On some pages like (http://player.vimeo.com/video/54469442) the - # html tags are not closed, python 2.6 cannot handle it - if err.args[0] == 'we should not get here!': - pass - else: - raise + + video_description = self._html_search_regex( + r'(?s)<div\s+class="[^"]*description[^"]*"[^>]*>(.*?)</div>', + webpage, 'description', default=None) + if not video_description: + video_description = self._html_search_meta( + 'description', webpage, default=None) + if not video_description and mobj.group('pro'): + orig_webpage = self._download_webpage( + orig_url, video_id, + note='Downloading webpage for description', + fatal=False) + if orig_webpage: + video_description = self._html_search_meta( + 'description', orig_webpage, default=None) + if not video_description and not mobj.group('player'): + self._downloader.report_warning('Cannot find video description') # Extract video duration video_duration = int_or_none(config["video"].get("duration")) @@ -533,32 +543,55 @@ class VimeoWatchLaterIE(VimeoBaseInfoExtractor, VimeoChannelIE): class VimeoLikesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes/?(?:$|[?#]|sort:)' IE_NAME = 'vimeo:likes' IE_DESC = 'Vimeo user likes' _TEST = { - 'url': 'https://vimeo.com/user20132939/likes', - 'playlist_mincount': 4, - 'add_ies': ['Generic'], + 'url': 'https://vimeo.com/user755559/likes/', + 'playlist_mincount': 293, "info_dict": { - "description": "Videos Philipp Hagemeister likes on Vimeo.", - "title": "Vimeo / Philipp Hagemeister's likes", - }, - 'params': { - 'extract_flat': False, + "description": "See all the videos urza likes", + "title": 'Videos urza likes', }, } def _real_extract(self, url): user_id = self._match_id(url) - rss_url = '%s//vimeo.com/user%s/likes/rss' % ( - self.http_scheme(), user_id) - surl = smuggle_url(rss_url, { - 'force_videoid': '%s_likes' % user_id, - 'to_generic': True, - }) + webpage = self._download_webpage(url, user_id) + page_count = self._int( + self._search_regex( + r'''(?x)<li><a\s+href="[^"]+"\s+data-page="([0-9]+)"> + .*?</a></li>\s*<li\s+class="pagination_next"> + ''', webpage, 'page count'), + 'page count', fatal=True) + PAGE_SIZE = 12 + title = self._html_search_regex( + r'(?s)<h1>(.+?)</h1>', webpage, 'title', fatal=False) + description = self._html_search_meta('description', webpage) + + def _get_page(idx): + page_url = '%s//vimeo.com/user%s/likes/page:%d/sort:date' % ( + self.http_scheme(), user_id, idx + 1) + webpage = self._download_webpage( + page_url, user_id, + note='Downloading page %d/%d' % (idx + 1, page_count)) + video_list = self._search_regex( + r'(?s)<ol class="js-browse_list[^"]+"[^>]*>(.*?)</ol>', + webpage, 'video content') + paths = re.findall( + r'<li[^>]*>\s*<a\s+href="([^"]+)"', video_list) + for path in paths: + yield { + '_type': 'url', + 'url': compat_urlparse.urljoin(page_url, path), + } + + pl = InAdvancePagedList(_get_page, page_count, PAGE_SIZE) return { - '_type': 'url', - 'url': surl, + '_type': 'playlist', + 'id': 'user%s_likes' % user_id, + 'title': title, + 'description': description, + 'entries': pl, } |