diff options
Diffstat (limited to 'youtube_dl/extractor/vimeo.py')
-rw-r--r-- | youtube_dl/extractor/vimeo.py | 381 |
1 files changed, 273 insertions, 108 deletions
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 707a5735a..51c69a80c 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import json @@ -8,6 +8,7 @@ import itertools from .common import InfoExtractor from ..compat import ( compat_HTTPError, + compat_str, compat_urlparse, ) from ..utils import ( @@ -15,6 +16,7 @@ from ..utils import ( ExtractorError, InAdvancePagedList, int_or_none, + NO_DEFAULT, RegexNotFoundError, sanitized_Request, smuggle_url, @@ -24,6 +26,7 @@ from ..utils import ( urlencode_postdata, unescapeHTML, parse_filesize, + try_get, ) @@ -54,6 +57,26 @@ class VimeoBaseInfoExtractor(InfoExtractor): self._set_vimeo_cookie('vuid', vuid) self._download_webpage(login_request, None, False, 'Wrong login info') + def _verify_video_password(self, url, video_id, webpage): + password = self._downloader.params.get('videopassword') + if password is None: + raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) + token, vuid = self._extract_xsrft_and_vuid(webpage) + data = urlencode_postdata({ + 'password': password, + 'token': token, + }) + if url.startswith('http://'): + # vimeo only supports https now, but the user can give an http url + url = url.replace('http://', 'https://') + password_request = sanitized_Request(url + '/password', data) + password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') + password_request.add_header('Referer', url) + self._set_vimeo_cookie('vuid', vuid) + return self._download_webpage( + password_request, video_id, + 'Verifying the password', 'Wrong password') + def _extract_xsrft_and_vuid(self, webpage): xsrft = self._search_regex( r'(?:(?P<q1>["\'])xsrft(?P=q1)\s*:|xsrft\s*[=:])\s*(?P<q>["\'])(?P<xsrft>.+?)(?P=q)', @@ -66,6 +89,69 @@ class VimeoBaseInfoExtractor(InfoExtractor): def _set_vimeo_cookie(self, name, value): self._set_cookie('vimeo.com', name, value) + def _vimeo_sort_formats(self, formats): + # Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps + # at the same time without actual units specified. This lead to wrong sorting. + self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'format_id')) + + def _parse_config(self, config, video_id): + # Extract title + video_title = config['video']['title'] + + # Extract uploader, uploader_url and uploader_id + video_uploader = config['video'].get('owner', {}).get('name') + video_uploader_url = config['video'].get('owner', {}).get('url') + video_uploader_id = video_uploader_url.split('/')[-1] if video_uploader_url else None + + # Extract video thumbnail + video_thumbnail = config['video'].get('thumbnail') + if video_thumbnail is None: + video_thumbs = config['video'].get('thumbs') + if video_thumbs and isinstance(video_thumbs, dict): + _, video_thumbnail = sorted((int(width if width.isdigit() else 0), t_url) for (width, t_url) in video_thumbs.items())[-1] + + # Extract video duration + video_duration = int_or_none(config['video'].get('duration')) + + formats = [] + config_files = config['video'].get('files') or config['request'].get('files', {}) + for f in config_files.get('progressive', []): + video_url = f.get('url') + if not video_url: + continue + formats.append({ + 'url': video_url, + 'format_id': 'http-%s' % f.get('quality'), + 'width': int_or_none(f.get('width')), + 'height': int_or_none(f.get('height')), + 'fps': int_or_none(f.get('fps')), + 'tbr': int_or_none(f.get('bitrate')), + }) + m3u8_url = config_files.get('hls', {}).get('url') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + + subtitles = {} + text_tracks = config['request'].get('text_tracks') + if text_tracks: + for tt in text_tracks: + subtitles[tt['lang']] = [{ + 'ext': 'vtt', + 'url': 'https://vimeo.com' + tt['url'], + }] + + return { + 'title': video_title, + 'uploader': video_uploader, + 'uploader_id': video_uploader_id, + 'uploader_url': video_uploader_url, + 'thumbnail': video_thumbnail, + 'duration': video_duration, + 'formats': formats, + 'subtitles': subtitles, + } + class VimeoIE(VimeoBaseInfoExtractor): """Information extractor for vimeo.com.""" @@ -81,7 +167,7 @@ class VimeoIE(VimeoBaseInfoExtractor): \. )? vimeo(?P<pro>pro)?\.com/ - (?!channels/[^/?#]+/?(?:$|[?#])|(?:album|ondemand)/) + (?!(?:channels|album)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/) (?:.*?/)? (?: (?: @@ -90,6 +176,7 @@ class VimeoIE(VimeoBaseInfoExtractor): )? (?:videos?/)? (?P<id>[0-9]+) + (?:/[\da-f]+)? /?(?:[?&].*)?(?:[#].*)?$ ''' IE_NAME = 'vimeo' @@ -152,7 +239,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader_id': 'user18948128', 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, - 'description': 'This is "youtube-dl password protected test video" by Jaime Marquínez Ferrándiz on Vimeo, the home for high quality videos and the people\u2026', + 'description': 'This is "youtube-dl password protected test video" by on Vimeo, the home for high quality videos and the people who love them.', }, 'params': { 'videopassword': 'youtube-dl', @@ -161,8 +248,6 @@ class VimeoIE(VimeoBaseInfoExtractor): { 'url': 'http://vimeo.com/channels/keypeele/75629013', 'md5': '2f86a05afe9d7abc0b9126d229bbe15d', - 'note': 'Video is freely available via original URL ' - 'and protected with password when accessed via http://vimeo.com/75629013', 'info_dict': { 'id': '75629013', 'ext': 'mp4', @@ -206,7 +291,7 @@ class VimeoIE(VimeoBaseInfoExtractor): { # contains original format 'url': 'https://vimeo.com/33951933', - 'md5': '53c688fa95a55bf4b7293d37a89c5c53', + 'md5': '2d9f5475e0537f013d0073e812ab89e6', 'info_dict': { 'id': '33951933', 'ext': 'mp4', @@ -219,6 +304,45 @@ class VimeoIE(VimeoBaseInfoExtractor): }, }, { + # only available via https://vimeo.com/channels/tributes/6213729 and + # not via https://vimeo.com/6213729 + 'url': 'https://vimeo.com/channels/tributes/6213729', + 'info_dict': { + 'id': '6213729', + 'ext': 'mp4', + 'title': 'Vimeo Tribute: The Shining', + 'uploader': 'Casey Donahue', + 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/caseydonahue', + 'uploader_id': 'caseydonahue', + 'upload_date': '20090821', + 'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download JSON metadata'], + }, + { + # redirects to ondemand extractor and should be passed throught it + # for successful extraction + 'url': 'https://vimeo.com/73445910', + 'info_dict': { + 'id': '73445910', + 'ext': 'mp4', + 'title': 'The Reluctant Revolutionary', + 'uploader': '10Ft Films', + 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/tenfootfilms', + 'uploader_id': 'tenfootfilms', + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'http://vimeo.com/moogaloop.swf?clip_id=2539741', + 'only_matching': True, + }, + { 'url': 'https://vimeo.com/109815029', 'note': 'Video not completely processed, "failed" seed status', 'only_matching': True, @@ -228,46 +352,47 @@ class VimeoIE(VimeoBaseInfoExtractor): 'only_matching': True, }, { + 'url': 'https://vimeo.com/album/2632481/video/79010983', + 'only_matching': True, + }, + { # source file returns 403: Forbidden 'url': 'https://vimeo.com/7809605', 'only_matching': True, }, + { + 'url': 'https://vimeo.com/160743502/abd0e13fb4', + 'only_matching': True, + } ] @staticmethod - def _extract_vimeo_url(url, webpage): + def _smuggle_referrer(url, referrer_url): + return smuggle_url(url, {'http_headers': {'Referer': referrer_url}}) + + @staticmethod + def _extract_urls(url, webpage): + urls = [] # Look for embedded (iframe) Vimeo player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage) - if mobj: - player_url = unescapeHTML(mobj.group('url')) - surl = smuggle_url(player_url, {'http_headers': {'Referer': url}}) - return surl - # Look for embedded (swf embed) Vimeo player - mobj = re.search( - r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage) - if mobj: - return mobj.group(1) + for mobj in re.finditer( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', + webpage): + urls.append(VimeoIE._smuggle_referrer(unescapeHTML(mobj.group('url')), url)) + PLAIN_EMBED_RE = ( + # Look for embedded (swf embed) Vimeo player + r'<embed[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)\1', + # Look more for non-standard embedded Vimeo player + r'<video[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vimeo\.com/[0-9]+)\1', + ) + for embed_re in PLAIN_EMBED_RE: + for mobj in re.finditer(embed_re, webpage): + urls.append(mobj.group('url')) + return urls - def _verify_video_password(self, url, video_id, webpage): - password = self._downloader.params.get('videopassword') - if password is None: - raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) - token, vuid = self._extract_xsrft_and_vuid(webpage) - data = urlencode_postdata({ - 'password': password, - 'token': token, - }) - if url.startswith('http://'): - # vimeo only supports https now, but the user can give an http url - url = url.replace('http://', 'https://') - password_request = sanitized_Request(url + '/password', data) - password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - password_request.add_header('Referer', url) - self._set_vimeo_cookie('vuid', vuid) - return self._download_webpage( - password_request, video_id, - 'Verifying the password', 'Wrong password') + @staticmethod + def _extract_url(url, webpage): + urls = VimeoIE._extract_urls(url, webpage) + return urls[0] if urls else None def _verify_player_video_password(self, url, video_id): password = self._downloader.params.get('videopassword') @@ -277,10 +402,10 @@ class VimeoIE(VimeoBaseInfoExtractor): pass_url = url + '/check-password' password_request = sanitized_Request(pass_url, data) password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') + password_request.add_header('Referer', url) return self._download_json( password_request, video_id, - 'Verifying the password', - 'Wrong password') + 'Verifying the password', 'Wrong password') def _real_initialize(self): self._login() @@ -299,13 +424,18 @@ class VimeoIE(VimeoBaseInfoExtractor): orig_url = url if mobj.group('pro') or mobj.group('player'): url = 'https://player.vimeo.com/video/' + video_id - else: + elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')): url = 'https://vimeo.com/' + video_id # Retrieve video webpage to extract further information request = sanitized_Request(url, headers=headers) try: - webpage = self._download_webpage(request, video_id) + webpage, urlh = self._download_webpage_handle(request, video_id) + # Some URLs redirect to ondemand can't be extracted with + # this extractor right away thus should be passed through + # ondemand extractor (e.g. https://vimeo.com/73445910) + if VimeoOndemandIE.suitable(urlh.geturl()): + return self.url_result(urlh.geturl(), VimeoOndemandIE.ie_key()) except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: errmsg = ee.cause.read() @@ -377,28 +507,24 @@ class VimeoIE(VimeoBaseInfoExtractor): if config.get('view') == 4: config = self._verify_player_video_password(url, video_id) - if '>You rented this title.<' in webpage: + def is_rented(): + if '>You rented this title.<' in webpage: + return True + if config.get('user', {}).get('purchased'): + return True + label = try_get( + config, lambda x: x['video']['vod']['purchase_options'][0]['label_string'], compat_str) + if label and label.startswith('You rented this'): + return True + return False + + if is_rented(): feature_id = config.get('video', {}).get('vod', {}).get('feature_id') if feature_id and not data.get('force_feature_id', False): return self.url_result(smuggle_url( 'https://player.vimeo.com/player/%s' % feature_id, {'force_feature_id': True}), 'Vimeo') - # Extract title - video_title = config['video']['title'] - - # Extract uploader, uploader_url and uploader_id - video_uploader = config['video'].get('owner', {}).get('name') - video_uploader_url = config['video'].get('owner', {}).get('url') - video_uploader_id = video_uploader_url.split('/')[-1] if video_uploader_url else None - - # Extract video thumbnail - video_thumbnail = config['video'].get('thumbnail') - if video_thumbnail is None: - video_thumbs = config['video'].get('thumbs') - if video_thumbs and isinstance(video_thumbs, dict): - _, video_thumbnail = sorted((int(width if width.isdigit() else 0), t_url) for (width, t_url) in video_thumbs.items())[-1] - # Extract video description video_description = self._html_search_regex( @@ -418,9 +544,6 @@ class VimeoIE(VimeoBaseInfoExtractor): if not video_description and not mobj.group('player'): self._downloader.report_warning('Cannot find video description') - # Extract video duration - video_duration = int_or_none(config['video'].get('duration')) - # Extract upload date video_upload_date = None mobj = re.search(r'<time[^>]+datetime="([^"]+)"', webpage) @@ -458,53 +581,22 @@ class VimeoIE(VimeoBaseInfoExtractor): 'format_id': source_name, 'preference': 1, }) - config_files = config['video'].get('files') or config['request'].get('files', {}) - for f in config_files.get('progressive', []): - video_url = f.get('url') - if not video_url: - continue - formats.append({ - 'url': video_url, - 'format_id': 'http-%s' % f.get('quality'), - 'width': int_or_none(f.get('width')), - 'height': int_or_none(f.get('height')), - 'fps': int_or_none(f.get('fps')), - 'tbr': int_or_none(f.get('bitrate')), - }) - m3u8_url = config_files.get('hls', {}).get('url') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - # Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps - # at the same time without actual units specified. This lead to wrong sorting. - self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'format_id')) - subtitles = {} - text_tracks = config['request'].get('text_tracks') - if text_tracks: - for tt in text_tracks: - subtitles[tt['lang']] = [{ - 'ext': 'vtt', - 'url': 'https://vimeo.com' + tt['url'], - }] - - return { + info_dict = self._parse_config(config, video_id) + formats.extend(info_dict['formats']) + self._vimeo_sort_formats(formats) + info_dict.update({ 'id': video_id, - 'uploader': video_uploader, - 'uploader_url': video_uploader_url, - 'uploader_id': video_uploader_id, + 'formats': formats, 'upload_date': video_upload_date, - 'title': video_title, - 'thumbnail': video_thumbnail, 'description': video_description, - 'duration': video_duration, - 'formats': formats, 'webpage_url': url, 'view_count': view_count, 'like_count': like_count, 'comment_count': comment_count, - 'subtitles': subtitles, - } + }) + + return info_dict class VimeoOndemandIE(VimeoBaseInfoExtractor): @@ -523,6 +615,20 @@ class VimeoOndemandIE(VimeoBaseInfoExtractor): 'uploader_id': 'gumfilms', }, }, { + # requires Referer to be passed along with og:video:url + 'url': 'https://vimeo.com/ondemand/36938/126682985', + 'info_dict': { + 'id': '126682985', + 'ext': 'mp4', + 'title': 'Rävlock, rätt läte på rätt plats', + 'uploader': 'Lindroth & Norin', + 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/user14430847', + 'uploader_id': 'user14430847', + }, + 'params': { + 'skip_download': True, + }, + }, { 'url': 'https://vimeo.com/ondemand/nazmaalik', 'only_matching': True, }, { @@ -536,7 +642,12 @@ class VimeoOndemandIE(VimeoBaseInfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - return self.url_result(self._og_search_video_url(webpage), VimeoIE.ie_key()) + return self.url_result( + # Some videos require Referer to be passed along with og:video:url + # similarly to generic vimeo embeds (e.g. + # https://vimeo.com/ondemand/36938/126682985). + VimeoIE._smuggle_referrer(self._og_search_video_url(webpage), url), + VimeoIE.ie_key()) class VimeoChannelIE(VimeoBaseInfoExtractor): @@ -598,8 +709,21 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): webpage = self._login_list_password(page_url, list_id, webpage) yield self._extract_list_title(webpage) - for video_id in re.findall(r'id="clip_(\d+?)"', webpage): - yield self.url_result('https://vimeo.com/%s' % video_id, 'Vimeo') + # Try extracting href first since not all videos are available via + # short https://vimeo.com/id URL (e.g. https://vimeo.com/channels/tributes/6213729) + clips = re.findall( + r'id="clip_(\d+)"[^>]*>\s*<a[^>]+href="(/(?:[^/]+/)*\1)', webpage) + if clips: + for video_id, video_url in clips: + yield self.url_result( + compat_urlparse.urljoin(base_url, video_url), + VimeoIE.ie_key(), video_id=video_id) + # More relaxed fallback + else: + for video_id in re.findall(r'id=["\']clip_(\d+)', webpage): + yield self.url_result( + 'https://vimeo.com/%s' % video_id, + VimeoIE.ie_key(), video_id=video_id) if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None: break @@ -636,7 +760,7 @@ class VimeoUserIE(VimeoChannelIE): class VimeoAlbumIE(VimeoChannelIE): IE_NAME = 'vimeo:album' - _VALID_URL = r'https://vimeo\.com/album/(?P<id>\d+)' + _VALID_URL = r'https://vimeo\.com/album/(?P<id>\d+)(?:$|[?#]|/(?!video))' _TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>' _TESTS = [{ 'url': 'https://vimeo.com/album/2632481', @@ -656,6 +780,13 @@ class VimeoAlbumIE(VimeoChannelIE): 'params': { 'videopassword': 'youtube-dl', } + }, { + 'url': 'https://vimeo.com/album/2632481/sort:plays/format:thumbnail', + 'only_matching': True, + }, { + # TODO: respect page number + 'url': 'https://vimeo.com/album/2632481/page:2/sort:plays/format:thumbnail', + 'only_matching': True, }] def _page_url(self, base_url, pagenum): @@ -687,7 +818,7 @@ class VimeoGroupsIE(VimeoAlbumIE): return self._extract_videos(name, 'https://vimeo.com/groups/%s' % name) -class VimeoReviewIE(InfoExtractor): +class VimeoReviewIE(VimeoBaseInfoExtractor): IE_NAME = 'vimeo:review' IE_DESC = 'Review pages on vimeo' _VALID_URL = r'https://vimeo\.com/[^/]+/review/(?P<id>[^/]+)' @@ -699,6 +830,7 @@ class VimeoReviewIE(InfoExtractor): 'ext': 'mp4', 'title': "DICK HARDWICK 'Comedian'", 'uploader': 'Richard Hardwick', + 'uploader_id': 'user21297594', } }, { 'note': 'video player needs Referer', @@ -711,14 +843,47 @@ class VimeoReviewIE(InfoExtractor): 'uploader': 'DevWeek Events', 'duration': 2773, 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader_id': 'user22258446', } + }, { + 'note': 'Password protected', + 'url': 'https://vimeo.com/user37284429/review/138823582/c4d865efde', + 'info_dict': { + 'id': '138823582', + 'ext': 'mp4', + 'title': 'EFFICIENT PICKUP MASTERCLASS MODULE 1', + 'uploader': 'TMB', + 'uploader_id': 'user37284429', + }, + 'params': { + 'videopassword': 'holygrail', + }, + 'skip': 'video gone', }] + def _real_initialize(self): + self._login() + + def _get_config_url(self, webpage_url, video_id, video_password_verified=False): + webpage = self._download_webpage(webpage_url, video_id) + data = self._parse_json(self._search_regex( + r'window\s*=\s*_extend\(window,\s*({.+?})\);', webpage, 'data', + default=NO_DEFAULT if video_password_verified else '{}'), video_id) + config_url = data.get('vimeo_esi', {}).get('config', {}).get('configUrl') + if config_url is None: + self._verify_video_password(webpage_url, video_id, webpage) + config_url = self._get_config_url( + webpage_url, video_id, video_password_verified=True) + return config_url + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - player_url = 'https://player.vimeo.com/player/' + video_id - return self.url_result(player_url, 'Vimeo', video_id) + video_id = self._match_id(url) + config_url = self._get_config_url(url, video_id) + config = self._download_json(config_url, video_id) + info_dict = self._parse_config(config, video_id) + self._vimeo_sort_formats(info_dict['formats']) + info_dict['id'] = video_id + return info_dict class VimeoWatchLaterIE(VimeoChannelIE): |