diff options
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/bilibili.py | 11 | ||||
-rw-r--r-- | youtube_dl/extractor/bliptv.py | 1 | ||||
-rw-r--r-- | youtube_dl/extractor/common.py | 4 | ||||
-rw-r--r-- | youtube_dl/extractor/francetv.py | 47 | ||||
-rw-r--r-- | youtube_dl/extractor/generic.py | 27 | ||||
-rw-r--r-- | youtube_dl/extractor/rtlnl.py | 25 | ||||
-rw-r--r-- | youtube_dl/extractor/vk.py | 45 |
7 files changed, 130 insertions, 30 deletions
diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index bf60450c2..ecc17ebeb 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -41,8 +41,15 @@ class BiliBiliIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - if self._search_regex(r'(此视频不存在或被删除)', webpage, 'error message', default=None): - raise ExtractorError('The video does not exist or was deleted', expected=True) + if '(此视频不存在或被删除)' in webpage: + raise ExtractorError( + 'The video does not exist or was deleted', expected=True) + + if '>你没有权限浏览! 由于版权相关问题 我们不对您所在的地区提供服务<' in webpage: + raise ExtractorError( + 'The video is not available in your region due to copyright reasons', + expected=True) + video_code = self._search_regex( r'(?s)<div itemprop="video".*?>(.*?)</div>', webpage, 'video code') diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py index a69ee482b..c3296283d 100644 --- a/youtube_dl/extractor/bliptv.py +++ b/youtube_dl/extractor/bliptv.py @@ -5,7 +5,6 @@ import re from .common import InfoExtractor from ..compat import ( - compat_str, compat_urllib_request, compat_urlparse, ) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5a2d0d995..b9014fc23 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -996,7 +996,7 @@ class InfoExtractor(object): def _parse_smil_video(self, video, video_id, base, rtmp_count): src = video.get('src') if not src: - return ([], rtmp_count) + return [], rtmp_count bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) width = int_or_none(video.get('width')) height = int_or_none(video.get('height')) @@ -1009,7 +1009,7 @@ class InfoExtractor(object): proto = 'http' ext = video.get('ext') if proto == 'm3u8': - return (self._extract_m3u8_formats(src, video_id, ext), rtmp_count) + return self._extract_m3u8_formats(src, video_id, ext), rtmp_count elif proto == 'rtmp': rtmp_count += 1 streamer = video.get('streamer') or base diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index b2c984bf2..d4f98ca16 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -87,7 +87,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor): 'title': info['titre'], 'description': clean_html(info['synopsis']), 'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', info['image']), - 'duration': float_or_none(info.get('real_duration'), 1000) or parse_duration(info['duree']), + 'duration': int_or_none(info.get('real_duration')) or parse_duration(info['duree']), 'timestamp': int_or_none(info['diffusion']['timestamp']), 'formats': formats, } @@ -160,11 +160,19 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): class FranceTVIE(FranceTVBaseInfoExtractor): IE_NAME = 'francetv' IE_DESC = 'France 2, 3, 4, 5 and Ô' - _VALID_URL = r'''(?x)https?://www\.france[2345o]\.fr/ - (?: - emissions/.*?/(videos|emissions)/(?P<id>[^/?]+) - | (emissions?|jt)/(?P<key>[^/?]+) - )''' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?france[2345o]\.fr/ + (?: + emissions/[^/]+/(?:videos|diffusions)?| + videos + ) + /| + embed\.francetv\.fr/\?ue= + ) + (?P<id>[^/?]+) + ''' _TESTS = [ # france2 @@ -232,13 +240,34 @@ class FranceTVIE(FranceTVBaseInfoExtractor): 'timestamp': 1410822000, }, }, + { + # francetv embed + 'url': 'http://embed.francetv.fr/?ue=8d7d3da1e3047c42ade5a5d7dfd3fc87', + 'info_dict': { + 'id': 'EV_30231', + 'ext': 'flv', + 'title': 'Alcaline, le concert avec Calogero', + 'description': 'md5:', + 'upload_date': '20150226', + 'timestamp': 1424989860, + 'duration': 5400, + }, + }, + { + 'url': 'http://www.france4.fr/emission/highlander/diffusion-du-17-07-2015-04h05', + 'only_matching': True, + }, + { + 'url': 'http://www.franceo.fr/videos/125377617', + 'only_matching': True, + } ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - webpage = self._download_webpage(url, mobj.group('key') or mobj.group('id')) + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) video_id, catalogue = self._html_search_regex( - r'href="http://videos\.francetv\.fr/video/([^@]+@[^"]+)"', + r'href="http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"', webpage, 'video ID').split('@') return self._extract_video(video_id, catalogue) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index a62287e50..dc24a8a8b 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -407,6 +407,26 @@ class GenericIE(InfoExtractor): 'skip_download': 'Requires rtmpdump' } }, + # francetv embed + { + 'url': 'http://www.tsprod.com/replay-du-concert-alcaline-de-calogero', + 'info_dict': { + 'id': 'EV_30231', + 'ext': 'mp4', + 'title': 'Alcaline, le concert avec Calogero', + 'description': 'md5:61f08036dcc8f47e9cfc33aed08ffaff', + 'upload_date': '20150226', + 'timestamp': 1424989860, + 'duration': 5400, + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + 'expected_warnings': [ + 'Forbidden' + ] + }, # Condé Nast embed { 'url': 'http://www.wired.com/2014/04/honda-asimo/', @@ -1431,6 +1451,13 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url'), 'ArteTVEmbed') + # Look for embedded francetv player + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1', + webpage) + if mobj is not None: + return self.url_result(mobj.group('url')) + # Look for embedded smotri.com player smotri_url = SmotriIE._extract_url(webpage) if smotri_url: diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index a4d3d73ff..e0c530d64 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -44,6 +44,21 @@ class RtlNlIE(InfoExtractor): 'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.', } }, { + # empty synopsis and missing episodes (see https://github.com/rg3/youtube-dl/issues/6275) + 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a/autoplay=false', + 'info_dict': { + 'id': 'f536aac0-1dc3-4314-920e-3bd1c5b3811a', + 'ext': 'mp4', + 'title': 'RTL Nieuws - Meer beelden van overval juwelier', + 'thumbnail': 're:^https?://screenshots\.rtl\.nl/system/thumb/sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$', + 'timestamp': 1437233400, + 'upload_date': '20150718', + 'duration': 30.474, + }, + 'params': { + 'skip_download': True, + }, + }, { # encrypted m3u8 streams, georestricted 'url': 'http://www.rtlxl.nl/#!/afl-2-257632/52a74543-c504-4cde-8aa8-ec66fe8d68a7', 'only_matching': True, @@ -59,9 +74,11 @@ class RtlNlIE(InfoExtractor): uuid) material = info['material'][0] - progname = info['abstracts'][0]['name'] - subtitle = material['title'] or info['episodes'][0]['name'] - description = material.get('synopsis') or info['episodes'][0]['synopsis'] + title = info['abstracts'][0]['name'] + subtitle = material.get('title') + if subtitle: + title += ' - %s' % subtitle + description = material.get('synopsis') meta = info.get('meta', {}) @@ -107,7 +124,7 @@ class RtlNlIE(InfoExtractor): return { 'id': uuid, - 'title': '%s - %s' % (progname, subtitle), + 'title': title, 'formats': formats, 'timestamp': material['original_date'], 'description': description, diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 8f677cae3..c30c5a8e5 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -20,7 +20,8 @@ from ..utils import ( class VKIE(InfoExtractor): - IE_NAME = 'vk.com' + IE_NAME = 'vk' + IE_DESC = 'VK' _VALID_URL = r'''(?x) https?:// (?: @@ -154,6 +155,11 @@ class VKIE(InfoExtractor): 'only_matching': True, }, { + # age restricted video, requires vk account credentials + 'url': 'https://vk.com/video205387401_164765225', + 'only_matching': True, + }, + { # vk wrapper 'url': 'http://www.biqle.ru/watch/847655_160197695', 'only_matching': True, @@ -204,6 +210,12 @@ class VKIE(InfoExtractor): info_page = self._download_webpage(info_url, video_id) + error_message = self._html_search_regex( + r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>', + info_page, 'error message', default=None) + if error_message: + raise ExtractorError(error_message, expected=True) + if re.search(r'<!>/login\.php\?.*\bact=security_check', info_page): raise ExtractorError( 'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.', @@ -289,25 +301,34 @@ class VKIE(InfoExtractor): class VKUserVideosIE(InfoExtractor): - IE_NAME = 'vk.com:user-videos' - IE_DESC = 'vk.com:All of a user\'s videos' - _VALID_URL = r'https?://vk\.com/videos(?P<id>[0-9]+)(?:m\?.*)?' + IE_NAME = 'vk:uservideos' + IE_DESC = "VK - User's Videos" + _VALID_URL = r'https?://vk\.com/videos(?P<id>-?[0-9]+)$' _TEMPLATE_URL = 'https://vk.com/videos' - _TEST = { + _TESTS = [{ 'url': 'http://vk.com/videos205387401', 'info_dict': { 'id': '205387401', + 'title': "Tom Cruise's Videos", }, 'playlist_mincount': 4, - } + }, { + 'url': 'http://vk.com/videos-77521', + 'only_matching': True, + }] def _real_extract(self, url): page_id = self._match_id(url) - page = self._download_webpage(url, page_id) - video_ids = orderedSet( - m.group(1) for m in re.finditer(r'href="/video([0-9_]+)"', page)) - url_entries = [ + + webpage = self._download_webpage(url, page_id) + + entries = [ self.url_result( 'http://vk.com/video' + video_id, 'VK', video_id=video_id) - for video_id in video_ids] - return self.playlist_result(url_entries, page_id) + for video_id in orderedSet(re.findall(r'href="/video(-?[0-9_]+)"', webpage))] + + title = unescapeHTML(self._search_regex( + r'<title>\s*([^<]+?)\s+\|\s+\d+\s+videos', + webpage, 'title', default=page_id)) + + return self.playlist_result(entries, page_id, title) |