diff options
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/__init__.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/fourtube.py | 95 | ||||
-rw-r--r-- | youtube_dl/extractor/francetv.py | 1 | ||||
-rw-r--r-- | youtube_dl/extractor/ndr.py | 12 | ||||
-rw-r--r-- | youtube_dl/extractor/nfb.py | 3 | ||||
-rw-r--r-- | youtube_dl/extractor/smotri.py | 1 | ||||
-rw-r--r-- | youtube_dl/extractor/sohu.py | 1 | ||||
-rw-r--r-- | youtube_dl/extractor/syfy.py | 27 | ||||
-rw-r--r-- | youtube_dl/extractor/theplatform.py | 66 | ||||
-rw-r--r-- | youtube_dl/extractor/vesti.py | 3 | ||||
-rw-r--r-- | youtube_dl/extractor/vk.py | 85 | ||||
-rw-r--r-- | youtube_dl/extractor/youtube.py | 3 |
12 files changed, 249 insertions, 50 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 9490df0d8..725371883 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -73,6 +73,7 @@ from .fktv import ( FKTVPosteckeIE, ) from .flickr import FlickrIE +from .fourtube import FourTubeIE from .franceinter import FranceInterIE from .francetv import ( PluzzIE, @@ -210,6 +211,7 @@ from .statigram import StatigramIE from .steam import SteamIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE +from .syfy import SyfyIE from .sztvhu import SztvHuIE from .teamcoco import TeamcocoIE from .techtalks import TechTalksIE diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py new file mode 100644 index 000000000..8db7fc6cb --- /dev/null +++ b/youtube_dl/extractor/fourtube.py @@ -0,0 +1,95 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_request, + unified_strdate, + str_to_int, + parse_duration, +) +from youtube_dl.utils import clean_html + + +class FourTubeIE(InfoExtractor): + IE_NAME = '4tube' + _VALID_URL = r'https?://(?:www\.)?4tube\.com/videos/(?P<id>\d+)' + + _TEST = { + 'url': 'http://www.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black', + 'md5': '6516c8ac63b03de06bc8eac14362db4f', + 'info_dict': { + 'id': '209733', + 'ext': 'mp4', + 'title': 'Hot Babe Holly Michaels gets her ass stuffed by black', + 'uploader': 'WCP Club', + 'uploader_id': 'wcp-club', + 'upload_date': '20131031', + 'duration': 583, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('id') + webpage_url = 'http://www.4tube.com/videos/' + video_id + webpage = self._download_webpage(webpage_url, video_id) + + self.report_extraction(video_id) + + playlist_json = self._html_search_regex(r'var playerConfigPlaylist\s+=\s+([^;]+)', webpage, 'Playlist') + media_id = self._search_regex(r'idMedia:\s*(\d+)', playlist_json, 'Media Id') + sources = self._search_regex(r'sources:\s*\[([^\]]*)\]', playlist_json, 'Sources').split(',') + title = self._search_regex(r'title:\s*"([^"]*)', playlist_json, 'Title') + thumbnail_url = self._search_regex(r'image:\s*"([^"]*)', playlist_json, 'Thumbnail', fatal=False) + + uploader_str = self._search_regex(r'<span>Uploaded by</span>(.*?)<span>', webpage, 'uploader', fatal=False) + mobj = re.search(r'<a href="/sites/(?P<id>[^"]+)"><strong>(?P<name>[^<]+)</strong></a>', uploader_str) + (uploader, uploader_id) = (mobj.group('name'), mobj.group('id')) if mobj else (clean_html(uploader_str), None) + + upload_date = None + view_count = None + duration = None + description = self._html_search_meta('description', webpage, 'description') + if description: + upload_date = self._search_regex(r'Published Date: (\d{2} [a-zA-Z]{3} \d{4})', description, 'upload date', + fatal=False) + if upload_date: + upload_date = unified_strdate(upload_date) + view_count = self._search_regex(r'Views: ([\d,\.]+)', description, 'view count', fatal=False) + if view_count: + view_count = str_to_int(view_count) + duration = parse_duration(self._search_regex(r'Length: (\d+m\d+s)', description, 'duration', fatal=False)) + + token_url = "http://tkn.4tube.com/{0}/desktop/{1}".format(media_id, "+".join(sources)) + headers = { + b'Content-Type': b'application/x-www-form-urlencoded', + b'Origin': b'http://www.4tube.com', + } + token_req = compat_urllib_request.Request(token_url, b'{}', headers) + tokens = self._download_json(token_req, video_id) + + formats = [{ + 'url': tokens[format]['token'], + 'format_id': format + 'p', + 'resolution': format + 'p', + 'quality': int(format), + } for format in sources] + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail_url, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'upload_date': upload_date, + 'view_count': view_count, + 'duration': duration, + 'age_limit': 18, + 'webpage_url': webpage_url, + }
\ No newline at end of file diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index ae342341c..51eb97b2f 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -184,6 +184,7 @@ class GenerationQuoiIE(InfoExtractor): # It uses Dailymotion 'skip_download': True, }, + 'skip': 'Only available from France', } def _real_extract(self, url): diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 811ef5201..0650f9564 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -26,15 +26,15 @@ class NDRIE(InfoExtractor): }, }, { - 'url': 'http://www.ndr.de/903/audio191719.html', - 'md5': '41ed601768534dd18a9ae34d84798129', + 'url': 'http://www.ndr.de/info/audio51535.html', + 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8', 'note': 'Audio file', 'info_dict': { - 'id': '191719', + 'id': '51535', 'ext': 'mp3', - 'title': '"Es war schockierend"', - 'description': 'md5:ed7ff8364793545021a6355b97e95f10', - 'duration': 112, + 'title': 'La Valette entgeht der Hinrichtung', + 'description': 'md5:22f9541913a40fe50091d5cdd7c9f536', + 'duration': 884, } } ] diff --git a/youtube_dl/extractor/nfb.py b/youtube_dl/extractor/nfb.py index a8c514f53..e88566c69 100644 --- a/youtube_dl/extractor/nfb.py +++ b/youtube_dl/extractor/nfb.py @@ -74,7 +74,8 @@ class NFBIE(InfoExtractor): description = media.find('description').text # It seems assets always go from lower to better quality, so no need to sort formats = [{ - 'url': x.find('default/streamerURI').text + '/', + 'url': x.find('default/streamerURI').text, + 'app': x.find('default/streamerURI').text.split('/', 3)[3], 'play_path': x.find('default/url').text, 'rtmp_live': False, 'ext': 'mp4', diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index f249f013c..540c55703 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -20,6 +20,7 @@ class SmotriIE(InfoExtractor): IE_DESC = 'Smotri.com' IE_NAME = 'smotri' _VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/video/view/\?id=(?P<videoid>v(?P<realvideoid>[0-9]+)[a-z0-9]{4}))' + _NETRC_MACHINE = 'smotri' _TESTS = [ # real video id 2610366 diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index 2b9bf0cb7..bebcafb62 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -17,6 +17,7 @@ class SohuIE(InfoExtractor): u'info_dict': { u'title': u'MV:Far East Movement《The Illest》', }, + u'skip': u'Only available from China', } def _real_extract(self, url): diff --git a/youtube_dl/extractor/syfy.py b/youtube_dl/extractor/syfy.py new file mode 100644 index 000000000..8809a57fe --- /dev/null +++ b/youtube_dl/extractor/syfy.py @@ -0,0 +1,27 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class SyfyIE(InfoExtractor): + _VALID_URL = r'https?://www\.syfy\.com/videos/.+?vid:(?P<id>\d+)' + + _TEST = { + 'url': 'http://www.syfy.com/videos/Robot%20Combat%20League/Behind%20the%20Scenes/vid:2631458', + 'md5': 'e07de1d52c7278adbb9b9b1c93a66849', + 'info_dict': { + 'id': 'NmqMrGnXvmO1', + 'ext': 'flv', + 'title': 'George Lucas has Advice for his Daughter', + 'description': 'Listen to what insights George Lucas give his daughter Amanda.', + }, + 'add_ie': ['ThePlatform'], + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + return self.url_result(self._og_search_video_url(webpage)) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 23172143e..d60702325 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -11,7 +11,10 @@ _x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language class ThePlatformIE(InfoExtractor): - _VALID_URL = r'(?:https?://link\.theplatform\.com/s/[^/]+/|theplatform:)(?P<id>[^/\?]+)' + _VALID_URL = r'''(?x) + (?:https?://(?:link|player)\.theplatform\.com/[sp]/[^/]+/ + (?P<config>[^/\?]+/(?:swf|config)/select/)? + |theplatform:)(?P<id>[^/\?&]+)''' _TEST = { # from http://www.metacafe.com/watch/cb-e9I_cZgTgIPd/blackberrys_big_bold_z30/ @@ -29,9 +32,7 @@ class ThePlatformIE(InfoExtractor): }, } - def _get_info(self, video_id): - smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?' - 'format=smil&mbr=true'.format(video_id)) + def _get_info(self, video_id, smil_url): meta = self._download_xml(smil_url, video_id) try: @@ -50,26 +51,34 @@ class ThePlatformIE(InfoExtractor): head = meta.find(_x('smil:head')) body = meta.find(_x('smil:body')) - base_url = head.find(_x('smil:meta')).attrib['base'] - switch = body.find(_x('smil:switch')) - formats = [] - for f in switch.findall(_x('smil:video')): - attr = f.attrib - width = int(attr['width']) - height = int(attr['height']) - vbr = int(attr['system-bitrate']) // 1000 - format_id = '%dx%d_%dk' % (width, height, vbr) - formats.append({ - 'format_id': format_id, - 'url': base_url, - 'play_path': 'mp4:' + attr['src'], - 'ext': 'flv', - 'width': width, - 'height': height, - 'vbr': vbr, - }) - self._sort_formats(formats) + f4m_node = body.find(_x('smil:seq/smil:video')) + if f4m_node is not None: + formats = [{ + 'ext': 'flv', + # the parameters are from syfy.com, other sites may use others + 'url': f4m_node.attrib['src'] + '?g=UXWGVKRWHFSP&hdcore=3.0.3', + }] + else: + base_url = head.find(_x('smil:meta')).attrib['base'] + switch = body.find(_x('smil:switch')) + formats = [] + for f in switch.findall(_x('smil:video')): + attr = f.attrib + width = int(attr['width']) + height = int(attr['height']) + vbr = int(attr['system-bitrate']) // 1000 + format_id = '%dx%d_%dk' % (width, height, vbr) + formats.append({ + 'format_id': format_id, + 'url': base_url, + 'play_path': 'mp4:' + attr['src'], + 'ext': 'flv', + 'width': width, + 'height': height, + 'vbr': vbr, + }) + self._sort_formats(formats) return { 'id': video_id, @@ -83,4 +92,13 @@ class ThePlatformIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - return self._get_info(video_id) + if mobj.group('config'): + config_url = url+ '&form=json' + config_url = config_url.replace('swf/', 'config/') + config_json = self._download_webpage(config_url, video_id, u'Downloading config') + config = json.loads(config_json) + smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4' + else: + smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?' + 'format=smil&mbr=true'.format(video_id)) + return self._get_info(video_id, smil_url) diff --git a/youtube_dl/extractor/vesti.py b/youtube_dl/extractor/vesti.py index 7773cec14..f51d4dcfa 100644 --- a/youtube_dl/extractor/vesti.py +++ b/youtube_dl/extractor/vesti.py @@ -87,8 +87,7 @@ class VestiIE(InfoExtractor): video_id = mobj.group('id') else: mobj = re.search( - r'<div.+?id="current-video-holder".*?>\s*<iframe src="http://player\.rutv\.ru/iframe/(?P<type>[^/]+)/id/(?P<id>\d+)[^"]*"', - page) + r'<iframe.+?src="http://player\.rutv\.ru/iframe/(?P<type>[^/]+)/id/(?P<id>\d+)[^"]*".*?></iframe>', page) if not mobj: raise ExtractorError('No media found') diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index f13ba1c8e..a293b8875 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -6,6 +6,9 @@ import json from .common import InfoExtractor from ..utils import ( + ExtractorError, + compat_urllib_request, + compat_urllib_parse, compat_str, unescapeHTML, ) @@ -14,31 +17,80 @@ from ..utils import ( class VKIE(InfoExtractor): IE_NAME = 'vk.com' _VALID_URL = r'https?://vk\.com/(?:videos.*?\?.*?z=)?video(?P<id>.*?)(?:\?|%2F|$)' + _NETRC_MACHINE = 'vk' - _TESTS = [{ - 'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', - 'file': '162222515.flv', - 'md5': '0deae91935c54e00003c2a00646315f0', - 'info_dict': { - 'title': 'ProtivoGunz - Хуёвая песня', - 'uploader': 'Noize MC', + _TESTS = [ + { + 'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', + 'md5': '0deae91935c54e00003c2a00646315f0', + 'info_dict': { + 'id': '162222515', + 'ext': 'flv', + 'title': 'ProtivoGunz - Хуёвая песня', + 'uploader': 'Noize MC', + 'duration': 195, + }, }, - }, - { - 'url': 'http://vk.com/video4643923_163339118', - 'file': '163339118.mp4', - 'md5': 'f79bccb5cd182b1f43502ca5685b2b36', - 'info_dict': { - 'uploader': 'Elvira Dzhonik', - 'title': 'Dream Theater - Hollow Years Live at Budokan 720*', + { + 'url': 'http://vk.com/video4643923_163339118', + 'md5': 'f79bccb5cd182b1f43502ca5685b2b36', + 'info_dict': { + 'id': '163339118', + 'ext': 'mp4', + 'uploader': 'Elvira Dzhonik', + 'title': 'Dream Theater - Hollow Years Live at Budokan 720*', + 'duration': 558, + } + }, + { + 'url': 'http://vk.com/video-8871596_164049491', + 'md5': 'a590bcaf3d543576c9bd162812387666', + 'note': 'Only available for registered users', + 'info_dict': { + 'id': '164049491', + 'ext': 'mp4', + 'uploader': 'Триллеры', + 'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]\u00a0', + 'duration': 8352, + }, + 'skip': 'Requires vk account credentials', } - }] + ] + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + login_form = { + 'act': 'login', + 'role': 'al_frame', + 'expire': '1', + 'email': username, + 'pass': password, + } + + request = compat_urllib_request.Request('https://login.vk.com/?act=login', + compat_urllib_parse.urlencode(login_form).encode('utf-8')) + login_page = self._download_webpage(request, None, note='Logging in as %s' % username) + + if re.search(r'onLoginFailed', login_page): + raise ExtractorError('Unable to login, incorrect username and/or password', expected=True) + + def _real_initialize(self): + self._login() def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + info_url = 'http://vk.com/al_video.php?act=show&al=1&video=%s' % video_id info_page = self._download_webpage(info_url, video_id) + + if re.search(r'<!>Please log in or <', info_page): + raise ExtractorError('This video is only available for registered users, ' + 'use --username and --password options to provide account credentials.', expected=True) + m_yt = re.search(r'src="(http://www.youtube.com/.*?)"', info_page) if m_yt is not None: self.to_screen(u'Youtube video detected') @@ -60,4 +112,5 @@ class VKIE(InfoExtractor): 'title': unescapeHTML(data['md_title']), 'thumbnail': data.get('jpg'), 'uploader': data.get('md_author'), + 'duration': data.get('duration') } diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 8c2c4dfa2..a81036843 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1694,7 +1694,8 @@ class YoutubeSearchIE(SearchInfoExtractor): api_response = data['data'] if 'items' not in api_response: - raise ExtractorError(u'[youtube] No video results') + raise ExtractorError( + u'[youtube] No video results', expected=True) new_ids = list(video['id'] for video in api_response['items']) video_ids += new_ids |