diff options
Diffstat (limited to 'youtube_dl')
25 files changed, 460 insertions, 357 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 663c8bd7f..4419b21f6 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -110,6 +110,7 @@ from .khanacademy import KhanAcademyIE from .kickstarter import KickStarterIE from .keek import KeekIE from .la7 import LA7IE +from .lifenews import LifeNewsIE from .liveleak import LiveLeakIE from .livestream import LivestreamIE, LivestreamOriginalIE from .lynda import ( diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py index d18bc7e0c..df2cff81c 100644 --- a/youtube_dl/extractor/bloomberg.py +++ b/youtube_dl/extractor/bloomberg.py @@ -24,5 +24,5 @@ class BloombergIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) name = mobj.group('name') webpage = self._download_webpage(url, name) - ooyala_code = self._search_regex(r'<source src="http://player.ooyala.com/player/[^/]+/([^".]+)', webpage, u'ooyala url') - return OoyalaIE._build_url_result(ooyala_code) + ooyala_url = self._twitter_search_player(webpage) + return self.url_result(ooyala_url, OoyalaIE.ie_key()) diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py index d10b7bd0c..1db27026a 100644 --- a/youtube_dl/extractor/collegehumor.py +++ b/youtube_dl/extractor/collegehumor.py @@ -28,7 +28,25 @@ class CollegeHumorIE(InfoExtractor): 'description': 'This video wasn\'t long enough, so we made it double-spaced.', 'age_limit': 10, }, - }] + }, + # embedded youtube video + { + 'url': 'http://www.collegehumor.com/embed/6950457', + 'info_dict': { + 'id': 'W5gMp3ZjYg4', + 'ext': 'mp4', + 'title': 'Funny Dogs Protecting Babies Compilation 2014 [NEW HD]', + 'uploader': 'Funnyplox TV', + 'uploader_id': 'funnyploxtv', + 'description': 'md5:7e8899d3f749db50fa089eb243cba17f', + 'upload_date': '20140128', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['Youtube'], + }, + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -38,6 +56,12 @@ class CollegeHumorIE(InfoExtractor): data = json.loads(self._download_webpage( jsonUrl, video_id, 'Downloading info JSON')) vdata = data['video'] + if vdata.get('youtubeId') is not None: + return { + '_type': 'url', + 'url': vdata['youtubeId'], + 'ie_key': 'Youtube', + } AGE_LIMITS = {'nc17': 18, 'r': 18, 'pg13': 13, 'pg': 10, 'g': 0} rating = vdata.get('rating') @@ -49,7 +73,7 @@ class CollegeHumorIE(InfoExtractor): PREFS = {'high_quality': 2, 'low_quality': 0} formats = [] for format_key in ('mp4', 'webm'): - for qname, qurl in vdata[format_key].items(): + for qname, qurl in vdata.get(format_key, {}).items(): formats.append({ 'format_id': format_key + '_' + qname, 'url': qurl, diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 3333d433b..ed3986f31 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -14,7 +14,7 @@ from ..utils import ( class ComedyCentralIE(MTVServicesInfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www.)?comedycentral.com/ + _VALID_URL = r'''(?x)https?://(?:www\.)?comedycentral\.com/ (video-clips|episodes|cc-studios|video-collections) /(?P<title>.*)''' _FEED_URL = 'http://comedycentral.com/feeds/mrss/' @@ -86,7 +86,7 @@ class ComedyCentralShowsIE(InfoExtractor): @staticmethod def _transform_rtmp_url(rtmp_video_url): - m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url) + m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp\.comedystor/.*)$', rtmp_video_url) if not m: raise ExtractorError('Cannot transform RTMP url') base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/' diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index f7478d459..2c0c75604 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -399,7 +399,7 @@ class InfoExtractor(object): # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): - content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')' + content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')' property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop) template = r'<meta[^>]+?%s[^>]+?%s' return [ @@ -465,6 +465,10 @@ class InfoExtractor(object): } return RATING_TABLE.get(rating.lower(), None) + def _twitter_search_player(self, html): + return self._html_search_meta('twitter:player', html, + 'twitter card player') + def _sort_formats(self, formats): if not formats: raise ExtractorError(u'No video formats found') diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 2b66bddbb..920728e01 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -1,4 +1,6 @@ # encoding: utf-8 +from __future__ import unicode_literals + import re, base64, zlib from hashlib import sha1 from math import pow, sqrt, floor @@ -18,29 +20,29 @@ from ..aes import ( ) class CrunchyrollIE(InfoExtractor): - _VALID_URL = r'(?:https?://)?(?:www\.)?(?P<url>crunchyroll\.com/[^/]*/[^/?&]*?(?P<video_id>[0-9]+))(?:[/?&]|$)' + _VALID_URL = r'(?:https?://)?(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)' _TESTS = [{ - u'url': u'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', - u'file': u'645513.flv', - #u'md5': u'b1639fd6ddfaa43788c85f6d1dddd412', - u'info_dict': { - u'title': u'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!', - u'description': u'md5:2d17137920c64f2f49981a7797d275ef', - u'thumbnail': u'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg', - u'uploader': u'Yomiuri Telecasting Corporation (YTV)', - u'upload_date': u'20131013', + 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', + 'file': '645513.flv', + #'md5': 'b1639fd6ddfaa43788c85f6d1dddd412', + 'info_dict': { + 'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!', + 'description': 'md5:2d17137920c64f2f49981a7797d275ef', + 'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg', + 'uploader': 'Yomiuri Telecasting Corporation (YTV)', + 'upload_date': '20131013', }, - u'params': { + 'params': { # rtmp - u'skip_download': True, + 'skip_download': True, }, }] _FORMAT_IDS = { - u'360': (u'60', u'106'), - u'480': (u'61', u'106'), - u'720': (u'62', u'106'), - u'1080': (u'80', u'108'), + '360': ('60', '106'), + '480': ('61', '106'), + '720': ('62', '106'), + '1080': ('80', '108'), } def _decrypt_subtitles(self, data, iv, id): @@ -63,7 +65,7 @@ class CrunchyrollIE(InfoExtractor): num3 = key ^ num1 num4 = num3 ^ (num3 >> 3) ^ num2 prefix = intlist_to_bytes(obfuscate_key_aux(20, 97, (1, 2))) - shaHash = bytes_to_intlist(sha1(prefix + str(num4).encode(u'ascii')).digest()) + shaHash = bytes_to_intlist(sha1(prefix + str(num4).encode('ascii')).digest()) # Extend 160 Bit hash to 256 Bit return shaHash + [0] * 12 @@ -79,93 +81,98 @@ class CrunchyrollIE(InfoExtractor): def _convert_subtitles_to_srt(self, subtitles): i=1 - output = u'' + output = '' for start, end, text in re.findall(r'<event [^>]*?start="([^"]+)" [^>]*?end="([^"]+)" [^>]*?text="([^"]+)"[^>]*?>', subtitles): - start = start.replace(u'.', u',') - end = end.replace(u'.', u',') + start = start.replace('.', ',') + end = end.replace('.', ',') text = clean_html(text) - text = text.replace(u'\\N', u'\n') + text = text.replace('\\N', '\n') if not text: continue - output += u'%d\n%s --> %s\n%s\n\n' % (i, start, end, text) + output += '%d\n%s --> %s\n%s\n\n' % (i, start, end, text) i+=1 return output def _real_extract(self,url): mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('video_id') + + if mobj.group('prefix') == 'm': + mobile_webpage = self._download_webpage(url, video_id, 'Downloading mobile webpage') + webpage_url = self._search_regex(r'<link rel="canonical" href="([^"]+)" />', mobile_webpage, 'webpage_url') + else: + webpage_url = 'http://www.' + mobj.group('url') - webpage_url = u'http://www.' + mobj.group('url') - video_id = mobj.group(u'video_id') - webpage = self._download_webpage(webpage_url, video_id) - note_m = self._html_search_regex(r'<div class="showmedia-trailer-notice">(.+?)</div>', webpage, u'trailer-notice', default=u'') + webpage = self._download_webpage(webpage_url, video_id, 'Downloading webpage') + note_m = self._html_search_regex(r'<div class="showmedia-trailer-notice">(.+?)</div>', webpage, 'trailer-notice', default='') if note_m: raise ExtractorError(note_m) - video_title = self._html_search_regex(r'<h1[^>]*>(.+?)</h1>', webpage, u'video_title', flags=re.DOTALL) - video_title = re.sub(r' {2,}', u' ', video_title) - video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, u'video_description', default=u'') + video_title = self._html_search_regex(r'<h1[^>]*>(.+?)</h1>', webpage, 'video_title', flags=re.DOTALL) + video_title = re.sub(r' {2,}', ' ', video_title) + video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='') if not video_description: video_description = None - video_upload_date = self._html_search_regex(r'<div>Availability for free users:(.+?)</div>', webpage, u'video_upload_date', fatal=False, flags=re.DOTALL) + video_upload_date = self._html_search_regex(r'<div>Availability for free users:(.+?)</div>', webpage, 'video_upload_date', fatal=False, flags=re.DOTALL) if video_upload_date: video_upload_date = unified_strdate(video_upload_date) - video_uploader = self._html_search_regex(r'<div>\s*Publisher:(.+?)</div>', webpage, u'video_uploader', fatal=False, flags=re.DOTALL) + video_uploader = self._html_search_regex(r'<div>\s*Publisher:(.+?)</div>', webpage, 'video_uploader', fatal=False, flags=re.DOTALL) - playerdata_url = compat_urllib_parse.unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, u'playerdata_url')) + playerdata_url = compat_urllib_parse.unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url')) playerdata_req = compat_urllib_request.Request(playerdata_url) - playerdata_req.data = compat_urllib_parse.urlencode({u'current_page': webpage_url}) - playerdata_req.add_header(u'Content-Type', u'application/x-www-form-urlencoded') - playerdata = self._download_webpage(playerdata_req, video_id, note=u'Downloading media info') + playerdata_req.data = compat_urllib_parse.urlencode({'current_page': webpage_url}) + playerdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') + playerdata = self._download_webpage(playerdata_req, video_id, note='Downloading media info') - stream_id = self._search_regex(r'<media_id>([^<]+)', playerdata, u'stream_id') - video_thumbnail = self._search_regex(r'<episode_image_url>([^<]+)', playerdata, u'thumbnail', fatal=False) + stream_id = self._search_regex(r'<media_id>([^<]+)', playerdata, 'stream_id') + video_thumbnail = self._search_regex(r'<episode_image_url>([^<]+)', playerdata, 'thumbnail', fatal=False) formats = [] for fmt in re.findall(r'\?p([0-9]{3,4})=1', webpage): stream_quality, stream_format = self._FORMAT_IDS[fmt] - video_format = fmt+u'p' - streamdata_req = compat_urllib_request.Request(u'http://www.crunchyroll.com/xml/') + video_format = fmt+'p' + streamdata_req = compat_urllib_request.Request('http://www.crunchyroll.com/xml/') # urlencode doesn't work! - streamdata_req.data = u'req=RpcApiVideoEncode%5FGetStreamInfo&video%5Fencode%5Fquality='+stream_quality+u'&media%5Fid='+stream_id+u'&video%5Fformat='+stream_format - streamdata_req.add_header(u'Content-Type', u'application/x-www-form-urlencoded') - streamdata_req.add_header(u'Content-Length', str(len(streamdata_req.data))) - streamdata = self._download_webpage(streamdata_req, video_id, note=u'Downloading media info for '+video_format) - video_url = self._search_regex(r'<host>([^<]+)', streamdata, u'video_url') - video_play_path = self._search_regex(r'<file>([^<]+)', streamdata, u'video_play_path') + streamdata_req.data = 'req=RpcApiVideoEncode%5FGetStreamInfo&video%5Fencode%5Fquality='+stream_quality+'&media%5Fid='+stream_id+'&video%5Fformat='+stream_format + streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') + streamdata_req.add_header('Content-Length', str(len(streamdata_req.data))) + streamdata = self._download_webpage(streamdata_req, video_id, note='Downloading media info for '+video_format) + video_url = self._search_regex(r'<host>([^<]+)', streamdata, 'video_url') + video_play_path = self._search_regex(r'<file>([^<]+)', streamdata, 'video_play_path') formats.append({ - u'url': video_url, - u'play_path': video_play_path, - u'ext': 'flv', - u'format': video_format, - u'format_id': video_format, + 'url': video_url, + 'play_path': video_play_path, + 'ext': 'flv', + 'format': video_format, + 'format_id': video_format, }) subtitles = {} for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage): - sub_page = self._download_webpage(u'http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id='+sub_id,\ - video_id, note=u'Downloading subtitles for '+sub_name) - id = self._search_regex(r'id=\'([0-9]+)', sub_page, u'subtitle_id', fatal=False) - iv = self._search_regex(r'<iv>([^<]+)', sub_page, u'subtitle_iv', fatal=False) - data = self._search_regex(r'<data>([^<]+)', sub_page, u'subtitle_data', fatal=False) + sub_page = self._download_webpage('http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id='+sub_id,\ + video_id, note='Downloading subtitles for '+sub_name) + id = self._search_regex(r'id=\'([0-9]+)', sub_page, 'subtitle_id', fatal=False) + iv = self._search_regex(r'<iv>([^<]+)', sub_page, 'subtitle_iv', fatal=False) + data = self._search_regex(r'<data>([^<]+)', sub_page, 'subtitle_data', fatal=False) if not id or not iv or not data: continue id = int(id) iv = base64.b64decode(iv) data = base64.b64decode(data) - subtitle = self._decrypt_subtitles(data, iv, id).decode(u'utf-8') - lang_code = self._search_regex(r'lang_code=\'([^\']+)', subtitle, u'subtitle_lang_code', fatal=False) + subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8') + lang_code = self._search_regex(r'lang_code=\'([^\']+)', subtitle, 'subtitle_lang_code', fatal=False) if not lang_code: continue subtitles[lang_code] = self._convert_subtitles_to_srt(subtitle) return { - u'id': video_id, - u'title': video_title, - u'description': video_description, - u'thumbnail': video_thumbnail, - u'uploader': video_uploader, - u'upload_date': video_upload_date, - u'subtitles': subtitles, - u'formats': formats, + 'id': video_id, + 'title': video_title, + 'description': video_description, + 'thumbnail': video_thumbnail, + 'uploader': video_uploader, + 'upload_date': video_upload_date, + 'subtitles': subtitles, + 'formats': formats, } diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index b32ff9f86..ae342341c 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -1,4 +1,7 @@ # encoding: utf-8 + +from __future__ import unicode_literals + import re import json @@ -30,7 +33,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor): class PluzzIE(FranceTVBaseInfoExtractor): - IE_NAME = u'pluzz.francetv.fr' + IE_NAME = 'pluzz.francetv.fr' _VALID_URL = r'https?://pluzz\.francetv\.fr/videos/(.*?)\.html' # Can't use tests, videos expire in 7 days @@ -44,17 +47,17 @@ class PluzzIE(FranceTVBaseInfoExtractor): class FranceTvInfoIE(FranceTVBaseInfoExtractor): - IE_NAME = u'francetvinfo.fr' + IE_NAME = 'francetvinfo.fr' _VALID_URL = r'https?://www\.francetvinfo\.fr/replay.*/(?P<title>.+)\.html' _TEST = { - u'url': u'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html', - u'file': u'84981923.mp4', - u'info_dict': { - u'title': u'Soir 3', + 'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html', + 'file': '84981923.mp4', + 'info_dict': { + 'title': 'Soir 3', }, - u'params': { - u'skip_download': True, + 'params': { + 'skip_download': True, }, } @@ -62,13 +65,13 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): mobj = re.match(self._VALID_URL, url) page_title = mobj.group('title') webpage = self._download_webpage(url, page_title) - video_id = self._search_regex(r'id-video=(\d+?)"', webpage, u'video id') + video_id = self._search_regex(r'id-video=(\d+?)[@"]', webpage, 'video id') return self._extract_video(video_id) class FranceTVIE(FranceTVBaseInfoExtractor): - IE_NAME = u'francetv' - IE_DESC = u'France 2, 3, 4, 5 and Ô' + IE_NAME = 'francetv' + IE_DESC = 'France 2, 3, 4, 5 and Ô' _VALID_URL = r'''(?x)https?://www\.france[2345o]\.fr/ (?: emissions/.*?/(videos|emissions)/(?P<id>[^/?]+) @@ -78,73 +81,73 @@ class FranceTVIE(FranceTVBaseInfoExtractor): _TESTS = [ # france2 { - u'url': u'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104', - u'file': u'75540104.mp4', - u'info_dict': { - u'title': u'13h15, le samedi...', - u'description': u'md5:2e5b58ba7a2d3692b35c792be081a03d', + 'url': 'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104', + 'file': '75540104.mp4', + 'info_dict': { + 'title': '13h15, le samedi...', + 'description': 'md5:2e5b58ba7a2d3692b35c792be081a03d', }, - u'params': { + 'params': { # m3u8 download - u'skip_download': True, + 'skip_download': True, }, }, # france3 { - u'url': u'http://www.france3.fr/emissions/pieces-a-conviction/diffusions/13-11-2013_145575', - u'info_dict': { - u'id': u'000702326_CAPP_PicesconvictionExtrait313022013_120220131722_Au', - u'ext': u'flv', - u'title': u'Le scandale du prix des médicaments', - u'description': u'md5:1384089fbee2f04fc6c9de025ee2e9ce', + 'url': 'http://www.france3.fr/emissions/pieces-a-conviction/diffusions/13-11-2013_145575', + 'info_dict': { + 'id': '000702326_CAPP_PicesconvictionExtrait313022013_120220131722_Au', + 'ext': 'flv', + 'title': 'Le scandale du prix des médicaments', + 'description': 'md5:1384089fbee2f04fc6c9de025ee2e9ce', }, - u'params': { + 'params': { # rtmp download - u'skip_download': True, + 'skip_download': True, }, }, # france4 { - u'url': u'http://www.france4.fr/emissions/hero-corp/videos/rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4', - u'info_dict': { - u'id': u'rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4', - u'ext': u'flv', - u'title': u'Hero Corp Making of - Extrait 1', - u'description': u'md5:c87d54871b1790679aec1197e73d650a', + 'url': 'http://www.france4.fr/emissions/hero-corp/videos/rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4', + 'info_dict': { + 'id': 'rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4', + 'ext': 'flv', + 'title': 'Hero Corp Making of - Extrait 1', + 'description': 'md5:c87d54871b1790679aec1197e73d650a', }, - u'params': { + 'params': { # rtmp download - u'skip_download': True, + 'skip_download': True, }, }, # france5 { - u'url': u'http://www.france5.fr/emissions/c-a-dire/videos/92837968', - u'info_dict': { - u'id': u'92837968', - u'ext': u'mp4', - u'title': u'C à dire ?!', - u'description': u'md5:fb1db1cbad784dcce7c7a7bd177c8e2f', + 'url': 'http://www.france5.fr/emissions/c-a-dire/videos/92837968', + 'info_dict': { + 'id': '92837968', + 'ext': 'mp4', + 'title': 'C à dire ?!', + 'description': 'md5:fb1db1cbad784dcce7c7a7bd177c8e2f', }, - u'params': { + 'params': { # m3u8 download - u'skip_download': True, + 'skip_download': True, }, }, # franceo { - u'url': u'http://www.franceo.fr/jt/info-afrique/04-12-2013', - u'info_dict': { - u'id': u'92327925', - u'ext': u'mp4', - u'title': u'Infô-Afrique', - u'description': u'md5:ebf346da789428841bee0fd2a935ea55', + 'url': 'http://www.franceo.fr/jt/info-afrique/04-12-2013', + 'info_dict': { + 'id': '92327925', + 'ext': 'mp4', + 'title': 'Infô-Afrique', + 'description': 'md5:ebf346da789428841bee0fd2a935ea55', }, - u'params': { + 'params': { # m3u8 download - u'skip_download': True, + 'skip_download': True, }, - u'skip': u'The id changes frequently', + 'skip': 'The id changes frequently', }, ] @@ -160,26 +163,26 @@ class FranceTVIE(FranceTVBaseInfoExtractor): '\.fr/\?id-video=([^"/&]+)'), (r'<a class="video" id="ftv_player_(.+?)"'), ] - video_id = self._html_search_regex(id_res, webpage, u'video ID') + video_id = self._html_search_regex(id_res, webpage, 'video ID') else: video_id = mobj.group('id') return self._extract_video(video_id) class GenerationQuoiIE(InfoExtractor): - IE_NAME = u'france2.fr:generation-quoi' + IE_NAME = 'france2.fr:generation-quoi' _VALID_URL = r'https?://generation-quoi\.france2\.fr/portrait/(?P<name>.*)(\?|$)' _TEST = { - u'url': u'http://generation-quoi.france2.fr/portrait/garde-a-vous', - u'file': u'k7FJX8VBcvvLmX4wA5Q.mp4', - u'info_dict': { - u'title': u'Génération Quoi - Garde à Vous', - u'uploader': u'Génération Quoi', + 'url': 'http://generation-quoi.france2.fr/portrait/garde-a-vous', + 'file': 'k7FJX8VBcvvLmX4wA5Q.mp4', + 'info_dict': { + 'title': 'Génération Quoi - Garde à Vous', + 'uploader': 'Génération Quoi', }, - u'params': { + 'params': { # It uses Dailymotion - u'skip_download': True, + 'skip_download': True, }, } @@ -194,20 +197,20 @@ class GenerationQuoiIE(InfoExtractor): class CultureboxIE(FranceTVBaseInfoExtractor): - IE_NAME = u'culturebox.francetvinfo.fr' + IE_NAME = 'culturebox.francetvinfo.fr' _VALID_URL = r'https?://culturebox\.francetvinfo\.fr/(?P<name>.*?)(\?|$)' _TEST = { - u'url': u'http://culturebox.francetvinfo.fr/einstein-on-the-beach-au-theatre-du-chatelet-146813', - u'info_dict': { - u'id': u'EV_6785', - u'ext': u'mp4', - u'title': u'Einstein on the beach au Théâtre du Châtelet', - u'description': u'md5:9ce2888b1efefc617b5e58b3f6200eeb', + 'url': 'http://culturebox.francetvinfo.fr/einstein-on-the-beach-au-theatre-du-chatelet-146813', + 'info_dict': { + 'id': 'EV_6785', + 'ext': 'mp4', + 'title': 'Einstein on the beach au Théâtre du Châtelet', + 'description': 'md5:9ce2888b1efefc617b5e58b3f6200eeb', }, - u'params': { + 'params': { # m3u8 download - u'skip_download': True, + 'skip_download': True, }, } @@ -215,5 +218,5 @@ class CultureboxIE(FranceTVBaseInfoExtractor): mobj = re.match(self._VALID_URL, url) name = mobj.group('name') webpage = self._download_webpage(url, name) - video_id = self._search_regex(r'"http://videos\.francetv\.fr/video/(.*?)"', webpage, u'video id') + video_id = self._search_regex(r'"http://videos\.francetv\.fr/video/(.*?)"', webpage, 'video id') return self._extract_video(video_id) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 48de379b7..082da9c77 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -38,18 +38,6 @@ class GenericIE(InfoExtractor): 'title': 'R\u00e9gis plante sa Jeep', } }, - # embedded vimeo video - { - 'add_ie': ['Vimeo'], - 'url': 'http://skillsmatter.com/podcast/home/move-semanticsperfect-forwarding-and-rvalue-references', - 'file': '22444065.mp4', - 'md5': '2903896e23df39722c33f015af0666e2', - 'info_dict': { - 'title': 'ACCU 2011: Move Semantics,Perfect Forwarding, and Rvalue references- Scott Meyers- 13/04/2011', - 'uploader_id': 'skillsmatter', - 'uploader': 'Skills Matter', - } - }, # bandcamp page with custom domain { 'add_ie': ['Bandcamp'], @@ -254,7 +242,7 @@ class GenericIE(InfoExtractor): # Look for embedded (iframe) Vimeo player mobj = re.search( - r'<iframe[^>]+?src="((?:https?:)?//player.vimeo.com/video/.+?)"', webpage) + r'<iframe[^>]+?src="((?:https?:)?//player\.vimeo\.com/video/.+?)"', webpage) if mobj: player_url = unescapeHTML(mobj.group(1)) surl = smuggle_url(player_url, {'Referer': url}) @@ -262,7 +250,7 @@ class GenericIE(InfoExtractor): # Look for embedded (swf embed) Vimeo player mobj = re.search( - r'<embed[^>]+?src="(https?://(?:www\.)?vimeo.com/moogaloop.swf.+?)"', webpage) + r'<embed[^>]+?src="(https?://(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage) if mobj: return self.url_result(mobj.group(1), 'Vimeo') @@ -332,7 +320,7 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group(1), 'Aparat') # Look for MPORA videos - mobj = re.search(r'<iframe .*?src="(http://mpora\.com/videos/[^"]+)"', webpage) + mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage) if mobj is not None: return self.url_result(mobj.group(1), 'Mpora') @@ -350,7 +338,7 @@ class GenericIE(InfoExtractor): # Look for embedded Huffington Post player mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live.huffingtonpost\.com/.+?)\1', webpage) + r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage) if mobj is not None: return self.url_result(mobj.group('url'), 'HuffPost') @@ -358,7 +346,7 @@ class GenericIE(InfoExtractor): mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) if mobj is None: # Look for gorilla-vid style embedding - mobj = re.search(r'(?s)jw_plugins.*?file:\s*["\'](.*?)["\']', webpage) + mobj = re.search(r'(?s)(?:jw_plugins|JWPlayerOptions).*?file\s*:\s*["\'](.*?)["\']', webpage) if mobj is None: # Broaden the search a little bit mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index c79c589c7..7c208b85d 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -1,27 +1,27 @@ +from __future__ import unicode_literals + import base64 import re from .common import InfoExtractor from ..utils import ( compat_urllib_parse, - - ExtractorError, ) class InfoQIE(InfoExtractor): _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$' _TEST = { - u"name": u"InfoQ", - u"url": u"http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things", - u"file": u"12-jan-pythonthings.mp4", - u"info_dict": { - u"description": u"Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.", - u"title": u"A Few of My Favorite [Python] Things" + "name": "InfoQ", + "url": "http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things", + "file": "12-jan-pythonthings.mp4", + "info_dict": { + "description": "Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.", + "title": "A Few of My Favorite [Python] Things", + }, + "params": { + "skip_download": True, }, - u"params": { - u"skip_download": True - } } def _real_extract(self, url): @@ -31,32 +31,25 @@ class InfoQIE(InfoExtractor): self.report_extraction(url) # Extract video URL - mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video url') - real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8')) + encoded_id = self._search_regex(r"jsclassref ?= ?'([^']*)'", webpage, 'encoded id') + real_id = compat_urllib_parse.unquote(base64.b64decode(encoded_id.encode('ascii')).decode('utf-8')) video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id # Extract title video_title = self._search_regex(r'contentTitle = "(.*?)";', - webpage, u'title') + webpage, 'title') # Extract description video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>', - webpage, u'description', fatal=False) + webpage, 'description', fatal=False) video_filename = video_url.split('/')[-1] video_id, extension = video_filename.split('.') - info = { + return { 'id': video_id, 'url': video_url, - 'uploader': None, - 'upload_date': None, 'title': video_title, 'ext': extension, # Extension is always(?) mp4, but seems to be flv - 'thumbnail': None, 'description': video_description, } - - return [info]
\ No newline at end of file diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py new file mode 100644 index 000000000..4e4035b76 --- /dev/null +++ b/youtube_dl/extractor/lifenews.py @@ -0,0 +1,63 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import unified_strdate + + +class LifeNewsIE(InfoExtractor): + IE_NAME = 'lifenews' + IE_DESC = 'LIFE | NEWS' + _VALID_URL = r'http://lifenews\.ru/(?:mobile/)?news/(?P<id>\d+)' + + _TEST = { + 'url': 'http://lifenews.ru/news/126342', + 'file': '126342.mp4', + 'md5': 'e1b50a5c5fb98a6a544250f2e0db570a', + 'info_dict': { + 'title': 'МВД разыскивает мужчин, оставивших в IKEA сумку с автоматом', + 'description': 'Камеры наблюдения гипермаркета зафиксировали троих мужчин, спрятавших оружейный арсенал в камере хранения.', + 'thumbnail': 'http://lifenews.ru/static/posts/2014/1/126342/.video.jpg', + 'upload_date': '20140130', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage('http://lifenews.ru/mobile/news/%s' % video_id, video_id, 'Downloading page') + + video_url = self._html_search_regex( + r'<video.*?src="([^"]+)"></video>', webpage, 'video URL') + + thumbnail = self._html_search_regex( + r'<video.*?poster="([^"]+)".*?"></video>', webpage, 'video thumbnail') + + title = self._og_search_title(webpage) + TITLE_SUFFIX = ' - Первый по срочным новостям — LIFE | NEWS' + if title.endswith(TITLE_SUFFIX): + title = title[:-len(TITLE_SUFFIX)] + + description = self._og_search_description(webpage) + + view_count = self._html_search_regex( + r'<div class=\'views\'>(\d+)</div>', webpage, 'view count') + comment_count = self._html_search_regex( + r'<div class=\'comments\'>(\d+)</div>', webpage, 'comment count') + + upload_date = self._html_search_regex( + r'<time datetime=\'([^\']+)\'>', webpage, 'upload date') + + return { + 'id': video_id, + 'url': video_url, + 'thumbnail': thumbnail, + 'title': title, + 'description': description, + 'view_count': view_count, + 'comment_count': comment_count, + 'upload_date': unified_strdate(upload_date), + }
\ No newline at end of file diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index d01fd01e3..4e76c1f4a 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -1,5 +1,6 @@ from __future__ import unicode_literals +import json import re from .common import InfoExtractor @@ -10,7 +11,7 @@ from ..utils import ( class LiveLeakIE(InfoExtractor): _VALID_URL = r'^(?:http://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)' - _TEST = { + _TESTS = [{ 'url': 'http://www.liveleak.com/view?i=757_1364311680', 'file': '757_1364311680.mp4', 'md5': '0813c2430bea7a46bf13acf3406992f4', @@ -19,15 +20,37 @@ class LiveLeakIE(InfoExtractor): 'uploader': 'ljfriel2', 'title': 'Most unlucky car accident' } - } + }, + { + 'url': 'http://www.liveleak.com/view?i=f93_1390833151', + 'file': 'f93_1390833151.mp4', + 'md5': 'd3f1367d14cc3c15bf24fbfbe04b9abf', + 'info_dict': { + 'description': 'German Television Channel NDR does an exclusive interview with Edward Snowden.\r\nUploaded on LiveLeak cause German Television thinks the rest of the world isn\'t intereseted in Edward Snowden.', + 'uploader': 'ARD_Stinkt', + 'title': 'German Television does first Edward Snowden Interview (ENGLISH)', + } + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('video_id') webpage = self._download_webpage(url, video_id) - video_url = self._search_regex( - r'file: "(.*?)",', webpage, 'video URL') + sources_raw = self._search_regex( + r'(?s)sources:\s*(\[.*?\]),', webpage, 'video URLs', default=None) + if sources_raw is None: + sources_raw = '[{ %s}]' % ( + self._search_regex(r'(file: ".*?"),', webpage, 'video URL')) + + sources_json = re.sub(r'\s([a-z]+):\s', r'"\1": ', sources_raw) + sources = json.loads(sources_json) + + formats = [{ + 'format_note': s.get('label'), + 'url': s['file'], + } for s in sources] + self._sort_formats(formats) video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip() video_description = self._og_search_description(webpage) @@ -36,9 +59,8 @@ class LiveLeakIE(InfoExtractor): return { 'id': video_id, - 'url': video_url, - 'ext': 'mp4', 'title': video_title, 'description': video_description, - 'uploader': video_uploader + 'uploader': video_uploader, + 'formats': formats, } diff --git a/youtube_dl/extractor/malemotion.py b/youtube_dl/extractor/malemotion.py index 62e99091d..8c1966ab2 100644 --- a/youtube_dl/extractor/malemotion.py +++ b/youtube_dl/extractor/malemotion.py @@ -16,7 +16,8 @@ class MalemotionIE(InfoExtractor): 'info_dict': { "title": "Bien dur", "age_limit": 18, - } + }, + 'skip': 'This video has been deleted.' } def _real_extract(self, url): diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index f6f31bfdc..4521451ac 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -119,7 +119,9 @@ class MTVServicesInfoExtractor(InfoExtractor): if mgid.endswith('.swf'): mgid = mgid[:-4] except RegexNotFoundError: - mgid = self._search_regex(r'data-mgid="(.*?)"', webpage, u'mgid') + mgid = self._search_regex( + [r'data-mgid="(.*?)"', r'swfobject.embedSWF\(".*?(mgid:.*?)"'], + webpage, u'mgid') return self._get_videos_info(mgid) diff --git a/youtube_dl/extractor/myspass.py b/youtube_dl/extractor/myspass.py index 4becddee6..4fa0575f8 100644 --- a/youtube_dl/extractor/myspass.py +++ b/youtube_dl/extractor/myspass.py @@ -1,3 +1,4 @@ +from __future__ import unicode_literals import os.path from .common import InfoExtractor @@ -11,13 +12,13 @@ from ..utils import ( class MySpassIE(InfoExtractor): _VALID_URL = r'http://www\.myspass\.de/.*' _TEST = { - u'url': u'http://www.myspass.de/myspass/shows/tvshows/absolute-mehrheit/Absolute-Mehrheit-vom-17022013-Die-Highlights-Teil-2--/11741/', - u'file': u'11741.mp4', - u'md5': u'0b49f4844a068f8b33f4b7c88405862b', - u'info_dict': { - u"description": u"Wer kann in die Fu\u00dfstapfen von Wolfgang Kubicki treten und die Mehrheit der Zuschauer hinter sich versammeln? Wird vielleicht sogar die Absolute Mehrheit geknackt und der Jackpot von 200.000 Euro mit nach Hause genommen?", - u"title": u"Absolute Mehrheit vom 17.02.2013 - Die Highlights, Teil 2" - } + 'url': 'http://www.myspass.de/myspass/shows/tvshows/absolute-mehrheit/Absolute-Mehrheit-vom-17022013-Die-Highlights-Teil-2--/11741/', + 'file': '11741.mp4', + 'md5': '0b49f4844a068f8b33f4b7c88405862b', + 'info_dict': { + "description": "Wer kann in die Fu\u00dfstapfen von Wolfgang Kubicki treten und die Mehrheit der Zuschauer hinter sich versammeln? Wird vielleicht sogar die Absolute Mehrheit geknackt und der Jackpot von 200.000 Euro mit nach Hause genommen?", + "title": "Absolute Mehrheit vom 17.02.2013 - Die Highlights, Teil 2", + }, } def _real_extract(self, url): @@ -37,12 +38,11 @@ class MySpassIE(InfoExtractor): # extract values from metadata url_flv_el = metadata.find('url_flv') if url_flv_el is None: - raise ExtractorError(u'Unable to extract download url') + raise ExtractorError('Unable to extract download url') video_url = url_flv_el.text - extension = os.path.splitext(video_url)[1][1:] title_el = metadata.find('title') if title_el is None: - raise ExtractorError(u'Unable to extract title') + raise ExtractorError('Unable to extract title') title = title_el.text format_id_el = metadata.find('format_id') if format_id_el is None: @@ -59,13 +59,12 @@ class MySpassIE(InfoExtractor): thumbnail = imagePreview_el.text else: thumbnail = None - info = { + + return { 'id': video_id, 'url': video_url, 'title': title, - 'ext': extension, 'format': format, 'thumbnail': thumbnail, - 'description': description + 'description': description, } - return [info] diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 0f178905b..7e421610e 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -1,48 +1,39 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor -from ..utils import ( - ExtractorError, -) class NBAIE(InfoExtractor): _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$' _TEST = { - u'url': u'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html', - u'file': u'0021200253-okc-bkn-recap.nba.mp4', - u'md5': u'c0edcfc37607344e2ff8f13c378c88a4', - u'info_dict': { - u"description": u"Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.", - u"title": u"Thunder vs. Nets" - } + 'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html', + 'file': u'0021200253-okc-bkn-recap.nba.mp4', + 'md5': u'c0edcfc37607344e2ff8f13c378c88a4', + 'info_dict': { + 'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.', + 'title': 'Thunder vs. Nets', + }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - video_id = mobj.group(1) webpage = self._download_webpage(url, video_id) - video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4' + video_url = 'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4' shortened_video_id = video_id.rpartition('/')[2] title = self._og_search_title(webpage, default=shortened_video_id).replace('NBA.com: ', '') - # It isn't there in the HTML it returns to us - # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False) - description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False) - info = { + return { 'id': shortened_video_id, 'url': video_url, 'ext': 'mp4', 'title': title, - # 'uploader_date': uploader_date, 'description': description, } - return [info] diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index d08e47734..44312ba4e 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -5,7 +5,7 @@ from .common import InfoExtractor from ..utils import unescapeHTML class OoyalaIE(InfoExtractor): - _VALID_URL = r'https?://.+?\.ooyala\.com/.*?embedCode=(?P<id>.+?)(&|$)' + _VALID_URL = r'https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=(?P<id>.+?)(&|$)' _TEST = { # From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video diff --git a/youtube_dl/extractor/rbmaradio.py b/youtube_dl/extractor/rbmaradio.py index 4b6147a73..b9cb7abd1 100644 --- a/youtube_dl/extractor/rbmaradio.py +++ b/youtube_dl/extractor/rbmaradio.py @@ -1,3 +1,6 @@ +# encoding: utf-8 +from __future__ import unicode_literals + import json import re @@ -12,16 +15,16 @@ from ..utils import ( class RBMARadioIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$' _TEST = { - u'url': u'http://www.rbmaradio.com/shows/ford-lopatin-live-at-primavera-sound-2011', - u'file': u'ford-lopatin-live-at-primavera-sound-2011.mp3', - u'md5': u'6bc6f9bcb18994b4c983bc3bf4384d95', - u'info_dict': { - u"uploader_id": u"ford-lopatin", - u"location": u"Spain", - u"description": u"Joel Ford and Daniel \u2019Oneohtrix Point Never\u2019 Lopatin fly their midified pop extravaganza to Spain. Live at Primavera Sound 2011.", - u"uploader": u"Ford & Lopatin", - u"title": u"Live at Primavera Sound 2011" - } + 'url': 'http://www.rbmaradio.com/shows/ford-lopatin-live-at-primavera-sound-2011', + 'file': 'ford-lopatin-live-at-primavera-sound-2011.mp3', + 'md5': '6bc6f9bcb18994b4c983bc3bf4384d95', + 'info_dict': { + "uploader_id": "ford-lopatin", + "location": "Spain", + "description": "Joel Ford and Daniel ’Oneohtrix Point Never’ Lopatin fly their midified pop extravaganza to Spain. Live at Primavera Sound 2011.", + "uploader": "Ford & Lopatin", + "title": "Live at Primavera Sound 2011", + }, } def _real_extract(self, url): @@ -31,26 +34,24 @@ class RBMARadioIE(InfoExtractor): webpage = self._download_webpage(url, video_id) json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$', - webpage, u'json data', flags=re.MULTILINE) + webpage, 'json data', flags=re.MULTILINE) try: data = json.loads(json_data) except ValueError as e: - raise ExtractorError(u'Invalid JSON: ' + str(e)) + raise ExtractorError('Invalid JSON: ' + str(e)) video_url = data['akamai_url'] + '&cbr=256' url_parts = compat_urllib_parse_urlparse(video_url) - video_ext = url_parts.path.rpartition('.')[2] - info = { - 'id': video_id, - 'url': video_url, - 'ext': video_ext, - 'title': data['title'], - 'description': data.get('teaser_text'), - 'location': data.get('country_of_origin'), - 'uploader': data.get('host', {}).get('name'), - 'uploader_id': data.get('host', {}).get('slug'), - 'thumbnail': data.get('image', {}).get('large_url_2x'), - 'duration': data.get('duration'), + + return { + 'id': video_id, + 'url': video_url, + 'title': data['title'], + 'description': data.get('teaser_text'), + 'location': data.get('country_of_origin'), + 'uploader': data.get('host', {}).get('name'), + 'uploader_id': data.get('host', {}).get('slug'), + 'thumbnail': data.get('image', {}).get('large_url_2x'), + 'duration': data.get('duration'), } - return [info] diff --git a/youtube_dl/extractor/ro220.py b/youtube_dl/extractor/ro220.py index c32f64d99..4678f62df 100644 --- a/youtube_dl/extractor/ro220.py +++ b/youtube_dl/extractor/ro220.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -11,12 +13,12 @@ class Ro220IE(InfoExtractor): IE_NAME = '220.ro' _VALID_URL = r'(?x)(?:https?://)?(?:www\.)?220\.ro/(?P<category>[^/]+)/(?P<shorttitle>[^/]+)/(?P<video_id>[^/]+)' _TEST = { - u"url": u"http://www.220.ro/sport/Luati-Le-Banii-Sez-4-Ep-1/LYV6doKo7f/", - u'file': u'LYV6doKo7f.mp4', - u'md5': u'03af18b73a07b4088753930db7a34add', - u'info_dict': { - u"title": u"Luati-le Banii sez 4 ep 1", - u"description": u"Iata-ne reveniti dupa o binemeritata vacanta. Va astept si pe Facebook cu pareri si comentarii.", + "url": "http://www.220.ro/sport/Luati-Le-Banii-Sez-4-Ep-1/LYV6doKo7f/", + 'file': 'LYV6doKo7f.mp4', + 'md5': '03af18b73a07b4088753930db7a34add', + 'info_dict': { + "title": "Luati-le Banii sez 4 ep 1", + "description": "Iata-ne reveniti dupa o binemeritata vacanta. Va astept si pe Facebook cu pareri si comentarii.", } } @@ -27,10 +29,10 @@ class Ro220IE(InfoExtractor): webpage = self._download_webpage(url, video_id) flashVars_str = self._search_regex( r'<param name="flashVars" value="([^"]+)"', - webpage, u'flashVars') + webpage, 'flashVars') flashVars = compat_parse_qs(flashVars_str) - info = { + return { '_type': 'video', 'id': video_id, 'ext': 'mp4', @@ -39,4 +41,3 @@ class Ro220IE(InfoExtractor): 'description': clean_html(flashVars['desc'][0]), 'thumbnail': flashVars['preview'][0], } - return info diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index 051a34d5b..9156d7faf 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -6,20 +8,20 @@ from .common import InfoExtractor class SpiegelIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$' _TESTS = [{ - u'url': u'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', - u'file': u'1259285.mp4', - u'md5': u'2c2754212136f35fb4b19767d242f66e', - u'info_dict': { - u"title": u"Vulkanausbruch in Ecuador: Der \"Feuerschlund\" ist wieder aktiv" - } + 'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', + 'file': '1259285.mp4', + 'md5': '2c2754212136f35fb4b19767d242f66e', + 'info_dict': { + 'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv', + }, }, { - u'url': u'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', - u'file': u'1309159.mp4', - u'md5': u'f2cdf638d7aa47654e251e1aee360af1', - u'info_dict': { - u'title': u'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers' - } + 'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', + 'file': '1309159.mp4', + 'md5': 'f2cdf638d7aa47654e251e1aee360af1', + 'info_dict': { + 'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers', + }, }] def _real_extract(self, url): @@ -29,17 +31,17 @@ class SpiegelIE(InfoExtractor): webpage = self._download_webpage(url, video_id) video_title = self._html_search_regex( - r'<div class="module-title">(.*?)</div>', webpage, u'title') + r'<div class="module-title">(.*?)</div>', webpage, 'title') - xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml' + xml_url = 'http://video2.spiegel.de/flash/' + video_id + '.xml' idoc = self._download_xml( xml_url, video_id, - note=u'Downloading XML', errnote=u'Failed to download XML') + note='Downloading XML', errnote='Failed to download XML') formats = [ { 'format_id': n.tag.rpartition('type')[2], - 'url': u'http://video2.spiegel.de/flash/' + n.find('./filename').text, + 'url': 'http://video2.spiegel.de/flash/' + n.find('./filename').text, 'width': int(n.find('./width').text), 'height': int(n.find('./height').text), 'abr': int(n.find('./audiobitrate').text), @@ -55,10 +57,9 @@ class SpiegelIE(InfoExtractor): self._sort_formats(formats) - info = { + return { 'id': video_id, 'title': video_title, 'duration': duration, 'formats': formats, } - return info diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py index 4e404fbf5..c980153ec 100644 --- a/youtube_dl/extractor/tutv.py +++ b/youtube_dl/extractor/tutv.py @@ -1,3 +1,4 @@ +from __future__ import unicode_literals import base64 import re @@ -6,15 +7,16 @@ from ..utils import ( compat_parse_qs, ) + class TutvIE(InfoExtractor): - _VALID_URL=r'https?://(?:www\.)?tu\.tv/videos/(?P<id>[^/?]+)' + _VALID_URL = r'https?://(?:www\.)?tu\.tv/videos/(?P<id>[^/?]+)' _TEST = { - u'url': u'http://tu.tv/videos/noah-en-pabellon-cuahutemoc', - u'file': u'2742556.flv', - u'md5': u'5eb766671f69b82e528dc1e7769c5cb2', - u'info_dict': { - u"title": u"Noah en pabellon cuahutemoc" - } + 'url': 'http://tu.tv/videos/noah-en-pabellon-cuahutemoc', + 'file': '2742556.flv', + 'md5': '5eb766671f69b82e528dc1e7769c5cb2', + 'info_dict': { + 'title': 'Noah en pabellon cuahutemoc', + }, } def _real_extract(self, url): @@ -22,18 +24,15 @@ class TutvIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - internal_id = self._search_regex(r'codVideo=([0-9]+)', webpage, u'internal video ID') + internal_id = self._search_regex(r'codVideo=([0-9]+)', webpage, 'internal video ID') - data_url = u'http://tu.tv/flvurl.php?codVideo=' + str(internal_id) - data_content = self._download_webpage(data_url, video_id, note=u'Downloading video info') + data_url = 'http://tu.tv/flvurl.php?codVideo=' + str(internal_id) + data_content = self._download_webpage(data_url, video_id, note='Downloading video info') data = compat_parse_qs(data_content) video_url = base64.b64decode(data['kpt'][0]).decode('utf-8') - ext = video_url.partition(u'?')[0].rpartition(u'.')[2] - info = { + return { 'id': internal_id, 'url': video_url, - 'ext': ext, 'title': self._og_search_title(webpage), } - return [info] diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py index 74c82587f..7fa2b9e15 100644 --- a/youtube_dl/extractor/ustream.py +++ b/youtube_dl/extractor/ustream.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import json import re @@ -10,48 +12,48 @@ from ..utils import ( class UstreamIE(InfoExtractor): _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)' - IE_NAME = u'ustream' + IE_NAME = 'ustream' _TEST = { - u'url': u'http://www.ustream.tv/recorded/20274954', - u'file': u'20274954.flv', - u'md5': u'088f151799e8f572f84eb62f17d73e5c', - u'info_dict': { - u"uploader": u"Young Americans for Liberty", - u"title": u"Young Americans for Liberty February 7, 2012 2:28 AM" - } + 'url': 'http://www.ustream.tv/recorded/20274954', + 'file': '20274954.flv', + 'md5': '088f151799e8f572f84eb62f17d73e5c', + 'info_dict': { + "uploader": "Young Americans for Liberty", + "title": "Young Americans for Liberty February 7, 2012 2:28 AM", + }, } def _real_extract(self, url): m = re.match(self._VALID_URL, url) video_id = m.group('videoID') - video_url = u'http://tcdn.ustream.tv/video/%s' % video_id + video_url = 'http://tcdn.ustream.tv/video/%s' % video_id webpage = self._download_webpage(url, video_id) self.report_extraction(video_id) video_title = self._html_search_regex(r'data-title="(?P<title>.+)"', - webpage, u'title') + webpage, 'title') uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>', - webpage, u'uploader', fatal=False, flags=re.DOTALL) + webpage, 'uploader', fatal=False, flags=re.DOTALL) thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"', - webpage, u'thumbnail', fatal=False) - - info = { - 'id': video_id, - 'url': video_url, - 'ext': 'flv', - 'title': video_title, - 'uploader': uploader, - 'thumbnail': thumbnail, - } - return info + webpage, 'thumbnail', fatal=False) + + return { + 'id': video_id, + 'url': video_url, + 'ext': 'flv', + 'title': video_title, + 'uploader': uploader, + 'thumbnail': thumbnail, + } + class UstreamChannelIE(InfoExtractor): _VALID_URL = r'https?://www\.ustream\.tv/channel/(?P<slug>.+)' - IE_NAME = u'ustream:channel' + IE_NAME = 'ustream:channel' def _real_extract(self, url): m = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index a4b26a26f..f0673972c 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re import json import xml.etree.ElementTree @@ -22,16 +24,16 @@ class VevoIE(InfoExtractor): vevo:) (?P<id>[^&?#]+)''' _TESTS = [{ - u'url': u'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280', - u'file': u'GB1101300280.mp4', - u"md5": u"06bea460acb744eab74a9d7dcb4bfd61", - u'info_dict': { - u"upload_date": u"20130624", - u"uploader": u"Hurts", - u"title": u"Somebody to Die For", - u"duration": 230.12, - u"width": 1920, - u"height": 1080, + 'url': 'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280', + 'file': 'GB1101300280.mp4', + "md5": "06bea460acb744eab74a9d7dcb4bfd61", + 'info_dict': { + "upload_date": "20130624", + "uploader": "Hurts", + "title": "Somebody to Die For", + "duration": 230.12, + "width": 1920, + "height": 1080, } }] _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com/' @@ -44,7 +46,7 @@ class VevoIE(InfoExtractor): if version['version'] > last_version['version']: last_version = version if last_version['version'] == -1: - raise ExtractorError(u'Unable to extract last version of the video') + raise ExtractorError('Unable to extract last version of the video') renditions = xml.etree.ElementTree.fromstring(last_version['data']) formats = [] @@ -85,7 +87,7 @@ class VevoIE(InfoExtractor): format_url = self._SMIL_BASE_URL + m.group('path') formats.append({ 'url': format_url, - 'format_id': u'SMIL_' + m.group('cbr'), + 'format_id': 'SMIL_' + m.group('cbr'), 'vcodec': m.group('vcodec'), 'acodec': m.group('acodec'), 'vbr': int(m.group('vbr')), @@ -101,26 +103,25 @@ class VevoIE(InfoExtractor): video_id = mobj.group('id') json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id - info_json = self._download_webpage(json_url, video_id, u'Downloading json info') - video_info = json.loads(info_json)['video'] + video_info = self._download_json(json_url, video_id)['video'] formats = self._formats_from_json(video_info) try: smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % ( self._SMIL_BASE_URL, video_id, video_id.lower()) smil_xml = self._download_webpage(smil_url, video_id, - u'Downloading SMIL info') + 'Downloading SMIL info') formats.extend(self._formats_from_smil(smil_xml)) except ExtractorError as ee: if not isinstance(ee.cause, compat_HTTPError): raise self._downloader.report_warning( - u'Cannot download SMIL information, falling back to JSON ..') + 'Cannot download SMIL information, falling back to JSON ..') timestamp_ms = int(self._search_regex( - r'/Date\((\d+)\)/', video_info['launchDate'], u'launch date')) + r'/Date\((\d+)\)/', video_info['launchDate'], 'launch date')) upload_date = datetime.datetime.fromtimestamp(timestamp_ms // 1000) - info = { + return { 'id': video_id, 'title': video_info['title'], 'formats': formats, @@ -129,5 +130,3 @@ class VevoIE(InfoExtractor): 'uploader': video_info['mainArtists'][0]['artistName'], 'duration': video_info['duration'], } - - return info diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py index e971b5b4b..fcb5ff758 100644 --- a/youtube_dl/extractor/youjizz.py +++ b/youtube_dl/extractor/youjizz.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -9,12 +11,12 @@ from ..utils import ( class YouJizzIE(InfoExtractor): _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+)\.html$' _TEST = { - u'url': u'http://www.youjizz.com/videos/zeichentrick-1-2189178.html', - u'file': u'2189178.flv', - u'md5': u'07e15fa469ba384c7693fd246905547c', - u'info_dict': { - u"title": u"Zeichentrick 1", - u"age_limit": 18, + 'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html', + 'file': '2189178.flv', + 'md5': '07e15fa469ba384c7693fd246905547c', + 'info_dict': { + "title": "Zeichentrick 1", + "age_limit": 18, } } @@ -30,12 +32,12 @@ class YouJizzIE(InfoExtractor): # Get the video title video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>', - webpage, u'title').strip() + webpage, 'title').strip() # Get the embed page result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage) if result is None: - raise ExtractorError(u'ERROR: unable to extract embed page') + raise ExtractorError('ERROR: unable to extract embed page') embed_page_url = result.group(0).strip() video_id = result.group('videoid') @@ -47,23 +49,23 @@ class YouJizzIE(InfoExtractor): if m_playlist is not None: playlist_url = m_playlist.group('playlist') playlist_page = self._download_webpage(playlist_url, video_id, - u'Downloading playlist page') + 'Downloading playlist page') m_levels = list(re.finditer(r'<level bitrate="(\d+?)" file="(.*?)"', playlist_page)) if len(m_levels) == 0: - raise ExtractorError(u'Unable to extract video url') + raise ExtractorError('Unable to extract video url') videos = [(int(m.group(1)), m.group(2)) for m in m_levels] (_, video_url) = sorted(videos)[0] video_url = video_url.replace('%252F', '%2F') else: video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', - webpage, u'video URL') - - info = {'id': video_id, - 'url': video_url, - 'title': video_title, - 'ext': 'flv', - 'format': 'flv', - 'player_url': embed_page_url, - 'age_limit': age_limit} + webpage, 'video URL') - return [info] + return { + 'id': video_id, + 'url': video_url, + 'title': video_title, + 'ext': 'flv', + 'format': 'flv', + 'player_url': embed_page_url, + 'age_limit': age_limit, + } diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 87a5a452e..54592d174 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1662,7 +1662,7 @@ class YoutubeUserIE(InfoExtractor): '_type': 'url', 'url': video_id, 'ie_key': 'Youtube', - 'id': 'video_id', + 'id': video_id, 'title': title, } url_results = PagedList(download_page, self._GDATA_PAGE_SIZE) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index aab85706a..b5748c14e 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.01.28.1' +__version__ = '2014.01.30.2' |