diff options
Diffstat (limited to 'youtube_dl')
37 files changed, 626 insertions, 275 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 86bff185b..e51ea701f 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -190,6 +190,7 @@ from .livestream import ( LivestreamOriginalIE, LivestreamShortenerIE, ) +from .lrt import LRTIE from .lynda import ( LyndaIE, LyndaCourseIE @@ -354,6 +355,7 @@ from .swrmediathek import SWRMediathekIE from .syfy import SyfyIE from .sztvhu import SztvHuIE from .tagesschau import TagesschauIE +from .tapely import TapelyIE from .teachertube import ( TeacherTubeIE, TeacherTubeUserIE, @@ -371,7 +373,10 @@ from .thisav import ThisAVIE from .tinypic import TinyPicIE from .tlc import TlcIE, TlcDeIE from .tnaflix import TNAFlixIE -from .thvideo import THVideoIE +from .thvideo import ( + THVideoIE, + THVideoPlaylistIE +) from .toutv import TouTvIE from .toypics import ToypicsUserIE, ToypicsIE from .traileraddict import TrailerAddictIE diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 957d35979..c3d02f85e 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -86,11 +86,15 @@ class ArteTVPlus7IE(InfoExtractor): info = self._download_json(json_url, video_id) player_info = info['videoJsonPlayer'] + upload_date_str = player_info.get('shootingDate') + if not upload_date_str: + upload_date_str = player_info.get('VDA', '').split(' ')[0] + info_dict = { 'id': player_info['VID'], 'title': player_info['VTI'], 'description': player_info.get('VDE'), - 'upload_date': unified_strdate(player_info.get('VDA', '').split(' ')[0]), + 'upload_date': unified_strdate(upload_date_str), 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), } diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index 4e2960c62..2e277c8c3 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -26,6 +26,8 @@ class BRIE(InfoExtractor): 'title': 'Wenn das Traditions-Theater wackelt', 'description': 'Heimatsound-Festival 2014: Wenn das Traditions-Theater wackelt', 'duration': 34, + 'uploader': 'BR', + 'upload_date': '20140802', } }, { @@ -66,8 +68,7 @@ class BRIE(InfoExtractor): ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('id') + display_id = self._match_id(url) page = self._download_webpage(url, display_id) xml_url = self._search_regex( r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/(?:[a-z0-9\-]+/)+[a-z0-9/~_.-]+)'}\)\);", page, 'XMLURL') diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py index 65c12136a..d4227e6eb 100644 --- a/youtube_dl/extractor/cliphunter.py +++ b/youtube_dl/extractor/cliphunter.py @@ -35,7 +35,6 @@ class CliphunterIE(InfoExtractor): 'title': 'Fun Jynx Maze solo', 'thumbnail': 're:^https?://.*\.jpg$', 'age_limit': 18, - 'duration': 1317, } } @@ -86,14 +85,11 @@ class CliphunterIE(InfoExtractor): thumbnail = self._search_regex( r"var\s+mov_thumb\s*=\s*'([^']+)';", webpage, 'thumbnail', fatal=False) - duration = int_or_none(self._search_regex( - r'pl_dur\s*=\s*([0-9]+)', webpage, 'duration', fatal=False)) return { 'id': video_id, 'title': video_title, 'formats': formats, - 'duration': duration, 'age_limit': self._rta_search(webpage), 'thumbnail': thumbnail, } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index f43a0a569..611cf95f1 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -334,7 +334,11 @@ class InfoExtractor(object): try: return json.loads(json_string) except ValueError as ve: - raise ExtractorError('Failed to download JSON', cause=ve) + errmsg = '%s: Failed to parse JSON ' % video_id + if fatal: + raise ExtractorError(errmsg, cause=ve) + else: + self.report_warning(errmsg + str(ve)) def report_warning(self, msg, video_id=None): idstr = '' if video_id is None else '%s: ' % video_id diff --git a/youtube_dl/extractor/dropbox.py b/youtube_dl/extractor/dropbox.py index 817a9bd61..5f24ac721 100644 --- a/youtube_dl/extractor/dropbox.py +++ b/youtube_dl/extractor/dropbox.py @@ -29,9 +29,8 @@ class DropboxIE(InfoExtractor): video_id = mobj.group('id') fn = compat_urllib_parse_unquote(url_basename(url)) title = os.path.splitext(fn)[0] - video_url = ( - re.sub(r'[?&]dl=0', '', url) + - ('?' if '?' in url else '&') + 'dl=1') + video_url = re.sub(r'[?&]dl=0', '', url) + video_url += ('?' if '?' not in video_url else '&') + 'dl=1' return { 'id': video_id, diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py index 522aa3d63..bb231ecb1 100644 --- a/youtube_dl/extractor/eporner.py +++ b/youtube_dl/extractor/eporner.py @@ -14,11 +14,11 @@ class EpornerIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P<id>\d+)/(?P<display_id>[\w-]+)' _TEST = { 'url': 'http://www.eporner.com/hd-porn/95008/Infamous-Tiffany-Teen-Strip-Tease-Video/', - 'md5': '3b427ae4b9d60619106de3185c2987cd', + 'md5': '39d486f046212d8e1b911c52ab4691f8', 'info_dict': { 'id': '95008', 'display_id': 'Infamous-Tiffany-Teen-Strip-Tease-Video', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Infamous Tiffany Teen Strip Tease Video', 'duration': 194, 'view_count': int, diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 60e68d98a..3ad993751 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -35,7 +35,7 @@ class FacebookIE(InfoExtractor): 'id': '637842556329505', 'ext': 'mp4', 'duration': 38, - 'title': 'Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam fin...', + 'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam', } }, { 'note': 'Video without discernible title', diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 721e5fce0..d966e8403 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -21,7 +21,7 @@ class FunnyOrDieIE(InfoExtractor): }, }, { 'url': 'http://www.funnyordie.com/embed/e402820827', - 'md5': 'ff4d83318f89776ed0250634cfaa8d36', + 'md5': '29f4c5e5a61ca39dfd7e8348a75d0aad', 'info_dict': { 'id': 'e402820827', 'ext': 'mp4', diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 0dfa4853d..14c024e48 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -155,7 +155,6 @@ class GenericIE(InfoExtractor): # funnyordie embed { 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns', - 'md5': '7cf780be104d40fea7bae52eed4a470e', 'info_dict': { 'id': '18e820ec3f', 'ext': 'mp4', @@ -180,13 +179,13 @@ class GenericIE(InfoExtractor): # Embedded TED video { 'url': 'http://en.support.wordpress.com/videos/ted-talks/', - 'md5': 'deeeabcc1085eb2ba205474e7235a3d5', + 'md5': '65fdff94098e4a607385a60c5177c638', 'info_dict': { - 'id': '981', + 'id': '1969', 'ext': 'mp4', - 'title': 'My web playroom', - 'uploader': 'Ze Frank', - 'description': 'md5:ddb2a40ecd6b6a147e400e535874947b', + 'title': 'Hidden miracles of the natural world', + 'uploader': 'Louie Schwartzberg', + 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9', } }, # Embeded Ustream video @@ -226,21 +225,6 @@ class GenericIE(InfoExtractor): 'skip_download': 'Requires rtmpdump' } }, - # smotri embed - { - 'url': 'http://rbctv.rbc.ru/archive/news/562949990879132.shtml', - 'md5': 'ec40048448e9284c9a1de77bb188108b', - 'info_dict': { - 'id': 'v27008541fad', - 'ext': 'mp4', - 'title': 'Крым и Севастополь вошли в состав России', - 'description': 'md5:fae01b61f68984c7bd2fa741e11c3175', - 'duration': 900, - 'upload_date': '20140318', - 'uploader': 'rbctv_2012_4', - 'uploader_id': 'rbctv_2012_4', - }, - }, # Condé Nast embed { 'url': 'http://www.wired.com/2014/04/honda-asimo/', @@ -295,13 +279,13 @@ class GenericIE(InfoExtractor): { 'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM', 'info_dict': { - 'id': 'jpSGZsgga_I', + 'id': '4vAffPZIT44', 'ext': 'mp4', - 'title': 'Asphalt 8: Airborne - Launch Trailer', + 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!', 'uploader': 'Gameloft', 'uploader_id': 'gameloft', - 'upload_date': '20130821', - 'description': 'md5:87bd95f13d8be3e7da87a5f2c443106a', + 'upload_date': '20140828', + 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4', }, 'params': { 'skip_download': True, diff --git a/youtube_dl/extractor/godtube.py b/youtube_dl/extractor/godtube.py index 73bd6d890..363dc6608 100644 --- a/youtube_dl/extractor/godtube.py +++ b/youtube_dl/extractor/godtube.py @@ -36,16 +36,16 @@ class GodTubeIE(InfoExtractor): 'http://www.godtube.com/resource/mediaplayer/%s.xml' % video_id.lower(), video_id, 'Downloading player config XML') - video_url = config.find('.//file').text - uploader = config.find('.//author').text - timestamp = parse_iso8601(config.find('.//date').text) - duration = parse_duration(config.find('.//duration').text) - thumbnail = config.find('.//image').text + video_url = config.find('file').text + uploader = config.find('author').text + timestamp = parse_iso8601(config.find('date').text) + duration = parse_duration(config.find('duration').text) + thumbnail = config.find('image').text media = self._download_xml( 'http://www.godtube.com/media/xml/?v=%s' % video_id, video_id, 'Downloading media XML') - title = media.find('.//title').text + title = media.find('title').text return { 'id': video_id, diff --git a/youtube_dl/extractor/golem.py b/youtube_dl/extractor/golem.py index bebfe8568..53714f47f 100644 --- a/youtube_dl/extractor/golem.py +++ b/youtube_dl/extractor/golem.py @@ -38,11 +38,9 @@ class GolemIE(InfoExtractor): } formats = [] - for e in config.findall('./*[url]'): + for e in config: url = e.findtext('./url') if not url: - self._downloader.report_warning( - "{0}: url: empty, skipping".format(e.tag)) continue formats.append({ @@ -57,7 +55,7 @@ class GolemIE(InfoExtractor): info['formats'] = formats thumbnails = [] - for e in config.findall('.//teaser[url]'): + for e in config.findall('.//teaser'): url = e.findtext('./url') if not url: continue diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index 12e9e61c4..c80185b53 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -89,7 +89,12 @@ class IGNIE(InfoExtractor): '<param name="flashvars"[^>]*value="[^"]*?url=(https?://www\.ign\.com/videos/.*?)["&]', webpage) if multiple_urls: - return [self.url_result(u, ie='IGN') for u in multiple_urls] + entries = [self.url_result(u, ie='IGN') for u in multiple_urls] + return { + '_type': 'playlist', + 'id': name_or_id, + 'entries': entries, + } video_id = self._find_video_id(webpage) result = self._get_video_info(video_id) diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py index 4ddda2f1b..53f9a5f75 100644 --- a/youtube_dl/extractor/internetvideoarchive.py +++ b/youtube_dl/extractor/internetvideoarchive.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -12,12 +14,13 @@ class InternetVideoArchiveIE(InfoExtractor): _VALID_URL = r'https?://video\.internetvideoarchive\.net/flash/players/.*?\?.*?publishedid.*?' _TEST = { - u'url': u'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247', - u'file': u'452693.mp4', - u'info_dict': { - u'title': u'SKYFALL', - u'description': u'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.', - u'duration': 153, + 'url': 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247', + 'info_dict': { + 'id': '452693', + 'ext': 'mp4', + 'title': 'SKYFALL', + 'description': 'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.', + 'duration': 149, }, } @@ -42,7 +45,7 @@ class InternetVideoArchiveIE(InfoExtractor): url = self._build_url(query) flashconfiguration = self._download_xml(url, video_id, - u'Downloading flash configuration') + 'Downloading flash configuration') file_url = flashconfiguration.find('file').text file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx') # Replace some of the parameters in the query to get the best quality @@ -51,7 +54,7 @@ class InternetVideoArchiveIE(InfoExtractor): lambda m: self._clean_query(m.group()), file_url) info = self._download_xml(file_url, video_id, - u'Downloading video info') + 'Downloading video info') item = info.find('channel/item') def _bp(p): diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py index a83dd249f..07ef682ee 100644 --- a/youtube_dl/extractor/izlesene.py +++ b/youtube_dl/extractor/izlesene.py @@ -63,7 +63,8 @@ class IzleseneIE(InfoExtractor): title = self._og_search_title(webpage) description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) + thumbnail = self._proto_relative_url( + self._og_search_thumbnail(webpage), scheme='http:') uploader = self._html_search_regex( r"adduserUsername\s*=\s*'([^']+)';", diff --git a/youtube_dl/extractor/jukebox.py b/youtube_dl/extractor/jukebox.py index 9b553b9fa..5aa32bf09 100644 --- a/youtube_dl/extractor/jukebox.py +++ b/youtube_dl/extractor/jukebox.py @@ -11,10 +11,9 @@ from ..utils import ( class JukeboxIE(InfoExtractor): - _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<video_id>[a-z0-9\-]+)\.html' + _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<id>[a-z0-9\-]+)\.html' _TEST = { 'url': 'http://www.jukebox.es/kosheen/videoclip,pride,r303r.html', - 'md5': '1574e9b4d6438446d5b7dbcdf2786276', 'info_dict': { 'id': 'r303r', 'ext': 'flv', @@ -24,8 +23,7 @@ class JukeboxIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('video_id') + video_id = self._match_id(url) html = self._download_webpage(url, video_id) iframe_url = unescapeHTML(self._search_regex(r'<iframe .*src="([^"]*)"', html, 'iframe url')) diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py new file mode 100644 index 000000000..fca0bfef0 --- /dev/null +++ b/youtube_dl/extractor/lrt.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + js_to_json, + parse_duration, + remove_end, +) + + +class LRTIE(InfoExtractor): + IE_NAME = 'lrt.lt' + _VALID_URL = r'https?://(?:www\.)?lrt\.lt/mediateka/irasas/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.lrt.lt/mediateka/irasas/54391/', + 'info_dict': { + 'id': '54391', + 'ext': 'mp4', + 'title': 'Septynios Kauno dienos', + 'description': 'Kauno miesto ir apskrities naujienos', + 'duration': 1783, + }, + 'params': { + 'skip_download': True, # HLS download + }, + + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + title = remove_end(self._og_search_title(webpage), ' - LRT') + thumbnail = self._og_search_thumbnail(webpage) + description = self._og_search_description(webpage) + duration = parse_duration(self._search_regex( + r"'duration':\s*'([^']+)',", webpage, + 'duration', fatal=False, default=None)) + + formats = [] + for js in re.findall(r'(?s)config:\s*(\{.*?\})', webpage): + data = json.loads(js_to_json(js)) + if data['provider'] == 'rtmp': + formats.append({ + 'format_id': 'rtmp', + 'ext': determine_ext(data['file']), + 'url': data['streamer'], + 'play_path': 'mp4:%s' % data['file'], + 'preference': -1, + }) + else: + formats.extend( + self._extract_m3u8_formats(data['file'], video_id, 'mp4')) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'description': description, + 'duration': duration, + } diff --git a/youtube_dl/extractor/nfl.py b/youtube_dl/extractor/nfl.py index 963c4587c..cc7c921c3 100644 --- a/youtube_dl/extractor/nfl.py +++ b/youtube_dl/extractor/nfl.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( ExtractorError, + compat_urllib_parse_urlparse, int_or_none, remove_end, ) @@ -13,76 +14,116 @@ from ..utils import ( class NFLIE(InfoExtractor): IE_NAME = 'nfl.com' - _VALID_URL = r'(?x)https?://(?:www\.)?nfl\.com/(?:videos/(?:.+)/|.*?\#video=)(?P<id>\d..[0-9]+)' - _PLAYER_CONFIG_URL = 'http://www.nfl.com/static/content/static/config/video/config.json' - _TEST = { - 'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights', - # 'md5': '5eb8c40a727dda106d510e5d6ffa79e5', # md5 checksum fluctuates - 'info_dict': { - 'id': '0ap3000000398478', - 'ext': 'mp4', - 'title': 'Week 3: Washington Redskins vs. Philadelphia Eagles highlights', - 'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478', - 'upload_date': '20140921', - 'timestamp': 1411337580, - 'thumbnail': 're:^https?://.*\.jpg$', + _VALID_URL = r'''(?x)https?:// + (?P<host>(?:www\.)?(?:nfl\.com|.*?\.clubs\.nfl\.com))/ + (?:.+?/)* + (?P<id>(?:\d[a-z]{2}\d{13}|\w{8}\-(?:\w{4}\-){3}\w{12}))''' + _TESTS = [ + { + 'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights', + 'md5': '394ef771ddcd1354f665b471d78ec4c6', + 'info_dict': { + 'id': '0ap3000000398478', + 'ext': 'mp4', + 'title': 'Week 3: Redskins vs. Eagles highlights', + 'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478', + 'upload_date': '20140921', + 'timestamp': 1411337580, + 'thumbnail': 're:^https?://.*\.jpg$', + } + }, + { + 'url': 'http://prod.www.steelers.clubs.nfl.com/video-and-audio/videos/LIVE_Post_Game_vs_Browns/9d72f26a-9e2b-4718-84d3-09fb4046c266', + 'md5': 'cf85bdb4bc49f6e9d3816d130c78279c', + 'info_dict': { + 'id': '9d72f26a-9e2b-4718-84d3-09fb4046c266', + 'ext': 'mp4', + 'title': 'LIVE: Post Game vs. Browns', + 'description': 'md5:6a97f7e5ebeb4c0e69a418a89e0636e8', + 'upload_date': '20131229', + 'timestamp': 1388354455, + 'thumbnail': 're:^https?://.*\.jpg$', + } + } + ] + + @staticmethod + def prepend_host(host, url): + if not url.startswith('http'): + if not url.startswith('/'): + url = '/%s' % url + url = 'http://{0:}{1:}'.format(host, url) + return url + + @staticmethod + def format_from_stream(stream, protocol, host, path_prefix='', + preference=0, note=None): + url = '{protocol:}://{host:}/{prefix:}{path:}'.format( + protocol=protocol, + host=host, + prefix=path_prefix, + path=stream.get('path'), + ) + return { + 'url': url, + 'vbr': int_or_none(stream.get('rate', 0), 1000), + 'preference': preference, + 'format_note': note, } - } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id, host = mobj.group('id'), mobj.group('host') - config = self._download_json(self._PLAYER_CONFIG_URL, video_id, - note='Downloading player config') - url_template = 'http://nfl.com{contentURLTemplate:s}'.format(**config) - video_data = self._download_json(url_template.format(id=video_id), video_id) + webpage = self._download_webpage(url, video_id) - cdns = config.get('cdns') - if not cdns: - raise ExtractorError('Failed to get CDN data', expected=True) + config_url = NFLIE.prepend_host(host, self._search_regex( + r'(?:config|configURL)\s*:\s*"([^"]+)"', webpage, 'config URL')) + config = self._download_json(config_url, video_id, + note='Downloading player config') + url_template = NFLIE.prepend_host( + host, '{contentURLTemplate:}'.format(**config)) + video_data = self._download_json( + url_template.format(id=video_id), video_id) formats = [] - streams = video_data.get('cdnData', {}).get('bitrateInfo', []) - for name, cdn in cdns.items(): - # LimeLight streams don't seem to work - if cdn.get('name') == 'LIMELIGHT': - continue - - protocol = cdn.get('protocol') - host = remove_end(cdn.get('host', ''), '/') - if not (protocol and host): - continue - - path_prefix = cdn.get('pathprefix', '') - if path_prefix and not path_prefix.endswith('/'): - path_prefix = '%s/' % path_prefix - - get_url = lambda p: '{protocol:s}://{host:s}/{prefix:s}{path:}'.format( - protocol=protocol, - host=host, - prefix=path_prefix, - path=p, - ) - - if protocol == 'rtmp': - preference = -2 - elif 'prog' in name.lower(): - preference = -1 - else: - preference = 0 - + cdn_data = video_data.get('cdnData', {}) + streams = cdn_data.get('bitrateInfo', []) + if cdn_data.get('format') == 'EXTERNAL_HTTP_STREAM': + parts = compat_urllib_parse_urlparse(cdn_data.get('uri')) + protocol, host = parts.scheme, parts.netloc for stream in streams: - path = stream.get('path') - if not path: + formats.append( + NFLIE.format_from_stream(stream, protocol, host)) + else: + cdns = config.get('cdns') + if not cdns: + raise ExtractorError('Failed to get CDN data', expected=True) + + for name, cdn in cdns.items(): + # LimeLight streams don't seem to work + if cdn.get('name') == 'LIMELIGHT': continue - formats.append({ - 'url': get_url(path), - 'vbr': int_or_none(stream.get('rate', 0), 1000), - 'preference': preference, - 'format_note': name, - }) + protocol = cdn.get('protocol') + host = remove_end(cdn.get('host', ''), '/') + if not (protocol and host): + continue + + prefix = cdn.get('pathprefix', '') + if prefix and not prefix.endswith('/'): + prefix = '%s/' % prefix + + preference = 0 + if protocol == 'rtmp': + preference = -2 + elif 'prog' in name.lower(): + preference = 1 + + for stream in streams: + formats.append( + NFLIE.format_from_stream(stream, protocol, host, + prefix, preference, name)) self._sort_formats(formats) @@ -94,7 +135,7 @@ class NFLIE(InfoExtractor): return { 'id': video_id, - 'title': video_data.get('storyHeadline'), + 'title': video_data.get('headline'), 'formats': formats, 'description': video_data.get('caption'), 'duration': video_data.get('duration'), diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 2adfde909..8f140d626 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -4,6 +4,7 @@ import re from .common import InfoExtractor from ..utils import ( + unified_strdate, US_RATINGS, ) @@ -11,10 +12,10 @@ from ..utils import ( class PBSIE(InfoExtractor): _VALID_URL = r'''(?x)https?:// (?: - # Direct video URL - video\.pbs\.org/(?:viralplayer|video)/(?P<id>[0-9]+)/? | - # Article with embedded player - (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+)/?(?:$|[?\#]) | + # Direct video URL + video\.pbs\.org/(?:viralplayer|video)/(?P<id>[0-9]+)/? | + # Article with embedded player (or direct video) + (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) | # Player video\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/ ) @@ -65,10 +66,25 @@ class PBSIE(InfoExtractor): 'duration': 6559, 'thumbnail': 're:^https?://.*\.jpg$', } + }, + { + 'url': 'http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html', + 'md5': '908f3e5473a693b266b84e25e1cf9703', + 'info_dict': { + 'id': '2365160389', + 'display_id': 'killer-typhoon', + 'ext': 'mp4', + 'description': 'md5:c741d14e979fc53228c575894094f157', + 'title': 'Killer Typhoon', + 'duration': 3172, + 'thumbnail': 're:^https?://.*\.jpg$', + 'upload_date': '20140122', + } } + ] - def _extract_ids(self, url): + def _extract_webpage(self, url): mobj = re.match(self._VALID_URL, url) presumptive_id = mobj.group('presumptive_id') @@ -76,15 +92,20 @@ class PBSIE(InfoExtractor): if presumptive_id: webpage = self._download_webpage(url, display_id) + upload_date = unified_strdate(self._search_regex( + r'<input type="hidden" id="air_date_[0-9]+" value="([^"]+)"', + webpage, 'upload date', default=None)) + MEDIA_ID_REGEXES = [ r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", # frontline video embed r'class="coveplayerid">([^<]+)<', # coveplayer + r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>', # jwplayer ] media_id = self._search_regex( MEDIA_ID_REGEXES, webpage, 'media ID', fatal=False, default=None) if media_id: - return media_id, presumptive_id + return media_id, presumptive_id, upload_date url = self._search_regex( r'<iframe\s+(?:class|id)=["\']partnerPlayer["\'].*?\s+src=["\'](.*?)["\']>', @@ -104,10 +125,10 @@ class PBSIE(InfoExtractor): video_id = mobj.group('id') display_id = video_id - return video_id, display_id + return video_id, display_id, None def _real_extract(self, url): - video_id, display_id = self._extract_ids(url) + video_id, display_id, upload_date = self._extract_webpage(url) info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id info = self._download_json(info_url, display_id) @@ -119,6 +140,7 @@ class PBSIE(InfoExtractor): return { 'id': video_id, + 'display_id': display_id, 'title': info['title'], 'url': info['alternate_encoding']['url'], 'ext': 'mp4', @@ -126,4 +148,5 @@ class PBSIE(InfoExtractor): 'thumbnail': info.get('image_url'), 'duration': info.get('duration'), 'age_limit': age_limit, + 'upload_date': upload_date, } diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 5b2a723c1..619496de7 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -144,7 +144,7 @@ class ProSiebenSat1IE(InfoExtractor): 'id': '2156342', 'ext': 'mp4', 'title': 'Kurztrips zum Valentinstag', - 'description': 'md5:8ba6301e70351ae0bedf8da00f7ba528', + 'description': 'Romantischer Kurztrip zum Valentinstag? Wir verraten, was sich hier wirklich lohnt.', 'duration': 307.24, }, 'params': { @@ -180,12 +180,10 @@ class ProSiebenSat1IE(InfoExtractor): ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) - page = self._download_webpage(url, video_id, 'Downloading page') - - clip_id = self._html_search_regex(self._CLIPID_REGEXES, page, 'clip id') + clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id') access_token = 'testclient' client_name = 'kolibri-1.2.5' @@ -234,12 +232,12 @@ class ProSiebenSat1IE(InfoExtractor): urls = self._download_json(url_api_url, clip_id, 'Downloading urls JSON') - title = self._html_search_regex(self._TITLE_REGEXES, page, 'title') - description = self._html_search_regex(self._DESCRIPTION_REGEXES, page, 'description', fatal=False) - thumbnail = self._og_search_thumbnail(page) + title = self._html_search_regex(self._TITLE_REGEXES, webpage, 'title') + description = self._html_search_regex(self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) upload_date = unified_strdate(self._html_search_regex( - self._UPLOAD_DATE_REGEXES, page, 'upload date', default=None)) + self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None)) formats = [] diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 2007a0013..94602e89e 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -9,7 +9,6 @@ from ..utils import ( compat_urllib_parse, unified_strdate, str_to_int, - int_or_none, ) from ..aes import aes_decrypt_text @@ -40,31 +39,42 @@ class SpankwireIE(InfoExtractor): req.add_header('Cookie', 'age_verified=1') webpage = self._download_webpage(req, video_id) - title = self._html_search_regex(r'<h1>([^<]+)', webpage, 'title') + title = self._html_search_regex( + r'<h1>([^<]+)', webpage, 'title') description = self._html_search_regex( - r'<div\s+id="descriptionContent">([^<]+)<', webpage, 'description', fatal=False) + r'<div\s+id="descriptionContent">([^<]+)<', + webpage, 'description', fatal=False) thumbnail = self._html_search_regex( - r'flashvars\.image_url = "([^"]+)', webpage, 'thumbnail', fatal=False) + r'playerData\.screenShot\s*=\s*["\']([^"\']+)["\']', + webpage, 'thumbnail', fatal=False) uploader = self._html_search_regex( - r'by:\s*<a [^>]*>(.+?)</a>', webpage, 'uploader', fatal=False) + r'by:\s*<a [^>]*>(.+?)</a>', + webpage, 'uploader', fatal=False) uploader_id = self._html_search_regex( - r'by:\s*<a href="/Profile\.aspx\?.*?UserId=(\d+).*?"', webpage, 'uploader id', fatal=False) - upload_date = self._html_search_regex(r'</a> on (.+?) at \d+:\d+', webpage, 'upload date', fatal=False) - if upload_date: - upload_date = unified_strdate(upload_date) - - view_count = self._html_search_regex( - r'<div id="viewsCounter"><span>([^<]+)</span> views</div>', webpage, 'view count', fatal=False) - if view_count: - view_count = str_to_int(view_count) - comment_count = int_or_none(self._html_search_regex( - r'<span id="spCommentCount">\s*(\d+)</span> Comments</div>', webpage, 'comment count', fatal=False)) + r'by:\s*<a href="/Profile\.aspx\?.*?UserId=(\d+).*?"', + webpage, 'uploader id', fatal=False) + upload_date = unified_strdate(self._html_search_regex( + r'</a> on (.+?) at \d+:\d+', + webpage, 'upload date', fatal=False)) - video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'flashvars\.quality_[0-9]{3}p = "([^"]+)', webpage))) + view_count = str_to_int(self._html_search_regex( + r'<div id="viewsCounter"><span>([\d,\.]+)</span> views</div>', + webpage, 'view count', fatal=False)) + comment_count = str_to_int(self._html_search_regex( + r'Comments<span[^>]+>\s*\(([\d,\.]+)\)</span>', + webpage, 'comment count', fatal=False)) + + video_urls = list(map( + compat_urllib_parse.unquote, + re.findall(r'playerData\.cdnPath[0-9]{3,}\s*=\s*["\']([^"\']+)["\']', webpage))) if webpage.find('flashvars\.encrypted = "true"') != -1: - password = self._html_search_regex(r'flashvars\.video_title = "([^"]+)', webpage, 'password').replace('+', ' ') - video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls)) + password = self._html_search_regex( + r'flashvars\.video_title = "([^"]+)', + webpage, 'password').replace('+', ' ') + video_urls = list(map( + lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), + video_urls)) formats = [] for video_url in video_urls: diff --git a/youtube_dl/extractor/sportdeutschland.py b/youtube_dl/extractor/sportdeutschland.py index 185353bef..abb827783 100644 --- a/youtube_dl/extractor/sportdeutschland.py +++ b/youtube_dl/extractor/sportdeutschland.py @@ -17,11 +17,11 @@ class SportDeutschlandIE(InfoExtractor): 'info_dict': { 'id': 'live-li-ning-badminton-weltmeisterschaft-2014-kopenhagen', 'ext': 'mp4', - 'title': 'LIVE: Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen', + 'title': 're:Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen', 'categories': ['Badminton'], 'view_count': int, 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 're:^Die Badminton-WM 2014 aus Kopenhagen LIVE', + 'description': 're:Die Badminton-WM 2014 aus Kopenhagen bei Sportdeutschland\.TV', 'timestamp': int, 'upload_date': 're:^201408[23][0-9]$', }, diff --git a/youtube_dl/extractor/sunporno.py b/youtube_dl/extractor/sunporno.py index 7de3c9dd5..263f09b46 100644 --- a/youtube_dl/extractor/sunporno.py +++ b/youtube_dl/extractor/sunporno.py @@ -39,10 +39,10 @@ class SunPornoIE(InfoExtractor): r'poster="([^"]+)"', webpage, 'thumbnail', fatal=False) duration = parse_duration(self._search_regex( - r'<span>Duration: (\d+:\d+)</span>', webpage, 'duration', fatal=False)) + r'Duration:\s*(\d+:\d+)\s*<', webpage, 'duration', fatal=False)) view_count = int_or_none(self._html_search_regex( - r'<span class="views">(\d+)</span>', webpage, 'view count', fatal=False)) + r'class="views">\s*(\d+)\s*<', webpage, 'view count', fatal=False)) comment_count = int_or_none(self._html_search_regex( r'(\d+)</b> Comments?', webpage, 'comment count', fatal=False)) diff --git a/youtube_dl/extractor/tapely.py b/youtube_dl/extractor/tapely.py new file mode 100644 index 000000000..77e056242 --- /dev/null +++ b/youtube_dl/extractor/tapely.py @@ -0,0 +1,104 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + clean_html, + compat_urllib_request, + float_or_none, + parse_iso8601, +) + + +class TapelyIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tape\.ly/(?P<id>[A-Za-z0-9\-_]+)(?:/(?P<songnr>\d+))?' + _API_URL = 'http://tape.ly/showtape?id={0:}' + _S3_SONG_URL = 'http://mytape.s3.amazonaws.com/{0:}' + _SOUNDCLOUD_SONG_URL = 'http://api.soundcloud.com{0:}' + _TESTS = [ + { + 'url': 'http://tape.ly/my-grief-as-told-by-water', + 'info_dict': { + 'id': 23952, + 'title': 'my grief as told by water', + 'thumbnail': 're:^https?://.*\.png$', + 'uploader_id': 16484, + 'timestamp': 1411848286, + 'description': 'For Robin and Ponkers, whom the tides of life have taken out to sea.', + }, + 'playlist_count': 13, + }, + { + 'url': 'http://tape.ly/my-grief-as-told-by-water/1', + 'md5': '79031f459fdec6530663b854cbc5715c', + 'info_dict': { + 'id': 258464, + 'title': 'Dreaming Awake (My Brightest Diamond)', + 'ext': 'm4a', + }, + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') + + playlist_url = self._API_URL.format(display_id) + request = compat_urllib_request.Request(playlist_url) + request.add_header('X-Requested-With', 'XMLHttpRequest') + request.add_header('Accept', 'application/json') + + playlist = self._download_json(request, display_id) + + tape = playlist['tape'] + + entries = [] + for s in tape['songs']: + song = s['song'] + entry = { + 'id': song['id'], + 'duration': float_or_none(song.get('songduration'), 1000), + 'title': song['title'], + } + if song['source'] == 'S3': + entry.update({ + 'url': self._S3_SONG_URL.format(song['filename']), + }) + entries.append(entry) + elif song['source'] == 'YT': + self.to_screen('YouTube video detected') + yt_id = song['filename'].replace('/youtube/', '') + entry.update(self.url_result(yt_id, 'Youtube', video_id=yt_id)) + entries.append(entry) + elif song['source'] == 'SC': + self.to_screen('SoundCloud song detected') + sc_url = self._SOUNDCLOUD_SONG_URL.format(song['filename']) + entry.update(self.url_result(sc_url, 'Soundcloud')) + entries.append(entry) + else: + self.report_warning('Unknown song source: %s' % song['source']) + + if mobj.group('songnr'): + songnr = int(mobj.group('songnr')) - 1 + try: + return entries[songnr] + except IndexError: + raise ExtractorError( + 'No song with index: %s' % mobj.group('songnr'), + expected=True) + + return { + '_type': 'playlist', + 'id': tape['id'], + 'display_id': display_id, + 'title': tape['name'], + 'entries': entries, + 'thumbnail': tape.get('image_url'), + 'description': clean_html(tape.get('subtext')), + 'like_count': tape.get('likescount'), + 'uploader_id': tape.get('user_id'), + 'timestamp': parse_iso8601(tape.get('published_at')), + } diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 1cca47771..d5e28efad 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -149,7 +149,7 @@ class TEDIE(SubtitlesInfoExtractor): thumbnail = 'http://' + thumbnail return { 'id': video_id, - 'title': talk_info['title'], + 'title': talk_info['title'].strip(), 'uploader': talk_info['speaker'], 'thumbnail': thumbnail, 'description': self._og_search_description(webpage), diff --git a/youtube_dl/extractor/thvideo.py b/youtube_dl/extractor/thvideo.py index 607e947bb..496f15d80 100644 --- a/youtube_dl/extractor/thvideo.py +++ b/youtube_dl/extractor/thvideo.py @@ -26,8 +26,7 @@ class THVideoIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) # extract download link from mobile player page webpage_player = self._download_webpage( @@ -57,3 +56,29 @@ class THVideoIE(InfoExtractor): 'description': description, 'upload_date': upload_date } + + +class THVideoPlaylistIE(InfoExtractor): + _VALID_URL = r'http?://(?:www\.)?thvideo\.tv/mylist(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://thvideo.tv/mylist2', + 'info_dict': { + 'id': '2', + 'title': '幻想万華鏡', + }, + 'playlist_mincount': 23, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + list_title = self._html_search_regex( + r'<h1 class="show_title">(.*?)<b id', webpage, 'playlist title', + fatal=False) + + entries = [ + self.url_result('http://thvideo.tv/v/th' + id, 'THVideo') + for id in re.findall(r'<dd><a href="http://thvideo.tv/v/th(\d+)/" target=', webpage)] + + return self.playlist_result(entries, playlist_id, list_title) diff --git a/youtube_dl/extractor/tvigle.py b/youtube_dl/extractor/tvigle.py index dc8697850..27962b5fe 100644 --- a/youtube_dl/extractor/tvigle.py +++ b/youtube_dl/extractor/tvigle.py @@ -17,16 +17,16 @@ class TvigleIE(InfoExtractor): _TESTS = [ { - 'url': 'http://www.tvigle.ru/video/brat-2/', - 'md5': '72cb7eab33e54314e1790da402d3c9c3', + 'url': 'http://www.tvigle.ru/video/brat/', + 'md5': 'ff4344a4894b0524441fb6f8218dc716', 'info_dict': { - 'id': '5119390', - 'display_id': 'brat-2', + 'id': '5118490', + 'display_id': 'brat', 'ext': 'mp4', - 'title': 'Брат 2 ', - 'description': 'md5:5751f4fe345a58e1692585c361294bd8', - 'duration': 7356.369, - 'age_limit': 0, + 'title': 'Брат', + 'description': 'md5:d16ac7c0b47052ea51fddb92c4e413eb', + 'duration': 5722.6, + 'age_limit': 16, }, }, { @@ -71,6 +71,7 @@ class TvigleIE(InfoExtractor): 'format_id': '%s-%s' % (vcodec, quality), 'vcodec': vcodec, 'height': int(quality[:-1]), + 'filesize': item['video_files_size'][vcodec][quality], }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index 7d27d6c57..964470070 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -31,7 +31,7 @@ class VGTVIE(InfoExtractor): 'url': 'http://www.vgtv.no/#!/live/100764/opptak-vgtv-foelger-em-kvalifiseringen', 'info_dict': { 'id': '100764', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'OPPTAK: VGTV følger EM-kvalifiseringen', 'description': 'md5:3772d9c0dc2dff92a886b60039a7d4d3', 'thumbnail': 're:^https?://.*\.jpg', @@ -50,7 +50,7 @@ class VGTVIE(InfoExtractor): 'url': 'http://www.vgtv.no/#!/live/100015/direkte-her-kan-du-se-laksen-live-fra-suldalslaagen', 'info_dict': { 'id': '100015', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'DIREKTE: Her kan du se laksen live fra Suldalslågen!', 'description': 'md5:9a60cc23fa349f761628924e56eeec2d', 'thumbnail': 're:^https?://.*\.jpg', diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 4be1b8785..d2c36b58a 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -8,18 +8,19 @@ import itertools from .common import InfoExtractor from .subtitles import SubtitlesInfoExtractor from ..utils import ( + clean_html, compat_HTTPError, compat_urllib_parse, compat_urllib_request, - clean_html, - get_element_by_attribute, + compat_urlparse, ExtractorError, + get_element_by_attribute, + InAdvancePagedList, + int_or_none, RegexNotFoundError, - smuggle_url, std_headers, unsmuggle_url, urlencode_postdata, - int_or_none, ) @@ -90,6 +91,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): 'uploader_id': 'openstreetmapus', 'uploader': 'OpenStreetMap US', 'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography', + 'description': 'md5:380943ec71b89736ff4bf27183233d09', 'duration': 1595, }, }, @@ -104,6 +106,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): 'uploader': 'The BLN & Business of Software', 'uploader_id': 'theblnbusinessofsoftware', 'duration': 3610, + 'description': None, }, }, { @@ -118,6 +121,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): 'uploader_id': 'user18948128', 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, + 'description': 'This is "youtube-dl password protected test video" by Jaime Marquínez Ferrándiz on Vimeo, the home for high quality videos and the people who love them.', }, 'params': { 'videopassword': 'youtube-dl', @@ -204,6 +208,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): # Extract ID from URL mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + orig_url = url if mobj.group('pro') or mobj.group('player'): url = 'http://player.vimeo.com/video/' + video_id @@ -274,18 +279,23 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): _, video_thumbnail = sorted((int(width if width.isdigit() else 0), t_url) for (width, t_url) in video_thumbs.items())[-1] # Extract video description - video_description = None - try: - video_description = get_element_by_attribute("class", "description_wrapper", webpage) - if video_description: - video_description = clean_html(video_description) - except AssertionError as err: - # On some pages like (http://player.vimeo.com/video/54469442) the - # html tags are not closed, python 2.6 cannot handle it - if err.args[0] == 'we should not get here!': - pass - else: - raise + + video_description = self._html_search_regex( + r'(?s)<div\s+class="[^"]*description[^"]*"[^>]*>(.*?)</div>', + webpage, 'description', default=None) + if not video_description: + video_description = self._html_search_meta( + 'description', webpage, default=None) + if not video_description and mobj.group('pro'): + orig_webpage = self._download_webpage( + orig_url, video_id, + note='Downloading webpage for description', + fatal=False) + if orig_webpage: + video_description = self._html_search_meta( + 'description', orig_webpage, default=None) + if not video_description and not mobj.group('player'): + self._downloader.report_warning('Cannot find video description') # Extract video duration video_duration = int_or_none(config["video"].get("duration")) @@ -533,32 +543,55 @@ class VimeoWatchLaterIE(VimeoBaseInfoExtractor, VimeoChannelIE): class VimeoLikesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes/?(?:$|[?#]|sort:)' IE_NAME = 'vimeo:likes' IE_DESC = 'Vimeo user likes' _TEST = { - 'url': 'https://vimeo.com/user20132939/likes', - 'playlist_mincount': 4, - 'add_ies': ['Generic'], + 'url': 'https://vimeo.com/user755559/likes/', + 'playlist_mincount': 293, "info_dict": { - "description": "Videos Philipp Hagemeister likes on Vimeo.", - "title": "Vimeo / Philipp Hagemeister's likes", - }, - 'params': { - 'extract_flat': False, + "description": "See all the videos urza likes", + "title": 'Videos urza likes', }, } def _real_extract(self, url): user_id = self._match_id(url) - rss_url = '%s//vimeo.com/user%s/likes/rss' % ( - self.http_scheme(), user_id) - surl = smuggle_url(rss_url, { - 'force_videoid': '%s_likes' % user_id, - 'to_generic': True, - }) + webpage = self._download_webpage(url, user_id) + page_count = self._int( + self._search_regex( + r'''(?x)<li><a\s+href="[^"]+"\s+data-page="([0-9]+)"> + .*?</a></li>\s*<li\s+class="pagination_next"> + ''', webpage, 'page count'), + 'page count', fatal=True) + PAGE_SIZE = 12 + title = self._html_search_regex( + r'(?s)<h1>(.+?)</h1>', webpage, 'title', fatal=False) + description = self._html_search_meta('description', webpage) + + def _get_page(idx): + page_url = '%s//vimeo.com/user%s/likes/page:%d/sort:date' % ( + self.http_scheme(), user_id, idx + 1) + webpage = self._download_webpage( + page_url, user_id, + note='Downloading page %d/%d' % (idx + 1, page_count)) + video_list = self._search_regex( + r'(?s)<ol class="js-browse_list[^"]+"[^>]*>(.*?)</ol>', + webpage, 'video content') + paths = re.findall( + r'<li[^>]*>\s*<a\s+href="([^"]+)"', video_list) + for path in paths: + yield { + '_type': 'url', + 'url': compat_urlparse.urljoin(page_url, path), + } + + pl = InAdvancePagedList(_get_page, page_count, PAGE_SIZE) return { - '_type': 'url', - 'url': surl, + '_type': 'playlist', + 'id': 'user%s_likes' % user_id, + 'title': title, + 'description': description, + 'entries': pl, } diff --git a/youtube_dl/extractor/vuclip.py b/youtube_dl/extractor/vuclip.py index fb0600f1a..ec3c010ad 100644 --- a/youtube_dl/extractor/vuclip.py +++ b/youtube_dl/extractor/vuclip.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from ..utils import ( compat_urllib_parse_urlparse, + ExtractorError, parse_duration, qualities, ) @@ -14,13 +15,12 @@ class VuClipIE(InfoExtractor): _VALID_URL = r'http://(?:m\.)?vuclip\.com/w\?.*?cid=(?P<id>[0-9]+)' _TEST = { - 'url': 'http://m.vuclip.com/w?cid=843902317&fid=63532&z=1007&nvar&frm=index.html&bu=4757321434', - 'md5': '92ac9d1ccefec4f0bb474661ab144fcf', + 'url': 'http://m.vuclip.com/w?cid=922692425&fid=70295&z=1010&nvar&frm=index.html', 'info_dict': { - 'id': '843902317', + 'id': '922692425', 'ext': '3gp', - 'title': 'Movie Trailer: Noah', - 'duration': 139, + 'title': 'The Toy Soldiers - Hollywood Movie Trailer', + 'duration': 180, } } @@ -37,16 +37,32 @@ class VuClipIE(InfoExtractor): webpage = self._download_webpage( adfree_url, video_id, note='Download post-ad page') + error_msg = self._html_search_regex( + r'<p class="message">(.*?)</p>', webpage, 'error message', + default=None) + if error_msg: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error_msg), expected=True) + + # These clowns alternate between two page types links_code = self._search_regex( - r'(?s)<div class="social align_c".*?>(.*?)<hr\s*/?>', webpage, - 'links') + r'''(?xs) + (?: + <img\s+src="/im/play.gif".*?>| + <!--\ player\ end\ -->\s*</div><!--\ thumb\ end--> + ) + (.*?) + (?: + <a\s+href="fblike|<div\s+class="social"> + ) + ''', webpage, 'links') title = self._html_search_regex( r'<title>(.*?)-\s*Vuclip</title>', webpage, 'title').strip() quality_order = qualities(['Reg', 'Hi']) formats = [] for url, q in re.findall( - r'<a href="(?P<url>[^"]+)".*?>(?P<q>[^<]+)</a>', links_code): + r'<a\s+href="(?P<url>[^"]+)".*?>(?:<button[^>]*>)?(?P<q>[^<]+)(?:</button>)?</a>', links_code): format_id = compat_urllib_parse_urlparse(url).scheme + '-' + q formats.append({ 'format_id': format_id, @@ -56,7 +72,7 @@ class VuClipIE(InfoExtractor): self._sort_formats(formats) duration = parse_duration(self._search_regex( - r'\(([0-9:]+)\)</span></h1>', webpage, 'duration', fatal=False)) + r'\(([0-9:]+)\)</span>', webpage, 'duration', fatal=False)) return { 'id': video_id, diff --git a/youtube_dl/extractor/worldstarhiphop.py b/youtube_dl/extractor/worldstarhiphop.py index 4e89acd81..bda3870db 100644 --- a/youtube_dl/extractor/worldstarhiphop.py +++ b/youtube_dl/extractor/worldstarhiphop.py @@ -13,37 +13,35 @@ class WorldStarHipHopIE(InfoExtractor): "info_dict": { "id": "wshh6a7q1ny0G34ZwuIO", "ext": "mp4", - "title": "Video: KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!" + "title": "KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!" } } def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('id') + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) - webpage_src = self._download_webpage(url, video_id) - - m_vevo_id = re.search(r'videoId=(.*?)&?', - webpage_src) + m_vevo_id = re.search(r'videoId=(.*?)&?', webpage) if m_vevo_id is not None: return self.url_result('vevo:%s' % m_vevo_id.group(1), ie='Vevo') video_url = self._search_regex( - r'so\.addVariable\("file","(.*?)"\)', webpage_src, 'video URL') + r'so\.addVariable\("file","(.*?)"\)', webpage, 'video URL') if 'youtube' in video_url: return self.url_result(video_url, ie='Youtube') video_title = self._html_search_regex( - r"<title>(.*)</title>", webpage_src, 'title') + r'(?s)<div class="content-heading">\s*<h1>(.*?)</h1>', + webpage, 'title') # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. thumbnail = self._html_search_regex( - r'rel="image_src" href="(.*)" />', webpage_src, 'thumbnail', + r'rel="image_src" href="(.*)" />', webpage, 'thumbnail', fatal=False) if not thumbnail: - _title = r"""candytitles.*>(.*)</span>""" - mobj = re.search(_title, webpage_src) + _title = r'candytitles.*>(.*)</span>' + mobj = re.search(_title, webpage) if mobj is not None: video_title = mobj.group(1) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 3ab6017cd..221341c13 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -38,16 +38,6 @@ class YahooIE(InfoExtractor): }, }, { - 'url': 'https://movies.yahoo.com/video/world-loves-spider-man-190819223.html', - 'md5': '410b7104aa9893b765bc22787a22f3d9', - 'info_dict': { - 'id': '516ed8e2-2c4f-339f-a211-7a8b49d30845', - 'ext': 'mp4', - 'title': 'The World Loves Spider-Man', - 'description': '''People all over the world are celebrating the release of \"The Amazing Spider-Man 2.\" We're taking a look at the enthusiastic response Spider-Man has received from viewers all over the world.''', - } - }, - { 'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed', 'md5': '60e8ac193d8fb71997caa8fce54c6460', 'info_dict': { diff --git a/youtube_dl/extractor/ynet.py b/youtube_dl/extractor/ynet.py index 24872861a..944d7da38 100644 --- a/youtube_dl/extractor/ynet.py +++ b/youtube_dl/extractor/ynet.py @@ -13,7 +13,7 @@ class YnetIE(InfoExtractor): _TESTS = [ { 'url': 'http://hot.ynet.co.il/home/0,7340,L-11659-99244,00.html', - 'md5': '002b44ee2f33d50363a1c153bed524cf', + 'md5': '4b29cb57c3dddd57642b3f051f535b07', 'info_dict': { 'id': 'L-11659-99244', 'ext': 'flv', @@ -22,7 +22,7 @@ class YnetIE(InfoExtractor): } }, { 'url': 'http://hot.ynet.co.il/home/0,7340,L-8859-84418,00.html', - 'md5': '6455046ae1b48cf7e2b7cae285e53a16', + 'md5': '8194c2ea221e9a639cac96b6b0753dc5', 'info_dict': { 'id': 'L-8859-84418', 'ext': 'flv', @@ -33,9 +33,7 @@ class YnetIE(InfoExtractor): ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) content = compat_urllib_parse.unquote_plus(self._og_search_video_url(webpage)) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 99198e380..9041cfa87 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -26,7 +26,7 @@ from ..utils import ( get_element_by_attribute, ExtractorError, int_or_none, - PagedList, + OnDemandPagedList, unescapeHTML, unified_strdate, orderedSet, @@ -655,6 +655,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # Get video webpage url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id + pref_cookies = [ + c for c in self._downloader.cookiejar + if c.domain == '.youtube.com' and c.name == 'PREF'] + for pc in pref_cookies: + if 'hl=' in pc.value: + pc.value = re.sub(r'hl=[^&]+', 'hl=en', pc.value) + else: + if pc.value: + pc.value += '&' + pc.value += 'hl=en' video_webpage = self._download_webpage(url, video_id) # Attempt to extract SWF player URL @@ -1341,7 +1351,7 @@ class YoutubeUserIE(InfoExtractor): 'id': video_id, 'title': title, } - url_results = PagedList(download_page, self._GDATA_PAGE_SIZE) + url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE) return self.playlist_result(url_results, playlist_title=username) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 44dcb1e34..f651337ad 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -87,7 +87,7 @@ def parseOpts(overrideArguments=None): for private_opt in ['-p', '--password', '-u', '--username', '--video-password']: try: i = opts.index(private_opt) - opts[i+1] = '<PRIVATE>' + opts[i+1] = 'PRIVATE' except ValueError: pass return opts diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index b644f4e92..f8dd9c72d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -673,6 +673,8 @@ class ExtractorError(Exception): expected = True if video_id is not None: msg = video_id + ': ' + msg + if cause: + msg += u' (caused by %r)' % cause if not expected: msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.' super(ExtractorError, self).__init__(msg) @@ -799,6 +801,12 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): del req.headers['User-agent'] req.headers['User-agent'] = req.headers['Youtubedl-user-agent'] del req.headers['Youtubedl-user-agent'] + + if sys.version_info < (2, 7) and '#' in req.get_full_url(): + # Python 2.6 is brain-dead when it comes to fragments + req._Request__original = req._Request__original.partition('#')[0] + req._Request__r_type = req._Request__r_type.partition('#')[0] + return req def http_response(self, req, resp): @@ -884,6 +892,7 @@ def unified_strdate(date_str): '%d/%m/%Y', '%d/%m/%y', '%Y/%m/%d %H:%M:%S', + '%d/%m/%Y %H:%M:%S', '%Y-%m-%d %H:%M:%S', '%d.%m.%Y %H:%M', '%d.%m.%Y %H.%M', @@ -1384,14 +1393,16 @@ def check_executable(exe, args=[]): class PagedList(object): - def __init__(self, pagefunc, pagesize): - self._pagefunc = pagefunc - self._pagesize = pagesize - def __len__(self): # This is only useful for tests return len(self.getslice()) + +class OnDemandPagedList(PagedList): + def __init__(self, pagefunc, pagesize): + self._pagefunc = pagefunc + self._pagesize = pagesize + def getslice(self, start=0, end=None): res = [] for pagenum in itertools.count(start // self._pagesize): @@ -1430,6 +1441,35 @@ class PagedList(object): return res +class InAdvancePagedList(PagedList): + def __init__(self, pagefunc, pagecount, pagesize): + self._pagefunc = pagefunc + self._pagecount = pagecount + self._pagesize = pagesize + + def getslice(self, start=0, end=None): + res = [] + start_page = start // self._pagesize + end_page = ( + self._pagecount if end is None else (end // self._pagesize + 1)) + skip_elems = start - start_page * self._pagesize + only_more = None if end is None else end - start + for pagenum in range(start_page, end_page): + page = list(self._pagefunc(pagenum)) + if skip_elems: + page = page[skip_elems:] + skip_elems = None + if only_more is not None: + if len(page) < only_more: + only_more -= len(page) + else: + page = page[:only_more] + res.extend(page) + break + res.extend(page) + return res + + def uppercase_escape(s): unicode_escape = codecs.getdecoder('unicode_escape') return re.sub( @@ -1540,27 +1580,24 @@ def strip_jsonp(code): def js_to_json(code): def fix_kv(m): - key = m.group(2) - if key.startswith("'"): - assert key.endswith("'") - assert '"' not in key - key = '"%s"' % key[1:-1] - elif not key.startswith('"'): - key = '"%s"' % key - - value = m.group(4) - if value.startswith("'"): - assert value.endswith("'") - assert '"' not in value - value = '"%s"' % value[1:-1] - - return m.group(1) + key + m.group(3) + value + v = m.group(0) + if v in ('true', 'false', 'null'): + return v + if v.startswith('"'): + return v + if v.startswith("'"): + v = v[1:-1] + v = re.sub(r"\\\\|\\'|\"", lambda m: { + '\\\\': '\\\\', + "\\'": "'", + '"': '\\"', + }[m.group(0)], v) + return '"%s"' % v res = re.sub(r'''(?x) - ([{,]\s*) - ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+) - (:\s*) - ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{) + "(?:[^"\\]*(?:\\\\|\\")?)*"| + '(?:[^'\\]*(?:\\\\|\\')?)*'| + [a-zA-Z_][a-zA-Z_0-9]* ''', fix_kv, code) res = re.sub(r',(\s*\])', lambda m: m.group(1), res) return res diff --git a/youtube_dl/version.py b/youtube_dl/version.py index eb4356811..1384b496b 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.09.28.1' +__version__ = '2014.09.29.2' |