diff options
Diffstat (limited to 'youtube_dl/extractor/generic.py')
-rw-r--r-- | youtube_dl/extractor/generic.py | 109 |
1 files changed, 79 insertions, 30 deletions
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 285c0ff66..4d38b0c9d 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -9,8 +9,8 @@ import sys from .common import InfoExtractor from .youtube import YoutubeIE from ..compat import ( + compat_etree_fromstring, compat_urllib_parse_unquote, - compat_urllib_request, compat_urlparse, compat_xml_parse_error, ) @@ -21,7 +21,7 @@ from ..utils import ( HEADRequest, is_html, orderedSet, - parse_xml, + sanitized_Request, smuggle_url, unescapeHTML, unified_strdate, @@ -30,7 +30,10 @@ from ..utils import ( url_basename, xpath_text, ) -from .brightcove import BrightcoveIE +from .brightcove import ( + BrightcoveLegacyIE, + BrightcoveNewIE, +) from .nbc import NBCSportsVPlayerIE from .ooyala import OoyalaIE from .rutv import RUTVIE @@ -140,6 +143,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'Automatics, robotics and biocybernetics', 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', + 'upload_date': '20130627', 'formats': 'mincount:16', 'subtitles': 'mincount:1', }, @@ -273,7 +277,7 @@ class GenericIE(InfoExtractor): # it also tests brightcove videos that need to set the 'Referer' in the # http requests { - 'add_ie': ['Brightcove'], + 'add_ie': ['BrightcoveLegacy'], 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', 'info_dict': { 'id': '2765128793001', @@ -297,7 +301,7 @@ class GenericIE(InfoExtractor): 'uploader': 'thestar.com', 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.', }, - 'add_ie': ['Brightcove'], + 'add_ie': ['BrightcoveLegacy'], }, { 'url': 'http://www.championat.com/video/football/v/87/87499.html', @@ -312,7 +316,7 @@ class GenericIE(InfoExtractor): }, { # https://github.com/rg3/youtube-dl/issues/3541 - 'add_ie': ['Brightcove'], + 'add_ie': ['BrightcoveLegacy'], 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1', 'info_dict': { 'id': '3866516442001', @@ -818,6 +822,19 @@ class GenericIE(InfoExtractor): 'title': 'Os Guinness // Is It Fools Talk? // Unbelievable? Conference 2014', }, }, + # Kaltura embed protected with referrer + { + 'url': 'http://www.disney.nl/disney-channel/filmpjes/achter-de-schermen#/videoId/violetta-achter-de-schermen-ruggero', + 'info_dict': { + 'id': '1_g4fbemnq', + 'ext': 'mp4', + 'title': 'Violetta - Achter De Schermen - Ruggero', + 'description': 'Achter de schermen met Ruggero', + 'timestamp': 1435133761, + 'upload_date': '20150624', + 'uploader_id': 'echojecka', + }, + }, # Eagle.Platform embed (generic URL) { 'url': 'http://lenta.ru/news/2015/03/06/navalny/', @@ -1029,6 +1046,31 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'cinemasnob', }, + }, + # BrightcoveInPageEmbed embed + { + 'url': 'http://www.geekandsundry.com/tabletop-bonus-wils-final-thoughts-on-dread/', + 'info_dict': { + 'id': '4238694884001', + 'ext': 'flv', + 'title': 'Tabletop: Dread, Last Thoughts', + 'description': 'Tabletop: Dread, Last Thoughts', + 'duration': 51690, + }, + }, + # JWPlayer with M3U8 + { + 'url': 'http://ren.tv/novosti/2015-09-25/sluchaynyy-prohozhiy-poymal-avtougonshchika-v-murmanske-video', + 'info_dict': { + 'id': 'playlist', + 'ext': 'mp4', + 'title': 'Случайный прохожий поймал автоугонщика в Мурманске. ВИДЕО | РЕН ТВ', + 'uploader': 'ren.tv', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + } } ] @@ -1172,7 +1214,7 @@ class GenericIE(InfoExtractor): full_response = None if head_response is False: - request = compat_urllib_request.Request(url) + request = sanitized_Request(url) request.add_header('Accept-Encoding', '*') full_response = self._request_webpage(request, video_id) head_response = full_response @@ -1201,7 +1243,7 @@ class GenericIE(InfoExtractor): '%s on generic information extractor.' % ('Forcing' if force else 'Falling back')) if not full_response: - request = compat_urllib_request.Request(url) + request = sanitized_Request(url) # Some webservers may serve compressed content of rather big size (e.g. gzipped flac) # making it impossible to download only chunk of the file (yet we need only 512kB to # test whether it's HTML or not). According to youtube-dl default Accept-Encoding @@ -1236,7 +1278,7 @@ class GenericIE(InfoExtractor): # Is it an RSS feed, a SMIL file or a XSPF playlist? try: - doc = parse_xml(webpage) + doc = compat_etree_fromstring(webpage.encode('utf-8')) if doc.tag == 'rss': return self._extract_rss(url, video_id, doc) elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): @@ -1288,14 +1330,14 @@ class GenericIE(InfoExtractor): return self.playlist_result( urlrs, playlist_id=video_id, playlist_title=video_title) - # Look for BrightCove: - bc_urls = BrightcoveIE._extract_brightcove_urls(webpage) + # Look for Brightcove Legacy Studio embeds + bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) if bc_urls: self.to_screen('Brightcove video detected.') entries = [{ '_type': 'url', 'url': smuggle_url(bc_url, {'Referer': url}), - 'ie_key': 'Brightcove' + 'ie_key': 'BrightcoveLegacy' } for bc_url in bc_urls] return { @@ -1305,6 +1347,11 @@ class GenericIE(InfoExtractor): 'entries': entries, } + # Look for Brightcove New Studio embeds + bc_urls = BrightcoveNewIE._extract_urls(webpage) + if bc_urls: + return _playlist_from_matches(bc_urls, ie='BrightcoveNew') + # Look for embedded rtl.nl player matches = re.findall( r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', @@ -1665,10 +1712,12 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url'), 'Zapiks') # Look for Kaltura embeds - mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage) or - re.search(r'(?s)(["\'])(?:https?:)?//cdnapisec\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?\1.*?entry_id\s*:\s*(["\'])(?P<id>[^\2]+?)\2', webpage)) + mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_?[Ii]d'\s*:\s*'(?P<id>[^']+)',", webpage) or + re.search(r'(?s)(?P<q1>["\'])(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?(?P=q1).*?entry_?[Ii]d\s*:\s*(?P<q2>["\'])(?P<id>.+?)(?P=q2)', webpage)) if mobj is not None: - return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura') + return self.url_result(smuggle_url( + 'kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), + {'source_url': url}), 'Kaltura') # Look for Eagle.Platform embeds mobj = re.search( @@ -1713,7 +1762,7 @@ class GenericIE(InfoExtractor): # Look for UDN embeds mobj = re.search( - r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._VALID_URL, webpage) + r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage) if mobj is not None: return self.url_result( compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed') @@ -1833,6 +1882,7 @@ class GenericIE(InfoExtractor): entries = [] for video_url in found: + video_url = video_url.replace('\\/', '/') video_url = compat_urlparse.urljoin(url, video_url) video_id = compat_urllib_parse_unquote(os.path.basename(video_url)) @@ -1844,25 +1894,24 @@ class GenericIE(InfoExtractor): # here's a fun little line of code for you: video_id = os.path.splitext(video_id)[0] + entry_info_dict = { + 'id': video_id, + 'uploader': video_uploader, + 'title': video_title, + 'age_limit': age_limit, + } + ext = determine_ext(video_url) if ext == 'smil': - entries.append({ - 'id': video_id, - 'formats': self._extract_smil_formats(video_url, video_id), - 'uploader': video_uploader, - 'title': video_title, - 'age_limit': age_limit, - }) + entry_info_dict['formats'] = self._extract_smil_formats(video_url, video_id) elif ext == 'xspf': return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id) + elif ext == 'm3u8': + entry_info_dict['formats'] = self._extract_m3u8_formats(video_url, video_id, ext='mp4') else: - entries.append({ - 'id': video_id, - 'url': video_url, - 'uploader': video_uploader, - 'title': video_title, - 'age_limit': age_limit, - }) + entry_info_dict['url'] = video_url + + entries.append(entry_info_dict) if len(entries) == 1: return entries[0] |