diff options
Diffstat (limited to 'youtube_dl/extractor/generic.py')
| -rw-r--r-- | youtube_dl/extractor/generic.py | 158 | 
1 files changed, 114 insertions, 44 deletions
| diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ec748ed9f..c2e8f9b62 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -4,12 +4,13 @@ from __future__ import unicode_literals  import os  import re +import sys  from .common import InfoExtractor  from .youtube import YoutubeIE  from ..compat import ( +    compat_etree_fromstring,      compat_urllib_parse_unquote, -    compat_urllib_request,      compat_urlparse,      compat_xml_parse_error,  ) @@ -20,7 +21,7 @@ from ..utils import (      HEADRequest,      is_html,      orderedSet, -    parse_xml, +    sanitized_Request,      smuggle_url,      unescapeHTML,      unified_strdate, @@ -29,7 +30,10 @@ from ..utils import (      url_basename,      xpath_text,  ) -from .brightcove import BrightcoveIE +from .brightcove import ( +    BrightcoveLegacyIE, +    BrightcoveNewIE, +)  from .nbc import NBCSportsVPlayerIE  from .ooyala import OoyalaIE  from .rutv import RUTVIE @@ -49,6 +53,8 @@ from .dailymotion import DailymotionCloudIE  from .onionstudios import OnionStudiosIE  from .snagfilms import SnagFilmsEmbedIE  from .screenwavemedia import ScreenwaveMediaIE +from .mtv import MTVServicesEmbeddedIE +from .pladform import PladformIE  class GenericIE(InfoExtractor): @@ -139,6 +145,7 @@ class GenericIE(InfoExtractor):                  'ext': 'mp4',                  'title': 'Automatics, robotics and biocybernetics',                  'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', +                'upload_date': '20130627',                  'formats': 'mincount:16',                  'subtitles': 'mincount:1',              }, @@ -231,6 +238,22 @@ class GenericIE(InfoExtractor):              }          },          { +            # redirect in Refresh HTTP header +            'url': 'https://www.facebook.com/l.php?u=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DpO8h3EaFRdo&h=TAQHsoToz&enc=AZN16h-b6o4Zq9pZkCCdOLNKMN96BbGMNtcFwHSaazus4JHT_MFYkAA-WARTX2kvsCIdlAIyHZjl6d33ILIJU7Jzwk_K3mcenAXoAzBNoZDI_Q7EXGDJnIhrGkLXo_LJ_pAa2Jzbx17UHMd3jAs--6j2zaeto5w9RTn8T_1kKg3fdC5WPX9Dbb18vzH7YFX0eSJmoa6SP114rvlkw6pkS1-T&s=1', +            'info_dict': { +                'id': 'pO8h3EaFRdo', +                'ext': 'mp4', +                'title': 'Tripeo Boiler Room x Dekmantel Festival DJ Set', +                'description': 'md5:6294cc1af09c4049e0652b51a2df10d5', +                'upload_date': '20150917', +                'uploader_id': 'brtvofficial', +                'uploader': 'Boiler Room', +            }, +            'params': { +                'skip_download': False, +            }, +        }, +        {              'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',              'md5': '85b90ccc9d73b4acd9138d3af4c27f89',              'info_dict': { @@ -256,7 +279,7 @@ class GenericIE(InfoExtractor):          # it also tests brightcove videos that need to set the 'Referer' in the          # http requests          { -            'add_ie': ['Brightcove'], +            'add_ie': ['BrightcoveLegacy'],              'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',              'info_dict': {                  'id': '2765128793001', @@ -280,7 +303,7 @@ class GenericIE(InfoExtractor):                  'uploader': 'thestar.com',                  'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',              }, -            'add_ie': ['Brightcove'], +            'add_ie': ['BrightcoveLegacy'],          },          {              'url': 'http://www.championat.com/video/football/v/87/87499.html', @@ -295,7 +318,7 @@ class GenericIE(InfoExtractor):          },          {              # https://github.com/rg3/youtube-dl/issues/3541 -            'add_ie': ['Brightcove'], +            'add_ie': ['BrightcoveLegacy'],              'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',              'info_dict': {                  'id': '3866516442001', @@ -317,6 +340,7 @@ class GenericIE(InfoExtractor):                  'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',                  'ext': 'mp4',                  'title': '2cc213299525360.mov',  # that's what we get +                'duration': 238.231,              },              'add_ie': ['Ooyala'],          }, @@ -328,6 +352,7 @@ class GenericIE(InfoExtractor):                  'ext': 'mp4',                  'title': '"Steve Jobs: Man in the Machine" trailer',                  'description': 'The first trailer for the Alex Gibney documentary "Steve Jobs: Man in the Machine."', +                'duration': 135.427,              },              'params': {                  'skip_download': True, @@ -801,6 +826,19 @@ class GenericIE(InfoExtractor):                  'title': 'Os Guinness // Is It Fools Talk? // Unbelievable? Conference 2014',              },          }, +        # Kaltura embed protected with referrer +        { +            'url': 'http://www.disney.nl/disney-channel/filmpjes/achter-de-schermen#/videoId/violetta-achter-de-schermen-ruggero', +            'info_dict': { +                'id': '1_g4fbemnq', +                'ext': 'mp4', +                'title': 'Violetta - Achter De Schermen - Ruggero', +                'description': 'Achter de schermen met Ruggero', +                'timestamp': 1435133761, +                'upload_date': '20150624', +                'uploader_id': 'echojecka', +            }, +        },          # Eagle.Platform embed (generic URL)          {              'url': 'http://lenta.ru/news/2015/03/06/navalny/', @@ -925,8 +963,9 @@ class GenericIE(InfoExtractor):              'info_dict': {                  'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs',                  'ext': 'mp4', -                'description': 'VIDEO: Index/Match versus VLOOKUP.', +                'description': 'VIDEO: INDEX/MATCH versus VLOOKUP.',                  'title': 'This is what separates the Excel masters from the wannabes', +                'duration': 191.933,              },              'params': {                  # m3u8 downloads @@ -1012,6 +1051,31 @@ class GenericIE(InfoExtractor):                  'ext': 'mp4',                  'title': 'cinemasnob',              }, +        }, +        # BrightcoveInPageEmbed embed +        { +            'url': 'http://www.geekandsundry.com/tabletop-bonus-wils-final-thoughts-on-dread/', +            'info_dict': { +                'id': '4238694884001', +                'ext': 'flv', +                'title': 'Tabletop: Dread, Last Thoughts', +                'description': 'Tabletop: Dread, Last Thoughts', +                'duration': 51690, +            }, +        }, +        # JWPlayer with M3U8 +        { +            'url': 'http://ren.tv/novosti/2015-09-25/sluchaynyy-prohozhiy-poymal-avtougonshchika-v-murmanske-video', +            'info_dict': { +                'id': 'playlist', +                'ext': 'mp4', +                'title': 'Случайный прохожий поймал автоугонщика в Мурманске. ВИДЕО | РЕН ТВ', +                'uploader': 'ren.tv', +            }, +            'params': { +                # m3u8 downloads +                'skip_download': True, +            }          }      ] @@ -1155,7 +1219,7 @@ class GenericIE(InfoExtractor):          full_response = None          if head_response is False: -            request = compat_urllib_request.Request(url) +            request = sanitized_Request(url)              request.add_header('Accept-Encoding', '*')              full_response = self._request_webpage(request, video_id)              head_response = full_response @@ -1184,7 +1248,7 @@ class GenericIE(InfoExtractor):                  '%s on generic information extractor.' % ('Forcing' if force else 'Falling back'))          if not full_response: -            request = compat_urllib_request.Request(url) +            request = sanitized_Request(url)              # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)              # making it impossible to download only chunk of the file (yet we need only 512kB to              # test whether it's HTML or not). According to youtube-dl default Accept-Encoding @@ -1219,7 +1283,7 @@ class GenericIE(InfoExtractor):          # Is it an RSS feed, a SMIL file or a XSPF playlist?          try: -            doc = parse_xml(webpage) +            doc = compat_etree_fromstring(webpage.encode('utf-8'))              if doc.tag == 'rss':                  return self._extract_rss(url, video_id, doc)              elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): @@ -1271,14 +1335,14 @@ class GenericIE(InfoExtractor):              return self.playlist_result(                  urlrs, playlist_id=video_id, playlist_title=video_title) -        # Look for BrightCove: -        bc_urls = BrightcoveIE._extract_brightcove_urls(webpage) +        # Look for Brightcove Legacy Studio embeds +        bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)          if bc_urls:              self.to_screen('Brightcove video detected.')              entries = [{                  '_type': 'url',                  'url': smuggle_url(bc_url, {'Referer': url}), -                'ie_key': 'Brightcove' +                'ie_key': 'BrightcoveLegacy'              } for bc_url in bc_urls]              return { @@ -1288,6 +1352,11 @@ class GenericIE(InfoExtractor):                  'entries': entries,              } +        # Look for Brightcove New Studio embeds +        bc_urls = BrightcoveNewIE._extract_urls(webpage) +        if bc_urls: +            return _playlist_from_matches(bc_urls, ie='BrightcoveNew') +          # Look for embedded rtl.nl player          matches = re.findall(              r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', @@ -1436,7 +1505,7 @@ class GenericIE(InfoExtractor):                  re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or                  re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))          if mobj is not None: -            return OoyalaIE._build_url_result(mobj.group('ec')) +            return OoyalaIE._build_url_result(smuggle_url(mobj.group('ec'), {'domain': url}))          # Look for multiple Ooyala embeds on SBN network websites          mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage) @@ -1444,7 +1513,7 @@ class GenericIE(InfoExtractor):              embeds = self._parse_json(mobj.group(1), video_id, fatal=False)              if embeds:                  return _playlist_from_matches( -                    embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala') +                    embeds, getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala')          # Look for Aparat videos          mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage) @@ -1594,12 +1663,9 @@ class GenericIE(InfoExtractor):              return self.url_result(url, ie='Vulture')          # Look for embedded mtvservices player -        mobj = re.search( -            r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"', -            webpage) -        if mobj is not None: -            url = unescapeHTML(mobj.group('url')) -            return self.url_result(url, ie='MTVServicesEmbedded') +        mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage) +        if mtvservices_url: +            return self.url_result(mtvservices_url, ie='MTVServicesEmbedded')          # Look for embedded yahoo player          mobj = re.search( @@ -1638,7 +1704,7 @@ class GenericIE(InfoExtractor):              return self.url_result(mobj.group('url'), 'MLB')          mobj = re.search( -            r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL, +            r'<(?:iframe|script)[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,              webpage)          if mobj is not None:              return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast') @@ -1656,10 +1722,12 @@ class GenericIE(InfoExtractor):              return self.url_result(mobj.group('url'), 'Zapiks')          # Look for Kaltura embeds -        mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage) or -                re.search(r'(?s)(["\'])(?:https?:)?//cdnapisec\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?\1.*?entry_id\s*:\s*(["\'])(?P<id>[^\2]+?)\2', webpage)) +        mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_?[Ii]d'\s*:\s*'(?P<id>[^']+)',", webpage) or +                re.search(r'(?s)(?P<q1>["\'])(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?(?P=q1).*?entry_?[Ii]d\s*:\s*(?P<q2>["\'])(?P<id>.+?)(?P=q2)', webpage))          if mobj is not None: -            return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura') +            return self.url_result(smuggle_url( +                'kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), +                {'source_url': url}), 'Kaltura')          # Look for Eagle.Platform embeds          mobj = re.search( @@ -1674,10 +1742,9 @@ class GenericIE(InfoExtractor):              return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')          # Look for Pladform embeds -        mobj = re.search( -            r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage) -        if mobj is not None: -            return self.url_result(mobj.group('url'), 'Pladform') +        pladform_url = PladformIE._extract_url(webpage) +        if pladform_url: +            return self.url_result(pladform_url)          # Look for Playwire embeds          mobj = re.search( @@ -1704,7 +1771,7 @@ class GenericIE(InfoExtractor):          # Look for UDN embeds          mobj = re.search( -            r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._VALID_URL, webpage) +            r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage)          if mobj is not None:              return self.url_result(                  compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed') @@ -1808,6 +1875,9 @@ class GenericIE(InfoExtractor):                  # Look also in Refresh HTTP header                  refresh_header = head_response.headers.get('Refresh')                  if refresh_header: +                    # In python 2 response HTTP headers are bytestrings +                    if sys.version_info < (3, 0) and isinstance(refresh_header, str): +                        refresh_header = refresh_header.decode('iso-8859-1')                      found = re.search(REDIRECT_REGEX, refresh_header)              if found:                  new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1))) @@ -1821,6 +1891,7 @@ class GenericIE(InfoExtractor):          entries = []          for video_url in found: +            video_url = video_url.replace('\\/', '/')              video_url = compat_urlparse.urljoin(url, video_url)              video_id = compat_urllib_parse_unquote(os.path.basename(video_url)) @@ -1832,25 +1903,24 @@ class GenericIE(InfoExtractor):              # here's a fun little line of code for you:              video_id = os.path.splitext(video_id)[0] +            entry_info_dict = { +                'id': video_id, +                'uploader': video_uploader, +                'title': video_title, +                'age_limit': age_limit, +            } +              ext = determine_ext(video_url)              if ext == 'smil': -                entries.append({ -                    'id': video_id, -                    'formats': self._extract_smil_formats(video_url, video_id), -                    'uploader': video_uploader, -                    'title': video_title, -                    'age_limit': age_limit, -                }) +                entry_info_dict['formats'] = self._extract_smil_formats(video_url, video_id)              elif ext == 'xspf':                  return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id) +            elif ext == 'm3u8': +                entry_info_dict['formats'] = self._extract_m3u8_formats(video_url, video_id, ext='mp4')              else: -                entries.append({ -                    'id': video_id, -                    'url': video_url, -                    'uploader': video_uploader, -                    'title': video_title, -                    'age_limit': age_limit, -                }) +                entry_info_dict['url'] = video_url + +            entries.append(entry_info_dict)          if len(entries) == 1:              return entries[0] | 
