diff options
Diffstat (limited to 'youtube_dl/extractor/generic.py')
| -rw-r--r-- | youtube_dl/extractor/generic.py | 41 | 
1 files changed, 30 insertions, 11 deletions
| diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 7a3a7f66b..34d930a2d 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -4,10 +4,12 @@ from __future__ import unicode_literals  import os  import re +import sys  from .common import InfoExtractor  from .youtube import YoutubeIE  from ..compat import ( +    compat_etree_fromstring,      compat_urllib_parse_unquote,      compat_urllib_request,      compat_urlparse, @@ -20,7 +22,6 @@ from ..utils import (      HEADRequest,      is_html,      orderedSet, -    parse_xml,      smuggle_url,      unescapeHTML,      unified_strdate, @@ -52,6 +53,7 @@ from .dailymotion import DailymotionCloudIE  from .onionstudios import OnionStudiosIE  from .snagfilms import SnagFilmsEmbedIE  from .screenwavemedia import ScreenwaveMediaIE +from .mtv import MTVServicesEmbeddedIE  class GenericIE(InfoExtractor): @@ -142,6 +144,7 @@ class GenericIE(InfoExtractor):                  'ext': 'mp4',                  'title': 'Automatics, robotics and biocybernetics',                  'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', +                'upload_date': '20130627',                  'formats': 'mincount:16',                  'subtitles': 'mincount:1',              }, @@ -234,6 +237,22 @@ class GenericIE(InfoExtractor):              }          },          { +            # redirect in Refresh HTTP header +            'url': 'https://www.facebook.com/l.php?u=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DpO8h3EaFRdo&h=TAQHsoToz&enc=AZN16h-b6o4Zq9pZkCCdOLNKMN96BbGMNtcFwHSaazus4JHT_MFYkAA-WARTX2kvsCIdlAIyHZjl6d33ILIJU7Jzwk_K3mcenAXoAzBNoZDI_Q7EXGDJnIhrGkLXo_LJ_pAa2Jzbx17UHMd3jAs--6j2zaeto5w9RTn8T_1kKg3fdC5WPX9Dbb18vzH7YFX0eSJmoa6SP114rvlkw6pkS1-T&s=1', +            'info_dict': { +                'id': 'pO8h3EaFRdo', +                'ext': 'mp4', +                'title': 'Tripeo Boiler Room x Dekmantel Festival DJ Set', +                'description': 'md5:6294cc1af09c4049e0652b51a2df10d5', +                'upload_date': '20150917', +                'uploader_id': 'brtvofficial', +                'uploader': 'Boiler Room', +            }, +            'params': { +                'skip_download': False, +            }, +        }, +        {              'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',              'md5': '85b90ccc9d73b4acd9138d3af4c27f89',              'info_dict': { @@ -1233,7 +1252,7 @@ class GenericIE(InfoExtractor):          # Is it an RSS feed, a SMIL file or a XSPF playlist?          try: -            doc = parse_xml(webpage) +            doc = compat_etree_fromstring(webpage.encode('utf-8'))              if doc.tag == 'rss':                  return self._extract_rss(url, video_id, doc)              elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): @@ -1613,12 +1632,9 @@ class GenericIE(InfoExtractor):              return self.url_result(url, ie='Vulture')          # Look for embedded mtvservices player -        mobj = re.search( -            r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"', -            webpage) -        if mobj is not None: -            url = unescapeHTML(mobj.group('url')) -            return self.url_result(url, ie='MTVServicesEmbedded') +        mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage) +        if mtvservices_url: +            return self.url_result(mtvservices_url, ie='MTVServicesEmbedded')          # Look for embedded yahoo player          mobj = re.search( @@ -1657,7 +1673,7 @@ class GenericIE(InfoExtractor):              return self.url_result(mobj.group('url'), 'MLB')          mobj = re.search( -            r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL, +            r'<(?:iframe|script)[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,              webpage)          if mobj is not None:              return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast') @@ -1675,8 +1691,8 @@ class GenericIE(InfoExtractor):              return self.url_result(mobj.group('url'), 'Zapiks')          # Look for Kaltura embeds -        mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage) or -                re.search(r'(?s)(["\'])(?:https?:)?//cdnapisec\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?\1.*?entry_id\s*:\s*(["\'])(?P<id>[^\2]+?)\2', webpage)) +        mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_?[Ii]d'\s*:\s*'(?P<id>[^']+)',", webpage) or +                re.search(r'(?s)(?P<q1>["\'])(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?(?P=q1).*?entry_?[Ii]d\s*:\s*(?P<q2>["\'])(?P<id>.+?)(?P=q2)', webpage))          if mobj is not None:              return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura') @@ -1827,6 +1843,9 @@ class GenericIE(InfoExtractor):                  # Look also in Refresh HTTP header                  refresh_header = head_response.headers.get('Refresh')                  if refresh_header: +                    # In python 2 response HTTP headers are bytestrings +                    if sys.version_info < (3, 0) and isinstance(refresh_header, str): +                        refresh_header = refresh_header.decode('iso-8859-1')                      found = re.search(REDIRECT_REGEX, refresh_header)              if found:                  new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1))) | 
