diff options
Diffstat (limited to 'youtube_dl/extractor/generic.py')
| -rw-r--r-- | youtube_dl/extractor/generic.py | 240 | 
1 files changed, 194 insertions, 46 deletions
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index dfc2ef4e7..2b4d8c62f 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -7,11 +7,12 @@ import re  from .common import InfoExtractor  from .youtube import YoutubeIE -from ..utils import ( +from ..compat import (      compat_urllib_parse,      compat_urlparse,      compat_xml_parse_error, - +) +from ..utils import (      determine_ext,      ExtractorError,      float_or_none, @@ -28,6 +29,7 @@ from .brightcove import BrightcoveIE  from .ooyala import OoyalaIE  from .rutv import RUTVIE  from .smotri import SmotriIE +from .condenast import CondeNastIE  class GenericIE(InfoExtractor): @@ -98,6 +100,22 @@ class GenericIE(InfoExtractor):                  'uploader': 'Championat',              },          }, +        { +            # https://github.com/rg3/youtube-dl/issues/3541 +            'add_ie': ['Brightcove'], +            'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1', +            'info_dict': { +                'id': '3866516442001', +                'ext': 'mp4', +                'title': 'Leer mij vrouwen kennen: Aflevering 1', +                'description': 'Leer mij vrouwen kennen: Aflevering 1', +                'uploader': 'SBS Broadcasting', +            }, +            'skip': 'Restricted to Netherlands', +            'params': { +                'skip_download': True,  # m3u8 download +            }, +        },          # Direct link to a video          {              'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', @@ -324,7 +342,7 @@ class GenericIE(InfoExtractor):                  'ext': 'mp4',                  'age_limit': 18,                  'uploader': 'www.handjobhub.com', -                'title': 'Busty Blonde Siri Tit Fuck While Wank at Handjob Hub', +                'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',              }          },          # RSS feed @@ -379,6 +397,87 @@ class GenericIE(InfoExtractor):                  'uploader': 'education-portal.com',              },          }, +        { +            'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz', +            'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4', +            'info_dict': { +                'id': 'uxjb0lwrcz', +                'ext': 'mp4', +                'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks', +                'duration': 1715.0, +                'uploader': 'thoughtworks.wistia.com', +            }, +        }, +        # Direct download with broken HEAD +        { +            'url': 'http://ai-radio.org:8000/radio.opus', +            'info_dict': { +                'id': 'radio', +                'ext': 'opus', +                'title': 'radio', +            }, +            'params': { +                'skip_download': True,  # infinite live stream +            }, +            'expected_warnings': [ +                r'501.*Not Implemented' +            ], +        }, +        # Soundcloud embed +        { +            'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/', +            'info_dict': { +                'id': '174391317', +                'ext': 'mp3', +                'description': 'md5:ff867d6b555488ad3c52572bb33d432c', +                'uploader': 'Sophos Security', +                'title': 'Chet Chat 171 - Oct 29, 2014', +                'upload_date': '20141029', +            } +        }, +        # Livestream embed +        { +            'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast', +            'info_dict': { +                'id': '67864563', +                'ext': 'flv', +                'upload_date': '20141112', +                'title': 'Rosetta #CometLanding webcast HL 10', +            } +        }, +        # LazyYT +        { +            'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986', +            'info_dict': { +                'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse', +            }, +            'playlist_mincount': 2, +        }, +        # Direct link with incorrect MIME type +        { +            'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', +            'md5': '4ccbebe5f36706d85221f204d7eb5913', +            'info_dict': { +                'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', +                'id': '5_Lennart_Poettering_-_Systemd', +                'ext': 'webm', +                'title': '5_Lennart_Poettering_-_Systemd', +                'upload_date': '20141120', +            }, +            'expected_warnings': [ +                'URL could be a direct video link, returning it as such.' +            ] +        }, +        # Cinchcast embed +        { +            'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/', +            'info_dict': { +                'id': '7141703', +                'ext': 'mp3', +                'upload_date': '20141126', +                'title': 'Jack Tips: 5 Steps to Permanent Gut Healing', +            } +        },      ]      def report_following_redirect(self, new_url): @@ -471,11 +570,12 @@ class GenericIE(InfoExtractor):              if default_search in ('error', 'fixup_error'):                  raise ExtractorError( -                    ('%r is not a valid URL. ' -                     'Set --default-search "ytsearch" (or run  youtube-dl "ytsearch:%s" ) to search YouTube' -                    ) % (url, url), expected=True) +                    '%r is not a valid URL. ' +                    'Set --default-search "ytsearch" (or run  youtube-dl "ytsearch:%s" ) to search YouTube' +                    % (url, url), expected=True)              else: -                assert ':' in default_search +                if ':' not in default_search: +                    default_search += ':'                  return self.url_result(default_search + url)          url, smuggled_data = unsmuggle_url(url) @@ -490,14 +590,14 @@ class GenericIE(InfoExtractor):          self.to_screen('%s: Requesting header' % video_id)          head_req = HEADRequest(url) -        response = self._request_webpage( +        head_response = self._request_webpage(              head_req, video_id,              note=False, errnote='Could not send HEAD request to %s' % url,              fatal=False) -        if response is not False: +        if head_response is not False:              # Check for redirect -            new_url = response.geturl() +            new_url = head_response.geturl()              if url != new_url:                  self.report_following_redirect(new_url)                  if force_videoid: @@ -505,33 +605,53 @@ class GenericIE(InfoExtractor):                          new_url, {'force_videoid': force_videoid})                  return self.url_result(new_url) -            # Check for direct link to a video -            content_type = response.headers.get('Content-Type', '') -            m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type) -            if m: -                upload_date = response.headers.get('Last-Modified') -                if upload_date: -                    upload_date = unified_strdate(upload_date) -                return { -                    'id': video_id, -                    'title': os.path.splitext(url_basename(url))[0], -                    'formats': [{ -                        'format_id': m.group('format_id'), -                        'url': url, -                        'vcodec': 'none' if m.group('type') == 'audio' else None -                    }], -                    'upload_date': upload_date, -                } +        full_response = None +        if head_response is False: +            full_response = self._request_webpage(url, video_id) +            head_response = full_response + +        # Check for direct link to a video +        content_type = head_response.headers.get('Content-Type', '') +        m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type) +        if m: +            upload_date = unified_strdate( +                head_response.headers.get('Last-Modified')) +            return { +                'id': video_id, +                'title': os.path.splitext(url_basename(url))[0], +                'direct': True, +                'formats': [{ +                    'format_id': m.group('format_id'), +                    'url': url, +                    'vcodec': 'none' if m.group('type') == 'audio' else None +                }], +                'upload_date': upload_date, +            }          if not self._downloader.params.get('test', False) and not is_intentional:              self._downloader.report_warning('Falling back on generic information extractor.') -        try: -            webpage = self._download_webpage(url, video_id) -        except ValueError: -            # since this is the last-resort InfoExtractor, if -            # this error is thrown, it'll be thrown here -            raise ExtractorError('Failed to download URL: %s' % url) +        if not full_response: +            full_response = self._request_webpage(url, video_id) + +        # Maybe it's a direct link to a video? +        # Be careful not to download the whole thing! +        first_bytes = full_response.read(512) +        if not re.match(r'^\s*<', first_bytes.decode('utf-8', 'replace')): +            self._downloader.report_warning( +                'URL could be a direct video link, returning it as such.') +            upload_date = unified_strdate( +                head_response.headers.get('Last-Modified')) +            return { +                'id': video_id, +                'title': os.path.splitext(url_basename(url))[0], +                'direct': True, +                'url': url, +                'upload_date': upload_date, +            } + +        webpage = self._webpage_read_content( +            full_response, url, video_id, prefix=first_bytes)          self.report_extraction(video_id) @@ -608,13 +728,13 @@ class GenericIE(InfoExtractor):          if mobj:              player_url = unescapeHTML(mobj.group('url'))              surl = smuggle_url(player_url, {'Referer': url}) -            return self.url_result(surl, 'Vimeo') +            return self.url_result(surl)          # Look for embedded (swf embed) Vimeo player          mobj = re.search( -            r'<embed[^>]+?src="(https?://(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage) +            r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)          if mobj: -            return self.url_result(mobj.group(1), 'Vimeo') +            return self.url_result(mobj.group(1))          # Look for embedded YouTube player          matches = re.findall(r'''(?x) @@ -622,7 +742,8 @@ class GenericIE(InfoExtractor):                  <iframe[^>]+?src=|                  data-video-url=|                  <embed[^>]+?src=| -                embedSWF\(?:\s* +                embedSWF\(?:\s*| +                new\s+SWFObject\(              )              (["\'])                  (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ @@ -632,6 +753,12 @@ class GenericIE(InfoExtractor):              return _playlist_from_matches(                  matches, lambda m: unescapeHTML(m[1])) +        # Look for lazyYT YouTube embed +        matches = re.findall( +            r'class="lazyYT" data-youtube-id="([^"]+)"', webpage) +        if matches: +            return _playlist_from_matches(matches, lambda m: unescapeHTML(m)) +          # Look for embedded Dailymotion player          matches = re.findall(              r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage) @@ -651,17 +778,20 @@ class GenericIE(InfoExtractor):          # Look for embedded Wistia player          match = re.search( -            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) +            r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)          if match: +            embed_url = self._proto_relative_url( +                unescapeHTML(match.group('url')))              return {                  '_type': 'url_transparent', -                'url': unescapeHTML(match.group('url')), +                'url': embed_url,                  'ie_key': 'Wistia',                  'uploader': video_uploader,                  'title': video_title,                  'id': video_id,              } -        match = re.search(r'(?:id=["\']wistia_|data-wistiaid=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage) + +        match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)          if match:              return {                  '_type': 'url_transparent', @@ -675,7 +805,7 @@ class GenericIE(InfoExtractor):          # Look for embedded blip.tv player          mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)          if mobj: -            return self.url_result('http://blip.tv/a/a-'+mobj.group(1), 'BlipTV') +            return self.url_result('http://blip.tv/a/a-' + mobj.group(1), 'BlipTV')          mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage)          if mobj:              return self.url_result(mobj.group(1), 'BlipTV') @@ -711,7 +841,7 @@ class GenericIE(InfoExtractor):          # Look for Ooyala videos          mobj = (re.search(r'player.ooyala.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or -             re.search(r'OO.Player.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage)) +                re.search(r'OO.Player.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage))          if mobj is not None:              return OoyalaIE._build_url_result(mobj.group('ec')) @@ -805,7 +935,7 @@ class GenericIE(InfoExtractor):          # Look for embeded soundcloud player          mobj = re.search( -            r'<iframe src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"', +            r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',              webpage)          if mobj is not None:              url = unescapeHTML(mobj.group('url')) @@ -841,12 +971,31 @@ class GenericIE(InfoExtractor):          if mobj is not None:              return self.url_result(mobj.group('url'), 'SBS') +        # Look for embedded Cinchcast player +        mobj = re.search( +            r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1', +            webpage) +        if mobj is not None: +            return self.url_result(mobj.group('url'), 'Cinchcast') +          mobj = re.search( -            r'<iframe[^>]+?src=(["\'])(?P<url>https?://m\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1', +            r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',              webpage)          if mobj is not None:              return self.url_result(mobj.group('url'), 'MLB') +        mobj = re.search( +            r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL, +            webpage) +        if mobj is not None: +            return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast') + +        mobj = re.search( +            r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"', +            webpage) +        if mobj is not None: +            return self.url_result(mobj.group('url'), 'Livestream') +          def check_video(vurl):              vpath = compat_urlparse.urlparse(vurl).path              vext = determine_ext(vpath) @@ -894,7 +1043,7 @@ class GenericIE(InfoExtractor):                  found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))          if not found:              # HTML5 video -            found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]+)? src="([^"]+)"', webpage) +            found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)          if not found:              found = re.search(                  r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")' @@ -940,4 +1089,3 @@ class GenericIE(InfoExtractor):                  '_type': 'playlist',                  'entries': entries,              } -  | 
