diff options
Diffstat (limited to 'youtube_dl/extractor/generic.py')
| -rw-r--r-- | youtube_dl/extractor/generic.py | 114 | 
1 files changed, 96 insertions, 18 deletions
| diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index af769ab61..7a5bf9392 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -23,6 +23,7 @@ from ..utils import (      unescapeHTML,      unified_strdate,      unsmuggle_url, +    UnsupportedError,      url_basename,  )  from .brightcove import BrightcoveIE @@ -130,12 +131,13 @@ class GenericIE(InfoExtractor):          # ooyala video          {              'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', -            'md5': '5644c6ca5d5782c1d0d350dad9bd840c', +            'md5': '166dd577b433b4d4ebfee10b0824d8ff',              'info_dict': {                  'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',                  'ext': 'mp4',                  'title': '2cc213299525360.mov',  # that's what we get              }, +            'add_ie': ['Ooyala'],          },          # google redirect          { @@ -145,7 +147,7 @@ class GenericIE(InfoExtractor):                  'ext': 'mp4',                  'upload_date': '20130224',                  'uploader_id': 'TheVerge', -                'description': 'Chris Ziegler takes a look at the Alcatel OneTouch Fire and the ZTE Open; two of the first Firefox OS handsets to be officially announced.', +                'description': 're:^Chris Ziegler takes a look at the\.*',                  'uploader': 'The Verge',                  'title': 'First Firefox OS phones side-by-side',              }, @@ -180,6 +182,14 @@ class GenericIE(InfoExtractor):                  'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',              },          }, +        # BBC iPlayer embeds +        { +            'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER', +            'info_dict': { +                'title': 'BBC - Blogs -  Adam Curtis - BUGGER', +            }, +            'playlist_mincount': 18, +        },          # RUTV embed          {              'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html', @@ -445,6 +455,39 @@ class GenericIE(InfoExtractor):                  'title': 'Rosetta #CometLanding webcast HL 10',              }          }, +        # LazyYT +        { +            'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986', +            'info_dict': { +                'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse', +            }, +            'playlist_mincount': 2, +        }, +        # Direct link with incorrect MIME type +        { +            'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', +            'md5': '4ccbebe5f36706d85221f204d7eb5913', +            'info_dict': { +                'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', +                'id': '5_Lennart_Poettering_-_Systemd', +                'ext': 'webm', +                'title': '5_Lennart_Poettering_-_Systemd', +                'upload_date': '20141120', +            }, +            'expected_warnings': [ +                'URL could be a direct video link, returning it as such.' +            ] +        }, +        # Cinchcast embed +        { +            'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/', +            'info_dict': { +                'id': '7141703', +                'ext': 'mp3', +                'upload_date': '20141126', +                'title': 'Jack Tips: 5 Steps to Permanent Gut Healing', +            } +        },      ]      def report_following_redirect(self, new_url): @@ -537,9 +580,9 @@ class GenericIE(InfoExtractor):              if default_search in ('error', 'fixup_error'):                  raise ExtractorError( -                    ('%r is not a valid URL. ' -                     'Set --default-search "ytsearch" (or run  youtube-dl "ytsearch:%s" ) to search YouTube' -                    ) % (url, url), expected=True) +                    '%r is not a valid URL. ' +                    'Set --default-search "ytsearch" (or run  youtube-dl "ytsearch:%s" ) to search YouTube' +                    % (url, url), expected=True)              else:                  if ':' not in default_search:                      default_search += ':' @@ -598,10 +641,28 @@ class GenericIE(InfoExtractor):          if not self._downloader.params.get('test', False) and not is_intentional:              self._downloader.report_warning('Falling back on generic information extractor.') -        if full_response: -            webpage = self._webpage_read_content(full_response, url, video_id) -        else: -            webpage = self._download_webpage(url, video_id) +        if not full_response: +            full_response = self._request_webpage(url, video_id) + +        # Maybe it's a direct link to a video? +        # Be careful not to download the whole thing! +        first_bytes = full_response.read(512) +        if not re.match(r'^\s*<', first_bytes.decode('utf-8', 'replace')): +            self._downloader.report_warning( +                'URL could be a direct video link, returning it as such.') +            upload_date = unified_strdate( +                head_response.headers.get('Last-Modified')) +            return { +                'id': video_id, +                'title': os.path.splitext(url_basename(url))[0], +                'direct': True, +                'url': url, +                'upload_date': upload_date, +            } + +        webpage = self._webpage_read_content( +            full_response, url, video_id, prefix=first_bytes) +          self.report_extraction(video_id)          # Is it an RSS feed? @@ -647,9 +708,9 @@ class GenericIE(InfoExtractor):              r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')          # Helper method -        def _playlist_from_matches(matches, getter, ie=None): +        def _playlist_from_matches(matches, getter=None, ie=None):              urlrs = orderedSet( -                self.url_result(self._proto_relative_url(getter(m)), ie) +                self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)                  for m in matches)              return self.playlist_result(                  urlrs, playlist_id=video_id, playlist_title=video_title) @@ -702,6 +763,12 @@ class GenericIE(InfoExtractor):              return _playlist_from_matches(                  matches, lambda m: unescapeHTML(m[1])) +        # Look for lazyYT YouTube embed +        matches = re.findall( +            r'class="lazyYT" data-youtube-id="([^"]+)"', webpage) +        if matches: +            return _playlist_from_matches(matches, lambda m: unescapeHTML(m)) +          # Look for embedded Dailymotion player          matches = re.findall(              r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage) @@ -733,7 +800,7 @@ class GenericIE(InfoExtractor):                  'title': video_title,                  'id': video_id,              } -             +          match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)          if match:              return { @@ -748,7 +815,7 @@ class GenericIE(InfoExtractor):          # Look for embedded blip.tv player          mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)          if mobj: -            return self.url_result('http://blip.tv/a/a-'+mobj.group(1), 'BlipTV') +            return self.url_result('http://blip.tv/a/a-' + mobj.group(1), 'BlipTV')          mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage)          if mobj:              return self.url_result(mobj.group(1), 'BlipTV') @@ -784,7 +851,7 @@ class GenericIE(InfoExtractor):          # Look for Ooyala videos          mobj = (re.search(r'player.ooyala.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or -             re.search(r'OO.Player.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage)) +                re.search(r'OO.Player.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage))          if mobj is not None:              return OoyalaIE._build_url_result(mobj.group('ec')) @@ -847,6 +914,11 @@ class GenericIE(InfoExtractor):              return _playlist_from_matches(                  matches, getter=unescapeHTML, ie='FunnyOrDie') +        # Look for BBC iPlayer embed +        matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage) +        if matches: +            return _playlist_from_matches(matches, ie='BBCCoUk') +          # Look for embedded RUTV player          rutv_url = RUTVIE._extract_url(webpage)          if rutv_url: @@ -854,7 +926,7 @@ class GenericIE(InfoExtractor):          # Look for embedded TED player          mobj = re.search( -            r'<iframe[^>]+?src=(["\'])(?P<url>http://embed\.ted\.com/.+?)\1', webpage) +            r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)          if mobj is not None:              return self.url_result(mobj.group('url'), 'TED') @@ -914,6 +986,13 @@ class GenericIE(InfoExtractor):          if mobj is not None:              return self.url_result(mobj.group('url'), 'SBS') +        # Look for embedded Cinchcast player +        mobj = re.search( +            r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1', +            webpage) +        if mobj is not None: +            return self.url_result(mobj.group('url'), 'Cinchcast') +          mobj = re.search(              r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',              webpage) @@ -979,7 +1058,7 @@ class GenericIE(InfoExtractor):                  found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))          if not found:              # HTML5 video -            found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src="([^"]+)"', webpage) +            found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)          if not found:              found = re.search(                  r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")' @@ -993,7 +1072,7 @@ class GenericIE(InfoExtractor):                      'url': new_url,                  }          if not found: -            raise ExtractorError('Unsupported URL: %s' % url) +            raise UnsupportedError(url)          entries = []          for video_url in found: @@ -1025,4 +1104,3 @@ class GenericIE(InfoExtractor):                  '_type': 'playlist',                  'entries': entries,              } - | 
