diff options
Diffstat (limited to 'youtube_dl/extractor/generic.py')
| -rw-r--r-- | youtube_dl/extractor/generic.py | 95 | 
1 files changed, 58 insertions, 37 deletions
| diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 742bc2856..9b6498894 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -28,6 +28,7 @@ from .brightcove import BrightcoveIE  from .ooyala import OoyalaIE  from .rutv import RUTVIE  from .smotri import SmotriIE +from .condenast import CondeNastIE  class GenericIE(InfoExtractor): @@ -225,21 +226,6 @@ class GenericIE(InfoExtractor):                  'skip_download': 'Requires rtmpdump'              }          }, -        # smotri embed -        { -            'url': 'http://rbctv.rbc.ru/archive/news/562949990879132.shtml', -            'md5': 'ec40048448e9284c9a1de77bb188108b', -            'info_dict': { -                'id': 'v27008541fad', -                'ext': 'mp4', -                'title': 'Крым и Севастополь вошли в состав России', -                'description': 'md5:fae01b61f68984c7bd2fa741e11c3175', -                'duration': 900, -                'upload_date': '20140318', -                'uploader': 'rbctv_2012_4', -                'uploader_id': 'rbctv_2012_4', -            }, -        },          # Condé Nast embed          {              'url': 'http://www.wired.com/2014/04/honda-asimo/', @@ -394,6 +380,17 @@ class GenericIE(InfoExtractor):                  'uploader': 'education-portal.com',              },          }, +        { +            'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz', +            'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4', +            'info_dict': { +                'id': 'uxjb0lwrcz', +                'ext': 'mp4', +                'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks', +                'duration': 1715.0, +                'uploader': 'thoughtworks.wistia.com', +            },    +        },      ]      def report_following_redirect(self, new_url): @@ -490,7 +487,8 @@ class GenericIE(InfoExtractor):                       'Set --default-search "ytsearch" (or run  youtube-dl "ytsearch:%s" ) to search YouTube'                      ) % (url, url), expected=True)              else: -                assert ':' in default_search +                if ':' not in default_search: +                    default_search += ':'                  return self.url_result(default_search + url)          url, smuggled_data = unsmuggle_url(url) @@ -623,13 +621,13 @@ class GenericIE(InfoExtractor):          if mobj:              player_url = unescapeHTML(mobj.group('url'))              surl = smuggle_url(player_url, {'Referer': url}) -            return self.url_result(surl, 'Vimeo') +            return self.url_result(surl)          # Look for embedded (swf embed) Vimeo player          mobj = re.search( -            r'<embed[^>]+?src="(https?://(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage) +            r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)          if mobj: -            return self.url_result(mobj.group(1), 'Vimeo') +            return self.url_result(mobj.group(1))          # Look for embedded YouTube player          matches = re.findall(r'''(?x) @@ -654,19 +652,32 @@ class GenericIE(InfoExtractor):              return _playlist_from_matches(                  matches, lambda m: unescapeHTML(m[1])) +        # Look for embedded Dailymotion playlist player (#3822) +        m = re.search( +            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage) +        if m: +            playlists = re.findall( +                r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url'))) +            if playlists: +                return _playlist_from_matches( +                    playlists, lambda p: '//dailymotion.com/playlist/%s' % p) +          # Look for embedded Wistia player          match = re.search( -            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) +            r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)          if match: +            embed_url = self._proto_relative_url( +                unescapeHTML(match.group('url')))              return {                  '_type': 'url_transparent', -                'url': unescapeHTML(match.group('url')), +                'url': embed_url,                  'ie_key': 'Wistia',                  'uploader': video_uploader,                  'title': video_title,                  'id': video_id,              } -        match = re.search(r'(?:id=["\']wistia_|data-wistiaid=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage) +             +        match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)          if match:              return {                  '_type': 'url_transparent', @@ -852,47 +863,57 @@ class GenericIE(InfoExtractor):          if mobj is not None:              return self.url_result(mobj.group('url'), 'MLB') +        mobj = re.search( +            r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL, +            webpage) +        if mobj is not None: +            return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast') + +        def check_video(vurl): +            vpath = compat_urlparse.urlparse(vurl).path +            vext = determine_ext(vpath) +            return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml') + +        def filter_video(urls): +            return list(filter(check_video, urls)) +          # Start with something easy: JW Player in SWFObject -        found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) +        found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))          if not found:              # Look for gorilla-vid style embedding -            found = re.findall(r'''(?sx) +            found = filter_video(re.findall(r'''(?sx)                  (?:                      jw_plugins|                      JWPlayerOptions|                      jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup                  ) -                .*?file\s*:\s*["\'](.*?)["\']''', webpage) +                .*?file\s*:\s*["\'](.*?)["\']''', webpage))          if not found:              # Broaden the search a little bit -            found = re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) +            found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))          if not found:              # Broaden the findall a little bit: JWPlayer JS loader -            found = re.findall(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage) +            found = filter_video(re.findall( +                r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))          if not found:              # Flow player -            found = re.findall(r'''(?xs) +            found = filter_video(re.findall(r'''(?xs)                  flowplayer\("[^"]+",\s*                      \{[^}]+?\}\s*,                      \s*{[^}]+? ["']?clip["']?\s*:\s*\{\s*                          ["']?url["']?\s*:\s*["']([^"']+)["'] -            ''', webpage) +            ''', webpage))          if not found:              # Try to find twitter cards info -            found = re.findall(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage) +            found = filter_video(re.findall( +                r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))          if not found:              # We look for Open Graph info:              # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)              m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)              # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:              if m_video_type is not None: -                def check_video(vurl): -                    vpath = compat_urlparse.urlparse(vurl).path -                    vext = determine_ext(vpath) -                    return '.' in vpath and vext not in ('swf', 'png', 'jpg') -                found = list(filter( -                    check_video, -                    re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))) +                found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))          if not found:              # HTML5 video              found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]+)? src="([^"]+)"', webpage) | 
