diff options
| -rw-r--r-- | youtube_dl/extractor/common.py | 24 | ||||
| -rw-r--r-- | youtube_dl/extractor/cspan.py | 4 | ||||
| -rw-r--r-- | youtube_dl/extractor/dailymotion.py | 5 | ||||
| -rw-r--r-- | youtube_dl/extractor/ehow.py | 11 | ||||
| -rw-r--r-- | youtube_dl/extractor/escapist.py | 8 | ||||
| -rw-r--r-- | youtube_dl/extractor/flickr.py | 15 | ||||
| -rw-r--r-- | youtube_dl/extractor/funnyordie.py | 5 | ||||
| -rw-r--r-- | youtube_dl/extractor/hotnewhiphop.py | 8 | ||||
| -rw-r--r-- | youtube_dl/extractor/instagram.py | 10 | ||||
| -rw-r--r-- | youtube_dl/extractor/keek.py | 3 | ||||
| -rw-r--r-- | youtube_dl/extractor/liveleak.py | 6 | ||||
| -rw-r--r-- | youtube_dl/extractor/nba.py | 3 | ||||
| -rw-r--r-- | youtube_dl/extractor/statigram.py | 10 | ||||
| -rw-r--r-- | youtube_dl/extractor/teamcoco.py | 15 | ||||
| -rw-r--r-- | youtube_dl/extractor/traileraddict.py | 9 | ||||
| -rw-r--r-- | youtube_dl/extractor/tutv.py | 4 | ||||
| -rw-r--r-- | youtube_dl/extractor/vine.py | 10 | 
17 files changed, 54 insertions, 96 deletions
| diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 1bd5538ca..05b243871 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -257,6 +257,30 @@ class InfoExtractor(object):          return (username, password) +    # Helper functions for extracting OpenGraph info +    @staticmethod +    def _og_regex(property): +        return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % property + +    def _og_search_property(self, property, html, name=None, **kargs): +        if name is None: +            name = 'OpenGraph %s' % property +        return self._html_search_regex(self._og_regex(property), html, name, flags=re.DOTALL, **kargs) + +    def _og_search_thumbnail(self, html, **kargs): +        return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs) + +    def _og_search_description(self, html, **kargs): +        return self._og_search_property('description', html, fatal=False, **kargs) + +    def _og_search_title(self, html, **kargs): +        return self._og_search_property('title', html, **kargs) + +    def _og_search_video_url(self, html, name='video url', **kargs): +        return self._html_search_regex([self._og_regex('video:secure_url'), +                                        self._og_regex('video')], +                                       html, name, **kargs) +  class SearchInfoExtractor(InfoExtractor):      """      Base class for paged search queries extractors. diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index a4853279b..7bf03c584 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -34,8 +34,6 @@ class CSpanIE(InfoExtractor):          description = self._html_search_regex(r'<meta (?:property="og:|name=")description" content="(.*?)"',                                                webpage, 'description',                                                flags=re.MULTILINE|re.DOTALL) -        thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.*?)"', -                                            webpage, 'thumbnail')          url = self._search_regex(r'<string name="URL">(.*?)</string>',                                   video_info, 'video url') @@ -49,5 +47,5 @@ class CSpanIE(InfoExtractor):                  'url': url,                  'play_path': path,                  'description': description, -                'thumbnail': thumbnail, +                'thumbnail': self._og_search_thumbnail(webpage),                  } diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 5fd2221a7..9bf7a28ca 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -39,9 +39,6 @@ class DailymotionIE(InfoExtractor):          # Extract URL, uploader and title from webpage          self.report_extraction(video_id) -        video_title = self._html_search_regex(r'<meta property="og:title" content="(.*?)" />', -                                              webpage, 'title') -          video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',                                               # Looking for official user                                               r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'], @@ -76,7 +73,7 @@ class DailymotionIE(InfoExtractor):              'url':      video_url,              'uploader': video_uploader,              'upload_date':  video_upload_date, -            'title':    video_title, +            'title':    self._og_search_title(webpage),              'ext':      video_extension,              'thumbnail': info['thumbnail_url']          }] diff --git a/youtube_dl/extractor/ehow.py b/youtube_dl/extractor/ehow.py index 1f0b3888e..2bb77aec6 100644 --- a/youtube_dl/extractor/ehow.py +++ b/youtube_dl/extractor/ehow.py @@ -28,14 +28,9 @@ class EHowIE(InfoExtractor):          video_url = self._search_regex(r'(?:file|source)=(http[^\'"&]*)',              webpage, u'video URL')          final_url = compat_urllib_parse.unquote(video_url)         -        thumbnail_url = self._search_regex(r'<meta property="og:image" content="(.+?)" />', -            webpage, u'thumbnail URL')          uploader = self._search_regex(r'<meta name="uploader" content="(.+?)" />',              webpage, u'uploader') -        title = self._search_regex(r'<meta property="og:title" content="(.+?)" />', -            webpage, u'Video title').replace(' | eHow', '') -        description = self._search_regex(r'<meta property="og:description" content="(.+?)" />', -            webpage, u'video description') +        title = self._og_search_title(webpage).replace(' | eHow', '')          ext = determine_ext(final_url)          return { @@ -44,8 +39,8 @@ class EHowIE(InfoExtractor):              'url':         final_url,              'ext':         ext,              'title':       title, -            'thumbnail':   thumbnail_url, -            'description': description, +            'thumbnail':   self._og_search_thumbnail(webpage), +            'description': self._og_search_description(webpage),              'uploader':    uploader,          } diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py index 794460e84..3aa2da52c 100644 --- a/youtube_dl/extractor/escapist.py +++ b/youtube_dl/extractor/escapist.py @@ -36,11 +36,7 @@ class EscapistIE(InfoExtractor):          videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',              webpage, u'description', fatal=False) -        imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"', -            webpage, u'thumbnail', fatal=False) - -        playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"', -            webpage, u'player url') +        playerUrl = self._og_search_video_url(webpage, name='player url')          title = self._html_search_regex('<meta name="title" content="([^"]*)"',              webpage, u'player url').split(' : ')[-1] @@ -70,7 +66,7 @@ class EscapistIE(InfoExtractor):              'upload_date': None,              'title': title,              'ext': 'mp4', -            'thumbnail': imgUrl, +            'thumbnail': self._og_search_thumbnail(webpage),              'description': videoDesc,              'player_url': playerUrl,          } diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index bd97bff9a..80d96baf7 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -47,21 +47,12 @@ class FlickrIE(InfoExtractor):              raise ExtractorError(u'Unable to extract video url')          video_url = mobj.group(1) + unescapeHTML(mobj.group(2)) -        video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')', -            webpage, u'video title') - -        video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')', -            webpage, u'description', fatal=False) - -        thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')', -            webpage, u'thumbnail', fatal=False) -          return [{              'id':          video_id,              'url':         video_url,              'ext':         'mp4', -            'title':       video_title, -            'description': video_description, -            'thumbnail':   thumbnail, +            'title':       self._og_search_title(webpage), +            'description': self._og_search_description(webpage), +            'thumbnail':   self._og_search_thumbnail(webpage),              'uploader_id': video_uploader_id,          }] diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 388aacf2f..67a7e5f76 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -27,14 +27,11 @@ class FunnyOrDieIE(InfoExtractor):          title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",              r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL) -        video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"', -            webpage, u'description', fatal=False, flags=re.DOTALL) -          info = {              'id': video_id,              'url': video_url,              'ext': 'mp4',              'title': title, -            'description': video_description, +            'description': self._og_search_description(webpage),          }          return [info] diff --git a/youtube_dl/extractor/hotnewhiphop.py b/youtube_dl/extractor/hotnewhiphop.py index ca3abb7d7..ccca1d7e0 100644 --- a/youtube_dl/extractor/hotnewhiphop.py +++ b/youtube_dl/extractor/hotnewhiphop.py @@ -33,16 +33,12 @@ class HotNewHipHopIE(InfoExtractor):          video_title = self._html_search_regex(r"<title>(.*)</title>",              webpage_src, u'title') -         -        # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. -        thumbnail = self._html_search_regex(r'"og:image" content="(.*)"', -            webpage_src, u'thumbnail', fatal=False)          results = [{                      'id': video_id,                      'url' : video_url,                      'title' : video_title, -                    'thumbnail' : thumbnail, +                    'thumbnail' : self._og_search_thumbnail(webpage_src),                      'ext' : 'mp3',                      }] -        return results
\ No newline at end of file +        return results diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 6ae704efd..1ffadf67f 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -18,12 +18,6 @@ class InstagramIE(InfoExtractor):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group(1)          webpage = self._download_webpage(url, video_id) -        video_url = self._html_search_regex( -            r'<meta property="og:video" content="(.+?)"', -            webpage, u'video URL') -        thumbnail_url = self._html_search_regex( -            r'<meta property="og:image" content="(.+?)" />', -            webpage, u'thumbnail URL', fatal=False)          html_title = self._html_search_regex(              r'<title>(.+?)</title>',              webpage, u'title', flags=re.DOTALL) @@ -34,9 +28,9 @@ class InstagramIE(InfoExtractor):          return [{              'id':        video_id, -            'url':       video_url, +            'url':       self._og_search_video_url(webpage),              'ext':       ext,              'title':     title, -            'thumbnail': thumbnail_url, +            'thumbnail': self._og_search_thumbnail(webpage),              'uploader_id' : uploader_id          }] diff --git a/youtube_dl/extractor/keek.py b/youtube_dl/extractor/keek.py index 72ad6a3d0..dda78743d 100644 --- a/youtube_dl/extractor/keek.py +++ b/youtube_dl/extractor/keek.py @@ -24,8 +24,7 @@ class KeekIE(InfoExtractor):          thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id          webpage = self._download_webpage(url, video_id) -        video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"', -            webpage, u'title') +        video_title = self._og_search_title(webpage)          uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',              webpage, u'uploader', fatal=False) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index cf8a2c931..dd062a14e 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -33,11 +33,9 @@ class LiveLeakIE(InfoExtractor):          video_url = self._search_regex(r'file: "(.*?)",',              webpage, u'video URL') -        video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"', -            webpage, u'title').replace('LiveLeak.com -', '').strip() +        video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip() -        video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"', -            webpage, u'description', fatal=False) +        video_description = self._og_search_description(webpage)          video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',              webpage, u'uploader', fatal=False) diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 122b7dd26..0f178905b 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -30,8 +30,7 @@ class NBAIE(InfoExtractor):          video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'          shortened_video_id = video_id.rpartition('/')[2] -        title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"', -            webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '') +        title = self._og_search_title(webpage, default=shortened_video_id).replace('NBA.com: ', '')          # It isn't there in the HTML it returns to us          # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False) diff --git a/youtube_dl/extractor/statigram.py b/youtube_dl/extractor/statigram.py index ae9a63e8b..b8e6b3bf9 100644 --- a/youtube_dl/extractor/statigram.py +++ b/youtube_dl/extractor/statigram.py @@ -18,12 +18,6 @@ class StatigramIE(InfoExtractor):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group(1)          webpage = self._download_webpage(url, video_id) -        video_url = self._html_search_regex( -            r'<meta property="og:video:secure_url" content="(.+?)">', -            webpage, u'video URL') -        thumbnail_url = self._html_search_regex( -            r'<meta property="og:image" content="(.+?)" />', -            webpage, u'thumbnail URL', fatal=False)          html_title = self._html_search_regex(              r'<title>(.+?)</title>',              webpage, u'title') @@ -34,9 +28,9 @@ class StatigramIE(InfoExtractor):          return [{              'id':        video_id, -            'url':       video_url, +            'url':       self._og_search_video_url(webpage),              'ext':       ext,              'title':     title, -            'thumbnail': thumbnail_url, +            'thumbnail': self._og_search_thumbnail(webpage),              'uploader_id' : uploader_id          }] diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 1dd5e1b68..ec92e589a 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -30,15 +30,6 @@ class TeamcocoIE(InfoExtractor):          self.report_extraction(video_id) -        video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"', -            webpage, u'title') - -        thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"', -            webpage, u'thumbnail', fatal=False) - -        video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"', -            webpage, u'description', fatal=False) -          data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id          data = self._download_webpage(data_url, video_id, 'Downloading data webpage') @@ -49,7 +40,7 @@ class TeamcocoIE(InfoExtractor):              'id':          video_id,              'url':         video_url,              'ext':         'mp4', -            'title':       video_title, -            'thumbnail':   thumbnail, -            'description': video_description, +            'title':       self._og_search_title(webpage), +            'thumbnail':   self._og_search_thumbnail(webpage), +            'description': self._og_search_description(webpage),          }] diff --git a/youtube_dl/extractor/traileraddict.py b/youtube_dl/extractor/traileraddict.py index 9dd26c163..324bb6231 100644 --- a/youtube_dl/extractor/traileraddict.py +++ b/youtube_dl/extractor/traileraddict.py @@ -24,11 +24,8 @@ class TrailerAddictIE(InfoExtractor):                  webpage, 'video title').replace(' - Trailer Addict','')          view_count = self._search_regex(r'Views: (.+?)<br />',                  webpage, 'Views Count') -        description = self._search_regex(r'<meta property="og:description" content="(.+?)" />', -                webpage, 'video description') -        video_id = self._search_regex(r'<meta property="og:video" content="(.+?)" />', -                webpage, 'Video id').split('=')[1] -         +        video_id = self._og_search_property('video', webpage, 'Video id').split('=')[1] +          info_url = "http://www.traileraddict.com/fvar.php?tid=%s" %(str(video_id))          info_webpage = self._download_webpage(info_url, video_id , "Downloading the info webpage") @@ -44,6 +41,6 @@ class TrailerAddictIE(InfoExtractor):              'ext'         : ext,              'title'       : title,              'thumbnail'   : thumbnail_url, -            'description' : description, +            'description' : self._og_search_description(webpage),              'view_count'  : view_count,          }] diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py index fcaa6ac01..4e404fbf5 100644 --- a/youtube_dl/extractor/tutv.py +++ b/youtube_dl/extractor/tutv.py @@ -22,8 +22,6 @@ class TutvIE(InfoExtractor):          video_id = mobj.group('id')          webpage = self._download_webpage(url, video_id) -        title = self._html_search_regex( -            r'<meta property="og:title" content="(.*?)">', webpage, u'title')          internal_id = self._search_regex(r'codVideo=([0-9]+)', webpage, u'internal video ID')          data_url = u'http://tu.tv/flvurl.php?codVideo=' + str(internal_id) @@ -36,6 +34,6 @@ class TutvIE(InfoExtractor):              'id': internal_id,              'url': video_url,              'ext': ext, -            'title': title, +            'title': self._og_search_title(webpage),          }          return [info] diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index bdd3522eb..c4ec1f06f 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -27,12 +27,6 @@ class VineIE(InfoExtractor):          video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',              webpage, u'video URL') -        video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"', -            webpage, u'title') - -        thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"', -            webpage, u'thumbnail', fatal=False) -          uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',              webpage, u'uploader', fatal=False, flags=re.DOTALL) @@ -40,7 +34,7 @@ class VineIE(InfoExtractor):              'id':        video_id,              'url':       video_url,              'ext':       'mp4', -            'title':     video_title, -            'thumbnail': thumbnail, +            'title':     self._og_search_title(webpage), +            'thumbnail': self._og_search_thumbnail(webpage),              'uploader':  uploader,          }] | 
