diff options
| author | Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com> | 2013-11-15 12:24:54 +0100 | 
|---|---|---|
| committer | Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com> | 2013-11-15 12:24:54 +0100 | 
| commit | ab2d524780736249c8988313db021e83642c24d1 (patch) | |
| tree | 162e3fb13da0769922631e9c4a9bbce656d784b8 | |
| parent | 85d61685f15bdc62709c699e849af512db78089f (diff) | |
Improve the OpenGraph regex
* Do not accept '>' between the property and content attributes.
* Recognize the properties if the content attribute is before the property attribute using two regexes (fixes the extraction of the description for SlideshareIE).
| -rw-r--r-- | youtube_dl/extractor/common.py | 14 | 
1 files changed, 9 insertions, 5 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 9c20d30b4..e02176852 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -315,13 +315,17 @@ class InfoExtractor(object):      # Helper functions for extracting OpenGraph info      @staticmethod -    def _og_regex(prop): -        return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop) +    def _og_regexes(prop): +        esc_prop = re.escape(prop) +        return [ +            r'<meta[^>]+?property=[\'"]og:%s[\'"][^>]+?content=(?:"(.+?)"|\'(.+?)\')' % esc_prop, +            r'<meta[^>]+?content=(?:"(.+?)"|\'(.+?)\')[^>]+?property=[\'"]og:%s[\'"]' % esc_prop, +        ]      def _og_search_property(self, prop, html, name=None, **kargs):          if name is None:              name = 'OpenGraph %s' % prop -        escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs) +        escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)          if escaped is None:              return None          return unescapeHTML(escaped) @@ -336,8 +340,8 @@ class InfoExtractor(object):          return self._og_search_property('title', html, **kargs)      def _og_search_video_url(self, html, name='video url', secure=True, **kargs): -        regexes = [self._og_regex('video')] -        if secure: regexes.insert(0, self._og_regex('video:secure_url')) +        regexes = self._og_regexes('video') +        if secure: regexes = self._og_regexes('video:secure_url') + regexes          return self._html_search_regex(regexes, html, name, **kargs)      def _rta_search(self, html):  | 
