diff options
author | Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com> | 2014-02-09 14:22:56 +0100 |
---|---|---|
committer | Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com> | 2014-02-09 14:23:19 +0100 |
commit | 1afe753462f0293122dc7a9b534b4f5cdb1e5c4e (patch) | |
tree | 133b44c05d94d1114428c3a98dedea4563343eee /youtube_dl/extractor | |
parent | 524c2c716a55d16b3cf4f2ed344e3a063bb63ce8 (diff) |
[slideshare] Fix description extraction and modernize
The ‘og:description’ property doesn’t contain the full description
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/slideshare.py | 23 |
1 files changed, 14 insertions, 9 deletions
diff --git a/youtube_dl/extractor/slideshare.py b/youtube_dl/extractor/slideshare.py index afc3001b5..9c62825cc 100644 --- a/youtube_dl/extractor/slideshare.py +++ b/youtube_dl/extractor/slideshare.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re import json @@ -12,11 +14,12 @@ class SlideshareIE(InfoExtractor): _VALID_URL = r'https?://www\.slideshare\.net/[^/]+?/(?P<title>.+?)($|\?)' _TEST = { - u'url': u'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity', - u'file': u'25665706.mp4', - u'info_dict': { - u'title': u'Managing Scale and Complexity', - u'description': u'This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix', + 'url': 'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity', + 'info_dict': { + 'id': '25665706', + 'ext': 'mp4', + 'title': 'Managing Scale and Complexity', + 'description': 'This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix.', }, } @@ -26,15 +29,17 @@ class SlideshareIE(InfoExtractor): webpage = self._download_webpage(url, page_title) slideshare_obj = self._search_regex( r'var slideshare_object = ({.*?}); var user_info =', - webpage, u'slideshare object') + webpage, 'slideshare object') info = json.loads(slideshare_obj) - if info['slideshow']['type'] != u'video': - raise ExtractorError(u'Webpage type is "%s": only video extraction is supported for Slideshare' % info['slideshow']['type'], expected=True) + if info['slideshow']['type'] != 'video': + raise ExtractorError('Webpage type is "%s": only video extraction is supported for Slideshare' % info['slideshow']['type'], expected=True) doc = info['doc'] bucket = info['jsplayer']['video_bucket'] ext = info['jsplayer']['video_extension'] video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext) + description = self._html_search_regex( + r'<p class="description.*?"[^>]*>(.*?)</p>', webpage, 'description') return { '_type': 'video', @@ -43,5 +48,5 @@ class SlideshareIE(InfoExtractor): 'ext': ext, 'url': video_url, 'thumbnail': info['slideshow']['pin_image_url'], - 'description': self._og_search_description(webpage), + 'description': description, } |