diff options
| author | Philipp Hagemeister <phihag@phihag.de> | 2013-12-06 09:15:04 +0100 | 
|---|---|---|
| committer | Philipp Hagemeister <phihag@phihag.de> | 2013-12-06 09:15:04 +0100 | 
| commit | ef4fd848573b601502ba9142d5ce521294024356 (patch) | |
| tree | 8dbccaff38949257f646fc5371dd9c72f2595b68 | |
| parent | 72135030d1235f608a2b5e0ec007ca8e6e19e3b4 (diff) | |
[wistia] Add extractor
| -rw-r--r-- | youtube_dl/YoutubeDL.py | 3 | ||||
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/generic.py | 31 | ||||
| -rw-r--r-- | youtube_dl/extractor/wistia.py | 55 | 
4 files changed, 80 insertions, 10 deletions
| diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 8ad7bd1da..07b36a98e 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -488,7 +488,8 @@ class YoutubeDL(object):                  new_result = ie_result.copy()                  for f in ('_type', 'url', 'ext', 'player_url', 'formats',                            'entries', 'urlhandle', 'ie_key', 'duration', -                          'subtitles', 'annotations', 'format'): +                          'subtitles', 'annotations', 'format', +                          'thumbnail', 'thumbnails'):                      if f in new_result:                          del new_result[f]                      if f in embedded_info: diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a78dcad7f..a7d37d48b 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -178,6 +178,7 @@ from .wat import WatIE  from .websurg import WeBSurgIE  from .weibo import WeiboIE  from .wimp import WimpIE +from .wistia import WistiaIE  from .worldstarhiphop import WorldStarHipHopIE  from .xhamster import XHamsterIE  from .xnxx import XNXXIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 10ae06263..216e03218 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -169,8 +169,13 @@ class GenericIE(InfoExtractor):          #   Site Name | Video Title          #   Video Title - Tagline | Site Name          # and so on and so forth; it's just not practical -        video_title = self._html_search_regex(r'<title>(.*)</title>', -            webpage, u'video title', default=u'video', flags=re.DOTALL) +        video_title = self._html_search_regex( +            r'(?s)<title>(.*?)</title>', webpage, u'video title', +            default=u'video') + +        # video uploader is domain name +        video_uploader = self._search_regex( +            r'^(?:https?://)?([^/]*)/.*', url, u'video uploader')          # Look for BrightCove:          bc_url = BrightcoveIE._extract_brightcove_url(webpage) @@ -188,7 +193,7 @@ class GenericIE(InfoExtractor):          # Look for embedded YouTube player          matches = re.findall( -            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube.com/embed/.+?)\1', webpage) +            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube\.com/embed/.+?)\1', webpage)          if matches:              urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube')                       for tuppl in matches] @@ -197,13 +202,26 @@ class GenericIE(InfoExtractor):          # Look for embedded Dailymotion player          matches = re.findall( -            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion.com/embed/video/.+?)\1', webpage) +            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)          if matches:              urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Dailymotion')                       for tuppl in matches]              return self.playlist_result(                  urlrs, playlist_id=video_id, playlist_title=video_title) +        # Look for embedded Wistia player +        match = re.search( +            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) +        if match: +            return { +                '_type': 'url_transparent', +                'url': unescapeHTML(match.group('url')), +                'ie_key': 'Wistia', +                'uploader': video_uploader, +                'title': video_title, +                'id': video_id, +            } +          # Look for Bandcamp pages with custom domain          mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)          if mobj is not None: @@ -247,14 +265,9 @@ class GenericIE(InfoExtractor):          # here's a fun little line of code for you:          video_id = os.path.splitext(video_id)[0] -        # video uploader is domain name -        video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*', -            url, u'video uploader') -          return {              'id':       video_id,              'url':      video_url,              'uploader': video_uploader, -            'upload_date':  None,              'title':    video_title,          } diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py new file mode 100644 index 000000000..e1748c261 --- /dev/null +++ b/youtube_dl/extractor/wistia.py @@ -0,0 +1,55 @@ +import json +import re + +from .common import InfoExtractor + + +class WistiaIE(InfoExtractor): +    _VALID_URL = r'^https?://(?:fast\.)?wistia\.net/embed/iframe/(?P<id>[a-z0-9]+)' + +    _TEST = { +        u"url": u"http://fast.wistia.net/embed/iframe/sh7fpupwlt", +        u"file": u"sh7fpupwlt.mov", +        u"md5": u"cafeb56ec0c53c18c97405eecb3133df", +        u"info_dict": { +            u"title": u"cfh_resourceful_zdkh_final_1" +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        webpage = self._download_webpage(url, video_id) +        data_json = self._html_search_regex( +            r'Wistia.iframeInit\((.*?), {}\);', webpage, u'video data') + +        data = json.loads(data_json) + +        formats = [] +        thumbnails = [] +        for atype, a in data['assets'].items(): +            if atype == 'still': +                thumbnails.append({ +                    'url': a['url'], +                    'resolution': '%dx%d' % (a['width'], a['height']), +                }) +                continue +            if atype == 'preview': +                continue +            formats.append({ +                'format_id': atype, +                'url': a['url'], +                'width': a['width'], +                'height': a['height'], +                'filesize': a['size'], +                'ext': a['ext'], +            }) +        formats.sort(key=lambda a: a['filesize']) + +        return { +            'id': video_id, +            'title': data['name'], +            'formats': formats, +            'thumbnails': thumbnails, +        } | 
