aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhilipp Hagemeister <phihag@phihag.de>2013-12-06 09:15:04 +0100
committerPhilipp Hagemeister <phihag@phihag.de>2013-12-06 09:15:04 +0100
commitef4fd848573b601502ba9142d5ce521294024356 (patch)
tree8dbccaff38949257f646fc5371dd9c72f2595b68
parent72135030d1235f608a2b5e0ec007ca8e6e19e3b4 (diff)
downloadyoutube-dl-ef4fd848573b601502ba9142d5ce521294024356.tar.xz
[wistia] Add extractor
-rw-r--r--youtube_dl/YoutubeDL.py3
-rw-r--r--youtube_dl/extractor/__init__.py1
-rw-r--r--youtube_dl/extractor/generic.py31
-rw-r--r--youtube_dl/extractor/wistia.py55
4 files changed, 80 insertions, 10 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py
index 8ad7bd1da..07b36a98e 100644
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -488,7 +488,8 @@ class YoutubeDL(object):
new_result = ie_result.copy()
for f in ('_type', 'url', 'ext', 'player_url', 'formats',
'entries', 'urlhandle', 'ie_key', 'duration',
- 'subtitles', 'annotations', 'format'):
+ 'subtitles', 'annotations', 'format',
+ 'thumbnail', 'thumbnails'):
if f in new_result:
del new_result[f]
if f in embedded_info:
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index a78dcad7f..a7d37d48b 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -178,6 +178,7 @@ from .wat import WatIE
from .websurg import WeBSurgIE
from .weibo import WeiboIE
from .wimp import WimpIE
+from .wistia import WistiaIE
from .worldstarhiphop import WorldStarHipHopIE
from .xhamster import XHamsterIE
from .xnxx import XNXXIE
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 10ae06263..216e03218 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -169,8 +169,13 @@ class GenericIE(InfoExtractor):
# Site Name | Video Title
# Video Title - Tagline | Site Name
# and so on and so forth; it's just not practical
- video_title = self._html_search_regex(r'<title>(.*)</title>',
- webpage, u'video title', default=u'video', flags=re.DOTALL)
+ video_title = self._html_search_regex(
+ r'(?s)<title>(.*?)</title>', webpage, u'video title',
+ default=u'video')
+
+ # video uploader is domain name
+ video_uploader = self._search_regex(
+ r'^(?:https?://)?([^/]*)/.*', url, u'video uploader')
# Look for BrightCove:
bc_url = BrightcoveIE._extract_brightcove_url(webpage)
@@ -188,7 +193,7 @@ class GenericIE(InfoExtractor):
# Look for embedded YouTube player
matches = re.findall(
- r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube.com/embed/.+?)\1', webpage)
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube\.com/embed/.+?)\1', webpage)
if matches:
urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube')
for tuppl in matches]
@@ -197,13 +202,26 @@ class GenericIE(InfoExtractor):
# Look for embedded Dailymotion player
matches = re.findall(
- r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion.com/embed/video/.+?)\1', webpage)
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
if matches:
urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Dailymotion')
for tuppl in matches]
return self.playlist_result(
urlrs, playlist_id=video_id, playlist_title=video_title)
+ # Look for embedded Wistia player
+ match = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
+ if match:
+ return {
+ '_type': 'url_transparent',
+ 'url': unescapeHTML(match.group('url')),
+ 'ie_key': 'Wistia',
+ 'uploader': video_uploader,
+ 'title': video_title,
+ 'id': video_id,
+ }
+
# Look for Bandcamp pages with custom domain
mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
if mobj is not None:
@@ -247,14 +265,9 @@ class GenericIE(InfoExtractor):
# here's a fun little line of code for you:
video_id = os.path.splitext(video_id)[0]
- # video uploader is domain name
- video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
- url, u'video uploader')
-
return {
'id': video_id,
'url': video_url,
'uploader': video_uploader,
- 'upload_date': None,
'title': video_title,
}
diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py
new file mode 100644
index 000000000..e1748c261
--- /dev/null
+++ b/youtube_dl/extractor/wistia.py
@@ -0,0 +1,55 @@
+import json
+import re
+
+from .common import InfoExtractor
+
+
+class WistiaIE(InfoExtractor):
+ _VALID_URL = r'^https?://(?:fast\.)?wistia\.net/embed/iframe/(?P<id>[a-z0-9]+)'
+
+ _TEST = {
+ u"url": u"http://fast.wistia.net/embed/iframe/sh7fpupwlt",
+ u"file": u"sh7fpupwlt.mov",
+ u"md5": u"cafeb56ec0c53c18c97405eecb3133df",
+ u"info_dict": {
+ u"title": u"cfh_resourceful_zdkh_final_1"
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+ data_json = self._html_search_regex(
+ r'Wistia.iframeInit\((.*?), {}\);', webpage, u'video data')
+
+ data = json.loads(data_json)
+
+ formats = []
+ thumbnails = []
+ for atype, a in data['assets'].items():
+ if atype == 'still':
+ thumbnails.append({
+ 'url': a['url'],
+ 'resolution': '%dx%d' % (a['width'], a['height']),
+ })
+ continue
+ if atype == 'preview':
+ continue
+ formats.append({
+ 'format_id': atype,
+ 'url': a['url'],
+ 'width': a['width'],
+ 'height': a['height'],
+ 'filesize': a['size'],
+ 'ext': a['ext'],
+ })
+ formats.sort(key=lambda a: a['filesize'])
+
+ return {
+ 'id': video_id,
+ 'title': data['name'],
+ 'formats': formats,
+ 'thumbnails': thumbnails,
+ }