aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
authorPhilipp Hagemeister <phihag@phihag.de>2014-01-27 05:47:30 +0100
committerPhilipp Hagemeister <phihag@phihag.de>2014-01-27 05:47:38 +0100
commitdb1f388878db8ce2ae6473a5447a5aa6c9ea86f1 (patch)
tree1ecd54e2f4209ad7ed6ddf07617a1d54b535b4b8 /youtube_dl/extractor
parent0f2999fe2b352795d54e6fcc4027e6a64ce5bc1d (diff)
[huffpost] Add support
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/__init__.py1
-rw-r--r--youtube_dl/extractor/common.py2
-rw-r--r--youtube_dl/extractor/generic.py8
-rw-r--r--youtube_dl/extractor/huffpost.py70
4 files changed, 79 insertions, 2 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 8daf995b9..5de90d6d9 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -83,6 +83,7 @@ from .googlesearch import GoogleSearchIE
from .hark import HarkIE
from .hotnewhiphop import HotNewHipHopIE
from .howcast import HowcastIE
+from .huffpost import HuffPostIE
from .hypem import HypemIE
from .ign import IGNIE, OneUPIE
from .imdb import (
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 3cf742a3b..db1ca9edb 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -71,7 +71,7 @@ class InfoExtractor(object):
* player_url SWF Player URL (used for rtmpdump).
* protocol The protocol that will be used for the actual
download, lower-case.
- "http", "https", "rtsp", "rtmp" or so.
+ "http", "https", "rtsp", "rtmp", "m3u8" or so.
* preference Order number of this format. If this field is
present and not None, the formats get sorted
by this field.
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index e1933837d..829e5894f 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -332,10 +332,16 @@ class GenericIE(InfoExtractor):
# Look for embedded Facebook player
mobj = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>https://www.facebook.com/video/embed.+?)\1', webpage)
+ r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'Facebook')
+ # Look for embedded Huffington Post player
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live.huffingtonpost\.com/.+?)\1', webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'HuffPost')
+
# Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
if mobj is None:
diff --git a/youtube_dl/extractor/huffpost.py b/youtube_dl/extractor/huffpost.py
new file mode 100644
index 000000000..b47114ab4
--- /dev/null
+++ b/youtube_dl/extractor/huffpost.py
@@ -0,0 +1,70 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_duration,
+ unified_strdate,
+)
+
+
+class HuffPostIE(InfoExtractor):
+ IE_DESC = 'Huffington Post'
+ _VALID_URL = r'''(?x)
+ https?://(embed\.)?live\.huffingtonpost\.com/
+ (?:
+ r/segment/[^/]+/|
+ HPLEmbedPlayer/\?segmentId=
+ )
+ (?P<id>[0-9a-f]+)'''
+
+ _TEST = {
+ 'url': 'http://live.huffingtonpost.com/r/segment/legalese-it/52dd3e4b02a7602131000677',
+ 'file': '52dd3e4b02a7602131000677.mp4',
+ 'md5': 'TODO',
+ 'info_dict': {
+ 'title': 'TODO',
+ 'description': 'TODO',
+ 'duration': 1549,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ api_url = 'http://embed.live.huffingtonpost.com/api/segments/%s.json' % video_id
+ data = self._download_json(api_url, video_id)['data']
+
+ video_title = data['title']
+ duration = parse_duration(data['running_time'])
+ upload_date = unified_strdate(data['schedule']['started_at'])
+
+ thumbnails = []
+ for url in data['images'].values():
+ m = re.match('.*-([0-9]+x[0-9]+)\.', url)
+ if not m:
+ continue
+ thumbnails.append({
+ 'url': url,
+ 'resolution': m.group(1),
+ })
+
+ formats = [{
+ 'format': key,
+ 'format_id': key.replace('/', '.'),
+ 'ext': 'mp4',
+ 'url': url,
+ 'vcodec': 'none' if key.startswith('audio/') else None,
+ } for key, url in data['sources']['live'].items()]
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': video_title,
+ 'formats': formats,
+ 'duration': duration,
+ 'upload_date': upload_date,
+ 'thumbnails': thumbnails,
+ }