diff options
| -rw-r--r-- | youtube_dl/downloader/__init__.py | 5 | ||||
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/common.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/generic.py | 8 | ||||
| -rw-r--r-- | youtube_dl/extractor/huffpost.py | 70 | 
5 files changed, 83 insertions, 3 deletions
diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index 0d9eb0001..aaa92bc75 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals +  from .common import FileDownloader  from .hls import HlsFD  from .http import HttpFD @@ -12,10 +14,11 @@ from ..utils import (  def get_suitable_downloader(info_dict):      """Get the downloader class that can handle the info dict."""      url = info_dict['url'] +    protocol = info_dict.get('protocol')      if url.startswith('rtmp'):          return RtmpFD -    if determine_ext(url) == u'm3u8': +    if (protocol == 'm3u8') or (protocol is None and determine_ext(url) == 'm3u8'):          return HlsFD      if url.startswith('mms') or url.startswith('rtsp'):          return MplayerFD diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 8daf995b9..5de90d6d9 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -83,6 +83,7 @@ from .googlesearch import GoogleSearchIE  from .hark import HarkIE  from .hotnewhiphop import HotNewHipHopIE  from .howcast import HowcastIE +from .huffpost import HuffPostIE  from .hypem import HypemIE  from .ign import IGNIE, OneUPIE  from .imdb import ( diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 3cf742a3b..db1ca9edb 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -71,7 +71,7 @@ class InfoExtractor(object):                      * player_url SWF Player URL (used for rtmpdump).                      * protocol   The protocol that will be used for the actual                                   download, lower-case. -                                 "http", "https", "rtsp", "rtmp" or so. +                                 "http", "https", "rtsp", "rtmp", "m3u8" or so.                      * preference Order number of this format. If this field is                                   present and not None, the formats get sorted                                   by this field. diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index e1933837d..829e5894f 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -332,10 +332,16 @@ class GenericIE(InfoExtractor):          # Look for embedded Facebook player          mobj = re.search( -            r'<iframe[^>]+?src=(["\'])(?P<url>https://www.facebook.com/video/embed.+?)\1', webpage) +            r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)          if mobj is not None:              return self.url_result(mobj.group('url'), 'Facebook') +        # Look for embedded Huffington Post player +        mobj = re.search( +            r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live.huffingtonpost\.com/.+?)\1', webpage) +        if mobj is not None: +            return self.url_result(mobj.group('url'), 'HuffPost') +          # Start with something easy: JW Player in SWFObject          mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)          if mobj is None: diff --git a/youtube_dl/extractor/huffpost.py b/youtube_dl/extractor/huffpost.py new file mode 100644 index 000000000..b47114ab4 --- /dev/null +++ b/youtube_dl/extractor/huffpost.py @@ -0,0 +1,70 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    parse_duration, +    unified_strdate, +) + + +class HuffPostIE(InfoExtractor): +    IE_DESC = 'Huffington Post' +    _VALID_URL = r'''(?x) +        https?://(embed\.)?live\.huffingtonpost\.com/ +        (?: +            r/segment/[^/]+/| +            HPLEmbedPlayer/\?segmentId= +        ) +        (?P<id>[0-9a-f]+)''' + +    _TEST = { +        'url': 'http://live.huffingtonpost.com/r/segment/legalese-it/52dd3e4b02a7602131000677', +        'file': '52dd3e4b02a7602131000677.mp4', +        'md5': 'TODO', +        'info_dict': { +            'title': 'TODO', +            'description': 'TODO', +            'duration': 1549, +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        api_url = 'http://embed.live.huffingtonpost.com/api/segments/%s.json' % video_id +        data = self._download_json(api_url, video_id)['data'] + +        video_title = data['title'] +        duration = parse_duration(data['running_time']) +        upload_date = unified_strdate(data['schedule']['started_at']) + +        thumbnails = [] +        for url in data['images'].values(): +            m = re.match('.*-([0-9]+x[0-9]+)\.', url) +            if not m: +                continue +            thumbnails.append({ +                'url': url, +                'resolution': m.group(1), +            }) + +        formats = [{ +            'format': key, +            'format_id': key.replace('/', '.'), +            'ext': 'mp4', +            'url': url, +            'vcodec': 'none' if key.startswith('audio/') else None, +        } for key, url in data['sources']['live'].items()] +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': video_title, +            'formats': formats, +            'duration': duration, +            'upload_date': upload_date, +            'thumbnails': thumbnails, +        }  | 
