diff options
| -rw-r--r-- | AUTHORS | 1 | ||||
| -rw-r--r-- | README.md | 2 | ||||
| -rw-r--r-- | test/test_utils.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 6 | ||||
| -rw-r--r-- | youtube_dl/extractor/airmozilla.py | 74 | ||||
| -rw-r--r-- | youtube_dl/extractor/escapist.py | 45 | ||||
| -rw-r--r-- | youtube_dl/extractor/generic.py | 4 | ||||
| -rw-r--r-- | youtube_dl/extractor/letv.py | 190 | ||||
| -rw-r--r-- | youtube_dl/extractor/mitele.py | 4 | ||||
| -rw-r--r-- | youtube_dl/extractor/rtlnow.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/telecinco.py | 9 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 7 | ||||
| -rw-r--r-- | youtube_dl/version.py | 2 | 
13 files changed, 326 insertions, 21 deletions
| @@ -112,3 +112,4 @@ Frans de Jonge  Robin de Rooij  Ryan Schmidt  Leslie P. Polzer +Duncan Keall @@ -139,6 +139,8 @@ which means you can modify it, redistribute it or use it however you like.                                       dislike_count <? 50 & description" .      --no-playlist                    If the URL refers to a video and a                                       playlist, download only the video. +    --yes-playlist                   If the URL refers to a video and a +                                     playlist, download the playlist.      --age-limit YEARS                download only videos suitable for the given                                       age      --download-archive FILE          Download only videos not listed in the diff --git a/test/test_utils.py b/test/test_utils.py index 2f8996d7b..3fba8ae11 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -246,6 +246,7 @@ class TestUtil(unittest.TestCase):          self.assertEqual(parse_duration('2.5 hours'), 9000)          self.assertEqual(parse_duration('02:03:04'), 7384)          self.assertEqual(parse_duration('01:02:03:04'), 93784) +        self.assertEqual(parse_duration('1 hour 3 minutes'), 3780)      def test_fix_xml_ampersands(self):          self.assertEqual( diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 40fc92cf7..ddb9d6670 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -8,6 +8,7 @@ from .adobetv import AdobeTVIE  from .adultswim import AdultSwimIE  from .aftenposten import AftenpostenIE  from .aftonbladet import AftonbladetIE +from .airmozilla import AirMozillaIE  from .aljazeera import AlJazeeraIE  from .alphaporno import AlphaPornoIE  from .anitube import AnitubeIE @@ -237,6 +238,11 @@ from .krasview import KrasViewIE  from .ku6 import Ku6IE  from .la7 import LA7IE  from .laola1tv import Laola1TvIE +from .letv import ( +    LetvIE, +    LetvTvIE, +    LetvPlaylistIE +)  from .lifenews import LifeNewsIE  from .liveleak import LiveLeakIE  from .livestream import ( diff --git a/youtube_dl/extractor/airmozilla.py b/youtube_dl/extractor/airmozilla.py new file mode 100644 index 000000000..611ad1e9d --- /dev/null +++ b/youtube_dl/extractor/airmozilla.py @@ -0,0 +1,74 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    int_or_none, +    parse_duration, +    parse_iso8601, +) + + +class AirMozillaIE(InfoExtractor): +    _VALID_URL = r'https?://air\.mozilla\.org/(?P<id>[0-9a-z-]+)/?' +    _TEST = { +        'url': 'https://air.mozilla.org/privacy-lab-a-meetup-for-privacy-minded-people-in-san-francisco/', +        'md5': '2e3e7486ba5d180e829d453875b9b8bf', +        'info_dict': { +            'id': '6x4q2w', +            'ext': 'mp4', +            'title': 'Privacy Lab - a meetup for privacy minded people in San Francisco', +            'thumbnail': 're:https://\w+\.cloudfront\.net/6x4q2w/poster\.jpg\?t=\d+', +            'description': 'Brings together privacy professionals and others interested in privacy at for-profits, non-profits, and NGOs in an effort to contribute to the state of the ecosystem...', +            'timestamp': 1422487800, +            'upload_date': '20150128', +            'location': 'SFO Commons', +            'duration': 3780, +            'view_count': int, +            'categories': ['Main'], +        } +    } + +    def _real_extract(self, url): +        display_id = self._match_id(url) +        webpage = self._download_webpage(url, display_id) +        video_id = self._html_search_regex(r'//vid.ly/(.*?)/embed', webpage, 'id') + +        embed_script = self._download_webpage('https://vid.ly/{0}/embed'.format(video_id), video_id) +        jwconfig = self._search_regex(r'\svar jwconfig = (\{.*?\});\s', embed_script, 'metadata') +        metadata = self._parse_json(jwconfig, video_id) + +        formats = [{ +            'url': source['file'], +            'ext': source['type'], +            'format_id': self._search_regex(r'&format=(.*)$', source['file'], 'video format'), +            'format': source['label'], +            'height': int(source['label'].rstrip('p')), +        } for source in metadata['playlist'][0]['sources']] +        self._sort_formats(formats) + +        view_count = int_or_none(self._html_search_regex( +            r'Views since archived: ([0-9]+)', +            webpage, 'view count', fatal=False)) +        timestamp = parse_iso8601(self._html_search_regex( +            r'<time datetime="(.*?)"', webpage, 'timestamp', fatal=False)) +        duration = parse_duration(self._search_regex( +            r'Duration:\s*(\d+\s*hours?\s*\d+\s*minutes?)', +            webpage, 'duration', fatal=False)) + +        return { +            'id': video_id, +            'title': self._og_search_title(webpage), +            'formats': formats, +            'url': self._og_search_url(webpage), +            'display_id': display_id, +            'thumbnail': metadata['playlist'][0].get('image'), +            'description': self._og_search_description(webpage), +            'timestamp': timestamp, +            'location': self._html_search_regex(r'Location: (.*)', webpage, 'location', default=None), +            'duration': duration, +            'view_count': view_count, +            'categories': re.findall(r'<a href=".*?" class="channel">(.*?)</a>', webpage), +        } diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py index 51ffec7ee..b45c1dbd0 100644 --- a/youtube_dl/extractor/escapist.py +++ b/youtube_dl/extractor/escapist.py @@ -44,14 +44,15 @@ class EscapistIE(InfoExtractor):          config_url = compat_urllib_parse.unquote(self._html_search_regex(              r'''(?x)              (?: -                <param\s+name="flashvars"\s+value="config=| +                <param\s+name="flashvars".*?\s+value="config=|                  flashvars="config=              ) -            ([^"&]+) +            (https?://[^"&]+)              ''',              webpage, 'config URL'))          formats = [] +        ad_formats = []          def _add_format(name, cfgurl, quality):              config = self._download_json( @@ -61,14 +62,19 @@ class EscapistIE(InfoExtractor):                  transform_source=js_to_json)              playlist = config['playlist'] -            video_url = next( -                p['url'] for p in playlist -                if p.get('eventCategory') == 'Video') -            formats.append({ -                'url': video_url, -                'format_id': name, -                'quality': quality, -            }) +            for p in playlist: +                if p.get('eventCategory') == 'Video': +                    ar = formats +                elif p.get('eventCategory') == 'Video Postroll': +                    ar = ad_formats +                else: +                    continue + +                ar.append({ +                    'url': p['url'], +                    'format_id': name, +                    'quality': quality, +                })          _add_format('normal', config_url, quality=0)          hq_url = (config_url + @@ -77,10 +83,9 @@ class EscapistIE(InfoExtractor):              _add_format('hq', hq_url, quality=1)          except ExtractorError:              pass  # That's fine, we'll just use normal quality -          self._sort_formats(formats) -        return { +        res = {              'id': video_id,              'formats': formats,              'uploader': uploader, @@ -89,3 +94,19 @@ class EscapistIE(InfoExtractor):              'thumbnail': self._og_search_thumbnail(webpage),              'description': description,          } + +        if self._downloader.params.get('include_ads') and ad_formats: +            self._sort_formats(ad_formats) +            ad_res = { +                'id': '%s-ad' % video_id, +                'title': '%s (Postroll)' % title, +                'formats': ad_formats, +            } +            return { +                '_type': 'playlist', +                'entries': [res, ad_res], +                'title': title, +                'id': video_id, +            } + +        return res diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 875e1bf05..3aff57e30 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1208,7 +1208,9 @@ class GenericIE(InfoExtractor):              return entries[0]          else:              for num, e in enumerate(entries, start=1): -                e['title'] = '%s (%d)' % (e['title'], num) +                # 'url' results don't have a title +                if e.get('title') is not None: +                    e['title'] = '%s (%d)' % (e['title'], num)              return {                  '_type': 'playlist',                  'entries': entries, diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py new file mode 100644 index 000000000..583ce35b9 --- /dev/null +++ b/youtube_dl/extractor/letv.py @@ -0,0 +1,190 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import datetime +import re +import time + +from .common import InfoExtractor +from ..compat import ( +    compat_urlparse, +    compat_urllib_parse, +) +from ..utils import ( +    determine_ext, +    ExtractorError, +    parse_iso8601, +) + + +class LetvIE(InfoExtractor): +    _VALID_URL = r'http://www\.letv\.com/ptv/vplay/(?P<id>\d+).html' + +    _TESTS = [{ +        'url': 'http://www.letv.com/ptv/vplay/22005890.html', +        'md5': 'cab23bd68d5a8db9be31c9a222c1e8df', +        'info_dict': { +            'id': '22005890', +            'ext': 'mp4', +            'title': '第87届奥斯卡颁奖礼完美落幕 《鸟人》成最大赢家', +            'timestamp': 1424747397, +            'upload_date': '20150224', +            'description': 'md5:a9cb175fd753e2962176b7beca21a47c', +        } +    }, { +        'url': 'http://www.letv.com/ptv/vplay/1415246.html', +        'info_dict': { +            'id': '1415246', +            'ext': 'mp4', +            'title': '美人天下01', +            'description': 'md5:f88573d9d7225ada1359eaf0dbf8bcda', +        }, +        'expected_warnings': [ +            'publish time' +        ] +    }] +    # http://www.letv.com/ptv/vplay/1118082.html +    # This video is available only in Mainland China + +    @staticmethod +    def urshift(val, n): +        return val >> n if val >= 0 else (val + 0x100000000) >> n + +    # ror() and calc_time_key() are reversed from a embedded swf file in KLetvPlayer.swf +    def ror(self, param1, param2): +        _loc3_ = 0 +        while _loc3_ < param2: +            param1 = self.urshift(param1, 1) + ((param1 & 1) << 31) +            _loc3_ += 1 +        return param1 + +    def calc_time_key(self, param1): +        _loc2_ = 773625421 +        _loc3_ = self.ror(param1, _loc2_ % 13) +        _loc3_ = _loc3_ ^ _loc2_ +        _loc3_ = self.ror(_loc3_, _loc2_ % 17) +        return _loc3_ + +    def _real_extract(self, url): +        media_id = self._match_id(url) +        page = self._download_webpage(url, media_id) +        params = { +            'id': media_id, +            'platid': 1, +            'splatid': 101, +            'format': 1, +            'tkey': self.calc_time_key(int(time.time())), +            'domain': 'www.letv.com' +        } +        play_json = self._download_json( +            'http://api.letv.com/mms/out/video/playJson?' + compat_urllib_parse.urlencode(params), +            media_id, 'playJson data') + +        # Check for errors +        playstatus = play_json['playstatus'] +        if playstatus['status'] == 0: +            flag = playstatus['flag'] +            if flag == 1: +                msg = 'Country %s auth error' % playstatus['country'] +            else: +                msg = 'Generic error. flag = %d' % flag +            raise ExtractorError(msg, expected=True) + +        playurl = play_json['playurl'] + +        formats = ['350', '1000', '1300', '720p', '1080p'] +        dispatch = playurl['dispatch'] + +        urls = [] +        for format_id in formats: +            if format_id in dispatch: +                media_url = playurl['domain'][0] + dispatch[format_id][0] + +                # Mimic what flvxz.com do +                url_parts = list(compat_urlparse.urlparse(media_url)) +                qs = dict(compat_urlparse.parse_qs(url_parts[4])) +                qs.update({ +                    'platid': '14', +                    'splatid': '1401', +                    'tss': 'no', +                    'retry': 1 +                }) +                url_parts[4] = compat_urllib_parse.urlencode(qs) +                media_url = compat_urlparse.urlunparse(url_parts) + +                url_info_dict = { +                    'url': media_url, +                    'ext': determine_ext(dispatch[format_id][1]) +                } + +                if format_id[-1:] == 'p': +                    url_info_dict['height'] = format_id[:-1] + +                urls.append(url_info_dict) + +        publish_time = parse_iso8601(self._html_search_regex( +            r'发布时间 ([^<>]+) ', page, 'publish time', fatal=False), +            delimiter=' ', timezone=datetime.timedelta(hours=8)) +        description = self._html_search_meta('description', page, fatal=False) + +        return { +            'id': media_id, +            'formats': urls, +            'title': playurl['title'], +            'thumbnail': playurl['pic'], +            'description': description, +            'timestamp': publish_time, +        } + + +class LetvTvIE(InfoExtractor): +    _VALID_URL = r'http://www.letv.com/tv/(?P<id>\d+).html' +    _TESTS = [{ +        'url': 'http://www.letv.com/tv/46177.html', +        'info_dict': { +            'id': '46177', +            'title': '美人天下', +            'description': 'md5:395666ff41b44080396e59570dbac01c' +        }, +        'playlist_count': 35 +    }] + +    def _real_extract(self, url): +        playlist_id = self._match_id(url) +        page = self._download_webpage(url, playlist_id) + +        media_urls = list(set(re.findall( +            r'http://www.letv.com/ptv/vplay/\d+.html', page))) +        entries = [self.url_result(media_url, ie='Letv') +                   for media_url in media_urls] + +        title = self._html_search_meta('keywords', page, +                                       fatal=False).split(',')[0] +        description = self._html_search_meta('description', page, fatal=False) + +        return self.playlist_result(entries, playlist_id, playlist_title=title, +                                    playlist_description=description) + + +class LetvPlaylistIE(LetvTvIE): +    _VALID_URL = r'http://tv.letv.com/[a-z]+/(?P<id>[a-z]+)/index.s?html' +    _TESTS = [{ +        'url': 'http://tv.letv.com/izt/wuzetian/index.html', +        'info_dict': { +            'id': 'wuzetian', +            'title': '武媚娘传奇', +            'description': 'md5:e12499475ab3d50219e5bba00b3cb248' +        }, +        # This playlist contains some extra videos other than the drama itself +        'playlist_mincount': 96 +    }, { +        'url': 'http://tv.letv.com/pzt/lswjzzjc/index.shtml', +        'info_dict': { +            'id': 'lswjzzjc', +            # The title should be "劲舞青春", but I can't find a simple way to +            # determine the playlist title +            'title': '乐视午间自制剧场', +            'description': 'md5:b1eef244f45589a7b5b1af9ff25a4489' +        }, +        'playlist_mincount': 7 +    }] diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 256758323..d8897eb90 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -18,7 +18,7 @@ class MiTeleIE(InfoExtractor):      IE_NAME = 'mitele.es'      _VALID_URL = r'http://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<id>[^/]+)/' -    _TEST = { +    _TESTS = [{          'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/',          'md5': '6a75fe9d0d3275bead0cb683c616fddb',          'info_dict': { @@ -29,7 +29,7 @@ class MiTeleIE(InfoExtractor):              'display_id': 'programa-144',              'duration': 2913,          }, -    } +    }]      def _real_extract(self, url):          episode = self._match_id(url) diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py index fd93cc66f..785a8045e 100644 --- a/youtube_dl/extractor/rtlnow.py +++ b/youtube_dl/extractor/rtlnow.py @@ -146,7 +146,7 @@ class RTLnowIE(InfoExtractor):                  mobj = re.search(r'.*/(?P<hoster>[^/]+)/videos/(?P<play_path>.+)\.f4m', filename.text)                  if mobj:                      fmt = { -                        'url': 'rtmpe://fmspay-fra2.rtl.de/' + mobj.group('hoster'), +                        'url': 'rtmpe://fms.rtl.de/' + mobj.group('hoster'),                          'play_path': 'mp4:' + mobj.group('play_path'),                          'page_url': url,                          'player_url': video_page_url + 'includes/vodplayer.swf', diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index be3f72df7..251a68680 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -6,9 +6,9 @@ from .mitele import MiTeleIE  class TelecincoIE(MiTeleIE):      IE_NAME = 'telecinco.es' -    _VALID_URL = r'https?://www\.telecinco\.es/[^/]+/[^/]+/[^/]+/(?P<id>.*?)\.html' +    _VALID_URL = r'https?://www\.telecinco\.es/[^/]+/[^/]+/(?:[^/]+/)?(?P<id>.*?)\.html' -    _TEST = { +    _TESTS = [{          'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html',          'info_dict': {              'id': 'MDSVID20141015_0058', @@ -16,4 +16,7 @@ class TelecincoIE(MiTeleIE):              'title': 'Con Martín Berasategui, hacer un bacalao al ...',              'duration': 662,          }, -    } +    }, { +        'url': 'http://www.telecinco.es/informativos/nacional/Pablo_Iglesias-Informativos_Telecinco-entrevista-Pedro_Piqueras_2_1945155182.html', +        'only_matching': True, +    }] diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index e2631dccd..1f3bfef7d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -54,7 +54,7 @@ from .compat import (  compiled_regex_type = type(re.compile(''))  std_headers = { -    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)', +    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',      'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',      'Accept-Encoding': 'gzip, deflate', @@ -1290,6 +1290,7 @@ def parse_duration(s):              (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|              (?P<only_hours>[0-9.]+)\s*(?:hours?)| +            \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*|              (?:                  (?:                      (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)? @@ -1308,10 +1309,14 @@ def parse_duration(s):          return float_or_none(m.group('only_hours'), invscale=60 * 60)      if m.group('secs'):          res += int(m.group('secs')) +    if m.group('mins_reversed'): +        res += int(m.group('mins_reversed')) * 60      if m.group('mins'):          res += int(m.group('mins')) * 60      if m.group('hours'):          res += int(m.group('hours')) * 60 * 60 +    if m.group('hours_reversed'): +        res += int(m.group('hours_reversed')) * 60 * 60      if m.group('days'):          res += int(m.group('days')) * 24 * 60 * 60      if m.group('ms'): diff --git a/youtube_dl/version.py b/youtube_dl/version.py index d23c6ae3d..0cbf66ed1 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@  from __future__ import unicode_literals -__version__ = '2015.02.24.2' +__version__ = '2015.02.26' | 
