diff options
| author | Sergey M․ <dstftw@gmail.com> | 2015-02-28 22:25:57 +0600 | 
|---|---|---|
| committer | Sergey M․ <dstftw@gmail.com> | 2015-02-28 22:25:57 +0600 | 
| commit | 6c87c2eea8b7d14c4178aaae3d74559347a772e6 (patch) | |
| tree | 737ed9507ce8ab98cc16ebb83ec34d39b4fd8669 | |
| parent | 58c2ec6ab32279c311e16842c8229f6f56104778 (diff) | |
[puls4] Improve and extract more metadata
| -rw-r--r-- | youtube_dl/extractor/puls4.py | 97 | 
1 files changed, 62 insertions, 35 deletions
| diff --git a/youtube_dl/extractor/puls4.py b/youtube_dl/extractor/puls4.py index 70dedbff3..cce84b9e4 100644 --- a/youtube_dl/extractor/puls4.py +++ b/youtube_dl/extractor/puls4.py @@ -1,61 +1,88 @@  # -*- coding: utf-8 -*- -  from __future__ import unicode_literals  from .common import InfoExtractor - -import re +from ..utils import ( +    ExtractorError, +    unified_strdate, +    int_or_none, +)  class Puls4IE(InfoExtractor): - -    _VALID_URL = r'https?://www.puls4.com/video/.+?/play/(?P<id>[0-9]+)' +    _VALID_URL = r'https?://(?:www\.)?puls4\.com/video/[^/]+/play/(?P<id>[0-9]+)'      _TESTS = [{          'url': 'http://www.puls4.com/video/pro-und-contra/play/2716816',          'md5': '49f6a6629747eeec43cef6a46b5df81d',          'info_dict': {              'id': '2716816',              'ext': 'mp4', -            'title': 'Pro und Contra vom 23.02.2015'}}, -        { +            'title': 'Pro und Contra vom 23.02.2015', +            'description': 'md5:293e44634d9477a67122489994675db6', +            'duration': 2989, +            'upload_date': '20150224', +            'uploader': 'PULS_4', +        }, +        'skip': 'Only works from Germany', +    }, {          'url': 'http://www.puls4.com/video/kult-spielfilme/play/1298106',          'md5': '6a48316c8903ece8dab9b9a7bf7a59ec',          'info_dict': {              'id': '1298106',              'ext': 'mp4', -            'title': 'Lucky Fritz'}} -    ] +            'title': 'Lucky Fritz', +        }, +        'skip': 'Only works from Germany', +    }]      def _real_extract(self, url):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) -        # if fsk-button -        real_url = self._html_search_regex(r'\"fsk-button\".+?href=\"([^"]+)', -                                           webpage, 'fsk_button', default=None) +        error_message = self._html_search_regex( +            r'<div class="message-error">(.+?)</div>', +            webpage, 'error message', default=None) +        if error_message: +            raise ExtractorError( +                '%s returned error: %s' % (self.IE_NAME, error_message), expected=True) + +        real_url = self._html_search_regex( +            r'\"fsk-button\".+?href=\"([^"]+)', +            webpage, 'fsk_button', default=None)          if real_url:              webpage = self._download_webpage(real_url, video_id) -        title = self._html_search_regex( -            r'<div id="bg_brandableContent">.+?<h1>(.+?)</h1>', -            webpage, 'title', flags=re.DOTALL) - -        sd_url = self._html_search_regex( -            r'{\"url\":\"([^"]+?)\",\"hd\":false', -            webpage, 'sd_url').replace('\\', '') - -        formats = [{'format_id': 'sd', 'url': sd_url, 'quality': -2}] - -        hd_url = self._html_search_regex( -            r'{\"url\":\"([^"]+?)\",\"hd\":true', -            webpage, 'hd_url', default=None) -        if hd_url: -            hd_url = hd_url.replace('\\', '') -            formats.append({'format_id': 'hd', 'url': hd_url, 'quality': -1}) - -        return { -            'id': video_id, -            'title': title, -            'formats': formats, -            'ext': 'mp4' -        } +        player = self._search_regex( +            r'p4_video_player(?:_iframe)?\("video_\d+_container"\s*,(.+?)\);\s*\}', +            webpage, 'player') + +        player_json = self._parse_json( +            '[%s]' % player, video_id, +            transform_source=lambda s: s.replace('undefined,', '')) + +        formats = None +        result = None + +        for v in player_json: +            if isinstance(v, list) and not formats: +                formats = [{ +                    'url': f['url'], +                    'format': 'hd' if f.get('hd') else 'sd', +                    'width': int_or_none(f.get('size_x')), +                    'height': int_or_none(f.get('size_y')), +                    'tbr': int_or_none(f.get('bitrate')), +                } for f in v] +                self._sort_formats(formats) +            elif isinstance(v, dict) and not result: +                result = { +                    'id': video_id, +                    'title': v['videopartname'].strip(), +                    'description': v.get('videotitle'), +                    'duration': int_or_none(v.get('videoduration') or v.get('episodeduration')), +                    'upload_date': unified_strdate(v.get('clipreleasetime')), +                    'uploader': v.get('channel'), +                } + +        result['formats'] = formats + +        return result | 
