diff options
| author | Sergey M․ <dstftw@gmail.com> | 2017-10-02 04:14:36 +0700 | 
|---|---|---|
| committer | Sergey M․ <dstftw@gmail.com> | 2017-10-02 04:14:36 +0700 | 
| commit | 117589dfa2ac32566dc705ee7e712139105e6dd8 (patch) | |
| tree | a0d90fea8f5354f6aee1bc056492c164cb24b7ff /youtube_dl/extractor/canvas.py | |
| parent | 839728f5bfad3a6166be3839009b13963f00dfac (diff) | |
[canvas] Generalize mediazone.vrt.be extractor and rework canvas and een
Diffstat (limited to 'youtube_dl/extractor/canvas.py')
| -rw-r--r-- | youtube_dl/extractor/canvas.py | 142 | 
1 files changed, 95 insertions, 47 deletions
diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py index aada02917..6899f8431 100644 --- a/youtube_dl/extractor/canvas.py +++ b/youtube_dl/extractor/canvas.py @@ -3,24 +3,104 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..utils import float_or_none +from ..utils import ( +    float_or_none, +    strip_or_none, +)  class CanvasIE(InfoExtractor): +    _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet)/assets/(?P<id>m[dz]-ast-[^/?#&]+)' +    _TESTS = [{ +        'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', +        'md5': '90139b746a0a9bd7bb631283f6e2a64e', +        'info_dict': { +            'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', +            'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', +            'ext': 'flv', +            'title': 'Nachtwacht: De Greystook', +            'description': 'md5:1db3f5dc4c7109c821261e7512975be7', +            'thumbnail': r're:^https?://.*\.jpg$', +            'duration': 1468.03, +        }, +        'expected_warnings': ['is not a supported codec', 'Unknown MIME type'], +    }, { +        'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        site_id, video_id = mobj.group('site_id'), mobj.group('id') + +        data = self._download_json( +            'https://mediazone.vrt.be/api/v1/%s/assets/%s' +            % (site_id, video_id), video_id) + +        title = data['title'] +        description = data.get('description') + +        formats = [] +        for target in data['targetUrls']: +            format_url, format_type = target.get('url'), target.get('type') +            if not format_url or not format_type: +                continue +            if format_type == 'HLS': +                formats.extend(self._extract_m3u8_formats( +                    format_url, video_id, 'mp4', entry_protocol='m3u8_native', +                    m3u8_id=format_type, fatal=False)) +            elif format_type == 'HDS': +                formats.extend(self._extract_f4m_formats( +                    format_url, video_id, f4m_id=format_type, fatal=False)) +            elif format_type == 'MPEG_DASH': +                formats.extend(self._extract_mpd_formats( +                    format_url, video_id, mpd_id=format_type, fatal=False)) +            elif format_type == 'HSS': +                formats.extend(self._extract_ism_formats( +                    format_url, video_id, ism_id='mss', fatal=False)) +            else: +                formats.append({ +                    'format_id': format_type, +                    'url': format_url, +                }) +        self._sort_formats(formats) + +        subtitles = {} +        subtitle_urls = data.get('subtitleUrls') +        if isinstance(subtitle_urls, list): +            for subtitle in subtitle_urls: +                subtitle_url = subtitle.get('url') +                if subtitle_url and subtitle.get('type') == 'CLOSED': +                    subtitles.setdefault('nl', []).append({'url': subtitle_url}) + +        return { +            'id': video_id, +            'display_id': video_id, +            'title': title, +            'description': description, +            'formats': formats, +            'duration': float_or_none(data.get('duration'), 1000), +            'thumbnail': data.get('posterImageUrl'), +            'subtitles': subtitles, +        } + + +class CanvasEenIE(InfoExtractor):      IE_DESC = 'canvas.be and een.be'      _VALID_URL = r'https?://(?:www\.)?(?P<site_id>canvas|een)\.be/(?:[^/]+/)*(?P<id>[^/?#&]+)'      _TESTS = [{          'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week', -        'md5': 'ea838375a547ac787d4064d8c7860a6c', +        'md5': 'ed66976748d12350b118455979cca293',          'info_dict': {              'id': 'mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e',              'display_id': 'de-afspraak-veilt-voor-de-warmste-week', -            'ext': 'mp4', +            'ext': 'flv',              'title': 'De afspraak veilt voor de Warmste Week',              'description': 'md5:24cb860c320dc2be7358e0e5aa317ba6',              'thumbnail': r're:^https?://.*\.jpg$',              'duration': 49.02, -        } +        }, +        'expected_warnings': ['is not a supported codec'],      }, {          # with subtitles          'url': 'http://www.canvas.be/video/panorama/2016/pieter-0167', @@ -40,7 +120,8 @@ class CanvasIE(InfoExtractor):          },          'params': {              'skip_download': True, -        } +        }, +        'skip': 'Pagina niet gevonden',      }, {          'url': 'https://www.een.be/sorry-voor-alles/herbekijk-sorry-voor-alles',          'info_dict': { @@ -54,7 +135,8 @@ class CanvasIE(InfoExtractor):          },          'params': {              'skip_download': True, -        } +        }, +        'skip': 'Episode no longer available',      }, {          'url': 'https://www.canvas.be/check-point/najaar-2016/de-politie-uw-vriend',          'only_matching': True, @@ -66,55 +148,21 @@ class CanvasIE(InfoExtractor):          webpage = self._download_webpage(url, display_id) -        title = (self._search_regex( +        title = strip_or_none(self._search_regex(              r'<h1[^>]+class="video__body__header__title"[^>]*>(.+?)</h1>',              webpage, 'title', default=None) or self._og_search_title( -            webpage)).strip() +            webpage, default=None))          video_id = self._html_search_regex( -            r'data-video=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id', group='id') - -        data = self._download_json( -            'https://mediazone.vrt.be/api/v1/%s/assets/%s' -            % (site_id, video_id), display_id) - -        formats = [] -        for target in data['targetUrls']: -            format_url, format_type = target.get('url'), target.get('type') -            if not format_url or not format_type: -                continue -            if format_type == 'HLS': -                formats.extend(self._extract_m3u8_formats( -                    format_url, display_id, entry_protocol='m3u8_native', -                    ext='mp4', preference=0, fatal=False, m3u8_id=format_type)) -            elif format_type == 'HDS': -                formats.extend(self._extract_f4m_formats( -                    format_url, display_id, f4m_id=format_type, fatal=False)) -            elif format_type == 'MPEG_DASH': -                formats.extend(self._extract_mpd_formats( -                    format_url, display_id, mpd_id=format_type, fatal=False)) -            else: -                formats.append({ -                    'format_id': format_type, -                    'url': format_url, -                }) -        self._sort_formats(formats) - -        subtitles = {} -        subtitle_urls = data.get('subtitleUrls') -        if isinstance(subtitle_urls, list): -            for subtitle in subtitle_urls: -                subtitle_url = subtitle.get('url') -                if subtitle_url and subtitle.get('type') == 'CLOSED': -                    subtitles.setdefault('nl', []).append({'url': subtitle_url}) +            r'data-video=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id', +            group='id')          return { +            '_type': 'url_transparent', +            'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (site_id, video_id), +            'ie_key': CanvasIE.ie_key(),              'id': video_id,              'display_id': display_id,              'title': title,              'description': self._og_search_description(webpage), -            'formats': formats, -            'duration': float_or_none(data.get('duration'), 1000), -            'thumbnail': data.get('posterImageUrl'), -            'subtitles': subtitles,          }  | 
