diff options
| author | Remita Amine <remitamine@gmail.com> | 2018-06-17 02:43:24 +0100 | 
|---|---|---|
| committer | Remita Amine <remitamine@gmail.com> | 2018-06-17 03:13:41 +0100 | 
| commit | 764cd4e6f3450997eb0499b68b17b580a5e074f3 (patch) | |
| tree | b68d963a03659b8ce79b252761cdcaa9bf3e83d1 | |
| parent | 734d461ca04a9f271dd463aa75d44ac82377057e (diff) | |
[rtbf] improve extraction
- add support for audio and live streams(closes #11923)(closes #9638)
- extract HLS, DASH and all HTTP formats
- extract subtitles
- fixup specific http urls(fixes #16101)
| -rw-r--r-- | youtube_dl/extractor/rtbf.py | 127 | 
1 files changed, 95 insertions, 32 deletions
| diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index 28cc5522d..acff9766a 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -1,10 +1,14 @@  # coding: utf-8  from __future__ import unicode_literals +import re +  from .common import InfoExtractor  from ..utils import ( -    int_or_none,      ExtractorError, +    float_or_none, +    int_or_none, +    strip_or_none,  ) @@ -14,20 +18,19 @@ class RTBFIE(InfoExtractor):          (?:              video/[^?]+\?.*\bid=|              ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=| -            auvio/[^/]+\?.*id= +            auvio/[^/]+\?.*\b(?P<live>l)?id=          )(?P<id>\d+)'''      _TESTS = [{          'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', -        'md5': '799f334ddf2c0a582ba80c44655be570', +        'md5': '8c876a1cceeb6cf31b476461ade72384',          'info_dict': {              'id': '1921274',              'ext': 'mp4',              'title': 'Les Diables au coeur (épisode 2)', -            'description': 'Football - Diables Rouges', -            'duration': 3099, +            'description': '(du 25/04/2014)', +            'duration': 3099.54,              'upload_date': '20140425', -            'timestamp': 1398456336, -            'uploader': 'rtbfsport', +            'timestamp': 1398456300,          }      }, {          # geo restricted @@ -39,6 +42,18 @@ class RTBFIE(InfoExtractor):      }, {          'url': 'http://www.rtbf.be/auvio/detail_jeudi-en-prime-siegfried-bracke?id=2102996',          'only_matching': True, +    }, { +        # Live +        'url': 'https://www.rtbf.be/auvio/direct_pure-fm?lid=134775', +        'only_matching': True, +    }, { +        # Audio +        'url': 'https://www.rtbf.be/auvio/detail_cinq-heures-cinema?id=2360811', +        'only_matching': True, +    }, { +        # With Subtitle +        'url': 'https://www.rtbf.be/auvio/detail_les-carnets-du-bourlingueur?id=2361588', +        'only_matching': True,      }]      _IMAGE_HOST = 'http://ds1.ds.static.rtbf.be'      _PROVIDERS = { @@ -53,46 +68,94 @@ class RTBFIE(InfoExtractor):      ]      def _real_extract(self, url): -        video_id = self._match_id(url) -        data = self._download_json( -            'http://www.rtbf.be/api/media/video?method=getVideoDetail&args[]=%s' % video_id, video_id) +        live, media_id = re.match(self._VALID_URL, url).groups() +        embed_page = self._download_webpage( +            'https://www.rtbf.be/auvio/embed/' + ('direct' if live else 'media'), +            media_id, query={'id': media_id}) +        data = self._parse_json(self._html_search_regex( +            r'data-media="([^"]+)"', embed_page, 'media data'), media_id)          error = data.get('error')          if error:              raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) -        data = data['data'] -          provider = data.get('provider')          if provider in self._PROVIDERS:              return self.url_result(data['url'], self._PROVIDERS[provider]) +        title = data['title'] +        is_live = data.get('isLive') +        if is_live: +            title = self._live_title(title) +        height_re = r'-(\d+)p\.'          formats = [] -        for key, format_id in self._QUALITIES: -            format_url = data.get(key + 'Url') -            if format_url: + +        m3u8_url = data.get('urlHlsAes128') or data.get('urlHls') +        if m3u8_url: +            formats.extend(self._extract_m3u8_formats( +                m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) + +        fix_url = lambda x: x.replace('//rtbf-vod.', '//rtbf.') if '/geo/drm/' in x else x +        http_url = data.get('url') +        if formats and http_url and re.search(height_re, http_url): +            http_url = fix_url(http_url) +            for m3u8_f in formats.copy(): +                height = m3u8_f.get('height') +                if not height: +                    continue +                f = m3u8_f.copy() +                del f['protocol'] +                f.update({ +                    'format_id': m3u8_f['format_id'].replace('hls-', 'http-'), +                    'url': re.sub(height_re, '-%dp.' % height, http_url), +                }) +                formats.append(f) +        else: +            sources = data.get('sources') or {} +            for key, format_id in self._QUALITIES: +                format_url = sources.get(key) +                if not format_url: +                    continue +                height = int_or_none(self._search_regex( +                    height_re, format_url, 'height', default=None))                  formats.append({                      'format_id': format_id, -                    'url': format_url, +                    'url': fix_url(format_url), +                    'height': height,                  }) -        thumbnails = [] -        for thumbnail_id, thumbnail_url in data.get('thumbnail', {}).items(): -            if thumbnail_id != 'default': -                thumbnails.append({ -                    'url': self._IMAGE_HOST + thumbnail_url, -                    'id': thumbnail_id, -                }) +        mpd_url = data.get('urlDash') +        if not data.get('drm') and mpd_url: +            formats.extend(self._extract_mpd_formats( +                mpd_url, media_id, mpd_id='dash', fatal=False)) + +        audio_url = data.get('urlAudio') +        if audio_url: +            formats.append({ +                'format_id': 'audio', +                'url': audio_url, +                'vcodec': 'none', +            }) +        self._sort_formats(formats) + +        subtitles = {} +        for track in (data.get('tracks') or {}).values(): +            sub_url = track.get('url') +            if not sub_url: +                continue +            subtitles.setdefault(track.get('lang') or 'fr', []).append({ +                'url': sub_url, +            })          return { -            'id': video_id, +            'id': media_id,              'formats': formats, -            'title': data['title'], -            'description': data.get('description') or data.get('subtitle'), -            'thumbnails': thumbnails, -            'duration': data.get('duration') or data.get('realDuration'), -            'timestamp': int_or_none(data.get('created')), -            'view_count': int_or_none(data.get('viewCount')), -            'uploader': data.get('channel'), -            'tags': data.get('tags'), +            'title': title, +            'description': strip_or_none(data.get('description')), +            'thumbnail': data.get('thumbnail'), +            'duration': float_or_none(data.get('realDuration')), +            'timestamp': int_or_none(data.get('liveFrom')), +            'series': data.get('programLabel'), +            'subtitles': subtitles, +            'is_live': is_live,          } | 
