diff options
| author | Alexander Seiler <seileralex@gmail.com> | 2017-11-11 19:30:10 +0100 | 
|---|---|---|
| committer | Remita Amine <remitamine@gmail.com> | 2021-02-25 15:50:49 +0100 | 
| commit | 3c58f9e0b9d8471212406e012727374db084932b (patch) | |
| tree | c738a7898fa18115f314b6fcbcb08f67b806e3b4 /youtube_dl/extractor/srgssr.py | |
| parent | ef28e33249f650b3f8d40c3e62b9df2c6103b360 (diff) | |
[srgssr] improve extraction
- extract subtitle
- fix extraction for new videos
- update srf download domains
closes #14717
closes #14725
closes #27231
closes #28238
Diffstat (limited to 'youtube_dl/extractor/srgssr.py')
| -rw-r--r-- | youtube_dl/extractor/srgssr.py | 208 | 
1 files changed, 134 insertions, 74 deletions
| diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py index f63a1359a..ac018e740 100644 --- a/youtube_dl/extractor/srgssr.py +++ b/youtube_dl/extractor/srgssr.py @@ -4,16 +4,32 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlparse  from ..utils import (      ExtractorError, +    float_or_none, +    int_or_none,      parse_iso8601,      qualities, +    try_get,  )  class SRGSSRIE(InfoExtractor): -    _VALID_URL = r'(?:https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn|srgssr):(?P<bu>srf|rts|rsi|rtr|swi):(?:[^:]+:)?(?P<type>video|audio):(?P<id>[0-9a-f\-]{36}|\d+)' +    _VALID_URL = r'''(?x) +                    (?: +                        https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn| +                        srgssr +                    ): +                    (?P<bu> +                        srf|rts|rsi|rtr|swi +                    ):(?:[^:]+:)? +                    (?P<type> +                        video|audio +                    ): +                    (?P<id> +                        [0-9a-f\-]{36}|\d+ +                    ) +                    '''      _GEO_BYPASS = False      _GEO_COUNTRIES = ['CH'] @@ -25,25 +41,39 @@ class SRGSSRIE(InfoExtractor):          'LEGAL': 'The video cannot be transmitted for legal reasons.',          'STARTDATE': 'This video is not yet available. Please try again later.',      } +    _DEFAULT_LANGUAGE_CODES = { +        'srf': 'de', +        'rts': 'fr', +        'rsi': 'it', +        'rtr': 'rm', +        'swi': 'en', +    }      def _get_tokenized_src(self, url, video_id, format_id): -        sp = compat_urllib_parse_urlparse(url).path.split('/')          token = self._download_json( -            'http://tp.srgssr.ch/akahd/token?acl=/%s/%s/*' % (sp[1], sp[2]), +            'http://tp.srgssr.ch/akahd/token?acl=*',              video_id, 'Downloading %s token' % format_id, fatal=False) or {} -        auth_params = token.get('token', {}).get('authparams') +        auth_params = try_get(token, lambda x: x['token']['authparams'])          if auth_params: -            url += '?' + auth_params +            url += ('?' if '?' not in url else '&') + auth_params          return url -    def get_media_data(self, bu, media_type, media_id): -        media_data = self._download_json( -            'http://il.srgssr.ch/integrationlayer/1.0/ue/%s/%s/play/%s.json' % (bu, media_type, media_id), -            media_id)[media_type.capitalize()] - -        if media_data.get('block') and media_data['block'] in self._ERRORS: -            message = self._ERRORS[media_data['block']] -            if media_data['block'] == 'GEOBLOCK': +    def _get_media_data(self, bu, media_type, media_id): +        query = {'onlyChapters': True} if media_type == 'video' else {} +        full_media_data = self._download_json( +            'https://il.srgssr.ch/integrationlayer/2.0/%s/mediaComposition/%s/%s.json' +            % (bu, media_type, media_id), +            media_id, query=query)['chapterList'] +        try: +            media_data = next( +                x for x in full_media_data if x.get('id') == media_id) +        except StopIteration: +            raise ExtractorError('No media information found') + +        block_reason = media_data.get('blockReason') +        if block_reason and block_reason in self._ERRORS: +            message = self._ERRORS[block_reason] +            if block_reason == 'GEOBLOCK':                  self.raise_geo_restricted(                      msg=message, countries=self._GEO_COUNTRIES)              raise ExtractorError( @@ -53,53 +83,75 @@ class SRGSSRIE(InfoExtractor):      def _real_extract(self, url):          bu, media_type, media_id = re.match(self._VALID_URL, url).groups() +        media_data = self._get_media_data(bu, media_type, media_id) +        title = media_data['title'] -        media_data = self.get_media_data(bu, media_type, media_id) - -        metadata = media_data['AssetMetadatas']['AssetMetadata'][0] -        title = metadata['title'] -        description = metadata.get('description') -        created_date = media_data.get('createdDate') or metadata.get('createdDate') -        timestamp = parse_iso8601(created_date) - -        thumbnails = [{ -            'id': image.get('id'), -            'url': image['url'], -        } for image in media_data.get('Image', {}).get('ImageRepresentations', {}).get('ImageRepresentation', [])] - -        preference = qualities(['LQ', 'MQ', 'SD', 'HQ', 'HD'])          formats = [] -        for source in media_data.get('Playlists', {}).get('Playlist', []) + media_data.get('Downloads', {}).get('Download', []): -            protocol = source.get('@protocol') -            for asset in source['url']: -                asset_url = asset['text'] -                quality = asset['@quality'] -                format_id = '%s-%s' % (protocol, quality) -                if protocol.startswith('HTTP-HDS') or protocol.startswith('HTTP-HLS'): -                    asset_url = self._get_tokenized_src(asset_url, media_id, format_id) -                    if protocol.startswith('HTTP-HDS'): -                        formats.extend(self._extract_f4m_formats( -                            asset_url + ('?' if '?' not in asset_url else '&') + 'hdcore=3.4.0', -                            media_id, f4m_id=format_id, fatal=False)) -                    elif protocol.startswith('HTTP-HLS'): -                        formats.extend(self._extract_m3u8_formats( -                            asset_url, media_id, 'mp4', 'm3u8_native', -                            m3u8_id=format_id, fatal=False)) -                else: -                    formats.append({ -                        'format_id': format_id, -                        'url': asset_url, -                        'preference': preference(quality), -                        'ext': 'flv' if protocol == 'RTMP' else None, -                    }) +        q = qualities(['SD', 'HD']) +        for source in (media_data.get('resourceList') or []): +            format_url = source.get('url') +            if not format_url: +                continue +            protocol = source.get('protocol') +            quality = source.get('quality') +            format_id = [] +            for e in (protocol, source.get('encoding'), quality): +                if e: +                    format_id.append(e) +            format_id = '-'.join(format_id) + +            if protocol in ('HDS', 'HLS'): +                if source.get('tokenType') == 'AKAMAI': +                    format_url = self._get_tokenized_src( +                        format_url, media_id, format_id) +                    formats.extend(self._extract_akamai_formats( +                        format_url, media_id)) +                elif protocol == 'HLS': +                    formats.extend(self._extract_m3u8_formats( +                        format_url, media_id, 'mp4', 'm3u8_native', +                        m3u8_id=format_id, fatal=False)) +            elif protocol in ('HTTP', 'HTTPS'): +                formats.append({ +                    'format_id': format_id, +                    'url': format_url, +                    'quality': q(quality), +                }) + +        # This is needed because for audio medias the podcast url is usually +        # always included, even if is only an audio segment and not the +        # whole episode. +        if int_or_none(media_data.get('position')) == 0: +            for p in ('S', 'H'): +                podcast_url = media_data.get('podcast%sdUrl' % p) +                if not podcast_url: +                    continue +                quality = p + 'D' +                formats.append({ +                    'format_id': 'PODCAST-' + quality, +                    'url': podcast_url, +                    'quality': q(quality), +                })          self._sort_formats(formats) +        subtitles = {} +        if media_type == 'video': +            for sub in (media_data.get('subtitleList') or []): +                sub_url = sub.get('url') +                if not sub_url: +                    continue +                lang = sub.get('locale') or self._DEFAULT_LANGUAGE_CODES[bu] +                subtitles.setdefault(lang, []).append({ +                    'url': sub_url, +                }) +          return {              'id': media_id,              'title': title, -            'description': description, -            'timestamp': timestamp, -            'thumbnails': thumbnails, +            'description': media_data.get('description'), +            'timestamp': parse_iso8601(media_data.get('date')), +            'thumbnail': media_data.get('imageUrl'), +            'duration': float_or_none(media_data.get('duration'), 1000), +            'subtitles': subtitles,              'formats': formats,          } @@ -119,26 +171,17 @@ class SRGSSRPlayIE(InfoExtractor):      _TESTS = [{          'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', -        'md5': 'da6b5b3ac9fa4761a942331cef20fcb3', +        'md5': '6db2226ba97f62ad42ce09783680046c',          'info_dict': {              'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5',              'ext': 'mp4',              'upload_date': '20130701',              'title': 'Snowden beantragt Asyl in Russland', -            'timestamp': 1372713995, -        } -    }, { -        # No Speichern (Save) button -        'url': 'http://www.srf.ch/play/tv/top-gear/video/jaguar-xk120-shadow-und-tornado-dampflokomotive?id=677f5829-e473-4823-ac83-a1087fe97faa', -        'md5': '0a274ce38fda48c53c01890651985bc6', -        'info_dict': { -            'id': '677f5829-e473-4823-ac83-a1087fe97faa', -            'ext': 'flv', -            'upload_date': '20130710', -            'title': 'Jaguar XK120, Shadow und Tornado-Dampflokomotive', -            'description': 'md5:88604432b60d5a38787f152dec89cd56', -            'timestamp': 1373493600, +            'timestamp': 1372708215, +            'duration': 113.827, +            'thumbnail': r're:^https?://.*1383719781\.png$',          }, +        'expected_warnings': ['Unable to download f4m manifest'],      }, {          'url': 'http://www.rtr.ch/play/radio/actualitad/audio/saira-tujetsch-tuttina-cuntinuar-cun-sedrun-muster-turissem?id=63cb0778-27f8-49af-9284-8c7a8c6d15fc',          'info_dict': { @@ -146,7 +189,8 @@ class SRGSSRPlayIE(InfoExtractor):              'ext': 'mp3',              'upload_date': '20151013',              'title': 'Saira: Tujetsch - tuttina cuntinuar cun Sedrun Mustér Turissem', -            'timestamp': 1444750398, +            'timestamp': 1444709160, +            'duration': 336.816,          },          'params': {              # rtmp download @@ -159,20 +203,33 @@ class SRGSSRPlayIE(InfoExtractor):              'id': '6348260',              'display_id': '6348260',              'ext': 'mp4', -            'duration': 1796, +            'duration': 1796.76,              'title': 'Le 19h30', -            'description': '', -            'uploader': '19h30',              'upload_date': '20141201',              'timestamp': 1417458600,              'thumbnail': r're:^https?://.*\.image', -            'view_count': int,          },          'params': {              # m3u8 download              'skip_download': True,          }      }, { +        'url': 'http://play.swissinfo.ch/play/tv/business/video/why-people-were-against-tax-reforms?id=42960270', +        'info_dict': { +            'id': '42960270', +            'ext': 'mp4', +            'title': 'Why people were against tax reforms', +            'description': 'md5:7ac442c558e9630e947427469c4b824d', +            'duration': 94.0, +            'upload_date': '20170215', +            'timestamp': 1487173560, +            'thumbnail': r're:https?://www\.swissinfo\.ch/srgscalableimage/42961964', +            'subtitles': 'count:9', +        }, +        'params': { +            'skip_download': True, +        } +    }, {          'url': 'https://www.srf.ch/play/tv/popupvideoplayer?id=c4dba0ca-e75b-43b2-a34f-f708a4932e01',          'only_matching': True,      }, { @@ -181,6 +238,10 @@ class SRGSSRPlayIE(InfoExtractor):      }, {          'url': 'https://www.rts.ch/play/tv/19h30/video/le-19h30?urn=urn:rts:video:6348260',          'only_matching': True, +    }, { +        # audio segment, has podcastSdUrl of the full episode +        'url': 'https://www.srf.ch/play/radio/popupaudioplayer?id=50b20dc8-f05b-4972-bf03-e438ff2833eb', +        'only_matching': True,      }]      def _real_extract(self, url): @@ -188,5 +249,4 @@ class SRGSSRPlayIE(InfoExtractor):          bu = mobj.group('bu')          media_type = mobj.group('type') or mobj.group('type_2')          media_id = mobj.group('id') -        # other info can be extracted from url + '&layout=json'          return self.url_result('srgssr:%s:%s:%s' % (bu[:3], media_type, media_id), 'SRGSSR') | 
