diff options
| author | Remita Amine <remitamine@gmail.com> | 2019-04-28 12:03:39 +0100 | 
|---|---|---|
| committer | Remita Amine <remitamine@gmail.com> | 2019-04-28 12:03:39 +0100 | 
| commit | 280913800dff225d7171ccdbe09d7ce01fdf5d3f (patch) | |
| tree | 7e4d8e4acfd20fb0ca06c13daf698b318cda29ec | |
| parent | 7ff8ad80f1442fc213a6463fa824a70d397b0745 (diff) | |
[sverigesradio] improve extraction(closes #18635)
| -rw-r--r-- | youtube_dl/extractor/sverigesradio.py | 108 | 
1 files changed, 59 insertions, 49 deletions
| diff --git a/youtube_dl/extractor/sverigesradio.py b/youtube_dl/extractor/sverigesradio.py index 05de31a79..aa0691f0d 100644 --- a/youtube_dl/extractor/sverigesradio.py +++ b/youtube_dl/extractor/sverigesradio.py @@ -2,58 +2,70 @@  from __future__ import unicode_literals  from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( +    determine_ext, +    int_or_none, +    str_or_none, +)  class SverigesRadioBaseIE(InfoExtractor): -    _BASE_URL = 'https://sverigesradio.se/sida/playerajax' -    _QUALITIES = ['high', 'medium', 'low'] -    _CODING_FORMATS = { -        5: {'acodec': 'mp3', 'abr': 128}, -        11: {'acodec': 'aac', 'abr': 192}, -        12: {'acodec': 'aac', 'abr': 32}, -        13: {'acodec': 'aac', 'abr': 96}, +    _BASE_URL = 'https://sverigesradio.se/sida/playerajax/' +    _QUALITIES = ['low', 'medium', 'high'] +    _EXT_TO_CODEC_MAP = { +        'mp3': 'mp3', +        'm4a': 'aac', +    } +    _CODING_FORMAT_TO_ABR_MAP = { +        5: 128, +        11: 192, +        12: 32, +        13: 96,      } -    def _extract_formats(self, query, audio_id, audio_type): -        audiourls = {} -        for quality in self._QUALITIES: -            audiourl = self._download_json( -                self._BASE_URL + '/getaudiourl', audio_id, -                fatal=True, -                query=dict(query, type=audio_type, quality=quality, format='iis')) -            if audiourl is None: -                continue - -            # for some reason url can be empty, skip if so -            # also skip if url has already been seen (quality parameter is ignored?) -            url = audiourl.get('audioUrl') -            if url is None or url == "" or url in audiourls: -                continue - -            audioformat = {'vcodec': 'none', 'url': url} -            # add codec and bitrate if known coding format -            codingformat = audiourl.get('codingFormat') -            if codingformat: -                audioformat.update(self._CODING_FORMATS.get(codingformat, {})) - -            audiourls[url] = audioformat - -        return audiourls.values() - -    def _extract_audio(self, audio_type, url): +    def _real_extract(self, url):          audio_id = self._match_id(url) -        query = {'id': audio_id, 'type': audio_type} +        query = { +            'id': audio_id, +            'type': self._AUDIO_TYPE, +        } -        metadata = self._download_json(self._BASE_URL + '/audiometadata', audio_id, query=query) -        item = metadata['items'][0] +        item = self._download_json( +            self._BASE_URL + 'audiometadata', audio_id, +            'Downloading audio JSON metadata', query=query)['items'][0] +        title = item['subtitle'] -        formats = self._extract_formats(query, audio_id, audio_type) +        query['format'] = 'iis' +        urls = [] +        formats = [] +        for quality in self._QUALITIES: +            query['quality'] = quality +            audio_url_data = self._download_json( +                self._BASE_URL + 'getaudiourl', audio_id, +                'Downloading %s format JSON metadata' % quality, +                fatal=False, query=query) or {} +            audio_url = audio_url_data.get('audioUrl') +            if not audio_url or audio_url in urls: +                continue +            urls.append(audio_url) +            ext = determine_ext(audio_url) +            coding_format = audio_url_data.get('codingFormat') +            abr = int_or_none(self._search_regex( +                r'_a(\d+)\.m4a', audio_url, 'audio bitrate', +                default=None)) or self._CODING_FORMAT_TO_ABR_MAP.get(coding_format) +            formats.append({ +                'abr': abr, +                'acodec': self._EXT_TO_CODEC_MAP.get(ext), +                'ext': ext, +                'format_id': str_or_none(coding_format), +                'vcodec': 'none', +                'url': audio_url, +            })          self._sort_formats(formats)          return {              'id': audio_id, -            'title': item['subtitle'], +            'title': title,              'formats': formats,              'series': item.get('title'),              'duration': int_or_none(item.get('duration')), @@ -63,7 +75,8 @@ class SverigesRadioBaseIE(InfoExtractor):  class SverigesRadioPublicationIE(SverigesRadioBaseIE): -    _VALID_URL = r'https?://(?:www\.)?sverigesradio\.se/sida/(?:artikel|gruppsida)\.aspx\?.*artikel=(?P<id>[0-9]+)' +    IE_NAME = 'sverigesradio:publication' +    _VALID_URL = r'https?://(?:www\.)?sverigesradio\.se/sida/(?:artikel|gruppsida)\.aspx\?.*?\bartikel=(?P<id>[0-9]+)'      _TESTS = [{          'url': 'https://sverigesradio.se/sida/artikel.aspx?programid=83&artikel=7038546',          'md5': '6a4917e1923fccb080e5a206a5afa542', @@ -74,18 +87,17 @@ class SverigesRadioPublicationIE(SverigesRadioBaseIE):              'series': 'Nyheter (Ekot)',              'title': 'Esa Teittinen: Sanningen har inte kommit fram',              'description': 'md5:daf7ce66a8f0a53d5465a5984d3839df', -            'thumbnail': 're:^https://static-cdn.sr.se/sida/images/', +            'thumbnail': r're:^https?://.*\.jpg',          },      }, {          'url': 'https://sverigesradio.se/sida/gruppsida.aspx?programid=3304&grupp=6247&artikel=7146887',          'only_matching': True,      }] - -    def _real_extract(self, url): -        return self._extract_audio('publication', url) +    _AUDIO_TYPE = 'publication'  class SverigesRadioEpisodeIE(SverigesRadioBaseIE): +    IE_NAME = 'sverigesradio:episode'      _VALID_URL = r'https?://(?:www\.)?sverigesradio\.se/(?:sida/)?avsnitt/(?P<id>[0-9]+)'      _TEST = {          'url': 'https://sverigesradio.se/avsnitt/1140922?programid=1300', @@ -97,9 +109,7 @@ class SverigesRadioEpisodeIE(SverigesRadioBaseIE):              'series': 'Konflikt',              'title': 'Metoo och valen',              'description': 'md5:fcb5c1f667f00badcc702b196f10a27e', -            'thumbnail': 're:^https://static-cdn.sr.se/sida/images/' +            'thumbnail': r're:^https?://.*\.jpg',          }      } - -    def _real_extract(self, url): -        return self._extract_audio('episode', url) +    _AUDIO_TYPE = 'episode' | 
