diff options
Diffstat (limited to 'youtube_dl/extractor/ard.py')
| -rw-r--r-- | youtube_dl/extractor/ard.py | 257 | 
1 files changed, 131 insertions, 126 deletions
| diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 8adae4644..09d3ab4f9 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -1,6 +1,7 @@  # coding: utf-8  from __future__ import unicode_literals +import json  import re  from .common import InfoExtractor @@ -22,66 +23,28 @@ from ..utils import (  from ..compat import compat_etree_fromstring -class ARDMediathekIE(InfoExtractor): -    IE_NAME = 'ARD:mediathek' -    _VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' - -    _TESTS = [{ -        # available till 26.07.2022 -        'url': 'http://www.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822', -        'info_dict': { -            'id': '44726822', -            'ext': 'mp4', -            'title': 'Was ist die Kunst der Zukunft, liebe Anna McCarthy?', -            'description': 'md5:4ada28b3e3b5df01647310e41f3a62f5', -            'duration': 1740, -        }, -        'params': { -            # m3u8 download -            'skip_download': True, -        } -    }, { -        'url': 'https://one.ard.de/tv/Mord-mit-Aussicht/Mord-mit-Aussicht-6-39-T%C3%B6dliche-Nach/ONE/Video?bcastId=46384294&documentId=55586872', -        'only_matching': True, -    }, { -        # audio -        'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', -        'only_matching': True, -    }, { -        'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', -        'only_matching': True, -    }, { -        # audio -        'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158', -        'only_matching': True, -    }, { -        'url': 'https://classic.ardmediathek.de/tv/Panda-Gorilla-Co/Panda-Gorilla-Co-Folge-274/Das-Erste/Video?bcastId=16355486&documentId=58234698', -        'only_matching': True, -    }] - -    @classmethod -    def suitable(cls, url): -        return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url) +class ARDMediathekBaseIE(InfoExtractor): +    _GEO_COUNTRIES = ['DE']      def _extract_media_info(self, media_info_url, webpage, video_id):          media_info = self._download_json(              media_info_url, video_id, 'Downloading media JSON') +        return self._parse_media_info(media_info, video_id, '"fsk"' in webpage) +    def _parse_media_info(self, media_info, video_id, fsk):          formats = self._extract_formats(media_info, video_id)          if not formats: -            if '"fsk"' in webpage: +            if fsk:                  raise ExtractorError(                      'This video is only available after 20:00', expected=True)              elif media_info.get('_geoblocked'): -                raise ExtractorError('This video is not available due to geo restriction', expected=True) +                self.raise_geo_restricted( +                    'This video is not available due to geoblocking', +                    countries=self._GEO_COUNTRIES)          self._sort_formats(formats) -        duration = int_or_none(media_info.get('_duration')) -        thumbnail = media_info.get('_previewImage') -        is_live = media_info.get('_isLive') is True -          subtitles = {}          subtitle_url = media_info.get('_subtitleUrl')          if subtitle_url: @@ -92,9 +55,9 @@ class ARDMediathekIE(InfoExtractor):          return {              'id': video_id, -            'duration': duration, -            'thumbnail': thumbnail, -            'is_live': is_live, +            'duration': int_or_none(media_info.get('_duration')), +            'thumbnail': media_info.get('_previewImage'), +            'is_live': media_info.get('_isLive') is True,              'formats': formats,              'subtitles': subtitles,          } @@ -123,11 +86,11 @@ class ARDMediathekIE(InfoExtractor):                              update_url_query(stream_url, {                                  'hdcore': '3.1.1',                                  'plugin': 'aasp-3.1.1.69.124' -                            }), -                            video_id, f4m_id='hds', fatal=False)) +                            }), video_id, f4m_id='hds', fatal=False))                      elif ext == 'm3u8':                          formats.extend(self._extract_m3u8_formats( -                            stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) +                            stream_url, video_id, 'mp4', 'm3u8_native', +                            m3u8_id='hls', fatal=False))                      else:                          if server and server.startswith('rtmp'):                              f = { @@ -140,7 +103,9 @@ class ARDMediathekIE(InfoExtractor):                                  'url': stream_url,                                  'format_id': 'a%s-%s-%s' % (num, ext, quality)                              } -                        m = re.search(r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$', stream_url) +                        m = re.search( +                            r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$', +                            stream_url)                          if m:                              f.update({                                  'width': int(m.group('width')), @@ -151,6 +116,48 @@ class ARDMediathekIE(InfoExtractor):                          formats.append(f)          return formats + +class ARDMediathekIE(ARDMediathekBaseIE): +    IE_NAME = 'ARD:mediathek' +    _VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' + +    _TESTS = [{ +        # available till 26.07.2022 +        'url': 'http://www.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822', +        'info_dict': { +            'id': '44726822', +            'ext': 'mp4', +            'title': 'Was ist die Kunst der Zukunft, liebe Anna McCarthy?', +            'description': 'md5:4ada28b3e3b5df01647310e41f3a62f5', +            'duration': 1740, +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        } +    }, { +        'url': 'https://one.ard.de/tv/Mord-mit-Aussicht/Mord-mit-Aussicht-6-39-T%C3%B6dliche-Nach/ONE/Video?bcastId=46384294&documentId=55586872', +        'only_matching': True, +    }, { +        # audio +        'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', +        'only_matching': True, +    }, { +        'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', +        'only_matching': True, +    }, { +        # audio +        'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158', +        'only_matching': True, +    }, { +        'url': 'https://classic.ardmediathek.de/tv/Panda-Gorilla-Co/Panda-Gorilla-Co-Folge-274/Das-Erste/Video?bcastId=16355486&documentId=58234698', +        'only_matching': True, +    }] + +    @classmethod +    def suitable(cls, url): +        return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url) +      def _real_extract(self, url):          # determine video id from url          m = re.match(self._VALID_URL, url) @@ -302,19 +309,20 @@ class ARDIE(InfoExtractor):          } -class ARDBetaMediathekIE(InfoExtractor): -    _VALID_URL = r'https://(?:beta|www)\.ardmediathek\.de/[^/]+/(?:player|live)/(?P<video_id>[a-zA-Z0-9]+)(?:/(?P<display_id>[^/?#]+))?' +class ARDBetaMediathekIE(ARDMediathekBaseIE): +    _VALID_URL = r'https://(?:beta|www)\.ardmediathek\.de/(?P<client>[^/]+)/(?:player|live)/(?P<video_id>[a-zA-Z0-9]+)(?:/(?P<display_id>[^/?#]+))?'      _TESTS = [{          'url': 'https://beta.ardmediathek.de/ard/player/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE/die-robuste-roswita', -        'md5': '2d02d996156ea3c397cfc5036b5d7f8f', +        'md5': 'dfdc87d2e7e09d073d5a80770a9ce88f',          'info_dict': {              'display_id': 'die-robuste-roswita', -            'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', -            'title': 'Tatort: Die robuste Roswita', +            'id': '70153354', +            'title': 'Die robuste Roswita',              'description': r're:^Der Mord.*trüber ist als die Ilm.',              'duration': 5316, -            'thumbnail': 'https://img.ardmediathek.de/standard/00/55/43/59/34/-1774185891/16x9/960?mandant=ard', -            'upload_date': '20180826', +            'thumbnail': 'https://img.ardmediathek.de/standard/00/70/15/33/90/-1852531467/16x9/960?mandant=ard', +            'timestamp': 1577047500, +            'upload_date': '20191222',              'ext': 'mp4',          },      }, { @@ -330,71 +338,68 @@ class ARDBetaMediathekIE(InfoExtractor):          video_id = mobj.group('video_id')          display_id = mobj.group('display_id') or video_id -        webpage = self._download_webpage(url, display_id) -        data_json = self._search_regex(r'window\.__APOLLO_STATE__\s*=\s*(\{.*);\n', webpage, 'json') -        data = self._parse_json(data_json, display_id) - -        res = { -            'id': video_id, -            'display_id': display_id, +        player_page = self._download_json( +            'https://api.ardmediathek.de/public-gateway', +            display_id, data=json.dumps({ +                'query': '''{ +  playerPage(client:"%s", clipId: "%s") { +    blockedByFsk +    broadcastedOn +    maturityContentRating +    mediaCollection { +      _duration +      _geoblocked +      _isLive +      _mediaArray { +        _mediaStreamArray { +          _quality +          _server +          _stream          } -        formats = [] -        subtitles = {} -        geoblocked = False -        for widget in data.values(): -            if widget.get('_geoblocked') is True: -                geoblocked = True -            if '_duration' in widget: -                res['duration'] = int_or_none(widget['_duration']) -            if 'clipTitle' in widget: -                res['title'] = widget['clipTitle'] -            if '_previewImage' in widget: -                res['thumbnail'] = widget['_previewImage'] -            if 'broadcastedOn' in widget: -                res['timestamp'] = unified_timestamp(widget['broadcastedOn']) -            if 'synopsis' in widget: -                res['description'] = widget['synopsis'] -            subtitle_url = url_or_none(widget.get('_subtitleUrl')) -            if subtitle_url: -                subtitles.setdefault('de', []).append({ -                    'ext': 'ttml', -                    'url': subtitle_url, -                }) -            if '_quality' in widget: -                format_url = url_or_none(try_get( -                    widget, lambda x: x['_stream']['json'][0])) -                if not format_url: -                    continue -                ext = determine_ext(format_url) -                if ext == 'f4m': -                    formats.extend(self._extract_f4m_formats( -                        format_url + '?hdcore=3.11.0', -                        video_id, f4m_id='hds', fatal=False)) -                elif ext == 'm3u8': -                    formats.extend(self._extract_m3u8_formats( -                        format_url, video_id, 'mp4', m3u8_id='hls', -                        fatal=False)) -                else: -                    # HTTP formats are not available when geoblocked is True, -                    # other formats are fine though -                    if geoblocked: -                        continue -                    quality = str_or_none(widget.get('_quality')) -                    formats.append({ -                        'format_id': ('http-' + quality) if quality else 'http', -                        'url': format_url, -                        'preference': 10,  # Plain HTTP, that's nice -                    }) - -        if not formats and geoblocked: -            self.raise_geo_restricted( -                msg='This video is not available due to geoblocking', -                countries=['DE']) - -        self._sort_formats(formats) -        res.update({ -            'subtitles': subtitles, -            'formats': formats, +      } +      _previewImage +      _subtitleUrl +      _type +    } +    show { +      title +    } +    synopsis +    title +    tracking { +      atiCustomVars { +        contentId +      } +    } +  } +}''' % (mobj.group('client'), video_id), +            }).encode(), headers={ +                'Content-Type': 'application/json' +            })['data']['playerPage'] +        title = player_page['title'] +        content_id = str_or_none(try_get( +            player_page, lambda x: x['tracking']['atiCustomVars']['contentId'])) +        media_collection = player_page.get('mediaCollection') or {} +        if not media_collection and content_id: +            media_collection = self._download_json( +                'https://www.ardmediathek.de/play/media/' + content_id, +                content_id, fatal=False) or {} +        info = self._parse_media_info( +            media_collection, content_id or video_id, +            player_page.get('blockedByFsk')) +        age_limit = None +        description = player_page.get('synopsis') +        maturity_content_rating = player_page.get('maturityContentRating') +        if maturity_content_rating: +            age_limit = int_or_none(maturity_content_rating.lstrip('FSK')) +        if not age_limit: +            age_limit = int_or_none(self._search_regex(r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None)) +        info.update({ +            'age_limit': age_limit, +            'display_id': display_id, +            'title': title, +            'description': description, +            'timestamp': unified_timestamp(player_page.get('broadcastedOn')), +            'series': try_get(player_page, lambda x: x['show']['title']),          }) - -        return res +        return info | 
