diff options
| -rw-r--r-- | youtube_dl/extractor/ceskatelevize.py | 170 | ||||
| -rw-r--r-- | youtube_dl/extractor/extractors.py | 5 | 
2 files changed, 92 insertions, 83 deletions
diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index 7cb4efb74..fe677d8e8 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -12,70 +12,136 @@ from ..utils import (      ExtractorError,      float_or_none,      sanitized_Request, -    unescapeHTML, -    update_url_query, +    str_or_none, +    traverse_obj,      urlencode_postdata,      USER_AGENTS,  )  class CeskaTelevizeIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)*(?P<id>[^/#?]+)' +    _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/(?:ivysilani|porady|zive)/(?:[^/?#&]+/)*(?P<id>[^/#?]+)'      _TESTS = [{ -        'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', +        'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en',          'info_dict': { -            'id': '61924494877246241', +            'id': '61924494877028507',              'ext': 'mp4', -            'title': 'Hyde Park Civilizace: Život v Grónsku', -            'description': 'md5:3fec8f6bb497be5cdb0c9e8781076626', +            'title': 'Bonus 01 - En - Hyde Park Civilizace', +            'description': 'English Subtittles',              'thumbnail': r're:^https?://.*\.jpg', -            'duration': 3350, +            'duration': 81.3,          },          'params': {              # m3u8 download              'skip_download': True,          },      }, { -        'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en', +        # live stream +        'url': 'http://www.ceskatelevize.cz/zive/ct1/',          'info_dict': { -            'id': '61924494877028507', +            'id': '102',              'ext': 'mp4', -            'title': 'Hyde Park Civilizace: Bonus 01 - En', -            'description': 'English Subtittles', -            'thumbnail': r're:^https?://.*\.jpg', -            'duration': 81.3, +            'title': r'ČT1 - živé vysílání online', +            'description': 'Sledujte živé vysílání kanálu ČT1 online. Vybírat si můžete i z dalších kanálů České televize na kterémkoli z vašich zařízení.', +            'is_live': True,          },          'params': {              # m3u8 download              'skip_download': True,          },      }, { -        # live stream +        # another          'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/', +        'only_matching': True,          'info_dict': {              'id': 402,              'ext': 'mp4',              'title': r're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',              'is_live': True,          }, +        # 'skip': 'Georestricted to Czech Republic', +    }, { +        'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25', +        'only_matching': True, +    }, { +        # video with 18+ caution trailer +        'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', +        'info_dict': { +            'id': '215562210900007-bogotart', +            'title': 'Bogotart - Queer', +            'description': 'Hlavní město Kolumbie v doprovodu queer umělců. Vroucí svět plný vášně, sebevědomí, ale i násilí a bolesti', +        }, +        'playlist': [{ +            'info_dict': { +                'id': '61924494877311053', +                'ext': 'mp4', +                'title': 'Bogotart - Queer (Varování 18+)', +                'duration': 11.9, +            }, +        }, { +            'info_dict': { +                'id': '61924494877068022', +                'ext': 'mp4', +                'title': 'Bogotart - Queer (Queer)', +                'thumbnail': r're:^https?://.*\.jpg', +                'duration': 1558.3, +            }, +        }],          'params': {              # m3u8 download              'skip_download': True,          }, -        'skip': 'Georestricted to Czech Republic',      }, { -        'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25', +        # iframe embed +        'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/',          'only_matching': True,      }] +    def _search_nextjs_data(self, webpage, video_id, **kw): +        return self._parse_json( +            self._search_regex( +                r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>', +                webpage, 'next.js data', **kw), +            video_id, **kw) +      def _real_extract(self, url):          playlist_id = self._match_id(url) - -        webpage = self._download_webpage(url, playlist_id) +        webpage, urlh = self._download_webpage_handle(url, playlist_id) +        parsed_url = compat_urllib_parse_urlparse(urlh.geturl()) +        site_name = self._og_search_property('site_name', webpage, fatal=False, default='Česká televize') +        playlist_title = self._og_search_title(webpage, default=None) +        if site_name and playlist_title: +            playlist_title = re.split(r'\s*[—|]\s*%s' % (site_name, ), playlist_title, 1)[0] +        playlist_description = self._og_search_description(webpage, default=None) +        if playlist_description: +            playlist_description = playlist_description.replace('\xa0', ' ') + +        type_ = 'IDEC' +        if re.search(r'(^/porady|/zive)/', parsed_url.path): +            next_data = self._search_nextjs_data(webpage, playlist_id) +            if '/zive/' in parsed_url.path: +                idec = traverse_obj(next_data, ('props', 'pageProps', 'data', 'liveBroadcast', 'current', 'idec'), get_all=False) +            else: +                idec = traverse_obj(next_data, ('props', 'pageProps', 'data', ('show', 'mediaMeta'), 'idec'), get_all=False) +                if not idec: +                    idec = traverse_obj(next_data, ('props', 'pageProps', 'data', 'videobonusDetail', 'bonusId'), get_all=False) +                    if idec: +                        type_ = 'bonus' +            if not idec: +                raise ExtractorError('Failed to find IDEC id') +            iframe_hash = self._download_webpage( +                'https://www.ceskatelevize.cz/v-api/iframe-hash/', +                playlist_id, note='Getting IFRAME hash') +            query = {'hash': iframe_hash, 'origin': 'iVysilani', 'autoStart': 'true', type_: idec, } +            webpage = self._download_webpage( +                'https://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php', +                playlist_id, note='Downloading player', query=query)          NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.'          if '%s</p>' % NOT_AVAILABLE_STRING in webpage: -            raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) +            self.raise_geo_restricted(NOT_AVAILABLE_STRING) +        if any(not_found in webpage for not_found in ('Neplatný parametr pro videopřehrávač', 'IDEC nebyl nalezen', )): +            raise ExtractorError('no video with IDEC available', video_id=idec, expected=True)          type_ = None          episode_id = None @@ -100,7 +166,7 @@ class CeskaTelevizeIE(InfoExtractor):          data = {              'playlist[0][type]': type_,              'playlist[0][id]': episode_id, -            'requestUrl': compat_urllib_parse_urlparse(url).path, +            'requestUrl': parsed_url.path,              'requestSource': 'iVysilani',          } @@ -108,7 +174,7 @@ class CeskaTelevizeIE(InfoExtractor):          for user_agent in (None, USER_AGENTS['Safari']):              req = sanitized_Request( -                'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', +                'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist/',                  data=urlencode_postdata(data))              req.add_header('Content-type', 'application/x-www-form-urlencoded') @@ -130,9 +196,6 @@ class CeskaTelevizeIE(InfoExtractor):              req = sanitized_Request(compat_urllib_parse_unquote(playlist_url))              req.add_header('Referer', url) -            playlist_title = self._og_search_title(webpage, default=None) -            playlist_description = self._og_search_description(webpage, default=None) -              playlist = self._download_json(req, playlist_id, fatal=False)              if not playlist:                  continue @@ -167,7 +230,7 @@ class CeskaTelevizeIE(InfoExtractor):                      entries[num]['formats'].extend(formats)                      continue -                item_id = item.get('id') or item['assetId'] +                item_id = str_or_none(item.get('id') or item['assetId'])                  title = item['title']                  duration = float_or_none(item.get('duration')) @@ -181,8 +244,6 @@ class CeskaTelevizeIE(InfoExtractor):                  if playlist_len == 1:                      final_title = playlist_title or title -                    if is_live: -                        final_title = self._live_title(final_title)                  else:                      final_title = '%s (%s)' % (playlist_title, title) @@ -200,6 +261,8 @@ class CeskaTelevizeIE(InfoExtractor):          for e in entries:              self._sort_formats(e['formats']) +        if len(entries) == 1: +            return entries[0]          return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)      def _get_subtitles(self, episode_id, subs): @@ -236,54 +299,3 @@ class CeskaTelevizeIE(InfoExtractor):                      yield line          return '\r\n'.join(_fix_subtitle(subtitles)) - - -class CeskaTelevizePoradyIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/porady/(?:[^/?#&]+/)*(?P<id>[^/#?]+)' -    _TESTS = [{ -        # video with 18+ caution trailer -        'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', -        'info_dict': { -            'id': '215562210900007-bogotart', -            'title': 'Queer: Bogotart', -            'description': 'Alternativní průvodce současným queer světem', -        }, -        'playlist': [{ -            'info_dict': { -                'id': '61924494876844842', -                'ext': 'mp4', -                'title': 'Queer: Bogotart (Varování 18+)', -                'duration': 10.2, -            }, -        }, { -            'info_dict': { -                'id': '61924494877068022', -                'ext': 'mp4', -                'title': 'Queer: Bogotart (Queer)', -                'thumbnail': r're:^https?://.*\.jpg', -                'duration': 1558.3, -            }, -        }], -        'params': { -            # m3u8 download -            'skip_download': True, -        }, -    }, { -        # iframe embed -        'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/', -        'only_matching': True, -    }] - -    def _real_extract(self, url): -        video_id = self._match_id(url) - -        webpage = self._download_webpage(url, video_id) - -        data_url = update_url_query(unescapeHTML(self._search_regex( -            (r'<span[^>]*\bdata-url=(["\'])(?P<url>(?:(?!\1).)+)\1', -             r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?ceskatelevize\.cz/ivysilani/embed/iFramePlayer\.php.*?)\1'), -            webpage, 'iframe player url', group='url')), query={ -                'autoStart': 'true', -        }) - -        return self.url_result(data_url, ie=CeskaTelevizeIE.ie_key()) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 751fc38b6..e36f86be4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -208,10 +208,7 @@ from .ccc import (  from .ccma import CCMAIE  from .cctv import CCTVIE  from .cda import CDAIE -from .ceskatelevize import ( -    CeskaTelevizeIE, -    CeskaTelevizePoradyIE, -) +from .ceskatelevize import CeskaTelevizeIE  from .channel9 import Channel9IE  from .charlierose import CharlieRoseIE  from .chaturbate import ChaturbateIE  | 
