diff options
| author | Sergey M․ <dstftw@gmail.com> | 2020-02-15 02:16:26 +0700 | 
|---|---|---|
| committer | Sergey M․ <dstftw@gmail.com> | 2020-02-15 02:16:26 +0700 | 
| commit | 06f1de2daff8351f572974dafccebefd378b9f99 (patch) | |
| tree | 93fdbc63789cb34cfc3a4b9df4fc5fa498aef375 | |
| parent | b68a6e32fb01717b24fd2201c3e1a5611fd1c963 (diff) | |
[nova] Improve extraction (refs #23690)
| -rw-r--r-- | youtube_dl/extractor/nova.py | 37 | 
1 files changed, 22 insertions, 15 deletions
| diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index 62d0552e9..2850af5db 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -97,7 +97,7 @@ class NovaIE(InfoExtractor):      _VALID_URL = r'https?://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)'      _TESTS = [{          'url': 'http://tn.nova.cz/clanek/tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci.html#player_13260', -        'md5': '1dd7b9d5ea27bc361f110cd855a19bd3', +        'md5': '249baab7d0104e186e78b0899c7d5f28',          'info_dict': {              'id': '1757139',              'display_id': 'tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci', @@ -119,7 +119,8 @@ class NovaIE(InfoExtractor):          'params': {              # rtmp download              'skip_download': True, -        } +        }, +        'skip': 'gone',      }, {          # media.cms.nova.cz embed          'url': 'https://novaplus.nova.cz/porad/ulice/epizoda/18760-2180-dil', @@ -134,6 +135,7 @@ class NovaIE(InfoExtractor):              'skip_download': True,          },          'add_ie': [NovaEmbedIE.ie_key()], +        'skip': 'CHYBA 404: STRÁNKA NENALEZENA',      }, {          'url': 'http://sport.tn.nova.cz/clanek/sport/hokej/nhl/zivot-jde-dal-hodnotil-po-vyrazeni-z-playoff-jiri-sekac.html',          'only_matching': True, @@ -158,14 +160,29 @@ class NovaIE(InfoExtractor):          webpage = self._download_webpage(url, display_id) +        description = clean_html(self._og_search_description(webpage, default=None)) +        if site == 'novaplus': +            upload_date = unified_strdate(self._search_regex( +                r'(\d{1,2}-\d{1,2}-\d{4})$', display_id, 'upload date', default=None)) +        elif site == 'fanda': +            upload_date = unified_strdate(self._search_regex( +                r'<span class="date_time">(\d{1,2}\.\d{1,2}\.\d{4})', webpage, 'upload date', default=None)) +        else: +            upload_date = None +          # novaplus          embed_id = self._search_regex(              r'<iframe[^>]+\bsrc=["\'](?:https?:)?//media\.cms\.nova\.cz/embed/([^/?#&]+)',              webpage, 'embed url', default=None)          if embed_id: -            return self.url_result( -                'https://media.cms.nova.cz/embed/%s' % embed_id, -                ie=NovaEmbedIE.ie_key(), video_id=embed_id) +            return { +                '_type': 'url_transparent', +                'url': 'https://media.cms.nova.cz/embed/%s' % embed_id, +                'ie_key': NovaEmbedIE.ie_key(), +                'id': embed_id, +                'description': description, +                'upload_date': upload_date +            }          video_id = self._search_regex(              [r"(?:media|video_id)\s*:\s*'(\d+)'", @@ -239,18 +256,8 @@ class NovaIE(InfoExtractor):          self._sort_formats(formats)          title = mediafile.get('meta', {}).get('title') or self._og_search_title(webpage) -        description = clean_html(self._og_search_description(webpage, default=None))          thumbnail = config.get('poster') -        if site == 'novaplus': -            upload_date = unified_strdate(self._search_regex( -                r'(\d{1,2}-\d{1,2}-\d{4})$', display_id, 'upload date', default=None)) -        elif site == 'fanda': -            upload_date = unified_strdate(self._search_regex( -                r'<span class="date_time">(\d{1,2}\.\d{1,2}\.\d{4})', webpage, 'upload date', default=None)) -        else: -            upload_date = None -          return {              'id': video_id,              'display_id': display_id, | 
