diff options
| author | Sergey M․ <dstftw@gmail.com> | 2018-07-10 00:21:14 +0700 | 
|---|---|---|
| committer | Sergey M․ <dstftw@gmail.com> | 2018-07-10 00:27:24 +0700 | 
| commit | 4b3ee09886d1f2a096004013e6a8a13a1f564ba8 (patch) | |
| tree | 18a59e31e4b4a0aa8eafb7b547b403a7837f5999 | |
| parent | 79fd7320e24596b39d81c2a364fb5b41c2f57b41 (diff) | |
[nrktv] Add support for new season and serie URL schema
| -rw-r--r-- | youtube_dl/extractor/extractors.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/nrk.py | 208 | 
2 files changed, 149 insertions, 60 deletions
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index a20712d34..c6f8a785a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -770,6 +770,7 @@ from .nrk import (      NRKTVDirekteIE,      NRKTVEpisodeIE,      NRKTVEpisodesIE, +    NRKTVSeasonIE,      NRKTVSeriesIE,  )  from .ntvde import NTVDeIE diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 50dd07d11..a231735fb 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -4,13 +4,18 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..compat import ( +    compat_str, +    compat_urllib_parse_unquote, +)  from ..utils import (      ExtractorError,      int_or_none,      JSON_LD_RE, +    NO_DEFAULT,      parse_age_limit,      parse_duration, +    try_get,  ) @@ -394,6 +399,148 @@ class NRKTVEpisodeIE(InfoExtractor):              'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id) +class NRKTVSerieBaseIE(InfoExtractor): +    def _extract_series(self, webpage, display_id, fatal=True): +        config = self._parse_json( +            self._search_regex( +                r'({.+?})\s*,\s*"[^"]+"\s*\)\s*</script>', webpage, 'config', +                default='{}' if not fatal else NO_DEFAULT), +            display_id, fatal=False) +        if not config: +            return +        return try_get(config, lambda x: x['series'], dict) + +    def _extract_episodes(self, season): +        entries = [] +        if not isinstance(season, dict): +            return entries +        episodes = season.get('episodes') +        if not isinstance(episodes, list): +            return entries +        for episode in episodes: +            nrk_id = episode.get('prfId') +            if not nrk_id or not isinstance(nrk_id, compat_str): +                continue +            entries.append(self.url_result( +                'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id)) +        return entries + + +class NRKTVSeasonIE(NRKTVSerieBaseIE): +    _VALID_URL = r'https?://tv\.nrk\.no/serie/[^/]+/sesong/(?P<id>\d+)' +    _TEST = { +        'url': 'https://tv.nrk.no/serie/backstage/sesong/1', +        'info_dict': { +            'id': '1', +            'title': 'Sesong 1', +        }, +        'playlist_mincount': 30, +    } + +    @classmethod +    def suitable(cls, url): +        return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) +                else super(NRKTVSeasonIE, cls).suitable(url)) + +    def _real_extract(self, url): +        display_id = self._match_id(url) + +        webpage = self._download_webpage(url, display_id) + +        series = self._extract_series(webpage, display_id) + +        season = next( +            s for s in series['seasons'] +            if int(display_id) == s.get('seasonNumber')) + +        title = try_get(season, lambda x: x['titles']['title'], compat_str) +        return self.playlist_result( +            self._extract_episodes(season), display_id, title) + + +class NRKTVSeriesIE(NRKTVSerieBaseIE): +    _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P<id>[^/]+)' +    _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P<id>\d+)' +    _TESTS = [{ +        # new layout +        'url': 'https://tv.nrk.no/serie/backstage', +        'info_dict': { +            'id': 'backstage', +            'title': 'Backstage', +            'description': 'md5:c3ec3a35736fca0f9e1207b5511143d3', +        }, +        'playlist_mincount': 60, +    }, { +        # old layout +        'url': 'https://tv.nrk.no/serie/groenn-glede', +        'info_dict': { +            'id': 'groenn-glede', +            'title': 'Grønn glede', +            'description': 'md5:7576e92ae7f65da6993cf90ee29e4608', +        }, +        'playlist_mincount': 9, +    }, { +        'url': 'http://tv.nrksuper.no/serie/labyrint', +        'info_dict': { +            'id': 'labyrint', +            'title': 'Labyrint', +            'description': 'md5:58afd450974c89e27d5a19212eee7115', +        }, +        'playlist_mincount': 3, +    }, { +        'url': 'https://tv.nrk.no/serie/broedrene-dal-og-spektralsteinene', +        'only_matching': True, +    }, { +        'url': 'https://tv.nrk.no/serie/saving-the-human-race', +        'only_matching': True, +    }, { +        'url': 'https://tv.nrk.no/serie/postmann-pat', +        'only_matching': True, +    }] + +    @classmethod +    def suitable(cls, url): +        return ( +            False if any(ie.suitable(url) +                         for ie in (NRKTVIE, NRKTVEpisodeIE, NRKTVSeasonIE)) +            else super(NRKTVSeriesIE, cls).suitable(url)) + +    def _real_extract(self, url): +        series_id = self._match_id(url) + +        webpage = self._download_webpage(url, series_id) + +        # New layout (e.g. https://tv.nrk.no/serie/backstage) +        series = self._extract_series(webpage, series_id, fatal=False) +        if series: +            title = try_get(series, lambda x: x['titles']['title'], compat_str) +            description = try_get( +                series, lambda x: x['titles']['subtitle'], compat_str) +            entries = [] +            for season in series['seasons']: +                entries.extend(self._extract_episodes(season)) +            return self.playlist_result(entries, series_id, title, description) + +        # Old layout (e.g. https://tv.nrk.no/serie/groenn-glede) +        entries = [ +            self.url_result( +                'https://tv.nrk.no/program/Episodes/{series}/{season}'.format( +                    series=series_id, season=season_id)) +            for season_id in re.findall(self._ITEM_RE, webpage) +        ] + +        title = self._html_search_meta( +            'seriestitle', webpage, +            'title', default=None) or self._og_search_title( +            webpage, fatal=False) + +        description = self._html_search_meta( +            'series_description', webpage, +            'description', default=None) or self._og_search_description(webpage) + +        return self.playlist_result(entries, series_id, title, description) + +  class NRKTVDirekteIE(NRKTVIE):      IE_DESC = 'NRK TV Direkte and NRK Radio Direkte'      _VALID_URL = r'https?://(?:tv|radio)\.nrk\.no/direkte/(?P<id>[^/?#&]+)' @@ -473,65 +620,6 @@ class NRKTVEpisodesIE(NRKPlaylistBaseIE):              r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False) -class NRKTVSeriesIE(InfoExtractor): -    _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P<id>[^/]+)' -    _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P<id>\d+)' -    _TESTS = [{ -        'url': 'https://tv.nrk.no/serie/groenn-glede', -        'info_dict': { -            'id': 'groenn-glede', -            'title': 'Grønn glede', -            'description': 'md5:7576e92ae7f65da6993cf90ee29e4608', -        }, -        'playlist_mincount': 9, -    }, { -        'url': 'http://tv.nrksuper.no/serie/labyrint', -        'info_dict': { -            'id': 'labyrint', -            'title': 'Labyrint', -            'description': 'md5:58afd450974c89e27d5a19212eee7115', -        }, -        'playlist_mincount': 3, -    }, { -        'url': 'https://tv.nrk.no/serie/broedrene-dal-og-spektralsteinene', -        'only_matching': True, -    }, { -        'url': 'https://tv.nrk.no/serie/saving-the-human-race', -        'only_matching': True, -    }, { -        'url': 'https://tv.nrk.no/serie/postmann-pat', -        'only_matching': True, -    }] - -    @classmethod -    def suitable(cls, url): -        return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) -                else super(NRKTVSeriesIE, cls).suitable(url)) - -    def _real_extract(self, url): -        series_id = self._match_id(url) - -        webpage = self._download_webpage(url, series_id) - -        entries = [ -            self.url_result( -                'https://tv.nrk.no/program/Episodes/{series}/{season}'.format( -                    series=series_id, season=season_id)) -            for season_id in re.findall(self._ITEM_RE, webpage) -        ] - -        title = self._html_search_meta( -            'seriestitle', webpage, -            'title', default=None) or self._og_search_title( -            webpage, fatal=False) - -        description = self._html_search_meta( -            'series_description', webpage, -            'description', default=None) or self._og_search_description(webpage) - -        return self.playlist_result(entries, series_id, title, description) - -  class NRKSkoleIE(InfoExtractor):      IE_DESC = 'NRK Skole'      _VALID_URL = r'https?://(?:www\.)?nrk\.no/skole/?\?.*\bmediaId=(?P<id>\d+)'  | 
