aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSergey M․ <dstftw@gmail.com>2018-12-07 00:49:24 +0700
committerSergey M․ <dstftw@gmail.com>2018-12-07 00:49:24 +0700
commit15699ec8b0a9cb8d519b721a5cb8199afe62fc3c (patch)
tree535da247b8fba92598d2f13a030fafb311f8e6df
parent33cc1ea586480ccc60fd25f2d42cfb44eec605c6 (diff)
[nrktv:season,series] Fix extraction and update tests (closes #17159, closes #17258)
-rw-r--r--youtube_dl/extractor/nrk.py68
1 files changed, 43 insertions, 25 deletions
diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py
index c5001ef48..48bc6fd7a 100644
--- a/youtube_dl/extractor/nrk.py
+++ b/youtube_dl/extractor/nrk.py
@@ -211,13 +211,13 @@ class NRKIE(NRKBaseIE):
_TESTS = [{
# video
'url': 'http://www.nrk.no/video/PS*150533',
- 'md5': '2f7f6eeb2aacdd99885f355428715cfa',
+ 'md5': '706f34cdf1322577589e369e522b50ef',
'info_dict': {
'id': '150533',
'ext': 'mp4',
'title': 'Dompap og andre fugler i Piip-Show',
'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f',
- 'duration': 263,
+ 'duration': 262,
}
}, {
# audio
@@ -256,14 +256,14 @@ class NRKTVIE(NRKBaseIE):
_API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no')
_TESTS = [{
'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',
- 'md5': '4e9ca6629f09e588ed240fb11619922a',
+ 'md5': '9a167e54d04671eb6317a37b7bc8a280',
'info_dict': {
'id': 'MUHH48000314AA',
'ext': 'mp4',
'title': '20 spørsmål 23.05.2014',
'description': 'md5:bdea103bc35494c143c6a9acdd84887a',
'duration': 1741,
- 'series': '20 spørsmål - TV',
+ 'series': '20 spørsmål',
'episode': '23.05.2014',
},
}, {
@@ -301,7 +301,7 @@ class NRKTVIE(NRKBaseIE):
'id': 'MSPO40010515AH',
'ext': 'mp4',
'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 1)',
- 'description': 'md5:c03aba1e917561eface5214020551b7a',
+ 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d',
'duration': 772,
'series': 'Tour de Ski',
'episode': '06.01.2015',
@@ -314,7 +314,7 @@ class NRKTVIE(NRKBaseIE):
'id': 'MSPO40010515BH',
'ext': 'mp4',
'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 2)',
- 'description': 'md5:c03aba1e917561eface5214020551b7a',
+ 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d',
'duration': 6175,
'series': 'Tour de Ski',
'episode': '06.01.2015',
@@ -326,7 +326,7 @@ class NRKTVIE(NRKBaseIE):
'info_dict': {
'id': 'MSPO40010515',
'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015',
- 'description': 'md5:c03aba1e917561eface5214020551b7a',
+ 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d',
},
'expected_warnings': ['Video is geo restricted'],
}, {
@@ -406,21 +406,35 @@ class NRKTVSerieBaseIE(InfoExtractor):
def _extract_series(self, webpage, display_id, fatal=True):
config = self._parse_json(
self._search_regex(
- r'({.+?})\s*,\s*"[^"]+"\s*\)\s*</script>', webpage, 'config',
- default='{}' if not fatal else NO_DEFAULT),
+ (r'INITIAL_DATA_*\s*=\s*({.+?})\s*;',
+ r'({.+?})\s*,\s*"[^"]+"\s*\)\s*</script>'),
+ webpage, 'config', default='{}' if not fatal else NO_DEFAULT),
display_id, fatal=False)
if not config:
return
- return try_get(config, lambda x: x['series'], dict)
+ return try_get(
+ config,
+ (lambda x: x['initialState']['series'], lambda x: x['series']),
+ dict)
+
+ def _extract_seasons(self, seasons):
+ if not isinstance(seasons, list):
+ return []
+ entries = []
+ for season in seasons:
+ entries.extend(self._extract_episodes(season))
+ return entries
def _extract_episodes(self, season):
- entries = []
if not isinstance(season, dict):
- return entries
- episodes = season.get('episodes')
- if not isinstance(episodes, list):
- return entries
- for episode in episodes:
+ return []
+ return self._extract_entries(season.get('episodes'))
+
+ def _extract_entries(self, entry_list):
+ if not isinstance(entry_list, list):
+ return []
+ entries = []
+ for episode in entry_list:
nrk_id = episode.get('prfId')
if not nrk_id or not isinstance(nrk_id, compat_str):
continue
@@ -465,7 +479,7 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE):
_VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P<id>[^/]+)'
_ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P<id>\d+)'
_TESTS = [{
- # new layout
+ # new layout, seasons
'url': 'https://tv.nrk.no/serie/backstage',
'info_dict': {
'id': 'backstage',
@@ -474,20 +488,21 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE):
},
'playlist_mincount': 60,
}, {
- # old layout
+ # new layout, instalments
'url': 'https://tv.nrk.no/serie/groenn-glede',
'info_dict': {
'id': 'groenn-glede',
'title': 'Grønn glede',
'description': 'md5:7576e92ae7f65da6993cf90ee29e4608',
},
- 'playlist_mincount': 9,
+ 'playlist_mincount': 10,
}, {
- 'url': 'http://tv.nrksuper.no/serie/labyrint',
+ # old layout
+ 'url': 'https://tv.nrksuper.no/serie/labyrint',
'info_dict': {
'id': 'labyrint',
'title': 'Labyrint',
- 'description': 'md5:58afd450974c89e27d5a19212eee7115',
+ 'description': 'md5:318b597330fdac5959247c9b69fdb1ec',
},
'playlist_mincount': 3,
}, {
@@ -520,11 +535,11 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE):
description = try_get(
series, lambda x: x['titles']['subtitle'], compat_str)
entries = []
- for season in series['seasons']:
- entries.extend(self._extract_episodes(season))
+ entries.extend(self._extract_seasons(series.get('seasons')))
+ entries.extend(self._extract_entries(series.get('instalments')))
return self.playlist_result(entries, series_id, title, description)
- # Old layout (e.g. https://tv.nrk.no/serie/groenn-glede)
+ # Old layout (e.g. https://tv.nrksuper.no/serie/labyrint)
entries = [
self.url_result(
'https://tv.nrk.no/program/Episodes/{series}/{season}'.format(
@@ -536,6 +551,9 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE):
'seriestitle', webpage,
'title', default=None) or self._og_search_title(
webpage, fatal=False)
+ if title:
+ title = self._search_regex(
+ r'NRK (?:Super )?TV\s*[-–]\s*(.+)', title, 'title', default=title)
description = self._html_search_meta(
'series_description', webpage,
@@ -596,7 +614,7 @@ class NRKPlaylistIE(NRKPlaylistBaseIE):
'title': 'Rivertonprisen til Karin Fossum',
'description': 'Første kvinne på 15 år til å vinne krimlitteraturprisen.',
},
- 'playlist_count': 5,
+ 'playlist_count': 2,
}]
def _extract_title(self, webpage):