diff options
| author | Sergey M․ <dstftw@gmail.com> | 2016-09-25 21:58:17 +0700 | 
|---|---|---|
| committer | Sergey M․ <dstftw@gmail.com> | 2016-09-25 22:26:00 +0700 | 
| commit | a3d8b3816802c76beffa48789eac5181e02db3dc (patch) | |
| tree | af1fb7588eba662120d0023cf177d215dabfe90a | |
| parent | e590b7ff9e8e408bb9ec4da58ab6847686d29dbc (diff) | |
[npo] Generalize playlist extractors
| -rw-r--r-- | youtube_dl/extractor/npo.py | 63 | 
1 files changed, 26 insertions, 37 deletions
| diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 3293bdb17..f95867d58 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -438,9 +438,29 @@ class SchoolTVIE(InfoExtractor):          } -class VPROIE(NPOIE): +class NPOPlaylistBaseIE(NPOIE): +    def _real_extract(self, url): +        playlist_id = self._match_id(url) + +        webpage = self._download_webpage(url, playlist_id) + +        entries = [ +            self.url_result('npo:%s' % video_id if not video_id.startswith('http') else video_id) +            for video_id in re.findall(self._PLAYLIST_ENTRY_RE, webpage) +        ] + +        playlist_title = self._html_search_regex( +            self._PLAYLIST_TITLE_RE, webpage, 'playlist title', +            default=None) or self._og_search_title(webpage) + +        return self.playlist_result(entries, playlist_id, playlist_title) + + +class VPROIE(NPOPlaylistBaseIE):      IE_NAME = 'vpro'      _VALID_URL = r'https?://(?:www\.)?(?:tegenlicht\.)?vpro\.nl/(?:[^/]+/){2,}(?P<id>[^/]+)\.html' +    _PLAYLIST_TITLE_RE = r'<title>\s*([^>]+?)\s*-\s*Teledoc\s*-\s*VPRO\s*</title>' +    _PLAYLIST_ENTRY_RE = r'data-media-id="([^"]+)"'      _TESTS = [          { @@ -473,48 +493,17 @@ class VPROIE(NPOIE):          }      ] -    def _real_extract(self, url): -        playlist_id = self._match_id(url) - -        webpage = self._download_webpage(url, playlist_id) - -        entries = [ -            self.url_result('npo:%s' % video_id if not video_id.startswith('http') else video_id) -            for video_id in re.findall(r'data-media-id="([^"]+)"', webpage) -        ] - -        playlist_title = self._search_regex( -            r'<title>\s*([^>]+?)\s*-\s*Teledoc\s*-\s*VPRO\s*</title>', -            webpage, 'playlist title', default=None) or self._og_search_title(webpage) - -        return self.playlist_result(entries, playlist_id, playlist_title) - -class WNLIE(InfoExtractor): +class WNLIE(NPOPlaylistBaseIE):      _VALID_URL = r'https?://(?:www\.)?omroepwnl\.nl/video/detail/(?P<id>[^/]+)__\d+' +    _PLAYLIST_TITLE_RE = r'(?s)<h1[^>]+class="subject"[^>]*>(.+?)</h1>' +    _PLAYLIST_ENTRY_RE = r'<a[^>]+href="([^"]+)"[^>]+class="js-mid"[^>]*>Deel \d+' -    _TEST = { +    _TESTS = [{          'url': 'http://www.omroepwnl.nl/video/detail/vandaag-de-dag-6-mei__060515',          'info_dict': {              'id': 'vandaag-de-dag-6-mei',              'title': 'Vandaag de Dag 6 mei',          },          'playlist_count': 4, -    } - -    def _real_extract(self, url): -        playlist_id = self._match_id(url) - -        webpage = self._download_webpage(url, playlist_id) - -        entries = [ -            self.url_result('npo:%s' % video_id, 'NPO') -            for video_id, part in re.findall( -                r'<a[^>]+href="([^"]+)"[^>]+class="js-mid"[^>]*>(Deel \d+)', webpage) -        ] - -        playlist_title = self._html_search_regex( -            r'(?s)<h1[^>]+class="subject"[^>]*>(.+?)</h1>', -            webpage, 'playlist title') - -        return self.playlist_result(entries, playlist_id, playlist_title) +    }] | 
