diff options
| author | Sergey M․ <dstftw@gmail.com> | 2017-12-09 02:16:23 +0700 | 
|---|---|---|
| committer | Sergey M․ <dstftw@gmail.com> | 2017-12-09 02:16:54 +0700 | 
| commit | 2a57b62b8007973b5b4974a1d9f5ab06ae78c86e (patch) | |
| tree | b9c092552a2de88af45fbb8e42fda8299083dcf0 | |
| parent | e2707a832cd53e2cfa68b99db997890a6a5bd685 (diff) | |
[ellentube] Fix issues, improve and simplify (closes #14570)
| -rw-r--r-- | youtube_dl/extractor/ellentube.py | 165 | ||||
| -rw-r--r-- | youtube_dl/extractor/extractors.py | 3 | 
2 files changed, 80 insertions, 88 deletions
diff --git a/youtube_dl/extractor/ellentube.py b/youtube_dl/extractor/ellentube.py index 68fe17273..544473274 100644 --- a/youtube_dl/extractor/ellentube.py +++ b/youtube_dl/extractor/ellentube.py @@ -4,137 +4,130 @@ from __future__ import unicode_literals  from .common import InfoExtractor  from ..utils import (      clean_html, +    extract_attributes, +    float_or_none,      int_or_none, +    try_get,  ) -class EllenTubeIE(InfoExtractor): -    _VALID_URL = r'''(?x) -                    (?: -                        https://api-prod\.ellentube\.com/ellenapi/api/item/ -                        |ellentube: -                    ) -                    (?P<id> -                        [\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12} -                    )''' - -    _TESTS = [{ -        'url': 'https://api-prod.ellentube.com/ellenapi/api/item/75c64c16-aefd-4558-b4f5-3de09b22e6fc', -        'match_only': True, -    }, { -        'url': 'ellentube:734a3353-f697-4e79-9ca9-bfc3002dc1e0', -        'match_only': True, -    }] +class EllenTubeBaseIE(InfoExtractor): +    def _extract_data_config(self, webpage, video_id): +        details = self._search_regex( +            r'(<[^>]+\bdata-component=(["\'])[Dd]etails.+?></div>)', webpage, +            'details') +        return self._parse_json( +            extract_attributes(details)['data-config'], video_id) -    def _real_extract(self, url): -        video_id = self._match_id(url) -        data = self._download_json( -            'https://api-prod.ellentube.com/ellenapi/api/item/%s' % video_id, video_id) +    def _extract_video(self, data, video_id):          title = data['title'] -        description = data.get('description') -        publish_time = int_or_none(data.get('publishTime')) -        thumbnail = data.get('thumbnail')          formats = []          duration = None          for entry in data.get('media'):              if entry.get('id') == 'm3u8':                  formats = self._extract_m3u8_formats( -                    entry.get('url'), video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') +                    entry['url'], video_id, 'mp4', +                    entry_protocol='m3u8_native', m3u8_id='hls')                  duration = int_or_none(entry.get('duration'))                  break          self._sort_formats(formats) + +        def get_insight(kind): +            return int_or_none(try_get( +                data, lambda x: x['insight']['%ss' % kind])) +          return { +            'extractor_key': EllenTubeIE.ie_key(),              'id': video_id,              'title': title, -            'description': description, +            'description': data.get('description'),              'duration': duration, -            'thumbnail': thumbnail, -            'timestamp': publish_time, +            'thumbnail': data.get('thumbnail'), +            'timestamp': float_or_none(data.get('publishTime'), scale=1000), +            'view_count': get_insight('view'), +            'like_count': get_insight('like'),              'formats': formats,          } -class EllenTubeVideoIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?ellentube\.com/video/(?P<id>.+)\.html' - -    _TEST = { -        'url': 'https://www.ellentube.com/video/ellen-meets-las-vegas-survivors-jesus-campos-and-stephen-schuck.html', +class EllenTubeIE(EllenTubeBaseIE): +    _VALID_URL = r'''(?x) +                        (?: +                            ellentube:| +                            https://api-prod\.ellentube\.com/ellenapi/api/item/ +                        ) +                        (?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}) +                    ''' +    _TESTS = [{ +        'url': 'https://api-prod.ellentube.com/ellenapi/api/item/0822171c-3829-43bf-b99f-d77358ae75e3',          'md5': '2fabc277131bddafdd120e0fc0f974c9',          'info_dict': {              'id': '0822171c-3829-43bf-b99f-d77358ae75e3',              'ext': 'mp4',              'title': 'Ellen Meets Las Vegas Survivors Jesus Campos and Stephen Schuck',              'description': 'md5:76e3355e2242a78ad9e3858e5616923f', +            'thumbnail': r're:^https?://.+?',              'duration': 514, -            'timestamp': 1508505120000, -            'thumbnail': 'https://warnerbros-h.assetsadobe.com/is/image/content/dam/ellen/videos/episodes/season15/32/video--2728751654987218111', +            'timestamp': 1508505120, +            'upload_date': '20171020', +            'view_count': int, +            'like_count': int,          } -    } +    }, { +        'url': 'ellentube:734a3353-f697-4e79-9ca9-bfc3002dc1e0', +        'only_matching': True, +    }]      def _real_extract(self, url): -        display_id = self._match_id(url) -        webpage = self._download_webpage(url, display_id) -        video_id = self._html_search_regex( -            r'(?s)<!--\s*CONTENT\s*-->.*data-config.+([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', -            webpage, 'video id') -        return self.url_result('ellentube:%s' % video_id, 'EllenTube') +        video_id = self._match_id(url) +        data = self._download_json( +            'https://api-prod.ellentube.com/ellenapi/api/item/%s' % video_id, +            video_id) +        return self._extract_video(data, video_id) -class EllenTubePlaylistIE(InfoExtractor): -    def _extract_videos_from_json(self, data, display_id): -        return [self.url_result('ellentube:%s' % elem['id'], 'EllenTube') -                for elem in data if elem.get('type') == 'VIDEO'] +class EllenTubeVideoIE(EllenTubeBaseIE): +    _VALID_URL = r'https?://(?:www\.)?ellentube\.com/video/(?P<id>.+?)\.html' +    _TEST = { +        'url': 'https://www.ellentube.com/video/ellen-meets-las-vegas-survivors-jesus-campos-and-stephen-schuck.html', +        'only_matching': True, +    } -    def _extract_playlist(self, url, display_id, extract_description=True): +    def _real_extract(self, url): +        display_id = self._match_id(url)          webpage = self._download_webpage(url, display_id) -        playlist_data = self._html_search_regex( -            r'<div\s+data-component\s*=\s*"Details"(.+)</div>', webpage, 'playlist data') -        playlist_title = self._search_regex( -            r'"title"\s*:\s*"(.+?)"', playlist_data, 'playlist title') -        playlist_description = clean_html(self._search_regex( -            r'"description"\s*:\s*"(.+?)"', playlist_data, 'playlist description', -            fatal=False)) if extract_description else None -        api_search = self._search_regex( -            r'"filter"\s*:\s*"(.+?)"', playlist_data, 'playlist api request') -        api_data = self._download_json( -            'https://api-prod.ellentube.com/ellenapi/api/feed/?%s' % api_search, -            display_id) -        return self.playlist_result( -            self._extract_videos_from_json(api_data, display_id), -            display_id, playlist_title, playlist_description) +        video_id = self._extract_data_config(webpage, display_id)['id'] +        return self.url_result( +            'ellentube:%s' % video_id, ie=EllenTubeIE.ie_key(), +            video_id=video_id) -class EllenTubeEpisodeIE(EllenTubePlaylistIE): -    _VALID_URL = r'https?://(?:www\.)?ellentube\.com/episode/(?P<id>.+)\.html' - -    _TEST = { +class EllenTubePlaylistIE(EllenTubeBaseIE): +    _VALID_URL = r'https?://(?:www\.)?ellentube\.com/(?:episode|studios)/(?P<id>.+?)\.html' +    _TESTS = [{          'url': 'https://www.ellentube.com/episode/dax-shepard-jordan-fisher-haim.html',          'info_dict': {              'id': 'dax-shepard-jordan-fisher-haim', -            'title': 'Dax Shepard, \'DWTS\' Team Jordan Fisher & Lindsay Arnold, HAIM', -            'description': 'md5:aed85d42892f6126e71ec5ed2aea2a0d' +            'title': "Dax Shepard, 'DWTS' Team Jordan Fisher & Lindsay Arnold, HAIM", +            'description': 'md5:bfc982194dabb3f4e325e43aa6b2e21c',          },          'playlist_count': 6, -    } - -    def _real_extract(self, url): -        display_id = self._match_id(url) -        return self._extract_playlist(url, display_id) - - -class EllenTubeStudioIE(EllenTubePlaylistIE): -    _VALID_URL = r'https?://(?:www\.)?ellentube\.com/studios/(?P<id>.+)\.html' - -    _TEST = { +    }, {          'url': 'https://www.ellentube.com/studios/macey-goes-rving0.html', -        'info_dict': { -            'id': 'macey-goes-rving0', -            'title': 'Macey Goes RVing', -        }, -        'playlist_mincount': 3, -    } +        'only_matching': True, +    }]      def _real_extract(self, url):          display_id = self._match_id(url) -        return self._extract_playlist(url, display_id, False) +        webpage = self._download_webpage(url, display_id) +        data = self._extract_data_config(webpage, display_id)['data'] +        feed = self._download_json( +            'https://api-prod.ellentube.com/ellenapi/api/feed/?%s' +            % data['filter'], display_id) +        entries = [ +            self._extract_video(elem, elem['id']) +            for elem in feed if elem.get('type') == 'VIDEO' and elem.get('id')] +        return self.playlist_result( +            entries, display_id, data.get('title'), +            clean_html(data.get('description'))) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 55d2dd1fe..0177a2cff 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -313,9 +313,8 @@ from .einthusan import EinthusanIE  from .eitb import EitbIE  from .ellentube import (      EllenTubeIE, -    EllenTubeEpisodeIE, -    EllenTubeStudioIE,      EllenTubeVideoIE, +    EllenTubePlaylistIE,  )  from .elpais import ElPaisIE  from .embedly import EmbedlyIE  | 
