diff options
| author | Remita Amine <remitamine@gmail.com> | 2017-12-26 19:41:08 +0100 | 
|---|---|---|
| committer | Remita Amine <remitamine@gmail.com> | 2017-12-26 19:41:08 +0100 | 
| commit | be069839b4acb645799f7b216d14c046fb4a3400 (patch) | |
| tree | b67241ec284f644a1e76b53a08b188ac6062a2cd | |
| parent | a14001a5a13b1639dc98b75b0775d251487aad1d (diff) | |
[filmweb] improve extraction
| -rw-r--r-- | youtube_dl/extractor/filmweb.py | 53 | ||||
| -rw-r--r-- | youtube_dl/extractor/twentythreevideo.py | 90 | 
2 files changed, 86 insertions, 57 deletions
| diff --git a/youtube_dl/extractor/filmweb.py b/youtube_dl/extractor/filmweb.py index a3d9f872e..56000bc5b 100644 --- a/youtube_dl/extractor/filmweb.py +++ b/youtube_dl/extractor/filmweb.py @@ -1,45 +1,42 @@  from __future__ import unicode_literals -from .twentythreevideo import TwentyThreeVideoIE +import re +from .common import InfoExtractor -class FilmwebIE(TwentyThreeVideoIE): -    IE_NAME = 'Filmweb' -    _VALID_URL = r'https?://(?:www\.)?filmweb\.no/trailere/article(?P<id>\d+).ece' + +class FilmwebIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?filmweb\.no/(?P<type>trailere|filmnytt)/article(?P<id>\d+)\.ece'      _TEST = {          'url': 'http://www.filmweb.no/trailere/article1264921.ece',          'md5': 'e353f47df98e557d67edaceda9dece89',          'info_dict': { -            'id': '1264921', -            'title': 'Det som en gang var', +            'id': '13033574',              'ext': 'mp4', -            'description': 'Trailer: Scener fra et vennskap', +            'title': 'Det som en gang var', +            'upload_date': '20160316', +            'timestamp': 1458140101, +            'uploader_id': '12639966', +            'uploader': 'Live Roaldset',          }      } -    _CLIENT_NAME = 'filmweb' -    _CLIENT_ID = '12732917' -    _EMBED_BASE_URL = 'http://www.filmweb.no/template/ajax/json_trailerEmbed.jsp?articleId=%s&autoplay=true' -      def _real_extract(self, url): -        article_id = self._match_id(url) -        webpage = self._download_webpage(url, article_id) - -        title = self._search_regex(r'var\s+jsTitle\s*=\s*escape\("([^"]+)"\);', -            webpage, 'title', fatal=True) - -        format_url = self._proto_relative_url( -            self._html_search_regex(r'"(//filmweb\.23video\.com/[^"]+)"', -                self._download_json(self._EMBED_BASE_URL % article_id, -                    article_id)['embedCode'], 'format url')) - -        formats = self._extract_formats(format_url, self._CLIENT_ID) -        self._sort_formats(formats) +        article_type, article_id = re.match(self._VALID_URL, url).groups() +        if article_type == 'filmnytt': +            webpage = self._download_webpage(url, article_id) +            article_id = self._search_regex(r'data-videoid="(\d+)"', webpage, 'article id') +        embed_code = self._download_json( +            'https://www.filmweb.no/template_v2/ajax/json_trailerEmbed.jsp', +            article_id, query={ +                'articleId': article_id, +            })['embedCode'] +        iframe_url = self._proto_relative_url(self._search_regex( +            r'<iframe[^>]+src="([^"]+)', embed_code, 'iframe url'))          return { +            '_type': 'url_transparent',              'id': article_id, -            'title': title, -            'alt_title': self._og_search_title(webpage), -            'formats': formats, -            'description': self._og_search_description(webpage), +            'url': iframe_url, +            'ie_key': 'TwentyThreeVideo',          } diff --git a/youtube_dl/extractor/twentythreevideo.py b/youtube_dl/extractor/twentythreevideo.py index 2bad2dbd6..aa0c6e90f 100644 --- a/youtube_dl/extractor/twentythreevideo.py +++ b/youtube_dl/extractor/twentythreevideo.py @@ -1,45 +1,77 @@  from __future__ import unicode_literals +import re +  from .common import InfoExtractor +from ..utils import int_or_none  class TwentyThreeVideoIE(InfoExtractor):      IE_NAME = '23video' -    _VALID_URL = r'https?://(?:www\.)?(?P<client>[\w-]+)\.23video\.com/v.ihtml/player.html.*photo_id=(?P<id>\d+)' -    _TEST = {} - -    _URL_TEMPLATE = 'https://%s.23video.com/%s/%s/%s/%s/download-video.mp4' -    _FORMATS = { -        'video_hd': { -            'width': 1280, -            'height': 720, -        }, -        'video_medium': { -            'width': 640, -            'height': 360, -        }, -        'video_mobile_high': { -            'width': 320, -            'height': 180, +    _VALID_URL = r'https?://video\.(?P<domain>twentythree\.net|23video\.com|filmweb\.no)/v\.ihtml/player\.html\?(?P<query>.*?\bphoto(?:_|%5f)id=(?P<id>\d+).*)' +    _TEST = { +        'url': 'https://video.twentythree.net/v.ihtml/player.html?showDescriptions=0&source=site&photo%5fid=20448876&autoPlay=1', +        'md5': '75fcf216303eb1dae9920d651f85ced4', +        'info_dict': { +            'id': '20448876', +            'ext': 'mp4', +            'title': 'Video Marketing Minute: Personalized Video', +            'timestamp': 1513855354, +            'upload_date': '20171221', +            'uploader_id': '12258964', +            'uploader': 'Rasmus Bysted',          }      } -    def _extract_formats(self, url, client_id): -        client_name = self._search_regex(r'([a-z]+)\.23video\.com', url, 'client name') -        video_id = self._search_regex(r'photo%5fid=([^?&]+)', url, 'video id') -        token = self._search_regex(r'token=([^?&]+)', url, 'token') +    def _real_extract(self, url): +        domain, query, photo_id = re.match(self._VALID_URL, url).groups() +        base_url = 'https://video.%s' % domain +        photo_data = self._download_json( +            base_url + '/api/photo/list?' + query, photo_id, query={ +                'format': 'json', +            }, transform_source=lambda s: self._search_regex(r'(?s)({.+})', s, 'photo data'))['photo'] +        title = photo_data['title']          formats = [] -        for format_key in self._FORMATS.keys(): + +        audio_path = photo_data.get('audio_download') +        if audio_path:              formats.append({ -                'url': self._URL_TEMPLATE % (client_name, client_id, video_id, -                    token, format_key), -                'width': self._FORMATS.get(format_key, {}).get('width'), -                'height': self._FORMATS.get(format_key, {}).get('height'), +                'format_id': 'audio', +                'url': base_url + audio_path, +                'filesize': int_or_none(photo_data.get('audio_size')), +                'vcodec': 'none', +            }) + +        def add_common_info_to_list(l, template, id_field, id_value): +            f_base = template % id_value +            f_path = photo_data.get(f_base + 'download') +            if not f_path: +                return +            l.append({ +                id_field: id_value, +                'url': base_url + f_path, +                'width': int_or_none(photo_data.get(f_base + 'width')), +                'height': int_or_none(photo_data.get(f_base + 'height')), +                'filesize': int_or_none(photo_data.get(f_base + 'size')),              }) -        return formats +        for f in ('mobile_high', 'medium', 'hd', '1080p', '4k'): +            add_common_info_to_list(formats, 'video_%s_', 'format_id', f) -    def _real_extract(self, url): -        # TODO: Find out how to extract client_id -        raise NotImplementedError('Not able to extract the `client_id`') +        thumbnails = [] +        for t in ('quad16', 'quad50', 'quad75', 'quad100', 'small', 'portrait', 'standard', 'medium', 'large', 'original'): +            add_common_info_to_list(thumbnails, '%s_', 'id', t) + +        return { +            'id': photo_id, +            'title': title, +            'timestamp': int_or_none(photo_data.get('creation_date_epoch')), +            'duration': int_or_none(photo_data.get('video_length')), +            'view_count': int_or_none(photo_data.get('view_count')), +            'comment_count': int_or_none(photo_data.get('number_of_comments')), +            'uploader_id': photo_data.get('user_id'), +            'uploader': photo_data.get('display_name'), +            'thumbnails': thumbnails, +            'formats': formats, +        } | 
