diff options
| author | Rob van Bekkum <rob_van_bekkum@hotmail.com> | 2016-02-27 18:21:42 +0100 | 
|---|---|---|
| committer | Sergey M․ <dstftw@gmail.com> | 2016-07-23 17:01:09 +0700 | 
| commit | 4671dd41b27e39eb4682189fca44d0f4272a4751 (patch) | |
| tree | 7d990d41b8110e87640ad8f081197b5aa4e4ec66 | |
| parent | f164b97123d5b61a7dd055c888212a0dc670f04f (diff) | |
[arkena:lcp] Add extractors
| -rw-r--r-- | youtube_dl/extractor/arkenaplay.py | 151 | ||||
| -rw-r--r-- | youtube_dl/extractor/lcp.py | 39 | 
2 files changed, 190 insertions, 0 deletions
| diff --git a/youtube_dl/extractor/arkenaplay.py b/youtube_dl/extractor/arkenaplay.py new file mode 100644 index 000000000..0061ea196 --- /dev/null +++ b/youtube_dl/extractor/arkenaplay.py @@ -0,0 +1,151 @@ +# coding: utf-8 +from __future__ import unicode_literals +from .common import InfoExtractor +from ..utils import ( +    int_or_none, +    parse_iso8601 +) +import re + + +class ArkenaPlayIE(InfoExtractor): +    IE_NAME = 'ArkenaPlay' +    _VALID_URL = r'(?P<shortcut>arkena:(?P<version>[0-9]+):(?P<mediatype>[A-Za-z0-9]+):(?P<mediaId>[^:]+):(?P<widgetsettingId>[A-Za-z0-9]+):(?P<accountId>[A-Za-z0-9]+))|(?:(?P<host>https?://(?:www\.)?play\..*\..*)/embed/(?:avp/v[0-9]+/player/[A-Za-z0-9]+/)?(?P<id>.*)?)' + +    _TESTS = [{ +        'url': 'http://play.lcp.fr/embed/327336/131064/darkmatter/0', +        'md5': '6cea4f7d13810464ef8485a924fc3333', +        'info_dict': { +            'id': '327336', +            'url': 're:http://httpod.scdn.arkena.com/11970/327336.*', +            'ext': 'mp4', +            'title': '327336', +            'upload_date': '20160225', +            'timestamp': 1456391602 +        } +    }, { +        # Shortcut for: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411 +        'url': 'arkena:2:media:b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe:1:129411', +        'md5': 'b96f2f71b359a8ecd05ce4e1daa72365', +        'info_dict': { +            'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe', +            'url': 'http://88e04ec095b07cd1aa3ea588be47e870.httpcache0.90034-httpcache0.dna.qbrick.com/90034-httpcache0/4bf759a1-00090034/bbb_sunflower_2160p_60fps_normal_720p.mp4', +            'ext': 'mp4', +            'title': 'Big Buck Bunny', +            'description': 'Royalty free test video', +            'upload_date': '20150528', +            'timestamp': 1432816365 +        } +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        if mobj.group('shortcut'): +            version = mobj.group('version') +            mediatype = mobj.group('mediatype') +            mediaid = mobj.group('mediaId') +            widgetsettingid = mobj.group('widgetsettingId') +            accountid = mobj.group('accountId') +            display_id = '{0}:{1}:{2}:{3}'.format(mediatype, mediaid, widgetsettingid, accountid) +            media_url = 'https://play.arkena.com/config/avp/v{0}/player/{1}/{2}/{3}/{4}/?callbackMethod=?'.format( +                version, mediatype, mediaid, widgetsettingid, accountid) +        else: +            display_id = self._search_regex(self._VALID_URL, url, 'host_name', group='id') +            webpage = self._download_webpage(url, display_id) + +            media_url_regex = '"(?P<mediainfo>(?P<host>.*)/(c|C)onfig/.*\?callbackMethod=\?)"' +            media_url = self._html_search_regex(media_url_regex, webpage, 'arkena_media_info_url') +            hostname = self._html_search_regex(media_url_regex, webpage, 'arkena_media_host', group='host') +            if not hostname: +                hostname = self._search_regex(self._VALID_URL, url, 'host_name', group='host') +                media_url = hostname + media_url + +        # Extract the required info of the media files gathered in a dictionary +        arkena_info = self._download_webpage(media_url, 'arkena_info_') +        arkena_info_regex = r'\?\((?P<json>.*)\);' +        media_dict = self._parse_json(self._search_regex(arkena_info_regex, arkena_info, 'json', group='json'), +                                      display_id) + +        # All videos are part of a playlist, a single video is also put in a playlist +        playlist_items = media_dict.get('Playlist', []) +        if len(playlist_items) == 0: +            return self.url_result(url, 'Generic') +        elif len(playlist_items) == 1: +            arkena_media_info = playlist_items[0] +            return self.__extract_from_playlistentry(arkena_media_info) +        else: +            entries_info = [] +            for arkena_playlist_item in playlist_items: +                entries_info.append(self.__extract_from_playlistentry(arkena_playlist_item)) +            return { +                'id': display_id, +                'entries': entries_info +            } + +    def __extract_from_playlistentry(self, arkena_playlistentry_info): +        media_info = arkena_playlistentry_info.get('MediaInfo', {}) +        thumbnails = self.__get_thumbnails(media_info) +        title = media_info.get('Title') +        description = media_info.get('Description') +        video_id = media_info.get('VideoId') +        timestamp = parse_iso8601(media_info.get('PublishDate')) +        formats = self.__get_video_formats(arkena_playlistentry_info, video_id) +        return { +            'id': video_id, +            'title': title, +            'formats': formats, +            'thumbnails': thumbnails, +            'description': description, +            'timestamp': timestamp +        } + +    def __get_thumbnails(self, arkena_mediainfo): +        thumbnails = [] +        thumbnails_info = arkena_mediainfo.get('Poster') +        if not thumbnails_info: +            return None +        for thumbnail in thumbnails_info: +            thumbnail_url = thumbnail.get('Url') +            if not thumbnail_url: +                continue +            thumbnails.append({ +                'url': thumbnail_url, +                'width': int_or_none(thumbnail.get('Size')) +            }) +        return thumbnails + +    def __get_video_formats(self, media_files_info, video_id): +        formats = [] +        media_files = media_files_info.get('MediaFiles') +        if not media_files: +            return None + +        for type_name, video_files_json in media_files.iteritems(): +            for video_info in video_files_json: +                video_url = video_info.get('Url') +                if not video_url: +                    continue +                type = video_info.get('Type') +                if type_name in ['Mp4', 'WebM', 'Flash']: +                    bitrate = int_or_none(video_info.get('Bitrate'), scale=1000) +                    ext = None +                    if type == 'video/mp4': +                        ext = 'mp4' +                    elif type == 'video/webm': +                        ext = 'webm' +                    elif type == 'video/x-flv': +                        ext = 'flv' +                    formats.append({ +                        'url': video_url, +                        'ext': ext, +                        'tbr': bitrate +                    }) +                elif type_name == 'M3u8' and type == 'application/x-mpegURL': +                    formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) +                elif type_name == 'Flash' and type == 'application/hds+xml': +                    formats.extend(self._extract_f4m_formats(video_url, video_id, f4m_id='hds', fatal=False)) +                elif type_name == 'Dash' and type == 'application/dash+xml': +                    formats.extend(self._extract_mpd_formats(video_url, video_id, mpd_id='dash', fatal=False)) + +        self._sort_formats(formats) +        return formats
\ No newline at end of file diff --git a/youtube_dl/extractor/lcp.py b/youtube_dl/extractor/lcp.py new file mode 100644 index 000000000..38d7502df --- /dev/null +++ b/youtube_dl/extractor/lcp.py @@ -0,0 +1,39 @@ +# coding: utf-8 +from __future__ import unicode_literals +from .common import InfoExtractor + +class LcpIE(InfoExtractor): +    IE_NAME = 'LCP' +    _VALID_URL = r'https?://(?:www\.)?lcp\.fr/(?:[^\/]+/)*(?P<id>[^/]+)' + +    _TESTS = [{ +        'url': 'http://www.lcp.fr/la-politique-en-video/schwartzenberg-prg-preconise-francois-hollande-de-participer-une-primaire', +        'md5': 'ab96c4dae94322ece1e98d97c8dc7807', +        'info_dict': { +            'id': 'd56d03e9', +            'url': 're:http://httpod.scdn.arkena.com/11970/d56d03e9_.*', +            'ext': 'mp4', +            'title': 'd56d03e9', +            'upload_date': '20160226', +            'timestamp': 1456488895 +        } +    }, { +        'url': 'http://www.lcp.fr/le-direct', +        'info_dict': { +            'title': 'Le direct | LCP Assembl\xe9e nationale', +            'id': 'le-direct', +        }, +        'playlist_mincount': 1 +    }] + +    def _real_extract(self, url): +        display_id = self._match_id(url) +        webpage = self._download_webpage(url, display_id) + +        embed_url_regex = r'"(?P<url>(?:https?://(?:www\.)?)?play\.lcp\.fr/embed/[A-za-z0-9]+/[A-za-z0-9]+/[A-za-z0-9]+/[A-za-z0-9]+)"' +        embed_url = self._html_search_regex(embed_url_regex, webpage, 'player_url', default=None, fatal=False) +        if not embed_url: +            return self.url_result(url, 'Generic') + +        title = self._og_search_title(webpage, default=None) +        return self.url_result(embed_url, 'ArkenaPlay', video_id=display_id, video_title=title) | 
