diff options
Diffstat (limited to 'youtube_dl/extractor/nfb.py')
| -rw-r--r-- | youtube_dl/extractor/nfb.py | 109 | 
1 files changed, 63 insertions, 46 deletions
| diff --git a/youtube_dl/extractor/nfb.py b/youtube_dl/extractor/nfb.py index 51e4a34f7..adcc636bc 100644 --- a/youtube_dl/extractor/nfb.py +++ b/youtube_dl/extractor/nfb.py @@ -2,8 +2,12 @@ from __future__ import unicode_literals  from .common import InfoExtractor  from ..utils import ( -    sanitized_Request, +    clean_html, +    determine_ext, +    int_or_none, +    qualities,      urlencode_postdata, +    xpath_text,  ) @@ -16,12 +20,12 @@ class NFBIE(InfoExtractor):          'url': 'https://www.nfb.ca/film/qallunaat_why_white_people_are_funny',          'info_dict': {              'id': 'qallunaat_why_white_people_are_funny', -            'ext': 'mp4', +            'ext': 'flv',              'title': 'Qallunaat! Why White People Are Funny ', -            'description': 'md5:836d8aff55e087d04d9f6df554d4e038', +            'description': 'md5:6b8e32dde3abf91e58857b174916620c',              'duration': 3128, +            'creator': 'Mark Sandiford',              'uploader': 'Mark Sandiford', -            'uploader_id': 'mark-sandiford',          },          'params': {              # rtmp download @@ -31,65 +35,78 @@ class NFBIE(InfoExtractor):      def _real_extract(self, url):          video_id = self._match_id(url) -        page = self._download_webpage( -            'https://www.nfb.ca/film/%s' % video_id, video_id, -            'Downloading film page') -        uploader_id = self._html_search_regex(r'<a class="director-link" href="/explore-all-directors/([^/]+)/"', -                                              page, 'director id', fatal=False) -        uploader = self._html_search_regex(r'<em class="director-name" itemprop="name">([^<]+)</em>', -                                           page, 'director name', fatal=False) - -        request = sanitized_Request( +        config = self._download_xml(              'https://www.nfb.ca/film/%s/player_config' % video_id, -            urlencode_postdata({'getConfig': 'true'})) -        request.add_header('Content-Type', 'application/x-www-form-urlencoded') -        request.add_header('X-NFB-Referer', 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf') - -        config = self._download_xml(request, video_id, 'Downloading player config XML') +            video_id, 'Downloading player config XML', +            data=urlencode_postdata({'getConfig': 'true'}), +            headers={ +                'Content-Type': 'application/x-www-form-urlencoded', +                'X-NFB-Referer': 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf' +            }) -        title = None -        description = None -        thumbnail = None -        duration = None -        formats = [] - -        def extract_thumbnail(media): -            thumbnails = {} -            for asset in media.findall('assets/asset'): -                thumbnails[asset.get('quality')] = asset.find('default/url').text -            if not thumbnails: -                return None -            if 'high' in thumbnails: -                return thumbnails['high'] -            return list(thumbnails.values())[0] +        title, description, thumbnail, duration, uploader, author = [None] * 6 +        thumbnails, formats = [[]] * 2 +        subtitles = {}          for media in config.findall('./player/stream/media'):              if media.get('type') == 'posterImage': -                thumbnail = extract_thumbnail(media) +                quality_key = qualities(('low', 'high')) +                thumbnails = [] +                for asset in media.findall('assets/asset'): +                    asset_url = xpath_text(asset, 'default/url', default=None) +                    if not asset_url: +                        continue +                    quality = asset.get('quality') +                    thumbnails.append({ +                        'url': asset_url, +                        'id': quality, +                        'preference': quality_key(quality), +                    })              elif media.get('type') == 'video': -                duration = int(media.get('duration')) -                title = media.find('title').text -                description = media.find('description').text -                # It seems assets always go from lower to better quality, so no need to sort +                title = xpath_text(media, 'title', fatal=True)                  for asset in media.findall('assets/asset'): -                    for x in asset: +                    quality = asset.get('quality') +                    height = int_or_none(self._search_regex( +                        r'^(\d+)[pP]$', quality or '', 'height', default=None)) +                    for node in asset: +                        streamer = xpath_text(node, 'streamerURI', default=None) +                        if not streamer: +                            continue +                        play_path = xpath_text(node, 'url', default=None) +                        if not play_path: +                            continue                          formats.append({ -                            'url': x.find('streamerURI').text, -                            'app': x.find('streamerURI').text.split('/', 3)[3], -                            'play_path': x.find('url').text, +                            'url': streamer, +                            'app': streamer.split('/', 3)[3], +                            'play_path': play_path,                              'rtmp_live': False, -                            'ext': 'mp4', -                            'format_id': '%s-%s' % (x.tag, asset.get('quality')), +                            'ext': 'flv', +                            'format_id': '%s-%s' % (node.tag, quality) if quality else node.tag, +                            'height': height,                          }) +                self._sort_formats(formats) +                description = clean_html(xpath_text(media, 'description')) +                uploader = xpath_text(media, 'author') +                duration = int_or_none(media.get('duration')) +                for subtitle in media.findall('./subtitles/subtitle'): +                    subtitle_url = xpath_text(subtitle, 'url', default=None) +                    if not subtitle_url: +                        continue +                    lang = xpath_text(subtitle, 'lang', default='en') +                    subtitles.setdefault(lang, []).append({ +                        'url': subtitle_url, +                        'ext': (subtitle.get('format') or determine_ext(subtitle_url)).lower(), +                    })          return {              'id': video_id,              'title': title,              'description': description, -            'thumbnail': thumbnail, +            'thumbnails': thumbnails,              'duration': duration, +            'creator': uploader,              'uploader': uploader, -            'uploader_id': uploader_id,              'formats': formats, +            'subtitles': subtitles,          } | 
