diff options
author | Sergey M․ <dstftw@gmail.com> | 2016-05-18 00:25:15 +0600 |
---|---|---|
committer | Sergey M․ <dstftw@gmail.com> | 2016-05-18 00:25:15 +0600 |
commit | 11e6a0b64130f9b4aea1a6115a3ebaad73f2f5e0 (patch) | |
tree | 4499da065ec3eabdf5d6d3b98fe59ed44df3524a /youtube_dl/extractor/nfb.py | |
parent | 15cda1ef774e9dbc538765f59dff5b10a492eca5 (diff) |
[nfb] Modernize and extract subtitles
Diffstat (limited to 'youtube_dl/extractor/nfb.py')
-rw-r--r-- | youtube_dl/extractor/nfb.py | 108 |
1 files changed, 63 insertions, 45 deletions
diff --git a/youtube_dl/extractor/nfb.py b/youtube_dl/extractor/nfb.py index 234e49047..adcc636bc 100644 --- a/youtube_dl/extractor/nfb.py +++ b/youtube_dl/extractor/nfb.py @@ -2,8 +2,12 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - sanitized_Request, + clean_html, + determine_ext, + int_or_none, + qualities, urlencode_postdata, + xpath_text, ) @@ -16,12 +20,12 @@ class NFBIE(InfoExtractor): 'url': 'https://www.nfb.ca/film/qallunaat_why_white_people_are_funny', 'info_dict': { 'id': 'qallunaat_why_white_people_are_funny', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Qallunaat! Why White People Are Funny ', - 'description': 'md5:836d8aff55e087d04d9f6df554d4e038', + 'description': 'md5:6b8e32dde3abf91e58857b174916620c', 'duration': 3128, + 'creator': 'Mark Sandiford', 'uploader': 'Mark Sandiford', - 'uploader_id': 'mark-sandiford', }, 'params': { # rtmp download @@ -31,64 +35,78 @@ class NFBIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - page = self._download_webpage( - 'https://www.nfb.ca/film/%s' % video_id, video_id, - 'Downloading film page') - uploader_id = self._html_search_regex(r'<a class="director-link" href="/explore-all-directors/([^/]+)/"', - page, 'director id', fatal=False) - uploader = self._og_search_property('video:director', page, 'director name') - - request = sanitized_Request( + config = self._download_xml( 'https://www.nfb.ca/film/%s/player_config' % video_id, - urlencode_postdata({'getConfig': 'true'})) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') - request.add_header('X-NFB-Referer', 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf') - - config = self._download_xml(request, video_id, 'Downloading player config XML') + video_id, 'Downloading player config XML', + data=urlencode_postdata({'getConfig': 'true'}), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'X-NFB-Referer': 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf' + }) - title = None - description = None - thumbnail = None - duration = None - formats = [] - - def extract_thumbnail(media): - thumbnails = {} - for asset in media.findall('assets/asset'): - thumbnails[asset.get('quality')] = asset.find('default/url').text - if not thumbnails: - return None - if 'high' in thumbnails: - return thumbnails['high'] - return list(thumbnails.values())[0] + title, description, thumbnail, duration, uploader, author = [None] * 6 + thumbnails, formats = [[]] * 2 + subtitles = {} for media in config.findall('./player/stream/media'): if media.get('type') == 'posterImage': - thumbnail = extract_thumbnail(media) + quality_key = qualities(('low', 'high')) + thumbnails = [] + for asset in media.findall('assets/asset'): + asset_url = xpath_text(asset, 'default/url', default=None) + if not asset_url: + continue + quality = asset.get('quality') + thumbnails.append({ + 'url': asset_url, + 'id': quality, + 'preference': quality_key(quality), + }) elif media.get('type') == 'video': - duration = int(media.get('duration')) - title = media.find('title').text - description = media.find('description').text - # It seems assets always go from lower to better quality, so no need to sort + title = xpath_text(media, 'title', fatal=True) for asset in media.findall('assets/asset'): - for x in asset: + quality = asset.get('quality') + height = int_or_none(self._search_regex( + r'^(\d+)[pP]$', quality or '', 'height', default=None)) + for node in asset: + streamer = xpath_text(node, 'streamerURI', default=None) + if not streamer: + continue + play_path = xpath_text(node, 'url', default=None) + if not play_path: + continue formats.append({ - 'url': x.find('streamerURI').text, - 'app': x.find('streamerURI').text.split('/', 3)[3], - 'play_path': x.find('url').text, + 'url': streamer, + 'app': streamer.split('/', 3)[3], + 'play_path': play_path, 'rtmp_live': False, - 'ext': 'mp4', - 'format_id': '%s-%s' % (x.tag, asset.get('quality')), + 'ext': 'flv', + 'format_id': '%s-%s' % (node.tag, quality) if quality else node.tag, + 'height': height, }) + self._sort_formats(formats) + description = clean_html(xpath_text(media, 'description')) + uploader = xpath_text(media, 'author') + duration = int_or_none(media.get('duration')) + for subtitle in media.findall('./subtitles/subtitle'): + subtitle_url = xpath_text(subtitle, 'url', default=None) + if not subtitle_url: + continue + lang = xpath_text(subtitle, 'lang', default='en') + subtitles.setdefault(lang, []).append({ + 'url': subtitle_url, + 'ext': (subtitle.get('format') or determine_ext(subtitle_url)).lower(), + }) return { 'id': video_id, 'title': title, 'description': description, - 'thumbnail': thumbnail, + 'thumbnails': thumbnails, 'duration': duration, + 'creator': uploader, 'uploader': uploader, - 'uploader_id': uploader_id, 'formats': formats, + 'subtitles': subtitles, } |