diff options
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/ceskatelevize.py | 45 | ||||
-rw-r--r-- | youtube_dl/extractor/nrk.py | 132 |
2 files changed, 152 insertions, 25 deletions
diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index ba8376338..f70e090bb 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor +from .subtitles import SubtitlesInfoExtractor from ..compat import ( compat_urllib_request, compat_urllib_parse, @@ -15,7 +15,7 @@ from ..utils import ( ) -class CeskaTelevizeIE(InfoExtractor): +class CeskaTelevizeIE(SubtitlesInfoExtractor): _VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(.+/)?(?P<id>[^?#]+)' _TESTS = [ @@ -104,6 +104,17 @@ class CeskaTelevizeIE(InfoExtractor): duration = float_or_none(item.get('duration')) thumbnail = item.get('previewImageUrl') + subtitles = {} + subs = item.get('subtitles') + if subs: + subtitles['cs'] = subs[0]['url'] + + if self._downloader.params.get('listsubtitles', False): + self._list_available_subtitles(video_id, subtitles) + return + + subtitles = self._fix_subtitles(self.extract_subtitles(video_id, subtitles)) + return { 'id': episode_id, 'title': title, @@ -111,4 +122,34 @@ class CeskaTelevizeIE(InfoExtractor): 'thumbnail': thumbnail, 'duration': duration, 'formats': formats, + 'subtitles': subtitles, } + + @staticmethod + def _fix_subtitles(subtitles): + """ Convert millisecond-based subtitles to SRT """ + if subtitles is None: + return subtitles # subtitles not requested + + def _msectotimecode(msec): + """ Helper utility to convert milliseconds to timecode """ + components = [] + for divider in [1000, 60, 60, 100]: + components.append(msec % divider) + msec //= divider + return "{3:02}:{2:02}:{1:02},{0:03}".format(*components) + + def _fix_subtitle(subtitle): + for line in subtitle.splitlines(): + m = re.match(r"^\s*([0-9]+);\s*([0-9]+)\s+([0-9]+)\s*$", line) + if m: + yield m.group(1) + start, stop = (_msectotimecode(int(t)) for t in m.groups()[1:]) + yield "{0} --> {1}".format(start, stop) + else: + yield line + + fixed_subtitles = {} + for k, v in subtitles.items(): + fixed_subtitles[k] = "\r\n".join(_fix_subtitle(v)) + return fixed_subtitles diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 43e8e619f..e950c76dd 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -72,7 +72,7 @@ class NRKIE(InfoExtractor): class NRKTVIE(InfoExtractor): - _VALID_URL = r'http://tv\.nrk(?:super)?\.no/(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})' + _VALID_URL = r'http://tv\.nrk(?:super)?\.no/(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?' _TESTS = [ { @@ -85,7 +85,7 @@ class NRKTVIE(InfoExtractor): 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', 'upload_date': '20140523', 'duration': 1741.52, - } + }, }, { 'url': 'http://tv.nrk.no/program/mdfp15000514', @@ -97,39 +97,125 @@ class NRKTVIE(InfoExtractor): 'description': 'md5:654c12511f035aed1e42bdf5db3b206a', 'upload_date': '20140524', 'duration': 4605.0, - } + }, }, + { + # single playlist video + 'url': 'http://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', + 'md5': 'adbd1dbd813edaf532b0a253780719c2', + 'info_dict': { + 'id': 'MSPO40010515-part2', + 'ext': 'flv', + 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', + 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'upload_date': '20150106', + }, + 'skip': 'Only works from Norway', + 'params': { + 'proxy': '127.0.0.1:8118', + }, + }, + { + 'url': 'http://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', + 'playlist': [ + { + 'md5': '9480285eff92d64f06e02a5367970a7a', + 'info_dict': { + 'id': 'MSPO40010515-part1', + 'ext': 'flv', + 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 1:2)', + 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'upload_date': '20150106', + }, + }, + { + 'md5': 'adbd1dbd813edaf532b0a253780719c2', + 'info_dict': { + 'id': 'MSPO40010515-part2', + 'ext': 'flv', + 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', + 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'upload_date': '20150106', + }, + }, + ], + 'info_dict': { + 'id': 'MSPO40010515', + 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn', + 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'upload_date': '20150106', + 'duration': 6947.5199999999995, + }, + 'skip': 'Only works from Norway', + 'params': { + 'proxy': '127.0.0.1:8118', + }, + } ] + def _extract_f4m(self, manifest_url, video_id): + return self._extract_f4m_formats(manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - - page = self._download_webpage(url, video_id) - - title = self._html_search_meta('title', page, 'title') - description = self._html_search_meta('description', page, 'description') - thumbnail = self._html_search_regex(r'data-posterimage="([^"]+)"', page, 'thumbnail', fatal=False) - upload_date = unified_strdate(self._html_search_meta('rightsfrom', page, 'upload date', fatal=False)) - duration = float_or_none( - self._html_search_regex(r'data-duration="([^"]+)"', page, 'duration', fatal=False)) + part_id = mobj.group('part_id') + + webpage = self._download_webpage(url, video_id) + + title = self._html_search_meta( + 'title', webpage, 'title') + description = self._html_search_meta( + 'description', webpage, 'description') + + thumbnail = self._html_search_regex( + r'data-posterimage="([^"]+)"', + webpage, 'thumbnail', fatal=False) + upload_date = unified_strdate(self._html_search_meta( + 'rightsfrom', webpage, 'upload date', fatal=False)) + duration = float_or_none(self._html_search_regex( + r'data-duration="([^"]+)"', + webpage, 'duration', fatal=False)) + + # playlist + parts = re.findall( + r'<a href="#del=(\d+)"[^>]+data-argument="([^"]+)">([^<]+)</a>', webpage) + if parts: + entries = [] + for current_part_id, stream_url, part_title in parts: + if part_id and current_part_id != part_id: + continue + video_part_id = '%s-part%s' % (video_id, current_part_id) + formats = self._extract_f4m(stream_url, video_part_id) + entries.append({ + 'id': video_part_id, + 'title': part_title, + 'description': description, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'formats': formats, + }) + if part_id: + if entries: + return entries[0] + else: + playlist = self.playlist_result(entries, video_id, title, description) + playlist.update({ + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'duration': duration, + }) + return playlist formats = [] - f4m_url = re.search(r'data-media="([^"]+)"', page) + f4m_url = re.search(r'data-media="([^"]+)"', webpage) if f4m_url: - formats.append({ - 'url': f4m_url.group(1) + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', - 'format_id': 'f4m', - 'ext': 'flv', - }) + formats.extend(self._extract_f4m(f4m_url.group(1), video_id)) - m3u8_url = re.search(r'data-hls-media="([^"]+)"', page) + m3u8_url = re.search(r'data-hls-media="([^"]+)"', webpage) if m3u8_url: - formats.append({ - 'url': m3u8_url.group(1), - 'format_id': 'm3u8', - }) + formats.extend(self._extract_m3u8_formats(m3u8_url.group(1), video_id, 'mp4')) self._sort_formats(formats) |