diff options
-rw-r--r-- | AUTHORS | 1 | ||||
-rw-r--r-- | test/test_subtitles.py | 28 | ||||
-rw-r--r-- | youtube_dl/extractor/ceskatelevize.py | 45 | ||||
-rw-r--r-- | youtube_dl/extractor/nrk.py | 132 | ||||
-rw-r--r-- | youtube_dl/version.py | 2 |
5 files changed, 182 insertions, 26 deletions
@@ -100,3 +100,4 @@ Cédric Luthi Thijs Vermeir Joel Leclerc Christopher Krooss +Ondřej Caletka diff --git a/test/test_subtitles.py b/test/test_subtitles.py index d34565191..6336dd317 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -17,6 +17,7 @@ from youtube_dl.extractor import ( TEDIE, VimeoIE, WallaIE, + CeskaTelevizeIE, ) @@ -317,5 +318,32 @@ class TestWallaSubtitles(BaseTestSubtitles): self.assertEqual(len(subtitles), 0) +class TestCeskaTelevizeSubtitles(BaseTestSubtitles): + url = 'http://www.ceskatelevize.cz/ivysilani/10600540290-u6-uzasny-svet-techniky' + IE = CeskaTelevizeIE + + def test_list_subtitles(self): + self.DL.expect_warning('Automatic Captions not supported by this server') + self.DL.params['listsubtitles'] = True + info_dict = self.getInfoDict() + self.assertEqual(info_dict, None) + + def test_allsubtitles(self): + self.DL.expect_warning('Automatic Captions not supported by this server') + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['cs'])) + self.assertEqual(md5(subtitles['cs']), '9bf52d9549533c32c427e264bf0847d4') + + def test_nosubtitles(self): + self.DL.expect_warning('video doesn\'t have subtitles') + self.url = 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220' + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(len(subtitles), 0) + + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index ba8376338..f70e090bb 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor +from .subtitles import SubtitlesInfoExtractor from ..compat import ( compat_urllib_request, compat_urllib_parse, @@ -15,7 +15,7 @@ from ..utils import ( ) -class CeskaTelevizeIE(InfoExtractor): +class CeskaTelevizeIE(SubtitlesInfoExtractor): _VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(.+/)?(?P<id>[^?#]+)' _TESTS = [ @@ -104,6 +104,17 @@ class CeskaTelevizeIE(InfoExtractor): duration = float_or_none(item.get('duration')) thumbnail = item.get('previewImageUrl') + subtitles = {} + subs = item.get('subtitles') + if subs: + subtitles['cs'] = subs[0]['url'] + + if self._downloader.params.get('listsubtitles', False): + self._list_available_subtitles(video_id, subtitles) + return + + subtitles = self._fix_subtitles(self.extract_subtitles(video_id, subtitles)) + return { 'id': episode_id, 'title': title, @@ -111,4 +122,34 @@ class CeskaTelevizeIE(InfoExtractor): 'thumbnail': thumbnail, 'duration': duration, 'formats': formats, + 'subtitles': subtitles, } + + @staticmethod + def _fix_subtitles(subtitles): + """ Convert millisecond-based subtitles to SRT """ + if subtitles is None: + return subtitles # subtitles not requested + + def _msectotimecode(msec): + """ Helper utility to convert milliseconds to timecode """ + components = [] + for divider in [1000, 60, 60, 100]: + components.append(msec % divider) + msec //= divider + return "{3:02}:{2:02}:{1:02},{0:03}".format(*components) + + def _fix_subtitle(subtitle): + for line in subtitle.splitlines(): + m = re.match(r"^\s*([0-9]+);\s*([0-9]+)\s+([0-9]+)\s*$", line) + if m: + yield m.group(1) + start, stop = (_msectotimecode(int(t)) for t in m.groups()[1:]) + yield "{0} --> {1}".format(start, stop) + else: + yield line + + fixed_subtitles = {} + for k, v in subtitles.items(): + fixed_subtitles[k] = "\r\n".join(_fix_subtitle(v)) + return fixed_subtitles diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 43e8e619f..e950c76dd 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -72,7 +72,7 @@ class NRKIE(InfoExtractor): class NRKTVIE(InfoExtractor): - _VALID_URL = r'http://tv\.nrk(?:super)?\.no/(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})' + _VALID_URL = r'http://tv\.nrk(?:super)?\.no/(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?' _TESTS = [ { @@ -85,7 +85,7 @@ class NRKTVIE(InfoExtractor): 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', 'upload_date': '20140523', 'duration': 1741.52, - } + }, }, { 'url': 'http://tv.nrk.no/program/mdfp15000514', @@ -97,39 +97,125 @@ class NRKTVIE(InfoExtractor): 'description': 'md5:654c12511f035aed1e42bdf5db3b206a', 'upload_date': '20140524', 'duration': 4605.0, - } + }, }, + { + # single playlist video + 'url': 'http://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', + 'md5': 'adbd1dbd813edaf532b0a253780719c2', + 'info_dict': { + 'id': 'MSPO40010515-part2', + 'ext': 'flv', + 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', + 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'upload_date': '20150106', + }, + 'skip': 'Only works from Norway', + 'params': { + 'proxy': '127.0.0.1:8118', + }, + }, + { + 'url': 'http://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', + 'playlist': [ + { + 'md5': '9480285eff92d64f06e02a5367970a7a', + 'info_dict': { + 'id': 'MSPO40010515-part1', + 'ext': 'flv', + 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 1:2)', + 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'upload_date': '20150106', + }, + }, + { + 'md5': 'adbd1dbd813edaf532b0a253780719c2', + 'info_dict': { + 'id': 'MSPO40010515-part2', + 'ext': 'flv', + 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', + 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'upload_date': '20150106', + }, + }, + ], + 'info_dict': { + 'id': 'MSPO40010515', + 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn', + 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'upload_date': '20150106', + 'duration': 6947.5199999999995, + }, + 'skip': 'Only works from Norway', + 'params': { + 'proxy': '127.0.0.1:8118', + }, + } ] + def _extract_f4m(self, manifest_url, video_id): + return self._extract_f4m_formats(manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - - page = self._download_webpage(url, video_id) - - title = self._html_search_meta('title', page, 'title') - description = self._html_search_meta('description', page, 'description') - thumbnail = self._html_search_regex(r'data-posterimage="([^"]+)"', page, 'thumbnail', fatal=False) - upload_date = unified_strdate(self._html_search_meta('rightsfrom', page, 'upload date', fatal=False)) - duration = float_or_none( - self._html_search_regex(r'data-duration="([^"]+)"', page, 'duration', fatal=False)) + part_id = mobj.group('part_id') + + webpage = self._download_webpage(url, video_id) + + title = self._html_search_meta( + 'title', webpage, 'title') + description = self._html_search_meta( + 'description', webpage, 'description') + + thumbnail = self._html_search_regex( + r'data-posterimage="([^"]+)"', + webpage, 'thumbnail', fatal=False) + upload_date = unified_strdate(self._html_search_meta( + 'rightsfrom', webpage, 'upload date', fatal=False)) + duration = float_or_none(self._html_search_regex( + r'data-duration="([^"]+)"', + webpage, 'duration', fatal=False)) + + # playlist + parts = re.findall( + r'<a href="#del=(\d+)"[^>]+data-argument="([^"]+)">([^<]+)</a>', webpage) + if parts: + entries = [] + for current_part_id, stream_url, part_title in parts: + if part_id and current_part_id != part_id: + continue + video_part_id = '%s-part%s' % (video_id, current_part_id) + formats = self._extract_f4m(stream_url, video_part_id) + entries.append({ + 'id': video_part_id, + 'title': part_title, + 'description': description, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'formats': formats, + }) + if part_id: + if entries: + return entries[0] + else: + playlist = self.playlist_result(entries, video_id, title, description) + playlist.update({ + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'duration': duration, + }) + return playlist formats = [] - f4m_url = re.search(r'data-media="([^"]+)"', page) + f4m_url = re.search(r'data-media="([^"]+)"', webpage) if f4m_url: - formats.append({ - 'url': f4m_url.group(1) + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', - 'format_id': 'f4m', - 'ext': 'flv', - }) + formats.extend(self._extract_f4m(f4m_url.group(1), video_id)) - m3u8_url = re.search(r'data-hls-media="([^"]+)"', page) + m3u8_url = re.search(r'data-hls-media="([^"]+)"', webpage) if m3u8_url: - formats.append({ - 'url': m3u8_url.group(1), - 'format_id': 'm3u8', - }) + formats.extend(self._extract_m3u8_formats(m3u8_url.group(1), video_id, 'mp4')) self._sort_formats(formats) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 2124e954f..f9c934351 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.01.05.1' +__version__ = '2015.01.07' |