From a504ced097e703a9bc6c18b6e31bcafb4783ed80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 15 Feb 2015 18:03:41 +0100 Subject: Improve subtitles support For each language the extractor builds a list with the available formats sorted (like for video formats), then YoutubeDL selects one of them using the '--sub-format' option which now allows giving the format preferences (for example 'ass/srt/best'). For each format the 'url' field can be set so that we only download the contents if needed, or if the contents needs to be processed (like in crunchyroll) the 'data' field can be used. The reasons for this change are: * We weren't checking that the format given with '--sub-format' was available, checking it in each extractor would be repetitive. * It allows to easily support giving a format preference. * The subtitles were automatically downloaded in the extractor, but I think that if you use for example the '--dump-json' option you want to finish as fast as possible. Currently only the ted extractor has been updated, but the old system still works. --- youtube_dl/YoutubeDL.py | 85 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 78 insertions(+), 7 deletions(-) (limited to 'youtube_dl/YoutubeDL.py') diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 13d18e25e..e665e3d53 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -154,7 +154,7 @@ class YoutubeDL(object): allsubtitles: Downloads all the subtitles of the video (requires writesubtitles or writeautomaticsub) listsubtitles: Lists all available subtitles for the video - subtitlesformat: Subtitle format [srt/sbv/vtt] (default=srt) + subtitlesformat: The format code for subtitles subtitleslangs: List of languages of the subtitles to download keepvideo: Keep the video file after post-processing daterange: A DateRange object, download only if the upload_date is in the range. @@ -1019,6 +1019,11 @@ class YoutubeDL(object): info_dict['timestamp']) info_dict['upload_date'] = upload_date.strftime('%Y%m%d') + if self.params.get('listsubtitles', False): + self.list_subtitles(info_dict['id'], info_dict.get('subtitles')) + return + info_dict['subtitles'] = self.process_subtitles(info_dict['id'], info_dict.get('subtitles')) + # This extractors handle format selection themselves if info_dict['extractor'] in ['Youku']: if download: @@ -1147,6 +1152,53 @@ class YoutubeDL(object): info_dict.update(formats_to_download[-1]) return info_dict + def process_subtitles(self, video_id, available_subs): + """Select the requested subtitles and their format""" + if not available_subs: + return available_subs + + if self.params.get('allsubtitles', False): + requested_langs = available_subs.keys() + else: + if self.params.get('subtitleslangs', False): + requested_langs = self.params.get('subtitleslangs') + elif 'en' in available_subs: + requested_langs = ['en'] + else: + requested_langs = [list(available_subs.keys())[0]] + + formats_query = self.params.get('subtitlesformat', 'best') + formats_preference = formats_query.split('/') if formats_query else [] + subs = {} + for lang in requested_langs: + formats = available_subs.get(lang) + if formats is None: + self.report_warning('%s subtitles not available for %s' % (lang, video_id)) + continue + if isinstance(formats, compat_str): + # TODO: convert all IE with subtitles support to the new format + # and remove this + subs[lang] = { + 'ext': formats_preference[0], + 'data': formats, + } + continue + for ext in formats_preference: + if ext == 'best': + f = formats[-1] + break + matches = list(filter(lambda f: f['ext'] == ext, formats)) + if matches: + f = matches[-1] + break + else: + f = formats[-1] + self.report_warning( + 'No subtitle format found matching "%s" for language %s, ' + 'using %s' % (formats_query, lang, f['ext'])) + subs[lang] = f + return subs + def process_info(self, info_dict): """Process a single resolved IE result.""" @@ -1253,11 +1305,18 @@ class YoutubeDL(object): # subtitles download errors are already managed as troubles in relevant IE # that way it will silently go on when used with unsupporting IE subtitles = info_dict['subtitles'] - sub_format = self.params.get('subtitlesformat', 'srt') - for sub_lang in subtitles.keys(): - sub = subtitles[sub_lang] - if sub is None: - continue + for sub_lang, sub_info in subtitles.items(): + sub_format = sub_info['ext'] + if sub_info.get('data') is not None: + sub_data = sub_info['data'] + else: + try: + uf = self.urlopen(sub_info['url']) + sub_data = uf.read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self.report_warning('Unable to download subtitle for "%s": %s' % + (sub_lang, compat_str(err))) + continue try: sub_filename = subtitles_filename(filename, sub_lang, sub_format) if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): @@ -1265,7 +1324,7 @@ class YoutubeDL(object): else: self.to_screen('[info] Writing video subtitles to: ' + sub_filename) with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: - subfile.write(sub) + subfile.write(sub_data) except (OSError, IOError): self.report_error('Cannot write subtitles file ' + sub_filename) return @@ -1586,6 +1645,18 @@ class YoutubeDL(object): ['ID', 'width', 'height', 'URL'], [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])) + def list_subtitles(self, video_id, subtitles): + if not subtitles: + self.to_screen('%s has no subtitles' % video_id) + return + header_line = 'Language formats' + sub_lines = [ + '%-12s%s' % (lang, ', '.join(f['ext'] for f in reversed(formats))) + for lang, formats in subtitles.items()] + self.to_screen( + 'Available subtitles for %s:\n%s\n%s' % + (video_id, header_line, '\n'.join(sub_lines))) + def urlopen(self, req): """ Start an HTTP download """ -- cgit v1.2.3 From c84dd8a90dcc75547b343449b921b644a2119c4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 16 Feb 2015 21:12:31 +0100 Subject: [YoutubeDL] store the subtitles to download in the 'requested_subtitles' field We need to keep the orginal subtitles information, so that the '--load-info' option can be used to list or select the subtitles again. We'll also be able to have a separate field for storing the automatic captions info. --- youtube_dl/YoutubeDL.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'youtube_dl/YoutubeDL.py') diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index e665e3d53..8545dc9e9 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1022,7 +1022,7 @@ class YoutubeDL(object): if self.params.get('listsubtitles', False): self.list_subtitles(info_dict['id'], info_dict.get('subtitles')) return - info_dict['subtitles'] = self.process_subtitles(info_dict['id'], info_dict.get('subtitles')) + info_dict['requested_subtitles'] = self.process_subtitles(info_dict['id'], info_dict.get('subtitles')) # This extractors handle format selection themselves if info_dict['extractor'] in ['Youku']: @@ -1301,10 +1301,10 @@ class YoutubeDL(object): subtitles_are_requested = any([self.params.get('writesubtitles', False), self.params.get('writeautomaticsub')]) - if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']: + if subtitles_are_requested and info_dict.get('requested_subtitles'): # subtitles download errors are already managed as troubles in relevant IE # that way it will silently go on when used with unsupporting IE - subtitles = info_dict['subtitles'] + subtitles = info_dict['requested_subtitles'] for sub_lang, sub_info in subtitles.items(): sub_format = sub_info['ext'] if sub_info.get('data') is not None: -- cgit v1.2.3 From 360e1ca5ccabcb5d48228d9472b09f1bce68bbc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 16 Feb 2015 21:44:17 +0100 Subject: [youtube] Convert to new subtitles system The automatic captions are stored in the 'automactic_captions' field, which is used if no normal subtitles are found for an specific language. --- youtube_dl/YoutubeDL.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'youtube_dl/YoutubeDL.py') diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 8545dc9e9..a47f8f5de 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1020,9 +1020,13 @@ class YoutubeDL(object): info_dict['upload_date'] = upload_date.strftime('%Y%m%d') if self.params.get('listsubtitles', False): - self.list_subtitles(info_dict['id'], info_dict.get('subtitles')) + if 'automatic_captions' in info_dict: + self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions') + self.list_subtitles(info_dict['id'], info_dict.get('subtitles'), 'subtitles') return - info_dict['requested_subtitles'] = self.process_subtitles(info_dict['id'], info_dict.get('subtitles')) + info_dict['requested_subtitles'] = self.process_subtitles( + info_dict['id'], info_dict.get('subtitles'), + info_dict.get('automatic_captions')) # This extractors handle format selection themselves if info_dict['extractor'] in ['Youku']: @@ -1152,8 +1156,14 @@ class YoutubeDL(object): info_dict.update(formats_to_download[-1]) return info_dict - def process_subtitles(self, video_id, available_subs): + def process_subtitles(self, video_id, available_subs, available_autocaps): """Select the requested subtitles and their format""" + if available_autocaps and self.params.get('writeautomaticsub'): + available_subs = available_subs.copy() + for lang, cap_info in available_autocaps.items(): + if lang not in available_subs: + available_subs[lang] = cap_info + if not available_subs: return available_subs @@ -1645,17 +1655,17 @@ class YoutubeDL(object): ['ID', 'width', 'height', 'URL'], [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])) - def list_subtitles(self, video_id, subtitles): + def list_subtitles(self, video_id, subtitles, name='subtitles'): if not subtitles: - self.to_screen('%s has no subtitles' % video_id) + self.to_screen('%s has no %s' % (video_id, name)) return header_line = 'Language formats' sub_lines = [ '%-12s%s' % (lang, ', '.join(f['ext'] for f in reversed(formats))) for lang, formats in subtitles.items()] self.to_screen( - 'Available subtitles for %s:\n%s\n%s' % - (video_id, header_line, '\n'.join(sub_lines))) + 'Available %s for %s:\n%s\n%s' % + (name, video_id, header_line, '\n'.join(sub_lines))) def urlopen(self, req): """ Start an HTTP download """ -- cgit v1.2.3 From edab9dbf4d00a7f76fbfd2df9ef4b205c88e47a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 17 Feb 2015 22:59:19 +0100 Subject: [YoutubeDL] use the 'render_table' function for listing the subtitles --- youtube_dl/YoutubeDL.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'youtube_dl/YoutubeDL.py') diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a47f8f5de..f8b8fb0c1 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1659,13 +1659,12 @@ class YoutubeDL(object): if not subtitles: self.to_screen('%s has no %s' % (video_id, name)) return - header_line = 'Language formats' - sub_lines = [ - '%-12s%s' % (lang, ', '.join(f['ext'] for f in reversed(formats))) - for lang, formats in subtitles.items()] self.to_screen( - 'Available %s for %s:\n%s\n%s' % - (name, video_id, header_line, '\n'.join(sub_lines))) + 'Available %s for %s:' % (name, video_id)) + self.to_screen(render_table( + ['Language', 'formats'], + [[lang, ', '.join(f['ext'] for f in reversed(formats))] + for lang, formats in subtitles.items()])) def urlopen(self, req): """ Start an HTTP download """ -- cgit v1.2.3 From 4d1718481755dde078678b6e55d457fc6351fcdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 21 Feb 2015 22:31:53 +0100 Subject: [YoutubeDL] don't set the 'requested_subtitles' without writesubtitles or writeautomaticsub --- youtube_dl/YoutubeDL.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'youtube_dl/YoutubeDL.py') diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index f8b8fb0c1..088b111eb 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1164,8 +1164,10 @@ class YoutubeDL(object): if lang not in available_subs: available_subs[lang] = cap_info - if not available_subs: - return available_subs + if (not self.params.get('writesubtitles') and not + self.params.get('writeautomaticsub') or not + available_subs): + return None if self.params.get('allsubtitles', False): requested_langs = available_subs.keys() -- cgit v1.2.3 From 98c70d6fc7006c8cbbd76fb1b8661d758fc4f5d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 22 Feb 2015 11:37:27 +0100 Subject: [YoutubeDL] only add normal subtitles to the 'requested_subtitles' field if 'writesubtitles' is True --- youtube_dl/YoutubeDL.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'youtube_dl/YoutubeDL.py') diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 088b111eb..7319323e5 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1156,11 +1156,13 @@ class YoutubeDL(object): info_dict.update(formats_to_download[-1]) return info_dict - def process_subtitles(self, video_id, available_subs, available_autocaps): + def process_subtitles(self, video_id, normal_subtitles, automatic_captions): """Select the requested subtitles and their format""" - if available_autocaps and self.params.get('writeautomaticsub'): - available_subs = available_subs.copy() - for lang, cap_info in available_autocaps.items(): + available_subs = {} + if normal_subtitles and self.params.get('writesubtitles'): + available_subs.update(normal_subtitles) + if automatic_captions and self.params.get('writeautomaticsub'): + for lang, cap_info in automatic_captions.items(): if lang not in available_subs: available_subs[lang] = cap_info -- cgit v1.2.3 From b531cfc019576b682f930bd269f68eb87cfd5abf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 23 Feb 2015 16:12:35 +0100 Subject: [YoutubeDL] remove compatiblity with the old subtitles system --- youtube_dl/YoutubeDL.py | 8 -------- 1 file changed, 8 deletions(-) (limited to 'youtube_dl/YoutubeDL.py') diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 7319323e5..70b364c9b 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1189,14 +1189,6 @@ class YoutubeDL(object): if formats is None: self.report_warning('%s subtitles not available for %s' % (lang, video_id)) continue - if isinstance(formats, compat_str): - # TODO: convert all IE with subtitles support to the new format - # and remove this - subs[lang] = { - 'ext': formats_preference[0], - 'data': formats, - } - continue for ext in formats_preference: if ext == 'best': f = formats[-1] -- cgit v1.2.3