aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
authorJaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>2015-02-15 18:03:41 +0100
committerJaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>2015-02-16 21:51:03 +0100
commita504ced097e703a9bc6c18b6e31bcafb4783ed80 (patch)
tree1d520371df47be5f2c62aaee78dbced5d6b05d08 /youtube_dl/extractor
parent8fb474fb17a64ff2aa9f6315ebbc99ae7938c4e1 (diff)
downloadyoutube-dl-a504ced097e703a9bc6c18b6e31bcafb4783ed80.tar.xz
Improve subtitles support
For each language the extractor builds a list with the available formats sorted (like for video formats), then YoutubeDL selects one of them using the '--sub-format' option which now allows giving the format preferences (for example 'ass/srt/best'). For each format the 'url' field can be set so that we only download the contents if needed, or if the contents needs to be processed (like in crunchyroll) the 'data' field can be used. The reasons for this change are: * We weren't checking that the format given with '--sub-format' was available, checking it in each extractor would be repetitive. * It allows to easily support giving a format preference. * The subtitles were automatically downloaded in the extractor, but I think that if you use for example the '--dump-json' option you want to finish as fast as possible. Currently only the ted extractor has been updated, but the old system still works.
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/common.py20
-rw-r--r--youtube_dl/extractor/ted.py18
2 files changed, 28 insertions, 10 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index c784eedb9..161c623eb 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -151,8 +151,14 @@ class InfoExtractor(object):
If not explicitly set, calculated from timestamp.
uploader_id: Nickname or id of the video uploader.
location: Physical location where the video was filmed.
- subtitles: The subtitle file contents as a dictionary in the format
- {language: subtitles}.
+ subtitles: The available subtitles as a dictionary in the format
+ {language: subformats}. "subformats" is a list sorted from
+ lower to higher preference, each element is a dictionary
+ with the "ext" entry and one of:
+ * "data": The subtitles file contents
+ * "url": A url pointing to the subtitles file
+ Note: YoutubeDL.extract_info will get the requested
+ format and replace the "subformats" list with it.
duration: Length of the video in seconds, as an integer.
view_count: How many users have watched the video on the platform.
like_count: Number of positive ratings of the video
@@ -993,6 +999,16 @@ class InfoExtractor(object):
any_restricted = any_restricted or is_restricted
return not any_restricted
+ def extract_subtitles(self, *args, **kwargs):
+ subtitles = {}
+ list_subtitles = self._downloader.params.get('listsubtitles')
+ if self._downloader.params.get('writesubtitles', False) or list_subtitles:
+ subtitles.update(self._get_subtitles(*args, **kwargs))
+ return subtitles
+
+ def _get_subtitles(self, *args, **kwargs):
+ raise NotImplementedError("This method must be implemented by subclasses")
+
class SearchInfoExtractor(InfoExtractor):
"""
diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py
index 10b3b706a..1809eaae4 100644
--- a/youtube_dl/extractor/ted.py
+++ b/youtube_dl/extractor/ted.py
@@ -3,14 +3,14 @@ from __future__ import unicode_literals
import json
import re
-from .subtitles import SubtitlesInfoExtractor
+from .common import InfoExtractor
from ..compat import (
compat_str,
)
-class TEDIE(SubtitlesInfoExtractor):
+class TEDIE(InfoExtractor):
_VALID_URL = r'''(?x)
(?P<proto>https?://)
(?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
@@ -165,9 +165,6 @@ class TEDIE(SubtitlesInfoExtractor):
video_id = compat_str(talk_info['id'])
# subtitles
video_subtitles = self.extract_subtitles(video_id, talk_info)
- if self._downloader.params.get('listsubtitles', False):
- self._list_available_subtitles(video_id, talk_info)
- return
thumbnail = talk_info['thumb']
if not thumbnail.startswith('http'):
@@ -183,13 +180,18 @@ class TEDIE(SubtitlesInfoExtractor):
'duration': talk_info.get('duration'),
}
- def _get_available_subtitles(self, video_id, talk_info):
+ def _get_subtitles(self, video_id, talk_info):
languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
if languages:
sub_lang_list = {}
for l in languages:
- url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
- sub_lang_list[l] = url
+ sub_lang_list[l] = [
+ {
+ 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),
+ 'ext': ext,
+ }
+ for ext in ['ted', 'srt']
+ ]
return sub_lang_list
else:
self._downloader.report_warning('video doesn\'t have subtitles')