diff options
| -rw-r--r-- | test/test_subtitles.py | 24 | ||||
| -rwxr-xr-x | youtube_dl/YoutubeDL.py | 85 | ||||
| -rw-r--r-- | youtube_dl/__init__.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/common.py | 20 | ||||
| -rw-r--r-- | youtube_dl/extractor/ted.py | 18 | ||||
| -rw-r--r-- | youtube_dl/options.py | 4 | ||||
| -rw-r--r-- | youtube_dl/postprocessor/ffmpeg.py | 6 | 
7 files changed, 121 insertions, 37 deletions
| diff --git a/test/test_subtitles.py b/test/test_subtitles.py index bcc69a778..fbc9eaf4d 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -27,15 +27,23 @@ class BaseTestSubtitles(unittest.TestCase):      def setUp(self):          self.DL = FakeYDL() -        self.ie = self.IE(self.DL) +        self.ie = self.IE() +        self.DL.add_info_extractor(self.ie)      def getInfoDict(self): -        info_dict = self.ie.extract(self.url) +        info_dict = self.DL.extract_info(self.url, download=False)          return info_dict      def getSubtitles(self):          info_dict = self.getInfoDict() -        return info_dict['subtitles'] +        subtitles = info_dict['subtitles'] +        if not subtitles: +            return subtitles +        for sub_info in subtitles.values(): +            if sub_info.get('data') is None: +                uf = self.DL.urlopen(sub_info['url']) +                sub_info['data'] = uf.read().decode('utf-8') +        return dict((l, sub_info['data']) for l, sub_info in subtitles.items())  class TestYoutubeSubtitles(BaseTestSubtitles): @@ -176,7 +184,7 @@ class TestTedSubtitles(BaseTestSubtitles):      def test_no_writesubtitles(self):          subtitles = self.getSubtitles() -        self.assertEqual(subtitles, None) +        self.assertFalse(subtitles)      def test_subtitles(self):          self.DL.params['writesubtitles'] = True @@ -196,18 +204,10 @@ class TestTedSubtitles(BaseTestSubtitles):          self.assertTrue(len(subtitles.keys()) >= 28)      def test_list_subtitles(self): -        self.DL.expect_warning('Automatic Captions not supported by this server')          self.DL.params['listsubtitles'] = True          info_dict = self.getInfoDict()          self.assertEqual(info_dict, None) -    def test_automatic_captions(self): -        self.DL.expect_warning('Automatic Captions not supported by this server') -        self.DL.params['writeautomaticsub'] = True -        self.DL.params['subtitleslang'] = ['en'] -        subtitles = self.getSubtitles() -        self.assertTrue(len(subtitles.keys()) == 0) -      def test_multiple_langs(self):          self.DL.params['writesubtitles'] = True          langs = ['es', 'fr', 'de'] diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 13d18e25e..e665e3d53 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -154,7 +154,7 @@ class YoutubeDL(object):      allsubtitles:      Downloads all the subtitles of the video                         (requires writesubtitles or writeautomaticsub)      listsubtitles:     Lists all available subtitles for the video -    subtitlesformat:   Subtitle format [srt/sbv/vtt] (default=srt) +    subtitlesformat:   The format code for subtitles      subtitleslangs:    List of languages of the subtitles to download      keepvideo:         Keep the video file after post-processing      daterange:         A DateRange object, download only if the upload_date is in the range. @@ -1019,6 +1019,11 @@ class YoutubeDL(object):                  info_dict['timestamp'])              info_dict['upload_date'] = upload_date.strftime('%Y%m%d') +        if self.params.get('listsubtitles', False): +            self.list_subtitles(info_dict['id'], info_dict.get('subtitles')) +            return +        info_dict['subtitles'] = self.process_subtitles(info_dict['id'], info_dict.get('subtitles')) +          # This extractors handle format selection themselves          if info_dict['extractor'] in ['Youku']:              if download: @@ -1147,6 +1152,53 @@ class YoutubeDL(object):          info_dict.update(formats_to_download[-1])          return info_dict +    def process_subtitles(self, video_id, available_subs): +        """Select the requested subtitles and their format""" +        if not available_subs: +            return available_subs + +        if self.params.get('allsubtitles', False): +            requested_langs = available_subs.keys() +        else: +            if self.params.get('subtitleslangs', False): +                requested_langs = self.params.get('subtitleslangs') +            elif 'en' in available_subs: +                requested_langs = ['en'] +            else: +                requested_langs = [list(available_subs.keys())[0]] + +        formats_query = self.params.get('subtitlesformat', 'best') +        formats_preference = formats_query.split('/') if formats_query else [] +        subs = {} +        for lang in requested_langs: +            formats = available_subs.get(lang) +            if formats is None: +                self.report_warning('%s subtitles not available for %s' % (lang, video_id)) +                continue +            if isinstance(formats, compat_str): +                # TODO: convert all IE with subtitles support to the new format +                # and remove this +                subs[lang] = { +                    'ext': formats_preference[0], +                    'data': formats, +                } +                continue +            for ext in formats_preference: +                if ext == 'best': +                    f = formats[-1] +                    break +                matches = list(filter(lambda f: f['ext'] == ext, formats)) +                if matches: +                    f = matches[-1] +                    break +            else: +                f = formats[-1] +                self.report_warning( +                    'No subtitle format found matching "%s" for language %s, ' +                    'using %s' % (formats_query, lang, f['ext'])) +            subs[lang] = f +        return subs +      def process_info(self, info_dict):          """Process a single resolved IE result.""" @@ -1253,11 +1305,18 @@ class YoutubeDL(object):              # subtitles download errors are already managed as troubles in relevant IE              # that way it will silently go on when used with unsupporting IE              subtitles = info_dict['subtitles'] -            sub_format = self.params.get('subtitlesformat', 'srt') -            for sub_lang in subtitles.keys(): -                sub = subtitles[sub_lang] -                if sub is None: -                    continue +            for sub_lang, sub_info in subtitles.items(): +                sub_format = sub_info['ext'] +                if sub_info.get('data') is not None: +                    sub_data = sub_info['data'] +                else: +                    try: +                        uf = self.urlopen(sub_info['url']) +                        sub_data = uf.read().decode('utf-8') +                    except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: +                        self.report_warning('Unable to download subtitle for "%s": %s' % +                                            (sub_lang, compat_str(err))) +                        continue                  try:                      sub_filename = subtitles_filename(filename, sub_lang, sub_format)                      if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): @@ -1265,7 +1324,7 @@ class YoutubeDL(object):                      else:                          self.to_screen('[info] Writing video subtitles to: ' + sub_filename)                          with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: -                            subfile.write(sub) +                            subfile.write(sub_data)                  except (OSError, IOError):                      self.report_error('Cannot write subtitles file ' + sub_filename)                      return @@ -1586,6 +1645,18 @@ class YoutubeDL(object):              ['ID', 'width', 'height', 'URL'],              [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])) +    def list_subtitles(self, video_id, subtitles): +        if not subtitles: +            self.to_screen('%s has no subtitles' % video_id) +            return +        header_line = 'Language    formats' +        sub_lines = [ +            '%-12s%s' % (lang, ', '.join(f['ext'] for f in reversed(formats))) +            for lang, formats in subtitles.items()] +        self.to_screen( +            'Available subtitles for %s:\n%s\n%s' % +            (video_id, header_line, '\n'.join(sub_lines))) +      def urlopen(self, req):          """ Start an HTTP download """ diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index ed22f169f..5f2585003 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -226,7 +226,6 @@ def _real_main(argv=None):      if opts.embedsubtitles:          postprocessors.append({              'key': 'FFmpegEmbedSubtitle', -            'subtitlesformat': opts.subtitlesformat,          })      if opts.xattrs:          postprocessors.append({'key': 'XAttrMetadata'}) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index c784eedb9..161c623eb 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -151,8 +151,14 @@ class InfoExtractor(object):                      If not explicitly set, calculated from timestamp.      uploader_id:    Nickname or id of the video uploader.      location:       Physical location where the video was filmed. -    subtitles:      The subtitle file contents as a dictionary in the format -                    {language: subtitles}. +    subtitles:      The available subtitles as a dictionary in the format +                    {language: subformats}. "subformats" is a list sorted from +                    lower to higher preference, each element is a dictionary +                    with the "ext" entry and one of: +                        * "data": The subtitles file contents +                        * "url": A url pointing to the subtitles file +                    Note: YoutubeDL.extract_info will get the requested +                    format and replace the "subformats" list with it.      duration:       Length of the video in seconds, as an integer.      view_count:     How many users have watched the video on the platform.      like_count:     Number of positive ratings of the video @@ -993,6 +999,16 @@ class InfoExtractor(object):              any_restricted = any_restricted or is_restricted          return not any_restricted +    def extract_subtitles(self, *args, **kwargs): +        subtitles = {} +        list_subtitles = self._downloader.params.get('listsubtitles') +        if self._downloader.params.get('writesubtitles', False) or list_subtitles: +            subtitles.update(self._get_subtitles(*args, **kwargs)) +        return subtitles + +    def _get_subtitles(self, *args, **kwargs): +        raise NotImplementedError("This method must be implemented by subclasses") +  class SearchInfoExtractor(InfoExtractor):      """ diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 10b3b706a..1809eaae4 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -3,14 +3,14 @@ from __future__ import unicode_literals  import json  import re -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor  from ..compat import (      compat_str,  ) -class TEDIE(SubtitlesInfoExtractor): +class TEDIE(InfoExtractor):      _VALID_URL = r'''(?x)          (?P<proto>https?://)          (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/ @@ -165,9 +165,6 @@ class TEDIE(SubtitlesInfoExtractor):          video_id = compat_str(talk_info['id'])          # subtitles          video_subtitles = self.extract_subtitles(video_id, talk_info) -        if self._downloader.params.get('listsubtitles', False): -            self._list_available_subtitles(video_id, talk_info) -            return          thumbnail = talk_info['thumb']          if not thumbnail.startswith('http'): @@ -183,13 +180,18 @@ class TEDIE(SubtitlesInfoExtractor):              'duration': talk_info.get('duration'),          } -    def _get_available_subtitles(self, video_id, talk_info): +    def _get_subtitles(self, video_id, talk_info):          languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]          if languages:              sub_lang_list = {}              for l in languages: -                url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l) -                sub_lang_list[l] = url +                sub_lang_list[l] = [ +                    { +                        'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext), +                        'ext': ext, +                    } +                    for ext in ['ted', 'srt'] +                ]              return sub_lang_list          else:              self._downloader.report_warning('video doesn\'t have subtitles') diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 873432bee..4fcf8c83d 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -387,8 +387,8 @@ def parseOpts(overrideArguments=None):          help='lists all available subtitles for the video')      subtitles.add_option(          '--sub-format', -        action='store', dest='subtitlesformat', metavar='FORMAT', default='srt', -        help='subtitle format (default=srt) ([sbv/vtt] youtube only)') +        action='store', dest='subtitlesformat', metavar='FORMAT', default='best', +        help='subtitle format, accepts formats preference, for example: "ass/srt/best"')      subtitles.add_option(          '--sub-lang', '--sub-langs', '--srt-lang',          action='callback', dest='subtitleslangs', metavar='LANGS', type='str', diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 5238ce534..d1bbfbfe3 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -453,10 +453,6 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):          'zu': 'zul',      } -    def __init__(self, downloader=None, subtitlesformat='srt'): -        super(FFmpegEmbedSubtitlePP, self).__init__(downloader) -        self._subformat = subtitlesformat -      @classmethod      def _conver_lang_code(cls, code):          """Convert language code from ISO 639-1 to ISO 639-2/T""" @@ -472,7 +468,7 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):          sub_langs = [key for key in information['subtitles']]          filename = information['filepath'] -        input_files = [filename] + [subtitles_filename(filename, lang, self._subformat) for lang in sub_langs] +        input_files = [filename] + [subtitles_filename(filename, lang, sub_info['ext']) for lang, sub_info in information['subtitles'].items()]          opts = [              '-map', '0', | 
