diff options
29 files changed, 533 insertions, 555 deletions
| @@ -43,7 +43,7 @@ test:  ot: offlinetest  offlinetest: codetest -	nosetests --verbose test --exclude test_download --exclude test_age_restriction --exclude test_subtitles --exclude test_write_annotations --exclude test_youtube_lists +	nosetests --verbose test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py  tar: youtube-dl.tar.gz diff --git a/test/parameters.json b/test/parameters.json index af77b89b4..cbff9bd16 100644 --- a/test/parameters.json +++ b/test/parameters.json @@ -28,7 +28,7 @@      "retries": 10,       "simulate": false,       "subtitleslang": null,  -    "subtitlesformat": "srt", +    "subtitlesformat": "best",      "test": true,       "updatetime": true,       "usenetrc": false,  diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index b1cd6a69f..055e42555 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -337,6 +337,65 @@ class TestFormatSelection(unittest.TestCase):          downloaded = ydl.downloaded_info_dicts[0]          self.assertEqual(downloaded['format_id'], 'G') +    def test_subtitles(self): +        def s_formats(lang, autocaption=False): +            return [{ +                'ext': ext, +                'url': 'http://localhost/video.%s.%s' % (lang, ext), +                '_auto': autocaption, +            } for ext in ['vtt', 'srt', 'ass']] +        subtitles = dict((l, s_formats(l)) for l in ['en', 'fr', 'es']) +        auto_captions = dict((l, s_formats(l, True)) for l in ['it', 'pt', 'es']) +        info_dict = { +            'id': 'test', +            'title': 'Test', +            'url': 'http://localhost/video.mp4', +            'subtitles': subtitles, +            'automatic_captions': auto_captions, +            'extractor': 'TEST', +        } + +        def get_info(params={}): +            params.setdefault('simulate', True) +            ydl = YDL(params) +            ydl.report_warning = lambda *args, **kargs: None +            return ydl.process_video_result(info_dict, download=False) + +        result = get_info() +        self.assertFalse(result.get('requested_subtitles')) +        self.assertEqual(result['subtitles'], subtitles) +        self.assertEqual(result['automatic_captions'], auto_captions) + +        result = get_info({'writesubtitles': True}) +        subs = result['requested_subtitles'] +        self.assertTrue(subs) +        self.assertEqual(set(subs.keys()), set(['en'])) +        self.assertTrue(subs['en'].get('data') is None) +        self.assertEqual(subs['en']['ext'], 'ass') + +        result = get_info({'writesubtitles': True, 'subtitlesformat': 'foo/srt'}) +        subs = result['requested_subtitles'] +        self.assertEqual(subs['en']['ext'], 'srt') + +        result = get_info({'writesubtitles': True, 'subtitleslangs': ['es', 'fr', 'it']}) +        subs = result['requested_subtitles'] +        self.assertTrue(subs) +        self.assertEqual(set(subs.keys()), set(['es', 'fr'])) + +        result = get_info({'writesubtitles': True, 'writeautomaticsub': True, 'subtitleslangs': ['es', 'pt']}) +        subs = result['requested_subtitles'] +        self.assertTrue(subs) +        self.assertEqual(set(subs.keys()), set(['es', 'pt'])) +        self.assertFalse(subs['es']['_auto']) +        self.assertTrue(subs['pt']['_auto']) + +        result = get_info({'writeautomaticsub': True, 'subtitleslangs': ['es', 'pt']}) +        subs = result['requested_subtitles'] +        self.assertTrue(subs) +        self.assertEqual(set(subs.keys()), set(['es', 'pt'])) +        self.assertTrue(subs['es']['_auto']) +        self.assertTrue(subs['pt']['_auto']) +      def test_add_extra_info(self):          test_dict = {              'extractor': 'Foo', diff --git a/test/test_subtitles.py b/test/test_subtitles.py index bcc69a778..7f93f0a75 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -18,6 +18,13 @@ from youtube_dl.extractor import (      VimeoIE,      WallaIE,      CeskaTelevizeIE, +    LyndaIE, +    NPOIE, +    ComedyCentralIE, +    NRKTVIE, +    RaiIE, +    VikiIE, +    ThePlatformIE,  ) @@ -27,42 +34,38 @@ class BaseTestSubtitles(unittest.TestCase):      def setUp(self):          self.DL = FakeYDL() -        self.ie = self.IE(self.DL) +        self.ie = self.IE() +        self.DL.add_info_extractor(self.ie)      def getInfoDict(self): -        info_dict = self.ie.extract(self.url) +        info_dict = self.DL.extract_info(self.url, download=False)          return info_dict      def getSubtitles(self):          info_dict = self.getInfoDict() -        return info_dict['subtitles'] +        subtitles = info_dict['requested_subtitles'] +        if not subtitles: +            return subtitles +        for sub_info in subtitles.values(): +            if sub_info.get('data') is None: +                uf = self.DL.urlopen(sub_info['url']) +                sub_info['data'] = uf.read().decode('utf-8') +        return dict((l, sub_info['data']) for l, sub_info in subtitles.items())  class TestYoutubeSubtitles(BaseTestSubtitles):      url = 'QRS8MkLhQmM'      IE = YoutubeIE -    def test_youtube_no_writesubtitles(self): -        self.DL.params['writesubtitles'] = False -        subtitles = self.getSubtitles() -        self.assertEqual(subtitles, None) - -    def test_youtube_subtitles(self): -        self.DL.params['writesubtitles'] = True -        subtitles = self.getSubtitles() -        self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260') - -    def test_youtube_subtitles_lang(self): -        self.DL.params['writesubtitles'] = True -        self.DL.params['subtitleslangs'] = ['it'] -        subtitles = self.getSubtitles() -        self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d') -      def test_youtube_allsubtitles(self):          self.DL.params['writesubtitles'] = True          self.DL.params['allsubtitles'] = True          subtitles = self.getSubtitles()          self.assertEqual(len(subtitles.keys()), 13) +        self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260') +        self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d') +        for lang in ['it', 'fr', 'de']: +            self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang)      def test_youtube_subtitles_sbv_format(self):          self.DL.params['writesubtitles'] = True @@ -76,12 +79,6 @@ class TestYoutubeSubtitles(BaseTestSubtitles):          subtitles = self.getSubtitles()          self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06') -    def test_youtube_list_subtitles(self): -        self.DL.expect_warning('Video doesn\'t have automatic captions') -        self.DL.params['listsubtitles'] = True -        info_dict = self.getInfoDict() -        self.assertEqual(info_dict, None) -      def test_youtube_automatic_captions(self):          self.url = '8YoUxe5ncPo'          self.DL.params['writeautomaticsub'] = True @@ -103,55 +100,22 @@ class TestYoutubeSubtitles(BaseTestSubtitles):          self.DL.params['writesubtitles'] = True          self.DL.params['allsubtitles'] = True          subtitles = self.getSubtitles() -        self.assertEqual(len(subtitles), 0) - -    def test_youtube_multiple_langs(self): -        self.url = 'QRS8MkLhQmM' -        self.DL.params['writesubtitles'] = True -        langs = ['it', 'fr', 'de'] -        self.DL.params['subtitleslangs'] = langs -        subtitles = self.getSubtitles() -        for lang in langs: -            self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) +        self.assertFalse(subtitles)  class TestDailymotionSubtitles(BaseTestSubtitles):      url = 'http://www.dailymotion.com/video/xczg00'      IE = DailymotionIE -    def test_no_writesubtitles(self): -        subtitles = self.getSubtitles() -        self.assertEqual(subtitles, None) - -    def test_subtitles(self): -        self.DL.params['writesubtitles'] = True -        subtitles = self.getSubtitles() -        self.assertEqual(md5(subtitles['en']), '976553874490cba125086bbfea3ff76f') - -    def test_subtitles_lang(self): -        self.DL.params['writesubtitles'] = True -        self.DL.params['subtitleslangs'] = ['fr'] -        subtitles = self.getSubtitles() -        self.assertEqual(md5(subtitles['fr']), '594564ec7d588942e384e920e5341792') -      def test_allsubtitles(self):          self.DL.params['writesubtitles'] = True          self.DL.params['allsubtitles'] = True          subtitles = self.getSubtitles()          self.assertTrue(len(subtitles.keys()) >= 6) - -    def test_list_subtitles(self): -        self.DL.expect_warning('Automatic Captions not supported by this server') -        self.DL.params['listsubtitles'] = True -        info_dict = self.getInfoDict() -        self.assertEqual(info_dict, None) - -    def test_automatic_captions(self): -        self.DL.expect_warning('Automatic Captions not supported by this server') -        self.DL.params['writeautomaticsub'] = True -        self.DL.params['subtitleslang'] = ['en'] -        subtitles = self.getSubtitles() -        self.assertTrue(len(subtitles.keys()) == 0) +        self.assertEqual(md5(subtitles['en']), '976553874490cba125086bbfea3ff76f') +        self.assertEqual(md5(subtitles['fr']), '594564ec7d588942e384e920e5341792') +        for lang in ['es', 'fr', 'de']: +            self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang)      def test_nosubtitles(self):          self.DL.expect_warning('video doesn\'t have subtitles') @@ -159,61 +123,21 @@ class TestDailymotionSubtitles(BaseTestSubtitles):          self.DL.params['writesubtitles'] = True          self.DL.params['allsubtitles'] = True          subtitles = self.getSubtitles() -        self.assertEqual(len(subtitles), 0) - -    def test_multiple_langs(self): -        self.DL.params['writesubtitles'] = True -        langs = ['es', 'fr', 'de'] -        self.DL.params['subtitleslangs'] = langs -        subtitles = self.getSubtitles() -        for lang in langs: -            self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) +        self.assertFalse(subtitles)  class TestTedSubtitles(BaseTestSubtitles):      url = 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html'      IE = TEDIE -    def test_no_writesubtitles(self): -        subtitles = self.getSubtitles() -        self.assertEqual(subtitles, None) - -    def test_subtitles(self): -        self.DL.params['writesubtitles'] = True -        subtitles = self.getSubtitles() -        self.assertEqual(md5(subtitles['en']), '4262c1665ff928a2dada178f62cb8d14') - -    def test_subtitles_lang(self): -        self.DL.params['writesubtitles'] = True -        self.DL.params['subtitleslangs'] = ['fr'] -        subtitles = self.getSubtitles() -        self.assertEqual(md5(subtitles['fr']), '66a63f7f42c97a50f8c0e90bc7797bb5') -      def test_allsubtitles(self):          self.DL.params['writesubtitles'] = True          self.DL.params['allsubtitles'] = True          subtitles = self.getSubtitles()          self.assertTrue(len(subtitles.keys()) >= 28) - -    def test_list_subtitles(self): -        self.DL.expect_warning('Automatic Captions not supported by this server') -        self.DL.params['listsubtitles'] = True -        info_dict = self.getInfoDict() -        self.assertEqual(info_dict, None) - -    def test_automatic_captions(self): -        self.DL.expect_warning('Automatic Captions not supported by this server') -        self.DL.params['writeautomaticsub'] = True -        self.DL.params['subtitleslang'] = ['en'] -        subtitles = self.getSubtitles() -        self.assertTrue(len(subtitles.keys()) == 0) - -    def test_multiple_langs(self): -        self.DL.params['writesubtitles'] = True -        langs = ['es', 'fr', 'de'] -        self.DL.params['subtitleslangs'] = langs -        subtitles = self.getSubtitles() -        for lang in langs: +        self.assertEqual(md5(subtitles['en']), '4262c1665ff928a2dada178f62cb8d14') +        self.assertEqual(md5(subtitles['fr']), '66a63f7f42c97a50f8c0e90bc7797bb5') +        for lang in ['es', 'fr', 'de']:              self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) @@ -221,14 +145,7 @@ class TestBlipTVSubtitles(BaseTestSubtitles):      url = 'http://blip.tv/a/a-6603250'      IE = BlipTVIE -    def test_list_subtitles(self): -        self.DL.expect_warning('Automatic Captions not supported by this server') -        self.DL.params['listsubtitles'] = True -        info_dict = self.getInfoDict() -        self.assertEqual(info_dict, None) -      def test_allsubtitles(self): -        self.DL.expect_warning('Automatic Captions not supported by this server')          self.DL.params['writesubtitles'] = True          self.DL.params['allsubtitles'] = True          subtitles = self.getSubtitles() @@ -240,39 +157,13 @@ class TestVimeoSubtitles(BaseTestSubtitles):      url = 'http://vimeo.com/76979871'      IE = VimeoIE -    def test_no_writesubtitles(self): -        subtitles = self.getSubtitles() -        self.assertEqual(subtitles, None) - -    def test_subtitles(self): -        self.DL.params['writesubtitles'] = True -        subtitles = self.getSubtitles() -        self.assertEqual(md5(subtitles['en']), '8062383cf4dec168fc40a088aa6d5888') - -    def test_subtitles_lang(self): -        self.DL.params['writesubtitles'] = True -        self.DL.params['subtitleslangs'] = ['fr'] -        subtitles = self.getSubtitles() -        self.assertEqual(md5(subtitles['fr']), 'b6191146a6c5d3a452244d853fde6dc8') -      def test_allsubtitles(self):          self.DL.params['writesubtitles'] = True          self.DL.params['allsubtitles'] = True          subtitles = self.getSubtitles()          self.assertEqual(set(subtitles.keys()), set(['de', 'en', 'es', 'fr'])) - -    def test_list_subtitles(self): -        self.DL.expect_warning('Automatic Captions not supported by this server') -        self.DL.params['listsubtitles'] = True -        info_dict = self.getInfoDict() -        self.assertEqual(info_dict, None) - -    def test_automatic_captions(self): -        self.DL.expect_warning('Automatic Captions not supported by this server') -        self.DL.params['writeautomaticsub'] = True -        self.DL.params['subtitleslang'] = ['en'] -        subtitles = self.getSubtitles() -        self.assertTrue(len(subtitles.keys()) == 0) +        self.assertEqual(md5(subtitles['en']), '8062383cf4dec168fc40a088aa6d5888') +        self.assertEqual(md5(subtitles['fr']), 'b6191146a6c5d3a452244d853fde6dc8')      def test_nosubtitles(self):          self.DL.expect_warning('video doesn\'t have subtitles') @@ -280,27 +171,13 @@ class TestVimeoSubtitles(BaseTestSubtitles):          self.DL.params['writesubtitles'] = True          self.DL.params['allsubtitles'] = True          subtitles = self.getSubtitles() -        self.assertEqual(len(subtitles), 0) - -    def test_multiple_langs(self): -        self.DL.params['writesubtitles'] = True -        langs = ['es', 'fr', 'de'] -        self.DL.params['subtitleslangs'] = langs -        subtitles = self.getSubtitles() -        for lang in langs: -            self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) +        self.assertFalse(subtitles)  class TestWallaSubtitles(BaseTestSubtitles):      url = 'http://vod.walla.co.il/movie/2705958/the-yes-men'      IE = WallaIE -    def test_list_subtitles(self): -        self.DL.expect_warning('Automatic Captions not supported by this server') -        self.DL.params['listsubtitles'] = True -        info_dict = self.getInfoDict() -        self.assertEqual(info_dict, None) -      def test_allsubtitles(self):          self.DL.expect_warning('Automatic Captions not supported by this server')          self.DL.params['writesubtitles'] = True @@ -315,19 +192,13 @@ class TestWallaSubtitles(BaseTestSubtitles):          self.DL.params['writesubtitles'] = True          self.DL.params['allsubtitles'] = True          subtitles = self.getSubtitles() -        self.assertEqual(len(subtitles), 0) +        self.assertFalse(subtitles)  class TestCeskaTelevizeSubtitles(BaseTestSubtitles):      url = 'http://www.ceskatelevize.cz/ivysilani/10600540290-u6-uzasny-svet-techniky'      IE = CeskaTelevizeIE -    def test_list_subtitles(self): -        self.DL.expect_warning('Automatic Captions not supported by this server') -        self.DL.params['listsubtitles'] = True -        info_dict = self.getInfoDict() -        self.assertEqual(info_dict, None) -      def test_allsubtitles(self):          self.DL.expect_warning('Automatic Captions not supported by this server')          self.DL.params['writesubtitles'] = True @@ -342,7 +213,96 @@ class TestCeskaTelevizeSubtitles(BaseTestSubtitles):          self.DL.params['writesubtitles'] = True          self.DL.params['allsubtitles'] = True          subtitles = self.getSubtitles() -        self.assertEqual(len(subtitles), 0) +        self.assertFalse(subtitles) + + +class TestLyndaSubtitles(BaseTestSubtitles): +    url = 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html' +    IE = LyndaIE + +    def test_allsubtitles(self): +        self.DL.params['writesubtitles'] = True +        self.DL.params['allsubtitles'] = True +        subtitles = self.getSubtitles() +        self.assertEqual(set(subtitles.keys()), set(['en'])) +        self.assertEqual(md5(subtitles['en']), '09bbe67222259bed60deaa26997d73a7') + + +class TestNPOSubtitles(BaseTestSubtitles): +    url = 'http://www.npo.nl/nos-journaal/28-08-2014/POW_00722860' +    IE = NPOIE + +    def test_allsubtitles(self): +        self.DL.params['writesubtitles'] = True +        self.DL.params['allsubtitles'] = True +        subtitles = self.getSubtitles() +        self.assertEqual(set(subtitles.keys()), set(['nl'])) +        self.assertEqual(md5(subtitles['nl']), 'fc6435027572b63fb4ab143abd5ad3f4') + + +class TestMTVSubtitles(BaseTestSubtitles): +    url = 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother' +    IE = ComedyCentralIE + +    def getInfoDict(self): +        return super(TestMTVSubtitles, self).getInfoDict()['entries'][0] + +    def test_allsubtitles(self): +        self.DL.params['writesubtitles'] = True +        self.DL.params['allsubtitles'] = True +        subtitles = self.getSubtitles() +        self.assertEqual(set(subtitles.keys()), set(['en'])) +        self.assertEqual(md5(subtitles['en']), 'b9f6ca22a6acf597ec76f61749765e65') + + +class TestNRKSubtitles(BaseTestSubtitles): +    url = 'http://tv.nrk.no/serie/ikke-gjoer-dette-hjemme/DMPV73000411/sesong-2/episode-1' +    IE = NRKTVIE + +    def test_allsubtitles(self): +        self.DL.params['writesubtitles'] = True +        self.DL.params['allsubtitles'] = True +        subtitles = self.getSubtitles() +        self.assertEqual(set(subtitles.keys()), set(['no'])) +        self.assertEqual(md5(subtitles['no']), '1d221e6458c95c5494dcd38e6a1f129a') + + +class TestRaiSubtitles(BaseTestSubtitles): +    url = 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html' +    IE = RaiIE + +    def test_allsubtitles(self): +        self.DL.params['writesubtitles'] = True +        self.DL.params['allsubtitles'] = True +        subtitles = self.getSubtitles() +        self.assertEqual(set(subtitles.keys()), set(['it'])) +        self.assertEqual(md5(subtitles['it']), 'b1d90a98755126b61e667567a1f6680a') + + +class TestVikiSubtitles(BaseTestSubtitles): +    url = 'http://www.viki.com/videos/1060846v-punch-episode-18' +    IE = VikiIE + +    def test_allsubtitles(self): +        self.DL.params['writesubtitles'] = True +        self.DL.params['allsubtitles'] = True +        subtitles = self.getSubtitles() +        self.assertEqual(set(subtitles.keys()), set(['en'])) +        self.assertEqual(md5(subtitles['en']), '53cb083a5914b2d84ef1ab67b880d18a') + + +class TestThePlatformSubtitles(BaseTestSubtitles): +    # from http://www.3playmedia.com/services-features/tools/integrations/theplatform/ +    # (see http://theplatform.com/about/partners/type/subtitles-closed-captioning/) +    url = 'theplatform:JFUjUE1_ehvq' +    IE = ThePlatformIE + +    def test_allsubtitles(self): +        self.DL.params['writesubtitles'] = True +        self.DL.params['allsubtitles'] = True +        subtitles = self.getSubtitles() +        self.assertEqual(set(subtitles.keys()), set(['en'])) +        self.assertEqual(md5(subtitles['en']), '97e7670cbae3c4d26ae8bcc7fdd78d4b')  if __name__ == '__main__': diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index ca7c3f5c6..76fc394bc 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -154,7 +154,7 @@ class YoutubeDL(object):      allsubtitles:      Downloads all the subtitles of the video                         (requires writesubtitles or writeautomaticsub)      listsubtitles:     Lists all available subtitles for the video -    subtitlesformat:   Subtitle format [srt/sbv/vtt] (default=srt) +    subtitlesformat:   The format code for subtitles      subtitleslangs:    List of languages of the subtitles to download      keepvideo:         Keep the video file after post-processing      daterange:         A DateRange object, download only if the upload_date is in the range. @@ -1008,6 +1008,15 @@ class YoutubeDL(object):                  info_dict['timestamp'])              info_dict['upload_date'] = upload_date.strftime('%Y%m%d') +        if self.params.get('listsubtitles', False): +            if 'automatic_captions' in info_dict: +                self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions') +            self.list_subtitles(info_dict['id'], info_dict.get('subtitles'), 'subtitles') +            return +        info_dict['requested_subtitles'] = self.process_subtitles( +            info_dict['id'], info_dict.get('subtitles'), +            info_dict.get('automatic_captions')) +          # This extractors handle format selection themselves          if info_dict['extractor'] in ['Youku']:              if download: @@ -1136,6 +1145,55 @@ class YoutubeDL(object):          info_dict.update(formats_to_download[-1])          return info_dict +    def process_subtitles(self, video_id, normal_subtitles, automatic_captions): +        """Select the requested subtitles and their format""" +        available_subs = {} +        if normal_subtitles and self.params.get('writesubtitles'): +            available_subs.update(normal_subtitles) +        if automatic_captions and self.params.get('writeautomaticsub'): +            for lang, cap_info in automatic_captions.items(): +                if lang not in available_subs: +                    available_subs[lang] = cap_info + +        if (not self.params.get('writesubtitles') and not +                self.params.get('writeautomaticsub') or not +                available_subs): +            return None + +        if self.params.get('allsubtitles', False): +            requested_langs = available_subs.keys() +        else: +            if self.params.get('subtitleslangs', False): +                requested_langs = self.params.get('subtitleslangs') +            elif 'en' in available_subs: +                requested_langs = ['en'] +            else: +                requested_langs = [list(available_subs.keys())[0]] + +        formats_query = self.params.get('subtitlesformat', 'best') +        formats_preference = formats_query.split('/') if formats_query else [] +        subs = {} +        for lang in requested_langs: +            formats = available_subs.get(lang) +            if formats is None: +                self.report_warning('%s subtitles not available for %s' % (lang, video_id)) +                continue +            for ext in formats_preference: +                if ext == 'best': +                    f = formats[-1] +                    break +                matches = list(filter(lambda f: f['ext'] == ext, formats)) +                if matches: +                    f = matches[-1] +                    break +            else: +                f = formats[-1] +                self.report_warning( +                    'No subtitle format found matching "%s" for language %s, ' +                    'using %s' % (formats_query, lang, f['ext'])) +            subs[lang] = f +        return subs +      def process_info(self, info_dict):          """Process a single resolved IE result.""" @@ -1238,15 +1296,22 @@ class YoutubeDL(object):          subtitles_are_requested = any([self.params.get('writesubtitles', False),                                         self.params.get('writeautomaticsub')]) -        if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']: +        if subtitles_are_requested and info_dict.get('requested_subtitles'):              # subtitles download errors are already managed as troubles in relevant IE              # that way it will silently go on when used with unsupporting IE -            subtitles = info_dict['subtitles'] -            sub_format = self.params.get('subtitlesformat', 'srt') -            for sub_lang in subtitles.keys(): -                sub = subtitles[sub_lang] -                if sub is None: -                    continue +            subtitles = info_dict['requested_subtitles'] +            for sub_lang, sub_info in subtitles.items(): +                sub_format = sub_info['ext'] +                if sub_info.get('data') is not None: +                    sub_data = sub_info['data'] +                else: +                    try: +                        uf = self.urlopen(sub_info['url']) +                        sub_data = uf.read().decode('utf-8') +                    except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: +                        self.report_warning('Unable to download subtitle for "%s": %s' % +                                            (sub_lang, compat_str(err))) +                        continue                  try:                      sub_filename = subtitles_filename(filename, sub_lang, sub_format)                      if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): @@ -1254,7 +1319,7 @@ class YoutubeDL(object):                      else:                          self.to_screen('[info] Writing video subtitles to: ' + sub_filename)                          with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: -                            subfile.write(sub) +                            subfile.write(sub_data)                  except (OSError, IOError):                      self.report_error('Cannot write subtitles file ' + sub_filename)                      return @@ -1564,6 +1629,17 @@ class YoutubeDL(object):              ['ID', 'width', 'height', 'URL'],              [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])) +    def list_subtitles(self, video_id, subtitles, name='subtitles'): +        if not subtitles: +            self.to_screen('%s has no %s' % (video_id, name)) +            return +        self.to_screen( +            'Available %s for %s:' % (name, video_id)) +        self.to_screen(render_table( +            ['Language', 'formats'], +            [[lang, ', '.join(f['ext'] for f in reversed(formats))] +                for lang, formats in subtitles.items()])) +      def urlopen(self, req):          """ Start an HTTP download """ diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 25ab3fdfe..5ce201800 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -226,7 +226,6 @@ def _real_main(argv=None):      if opts.embedsubtitles:          postprocessors.append({              'key': 'FFmpegEmbedSubtitle', -            'subtitlesformat': opts.subtitlesformat,          })      if opts.xattrs:          postprocessors.append({'key': 'XAttrMetadata'}) diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index f016368fa..7669e0e3d 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals  import time  import hmac -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor  from ..compat import (      compat_str,      compat_urllib_parse, @@ -17,7 +17,7 @@ from ..utils import (  ) -class AtresPlayerIE(SubtitlesInfoExtractor): +class AtresPlayerIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/television/[^/]+/[^/]+/[^/]+/(?P<id>.+?)_\d+\.html'      _TESTS = [          { @@ -144,13 +144,12 @@ class AtresPlayerIE(SubtitlesInfoExtractor):          thumbnail = xpath_text(episode, './media/asset/files/background', 'thumbnail')          subtitles = {} -        subtitle = xpath_text(episode, './media/asset/files/subtitle', 'subtitle') -        if subtitle: -            subtitles['es'] = subtitle - -        if self._downloader.params.get('listsubtitles', False): -            self._list_available_subtitles(video_id, subtitles) -            return +        subtitle_url = xpath_text(episode, './media/asset/files/subtitle', 'subtitle') +        if subtitle_url: +            subtitles['es'] = [{ +                'ext': 'srt', +                'url': subtitle_url, +            }]          return {              'id': video_id, @@ -159,5 +158,5 @@ class AtresPlayerIE(SubtitlesInfoExtractor):              'thumbnail': thumbnail,              'duration': duration,              'formats': formats, -            'subtitles': self.extract_subtitles(video_id, subtitles), +            'subtitles': subtitles,          } diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index f23e39545..abc34a576 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -2,12 +2,12 @@ from __future__ import unicode_literals  import xml.etree.ElementTree -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor  from ..utils import ExtractorError  from ..compat import compat_HTTPError -class BBCCoUkIE(SubtitlesInfoExtractor): +class BBCCoUkIE(InfoExtractor):      IE_NAME = 'bbc.co.uk'      IE_DESC = 'BBC iPlayer'      _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})' @@ -215,17 +215,32 @@ class BBCCoUkIE(SubtitlesInfoExtractor):              formats.extend(conn_formats)          return formats -    def _extract_captions(self, media, programme_id): +    def _get_subtitles(self, media, programme_id):          subtitles = {}          for connection in self._extract_connections(media):              captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')              lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')              ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}'))              srt = '' + +            def _extract_text(p): +                if p.text is not None: +                    stripped_text = p.text.strip() +                    if stripped_text: +                        return stripped_text +                return ' '.join(span.text.strip() for span in p.findall('{http://www.w3.org/2006/10/ttaf1}span'))              for pos, p in enumerate(ps): -                srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), -                                                          p.text.strip() if p.text is not None else '') -            subtitles[lang] = srt +                srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), _extract_text(p)) +            subtitles[lang] = [ +                { +                    'url': connection.get('href'), +                    'ext': 'ttml', +                }, +                { +                    'data': srt, +                    'ext': 'srt', +                }, +            ]          return subtitles      def _download_media_selector(self, programme_id): @@ -249,7 +264,7 @@ class BBCCoUkIE(SubtitlesInfoExtractor):              elif kind == 'video':                  formats.extend(self._extract_video(media, programme_id))              elif kind == 'captions': -                subtitles = self._extract_captions(media, programme_id) +                subtitles = self.extract_subtitles(media, programme_id)          return formats, subtitles @@ -324,10 +339,6 @@ class BBCCoUkIE(SubtitlesInfoExtractor):          else:              programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id) -        if self._downloader.params.get('listsubtitles', False): -            self._list_available_subtitles(programme_id, subtitles) -            return -          self._sort_formats(formats)          return { diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py index 436cc5155..8c7ba4b91 100644 --- a/youtube_dl/extractor/bliptv.py +++ b/youtube_dl/extractor/bliptv.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from .subtitles import SubtitlesInfoExtractor  from ..compat import (      compat_str, @@ -18,7 +17,7 @@ from ..utils import (  ) -class BlipTVIE(SubtitlesInfoExtractor): +class BlipTVIE(InfoExtractor):      _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/(?:(?:.+-|rss/flash/)(?P<id>\d+)|((?:play/|api\.swf#)(?P<lookup_id>[\da-zA-Z+_]+)))'      _TESTS = [ @@ -143,7 +142,7 @@ class BlipTVIE(SubtitlesInfoExtractor):          categories = [category.text for category in item.findall('category')]          formats = [] -        subtitles = {} +        subtitles_urls = {}          media_group = item.find(media('group'))          for media_content in media_group.findall(media('content')): @@ -161,7 +160,7 @@ class BlipTVIE(SubtitlesInfoExtractor):                  }                  lang = role.rpartition('-')[-1].strip().lower()                  langcode = LANGS.get(lang, lang) -                subtitles[langcode] = url +                subtitles_urls[langcode] = url              elif media_type.startswith('video/'):                  formats.append({                      'url': real_url, @@ -175,11 +174,7 @@ class BlipTVIE(SubtitlesInfoExtractor):                  })          self._sort_formats(formats) -        # subtitles -        video_subtitles = self.extract_subtitles(video_id, subtitles) -        if self._downloader.params.get('listsubtitles', False): -            self._list_available_subtitles(video_id, subtitles) -            return +        subtitles = self.extract_subtitles(video_id, subtitles_urls)          return {              'id': video_id, @@ -192,15 +187,22 @@ class BlipTVIE(SubtitlesInfoExtractor):              'thumbnail': thumbnail,              'categories': categories,              'formats': formats, -            'subtitles': video_subtitles, +            'subtitles': subtitles,          } -    def _download_subtitle_url(self, sub_lang, url): -        # For some weird reason, blip.tv serves a video instead of subtitles -        # when we request with a common UA -        req = compat_urllib_request.Request(url) -        req.add_header('User-Agent', 'youtube-dl') -        return self._download_webpage(req, None, note=False) +    def _get_subtitles(self, video_id, subtitles_urls): +        subtitles = {} +        for lang, url in subtitles_urls.items(): +            # For some weird reason, blip.tv serves a video instead of subtitles +            # when we request with a common UA +            req = compat_urllib_request.Request(url) +            req.add_header('User-Agent', 'youtube-dl') +            subtitles[lang] = [{ +                # The extension is 'srt' but it's actually an 'ass' file +                'ext': 'ass', +                'data': self._download_webpage(req, None, note=False), +            }] +        return subtitles  class BlipTVUserIE(InfoExtractor): diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index f70e090bb..65f6be623 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals  import re -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor  from ..compat import (      compat_urllib_request,      compat_urllib_parse, @@ -15,7 +15,7 @@ from ..utils import (  ) -class CeskaTelevizeIE(SubtitlesInfoExtractor): +class CeskaTelevizeIE(InfoExtractor):      _VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(.+/)?(?P<id>[^?#]+)'      _TESTS = [ @@ -107,13 +107,7 @@ class CeskaTelevizeIE(SubtitlesInfoExtractor):          subtitles = {}          subs = item.get('subtitles')          if subs: -            subtitles['cs'] = subs[0]['url'] - -        if self._downloader.params.get('listsubtitles', False): -            self._list_available_subtitles(video_id, subtitles) -            return - -        subtitles = self._fix_subtitles(self.extract_subtitles(video_id, subtitles)) +            subtitles = self.extract_subtitles(episode_id, subs)          return {              'id': episode_id, @@ -125,11 +119,20 @@ class CeskaTelevizeIE(SubtitlesInfoExtractor):              'subtitles': subtitles,          } +    def _get_subtitles(self, episode_id, subs): +        original_subtitles = self._download_webpage( +            subs[0]['url'], episode_id, 'Downloading subtitles') +        srt_subs = self._fix_subtitles(original_subtitles) +        return { +            'cs': [{ +                'ext': 'srt', +                'data': srt_subs, +            }] +        } +      @staticmethod      def _fix_subtitles(subtitles):          """ Convert millisecond-based subtitles to SRT """ -        if subtitles is None: -            return subtitles  # subtitles not requested          def _msectotimecode(msec):              """ Helper utility to convert milliseconds to timecode """ @@ -149,7 +152,4 @@ class CeskaTelevizeIE(SubtitlesInfoExtractor):                  else:                      yield line -        fixed_subtitles = {} -        for k, v in subtitles.items(): -            fixed_subtitles[k] = "\r\n".join(_fix_subtitle(v)) -        return fixed_subtitles +        return "\r\n".join(_fix_subtitle(subtitles)) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 79f6d199b..87fce9cd8 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -150,8 +150,14 @@ class InfoExtractor(object):                      If not explicitly set, calculated from timestamp.      uploader_id:    Nickname or id of the video uploader.      location:       Physical location where the video was filmed. -    subtitles:      The subtitle file contents as a dictionary in the format -                    {language: subtitles}. +    subtitles:      The available subtitles as a dictionary in the format +                    {language: subformats}. "subformats" is a list sorted from +                    lower to higher preference, each element is a dictionary +                    with the "ext" entry and one of: +                        * "data": The subtitles file contents +                        * "url": A url pointing to the subtitles file +    automatic_captions: Like 'subtitles', used by the YoutubeIE for +                    automatically generated captions      duration:       Length of the video in seconds, as an integer.      view_count:     How many users have watched the video on the platform.      like_count:     Number of positive ratings of the video @@ -1011,6 +1017,24 @@ class InfoExtractor(object):              any_restricted = any_restricted or is_restricted          return not any_restricted +    def extract_subtitles(self, *args, **kwargs): +        if (self._downloader.params.get('writesubtitles', False) or +                self._downloader.params.get('listsubtitles')): +            return self._get_subtitles(*args, **kwargs) +        return {} + +    def _get_subtitles(self, *args, **kwargs): +        raise NotImplementedError("This method must be implemented by subclasses") + +    def extract_automatic_captions(self, *args, **kwargs): +        if (self._downloader.params.get('writeautomaticsub', False) or +                self._downloader.params.get('listsubtitles')): +            return self._get_automatic_captions(*args, **kwargs) +        return {} + +    def _get_automatic_captions(self, *args, **kwargs): +        raise NotImplementedError("This method must be implemented by subclasses") +  class SearchInfoExtractor(InfoExtractor):      """ diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 1680f532f..f1da7d09b 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -9,7 +9,7 @@ import xml.etree.ElementTree  from hashlib import sha1  from math import pow, sqrt, floor -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor  from ..compat import (      compat_urllib_parse,      compat_urllib_request, @@ -25,10 +25,9 @@ from ..aes import (      aes_cbc_decrypt,      inc,  ) -from .common import InfoExtractor -class CrunchyrollIE(SubtitlesInfoExtractor): +class CrunchyrollIE(InfoExtractor):      _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)'      _TESTS = [{          'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', @@ -187,6 +186,38 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text          return output +    def _get_subtitles(self, video_id, webpage): +        subtitles = {} +        for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage): +            sub_page = self._download_webpage( +                'http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id=' + sub_id, +                video_id, note='Downloading subtitles for ' + sub_name) +            id = self._search_regex(r'id=\'([0-9]+)', sub_page, 'subtitle_id', fatal=False) +            iv = self._search_regex(r'<iv>([^<]+)', sub_page, 'subtitle_iv', fatal=False) +            data = self._search_regex(r'<data>([^<]+)', sub_page, 'subtitle_data', fatal=False) +            if not id or not iv or not data: +                continue +            id = int(id) +            iv = base64.b64decode(iv) +            data = base64.b64decode(data) + +            subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8') +            lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False) +            if not lang_code: +                continue +            sub_root = xml.etree.ElementTree.fromstring(subtitle) +            subtitles[lang_code] = [ +                { +                    'ext': 'srt', +                    'data': self._convert_subtitles_to_srt(sub_root), +                }, +                { +                    'ext': 'ass', +                    'data': self._convert_subtitles_to_ass(sub_root), +                }, +            ] +        return subtitles +      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('video_id') @@ -249,34 +280,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text                  'format_id': video_format,              }) -        subtitles = {} -        sub_format = self._downloader.params.get('subtitlesformat', 'srt') -        for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage): -            sub_page = self._download_webpage( -                'http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id=' + sub_id, -                video_id, note='Downloading subtitles for ' + sub_name) -            id = self._search_regex(r'id=\'([0-9]+)', sub_page, 'subtitle_id', fatal=False) -            iv = self._search_regex(r'<iv>([^<]+)', sub_page, 'subtitle_iv', fatal=False) -            data = self._search_regex(r'<data>([^<]+)', sub_page, 'subtitle_data', fatal=False) -            if not id or not iv or not data: -                continue -            id = int(id) -            iv = base64.b64decode(iv) -            data = base64.b64decode(data) - -            subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8') -            lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False) -            if not lang_code: -                continue -            sub_root = xml.etree.ElementTree.fromstring(subtitle) -            if sub_format == 'ass': -                subtitles[lang_code] = self._convert_subtitles_to_ass(sub_root) -            else: -                subtitles[lang_code] = self._convert_subtitles_to_srt(sub_root) - -        if self._downloader.params.get('listsubtitles', False): -            self._list_available_subtitles(video_id, subtitles) -            return +        subtitles = self.extract_subtitles(video_id, webpage)          return {              'id': video_id, diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index b2dbf4a92..42b20a46d 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -6,7 +6,6 @@ import json  import itertools  from .common import InfoExtractor -from .subtitles import SubtitlesInfoExtractor  from ..compat import (      compat_str, @@ -31,7 +30,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor):          return request -class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): +class DailymotionIE(DailymotionBaseInfoExtractor):      """Information Extractor for Dailymotion"""      _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(embed|#)/)?video/(?P<id>[^/?_]+)' @@ -143,9 +142,6 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):          # subtitles          video_subtitles = self.extract_subtitles(video_id, webpage) -        if self._downloader.params.get('listsubtitles', False): -            self._list_available_subtitles(video_id, webpage) -            return          view_count = str_to_int(self._search_regex(              r'video_views_count[^>]+>\s+([\d\.,]+)', @@ -169,7 +165,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):              'view_count': view_count,          } -    def _get_available_subtitles(self, video_id, webpage): +    def _get_subtitles(self, video_id, webpage):          try:              sub_list = self._download_webpage(                  'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id, @@ -179,7 +175,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):              return {}          info = json.loads(sub_list)          if (info['total'] > 0): -            sub_lang_list = dict((l['language'], l['url']) for l in info['list']) +            sub_lang_list = dict((l['language'], [{'url': l['url'], 'ext': 'srt'}]) for l in info['list'])              return sub_lang_list          self._downloader.report_warning('video doesn\'t have subtitles')          return {} diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index d5df18d7c..8257e35a4 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -1,11 +1,10 @@  from __future__ import unicode_literals -from .subtitles import SubtitlesInfoExtractor -from .common import ExtractorError +from .common import InfoExtractor, ExtractorError  from ..utils import parse_iso8601 -class DRTVIE(SubtitlesInfoExtractor): +class DRTVIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?dr\.dk/tv/se/(?:[^/]+/)*(?P<id>[\da-z-]+)(?:[/#?]|$)'      _TEST = { @@ -76,7 +75,7 @@ class DRTVIE(SubtitlesInfoExtractor):                      }                      for subs in subtitles_list:                          lang = subs['Language'] -                        subtitles[LANGS.get(lang, lang)] = subs['Uri'] +                        subtitles[LANGS.get(lang, lang)] = [{'url': subs['Uri'], 'ext': 'vtt'}]          if not formats and restricted_to_denmark:              raise ExtractorError( @@ -84,10 +83,6 @@ class DRTVIE(SubtitlesInfoExtractor):          self._sort_formats(formats) -        if self._downloader.params.get('listsubtitles', False): -            self._list_available_subtitles(video_id, subtitles) -            return -          return {              'id': video_id,              'title': title, @@ -96,5 +91,5 @@ class DRTVIE(SubtitlesInfoExtractor):              'timestamp': timestamp,              'duration': duration,              'formats': formats, -            'subtitles': self.extract_subtitles(video_id, subtitles), +            'subtitles': subtitles,          } diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 762cefa34..109055e72 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals  import re  import json -from .subtitles import SubtitlesInfoExtractor  from .common import InfoExtractor  from ..compat import (      compat_str, @@ -16,7 +15,7 @@ from ..utils import (  ) -class LyndaIE(SubtitlesInfoExtractor): +class LyndaIE(InfoExtractor):      IE_NAME = 'lynda'      IE_DESC = 'lynda.com videos'      _VALID_URL = r'https?://www\.lynda\.com/[^/]+/[^/]+/\d+/(\d+)-\d\.html' @@ -88,11 +87,7 @@ class LyndaIE(SubtitlesInfoExtractor):          self._check_formats(formats, video_id)          self._sort_formats(formats) -        if self._downloader.params.get('listsubtitles', False): -            self._list_available_subtitles(video_id, page) -            return - -        subtitles = self._fix_subtitles(self.extract_subtitles(video_id, page)) +        subtitles = self.extract_subtitles(video_id, page)          return {              'id': video_id, @@ -144,38 +139,31 @@ class LyndaIE(SubtitlesInfoExtractor):          if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None:              raise ExtractorError('Unable to log in') -    def _fix_subtitles(self, subtitles): -        if subtitles is None: -            return subtitles  # subtitles not requested - -        fixed_subtitles = {} -        for k, v in subtitles.items(): -            subs = json.loads(v) -            if len(subs) == 0: +    def _fix_subtitles(self, subs): +        srt = '' +        for pos in range(0, len(subs) - 1): +            seq_current = subs[pos] +            m_current = re.match(self._TIMECODE_REGEX, seq_current['Timecode']) +            if m_current is None:                  continue -            srt = '' -            for pos in range(0, len(subs) - 1): -                seq_current = subs[pos] -                m_current = re.match(self._TIMECODE_REGEX, seq_current['Timecode']) -                if m_current is None: -                    continue -                seq_next = subs[pos + 1] -                m_next = re.match(self._TIMECODE_REGEX, seq_next['Timecode']) -                if m_next is None: -                    continue -                appear_time = m_current.group('timecode') -                disappear_time = m_next.group('timecode') -                text = seq_current['Caption'] -                srt += '%s\r\n%s --> %s\r\n%s' % (str(pos), appear_time, disappear_time, text) -            if srt: -                fixed_subtitles[k] = srt -        return fixed_subtitles - -    def _get_available_subtitles(self, video_id, webpage): +            seq_next = subs[pos + 1] +            m_next = re.match(self._TIMECODE_REGEX, seq_next['Timecode']) +            if m_next is None: +                continue +            appear_time = m_current.group('timecode') +            disappear_time = m_next.group('timecode') +            text = seq_current['Caption'] +            srt += '%s\r\n%s --> %s\r\n%s' % (str(pos), appear_time, disappear_time, text) +        if srt: +            return srt + +    def _get_subtitles(self, video_id, webpage):          url = 'http://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id -        sub = self._download_webpage(url, None, False) -        sub_json = json.loads(sub) -        return {'en': url} if len(sub_json) > 0 else {} +        subs = self._download_json(url, None, False) +        if subs: +            return {'en': [{'ext': 'srt', 'data': self._fix_subtitles(subs)}]} +        else: +            return {}  class LyndaCourseIE(InfoExtractor): diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py index 3c61a850f..d7ab6a9ae 100644 --- a/youtube_dl/extractor/mit.py +++ b/youtube_dl/extractor/mit.py @@ -5,9 +5,6 @@ import json  from .common import InfoExtractor  from .youtube import YoutubeIE -from ..compat import ( -    compat_urlparse, -)  from ..utils import (      clean_html,      ExtractorError, @@ -108,7 +105,6 @@ class OCWMITIE(InfoExtractor):                  'upload_date': '20121109',                  'uploader_id': 'MIT',                  'uploader': 'MIT OpenCourseWare', -                # 'subtitles': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/MIT6_041F11_lec07_300k.mp4.srt'              }          },          { @@ -121,7 +117,6 @@ class OCWMITIE(InfoExtractor):                  'uploader_id': 'MIT',                  'uploader': 'MIT OpenCourseWare',                  'description': 'This section contains lecture video excerpts, lecture notes, an interactive mathlet with supporting documents, and problem solving videos.', -                # 'subtitles': 'http://ocw.mit.edu//courses/mathematics/18-01sc-single-variable-calculus-fall-2010/ocw-18.01-f07-lec01_300k.SRT'              }          }      ] @@ -140,7 +135,6 @@ class OCWMITIE(InfoExtractor):              metadata = re.sub(r'[\'"]', '', embed_chapter_media.group(1))              metadata = re.split(r', ?', metadata)              yt = metadata[1] -            subs = compat_urlparse.urljoin(self._BASE_URL, metadata[7])          else:              # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, captions_file)              embed_media = re.search(r'ocw_embed_media\((.+?)\)', webpage) @@ -148,7 +142,6 @@ class OCWMITIE(InfoExtractor):                  metadata = re.sub(r'[\'"]', '', embed_media.group(1))                  metadata = re.split(r', ?', metadata)                  yt = metadata[1] -                subs = compat_urlparse.urljoin(self._BASE_URL, metadata[5])              else:                  raise ExtractorError('Unable to find embedded YouTube video.')          video_id = YoutubeIE.extract_id(yt) @@ -159,7 +152,5 @@ class OCWMITIE(InfoExtractor):              'title': title,              'description': description,              'url': yt, -            'url_transparent' -            'subtitles': subs,              'ie_key': 'Youtube',          } diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index bc7f49ebb..c11de1cb6 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals  import re -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor  from ..compat import (      compat_urllib_parse,      compat_urllib_request, @@ -23,7 +23,7 @@ def _media_xml_tag(tag):      return '{http://search.yahoo.com/mrss/}%s' % tag -class MTVServicesInfoExtractor(SubtitlesInfoExtractor): +class MTVServicesInfoExtractor(InfoExtractor):      _MOBILE_TEMPLATE = None      @staticmethod @@ -95,25 +95,15 @@ class MTVServicesInfoExtractor(SubtitlesInfoExtractor):      def _extract_subtitles(self, mdoc, mtvn_id):          subtitles = {} -        FORMATS = { -            'scc': 'cea-608', -            'eia-608': 'cea-608', -            'xml': 'ttml', -        } -        subtitles_format = FORMATS.get( -            self._downloader.params.get('subtitlesformat'), 'ttml')          for transcript in mdoc.findall('.//transcript'):              if transcript.get('kind') != 'captions':                  continue              lang = transcript.get('srclang') -            for typographic in transcript.findall('./typographic'): -                captions_format = typographic.get('format') -                if captions_format == subtitles_format: -                    subtitles[lang] = compat_str(typographic.get('src')) -                    break -        if self._downloader.params.get('listsubtitles', False): -            self._list_available_subtitles(mtvn_id, subtitles) -        return self.extract_subtitles(mtvn_id, subtitles) +            subtitles[lang] = [{ +                'url': compat_str(typographic.get('src')), +                'ext': typographic.get('format') +            } for typographic in transcript.findall('./typographic')] +        return subtitles      def _get_video_info(self, itemdoc):          uri = itemdoc.find('guid').text @@ -196,8 +186,6 @@ class MTVServicesInfoExtractor(SubtitlesInfoExtractor):                  webpage, 'mgid')          videos_info = self._get_videos_info(mgid) -        if self._downloader.params.get('listsubtitles', False): -            return          return videos_info diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index c075618e8..9c01eb0af 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -1,6 +1,5 @@  from __future__ import unicode_literals -from .subtitles import SubtitlesInfoExtractor  from .common import InfoExtractor  from ..utils import (      fix_xml_ampersands, @@ -12,7 +11,7 @@ from ..utils import (  ) -class NPOBaseIE(SubtitlesInfoExtractor): +class NPOBaseIE(InfoExtractor):      def _get_token(self, video_id):          token_page = self._download_webpage(              'http://ida.omroep.nl/npoplayer/i.js', @@ -164,13 +163,10 @@ class NPOIE(NPOBaseIE):          subtitles = {}          if metadata.get('tt888') == 'ja': -            subtitles['nl'] = 'http://e.omroep.nl/tt888/%s' % video_id - -        if self._downloader.params.get('listsubtitles', False): -            self._list_available_subtitles(video_id, subtitles) -            return - -        subtitles = self.extract_subtitles(video_id, subtitles) +            subtitles['nl'] = [{ +                'ext': 'vtt', +                'url': 'http://e.omroep.nl/tt888/%s' % video_id, +            }]          return {              'id': video_id, diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index f6de26022..46f493cfc 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -10,7 +10,6 @@ from ..utils import (      parse_duration,      unified_strdate,  ) -from .subtitles import SubtitlesInfoExtractor  class NRKIE(InfoExtractor): @@ -73,7 +72,7 @@ class NRKIE(InfoExtractor):          } -class NRKTVIE(SubtitlesInfoExtractor): +class NRKTVIE(InfoExtractor):      _VALID_URL = r'(?P<baseurl>http://tv\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?'      _TESTS = [ @@ -156,7 +155,7 @@ class NRKTVIE(SubtitlesInfoExtractor):          if self._downloader.params.get('verbose', False):              self.to_screen('[debug] %s' % txt) -    def _extract_captions(self, subtitlesurl, video_id, baseurl): +    def _get_subtitles(self, subtitlesurl, video_id, baseurl):          url = "%s%s" % (baseurl, subtitlesurl)          self._debug_print('%s: Subtitle url: %s' % (video_id, url))          captions = self._download_xml(url, video_id, 'Downloading subtitles') @@ -170,7 +169,10 @@ class NRKTVIE(SubtitlesInfoExtractor):              endtime = self._seconds2str(begin + duration)              text = '\n'.join(p.itertext())              srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), starttime, endtime, text) -        return {lang: srt} +        return {lang: [ +            {'ext': 'ttml', 'url': url}, +            {'ext': 'srt', 'data': srt}, +        ]}      def _extract_f4m(self, manifest_url, video_id):          return self._extract_f4m_formats(manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id) @@ -243,10 +245,7 @@ class NRKTVIE(SubtitlesInfoExtractor):              webpage, 'subtitle URL', default=None)          subtitles = None          if subtitles_url: -            subtitles = self._extract_captions(subtitles_url, video_id, baseurl) -        if self._downloader.params.get('listsubtitles', False): -            self._list_available_subtitles(video_id, subtitles) -            return +            subtitles = self.extract_subtitles(subtitles_url, video_id, baseurl)          return {              'id': video_id, diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index aa26b7e0b..144e33982 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals  import re -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor  from ..compat import (      compat_urllib_parse,  ) @@ -12,7 +12,7 @@ from ..utils import (  ) -class RaiIE(SubtitlesInfoExtractor): +class RaiIE(InfoExtractor):      _VALID_URL = r'(?P<url>http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html)'      _TESTS = [          { @@ -89,15 +89,7 @@ class RaiIE(SubtitlesInfoExtractor):                  'ext': 'mp4',              }) -        if self._downloader.params.get('listsubtitles', False): -            page = self._download_webpage(url, video_id) -            self._list_available_subtitles(video_id, page) -            return - -        subtitles = {} -        if self._have_to_download_any_subtitles: -            page = self._download_webpage(url, video_id) -            subtitles = self.extract_subtitles(video_id, page) +        subtitles = self.extract_subtitles(video_id, url)          return {              'id': video_id, @@ -111,7 +103,8 @@ class RaiIE(SubtitlesInfoExtractor):              'subtitles': subtitles,          } -    def _get_available_subtitles(self, video_id, webpage): +    def _get_subtitles(self, video_id, url): +        webpage = self._download_webpage(url, video_id)          subtitles = {}          m = re.search(r'<meta name="closedcaption" content="(?P<captions>[^"]+)"', webpage)          if m: @@ -120,5 +113,8 @@ class RaiIE(SubtitlesInfoExtractor):              SRT_EXT = '.srt'              if captions.endswith(STL_EXT):                  captions = captions[:-len(STL_EXT)] + SRT_EXT -            subtitles['it'] = 'http://www.rai.tv%s' % compat_urllib_parse.quote(captions) +            subtitles['it'] = [{ +                'ext': 'srt', +                'url': 'http://www.rai.tv%s' % compat_urllib_parse.quote(captions), +            }]          return subtitles diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py deleted file mode 100644 index 59a51268d..000000000 --- a/youtube_dl/extractor/subtitles.py +++ /dev/null @@ -1,99 +0,0 @@ -from __future__ import unicode_literals -from .common import InfoExtractor - -from ..compat import compat_str -from ..utils import ( -    ExtractorError, -) - - -class SubtitlesInfoExtractor(InfoExtractor): -    @property -    def _have_to_download_any_subtitles(self): -        return any([self._downloader.params.get('writesubtitles', False), -                    self._downloader.params.get('writeautomaticsub')]) - -    def _list_available_subtitles(self, video_id, webpage): -        """ outputs the available subtitles for the video """ -        sub_lang_list = self._get_available_subtitles(video_id, webpage) -        auto_captions_list = self._get_available_automatic_caption(video_id, webpage) -        sub_lang = ",".join(list(sub_lang_list.keys())) -        self.to_screen('%s: Available subtitles for video: %s' % -                       (video_id, sub_lang)) -        auto_lang = ",".join(auto_captions_list.keys()) -        self.to_screen('%s: Available automatic captions for video: %s' % -                       (video_id, auto_lang)) - -    def extract_subtitles(self, video_id, webpage): -        """ -        returns {sub_lang: sub} ,{} if subtitles not found or None if the -        subtitles aren't requested. -        """ -        if not self._have_to_download_any_subtitles: -            return None -        available_subs_list = {} -        if self._downloader.params.get('writeautomaticsub', False): -            available_subs_list.update(self._get_available_automatic_caption(video_id, webpage)) -        if self._downloader.params.get('writesubtitles', False): -            available_subs_list.update(self._get_available_subtitles(video_id, webpage)) - -        if not available_subs_list:  # error, it didn't get the available subtitles -            return {} -        if self._downloader.params.get('allsubtitles', False): -            sub_lang_list = available_subs_list -        else: -            if self._downloader.params.get('subtitleslangs', False): -                requested_langs = self._downloader.params.get('subtitleslangs') -            elif 'en' in available_subs_list: -                requested_langs = ['en'] -            else: -                requested_langs = [list(available_subs_list.keys())[0]] - -            sub_lang_list = {} -            for sub_lang in requested_langs: -                if sub_lang not in available_subs_list: -                    self._downloader.report_warning('no closed captions found in the specified language "%s"' % sub_lang) -                    continue -                sub_lang_list[sub_lang] = available_subs_list[sub_lang] - -        subtitles = {} -        for sub_lang, url in sub_lang_list.items(): -            subtitle = self._request_subtitle_url(sub_lang, url) -            if subtitle: -                subtitles[sub_lang] = subtitle -        return subtitles - -    def _download_subtitle_url(self, sub_lang, url): -        return self._download_webpage(url, None, note=False) - -    def _request_subtitle_url(self, sub_lang, url): -        """ makes the http request for the subtitle """ -        try: -            sub = self._download_subtitle_url(sub_lang, url) -        except ExtractorError as err: -            self._downloader.report_warning('unable to download video subtitles for %s: %s' % (sub_lang, compat_str(err))) -            return -        if not sub: -            self._downloader.report_warning('Did not fetch video subtitles') -            return -        return sub - -    def _get_available_subtitles(self, video_id, webpage): -        """ -        returns {sub_lang: url} or {} if not available -        Must be redefined by the subclasses -        """ - -        # By default, allow implementations to simply pass in the result -        assert isinstance(webpage, dict), \ -            '_get_available_subtitles not implemented' -        return webpage - -    def _get_available_automatic_caption(self, video_id, webpage): -        """ -        returns {sub_lang: url} or {} if not available -        Must be redefined by the subclasses that support automatic captions, -        otherwise it will return {} -        """ -        self._downloader.report_warning('Automatic Captions not supported by this server') -        return {} diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 59678399d..4cec06f8b 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -3,14 +3,14 @@ from __future__ import unicode_literals  import json  import re -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor  from ..compat import (      compat_str,  ) -class TEDIE(SubtitlesInfoExtractor): +class TEDIE(InfoExtractor):      _VALID_URL = r'''(?x)          (?P<proto>https?://)          (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/ @@ -184,11 +184,6 @@ class TEDIE(SubtitlesInfoExtractor):          self._sort_formats(formats)          video_id = compat_str(talk_info['id']) -        # subtitles -        video_subtitles = self.extract_subtitles(video_id, talk_info) -        if self._downloader.params.get('listsubtitles', False): -            self._list_available_subtitles(video_id, talk_info) -            return          thumbnail = talk_info['thumb']          if not thumbnail.startswith('http'): @@ -199,21 +194,25 @@ class TEDIE(SubtitlesInfoExtractor):              'uploader': talk_info['speaker'],              'thumbnail': thumbnail,              'description': self._og_search_description(webpage), -            'subtitles': video_subtitles, +            'subtitles': self._get_subtitles(video_id, talk_info),              'formats': formats,              'duration': talk_info.get('duration'),          } -    def _get_available_subtitles(self, video_id, talk_info): +    def _get_subtitles(self, video_id, talk_info):          languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]          if languages:              sub_lang_list = {}              for l in languages: -                url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l) -                sub_lang_list[l] = url +                sub_lang_list[l] = [ +                    { +                        'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext), +                        'ext': ext, +                    } +                    for ext in ['ted', 'srt'] +                ]              return sub_lang_list          else: -            self._downloader.report_warning('video doesn\'t have subtitles')              return {}      def _watch_info(self, url, name): diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index f7b34bd26..feac666f7 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -8,7 +8,7 @@ import binascii  import hashlib -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor  from ..compat import (      compat_str,  ) @@ -22,7 +22,7 @@ from ..utils import (  _x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language'}) -class ThePlatformIE(SubtitlesInfoExtractor): +class ThePlatformIE(InfoExtractor):      _VALID_URL = r'''(?x)          (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/             (?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/)? @@ -106,15 +106,11 @@ class ThePlatformIE(SubtitlesInfoExtractor):          captions = info.get('captions')          if isinstance(captions, list):              for caption in captions: -                lang, src = caption.get('lang'), caption.get('src') -                if lang and src: -                    subtitles[lang] = src - -        if self._downloader.params.get('listsubtitles', False): -            self._list_available_subtitles(video_id, subtitles) -            return - -        subtitles = self.extract_subtitles(video_id, subtitles) +                lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type') +                subtitles[lang] = [{ +                    'ext': 'srt' if mime == 'text/srt' else 'ttml', +                    'url': src, +                }]          head = meta.find(_x('smil:head'))          body = meta.find(_x('smil:body')) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 944901e14..6816dacb6 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -2,16 +2,17 @@ from __future__ import unicode_literals  import re +from ..compat import compat_urlparse  from ..utils import (      ExtractorError,      unescapeHTML,      unified_strdate,      US_RATINGS,  ) -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor -class VikiIE(SubtitlesInfoExtractor): +class VikiIE(InfoExtractor):      IE_NAME = 'viki'      _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)' @@ -69,9 +70,6 @@ class VikiIE(SubtitlesInfoExtractor):          # subtitles          video_subtitles = self.extract_subtitles(video_id, info_webpage) -        if self._downloader.params.get('listsubtitles', False): -            self._list_available_subtitles(video_id, info_webpage) -            return          return {              'id': video_id, @@ -85,12 +83,15 @@ class VikiIE(SubtitlesInfoExtractor):              'upload_date': upload_date,          } -    def _get_available_subtitles(self, video_id, info_webpage): +    def _get_subtitles(self, video_id, info_webpage):          res = {} -        for sturl_html in re.findall(r'<track src="([^"]+)"/>', info_webpage): +        for sturl_html in re.findall(r'<track src="([^"]+)"', info_webpage):              sturl = unescapeHTML(sturl_html)              m = re.search(r'/(?P<lang>[a-z]+)\.vtt', sturl)              if not m:                  continue -            res[m.group('lang')] = sturl +            res[m.group('lang')] = [{ +                'url': compat_urlparse.urljoin('http://www.viki.com', sturl), +                'ext': 'vtt', +            }]          return res diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 4cd2f73d9..8f540f578 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -7,7 +7,6 @@ import itertools  import hashlib  from .common import InfoExtractor -from .subtitles import SubtitlesInfoExtractor  from ..compat import (      compat_HTTPError,      compat_urllib_parse, @@ -53,7 +52,7 @@ class VimeoBaseInfoExtractor(InfoExtractor):          self._download_webpage(login_request, None, False, 'Wrong login info') -class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): +class VimeoIE(VimeoBaseInfoExtractor):      """Information extractor for vimeo.com."""      # _VALID_URL matches Vimeo URLs @@ -378,12 +377,10 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):          text_tracks = config['request'].get('text_tracks')          if text_tracks:              for tt in text_tracks: -                subtitles[tt['lang']] = 'http://vimeo.com' + tt['url'] - -        video_subtitles = self.extract_subtitles(video_id, subtitles) -        if self._downloader.params.get('listsubtitles', False): -            self._list_available_subtitles(video_id, subtitles) -            return +                subtitles[tt['lang']] = [{ +                    'ext': 'vtt', +                    'url': 'http://vimeo.com' + tt['url'], +                }]          return {              'id': video_id, @@ -399,7 +396,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):              'view_count': view_count,              'like_count': like_count,              'comment_count': comment_count, -            'subtitles': video_subtitles, +            'subtitles': subtitles,          } diff --git a/youtube_dl/extractor/walla.py b/youtube_dl/extractor/walla.py index 672bda7a7..24efbd6e6 100644 --- a/youtube_dl/extractor/walla.py +++ b/youtube_dl/extractor/walla.py @@ -3,14 +3,14 @@ from __future__ import unicode_literals  import re -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor  from ..utils import (      xpath_text,      int_or_none,  ) -class WallaIE(SubtitlesInfoExtractor): +class WallaIE(InfoExtractor):      _VALID_URL = r'http://vod\.walla\.co\.il/[^/]+/(?P<id>\d+)/(?P<display_id>.+)'      _TEST = {          'url': 'http://vod.walla.co.il/movie/2642630/one-direction-all-for-one', @@ -52,13 +52,10 @@ class WallaIE(SubtitlesInfoExtractor):          subtitles = {}          for subtitle in item.findall('./subtitles/subtitle'):              lang = xpath_text(subtitle, './title') -            subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = xpath_text(subtitle, './src') - -        if self._downloader.params.get('listsubtitles', False): -            self._list_available_subtitles(video_id, subtitles) -            return - -        subtitles = self.extract_subtitles(video_id, subtitles) +            subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = [{ +                'ext': 'srt', +                'url': xpath_text(subtitle, './src'), +            }]          formats = []          for quality in item.findall('./qualities/quality'): diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 3d3d43491..22db896b1 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -11,7 +11,6 @@ import time  import traceback  from .common import InfoExtractor, SearchInfoExtractor -from .subtitles import SubtitlesInfoExtractor  from ..jsinterp import JSInterpreter  from ..swfinterp import SWFInterpreter  from ..compat import ( @@ -185,7 +184,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):              return -class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): +class YoutubeIE(YoutubeBaseInfoExtractor):      IE_DESC = 'YouTube.com'      _VALID_URL = r"""(?x)^                       ( @@ -648,7 +647,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):              raise ExtractorError(                  'Signature extraction failed: ' + tb, cause=e) -    def _get_available_subtitles(self, video_id, webpage): +    def _get_subtitles(self, video_id, webpage):          try:              subs_doc = self._download_xml(                  'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, @@ -662,23 +661,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):              lang = track.attrib['lang_code']              if lang in sub_lang_list:                  continue -            params = compat_urllib_parse.urlencode({ -                'lang': lang, -                'v': video_id, -                'fmt': self._downloader.params.get('subtitlesformat', 'srt'), -                'name': track.attrib['name'].encode('utf-8'), -            }) -            url = 'https://www.youtube.com/api/timedtext?' + params -            sub_lang_list[lang] = url +            sub_formats = [] +            for ext in ['sbv', 'vtt', 'srt']: +                params = compat_urllib_parse.urlencode({ +                    'lang': lang, +                    'v': video_id, +                    'fmt': ext, +                    'name': track.attrib['name'].encode('utf-8'), +                }) +                sub_formats.append({ +                    'url': 'https://www.youtube.com/api/timedtext?' + params, +                    'ext': ext, +                }) +            sub_lang_list[lang] = sub_formats          if not sub_lang_list:              self._downloader.report_warning('video doesn\'t have subtitles')              return {}          return sub_lang_list -    def _get_available_automatic_caption(self, video_id, webpage): +    def _get_automatic_captions(self, video_id, webpage):          """We need the webpage for getting the captions url, pass it as an             argument to speed up the process.""" -        sub_format = self._downloader.params.get('subtitlesformat', 'srt')          self.to_screen('%s: Looking for automatic captions' % video_id)          mobj = re.search(r';ytplayer.config = ({.*?});', webpage)          err_msg = 'Couldn\'t find automatic captions for %s' % video_id @@ -708,14 +711,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):              sub_lang_list = {}              for lang_node in caption_list.findall('target'):                  sub_lang = lang_node.attrib['lang_code'] -                params = compat_urllib_parse.urlencode({ -                    'lang': original_lang, -                    'tlang': sub_lang, -                    'fmt': sub_format, -                    'ts': timestamp, -                    'kind': caption_kind, -                }) -                sub_lang_list[sub_lang] = caption_url + '&' + params +                sub_formats = [] +                for ext in ['sbv', 'vtt', 'srt']: +                    params = compat_urllib_parse.urlencode({ +                        'lang': original_lang, +                        'tlang': sub_lang, +                        'fmt': ext, +                        'ts': timestamp, +                        'kind': caption_kind, +                    }) +                    sub_formats.append({ +                        'url': caption_url + '&' + params, +                        'ext': ext, +                    }) +                sub_lang_list[sub_lang] = sub_formats              return sub_lang_list          # An extractor error can be raise by the download process if there are          # no automatic captions but there are subtitles @@ -970,10 +979,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          # subtitles          video_subtitles = self.extract_subtitles(video_id, video_webpage) - -        if self._downloader.params.get('listsubtitles', False): -            self._list_available_subtitles(video_id, video_webpage) -            return +        automatic_captions = self.extract_automatic_captions(video_id, video_webpage)          if 'length_seconds' not in video_info:              self._downloader.report_warning('unable to extract video duration') @@ -1122,6 +1128,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):              'description': video_description,              'categories': video_categories,              'subtitles': video_subtitles, +            'automatic_captions': automatic_captions,              'duration': video_duration,              'age_limit': 18 if age_gate else 0,              'annotations': video_annotations, diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 5f678f76b..5c2d153b1 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -387,8 +387,8 @@ def parseOpts(overrideArguments=None):          help='lists all available subtitles for the video')      subtitles.add_option(          '--sub-format', -        action='store', dest='subtitlesformat', metavar='FORMAT', default='srt', -        help='subtitle format (default=srt) ([sbv/vtt] youtube only)') +        action='store', dest='subtitlesformat', metavar='FORMAT', default='best', +        help='subtitle format, accepts formats preference, for example: "ass/srt/best"')      subtitles.add_option(          '--sub-lang', '--sub-langs', '--srt-lang',          action='callback', dest='subtitleslangs', metavar='LANGS', type='str', diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 3f2e6cf1d..398fe050e 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -496,10 +496,6 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):          'zu': 'zul',      } -    def __init__(self, downloader=None, subtitlesformat='srt'): -        super(FFmpegEmbedSubtitlePP, self).__init__(downloader) -        self._subformat = subtitlesformat -      @classmethod      def _conver_lang_code(cls, code):          """Convert language code from ISO 639-1 to ISO 639-2/T""" @@ -509,13 +505,14 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):          if information['ext'] != 'mp4':              self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4 files')              return True, information -        if not information.get('subtitles'): +        subtitles = information.get('requested_subtitles') +        if not subtitles:              self._downloader.to_screen('[ffmpeg] There aren\'t any subtitles to embed')              return True, information -        sub_langs = [key for key in information['subtitles']] +        sub_langs = list(subtitles.keys())          filename = information['filepath'] -        input_files = [filename] + [subtitles_filename(filename, lang, self._subformat) for lang in sub_langs] +        input_files = [filename] + [subtitles_filename(filename, lang, sub_info['ext']) for lang, sub_info in subtitles.items()]          opts = [              '-map', '0', | 
