diff options
| author | Sergey M․ <dstftw@gmail.com> | 2016-02-26 22:21:47 +0600 | 
|---|---|---|
| committer | Sergey M․ <dstftw@gmail.com> | 2016-02-26 22:21:47 +0600 | 
| commit | b78b292f0c51323edaf3e18ae4f45927a55e9198 (patch) | |
| tree | 3b57ea2aa7944b99d384bd7b030f2c391fc7b8fa | |
| parent | efbd6fb8bb86c07e6f924a7ec2c4bd486face3a4 (diff) | |
[youtube] Add alternative automatic captions extraction approach (Closes #8667)
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 83 | 
1 files changed, 55 insertions, 28 deletions
| diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e24dd3e5b..ec90c2111 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -975,40 +975,67 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              return {}          try:              args = player_config['args'] -            caption_url = args['ttsurl'] -            if not caption_url: -                self._downloader.report_warning(err_msg) -                return {} -            timestamp = args['timestamp'] -            # We get the available subtitles -            list_params = compat_urllib_parse.urlencode({ -                'type': 'list', -                'tlangs': 1, -                'asrs': 1, -            }) -            list_url = caption_url + '&' + list_params -            caption_list = self._download_xml(list_url, video_id) -            original_lang_node = caption_list.find('track') -            if original_lang_node is None: -                self._downloader.report_warning('Video doesn\'t have automatic captions') -                return {} -            original_lang = original_lang_node.attrib['lang_code'] -            caption_kind = original_lang_node.attrib.get('kind', '') +            caption_url = args.get('ttsurl') +            if caption_url: +                timestamp = args['timestamp'] +                # We get the available subtitles +                list_params = compat_urllib_parse.urlencode({ +                    'type': 'list', +                    'tlangs': 1, +                    'asrs': 1, +                }) +                list_url = caption_url + '&' + list_params +                caption_list = self._download_xml(list_url, video_id) +                original_lang_node = caption_list.find('track') +                if original_lang_node is None: +                    self._downloader.report_warning('Video doesn\'t have automatic captions') +                    return {} +                original_lang = original_lang_node.attrib['lang_code'] +                caption_kind = original_lang_node.attrib.get('kind', '') + +                sub_lang_list = {} +                for lang_node in caption_list.findall('target'): +                    sub_lang = lang_node.attrib['lang_code'] +                    sub_formats = [] +                    for ext in self._SUBTITLE_FORMATS: +                        params = compat_urllib_parse.urlencode({ +                            'lang': original_lang, +                            'tlang': sub_lang, +                            'fmt': ext, +                            'ts': timestamp, +                            'kind': caption_kind, +                        }) +                        sub_formats.append({ +                            'url': caption_url + '&' + params, +                            'ext': ext, +                        }) +                    sub_lang_list[sub_lang] = sub_formats +                return sub_lang_list + +            # Some videos don't provide ttsurl but rather caption_tracks and +            # caption_translation_languages (e.g. 20LmZk1hakA) +            caption_tracks = args['caption_tracks'] +            caption_translation_languages = args['caption_translation_languages'] +            caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0] +            parsed_caption_url = compat_urlparse.urlparse(caption_url) +            caption_qs = compat_parse_qs(parsed_caption_url.query)              sub_lang_list = {} -            for lang_node in caption_list.findall('target'): -                sub_lang = lang_node.attrib['lang_code'] +            for lang in caption_translation_languages.split(','): +                lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang)) +                sub_lang = lang_qs.get('lc', [None])[0] +                if not sub_lang: +                    continue                  sub_formats = []                  for ext in self._SUBTITLE_FORMATS: -                    params = compat_urllib_parse.urlencode({ -                        'lang': original_lang, -                        'tlang': sub_lang, -                        'fmt': ext, -                        'ts': timestamp, -                        'kind': caption_kind, +                    caption_qs.update({ +                        'tlang': [sub_lang], +                        'fmt': [ext],                      }) +                    sub_url = compat_urlparse.urlunparse(parsed_caption_url._replace( +                        query=compat_urllib_parse.urlencode(caption_qs, True)))                      sub_formats.append({ -                        'url': caption_url + '&' + params, +                        'url': sub_url,                          'ext': ext,                      })                  sub_lang_list[sub_lang] = sub_formats | 
