diff options
| author | Ismael Mejia <iemejia@gmail.com> | 2013-08-08 08:54:10 +0200 | 
|---|---|---|
| committer | Ismael Mejia <iemejia@gmail.com> | 2013-08-08 08:54:10 +0200 | 
| commit | 8377574c9cb8740e24d45e9b3d30921fd6ec846c (patch) | |
| tree | 3cbe6f014c1322ae85f17daa919de7e86ba3d307 | |
| parent | 372297e713c92489c113bf8649ec4aa1d23511f9 (diff) | |
[internal] Improved subtitle architecture + (update in
youtube/dailymotion)
The structure of subtitles was refined, you only need to implement one
method that returns a dictionnary of the available subtitles (lang, url) to
support all the subtitle options in a website. I updated the subtitle
downloaders for youtube/dailymotion to show how it works.
| -rw-r--r-- | youtube_dl/extractor/dailymotion.py | 15 | ||||
| -rw-r--r-- | youtube_dl/extractor/subtitles.py | 27 | ||||
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 175 | 
3 files changed, 73 insertions, 144 deletions
| diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index eb2322d54..97003ee35 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -1,6 +1,5 @@  import re  import json -import itertools  import socket  from .common import InfoExtractor @@ -34,16 +33,12 @@ class DailyMotionSubtitlesIE(SubtitlesIE):          self._downloader.report_warning(u'video doesn\'t have subtitles')          return {} -    def _get_subtitle_url(self, sub_lang, sub_name, video_id, format): -        sub_lang_list = self._get_available_subtitles(video_id) -        return sub_lang_list[sub_lang] -      def _request_automatic_caption(self, video_id, webpage): -        self._downloader.report_warning(u'Automatic Captions not supported by dailymotion') +        self._downloader.report_warning(u'Automatic Captions not supported by this server')          return {} -class DailymotionIE(DailyMotionSubtitlesIE): #,InfoExtractor): +class DailymotionIE(DailyMotionSubtitlesIE):      """Information Extractor for Dailymotion"""      _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)' @@ -116,12 +111,6 @@ class DailymotionIE(DailyMotionSubtitlesIE): #,InfoExtractor):              self._list_available_subtitles(video_id)              return -        if 'length_seconds' not in info: -            self._downloader.report_warning(u'unable to extract video duration') -            video_duration = '' -        else: -            video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]) -          return [{              'id':       video_id,              'url':      video_url, diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py index 89864e5d7..8843e0220 100644 --- a/youtube_dl/extractor/subtitles.py +++ b/youtube_dl/extractor/subtitles.py @@ -15,7 +15,8 @@ class SubtitlesIE(InfoExtractor):      def report_video_subtitles_available(self, video_id, sub_lang_list):          """Report available subtitles."""          sub_lang = ",".join(list(sub_lang_list.keys())) -        self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang)) +        self.to_screen(u'%s: Available subtitles for video: %s' % +                       (video_id, sub_lang))      def _list_available_subtitles(self, video_id):          sub_lang_list = self._get_available_subtitles(video_id) @@ -27,9 +28,9 @@ class SubtitlesIE(InfoExtractor):          couldn't be found          """          sub_lang_list = self._get_available_subtitles(video_id) -        sub_format = self._downloader.params.get('subtitlesformat') -        if  not sub_lang_list: #There was some error, it didn't get the available subtitles +        if not sub_lang_list:  # error, it didn't get the available subtitles              return {} +          if self._downloader.params.get('writesubtitles', False):              if self._downloader.params.get('subtitleslang', False):                  sub_lang = self._downloader.params.get('subtitleslang') @@ -41,18 +42,15 @@ class SubtitlesIE(InfoExtractor):                  self._downloader.report_warning(u'no closed captions found in the specified language "%s"' % sub_lang)                  return {}              sub_lang_list = {sub_lang: sub_lang_list[sub_lang]} +          subtitles = {} -        for sub_lang in sub_lang_list: -            subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) +        for sub_lang, url in sub_lang_list.iteritems(): +            subtitle = self._request_subtitle_url(sub_lang, url)              if subtitle:                  subtitles[sub_lang] = subtitle          return subtitles -    def _request_subtitle(self, sub_lang, sub_name, video_id, format): -        """ Return the subtitle as a string or None if they are not found """ -        # return (u'Did not fetch video subtitles for %s' % sub_lang, None, None) -        self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format)) -        url = self._get_subtitle_url(sub_lang, sub_name, video_id, format) +    def _request_subtitle_url(self, sub_lang, url):          try:              sub = compat_urllib_request.urlopen(url).read().decode('utf-8')          except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: @@ -64,13 +62,8 @@ class SubtitlesIE(InfoExtractor):          return sub      def _get_available_subtitles(self, video_id): -        """Get available subtitles. Redefine in subclasses.""" -        """returns {(lang, url)} """ -        # return {} -        pass - -    def _get_subtitle_url(self, sub_lang, sub_name, video_id, format): -        """returns the url for the given subtitle. Redefine in subclasses.""" +        """returns the list of available subtitles like this {lang: url} """ +        """or {} if not available. Must be redefined by the subclasses."""          pass      def _request_automatic_caption(self, video_id, webpage): diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 2b03226f6..414e33b49 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -7,6 +7,7 @@ import socket  import itertools  from .common import InfoExtractor, SearchInfoExtractor +from .subtitles import SubtitlesIE  from ..utils import (      compat_http_client,      compat_parse_qs, @@ -24,7 +25,66 @@ from ..utils import (  ) -class YoutubeIE(InfoExtractor): +class YoutubeSubtitlesIE(SubtitlesIE): + +    def _get_available_subtitles(self, video_id): +        request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) +        try: +            sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') +        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: +            self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err)) +            return {} +        lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) + +        sub_lang_list = {} +        for l in lang_list: +            lang = l[1] +            params = compat_urllib_parse.urlencode({ +                'lang': lang, +                'v': video_id, +                'fmt': self._downloader.params.get('subtitlesformat'), +            }) +            url = u'http://www.youtube.com/api/timedtext?' + params +            sub_lang_list[lang] = url +        if not sub_lang_list: +            self._downloader.report_warning(u'video doesn\'t have subtitles') +            return {} +        return sub_lang_list + +    def _request_automatic_caption(self, video_id, webpage): +        """We need the webpage for getting the captions url, pass it as an +           argument to speed up the process.""" +        sub_lang = self._downloader.params.get('subtitleslang') or 'en' +        sub_format = self._downloader.params.get('subtitlesformat') +        self.to_screen(u'%s: Looking for automatic captions' % video_id) +        mobj = re.search(r';ytplayer.config = ({.*?});', webpage) +        err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang +        if mobj is None: +            self._downloader.report_warning(err_msg) +            return {} +        player_config = json.loads(mobj.group(1)) +        try: +            args = player_config[u'args'] +            caption_url = args[u'ttsurl'] +            timestamp = args[u'timestamp'] +            params = compat_urllib_parse.urlencode({ +                'lang': 'en', +                'tlang': sub_lang, +                'fmt': sub_format, +                'ts': timestamp, +                'kind': 'asr', +            }) +            subtitles_url = caption_url + '&' + params +            sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions') +            return {sub_lang: sub} +        # An extractor error can be raise by the download process if there are +        # no automatic captions but there are subtitles +        except (KeyError, ExtractorError): +            self._downloader.report_warning(err_msg) +            return {} + + +class YoutubeIE(YoutubeSubtitlesIE):      IE_DESC = u'YouTube.com'      _VALID_URL = r"""^                       ( @@ -151,19 +211,6 @@ class YoutubeIE(InfoExtractor):          """Report attempt to download video info webpage."""          self.to_screen(u'%s: Downloading video info webpage' % video_id) -    def report_video_subtitles_download(self, video_id): -        """Report attempt to download video info webpage.""" -        self.to_screen(u'%s: Checking available subtitles' % video_id) - -    def report_video_subtitles_request(self, video_id, sub_lang, format): -        """Report attempt to download video info webpage.""" -        self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format)) - -    def report_video_subtitles_available(self, video_id, sub_lang_list): -        """Report available subtitles.""" -        sub_lang = ",".join(list(sub_lang_list.keys())) -        self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang)) -      def report_information_extraction(self, video_id):          """Report attempt to extract video information."""          self.to_screen(u'%s: Extracting video information' % video_id) @@ -203,106 +250,6 @@ class YoutubeIE(InfoExtractor):          else:              raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s))) -    def _get_available_subtitles(self, video_id): -        self.report_video_subtitles_download(video_id) -        request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) -        try: -            sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -            self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err)) -            return {} -        sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) -        sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list) -        if not sub_lang_list: -            self._downloader.report_warning(u'video doesn\'t have subtitles') -            return {} -        return sub_lang_list - -    def _list_available_subtitles(self, video_id): -        sub_lang_list = self._get_available_subtitles(video_id) -        self.report_video_subtitles_available(video_id, sub_lang_list) - -    def _request_subtitle(self, sub_lang, sub_name, video_id, format): -        """ -        Return the subtitle as a string or None if they are not found -        """ -        self.report_video_subtitles_request(video_id, sub_lang, format) -        params = compat_urllib_parse.urlencode({ -            'lang': sub_lang, -            'name': sub_name, -            'v': video_id, -            'fmt': format, -        }) -        url = 'http://www.youtube.com/api/timedtext?' + params -        try: -            sub = compat_urllib_request.urlopen(url).read().decode('utf-8') -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -            self._downloader.report_warning(u'unable to download video subtitles for %s: %s' % (sub_lang, compat_str(err))) -            return -        if not sub: -            self._downloader.report_warning(u'Did not fetch video subtitles') -            return -        return sub - -    def _request_automatic_caption(self, video_id, webpage): -        """We need the webpage for getting the captions url, pass it as an -           argument to speed up the process.""" -        sub_lang = self._downloader.params.get('subtitleslang') or 'en' -        sub_format = self._downloader.params.get('subtitlesformat') -        self.to_screen(u'%s: Looking for automatic captions' % video_id) -        mobj = re.search(r';ytplayer.config = ({.*?});', webpage) -        err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang -        if mobj is None: -            self._downloader.report_warning(err_msg) -            return {} -        player_config = json.loads(mobj.group(1)) -        try: -            args = player_config[u'args'] -            caption_url = args[u'ttsurl'] -            timestamp = args[u'timestamp'] -            params = compat_urllib_parse.urlencode({ -                'lang': 'en', -                'tlang': sub_lang, -                'fmt': sub_format, -                'ts': timestamp, -                'kind': 'asr', -            }) -            subtitles_url = caption_url + '&' + params -            sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions') -            return {sub_lang: sub} -        # An extractor error can be raise by the download process if there are -        # no automatic captions but there are subtitles -        except (KeyError, ExtractorError): -            self._downloader.report_warning(err_msg) -            return {} -     -    def _extract_subtitles(self, video_id): -        """ -        Return a dictionary: {language: subtitles} or {} if the subtitles -        couldn't be found -        """ -        sub_lang_list = self._get_available_subtitles(video_id) -        sub_format = self._downloader.params.get('subtitlesformat') -        if  not sub_lang_list: #There was some error, it didn't get the available subtitles -            return {} -        if self._downloader.params.get('writesubtitles', False): -            if self._downloader.params.get('subtitleslang', False): -                sub_lang = self._downloader.params.get('subtitleslang') -            elif 'en' in sub_lang_list: -                sub_lang = 'en' -            else: -                sub_lang = list(sub_lang_list.keys())[0] -            if not sub_lang in sub_lang_list: -                self._downloader.report_warning(u'no closed captions found in the specified language "%s"' % sub_lang) -                return {} -            sub_lang_list = {sub_lang: sub_lang_list[sub_lang]} -        subtitles = {} -        for sub_lang in sub_lang_list: -            subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) -            if subtitle: -                subtitles[sub_lang] = subtitle -        return subtitles -      def _print_formats(self, formats):          print('Available formats:')          for x in formats: | 
