diff options
Diffstat (limited to 'youtube_dl/extractor')
| -rw-r--r-- | youtube_dl/extractor/dailymotion.py | 39 | ||||
| -rw-r--r-- | youtube_dl/extractor/subtitles.py | 80 | ||||
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 180 | 
3 files changed, 178 insertions, 121 deletions
| diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 3c616e089..f7dffd4cc 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -1,17 +1,40 @@  import re  import json  import itertools +import socket  from .common import InfoExtractor +from .subtitles import NoAutoSubtitlesIE +  from ..utils import ( +    compat_http_client, +    compat_urllib_error,      compat_urllib_request, +    compat_str,      get_element_by_attribute,      get_element_by_id,      ExtractorError,  ) -class DailymotionIE(InfoExtractor): + +class DailyMotionSubtitlesIE(NoAutoSubtitlesIE): + +    def _get_available_subtitles(self, video_id): +        request = compat_urllib_request.Request('https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id) +        try: +            sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') +        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: +            self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err)) +            return {} +        info = json.loads(sub_list) +        if (info['total'] > 0): +            sub_lang_list = dict((l['language'], l['url']) for l in info['list']) +            return sub_lang_list +        self._downloader.report_warning(u'video doesn\'t have subtitles') +        return {} + +class DailymotionIE(DailyMotionSubtitlesIE, InfoExtractor):      """Information Extractor for Dailymotion"""      _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/(?:embed/)?video/([^/]+)' @@ -73,6 +96,19 @@ class DailymotionIE(InfoExtractor):              raise ExtractorError(u'Unable to extract video URL')          video_url = info[max_quality] +        # subtitles +        video_subtitles = None +        video_webpage = None + +        if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False): +            video_subtitles = self._extract_subtitles(video_id) +        elif self._downloader.params.get('writeautomaticsub', False): +            video_subtitles = self._request_automatic_caption(video_id, video_webpage) + +        if self._downloader.params.get('listsubtitles', False): +            self._list_available_subtitles(video_id) +            return +          return [{              'id':       video_id,              'url':      video_url, @@ -80,6 +116,7 @@ class DailymotionIE(InfoExtractor):              'upload_date':  video_upload_date,              'title':    self._og_search_title(webpage),              'ext':      video_extension, +            'subtitles':    video_subtitles,              'thumbnail': info['thumbnail_url']          }] diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py new file mode 100644 index 000000000..c10cdf266 --- /dev/null +++ b/youtube_dl/extractor/subtitles.py @@ -0,0 +1,80 @@ +import socket + +from .common import InfoExtractor + +from ..utils import ( +    compat_http_client, +    compat_urllib_error, +    compat_urllib_request, +    compat_str, +) + + +class SubtitlesIE(InfoExtractor): + +    def _list_available_subtitles(self, video_id): +        """ outputs the available subtitles for the video """ +        sub_lang_list = self._get_available_subtitles(video_id) +        sub_lang = ",".join(list(sub_lang_list.keys())) +        self.to_screen(u'%s: Available subtitles for video: %s' % +                       (video_id, sub_lang)) + +    def _extract_subtitles(self, video_id): +        """ returns {sub_lang: sub} or {} if subtitles not found """ +        available_subs_list = self._get_available_subtitles(video_id) +        if not available_subs_list:  # error, it didn't get the available subtitles +            return {} +        if self._downloader.params.get('allsubtitles', False): +            sub_lang_list = available_subs_list +        else: +            if self._downloader.params.get('writesubtitles', False): +                if self._downloader.params.get('subtitleslangs', False): +                    requested_langs = self._downloader.params.get('subtitleslangs') +                elif 'en' in available_subs_list: +                    requested_langs = ['en'] +                else: +                    requested_langs = [list(available_subs_list.keys())[0]] + +                sub_lang_list = {} +                for sub_lang in requested_langs: +                    if not sub_lang in available_subs_list: +                        self._downloader.report_warning(u'no closed captions found in the specified language "%s"' % sub_lang) +                        continue +                    sub_lang_list[sub_lang] = available_subs_list[sub_lang] + +        subtitles = {} +        for sub_lang, url in sub_lang_list.items(): +            subtitle = self._request_subtitle_url(sub_lang, url) +            if subtitle: +                subtitles[sub_lang] = subtitle +        return subtitles + +    def _request_subtitle_url(self, sub_lang, url): +        """ makes the http request for the subtitle """ +        try: +            sub = compat_urllib_request.urlopen(url).read().decode('utf-8') +        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: +            self._downloader.report_warning(u'unable to download video subtitles for %s: %s' % (sub_lang, compat_str(err))) +            return +        if not sub: +            self._downloader.report_warning(u'Did not fetch video subtitles') +            return +        return sub + +    def _get_available_subtitles(self, video_id): +        """ returns {sub_lang: url} or {} if not available """ +        """ Must be redefined by the subclasses """ +        pass + +    def _request_automatic_caption(self, video_id, webpage): +        """ returns {sub_lang: sub} or {} if not available """ +        """ Must be redefined by the subclasses """ +        pass + + +class NoAutoSubtitlesIE(SubtitlesIE): +    """ A subtitle class for the servers that don't support auto-captions""" + +    def _request_automatic_caption(self, video_id, webpage): +        self._downloader.report_warning(u'Automatic Captions not supported by this server') +        return {} diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 0e828263c..e71cd62ec 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -7,6 +7,7 @@ import socket  import itertools  from .common import InfoExtractor, SearchInfoExtractor +from .subtitles import SubtitlesIE  from ..utils import (      compat_http_client,      compat_parse_qs, @@ -130,7 +131,65 @@ class YoutubeBaseInfoExtractor(InfoExtractor):              return          self._confirm_age() -class YoutubeIE(YoutubeBaseInfoExtractor): +class YoutubeSubtitlesIE(SubtitlesIE): + +    def _get_available_subtitles(self, video_id): +        request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) +        try: +            sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') +        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: +            self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err)) +            return {} +        lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) + +        sub_lang_list = {} +        for l in lang_list: +            lang = l[1] +            params = compat_urllib_parse.urlencode({ +                'lang': lang, +                'v': video_id, +                'fmt': self._downloader.params.get('subtitlesformat'), +            }) +            url = u'http://www.youtube.com/api/timedtext?' + params +            sub_lang_list[lang] = url +        if not sub_lang_list: +            self._downloader.report_warning(u'video doesn\'t have subtitles') +            return {} +        return sub_lang_list + +    def _request_automatic_caption(self, video_id, webpage): +        """We need the webpage for getting the captions url, pass it as an +           argument to speed up the process.""" +        sub_lang = (self._downloader.params.get('subtitleslangs') or ['en'])[0] +        sub_format = self._downloader.params.get('subtitlesformat') +        self.to_screen(u'%s: Looking for automatic captions' % video_id) +        mobj = re.search(r';ytplayer.config = ({.*?});', webpage) +        err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang +        if mobj is None: +            self._downloader.report_warning(err_msg) +            return {} +        player_config = json.loads(mobj.group(1)) +        try: +            args = player_config[u'args'] +            caption_url = args[u'ttsurl'] +            timestamp = args[u'timestamp'] +            params = compat_urllib_parse.urlencode({ +                'lang': 'en', +                'tlang': sub_lang, +                'fmt': sub_format, +                'ts': timestamp, +                'kind': 'asr', +            }) +            subtitles_url = caption_url + '&' + params +            sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions') +            return {sub_lang: sub} +        # An extractor error can be raise by the download process if there are +        # no automatic captions but there are subtitles +        except (KeyError, ExtractorError): +            self._downloader.report_warning(err_msg) +            return {} + +class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor):      IE_DESC = u'YouTube.com'      _VALID_URL = r"""^                       ( @@ -397,19 +456,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          """Report attempt to download video info webpage."""          self.to_screen(u'%s: Downloading video info webpage' % video_id) -    def report_video_subtitles_download(self, video_id): -        """Report attempt to download video info webpage.""" -        self.to_screen(u'%s: Checking available subtitles' % video_id) - -    def report_video_subtitles_request(self, video_id, sub_lang, format): -        """Report attempt to download video info webpage.""" -        self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format)) - -    def report_video_subtitles_available(self, video_id, sub_lang_list): -        """Report available subtitles.""" -        sub_lang = ",".join(list(sub_lang_list.keys())) -        self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang)) -      def report_information_extraction(self, video_id):          """Report attempt to extract video information."""          self.to_screen(u'%s: Extracting video information' % video_id) @@ -464,112 +510,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              # Fallback to the other algortihms              return self._decrypt_signature(s) - -    def _get_available_subtitles(self, video_id): -        self.report_video_subtitles_download(video_id) -        request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) -        try: -            sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -            self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err)) -            return {} -        sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) -        sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list) -        if not sub_lang_list: -            self._downloader.report_warning(u'video doesn\'t have subtitles') -            return {} -        return sub_lang_list - -    def _list_available_subtitles(self, video_id): -        sub_lang_list = self._get_available_subtitles(video_id) -        self.report_video_subtitles_available(video_id, sub_lang_list) - -    def _request_subtitle(self, sub_lang, sub_name, video_id, format): -        """ -        Return the subtitle as a string or None if they are not found -        """ -        self.report_video_subtitles_request(video_id, sub_lang, format) -        params = compat_urllib_parse.urlencode({ -            'lang': sub_lang, -            'name': sub_name, -            'v': video_id, -            'fmt': format, -        }) -        url = 'http://www.youtube.com/api/timedtext?' + params -        try: -            sub = compat_urllib_request.urlopen(url).read().decode('utf-8') -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -            self._downloader.report_warning(u'unable to download video subtitles for %s: %s' % (sub_lang, compat_str(err))) -            return -        if not sub: -            self._downloader.report_warning(u'Did not fetch video subtitles') -            return -        return sub - -    def _request_automatic_caption(self, video_id, webpage): -        """We need the webpage for getting the captions url, pass it as an -           argument to speed up the process.""" -        sub_lang = (self._downloader.params.get('subtitleslangs') or ['en'])[0] -        sub_format = self._downloader.params.get('subtitlesformat') -        self.to_screen(u'%s: Looking for automatic captions' % video_id) -        mobj = re.search(r';ytplayer.config = ({.*?});', webpage) -        err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang -        if mobj is None: -            self._downloader.report_warning(err_msg) -            return {} -        player_config = json.loads(mobj.group(1)) -        try: -            args = player_config[u'args'] -            caption_url = args[u'ttsurl'] -            timestamp = args[u'timestamp'] -            params = compat_urllib_parse.urlencode({ -                'lang': 'en', -                'tlang': sub_lang, -                'fmt': sub_format, -                'ts': timestamp, -                'kind': 'asr', -            }) -            subtitles_url = caption_url + '&' + params -            sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions') -            return {sub_lang: sub} -        # An extractor error can be raise by the download process if there are -        # no automatic captions but there are subtitles -        except (KeyError, ExtractorError): -            self._downloader.report_warning(err_msg) -            return {} -     -    def _extract_subtitles(self, video_id): -        """ -        Return a dictionary: {language: subtitles} or {} if the subtitles -        couldn't be found -        """ -        available_subs_list = self._get_available_subtitles(video_id) -        sub_format = self._downloader.params.get('subtitlesformat') -        if  not available_subs_list: #There was some error, it didn't get the available subtitles -            return {} -        if self._downloader.params.get('allsubtitles', False): -            sub_lang_list = available_subs_list -        else: -            if self._downloader.params.get('subtitleslangs', False): -                reqested_langs = self._downloader.params.get('subtitleslangs') -            elif 'en' in available_subs_list: -                reqested_langs = ['en'] -            else: -                reqested_langs = [list(available_subs_list.keys())[0]] - -            sub_lang_list = {} -            for sub_lang in reqested_langs: -                if not sub_lang in available_subs_list: -                    self._downloader.report_warning(u'no closed captions found in the specified language "%s"' % sub_lang) -                    continue -                sub_lang_list[sub_lang] = available_subs_list[sub_lang] -        subtitles = {} -        for sub_lang in sub_lang_list: -            subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) -            if subtitle: -                subtitles[sub_lang] = subtitle -        return subtitles -      def _print_formats(self, formats):          print('Available formats:')          for x in formats: | 
