diff options
Diffstat (limited to 'youtube_dl/extractor/youtube.py')
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 235 | 
1 files changed, 112 insertions, 123 deletions
| diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9e2373bd5..f49665925 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -5,8 +5,10 @@ import netrc  import re  import socket  import itertools +import xml.etree.ElementTree  from .common import InfoExtractor, SearchInfoExtractor +from .subtitles import SubtitlesInfoExtractor  from ..utils import (      compat_http_client,      compat_parse_qs, @@ -130,12 +132,13 @@ class YoutubeBaseInfoExtractor(InfoExtractor):              return          self._confirm_age() -class YoutubeIE(YoutubeBaseInfoExtractor): + +class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):      IE_DESC = u'YouTube.com'      _VALID_URL = r"""^                       (                           (?:https?://)?                                       # http(s):// (optional) -                         (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/| +                         (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|                              tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains                           (?:.*?\#/)?                                          # handle anchor (#/) redirect urls                           (?:                                                  # the various things that can precede the ID: @@ -146,15 +149,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                                   (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)                                   v=                               ) -                         )?                                                   # optional -> youtube.com/xxxx is OK +                         )) +                         |youtu\.be/                                          # just youtu.be/xxxx +                         )                       )?                                                       # all until now is optional -> you can pass the naked ID -                     ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID +                     ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID                       (?(1).+)?                                                # if we found the ID, everything can follow                       $"""      _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'      # Listed in order of quality -    _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13', -                          '95', '94', '93', '92', '132', '151', +    _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13', +                          # Apple HTTP Live Streaming +                          '96', '95', '94', '93', '92', '132', '151',                            # 3D                            '85', '84', '102', '83', '101', '82', '100',                            # Dash video @@ -163,8 +169,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                            # Dash audio                            '141', '172', '140', '171', '139',                            ] -    _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13', -                                      '95', '94', '93', '92', '132', '151', +    _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13', +                                      # Apple HTTP Live Streaming +                                      '96', '95', '94', '93', '92', '132', '151', +                                      # 3D                                        '85', '102', '84', '101', '83', '100', '82',                                        # Dash video                                        '138', '248', '137', '247', '136', '246', '245', @@ -172,11 +180,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                                        # Dash audio                                        '172', '141', '171', '140', '139',                                        ] +    _video_formats_map = { +        'flv': ['35', '34', '6', '5'], +        '3gp': ['36', '17', '13'], +        'mp4': ['38', '37', '22', '18'], +        'webm': ['46', '45', '44', '43'], +    }      _video_extensions = {          '13': '3gp', -        '17': 'mp4', +        '17': '3gp',          '18': 'mp4',          '22': 'mp4', +        '36': '3gp',          '37': 'mp4',          '38': 'mp4',          '43': 'webm', @@ -193,7 +208,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          '101': 'webm',          '102': 'webm', -        # videos that use m3u8 +        # Apple HTTP Live Streaming          '92': 'mp4',          '93': 'mp4',          '94': 'mp4', @@ -234,6 +249,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          '22': '720x1280',          '34': '360x640',          '35': '480x854', +        '36': '240x320',          '37': '1080x1920',          '38': '3072x4096',          '43': '360x640', @@ -373,7 +389,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):      @classmethod      def suitable(cls, url):          """Receives a URL and returns True if suitable for this IE.""" -        if YoutubePlaylistIE.suitable(url) or YoutubeSubscriptionsIE.suitable(url): return False +        if YoutubePlaylistIE.suitable(url): return False          return re.match(cls._VALID_URL, url, re.VERBOSE) is not None      def report_video_webpage_download(self, video_id): @@ -384,19 +400,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          """Report attempt to download video info webpage."""          self.to_screen(u'%s: Downloading video info webpage' % video_id) -    def report_video_subtitles_download(self, video_id): -        """Report attempt to download video info webpage.""" -        self.to_screen(u'%s: Checking available subtitles' % video_id) - -    def report_video_subtitles_request(self, video_id, sub_lang, format): -        """Report attempt to download video info webpage.""" -        self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format)) - -    def report_video_subtitles_available(self, video_id, sub_lang_list): -        """Report available subtitles.""" -        sub_lang = ",".join(list(sub_lang_list.keys())) -        self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang)) -      def report_information_extraction(self, video_id):          """Report attempt to extract video information."""          self.to_screen(u'%s: Extracting video information' % video_id) @@ -423,15 +426,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          elif len(s) == 87:              return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]          elif len(s) == 86: -            return s[83:36:-1] + s[0] + s[35:2:-1] +            return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]          elif len(s) == 85: -            return s[83:34:-1] + s[0] + s[33:27:-1] + s[3] + s[26:19:-1] + s[34] + s[18:3:-1] + s[27] +            return s[40] + s[82:43:-1] + s[22] + s[42:40:-1] + s[83] + s[39:22:-1] + s[0] + s[21:2:-1]          elif len(s) == 84:              return s[81:36:-1] + s[0] + s[35:2:-1]          elif len(s) == 83:              return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]          elif len(s) == 82: -            return s[1:19] + s[0] + s[20:68] + s[19] + s[69:82] +            return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]          elif len(s) == 81:              return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]          elif len(s) == 80: @@ -451,56 +454,38 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              # Fallback to the other algortihms              return self._decrypt_signature(s) -      def _get_available_subtitles(self, video_id): -        self.report_video_subtitles_download(video_id) -        request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)          try: -            sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: +            sub_list = self._download_webpage( +                'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, +                video_id, note=False) +        except ExtractorError as err:              self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))              return {} -        sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) -        sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list) +        lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) + +        sub_lang_list = {} +        for l in lang_list: +            lang = l[1] +            params = compat_urllib_parse.urlencode({ +                'lang': lang, +                'v': video_id, +                'fmt': self._downloader.params.get('subtitlesformat'), +            }) +            url = u'http://www.youtube.com/api/timedtext?' + params +            sub_lang_list[lang] = url          if not sub_lang_list:              self._downloader.report_warning(u'video doesn\'t have subtitles')              return {}          return sub_lang_list -    def _list_available_subtitles(self, video_id): -        sub_lang_list = self._get_available_subtitles(video_id) -        self.report_video_subtitles_available(video_id, sub_lang_list) - -    def _request_subtitle(self, sub_lang, sub_name, video_id, format): -        """ -        Return the subtitle as a string or None if they are not found -        """ -        self.report_video_subtitles_request(video_id, sub_lang, format) -        params = compat_urllib_parse.urlencode({ -            'lang': sub_lang, -            'name': sub_name, -            'v': video_id, -            'fmt': format, -        }) -        url = 'http://www.youtube.com/api/timedtext?' + params -        try: -            sub = compat_urllib_request.urlopen(url).read().decode('utf-8') -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -            self._downloader.report_warning(u'unable to download video subtitles for %s: %s' % (sub_lang, compat_str(err))) -            return -        if not sub: -            self._downloader.report_warning(u'Did not fetch video subtitles') -            return -        return sub - -    def _request_automatic_caption(self, video_id, webpage): +    def _get_available_automatic_caption(self, video_id, webpage):          """We need the webpage for getting the captions url, pass it as an             argument to speed up the process.""" -        sub_lang = (self._downloader.params.get('subtitleslangs') or ['en'])[0]          sub_format = self._downloader.params.get('subtitlesformat')          self.to_screen(u'%s: Looking for automatic captions' % video_id)          mobj = re.search(r';ytplayer.config = ({.*?});', webpage) -        err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang +        err_msg = u'Couldn\'t find automatic captions for %s' % video_id          if mobj is None:              self._downloader.report_warning(err_msg)              return {} @@ -509,53 +494,38 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              args = player_config[u'args']              caption_url = args[u'ttsurl']              timestamp = args[u'timestamp'] -            params = compat_urllib_parse.urlencode({ -                'lang': 'en', -                'tlang': sub_lang, -                'fmt': sub_format, -                'ts': timestamp, -                'kind': 'asr', +            # We get the available subtitles +            list_params = compat_urllib_parse.urlencode({ +                'type': 'list', +                'tlangs': 1, +                'asrs': 1,              }) -            subtitles_url = caption_url + '&' + params -            sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions') -            return {sub_lang: sub} +            list_url = caption_url + '&' + list_params +            list_page = self._download_webpage(list_url, video_id) +            caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8')) +            original_lang_node = caption_list.find('track') +            if original_lang_node.attrib.get('kind') != 'asr' : +                self._downloader.report_warning(u'Video doesn\'t have automatic captions') +                return {} +            original_lang = original_lang_node.attrib['lang_code'] + +            sub_lang_list = {} +            for lang_node in caption_list.findall('target'): +                sub_lang = lang_node.attrib['lang_code'] +                params = compat_urllib_parse.urlencode({ +                    'lang': original_lang, +                    'tlang': sub_lang, +                    'fmt': sub_format, +                    'ts': timestamp, +                    'kind': 'asr', +                }) +                sub_lang_list[sub_lang] = caption_url + '&' + params +            return sub_lang_list          # An extractor error can be raise by the download process if there are          # no automatic captions but there are subtitles          except (KeyError, ExtractorError):              self._downloader.report_warning(err_msg)              return {} -     -    def _extract_subtitles(self, video_id): -        """ -        Return a dictionary: {language: subtitles} or {} if the subtitles -        couldn't be found -        """ -        available_subs_list = self._get_available_subtitles(video_id) -        sub_format = self._downloader.params.get('subtitlesformat') -        if  not available_subs_list: #There was some error, it didn't get the available subtitles -            return {} -        if self._downloader.params.get('allsubtitles', False): -            sub_lang_list = available_subs_list -        else: -            if self._downloader.params.get('subtitleslangs', False): -                reqested_langs = self._downloader.params.get('subtitleslangs') -            elif 'en' in available_subs_list: -                reqested_langs = ['en'] -            else: -                reqested_langs = [list(available_subs_list.keys())[0]] - -            sub_lang_list = {} -            for sub_lang in reqested_langs: -                if not sub_lang in available_subs_list: -                    self._downloader.report_warning(u'no closed captions found in the specified language "%s"' % sub_lang) -                    continue -                sub_lang_list[sub_lang] = available_subs_list[sub_lang] -        subtitles = {} -        for sub_lang in sub_lang_list: -            subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) -            if subtitle: -                subtitles[sub_lang] = subtitle -        return subtitles      def _print_formats(self, formats):          print('Available formats:') @@ -597,13 +567,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats          else:              # Specific formats. We pick the first in a slash-delimeted sequence. -            # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'. +            # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality +            # available in the specified format. For example, +            # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'. +            # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'. +            # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.              req_formats = req_format.split('/')              video_url_list = None              for rf in req_formats:                  if rf in url_map:                      video_url_list = [(rf, url_map[rf])]                      break +                if rf in self._video_formats_map: +                    for srf in self._video_formats_map[rf]: +                        if srf in url_map: +                            video_url_list = [(srf, url_map[srf])] +                            break +                    else: +                        continue +                    break              if video_url_list is None:                  raise ExtractorError(u'requested format not available')          return video_url_list @@ -743,15 +725,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  video_description = u''          # subtitles -        video_subtitles = None - -        if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False): -            video_subtitles = self._extract_subtitles(video_id) -        elif self._downloader.params.get('writeautomaticsub', False): -            video_subtitles = self._request_automatic_caption(video_id, video_webpage) +        video_subtitles = self.extract_subtitles(video_id, video_webpage)          if self._downloader.params.get('listsubtitles', False): -            self._list_available_subtitles(video_id) +            self._list_available_subtitles(video_id, video_webpage)              return          if 'length_seconds' not in video_info: @@ -920,8 +897,11 @@ class YoutubePlaylistIE(InfoExtractor):              for entry in response['feed']['entry']:                  index = entry['yt$position']['$t'] -                if 'media$group' in entry and 'media$player' in entry['media$group']: -                    videos.append((index, entry['media$group']['media$player']['url'])) +                if 'media$group' in entry and 'yt$videoid' in entry['media$group']: +                    videos.append(( +                        index, +                        'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t'] +                    ))          videos = [v[1] for v in sorted(videos)] @@ -987,13 +967,20 @@ class YoutubeChannelIE(InfoExtractor):  class YoutubeUserIE(InfoExtractor):      IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)' -    _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)' +    _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'      _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'      _GDATA_PAGE_SIZE = 50 -    _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d' -    _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]' +    _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'      IE_NAME = u'youtube:user' +    @classmethod +    def suitable(cls, url): +        # Don't return True if the url can be extracted with other youtube +        # extractor, the regex would is too permissive and it would match. +        other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls) +        if any(ie.suitable(url) for ie in other_ies): return False +        else: return super(YoutubeUserIE, cls).suitable(url) +      def _real_extract(self, url):          # Extract username          mobj = re.match(self._VALID_URL, url) @@ -1016,13 +1003,15 @@ class YoutubeUserIE(InfoExtractor):              page = self._download_webpage(gdata_url, username,                                            u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE)) +            try: +                response = json.loads(page) +            except ValueError as err: +                raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err)) +              # Extract video identifiers              ids_in_page = [] - -            for mobj in re.finditer(self._VIDEO_INDICATOR, page): -                if mobj.group(1) not in ids_in_page: -                    ids_in_page.append(mobj.group(1)) - +            for entry in response['feed']['entry']: +                ids_in_page.append(entry['id']['$t'].split('/')[-1])              video_ids.extend(ids_in_page)              # A little optimization - if current page is not @@ -1161,7 +1150,7 @@ class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):  class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):      IE_NAME = u'youtube:favorites'      IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)' -    _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:o?rites)?' +    _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'      _LOGIN_REQUIRED = True      def _real_extract(self, url): | 
