diff options
Diffstat (limited to 'youtube_dl/extractor/youtube.py')
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 396 | 
1 files changed, 229 insertions, 167 deletions
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index dc601de52..c860eedda 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -7,20 +7,16 @@ import itertools  import json  import os.path  import re -import socket  import string  import struct  import traceback -import xml.etree.ElementTree  import zlib  from .common import InfoExtractor, SearchInfoExtractor  from .subtitles import SubtitlesInfoExtractor  from ..utils import (      compat_chr, -    compat_http_client,      compat_parse_qs, -    compat_urllib_error,      compat_urllib_parse,      compat_urllib_request,      compat_urlparse, @@ -29,6 +25,7 @@ from ..utils import (      clean_html,      get_cachedir,      get_element_by_id, +    get_element_by_attribute,      ExtractorError,      unescapeHTML,      unified_strdate, @@ -45,19 +42,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor):      # If True it will raise an error if no login info is provided      _LOGIN_REQUIRED = False -    def report_lang(self): -        """Report attempt to set language.""" -        self.to_screen(u'Setting language') -      def _set_language(self): -        request = compat_urllib_request.Request(self._LANG_URL) -        try: -            self.report_lang() -            compat_urllib_request.urlopen(request).read() -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -            self._downloader.report_warning(u'unable to set language: %s' % compat_str(err)) -            return False -        return True +        return bool(self._download_webpage( +            self._LANG_URL, None, +            note=u'Setting language', errnote='unable to set language', +            fatal=False))      def _login(self):          (username, password) = self._get_login_info() @@ -67,12 +56,12 @@ class YoutubeBaseInfoExtractor(InfoExtractor):                  raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)              return False -        request = compat_urllib_request.Request(self._LOGIN_URL) -        try: -            login_page = compat_urllib_request.urlopen(request).read().decode('utf-8') -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -            self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err)) -            return False +        login_page = self._download_webpage( +            self._LOGIN_URL, None, +            note=u'Downloading login page', +            errnote=u'unable to fetch login page', fatal=False) +        if login_page is False: +            return          galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',                                    login_page, u'Login GALX parameter') @@ -102,29 +91,28 @@ class YoutubeBaseInfoExtractor(InfoExtractor):          # chokes on unicode          login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())          login_data = compat_urllib_parse.urlencode(login_form).encode('ascii') -        request = compat_urllib_request.Request(self._LOGIN_URL, login_data) -        try: -            self.report_login() -            login_results = compat_urllib_request.urlopen(request).read().decode('utf-8') -            if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None: -                self._downloader.report_warning(u'unable to log in: bad username or password') -                return False -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -            self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) + +        req = compat_urllib_request.Request(self._LOGIN_URL, login_data) +        login_results = self._download_webpage( +            req, None, +            note=u'Logging in', errnote=u'unable to log in', fatal=False) +        if login_results is False: +            return False +        if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None: +            self._downloader.report_warning(u'unable to log in: bad username or password')              return False          return True      def _confirm_age(self):          age_form = { -                'next_url':     '/', -                'action_confirm':   'Confirm', -                } -        request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form)) -        try: -            self.report_age_confirmation() -            compat_urllib_request.urlopen(request).read().decode('utf-8') -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -            raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err)) +            'next_url': '/', +            'action_confirm': 'Confirm', +        } +        req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form)) + +        self._download_webpage( +            req, None, +            note=u'Confirming age', errnote=u'Unable to confirm age')          return True      def _real_initialize(self): @@ -139,10 +127,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor):  class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):      IE_DESC = u'YouTube.com' -    _VALID_URL = r"""^ +    _VALID_URL = r"""(?x)^                       ( -                         (?:https?://)?                                       # http(s):// (optional) -                         (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/| +                         (?:https?://|//)?                                    # http(s):// or protocol-independent URL (optional) +                         (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|                              tube\.majestyc\.net/|                              youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains                           (?:.*?\#/)?                                          # handle anchor (#/) redirect urls @@ -248,21 +236,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          '248': 'webm',      }      _video_dimensions = { -        '5': '240x400', +        '5': '400x240',          '6': '???',          '13': '???', -        '17': '144x176', -        '18': '360x640', -        '22': '720x1280', -        '34': '360x640', -        '35': '480x854', -        '36': '240x320', -        '37': '1080x1920', -        '38': '3072x4096', -        '43': '360x640', -        '44': '480x854', -        '45': '720x1280', -        '46': '1080x1920', +        '17': '176x144', +        '18': '640x360', +        '22': '1280x720', +        '34': '640x360', +        '35': '854x480', +        '36': '320x240', +        '37': '1920x1080', +        '38': '4096x3072', +        '43': '640x360', +        '44': '854x480', +        '45': '1280x720', +        '46': '1920x1080',          '82': '360p',          '83': '480p',          '84': '720p', @@ -336,19 +324,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):                  u"uploader": u"Philipp Hagemeister",                  u"uploader_id": u"phihag",                  u"upload_date": u"20121002", -                u"description": u"test chars:  \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ." -            } -        }, -        { -            u"url":  u"http://www.youtube.com/watch?v=1ltcDfZMA3U", -            u"file":  u"1ltcDfZMA3U.mp4", -            u"note": u"Test VEVO video (#897)", -            u"info_dict": { -                u"upload_date": u"20070518", -                u"title": u"Maps - It Will Find You", -                u"description": u"Music video by Maps performing It Will Find You.", -                u"uploader": u"MuteUSA", -                u"uploader_id": u"MuteUSA" +                u"description": u"test chars:  \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."              }          },          { @@ -375,6 +351,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):                  u"uploader_id": u"justintimberlakeVEVO"              }          }, +        { +            u"url":  u"//www.YouTube.com/watch?v=yZIXLfi8CZQ", +            u"file":  u"yZIXLfi8CZQ.mp4", +            u"note": u"Embed-only video (#1746)", +            u"info_dict": { +                u"upload_date": u"20120608", +                u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012", +                u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7", +                u"uploader": u"SET India", +                u"uploader_id": u"setindia" +            } +        },      ] @@ -382,16 +370,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):      def suitable(cls, url):          """Receives a URL and returns True if suitable for this IE."""          if YoutubePlaylistIE.suitable(url): return False -        return re.match(cls._VALID_URL, url, re.VERBOSE) is not None +        return re.match(cls._VALID_URL, url) is not None      def __init__(self, *args, **kwargs):          super(YoutubeIE, self).__init__(*args, **kwargs)          self._player_cache = {} -    def report_video_webpage_download(self, video_id): -        """Report attempt to download video webpage.""" -        self.to_screen(u'%s: Downloading video webpage' % video_id) -      def report_video_info_webpage_download(self, video_id):          """Report attempt to download video info webpage."""          self.to_screen(u'%s: Downloading video info webpage' % video_id) @@ -1031,6 +1015,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          """Turn the encrypted s field into a working signature"""          if player_url is not None: +            if player_url.startswith(u'//'): +                player_url = u'https:' + player_url              try:                  player_id = (player_url, len(s))                  if player_id not in self._player_cache: @@ -1094,7 +1080,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          else:              raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s))) -    def _get_available_subtitles(self, video_id): +    def _get_available_subtitles(self, video_id, webpage):          try:              sub_list = self._download_webpage(                  'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, @@ -1110,7 +1096,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):              params = compat_urllib_parse.urlencode({                  'lang': lang,                  'v': video_id, -                'fmt': self._downloader.params.get('subtitlesformat'), +                'fmt': self._downloader.params.get('subtitlesformat', 'srt'),                  'name': l[0].encode('utf-8'),              })              url = u'http://www.youtube.com/api/timedtext?' + params @@ -1123,7 +1109,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):      def _get_available_automatic_caption(self, video_id, webpage):          """We need the webpage for getting the captions url, pass it as an             argument to speed up the process.""" -        sub_format = self._downloader.params.get('subtitlesformat') +        sub_format = self._downloader.params.get('subtitlesformat', 'srt')          self.to_screen(u'%s: Looking for automatic captions' % video_id)          mobj = re.search(r';ytplayer.config = ({.*?});', webpage)          err_msg = u'Couldn\'t find automatic captions for %s' % video_id @@ -1142,8 +1128,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):                  'asrs': 1,              })              list_url = caption_url + '&' + list_params -            list_page = self._download_webpage(list_url, video_id) -            caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8')) +            caption_list = self._download_xml(list_url, video_id)              original_lang_node = caption_list.find('track')              if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :                  self._downloader.report_warning(u'Video doesn\'t have automatic captions') @@ -1257,15 +1242,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          video_id = self._extract_id(url)          # Get video webpage -        self.report_video_webpage_download(video_id)          url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id -        request = compat_urllib_request.Request(url) -        try: -            video_webpage_bytes = compat_urllib_request.urlopen(request).read() -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -            raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err)) - -        video_webpage = video_webpage_bytes.decode('utf-8', 'ignore') +        video_webpage = self._download_webpage(url, video_id)          # Attempt to extract SWF player URL          mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) @@ -1282,7 +1260,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):              # We simulate the access to the video from www.youtube.com/v/{video_id}              # this can be viewed without login into Youtube              data = compat_urllib_parse.urlencode({'video_id': video_id, -                                                  'el': 'embedded', +                                                  'el': 'player_embedded',                                                    'gl': 'US',                                                    'hl': 'en',                                                    'eurl': 'https://youtube.googleapis.com/v/' + video_id, @@ -1311,6 +1289,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):              else:                  raise ExtractorError(u'"token" parameter not in video info for unknown reason') +        if 'view_count' in video_info: +            view_count = int(video_info['view_count'][0]) +        else: +            view_count = None +          # Check for "rental" videos          if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:              raise ExtractorError(u'"rental" videos not supported') @@ -1360,6 +1343,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          # description          video_description = get_element_by_id("eow-description", video_webpage)          if video_description: +            video_description = re.sub(r'''(?x) +                <a\s+ +                    (?:[a-zA-Z-]+="[^"]+"\s+)*? +                    title="([^"]+)"\s+ +                    (?:[a-zA-Z-]+="[^"]+"\s+)*? +                    class="yt-uix-redirect-link"\s*> +                [^<]+ +                </a> +            ''', r'\1', video_description)              video_description = clean_html(video_description)          else:              fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage) @@ -1368,6 +1360,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):              else:                  video_description = u'' +        def _extract_count(klass): +            count = self._search_regex( +                r'class="%s">([\d,]+)</span>' % re.escape(klass), +                video_webpage, klass, default=None) +            if count is not None: +                return int(count.replace(',', '')) +            return None +        like_count = _extract_count(u'likes-count') +        dislike_count = _extract_count(u'dislikes-count') +          # subtitles          video_subtitles = self.extract_subtitles(video_id, video_webpage) @@ -1377,9 +1379,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          if 'length_seconds' not in video_info:              self._downloader.report_warning(u'unable to extract video duration') -            video_duration = '' +            video_duration = None          else: -            video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]) +            video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))          # annotations          video_annotations = None @@ -1497,11 +1499,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):                  'subtitles':    video_subtitles,                  'duration':     video_duration,                  'age_limit':    18 if age_gate else 0, -                'annotations':  video_annotations +                'annotations':  video_annotations, +                'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id, +                'view_count': view_count, +                'like_count': like_count, +                'dislike_count': dislike_count,              })          return results -class YoutubePlaylistIE(InfoExtractor): +class YoutubePlaylistIE(YoutubeBaseInfoExtractor):      IE_DESC = u'YouTube.com playlists'      _VALID_URL = r"""(?:                          (?:https?://)? @@ -1512,13 +1518,14 @@ class YoutubePlaylistIE(InfoExtractor):                             \? (?:.*?&)*? (?:p|a|list)=                          |  p/                          ) -                        ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,}) +                        ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})                          .*                       | -                        ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,}) +                        ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})                       )""" -    _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none' -    _MAX_RESULTS = 50 +    _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s' +    _MORE_PAGES_INDICATOR = r'data-link-type="next"' +    _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)'      IE_NAME = u'youtube:playlist'      @classmethod @@ -1526,6 +1533,27 @@ class YoutubePlaylistIE(InfoExtractor):          """Receives a URL and returns True if suitable for this IE."""          return re.match(cls._VALID_URL, url, re.VERBOSE) is not None +    def _real_initialize(self): +        self._login() + +    def _ids_to_results(self, ids): +        return [self.url_result(vid_id, 'Youtube', video_id=vid_id) +                       for vid_id in ids] + +    def _extract_mix(self, playlist_id): +        # The mixes are generated from a a single video +        # the id of the playlist is just 'RD' + video_id +        url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id) +        webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix') +        title_span = (get_element_by_attribute('class', 'title long-title', webpage) or +            get_element_by_attribute('class', 'title ', webpage)) +        title = clean_html(title_span) +        video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s' % re.escape(playlist_id) +        ids = orderedSet(re.findall(video_re, webpage)) +        url_results = self._ids_to_results(ids) + +        return self.playlist_result(url_results, playlist_id, title) +      def _real_extract(self, url):          # Extract playlist id          mobj = re.match(self._VALID_URL, url, re.VERBOSE) @@ -1539,51 +1567,73 @@ class YoutubePlaylistIE(InfoExtractor):              video_id = query_dict['v'][0]              if self._downloader.params.get('noplaylist'):                  self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id) -                return self.url_result('https://www.youtube.com/watch?v=' + video_id, 'Youtube') +                return self.url_result(video_id, 'Youtube', video_id=video_id)              else:                  self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id)) -        # Download playlist videos from API -        videos = [] +        if playlist_id.startswith('RD'): +            # Mixes require a custom extraction process +            return self._extract_mix(playlist_id) +        if playlist_id.startswith('TL'): +            raise ExtractorError(u'For downloading YouTube.com top lists, use ' +                u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True) + +        # Extract the video ids from the playlist pages +        ids = []          for page_num in itertools.count(1): -            start_index = self._MAX_RESULTS * (page_num - 1) + 1 -            if start_index >= 1000: -                self._downloader.report_warning(u'Max number of results reached') -                break -            url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index) +            url = self._TEMPLATE_URL % (playlist_id, page_num)              page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num) +            matches = re.finditer(self._VIDEO_RE, page) +            # We remove the duplicates and the link with index 0 +            # (it's not the first video of the playlist) +            new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0') +            ids.extend(new_ids) -            try: -                response = json.loads(page) -            except ValueError as err: -                raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err)) - -            if 'feed' not in response: -                raise ExtractorError(u'Got a malformed response from YouTube API') -            playlist_title = response['feed']['title']['$t'] -            if 'entry' not in response['feed']: -                # Number of videos is a multiple of self._MAX_RESULTS +            if re.search(self._MORE_PAGES_INDICATOR, page) is None:                  break -            for entry in response['feed']['entry']: -                index = entry['yt$position']['$t'] -                if 'media$group' in entry and 'yt$videoid' in entry['media$group']: -                    videos.append(( -                        index, -                        'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t'] -                    )) +        playlist_title = self._og_search_title(page) -        videos = [v[1] for v in sorted(videos)] +        url_results = self._ids_to_results(ids) +        return self.playlist_result(url_results, playlist_id, playlist_title) -        url_results = [self.url_result(vurl, 'Youtube') for vurl in videos] -        return [self.playlist_result(url_results, playlist_id, playlist_title)] + +class YoutubeTopListIE(YoutubePlaylistIE): +    IE_NAME = u'youtube:toplist' +    IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"' +        u' (Example: "yttoplist:music:Top Tracks")') +    _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$' + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        channel = mobj.group('chann') +        title = mobj.group('title') +        query = compat_urllib_parse.urlencode({'title': title}) +        playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query) +        channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title) +        link = self._html_search_regex(playlist_re, channel_page, u'list') +        url = compat_urlparse.urljoin('https://www.youtube.com/', link) +         +        video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"' +        ids = [] +        # sometimes the webpage doesn't contain the videos +        # retry until we get them +        for i in itertools.count(0): +            msg = u'Downloading Youtube mix' +            if i > 0: +                msg += ', retry #%d' % i +            webpage = self._download_webpage(url, title, msg) +            ids = orderedSet(re.findall(video_re, webpage)) +            if ids: +                break +        url_results = self._ids_to_results(ids) +        return self.playlist_result(url_results, playlist_title=title)  class YoutubeChannelIE(InfoExtractor):      IE_DESC = u'YouTube.com channels'      _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)" -    _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'      _MORE_PAGES_INDICATOR = 'yt-uix-load-more'      _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'      IE_NAME = u'youtube:channel' @@ -1604,36 +1654,38 @@ class YoutubeChannelIE(InfoExtractor):          # Download channel page          channel_id = mobj.group(1)          video_ids = [] -        pagenum = 1 - -        url = self._TEMPLATE_URL % (channel_id, pagenum) -        page = self._download_webpage(url, channel_id, -                                      u'Downloading page #%s' % pagenum) - -        # Extract video identifiers -        ids_in_page = self.extract_videos_from_page(page) -        video_ids.extend(ids_in_page) - -        # Download any subsequent channel pages using the json-based channel_ajax query -        if self._MORE_PAGES_INDICATOR in page: +        url = 'https://www.youtube.com/channel/%s/videos' % channel_id +        channel_page = self._download_webpage(url, channel_id) +        autogenerated = re.search(r'''(?x) +                class="[^"]*?(?: +                    channel-header-autogenerated-label| +                    yt-channel-title-autogenerated +                )[^"]*"''', channel_page) is not None + +        if autogenerated: +            # The videos are contained in a single page +            # the ajax pages can't be used, they are empty +            video_ids = self.extract_videos_from_page(channel_page) +        else: +            # Download all channel pages using the json-based channel_ajax query              for pagenum in itertools.count(1):                  url = self._MORE_PAGES_URL % (pagenum, channel_id)                  page = self._download_webpage(url, channel_id,                                                u'Downloading page #%s' % pagenum) - +                      page = json.loads(page) - +                      ids_in_page = self.extract_videos_from_page(page['content_html'])                  video_ids.extend(ids_in_page) - -                if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']: +     +                if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:                      break          self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids))) -        urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids] -        url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls] -        return [self.playlist_result(url_entries, channel_id)] +        url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id) +                       for video_id in video_ids] +        return self.playlist_result(url_entries, channel_id)  class YoutubeUserIE(InfoExtractor): @@ -1697,9 +1749,11 @@ class YoutubeUserIE(InfoExtractor):              if len(ids_in_page) < self._GDATA_PAGE_SIZE:                  break -        urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids] -        url_results = [self.url_result(rurl, 'Youtube') for rurl in urls] -        return [self.playlist_result(url_results, playlist_title = username)] +        url_results = [ +            self.url_result(video_id, 'Youtube', video_id=video_id) +            for video_id in video_ids] +        return self.playlist_result(url_results, playlist_title=username) +  class YoutubeSearchIE(SearchInfoExtractor):      IE_DESC = u'YouTube.com searches' @@ -1708,10 +1762,6 @@ class YoutubeSearchIE(SearchInfoExtractor):      IE_NAME = u'youtube:search'      _SEARCH_KEY = 'ytsearch' -    def report_download_page(self, query, pagenum): -        """Report attempt to download search page with given number.""" -        self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum)) -      def _get_n_results(self, query, n):          """Get a specified number of results for a query""" @@ -1720,16 +1770,15 @@ class YoutubeSearchIE(SearchInfoExtractor):          limit = n          while (50 * pagenum) < limit: -            self.report_download_page(query, pagenum+1)              result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1) -            request = compat_urllib_request.Request(result_url) -            try: -                data = compat_urllib_request.urlopen(request).read().decode('utf-8') -            except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -                raise ExtractorError(u'Unable to download API page: %s' % compat_str(err)) -            api_response = json.loads(data)['data'] - -            if not 'items' in api_response: +            data_json = self._download_webpage( +                result_url, video_id=u'query "%s"' % query, +                note=u'Downloading page %s' % (pagenum + 1), +                errnote=u'Unable to download API page') +            data = json.loads(data_json) +            api_response = data['data'] + +            if 'items' not in api_response:                  raise ExtractorError(u'[youtube] No video results')              new_ids = list(video['id'] for video in api_response['items']) @@ -1740,9 +1789,15 @@ class YoutubeSearchIE(SearchInfoExtractor):          if len(video_ids) > n:              video_ids = video_ids[:n] -        videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids] +        videos = [self.url_result(video_id, 'Youtube', video_id=video_id) +                  for video_id in video_ids]          return self.playlist_result(videos, query) +class YoutubeSearchDateIE(YoutubeSearchIE): +    IE_NAME = YoutubeSearchIE.IE_NAME + ':date' +    _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published' +    _SEARCH_KEY = 'ytsearchdate' +    IE_DESC = u'YouTube.com searches, newest videos first'  class YoutubeShowIE(InfoExtractor):      IE_DESC = u'YouTube.com (multi-season) shows' @@ -1766,7 +1821,6 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):      Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.      """      _LOGIN_REQUIRED = True -    _PAGING_STEP = 30      # use action_load_personal_feed instead of action_load_system_feed      _PERSONAL_FEED = False @@ -1786,9 +1840,8 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):      def _real_extract(self, url):          feed_entries = [] -        # The step argument is available only in 2.7 or higher -        for i in itertools.count(0): -            paging = i*self._PAGING_STEP +        paging = 0 +        for i in itertools.count(1):              info = self._download_webpage(self._FEED_TEMPLATE % paging,                                            u'%s feed' % self._FEED_NAME,                                            u'Downloading page %s' % i) @@ -1796,9 +1849,12 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):              feed_html = info['feed_html']              m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)              ids = orderedSet(m.group(1) for m in m_ids) -            feed_entries.extend(self.url_result(id, 'Youtube') for id in ids) +            feed_entries.extend( +                self.url_result(video_id, 'Youtube', video_id=video_id) +                for video_id in ids)              if info['paging'] is None:                  break +            paging = info['paging']          return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)  class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): @@ -1818,9 +1874,15 @@ class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):      _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'      _FEED_NAME = 'watch_later'      _PLAYLIST_TITLE = u'Youtube Watch Later' -    _PAGING_STEP = 100      _PERSONAL_FEED = True +class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): +    IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)' +    _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory' +    _FEED_NAME = 'history' +    _PERSONAL_FEED = True +    _PLAYLIST_TITLE = u'Youtube Watch History' +  class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):      IE_NAME = u'youtube:favorites'      IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'  | 
