diff options
Diffstat (limited to 'youtube_dl/extractor/youtube.py')
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 167 | 
1 files changed, 89 insertions, 78 deletions
| diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 97ce36550..687e0b4db 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -178,6 +178,52 @@ class YoutubeBaseInfoExtractor(InfoExtractor):              return +class YoutubePlaylistBaseInfoExtractor(InfoExtractor): +    # Extract the video ids from the playlist pages +    def _entries(self, page, playlist_id): +        more_widget_html = content_html = page +        for page_num in itertools.count(1): +            for video_id, video_title in self.extract_videos_from_page(content_html): +                yield self.url_result( +                    video_id, 'Youtube', video_id=video_id, +                    video_title=video_title) + +            mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) +            if not mobj: +                break + +            more = self._download_json( +                'https://youtube.com/%s' % mobj.group('more'), playlist_id, +                'Downloading page #%s' % page_num, +                transform_source=uppercase_escape) +            content_html = more['content_html'] +            if not content_html.strip(): +                # Some webpages show a "Load more" button but they don't +                # have more videos +                break +            more_widget_html = more['load_more_widget_html'] + +    def extract_videos_from_page(self, page): +        ids_in_page = [] +        titles_in_page = [] +        for mobj in re.finditer(self._VIDEO_RE, page): +            # The link with index 0 is not the first video of the playlist (not sure if still actual) +            if 'index' in mobj.groupdict() and mobj.group('id') == '0': +                continue +            video_id = mobj.group('id') +            video_title = unescapeHTML(mobj.group('title')) +            if video_title: +                video_title = video_title.strip() +            try: +                idx = ids_in_page.index(video_id) +                if video_title and not titles_in_page[idx]: +                    titles_in_page[idx] = video_title +            except ValueError: +                ids_in_page.append(video_id) +                titles_in_page.append(video_title) +        return zip(ids_in_page, titles_in_page) + +  class YoutubeIE(YoutubeBaseInfoExtractor):      IE_DESC = 'YouTube.com'      _VALID_URL = r"""(?x)^ @@ -657,7 +703,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):      def _extract_signature_function(self, video_id, player_url, example_sig):          id_m = re.match( -            r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?)?\.(?P<ext>[a-z]+)$', +            r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|/base)?\.(?P<ext>[a-z]+)$',              player_url)          if not id_m:              raise ExtractorError('Cannot identify player %r' % player_url) @@ -1061,6 +1107,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                      if not video_info:                          video_info = get_video_info                      if 'token' in get_video_info: +                        # Different get_video_info requests may report different results, e.g. +                        # some may report video unavailability, but some may serve it without +                        # any complaint (see https://github.com/rg3/youtube-dl/issues/7362, +                        # the original webpage as well as el=info and el=embedded get_video_info +                        # requests report video unavailability due to geo restriction while +                        # el=detailpage succeeds and returns valid data). This is probably +                        # due to YouTube measures against IP ranges of hosting providers. +                        # Working around by preferring the first succeeded video_info containing +                        # the token if no such video_info yet was found. +                        if 'token' not in video_info: +                            video_info = get_video_info                          break          if 'token' not in video_info:              if 'reason' in video_info: @@ -1286,7 +1343,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                                  player_desc = 'flash player %s' % player_version                              else:                                  player_version = self._search_regex( -                                    r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', +                                    [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', r'(?:www|player)-([^/]+)/base\.js'],                                      player_url,                                      'html5 player', fatal=False)                                  player_desc = 'html5 player %s' % player_version @@ -1419,7 +1476,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          } -class YoutubePlaylistIE(YoutubeBaseInfoExtractor): +class YoutubePlaylistIE(YoutubeBaseInfoExtractor, YoutubePlaylistBaseInfoExtractor):      IE_DESC = 'YouTube.com playlists'      _VALID_URL = r"""(?x)(?:                          (?:https?://)? @@ -1440,7 +1497,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):                          ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})                       )"""      _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' -    _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)' +    _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'      IE_NAME = 'youtube:playlist'      _TESTS = [{          'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', @@ -1557,37 +1614,11 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):              else:                  self.report_warning('Youtube gives an alert message: ' + match) -        # Extract the video ids from the playlist pages -        def _entries(): -            more_widget_html = content_html = page -            for page_num in itertools.count(1): -                matches = re.finditer(self._VIDEO_RE, content_html) -                # We remove the duplicates and the link with index 0 -                # (it's not the first video of the playlist) -                new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0') -                for vid_id in new_ids: -                    yield self.url_result(vid_id, 'Youtube', video_id=vid_id) - -                mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) -                if not mobj: -                    break - -                more = self._download_json( -                    'https://youtube.com/%s' % mobj.group('more'), playlist_id, -                    'Downloading page #%s' % page_num, -                    transform_source=uppercase_escape) -                content_html = more['content_html'] -                if not content_html.strip(): -                    # Some webpages show a "Load more" button but they don't -                    # have more videos -                    break -                more_widget_html = more['load_more_widget_html'] -          playlist_title = self._html_search_regex(              r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',              page, 'title') -        return self.playlist_result(_entries(), playlist_id, playlist_title) +        return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title)      def _real_extract(self, url):          # Extract playlist id @@ -1613,36 +1644,31 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):          return self._extract_playlist(playlist_id) -class YoutubeChannelIE(InfoExtractor): +class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):      IE_DESC = 'YouTube.com channels'      _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'      _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos' +    _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'      IE_NAME = 'youtube:channel'      _TESTS = [{          'note': 'paginated channel',          'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',          'playlist_mincount': 91,          'info_dict': { -            'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', +            'id': 'UUKfVa3S1e4PHvxWcwyMMg8w', +            'title': 'Uploads from lex will',          } +    }, { +        'note': 'Age restricted channel', +        # from https://www.youtube.com/user/DeusExOfficial +        'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w', +        'playlist_mincount': 64, +        'info_dict': { +            'id': 'UUs0ifCMCm1icqRbqhUINa0w', +            'title': 'Uploads from Deus Ex', +        },      }] -    @staticmethod -    def extract_videos_from_page(page): -        ids_in_page = [] -        titles_in_page = [] -        for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page): -            video_id = mobj.group('id') -            video_title = unescapeHTML(mobj.group('title')) -            try: -                idx = ids_in_page.index(video_id) -                if video_title and not titles_in_page[idx]: -                    titles_in_page[idx] = video_title -            except ValueError: -                ids_in_page.append(video_id) -                titles_in_page.append(video_title) -        return zip(ids_in_page, titles_in_page) -      def _real_extract(self, url):          channel_id = self._match_id(url) @@ -1654,12 +1680,15 @@ class YoutubeChannelIE(InfoExtractor):          channel_page = self._download_webpage(              url + '?view=57', channel_id,              'Downloading channel page', fatal=False) -        channel_playlist_id = self._html_search_meta( -            'channelId', channel_page, 'channel id', default=None) -        if not channel_playlist_id: -            channel_playlist_id = self._search_regex( -                r'data-channel-external-id="([^"]+)"', -                channel_page, 'channel id', default=None) +        if channel_page is False: +            channel_playlist_id = False +        else: +            channel_playlist_id = self._html_search_meta( +                'channelId', channel_page, 'channel id', default=None) +            if not channel_playlist_id: +                channel_playlist_id = self._search_regex( +                    r'data-(?:channel-external-|yt)id="([^"]+)"', +                    channel_page, 'channel id', default=None)          if channel_playlist_id and channel_playlist_id.startswith('UC'):              playlist_id = 'UU' + channel_playlist_id[2:]              return self.url_result( @@ -1682,29 +1711,7 @@ class YoutubeChannelIE(InfoExtractor):                  for video_id, video_title in self.extract_videos_from_page(channel_page)]              return self.playlist_result(entries, channel_id) -        def _entries(): -            more_widget_html = content_html = channel_page -            for pagenum in itertools.count(1): - -                for video_id, video_title in self.extract_videos_from_page(content_html): -                    yield self.url_result( -                        video_id, 'Youtube', video_id=video_id, -                        video_title=video_title) - -                mobj = re.search( -                    r'data-uix-load-more-href="/?(?P<more>[^"]+)"', -                    more_widget_html) -                if not mobj: -                    break - -                more = self._download_json( -                    'https://youtube.com/%s' % mobj.group('more'), channel_id, -                    'Downloading page #%s' % (pagenum + 1), -                    transform_source=uppercase_escape) -                content_html = more['content_html'] -                more_widget_html = more['load_more_widget_html'] - -        return self.playlist_result(_entries(), channel_id) +        return self.playlist_result(self._entries(channel_page, channel_id), channel_id)  class YoutubeUserIE(YoutubeChannelIE): @@ -1970,6 +1977,7 @@ class YoutubeTruncatedURLIE(InfoExtractor):              annotation_id=annotation_[^&]+|              x-yt-cl=[0-9]+|              hl=[^&]*| +            t=[0-9]+          )?          |              attribution_link\?a=[^&]+ @@ -1992,6 +2000,9 @@ class YoutubeTruncatedURLIE(InfoExtractor):      }, {          'url': 'https://www.youtube.com/watch?hl=en-GB',          'only_matching': True, +    }, { +        'url': 'https://www.youtube.com/watch?t=2372', +        'only_matching': True,      }]      def _real_extract(self, url): | 
