diff options
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 121 | 
1 files changed, 52 insertions, 69 deletions
| diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b252e36e1..08e821362 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -178,6 +178,52 @@ class YoutubeBaseInfoExtractor(InfoExtractor):              return +class YoutubePlaylistBaseInfoExtractor(InfoExtractor): +    # Extract the video ids from the playlist pages +    def _entries(self, page, playlist_id): +        more_widget_html = content_html = page +        for page_num in itertools.count(1): +            for video_id, video_title in self.extract_videos_from_page(content_html): +                yield self.url_result( +                    video_id, 'Youtube', video_id=video_id, +                    video_title=video_title) + +            mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) +            if not mobj: +                break + +            more = self._download_json( +                'https://youtube.com/%s' % mobj.group('more'), playlist_id, +                'Downloading page #%s' % page_num, +                transform_source=uppercase_escape) +            content_html = more['content_html'] +            if not content_html.strip(): +                # Some webpages show a "Load more" button but they don't +                # have more videos +                break +            more_widget_html = more['load_more_widget_html'] + +    def extract_videos_from_page(self, page): +        ids_in_page = [] +        titles_in_page = [] +        for mobj in re.finditer(self._VIDEO_RE, page): +            # The link with index 0 is not the first video of the playlist (not sure if still actual) +            if 'index' in mobj.groupdict() and mobj.group('id') == '0': +                continue +            video_id = mobj.group('id') +            video_title = unescapeHTML(mobj.group('title')) +            if video_title: +                video_title = video_title.strip() +            try: +                idx = ids_in_page.index(video_id) +                if video_title and not titles_in_page[idx]: +                    titles_in_page[idx] = video_title +            except ValueError: +                ids_in_page.append(video_id) +                titles_in_page.append(video_title) +        return zip(ids_in_page, titles_in_page) + +  class YoutubeIE(YoutubeBaseInfoExtractor):      IE_DESC = 'YouTube.com'      _VALID_URL = r"""(?x)^ @@ -1419,7 +1465,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          } -class YoutubePlaylistIE(YoutubeBaseInfoExtractor): +class YoutubePlaylistIE(YoutubeBaseInfoExtractor, YoutubePlaylistBaseInfoExtractor):      IE_DESC = 'YouTube.com playlists'      _VALID_URL = r"""(?x)(?:                          (?:https?://)? @@ -1440,7 +1486,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):                          ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})                       )"""      _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' -    _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)' +    _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'      IE_NAME = 'youtube:playlist'      _TESTS = [{          'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', @@ -1557,37 +1603,11 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):              else:                  self.report_warning('Youtube gives an alert message: ' + match) -        # Extract the video ids from the playlist pages -        def _entries(): -            more_widget_html = content_html = page -            for page_num in itertools.count(1): -                matches = re.finditer(self._VIDEO_RE, content_html) -                # We remove the duplicates and the link with index 0 -                # (it's not the first video of the playlist) -                new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0') -                for vid_id in new_ids: -                    yield self.url_result(vid_id, 'Youtube', video_id=vid_id) - -                mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) -                if not mobj: -                    break - -                more = self._download_json( -                    'https://youtube.com/%s' % mobj.group('more'), playlist_id, -                    'Downloading page #%s' % page_num, -                    transform_source=uppercase_escape) -                content_html = more['content_html'] -                if not content_html.strip(): -                    # Some webpages show a "Load more" button but they don't -                    # have more videos -                    break -                more_widget_html = more['load_more_widget_html'] -          playlist_title = self._html_search_regex(              r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',              page, 'title') -        return self.playlist_result(_entries(), playlist_id, playlist_title) +        return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title)      def _real_extract(self, url):          # Extract playlist id @@ -1613,10 +1633,11 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):          return self._extract_playlist(playlist_id) -class YoutubeChannelIE(InfoExtractor): +class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):      IE_DESC = 'YouTube.com channels'      _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'      _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos' +    _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'      IE_NAME = 'youtube:channel'      _TESTS = [{          'note': 'paginated channel', @@ -1627,22 +1648,6 @@ class YoutubeChannelIE(InfoExtractor):          }      }] -    @staticmethod -    def extract_videos_from_page(page): -        ids_in_page = [] -        titles_in_page = [] -        for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page): -            video_id = mobj.group('id') -            video_title = unescapeHTML(mobj.group('title')) -            try: -                idx = ids_in_page.index(video_id) -                if video_title and not titles_in_page[idx]: -                    titles_in_page[idx] = video_title -            except ValueError: -                ids_in_page.append(video_id) -                titles_in_page.append(video_title) -        return zip(ids_in_page, titles_in_page) -      def _real_extract(self, url):          channel_id = self._match_id(url) @@ -1685,29 +1690,7 @@ class YoutubeChannelIE(InfoExtractor):                  for video_id, video_title in self.extract_videos_from_page(channel_page)]              return self.playlist_result(entries, channel_id) -        def _entries(): -            more_widget_html = content_html = channel_page -            for pagenum in itertools.count(1): - -                for video_id, video_title in self.extract_videos_from_page(content_html): -                    yield self.url_result( -                        video_id, 'Youtube', video_id=video_id, -                        video_title=video_title) - -                mobj = re.search( -                    r'data-uix-load-more-href="/?(?P<more>[^"]+)"', -                    more_widget_html) -                if not mobj: -                    break - -                more = self._download_json( -                    'https://youtube.com/%s' % mobj.group('more'), channel_id, -                    'Downloading page #%s' % (pagenum + 1), -                    transform_source=uppercase_escape) -                content_html = more['content_html'] -                more_widget_html = more['load_more_widget_html'] - -        return self.playlist_result(_entries(), channel_id) +        return self.playlist_result(self._entries(channel_page, channel_id), channel_id)  class YoutubeUserIE(YoutubeChannelIE): | 
