diff options
| -rw-r--r-- | test/test_all_urls.py | 7 | ||||
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 161 | 
2 files changed, 97 insertions, 71 deletions
| diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 56a08bed8..50c3466fa 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -61,9 +61,10 @@ class TestAllURLsMatching(unittest.TestCase):      #     self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:tab'])      def test_youtube_feeds(self): -        self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater']) -        self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions']) -        self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended']) +        self.assertMatch('https://www.youtube.com/feed/library', ['youtube:tab']) +        self.assertMatch('https://www.youtube.com/feed/history', ['youtube:tab']) +        self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:tab']) +        self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:tab'])      # def test_youtube_search_matching(self):      #     self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index abb1cad74..7324d8080 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -33,7 +33,6 @@ from ..utils import (      get_element_by_id,      int_or_none,      mimetype2ext, -    orderedSet,      parse_codecs,      parse_duration,      remove_quotes, @@ -2381,7 +2380,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):  class YoutubeTabIE(YoutubeBaseInfoExtractor):      IE_DESC = 'YouTube.com tab' -    _VALID_URL = r'https?://(?:\w+\.)?(?:youtube(?:kids)?\.com|invidio\.us)/(?:(?:channel|c|user)/|(?:playlist|watch)\?.*?\blist=)(?P<id>[^/?#&]+)' +    _VALID_URL = r'''(?x) +                    https?:// +                        (?:\w+\.)? +                        (?: +                            youtube(?:kids)?\.com| +                            invidio\.us +                        )/ +                        (?: +                            (?:channel|c|user|feed)/| +                            (?:playlist|watch)\?.*?\blist= +                        ) +                        (?P<id>[^/?\#&]+) +                    '''      IE_NAME = 'youtube:tab'      _TESTS = [{ @@ -2620,7 +2631,30 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):      }, {          'url': 'https://www.youtube.com/c/CommanderVideoHq/live',          'only_matching': True, -    }, +    }, { +        'url': 'https://www.youtube.com/feed/trending', +        'only_matching': True, +    }, { +        # needs auth +        'url': 'https://www.youtube.com/feed/library', +        'only_matching': True, +    }, { +        # needs auth +        'url': 'https://www.youtube.com/feed/history', +        'only_matching': True, +    }, { +        # needs auth +        'url': 'https://www.youtube.com/feed/subscriptions', +        'only_matching': True, +    }, { +        # needs auth +        'url': 'https://www.youtube.com/feed/watch_later', +        'only_matching': True, +    }, { +        # no longer available? +        'url': 'https://www.youtube.com/feed/recommended', +        'only_matching': True, +    }          # TODO          # {          #     'url': 'https://www.youtube.com/TheYoungTurks/live', @@ -2707,27 +2741,34 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):                      'https://www.youtube.com/channel/%s' % channel_id,                      ie=YoutubeTabIE.ie_key(), video_title=title) -    def _shelf_entries_trimmed(self, shelf_renderer): -        renderer = try_get( -            shelf_renderer, lambda x: x['content']['horizontalListRenderer'], dict) -        if not renderer: +    def _shelf_entries_from_content(self, shelf_renderer): +        content = shelf_renderer.get('content') +        if not isinstance(content, dict):              return -        # TODO: add support for nested playlists so each shelf is processed -        # as separate playlist -        # TODO: this includes only first N items -        for entry in self._grid_entries(renderer): -            yield entry +        renderer = content.get('gridRenderer') +        if renderer: +            # TODO: add support for nested playlists so each shelf is processed +            # as separate playlist +            # TODO: this includes only first N items +            for entry in self._grid_entries(renderer): +                yield entry +        renderer = content.get('horizontalListRenderer') +        if renderer: +            # TODO +            pass      def _shelf_entries(self, shelf_renderer):          ep = try_get(              shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],              compat_str)          shelf_url = urljoin('https://www.youtube.com', ep) -        if not shelf_url: -            return -        title = try_get( -            shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str) -        yield self.url_result(shelf_url, video_title=title) +        if shelf_url: +            title = try_get( +                shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str) +            yield self.url_result(shelf_url, video_title=title) +        # Shelf may not contain shelf URL, fallback to extraction from content +        for entry in self._shelf_entries_from_content(shelf_renderer): +            yield entry      def _playlist_entries(self, video_list_renderer):          for content in video_list_renderer['contents']: @@ -2832,8 +2873,11 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):              }      def _entries(self, tab, identity_token): +        slr_renderer = try_get(tab, lambda x: x['sectionListRenderer'], dict) +        if not slr_renderer: +            return          continuation = None -        slr_contents = try_get(tab, lambda x: x['sectionListRenderer']['contents'], list) or [] +        slr_contents = try_get(slr_renderer, lambda x: x['contents'], list) or []          for slr_content in slr_contents:              if not isinstance(slr_content, dict):                  continue @@ -2876,6 +2920,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):              if not continuation:                  continuation = self._extract_continuation(is_renderer) +        if not continuation: +            continuation = self._extract_continuation(slr_renderer) +          headers = {              'x-youtube-client-name': '1',              'x-youtube-client-version': '2.20201112.04.01', @@ -2924,7 +2971,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):                  continuation_item = continuation_items[0]                  if not isinstance(continuation_item, dict):                      continue -                renderer = continuation_item.get('playlistVideoRenderer') +                renderer = continuation_item.get('playlistVideoRenderer') or continuation_item.get('itemSectionRenderer')                  if renderer:                      video_list_renderer = {'contents': continuation_items}                      for entry in self._playlist_entries(video_list_renderer): @@ -2969,6 +3016,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):          selected_tab = self._extract_selected_tab(tabs)          renderer = try_get(              data, lambda x: x['metadata']['channelMetadataRenderer'], dict) +        playlist_id = title = description = None          if renderer:              channel_title = renderer.get('title') or item_id              tab_title = selected_tab.get('title') @@ -3289,10 +3337,10 @@ class YoutubeSearchURLIE(YoutubeSearchIE):  """ -class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): +class YoutubeFeedsInfoExtractor(YoutubeTabIE):      """      Base class for feed extractors -    Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. +    Subclasses must define the _FEED_NAME property.      """      _LOGIN_REQUIRED = True @@ -3303,55 +3351,17 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):      def _real_initialize(self):          self._login() -    def _entries(self, page): -        # The extraction process is the same as for playlists, but the regex -        # for the video ids doesn't contain an index -        ids = [] -        more_widget_html = content_html = page -        for page_num in itertools.count(1): -            matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) - -            # 'recommended' feed has infinite 'load more' and each new portion spins -            # the same videos in (sometimes) slightly different order, so we'll check -            # for unicity and break when portion has no new videos -            new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches))) -            if not new_ids: -                break - -            ids.extend(new_ids) - -            for entry in self._ids_to_results(new_ids): -                yield entry - -            mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) -            if not mobj: -                break - -            more = self._download_json( -                'https://www.youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, -                'Downloading page #%s' % page_num, -                transform_source=uppercase_escape, -                headers=self._YOUTUBE_CLIENT_HEADERS) -            content_html = more['content_html'] -            more_widget_html = more['load_more_widget_html'] -      def _real_extract(self, url): -        page = self._download_webpage( +        return self.url_result(              'https://www.youtube.com/feed/%s' % self._FEED_NAME, -            self._PLAYLIST_TITLE) -        return self.playlist_result( -            self._entries(page), playlist_title=self._PLAYLIST_TITLE) +            ie=YoutubeTabIE.ie_key())  class YoutubeWatchLaterIE(InfoExtractor):      IE_NAME = 'youtube:watchlater'      IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' -    _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/watch_later|:ytwatchlater' - +    _VALID_URL = r':ytwatchlater'      _TESTS = [{ -        'url': 'https://www.youtube.com/feed/watch_later', -        'only_matching': True, -    }, {          'url': ':ytwatchlater',          'only_matching': True,      }] @@ -3363,23 +3373,38 @@ class YoutubeWatchLaterIE(InfoExtractor):  class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):      IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' -    _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?' +    _VALID_URL = r':ytrec(?:ommended)?'      _FEED_NAME = 'recommended' -    _PLAYLIST_TITLE = 'Youtube Recommended videos' +    _TESTS = [{ +        'url': ':ytrec', +        'only_matching': True, +    }, { +        'url': ':ytrecommended', +        'only_matching': True, +    }]  class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):      IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' -    _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' +    _VALID_URL = r':ytsubs(?:criptions)?'      _FEED_NAME = 'subscriptions' -    _PLAYLIST_TITLE = 'Youtube Subscriptions' +    _TESTS = [{ +        'url': ':ytsubs', +        'only_matching': True, +    }, { +        'url': ':ytsubscriptions', +        'only_matching': True, +    }]  class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):      IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' -    _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory' +    _VALID_URL = r':ythistory'      _FEED_NAME = 'history' -    _PLAYLIST_TITLE = 'Youtube History' +    _TESTS = [{ +        'url': ':ythistory', +        'only_matching': True, +    }]  class YoutubeTruncatedURLIE(InfoExtractor): | 
