diff options
author | Sergey M․ <dstftw@gmail.com> | 2020-11-24 00:10:25 +0700 |
---|---|---|
committer | Sergey M․ <dstftw@gmail.com> | 2020-11-24 00:10:25 +0700 |
commit | 191286265d8ad4fff834249d44e3d334a969b668 (patch) | |
tree | c131ab631c62ed0914b4fca59361416f7288e67f | |
parent | 323427281899cf3527e504ac7058f5b10aad5d61 (diff) |
[youtube:tab] Fix feeds extraction (closes #25695, closes #26452)
-rw-r--r-- | test/test_all_urls.py | 7 | ||||
-rw-r--r-- | youtube_dl/extractor/youtube.py | 161 |
2 files changed, 97 insertions, 71 deletions
diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 56a08bed8..50c3466fa 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -61,9 +61,10 @@ class TestAllURLsMatching(unittest.TestCase): # self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:tab']) def test_youtube_feeds(self): - self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater']) - self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions']) - self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended']) + self.assertMatch('https://www.youtube.com/feed/library', ['youtube:tab']) + self.assertMatch('https://www.youtube.com/feed/history', ['youtube:tab']) + self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:tab']) + self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:tab']) # def test_youtube_search_matching(self): # self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index abb1cad74..7324d8080 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -33,7 +33,6 @@ from ..utils import ( get_element_by_id, int_or_none, mimetype2ext, - orderedSet, parse_codecs, parse_duration, remove_quotes, @@ -2381,7 +2380,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): class YoutubeTabIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com tab' - _VALID_URL = r'https?://(?:\w+\.)?(?:youtube(?:kids)?\.com|invidio\.us)/(?:(?:channel|c|user)/|(?:playlist|watch)\?.*?\blist=)(?P<id>[^/?#&]+)' + _VALID_URL = r'''(?x) + https?:// + (?:\w+\.)? + (?: + youtube(?:kids)?\.com| + invidio\.us + )/ + (?: + (?:channel|c|user|feed)/| + (?:playlist|watch)\?.*?\blist= + ) + (?P<id>[^/?\#&]+) + ''' IE_NAME = 'youtube:tab' _TESTS = [{ @@ -2620,7 +2631,30 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): }, { 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', 'only_matching': True, - }, + }, { + 'url': 'https://www.youtube.com/feed/trending', + 'only_matching': True, + }, { + # needs auth + 'url': 'https://www.youtube.com/feed/library', + 'only_matching': True, + }, { + # needs auth + 'url': 'https://www.youtube.com/feed/history', + 'only_matching': True, + }, { + # needs auth + 'url': 'https://www.youtube.com/feed/subscriptions', + 'only_matching': True, + }, { + # needs auth + 'url': 'https://www.youtube.com/feed/watch_later', + 'only_matching': True, + }, { + # no longer available? + 'url': 'https://www.youtube.com/feed/recommended', + 'only_matching': True, + } # TODO # { # 'url': 'https://www.youtube.com/TheYoungTurks/live', @@ -2707,27 +2741,34 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'https://www.youtube.com/channel/%s' % channel_id, ie=YoutubeTabIE.ie_key(), video_title=title) - def _shelf_entries_trimmed(self, shelf_renderer): - renderer = try_get( - shelf_renderer, lambda x: x['content']['horizontalListRenderer'], dict) - if not renderer: + def _shelf_entries_from_content(self, shelf_renderer): + content = shelf_renderer.get('content') + if not isinstance(content, dict): return - # TODO: add support for nested playlists so each shelf is processed - # as separate playlist - # TODO: this includes only first N items - for entry in self._grid_entries(renderer): - yield entry + renderer = content.get('gridRenderer') + if renderer: + # TODO: add support for nested playlists so each shelf is processed + # as separate playlist + # TODO: this includes only first N items + for entry in self._grid_entries(renderer): + yield entry + renderer = content.get('horizontalListRenderer') + if renderer: + # TODO + pass def _shelf_entries(self, shelf_renderer): ep = try_get( shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'], compat_str) shelf_url = urljoin('https://www.youtube.com', ep) - if not shelf_url: - return - title = try_get( - shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str) - yield self.url_result(shelf_url, video_title=title) + if shelf_url: + title = try_get( + shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str) + yield self.url_result(shelf_url, video_title=title) + # Shelf may not contain shelf URL, fallback to extraction from content + for entry in self._shelf_entries_from_content(shelf_renderer): + yield entry def _playlist_entries(self, video_list_renderer): for content in video_list_renderer['contents']: @@ -2832,8 +2873,11 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): } def _entries(self, tab, identity_token): + slr_renderer = try_get(tab, lambda x: x['sectionListRenderer'], dict) + if not slr_renderer: + return continuation = None - slr_contents = try_get(tab, lambda x: x['sectionListRenderer']['contents'], list) or [] + slr_contents = try_get(slr_renderer, lambda x: x['contents'], list) or [] for slr_content in slr_contents: if not isinstance(slr_content, dict): continue @@ -2876,6 +2920,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): if not continuation: continuation = self._extract_continuation(is_renderer) + if not continuation: + continuation = self._extract_continuation(slr_renderer) + headers = { 'x-youtube-client-name': '1', 'x-youtube-client-version': '2.20201112.04.01', @@ -2924,7 +2971,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): continuation_item = continuation_items[0] if not isinstance(continuation_item, dict): continue - renderer = continuation_item.get('playlistVideoRenderer') + renderer = continuation_item.get('playlistVideoRenderer') or continuation_item.get('itemSectionRenderer') if renderer: video_list_renderer = {'contents': continuation_items} for entry in self._playlist_entries(video_list_renderer): @@ -2969,6 +3016,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): selected_tab = self._extract_selected_tab(tabs) renderer = try_get( data, lambda x: x['metadata']['channelMetadataRenderer'], dict) + playlist_id = title = description = None if renderer: channel_title = renderer.get('title') or item_id tab_title = selected_tab.get('title') @@ -3289,10 +3337,10 @@ class YoutubeSearchURLIE(YoutubeSearchIE): """ -class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): +class YoutubeFeedsInfoExtractor(YoutubeTabIE): """ Base class for feed extractors - Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. + Subclasses must define the _FEED_NAME property. """ _LOGIN_REQUIRED = True @@ -3303,55 +3351,17 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): def _real_initialize(self): self._login() - def _entries(self, page): - # The extraction process is the same as for playlists, but the regex - # for the video ids doesn't contain an index - ids = [] - more_widget_html = content_html = page - for page_num in itertools.count(1): - matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) - - # 'recommended' feed has infinite 'load more' and each new portion spins - # the same videos in (sometimes) slightly different order, so we'll check - # for unicity and break when portion has no new videos - new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches))) - if not new_ids: - break - - ids.extend(new_ids) - - for entry in self._ids_to_results(new_ids): - yield entry - - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) - if not mobj: - break - - more = self._download_json( - 'https://www.youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, - 'Downloading page #%s' % page_num, - transform_source=uppercase_escape, - headers=self._YOUTUBE_CLIENT_HEADERS) - content_html = more['content_html'] - more_widget_html = more['load_more_widget_html'] - def _real_extract(self, url): - page = self._download_webpage( + return self.url_result( 'https://www.youtube.com/feed/%s' % self._FEED_NAME, - self._PLAYLIST_TITLE) - return self.playlist_result( - self._entries(page), playlist_title=self._PLAYLIST_TITLE) + ie=YoutubeTabIE.ie_key()) class YoutubeWatchLaterIE(InfoExtractor): IE_NAME = 'youtube:watchlater' IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/watch_later|:ytwatchlater' - + _VALID_URL = r':ytwatchlater' _TESTS = [{ - 'url': 'https://www.youtube.com/feed/watch_later', - 'only_matching': True, - }, { 'url': ':ytwatchlater', 'only_matching': True, }] @@ -3363,23 +3373,38 @@ class YoutubeWatchLaterIE(InfoExtractor): class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?' + _VALID_URL = r':ytrec(?:ommended)?' _FEED_NAME = 'recommended' - _PLAYLIST_TITLE = 'Youtube Recommended videos' + _TESTS = [{ + 'url': ':ytrec', + 'only_matching': True, + }, { + 'url': ':ytrecommended', + 'only_matching': True, + }] class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' + _VALID_URL = r':ytsubs(?:criptions)?' _FEED_NAME = 'subscriptions' - _PLAYLIST_TITLE = 'Youtube Subscriptions' + _TESTS = [{ + 'url': ':ytsubs', + 'only_matching': True, + }, { + 'url': ':ytsubscriptions', + 'only_matching': True, + }] class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory' + _VALID_URL = r':ythistory' _FEED_NAME = 'history' - _PLAYLIST_TITLE = 'Youtube History' + _TESTS = [{ + 'url': ':ythistory', + 'only_matching': True, + }] class YoutubeTruncatedURLIE(InfoExtractor): |