[youtube] Improve extraction

+ Add support for --no-playlist (closes #27009) * Improve playlist and mix extraction (closes #26390, closes #26509, closes #26534, closes #27011) + Extract playlist uploader data * Update tests
author: Sergey M․ <dstftw@gmail.com> 2020-11-18 03:32:42 +0700
committer: Sergey M․ <dstftw@gmail.com> 2020-11-18 03:34:08 +0700
commit: 2864179293ba16189544b28356647886960a48fe (patch)
tree: 1327aa1ea264304796784626dee33b4ae33cc171 /youtube_dl/extractor/youtube.py
parent: 284f8306dffb3dbeeca3f99ef4c32ed4fcd571c3 (diff)
1 files changed, 137 insertions, 92 deletions
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 4089e2aba..79f87aa85 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -45,6 +45,7 @@ from ..utils import (
     unescapeHTML,
     unified_strdate,
     unsmuggle_url,
+    update_url_query,
     uppercase_escape,
     url_or_none,
     urlencode_postdata,
@@ -65,7 +66,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
     # If True it will raise an error if no login info is provided
     _LOGIN_REQUIRED = False
 
-    _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}'
+    _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM)'
 
     _YOUTUBE_CLIENT_HEADERS = {
         'x-youtube-client-name': '1',
@@ -975,10 +976,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             'only_matching': True,
         },
         {
-            'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
-            'only_matching': True,
-        },
-        {
             'url': 'https://invidio.us/watch?v=BaW_jenozKc',
             'only_matching': True,
         },
@@ -2351,7 +2348,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
 
 class YoutubeTabIE(YoutubeBaseInfoExtractor):
     IE_DESC = 'YouTube.com tab'
-    _VALID_URL = r'https?://(?:\w+\.)?(?:youtube(?:kids)?\.com|invidio\.us)/(?:(?:channel|c|user)/|playlist\?.*?\blist=)(?P<id>[^/?#&]+)'
+    _VALID_URL = r'https?://(?:\w+\.)?(?:youtube(?:kids)?\.com|invidio\.us)/(?:(?:channel|c|user)/|(?:playlist|watch)\?.*?\blist=)(?P<id>[^/?#&]+)'
     IE_NAME = 'youtube:tab'
 
     _TESTS = [{
@@ -2361,6 +2358,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
         'info_dict': {
             'id': 'UCqj7Cz7revf5maW9g5pgNcg',
             'title': 'Игорь Клейнер - Playlists',
+            'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
         },
     }, {
         # playlists, multipage, different order
@@ -2369,14 +2367,16 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
         'info_dict': {
             'id': 'UCqj7Cz7revf5maW9g5pgNcg',
             'title': 'Игорь Клейнер - Playlists',
+            'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
         },
     }, {
         # playlists, singlepage
         'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
         'playlist_mincount': 4,
         'info_dict': {
-            'id': 'ThirstForScience',
-            'title': 'ThirstForScience',
+            'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
+            'title': 'ThirstForScience - Playlists',
+            'description': 'md5:609399d937ea957b0f53cbffb747a14c',
         }
     }, {
         'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
@@ -2407,6 +2407,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
         'info_dict': {
             'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
             'title': 'lex will - Home',
+            'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
         },
         'playlist_mincount': 2,
     }, {
@@ -2415,6 +2416,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
         'info_dict': {
             'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
             'title': 'lex will - Videos',
+            'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
         },
         'playlist_mincount': 975,
     }, {
@@ -2423,6 +2425,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
         'info_dict': {
             'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
             'title': 'lex will - Videos',
+            'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
         },
         'playlist_mincount': 199,
     }, {
@@ -2431,6 +2434,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
         'info_dict': {
             'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
             'title': 'lex will - Playlists',
+            'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
         },
         'playlist_mincount': 17,
     }, {
@@ -2439,6 +2443,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
         'info_dict': {
             'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
             'title': 'lex will - Community',
+            'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
         },
         'playlist_mincount': 18,
     }, {
@@ -2447,6 +2452,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
         'info_dict': {
             'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
             'title': 'lex will - Channels',
+            'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
         },
         'playlist_mincount': 138,
     }, {
@@ -2465,7 +2471,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
             'title': '29C3: Not my department',
             'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
             'uploader': 'Christiaan008',
-            'uploader_id': 'ChRiStIaAn008',
+            'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
         },
         'playlist_count': 96,
     }, {
@@ -2475,7 +2481,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
             'title': 'Uploads from Cauchemar',
             'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
             'uploader': 'Cauchemar',
-            'uploader_id': 'Cauchemar89',
+            'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
         },
         'playlist_mincount': 1123,
     }, {
@@ -2489,7 +2495,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
             'title': 'Uploads from Interstellar Movie',
             'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
             'uploader': 'Interstellar Movie',
-            'uploader_id': 'InterstellarMovie1',
+            'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
         },
         'playlist_mincount': 21,
     }, {
@@ -2498,13 +2504,43 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
         'info_dict': {
             'title': 'Data Analysis with Dr Mike Pound',
             'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
-            'uploader_id': 'Computerphile',
+            'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
             'uploader': 'Computerphile',
         },
         'playlist_mincount': 11,
     }, {
         'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
         'only_matching': True,
+    }, {
+        # Playlist URL that does not actually serve a playlist
+        'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
+        'info_dict': {
+            'id': 'FqZTN594JQw',
+            'ext': 'webm',
+            'title': "Smiley's People 01 detective, Adventure Series, Action",
+            'uploader': 'STREEM',
+            'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
+            'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
+            'upload_date': '20150526',
+            'license': 'Standard YouTube License',
+            'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
+            'categories': ['People & Blogs'],
+            'tags': list,
+            'view_count': int,
+            'like_count': int,
+            'dislike_count': int,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'skip': 'This video is not available.',
+        'add_ie': [YoutubeIE.ie_key()],
+    }, {
+        'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
+        'only_matching': True,
     }]
 
     @classmethod
@@ -2535,7 +2571,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
     def _extract_video(self, renderer):
         video_id = renderer.get('videoId')
         title = try_get(
-            renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
+            renderer,
+            (lambda x: x['title']['runs'][0]['text'],
+             lambda x: x['title']['simpleText']), compat_str)
         description = try_get(
             renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'],
             compat_str)
@@ -2615,7 +2653,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
         for content in video_list_renderer['contents']:
             if not isinstance(content, dict):
                 continue
-            renderer = content.get('playlistVideoRenderer')
+            renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer')
             if not isinstance(renderer, dict):
                 continue
             video_id = renderer.get('videoId')
@@ -2715,7 +2753,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
 
     def _entries(self, tab, identity_token):
         continuation = None
-        slr_contents = tab['sectionListRenderer']['contents']
+        slr_contents = try_get(tab, lambda x: x['sectionListRenderer']['contents'], list) or []
         for slr_content in slr_contents:
             if not isinstance(slr_content, dict):
                 continue
@@ -2824,13 +2862,30 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
         else:
             raise ExtractorError('Unable to find selected tab')
 
-    def _real_extract(self, url):
-        item_id = self._match_id(url)
-        url = compat_urlparse.urlunparse(
-            compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
-        webpage = self._download_webpage(url, item_id)
-        data = self._extract_yt_initial_data(item_id, webpage)
-        tabs = data['contents']['twoColumnBrowseResultsRenderer']['tabs']
+    @staticmethod
+    def _extract_uploader(data):
+        uploader = {}
+        sidebar_renderer = try_get(
+            data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list)
+        if sidebar_renderer:
+            for item in sidebar_renderer:
+                if not isinstance(item, dict):
+                    continue
+                renderer = item.get('playlistSidebarSecondaryInfoRenderer')
+                if not isinstance(renderer, dict):
+                    continue
+                owner = try_get(
+                    renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
+                if owner:
+                    uploader['uploader'] = owner.get('text')
+                    uploader['uploader_id'] = try_get(
+                        owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
+                    uploader['uploader_url'] = urljoin(
+                        'https://www.youtube.com/',
+                        try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
+        return uploader
+
+    def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
         selected_tab = self._extract_selected_tab(tabs)
         renderer = try_get(
             data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
@@ -2848,42 +2903,69 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
             title = renderer.get('title')
             description = None
             playlist_id = item_id
-        identity_token = self._search_regex(
-            r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
-            'identity token', default=None)
-        return self.playlist_result(
+        playlist = self.playlist_result(
             self._entries(selected_tab['content'], identity_token),
             playlist_id=playlist_id, playlist_title=title,
             playlist_description=description)
+        playlist.update(self._extract_uploader(data))
+        return playlist
+
+    def _extract_from_playlist(self, item_id, data, playlist):
+        title = playlist.get('title') or try_get(
+            data, lambda x: x['titleText']['simpleText'], compat_str)
+        playlist_id = playlist.get('playlistId') or item_id
+        return self.playlist_result(
+            self._playlist_entries(playlist), playlist_id=playlist_id,
+            playlist_title=title)
+
+    def _real_extract(self, url):
+        item_id = self._match_id(url)
+        url = compat_urlparse.urlunparse(
+            compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
+        # Handle both video/playlist URLs
+        qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+        video_id = qs.get('v', [None])[0]
+        playlist_id = qs.get('list', [None])[0]
+        if video_id and playlist_id:
+            if self._downloader.params.get('noplaylist'):
+                self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+                return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
+            self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
+        webpage = self._download_webpage(url, item_id)
+        identity_token = self._search_regex(
+            r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
+            'identity token', default=None)
+        data = self._extract_yt_initial_data(item_id, webpage)
+        tabs = try_get(
+            data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
+        if tabs:
+            return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
+        playlist = try_get(
+            data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
+        if playlist:
+            return self._extract_from_playlist(item_id, data, playlist)
+        # Fallback to video extraction if no playlist alike page is recognized
+        if video_id:
+            return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
+        # Failed to recognize
+        raise ExtractorError('Unable to recognize tab page')
 
 
 class YoutubePlaylistIE(InfoExtractor):
     IE_DESC = 'YouTube.com playlists'
-    _VALID_URL = r"""(?x)(?:
+    _VALID_URL = r'''(?x)(?:
                         (?:https?://)?
                         (?:\w+\.)?
                         (?:
                             (?:
                                 youtube(?:kids)?\.com|
-                                invidio\.us
+                                invidio\.us|
+                                youtu\.be
                             )
-                            /
-                            (?:
-                               (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
-                               \? (?:.*?[&;])*? (?:p|a|list)=
-                            |  p/
-                            )|
-                            youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
-                        )
-                        (
-                            (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,}
-                            # Top tracks, they can also include dots
-                            |(?:MC)[\w\.]*
-                        )
-                        .*
-                     |
-                        (%(playlist_id)s)
-                     )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
+                            /.*?\?.*?\blist=
+                        )?
+                        (?P<id>%(playlist_id)s)
+                     )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
     IE_NAME = 'youtube:playlist'
     _TESTS = [{
         'note': 'issue #673',
@@ -2892,7 +2974,7 @@ class YoutubePlaylistIE(InfoExtractor):
             'title': '[OLD]Team Fortress 2 (Class-based LP)',
             'id': 'PLBB231211A4F62143',
             'uploader': 'Wickydoo',
-            'uploader_id': 'Wickydoo',
+            'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
         },
         'playlist_mincount': 29,
     }, {
@@ -2920,42 +3002,9 @@ class YoutubePlaylistIE(InfoExtractor):
             'title': '2018 Chinese New Singles (11/6 updated)',
             'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
             'uploader': 'LBK',
-            'uploader_id': 'sdragonfang',
+            'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
         }
     }, {
-        'note': 'Embedded SWF player',
-        'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
-        'playlist_count': 4,
-        'info_dict': {
-            'title': 'JODA7',
-            'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
-        },
-        'skip': 'This playlist does not exist',
-    }, {
-        # Playlist URL that does not actually serve a playlist
-        'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
-        'info_dict': {
-            'id': 'FqZTN594JQw',
-            'ext': 'webm',
-            'title': "Smiley's People 01 detective, Adventure Series, Action",
-            'uploader': 'STREEM',
-            'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
-            'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
-            'upload_date': '20150526',
-            'license': 'Standard YouTube License',
-            'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
-            'categories': ['People & Blogs'],
-            'tags': list,
-            'view_count': int,
-            'like_count': int,
-            'dislike_count': int,
-        },
-        'params': {
-            'skip_download': True,
-        },
-        'skip': 'This video is not available.',
-        'add_ie': [YoutubeIE.ie_key()],
-    }, {
         'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
         'info_dict': {
             'id': 'yeWKywCrFtk',
@@ -2985,9 +3034,6 @@ class YoutubePlaylistIE(InfoExtractor):
         # music album playlist
         'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
         'only_matching': True,
-    }, {
-        'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
-        'only_matching': True,
     }]
 
     @classmethod
@@ -2996,13 +3042,12 @@ class YoutubePlaylistIE(InfoExtractor):
             YoutubePlaylistIE, cls).suitable(url)
 
     def _real_extract(self, url):
-        # Extract playlist id
-        mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError('Invalid URL: %s' % url)
-        playlist_id = mobj.group(1) or mobj.group(2)
+        playlist_id = self._match_id(url)
+        qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+        if not qs:
+            qs = {'list': playlist_id}
         return self.url_result(
-            'https://www.youtube.com/playlist?list=%s' % playlist_id,
+            update_url_query('https://www.youtube.com/playlist', qs),
             ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
 
 
@@ -3250,13 +3295,13 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
 class YoutubeWatchLaterIE(InfoExtractor):
     IE_NAME = 'youtube:watchlater'
     IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
-    _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
+    _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/watch_later|:ytwatchlater'
 
     _TESTS = [{
-        'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
+        'url': 'https://www.youtube.com/feed/watch_later',
         'only_matching': True,
     }, {
-        'url': 'https://www.youtube.com/feed/watch_later',
+        'url': ':ytwatchlater',
         'only_matching': True,
     }]
author	Sergey M․ <dstftw@gmail.com>	2020-11-18 03:32:42 +0700
committer	Sergey M․ <dstftw@gmail.com>	2020-11-18 03:34:08 +0700
commit	2864179293ba16189544b28356647886960a48fe (patch)
tree	1327aa1ea264304796784626dee33b4ae33cc171 /youtube_dl/extractor/youtube.py
parent	284f8306dffb3dbeeca3f99ef4c32ed4fcd571c3 (diff)