diff options
| -rw-r--r-- | test/test_all_urls.py | 6 | ||||
| -rw-r--r-- | youtube_dl/extractor/extractors.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 177 | 
3 files changed, 93 insertions, 92 deletions
diff --git a/test/test_all_urls.py b/test/test_all_urls.py index df6d81b5d..0e1328ede 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -66,9 +66,9 @@ class TestAllURLsMatching(unittest.TestCase):          self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:tab'])          self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:tab']) -    # def test_youtube_search_matching(self): -    #     self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) -    #     self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) +    def test_youtube_search_matching(self): +        self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) +        self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url'])      def test_youtube_extract(self):          assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 9b449937d..d403a2dbe 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1556,7 +1556,7 @@ from .youtube import (      YoutubeRecommendedIE,      YoutubeSearchDateIE,      YoutubeSearchIE, -    #YoutubeSearchURLIE, +    YoutubeSearchURLIE,      YoutubeSubscriptionsIE,      YoutubeTruncatedIDIE,      YoutubeTruncatedURLIE, diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 87bdc1677..578cfcf90 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -308,6 +308,77 @@ class YoutubeBaseInfoExtractor(InfoExtractor):                  r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',                  default='{}'), video_id, fatal=False) +    def _search_results(self, query, params): +        data = { +            'context': { +                'client': { +                    'clientName': 'WEB', +                    'clientVersion': '2.20201021.03.00', +                } +            }, +            'query': query, +        } +        if params: +            data['params'] = params +        for page_num in itertools.count(1): +            search = self._download_json( +                'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', +                video_id='query "%s"' % query, +                note='Downloading page %s' % page_num, +                errnote='Unable to download API page', fatal=False, +                data=json.dumps(data).encode('utf8'), +                headers={'content-type': 'application/json'}) +            if not search: +                break +            slr_contents = try_get( +                search, +                (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'], +                 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']), +                list) +            if not slr_contents: +                break +            isr_contents = try_get( +                slr_contents, +                lambda x: x[0]['itemSectionRenderer']['contents'], +                list) +            if not isr_contents: +                break +            for content in isr_contents: +                if not isinstance(content, dict): +                    continue +                video = content.get('videoRenderer') +                if not isinstance(video, dict): +                    continue +                video_id = video.get('videoId') +                if not video_id: +                    continue +                title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str) +                description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str) +                duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str)) +                view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or '' +                view_count = int_or_none(self._search_regex( +                    r'^(\d+)', re.sub(r'\s', '', view_count_text), +                    'view count', default=None)) +                uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str) +                yield { +                    '_type': 'url_transparent', +                    'ie_key': YoutubeIE.ie_key(), +                    'id': video_id, +                    'url': video_id, +                    'title': title, +                    'description': description, +                    'duration': duration, +                    'view_count': view_count, +                    'uploader': uploader, +                } +            token = try_get( +                slr_contents, +                lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'], +                compat_str) +            if not token: +                break +            data['continuation'] = token +  class YoutubeIE(YoutubeBaseInfoExtractor):      IE_DESC = 'YouTube.com' @@ -2454,7 +2525,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):                          (?:                              (?:channel|c|user|feed)/|                              (?:playlist|watch)\?.*?\blist=| -                            (?!(?:watch|embed|v|e)\b) +                            (?!(?:watch|embed|v|e|results)\b)                          )                          (?P<id>[^/?\#&]+)                      ''' @@ -3379,88 +3450,18 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):  class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):      IE_DESC = 'YouTube.com searches' -    # there doesn't appear to be a real limit, for example if you search for -    # 'python' you get more than 8.000.000 results -    _MAX_RESULTS = float('inf')      IE_NAME = 'youtube:search'      _SEARCH_KEY = 'ytsearch' -    _SEARCH_PARAMS = None +    _SEARCH_PARAMS = 'EgIQAQ%3D%3D'  # Videos only      _TESTS = []      def _entries(self, query, n): -        data = { -            'context': { -                'client': { -                    'clientName': 'WEB', -                    'clientVersion': '2.20201021.03.00', -                } -            }, -            'query': query, -        } -        if self._SEARCH_PARAMS: -            data['params'] = self._SEARCH_PARAMS          total = 0 -        for page_num in itertools.count(1): -            search = self._download_json( -                'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', -                video_id='query "%s"' % query, -                note='Downloading page %s' % page_num, -                errnote='Unable to download API page', fatal=False, -                data=json.dumps(data).encode('utf8'), -                headers={'content-type': 'application/json'}) -            if not search: -                break -            slr_contents = try_get( -                search, -                (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'], -                 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']), -                list) -            if not slr_contents: -                break -            isr_contents = try_get( -                slr_contents, -                lambda x: x[0]['itemSectionRenderer']['contents'], -                list) -            if not isr_contents: -                break -            for content in isr_contents: -                if not isinstance(content, dict): -                    continue -                video = content.get('videoRenderer') -                if not isinstance(video, dict): -                    continue -                video_id = video.get('videoId') -                if not video_id: -                    continue -                title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str) -                description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str) -                duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str)) -                view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or '' -                view_count = int_or_none(self._search_regex( -                    r'^(\d+)', re.sub(r'\s', '', view_count_text), -                    'view count', default=None)) -                uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str) -                total += 1 -                yield { -                    '_type': 'url_transparent', -                    'ie_key': YoutubeIE.ie_key(), -                    'id': video_id, -                    'url': video_id, -                    'title': title, -                    'description': description, -                    'duration': duration, -                    'view_count': view_count, -                    'uploader': uploader, -                } -                if total == n: -                    return -            token = try_get( -                slr_contents, -                lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'], -                compat_str) -            if not token: -                break -            data['continuation'] = token +        for entry in self._search_results(query, self._SEARCH_PARAMS): +            yield entry +            total += 1 +            if total >= n: +                return      def _get_n_results(self, query, n):          """Get a specified number of results for a query""" @@ -3471,18 +3472,19 @@ class YoutubeSearchDateIE(YoutubeSearchIE):      IE_NAME = YoutubeSearchIE.IE_NAME + ':date'      _SEARCH_KEY = 'ytsearchdate'      IE_DESC = 'YouTube.com searches, newest videos first' -    _SEARCH_PARAMS = 'CAI%3D' +    _SEARCH_PARAMS = 'CAISAhAB'  # Videos only, sorted by date +    _TESTS = [] -r""" -class YoutubeSearchURLIE(YoutubeSearchIE): -    IE_DESC = 'YouTube.com search URLs' -    IE_NAME = 'youtube:search_url' -    _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' +class YoutubeSearchURLIE(YoutubeBaseInfoExtractor): +    IE_DESC = 'YouTube search URLs with sorting and filter support' +    IE_NAME = YoutubeSearchIE.IE_NAME + '_url' +    _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'      _TESTS = [{          'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',          'playlist_mincount': 5,          'info_dict': { +            'id': 'youtube-dl test video',              'title': 'youtube-dl test video',          }      }, { @@ -3491,11 +3493,10 @@ class YoutubeSearchURLIE(YoutubeSearchIE):      }]      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        query = compat_urllib_parse_unquote_plus(mobj.group('query')) -        webpage = self._download_webpage(url, query) -        return self.playlist_result(self._process_page(webpage), playlist_title=query) -""" +        qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) +        query = (qs.get('search_query') or qs.get('q'))[0] +        params = qs.get('sp', ('',))[0] +        return self.playlist_result(self._search_results(query, params), query, query)  class YoutubeFeedsInfoExtractor(YoutubeTabIE):  | 
