diff options
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 202 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 11 | 
2 files changed, 168 insertions, 45 deletions
| diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 28fdb086a..65428528d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -42,6 +42,7 @@ from ..utils import (      unescapeHTML,      unified_strdate,      unsmuggle_url, +    update_url,      update_url_query,      url_or_none,      urlencode_postdata, @@ -286,15 +287,18 @@ class YoutubeBaseInfoExtractor(InfoExtractor):      _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'      _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)' -    def _call_api(self, ep, query, video_id, fatal=True): +    def _call_api(self, ep, query, video_id, fatal=True, headers=None):          data = self._DEFAULT_API_DATA.copy()          data.update(query) +        real_headers = {'content-type': 'application/json'} +        if headers: +            real_headers.update(headers)          return self._download_json(              'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id,              note='Downloading API JSON', errnote='Unable to download API page',              data=json.dumps(data).encode('utf8'), fatal=fatal, -            headers={'content-type': 'application/json'}, +            headers=real_headers,              query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})      def _extract_yt_initial_data(self, video_id, webpage): @@ -515,6 +519,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'uploader': 'Philipp Hagemeister',                  'uploader_id': 'phihag',                  'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', +                'channel': 'Philipp Hagemeister',                  'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',                  'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',                  'upload_date': '20121002', @@ -524,10 +529,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'duration': 10,                  'view_count': int,                  'like_count': int, -                'dislike_count': int, +                'thumbnail': 'https://i.ytimg.com/vi/BaW_jenozKc/maxresdefault.jpg',                  'start_time': 1,                  'end_time': 9, -            } +            },          },          {              'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ', @@ -562,7 +567,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'duration': 10,                  'view_count': int,                  'like_count': int, -                'dislike_count': int,              },              'params': {                  'skip_download': True, @@ -621,8 +625,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms \r\n\r\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',              }          }, -        # Normal age-gate video (No vevo, embed allowed), available via embed page +        # Age-gated videos          { +            'note': 'Age-gated video (No vevo, embed allowed)',              'url': 'https://youtube.com/watch?v=HtVdAasjOgU',              'info_dict': {                  'id': 'HtVdAasjOgU', @@ -631,17 +636,97 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',                  'duration': 142,                  'uploader': 'The Witcher', -                'uploader_id': 'WitcherGame', -                'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',                  'upload_date': '20140605', +                'thumbnail': 'https://i.ytimg.com/vi/HtVdAasjOgU/maxresdefault.jpg', +                'age_limit': 18, +                'categories': ['Gaming'], +                'tags': 'count:17', +                'channel': 'The Witcher', +                'channel_url': 'https://www.youtube.com/channel/UCzybXLxv08IApdjdN0mJhEg', +                'channel_id': 'UCzybXLxv08IApdjdN0mJhEg', +                'view_count': int, +                'like_count': int, +            }, +        }, +        { +            'note': 'Age-gated video with embed allowed in public site', +            'url': 'https://youtube.com/watch?v=HsUATh_Nc2U', +            'info_dict': { +                'id': 'HsUATh_Nc2U', +                'ext': 'mp4', +                'title': 'Godzilla 2 (Official Video)', +                'description': 'md5:bf77e03fcae5529475e500129b05668a', +                'duration': 177, +                'uploader': 'FlyingKitty', +                'upload_date': '20200408', +                'thumbnail': 'https://i.ytimg.com/vi/HsUATh_Nc2U/maxresdefault.jpg',                  'age_limit': 18, +                'categories': ['Entertainment'], +                'tags': ['Flyingkitty', 'godzilla 2'], +                'channel': 'FlyingKitty', +                'channel_url': 'https://www.youtube.com/channel/UCYQT13AtrJC0gsM1far_zJg', +                'channel_id': 'UCYQT13AtrJC0gsM1far_zJg', +                'view_count': int, +                'like_count': int,              },          },          { -            # Age-gated video only available with authentication (unavailable -            # via embed page workaround) +            'note': 'Age-gated video embedable only with clientScreen=EMBED', +            'url': 'https://youtube.com/watch?v=Tq92D6wQ1mg', +            'info_dict': { +                'id': 'Tq92D6wQ1mg', +                'ext': 'mp4', +                'title': '[MMD] Adios - EVERGLOW [+Motion DL]', +                'description': 'md5:17eccca93a786d51bc67646756894066', +                'duration': 106, +                'uploader': 'Projekt Melody', +                'upload_date': '20191227', +                'age_limit': 18, +                'thumbnail': 'https://i.ytimg.com/vi/Tq92D6wQ1mg/sddefault.jpg', +                'tags': ['mmd', 'dance', 'mikumikudance', 'kpop', 'vtuber'], +                'categories': ['Entertainment'], +                'channel': 'Projekt Melody', +                'channel_url': 'https://www.youtube.com/channel/UC1yoRdFoFJaCY-AGfD9W0wQ', +                'channel_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ', +                'view_count': int, +                'like_count': int, +            }, +        }, +        { +            'note': 'Non-Age-gated non-embeddable video', +            'url': 'https://youtube.com/watch?v=MeJVWBSsPAY', +            'info_dict': { +                'id': 'MeJVWBSsPAY', +                'ext': 'mp4', +                'title': 'OOMPH! - Such Mich Find Mich (Lyrics)', +                'description': 'Fan Video. Music & Lyrics by OOMPH!.', +                'duration': 210, +                'uploader': 'Herr Lurik', +                'uploader_id': 'st3in234', +                'upload_date': '20130730', +                'uploader_url': 'http://www.youtube.com/user/st3in234', +                'age_limit': 0, +                'thumbnail': 'https://i.ytimg.com/vi/MeJVWBSsPAY/hqdefault.jpg', +                'tags': ['oomph', 'such mich find mich', 'lyrics', 'german industrial', 'musica industrial'], +                'categories': ['Music'], +                'channel': 'Herr Lurik', +                'channel_url': 'https://www.youtube.com/channel/UCdR3RSDPqub28LjZx0v9-aA', +                'channel_id': 'UCdR3RSDPqub28LjZx0v9-aA', +                'artist': 'OOMPH!', +                'view_count': int, +                'like_count': int, +            }, +        }, +        { +            'note': 'Non-bypassable age-gated video', +            'url': 'https://youtube.com/watch?v=Cr381pDsSsA', +            'only_matching': True, +        }, +        { +            'note': 'Age-gated video only available with authentication (not via embed workaround)',              'url': 'XgnwCQzjau8',              'only_matching': True, +            'skip': '''This video has been removed for violating YouTube's Community Guidelines''',          },          # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)          # YouTube Red ad is not captured for creator @@ -670,17 +755,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              'info_dict': {                  'id': 'lqQg6PlCWgI',                  'ext': 'mp4', +                'title': 'Hockey - Women -  GER-AUS - London 2012 Olympic Games', +                'description': r're:(?s)(?:.+\s)?HO09  - Women -  GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games\s*',                  'duration': 6085,                  'upload_date': '20150827',                  'uploader_id': 'olympic',                  'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic', -                'description': 'HO09  - Women -  GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', -                'uploader': 'Olympic', -                'title': 'Hockey - Women -  GER-AUS - London 2012 Olympic Games', +                'uploader': r're:Olympics?', +                'age_limit': 0, +                'thumbnail': 'https://i.ytimg.com/vi/lqQg6PlCWgI/maxresdefault.jpg', +                'categories': ['Sports'], +                'tags': ['Hockey', '2012-07-31', '31 July 2012', 'Riverbank Arena', 'Session', 'Olympics', 'Olympic Games', 'London 2012', '2012 Summer Olympics', 'Summer Games'], +                'channel': 'Olympics', +                'channel_url': 'https://www.youtube.com/channel/UCTl3QQTvqHFjurroKxexy2Q', +                'channel_id': 'UCTl3QQTvqHFjurroKxexy2Q', +                'view_count': int, +                'like_count': int,              }, -            'params': { -                'skip_download': 'requires avconv', -            }          },          # Non-square pixels          { @@ -1683,27 +1774,52 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              player_response = self._call_api(                  'player', {'videoId': video_id}, video_id) -        playability_status = player_response.get('playabilityStatus') or {} -        if playability_status.get('reason') == 'Sign in to confirm your age': -            video_info = self._download_webpage( -                base_url + 'get_video_info', video_id, -                'Refetching age-gated info webpage', -                'unable to download video info webpage', query={ -                    'video_id': video_id, -                    'eurl': 'https://youtube.googleapis.com/v/' + video_id, -                    'html5': 1, -                    # See https://github.com/ytdl-org/youtube-dl/issues/29333#issuecomment-864049544 -                    'c': 'TVHTML5', -                    'cver': '6.20180913', -                }, fatal=False) -            if video_info: -                pr = self._parse_json( -                    try_get( -                        compat_parse_qs(video_info), -                        lambda x: x['player_response'][0], compat_str) or '{}', -                    video_id, fatal=False) -                if pr and isinstance(pr, dict): -                    player_response = pr +        def is_agegated(playability): +            if not isinstance(playability, dict): +                return + +            if playability.get('desktopLegacyAgeGateReason'): +                return True + +            reasons = filter(None, (playability.get(r) for r in ('status', 'reason'))) +            AGE_GATE_REASONS = ( +                'confirm your age', 'age-restricted', 'inappropriate',  # reason +                'age_verification_required', 'age_check_required',  # status +            ) +            return any(expected in reason for expected in AGE_GATE_REASONS for reason in reasons) + +        def get_playability_status(response): +            return try_get(response, lambda x: x['playabilityStatus'], dict) or {} + +        playability_status = get_playability_status(player_response) +        if (is_agegated(playability_status) +                and int_or_none(self._downloader.params.get('age_limit'), default=18) >= 18): + +            self.report_age_confirmation() + +            # Thanks: https://github.com/yt-dlp/yt-dlp/pull/3233 +            pb_context = {'html5Preference': 'HTML5_PREF_WANTS'} +            query = { +                'playbackContext': {'contentPlaybackContext': {'html5Preference': 'HTML5_PREF_WANTS'}}, +                'contentCheckOk': True, +                'racyCheckOk': True, +                'context': { +                    'client': {'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', 'clientVersion': '2.0', 'hl': 'en', 'clientScreen': 'EMBED'}, +                    'thirdParty': {'embedUrl': 'https://google.com'}, +                }, +                'videoId': video_id, +            } +            headers = { +                'X-YouTube-Client-Name': '85', +                'X-YouTube-Client-Version': '2.0', +                'Origin': 'https://www.youtube.com' +            } + +            video_info = self._call_api('player', query, video_id, fatal=False, headers=headers) +            age_gate_status = get_playability_status(video_info) +            if age_gate_status.get('status') == 'OK': +                player_response = video_info +                playability_status = age_gate_status          trailer_video_id = try_get(              playability_status, @@ -1932,12 +2048,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              for thumbnail in (try_get(                      container,                      lambda x: x['thumbnail']['thumbnails'], list) or []): -                thumbnail_url = thumbnail.get('url') +                thumbnail_url = url_or_none(thumbnail.get('url'))                  if not thumbnail_url:                      continue                  thumbnails.append({                      'height': int_or_none(thumbnail.get('height')), -                    'url': thumbnail_url, +                    'url': update_url(thumbnail_url, query=None, fragment=None),                      'width': int_or_none(thumbnail.get('width')),                  })              if thumbnails: @@ -2142,6 +2258,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                      sbr_tooltip = try_get(                          vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip'])                      if sbr_tooltip: +                        # however dislike_count was hidden by YT, as if there could ever be dislikable content on YT                          like_count, dislike_count = sbr_tooltip.split(' / ')                          info.update({                              'like_count': str_to_int(like_count), @@ -2411,7 +2528,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):              'tags': list,              'view_count': int,              'like_count': int, -            'dislike_count': int,          },          'params': {              'skip_download': True, @@ -2438,7 +2554,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):              'categories': ['News & Politics'],              'tags': list,              'like_count': int, -            'dislike_count': int,          },          'params': {              'skip_download': True, @@ -2458,7 +2573,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):              'categories': ['News & Politics'],              'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],              'like_count': int, -            'dislike_count': int,          },          'params': {              'skip_download': True, @@ -3043,8 +3157,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):      def _real_extract(self, url):          item_id = self._match_id(url) -        url = compat_urlparse.urlunparse( -            compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com')) +        url = update_url(url, netloc='www.youtube.com')          # Handle both video/playlist URLs          qs = parse_qs(url)          video_id = qs.get('v', [None])[0] @@ -3178,7 +3291,6 @@ class YoutubeYtBeIE(InfoExtractor):              'categories': ['Nonprofits & Activism'],              'tags': list,              'like_count': int, -            'dislike_count': int,          },          'params': {              'noplaylist': True, diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index e3c3ccff9..d5cc6386d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -4121,6 +4121,17 @@ def update_url_query(url, query):          query=compat_urllib_parse_urlencode(qs, True))) +def update_url(url, **kwargs): +    """Replace URL components specified by kwargs +       url: compat_str or parsed URL tuple +       returns: compat_str""" +    if not kwargs: +        return compat_urlparse.urlunparse(url) if isinstance(url, tuple) else url +    if not isinstance(url, tuple): +        url = compat_urlparse.urlparse(url) +    return compat_urlparse.urlunparse(url._replace(**kwargs)) + +  def update_Request(req, url=None, data=None, headers={}, query={}):      req_headers = req.headers.copy()      req_headers.update(headers) | 
