diff options
Diffstat (limited to 'youtube_dl/extractor')
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 474 | 
1 files changed, 289 insertions, 185 deletions
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 3bf483c1c..cd4b3ef60 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2,6 +2,7 @@  from __future__ import unicode_literals +import collections  import itertools  import json  import os.path @@ -23,10 +24,10 @@ from ..compat import (  )  from ..jsinterp import JSInterpreter  from ..utils import ( -    ExtractorError,      clean_html,      dict_get,      error_to_compat_str, +    ExtractorError,      float_or_none,      extract_attributes,      get_element_by_attribute, @@ -36,6 +37,7 @@ from ..utils import (      LazyList,      merge_dicts,      mimetype2ext, +    NO_DEFAULT,      parse_codecs,      parse_duration,      parse_qs, @@ -45,6 +47,7 @@ from ..utils import (      str_or_none,      str_to_int,      traverse_obj, +    try_call,      try_get,      txt_or_none,      unescapeHTML, @@ -1460,6 +1463,30 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          self._code_cache = {}          self._player_cache = {} +    # *ytcfgs, webpage=None +    def _extract_player_url(self, *ytcfgs, **kw_webpage): +        if ytcfgs and not isinstance(ytcfgs[0], dict): +            webpage = kw_webpage.get('webpage') or ytcfgs[0] +        if webpage: +            player_url = self._search_regex( +                r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"', +                webpage or '', 'player URL', fatal=False) +            if player_url: +                ytcfgs = ytcfgs + ({'PLAYER_JS_URL': player_url},) +        return traverse_obj( +            ytcfgs, (Ellipsis, 'PLAYER_JS_URL'), (Ellipsis, 'WEB_PLAYER_CONTEXT_CONFIGS', Ellipsis, 'jsUrl'), +            get_all=False, expected_type=lambda u: urljoin('https://www.youtube.com', u)) + +    def _download_player_url(self, video_id, fatal=False): +        res = self._download_webpage( +            'https://www.youtube.com/iframe_api', +            note='Downloading iframe API JS', video_id=video_id, fatal=fatal) +        player_version = self._search_regex( +            r'player\\?/([0-9a-fA-F]{8})\\?/', res or '', 'player version', fatal=fatal, +            default=NO_DEFAULT if res else None) +        if player_version: +            return 'https://www.youtube.com/s/player/{0}/player_ias.vflset/en_US/base.js'.format(player_version) +      def _signature_cache_id(self, example_sig):          """ Return a string representation of a signature """          return '.'.join(compat_str(len(part)) for part in example_sig.split('.')) @@ -1474,46 +1501,49 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              raise ExtractorError('Cannot identify player %r' % player_url)          return id_m.group('id') -    def _get_player_code(self, video_id, player_url, player_id=None): +    def _load_player(self, video_id, player_url, fatal=True, player_id=None):          if not player_id:              player_id = self._extract_player_info(player_url) -          if player_id not in self._code_cache: -            self._code_cache[player_id] = self._download_webpage( -                player_url, video_id, +            code = self._download_webpage( +                player_url, video_id, fatal=fatal,                  note='Downloading player ' + player_id,                  errnote='Download of %s failed' % player_url) -        return self._code_cache[player_id] +            if code: +                self._code_cache[player_id] = code +        return self._code_cache[player_id] if fatal else self._code_cache.get(player_id)      def _extract_signature_function(self, video_id, player_url, example_sig):          player_id = self._extract_player_info(player_url)          # Read from filesystem cache -        func_id = 'js_%s_%s' % ( +        func_id = 'js_{0}_{1}'.format(              player_id, self._signature_cache_id(example_sig))          assert os.path.basename(func_id) == func_id -        cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id) -        if cache_spec is not None: -            return lambda s: ''.join(s[i] for i in cache_spec) +        self.write_debug('Extracting signature function {0}'.format(func_id)) +        cache_spec, code = self.cache.load('youtube-sigfuncs', func_id), None -        code = self._get_player_code(video_id, player_url, player_id) -        res = self._parse_sig_js(code) +        if not cache_spec: +            code = self._load_player(video_id, player_url, player_id) +        if code: +            res = self._parse_sig_js(code) +            test_string = ''.join(map(compat_chr, range(len(example_sig)))) +            cache_spec = [ord(c) for c in res(test_string)] +            self.cache.store('youtube-sigfuncs', func_id, cache_spec) -        test_string = ''.join(map(compat_chr, range(len(example_sig)))) -        cache_res = res(test_string) -        cache_spec = [ord(c) for c in cache_res] - -        self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec) -        return res +        return lambda s: ''.join(s[i] for i in cache_spec)      def _print_sig_code(self, func, example_sig): +        if not self.get_param('youtube_print_sig_code'): +            return +          def gen_sig_code(idxs):              def _genslice(start, end, step):                  starts = '' if start == 0 else str(start)                  ends = (':%d' % (end + step)) if end + step >= 0 else ':'                  steps = '' if step == 1 else (':%d' % step) -                return 's[%s%s%s]' % (starts, ends, steps) +                return 's[{0}{1}{2}]'.format(starts, ends, steps)              step = None              # Quelch pyflakes warnings - start will be set when step is set @@ -1564,143 +1594,137 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              jscode, 'Initial JS player signature function name', group='sig')          jsi = JSInterpreter(jscode) -          initial_function = jsi.extract_function(funcname) -          return lambda s: initial_function([s]) -    def _decrypt_signature(self, s, video_id, player_url): -        """Turn the encrypted s field into a working signature""" +    def _cached(self, func, *cache_id): +        def inner(*args, **kwargs): +            if cache_id not in self._player_cache: +                try: +                    self._player_cache[cache_id] = func(*args, **kwargs) +                except ExtractorError as e: +                    self._player_cache[cache_id] = e +                except Exception as e: +                    self._player_cache[cache_id] = ExtractorError(traceback.format_exc(), cause=e) -        if player_url is None: -            raise ExtractorError('Cannot decrypt signature without player_url') +            ret = self._player_cache[cache_id] +            if isinstance(ret, Exception): +                raise ret +            return ret +        return inner -        try: -            player_id = (player_url, self._signature_cache_id(s)) -            if player_id not in self._player_cache: -                func = self._extract_signature_function( -                    video_id, player_url, s -                ) -                self._player_cache[player_id] = func -            func = self._player_cache[player_id] -            if self._downloader.params.get('youtube_print_sig_code'): -                self._print_sig_code(func, s) -            return func(s) -        except Exception as e: -            tb = traceback.format_exc() -            raise ExtractorError( -                'Signature extraction failed: ' + tb, cause=e) - -    def _extract_player_url(self, webpage): -        player_url = self._search_regex( -            r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"', -            webpage or '', 'player URL', fatal=False) -        if not player_url: -            return -        if player_url.startswith('//'): -            player_url = 'https:' + player_url -        elif not re.match(r'https?://', player_url): -            player_url = compat_urllib_parse.urljoin( -                'https://www.youtube.com', player_url) -        return player_url +    def _decrypt_signature(self, s, video_id, player_url): +        """Turn the encrypted s field into a working signature""" +        extract_sig = self._cached( +            self._extract_signature_function, 'sig', player_url, self._signature_cache_id(s)) +        func = extract_sig(video_id, player_url, s) +        self._print_sig_code(func, s) +        return func(s)      # from yt-dlp      # See also:      # 1. https://github.com/ytdl-org/youtube-dl/issues/29326#issuecomment-894619419      # 2. https://code.videolan.org/videolan/vlc/-/blob/4fb284e5af69aa9ac2100ccbdd3b88debec9987f/share/lua/playlist/youtube.lua#L116      # 3. https://github.com/ytdl-org/youtube-dl/issues/30097#issuecomment-950157377 +    def _decrypt_nsig(self, n, video_id, player_url): +        """Turn the encrypted n field into a working signature""" +        if player_url is None: +            raise ExtractorError('Cannot decrypt nsig without player_url') + +        try: +            jsi, player_id, func_code = self._extract_n_function_code(video_id, player_url) +        except ExtractorError as e: +            raise ExtractorError('Unable to extract nsig jsi, player_id, func_codefunction code', cause=e) +        if self.get_param('youtube_print_sig_code'): +            self.to_screen('Extracted nsig function from {0}:\n{1}\n'.format( +                player_id, func_code[1])) + +        try: +            extract_nsig = self._cached(self._extract_n_function_from_code, 'nsig func', player_url) +            ret = extract_nsig(jsi, func_code)(n) +        except JSInterpreter.Exception as e: +            self.report_warning( +                '%s (%s %s)' % ( +                    self.__ie_msg( +                        'Unable to decode n-parameter: download likely to be throttled'), +                    error_to_compat_str(e), +                    traceback.format_exc())) +            return + +        self.write_debug('Decrypted nsig {0} => {1}'.format(n, ret)) +        return ret +      def _extract_n_function_name(self, jscode): -        target = r'(?P<nfunc>[a-zA-Z_$][\w$]*)(?:\[(?P<idx>\d+)\])?' -        nfunc_and_idx = self._search_regex( -            r'\.get\("n"\)\)&&\(b=(%s)\([\w$]+\)' % (target, ), -            jscode, 'Initial JS player n function name') -        nfunc, idx = re.match(target, nfunc_and_idx).group('nfunc', 'idx') +        func_name, idx = self._search_regex( +            r'\.get\("n"\)\)&&\(b=(?P<nfunc>[a-zA-Z_$][\w$]*)(?:\[(?P<idx>\d+)\])?\([\w$]+\)', +            jscode, 'Initial JS player n function name', group=('nfunc', 'idx'))          if not idx: -            return nfunc +            return func_name -        VAR_RE_TMPL = r'var\s+%s\s*=\s*(?P<name>\[(?P<alias>%s)\])[;,]' -        note = 'Initial JS player n function {0} (%s[%s])' % (nfunc, idx) +        return self._parse_json(self._search_regex( +            r'var {0}\s*=\s*(\[.+?\])\s*[,;]'.format(re.escape(func_name)), jscode, +            'Initial JS player n function list ({0}.{1})'.format(func_name, idx)), +            func_name, transform_source=js_to_json)[int(idx)] -        def search_function_code(needle, group): -            return self._search_regex( -                VAR_RE_TMPL % (re.escape(nfunc), needle), jscode, -                note.format(group), group=group) +    def _extract_n_function_code(self, video_id, player_url): +        player_id = self._extract_player_info(player_url) +        func_code = self.cache.load('youtube-nsig', player_id) +        jscode = func_code or self._load_player(video_id, player_url) +        jsi = JSInterpreter(jscode) -        if int_or_none(idx) == 0: -            real_nfunc = search_function_code(r'[a-zA-Z_$][\w$]*', group='alias') -            if real_nfunc: -                return real_nfunc -        return self._parse_json( -            search_function_code('.+?', group='name'), -            nfunc, transform_source=js_to_json)[int(idx)] +        if func_code: +            return jsi, player_id, func_code -    def _extract_n_function(self, video_id, player_url): -        player_id = self._extract_player_info(player_url) -        func_code = self._downloader.cache.load('youtube-nsig', player_id) +        func_name = self._extract_n_function_name(jscode) +        # For redundancy +        func_code = self._search_regex( +            r'''(?xs)%s\s*=\s*function\s*\((?P<var>[\w$]+)\)\s* +                     # NB: The end of the regex is intentionally kept strict +                     {(?P<code>.+?}\s*return\ [\w$]+.join\(""\))};''' % func_name, +            jscode, 'nsig function', group=('var', 'code'), default=None)          if func_code: -            jsi = JSInterpreter(func_code) +            func_code = ([func_code[0]], func_code[1])          else: -            jscode = self._get_player_code(video_id, player_url, player_id) -            funcname = self._extract_n_function_name(jscode) -            jsi = JSInterpreter(jscode) -            func_code = jsi.extract_function_code(funcname) -            self._downloader.cache.store('youtube-nsig', player_id, func_code) - -        if self._downloader.params.get('youtube_print_sig_code'): -            self.to_screen('Extracted nsig function from {0}:\n{1}\n'.format(player_id, func_code[1])) - -        return lambda s: jsi.extract_function_from_code(*func_code)([s]) - -    def _n_descramble(self, n_param, player_url, video_id): -        """Compute the response to YT's "n" parameter challenge, -           or None - -        Args: -        n_param     -- challenge string that is the value of the -                       URL's "n" query parameter -        player_url  -- URL of YT player JS -        video_id -        """ +            self.write_debug('Extracting nsig function with jsinterp') +            func_code = jsi.extract_function_code(func_name) -        sig_id = ('nsig_value', n_param) -        if sig_id in self._player_cache: -            return self._player_cache[sig_id] +        self.cache.store('youtube-nsig', player_id, func_code) +        return jsi, player_id, func_code + +    def _extract_n_function_from_code(self, jsi, func_code): +        func = jsi.extract_function_from_code(*func_code) + +        def extract_nsig(s): +            try: +                ret = func([s]) +            except JSInterpreter.Exception: +                raise +            except Exception as e: +                raise JSInterpreter.Exception(traceback.format_exc(), cause=e) -        try: -            player_id = ('nsig', player_url) -            if player_id not in self._player_cache: -                self._player_cache[player_id] = self._extract_n_function(video_id, player_url) -            func = self._player_cache[player_id] -            ret = func(n_param)              if ret.startswith('enhanced_except_'): -                raise ExtractorError('Unhandled exception in decode') -            self._player_cache[sig_id] = ret -            if self._downloader.params.get('verbose', False): -                self._downloader.to_screen('[debug] [%s] %s' % (self.IE_NAME, 'Decrypted nsig {0} => {1}'.format(n_param, self._player_cache[sig_id]))) -            return self._player_cache[sig_id] -        except Exception as e: -            self._downloader.report_warning( -                '[%s] %s (%s %s)' % ( -                    self.IE_NAME, -                    'Unable to decode n-parameter: download likely to be throttled', -                    error_to_compat_str(e), -                    traceback.format_exc())) +                raise JSInterpreter.Exception('Signature function returned an exception') +            return ret + +        return extract_nsig + +    def _unthrottle_format_urls(self, video_id, player_url, *formats): + +        def decrypt_nsig(n): +            return self._cached(self._decrypt_nsig, 'nsig', n, player_url) -    def _unthrottle_format_urls(self, video_id, player_url, formats):          for fmt in formats:              parsed_fmt_url = compat_urllib_parse.urlparse(fmt['url'])              n_param = compat_parse_qs(parsed_fmt_url.query).get('n')              if not n_param:                  continue              n_param = n_param[-1] -            n_response = self._n_descramble(n_param, player_url, video_id) +            n_response = decrypt_nsig(n_param)(n_param, video_id, player_url)              if n_response is None:                  # give up if descrambling failed                  break -            for fmt_dct in traverse_obj(fmt, (None, (None, ('fragments', Ellipsis))), expected_type=dict): -                fmt_dct['url'] = update_url( -                    fmt_dct['url'], query_update={'n': [n_response]}) +            fmt['url'] = update_url_query(fmt['url'], {'n': n_response})      # from yt-dlp, with tweaks      def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False): @@ -1708,16 +1732,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          Extract signatureTimestamp (sts)          Required to tell API what sig/player version is in use.          """ -        sts = int_or_none(ytcfg.get('STS')) if isinstance(ytcfg, dict) else None +        sts = traverse_obj(ytcfg, 'STS', expected_type=int)          if not sts:              # Attempt to extract from player              if player_url is None:                  error_msg = 'Cannot extract signature timestamp without player_url.'                  if fatal:                      raise ExtractorError(error_msg) -                self._downloader.report_warning(error_msg) +                self.report_warning(error_msg)                  return -            code = self._get_player_code(video_id, player_url) +            code = self._load_player(video_id, player_url, fatal=fatal)              sts = int_or_none(self._search_regex(                  r'(?:signatureTimestamp|sts)\s*:\s*(?P<sts>[0-9]{5})', code or '',                  'JS player signature timestamp', group='sts', fatal=fatal)) @@ -1733,12 +1757,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          # cpn generation algorithm is reverse engineered from base.js.          # In fact it works even with dummy cpn.          CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' -        cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16))) - -        playback_url = update_url( -            playback_url, query_update={ -                'ver': ['2'], -                'cpn': [cpn], +        cpn = ''.join(CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)) + +        # more consistent results setting it to right before the end +        qs = parse_qs(playback_url) +        video_length = '{0}'.format(float((qs.get('len') or ['1.5'])[0]) - 1) + +        playback_url = update_url_query( +            playback_url, { +                'ver': '2', +                'cpn': cpn, +                'cmt': video_length, +                'el': 'detailpage',  # otherwise defaults to "shorts"              })          self._download_webpage( @@ -1986,8 +2016,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              else:                  self.to_screen('Downloading just video %s because of --no-playlist' % video_id) +        if not player_url: +            player_url = self._extract_player_url(webpage) +          formats = [] -        itags = [] +        itags = collections.defaultdict(set)          itag_qualities = {}          q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres'])          CHUNK_SIZE = 10 << 20 @@ -2003,58 +2036,92 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  })              } for range_start in range(0, f['filesize'], CHUNK_SIZE)) +        lower = lambda s: s.lower() +          for fmt in streaming_formats: -            if fmt.get('targetDurationSec') or fmt.get('drmFamilies'): +            if fmt.get('targetDurationSec'):                  continue              itag = str_or_none(fmt.get('itag')) -            quality = fmt.get('quality') -            if itag and quality: +            audio_track = traverse_obj(fmt, ('audioTrack', T(dict))) or {} + +            quality = traverse_obj(fmt, (( +                # The 3gp format (17) in android client has a quality of "small", +                # but is actually worse than other formats +                T(lambda _: 'tiny' if itag == 17 else None), +                ('quality', T(lambda q: q if q and q != 'tiny' else None)), +                ('audioQuality', T(lower)), +                'quality'), T(txt_or_none)), get_all=False) +            if quality and itag:                  itag_qualities[itag] = quality              # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment              # (adding `&sq=0` to the URL) and parsing emsg box to determine the -            # number of fragment that would subsequently requested with (`&sq=N`) +            # number of fragments that would subsequently be requested with (`&sq=N`)              if fmt.get('type') == 'FORMAT_STREAM_TYPE_OTF':                  continue              fmt_url = fmt.get('url')              if not fmt_url:                  sc = compat_parse_qs(fmt.get('signatureCipher')) -                fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0])) -                encrypted_sig = try_get(sc, lambda x: x['s'][0]) -                if not (sc and fmt_url and encrypted_sig): +                fmt_url = traverse_obj(sc, ('url', -1, T(url_or_none))) +                encrypted_sig = traverse_obj(sc, ('s', -1)) +                if not (fmt_url and encrypted_sig):                      continue +                player_url = player_url or self._extract_player_url(webpage)                  if not player_url: -                    player_url = self._extract_player_url(webpage) -                if not player_url:                      continue -                signature = self._decrypt_signature(sc['s'][0], video_id, player_url) -                sp = try_get(sc, lambda x: x['sp'][0]) or 'signature' -                fmt_url += '&' + sp + '=' + signature +                try: +                    fmt_url = update_url_query(fmt_url, { +                        traverse_obj(sc, ('sp', -1)) or 'signature': +                            [self._decrypt_signature(encrypted_sig, video_id, player_url)], +                    }) +                except ExtractorError as e: +                    self.report_warning('Signature extraction failed: Some formats may be missing', +                                        video_id=video_id, only_once=True) +                    self.write_debug(error_to_compat_str(e), only_once=True) +                    continue -            if itag: -                itags.append(itag) -            tbr = float_or_none( -                fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) +            language_preference = ( +                10 if audio_track.get('audioIsDefault') +                else -10 if 'descriptive' in (traverse_obj(audio_track, ('displayName', T(lower))) or '') +                else -1) +            name = ( +                traverse_obj(fmt, ('qualityLabel', T(txt_or_none))) +                or quality.replace('audio_quality_', ''))              dct = { -                'asr': int_or_none(fmt.get('audioSampleRate')), -                'filesize': int_or_none(fmt.get('contentLength')), -                'format_id': itag, -                'format_note': fmt.get('qualityLabel') or quality, -                'fps': int_or_none(fmt.get('fps')), -                'height': int_or_none(fmt.get('height')), -                'quality': q(quality), -                'tbr': tbr, +                'format_id': join_nonempty(itag, fmt.get('isDrc') and 'drc'),                  'url': fmt_url, -                'width': fmt.get('width'), +                # Format 22 is likely to be damaged: see https://github.com/yt-dlp/yt-dlp/issues/3372 +                'source_preference': ((-5 if itag == '22' else -1) +                                      + (100 if 'Premium' in name else 0)), +                'quality': q(quality), +                'language': join_nonempty(audio_track.get('id', '').split('.')[0], +                                          'desc' if language_preference < -1 else '') or None, +                'language_preference': language_preference, +                # Strictly de-prioritize 3gp formats +                'preference': -2 if itag == '17' else None,              } -            mimetype = fmt.get('mimeType') -            if mimetype: -                mobj = re.match( -                    r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', mimetype) -                if mobj: -                    dct['ext'] = mimetype2ext(mobj.group(1)) -                    dct.update(parse_codecs(mobj.group(2))) +            if itag: +                itags[itag].add(('https', dct.get('language'))) +            self._unthrottle_format_urls(video_id, player_url, dct) +            dct.update(traverse_obj(fmt, { +                'asr': ('audioSampleRate', T(int_or_none)), +                'filesize': ('contentLength', T(int_or_none)), +                'format_note': ('qualityLabel', T(lambda x: x or quality)), +                # for some formats, fps is wrongly returned as 1 +                'fps': ('fps', T(int_or_none), T(lambda f: f if f > 1 else None)), +                'audio_channels': ('audioChannels', T(int_or_none)), +                'height': ('height', T(int_or_none)), +                'has_drm': ('drmFamilies', T(bool)), +                'tbr': (('averageBitrate', 'bitrate'), T(lambda t: float_or_none(t, 1000))), +                'width': ('width', T(int_or_none)), +                '_duration_ms': ('approxDurationMs', T(int_or_none)), +            }, get_all=False)) +            mime_mobj = re.match( +                r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '') +            if mime_mobj: +                dct['ext'] = mimetype2ext(mime_mobj.group(1)) +                dct.update(parse_codecs(mime_mobj.group(2)))              single_stream = 'none' in (dct.get(c) for c in ('acodec', 'vcodec'))              if single_stream and dct.get('ext'):                  dct['container'] = dct['ext'] + '_dash' @@ -2069,32 +2136,62 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              formats.append(dct) +        def process_manifest_format(f, proto, client_name, itag, all_formats=False): +            key = (proto, f.get('language')) +            if not all_formats and key in itags[itag]: +                return False +            itags[itag].add(key) + +            if itag: +                f['format_id'] = ( +                    '{0}-{1}'.format(itag, proto) +                    if all_formats or any(p != proto for p, _ in itags[itag]) +                    else itag) + +            if f.get('source_preference') is None: +                f['source_preference'] = -1 + +            if itag in ('616', '235'): +                f['format_note'] = join_nonempty(f.get('format_note'), 'Premium', delim=' ') +                f['source_preference'] += 100 + +            f['quality'] = q(traverse_obj(f, ( +                'format_id', T(lambda s: itag_qualities[s.split('-')[0]])), default=-1)) +            if try_call(lambda: f['fps'] <= 1): +                del f['fps'] + +            if proto == 'hls' and f.get('has_drm'): +                f['has_drm'] = 'maybe' +                f['source_preference'] -= 5 +            return True +          hls_manifest_url = streaming_data.get('hlsManifestUrl')          if hls_manifest_url:              for f in self._extract_m3u8_formats(                      hls_manifest_url, video_id, 'mp4', fatal=False): -                itag = self._search_regex( -                    r'/itag/(\d+)', f['url'], 'itag', default=None) -                if itag: -                    f['format_id'] = itag -                formats.append(f) +                if process_manifest_format( +                        f, 'hls', None, self._search_regex( +                            r'/itag/(\d+)', f['url'], 'itag', default=None)): +                    formats.append(f)          if self._downloader.params.get('youtube_include_dash_manifest', True):              dash_manifest_url = streaming_data.get('dashManifestUrl')              if dash_manifest_url:                  for f in self._extract_mpd_formats(                          dash_manifest_url, video_id, fatal=False): -                    itag = f['format_id'] -                    if itag in itags: -                        continue -                    if itag in itag_qualities: -                        f['quality'] = q(itag_qualities[itag]) -                    filesize = int_or_none(self._search_regex( -                        r'/clen/(\d+)', f.get('fragment_base_url') -                        or f['url'], 'file size', default=None)) -                    if filesize: -                        f['filesize'] = filesize -                    formats.append(f) +                    if process_manifest_format( +                            f, 'dash', None, f['format_id']): +                        f['filesize'] = traverse_obj(f, ( +                            ('fragment_base_url', 'url'), T(lambda u: self._search_regex( +                                r'/clen/(\d+)', u, 'file size', default=None)), +                            T(int_or_none)), get_all=False) +                        formats.append(f) + +        playable_formats = [f for f in formats if not f.get('has_drm')] +        if formats and not playable_formats: +            # If there are no formats that definitely don't have DRM, all have DRM +            self.report_drm(video_id) +        formats[:] = playable_formats          if not formats:              if streaming_data.get('licenseInfos'): @@ -2166,6 +2263,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              video_details.get('lengthSeconds')              or microformat.get('lengthSeconds')) \              or parse_duration(search_meta('duration')) + +        for f in formats: +            # Some formats may have much smaller duration than others (possibly damaged during encoding) +            # but avoid false positives with small duration differences. +            # Ref: https://github.com/yt-dlp/yt-dlp/issues/2823 +            if try_call(lambda x: float(x.pop('_duration_ms')) / duration < 500, args=(f,)): +                self.report_warning( +                    '{0}: Some possibly damaged formats will be deprioritized'.format(video_id), only_once=True) +                # Strictly de-prioritize damaged formats +                f['preference'] = -10 +          is_live = video_details.get('isLive')          owner_profile_url = self._yt_urljoin(self._extract_author_var( @@ -2174,10 +2282,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          uploader = self._extract_author_var(              webpage, 'name', videodetails=video_details, metadata=microformat) -        if not player_url: -            player_url = self._extract_player_url(webpage) -        self._unthrottle_format_urls(video_id, player_url, formats) -          info = {              'id': video_id,              'title': self._live_title(video_title) if is_live else video_title,  | 
