diff options
| -rw-r--r-- | .github/workflows/ci.yml | 19 | ||||
| -rw-r--r-- | test/test_cache.py | 14 | ||||
| -rw-r--r-- | test/test_jsinterp.py | 80 | ||||
| -rw-r--r-- | test/test_youtube_signature.py | 163 | ||||
| -rwxr-xr-x | youtube_dl/YoutubeDL.py | 37 | ||||
| -rw-r--r-- | youtube_dl/__init__.py | 6 | ||||
| -rw-r--r-- | youtube_dl/cache.py | 95 | ||||
| -rw-r--r-- | youtube_dl/casefold.py | 12 | ||||
| -rw-r--r-- | youtube_dl/compat.py | 277 | ||||
| -rw-r--r-- | youtube_dl/downloader/common.py | 28 | ||||
| -rw-r--r-- | youtube_dl/extractor/bokecc.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/cloudy.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/common.py | 92 | ||||
| -rw-r--r-- | youtube_dl/extractor/itv.py | 17 | ||||
| -rw-r--r-- | youtube_dl/extractor/senateisvp.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 944 | ||||
| -rw-r--r-- | youtube_dl/jsinterp.py | 414 | ||||
| -rw-r--r-- | youtube_dl/options.py | 15 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 8 | ||||
| -rw-r--r-- | youtube_dl/version.py | 2 | 
20 files changed, 1647 insertions, 582 deletions
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d3b9ae016..073c4458c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -116,29 +116,29 @@ jobs:      strategy:        fail-fast: true        matrix: -        os: [ubuntu-20.04] +        os: [ubuntu-22.04]          python-version: ${{ fromJSON(needs.select.outputs.cpython-versions) }}          python-impl: [cpython]          ytdl-test-set: ${{ fromJSON(needs.select.outputs.test-set) }}          run-tests-ext: [sh]          include: -        - os: windows-2019 +        - os: windows-2022            python-version: 3.4            python-impl: cpython            ytdl-test-set: ${{ contains(needs.select.outputs.test-set, 'core') && 'core' || 'nocore' }}            run-tests-ext: bat -        - os: windows-2019 +        - os: windows-2022            python-version: 3.4            python-impl: cpython            ytdl-test-set: ${{ contains(needs.select.outputs.test-set, 'download') && 'download'  || 'nodownload' }}            run-tests-ext: bat          # jython -        - os: ubuntu-20.04 +        - os: ubuntu-22.04            python-version: 2.7            python-impl: jython            ytdl-test-set: ${{ contains(needs.select.outputs.test-set, 'core') && 'core' || 'nocore' }}            run-tests-ext: sh -        - os: ubuntu-20.04 +        - os: ubuntu-22.04            python-version: 2.7            python-impl: jython            ytdl-test-set: ${{ contains(needs.select.outputs.test-set, 'download') && 'download'  || 'nodownload' }} @@ -160,7 +160,7 @@ jobs:        # NB may run apt-get install in Linux        uses: ytdl-org/setup-python@v1        env: -        # Temporary workaround for Python 3.5 failures - May 2024 +        # Temporary (?) workaround for Python 3.5 failures - May 2024          PIP_TRUSTED_HOST: "pypi.python.org pypi.org files.pythonhosted.org"        with:          python-version: ${{ matrix.python-version }} @@ -240,7 +240,10 @@ jobs:        # install 2.7        shell: bash        run: | -        sudo apt-get install -y python2 python-is-python2 +        # Ubuntu 22.04 no longer has python-is-python2: fetch it +        curl -L "http://launchpadlibrarian.net/474693132/python-is-python2_2.7.17-4_all.deb" -o python-is-python2.deb +        sudo apt-get install -y python2 +        sudo dpkg --force-breaks -i python-is-python2.deb          echo "PYTHONHOME=/usr" >> "$GITHUB_ENV"      #-------- Python 2.6 --      - name: Set up Python 2.6 environment @@ -362,7 +365,7 @@ jobs:          python -m ensurepip || python -m pip --version || { \            get_pip="${{ contains(needs.select.outputs.own-pip-versions, matrix.python-version) && format('{0}/', matrix.python-version) || '' }}"; \            curl -L -O "https://bootstrap.pypa.io/pip/${get_pip}get-pip.py"; \ -          python get-pip.py; } +          python get-pip.py --no-setuptools --no-wheel; }      - name: Set up Python 2.6 pip        if: ${{ matrix.python-version == '2.6' }}        shell: bash diff --git a/test/test_cache.py b/test/test_cache.py index 931074aa1..0431f4f15 100644 --- a/test/test_cache.py +++ b/test/test_cache.py @@ -63,9 +63,21 @@ class TestCache(unittest.TestCase):          obj = {'x': 1, 'y': ['รค', '\\a', True]}          c.store('test_cache', 'k.', obj)          self.assertEqual(c.load('test_cache', 'k.', min_ver='1970.01.01'), obj) -        new_version = '.'.join(('%d' % ((v + 1) if i == 0 else v, )) for i, v in enumerate(version_tuple(__version__))) +        new_version = '.'.join(('%0.2d' % ((v + 1) if i == 0 else v, )) for i, v in enumerate(version_tuple(__version__)))          self.assertIs(c.load('test_cache', 'k.', min_ver=new_version), None) +    def test_cache_clear(self): +        ydl = FakeYDL({ +            'cachedir': self.test_dir, +        }) +        c = Cache(ydl) +        c.store('test_cache', 'k.', 'kay') +        c.store('test_cache', 'l.', 'ell') +        self.assertEqual(c.load('test_cache', 'k.'), 'kay') +        c.clear('test_cache', 'k.') +        self.assertEqual(c.load('test_cache', 'k.'), None) +        self.assertEqual(c.load('test_cache', 'l.'), 'ell') +  if __name__ == '__main__':      unittest.main() diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 12e7b9b94..479cb43a0 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -7,10 +7,12 @@ from __future__ import unicode_literals  import os  import sys  import unittest +  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))  import math  import re +import time  from youtube_dl.compat import compat_str as str  from youtube_dl.jsinterp import JS_Undefined, JSInterpreter @@ -145,6 +147,25 @@ class TestJSInterpreter(unittest.TestCase):          # https://github.com/ytdl-org/youtube-dl/issues/32815          self._test('function f(){return 0  - 7 * - 6;}', 42) +    def test_bitwise_operators_typecast(self): +        # madness +        self._test('function f(){return null << 5}', 0) +        self._test('function f(){return undefined >> 5}', 0) +        self._test('function f(){return 42 << NaN}', 42) +        self._test('function f(){return 42 << Infinity}', 42) +        self._test('function f(){return 0.0 << null}', 0) +        self._test('function f(){return NaN << 42}', 0) +        self._test('function f(){return "21.9" << 1}', 42) +        self._test('function f(){return true << "5";}', 32) +        self._test('function f(){return true << true;}', 2) +        self._test('function f(){return "19" & "21.9";}', 17) +        self._test('function f(){return "19" & false;}', 0) +        self._test('function f(){return "11.0" >> "2.1";}', 2) +        self._test('function f(){return 5 ^ 9;}', 12) +        self._test('function f(){return 0.0 << NaN}', 0) +        self._test('function f(){return null << undefined}', 0) +        self._test('function f(){return 21 << 4294967297}', 42) +      def test_array_access(self):          self._test('function f(){var x = [1,2,3]; x[0] = 4; x[0] = 5; x[2.0] = 7; return x;}', [5, 2, 7]) @@ -159,6 +180,7 @@ class TestJSInterpreter(unittest.TestCase):          self._test('function f(){var x = 20; x = 30 + 1; return x;}', 31)          self._test('function f(){var x = 20; x += 30 + 1; return x;}', 51)          self._test('function f(){var x = 20; x -= 30 + 1; return x;}', -11) +        self._test('function f(){var x = 2; var y = ["a", "b"]; y[x%y["length"]]="z"; return y}', ['z', 'b'])      def test_comments(self):          self._test(''' @@ -208,6 +230,34 @@ class TestJSInterpreter(unittest.TestCase):          self._test(jsi, 86000, args=['12/31/1969 18:01:26 MDT'])          # epoch 0          self._test(jsi, 0, args=['1 January 1970 00:00:00 UTC']) +        # undefined +        self._test(jsi, NaN, args=[JS_Undefined]) +        # y,m,d, ... - may fail with older dates lacking DST data +        jsi = JSInterpreter( +            'function f() { return new Date(%s); }' +            % ('2024, 5, 29, 2, 52, 12, 42',)) +        self._test(jsi, ( +            1719625932042                           # UK value +            + ( +                + 3600                              # back to GMT +                + (time.altzone if time.daylight    # host's DST +                   else time.timezone) +            ) * 1000)) +        # no arg +        self.assertAlmostEqual(JSInterpreter( +            'function f() { return new Date() - 0; }').call_function('f'), +            time.time() * 1000, delta=100) +        # Date.now() +        self.assertAlmostEqual(JSInterpreter( +            'function f() { return Date.now(); }').call_function('f'), +            time.time() * 1000, delta=100) +        # Date.parse() +        jsi = JSInterpreter('function f(dt) { return Date.parse(dt); }') +        self._test(jsi, 0, args=['1 January 1970 00:00:00 UTC']) +        # Date.UTC() +        jsi = JSInterpreter('function f() { return Date.UTC(%s); }' +                            % ('1970, 0, 1, 0, 0, 0, 0',)) +        self._test(jsi, 0)      def test_call(self):          jsi = JSInterpreter(''' @@ -322,6 +372,13 @@ class TestJSInterpreter(unittest.TestCase):          self._test('function f() { a=5; return (a -= 1, a+=3, a); }', 7)          self._test('function f() { return (l=[0,1,2,3], function(a, b){return a+b})((l[1], l[2]), l[3]) }', 5) +    def test_not(self): +        self._test('function f() { return ! undefined; }', True) +        self._test('function f() { return !0; }', True) +        self._test('function f() { return !!0; }', False) +        self._test('function f() { return ![]; }', False) +        self._test('function f() { return !0 !== false; }', True) +      def test_void(self):          self._test('function f() { return void 42; }', JS_Undefined) @@ -406,6 +463,7 @@ class TestJSInterpreter(unittest.TestCase):      def test_regex(self):          self._test('function f() { let a=/,,[/,913,/](,)}/; }', None) +        self._test('function f() { let a=/,,[/,913,/](,)}/; return a.source;  }', ',,[/,913,/](,)}')          jsi = JSInterpreter('''              function x() { let a=/,,[/,913,/](,)}/; "".replace(a, ""); return a; } @@ -453,13 +511,6 @@ class TestJSInterpreter(unittest.TestCase):          self._test('function f(){return -524999584 << 5}', 379882496)          self._test('function f(){return 1236566549 << 5}', 915423904) -    def test_bitwise_operators_typecast(self): -        # madness -        self._test('function f(){return null << 5}', 0) -        self._test('function f(){return undefined >> 5}', 0) -        self._test('function f(){return 42 << NaN}', 42) -        self._test('function f(){return 42 << Infinity}', 42) -      def test_negative(self):          self._test('function f(){return 2    *    -2.0    ;}', -4)          self._test('function f(){return 2    -    - -2    ;}', 0) @@ -502,6 +553,8 @@ class TestJSInterpreter(unittest.TestCase):          test_result = list('test')          tests = [              'function f(a, b){return a.split(b)}', +            'function f(a, b){return a["split"](b)}', +            'function f(a, b){let x = ["split"]; return a[x[0]](b)}',              'function f(a, b){return String.prototype.split.call(a, b)}',              'function f(a, b){return String.prototype.split.apply(a, [b])}',          ] @@ -552,6 +605,9 @@ class TestJSInterpreter(unittest.TestCase):          self._test('function f(){return "012345678".slice(-1, 1)}', '')          self._test('function f(){return "012345678".slice(-3, -1)}', '67') +    def test_splice(self): +        self._test('function f(){var T = ["0", "1", "2"]; T["splice"](2, 1, "0")[0]; return T }', ['0', '1', '0']) +      def test_pop(self):          # pop          self._test('function f(){var a = [0, 1, 2, 3, 4, 5, 6, 7, 8]; return [a.pop(), a]}', @@ -586,6 +642,16 @@ class TestJSInterpreter(unittest.TestCase):                     'return [ret.length, ret[0][0], ret[1][1], ret[0][2]]}',                     [2, 4, 1, [4, 2]]) +    def test_extract_function(self): +        jsi = JSInterpreter('function a(b) { return b + 1; }') +        func = jsi.extract_function('a') +        self.assertEqual(func([2]), 3) + +    def test_extract_function_with_global_stack(self): +        jsi = JSInterpreter('function c(d) { return d + e + f + g; }') +        func = jsi.extract_function('c', {'e': 10}, {'f': 100, 'g': 1000}) +        self.assertEqual(func([1]), 1111) +  if __name__ == '__main__':      unittest.main() diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index fcbc9d7a8..98221b9c2 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -84,6 +84,61 @@ _SIG_TESTS = [          '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',          '0QJ8wRAIgXmPlOPSBkkUs1bYFYlJCfe29xxAj7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJ2OySqa0q',      ), +    ( +        'https://www.youtube.com/s/player/643afba4/tv-player-ias.vflset/tv-player-ias.js', +        '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', +        'AAOAOq0QJ8wRAIgXmPlOPSBkkUs1bYFYlJCfe29xx8j7vgpDL0QwbdV06sCIEzpWqMGkFR20CFOS21Tp-7vj_EMu-m37KtXJoOy1', +    ), +    ( +        'https://www.youtube.com/s/player/363db69b/player_ias.vflset/en_US/base.js', +        '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', +        '0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpz2ICs6EVdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', +    ), +    ( +        'https://www.youtube.com/s/player/363db69b/player_ias_tce.vflset/en_US/base.js', +        '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', +        '0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpz2ICs6EVdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', +    ), +    ( +        'https://www.youtube.com/s/player/4fcd6e4a/player_ias.vflset/en_US/base.js', +        '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', +        'wAOAOq0QJ8ARAIgXmPlOPSBkkUs1bYFYlJCfe29xx8q7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0', +    ), +    ( +        'https://www.youtube.com/s/player/4fcd6e4a/player_ias_tce.vflset/en_US/base.js', +        '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', +        'wAOAOq0QJ8ARAIgXmPlOPSBkkUs1bYFYlJCfe29xx8q7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0', +    ), +    ( +        'https://www.youtube.com/s/player/20830619/player_ias.vflset/en_US/base.js', +        '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', +        '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw', +    ), +    ( +        'https://www.youtube.com/s/player/20830619/player_ias_tce.vflset/en_US/base.js', +        '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', +        '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw', +    ), +    ( +        'https://www.youtube.com/s/player/20830619/player-plasma-ias-phone-en_US.vflset/base.js', +        '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', +        '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw', +    ), +    ( +        'https://www.youtube.com/s/player/20830619/player-plasma-ias-tablet-en_US.vflset/base.js', +        '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', +        '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw', +    ), +    ( +        'https://www.youtube.com/s/player/8a8ac953/player_ias_tce.vflset/en_US/base.js', +        '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', +        'IAOAOq0QJ8wRAAgXmPlOPSBkkUs1bYFYlJCfe29xx8j7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_E2u-m37KtXJoOySqa0', +    ), +    ( +        'https://www.youtube.com/s/player/8a8ac953/tv-player-es6.vflset/tv-player-es6.js', +        '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', +        'IAOAOq0QJ8wRAAgXmPlOPSBkkUs1bYFYlJCfe29xx8j7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_E2u-m37KtXJoOySqa0', +    ),  ]  _NSIG_TESTS = [ @@ -153,7 +208,7 @@ _NSIG_TESTS = [      ),      (          'https://www.youtube.com/s/player/c57c113c/player_ias.vflset/en_US/base.js', -        '-Txvy6bT5R6LqgnQNx', 'dcklJCnRUHbgSg', +        'M92UUMHa8PdvPd3wyM', '3hPqLJsiNZx7yA',      ),      (          'https://www.youtube.com/s/player/5a3b6271/player_ias.vflset/en_US/base.js', @@ -173,7 +228,7 @@ _NSIG_TESTS = [      ),      (          'https://www.youtube.com/s/player/cfa9e7cb/player_ias.vflset/en_US/base.js', -        'qO0NiMtYQ7TeJnfFG2', 'k9cuJDHNS5O7kQ', +        'aCi3iElgd2kq0bxVbQ', 'QX1y8jGb2IbZ0w',      ),      (          'https://www.youtube.com/s/player/8c7583ff/player_ias.vflset/en_US/base.js', @@ -219,6 +274,82 @@ _NSIG_TESTS = [          'https://www.youtube.com/s/player/2f1832d2/player_ias.vflset/en_US/base.js',          'YWt1qdbe8SAfkoPHW5d', 'RrRjWQOJmBiP',      ), +    ( +        'https://www.youtube.com/s/player/9c6dfc4a/player_ias.vflset/en_US/base.js', +        'jbu7ylIosQHyJyJV', 'uwI0ESiynAmhNg', +    ), +    ( +        'https://www.youtube.com/s/player/f6e09c70/player_ias.vflset/en_US/base.js', +        'W9HJZKktxuYoDTqW', 'jHbbkcaxm54', +    ), +    ( +        'https://www.youtube.com/s/player/f6e09c70/player_ias_tce.vflset/en_US/base.js', +        'W9HJZKktxuYoDTqW', 'jHbbkcaxm54', +    ), +    ( +        'https://www.youtube.com/s/player/e7567ecf/player_ias_tce.vflset/en_US/base.js', +        'Sy4aDGc0VpYRR9ew_', '5UPOT1VhoZxNLQ', +    ), +    ( +        'https://www.youtube.com/s/player/d50f54ef/player_ias_tce.vflset/en_US/base.js', +        'Ha7507LzRmH3Utygtj', 'XFTb2HoeOE5MHg', +    ), +    ( +        'https://www.youtube.com/s/player/074a8365/player_ias_tce.vflset/en_US/base.js', +        'Ha7507LzRmH3Utygtj', 'ufTsrE0IVYrkl8v', +    ), +    ( +        'https://www.youtube.com/s/player/643afba4/player_ias.vflset/en_US/base.js', +        'N5uAlLqm0eg1GyHO', 'dCBQOejdq5s-ww', +    ), +    ( +        'https://www.youtube.com/s/player/69f581a5/tv-player-ias.vflset/tv-player-ias.js', +        '-qIP447rVlTTwaZjY', 'KNcGOksBAvwqQg', +    ), +    ( +        'https://www.youtube.com/s/player/643afba4/tv-player-ias.vflset/tv-player-ias.js', +        'ir9-V6cdbCiyKxhr', '2PL7ZDYAALMfmA', +    ), +    ( +        'https://www.youtube.com/s/player/643afba4/player_ias.vflset/en_US/base.js', +        'ir9-V6cdbCiyKxhr', '2PL7ZDYAALMfmA', +    ), +    ( +        'https://www.youtube.com/s/player/363db69b/player_ias.vflset/en_US/base.js', +        'eWYu5d5YeY_4LyEDc', 'XJQqf-N7Xra3gg', +    ), +    ( +        'https://www.youtube.com/s/player/4fcd6e4a/player_ias.vflset/en_US/base.js', +        'o_L251jm8yhZkWtBW', 'lXoxI3XvToqn6A', +    ), +    ( +        'https://www.youtube.com/s/player/4fcd6e4a/tv-player-ias.vflset/tv-player-ias.js', +        'o_L251jm8yhZkWtBW', 'lXoxI3XvToqn6A', +    ), +    ( +        'https://www.youtube.com/s/player/20830619/tv-player-ias.vflset/tv-player-ias.js', +        'ir9-V6cdbCiyKxhr', '9YE85kNjZiS4', +    ), +    ( +        'https://www.youtube.com/s/player/20830619/player-plasma-ias-phone-en_US.vflset/base.js', +        'ir9-V6cdbCiyKxhr', '9YE85kNjZiS4', +    ), +    ( +        'https://www.youtube.com/s/player/20830619/player-plasma-ias-tablet-en_US.vflset/base.js', +        'ir9-V6cdbCiyKxhr', '9YE85kNjZiS4', +    ), +    ( +        'https://www.youtube.com/s/player/8a8ac953/player_ias_tce.vflset/en_US/base.js', +        'MiBYeXx_vRREbiCCmh', 'RtZYMVvmkE0JE', +    ), +    ( +        'https://www.youtube.com/s/player/8a8ac953/tv-player-es6.vflset/tv-player-es6.js', +        'MiBYeXx_vRREbiCCmh', 'RtZYMVvmkE0JE', +    ), +    ( +        'https://www.youtube.com/s/player/aa3fc80b/player_ias.vflset/en_US/base.js', +        '0qY9dal2uzOnOGwa-48hha', 'VSh1KDfQMk-eag', +    ),  ] @@ -231,6 +362,8 @@ class TestPlayerInfo(unittest.TestCase):              ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-en_US.vflset/base.js', '64dddad9'),              ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-de_DE.vflset/base.js', '64dddad9'),              ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-tablet-en_US.vflset/base.js', '64dddad9'), +            ('https://www.youtube.com/s/player/e7567ecf/player_ias_tce.vflset/en_US/base.js', 'e7567ecf'), +            ('https://www.youtube.com/s/player/643afba4/tv-player-ias.vflset/tv-player-ias.js', '643afba4'),              # obsolete              ('https://www.youtube.com/yts/jsbin/player_ias-vfle4-e03/en_US/base.js', 'vfle4-e03'),              ('https://www.youtube.com/yts/jsbin/player_ias-vfl49f_g4/en_US/base.js', 'vfl49f_g4'), @@ -240,8 +373,9 @@ class TestPlayerInfo(unittest.TestCase):              ('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflXGBaUN.js', 'vflXGBaUN'),              ('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflKjOTVq/html5player.js', 'vflKjOTVq'),          ) +        ie = YoutubeIE(FakeYDL({'cachedir': False}))          for player_url, expected_player_id in PLAYER_URLS: -            player_id = YoutubeIE._extract_player_info(player_url) +            player_id = ie._extract_player_info(player_url)              self.assertEqual(player_id, expected_player_id) @@ -261,11 +395,11 @@ class TestSignature(unittest.TestCase):  def t_factory(name, sig_func, url_pattern):      def make_tfunc(url, sig_input, expected_sig):          m = url_pattern.match(url) -        assert m, '%r should follow URL format' % url -        test_id = m.group('id') +        assert m, '{0!r} should follow URL format'.format(url) +        test_id = re.sub(r'[/.-]', '_', m.group('id') or m.group('compat_id'))          def test_func(self): -            basename = 'player-{0}-{1}.js'.format(name, test_id) +            basename = 'player-{0}.js'.format(test_id)              fn = os.path.join(self.TESTDATA_DIR, basename)              if not os.path.exists(fn): @@ -280,7 +414,7 @@ def t_factory(name, sig_func, url_pattern):  def signature(jscode, sig_input): -    func = YoutubeIE(FakeYDL())._parse_sig_js(jscode) +    func = YoutubeIE(FakeYDL({'cachedir': False}))._parse_sig_js(jscode)      src_sig = (          compat_str(string.printable[:sig_input])          if isinstance(sig_input, int) else sig_input) @@ -288,18 +422,23 @@ def signature(jscode, sig_input):  def n_sig(jscode, sig_input): -    funcname = YoutubeIE(FakeYDL())._extract_n_function_name(jscode) -    return JSInterpreter(jscode).call_function( -        funcname, sig_input, _ytdl_do_not_return=sig_input) +    ie = YoutubeIE(FakeYDL({'cachedir': False})) +    jsi = JSInterpreter(jscode) +    jsi, _, func_code = ie._extract_n_function_code_jsi(sig_input, jsi) +    return ie._extract_n_function_from_code(jsi, func_code)(sig_input)  make_sig_test = t_factory( -    'signature', signature, re.compile(r'.*(?:-|/player/)(?P<id>[a-zA-Z0-9_-]+)(?:/.+\.js|(?:/watch_as3|/html5player)?\.[a-z]+)$')) +    'signature', signature, +    re.compile(r'''(?x) +        .+/(?P<h5>html5)?player(?(h5)(?:-en_US)?-|/)(?P<id>[a-zA-Z0-9/._-]+) +        (?(h5)/(?:watch_as3|html5player))?\.js$ +    '''))  for test_spec in _SIG_TESTS:      make_sig_test(*test_spec)  make_nsig_test = t_factory( -    'nsig', n_sig, re.compile(r'.+/player/(?P<id>[a-zA-Z0-9_-]+)/.+.js$')) +    'nsig', n_sig, re.compile(r'.+/player/(?P<id>[a-zA-Z0-9_/.-]+)\.js$'))  for test_spec in _NSIG_TESTS:      make_nsig_test(*test_spec) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 9e5620eef..8367b6e53 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -540,10 +540,14 @@ class YoutubeDL(object):          """Print message to stdout if not in quiet mode."""          return self.to_stdout(message, skip_eol, check_quiet=True) -    def _write_string(self, s, out=None): +    def _write_string(self, s, out=None, only_once=False, _cache=set()): +        if only_once and s in _cache: +            return          write_string(s, out=out, encoding=self.params.get('encoding')) +        if only_once: +            _cache.add(s) -    def to_stdout(self, message, skip_eol=False, check_quiet=False): +    def to_stdout(self, message, skip_eol=False, check_quiet=False, only_once=False):          """Print message to stdout if not in quiet mode."""          if self.params.get('logger'):              self.params['logger'].debug(message) @@ -552,9 +556,9 @@ class YoutubeDL(object):              terminator = ['\n', ''][skip_eol]              output = message + terminator -            self._write_string(output, self._screen_file) +            self._write_string(output, self._screen_file, only_once=only_once) -    def to_stderr(self, message): +    def to_stderr(self, message, only_once=False):          """Print message to stderr."""          assert isinstance(message, compat_str)          if self.params.get('logger'): @@ -562,7 +566,7 @@ class YoutubeDL(object):          else:              message = self._bidi_workaround(message)              output = message + '\n' -            self._write_string(output, self._err_file) +            self._write_string(output, self._err_file, only_once=only_once)      def to_console_title(self, message):          if not self.params.get('consoletitle', False): @@ -641,18 +645,11 @@ class YoutubeDL(object):              raise DownloadError(message, exc_info)          self._download_retcode = 1 -    def report_warning(self, message, only_once=False, _cache={}): +    def report_warning(self, message, only_once=False):          '''          Print the message to stderr, it will be prefixed with 'WARNING:'          If stderr is a tty file the 'WARNING:' will be colored          ''' -        if only_once: -            m_hash = hash((self, message)) -            m_cnt = _cache.setdefault(m_hash, 0) -            _cache[m_hash] = m_cnt + 1 -            if m_cnt > 0: -                return -          if self.params.get('logger') is not None:              self.params['logger'].warning(message)          else: @@ -663,7 +660,7 @@ class YoutubeDL(object):              else:                  _msg_header = 'WARNING:'              warning_message = '%s %s' % (_msg_header, message) -            self.to_stderr(warning_message) +            self.to_stderr(warning_message, only_once=only_once)      def report_error(self, message, *args, **kwargs):          ''' @@ -677,6 +674,16 @@ class YoutubeDL(object):          kwargs['message'] = '%s %s' % (_msg_header, message)          self.trouble(*args, **kwargs) +    def write_debug(self, message, only_once=False): +        '''Log debug message or Print message to stderr''' +        if not self.params.get('verbose', False): +            return +        message = '[debug] {0}'.format(message) +        if self.params.get('logger'): +            self.params['logger'].debug(message) +        else: +            self.to_stderr(message, only_once) +      def report_unscoped_cookies(self, *args, **kwargs):          # message=None, tb=False, is_error=False          if len(args) <= 2: @@ -2514,7 +2521,7 @@ class YoutubeDL(object):                  self.get_encoding()))          write_string(encoding_str, encoding=None) -        writeln_debug = lambda *s: self._write_string('[debug] %s\n' % (''.join(s), )) +        writeln_debug = lambda *s: self.write_debug(''.join(s))          writeln_debug('youtube-dl version ', __version__)          if _LAZY_LOADER:              writeln_debug('Lazy loading extractors enabled') diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 06bdfb689..202f2c9b9 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -18,7 +18,7 @@ from .compat import (      compat_getpass,      compat_register_utf8,      compat_shlex_split, -    workaround_optparse_bug9161, +    _workaround_optparse_bug9161,  )  from .utils import (      _UnsafeExtensionError, @@ -50,7 +50,7 @@ def _real_main(argv=None):      # Compatibility fix for Windows      compat_register_utf8() -    workaround_optparse_bug9161() +    _workaround_optparse_bug9161()      setproctitle('youtube-dl') @@ -409,6 +409,8 @@ def _real_main(argv=None):          'include_ads': opts.include_ads,          'default_search': opts.default_search,          'youtube_include_dash_manifest': opts.youtube_include_dash_manifest, +        'youtube_player_js_version': opts.youtube_player_js_version, +        'youtube_player_js_variant': opts.youtube_player_js_variant,          'encoding': opts.encoding,          'extract_flat': opts.extract_flat,          'mark_watched': opts.mark_watched, diff --git a/youtube_dl/cache.py b/youtube_dl/cache.py index 54123da0e..eb0a729c2 100644 --- a/youtube_dl/cache.py +++ b/youtube_dl/cache.py @@ -1,3 +1,4 @@ +# coding: utf-8  from __future__ import unicode_literals  import errno @@ -10,12 +11,14 @@ import traceback  from .compat import (      compat_getenv,      compat_open as open, +    compat_os_makedirs,  )  from .utils import (      error_to_compat_str, +    escape_rfc3986,      expand_path,      is_outdated_version, -    try_get, +    traverse_obj,      write_json_file,  )  from .version import __version__ @@ -30,23 +33,35 @@ class Cache(object):      def __init__(self, ydl):          self._ydl = ydl +    def _write_debug(self, *args, **kwargs): +        self._ydl.write_debug(*args, **kwargs) + +    def _report_warning(self, *args, **kwargs): +        self._ydl.report_warning(*args, **kwargs) + +    def _to_screen(self, *args, **kwargs): +        self._ydl.to_screen(*args, **kwargs) + +    def _get_param(self, k, default=None): +        return self._ydl.params.get(k, default) +      def _get_root_dir(self): -        res = self._ydl.params.get('cachedir') +        res = self._get_param('cachedir')          if res is None:              cache_root = compat_getenv('XDG_CACHE_HOME', '~/.cache')              res = os.path.join(cache_root, self._YTDL_DIR)          return expand_path(res)      def _get_cache_fn(self, section, key, dtype): -        assert re.match(r'^[a-zA-Z0-9_.-]+$', section), \ +        assert re.match(r'^[\w.-]+$', section), \              'invalid section %r' % section -        assert re.match(r'^[a-zA-Z0-9_.-]+$', key), 'invalid key %r' % key +        key = escape_rfc3986(key, safe='').replace('%', ',')  # encode non-ascii characters          return os.path.join(              self._get_root_dir(), section, '%s.%s' % (key, dtype))      @property      def enabled(self): -        return self._ydl.params.get('cachedir') is not False +        return self._get_param('cachedir') is not False      def store(self, section, key, data, dtype='json'):          assert dtype in ('json',) @@ -56,61 +71,75 @@ class Cache(object):          fn = self._get_cache_fn(section, key, dtype)          try: -            try: -                os.makedirs(os.path.dirname(fn)) -            except OSError as ose: -                if ose.errno != errno.EEXIST: -                    raise +            compat_os_makedirs(os.path.dirname(fn), exist_ok=True) +            self._write_debug('Saving {section}.{key} to cache'.format(section=section, key=key))              write_json_file({self._VERSION_KEY: __version__, 'data': data}, fn)          except Exception:              tb = traceback.format_exc() -            self._ydl.report_warning( -                'Writing cache to %r failed: %s' % (fn, tb)) +            self._report_warning('Writing cache to {fn!r} failed: {tb}'.format(fn=fn, tb=tb)) + +    def clear(self, section, key, dtype='json'): + +        if not self.enabled: +            return + +        fn = self._get_cache_fn(section, key, dtype) +        self._write_debug('Clearing {section}.{key} from cache'.format(section=section, key=key)) +        try: +            os.remove(fn) +        except Exception as e: +            if getattr(e, 'errno') == errno.ENOENT: +                # file not found +                return +            tb = traceback.format_exc() +            self._report_warning('Clearing cache from {fn!r} failed: {tb}'.format(fn=fn, tb=tb))      def _validate(self, data, min_ver): -        version = try_get(data, lambda x: x[self._VERSION_KEY]) +        version = traverse_obj(data, self._VERSION_KEY)          if not version:  # Backward compatibility              data, version = {'data': data}, self._DEFAULT_VERSION          if not is_outdated_version(version, min_ver or '0', assume_new=False):              return data['data'] -        self._ydl.to_screen( -            'Discarding old cache from version {version} (needs {min_ver})'.format(**locals())) +        self._write_debug('Discarding old cache from version {version} (needs {min_ver})'.format(version=version, min_ver=min_ver)) -    def load(self, section, key, dtype='json', default=None, min_ver=None): +    def load(self, section, key, dtype='json', default=None, **kw_min_ver):          assert dtype in ('json',) +        min_ver = kw_min_ver.get('min_ver')          if not self.enabled:              return default          cache_fn = self._get_cache_fn(section, key, dtype)          try: +            with open(cache_fn, encoding='utf-8') as cachef: +                self._write_debug('Loading {section}.{key} from cache'.format(section=section, key=key), only_once=True) +                return self._validate(json.load(cachef), min_ver) +        except (ValueError, KeyError):              try: -                with open(cache_fn, 'r', encoding='utf-8') as cachef: -                    return self._validate(json.load(cachef), min_ver) -            except ValueError: -                try: -                    file_size = os.path.getsize(cache_fn) -                except (OSError, IOError) as oe: -                    file_size = error_to_compat_str(oe) -                self._ydl.report_warning( -                    'Cache retrieval from %s failed (%s)' % (cache_fn, file_size)) -        except IOError: -            pass  # No cache available +                file_size = 'size: %d' % os.path.getsize(cache_fn) +            except (OSError, IOError) as oe: +                file_size = error_to_compat_str(oe) +            self._report_warning('Cache retrieval from %s failed (%s)' % (cache_fn, file_size)) +        except Exception as e: +            if getattr(e, 'errno') == errno.ENOENT: +                # no cache available +                return +            self._report_warning('Cache retrieval from %s failed' % (cache_fn,))          return default      def remove(self):          if not self.enabled: -            self._ydl.to_screen('Cache is disabled (Did you combine --no-cache-dir and --rm-cache-dir?)') +            self._to_screen('Cache is disabled (Did you combine --no-cache-dir and --rm-cache-dir?)')              return          cachedir = self._get_root_dir()          if not any((term in cachedir) for term in ('cache', 'tmp')): -            raise Exception('Not removing directory %s - this does not look like a cache dir' % cachedir) +            raise Exception('Not removing directory %s - this does not look like a cache dir' % (cachedir,)) -        self._ydl.to_screen( -            'Removing cache dir %s .' % cachedir, skip_eol=True) +        self._to_screen( +            'Removing cache dir %s .' % (cachedir,), skip_eol=True, ),          if os.path.exists(cachedir): -            self._ydl.to_screen('.', skip_eol=True) +            self._to_screen('.', skip_eol=True)              shutil.rmtree(cachedir) -        self._ydl.to_screen('.') +        self._to_screen('.') diff --git a/youtube_dl/casefold.py b/youtube_dl/casefold.py index ad9c66f8e..712b2e7fa 100644 --- a/youtube_dl/casefold.py +++ b/youtube_dl/casefold.py @@ -10,9 +10,10 @@ from .compat import (  # https://github.com/unicode-org/icu/blob/main/icu4c/source/data/unidata/CaseFolding.txt  # In case newly foldable Unicode characters are defined, paste the new version  # of the text inside the ''' marks. -# The text is expected to have only blank lines andlines with 1st character #, +# The text is expected to have only blank lines and lines with 1st character #,  # all ignored, and fold definitions like this: -# `from_hex_code; space_separated_to_hex_code_list; comment` +# `from_hex_code; status; space_separated_to_hex_code_list; comment` +# Only `status` C/F are used.  _map_str = '''  # CaseFolding-15.0.0.txt @@ -1657,11 +1658,6 @@ _map = dict(  del _map_str -def casefold(s): +def _casefold(s):      assert isinstance(s, compat_str)      return ''.join((_map.get(c, c) for c in s)) - - -__all__ = [ -    'casefold', -] diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index ed1a33cf2..ebe22bdf9 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -16,7 +16,6 @@ import os  import platform  import re  import shlex -import shutil  import socket  import struct  import subprocess @@ -24,11 +23,15 @@ import sys  import types  import xml.etree.ElementTree +_IDENTITY = lambda x: x +  # naming convention  # 'compat_' + Python3_name.replace('.', '_')  # other aliases exist for convenience and/or legacy +# wrap disposable test values in type() to reclaim storage -# deal with critical unicode/str things first +# deal with critical unicode/str things first: +# compat_str, compat_basestring, compat_chr  try:      # Python 2      compat_str, compat_basestring, compat_chr = ( @@ -39,18 +42,23 @@ except NameError:          str, (str, bytes), chr      ) -# casefold + +# compat_casefold  try:      compat_str.casefold      compat_casefold = lambda s: s.casefold()  except AttributeError: -    from .casefold import casefold as compat_casefold +    from .casefold import _casefold as compat_casefold + +# compat_collections_abc  try:      import collections.abc as compat_collections_abc  except ImportError:      import collections as compat_collections_abc + +# compat_urllib_request  try:      import urllib.request as compat_urllib_request  except ImportError:  # Python 2 @@ -79,11 +87,15 @@ except TypeError:      _add_init_method_arg(compat_urllib_request.Request)      del _add_init_method_arg + +# compat_urllib_error  try:      import urllib.error as compat_urllib_error  except ImportError:  # Python 2      import urllib2 as compat_urllib_error + +# compat_urllib_parse  try:      import urllib.parse as compat_urllib_parse  except ImportError:  # Python 2 @@ -98,17 +110,23 @@ except ImportError:  # Python 2  compat_urlparse = compat_urllib_parse  compat_urllib_parse_urlparse = compat_urllib_parse.urlparse + +# compat_urllib_response  try:      import urllib.response as compat_urllib_response  except ImportError:  # Python 2      import urllib as compat_urllib_response + +# compat_urllib_response.addinfourl  try:      compat_urllib_response.addinfourl.status  except AttributeError:      # .getcode() is deprecated in Py 3.      compat_urllib_response.addinfourl.status = property(lambda self: self.getcode()) + +# compat_http_cookiejar  try:      import http.cookiejar as compat_cookiejar  except ImportError:  # Python 2 @@ -127,12 +145,16 @@ else:      compat_cookiejar_Cookie = compat_cookiejar.Cookie  compat_http_cookiejar_Cookie = compat_cookiejar_Cookie + +# compat_http_cookies  try:      import http.cookies as compat_cookies  except ImportError:  # Python 2      import Cookie as compat_cookies  compat_http_cookies = compat_cookies + +# compat_http_cookies_SimpleCookie  if sys.version_info[0] == 2 or sys.version_info < (3, 3):      class compat_cookies_SimpleCookie(compat_cookies.SimpleCookie):          def load(self, rawdata): @@ -155,11 +177,15 @@ else:      compat_cookies_SimpleCookie = compat_cookies.SimpleCookie  compat_http_cookies_SimpleCookie = compat_cookies_SimpleCookie + +# compat_html_entities, probably useless now  try:      import html.entities as compat_html_entities  except ImportError:  # Python 2      import htmlentitydefs as compat_html_entities + +# compat_html_entities_html5  try:  # Python >= 3.3      compat_html_entities_html5 = compat_html_entities.html5  except AttributeError: @@ -2408,18 +2434,24 @@ except AttributeError:      # Py < 3.1      compat_http_client.HTTPResponse.getcode = lambda self: self.status + +# compat_urllib_HTTPError  try:      from urllib.error import HTTPError as compat_HTTPError  except ImportError:  # Python 2      from urllib2 import HTTPError as compat_HTTPError  compat_urllib_HTTPError = compat_HTTPError + +# compat_urllib_request_urlretrieve  try:      from urllib.request import urlretrieve as compat_urlretrieve  except ImportError:  # Python 2      from urllib import urlretrieve as compat_urlretrieve  compat_urllib_request_urlretrieve = compat_urlretrieve + +# compat_html_parser_HTMLParser, compat_html_parser_HTMLParseError  try:      from HTMLParser import (          HTMLParser as compat_HTMLParser, @@ -2432,22 +2464,33 @@ except ImportError:  # Python 3          # HTMLParseError was deprecated in Python 3.3 and removed in          # Python 3.5. Introducing dummy exception for Python >3.5 for compatible          # and uniform cross-version exception handling +          class compat_HTMLParseError(Exception):              pass +  compat_html_parser_HTMLParser = compat_HTMLParser  compat_html_parser_HTMLParseError = compat_HTMLParseError + +# compat_subprocess_get_DEVNULL  try:      _DEVNULL = subprocess.DEVNULL      compat_subprocess_get_DEVNULL = lambda: _DEVNULL  except AttributeError:      compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w') + +# compat_http_server  try:      import http.server as compat_http_server  except ImportError:      import BaseHTTPServer as compat_http_server + +# compat_urllib_parse_unquote_to_bytes, +# compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, +# compat_urllib_parse_urlencode, +# compat_urllib_parse_parse_qs  try:      from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes      from urllib.parse import unquote as compat_urllib_parse_unquote @@ -2455,8 +2498,7 @@ try:      from urllib.parse import urlencode as compat_urllib_parse_urlencode      from urllib.parse import parse_qs as compat_parse_qs  except ImportError:  # Python 2 -    _asciire = (compat_urllib_parse._asciire if hasattr(compat_urllib_parse, '_asciire') -                else re.compile(r'([\x00-\x7f]+)')) +    _asciire = getattr(compat_urllib_parse, '_asciire', None) or re.compile(r'([\x00-\x7f]+)')      # HACK: The following are the correct unquote_to_bytes, unquote and unquote_plus      # implementations from cpython 3.4.3's stdlib. Python 2's version @@ -2524,24 +2566,21 @@ except ImportError:  # Python 2      # Possible solutions are to either port it from python 3 with all      # the friends or manually ensure input query contains only byte strings.      # We will stick with latter thus recursively encoding the whole query. -    def compat_urllib_parse_urlencode(query, doseq=0, encoding='utf-8'): +    def compat_urllib_parse_urlencode(query, doseq=0, safe='', encoding='utf-8', errors='strict'): +          def encode_elem(e):              if isinstance(e, dict):                  e = encode_dict(e)              elif isinstance(e, (list, tuple,)): -                list_e = encode_list(e) -                e = tuple(list_e) if isinstance(e, tuple) else list_e +                e = type(e)(encode_elem(el) for el in e)              elif isinstance(e, compat_str): -                e = e.encode(encoding) +                e = e.encode(encoding, errors)              return e          def encode_dict(d): -            return dict((encode_elem(k), encode_elem(v)) for k, v in d.items()) +            return tuple((encode_elem(k), encode_elem(v)) for k, v in d.items()) -        def encode_list(l): -            return [encode_elem(e) for e in l] - -        return compat_urllib_parse._urlencode(encode_elem(query), doseq=doseq) +        return compat_urllib_parse._urlencode(encode_elem(query), doseq=doseq).decode('ascii')      # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.      # Python 2's version is apparently totally broken @@ -2596,8 +2635,61 @@ except ImportError:  # Python 2              ('parse_qs', compat_parse_qs)):          setattr(compat_urllib_parse, name, fix) +    try: +        all(chr(i) in b'' for i in range(256)) +    except TypeError: +        # not all chr(i) are str: patch Python2 quote + +        _safemaps = getattr(compat_urllib_parse, '_safemaps', {}) +        _always_safe = frozenset(compat_urllib_parse.always_safe) + +        def _quote(s, safe='/'): +            """quote('abc def') -> 'abc%20def'""" + +            if not s and s is not None:  # fast path +                return s +            safe = frozenset(safe) +            cachekey = (safe, _always_safe) +            try: +                safe_map = _safemaps[cachekey] +            except KeyError: +                safe = _always_safe | safe +                safe_map = {} +                for i in range(256): +                    c = chr(i) +                    safe_map[c] = ( +                        c if (i < 128 and c in safe) +                        else b'%{0:02X}'.format(i)) +                _safemaps[cachekey] = safe_map + +            if safe.issuperset(s): +                return s +            return ''.join(safe_map[c] for c in s) + +        # linked code +        def _quote_plus(s, safe=''): +            return ( +                _quote(s, safe + b' ').replace(b' ', b'+') if b' ' in s +                else _quote(s, safe)) + +        # linked code +        def _urlcleanup(): +            if compat_urllib_parse._urlopener: +                compat_urllib_parse._urlopener.cleanup() +            _safemaps.clear() +            compat_urllib_parse.ftpcache.clear() + +        for name, fix in ( +                ('quote', _quote), +                ('quote_plus', _quote_plus), +                ('urlcleanup', _urlcleanup)): +            setattr(compat_urllib_parse, '_' + name, getattr(compat_urllib_parse, name)) +            setattr(compat_urllib_parse, name, fix) +  compat_urllib_parse_parse_qs = compat_parse_qs + +# compat_urllib_request_DataHandler  try:      from urllib.request import DataHandler as compat_urllib_request_DataHandler  except ImportError:  # Python < 3.4 @@ -2632,16 +2724,20 @@ except ImportError:  # Python < 3.4              return compat_urllib_response.addinfourl(io.BytesIO(data), headers, url) + +# compat_xml_etree_ElementTree_ParseError  try:      from xml.etree.ElementTree import ParseError as compat_xml_parse_error  except ImportError:  # Python 2.6      from xml.parsers.expat import ExpatError as compat_xml_parse_error  compat_xml_etree_ElementTree_ParseError = compat_xml_parse_error -etree = xml.etree.ElementTree +# compat_xml_etree_ElementTree_Element +_etree = xml.etree.ElementTree -class _TreeBuilder(etree.TreeBuilder): + +class _TreeBuilder(_etree.TreeBuilder):      def doctype(self, name, pubid, system):          pass @@ -2650,7 +2746,7 @@ try:      # xml.etree.ElementTree.Element is a method in Python <=2.6 and      # the following will crash with:      #  TypeError: isinstance() arg 2 must be a class, type, or tuple of classes and types -    isinstance(None, etree.Element) +    isinstance(None, _etree.Element)      from xml.etree.ElementTree import Element as compat_etree_Element  except TypeError:  # Python <=2.6      from xml.etree.ElementTree import _ElementInterface as compat_etree_Element @@ -2658,12 +2754,12 @@ compat_xml_etree_ElementTree_Element = compat_etree_Element  if sys.version_info[0] >= 3:      def compat_etree_fromstring(text): -        return etree.XML(text, parser=etree.XMLParser(target=_TreeBuilder())) +        return _etree.XML(text, parser=_etree.XMLParser(target=_TreeBuilder()))  else:      # python 2.x tries to encode unicode strings with ascii (see the      # XMLParser._fixtext method)      try: -        _etree_iter = etree.Element.iter +        _etree_iter = _etree.Element.iter      except AttributeError:  # Python <=2.6          def _etree_iter(root):              for el in root.findall('*'): @@ -2675,27 +2771,29 @@ else:      # 2.7 source      def _XML(text, parser=None):          if not parser: -            parser = etree.XMLParser(target=_TreeBuilder()) +            parser = _etree.XMLParser(target=_TreeBuilder())          parser.feed(text)          return parser.close()      def _element_factory(*args, **kwargs): -        el = etree.Element(*args, **kwargs) +        el = _etree.Element(*args, **kwargs)          for k, v in el.items():              if isinstance(v, bytes):                  el.set(k, v.decode('utf-8'))          return el      def compat_etree_fromstring(text): -        doc = _XML(text, parser=etree.XMLParser(target=_TreeBuilder(element_factory=_element_factory))) +        doc = _XML(text, parser=_etree.XMLParser(target=_TreeBuilder(element_factory=_element_factory)))          for el in _etree_iter(doc):              if el.text is not None and isinstance(el.text, bytes):                  el.text = el.text.decode('utf-8')          return doc -if hasattr(etree, 'register_namespace'): -    compat_etree_register_namespace = etree.register_namespace -else: + +# compat_xml_etree_register_namespace +try: +    compat_etree_register_namespace = _etree.register_namespace +except AttributeError:      def compat_etree_register_namespace(prefix, uri):          """Register a namespace prefix.          The registry is global, and any existing mapping for either the @@ -2704,14 +2802,16 @@ else:          attributes in this namespace will be serialized with prefix if possible.          ValueError is raised if prefix is reserved or is invalid.          """ -        if re.match(r"ns\d+$", prefix): -            raise ValueError("Prefix format reserved for internal use") -        for k, v in list(etree._namespace_map.items()): +        if re.match(r'ns\d+$', prefix): +            raise ValueError('Prefix format reserved for internal use') +        for k, v in list(_etree._namespace_map.items()):              if k == uri or v == prefix: -                del etree._namespace_map[k] -        etree._namespace_map[uri] = prefix +                del _etree._namespace_map[k] +        _etree._namespace_map[uri] = prefix  compat_xml_etree_register_namespace = compat_etree_register_namespace + +# compat_xpath, compat_etree_iterfind  if sys.version_info < (2, 7):      # Here comes the crazy part: In 2.6, if the xpath is a unicode,      # .//node does not match if a node is a direct child of . ! @@ -2898,7 +2998,6 @@ if sys.version_info < (2, 7):          def __init__(self, root):              self.root = root -    ##      # Generate all matching objects.      def compat_etree_iterfind(elem, path, namespaces=None): @@ -2933,13 +3032,15 @@ if sys.version_info < (2, 7):  else: -    compat_xpath = lambda xpath: xpath      compat_etree_iterfind = lambda element, match: element.iterfind(match) +    compat_xpath = _IDENTITY +# compat_os_name  compat_os_name = os._name if os.name == 'java' else os.name +# compat_shlex_quote  if compat_os_name == 'nt':      def compat_shlex_quote(s):          return s if re.match(r'^[-_\w./]+$', s) else '"%s"' % s.replace('"', '\\"') @@ -2954,6 +3055,7 @@ else:                  return "'" + s.replace("'", "'\"'\"'") + "'" +# compat_shlex.split  try:      args = shlex.split('ไธญๆ')      assert (isinstance(args, list) @@ -2969,6 +3071,7 @@ except (AssertionError, UnicodeEncodeError):          return list(map(lambda s: s.decode('utf-8'), shlex.split(s, comments, posix))) +# compat_ord  def compat_ord(c):      if isinstance(c, int):          return c @@ -2976,6 +3079,7 @@ def compat_ord(c):          return ord(c) +# compat_getenv, compat_os_path_expanduser, compat_setenv  if sys.version_info >= (3, 0):      compat_getenv = os.getenv      compat_expanduser = os.path.expanduser @@ -3063,6 +3167,22 @@ else:  compat_os_path_expanduser = compat_expanduser +# compat_os_makedirs +try: +    os.makedirs('.', exist_ok=True) +    compat_os_makedirs = os.makedirs +except TypeError:  # < Py3.2 +    from errno import EEXIST as _errno_EEXIST + +    def compat_os_makedirs(name, mode=0o777, exist_ok=False): +        try: +            return os.makedirs(name, mode=mode) +        except OSError as ose: +            if not (exist_ok and ose.errno == _errno_EEXIST): +                raise + + +# compat_os_path_realpath  if compat_os_name == 'nt' and sys.version_info < (3, 8):      # os.path.realpath on Windows does not follow symbolic links      # prior to Python 3.8 (see https://bugs.python.org/issue9949) @@ -3076,6 +3196,7 @@ else:  compat_os_path_realpath = compat_realpath +# compat_print  if sys.version_info < (3, 0):      def compat_print(s):          from .utils import preferredencoding @@ -3086,6 +3207,7 @@ else:          print(s) +# compat_getpass_getpass  if sys.version_info < (3, 0) and sys.platform == 'win32':      def compat_getpass(prompt, *args, **kwargs):          if isinstance(prompt, compat_str): @@ -3098,36 +3220,42 @@ else:  compat_getpass_getpass = compat_getpass +# compat_input  try:      compat_input = raw_input  except NameError:  # Python 3      compat_input = input +# compat_kwargs  # Python < 2.6.5 require kwargs to be bytes  try: -    def _testfunc(x): -        pass -    _testfunc(**{'x': 0}) +    (lambda x: x)(**{'x': 0})  except TypeError:      def compat_kwargs(kwargs):          return dict((bytes(k), v) for k, v in kwargs.items())  else: -    compat_kwargs = lambda kwargs: kwargs +    compat_kwargs = _IDENTITY +# compat_numeric_types  try:      compat_numeric_types = (int, float, long, complex)  except NameError:  # Python 3      compat_numeric_types = (int, float, complex) +# compat_integer_types  try:      compat_integer_types = (int, long)  except NameError:  # Python 3      compat_integer_types = (int, ) +# compat_int +compat_int = compat_integer_types[-1] + +# compat_socket_create_connection  if sys.version_info < (2, 7):      def compat_socket_create_connection(address, timeout, source_address=None):          host, port = address @@ -3154,6 +3282,7 @@ else:      compat_socket_create_connection = socket.create_connection +# compat_contextlib_suppress  try:      from contextlib import suppress as compat_contextlib_suppress  except ImportError: @@ -3196,12 +3325,12 @@ except AttributeError:                          # repeated .close() is OK, but just in case                          with compat_contextlib_suppress(EnvironmentError):                              f.close() -                popen.wait() +            popen.wait()  # Fix https://github.com/ytdl-org/youtube-dl/issues/4223  # See http://bugs.python.org/issue9161 for what is broken -def workaround_optparse_bug9161(): +def _workaround_optparse_bug9161():      op = optparse.OptionParser()      og = optparse.OptionGroup(op, 'foo')      try: @@ -3220,9 +3349,10 @@ def workaround_optparse_bug9161():          optparse.OptionGroup.add_option = _compat_add_option -if hasattr(shutil, 'get_terminal_size'):  # Python >= 3.3 -    compat_get_terminal_size = shutil.get_terminal_size -else: +# compat_shutil_get_terminal_size +try: +    from shutil import get_terminal_size as compat_get_terminal_size  # Python >= 3.3 +except ImportError:      _terminal_size = collections.namedtuple('terminal_size', ['columns', 'lines'])      def compat_get_terminal_size(fallback=(80, 24)): @@ -3252,27 +3382,33 @@ else:                  columns = _columns              if lines is None or lines <= 0:                  lines = _lines +          return _terminal_size(columns, lines) +compat_shutil_get_terminal_size = compat_get_terminal_size + +# compat_itertools_count  try: -    itertools.count(start=0, step=1) +    type(itertools.count(start=0, step=1))      compat_itertools_count = itertools.count -except TypeError:  # Python 2.6 +except TypeError:  # Python 2.6 lacks step      def compat_itertools_count(start=0, step=1):          while True:              yield start              start += step +# compat_tokenize_tokenize  if sys.version_info >= (3, 0):      from tokenize import tokenize as compat_tokenize_tokenize  else:      from tokenize import generate_tokens as compat_tokenize_tokenize +# compat_struct_pack, compat_struct_unpack, compat_Struct  try: -    struct.pack('!I', 0) +    type(struct.pack('!I', 0))  except TypeError:      # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument      # See https://bugs.python.org/issue19099 @@ -3304,8 +3440,10 @@ else:          compat_Struct = struct.Struct -# compat_map/filter() returning an iterator, supposedly the -# same versioning as for zip below +# builtins returning an iterator + +# compat_map, compat_filter +# supposedly the same versioning as for zip below  try:      from future_builtins import map as compat_map  except ImportError: @@ -3322,6 +3460,7 @@ except ImportError:      except ImportError:          compat_filter = filter +# compat_zip  try:      from future_builtins import zip as compat_zip  except ImportError:  # not 2.6+ or is 3.x @@ -3331,6 +3470,7 @@ except ImportError:  # not 2.6+ or is 3.x          compat_zip = zip +# compat_itertools_zip_longest  # method renamed between Py2/3  try:      from itertools import zip_longest as compat_itertools_zip_longest @@ -3338,7 +3478,8 @@ except ImportError:      from itertools import izip_longest as compat_itertools_zip_longest -# new class in collections +# compat_collections_chain_map +# collections.ChainMap: new class  try:      from collections import ChainMap as compat_collections_chain_map      # Py3.3's ChainMap is deficient @@ -3394,19 +3535,22 @@ except ImportError:          def new_child(self, m=None, **kwargs):              m = m or {}              m.update(kwargs) -            return compat_collections_chain_map(m, *self.maps) +            # support inheritance ! +            return type(self)(m, *self.maps)          @property          def parents(self): -            return compat_collections_chain_map(*(self.maps[1:])) +            return type(self)(*(self.maps[1:])) +# compat_re_Pattern, compat_re_Match  # Pythons disagree on the type of a pattern (RegexObject, _sre.SRE_Pattern, Pattern, ...?)  compat_re_Pattern = type(re.compile(''))  # and on the type of a match  compat_re_Match = type(re.match('a', 'a')) +# compat_base64_b64decode  if sys.version_info < (3, 3):      def compat_b64decode(s, *args, **kwargs):          if isinstance(s, compat_str): @@ -3418,6 +3562,7 @@ else:  compat_base64_b64decode = compat_b64decode +# compat_ctypes_WINFUNCTYPE  if platform.python_implementation() == 'PyPy' and sys.pypy_version_info < (5, 4, 0):      # PyPy2 prior to version 5.4.0 expects byte strings as Windows function      # names, see the original PyPy issue [1] and the youtube-dl one [2]. @@ -3436,6 +3581,7 @@ else:          return ctypes.WINFUNCTYPE(*args, **kwargs) +# compat_open  if sys.version_info < (3, 0):      # open(file, mode='r', buffering=- 1, encoding=None, errors=None, newline=None, closefd=True) not: opener=None      def compat_open(file_, *args, **kwargs): @@ -3463,18 +3609,28 @@ except AttributeError:      def compat_datetime_timedelta_total_seconds(td):          return (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / 10**6 +  # optional decompression packages +# compat_brotli  # PyPi brotli package implements 'br' Content-Encoding  try:      import brotli as compat_brotli  except ImportError:      compat_brotli = None +# compat_ncompress  # PyPi ncompress package implements 'compress' Content-Encoding  try:      import ncompress as compat_ncompress  except ImportError:      compat_ncompress = None +# compat_zstandard +# PyPi zstandard package implements 'zstd' Content-Encoding (RFC 8878 7.2) +try: +    import zstandard as compat_zstandard +except ImportError: +    compat_zstandard = None +  legacy = [      'compat_HTMLParseError', @@ -3491,6 +3647,7 @@ legacy = [      'compat_getpass',      'compat_parse_qs',      'compat_realpath', +    'compat_shlex_split',      'compat_urllib_parse_parse_qs',      'compat_urllib_parse_unquote',      'compat_urllib_parse_unquote_plus', @@ -3504,8 +3661,6 @@ legacy = [  __all__ = [ -    'compat_html_parser_HTMLParseError', -    'compat_html_parser_HTMLParser',      'compat_Struct',      'compat_base64_b64decode',      'compat_basestring', @@ -3514,13 +3669,9 @@ __all__ = [      'compat_chr',      'compat_collections_abc',      'compat_collections_chain_map', -    'compat_datetime_timedelta_total_seconds', -    'compat_http_cookiejar', -    'compat_http_cookiejar_Cookie', -    'compat_http_cookies', -    'compat_http_cookies_SimpleCookie',      'compat_contextlib_suppress',      'compat_ctypes_WINFUNCTYPE', +    'compat_datetime_timedelta_total_seconds',      'compat_etree_fromstring',      'compat_etree_iterfind',      'compat_filter', @@ -3529,9 +3680,16 @@ __all__ = [      'compat_getpass_getpass',      'compat_html_entities',      'compat_html_entities_html5', +    'compat_html_parser_HTMLParseError', +    'compat_html_parser_HTMLParser', +    'compat_http_cookiejar', +    'compat_http_cookiejar_Cookie', +    'compat_http_cookies', +    'compat_http_cookies_SimpleCookie',      'compat_http_client',      'compat_http_server',      'compat_input', +    'compat_int',      'compat_integer_types',      'compat_itertools_count',      'compat_itertools_zip_longest', @@ -3541,6 +3699,7 @@ __all__ = [      'compat_numeric_types',      'compat_open',      'compat_ord', +    'compat_os_makedirs',      'compat_os_name',      'compat_os_path_expanduser',      'compat_os_path_realpath', @@ -3550,7 +3709,7 @@ __all__ = [      'compat_register_utf8',      'compat_setenv',      'compat_shlex_quote', -    'compat_shlex_split', +    'compat_shutil_get_terminal_size',      'compat_socket_create_connection',      'compat_str',      'compat_struct_pack', @@ -3570,5 +3729,5 @@ __all__ = [      'compat_xml_etree_register_namespace',      'compat_xpath',      'compat_zip', -    'workaround_optparse_bug9161', +    'compat_zstandard',  ] diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 91e691776..8354030a9 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -11,6 +11,7 @@ from ..utils import (      decodeArgument,      encodeFilename,      error_to_compat_str, +    float_or_none,      format_bytes,      shell_quote,      timeconvert, @@ -367,14 +368,27 @@ class FileDownloader(object):                  })                  return True -        min_sleep_interval = self.params.get('sleep_interval') -        if min_sleep_interval: -            max_sleep_interval = self.params.get('max_sleep_interval', min_sleep_interval) -            sleep_interval = random.uniform(min_sleep_interval, max_sleep_interval) +        min_sleep_interval, max_sleep_interval = ( +            float_or_none(self.params.get(interval), default=0) +            for interval in ('sleep_interval', 'max_sleep_interval')) + +        sleep_note = '' +        available_at = info_dict.get('available_at') +        if available_at: +            forced_sleep_interval = available_at - int(time.time()) +            if forced_sleep_interval > min_sleep_interval: +                sleep_note = 'as required by the site' +                min_sleep_interval = forced_sleep_interval +            if forced_sleep_interval > max_sleep_interval: +                max_sleep_interval = forced_sleep_interval + +        sleep_interval = random.uniform( +            min_sleep_interval, max_sleep_interval or min_sleep_interval) + +        if sleep_interval > 0:              self.to_screen( -                '[download] Sleeping %s seconds...' % ( -                    int(sleep_interval) if sleep_interval.is_integer() -                    else '%.2f' % sleep_interval)) +                '[download] Sleeping %.2f seconds %s...' % ( +                    sleep_interval, sleep_note))              time.sleep(sleep_interval)          return self.real_download(filename, info_dict) diff --git a/youtube_dl/extractor/bokecc.py b/youtube_dl/extractor/bokecc.py index 6017e8344..4b8bef391 100644 --- a/youtube_dl/extractor/bokecc.py +++ b/youtube_dl/extractor/bokecc.py @@ -32,7 +32,7 @@ class BokeCCBaseIE(InfoExtractor):  class BokeCCIE(BokeCCBaseIE): -    _IE_DESC = 'CC่ง้ข' +    IE_DESC = 'CC่ง้ข'      _VALID_URL = r'https?://union\.bokecc\.com/playvideo\.bo\?(?P<query>.*)'      _TESTS = [{ diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py index 85ca20ecc..d39a9a5c2 100644 --- a/youtube_dl/extractor/cloudy.py +++ b/youtube_dl/extractor/cloudy.py @@ -9,7 +9,7 @@ from ..utils import (  class CloudyIE(InfoExtractor): -    _IE_DESC = 'cloudy.ec' +    IE_DESC = 'cloudy.ec'      _VALID_URL = r'https?://(?:www\.)?cloudy\.ec/(?:v/|embed\.php\?.*?\bid=)(?P<id>[A-Za-z0-9]+)'      _TESTS = [{          'url': 'https://www.cloudy.ec/v/af511e2527aac', diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 78704b557..a64fcfccc 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -422,6 +422,8 @@ class InfoExtractor(object):      _GEO_COUNTRIES = None      _GEO_IP_BLOCKS = None      _WORKING = True +    # supply this in public subclasses: used in supported sites list, etc +    # IE_DESC = 'short description of IE'      def __init__(self, downloader=None):          """Constructor. Receives an optional downloader.""" @@ -503,7 +505,7 @@ class InfoExtractor(object):          if not self._x_forwarded_for_ip:              # Geo bypass mechanism is explicitly disabled by user -            if not self._downloader.params.get('geo_bypass', True): +            if not self.get_param('geo_bypass', True):                  return              if not geo_bypass_context: @@ -525,7 +527,7 @@ class InfoExtractor(object):              # Explicit IP block specified by user, use it right away              # regardless of whether extractor is geo bypassable or not -            ip_block = self._downloader.params.get('geo_bypass_ip_block', None) +            ip_block = self.get_param('geo_bypass_ip_block', None)              # Otherwise use random IP block from geo bypass context but only              # if extractor is known as geo bypassable @@ -536,8 +538,8 @@ class InfoExtractor(object):              if ip_block:                  self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block) -                if self._downloader.params.get('verbose', False): -                    self._downloader.to_screen( +                if self.get_param('verbose', False): +                    self.to_screen(                          '[debug] Using fake IP %s as X-Forwarded-For.'                          % self._x_forwarded_for_ip)                  return @@ -546,7 +548,7 @@ class InfoExtractor(object):              # Explicit country code specified by user, use it right away              # regardless of whether extractor is geo bypassable or not -            country = self._downloader.params.get('geo_bypass_country', None) +            country = self.get_param('geo_bypass_country', None)              # Otherwise use random country code from geo bypass context but              # only if extractor is known as geo bypassable @@ -557,8 +559,8 @@ class InfoExtractor(object):              if country:                  self._x_forwarded_for_ip = GeoUtils.random_ipv4(country) -                if self._downloader.params.get('verbose', False): -                    self._downloader.to_screen( +                if self.get_param('verbose', False): +                    self.to_screen(                          '[debug] Using fake IP %s (%s) as X-Forwarded-For.'                          % (self._x_forwarded_for_ip, country.upper())) @@ -584,9 +586,9 @@ class InfoExtractor(object):              raise ExtractorError('An extractor error has occurred.', cause=e)      def __maybe_fake_ip_and_retry(self, countries): -        if (not self._downloader.params.get('geo_bypass_country', None) +        if (not self.get_param('geo_bypass_country', None)                  and self._GEO_BYPASS -                and self._downloader.params.get('geo_bypass', True) +                and self.get_param('geo_bypass', True)                  and not self._x_forwarded_for_ip                  and countries):              country_code = random.choice(countries) @@ -696,7 +698,7 @@ class InfoExtractor(object):              if fatal:                  raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)              else: -                self._downloader.report_warning(errmsg) +                self.report_warning(errmsg)                  return False      def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): @@ -768,11 +770,11 @@ class InfoExtractor(object):              webpage_bytes = prefix + webpage_bytes          if not encoding:              encoding = self._guess_encoding_from_content(content_type, webpage_bytes) -        if self._downloader.params.get('dump_intermediate_pages', False): +        if self.get_param('dump_intermediate_pages', False):              self.to_screen('Dumping request to ' + urlh.geturl())              dump = base64.b64encode(webpage_bytes).decode('ascii') -            self._downloader.to_screen(dump) -        if self._downloader.params.get('write_pages', False): +            self.to_screen(dump) +        if self.get_param('write_pages', False):              basen = '%s_%s' % (video_id, urlh.geturl())              if len(basen) > 240:                  h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() @@ -974,19 +976,9 @@ class InfoExtractor(object):          """Print msg to screen, prefixing it with '[ie_name]'"""          self._downloader.to_screen(self.__ie_msg(msg)) -    def write_debug(self, msg, only_once=False, _cache=[]): +    def write_debug(self, msg, only_once=False):          '''Log debug message or Print message to stderr''' -        if not self.get_param('verbose', False): -            return -        message = '[debug] ' + self.__ie_msg(msg) -        logger = self.get_param('logger') -        if logger: -            logger.debug(message) -        else: -            if only_once and hash(message) in _cache: -                return -            self._downloader.to_stderr(message) -            _cache.append(hash(message)) +        self._downloader.write_debug(self.__ie_msg(msg), only_once=only_once)      # name, default=None, *args, **kwargs      def get_param(self, name, *args, **kwargs): @@ -1082,7 +1074,7 @@ class InfoExtractor(object):                  if mobj:                      break -        if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty(): +        if not self.get_param('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():              _name = '\033[0;34m%s\033[0m' % name          else:              _name = name @@ -1100,7 +1092,7 @@ class InfoExtractor(object):          elif fatal:              raise RegexNotFoundError('Unable to extract %s' % _name)          else: -            self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message()) +            self.report_warning('unable to extract %s' % _name + bug_reports_message())              return None      def _search_json(self, start_pattern, string, name, video_id, **kwargs): @@ -1170,7 +1162,7 @@ class InfoExtractor(object):          username = None          password = None -        if self._downloader.params.get('usenetrc', False): +        if self.get_param('usenetrc', False):              try:                  netrc_machine = netrc_machine or self._NETRC_MACHINE                  info = netrc.netrc().authenticators(netrc_machine) @@ -1181,7 +1173,7 @@ class InfoExtractor(object):                      raise netrc.NetrcParseError(                          'No authenticators for %s' % netrc_machine)              except (AttributeError, IOError, netrc.NetrcParseError) as err: -                self._downloader.report_warning( +                self.report_warning(                      'parsing .netrc: %s' % error_to_compat_str(err))          return username, password @@ -1218,10 +1210,10 @@ class InfoExtractor(object):          """          if self._downloader is None:              return None -        downloader_params = self._downloader.params -        if downloader_params.get('twofactor') is not None: -            return downloader_params['twofactor'] +        twofactor = self.get_param('twofactor') +        if twofactor is not None: +            return twofactor          return compat_getpass('Type %s and press [Return]: ' % note) @@ -1356,7 +1348,7 @@ class InfoExtractor(object):          elif fatal:              raise RegexNotFoundError('Unable to extract JSON-LD')          else: -            self._downloader.report_warning('unable to extract JSON-LD %s' % bug_reports_message()) +            self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())              return {}      def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None): @@ -1587,7 +1579,7 @@ class InfoExtractor(object):              if f.get('vcodec') == 'none':  # audio only                  preference -= 50 -                if self._downloader.params.get('prefer_free_formats'): +                if self.get_param('prefer_free_formats'):                      ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']                  else:                      ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a'] @@ -1599,7 +1591,7 @@ class InfoExtractor(object):              else:                  if f.get('acodec') == 'none':  # video only                      preference -= 40 -                if self._downloader.params.get('prefer_free_formats'): +                if self.get_param('prefer_free_formats'):                      ORDER = ['flv', 'mp4', 'webm']                  else:                      ORDER = ['webm', 'flv', 'mp4'] @@ -1665,7 +1657,7 @@ class InfoExtractor(object):          """ Either "http:" or "https:", depending on the user's preferences """          return (              'http:' -            if self._downloader.params.get('prefer_insecure', False) +            if self.get_param('prefer_insecure', False)              else 'https:')      def _proto_relative_url(self, url, scheme=None): @@ -3197,7 +3189,7 @@ class InfoExtractor(object):              if fatal:                  raise ExtractorError(msg)              else: -                self._downloader.report_warning(msg) +                self.report_warning(msg)          return res      def _float(self, v, name, fatal=False, **kwargs): @@ -3207,7 +3199,7 @@ class InfoExtractor(object):              if fatal:                  raise ExtractorError(msg)              else: -                self._downloader.report_warning(msg) +                self.report_warning(msg)          return res      def _set_cookie(self, domain, name, value, expire_time=None, port=None, @@ -3216,12 +3208,12 @@ class InfoExtractor(object):              0, name, value, port, port is not None, domain, True,              domain.startswith('.'), path, True, secure, expire_time,              discard, None, None, rest) -        self._downloader.cookiejar.set_cookie(cookie) +        self.cookiejar.set_cookie(cookie)      def _get_cookies(self, url):          """ Return a compat_cookies_SimpleCookie with the cookies for the url """          req = sanitized_Request(url) -        self._downloader.cookiejar.add_cookie_header(req) +        self.cookiejar.add_cookie_header(req)          return compat_cookies_SimpleCookie(req.get_header('Cookie'))      def _apply_first_set_cookie_header(self, url_handle, cookie): @@ -3281,8 +3273,8 @@ class InfoExtractor(object):          return not any_restricted      def extract_subtitles(self, *args, **kwargs): -        if (self._downloader.params.get('writesubtitles', False) -                or self._downloader.params.get('listsubtitles')): +        if (self.get_param('writesubtitles', False) +                or self.get_param('listsubtitles')):              return self._get_subtitles(*args, **kwargs)          return {} @@ -3303,7 +3295,11 @@ class InfoExtractor(object):          """ Merge subtitle dictionaries, language by language. """          # ..., * , target=None -        target = kwargs.get('target') or dict(subtitle_dict1) +        target = kwargs.get('target') +        if target is None: +            target = dict(subtitle_dict1) +        else: +            subtitle_dicts = (subtitle_dict1,) + subtitle_dicts          for subtitle_dict in subtitle_dicts:              for lang in subtitle_dict: @@ -3311,8 +3307,8 @@ class InfoExtractor(object):          return target      def extract_automatic_captions(self, *args, **kwargs): -        if (self._downloader.params.get('writeautomaticsub', False) -                or self._downloader.params.get('listsubtitles')): +        if (self.get_param('writeautomaticsub', False) +                or self.get_param('listsubtitles')):              return self._get_automatic_captions(*args, **kwargs)          return {} @@ -3320,9 +3316,9 @@ class InfoExtractor(object):          raise NotImplementedError('This method must be implemented by subclasses')      def mark_watched(self, *args, **kwargs): -        if (self._downloader.params.get('mark_watched', False) +        if (self.get_param('mark_watched', False)                  and (self._get_login_info()[0] is not None -                     or self._downloader.params.get('cookiefile') is not None)): +                     or self.get_param('cookiefile') is not None)):              self._mark_watched(*args, **kwargs)      def _mark_watched(self, *args, **kwargs): @@ -3330,7 +3326,7 @@ class InfoExtractor(object):      def geo_verification_headers(self):          headers = {} -        geo_verification_proxy = self._downloader.params.get('geo_verification_proxy') +        geo_verification_proxy = self.get_param('geo_verification_proxy')          if geo_verification_proxy:              headers['Ytdl-request-proxy'] = geo_verification_proxy          return headers diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index c64af3be6..2510ad887 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -35,15 +35,6 @@ from ..utils import (  class ITVBaseIE(InfoExtractor): -    def _search_nextjs_data(self, webpage, video_id, **kw): -        transform_source = kw.pop('transform_source', None) -        fatal = kw.pop('fatal', True) -        return self._parse_json( -            self._search_regex( -                r'''<script\b[^>]+\bid=('|")__NEXT_DATA__\1[^>]*>(?P<js>[^<]+)</script>''', -                webpage, 'next.js data', group='js', fatal=fatal, **kw), -            video_id, transform_source=transform_source, fatal=fatal) -      def __handle_request_webpage_error(self, err, video_id=None, errnote=None, fatal=True):          if errnote is False:              return False @@ -109,7 +100,9 @@ class ITVBaseIE(InfoExtractor):  class ITVIE(ITVBaseIE):      _VALID_URL = r'https?://(?:www\.)?itv\.com/(?:(?P<w>watch)|hub)/[^/]+/(?(w)[\w-]+/)(?P<id>\w+)' -    _IE_DESC = 'ITVX' +    IE_DESC = 'ITVX' +    _WORKING = False +      _TESTS = [{          'note': 'Hub URLs redirect to ITVX',          'url': 'https://www.itv.com/hub/liar/2a4547a0012', @@ -270,7 +263,7 @@ class ITVIE(ITVBaseIE):                  'ext': determine_ext(href, 'vtt'),              }) -        next_data = self._search_nextjs_data(webpage, video_id, fatal=False, default='{}') +        next_data = self._search_nextjs_data(webpage, video_id, fatal=False, default={})          video_data.update(traverse_obj(next_data, ('props', 'pageProps', ('title', 'episode')), expected_type=dict)[0] or {})          title = traverse_obj(video_data, 'headerTitle', 'episodeTitle')          info = self._og_extract(webpage, require_title=not title) @@ -323,7 +316,7 @@ class ITVIE(ITVBaseIE):  class ITVBTCCIE(ITVBaseIE):      _VALID_URL = r'https?://(?:www\.)?itv\.com/(?!(?:watch|hub)/)(?:[^/]+/)+(?P<id>[^/?#&]+)' -    _IE_DESC = 'ITV articles: News, British Touring Car Championship' +    IE_DESC = 'ITV articles: News, British Touring Car Championship'      _TESTS = [{          'note': 'British Touring Car Championship',          'url': 'https://www.itv.com/btcc/articles/btcc-2018-all-the-action-from-brands-hatch', diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py index db5ef8b57..b8ac58713 100644 --- a/youtube_dl/extractor/senateisvp.py +++ b/youtube_dl/extractor/senateisvp.py @@ -47,7 +47,7 @@ class SenateISVPIE(InfoExtractor):          ['vetaff', '76462', 'http://vetaff-f.akamaihd.net'],          ['arch', '', 'http://ussenate-f.akamaihd.net/']      ] -    _IE_NAME = 'senate.gov' +    IE_NAME = 'senate.gov'      _VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P<qs>.+)'      _TESTS = [{          'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png', diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 56957a661..c045bc8bc 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1,5 +1,4 @@  # coding: utf-8 -  from __future__ import unicode_literals  import collections @@ -27,11 +26,14 @@ from ..compat import (  )  from ..jsinterp import JSInterpreter  from ..utils import ( +    bug_reports_message,      clean_html,      dict_get,      error_to_compat_str,      ExtractorError, +    filter_dict,      float_or_none, +    get_first,      extract_attributes,      get_element_by_attribute,      int_or_none, @@ -46,6 +48,7 @@ from ..utils import (      parse_duration,      parse_qs,      qualities, +    remove_end,      remove_start,      smuggle_url,      str_or_none, @@ -63,6 +66,7 @@ from ..utils import (      url_or_none,      urlencode_postdata,      urljoin, +    variadic,  ) @@ -82,9 +86,79 @@ class YoutubeBaseInfoExtractor(InfoExtractor):      _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM)' +    _INNERTUBE_CLIENTS = { +        'ios': { +            'INNERTUBE_CONTEXT': { +                'client': { +                    'clientName': 'IOS', +                    'clientVersion': '20.10.4', +                    'deviceMake': 'Apple', +                    'deviceModel': 'iPhone16,2', +                    'userAgent': 'com.google.ios.youtube/20.10.4 (iPhone16,2; U; CPU iOS 18_3_2 like Mac OS X;)', +                    'osName': 'iPhone', +                    'osVersion': '18.3.2.22D82', +                }, +            }, +            'INNERTUBE_CONTEXT_CLIENT_NAME': 5, +            'REQUIRE_PO_TOKEN': False, +            'REQUIRE_JS_PLAYER': False, +        }, +        # mweb has 'ultralow' formats +        # See: https://github.com/yt-dlp/yt-dlp/pull/557 +        'mweb': { +            'INNERTUBE_CONTEXT': { +                'client': { +                    'clientName': 'MWEB', +                    'clientVersion': '2.2.20250925.01.00', +                    # mweb previously did not require PO Token with this UA +                    'userAgent': 'Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)', +                }, +            }, +            'INNERTUBE_CONTEXT_CLIENT_NAME': 2, +            'REQUIRE_PO_TOKEN': True, +            'SUPPORTS_COOKIES': True, +        }, +        'tv': { +            'INNERTUBE_CONTEXT': { +                'client': { +                    'clientName': 'TVHTML5', +                    'clientVersion': '7.20250312.16.00', +                    # See: https://github.com/youtube/cobalt/blob/main/cobalt/browser/user_agent/user_agent_platform_info.cc#L506 +                    'userAgent': 'Mozilla/5.0 (ChromiumStylePlatform) Cobalt/25.lts.30.1034943-gold (unlike Gecko), Unknown_TV_Unknown_0/Unknown (Unknown, Unknown)', +                }, +            }, +            'INNERTUBE_CONTEXT_CLIENT_NAME': 7, +            'SUPPORTS_COOKIES': True, +        }, + +        'web': { +            'INNERTUBE_CONTEXT': { +                'client': { +                    'clientName': 'WEB', +                    'clientVersion': '2.20250925.01.00', +                    'userAgent': 'Mozilla/5.0', +                }, +            }, +            'INNERTUBE_CONTEXT_CLIENT_NAME': 1, +            'REQUIRE_PO_TOKEN': True, +            'SUPPORTS_COOKIES': True, +        }, +        # Safari UA returns pre-merged video+audio 144p/240p/360p/720p/1080p HLS formats +        'web_safari': { +            'INNERTUBE_CONTEXT': { +                'client': { +                    'clientName': 'WEB', +                    'clientVersion': '2.20250925.01.00', +                    'userAgent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15,gzip(gfe)', +                }, +            }, +        }, +    } +      def _login(self):          """          Attempt to log in to YouTube. +          True is returned if successful or skipped.          False is returned if login failed. @@ -281,14 +355,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):          if not self._login():              return -    _DEFAULT_API_DATA = { -        'context': { -            'client': { -                'clientName': 'WEB', -                'clientVersion': '2.20201021.03.00', -            }, -        }, -    } +    _DEFAULT_API_DATA = {'context': _INNERTUBE_CLIENTS['web']['INNERTUBE_CONTEXT']}      _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'      _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;' @@ -321,19 +388,24 @@ class YoutubeBaseInfoExtractor(InfoExtractor):              '{0} {1} {2}'.format(time_now, self._SAPISID, origin).encode('utf-8')).hexdigest()          return 'SAPISIDHASH {0}_{1}'.format(time_now, sapisidhash) -    def _call_api(self, ep, query, video_id, fatal=True, headers=None): +    def _call_api(self, ep, query, video_id, fatal=True, headers=None, +                  note='Downloading API JSON'):          data = self._DEFAULT_API_DATA.copy()          data.update(query)          real_headers = {'content-type': 'application/json'}          if headers:              real_headers.update(headers) +        # was: 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8' +        api_key = self.get_param('youtube_innertube_key')          return self._download_json(              'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id, -            note='Downloading API JSON', errnote='Unable to download API page', +            note=note, errnote='Unable to download API page',              data=json.dumps(data).encode('utf8'), fatal=fatal, -            headers=real_headers, -            query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'}) +            headers=real_headers, query=filter_dict({ +                'key': api_key, +                'prettyPrint': 'false', +            }))      def _extract_yt_initial_data(self, video_id, webpage):          return self._parse_json( @@ -342,11 +414,32 @@ class YoutubeBaseInfoExtractor(InfoExtractor):                   self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),              video_id) +    def _extract_visitor_data(self, *args): +        """ +        Extract visitorData from an API response or ytcfg + +        Appears to be used to track session state +        """ +        visitor_data = self.get_param('youtube_visitor_data') +        if visitor_data: +            return visitor_data + +        return get_first( +            args, (('VISITOR_DATA', +                    ('INNERTUBE_CONTEXT', 'client', 'visitorData'), +                    ('responseContext', 'visitorData')), +                   T(compat_str))) +      def _extract_ytcfg(self, video_id, webpage): -        return self._parse_json( -            self._search_regex( -                r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg', -                default='{}'), video_id, fatal=False) or {} +        ytcfg = self._search_json( +            r'ytcfg\.set\s*\(', webpage, 'ytcfg', video_id, +            end_pattern=r'\)\s*;', default={}) + +        traverse_obj(ytcfg, ( +            'INNERTUBE_CONTEXT', 'client', 'configInfo', +            T(lambda x: x.pop('appInstallData', None)))) + +        return ytcfg      def _extract_video(self, renderer):          video_id = renderer['videoId'] @@ -381,6 +474,27 @@ class YoutubeBaseInfoExtractor(InfoExtractor):              'uploader': uploader,          } +    @staticmethod +    def _extract_thumbnails(data, *path_list, **kw_final_key): +        """ +        Extract thumbnails from thumbnails dict + +        @param path_list: path list to level that contains 'thumbnails' key +        """ +        final_key = kw_final_key.get('final_key', 'thumbnails') + +        return traverse_obj(data, (( +            tuple(variadic(path) + (final_key, Ellipsis) +                  for path in path_list or [()])), { +            'url': ('url', T(url_or_none), +                    # Sometimes youtube gives a wrong thumbnail URL. See: +                    # https://github.com/yt-dlp/yt-dlp/issues/233 +                    # https://github.com/ytdl-org/youtube-dl/issues/28023 +                    T(lambda u: update_url(u, query=None) if u and 'maxresdefault' in u else u)), +            'height': ('height', T(int_or_none)), +            'width': ('width', T(int_or_none)), +        }, T(lambda t: t if t.get('url') else None))) +      def _search_results(self, query, params):          data = {              'context': { @@ -395,42 +509,38 @@ class YoutubeBaseInfoExtractor(InfoExtractor):              data['params'] = params          for page_num in itertools.count(1):              search = self._download_json( -                'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', +                'https://www.youtube.com/youtubei/v1/search',                  video_id='query "%s"' % query,                  note='Downloading page %s' % page_num,                  errnote='Unable to download API page', fatal=False,                  data=json.dumps(data).encode('utf8'), +                query={ +                    # 'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', +                    'prettyPrint': 'false', +                },                  headers={'content-type': 'application/json'})              if not search:                  break -            slr_contents = try_get( +            slr_contents = traverse_obj(                  search, -                (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'], -                 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']), -                list) +                ('contents', 'twoColumnSearchResultsRenderer', 'primaryContents', +                 'sectionListRenderer', 'contents'), +                ('onResponseReceivedCommands', 0, 'appendContinuationItemsAction', +                 'continuationItems'), +                expected_type=list)              if not slr_contents:                  break -            for slr_content in slr_contents: -                isr_contents = try_get( -                    slr_content, -                    lambda x: x['itemSectionRenderer']['contents'], -                    list) -                if not isr_contents: -                    continue -                for content in isr_contents: -                    if not isinstance(content, dict): -                        continue -                    video = content.get('videoRenderer') -                    if not isinstance(video, dict): -                        continue -                    video_id = video.get('videoId') -                    if not video_id: -                        continue -                    yield self._extract_video(video) -            token = try_get( +            for video in traverse_obj( +                    slr_contents, +                    (Ellipsis, 'itemSectionRenderer', 'contents', +                     Ellipsis, 'videoRenderer', +                     T(lambda v: v if v.get('videoId') else None))): +                yield self._extract_video(video) + +            token = traverse_obj(                  slr_contents, -                lambda x: x[-1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'], -                compat_str) +                (-1, 'continuationItemRenderer', 'continuationEndpoint', +                 'continuationCommand', 'token', T(compat_str)))              if not token:                  break              data['continuation'] = token @@ -590,11 +700,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          'invidious': '|'.join(_INVIDIOUS_SITES),      }      _PLAYER_INFO_RE = ( -        r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player', -        r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$', -        r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$', +        r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/(?:tv-)?player', +        r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias(?:_tce)?\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$', +        r'\b(?P<id>vfl[a-zA-Z0-9_-]{6,})\b.*?\.js$',      ) -    _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt') +    _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'srt', 'vtt')      _GEO_BYPASS = False @@ -1485,6 +1595,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},      } +    _PLAYER_JS_VARIANT_MAP = ( +        ('main', 'player_ias.vflset/en_US/base.js'), +        ('tcc', 'player_ias_tcc.vflset/en_US/base.js'), +        ('tce', 'player_ias_tce.vflset/en_US/base.js'), +        ('es5', 'player_es5.vflset/en_US/base.js'), +        ('es6', 'player_es6.vflset/en_US/base.js'), +        ('tv', 'tv-player-ias.vflset/tv-player-ias.js'), +        ('tv_es6', 'tv-player-es6.vflset/tv-player-es6.js'), +        ('phone', 'player-plasma-ias-phone-en_US.vflset/base.js'), +        ('tablet', 'player-plasma-ias-tablet-en_US.vflset/base.js'), +    ) +      @classmethod      def suitable(cls, url):          if parse_qs(url).get('list', [None])[0]: @@ -1496,6 +1618,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          self._code_cache = {}          self._player_cache = {} +    def _get_player_js_version(self): +        player_js_version = self.get_param('youtube_player_js_version') or '20348@0004de42' +        sts_hash = self._search_regex( +            ('^actual$(^)?(^)?', r'^([0-9]{5,})@([0-9a-f]{8,})$'), +            player_js_version, 'player_js_version', group=(1, 2), default=None) +        if sts_hash: +            return sts_hash +        self.report_warning( +            'Invalid player JS version "{0}" specified. ' +            'It should be "{1}" or in the format of {2}'.format( +                player_js_version, 'actual', 'SignatureTimeStamp@Hash'), only_once=True) +        return None, None +      # *ytcfgs, webpage=None      def _extract_player_url(self, *ytcfgs, **kw_webpage):          if ytcfgs and not isinstance(ytcfgs[0], dict): @@ -1506,64 +1641,139 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  webpage or '', 'player URL', fatal=False)              if player_url:                  ytcfgs = ytcfgs + ({'PLAYER_JS_URL': player_url},) -        return traverse_obj( +        player_url = traverse_obj(              ytcfgs, (Ellipsis, 'PLAYER_JS_URL'), (Ellipsis, 'WEB_PLAYER_CONTEXT_CONFIGS', Ellipsis, 'jsUrl'),              get_all=False, expected_type=lambda u: urljoin('https://www.youtube.com', u)) +        player_id_override = self._get_player_js_version()[1] + +        requested_js_variant = self.get_param('youtube_player_js_variant') or 'main' +        variant_js = next( +            (v for k, v in self._PLAYER_JS_VARIANT_MAP if k == requested_js_variant), +            None) +        if variant_js: +            player_id = player_id_override or self._extract_player_info(player_url) +            original_url = player_url +            player_url = '/s/player/{0}/{1}'.format(player_id, variant_js) +            if original_url != player_url: +                self.write_debug( +                    'Forcing "{0}" player JS variant for player {1}\n' +                    '        original url = {2}'.format( +                        requested_js_variant, player_id, original_url), +                    only_once=True) +        elif requested_js_variant != 'actual': +            self.report_warning( +                'Invalid player JS variant name "{0}" requested. ' +                'Valid choices are: {1}'.format( +                    requested_js_variant, ','.join(k for k, _ in self._PLAYER_JS_VARIANT_MAP)), +                only_once=True) + +        return urljoin('https://www.youtube.com', player_url) +      def _download_player_url(self, video_id, fatal=False):          res = self._download_webpage(              'https://www.youtube.com/iframe_api',              note='Downloading iframe API JS', video_id=video_id, fatal=fatal)          player_version = self._search_regex(              r'player\\?/([0-9a-fA-F]{8})\\?/', res or '', 'player version', fatal=fatal, -            default=NO_DEFAULT if res else None) -        if player_version: -            return 'https://www.youtube.com/s/player/{0}/player_ias.vflset/en_US/base.js'.format(player_version) +            default=NO_DEFAULT if res else None) or None +        return player_version and 'https://www.youtube.com/s/player/{0}/player_ias.vflset/en_US/base.js'.format(player_version)      def _signature_cache_id(self, example_sig):          """ Return a string representation of a signature """          return '.'.join(compat_str(len(part)) for part in example_sig.split('.')) -    @classmethod -    def _extract_player_info(cls, player_url): -        for player_re in cls._PLAYER_INFO_RE: -            id_m = re.search(player_re, player_url) -            if id_m: -                break -        else: -            raise ExtractorError('Cannot identify player %r' % player_url) -        return id_m.group('id') +    def _extract_player_info(self, player_url): +        try: +            return self._search_regex( +                self._PLAYER_INFO_RE, player_url, 'player info', group='id') +        except ExtractorError as e: +            raise ExtractorError( +                'Cannot identify player %r' % (player_url,), cause=e) -    def _load_player(self, video_id, player_url, fatal=True, player_id=None): -        if not player_id: +    def _player_js_cache_key(self, player_url, extra_id=None, _cache={}): +        if player_url not in _cache:              player_id = self._extract_player_info(player_url) -        if player_id not in self._code_cache: +            player_path = remove_start( +                compat_urllib_parse.urlparse(player_url).path, +                '/s/player/{0}/'.format(player_id)) +            variant = next((k for k, v in self._PLAYER_JS_VARIANT_MAP +                           if v == player_path), None) +            if not variant: +                variant = next( +                    (k for k, v in self._PLAYER_JS_VARIANT_MAP +                     if re.match(re.escape(v).replace('en_US', r'\w+') + '$', player_path)), +                    None) +            if not variant: +                self.write_debug( +                    'Unable to determine player JS variant\n' +                    '        player = {0}'.format(player_url), only_once=True) +                variant = re.sub(r'[^a-zA-Z0-9]', '_', remove_end(player_path, '.js')) +            _cache[player_url] = join_nonempty(player_id, variant) + +        if extra_id: +            extra_id = '-'.join((_cache[player_url], extra_id)) +            assert os.path.basename(extra_id) == extra_id +            return extra_id +        return _cache[player_url] + +    def _load_player(self, video_id, player_url, fatal=True): +        player_js_key = self._player_js_cache_key(player_url) +        if player_js_key not in self._code_cache:              code = self._download_webpage(                  player_url, video_id, fatal=fatal, -                note='Downloading player ' + player_id, -                errnote='Download of %s failed' % player_url) +                note='Downloading player {0}'.format(player_js_key), +                errnote='Download of {0} failed'.format(player_url))              if code: -                self._code_cache[player_id] = code -        return self._code_cache[player_id] if fatal else self._code_cache.get(player_id) +                self._code_cache[player_js_key] = code +        return self._code_cache.get(player_js_key) + +    def _load_player_data_from_cache(self, name, player_url, extra_id=None): +        cache_id = ('youtube-{0}'.format(name), self._player_js_cache_key(player_url, extra_id)) +        data = self._player_cache.get(cache_id) +        if data: +            return data + +        data = self.cache.load(*cache_id, min_ver='2025.04.07') +        if data: +            self._player_cache[cache_id] = data +        return data + +    def _store_player_data_to_cache(self, name, player_url, data, extra_id=None): +        cache_id = ('youtube-{0}'.format(name), self._player_js_cache_key(player_url, extra_id)) + +        if cache_id not in self._player_cache: +            self.cache.store(cache_id[0], cache_id[1], data) +            self._player_cache[cache_id] = data + +    def _remove_player_data_from_cache(self, name, player_url, extra_id=None): +        cache_id = ('youtube-{0}'.format(name), self._player_js_cache_key(player_url, extra_id)) + +        if cache_id in self._player_cache: +            self.cache.clear(*cache_id) +            self._player_cache.pop(cache_id, None)      def _extract_signature_function(self, video_id, player_url, example_sig): -        player_id = self._extract_player_info(player_url) +        # player_id = self._extract_player_info(player_url)          # Read from filesystem cache -        func_id = 'js_{0}_{1}'.format( -            player_id, self._signature_cache_id(example_sig)) -        assert os.path.basename(func_id) == func_id - -        self.write_debug('Extracting signature function {0}'.format(func_id)) -        cache_spec, code = self.cache.load('youtube-sigfuncs', func_id), None +        extra_id = self._signature_cache_id(example_sig) +        self.write_debug('Extracting signature function {0}-{1}'.format(player_url, extra_id)) +        cache_spec, code = self._load_player_data_from_cache( +            'sigfuncs', player_url, extra_id=extra_id), None          if not cache_spec: -            code = self._load_player(video_id, player_url, player_id) -        if code: -            res = self._parse_sig_js(code) -            test_string = ''.join(map(compat_chr, range(len(example_sig)))) -            cache_spec = [ord(c) for c in res(test_string)] -            self.cache.store('youtube-sigfuncs', func_id, cache_spec) +            code = self._load_player(video_id, player_url) +            if code: +                res = self._parse_sig_js(code) +                test_string = ''.join(map(compat_chr, range(len(example_sig)))) +                cache_spec = [ord(c) for c in res(test_string)] +                self._store_player_data_to_cache( +                    'sigfuncs', player_url, cache_spec, extra_id=extra_id) +            else: +                self.report_warning( +                    'Failed to compute signature function {0}-{1}'.format( +                        player_url, extra_id))          return lambda s: ''.join(s[i] for i in cache_spec) @@ -1609,6 +1819,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  '    return %s\n') % (signature_id_tuple, expr_code)          self.to_screen('Extracted signature function:\n' + code) +    def _extract_sig_fn(self, jsi, funcname): +        var_ay = self._search_regex( +            r'''(?x) +                (?:\*/|\{|\n|^)\s*(?:'[^']+'\s*;\s*) +                    (var\s*[\w$]+\s*=\s*(?: +                        ('|")(?:\\\2|(?!\2).)+\2\s*\.\s*split\(\s*('|")\W+\3\s*\)| +                        \[\s*(?:('|")(?:\\\4|(?!\4).)*\4\s*(?:(?=\])|,\s*))+\] +                    ))(?=\s*[,;]) +            ''', jsi.code, 'useful values', default='') + +        sig_fn = jsi.extract_function_code(funcname) + +        if var_ay: +            sig_fn = (sig_fn[0], ';\n'.join((var_ay, sig_fn[1]))) + +        return sig_fn +      def _parse_sig_js(self, jscode):          # Examples where `sig` is funcname:          # sig=function(a){a=a.split(""); ... ;return a.join("")}; @@ -1634,8 +1861,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              jscode, 'Initial JS player signature function name', group='sig')          jsi = JSInterpreter(jscode) -        initial_function = jsi.extract_function(funcname) -        return lambda s: initial_function([s]) + +        initial_function = self._extract_sig_fn(jsi, funcname) + +        func = jsi.extract_function_from_code(*initial_function) + +        return lambda s: func([s])      def _cached(self, func, *cache_id):          def inner(*args, **kwargs): @@ -1695,6 +1926,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          return ret      def _extract_n_function_name(self, jscode): +        func_name, idx = None, None + +        def generic_n_function_search(func_name=None): +            return self._search_regex( +                r'''(?xs) +                    (?:(?<=[^\w$])|^)       # instead of \b, which ignores $ +                    (?P<name>%s)\s*=\s*function\((?!\d)[a-zA-Z\d_$]+\) +                    \s*\{(?:(?!};).)+?(?: +                        ["']enhanced_except_ | +                        return\s*(?P<q>"|')[a-zA-Z\d-]+_w8_(?P=q)\s*\+\s*[\w$]+ +                    ) +                ''' % (func_name or r'(?!\d)[a-zA-Z\d_$]+',), jscode, +                'Initial JS player n function name', group='name', +                default=None if func_name else NO_DEFAULT) + +        # these special cases are redundant and probably obsolete (2025-04): +        # they make the tests run ~10% faster without fallback warnings +        r"""          func_name, idx = self._search_regex(              # (y=NuD(),Mw(k),q=k.Z[y]||null)&&(q=narray[idx](q),k.set(y,q),k.V||NuD(''))}};              # (R="nn"[+J.Z],mW(J),N=J.K[R]||null)&&(N=narray[idx](N),J.set(R,N))}}; @@ -1721,41 +1970,59 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                      \(\s*[\w$]+\s*\)              ''', jscode, 'Initial JS player n function name', group=('nfunc', 'idx'),              default=(None, None)) +        """ + +        if not func_name: +            # nfunc=function(x){...}|function nfunc(x); ... +            # ... var y=[nfunc]|y[idx]=nfunc); +            # obvious REs hang, so use a two-stage tactic +            for m in re.finditer(r'''(?x) +                    [\n;]var\s(?:(?:(?!,).)+,|\s)*?(?!\d)[\w$]+(?:\[(?P<idx>\d+)\])?\s*=\s* +                        (?(idx)|\[\s*)(?P<nfunc>(?!\d)[\w$]+)(?(idx)|\s*\]) +                    \s*?[;\n] +                    ''', jscode): +                fn = self._search_regex( +                    r'[;,]\s*(function\s+)?({0})(?(1)|\s*=\s*function)\s*\((?!\d)[\w$]+\)\s*\{1}(?!\s*return\s)'.format( +                        re.escape(m.group('nfunc')), '{'), +                    jscode, 'Initial JS player n function name (2)', group=2, default=None) +                if fn: +                    func_name = fn +                    idx = m.group('idx') +                    if generic_n_function_search(func_name): +                        # don't look any further +                        break +          # thx bashonly: yt-dlp/yt-dlp/pull/10611          if not func_name: -            self.report_warning('Falling back to generic n function search') -            return self._search_regex( -                r'''(?xs) -                    (?:(?<=[^\w$])|^)       # instead of \b, which ignores $ -                    (?P<name>(?!\d)[a-zA-Z\d_$]+)\s*=\s*function\((?!\d)[a-zA-Z\d_$]+\) -                    \s*\{(?:(?!};).)+?(?: -                        ["']enhanced_except_ | -                        return\s*(?P<q>"|')[a-zA-Z\d-]+_w8_(?P=q)\s*\+\s*[\w$]+ -                    ) -                ''', jscode, 'Initial JS player n function name', group='name') +            self.report_warning('Falling back to generic n function search', only_once=True) +            return generic_n_function_search() +          if not idx:              return func_name          return self._search_json( -            r'var\s+{0}\s*='.format(re.escape(func_name)), jscode, +            r'(?<![\w-])var\s(?:(?:(?!,).)+,|\s)*?{0}\s*='.format(re.escape(func_name)), jscode,              'Initial JS player n function list ({0}.{1})'.format(func_name, idx), -            func_name, contains_pattern=r'\[[\s\S]+\]', end_pattern='[,;]', +            func_name, contains_pattern=r'\[.+\]', end_pattern='[,;]',              transform_source=js_to_json)[int(idx)]      def _extract_n_function_code(self, video_id, player_url):          player_id = self._extract_player_info(player_url) -        func_code = self.cache.load('youtube-nsig', player_id) +        func_code = self._load_player_data_from_cache('nsig', player_url)          jscode = func_code or self._load_player(video_id, player_url)          jsi = JSInterpreter(jscode)          if func_code:              return jsi, player_id, func_code -        func_name = self._extract_n_function_name(jscode) +        return self._extract_n_function_code_jsi(video_id, jsi, player_id, player_url) -        func_code = jsi.extract_function_code(func_name) +    def _extract_n_function_code_jsi(self, video_id, jsi, player_id=None, player_url=None): +        func_name = self._extract_n_function_name(jsi.code) -        self.cache.store('youtube-nsig', player_id, func_code) +        func_code = self._extract_sig_fn(jsi, func_name) +        if player_url: +            self._store_player_data_to_cache('nsig', player_url, func_code)          return jsi, player_id, func_code      def _extract_n_function_from_code(self, jsi, func_code): @@ -1788,7 +2055,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              n_param = n_param[-1]              n_response = decrypt_nsig(n_param)(n_param, video_id, player_url)              if n_response is None: -                # give up if descrambling failed +                # give up and forget cached data if descrambling failed +                self._remove_player_data_from_cache('nsig', player_url)                  break              fmt['url'] = update_url_query(fmt['url'], {'n': n_response}) @@ -1796,21 +2064,37 @@ class YoutubeIE(YoutubeBaseInfoExtractor):      def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False):          """          Extract signatureTimestamp (sts) +          Required to tell API what sig/player version is in use.          """ -        sts = traverse_obj(ytcfg, 'STS', expected_type=int) -        if not sts: -            # Attempt to extract from player -            if player_url is None: -                error_msg = 'Cannot extract signature timestamp without player_url.' -                if fatal: -                    raise ExtractorError(error_msg) -                self.report_warning(error_msg) -                return -            code = self._load_player(video_id, player_url, fatal=fatal) -            sts = int_or_none(self._search_regex( -                r'(?:signatureTimestamp|sts)\s*:\s*(?P<sts>[0-9]{5})', code or '', -                'JS player signature timestamp', group='sts', fatal=fatal)) +        sts = traverse_obj( +            (self._get_player_js_version(), ytcfg), +            (0, 0), +            (1, 'STS'), +            expected_type=int_or_none) + +        if sts: +            return sts + +        if not player_url: +            error_msg = 'Cannot extract signature timestamp without player url' +            if fatal: +                raise ExtractorError(error_msg) +            self.report_warning(error_msg) +            return None + +        sts = self._load_player_data_from_cache('sts', player_url) +        if sts: +            return sts + +        # Attempt to extract from player +        code = self._load_player(video_id, player_url, fatal=fatal) +        sts = int_or_none(self._search_regex( +            r'(?:signatureTimestamp|sts)\s*:\s*(?P<sts>[0-9]{5})', code or '', +            'JS player signature timestamp', group='sts', fatal=fatal)) +        if sts: +            self._store_player_data_to_cache('sts', player_url, sts) +          return sts      def _mark_watched(self, video_id, player_response): @@ -1885,7 +2169,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              raise ExtractorError('Invalid URL: %s' % url)          return mobj.group(2) -    def _extract_chapters_from_json(self, data, video_id, duration): +    @staticmethod +    def _extract_chapters_from_json(data, video_id, duration):          chapters_list = try_get(              data,              lambda x: x['playerOverlays'] @@ -1935,8 +2220,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          video_id = self._match_id(url)          base_url = self.http_scheme() + '//www.youtube.com/'          webpage_url = base_url + 'watch?v=' + video_id +        ua = traverse_obj(self._INNERTUBE_CLIENTS, ( +            'web', 'INNERTUBE_CONTEXT', 'client', 'userAgent')) +        headers = {'User-Agent': ua} if ua else None          webpage = self._download_webpage( -            webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False) +            webpage_url + '&bpctr=9999999999&has_verified=1', video_id, +            headers=headers, fatal=False)          player_response = None          player_url = None @@ -1944,12 +2233,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              player_response = self._extract_yt_initial_variable(                  webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE,                  video_id, 'initial player response') +        is_live = traverse_obj(player_response, ('videoDetails', 'isLive')) + +        fetched_timestamp = None          if False and not player_response:              player_response = self._call_api(                  'player', {'videoId': video_id}, video_id)          if True or not player_response:              origin = 'https://www.youtube.com'              pb_context = {'html5Preference': 'HTML5_PREF_WANTS'} +            fetched_timestamp = int(time.time())              player_url = self._extract_player_url(webpage)              ytcfg = self._extract_ytcfg(video_id, webpage or '') @@ -1957,46 +2250,85 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              if sts:                  pb_context['signatureTimestamp'] = sts -            query = { -                'playbackContext': { -                    'contentPlaybackContext': pb_context, -                    'contentCheckOk': True, -                    'racyCheckOk': True, -                }, -                'context': { -                    'client': { -                        'clientName': 'MWEB', -                        'clientVersion': '2.20241202.07.00', -                        'hl': 'en', -                        'userAgent': 'Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)', -                        'timeZone': 'UTC', -                        'utcOffsetMinutes': 0, -                    }, -                }, -                'videoId': video_id, -            } +            client_names = traverse_obj(self._INNERTUBE_CLIENTS, ( +                T(dict.items), lambda _, k_v: not k_v[1].get('REQUIRE_PO_TOKEN'), +                0))[:1] +            if 'web' not in client_names: +                # webpage links won't download: ignore links and playability +                player_response = filter_dict( +                    player_response or {}, +                    lambda k, _: k not in ('streamingData', 'playabilityStatus')) + +            if is_live and 'ios' not in client_names: +                client_names.append('ios') +              headers = { -                'X-YouTube-Client-Name': '2', -                'X-YouTube-Client-Version': '2.20241202.07.00', -                'Origin': origin,                  'Sec-Fetch-Mode': 'navigate', -                'User-Agent': query['context']['client']['userAgent'], +                'Origin': origin, +                'X-Goog-Visitor-Id': self._extract_visitor_data(ytcfg) or '',              }              auth = self._generate_sapisidhash_header(origin)              if auth is not None:                  headers['Authorization'] = auth                  headers['X-Origin'] = origin -            player_response = self._call_api('player', query, video_id, fatal=False, headers=headers) +            for client in traverse_obj(self._INNERTUBE_CLIENTS, (client_names, T(dict))): + +                query = { +                    'playbackContext': { +                        'contentPlaybackContext': pb_context, +                    }, +                    'contentCheckOk': True, +                    'racyCheckOk': True, +                    'context': { +                        'client': merge_dicts( +                            traverse_obj(client, ('INNERTUBE_CONTEXT', 'client')), { +                                'hl': 'en', +                                'timeZone': 'UTC', +                                'utcOffsetMinutes': 0, +                            }), +                    }, +                    'videoId': video_id, +                } + +                api_headers = merge_dicts(headers, traverse_obj(client, { +                    'X-YouTube-Client-Name': 'INNERTUBE_CONTEXT_CLIENT_NAME', +                    'X-YouTube-Client-Version': ( +                        'INNERTUBE_CONTEXT', 'client', 'clientVersion'), +                    'User-Agent': ( +                        'INNERTUBE_CONTEXT', 'client', 'userAgent'), +                })) + +                api_player_response = self._call_api( +                    'player', query, video_id, fatal=False, headers=api_headers, +                    note=join_nonempty( +                        'Downloading', traverse_obj(query, ( +                            'context', 'client', 'clientName')), +                        'API JSON', delim=' ')) + +                hls = traverse_obj( +                    (player_response, api_player_response), +                    (Ellipsis, 'streamingData', 'hlsManifestUrl', T(url_or_none))) +                fetched_timestamp = int(time.time()) +                if len(hls) == 2 and not hls[0] and hls[1]: +                    player_response['streamingData']['hlsManifestUrl'] = hls[1] +                else: +                    video_details = merge_dicts(*traverse_obj( +                        (player_response, api_player_response), +                        (Ellipsis, 'videoDetails', T(dict)))) +                    player_response.update(filter_dict( +                        api_player_response or {}, cndn=lambda k, _: k != 'captions')) +                    player_response['videoDetails'] = video_details          def is_agegated(playability): -            if not isinstance(playability, dict): -                return +            # playability: dict +            if not playability: +                return False              if playability.get('desktopLegacyAgeGateReason'):                  return True -            reasons = filter(None, (playability.get(r) for r in ('status', 'reason'))) +            reasons = traverse_obj(playability, (('status', 'reason'),))              AGE_GATE_REASONS = (                  'confirm your age', 'age-restricted', 'inappropriate',  # reason                  'age_verification_required', 'age_check_required',  # status @@ -2054,15 +2386,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  trailer_video_id, self.ie_key(), trailer_video_id)          def get_text(x): -            if not x: -                return -            text = x.get('simpleText') -            if text and isinstance(text, compat_str): -                return text -            runs = x.get('runs') -            if not isinstance(runs, list): -                return -            return ''.join([r['text'] for r in runs if isinstance(r.get('text'), compat_str)]) +            return ''.join(traverse_obj( +                x, (('simpleText',),), ('runs', Ellipsis, 'text'), +                expected_type=compat_str))          search_meta = (              (lambda x: self._html_search_meta(x, webpage, default=None)) @@ -2130,6 +2456,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          itag_qualities = {}          q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres'])          CHUNK_SIZE = 10 << 20 +        is_live = video_details.get('isLive')          streaming_data = player_response.get('streamingData') or {}          streaming_formats = streaming_data.get('formats') or [] @@ -2139,11 +2466,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              return LazyList({                  'url': update_url_query(f['url'], {                      'range': '{0}-{1}'.format(range_start, min(range_start + CHUNK_SIZE - 1, f['filesize'])), -                }) +                }),              } for range_start in range(0, f['filesize'], CHUNK_SIZE))          lower = lambda s: s.lower() +        if is_live: +            fetched_timestamp = None +        elif fetched_timestamp is not None: +            # Handle preroll waiting period +            preroll_sleep = self.get_param('youtube_preroll_sleep') +            preroll_sleep = int_or_none(preroll_sleep, default=6) +            fetched_timestamp += preroll_sleep +          for fmt in streaming_formats:              if fmt.get('targetDurationSec'):                  continue @@ -2240,6 +2575,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                      'downloader_options': {'http_chunk_size': CHUNK_SIZE},  # No longer useful?                  }) +            if fetched_timestamp: +                dct['available_at'] = fetched_timestamp +              formats.append(dct)          def process_manifest_format(f, proto, client_name, itag, all_formats=False): @@ -2257,6 +2595,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              if f.get('source_preference') is None:                  f['source_preference'] = -1 +            # Deprioritize since its pre-merged m3u8 formats may have lower quality audio streams +            if client_name == 'web_safari' and proto == 'hls' and not is_live: +                f['source_preference'] -= 1 +              if itag in ('616', '235'):                  f['format_note'] = join_nonempty(f.get('format_note'), 'Premium', delim=' ')                  f['source_preference'] += 100 @@ -2273,14 +2615,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          hls_manifest_url = streaming_data.get('hlsManifestUrl')          if hls_manifest_url: -            for f in self._extract_m3u8_formats( -                    hls_manifest_url, video_id, 'mp4', fatal=False): +            formats.extend( +                f for f in self._extract_m3u8_formats( +                    hls_manifest_url, video_id, 'mp4', +                    entry_protocol='m3u8_native', live=is_live, fatal=False)                  if process_manifest_format( -                        f, 'hls', None, self._search_regex( -                            r'/itag/(\d+)', f['url'], 'itag', default=None)): -                    formats.append(f) +                    f, 'hls', None, self._search_regex( +                        r'/itag/(\d+)', f['url'], 'itag', default=None))) -        if self._downloader.params.get('youtube_include_dash_manifest', True): +        if self.get_param('youtube_include_dash_manifest', True):              dash_manifest_url = streaming_data.get('dashManifestUrl')              if dash_manifest_url:                  for f in self._extract_mpd_formats( @@ -2307,7 +2650,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  playability_status,                  lambda x: x['errorScreen']['playerErrorMessageRenderer'],                  dict) or {} -            reason = get_text(pemr.get('reason')) or playability_status.get('reason') +            reason = get_text(pemr.get('reason')) or playability_status.get('reason') or ''              subreason = pemr.get('subreason')              if subreason:                  subreason = clean_html(get_text(subreason)) @@ -2319,7 +2662,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                      self.raise_geo_restricted(                          subreason, countries)                  reason += '\n' + subreason +              if reason: +                if 'sign in' in reason.lower(): +                    self.raise_login_required(remove_end(reason, 'This helps protect our community. Learn more')) +                elif traverse_obj(playability_status, ('errorScreen', 'playerCaptchaViewModel', T(dict))): +                    reason += '. YouTube is requiring a captcha challenge before playback'                  raise ExtractorError(reason, expected=True)          self._sort_formats(formats) @@ -2380,8 +2728,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  # Strictly de-prioritize damaged formats                  f['preference'] = -10 -        is_live = video_details.get('isLive') -          owner_profile_url = self._yt_urljoin(self._extract_author_var(              webpage, 'url', videodetails=video_details, metadata=microformat)) @@ -2416,14 +2762,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          }          pctr = traverse_obj( -            player_response, -            ('captions', 'playerCaptionsTracklistRenderer', T(dict))) +            (player_response, api_player_response), +            (Ellipsis, 'captions', 'playerCaptionsTracklistRenderer', T(dict)))          if pctr:              def process_language(container, base_url, lang_code, query):                  lang_subs = []                  for fmt in self._SUBTITLE_FORMATS:                      query.update({                          'fmt': fmt, +                        # xosf=1 causes undesirable text position data for vtt, json3 & srv* subtitles +                        # See: https://github.com/yt-dlp/yt-dlp/issues/13654 +                        'xosf': [],                      })                      lang_subs.append({                          'ext': fmt, @@ -2434,19 +2783,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              def process_subtitles():                  subtitles = {}                  for caption_track in traverse_obj(pctr, ( -                        'captionTracks', lambda _, v: v.get('baseUrl'))): +                        Ellipsis, 'captionTracks', lambda _, v: ( +                            v.get('baseUrl') and v.get('languageCode')))): +                    base_url = self._yt_urljoin(caption_track['baseUrl'])                      if not base_url:                          continue +                    lang_code = caption_track['languageCode']                      if caption_track.get('kind') != 'asr': -                        lang_code = caption_track.get('languageCode') -                        if not lang_code: -                            continue                          process_language(                              subtitles, base_url, lang_code, {})                          continue                      automatic_captions = {} +                    process_language( +                        automatic_captions, base_url, lang_code, {})                      for translation_language in traverse_obj(pctr, ( -                            'translationLanguages', lambda _, v: v.get('languageCode'))): +                            Ellipsis, 'translationLanguages', lambda _, v: v.get('languageCode'))):                          translation_language_code = translation_language['languageCode']                          process_language(                              automatic_captions, base_url, translation_language_code, @@ -2463,7 +2814,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]:                      d_k += '_time'                      if d_k not in info and k in s_ks: -                        info[d_k] = parse_duration(query[k][0]) +                        info[d_k] = parse_duration(v[0])          if video_description:              # Youtube Music Auto-generated description @@ -2492,6 +2843,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              initial_data = self._call_api(                  'next', {'videoId': video_id}, video_id, fatal=False) +        initial_sdcr = None          if initial_data:              chapters = self._extract_chapters_from_json(                  initial_data, video_id, duration) @@ -2511,9 +2863,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                      for next_num, content in enumerate(contents, start=1):                          mmlir = content.get('macroMarkersListItemRenderer') or {}                          start_time = chapter_time(mmlir) -                        end_time = chapter_time(try_get( -                            contents, lambda x: x[next_num]['macroMarkersListItemRenderer'])) \ -                            if next_num < len(contents) else duration +                        end_time = (traverse_obj( +                            contents, (next_num, 'macroMarkersListItemRenderer', T(chapter_time))) +                            if next_num < len(contents) else duration)                          if start_time is None or end_time is None:                              continue                          chapters.append({ @@ -2619,12 +2971,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                                  info['track'] = mrr_contents_text              # this is not extraction but spelunking! -            carousel_lockups = traverse_obj( -                initial_data, -                ('engagementPanels', Ellipsis, 'engagementPanelSectionListRenderer', -                 'content', 'structuredDescriptionContentRenderer', 'items', Ellipsis, -                 'videoDescriptionMusicSectionRenderer', 'carouselLockups', Ellipsis), -                expected_type=dict) or [] +            initial_sdcr = traverse_obj(initial_data, ( +                'engagementPanels', Ellipsis, 'engagementPanelSectionListRenderer', +                'content', 'structuredDescriptionContentRenderer', T(dict)), +                get_all=False) +            carousel_lockups = traverse_obj(initial_sdcr, ( +                'items', Ellipsis, 'videoDescriptionMusicSectionRenderer', +                'carouselLockups', Ellipsis, T(dict))) or []              # try to reproduce logic from metadataRowContainerRenderer above (if it still is)              fields = (('ALBUM', 'album'), ('ARTIST', 'artist'), ('SONG', 'track'), ('LICENSES', 'license'))              # multiple_songs ? @@ -2649,6 +3002,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          self.mark_watched(video_id, player_response) +        # Fallbacks for missing metadata +        if initial_sdcr: +            if info.get('description') is None: +                info['description'] = traverse_obj(initial_sdcr, ( +                    'items', Ellipsis, 'expandableVideoDescriptionBodyRenderer', +                    'attributedDescriptionBodyText', 'content', T(compat_str)), +                    get_all=False) +            # videoDescriptionHeaderRenderer also has publishDate/channel/handle/ucid, but not needed +            if info.get('title') is None: +                info['title'] = traverse_obj( +                    (initial_sdcr, initial_data), +                    (0, 'items', Ellipsis, 'videoDescriptionHeaderRenderer', T(dict)), +                    (1, 'playerOverlays', 'playerOverlayRenderer', 'videoDetails', +                     'playerOverlayVideoDetailsRenderer', T(dict)), +                    expected_type=lambda x: self._get_text(x, 'title'), +                    get_all=False) +          return merge_dicts(              info, {                  'uploader_id': self._extract_uploader_id(owner_profile_url), @@ -3050,13 +3420,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):      @staticmethod      def _extract_grid_item_renderer(item): -        assert isinstance(item, dict) -        for key, renderer in item.items(): -            if not key.startswith('grid') or not key.endswith('Renderer'): -                continue -            if not isinstance(renderer, dict): -                continue -            return renderer +        return traverse_obj(item, ( +            T(dict.items), lambda _, k_v: k_v[0].startswith('grid') and k_v[0].endswith('Renderer'), +            1, T(dict)), get_all=False)      @staticmethod      def _get_text(r, k): @@ -3065,8 +3431,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):              expected_type=txt_or_none)      def _grid_entries(self, grid_renderer): -        for item in grid_renderer['items']: -            if not isinstance(item, dict): +        for item in traverse_obj(grid_renderer, ('items', Ellipsis, T(dict))): +            lockup_view_model = traverse_obj(item, ('lockupViewModel', T(dict))) +            if lockup_view_model: +                entry = self._extract_lockup_view_model(lockup_view_model) +                if entry: +                    yield entry                  continue              renderer = self._extract_grid_item_renderer(item)              if not isinstance(renderer, dict): @@ -3135,8 +3505,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):                  shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)              yield self.url_result(shelf_url, video_title=title)          # Shelf may not contain shelf URL, fallback to extraction from content -        for entry in self._shelf_entries_from_content(shelf_renderer): -            yield entry +        for from_ in self._shelf_entries_from_content(shelf_renderer): +            yield from_      def _playlist_entries(self, video_list_renderer):          for content in video_list_renderer['contents']: @@ -3150,10 +3520,51 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):                  continue              yield self._extract_video(renderer) +    def _extract_lockup_view_model(self, view_model): +        content_id = view_model.get('contentId') +        if not content_id: +            return +        content_type = view_model.get('contentType') +        if content_type == 'LOCKUP_CONTENT_TYPE_VIDEO': +            ie = YoutubeIE +            url = update_url_query( +                'https://www.youtube.com/watch', {'v': content_id}) +            thumb_keys = (None,) +        elif content_type in ('LOCKUP_CONTENT_TYPE_PLAYLIST', 'LOCKUP_CONTENT_TYPE_PODCAST'): +            ie = YoutubeTabIE +            url = update_url_query( +                'https://www.youtube.com/playlist', {'list': content_id}) +            thumb_keys = ('collectionThumbnailViewModel', 'primaryThumbnail') +        else: +            self.report_warning( +                'Unsupported lockup view model content type "{0}"{1}'.format(content_type, bug_reports_message()), +                only_once=True) +            return +        thumb_keys = ('contentImage',) + thumb_keys + ('thumbnailViewModel', 'image') +        return merge_dicts(self.url_result( +            url, ie=ie.ie_key(), video_id=content_id), { +                'title': traverse_obj(view_model, ( +                    'metadata', 'lockupMetadataViewModel', 'title', +                    'content', T(compat_str))), +                'thumbnails': self._extract_thumbnails( +                    view_model, thumb_keys, final_key='sources'), +        }) + +    def _extract_shorts_lockup_view_model(self, view_model): +        content_id = traverse_obj(view_model, ( +            'onTap', 'innertubeCommand', 'reelWatchEndpoint', 'videoId', +            T(lambda v: v if YoutubeIE.suitable(v) else None))) +        return merge_dicts(self.url_result( +            content_id, ie=YoutubeIE.ie_key(), video_id=content_id), { +                'title': traverse_obj(view_model, ( +                    'overlayMetadata', 'primaryText', 'content', T(compat_str))), +                'thumbnails': self._extract_thumbnails( +                    view_model, 'thumbnail', final_key='sources'), +        }) if content_id else None +      def _video_entry(self, video_renderer):          video_id = video_renderer.get('videoId') -        if video_id: -            return self._extract_video(video_renderer) +        return self._extract_video(video_renderer) if video_id else None      def _post_thread_entries(self, post_thread_renderer):          post_renderer = try_get( @@ -3185,21 +3596,15 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):              yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=video_id)      def _post_thread_continuation_entries(self, post_thread_continuation): -        contents = post_thread_continuation.get('contents') -        if not isinstance(contents, list): -            return -        for content in contents: -            renderer = content.get('backstagePostThreadRenderer') -            if not isinstance(renderer, dict): -                continue -            for entry in self._post_thread_entries(renderer): -                yield entry +        for renderer in traverse_obj(post_thread_continuation, ( +                'contents', Ellipsis, 'backstagePostThreadRenderer', T(dict))): +            for from_ in self._post_thread_entries(renderer): +                yield from_      def _rich_grid_entries(self, contents): -        for content in contents: -            content = traverse_obj( -                content, ('richItemRenderer', 'content'), -                expected_type=dict) or {} +        for content in traverse_obj( +                contents, (Ellipsis, 'richItemRenderer', 'content'), +                expected_type=dict):              video_renderer = traverse_obj(                  content, 'videoRenderer', 'reelItemRenderer',                  expected_type=dict) @@ -3207,6 +3612,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):                  entry = self._video_entry(video_renderer)                  if entry:                      yield entry +            # shorts item +            shorts_lockup_view_model = content.get('shortsLockupViewModel') +            if shorts_lockup_view_model: +                entry = self._extract_shorts_lockup_view_model(shorts_lockup_view_model) +                if entry: +                    yield entry              # playlist              renderer = traverse_obj(                  content, 'playlistRenderer', expected_type=dict) or {} @@ -3245,23 +3656,15 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):          next_continuation = cls._extract_next_continuation_data(renderer)          if next_continuation:              return next_continuation -        contents = [] -        for key in ('contents', 'items'): -            contents.extend(try_get(renderer, lambda x: x[key], list) or []) -        for content in contents: -            if not isinstance(content, dict): -                continue -            continuation_ep = try_get( -                content, lambda x: x['continuationItemRenderer']['continuationEndpoint'], -                dict) -            if not continuation_ep: -                continue -            continuation = try_get( -                continuation_ep, lambda x: x['continuationCommand']['token'], compat_str) +        for command in traverse_obj(renderer, ( +                ('contents', 'items', 'rows'), Ellipsis, 'continuationItemRenderer', +                ('continuationEndpoint', ('button', 'buttonRenderer', 'command')), +                (('commandExecutorCommand', 'commands', Ellipsis), None), T(dict))): +            continuation = traverse_obj(command, ('continuationCommand', 'token', T(compat_str)))              if not continuation:                  continue -            ctp = continuation_ep.get('clickTrackingParams') -            return YoutubeTabIE._build_continuation_query(continuation, ctp) +            ctp = command.get('clickTrackingParams') +            return cls._build_continuation_query(continuation, ctp)      def _entries(self, tab, item_id, webpage):          tab_content = try_get(tab, lambda x: x['content'], dict) @@ -3271,17 +3674,10 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):          if slr_renderer:              is_channels_tab = tab.get('title') == 'Channels'              continuation = None -            slr_contents = try_get(slr_renderer, lambda x: x['contents'], list) or [] -            for slr_content in slr_contents: -                if not isinstance(slr_content, dict): -                    continue -                is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict) -                if not is_renderer: -                    continue -                isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] -                for isr_content in isr_contents: -                    if not isinstance(isr_content, dict): -                        continue +            for is_renderer in traverse_obj(slr_renderer, ( +                    'contents', Ellipsis, 'itemSectionRenderer', T(dict))): +                for isr_content in traverse_obj(slr_renderer, ( +                        'contents', Ellipsis, T(dict))):                      renderer = isr_content.get('playlistVideoListRenderer')                      if renderer:                          for entry in self._playlist_entries(renderer): @@ -3310,6 +3706,13 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):                          entry = self._video_entry(renderer)                          if entry:                              yield entry +                    renderer = isr_content.get('richGridRenderer') +                    if renderer: +                        for from_ in self._rich_grid_entries( +                                traverse_obj(renderer, ('contents', Ellipsis, T(dict)))): +                            yield from_ +                        continuation = self._extract_continuation(renderer) +                        continue                  if not continuation:                      continuation = self._extract_continuation(is_renderer) @@ -3319,8 +3722,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):              rich_grid_renderer = tab_content.get('richGridRenderer')              if not rich_grid_renderer:                  return -            for entry in self._rich_grid_entries(rich_grid_renderer.get('contents') or []): -                yield entry +            for from_ in self._rich_grid_entries( +                    traverse_obj(rich_grid_renderer, ('contents', Ellipsis, T(dict)))): +                yield from_              continuation = self._extract_continuation(rich_grid_renderer) @@ -3354,7 +3758,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):              if not continuation:                  break              if visitor_data: -                headers['x-goog-visitor-id'] = visitor_data +                headers['X-Goog-Visitor-Id'] = visitor_data              data['continuation'] = continuation['continuation']              data['clickTracking'] = {                  'clickTrackingParams': continuation['itct'], @@ -3366,8 +3770,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):                      # Downloading page may result in intermittent 5xx HTTP error                      # that is usually worked around with a retry                      response = self._download_json( -                        'https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', +                        'https://www.youtube.com/youtubei/v1/browse',                          None, 'Downloading page %d%s' % (page_num, ' (retry #%d)' % count if count else ''), +                        query={ +                            # 'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', +                            'prettyPrint': 'false', +                        },                          headers=headers, data=json.dumps(data).encode('utf8'))                      break                  except ExtractorError as e: @@ -3464,18 +3872,34 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):          uploader['channel'] = uploader['uploader']          return uploader -    @classmethod -    def _extract_alert(cls, data): -        alerts = [] -        for alert in traverse_obj(data, ('alerts', Ellipsis), expected_type=dict): -            alert_text = traverse_obj( -                alert, (None, lambda x: x['alertRenderer']['text']), get_all=False) -            if not alert_text: -                continue -            text = cls._get_text(alert_text, 'text') -            if text: -                alerts.append(text) -        return '\n'.join(alerts) +    def _extract_and_report_alerts(self, data, expected=True, fatal=True, only_once=False): + +        def alerts(): +            for alert in traverse_obj(data, ('alerts', Ellipsis), expected_type=dict): +                alert_dict = traverse_obj( +                    alert, 'alertRenderer', None, expected_type=dict, get_all=False) +                alert_type = traverse_obj(alert_dict, 'type') +                if not alert_type: +                    continue +                message = self._get_text(alert_dict, 'text') +                if message: +                    yield alert_type, message + +        errors, warnings = [], [] +        _IGNORED_WARNINGS = T('Unavailable videos will be hidden during playback') +        for alert_type, alert_message in alerts(): +            if alert_type.lower() == 'error' and fatal: +                errors.append([alert_type, alert_message]) +            elif alert_message not in _IGNORED_WARNINGS: +                warnings.append([alert_type, alert_message]) + +        for alert_type, alert_message in itertools.chain(warnings, errors[:-1]): +            self.report_warning( +                'YouTube said: %s - %s' % (alert_type, alert_message), +                only_once=only_once) +        if errors: +            raise ExtractorError( +                'YouTube said: %s' % (errors[-1][1],), expected=expected)      def _extract_from_tabs(self, item_id, webpage, data, tabs):          selected_tab = self._extract_selected_tab(tabs) @@ -3536,10 +3960,23 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):      def _real_extract(self, url):          item_id = self._match_id(url)          url = update_url(url, netloc='www.youtube.com') -        # Handle both video/playlist URLs          qs = parse_qs(url) -        video_id = qs.get('v', [None])[0] -        playlist_id = qs.get('list', [None])[0] + +        def qs_get(key, default=None): +            return qs.get(key, [default])[-1] + +        # Go around for /feeds/videos.xml?playlist_id={pl_id} +        if item_id == 'feeds' and '/feeds/videos.xml?' in url: +            playlist_id = qs_get('playlist_id') +            if playlist_id: +                return self.url_result( +                    update_url_query('https://www.youtube.com/playlist', { +                        'list': playlist_id, +                    }), ie=self.ie_key(), video_id=playlist_id) + +        # Handle both video/playlist URLs +        video_id = qs_get('v') +        playlist_id = qs_get('list')          if video_id and playlist_id:              if self._downloader.params.get('noplaylist'):                  self.to_screen('Downloading just video %s because of --no-playlist' % video_id) @@ -3562,10 +3999,10 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):              compat_str) or video_id          if video_id:              return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id) +          # Capture and output alerts -        alert = self._extract_alert(data) -        if alert: -            raise ExtractorError(alert, expected=True) +        self._extract_and_report_alerts(data) +          # Failed to recognize          raise ExtractorError('Unable to recognize tab page') @@ -3719,7 +4156,7 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):          'only_matching': True,      }] -    def _real_extract(self, url): +    def _real_extract(self, _):          return self.url_result(              'https://www.youtube.com/playlist?list=LL',              ie=YoutubeTabIE.ie_key()) @@ -3791,6 +4228,7 @@ class YoutubeFeedsInfoExtractor(YoutubeTabIE):      Subclasses must define the _FEED_NAME property.      """ +      _LOGIN_REQUIRED = True      @property @@ -3800,7 +4238,7 @@ class YoutubeFeedsInfoExtractor(YoutubeTabIE):      def _real_initialize(self):          self._login() -    def _real_extract(self, url): +    def _real_extract(self, _):          return self.url_result(              'https://www.youtube.com/feed/%s' % self._FEED_NAME,              ie=YoutubeTabIE.ie_key()) @@ -3815,7 +4253,7 @@ class YoutubeWatchLaterIE(InfoExtractor):          'only_matching': True,      }] -    def _real_extract(self, url): +    def _real_extract(self, _):          return self.url_result(              'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key()) @@ -3895,7 +4333,7 @@ class YoutubeTruncatedURLIE(InfoExtractor):          'only_matching': True,      }] -    def _real_extract(self, url): +    def _real_extract(self, _):          raise ExtractorError(              'Did you forget to quote the URL? Remember that & is a meta '              'character in most shells, so you want to put the URL in quotes, ' diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 7835187f5..7630e2099 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -1,10 +1,12 @@  # coding: utf-8  from __future__ import unicode_literals +import calendar  import itertools  import json  import operator  import re +import time  from functools import update_wrapper, wraps @@ -12,8 +14,10 @@ from .utils import (      error_to_compat_str,      ExtractorError,      float_or_none, +    int_or_none,      js_to_json,      remove_quotes, +    str_or_none,      unified_timestamp,      variadic,      write_string, @@ -24,6 +28,8 @@ from .compat import (      compat_collections_chain_map as ChainMap,      compat_contextlib_suppress,      compat_filter as filter, +    compat_int, +    compat_integer_types,      compat_itertools_zip_longest as zip_longest,      compat_map as map,      compat_numeric_types, @@ -70,14 +76,27 @@ class JS_Undefined(object):      pass -def _js_bit_op(op): +def _js_bit_op(op, is_shift=False): -    def zeroise(x): -        return 0 if x in (None, JS_Undefined, _NaN, _Infinity) else x +    def zeroise(x, is_shift_arg=False): +        if isinstance(x, compat_integer_types): +            return (x % 32) if is_shift_arg else (x & 0xffffffff) +        try: +            x = float(x) +            if is_shift_arg: +                x = int(x % 32) +            elif x < 0: +                x = -compat_int(-x % 0xffffffff) +            else: +                x = compat_int(x % 0xffffffff) +        except (ValueError, TypeError): +            # also here for int(NaN), including float('inf') % 32 +            x = 0 +        return x      @wraps_op(op)      def wrapped(a, b): -        return op(zeroise(a), zeroise(b)) & 0xffffffff +        return op(zeroise(a), zeroise(b, is_shift)) & 0xffffffff      return wrapped @@ -135,6 +154,7 @@ def _js_to_primitive(v):      ) +# more exact: yt-dlp/yt-dlp#12110  def _js_toString(v):      return (          'undefined' if v is JS_Undefined @@ -143,7 +163,7 @@ def _js_toString(v):          else 'null' if v is None          # bool <= int: do this first          else ('false', 'true')[v] if isinstance(v, bool) -        else '{0:.7f}'.format(v).rstrip('.0') if isinstance(v, compat_numeric_types) +        else re.sub(r'(?<=\d)\.?0*$', '', '{0:.7f}'.format(v)) if isinstance(v, compat_numeric_types)          else _js_to_primitive(v)) @@ -220,7 +240,7 @@ def _js_ternary(cndn, if_true=True, if_false=False):  def _js_unary_op(op):      @wraps_op(op) -    def wrapped(_, a): +    def wrapped(a, _):          return op(a)      return wrapped @@ -253,8 +273,8 @@ def _js_typeof(expr):  # avoid dict to maintain order  # definition None => Defined in JSInterpreter._operator  _OPERATORS = ( -    ('>>', _js_bit_op(operator.rshift)), -    ('<<', _js_bit_op(operator.lshift)), +    ('>>', _js_bit_op(operator.rshift, True)), +    ('<<', _js_bit_op(operator.lshift, True)),      ('+', _js_add),      ('-', _js_arith_op(operator.sub)),      ('*', _js_arith_op(operator.mul)), @@ -263,17 +283,6 @@ _OPERATORS = (      ('**', _js_exp),  ) -_COMP_OPERATORS = ( -    ('===', _js_id_op(operator.is_)), -    ('!==', _js_id_op(operator.is_not)), -    ('==', _js_eq), -    ('!=', _js_neq), -    ('<=', _js_comp_op(operator.le)), -    ('>=', _js_comp_op(operator.ge)), -    ('<', _js_comp_op(operator.lt)), -    ('>', _js_comp_op(operator.gt)), -) -  _LOG_OPERATORS = (      ('|', _js_bit_op(operator.or_)),      ('^', _js_bit_op(operator.xor)), @@ -290,13 +299,27 @@ _SC_OPERATORS = (  _UNARY_OPERATORS_X = (      ('void', _js_unary_op(lambda _: JS_Undefined)),      ('typeof', _js_unary_op(_js_typeof)), +    # avoid functools.partial here since Py2 update_wrapper(partial) -> no __module__ +    ('!', _js_unary_op(lambda x: _js_ternary(x, if_true=False, if_false=True))),  ) -_OPERATOR_RE = '|'.join(map(lambda x: re.escape(x[0]), _OPERATORS + _LOG_OPERATORS)) +_COMP_OPERATORS = ( +    ('===', _js_id_op(operator.is_)), +    ('!==', _js_id_op(operator.is_not)), +    ('==', _js_eq), +    ('!=', _js_neq), +    ('<=', _js_comp_op(operator.le)), +    ('>=', _js_comp_op(operator.ge)), +    ('<', _js_comp_op(operator.lt)), +    ('>', _js_comp_op(operator.gt)), +) + +_OPERATOR_RE = '|'.join(map(lambda x: re.escape(x[0]), _OPERATORS + _LOG_OPERATORS + _SC_OPERATORS))  _NAME_RE = r'[a-zA-Z_$][\w$]*'  _MATCHING_PARENS = dict(zip(*zip('()', '{}', '[]')))  _QUOTES = '\'"/' +_NESTED_BRACKETS = r'[^[\]]+(?:\[[^[\]]+(?:\[[^\]]+\])?\])?'  class JS_Break(ExtractorError): @@ -333,7 +356,7 @@ class LocalNameSpace(ChainMap):          raise NotImplementedError('Deleting is not supported')      def __repr__(self): -        return 'LocalNameSpace%s' % (self.maps, ) +        return 'LocalNameSpace({0!r})'.format(self.maps)  class Debugger(object): @@ -354,6 +377,9 @@ class Debugger(object):      @classmethod      def wrap_interpreter(cls, f): +        if not cls.ENABLED: +            return f +          @wraps(f)          def interpret_statement(self, stmt, local_vars, allow_recursion, *args, **kwargs):              if cls.ENABLED and stmt.strip(): @@ -389,11 +415,22 @@ class JSInterpreter(object):      class Exception(ExtractorError):          def __init__(self, msg, *args, **kwargs):              expr = kwargs.pop('expr', None) +            msg = str_or_none(msg, default='"None"')              if expr is not None:                  msg = '{0} in: {1!r:.100}'.format(msg.rstrip(), expr)              super(JSInterpreter.Exception, self).__init__(msg, *args, **kwargs) -    class JS_RegExp(object): +    class JS_Object(object): +        def __getitem__(self, key): +            if hasattr(self, key): +                return getattr(self, key) +            raise KeyError(key) + +        def dump(self): +            """Serialise the instance""" +            raise NotImplementedError + +    class JS_RegExp(JS_Object):          RE_FLAGS = {              # special knowledge: Python's re flags are bitmask values, current max 128              # invent new bitmask values well above that for literal parsing @@ -414,15 +451,24 @@ class JSInterpreter(object):          def __init__(self, pattern_txt, flags=0):              if isinstance(flags, compat_str):                  flags, _ = self.regex_flags(flags) -            # First, avoid https://github.com/python/cpython/issues/74534              self.__self = None -            self.__pattern_txt = pattern_txt.replace('[[', r'[\[') +            pattern_txt = str_or_none(pattern_txt) or '(?:)' +            # escape unintended embedded flags +            pattern_txt = re.sub( +                r'(\(\?)([aiLmsux]*)(-[imsx]+:|(?<!\?)\))', +                lambda m: ''.join( +                    (re.escape(m.group(1)), m.group(2), re.escape(m.group(3))) +                    if m.group(3) == ')' +                    else ('(?:', m.group(2), m.group(3))), +                pattern_txt) +            # Avoid https://github.com/python/cpython/issues/74534 +            self.source = pattern_txt.replace('[[', r'[\[')              self.__flags = flags          def __instantiate(self):              if self.__self:                  return -            self.__self = re.compile(self.__pattern_txt, self.__flags) +            self.__self = re.compile(self.source, self.__flags)              # Thx: https://stackoverflow.com/questions/44773522/setattr-on-python2-sre-sre-pattern              for name in dir(self.__self):                  # Only these? Obviously __class__, __init__. @@ -430,16 +476,15 @@ class JSInterpreter(object):                  # that can't be setattr'd but also can't need to be copied.                  if name in ('__class__', '__init__', '__weakref__'):                      continue -                setattr(self, name, getattr(self.__self, name)) +                if name == 'flags': +                    setattr(self, name, getattr(self.__self, name, self.__flags)) +                else: +                    setattr(self, name, getattr(self.__self, name))          def __getattr__(self, name):              self.__instantiate() -            # make Py 2.6 conform to its lying documentation -            if name == 'flags': -                self.flags = self.__flags -                return self.flags -            elif name == 'pattern': -                self.pattern = self.__pattern_txt +            if name == 'pattern': +                self.pattern = self.source                  return self.pattern              elif hasattr(self.__self, name):                  v = getattr(self.__self, name) @@ -447,6 +492,26 @@ class JSInterpreter(object):                  return v              elif name in ('groupindex', 'groups'):                  return 0 if name == 'groupindex' else {} +            else: +                flag_attrs = (  # order by 2nd elt +                    ('hasIndices', 'd'), +                    ('global', 'g'), +                    ('ignoreCase', 'i'), +                    ('multiline', 'm'), +                    ('dotAll', 's'), +                    ('unicode', 'u'), +                    ('unicodeSets', 'v'), +                    ('sticky', 'y'), +                ) +                for k, c in flag_attrs: +                    if name == k: +                        return bool(self.RE_FLAGS[c] & self.__flags) +                else: +                    if name == 'flags': +                        return ''.join( +                            (c if self.RE_FLAGS[c] & self.__flags else '') +                            for _, c in flag_attrs) +              raise AttributeError('{0} has no attribute named {1}'.format(self, name))          @classmethod @@ -460,6 +525,85 @@ class JSInterpreter(object):                  flags |= cls.RE_FLAGS[ch]              return flags, expr[idx + 1:] +        def dump(self): +            return '(/{0}/{1})'.format( +                re.sub(r'(?<!\\)/', r'\/', self.source), +                self.flags) + +        @staticmethod +        def escape(string_): +            return re.escape(string_) + +    class JS_Date(JS_Object): +        _t = None + +        @staticmethod +        def __ymd_etc(*args, **kw_is_utc): +            # args: year, monthIndex, day, hours, minutes, seconds, milliseconds +            is_utc = kw_is_utc.get('is_utc', False) + +            args = list(args[:7]) +            args += [0] * (9 - len(args)) +            args[1] += 1  # month 0..11 -> 1..12 +            ms = args[6] +            for i in range(6, 9): +                args[i] = -1  # don't know +            if is_utc: +                args[-1] = 1 +            # TODO: [MDN] When a segment overflows or underflows its expected +            # range, it usually "carries over to" or "borrows from" the higher segment. +            try: +                mktime = calendar.timegm if is_utc else time.mktime +                return mktime(time.struct_time(args)) * 1000 + ms +            except (OverflowError, ValueError): +                return None + +        @classmethod +        def UTC(cls, *args): +            t = cls.__ymd_etc(*args, is_utc=True) +            return _NaN if t is None else t + +        @staticmethod +        def parse(date_str, **kw_is_raw): +            is_raw = kw_is_raw.get('is_raw', False) + +            t = unified_timestamp(str_or_none(date_str), False) +            return int(t * 1000) if t is not None else t if is_raw else _NaN + +        @staticmethod +        def now(**kw_is_raw): +            is_raw = kw_is_raw.get('is_raw', False) + +            t = time.time() +            return int(t * 1000) if t is not None else t if is_raw else _NaN + +        def __init__(self, *args): +            if not args: +                args = [self.now(is_raw=True)] +            if len(args) == 1: +                if isinstance(args[0], JSInterpreter.JS_Date): +                    self._t = int_or_none(args[0].valueOf(), default=None) +                else: +                    arg_type = _js_typeof(args[0]) +                    if arg_type == 'string': +                        self._t = self.parse(args[0], is_raw=True) +                    elif arg_type == 'number': +                        self._t = int(args[0]) +            else: +                self._t = self.__ymd_etc(*args) + +        def toString(self): +            try: +                return time.strftime('%a %b %0d %Y %H:%M:%S %Z%z', self._t).rstrip() +            except TypeError: +                return "Invalid Date" + +        def valueOf(self): +            return _NaN if self._t is None else self._t + +        def dump(self): +            return '(new Date({0}))'.format(self.toString()) +      @classmethod      def __op_chars(cls):          op_chars = set(';,[') @@ -563,6 +707,68 @@ class JSInterpreter(object):                  _SC_OPERATORS, _LOG_OPERATORS, _COMP_OPERATORS, _OPERATORS, _UNARY_OPERATORS_X))          return _cached +    def _separate_at_op(self, expr, max_split=None): + +        for op, _ in self._all_operators(): +            # hackety: </> have higher priority than <</>>, but don't confuse them +            skip_delim = (op + op) if op in '<>*?' else None +            if op == '?': +                skip_delim = (skip_delim, '?.') +            separated = list(self._separate(expr, op, skip_delims=skip_delim)) +            if len(separated) < 2: +                continue + +            right_expr = separated.pop() +            # handle operators that are both unary and binary, minimal BODMAS +            if op in ('+', '-'): +                # simplify/adjust consecutive instances of these operators +                undone = 0 +                separated = [s.strip() for s in separated] +                while len(separated) > 1 and not separated[-1]: +                    undone += 1 +                    separated.pop() +                if op == '-' and undone % 2 != 0: +                    right_expr = op + right_expr +                elif op == '+': +                    while len(separated) > 1 and set(separated[-1]) <= self.OP_CHARS: +                        right_expr = separated.pop() + right_expr +                    if separated[-1][-1:] in self.OP_CHARS: +                        right_expr = separated.pop() + right_expr +                # hanging op at end of left => unary + (strip) or - (push right) +                separated.append(right_expr) +                dm_ops = ('*', '%', '/', '**') +                dm_chars = set(''.join(dm_ops)) + +                def yield_terms(s): +                    skip = False +                    for i, term in enumerate(s[:-1]): +                        if skip: +                            skip = False +                            continue +                        if not (dm_chars & set(term)): +                            yield term +                            continue +                        for dm_op in dm_ops: +                            bodmas = list(self._separate(term, dm_op, skip_delims=skip_delim)) +                            if len(bodmas) > 1 and not bodmas[-1].strip(): +                                bodmas[-1] = (op if op == '-' else '') + s[i + 1] +                                yield dm_op.join(bodmas) +                                skip = True +                                break +                        else: +                            if term: +                                yield term + +                    if not skip and s[-1]: +                        yield s[-1] + +                separated = list(yield_terms(separated)) +                right_expr = separated.pop() if len(separated) > 1 else None +                expr = op.join(separated) +            if right_expr is None: +                continue +            return op, separated, right_expr +      def _operator(self, op, left_val, right_expr, expr, local_vars, allow_recursion):          if op in ('||', '&&'):              if (op == '&&') ^ _js_ternary(left_val): @@ -573,7 +779,7 @@ class JSInterpreter(object):          elif op == '?':              right_expr = _js_ternary(left_val, *self._separate(right_expr, ':', 1)) -        right_val = self.interpret_expression(right_expr, local_vars, allow_recursion) +        right_val = self.interpret_expression(right_expr, local_vars, allow_recursion) if right_expr else left_val          opfunc = op and next((v for k, v in self._all_operators() if k == op), None)          if not opfunc:              return right_val @@ -584,18 +790,21 @@ class JSInterpreter(object):          except Exception as e:              raise self.Exception('Failed to evaluate {left_val!r:.50} {op} {right_val!r:.50}'.format(**locals()), expr, cause=e) -    def _index(self, obj, idx, allow_undefined=True): +    def _index(self, obj, idx, allow_undefined=None):          if idx == 'length' and isinstance(obj, list):              return len(obj)          try:              return obj[int(idx)] if isinstance(obj, list) else obj[compat_str(idx)] -        except (TypeError, KeyError, IndexError) as e: -            if allow_undefined: -                # when is not allowed? +        except (TypeError, KeyError, IndexError, ValueError) as e: +            # allow_undefined is None gives correct behaviour +            if allow_undefined or ( +                    allow_undefined is None and not isinstance(e, TypeError)):                  return JS_Undefined              raise self.Exception('Cannot get index {idx!r:.100}'.format(**locals()), expr=repr(obj), cause=e)      def _dump(self, obj, namespace): +        if obj is JS_Undefined: +            return 'undefined'          try:              return json.dumps(obj)          except TypeError: @@ -615,51 +824,9 @@ class JSInterpreter(object):      _FINALLY_RE = re.compile(r'finally\s*\{')      _SWITCH_RE = re.compile(r'switch\s*\(') -    def handle_operators(self, expr, local_vars, allow_recursion): - -        for op, _ in self._all_operators(): -            # hackety: </> have higher priority than <</>>, but don't confuse them -            skip_delim = (op + op) if op in '<>*?' else None -            if op == '?': -                skip_delim = (skip_delim, '?.') -            separated = list(self._separate(expr, op, skip_delims=skip_delim)) -            if len(separated) < 2: -                continue - -            right_expr = separated.pop() -            # handle operators that are both unary and binary, minimal BODMAS -            if op in ('+', '-'): -                # simplify/adjust consecutive instances of these operators -                undone = 0 -                separated = [s.strip() for s in separated] -                while len(separated) > 1 and not separated[-1]: -                    undone += 1 -                    separated.pop() -                if op == '-' and undone % 2 != 0: -                    right_expr = op + right_expr -                elif op == '+': -                    while len(separated) > 1 and set(separated[-1]) <= self.OP_CHARS: -                        right_expr = separated.pop() + right_expr -                    if separated[-1][-1:] in self.OP_CHARS: -                        right_expr = separated.pop() + right_expr -                # hanging op at end of left => unary + (strip) or - (push right) -                left_val = separated[-1] if separated else '' -                for dm_op in ('*', '%', '/', '**'): -                    bodmas = tuple(self._separate(left_val, dm_op, skip_delims=skip_delim)) -                    if len(bodmas) > 1 and not bodmas[-1].strip(): -                        expr = op.join(separated) + op + right_expr -                        if len(separated) > 1: -                            separated.pop() -                            right_expr = op.join((left_val, right_expr)) -                        else: -                            separated = [op.join((left_val, right_expr))] -                            right_expr = None -                        break -                if right_expr is None: -                    continue - -            left_val = self.interpret_expression(op.join(separated), local_vars, allow_recursion) -            return self._operator(op, left_val, right_expr, expr, local_vars, allow_recursion), True +    def _eval_operator(self, op, left_expr, right_expr, expr, local_vars, allow_recursion): +        left_val = self.interpret_expression(left_expr, local_vars, allow_recursion) +        return self._operator(op, left_val, right_expr, expr, local_vars, allow_recursion)      @Debugger.wrap_interpreter      def interpret_statement(self, stmt, local_vars, allow_recursion=100): @@ -700,7 +867,7 @@ class JSInterpreter(object):          new_kw, _, obj = expr.partition('new ')          if not new_kw: -            for klass, konstr in (('Date', lambda x: int(unified_timestamp(x, False) * 1000)), +            for klass, konstr in (('Date', lambda *x: self.JS_Date(*x).valueOf()),                                    ('RegExp', self.JS_RegExp),                                    ('Error', self.Exception)):                  if not obj.startswith(klass + '('): @@ -715,15 +882,19 @@ class JSInterpreter(object):              else:                  raise self.Exception('Unsupported object {obj:.100}'.format(**locals()), expr=expr) +        # apply unary operators (see new above)          for op, _ in _UNARY_OPERATORS_X:              if not expr.startswith(op):                  continue              operand = expr[len(op):] -            if not operand or operand[0] != ' ': +            if not operand or (op.isalpha() and operand[0] != ' '):                  continue -            op_result = self.handle_operators(expr, local_vars, allow_recursion) -            if op_result: -                return op_result[0], should_return +            separated = self._separate_at_op(operand, max_split=1) +            if separated: +                next_op, separated, right_expr = separated +                separated.append(right_expr) +                operand = next_op.join(separated) +            return self._eval_operator(op, operand, '', expr, local_vars, allow_recursion), should_return          if expr.startswith('{'):              inner, outer = self._separate_at_paren(expr) @@ -918,15 +1089,18 @@ class JSInterpreter(object):          m = re.match(r'''(?x)              (?P<assign> -                (?P<out>{_NAME_RE})(?:\[(?P<out_idx>(?:.+?\]\s*\[)*.+?)\])?\s* +                (?P<out>{_NAME_RE})(?P<out_idx>(?:\[{_NESTED_BRACKETS}\])+)?\s*                  (?P<op>{_OPERATOR_RE})?                  =(?!=)(?P<expr>.*)$              )|(?P<return>                  (?!if|return|true|false|null|undefined|NaN|Infinity)(?P<name>{_NAME_RE})$ -            )|(?P<indexing> -                (?P<in>{_NAME_RE})\[(?P<in_idx>(?:.+?\]\s*\[)*.+?)\]$              )|(?P<attribute> -                (?P<var>{_NAME_RE})(?:(?P<nullish>\?)?\.(?P<member>[^(]+)|\[(?P<member2>[^\]]+)\])\s* +                (?P<var>{_NAME_RE})(?: +                    (?P<nullish>\?)?\.(?P<member>[^(]+)| +                    \[(?P<member2>{_NESTED_BRACKETS})\] +                )\s* +            )|(?P<indexing> +                (?P<in>{_NAME_RE})(?P<in_idx>\[.+\])$              )|(?P<function>                  (?P<fname>{_NAME_RE})\((?P<args>.*)\)$              )'''.format(**globals()), expr) @@ -941,13 +1115,18 @@ class JSInterpreter(object):              elif left_val in (None, JS_Undefined):                  raise self.Exception('Cannot index undefined variable ' + m.group('out'), expr=expr) -            indexes = re.split(r'\]\s*\[', m.group('out_idx')) -            for i, idx in enumerate(indexes, 1): +            indexes = md['out_idx'] +            while indexes: +                idx, indexes = self._separate_at_paren(indexes)                  idx = self.interpret_expression(idx, local_vars, allow_recursion) -                if i < len(indexes): +                if indexes:                      left_val = self._index(left_val, idx)              if isinstance(idx, float):                  idx = int(idx) +            if isinstance(left_val, list) and len(left_val) <= int_or_none(idx, default=-1): +                # JS Array is a sparsely assignable list +                # TODO: handle extreme sparsity without memory bloat, eg using auxiliary dict +                left_val.extend((idx - len(left_val) + 1) * [JS_Undefined])              left_val[idx] = self._operator(                  m.group('op'), self._index(left_val, idx) if m.group('op') else None,                  m.group('expr'), expr, local_vars, allow_recursion) @@ -985,14 +1164,17 @@ class JSInterpreter(object):          if md.get('indexing'):              val = local_vars[m.group('in')] -            for idx in re.split(r'\]\s*\[', m.group('in_idx')): +            indexes = m.group('in_idx') +            while indexes: +                idx, indexes = self._separate_at_paren(indexes)                  idx = self.interpret_expression(idx, local_vars, allow_recursion)                  val = self._index(val, idx)              return val, should_return -        op_result = self.handle_operators(expr, local_vars, allow_recursion) -        if op_result: -            return op_result[0], should_return +        separated = self._separate_at_op(expr) +        if separated: +            op, separated, right_expr = separated +            return self._eval_operator(op, op.join(separated), right_expr, expr, local_vars, allow_recursion), should_return          if md.get('attribute'):              variable, member, nullish = m.group('var', 'member', 'nullish') @@ -1013,12 +1195,15 @@ class JSInterpreter(object):              def eval_method(variable, member):                  if (variable, member) == ('console', 'debug'):                      if Debugger.ENABLED: -                        Debugger.write(self.interpret_expression('[{}]'.format(arg_str), local_vars, allow_recursion)) +                        Debugger.write(self.interpret_expression('[{0}]'.format(arg_str), local_vars, allow_recursion))                      return                  types = {                      'String': compat_str,                      'Math': float,                      'Array': list, +                    'Date': self.JS_Date, +                    'RegExp': self.JS_RegExp, +                    # 'Error': self.Exception,  # has no std static methods                  }                  obj = local_vars.get(variable)                  if obj in (JS_Undefined, None): @@ -1026,7 +1211,7 @@ class JSInterpreter(object):                  if obj is JS_Undefined:                      try:                          if variable not in self._objects: -                            self._objects[variable] = self.extract_object(variable) +                            self._objects[variable] = self.extract_object(variable, local_vars)                          obj = self._objects[variable]                      except self.Exception:                          if not nullish: @@ -1037,7 +1222,7 @@ class JSInterpreter(object):                  # Member access                  if arg_str is None: -                    return self._index(obj, member) +                    return self._index(obj, member, nullish)                  # Function call                  argvals = [ @@ -1071,6 +1256,8 @@ class JSInterpreter(object):                          assertion(len(argvals) == 2, 'takes two arguments')                          return argvals[0] ** argvals[1]                      raise self.Exception('Unsupported Math method ' + member, expr=expr) +                elif obj is self.JS_Date: +                    return getattr(obj, member)(*argvals)                  if member == 'split':                      assertion(len(argvals) <= 2, 'takes at most two arguments') @@ -1111,9 +1298,10 @@ class JSInterpreter(object):                  elif member == 'join':                      assertion(isinstance(obj, list), 'must be applied on a list')                      assertion(len(argvals) <= 1, 'takes at most one argument') -                    return (',' if len(argvals) == 0 else argvals[0]).join( -                        ('' if x in (None, JS_Undefined) else _js_toString(x)) -                        for x in obj) +                    return (',' if len(argvals) == 0 or argvals[0] in (None, JS_Undefined) +                            else argvals[0]).join( +                                ('' if x in (None, JS_Undefined) else _js_toString(x)) +                                for x in obj)                  elif member == 'reverse':                      assertion(not argvals, 'does not take any arguments')                      obj.reverse() @@ -1177,7 +1365,8 @@ class JSInterpreter(object):                      assertion(len(argvals) == 2, 'takes exactly two arguments')                      # TODO: argvals[1] callable, other Py vs JS edge cases                      if isinstance(argvals[0], self.JS_RegExp): -                        count = 0 if argvals[0].flags & self.JS_RegExp.RE_FLAGS['g'] else 1 +                        # access JS member with Py reserved name +                        count = 0 if self._index(argvals[0], 'global') else 1                          assertion(member != 'replaceAll' or count == 0,                                    'replaceAll must be called with a global RegExp')                          return argvals[0].sub(argvals[1], obj, count=count) @@ -1218,7 +1407,7 @@ class JSInterpreter(object):          for v in self._separate(list_txt):              yield self.interpret_expression(v, local_vars, allow_recursion) -    def extract_object(self, objname): +    def extract_object(self, objname, *global_stack):          _FUNC_NAME_RE = r'''(?:{n}|"{n}"|'{n}')'''.format(n=_NAME_RE)          obj = {}          fields = next(filter(None, ( @@ -1239,7 +1428,8 @@ class JSInterpreter(object):                  fields):              argnames = self.build_arglist(f.group('args'))              name = remove_quotes(f.group('key')) -            obj[name] = function_with_repr(self.build_function(argnames, f.group('code')), 'F<{0}>'.format(name)) +            obj[name] = function_with_repr( +                self.build_function(argnames, f.group('code'), *global_stack), 'F<{0}>'.format(name))          return obj @@ -1271,19 +1461,21 @@ class JSInterpreter(object):          code, _ = self._separate_at_paren(func_m.group('code'))  # refine the match          return self.build_arglist(func_m.group('args')), code -    def extract_function(self, funcname): +    def extract_function(self, funcname, *global_stack):          return function_with_repr( -            self.extract_function_from_code(*self.extract_function_code(funcname)), +            self.extract_function_from_code(*itertools.chain( +                self.extract_function_code(funcname), global_stack)),              'F<%s>' % (funcname,))      def extract_function_from_code(self, argnames, code, *global_stack):          local_vars = {} +        start = None          while True: -            mobj = re.search(r'function\((?P<args>[^)]*)\)\s*{', code) +            mobj = re.search(r'function\((?P<args>[^)]*)\)\s*{', code[start:])              if mobj is None:                  break -            start, body_start = mobj.span() +            start, body_start = ((start or 0) + x for x in mobj.span())              body, remaining = self._separate_at_paren(code[body_start - 1:])              name = self._named_object(local_vars, self.extract_function_from_code(                  [x.strip() for x in mobj.group('args').split(',')], diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 61705d1f0..ce3633c41 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -405,6 +405,10 @@ def parseOpts(overrideArguments=None):          action='store_true', dest='listformats',          help='List all available formats of requested videos')      video_format.add_option( +        '--no-list-formats', +        action='store_false', dest='listformats', +        help='Do not list available formats of requested videos (default)') +    video_format.add_option(          '--youtube-include-dash-manifest',          action='store_true', dest='youtube_include_dash_manifest', default=True,          help=optparse.SUPPRESS_HELP) @@ -413,6 +417,17 @@ def parseOpts(overrideArguments=None):          action='store_false', dest='youtube_include_dash_manifest',          help='Do not download the DASH manifests and related data on YouTube videos')      video_format.add_option( +        '--youtube-player-js-variant', +        action='store', dest='youtube_player_js_variant', +        help='For YouTube, the player javascript variant to use for n/sig deciphering; `actual` to follow the site; default `%default`.', +        choices=('actual', 'main', 'tcc', 'tce', 'es5', 'es6', 'tv', 'tv_es6', 'phone', 'tablet'), +        default='main', metavar='VARIANT') +    video_format.add_option( +        '--youtube-player-js-version', +        action='store', dest='youtube_player_js_version', +        help='For YouTube, the player javascript version to use for n/sig deciphering, specified as `signature_timestamp@hash`, or `actual` to follow the site; default `%default`', +        default='20348@0004de42', metavar='STS@HASH') +    video_format.add_option(          '--merge-output-format',          action='store', dest='merge_output_format', metavar='FORMAT', default=None,          help=( diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ac1e78002..c4262936e 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -4204,12 +4204,16 @@ def lowercase_escape(s):          s) -def escape_rfc3986(s): +def escape_rfc3986(s, safe=None):      """Escape non-ASCII characters as suggested by RFC 3986"""      if sys.version_info < (3, 0):          s = _encode_compat_str(s, 'utf-8') +        if safe is not None: +            safe = _encode_compat_str(safe, 'utf-8') +    if safe is None: +        safe = b"%/;:@&=+$,!~*'()?#[]"      # ensure unicode: after quoting, it can always be converted -    return compat_str(compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")) +    return compat_str(compat_urllib_parse.quote(s, safe))  def escape_url(url): diff --git a/youtube_dl/version.py b/youtube_dl/version.py index b82fbc702..c70d9d2af 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@  from __future__ import unicode_literals -__version__ = '2021.12.17' +__version__ = '2025.04.07'  | 
