diff options
author | coletdjnz <coletdjnz@protonmail.com> | 2023-10-14 12:33:00 +1300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-10-13 23:33:00 +0000 |
commit | 8a8b54523addf46dfd50ef599761a81bc22362e6 (patch) | |
tree | f6f7d31d0ee51dfe732d26e26c294829b6c5ed5b | |
parent | 700444c23ddb65f618c2abd942acdc0c58c650b1 (diff) |
[rh:requests] Add handler for `requests` HTTP library (#3668)
Adds support for HTTPS proxies and persistent connections (keep-alive)
Closes https://github.com/yt-dlp/yt-dlp/issues/1890
Resolves https://github.com/yt-dlp/yt-dlp/issues/4070
Resolves https://github.com/ytdl-org/youtube-dl/issues/32549
Resolves https://github.com/ytdl-org/youtube-dl/issues/14523
Resolves https://github.com/ytdl-org/youtube-dl/issues/13734
Authored by: coletdjnz, Grub4K, bashonly
-rw-r--r-- | .github/workflows/core.yml | 2 | ||||
-rw-r--r-- | README.md | 4 | ||||
-rw-r--r-- | requirements.txt | 2 | ||||
-rw-r--r-- | setup.py | 9 | ||||
-rw-r--r-- | test/test_networking.py | 168 | ||||
-rw-r--r-- | test/test_socks.py | 36 | ||||
-rw-r--r-- | yt_dlp/YoutubeDL.py | 7 | ||||
-rw-r--r-- | yt_dlp/__pyinstaller/hook-yt_dlp.py | 4 | ||||
-rw-r--r-- | yt_dlp/dependencies/__init__.py | 9 | ||||
-rw-r--r-- | yt_dlp/networking/__init__.py | 10 | ||||
-rw-r--r-- | yt_dlp/networking/_helper.py | 20 | ||||
-rw-r--r-- | yt_dlp/networking/_requests.py | 398 | ||||
-rw-r--r-- | yt_dlp/networking/_urllib.py | 26 | ||||
-rw-r--r-- | yt_dlp/options.py | 3 |
14 files changed, 619 insertions, 79 deletions
diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml index 7acaee1e8..049faf373 100644 --- a/.github/workflows/core.yml +++ b/.github/workflows/core.yml @@ -32,7 +32,7 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - - name: Install pytest + - name: Install dependencies run: pip install pytest -r requirements.txt - name: Run tests continue-on-error: False @@ -157,6 +157,7 @@ Some of yt-dlp's default options are different from that of youtube-dl and youtu * yt-dlp's sanitization of invalid characters in filenames is different/smarter than in youtube-dl. You can use `--compat-options filename-sanitization` to revert to youtube-dl's behavior * yt-dlp tries to parse the external downloader outputs into the standard progress output if possible (Currently implemented: [~~aria2c~~](https://github.com/yt-dlp/yt-dlp/issues/5931)). You can use `--compat-options no-external-downloader-progress` to get the downloader output as-is * yt-dlp versions between 2021.09.01 and 2023.01.02 applies `--match-filter` to nested playlists. This was an unintentional side-effect of [8f18ac](https://github.com/yt-dlp/yt-dlp/commit/8f18aca8717bb0dd49054555af8d386e5eda3a88) and is fixed in [d7b460](https://github.com/yt-dlp/yt-dlp/commit/d7b460d0e5fc710950582baed2e3fc616ed98a80). Use `--compat-options playlist-match-filter` to revert this +* yt-dlp uses modern http client backends such as `requests`. Use `--compat-options prefer-legacy-http-handler` to prefer the legacy http handler (`urllib`) to be used for standard http requests. For ease of use, a few more compat options are available: @@ -164,7 +165,7 @@ For ease of use, a few more compat options are available: * `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams,-playlist-match-filter` * `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect,-playlist-match-filter` * `--compat-options 2021`: Same as `--compat-options 2022,no-certifi,filename-sanitization,no-youtube-prefer-utc-upload-date` -* `--compat-options 2022`: Same as `--compat-options playlist-match-filter,no-external-downloader-progress`. Use this to enable all future compat options +* `--compat-options 2022`: Same as `--compat-options playlist-match-filter,no-external-downloader-progress,prefer-legacy-http-handler`. Use this to enable all future compat options # INSTALLATION @@ -274,6 +275,7 @@ While all the other dependencies are optional, `ffmpeg` and `ffprobe` are highly * [**certifi**](https://github.com/certifi/python-certifi)\* - Provides Mozilla's root certificate bundle. Licensed under [MPLv2](https://github.com/certifi/python-certifi/blob/master/LICENSE) * [**brotli**](https://github.com/google/brotli)\* or [**brotlicffi**](https://github.com/python-hyper/brotlicffi) - [Brotli](https://en.wikipedia.org/wiki/Brotli) content encoding support. Both licensed under MIT <sup>[1](https://github.com/google/brotli/blob/master/LICENSE) [2](https://github.com/python-hyper/brotlicffi/blob/master/LICENSE) </sup> * [**websockets**](https://github.com/aaugustin/websockets)\* - For downloading over websocket. Licensed under [BSD-3-Clause](https://github.com/aaugustin/websockets/blob/main/LICENSE) +* [**requests**](https://github.com/psf/requests)\* - HTTP library. For HTTPS proxy and persistent connections support. Licensed under [Apache-2.0](https://github.com/psf/requests/blob/main/LICENSE) ### Metadata diff --git a/requirements.txt b/requirements.txt index dde37120f..112c30aeb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,5 @@ websockets brotli; platform_python_implementation=='CPython' brotlicffi; platform_python_implementation!='CPython' certifi +requests>=2.31.0,<3 +urllib3>=1.26.17,<3
\ No newline at end of file @@ -62,7 +62,14 @@ def py2exe_params(): 'compressed': 1, 'optimize': 2, 'dist_dir': './dist', - 'excludes': ['Crypto', 'Cryptodome'], # py2exe cannot import Crypto + 'excludes': [ + # py2exe cannot import Crypto + 'Crypto', + 'Cryptodome', + # py2exe appears to confuse this with our socks library. + # We don't use pysocks and urllib3.contrib.socks would fail to import if tried. + 'urllib3.contrib.socks' + ], 'dll_excludes': ['w9xpopen.exe', 'crypt32.dll'], # Modules that are only imported dynamically must be added here 'includes': ['yt_dlp.compat._legacy', 'yt_dlp.compat._deprecated', diff --git a/test/test_networking.py b/test/test_networking.py index 5308c8d6f..2b45deac7 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -28,7 +28,7 @@ from http.cookiejar import CookieJar from test.helper import FakeYDL, http_server_port from yt_dlp.cookies import YoutubeDLCookieJar -from yt_dlp.dependencies import brotli +from yt_dlp.dependencies import brotli, requests, urllib3 from yt_dlp.networking import ( HEADRequest, PUTRequest, @@ -43,6 +43,7 @@ from yt_dlp.networking.exceptions import ( HTTPError, IncompleteRead, NoSupportingHandlers, + ProxyError, RequestError, SSLError, TransportError, @@ -305,7 +306,7 @@ class TestRequestHandlerBase: class TestHTTPRequestHandler(TestRequestHandlerBase): - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_verify_cert(self, handler): with handler() as rh: with pytest.raises(CertificateVerifyError): @@ -316,7 +317,7 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): assert r.status == 200 r.close() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_ssl_error(self, handler): # HTTPS server with too old TLS version # XXX: is there a better way to test this than to create a new server? @@ -334,7 +335,7 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): validate_and_send(rh, Request(f'https://127.0.0.1:{https_port}/headers')) assert not issubclass(exc_info.type, CertificateVerifyError) - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_percent_encode(self, handler): with handler() as rh: # Unicode characters should be encoded with uppercase percent-encoding @@ -346,7 +347,7 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): assert res.status == 200 res.close() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_remove_dot_segments(self, handler): with handler() as rh: # This isn't a comprehensive test, @@ -361,14 +362,14 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): assert res.url == f'http://127.0.0.1:{self.http_port}/headers' res.close() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_unicode_path_redirection(self, handler): with handler() as rh: r = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/302-non-ascii-redirect')) assert r.url == f'http://127.0.0.1:{self.http_port}/%E4%B8%AD%E6%96%87.html' r.close() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_raise_http_error(self, handler): with handler() as rh: for bad_status in (400, 500, 599, 302): @@ -378,7 +379,7 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): # Should not raise an error validate_and_send(rh, Request('http://127.0.0.1:%d/gen_200' % self.http_port)).close() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_response_url(self, handler): with handler() as rh: # Response url should be that of the last url in redirect chain @@ -389,7 +390,7 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): assert res2.url == f'http://127.0.0.1:{self.http_port}/gen_200' res2.close() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_redirect(self, handler): with handler() as rh: def do_req(redirect_status, method, assert_no_content=False): @@ -444,7 +445,7 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): with pytest.raises(HTTPError): do_req(code, 'GET') - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_request_cookie_header(self, handler): # We should accept a Cookie header being passed as in normal headers and handle it appropriately. with handler() as rh: @@ -476,19 +477,19 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): assert b'Cookie: test=ytdlp' not in data assert b'Cookie: test=test' in data - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_redirect_loop(self, handler): with handler() as rh: with pytest.raises(HTTPError, match='redirect loop'): validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/redirect_loop')) - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_incompleteread(self, handler): with handler(timeout=2) as rh: with pytest.raises(IncompleteRead): validate_and_send(rh, Request('http://127.0.0.1:%d/incompleteread' % self.http_port)).read() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_cookies(self, handler): cookiejar = YoutubeDLCookieJar() cookiejar.set_cookie(http.cookiejar.Cookie( @@ -505,7 +506,7 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): rh, Request(f'http://127.0.0.1:{self.http_port}/headers', extensions={'cookiejar': cookiejar})).read() assert b'Cookie: test=ytdlp' in data - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_headers(self, handler): with handler(headers=HTTPHeaderDict({'test1': 'test', 'test2': 'test2'})) as rh: @@ -521,7 +522,7 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): assert b'Test2: test2' not in data assert b'Test3: test3' in data - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_timeout(self, handler): with handler() as rh: # Default timeout is 20 seconds, so this should go through @@ -537,7 +538,7 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): validate_and_send( rh, Request(f'http://127.0.0.1:{self.http_port}/timeout_1', extensions={'timeout': 4})) - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_source_address(self, handler): source_address = f'127.0.0.{random.randint(5, 255)}' with handler(source_address=source_address) as rh: @@ -545,13 +546,13 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): rh, Request(f'http://127.0.0.1:{self.http_port}/source_address')).read().decode() assert source_address == data - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_gzip_trailing_garbage(self, handler): with handler() as rh: data = validate_and_send(rh, Request(f'http://localhost:{self.http_port}/trailing_garbage')).read().decode() assert data == '<html><video src="/vid.mp4" /></html>' - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) @pytest.mark.skipif(not brotli, reason='brotli support is not installed') def test_brotli(self, handler): with handler() as rh: @@ -562,7 +563,7 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): assert res.headers.get('Content-Encoding') == 'br' assert res.read() == b'<html><video src="/vid.mp4" /></html>' - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_deflate(self, handler): with handler() as rh: res = validate_and_send( @@ -572,7 +573,7 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): assert res.headers.get('Content-Encoding') == 'deflate' assert res.read() == b'<html><video src="/vid.mp4" /></html>' - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_gzip(self, handler): with handler() as rh: res = validate_and_send( @@ -582,7 +583,7 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): assert res.headers.get('Content-Encoding') == 'gzip' assert res.read() == b'<html><video src="/vid.mp4" /></html>' - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_multiple_encodings(self, handler): with handler() as rh: for pair in ('gzip,deflate', 'deflate, gzip', 'gzip, gzip', 'deflate, deflate'): @@ -593,7 +594,7 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): assert res.headers.get('Content-Encoding') == pair assert res.read() == b'<html><video src="/vid.mp4" /></html>' - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_unsupported_encoding(self, handler): with handler() as rh: res = validate_and_send( @@ -603,7 +604,7 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): assert res.headers.get('Content-Encoding') == 'unsupported' assert res.read() == b'raw' - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_read(self, handler): with handler() as rh: res = validate_and_send( @@ -633,7 +634,7 @@ class TestHTTPProxy(TestRequestHandlerBase): cls.geo_proxy_thread.daemon = True cls.geo_proxy_thread.start() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_http_proxy(self, handler): http_proxy = f'http://127.0.0.1:{self.proxy_port}' geo_proxy = f'http://127.0.0.1:{self.geo_port}' @@ -659,7 +660,7 @@ class TestHTTPProxy(TestRequestHandlerBase): assert res != f'normal: {real_url}' assert 'Accept' in res - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_noproxy(self, handler): with handler(proxies={'proxy': f'http://127.0.0.1:{self.proxy_port}'}) as rh: # NO_PROXY @@ -669,7 +670,7 @@ class TestHTTPProxy(TestRequestHandlerBase): 'utf-8') assert 'Accept' in nop_response - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_allproxy(self, handler): url = 'http://foo.com/bar' with handler() as rh: @@ -677,7 +678,7 @@ class TestHTTPProxy(TestRequestHandlerBase): 'utf-8') assert response == f'normal: {url}' - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_http_proxy_with_idn(self, handler): with handler(proxies={ 'http': f'http://127.0.0.1:{self.proxy_port}', @@ -715,27 +716,27 @@ class TestClientCertificate: ) as rh: validate_and_send(rh, Request(f'https://127.0.0.1:{self.port}/video.html')).read().decode() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_certificate_combined_nopass(self, handler): self._run_test(handler, client_cert={ 'client_certificate': os.path.join(self.certdir, 'clientwithkey.crt'), }) - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_certificate_nocombined_nopass(self, handler): self._run_test(handler, client_cert={ 'client_certificate': os.path.join(self.certdir, 'client.crt'), 'client_certificate_key': os.path.join(self.certdir, 'client.key'), }) - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_certificate_combined_pass(self, handler): self._run_test(handler, client_cert={ 'client_certificate': os.path.join(self.certdir, 'clientwithencryptedkey.crt'), 'client_certificate_password': 'foobar', }) - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_certificate_nocombined_pass(self, handler): self._run_test(handler, client_cert={ 'client_certificate': os.path.join(self.certdir, 'client.crt'), @@ -819,6 +820,75 @@ class TestUrllibRequestHandler(TestRequestHandlerBase): assert not isinstance(exc_info.value, TransportError) +class TestRequestsRequestHandler(TestRequestHandlerBase): + @pytest.mark.parametrize('raised,expected', [ + (lambda: requests.exceptions.ConnectTimeout(), TransportError), + (lambda: requests.exceptions.ReadTimeout(), TransportError), + (lambda: requests.exceptions.Timeout(), TransportError), + (lambda: requests.exceptions.ConnectionError(), TransportError), + (lambda: requests.exceptions.ProxyError(), ProxyError), + (lambda: requests.exceptions.SSLError('12[CERTIFICATE_VERIFY_FAILED]34'), CertificateVerifyError), + (lambda: requests.exceptions.SSLError(), SSLError), + (lambda: requests.exceptions.InvalidURL(), RequestError), + (lambda: requests.exceptions.InvalidHeader(), RequestError), + # catch-all: https://github.com/psf/requests/blob/main/src/requests/adapters.py#L535 + (lambda: urllib3.exceptions.HTTPError(), TransportError), + (lambda: requests.exceptions.RequestException(), RequestError) + # (lambda: requests.exceptions.TooManyRedirects(), HTTPError) - Needs a response object + ]) + @pytest.mark.parametrize('handler', ['Requests'], indirect=True) + def test_request_error_mapping(self, handler, monkeypatch, raised, expected): + with handler() as rh: + def mock_get_instance(*args, **kwargs): + class MockSession: + def request(self, *args, **kwargs): + raise raised() + return MockSession() + + monkeypatch.setattr(rh, '_get_instance', mock_get_instance) + + with pytest.raises(expected) as exc_info: + rh.send(Request('http://fake')) + + assert exc_info.type is expected + + @pytest.mark.parametrize('raised,expected,match', [ + (lambda: urllib3.exceptions.SSLError(), SSLError, None), + (lambda: urllib3.exceptions.TimeoutError(), TransportError, None), + (lambda: urllib3.exceptions.ReadTimeoutError(None, None, None), TransportError, None), + (lambda: urllib3.exceptions.ProtocolError(), TransportError, None), + (lambda: urllib3.exceptions.DecodeError(), TransportError, None), + (lambda: urllib3.exceptions.HTTPError(), TransportError, None), # catch-all + ( + lambda: urllib3.exceptions.ProtocolError('error', http.client.IncompleteRead(partial=b'abc', expected=4)), + IncompleteRead, + '3 bytes read, 4 more expected' + ), + ( + lambda: urllib3.exceptions.IncompleteRead(partial=3, expected=5), + IncompleteRead, + '3 bytes read, 5 more expected' + ), + ]) + @pytest.mark.parametrize('handler', ['Requests'], indirect=True) + def test_response_error_mapping(self, handler, monkeypatch, raised, expected, match): + from urllib3.response import HTTPResponse as Urllib3Response + from requests.models import Response as RequestsResponse + from yt_dlp.networking._requests import RequestsResponseAdapter + requests_res = RequestsResponse() + requests_res.raw = Urllib3Response(body=b'', status=200) + res = RequestsResponseAdapter(requests_res) + + def mock_read(*args, **kwargs): + raise raised() + monkeypatch.setattr(res.fp, 'read', mock_read) + + with pytest.raises(expected, match=match) as exc_info: + res.read() + + assert exc_info.type is expected + + def run_validation(handler, error, req, **handler_kwargs): with handler(**handler_kwargs) as rh: if error: @@ -855,6 +925,10 @@ class TestRequestHandlerValidation: ('file', UnsupportedRequest, {}), ('file', False, {'enable_file_urls': True}), ]), + ('Requests', [ + ('http', False, {}), + ('https', False, {}), + ]), (NoCheckRH, [('http', False, {})]), (ValidationRH, [('http', UnsupportedRequest, {})]) ] @@ -870,6 +944,14 @@ class TestRequestHandlerValidation: ('socks5h', False), ('socks', UnsupportedRequest), ]), + ('Requests', [ + ('http', False), + ('https', False), + ('socks4', False), + ('socks4a', False), + ('socks5', False), + ('socks5h', False), + ]), (NoCheckRH, [('http', False)]), (HTTPSupportedRH, [('http', UnsupportedRequest)]), ] @@ -880,6 +962,10 @@ class TestRequestHandlerValidation: ('all', False), ('unrelated', False), ]), + ('Requests', [ + ('all', False), + ('unrelated', False), + ]), (NoCheckRH, [('all', False)]), (HTTPSupportedRH, [('all', UnsupportedRequest)]), (HTTPSupportedRH, [('no', UnsupportedRequest)]), @@ -894,6 +980,13 @@ class TestRequestHandlerValidation: ({'timeout': 'notatimeout'}, AssertionError), ({'unsupported': 'value'}, UnsupportedRequest), ]), + ('Requests', [ + ({'cookiejar': 'notacookiejar'}, AssertionError), + ({'cookiejar': YoutubeDLCookieJar()}, False), + ({'timeout': 1}, False), + ({'timeout': 'notatimeout'}, AssertionError), + ({'unsupported': 'value'}, UnsupportedRequest), + ]), (NoCheckRH, [ ({'cookiejar': 'notacookiejar'}, False), ({'somerandom': 'test'}, False), # but any extension is allowed through @@ -909,7 +1002,7 @@ class TestRequestHandlerValidation: def test_url_scheme(self, handler, scheme, fail, handler_kwargs): run_validation(handler, fail, Request(f'{scheme}://'), **(handler_kwargs or {})) - @pytest.mark.parametrize('handler,fail', [('Urllib', False)], indirect=['handler']) + @pytest.mark.parametrize('handler,fail', [('Urllib', False), ('Requests', False)], indirect=['handler']) def test_no_proxy(self, handler, fail): run_validation(handler, fail, Request('http://', proxies={'no': '127.0.0.1,github.com'})) run_validation(handler, fail, Request('http://'), proxies={'no': '127.0.0.1,github.com'}) @@ -932,13 +1025,13 @@ class TestRequestHandlerValidation: run_validation(handler, fail, Request('http://', proxies={'http': f'{scheme}://example.com'})) run_validation(handler, fail, Request('http://'), proxies={'http': f'{scheme}://example.com'}) - @pytest.mark.parametrize('handler', ['Urllib', HTTPSupportedRH], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', HTTPSupportedRH, 'Requests'], indirect=True) def test_empty_proxy(self, handler): run_validation(handler, False, Request('http://', proxies={'http': None})) run_validation(handler, False, Request('http://'), proxies={'http': None}) @pytest.mark.parametrize('proxy_url', ['//example.com', 'example.com', '127.0.0.1', '/a/b/c']) - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_invalid_proxy_url(self, handler, proxy_url): run_validation(handler, UnsupportedRequest, Request('http://', proxies={'http': proxy_url})) @@ -1242,6 +1335,13 @@ class TestYoutubeDLNetworking: rh = self.build_handler(ydl, UrllibRH) assert rh.enable_file_urls is True + def test_compat_opt_prefer_urllib(self): + # This assumes urllib only has a preference when this compat opt is given + with FakeYDL({'compat_opts': ['prefer-legacy-http-handler']}) as ydl: + director = ydl.build_request_director([UrllibRH]) + assert len(director.preferences) == 1 + assert director.preferences.pop()(UrllibRH, None) + class TestRequest: diff --git a/test/test_socks.py b/test/test_socks.py index 211ee814d..d8ac88dad 100644 --- a/test/test_socks.py +++ b/test/test_socks.py @@ -263,7 +263,7 @@ def ctx(request): class TestSocks4Proxy: - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) def test_socks4_no_auth(self, handler, ctx): with handler() as rh: with ctx.socks_server(Socks4ProxyHandler) as server_address: @@ -271,7 +271,7 @@ class TestSocks4Proxy: rh, proxies={'all': f'socks4://{server_address}'}) assert response['version'] == 4 - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) def test_socks4_auth(self, handler, ctx): with handler() as rh: with ctx.socks_server(Socks4ProxyHandler, user_id='user') as server_address: @@ -281,7 +281,7 @@ class TestSocks4Proxy: rh, proxies={'all': f'socks4://user:@{server_address}'}) assert response['version'] == 4 - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) def test_socks4a_ipv4_target(self, handler, ctx): with ctx.socks_server(Socks4ProxyHandler) as server_address: with handler(proxies={'all': f'socks4a://{server_address}'}) as rh: @@ -289,7 +289,7 @@ class TestSocks4Proxy: assert response['version'] == 4 assert (response['ipv4_address'] == '127.0.0.1') != (response['domain_address'] == '127.0.0.1') - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) def test_socks4a_domain_target(self, handler, ctx): with ctx.socks_server(Socks4ProxyHandler) as server_address: with handler(proxies={'all': f'socks4a://{server_address}'}) as rh: @@ -298,7 +298,7 @@ class TestSocks4Proxy: assert response['ipv4_address'] is None assert response['domain_address'] == 'localhost' - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) def test_ipv4_client_source_address(self, handler, ctx): with ctx.socks_server(Socks4ProxyHandler) as server_address: source_address = f'127.0.0.{random.randint(5, 255)}' @@ -308,7 +308,7 @@ class TestSocks4Proxy: assert response['client_address'][0] == source_address assert response['version'] == 4 - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) @pytest.mark.parametrize('reply_code', [ Socks4CD.REQUEST_REJECTED_OR_FAILED, Socks4CD.REQUEST_REJECTED_CANNOT_CONNECT_TO_IDENTD, @@ -320,7 +320,7 @@ class TestSocks4Proxy: with pytest.raises(ProxyError): ctx.socks_info_request(rh) - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) def test_ipv6_socks4_proxy(self, handler, ctx): with ctx.socks_server(Socks4ProxyHandler, bind_ip='::1') as server_address: with handler(proxies={'all': f'socks4://{server_address}'}) as rh: @@ -329,7 +329,7 @@ class TestSocks4Proxy: assert response['ipv4_address'] == '127.0.0.1' assert response['version'] == 4 - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) def test_timeout(self, handler, ctx): with ctx.socks_server(Socks4ProxyHandler, sleep=2) as server_address: with handler(proxies={'all': f'socks4://{server_address}'}, timeout=0.5) as rh: @@ -339,7 +339,7 @@ class TestSocks4Proxy: class TestSocks5Proxy: - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) def test_socks5_no_auth(self, handler, ctx): with ctx.socks_server(Socks5ProxyHandler) as server_address: with handler(proxies={'all': f'socks5://{server_address}'}) as rh: @@ -347,7 +347,7 @@ class TestSocks5Proxy: assert response['auth_methods'] == [0x0] assert response['version'] == 5 - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) def test_socks5_user_pass(self, handler, ctx): with ctx.socks_server(Socks5ProxyHandler, auth=('test', 'testpass')) as server_address: with handler() as rh: @@ -360,7 +360,7 @@ class TestSocks5Proxy: assert response['auth_methods'] == [Socks5Auth.AUTH_NONE, Socks5Auth.AUTH_USER_PASS] assert response['version'] == 5 - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) def test_socks5_ipv4_target(self, handler, ctx): with ctx.socks_server(Socks5ProxyHandler) as server_address: with handler(proxies={'all': f'socks5://{server_address}'}) as rh: @@ -368,7 +368,7 @@ class TestSocks5Proxy: assert response['ipv4_address'] == '127.0.0.1' assert response['version'] == 5 - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) def test_socks5_domain_target(self, handler, ctx): with ctx.socks_server(Socks5ProxyHandler) as server_address: with handler(proxies={'all': f'socks5://{server_address}'}) as rh: @@ -376,7 +376,7 @@ class TestSocks5Proxy: assert (response['ipv4_address'] == '127.0.0.1') != (response['ipv6_address'] == '::1') assert response['version'] == 5 - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) def test_socks5h_domain_target(self, handler, ctx): with ctx.socks_server(Socks5ProxyHandler) as server_address: with handler(proxies={'all': f'socks5h://{server_address}'}) as rh: @@ -385,7 +385,7 @@ class TestSocks5Proxy: assert response['domain_address'] == 'localhost' assert response['version'] == 5 - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) def test_socks5h_ip_target(self, handler, ctx): with ctx.socks_server(Socks5ProxyHandler) as server_address: with handler(proxies={'all': f'socks5h://{server_address}'}) as rh: @@ -394,7 +394,7 @@ class TestSocks5Proxy: assert response['domain_address'] is None assert response['version'] == 5 - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) def test_socks5_ipv6_destination(self, handler, ctx): with ctx.socks_server(Socks5ProxyHandler) as server_address: with handler(proxies={'all': f'socks5://{server_address}'}) as rh: @@ -402,7 +402,7 @@ class TestSocks5Proxy: assert response['ipv6_address'] == '::1' assert response['version'] == 5 - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) def test_ipv6_socks5_proxy(self, handler, ctx): with ctx.socks_server(Socks5ProxyHandler, bind_ip='::1') as server_address: with handler(proxies={'all': f'socks5://{server_address}'}) as rh: @@ -413,7 +413,7 @@ class TestSocks5Proxy: # XXX: is there any feasible way of testing IPv6 source addresses? # Same would go for non-proxy source_address test... - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) def test_ipv4_client_source_address(self, handler, ctx): with ctx.socks_server(Socks5ProxyHandler) as server_address: source_address = f'127.0.0.{random.randint(5, 255)}' @@ -422,7 +422,7 @@ class TestSocks5Proxy: assert response['client_address'][0] == source_address assert response['version'] == 5 - @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) @pytest.mark.parametrize('reply_code', [ Socks5Reply.GENERAL_FAILURE, Socks5Reply.CONNECTION_NOT_ALLOWED, diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 71d17ac01..8e11646d3 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -3968,7 +3968,7 @@ class YoutubeDL: })) or 'none')) write_debug(f'Proxy map: {self.proxies}') - # write_debug(f'Request Handlers: {", ".join(rh.RH_NAME for rh in self._request_director.handlers.values())}') + write_debug(f'Request Handlers: {", ".join(rh.RH_NAME for rh in self._request_director.handlers.values())}') for plugin_type, plugins in {'Extractor': plugin_ies, 'Post-Processor': plugin_pps}.items(): display_list = ['%s%s' % ( klass.__name__, '' if klass.__name__ == name else f' as {name}') @@ -4057,6 +4057,9 @@ class YoutubeDL: raise RequestError( 'file:// URLs are disabled by default in yt-dlp for security reasons. ' 'Use --enable-file-urls to enable at your own risk.', cause=ue) from ue + if 'unsupported proxy type: "https"' in ue.msg.lower(): + raise RequestError( + 'To use an HTTPS proxy for this request, one of the following dependencies needs to be installed: requests') raise except SSLError as e: if 'UNSAFE_LEGACY_RENEGOTIATION_DISABLED' in str(e): @@ -4099,6 +4102,8 @@ class YoutubeDL: }), )) director.preferences.update(preferences or []) + if 'prefer-legacy-http-handler' in self.params['compat_opts']: + director.preferences.add(lambda rh, _: 500 if rh.RH_KEY == 'Urllib' else 0) return director def encode(self, s): diff --git a/yt_dlp/__pyinstaller/hook-yt_dlp.py b/yt_dlp/__pyinstaller/hook-yt_dlp.py index 88c2b8b28..c7f2c0ceb 100644 --- a/yt_dlp/__pyinstaller/hook-yt_dlp.py +++ b/yt_dlp/__pyinstaller/hook-yt_dlp.py @@ -21,7 +21,9 @@ def get_hidden_imports(): yield from ('yt_dlp.compat._legacy', 'yt_dlp.compat._deprecated') yield from ('yt_dlp.utils._legacy', 'yt_dlp.utils._deprecated') yield pycryptodome_module() - yield from collect_submodules('websockets') + # Only `websockets` is required, others are collected just in case + for module in ('websockets', 'requests', 'urllib3'): + yield from collect_submodules(module) # These are auto-detected, but explicitly add them just in case yield from ('mutagen', 'brotli', 'certifi') diff --git a/yt_dlp/dependencies/__init__.py b/yt_dlp/dependencies/__init__.py index b56e4f5cc..ef83739a3 100644 --- a/yt_dlp/dependencies/__init__.py +++ b/yt_dlp/dependencies/__init__.py @@ -58,6 +58,15 @@ except (ImportError, SyntaxError): # See https://github.com/yt-dlp/yt-dlp/issues/2633 websockets = None +try: + import urllib3 +except ImportError: + urllib3 = None + +try: + import requests +except ImportError: + requests = None try: import xattr # xattr or pyxattr diff --git a/yt_dlp/networking/__init__.py b/yt_dlp/networking/__init__.py index 5b1599a6d..aa8d0eabe 100644 --- a/yt_dlp/networking/__init__.py +++ b/yt_dlp/networking/__init__.py @@ -1,4 +1,6 @@ # flake8: noqa: F401 +import warnings + from .common import ( HEADRequest, PUTRequest, @@ -11,3 +13,11 @@ from .common import ( # isort: split # TODO: all request handlers should be safely imported from . import _urllib +from ..utils import bug_reports_message + +try: + from . import _requests +except ImportError: + pass +except Exception as e: + warnings.warn(f'Failed to import "requests" request handler: {e}' + bug_reports_message()) diff --git a/yt_dlp/networking/_helper.py b/yt_dlp/networking/_helper.py index 4c9dbf25d..a6fa3550b 100644 --- a/yt_dlp/networking/_helper.py +++ b/yt_dlp/networking/_helper.py @@ -11,7 +11,7 @@ import urllib.request from .exceptions import RequestError, UnsupportedRequest from ..dependencies import certifi -from ..socks import ProxyType +from ..socks import ProxyType, sockssocket from ..utils import format_field, traverse_obj if typing.TYPE_CHECKING: @@ -224,6 +224,24 @@ def _socket_connect(ip_addr, timeout, source_address): raise +def create_socks_proxy_socket(dest_addr, proxy_args, proxy_ip_addr, timeout, source_address): + af, socktype, proto, canonname, sa = proxy_ip_addr + sock = sockssocket(af, socktype, proto) + try: + connect_proxy_args = proxy_args.copy() + connect_proxy_args.update({'addr': sa[0], 'port': sa[1]}) + sock.setproxy(**connect_proxy_args) + if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: # noqa: E721 + sock.settimeout(timeout) + if source_address: + sock.bind(source_address) + sock.connect(dest_addr) + return sock + except socket.error: + sock.close() + raise + + def create_connection( address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, diff --git a/yt_dlp/networking/_requests.py b/yt_dlp/networking/_requests.py new file mode 100644 index 000000000..27974357a --- /dev/null +++ b/yt_dlp/networking/_requests.py @@ -0,0 +1,398 @@ +import contextlib +import functools +import http.client +import logging +import re +import socket +import warnings + +from ..dependencies import brotli, requests, urllib3 +from ..utils import bug_reports_message, int_or_none, variadic + +if requests is None: + raise ImportError('requests module is not installed') + +if urllib3 is None: + raise ImportError('urllib3 module is not installed') + +urllib3_version = tuple(int_or_none(x, default=0) for x in urllib3.__version__.split('.')) + +if urllib3_version < (1, 26, 17): + raise ImportError('Only urllib3 >= 1.26.17 is supported') + +if requests.__build__ < 0x023100: + raise ImportError('Only requests >= 2.31.0 is supported') + +import requests.adapters +import requests.utils +import urllib3.connection +import urllib3.exceptions + +from ._helper import ( + InstanceStoreMixin, + add_accept_encoding_header, + create_connection, + create_socks_proxy_socket, + get_redirect_method, + make_socks_proxy_opts, + select_proxy, +) +from .common import ( + Features, + RequestHandler, + Response, + register_preference, + register_rh, +) +from .exceptions import ( + CertificateVerifyError, + HTTPError, + IncompleteRead, + ProxyError, + RequestError, + SSLError, + TransportError, +) +from ..socks import ProxyError as SocksProxyError + +SUPPORTED_ENCODINGS = [ + 'gzip', 'deflate' +] + +if brotli is not None: + SUPPORTED_ENCODINGS.append('br') + +""" +Override urllib3's behavior to not convert lower-case percent-encoded characters +to upper-case during url normalization process. + +RFC3986 defines that the lower or upper case percent-encoded hexidecimal characters are equivalent +and normalizers should convert them to uppercase for consistency [1]. + +However, some sites may have an incorrect implementation where they provide +a percent-encoded url that is then compared case-sensitively.[2] + +While this is a very rare case, since urllib does not do this normalization step, it +is best to avoid it in requests too for compatability reasons. + +1: https://tools.ietf.org/html/rfc3986#section-2.1 +2: https://github.com/streamlink/streamlink/pull/4003 +""" + + +class Urllib3PercentREOverride: + def __init__(self, r: re.Pattern): + self.re = r + + # pass through all other attribute calls to the original re + def __getattr__(self, item): + return self.re.__getattribute__(item) + + def subn(self, repl, string, *args, **kwargs): + return string, self.re.subn(repl, string, *args, **kwargs)[1] + + +# urllib3 >= 1.25.8 uses subn: +# https://github.com/urllib3/urllib3/commit/a2697e7c6b275f05879b60f593c5854a816489f0 +import urllib3.util.url # noqa: E305 + +if hasattr(urllib3.util.url, 'PERCENT_RE'): + urllib3.util.url.PERCENT_RE = Urllib3PercentREOverride(urllib3.util.url.PERCENT_RE) +elif hasattr(urllib3.util.url, '_PERCENT_RE'): # urllib3 >= 2.0.0 + urllib3.util.url._PERCENT_RE = Urllib3PercentREOverride(urllib3.util.url._PERCENT_RE) +else: + warnings.warn('Failed to patch PERCENT_RE in urllib3 (does the attribute exist?)' + bug_reports_message()) + +""" +Workaround for issue in urllib.util.ssl_.py: ssl_wrap_context does not pass +server_hostname to SSLContext.wrap_socket if server_hostname is an IP, +however this is an issue because we set check_hostname to True in our SSLContext. + +Monkey-patching IS_SECURETRANSPORT forces ssl_wrap_context to pass server_hostname regardless. + +This has been fixed in urllib3 2.0+. +See: https://github.com/urllib3/urllib3/issues/517 +""" + +if urllib3_version < (2, 0, 0): + with contextlib.suppress(): + urllib3.util.IS_SECURETRANSPORT = urllib3.util.ssl_.IS_SECURETRANSPORT = True + + +# Requests will not automatically handle no_proxy by default +# due to buggy no_proxy handling with proxy dict [1]. +# 1. https://github.com/psf/requests/issues/5000 +requests.adapters.select_proxy = select_proxy + + +class RequestsResponseAdapter(Response): + def __init__(self, res: requests.models.Response): + super().__init__( + fp=res.raw, headers=res.headers, url=res.url, + status=res.status_code, reason=res.reason) + + self._requests_response = res + + def read(self, amt: int = None): + try: + # Interact with urllib3 response directly. + return self.fp.read(amt, decode_content=True) + + # See urllib3.response.HTTPResponse.read() for exceptions raised on read + except urllib3.exceptions.SSLError as e: + raise SSLError(cause=e) from e + + except urllib3.exceptions.IncompleteRead as e: + # urllib3 IncompleteRead.partial is always an integer + raise IncompleteRead(partial=e.partial, expected=e.expected) from e + + except urllib3.exceptions.ProtocolError as e: + # http.client.IncompleteRead may be contained within ProtocolError + # See urllib3.response.HTTPResponse._error_catcher() + ir_err = next( + (err for err in (e.__context__, e.__cause__, *variadic(e.args)) + if isinstance(err, http.client.IncompleteRead)), None) + if ir_err is not None: + raise IncompleteRead(partial=len(ir_err.partial), expected=ir_err.expected) from e + raise TransportError(cause=e) from e + + except urllib3.exceptions.HTTPError as e: + # catch-all for any other urllib3 response exceptions + raise TransportError(cause=e) from e + + +class RequestsHTTPAdapter(requests.adapters.HTTPAdapter): + def __init__(self, ssl_context=None, proxy_ssl_context=None, source_address=None, **kwargs): + self._pm_args = {} + if ssl_context: + self._pm_args['ssl_context'] = ssl_context + if source_address: + self._pm_args['source_address'] = (source_address, 0) + self._proxy_ssl_context = proxy_ssl_context or ssl_context + super().__init__(**kwargs) + + def init_poolmanager(self, *args, **kwargs): + return super().init_poolmanager(*args, **kwargs, **self._pm_args) + + def proxy_manager_for(self, proxy, **proxy_kwargs): + extra_kwargs = {} + if not proxy.lower().startswith('socks') and self._proxy_ssl_context: + extra_kwargs['proxy_ssl_context'] = self._proxy_ssl_context + return super().proxy_manager_for(proxy, **proxy_kwargs, **self._pm_args, **extra_kwargs) + + def cert_verify(*args, **kwargs): + # lean on SSLContext for cert verification + pass + + +class RequestsSession(requests.sessions.Session): + """ + Ensure unified redirect method handling with our urllib redirect handler. + """ + def rebuild_method(self, prepared_request, response): + new_method = get_redirect_method(prepared_request.method, response.status_code) + + # HACK: requests removes headers/body on redirect unless code was a 307/308. + if new_method == prepared_request.method: + response._real_status_code = response.status_code + response.status_code = 308 + + prepared_request.method = new_method + + def rebuild_auth(self, prepared_request, response): + # HACK: undo status code change from rebuild_method, if applicable. + # rebuild_auth runs after requests would remove headers/body based on status code + if hasattr(response, '_real_status_code'): + response.status_code = response._real_status_code + del response._real_status_code + return super().rebuild_auth(prepared_request, response) + + +class Urllib3LoggingFilter(logging.Filter): + + def filter(self, record): + # Ignore HTTP request messages since HTTPConnection prints those + if record.msg == '%s://%s:%s "%s %s %s" %s %s': + return False + return True + + +class Urllib3LoggingHandler(logging.Handler): + """Redirect urllib3 logs to our logger""" + def __init__(self, logger, *args, **kwargs): + super().__init__(*args, **kwargs) + self._logger = logger + + def emit(self, record): + try: + msg = self.format(record) + if record.levelno >= logging.ERROR: + self._logger.error(msg) + else: + self._logger.stdout(msg) + + except Exception: + self.handleError(record) + + +@register_rh +class RequestsRH(RequestHandler, InstanceStoreMixin): + + """Requests RequestHandler + https://github.com/psf/requests + """ + _SUPPORTED_URL_SCHEMES = ('http', 'https') + _SUPPORTED_ENCODINGS = tuple(SUPPORTED_ENCODINGS) + _SUPPORTED_PROXY_SCHEMES = ('http', 'https', 'socks4', 'socks4a', 'socks5', 'socks5h') + _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY) + RH_NAME = 'requests' + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # Forward urllib3 debug messages to our logger + logger = logging.getLogger('urllib3') + handler = Urllib3LoggingHandler(logger=self._logger) + handler.setFormatter(logging.Formatter('requests: %(message)s')) + handler.addFilter(Urllib3LoggingFilter()) + logger.addHandler(handler) + logger.setLevel(logging.WARNING) + + if self.verbose: + # Setting this globally is not ideal, but is easier than hacking with urllib3. + # It could technically be problematic for scripts embedding yt-dlp. + # However, it is unlikely debug traffic is used in that context in a way this will cause problems. + urllib3.connection.HTTPConnection.debuglevel = 1 + logger.setLevel(logging.DEBUG) + # this is expected if we are using --no-check-certificate + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + def close(self): + self._clear_instances() + + def _check_extensions(self, extensions): + super()._check_extensions(extensions) + extensions.pop('cookiejar', None) + extensions.pop('timeout', None) + + def _create_instance(self, cookiejar): + session = RequestsSession() + http_adapter = RequestsHTTPAdapter( + ssl_context=self._make_sslcontext(), + source_address=self.source_address, + max_retries=urllib3.util.retry.Retry(False), + ) + session.adapters.clear() + session.headers = requests.models.CaseInsensitiveDict({'Connection': 'keep-alive'}) + session.mount('https://', http_adapter) + session.mount('http://', http_adapter) + session.cookies = cookiejar + session.trust_env = False # no need, we already load proxies from env + return session + + def _send(self, request): + + headers = self._merge_headers(request.headers) + add_accept_encoding_header(headers, SUPPORTED_ENCODINGS) + + max_redirects_exceeded = False + + session = self._get_instance( + cookiejar=request.extensions.get('cookiejar') or self.cookiejar) + + try: + requests_res = session.request( + method=request.method, + url=request.url, + data=request.data, + headers=headers, + timeout=float(request.extensions.get('timeout') or self.timeout), + proxies=request.proxies or self.proxies, + allow_redirects=True, + stream=True + ) + + except requests.exceptions.TooManyRedirects as e: + max_redirects_exceeded = True + requests_res = e.response + + except requests.exceptions.SSLError as e: + if 'CERTIFICATE_VERIFY_FAILED' in str(e): + raise CertificateVerifyError(cause=e) from e + raise SSLError(cause=e) from e + + except requests.exceptions.ProxyError as e: + raise ProxyError(cause=e) from e + + except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e: + raise TransportError(cause=e) from e + + except urllib3.exceptions.HTTPError as e: + # Catch any urllib3 exceptions that may leak through + raise TransportError(cause=e) from e + + except requests.exceptions.RequestException as e: + # Miscellaneous Requests exceptions. May not necessary be network related e.g. InvalidURL + raise RequestError(cause=e) from e + + res = RequestsResponseAdapter(requests_res) + + if not 200 <= res.status < 300: + raise HTTPError(res, redirect_loop=max_redirects_exceeded) + + return res + + +@register_preference(RequestsRH) +def requests_preference(rh, request): + return 100 + + +# Use our socks proxy implementation with requests to avoid an extra dependency. +class SocksHTTPConnection(urllib3.connection.HTTPConnection): + def __init__(self, _socks_options, *args, **kwargs): # must use _socks_options to pass PoolKey checks + self._proxy_args = _socks_options + super().__init__(*args, **kwargs) + + def _new_conn(self): + try: + return create_connection( + address=(self._proxy_args['addr'], self._proxy_args['port']), + timeout=self.timeout, + source_address=self.source_address, + _create_socket_func=functools.partial( + create_socks_proxy_socket, (self.host, self.port), self._proxy_args)) + except (socket.timeout, TimeoutError) as e: + raise urllib3.exceptions.ConnectTimeoutError( + self, f'Connection to {self.host} timed out. (connect timeout={self.timeout})') from e + except SocksProxyError as e: + raise urllib3.exceptions.ProxyError(str(e), e) from e + except (OSError, socket.error) as e: + raise urllib3.exceptions.NewConnectionError( + self, f'Failed to establish a new connection: {e}') from e + + +class SocksHTTPSConnection(SocksHTTPConnection, urllib3.connection.HTTPSConnection): + pass + + +class SocksHTTPConnectionPool(urllib3.HTTPConnectionPool): + ConnectionCls = SocksHTTPConnection + + +class SocksHTTPSConnectionPool(urllib3.HTTPSConnectionPool): + ConnectionCls = SocksHTTPSConnection + + +class SocksProxyManager(urllib3.PoolManager): + + def __init__(self, socks_proxy, username=None, password=None, num_pools=10, headers=None, **connection_pool_kw): + connection_pool_kw['_socks_options'] = make_socks_proxy_opts(socks_proxy) + super().__init__(num_pools, headers, **connection_pool_kw) + self.pool_classes_by_scheme = { + 'http': SocksHTTPConnectionPool, + 'https': SocksHTTPSConnectionPool + } + + +requests.adapters.SOCKSProxyManager = SocksProxyManager diff --git a/yt_dlp/networking/_urllib.py b/yt_dlp/networking/_urllib.py index 9e2bf33e4..68bab2b08 100644 --- a/yt_dlp/networking/_urllib.py +++ b/yt_dlp/networking/_urllib.py @@ -3,7 +3,6 @@ from __future__ import annotations import functools import http.client import io -import socket import ssl import urllib.error import urllib.parse @@ -24,6 +23,7 @@ from ._helper import ( InstanceStoreMixin, add_accept_encoding_header, create_connection, + create_socks_proxy_socket, get_redirect_method, make_socks_proxy_opts, select_proxy, @@ -40,7 +40,6 @@ from .exceptions import ( ) from ..dependencies import brotli from ..socks import ProxyError as SocksProxyError -from ..socks import sockssocket from ..utils import update_url_query from ..utils.networking import normalize_url @@ -190,25 +189,12 @@ def make_socks_conn_class(base_class, socks_proxy): _create_connection = create_connection def connect(self): - def sock_socket_connect(ip_addr, timeout, source_address): - af, socktype, proto, canonname, sa = ip_addr - sock = sockssocket(af, socktype, proto) - try: - connect_proxy_args = proxy_args.copy() - connect_proxy_args.update({'addr': sa[0], 'port': sa[1]}) - sock.setproxy(**connect_proxy_args) - if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: # noqa: E721 - sock.settimeout(timeout) - if source_address: - sock.bind(source_address) - sock.connect((self.host, self.port)) - return sock - except socket.error: - sock.close() - raise self.sock = create_connection( - (proxy_args['addr'], proxy_args['port']), timeout=self.timeout, - source_address=self.source_address, _create_socket_func=sock_socket_connect) + (proxy_args['addr'], proxy_args['port']), + timeout=self.timeout, + source_address=self.source_address, + _create_socket_func=functools.partial( + create_socks_proxy_socket, (self.host, self.port), proxy_args)) if isinstance(self, http.client.HTTPSConnection): self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host) diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 85a6402a6..4254974fc 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -471,11 +471,12 @@ def create_parser(): 'no-attach-info-json', 'embed-thumbnail-atomicparsley', 'no-external-downloader-progress', 'embed-metadata', 'seperate-video-versions', 'no-clean-infojson', 'no-keep-subs', 'no-certifi', 'no-youtube-channel-redirect', 'no-youtube-unavailable-videos', 'no-youtube-prefer-utc-upload-date', + 'prefer-legacy-http-handler' }, 'aliases': { 'youtube-dl': ['all', '-multistreams', '-playlist-match-filter'], 'youtube-dlc': ['all', '-no-youtube-channel-redirect', '-no-live-chat', '-playlist-match-filter'], '2021': ['2022', 'no-certifi', 'filename-sanitization', 'no-youtube-prefer-utc-upload-date'], - '2022': ['no-external-downloader-progress', 'playlist-match-filter'], + '2022': ['no-external-downloader-progress', 'playlist-match-filter', 'prefer-legacy-http-handler'], } }, help=( 'Options that can help keep compatibility with youtube-dl or youtube-dlc ' |