From abef53466da1f7d2e79f5644718a2cf7524abc49 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 28 Jul 2023 06:19:15 +0100 Subject: [utils] Rework URL path munging for ., .. components * move processing to YoutubeDLHandler * also process `Location` header for redirect * use tests from https://github.com/yt-dlp/yt-dlp/pull/7662 --- youtube_dl/YoutubeDL.py | 23 ---------------- youtube_dl/utils.py | 70 +++++++++++++++++++++++++++++++++++++------------ 2 files changed, 53 insertions(+), 40 deletions(-) (limited to 'youtube_dl') diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 6a12f91e4..13a41928f 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -71,7 +71,6 @@ from .utils import ( format_bytes, formatSeconds, GeoRestrictedError, - HEADRequest, int_or_none, ISO3166Utils, join_nonempty, @@ -88,7 +87,6 @@ from .utils import ( preferredencoding, prepend_extension, process_communicate_or_kill, - PUTRequest, register_socks_protocols, render_table, replace_extension, @@ -2460,27 +2458,6 @@ class YoutubeDL(object): """ Start an HTTP download """ if isinstance(req, compat_basestring): req = sanitized_Request(req) - # an embedded /../ sequence is not automatically handled by urllib2 - # see https://github.com/yt-dlp/yt-dlp/issues/3355 - url = req.get_full_url() - parts = url.partition('/../') - if parts[1]: - url = compat_urllib_parse.urljoin(parts[0] + parts[1][:1], parts[1][1:] + parts[2]) - if url: - # worse, URL path may have initial /../ against RFCs: work-around - # by stripping such prefixes, like eg Firefox - parts = compat_urllib_parse.urlsplit(url) - path = parts.path - while path.startswith('/../'): - path = path[3:] - url = parts._replace(path=path).geturl() - # get a new Request with the munged URL - if url != req.get_full_url(): - req_type = {'HEAD': HEADRequest, 'PUT': PUTRequest}.get( - req.get_method(), compat_urllib_request.Request) - req = req_type( - url, data=req.data, headers=dict(req.header_items()), - origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) return self._opener.open(req, timeout=self._socket_timeout) def print_debug_header(self): diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index e73291107..36204c8fa 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2678,17 +2678,52 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): def compress(data): return data and ncompress.decompress(data) + @staticmethod + def _fix_path(url): + # an embedded /../ or /./ sequence is not automatically handled by urllib2 + # see https://github.com/yt-dlp/yt-dlp/issues/3355 + parsed_url = compat_urllib_parse.urlsplit(url) + path = parsed_url.path + if not path.endswith('/'): + path += '/' + parts = path.partition('/./') + if not parts[1]: + parts = path.partition('/../') + if parts[1]: + path = compat_urllib_parse.urljoin( + parts[0] + parts[1][:1], + parts[1][1:] + (parts[2] if parsed_url.path.endswith('/') else parts[2][:-1])) + url = parsed_url._replace(path=path).geturl() + if '/.' in url: + # worse, URL path may have initial /../ against RFCs: work-around + # by stripping such prefixes, like eg Firefox + path = parsed_url.path + '/' + while path.startswith('/.'): + if path.startswith('/../'): + path = path[3:] + elif path.startswith('/./'): + path = path[2:] + else: + break + path = path[:-1] + if not path.startswith('/') and parsed_url.path.startswith('/'): + path = '/' + path + url = parsed_url._replace(path=path).geturl() + return url + def http_request(self, req): - # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not - # always respected by websites, some tend to give out URLs with non percent-encoded + url = req.get_full_url() + # resolve embedded . and .. + url_fixed = self._fix_path(url) + # According to RFC 3986, URLs can not contain non-ASCII characters; however this is not + # always respected by websites: some tend to give out URLs with non percent-encoded # non-ASCII characters (see telemb.py, ard.py [#3412]) # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) # To work around aforementioned issue we will replace request's original URL with # percent-encoded one # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09) # the code of this workaround has been moved here from YoutubeDL.urlopen() - url = req.get_full_url() - url_escaped = escape_url(url) + url_escaped = escape_url(url_fixed) # Substitute URL if any change after escaping if url != url_escaped: @@ -2702,10 +2737,13 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): req.headers = handle_youtubedl_headers(req.headers) - if sys.version_info < (2, 7) and '#' in req.get_full_url(): - # Python 2.6 is brain-dead when it comes to fragments - req._Request__original = req._Request__original.partition('#')[0] - req._Request__r_type = req._Request__r_type.partition('#')[0] + if sys.version_info < (2, 7): + # avoid possible race where __r_type may be unset + req.get_type() + if '#' in req.get_full_url(): + # Python 2.6 is brain-dead when it comes to fragments + req._Request__original = req._Request__original.partition('#')[0] + req._Request__r_type = req._Request__r_type.partition('#')[0] # Use the totally undocumented AbstractHTTPHandler per # https://github.com/yt-dlp/yt-dlp/pull/4158 @@ -2775,10 +2813,13 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): if sys.version_info >= (3, 0): location = location.encode('iso-8859-1') location = location.decode('utf-8') - location_escaped = escape_url(location) + # resolve embedded . and .. + location_fixed = self._fix_path(location) + location_escaped = escape_url(location_fixed) if location != location_escaped: del resp.headers['Location'] - if sys.version_info < (3, 0): + # if sys.version_info < (3, 0): + if not isinstance(location_escaped, str): location_escaped = location_escaped.encode('utf-8') resp.headers['Location'] = location_escaped return resp @@ -4248,13 +4289,8 @@ def update_Request(req, url=None, data=None, headers={}, query={}): req_headers.update(headers) req_data = data if data is not None else req.data req_url = update_url_query(url or req.get_full_url(), query) - req_get_method = req.get_method() - if req_get_method == 'HEAD': - req_type = HEADRequest - elif req_get_method == 'PUT': - req_type = PUTRequest - else: - req_type = compat_urllib_request.Request + req_type = {'HEAD': HEADRequest, 'PUT': PUTRequest}.get( + req.get_method(), compat_urllib_request.Request) new_req = req_type( req_url, data=req_data, headers=req_headers, origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) -- cgit v1.2.3