diff options
Diffstat (limited to 'youtube_dl/utils.py')
| -rw-r--r-- | youtube_dl/utils.py | 48 | 
1 files changed, 40 insertions, 8 deletions
| diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ae813099d..e265c7574 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -139,21 +139,24 @@ def write_json_file(obj, fn):  if sys.version_info >= (2, 7): -    def find_xpath_attr(node, xpath, key, val): +    def find_xpath_attr(node, xpath, key, val=None):          """ Find the xpath xpath[@key=val] """          assert re.match(r'^[a-zA-Z-]+$', key) -        assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val) -        expr = xpath + "[@%s='%s']" % (key, val) +        if val: +            assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val) +        expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))          return node.find(expr)  else: -    def find_xpath_attr(node, xpath, key, val): +    def find_xpath_attr(node, xpath, key, val=None):          # Here comes the crazy part: In 2.6, if the xpath is a unicode,          # .//node does not match if a node is a direct child of . !          if isinstance(xpath, compat_str):              xpath = xpath.encode('ascii')          for f in node.findall(xpath): -            if f.attrib.get(key) == val: +            if key not in f.attrib: +                continue +            if val is None or f.attrib.get(key) == val:                  return f          return None @@ -576,11 +579,9 @@ class ContentTooShortError(Exception):      download is too small for what the server announced first, indicating      the connection was probably interrupted.      """ -    # Both in bytes -    downloaded = None -    expected = None      def __init__(self, downloaded, expected): +        # Both in bytes          self.downloaded = downloaded          self.expected = expected @@ -650,6 +651,26 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):          return ret      def http_request(self, req): +        # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not +        # always respected by websites, some tend to give out URLs with non percent-encoded +        # non-ASCII characters (see telemb.py, ard.py [#3412]) +        # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) +        # To work around aforementioned issue we will replace request's original URL with +        # percent-encoded one +        # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09) +        # the code of this workaround has been moved here from YoutubeDL.urlopen() +        url = req.get_full_url() +        url_escaped = escape_url(url) + +        # Substitute URL if any change after escaping +        if url != url_escaped: +            req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request +            new_req = req_type( +                url_escaped, data=req.data, headers=req.headers, +                origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) +            new_req.timeout = req.timeout +            req = new_req +          for h, v in std_headers.items():              # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275              # The dict keys are capitalized because of this bug by urllib @@ -694,6 +715,17 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):              gz = io.BytesIO(self.deflate(resp.read()))              resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)              resp.msg = old_resp.msg +        # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 +        if 300 <= resp.code < 400: +            location = resp.headers.get('Location') +            if location: +                # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3 +                if sys.version_info >= (3, 0): +                    location = location.encode('iso-8859-1').decode('utf-8') +                location_escaped = escape_url(location) +                if location != location_escaped: +                    del resp.headers['Location'] +                    resp.headers['Location'] = location_escaped          return resp      https_request = http_request | 
