aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl
diff options
context:
space:
mode:
authorPhilipp Hagemeister <phihag@phihag.de>2014-09-15 15:40:10 +0200
committerPhilipp Hagemeister <phihag@phihag.de>2014-09-15 15:40:10 +0200
commit0003a5c4163df6b5a7fd90ec256ea7497f639dda (patch)
treeda39efa0f5db04075cb91999bb901199f2725bb7 /youtube_dl
parent21f2927f707dc1dfe9182a290571da1714f1ed63 (diff)
parent984e8e14ea266d406c253098f953e727ca8c19c7 (diff)
downloadyoutube-dl-0003a5c4163df6b5a7fd90ec256ea7497f639dda.tar.xz
Merge remote-tracking branch 'dstftw/escape-non-ascii-in-urls'
Conflicts: test/test_utils.py
Diffstat (limited to 'youtube_dl')
-rwxr-xr-xyoutube_dl/YoutubeDL.py20
-rw-r--r--youtube_dl/utils.py18
2 files changed, 38 insertions, 0 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py
index 553bf559b..9519594c9 100755
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -28,6 +28,7 @@ from .utils import (
compat_str,
compat_urllib_error,
compat_urllib_request,
+ escape_url,
ContentTooShortError,
date_from_str,
DateRange,
@@ -1241,6 +1242,25 @@ class YoutubeDL(object):
def urlopen(self, req):
""" Start an HTTP download """
+
+ # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
+ # always respected by websites, some tend to give out URLs with non percent-encoded
+ # non-ASCII characters (see telemb.py, ard.py [#3412])
+ # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
+ # To work around aforementioned issue we will replace request's original URL with
+ # percent-encoded one
+ url = req if isinstance(req, compat_str) else req.get_full_url()
+ url_escaped = escape_url(url)
+
+ # Substitute URL if any change after escaping
+ if url != url_escaped:
+ if isinstance(req, compat_str):
+ req = url_escaped
+ else:
+ req = compat_urllib_request.Request(
+ url_escaped, data=req.data, headers=req.headers,
+ origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
+
return self._opener.open(req, timeout=self._socket_timeout)
def print_debug_header(self):
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 3ac0f1f54..b644f4e92 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1437,6 +1437,24 @@ def uppercase_escape(s):
lambda m: unicode_escape(m.group(0))[0],
s)
+
+def escape_rfc3986(s):
+ """Escape non-ASCII characters as suggested by RFC 3986"""
+ if sys.version_info < (3, 0) and isinstance(s, unicode):
+ s = s.encode('utf-8')
+ return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]")
+
+
+def escape_url(url):
+ """Escape URL as suggested by RFC 3986"""
+ url_parsed = compat_urllib_parse_urlparse(url)
+ return url_parsed._replace(
+ path=escape_rfc3986(url_parsed.path),
+ params=escape_rfc3986(url_parsed.params),
+ query=escape_rfc3986(url_parsed.query),
+ fragment=escape_rfc3986(url_parsed.fragment)
+ ).geturl()
+
try:
struct.pack(u'!I', 0)
except TypeError: