aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYen Chi Hsuan <yan12125@gmail.com>2016-06-10 15:11:55 +0800
committerYen Chi Hsuan <yan12125@gmail.com>2016-06-10 15:11:55 +0800
commit55b2f099c0c820d6c4b46609b175a44a6d7f97bf (patch)
tree4297d785c0d5d392b2063d583936d93cddebcc4b
parent9631a94fb5e5ee9b92135f938df00866535fc6c6 (diff)
downloadyoutube-dl-55b2f099c0c820d6c4b46609b175a44a6d7f97bf.tar.xz
[utils] Decode HTML5 entities
Used in test_Vporn_1. Also related to #9270
-rw-r--r--test/test_utils.py2
-rw-r--r--youtube_dl/utils.py12
2 files changed, 12 insertions, 2 deletions
diff --git a/test/test_utils.py b/test/test_utils.py
index feef80465..0e25de6b7 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -249,6 +249,8 @@ class TestUtil(unittest.TestCase):
self.assertEqual(unescapeHTML('&#47;'), '/')
self.assertEqual(unescapeHTML('&eacute;'), 'é')
self.assertEqual(unescapeHTML('&#2013266066;'), '&#2013266066;')
+ # HTML5 entities
+ self.assertEqual(unescapeHTML('&period;&apos;'), '.\'')
def test_date_from_str(self):
self.assertEqual(date_from_str('yesterday'), date_from_str('now-1day'))
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 229de4b39..f77ab8650 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -39,6 +39,7 @@ from .compat import (
compat_chr,
compat_etree_fromstring,
compat_html_entities,
+ compat_html_entities_html5,
compat_http_client,
compat_kwargs,
compat_parse_qs,
@@ -456,12 +457,19 @@ def orderedSet(iterable):
return res
-def _htmlentity_transform(entity):
+def _htmlentity_transform(entity_with_semicolon):
"""Transforms an HTML entity to a character."""
+ entity = entity_with_semicolon[:-1]
+
# Known non-numeric HTML entity
if entity in compat_html_entities.name2codepoint:
return compat_chr(compat_html_entities.name2codepoint[entity])
+ # TODO: HTML5 allows entities without a semicolon. For example,
+ # '&Eacuteric' should be decoded as 'Éric'.
+ if entity_with_semicolon in compat_html_entities_html5:
+ return compat_html_entities_html5[entity_with_semicolon]
+
mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
if mobj is not None:
numstr = mobj.group(1)
@@ -486,7 +494,7 @@ def unescapeHTML(s):
assert type(s) == compat_str
return re.sub(
- r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
+ r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
def get_subprocess_encoding():