aboutsummaryrefslogtreecommitdiff
path: root/test
diff options
context:
space:
mode:
authorBrian Foley <bpfoley@gmail.com>2016-01-02 19:49:59 +0000
committerBrian Foley <bpfoley@gmail.com>2016-03-03 10:11:37 +0000
commit8bb56eeeea8154f811076c0a9093203fab224003 (patch)
tree4bfe429dc5b46b75a6ca4b6b19f411891de0686f /test
parent03879ff0547b6d1b96c530075cd99f99b8c74a2b (diff)
[utils] Add extract_attributes for extracting html tag attributes
This is much more robust than just using regexps, and handles all the common scenarios, such as empty/no values, repeated attributes, entity decoding, mixed case names, and the different possible value quoting schemes.
Diffstat (limited to 'test')
-rw-r--r--test/test_utils.py40
1 files changed, 40 insertions, 0 deletions
diff --git a/test/test_utils.py b/test/test_utils.py
index 97587ad2f..cb85e18f0 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -28,6 +28,7 @@ from youtube_dl.utils import (
encodeFilename,
escape_rfc3986,
escape_url,
+ extract_attributes,
ExtractorError,
find_xpath_attr,
fix_xml_ampersands,
@@ -75,6 +76,7 @@ from youtube_dl.utils import (
cli_bool_option,
)
from youtube_dl.compat import (
+ compat_chr,
compat_etree_fromstring,
)
@@ -591,6 +593,44 @@ class TestUtil(unittest.TestCase):
on = js_to_json('{"abc": "def",}')
self.assertEqual(json.loads(on), {'abc': 'def'})
+ def test_extract_attributes(self):
+ self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'})
+ self.assertEqual(extract_attributes("<e x='y'>"), {'x': 'y'})
+ self.assertEqual(extract_attributes('<e x=y>'), {'x': 'y'})
+ self.assertEqual(extract_attributes('<e x="a \'b\' c">'), {'x': "a 'b' c"})
+ self.assertEqual(extract_attributes('<e x=\'a "b" c\'>'), {'x': 'a "b" c'})
+ self.assertEqual(extract_attributes('<e x="&#121;">'), {'x': 'y'})
+ self.assertEqual(extract_attributes('<e x="&#x79;">'), {'x': 'y'})
+ self.assertEqual(extract_attributes('<e x="&amp;">'), {'x': '&'}) # XML
+ self.assertEqual(extract_attributes('<e x="&quot;">'), {'x': '"'})
+ self.assertEqual(extract_attributes('<e x="&pound;">'), {'x': '£'}) # HTML 3.2
+ self.assertEqual(extract_attributes('<e x="&lambda;">'), {'x': 'λ'}) # HTML 4.0
+ self.assertEqual(extract_attributes('<e x="&foo">'), {'x': '&foo'})
+ self.assertEqual(extract_attributes('<e x="\'">'), {'x': "'"})
+ self.assertEqual(extract_attributes('<e x=\'"\'>'), {'x': '"'})
+ self.assertEqual(extract_attributes('<e x >'), {'x': None})
+ self.assertEqual(extract_attributes('<e x=y a>'), {'x': 'y', 'a': None})
+ self.assertEqual(extract_attributes('<e x= y>'), {'x': 'y'})
+ self.assertEqual(extract_attributes('<e x=1 y=2 x=3>'), {'y': '2', 'x': '3'})
+ self.assertEqual(extract_attributes('<e \nx=\ny\n>'), {'x': 'y'})
+ self.assertEqual(extract_attributes('<e \nx=\n"y"\n>'), {'x': 'y'})
+ self.assertEqual(extract_attributes("<e \nx=\n'y'\n>"), {'x': 'y'})
+ self.assertEqual(extract_attributes('<e \nx="\ny\n">'), {'x': '\ny\n'})
+ self.assertEqual(extract_attributes('<e CAPS=x>'), {'caps': 'x'}) # Names lowercased
+ self.assertEqual(extract_attributes('<e x=1 X=2>'), {'x': '2'})
+ self.assertEqual(extract_attributes('<e X=1 x=2>'), {'x': '2'})
+ self.assertEqual(extract_attributes('<e _:funny-name1=1>'), {'_:funny-name1': '1'})
+ self.assertEqual(extract_attributes('<e x="Fáilte 世界 \U0001f600">'), {'x': 'Fáilte 世界 \U0001f600'})
+ self.assertEqual(extract_attributes('<e x="décompose&#769;">'), {'x': 'décompose\u0301'})
+ # "Narrow" Python builds don't support unicode code points outside BMP.
+ try:
+ compat_chr(0x10000)
+ supports_outside_bmp = True
+ except ValueError:
+ supports_outside_bmp = False
+ if supports_outside_bmp:
+ self.assertEqual(extract_attributes('<e x="Smile &#128512;!">'), {'x': 'Smile \U0001f600!'})
+
def test_clean_html(self):
self.assertEqual(clean_html('a:\nb'), 'a: b')
self.assertEqual(clean_html('a:\n "b"'), 'a: "b"')