diff options
| author | Brian Foley <bpfoley@gmail.com> | 2016-01-02 19:49:59 +0000 | 
|---|---|---|
| committer | Brian Foley <bpfoley@gmail.com> | 2016-03-03 10:11:37 +0000 | 
| commit | 8bb56eeeea8154f811076c0a9093203fab224003 (patch) | |
| tree | 4bfe429dc5b46b75a6ca4b6b19f411891de0686f | |
| parent | 03879ff0547b6d1b96c530075cd99f99b8c74a2b (diff) | |
[utils] Add extract_attributes for extracting html tag attributes
This is much more robust than just using regexps, and handles all
the common scenarios, such as empty/no values, repeated attributes,
entity decoding, mixed case names, and the different possible value
quoting schemes.
| -rw-r--r-- | test/test_utils.py | 40 | ||||
| -rw-r--r-- | youtube_dl/compat.py | 6 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 30 | 
3 files changed, 76 insertions, 0 deletions
| diff --git a/test/test_utils.py b/test/test_utils.py index 97587ad2f..cb85e18f0 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -28,6 +28,7 @@ from youtube_dl.utils import (      encodeFilename,      escape_rfc3986,      escape_url, +    extract_attributes,      ExtractorError,      find_xpath_attr,      fix_xml_ampersands, @@ -75,6 +76,7 @@ from youtube_dl.utils import (      cli_bool_option,  )  from youtube_dl.compat import ( +    compat_chr,      compat_etree_fromstring,  ) @@ -591,6 +593,44 @@ class TestUtil(unittest.TestCase):          on = js_to_json('{"abc": "def",}')          self.assertEqual(json.loads(on), {'abc': 'def'}) +    def test_extract_attributes(self): +        self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'}) +        self.assertEqual(extract_attributes("<e x='y'>"), {'x': 'y'}) +        self.assertEqual(extract_attributes('<e x=y>'), {'x': 'y'}) +        self.assertEqual(extract_attributes('<e x="a \'b\' c">'), {'x': "a 'b' c"}) +        self.assertEqual(extract_attributes('<e x=\'a "b" c\'>'), {'x': 'a "b" c'}) +        self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'}) +        self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'}) +        self.assertEqual(extract_attributes('<e x="&">'), {'x': '&'})  # XML +        self.assertEqual(extract_attributes('<e x=""">'), {'x': '"'}) +        self.assertEqual(extract_attributes('<e x="£">'), {'x': '£'}) # HTML 3.2 +        self.assertEqual(extract_attributes('<e x="λ">'), {'x': 'λ'}) # HTML 4.0 +        self.assertEqual(extract_attributes('<e x="&foo">'), {'x': '&foo'}) +        self.assertEqual(extract_attributes('<e x="\'">'), {'x': "'"}) +        self.assertEqual(extract_attributes('<e x=\'"\'>'), {'x': '"'}) +        self.assertEqual(extract_attributes('<e x >'), {'x': None}) +        self.assertEqual(extract_attributes('<e x=y a>'), {'x': 'y', 'a': None}) +        self.assertEqual(extract_attributes('<e x= y>'), {'x': 'y'}) +        self.assertEqual(extract_attributes('<e x=1 y=2 x=3>'), {'y': '2', 'x': '3'}) +        self.assertEqual(extract_attributes('<e \nx=\ny\n>'), {'x': 'y'}) +        self.assertEqual(extract_attributes('<e \nx=\n"y"\n>'), {'x': 'y'}) +        self.assertEqual(extract_attributes("<e \nx=\n'y'\n>"), {'x': 'y'}) +        self.assertEqual(extract_attributes('<e \nx="\ny\n">'), {'x': '\ny\n'}) +        self.assertEqual(extract_attributes('<e CAPS=x>'), {'caps': 'x'}) # Names lowercased +        self.assertEqual(extract_attributes('<e x=1 X=2>'), {'x': '2'}) +        self.assertEqual(extract_attributes('<e X=1 x=2>'), {'x': '2'}) +        self.assertEqual(extract_attributes('<e _:funny-name1=1>'), {'_:funny-name1': '1'}) +        self.assertEqual(extract_attributes('<e x="Fáilte 世界 \U0001f600">'), {'x': 'Fáilte 世界 \U0001f600'}) +        self.assertEqual(extract_attributes('<e x="décomposé">'), {'x': 'décompose\u0301'}) +        # "Narrow" Python builds don't support unicode code points outside BMP. +        try: +            compat_chr(0x10000) +            supports_outside_bmp = True +        except ValueError: +            supports_outside_bmp = False +        if supports_outside_bmp: +            self.assertEqual(extract_attributes('<e x="Smile 😀!">'), {'x': 'Smile \U0001f600!'}) +      def test_clean_html(self):          self.assertEqual(clean_html('a:\nb'), 'a: b')          self.assertEqual(clean_html('a:\n   "b"'), 'a:    "b"') diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index b497da696..7b9afc36d 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -77,6 +77,11 @@ try:  except ImportError:  # Python 2      from urllib import urlretrieve as compat_urlretrieve +try: +    from html.parser import HTMLParser as compat_HTMLParser +except ImportError:  # Python 2 +    from HTMLParser import HTMLParser as compat_HTMLParser +  try:      from subprocess import DEVNULL @@ -540,6 +545,7 @@ else:      from tokenize import generate_tokens as compat_tokenize_tokenize  __all__ = [ +    'compat_HTMLParser',      'compat_HTTPError',      'compat_basestring',      'compat_chr', diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 210c47fce..a0234a3a8 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -35,6 +35,7 @@ import xml.etree.ElementTree  import zlib  from .compat import ( +    compat_HTMLParser,      compat_basestring,      compat_chr,      compat_etree_fromstring, @@ -272,6 +273,35 @@ def get_element_by_attribute(attribute, value, html):      return unescapeHTML(res) +class HTMLAttributeParser(compat_HTMLParser): +    """Trivial HTML parser to gather the attributes for a single element""" +    def __init__(self): +        self.attrs = { } +        compat_HTMLParser.__init__(self) + +    def handle_starttag(self, tag, attrs): +        self.attrs = dict(attrs) + +def extract_attributes(html_element): +    """Given a string for an HTML element such as +    <el +         a="foo" B="bar" c="&98;az" d=boz +         empty= noval entity="&" +         sq='"' dq="'" +    > +    Decode and return a dictionary of attributes. +    { +        'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz', +        'empty': '', 'noval': None, 'entity': '&', +        'sq': '"', 'dq': '\'' +    }. +    NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions, +    but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5. +    """ +    parser = HTMLAttributeParser() +    parser.feed(html_element) +    parser.close() +    return parser.attrs  def clean_html(html):      """Clean an HTML snippet into a readable string""" | 
