aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/utils.py
diff options
context:
space:
mode:
authorBrian Foley <bpfoley@gmail.com>2016-01-02 19:49:59 +0000
committerBrian Foley <bpfoley@gmail.com>2016-03-03 10:11:37 +0000
commit8bb56eeeea8154f811076c0a9093203fab224003 (patch)
tree4bfe429dc5b46b75a6ca4b6b19f411891de0686f /youtube_dl/utils.py
parent03879ff0547b6d1b96c530075cd99f99b8c74a2b (diff)
downloadyoutube-dl-8bb56eeeea8154f811076c0a9093203fab224003.tar.xz
[utils] Add extract_attributes for extracting html tag attributes
This is much more robust than just using regexps, and handles all the common scenarios, such as empty/no values, repeated attributes, entity decoding, mixed case names, and the different possible value quoting schemes.
Diffstat (limited to 'youtube_dl/utils.py')
-rw-r--r--youtube_dl/utils.py30
1 files changed, 30 insertions, 0 deletions
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 210c47fce..a0234a3a8 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -35,6 +35,7 @@ import xml.etree.ElementTree
import zlib
from .compat import (
+ compat_HTMLParser,
compat_basestring,
compat_chr,
compat_etree_fromstring,
@@ -272,6 +273,35 @@ def get_element_by_attribute(attribute, value, html):
return unescapeHTML(res)
+class HTMLAttributeParser(compat_HTMLParser):
+ """Trivial HTML parser to gather the attributes for a single element"""
+ def __init__(self):
+ self.attrs = { }
+ compat_HTMLParser.__init__(self)
+
+ def handle_starttag(self, tag, attrs):
+ self.attrs = dict(attrs)
+
+def extract_attributes(html_element):
+ """Given a string for an HTML element such as
+ <el
+ a="foo" B="bar" c="&98;az" d=boz
+ empty= noval entity="&amp;"
+ sq='"' dq="'"
+ >
+ Decode and return a dictionary of attributes.
+ {
+ 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
+ 'empty': '', 'noval': None, 'entity': '&',
+ 'sq': '"', 'dq': '\''
+ }.
+ NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
+ but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
+ """
+ parser = HTMLAttributeParser()
+ parser.feed(html_element)
+ parser.close()
+ return parser.attrs
def clean_html(html):
"""Clean an HTML snippet into a readable string"""