diff options
author | remitamine <remitamine@gmail.com> | 2016-03-16 13:16:27 +0100 |
---|---|---|
committer | remitamine <remitamine@gmail.com> | 2016-03-16 13:16:27 +0100 |
commit | 83548824c29ccdf53a4659260aa3898939833882 (patch) | |
tree | ec72d6a8943e467ca0eda8c81f17e06de6740b88 /youtube_dl | |
parent | 354dbbd8808dc5e835c7042f84c175eb56e0bcfc (diff) | |
parent | 8bb56eeeea8154f811076c0a9093203fab224003 (diff) |
Merge pull request #8092 from bpfoley/twitter-thumbnail
[utils] Add extract_attributes for extracting html tag attributes
Diffstat (limited to 'youtube_dl')
-rw-r--r-- | youtube_dl/compat.py | 6 | ||||
-rw-r--r-- | youtube_dl/utils.py | 30 |
2 files changed, 36 insertions, 0 deletions
diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 2771fb5fa..74702786a 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -77,6 +77,11 @@ try: except ImportError: # Python 2 from urllib import urlretrieve as compat_urlretrieve +try: + from html.parser import HTMLParser as compat_HTMLParser +except ImportError: # Python 2 + from HTMLParser import HTMLParser as compat_HTMLParser + try: from subprocess import DEVNULL @@ -543,6 +548,7 @@ else: from tokenize import generate_tokens as compat_tokenize_tokenize __all__ = [ + 'compat_HTMLParser', 'compat_HTTPError', 'compat_basestring', 'compat_chr', diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 9fd0ec8d5..ec186918c 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -35,6 +35,7 @@ import xml.etree.ElementTree import zlib from .compat import ( + compat_HTMLParser, compat_basestring, compat_chr, compat_etree_fromstring, @@ -272,6 +273,35 @@ def get_element_by_attribute(attribute, value, html): return unescapeHTML(res) +class HTMLAttributeParser(compat_HTMLParser): + """Trivial HTML parser to gather the attributes for a single element""" + def __init__(self): + self.attrs = { } + compat_HTMLParser.__init__(self) + + def handle_starttag(self, tag, attrs): + self.attrs = dict(attrs) + +def extract_attributes(html_element): + """Given a string for an HTML element such as + <el + a="foo" B="bar" c="&98;az" d=boz + empty= noval entity="&" + sq='"' dq="'" + > + Decode and return a dictionary of attributes. + { + 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz', + 'empty': '', 'noval': None, 'entity': '&', + 'sq': '"', 'dq': '\'' + }. + NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions, + but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5. + """ + parser = HTMLAttributeParser() + parser.feed(html_element) + parser.close() + return parser.attrs def clean_html(html): """Clean an HTML snippet into a readable string""" |