Merge pull request #8092 from bpfoley/twitter-thumbnail

[utils] Add extract_attributes for extracting html tag attributes
author: remitamine <remitamine@gmail.com> 2016-03-16 13:16:27 +0100
committer: remitamine <remitamine@gmail.com> 2016-03-16 13:16:27 +0100
commit: 83548824c29ccdf53a4659260aa3898939833882 (patch)
tree: ec72d6a8943e467ca0eda8c81f17e06de6740b88 /youtube_dl
parent: 354dbbd8808dc5e835c7042f84c175eb56e0bcfc (diff)
parent: 8bb56eeeea8154f811076c0a9093203fab224003 (diff)
2 files changed, 36 insertions, 0 deletions
diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py
index 2771fb5fa..74702786a 100644
--- a/youtube_dl/compat.py
+++ b/youtube_dl/compat.py
@@ -77,6 +77,11 @@ try:
 except ImportError:  # Python 2
     from urllib import urlretrieve as compat_urlretrieve
 
+try:
+    from html.parser import HTMLParser as compat_HTMLParser
+except ImportError:  # Python 2
+    from HTMLParser import HTMLParser as compat_HTMLParser
+
 
 try:
     from subprocess import DEVNULL
@@ -543,6 +548,7 @@ else:
     from tokenize import generate_tokens as compat_tokenize_tokenize
 
 __all__ = [
+    'compat_HTMLParser',
     'compat_HTTPError',
     'compat_basestring',
     'compat_chr',
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 9fd0ec8d5..ec186918c 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -35,6 +35,7 @@ import xml.etree.ElementTree
 import zlib
 
 from .compat import (
+    compat_HTMLParser,
     compat_basestring,
     compat_chr,
     compat_etree_fromstring,
@@ -272,6 +273,35 @@ def get_element_by_attribute(attribute, value, html):
 
     return unescapeHTML(res)
 
+class HTMLAttributeParser(compat_HTMLParser):
+    """Trivial HTML parser to gather the attributes for a single element"""
+    def __init__(self):
+        self.attrs = { }
+        compat_HTMLParser.__init__(self)
+
+    def handle_starttag(self, tag, attrs):
+        self.attrs = dict(attrs)
+
+def extract_attributes(html_element):
+    """Given a string for an HTML element such as
+    <el
+         a="foo" B="bar" c="&98;az" d=boz
+         empty= noval entity="&amp;"
+         sq='"' dq="'"
+    >
+    Decode and return a dictionary of attributes.
+    {
+        'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
+        'empty': '', 'noval': None, 'entity': '&',
+        'sq': '"', 'dq': '\''
+    }.
+    NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
+    but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
+    """
+    parser = HTMLAttributeParser()
+    parser.feed(html_element)
+    parser.close()
+    return parser.attrs
 
 def clean_html(html):
     """Clean an HTML snippet into a readable string"""
author	remitamine <remitamine@gmail.com>	2016-03-16 13:16:27 +0100
committer	remitamine <remitamine@gmail.com>	2016-03-16 13:16:27 +0100
commit	83548824c29ccdf53a4659260aa3898939833882 (patch)
tree	ec72d6a8943e467ca0eda8c81f17e06de6740b88 /youtube_dl
parent	354dbbd8808dc5e835c7042f84c175eb56e0bcfc (diff)
parent	8bb56eeeea8154f811076c0a9093203fab224003 (diff)