From 2af12ad9d2c3e41d0e91fe6e2f35827469d587bf Mon Sep 17 00:00:00 2001 From: Thomas Christlieb Date: Sat, 11 Feb 2017 10:16:54 +0100 Subject: Introduce get_elements_by_class and get_elements_by_attribute utility functions --- youtube_dl/utils.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) (limited to 'youtube_dl') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 67a847eba..a81fe7d30 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -337,17 +337,30 @@ def get_element_by_id(id, html): def get_element_by_class(class_name, html): - return get_element_by_attribute( + """Return the content of the first tag with the specified class in the passed HTML document""" + retval = get_elements_by_class(class_name, html) + return retval[0] if retval else None + + +def get_element_by_attribute(attribute, value, html, escape_value=True): + retval = get_elements_by_attribute(attribute, value, html, escape_value) + return retval[0] if retval else None + + +def get_elements_by_class(class_name, html): + """Return the content of all tags with the specified class in the passed HTML document as a list""" + return get_elements_by_attribute( 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name), html, escape_value=False) -def get_element_by_attribute(attribute, value, html, escape_value=True): +def get_elements_by_attribute(attribute, value, html, escape_value=True): """Return the content of the tag with the specified attribute in the passed HTML document""" value = re.escape(value) if escape_value else value - m = re.search(r'''(?xs) + retlist = [] + for m in re.finditer(r'''(?xs) <([a-zA-Z0-9:._-]+) (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*? \s+%s=['"]?%s['"]? @@ -355,16 +368,15 @@ def get_element_by_attribute(attribute, value, html, escape_value=True): \s*> (?P.*?) - ''' % (re.escape(attribute), value), html) + ''' % (re.escape(attribute), value), html): + res = m.group('content') - if not m: - return None - res = m.group('content') + if res.startswith('"') or res.startswith("'"): + res = res[1:-1] - if res.startswith('"') or res.startswith("'"): - res = res[1:-1] + retlist.append(unescapeHTML(res)) - return unescapeHTML(res) + return retlist class HTMLAttributeParser(compat_HTMLParser): -- cgit v1.2.3