diff options
author | Nick Daniels <nick.daniels@forward.co.uk> | 2012-12-19 14:21:14 +0000 |
---|---|---|
committer | Philipp Hagemeister <phihag@phihag.de> | 2012-12-20 13:27:38 +0100 |
commit | 43e8fafd49f94ebf4776c84697e4b815750ec701 (patch) | |
tree | e6e2e2ac9f7769d7f177ac59c7ca77f486351f1c /youtube_dl | |
parent | 314d506b96d87a212e7e57eaa4d86514579c1c12 (diff) |
Refactor IDParser to search for elements by any attribute not just ID
Diffstat (limited to 'youtube_dl')
-rw-r--r-- | youtube_dl/utils.py | 19 |
1 files changed, 12 insertions, 7 deletions
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 4e64f327a..a5196b0ae 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -214,10 +214,11 @@ def htmlentity_transform(matchobj): return (u'&%s;' % entity) compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix -class IDParser(compat_html_parser.HTMLParser): - """Modified HTMLParser that isolates a tag with the specified id""" - def __init__(self, id): - self.id = id +class AttrParser(compat_html_parser.HTMLParser): + """Modified HTMLParser that isolates a tag with the specified attribute""" + def __init__(self, attribute, value): + self.attribute = attribute + self.value = value self.result = None self.started = False self.depth = {} @@ -242,7 +243,7 @@ class IDParser(compat_html_parser.HTMLParser): attrs = dict(attrs) if self.started: self.find_startpos(None) - if 'id' in attrs and attrs['id'] == self.id: + if self.attribute in attrs and attrs[self.attribute] == self.value: self.result = [tag] self.started = True self.watch_startpos = True @@ -280,8 +281,12 @@ class IDParser(compat_html_parser.HTMLParser): return '\n'.join(lines).strip() def get_element_by_id(id, html): - """Return the content of the tag with the specified id in the passed HTML document""" - parser = IDParser(id) + """Return the content of the tag with the specified ID in the passed HTML document""" + return get_element_by_attribute("id", id, html) + +def get_element_by_attribute(attribute, value, html): + """Return the content of the tag with the specified attribute in the passed HTML document""" + parser = AttrParser(attribute, value) try: parser.loads(html) except compat_html_parser.HTMLParseError: |