diff options
| author | Philipp Hagemeister <phihag@phihag.de> | 2014-11-04 23:33:43 +0100 | 
|---|---|---|
| committer | Philipp Hagemeister <phihag@phihag.de> | 2014-11-04 23:33:43 +0100 | 
| commit | 3828505646a58d5e19f0ffe77821650e25d72b3a (patch) | |
| tree | c3fdf3c08981dd94c5b32acee72f9780513a40e6 | |
| parent | 11fba1751d5a43491254a3f54fcd0903a40d1a35 (diff) | |
[utils] Use a regexp instead of HTMLParser for get_element_by_attribute
| -rw-r--r-- | youtube_dl/utils.py | 121 | 
1 files changed, 16 insertions, 105 deletions
| diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 16651bf11..7c0fb1592 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -152,86 +152,6 @@ def xpath_text(node, xpath, name=None, fatal=False):      return n.text -if sys.version_info < (2, 7): -    compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix - -class BaseHTMLParser(compat_html_parser.HTMLParser): -    def __init(self): -        compat_html_parser.HTMLParser.__init__(self) -        self.html = None - -    def loads(self, html): -        self.html = html -        self.feed(html) -        self.close() - -class AttrParser(BaseHTMLParser): -    """Modified HTMLParser that isolates a tag with the specified attribute""" -    def __init__(self, attribute, value): -        self.attribute = attribute -        self.value = value -        self.result = None -        self.started = False -        self.depth = {} -        self.watch_startpos = False -        self.error_count = 0 -        BaseHTMLParser.__init__(self) - -    def error(self, message): -        if self.error_count > 10 or self.started: -            raise compat_html_parser.HTMLParseError(message, self.getpos()) -        self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line -        self.error_count += 1 -        self.goahead(1) - -    def handle_starttag(self, tag, attrs): -        attrs = dict(attrs) -        if self.started: -            self.find_startpos(None) -        if self.attribute in attrs and attrs[self.attribute] == self.value: -            self.result = [tag] -            self.started = True -            self.watch_startpos = True -        if self.started: -            if not tag in self.depth: self.depth[tag] = 0 -            self.depth[tag] += 1 - -    def handle_endtag(self, tag): -        if self.started: -            if tag in self.depth: self.depth[tag] -= 1 -            if self.depth[self.result[0]] == 0: -                self.started = False -                self.result.append(self.getpos()) - -    def find_startpos(self, x): -        """Needed to put the start position of the result (self.result[1]) -        after the opening tag with the requested id""" -        if self.watch_startpos: -            self.watch_startpos = False -            self.result.append(self.getpos()) -    handle_entityref = handle_charref = handle_data = handle_comment = \ -    handle_decl = handle_pi = unknown_decl = find_startpos - -    def get_result(self): -        if self.result is None: -            return None -        if len(self.result) != 3: -            return None -        lines = self.html.split('\n') -        lines = lines[self.result[1][0]-1:self.result[2][0]] -        lines[0] = lines[0][self.result[1][1]:] -        if len(lines) == 1: -            lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]] -        lines[-1] = lines[-1][:self.result[2][1]] -        return '\n'.join(lines).strip() -# Hack for https://github.com/rg3/youtube-dl/issues/662 -if sys.version_info < (2, 7, 3): -    AttrParser.parse_endtag = (lambda self, i: -        i + len("</scr'+'ipt>") -        if self.rawdata[i:].startswith("</scr'+'ipt>") -        else compat_html_parser.HTMLParser.parse_endtag(self, i)) - -  def get_element_by_id(id, html):      """Return the content of the tag with the specified ID in the passed HTML document"""      return get_element_by_attribute("id", id, html) @@ -239,34 +159,25 @@ def get_element_by_id(id, html):  def get_element_by_attribute(attribute, value, html):      """Return the content of the tag with the specified attribute in the passed HTML document""" -    parser = AttrParser(attribute, value) -    try: -        parser.loads(html) -    except compat_html_parser.HTMLParseError: -        pass -    return parser.get_result() -class MetaParser(BaseHTMLParser): -    """ -    Modified HTMLParser that isolates a meta tag with the specified name  -    attribute. -    """ -    def __init__(self, name): -        BaseHTMLParser.__init__(self) -        self.name = name -        self.content = None -        self.result = None - -    def handle_starttag(self, tag, attrs): -        if tag != 'meta': -            return -        attrs = dict(attrs) -        if attrs.get('name') == self.name: -            self.result = attrs.get('content') +    m = re.search(r'''(?xs) +        <([a-zA-Z0-9:._-]+) +         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*? +         \s+%s=['"]?%s['"]? +         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*? +        \s*> +        (?P<content>.*?) +        </\1> +    ''' % (re.escape(attribute), re.escape(value)), html) + +    if not m: +        return None +    res = m.group('content') -    def get_result(self): -        return self.result +    if res.startswith('"') or res.startswith("'"): +        res = res[1:-1] +    return unescapeHTML(res)  def clean_html(html): | 
