From 0254f1627487c137abd201dea230247de6cb7f87 Mon Sep 17 00:00:00 2001 From: Zenon Mousmoulas Date: Sun, 9 Jan 2022 20:14:56 +0200 Subject: [utils] Improve `get_elements_text_and_html_by_attribute` regex (#2280) Authored by: zmousm, pukkandan --- yt_dlp/utils.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'yt_dlp/utils.py') diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 826ab5d29..9a66de9f5 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -473,24 +473,23 @@ def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value attribute in the passed HTML document """ + value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?' + value = re.escape(value) if escape_value else value - retlist = [] - for m in re.finditer(r'''(?xs) + partial_element_re = r'''(?x) <(?P[a-zA-Z0-9:._-]+) - (?:\s+[a-zA-Z0-9_:.-]+(?:=\S*?|\s*=\s*(?:"[^"]*"|'[^']*')|))*? - \s+%(attribute)s(?:=%(value)s|\s*=\s*(?P<_q>['"]?)%(value)s(?P=_q)) - (?:\s+[a-zA-Z0-9_:.-]+(?:=\S*?|\s*=\s*(?:"[^"]*"|'[^']*')|))*? - \s*> - ''' % {'attribute': re.escape(attribute), 'value': value}, html): - content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():]) + (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)? + \s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q) + ''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional} - retlist.append(( - unescapeHTML(re.sub(r'(?s)^(?P["\'])(?P.*)(?P=q)$', r'\g', content)), - whole, - )) + for m in re.finditer(partial_element_re, html): + content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():]) - return retlist + yield ( + unescapeHTML(re.sub(r'^(?P["\'])(?P.*)(?P=q)$', r'\g', content, flags=re.DOTALL)), + whole + ) class HTMLBreakOnClosingTagParser(compat_HTMLParser): -- cgit v1.2.3