From d9d07a95815a992bf5f876a62f25c831eb3f32ac Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 3 May 2023 12:06:34 +0100 Subject: [utils] Improve js_to_json, align with yt-dlp * support variable substitution, from https://github.com/yt-dlp/yt-dlp/pull/#521 etc, thanks ChillingPepper, Grub4k, pukkandan * improve escape handling, from https://github.com/yt-dlp/yt-dlp/pull/#521 thanks Grub4k * support template strings from https://github.com/yt-dlp/yt-dlp/pull/6623 thanks Grub4k * add limited `!` evaluation (eg, !!0 -> false, see tests) --- youtube_dl/utils.py | 114 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 88 insertions(+), 26 deletions(-) (limited to 'youtube_dl') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index b77a7fb0e..b05f65283 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -4365,46 +4365,108 @@ def strip_jsonp(code): r'\g', code) -def js_to_json(code): - COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*' +def js_to_json(code, *args, **kwargs): + + # vars is a dict of (var, val) pairs to substitute + vars = args[0] if len(args) > 0 else kwargs.get('vars', {}) + strict = kwargs.get('strict', False) + + STRING_QUOTES = '\'"`' + STRING_RE = '|'.join(r'{0}(?:\\.|[^\\{0}])*{0}'.format(q) for q in STRING_QUOTES) + COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n' SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE) INTEGER_TABLE = ( (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16), (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8), + (r'(?s)^(\d+){skip}:?$'.format(skip=SKIP_RE), 10), ) + # compat candidate + JSONDecodeError = json.JSONDecodeError if 'JSONDecodeError' in dir(json) else ValueError + + def process_escape(match): + JSON_PASSTHROUGH_ESCAPES = r'"\bfnrtu' + escape = match.group(1) or match.group(2) + + return ('\\' + escape if escape in JSON_PASSTHROUGH_ESCAPES + else '\\u00' if escape == 'x' + else '' if escape == '\n' + else escape) + + def template_substitute(match): + evaluated = js_to_json(match.group(1), vars, strict=strict) + if evaluated[0] == '"': + return json.loads(evaluated) + return evaluated def fix_kv(m): v = m.group(0) if v in ('true', 'false', 'null'): return v - elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',': - return "" - - if v[0] in ("'", '"'): - v = re.sub(r'(?s)\\.|"', lambda m: { - '"': '\\"', - "\\'": "'", - '\\\n': '', - '\\x': '\\u00', - }.get(m.group(0), m.group(0)), v[1:-1]) - else: - for regex, base in INTEGER_TABLE: - im = re.match(regex, v) - if im: - i = int(im.group(1), base) - return '"%d":' % i if v.endswith(':') else '%d' % i + elif v in ('undefined', 'void 0'): + return 'null' + elif v.startswith('/*') or v.startswith('//') or v == ',': + return '' + + if v[0] in STRING_QUOTES: + v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1] + escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v) + return '"{0}"'.format(escaped) + + inv = IDENTITY + im = re.split(r'^!+', v) + if len(im) > 1 and not im[-1].endswith(':'): + if (len(v) - len(im[1])) % 2 == 1: + inv = lambda x: 'true' if x == 0 else 'false' + else: + inv = lambda x: 'false' if x == 0 else 'true' + if not any(x for x in im): + return + v = im[-1] + + for regex, base in INTEGER_TABLE: + im = re.match(regex, v) + if im: + i = int(im.group(1), base) + return ('"%s":' if v.endswith(':') else '%s') % inv(i) + + if v in vars: + try: + if not strict: + json.loads(vars[v]) + except JSONDecodeError: + return inv(json.dumps(vars[v])) + else: + return inv(vars[v]) + + if not strict: + v = try_call(inv, args=(v,), default=v) + if v in ('true', 'false'): + return v + return '"{0}"'.format(v) + + raise ValueError('Unknown value: ' + v) + + def create_map(mobj): + return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars)))) - return '"%s"' % v + code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code) + if not strict: + code = re.sub(r'new Date\((".+")\)', r'\g<1>', code) + code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code) + code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code) + code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code) return re.sub(r'''(?sx) - "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| - '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'| - {comment}|,(?={skip}[\]}}])| - (?:(?