diff options
Diffstat (limited to 'youtube_dl/jsinterp.py')
-rw-r--r-- | youtube_dl/jsinterp.py | 140 |
1 files changed, 121 insertions, 19 deletions
diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 7835187f5..69c8f77ca 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -1,10 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals +import calendar import itertools import json import operator import re +import time from functools import update_wrapper, wraps @@ -12,8 +14,10 @@ from .utils import ( error_to_compat_str, ExtractorError, float_or_none, + int_or_none, js_to_json, remove_quotes, + str_or_none, unified_timestamp, variadic, write_string, @@ -24,6 +28,8 @@ from .compat import ( compat_collections_chain_map as ChainMap, compat_contextlib_suppress, compat_filter as filter, + compat_int, + compat_integer_types, compat_itertools_zip_longest as zip_longest, compat_map as map, compat_numeric_types, @@ -70,14 +76,27 @@ class JS_Undefined(object): pass -def _js_bit_op(op): +def _js_bit_op(op, is_shift=False): - def zeroise(x): - return 0 if x in (None, JS_Undefined, _NaN, _Infinity) else x + def zeroise(x, is_shift_arg=False): + if isinstance(x, compat_integer_types): + return (x % 32) if is_shift_arg else (x & 0xffffffff) + try: + x = float(x) + if is_shift_arg: + x = int(x % 32) + elif x < 0: + x = -compat_int(-x % 0xffffffff) + else: + x = compat_int(x % 0xffffffff) + except (ValueError, TypeError): + # also here for int(NaN), including float('inf') % 32 + x = 0 + return x @wraps_op(op) def wrapped(a, b): - return op(zeroise(a), zeroise(b)) & 0xffffffff + return op(zeroise(a), zeroise(b, is_shift)) & 0xffffffff return wrapped @@ -135,6 +154,7 @@ def _js_to_primitive(v): ) +# more exact: yt-dlp/yt-dlp#12110 def _js_toString(v): return ( 'undefined' if v is JS_Undefined @@ -143,7 +163,7 @@ def _js_toString(v): else 'null' if v is None # bool <= int: do this first else ('false', 'true')[v] if isinstance(v, bool) - else '{0:.7f}'.format(v).rstrip('.0') if isinstance(v, compat_numeric_types) + else re.sub(r'(?<=\d)\.?0*$', '', '{0:.7f}'.format(v)) if isinstance(v, compat_numeric_types) else _js_to_primitive(v)) @@ -253,8 +273,8 @@ def _js_typeof(expr): # avoid dict to maintain order # definition None => Defined in JSInterpreter._operator _OPERATORS = ( - ('>>', _js_bit_op(operator.rshift)), - ('<<', _js_bit_op(operator.lshift)), + ('>>', _js_bit_op(operator.rshift, True)), + ('<<', _js_bit_op(operator.lshift, True)), ('+', _js_add), ('-', _js_arith_op(operator.sub)), ('*', _js_arith_op(operator.mul)), @@ -389,6 +409,7 @@ class JSInterpreter(object): class Exception(ExtractorError): def __init__(self, msg, *args, **kwargs): expr = kwargs.pop('expr', None) + msg = str_or_none(msg, default='"None"') if expr is not None: msg = '{0} in: {1!r:.100}'.format(msg.rstrip(), expr) super(JSInterpreter.Exception, self).__init__(msg, *args, **kwargs) @@ -416,6 +437,7 @@ class JSInterpreter(object): flags, _ = self.regex_flags(flags) # First, avoid https://github.com/python/cpython/issues/74534 self.__self = None + pattern_txt = str_or_none(pattern_txt) or '(?:)' self.__pattern_txt = pattern_txt.replace('[[', r'[\[') self.__flags = flags @@ -460,6 +482,73 @@ class JSInterpreter(object): flags |= cls.RE_FLAGS[ch] return flags, expr[idx + 1:] + class JS_Date(object): + _t = None + + @staticmethod + def __ymd_etc(*args, **kw_is_utc): + # args: year, monthIndex, day, hours, minutes, seconds, milliseconds + is_utc = kw_is_utc.get('is_utc', False) + + args = list(args[:7]) + args += [0] * (9 - len(args)) + args[1] += 1 # month 0..11 -> 1..12 + ms = args[6] + for i in range(6, 9): + args[i] = -1 # don't know + if is_utc: + args[-1] = 1 + # TODO: [MDN] When a segment overflows or underflows its expected + # range, it usually "carries over to" or "borrows from" the higher segment. + try: + mktime = calendar.timegm if is_utc else time.mktime + return mktime(time.struct_time(args)) * 1000 + ms + except (OverflowError, ValueError): + return None + + @classmethod + def UTC(cls, *args): + t = cls.__ymd_etc(*args, is_utc=True) + return _NaN if t is None else t + + @staticmethod + def parse(date_str, **kw_is_raw): + is_raw = kw_is_raw.get('is_raw', False) + + t = unified_timestamp(str_or_none(date_str), False) + return int(t * 1000) if t is not None else t if is_raw else _NaN + + @staticmethod + def now(**kw_is_raw): + is_raw = kw_is_raw.get('is_raw', False) + + t = time.time() + return int(t * 1000) if t is not None else t if is_raw else _NaN + + def __init__(self, *args): + if not args: + args = [self.now(is_raw=True)] + if len(args) == 1: + if isinstance(args[0], JSInterpreter.JS_Date): + self._t = int_or_none(args[0].valueOf(), default=None) + else: + arg_type = _js_typeof(args[0]) + if arg_type == 'string': + self._t = self.parse(args[0], is_raw=True) + elif arg_type == 'number': + self._t = int(args[0]) + else: + self._t = self.__ymd_etc(*args) + + def toString(self): + try: + return time.strftime('%a %b %0d %Y %H:%M:%S %Z%z', self._t).rstrip() + except TypeError: + return "Invalid Date" + + def valueOf(self): + return _NaN if self._t is None else self._t + @classmethod def __op_chars(cls): op_chars = set(';,[') @@ -584,18 +673,21 @@ class JSInterpreter(object): except Exception as e: raise self.Exception('Failed to evaluate {left_val!r:.50} {op} {right_val!r:.50}'.format(**locals()), expr, cause=e) - def _index(self, obj, idx, allow_undefined=True): + def _index(self, obj, idx, allow_undefined=None): if idx == 'length' and isinstance(obj, list): return len(obj) try: return obj[int(idx)] if isinstance(obj, list) else obj[compat_str(idx)] - except (TypeError, KeyError, IndexError) as e: - if allow_undefined: - # when is not allowed? + except (TypeError, KeyError, IndexError, ValueError) as e: + # allow_undefined is None gives correct behaviour + if allow_undefined or ( + allow_undefined is None and not isinstance(e, TypeError)): return JS_Undefined raise self.Exception('Cannot get index {idx!r:.100}'.format(**locals()), expr=repr(obj), cause=e) def _dump(self, obj, namespace): + if obj is JS_Undefined: + return 'undefined' try: return json.dumps(obj) except TypeError: @@ -700,7 +792,7 @@ class JSInterpreter(object): new_kw, _, obj = expr.partition('new ') if not new_kw: - for klass, konstr in (('Date', lambda x: int(unified_timestamp(x, False) * 1000)), + for klass, konstr in (('Date', lambda *x: self.JS_Date(*x).valueOf()), ('RegExp', self.JS_RegExp), ('Error', self.Exception)): if not obj.startswith(klass + '('): @@ -948,6 +1040,10 @@ class JSInterpreter(object): left_val = self._index(left_val, idx) if isinstance(idx, float): idx = int(idx) + if isinstance(left_val, list) and len(left_val) <= int_or_none(idx, default=-1): + # JS Array is a sparsely assignable list + # TODO: handle extreme sparsity without memory bloat, eg using auxiliary dict + left_val.extend((idx - len(left_val) + 1) * [JS_Undefined]) left_val[idx] = self._operator( m.group('op'), self._index(left_val, idx) if m.group('op') else None, m.group('expr'), expr, local_vars, allow_recursion) @@ -1019,6 +1115,7 @@ class JSInterpreter(object): 'String': compat_str, 'Math': float, 'Array': list, + 'Date': self.JS_Date, } obj = local_vars.get(variable) if obj in (JS_Undefined, None): @@ -1071,6 +1168,8 @@ class JSInterpreter(object): assertion(len(argvals) == 2, 'takes two arguments') return argvals[0] ** argvals[1] raise self.Exception('Unsupported Math method ' + member, expr=expr) + elif obj is self.JS_Date: + return getattr(obj, member)(*argvals) if member == 'split': assertion(len(argvals) <= 2, 'takes at most two arguments') @@ -1111,9 +1210,10 @@ class JSInterpreter(object): elif member == 'join': assertion(isinstance(obj, list), 'must be applied on a list') assertion(len(argvals) <= 1, 'takes at most one argument') - return (',' if len(argvals) == 0 else argvals[0]).join( - ('' if x in (None, JS_Undefined) else _js_toString(x)) - for x in obj) + return (',' if len(argvals) == 0 or argvals[0] in (None, JS_Undefined) + else argvals[0]).join( + ('' if x in (None, JS_Undefined) else _js_toString(x)) + for x in obj) elif member == 'reverse': assertion(not argvals, 'does not take any arguments') obj.reverse() @@ -1271,19 +1371,21 @@ class JSInterpreter(object): code, _ = self._separate_at_paren(func_m.group('code')) # refine the match return self.build_arglist(func_m.group('args')), code - def extract_function(self, funcname): + def extract_function(self, funcname, *global_stack): return function_with_repr( - self.extract_function_from_code(*self.extract_function_code(funcname)), + self.extract_function_from_code(*itertools.chain( + self.extract_function_code(funcname), global_stack)), 'F<%s>' % (funcname,)) def extract_function_from_code(self, argnames, code, *global_stack): local_vars = {} + start = None while True: - mobj = re.search(r'function\((?P<args>[^)]*)\)\s*{', code) + mobj = re.search(r'function\((?P<args>[^)]*)\)\s*{', code[start:]) if mobj is None: break - start, body_start = mobj.span() + start, body_start = ((start or 0) + x for x in mobj.span()) body, remaining = self._separate_at_paren(code[body_start - 1:]) name = self._named_object(local_vars, self.extract_function_from_code( [x.strip() for x in mobj.group('args').split(',')], |