diff options
| author | dirkf <fieldhouse@gmx.net> | 2024-12-12 04:13:06 +0000 | 
|---|---|---|
| committer | dirkf <fieldhouse@gmx.net> | 2024-12-16 12:38:51 +0000 | 
| commit | 94fd7746084d87a43e34b094c5db1325f91ce053 (patch) | |
| tree | 4538e3e7186d9b2295c7337b19a0ef81991b9c44 | |
| parent | 5dee6213edddb5ea00775db6b3b73f8355144485 (diff) | |
[jsinterp] Fix and improve split/join
* improve split/join edge cases
* correctly implement regex split (not like re.split)
| -rw-r--r-- | test/test_jsinterp.py | 19 | ||||
| -rw-r--r-- | youtube_dl/jsinterp.py | 48 | 
2 files changed, 62 insertions, 5 deletions
| diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 52eaf1ed8..b6e87e9f1 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -483,6 +483,13 @@ class TestJSInterpreter(unittest.TestCase):              self._test(jsi, 't-e-s-t', args=[test_input, '-'])              self._test(jsi, '', args=[[], '-']) +        self._test('function f(){return ' +                   '[1, 1.0, "abc", {a: 1}, null, undefined, Infinity, NaN].join()}', +                   '1,1,abc,[object Object],,,Infinity,NaN') +        self._test('function f(){return ' +                   '[1, 1.0, "abc", {a: 1}, null, undefined, Infinity, NaN].join("~")}', +                   '1~1~abc~[object Object]~~~Infinity~NaN') +      def test_split(self):          test_result = list('test')          tests = [ @@ -496,6 +503,18 @@ class TestJSInterpreter(unittest.TestCase):              self._test(jsi, test_result, args=['t-e-s-t', '-'])              self._test(jsi, [''], args=['', '-'])              self._test(jsi, [], args=['', '']) +        # RegExp split +        self._test('function f(){return "test".split(/(?:)/)}', +                   ['t', 'e', 's', 't']) +        self._test('function f(){return "t-e-s-t".split(/[es-]+/)}', +                   ['t', 't']) +        # from MDN: surrogate pairs aren't handled: case 1 fails +        # self._test('function f(){return "😄😄".split(/(?:)/)}', +        #            ['\ud83d', '\ude04', '\ud83d', '\ude04']) +        # case 2 beats Py3.2: it gets the case 1 result +        if sys.version_info >= (2, 6) and not ((3, 0) <= sys.version_info < (3, 3)): +            self._test('function f(){return "😄😄".split(/(?:)/u)}', +                       ['😄', '😄'])      def test_slice(self):          self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice()}', [0, 1, 2, 3, 4, 5, 6, 7, 8]) diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 73934a6af..bec959946 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -397,6 +397,9 @@ class JSInterpreter(object):          RE_FLAGS = {              # special knowledge: Python's re flags are bitmask values, current max 128              # invent new bitmask values well above that for literal parsing +            # JS 'u' flag is effectively always set (surrogate pairs aren't seen), +            # but \u{...} and \p{...} escapes aren't handled); no additional JS 'v' +            # features are supported              # TODO: execute matches with these flags (remaining: d, y)              'd': 1024,  # Generate indices for substring matches              'g': 2048,  # Global search @@ -404,6 +407,7 @@ class JSInterpreter(object):              'm': re.M,  # Multi-line search              's': re.S,  # Allows . to match newline characters              'u': re.U,  # Treat a pattern as a sequence of unicode code points +            'v': re.U,  # Like 'u' with extended character class and \p{} syntax              'y': 4096,  # Perform a "sticky" search that matches starting at the current position in the target string          } @@ -1047,13 +1051,47 @@ class JSInterpreter(object):                      raise self.Exception('Unsupported Math method ' + member, expr=expr)                  if member == 'split': -                    assertion(argvals, 'takes one or more arguments') -                    assertion(len(argvals) == 1, 'with limit argument is not implemented') -                    return obj.split(argvals[0]) if argvals[0] else list(obj) +                    assertion(len(argvals) <= 2, 'takes at most two arguments') +                    if len(argvals) > 1: +                        limit = argvals[1] +                        assertion(isinstance(limit, int) and limit >= 0, 'integer limit >= 0') +                        if limit == 0: +                            return [] +                    else: +                        limit = 0 +                    if len(argvals) == 0: +                        argvals = [JS_Undefined] +                    elif isinstance(argvals[0], self.JS_RegExp): +                        # avoid re.split(), similar but not enough + +                        def where(): +                            for m in argvals[0].finditer(obj): +                                yield m.span(0) +                            yield (None, None) + +                        def splits(limit=limit): +                            i = 0 +                            for j, jj in where(): +                                if j == jj == 0: +                                    continue +                                if j is None and i >= len(obj): +                                    break +                                yield obj[i:j] +                                if jj is None or limit == 1: +                                    break +                                limit -= 1 +                                i = jj + +                        return list(splits()) +                    return ( +                        obj.split(argvals[0], limit - 1) if argvals[0] and argvals[0] != JS_Undefined +                        else list(obj)[:limit or None])                  elif member == 'join':                      assertion(isinstance(obj, list), 'must be applied on a list') -                    assertion(len(argvals) == 1, 'takes exactly one argument') -                    return argvals[0].join(obj) +                    assertion(len(argvals) <= 1, 'takes at most one argument') +                    return (',' if len(argvals) == 0 else argvals[0]).join( +                        ('' if x in (None, JS_Undefined) else _js_toString(x)) +                        for x in obj)                  elif member == 'reverse':                      assertion(not argvals, 'does not take any arguments')                      obj.reverse() | 
