diff options
| author | Philipp Hagemeister <phihag@phihag.de> | 2014-03-30 07:02:58 +0200 | 
|---|---|---|
| committer | Philipp Hagemeister <phihag@phihag.de> | 2014-03-30 07:02:58 +0200 | 
| commit | 2b25cb5d7693b62736d4cdfa656289cc429c4c81 (patch) | |
| tree | 3604fbd6cf32550b33fc826c03d5d9af753bc5c0 | |
| parent | 62fec3b2fffd12949da6fe057ce08d5bab2b7db5 (diff) | |
[youtube] Move JavaScript interpreter into its own module
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 112 | ||||
| -rw-r--r-- | youtube_dl/jsinterp.py | 113 | 
2 files changed, 118 insertions, 107 deletions
| diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 3a3a5a39e..2d1a19123 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -14,6 +14,7 @@ import zlib  from .common import InfoExtractor, SearchInfoExtractor  from .subtitles import SubtitlesInfoExtractor +from ..jsinterp import JSInterpreter  from ..utils import (      compat_chr,      compat_parse_qs, @@ -438,113 +439,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):      def _parse_sig_js(self, jscode):          funcname = self._search_regex(              r'signature=([a-zA-Z]+)', jscode, -            u'Initial JS player signature function name') - -        functions = {} - -        def argidx(varname): -            return string.lowercase.index(varname) - -        def interpret_statement(stmt, local_vars, allow_recursion=20): -            if allow_recursion < 0: -                raise ExtractorError(u'Recursion limit reached') - -            if stmt.startswith(u'var '): -                stmt = stmt[len(u'var '):] -            ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' + -                             r'=(?P<expr>.*)$', stmt) -            if ass_m: -                if ass_m.groupdict().get('index'): -                    def assign(val): -                        lvar = local_vars[ass_m.group('out')] -                        idx = interpret_expression(ass_m.group('index'), -                                                   local_vars, allow_recursion) -                        assert isinstance(idx, int) -                        lvar[idx] = val -                        return val -                    expr = ass_m.group('expr') -                else: -                    def assign(val): -                        local_vars[ass_m.group('out')] = val -                        return val -                    expr = ass_m.group('expr') -            elif stmt.startswith(u'return '): -                assign = lambda v: v -                expr = stmt[len(u'return '):] -            else: -                raise ExtractorError( -                    u'Cannot determine left side of statement in %r' % stmt) - -            v = interpret_expression(expr, local_vars, allow_recursion) -            return assign(v) - -        def interpret_expression(expr, local_vars, allow_recursion): -            if expr.isdigit(): -                return int(expr) - -            if expr.isalpha(): -                return local_vars[expr] - -            m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr) -            if m: -                member = m.group('member') -                val = local_vars[m.group('in')] -                if member == 'split("")': -                    return list(val) -                if member == 'join("")': -                    return u''.join(val) -                if member == 'length': -                    return len(val) -                if member == 'reverse()': -                    return val[::-1] -                slice_m = re.match(r'slice\((?P<idx>.*)\)', member) -                if slice_m: -                    idx = interpret_expression( -                        slice_m.group('idx'), local_vars, allow_recursion-1) -                    return val[idx:] - -            m = re.match( -                r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr) -            if m: -                val = local_vars[m.group('in')] -                idx = interpret_expression(m.group('idx'), local_vars, -                                           allow_recursion-1) -                return val[idx] - -            m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr) -            if m: -                a = interpret_expression(m.group('a'), -                                         local_vars, allow_recursion) -                b = interpret_expression(m.group('b'), -                                         local_vars, allow_recursion) -                return a % b - -            m = re.match( -                r'^(?P<func>[a-zA-Z$]+)\((?P<args>[a-z0-9,]+)\)$', expr) -            if m: -                fname = m.group('func') -                if fname not in functions: -                    functions[fname] = extract_function(fname) -                argvals = [int(v) if v.isdigit() else local_vars[v] -                           for v in m.group('args').split(',')] -                return functions[fname](argvals) -            raise ExtractorError(u'Unsupported JS expression %r' % expr) - -        def extract_function(funcname): -            func_m = re.search( -                r'function ' + re.escape(funcname) + -                r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}', -                jscode) -            argnames = func_m.group('args').split(',') - -            def resf(args): -                local_vars = dict(zip(argnames, args)) -                for stmt in func_m.group('code').split(';'): -                    res = interpret_statement(stmt, local_vars) -                return res -            return resf - -        initial_function = extract_function(funcname) +             u'Initial JS player signature function name') + +        jsi = JSInterpreter(jscode) +        initial_function = jsi.extract_function(funcname)          return lambda s: initial_function([s])      def _parse_sig_swf(self, file_contents): diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py new file mode 100644 index 000000000..129a4027b --- /dev/null +++ b/youtube_dl/jsinterp.py @@ -0,0 +1,113 @@ +from __future__ import unicode_literals + +import re + +from .utils import ( +    ExtractorError, +) + + +class JSInterpreter(object): +    def __init__(self, code): +        self.code = code +        self._functions = {} + +    def interpret_statement(self, stmt, local_vars, allow_recursion=20): +        if allow_recursion < 0: +            raise ExtractorError('Recursion limit reached') + +        if stmt.startswith('var '): +            stmt = stmt[len('var '):] +        ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' + +                         r'=(?P<expr>.*)$', stmt) +        if ass_m: +            if ass_m.groupdict().get('index'): +                def assign(val): +                    lvar = local_vars[ass_m.group('out')] +                    idx = self.interpret_expression( +                        ass_m.group('index'), local_vars, allow_recursion) +                    assert isinstance(idx, int) +                    lvar[idx] = val +                    return val +                expr = ass_m.group('expr') +            else: +                def assign(val): +                    local_vars[ass_m.group('out')] = val +                    return val +                expr = ass_m.group('expr') +        elif stmt.startswith('return '): +            assign = lambda v: v +            expr = stmt[len('return '):] +        else: +            raise ExtractorError( +                'Cannot determine left side of statement in %r' % stmt) + +        v = self.interpret_expression(expr, local_vars, allow_recursion) +        return assign(v) + +    def interpret_expression(self, expr, local_vars, allow_recursion): +        if expr.isdigit(): +            return int(expr) + +        if expr.isalpha(): +            return local_vars[expr] + +        m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr) +        if m: +            member = m.group('member') +            val = local_vars[m.group('in')] +            if member == 'split("")': +                return list(val) +            if member == 'join("")': +                return u''.join(val) +            if member == 'length': +                return len(val) +            if member == 'reverse()': +                return val[::-1] +            slice_m = re.match(r'slice\((?P<idx>.*)\)', member) +            if slice_m: +                idx = self.interpret_expression( +                    slice_m.group('idx'), local_vars, allow_recursion - 1) +                return val[idx:] + +        m = re.match( +            r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr) +        if m: +            val = local_vars[m.group('in')] +            idx = self.interpret_expression( +                m.group('idx'), local_vars, allow_recursion - 1) +            return val[idx] + +        m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr) +        if m: +            a = self.interpret_expression( +                m.group('a'), local_vars, allow_recursion) +            b = self.interpret_expression( +                m.group('b'), local_vars, allow_recursion) +            return a % b + +        m = re.match( +            r'^(?P<func>[a-zA-Z$]+)\((?P<args>[a-z0-9,]+)\)$', expr) +        if m: +            fname = m.group('func') +            if fname not in self._functions: +                self._functions[fname] = self.extract_function(fname) +            argvals = [int(v) if v.isdigit() else local_vars[v] +                       for v in m.group('args').split(',')] +            return self._functions[fname](argvals) +        raise ExtractorError('Unsupported JS expression %r' % expr) + +    def extract_function(self, funcname): +        func_m = re.search( +            r'function ' + re.escape(funcname) + +            r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}', +            self.code) +        argnames = func_m.group('args').split(',') + +        def resf(args): +            local_vars = dict(zip(argnames, args)) +            for stmt in func_m.group('code').split(';'): +                res = self.interpret_statement(stmt, local_vars) +            return res +        return resf + | 
