aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhilipp Hagemeister <phihag@phihag.de>2014-03-30 07:02:58 +0200
committerPhilipp Hagemeister <phihag@phihag.de>2014-03-30 07:02:58 +0200
commit2b25cb5d7693b62736d4cdfa656289cc429c4c81 (patch)
tree3604fbd6cf32550b33fc826c03d5d9af753bc5c0
parent62fec3b2fffd12949da6fe057ce08d5bab2b7db5 (diff)
downloadyoutube-dl-2b25cb5d7693b62736d4cdfa656289cc429c4c81.tar.xz
[youtube] Move JavaScript interpreter into its own module
-rw-r--r--youtube_dl/extractor/youtube.py112
-rw-r--r--youtube_dl/jsinterp.py113
2 files changed, 118 insertions, 107 deletions
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 3a3a5a39e..2d1a19123 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -14,6 +14,7 @@ import zlib
from .common import InfoExtractor, SearchInfoExtractor
from .subtitles import SubtitlesInfoExtractor
+from ..jsinterp import JSInterpreter
from ..utils import (
compat_chr,
compat_parse_qs,
@@ -438,113 +439,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
def _parse_sig_js(self, jscode):
funcname = self._search_regex(
r'signature=([a-zA-Z]+)', jscode,
- u'Initial JS player signature function name')
-
- functions = {}
-
- def argidx(varname):
- return string.lowercase.index(varname)
-
- def interpret_statement(stmt, local_vars, allow_recursion=20):
- if allow_recursion < 0:
- raise ExtractorError(u'Recursion limit reached')
-
- if stmt.startswith(u'var '):
- stmt = stmt[len(u'var '):]
- ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
- r'=(?P<expr>.*)$', stmt)
- if ass_m:
- if ass_m.groupdict().get('index'):
- def assign(val):
- lvar = local_vars[ass_m.group('out')]
- idx = interpret_expression(ass_m.group('index'),
- local_vars, allow_recursion)
- assert isinstance(idx, int)
- lvar[idx] = val
- return val
- expr = ass_m.group('expr')
- else:
- def assign(val):
- local_vars[ass_m.group('out')] = val
- return val
- expr = ass_m.group('expr')
- elif stmt.startswith(u'return '):
- assign = lambda v: v
- expr = stmt[len(u'return '):]
- else:
- raise ExtractorError(
- u'Cannot determine left side of statement in %r' % stmt)
-
- v = interpret_expression(expr, local_vars, allow_recursion)
- return assign(v)
-
- def interpret_expression(expr, local_vars, allow_recursion):
- if expr.isdigit():
- return int(expr)
-
- if expr.isalpha():
- return local_vars[expr]
-
- m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
- if m:
- member = m.group('member')
- val = local_vars[m.group('in')]
- if member == 'split("")':
- return list(val)
- if member == 'join("")':
- return u''.join(val)
- if member == 'length':
- return len(val)
- if member == 'reverse()':
- return val[::-1]
- slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
- if slice_m:
- idx = interpret_expression(
- slice_m.group('idx'), local_vars, allow_recursion-1)
- return val[idx:]
-
- m = re.match(
- r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
- if m:
- val = local_vars[m.group('in')]
- idx = interpret_expression(m.group('idx'), local_vars,
- allow_recursion-1)
- return val[idx]
-
- m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
- if m:
- a = interpret_expression(m.group('a'),
- local_vars, allow_recursion)
- b = interpret_expression(m.group('b'),
- local_vars, allow_recursion)
- return a % b
-
- m = re.match(
- r'^(?P<func>[a-zA-Z$]+)\((?P<args>[a-z0-9,]+)\)$', expr)
- if m:
- fname = m.group('func')
- if fname not in functions:
- functions[fname] = extract_function(fname)
- argvals = [int(v) if v.isdigit() else local_vars[v]
- for v in m.group('args').split(',')]
- return functions[fname](argvals)
- raise ExtractorError(u'Unsupported JS expression %r' % expr)
-
- def extract_function(funcname):
- func_m = re.search(
- r'function ' + re.escape(funcname) +
- r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
- jscode)
- argnames = func_m.group('args').split(',')
-
- def resf(args):
- local_vars = dict(zip(argnames, args))
- for stmt in func_m.group('code').split(';'):
- res = interpret_statement(stmt, local_vars)
- return res
- return resf
-
- initial_function = extract_function(funcname)
+ u'Initial JS player signature function name')
+
+ jsi = JSInterpreter(jscode)
+ initial_function = jsi.extract_function(funcname)
return lambda s: initial_function([s])
def _parse_sig_swf(self, file_contents):
diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py
new file mode 100644
index 000000000..129a4027b
--- /dev/null
+++ b/youtube_dl/jsinterp.py
@@ -0,0 +1,113 @@
+from __future__ import unicode_literals
+
+import re
+
+from .utils import (
+ ExtractorError,
+)
+
+
+class JSInterpreter(object):
+ def __init__(self, code):
+ self.code = code
+ self._functions = {}
+
+ def interpret_statement(self, stmt, local_vars, allow_recursion=20):
+ if allow_recursion < 0:
+ raise ExtractorError('Recursion limit reached')
+
+ if stmt.startswith('var '):
+ stmt = stmt[len('var '):]
+ ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
+ r'=(?P<expr>.*)$', stmt)
+ if ass_m:
+ if ass_m.groupdict().get('index'):
+ def assign(val):
+ lvar = local_vars[ass_m.group('out')]
+ idx = self.interpret_expression(
+ ass_m.group('index'), local_vars, allow_recursion)
+ assert isinstance(idx, int)
+ lvar[idx] = val
+ return val
+ expr = ass_m.group('expr')
+ else:
+ def assign(val):
+ local_vars[ass_m.group('out')] = val
+ return val
+ expr = ass_m.group('expr')
+ elif stmt.startswith('return '):
+ assign = lambda v: v
+ expr = stmt[len('return '):]
+ else:
+ raise ExtractorError(
+ 'Cannot determine left side of statement in %r' % stmt)
+
+ v = self.interpret_expression(expr, local_vars, allow_recursion)
+ return assign(v)
+
+ def interpret_expression(self, expr, local_vars, allow_recursion):
+ if expr.isdigit():
+ return int(expr)
+
+ if expr.isalpha():
+ return local_vars[expr]
+
+ m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
+ if m:
+ member = m.group('member')
+ val = local_vars[m.group('in')]
+ if member == 'split("")':
+ return list(val)
+ if member == 'join("")':
+ return u''.join(val)
+ if member == 'length':
+ return len(val)
+ if member == 'reverse()':
+ return val[::-1]
+ slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
+ if slice_m:
+ idx = self.interpret_expression(
+ slice_m.group('idx'), local_vars, allow_recursion - 1)
+ return val[idx:]
+
+ m = re.match(
+ r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
+ if m:
+ val = local_vars[m.group('in')]
+ idx = self.interpret_expression(
+ m.group('idx'), local_vars, allow_recursion - 1)
+ return val[idx]
+
+ m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
+ if m:
+ a = self.interpret_expression(
+ m.group('a'), local_vars, allow_recursion)
+ b = self.interpret_expression(
+ m.group('b'), local_vars, allow_recursion)
+ return a % b
+
+ m = re.match(
+ r'^(?P<func>[a-zA-Z$]+)\((?P<args>[a-z0-9,]+)\)$', expr)
+ if m:
+ fname = m.group('func')
+ if fname not in self._functions:
+ self._functions[fname] = self.extract_function(fname)
+ argvals = [int(v) if v.isdigit() else local_vars[v]
+ for v in m.group('args').split(',')]
+ return self._functions[fname](argvals)
+ raise ExtractorError('Unsupported JS expression %r' % expr)
+
+ def extract_function(self, funcname):
+ func_m = re.search(
+ r'function ' + re.escape(funcname) +
+ r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
+ self.code)
+ argnames = func_m.group('args').split(',')
+
+ def resf(args):
+ local_vars = dict(zip(argnames, args))
+ for stmt in func_m.group('code').split(';'):
+ res = self.interpret_statement(stmt, local_vars)
+ return res
+ return resf
+