diff options
author | Philipp Hagemeister <phihag@phihag.de> | 2014-07-18 10:24:28 +0200 |
---|---|---|
committer | Philipp Hagemeister <phihag@phihag.de> | 2014-07-18 10:24:28 +0200 |
commit | 5425626790a46f9b5bdecf4e33bb254c4c2423ea (patch) | |
tree | fffe33aebe1cd1655ed32230e239cd74b4f3c6f4 /youtube_dl/swfinterp.py | |
parent | 5dc3552d85ac2b3723d0548bbe44996d50891cf2 (diff) |
[youtube] Move swfinterp into its own file
Diffstat (limited to 'youtube_dl/swfinterp.py')
-rw-r--r-- | youtube_dl/swfinterp.py | 503 |
1 files changed, 503 insertions, 0 deletions
diff --git a/youtube_dl/swfinterp.py b/youtube_dl/swfinterp.py new file mode 100644 index 000000000..1cd292138 --- /dev/null +++ b/youtube_dl/swfinterp.py @@ -0,0 +1,503 @@ +from __future__ import unicode_literals + +import collections +import io +import struct +import zlib + +from .utils import ExtractorError + + +def _extract_tags(content): + pos = 0 + while pos < len(content): + header16 = struct.unpack('<H', content[pos:pos + 2])[0] + pos += 2 + tag_code = header16 >> 6 + tag_len = header16 & 0x3f + if tag_len == 0x3f: + tag_len = struct.unpack('<I', content[pos:pos + 4])[0] + pos += 4 + assert pos + tag_len <= len(content) + yield (tag_code, content[pos:pos + tag_len]) + pos += tag_len + + +class _AVMClass_Object(object): + def __init__(self, avm_class): + self.avm_class = avm_class + + def __repr__(self): + return '%s#%x' % (self.avm_class.name, id(self)) + + +class _AVMClass(object): + def __init__(self, name_idx, name): + self.name_idx = name_idx + self.name = name + self.method_names = {} + self.method_idxs = {} + self.methods = {} + self.method_pyfunctions = {} + self.variables = {} + + def make_object(self): + return _AVMClass_Object(self) + + +def _read_int(reader): + res = 0 + shift = 0 + for _ in range(5): + buf = reader.read(1) + assert len(buf) == 1 + b = struct.unpack('<B', buf)[0] + res = res | ((b & 0x7f) << shift) + if b & 0x80 == 0: + break + shift += 7 + return res + + +def _u30(reader): + res = _read_int(reader) + assert res & 0xf0000000 == 0 + return res +u32 = _read_int + + +def _s32(reader): + v = _read_int(reader) + if v & 0x80000000 != 0: + v = - ((v ^ 0xffffffff) + 1) + return v + + +def _s24(reader): + bs = reader.read(3) + assert len(bs) == 3 + first_byte = b'\xff' if (ord(bs[0:1]) >= 0x80) else b'\x00' + return struct.unpack('!i', first_byte + bs) + + +def _read_string(reader): + slen = _u30(reader) + resb = reader.read(slen) + assert len(resb) == slen + return resb.decode('utf-8') + + +def _read_bytes(count, reader): + if reader is None: + reader = code_reader + resb = reader.read(count) + assert len(resb) == count + return resb + + +def _read_byte(reader): + resb = _read_bytes(1, reader=reader) + res = struct.unpack('<B', resb)[0] + return res + + +class SWFInterpreter(object): + def __init__(self, file_contents): + if file_contents[1:3] != b'WS': + raise ExtractorError( + 'Not an SWF file; header is %r' % file_contents[:3]) + if file_contents[:1] == b'C': + content = zlib.decompress(file_contents[8:]) + else: + raise NotImplementedError( + 'Unsupported compression format %r' % + file_contents[:1]) + + code_tag = next(tag + for tag_code, tag in _extract_tags(content) + if tag_code == 82) + p = code_tag.index(b'\0', 4) + 1 + code_reader = io.BytesIO(code_tag[p:]) + + # Parse ABC (AVM2 ByteCode) + + # Define a couple convenience methods + u30 = lambda *args: _u30(*args, reader=code_reader) + s32 = lambda *args: _s32(*args, reader=code_reader) + u32 = lambda *args: _u32(*args, reader=code_reader) + read_bytes = lambda *args: _read_bytes(*args, reader=code_reader) + read_byte = lambda *args: _read_byte(*args, reader=code_reader) + + # minor_version + major_version + read_bytes(2 + 2) + + # Constant pool + int_count = u30() + for _c in range(1, int_count): + s32() + uint_count = u30() + for _c in range(1, uint_count): + u32() + double_count = u30() + read_bytes((double_count - 1) * 8) + string_count = u30() + constant_strings = [''] + for _c in range(1, string_count): + s = _read_string(code_reader) + constant_strings.append(s) + namespace_count = u30() + for _c in range(1, namespace_count): + read_bytes(1) # kind + u30() # name + ns_set_count = u30() + for _c in range(1, ns_set_count): + count = u30() + for _c2 in range(count): + u30() + multiname_count = u30() + MULTINAME_SIZES = { + 0x07: 2, # QName + 0x0d: 2, # QNameA + 0x0f: 1, # RTQName + 0x10: 1, # RTQNameA + 0x11: 0, # RTQNameL + 0x12: 0, # RTQNameLA + 0x09: 2, # Multiname + 0x0e: 2, # MultinameA + 0x1b: 1, # MultinameL + 0x1c: 1, # MultinameLA + } + self.multinames = [''] + for _c in range(1, multiname_count): + kind = u30() + assert kind in MULTINAME_SIZES, 'Invalid multiname kind %r' % kind + if kind == 0x07: + u30() # namespace_idx + name_idx = u30() + self.multinames.append(constant_strings[name_idx]) + else: + self.multinames.append('[MULTINAME kind: %d]' % kind) + for _c2 in range(MULTINAME_SIZES[kind]): + u30() + + # Methods + method_count = u30() + MethodInfo = collections.namedtuple( + 'MethodInfo', + ['NEED_ARGUMENTS', 'NEED_REST']) + method_infos = [] + for method_id in range(method_count): + param_count = u30() + u30() # return type + for _ in range(param_count): + u30() # param type + u30() # name index (always 0 for youtube) + flags = read_byte() + if flags & 0x08 != 0: + # Options present + option_count = u30() + for c in range(option_count): + u30() # val + read_bytes(1) # kind + if flags & 0x80 != 0: + # Param names present + for _ in range(param_count): + u30() # param name + mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0) + method_infos.append(mi) + + # Metadata + metadata_count = u30() + for _c in range(metadata_count): + u30() # name + item_count = u30() + for _c2 in range(item_count): + u30() # key + u30() # value + + def parse_traits_info(): + trait_name_idx = u30() + kind_full = read_byte() + kind = kind_full & 0x0f + attrs = kind_full >> 4 + methods = {} + if kind in [0x00, 0x06]: # Slot or Const + u30() # Slot id + u30() # type_name_idx + vindex = u30() + if vindex != 0: + read_byte() # vkind + elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter + u30() # disp_id + method_idx = u30() + methods[self.multinames[trait_name_idx]] = method_idx + elif kind == 0x04: # Class + u30() # slot_id + u30() # classi + elif kind == 0x05: # Function + u30() # slot_id + function_idx = u30() + methods[function_idx] = self.multinames[trait_name_idx] + else: + raise ExtractorError('Unsupported trait kind %d' % kind) + + if attrs & 0x4 != 0: # Metadata present + metadata_count = u30() + for _c3 in range(metadata_count): + u30() # metadata index + + return methods + + # Classes + class_count = u30() + classes = [] + for class_id in range(class_count): + name_idx = u30() + classes.append(_AVMClass(name_idx, self.multinames[name_idx])) + u30() # super_name idx + flags = read_byte() + if flags & 0x08 != 0: # Protected namespace is present + u30() # protected_ns_idx + intrf_count = u30() + for _c2 in range(intrf_count): + u30() + u30() # iinit + trait_count = u30() + for _c2 in range(trait_count): + parse_traits_info() + assert len(classes) == class_count + self._classes_by_name = dict((c.name, c) for c in classes) + + for avm_class in classes: + u30() # cinit + trait_count = u30() + for _c2 in range(trait_count): + trait_methods = parse_traits_info() + avm_class.method_names.update(trait_methods.items()) + avm_class.method_idxs.update(dict( + (idx, name) + for name, idx in trait_methods.items())) + + # Scripts + script_count = u30() + for _c in range(script_count): + u30() # init + trait_count = u30() + for _c2 in range(trait_count): + parse_traits_info() + + # Method bodies + method_body_count = u30() + Method = collections.namedtuple('Method', ['code', 'local_count']) + for _c in range(method_body_count): + method_idx = u30() + u30() # max_stack + local_count = u30() + u30() # init_scope_depth + u30() # max_scope_depth + code_length = u30() + code = read_bytes(code_length) + for avm_class in classes: + if method_idx in avm_class.method_idxs: + m = Method(code, local_count) + avm_class.methods[avm_class.method_idxs[method_idx]] = m + exception_count = u30() + for _c2 in range(exception_count): + u30() # from + u30() # to + u30() # target + u30() # exc_type + u30() # var_name + trait_count = u30() + for _c2 in range(trait_count): + parse_traits_info() + + assert p + code_reader.tell() == len(code_tag) + + def extract_class(self, class_name): + try: + return self._classes_by_name[class_name] + except KeyError: + raise ExtractorError('Class %r not found' % class_name) + + def extract_function(self, avm_class, func_name): + if func_name in avm_class.method_pyfunctions: + return avm_class.method_pyfunctions[func_name] + if func_name in self._classes_by_name: + return self._classes_by_name[func_name].make_object() + if func_name not in avm_class.methods: + raise ExtractorError('Cannot find function %r' % func_name) + m = avm_class.methods[func_name] + + def resfunc(args): + # Helper functions + coder = io.BytesIO(m.code) + s24 = lambda: _s24(coder) + u30 = lambda: _u30(coder) + + print('Invoking %s.%s(%r)' % (avm_class.name, func_name, tuple(args))) + registers = ['(this)'] + list(args) + [None] * m.local_count + stack = [] + while True: + opcode = _read_byte(coder) + print('opcode: %r, stack(%d): %r' % (opcode, len(stack), stack)) + if opcode == 17: # iftrue + offset = s24() + value = stack.pop() + if value: + coder.seek(coder.tell() + offset) + elif opcode == 36: # pushbyte + v = _read_byte(coder) + stack.append(v) + elif opcode == 44: # pushstring + idx = u30() + stack.append(constant_strings[idx]) + elif opcode == 48: # pushscope + # We don't implement the scope register, so we'll just + # ignore the popped value + new_scope = stack.pop() + elif opcode == 70: # callproperty + index = u30() + mname = self.multinames[index] + arg_count = u30() + args = list(reversed( + [stack.pop() for _ in range(arg_count)])) + obj = stack.pop() + if mname == 'split': + assert len(args) == 1 + assert isinstance(args[0], compat_str) + assert isinstance(obj, compat_str) + if args[0] == '': + res = list(obj) + else: + res = obj.split(args[0]) + stack.append(res) + elif mname == 'slice': + assert len(args) == 1 + assert isinstance(args[0], int) + assert isinstance(obj, list) + res = obj[args[0]:] + stack.append(res) + elif mname == 'join': + assert len(args) == 1 + assert isinstance(args[0], compat_str) + assert isinstance(obj, list) + res = args[0].join(obj) + stack.append(res) + elif mname in avm_class.method_pyfunctions: + stack.append(avm_class.method_pyfunctions[mname](args)) + else: + raise NotImplementedError( + 'Unsupported property %r on %r' + % (mname, obj)) + elif opcode == 72: # returnvalue + res = stack.pop() + return res + elif opcode == 74: # constructproperty + index = u30() + arg_count = u30() + args = list(reversed( + [stack.pop() for _ in range(arg_count)])) + obj = stack.pop() + + mname = self.multinames[index] + construct_method = self.extract_function( + obj.avm_class, mname) + # We do not actually call the constructor for now; + # we just pretend it does nothing + stack.append(obj) + elif opcode == 79: # callpropvoid + index = u30() + mname = self.multinames[index] + arg_count = u30() + args = list(reversed( + [stack.pop() for _ in range(arg_count)])) + obj = stack.pop() + if mname == 'reverse': + assert isinstance(obj, list) + obj.reverse() + else: + raise NotImplementedError( + 'Unsupported (void) property %r on %r' + % (mname, obj)) + elif opcode == 86: # newarray + arg_count = u30() + arr = [] + for i in range(arg_count): + arr.append(stack.pop()) + arr = arr[::-1] + stack.append(arr) + elif opcode == 93: # findpropstrict + index = u30() + mname = self.multinames[index] + res = self.extract_function(avm_class, mname) + stack.append(res) + elif opcode == 94: # findproperty + index = u30() + mname = self.multinames[index] + res = avm_class.variables.get(mname) + stack.append(res) + elif opcode == 96: # getlex + index = u30() + mname = self.multinames[index] + res = avm_class.variables.get(mname, None) + stack.append(res) + elif opcode == 97: # setproperty + index = u30() + value = stack.pop() + idx = self.multinames[index] + obj = stack.pop() + obj[idx] = value + elif opcode == 98: # getlocal + index = u30() + stack.append(registers[index]) + elif opcode == 99: # setlocal + index = u30() + value = stack.pop() + registers[index] = value + elif opcode == 102: # getproperty + index = u30() + pname = self.multinames[index] + if pname == 'length': + obj = stack.pop() + assert isinstance(obj, list) + stack.append(len(obj)) + else: # Assume attribute access + idx = stack.pop() + assert isinstance(idx, int) + obj = stack.pop() + assert isinstance(obj, list) + stack.append(obj[idx]) + elif opcode == 128: # coerce + u30() + elif opcode == 133: # coerce_s + assert isinstance(stack[-1], (type(None), compat_str)) + elif opcode == 164: # modulo + value2 = stack.pop() + value1 = stack.pop() + res = value1 % value2 + stack.append(res) + elif opcode == 175: # greaterequals + value2 = stack.pop() + value1 = stack.pop() + result = value1 >= value2 + stack.append(result) + elif opcode == 208: # getlocal_0 + stack.append(registers[0]) + elif opcode == 209: # getlocal_1 + stack.append(registers[1]) + elif opcode == 210: # getlocal_2 + stack.append(registers[2]) + elif opcode == 211: # getlocal_3 + stack.append(registers[3]) + elif opcode == 214: # setlocal_2 + registers[2] = stack.pop() + elif opcode == 215: # setlocal_3 + registers[3] = stack.pop() + else: + raise NotImplementedError( + 'Unsupported opcode %d' % opcode) + + avm_class.method_pyfunctions[func_name] = resfunc + return resfunc + |