diff options
Diffstat (limited to 'youtube_dl/extractor/youtube.py')
-rw-r--r-- | youtube_dl/extractor/youtube.py | 454 |
1 files changed, 4 insertions, 450 deletions
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 16f4a047d..623056bd9 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -14,6 +14,7 @@ import zlib from .common import InfoExtractor, SearchInfoExtractor from .subtitles import SubtitlesInfoExtractor from ..jsinterp import JSInterpreter +from ..swfinterp import SWFInterpreter from ..utils import ( compat_chr, compat_parse_qs, @@ -450,457 +451,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): return lambda s: initial_function([s]) def _parse_sig_swf(self, file_contents): - if file_contents[1:3] != b'WS': - raise ExtractorError( - u'Not an SWF file; header is %r' % file_contents[:3]) - if file_contents[:1] == b'C': - content = zlib.decompress(file_contents[8:]) - else: - raise NotImplementedError(u'Unsupported compression format %r' % - file_contents[:1]) - - def extract_tags(content): - pos = 0 - while pos < len(content): - header16 = struct.unpack('<H', content[pos:pos+2])[0] - pos += 2 - tag_code = header16 >> 6 - tag_len = header16 & 0x3f - if tag_len == 0x3f: - tag_len = struct.unpack('<I', content[pos:pos+4])[0] - pos += 4 - assert pos+tag_len <= len(content) - yield (tag_code, content[pos:pos+tag_len]) - pos += tag_len - - code_tag = next(tag - for tag_code, tag in extract_tags(content) - if tag_code == 82) - p = code_tag.index(b'\0', 4) + 1 - code_reader = io.BytesIO(code_tag[p:]) - - # Parse ABC (AVM2 ByteCode) - def read_int(reader=None): - if reader is None: - reader = code_reader - res = 0 - shift = 0 - for _ in range(5): - buf = reader.read(1) - assert len(buf) == 1 - b = struct.unpack('<B', buf)[0] - res = res | ((b & 0x7f) << shift) - if b & 0x80 == 0: - break - shift += 7 - return res - - def u30(reader=None): - res = read_int(reader) - assert res & 0xf0000000 == 0 - return res - u32 = read_int - - def s32(reader=None): - v = read_int(reader) - if v & 0x80000000 != 0: - v = - ((v ^ 0xffffffff) + 1) - return v - - def s24(reader): - bs = reader.read(3) - assert len(bs) == 3 - first_byte = b'\xff' if (ord(bs[0:1]) >= 0x80) else b'\x00' - return struct.unpack('!i', first_byte + bs) - - def read_string(reader=None): - if reader is None: - reader = code_reader - slen = u30(reader) - resb = reader.read(slen) - assert len(resb) == slen - return resb.decode('utf-8') - - def read_bytes(count, reader=None): - if reader is None: - reader = code_reader - resb = reader.read(count) - assert len(resb) == count - return resb - - def read_byte(reader=None): - resb = read_bytes(1, reader=reader) - res = struct.unpack('<B', resb)[0] - return res - - # minor_version + major_version - read_bytes(2 + 2) - - # Constant pool - int_count = u30() - for _c in range(1, int_count): - s32() - uint_count = u30() - for _c in range(1, uint_count): - u32() - double_count = u30() - read_bytes((double_count-1) * 8) - string_count = u30() - constant_strings = [u''] - for _c in range(1, string_count): - s = read_string() - constant_strings.append(s) - namespace_count = u30() - for _c in range(1, namespace_count): - read_bytes(1) # kind - u30() # name - ns_set_count = u30() - for _c in range(1, ns_set_count): - count = u30() - for _c2 in range(count): - u30() - multiname_count = u30() - MULTINAME_SIZES = { - 0x07: 2, # QName - 0x0d: 2, # QNameA - 0x0f: 1, # RTQName - 0x10: 1, # RTQNameA - 0x11: 0, # RTQNameL - 0x12: 0, # RTQNameLA - 0x09: 2, # Multiname - 0x0e: 2, # MultinameA - 0x1b: 1, # MultinameL - 0x1c: 1, # MultinameLA - } - multinames = [u''] - for _c in range(1, multiname_count): - kind = u30() - assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind - if kind == 0x07: - u30() # namespace_idx - name_idx = u30() - multinames.append(constant_strings[name_idx]) - else: - multinames.append('[MULTINAME kind: %d]' % kind) - for _c2 in range(MULTINAME_SIZES[kind]): - u30() - - # Methods - method_count = u30() - MethodInfo = collections.namedtuple( - 'MethodInfo', - ['NEED_ARGUMENTS', 'NEED_REST']) - method_infos = [] - for method_id in range(method_count): - param_count = u30() - u30() # return type - for _ in range(param_count): - u30() # param type - u30() # name index (always 0 for youtube) - flags = read_byte() - if flags & 0x08 != 0: - # Options present - option_count = u30() - for c in range(option_count): - u30() # val - read_bytes(1) # kind - if flags & 0x80 != 0: - # Param names present - for _ in range(param_count): - u30() # param name - mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0) - method_infos.append(mi) - - # Metadata - metadata_count = u30() - for _c in range(metadata_count): - u30() # name - item_count = u30() - for _c2 in range(item_count): - u30() # key - u30() # value - - def parse_traits_info(): - trait_name_idx = u30() - kind_full = read_byte() - kind = kind_full & 0x0f - attrs = kind_full >> 4 - methods = {} - if kind in [0x00, 0x06]: # Slot or Const - u30() # Slot id - u30() # type_name_idx - vindex = u30() - if vindex != 0: - read_byte() # vkind - elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter - u30() # disp_id - method_idx = u30() - methods[multinames[trait_name_idx]] = method_idx - elif kind == 0x04: # Class - u30() # slot_id - u30() # classi - elif kind == 0x05: # Function - u30() # slot_id - function_idx = u30() - methods[function_idx] = multinames[trait_name_idx] - else: - raise ExtractorError(u'Unsupported trait kind %d' % kind) - - if attrs & 0x4 != 0: # Metadata present - metadata_count = u30() - for _c3 in range(metadata_count): - u30() # metadata index - - return methods - - class AVMClass(object): - def __init__(self, name_idx): - self.name_idx = name_idx - self.method_names = {} - self.method_idxs = {} - self.methods = {} - self.method_pyfunctions = {} - self.variables = {} - - @property - def name(self): - return multinames[self.name_idx] - - # Classes - class_count = u30() - classes = [] - for class_id in range(class_count): - name_idx = u30() - classes.append(AVMClass(name_idx)) - u30() # super_name idx - flags = read_byte() - if flags & 0x08 != 0: # Protected namespace is present - u30() # protected_ns_idx - intrf_count = u30() - for _c2 in range(intrf_count): - u30() - u30() # iinit - trait_count = u30() - for _c2 in range(trait_count): - parse_traits_info() - assert len(classes) == class_count - + swfi = SWFInterpreter(file_contents) TARGET_CLASSNAME = u'SignatureDecipher' - searched_class = next( - c for c in classes if c.name == TARGET_CLASSNAME) - if searched_class is None: - raise ExtractorError(u'Target class %r not found' % - TARGET_CLASSNAME) - - for avm_class in classes: - u30() # cinit - trait_count = u30() - for _c2 in range(trait_count): - trait_methods = parse_traits_info() - avm_class.method_names.update(trait_methods.items()) - avm_class.method_idxs.update(dict( - (idx, name) - for name, idx in trait_methods.items())) - - # Scripts - script_count = u30() - for _c in range(script_count): - u30() # init - trait_count = u30() - for _c2 in range(trait_count): - parse_traits_info() - - # Method bodies - method_body_count = u30() - Method = collections.namedtuple('Method', ['code', 'local_count']) - for _c in range(method_body_count): - method_idx = u30() - u30() # max_stack - local_count = u30() - u30() # init_scope_depth - u30() # max_scope_depth - code_length = u30() - code = read_bytes(code_length) - for avm_class in classes: - if method_idx in avm_class.method_idxs: - m = Method(code, local_count) - avm_class.methods[avm_class.method_idxs[method_idx]] = m - exception_count = u30() - for _c2 in range(exception_count): - u30() # from - u30() # to - u30() # target - u30() # exc_type - u30() # var_name - trait_count = u30() - for _c2 in range(trait_count): - parse_traits_info() - - assert p + code_reader.tell() == len(code_tag) - - def extract_function(avm_class, func_name): - if func_name in avm_class.method_pyfunctions: - return avm_class.method_pyfunctions[func_name] - if func_name not in avm_class.methods: - raise ExtractorError(u'Cannot find function %r' % func_name) - m = avm_class.methods[func_name] - - def resfunc(args): - registers = ['(this)'] + list(args) + [None] * m.local_count - stack = [] - coder = io.BytesIO(m.code) - while True: - opcode = struct.unpack('!B', coder.read(1))[0] - if opcode == 17: # iftrue - offset = s24(coder) - value = stack.pop() - if value: - coder.seek(coder.tell() + offset) - elif opcode == 36: # pushbyte - v = struct.unpack('!B', coder.read(1))[0] - stack.append(v) - elif opcode == 44: # pushstring - idx = u30(coder) - stack.append(constant_strings[idx]) - elif opcode == 48: # pushscope - # We don't implement the scope register, so we'll just - # ignore the popped value - stack.pop() - elif opcode == 70: # callproperty - index = u30(coder) - mname = multinames[index] - arg_count = u30(coder) - args = list(reversed( - [stack.pop() for _ in range(arg_count)])) - obj = stack.pop() - if mname == u'split': - assert len(args) == 1 - assert isinstance(args[0], compat_str) - assert isinstance(obj, compat_str) - if args[0] == u'': - res = list(obj) - else: - res = obj.split(args[0]) - stack.append(res) - elif mname == u'slice': - assert len(args) == 1 - assert isinstance(args[0], int) - assert isinstance(obj, list) - res = obj[args[0]:] - stack.append(res) - elif mname == u'join': - assert len(args) == 1 - assert isinstance(args[0], compat_str) - assert isinstance(obj, list) - res = args[0].join(obj) - stack.append(res) - elif mname in avm_class.method_pyfunctions: - stack.append(avm_class.method_pyfunctions[mname](args)) - else: - raise NotImplementedError( - u'Unsupported property %r on %r' - % (mname, obj)) - elif opcode == 72: # returnvalue - res = stack.pop() - return res - elif opcode == 79: # callpropvoid - index = u30(coder) - mname = multinames[index] - arg_count = u30(coder) - args = list(reversed( - [stack.pop() for _ in range(arg_count)])) - obj = stack.pop() - if mname == u'reverse': - assert isinstance(obj, list) - obj.reverse() - else: - raise NotImplementedError( - u'Unsupported (void) property %r on %r' - % (mname, obj)) - elif opcode == 86: # newarray - arg_count = u30(coder) - arr = [] - for i in range(arg_count): - arr.append(stack.pop()) - arr = arr[::-1] - stack.append(arr) - elif opcode == 93: # findpropstrict - index = u30(coder) - mname = multinames[index] - res = extract_function(avm_class, mname) - stack.append(res) - elif opcode == 94: # findproperty - index = u30(coder) - mname = multinames[index] - res = avm_class.variables.get(mname) - stack.append(res) - elif opcode == 96: # getlex - index = u30(coder) - mname = multinames[index] - res = avm_class.variables.get(mname) - stack.append(res) - elif opcode == 97: # setproperty - index = u30(coder) - value = stack.pop() - idx = stack.pop() - obj = stack.pop() - assert isinstance(obj, list) - assert isinstance(idx, int) - obj[idx] = value - elif opcode == 98: # getlocal - index = u30(coder) - stack.append(registers[index]) - elif opcode == 99: # setlocal - index = u30(coder) - value = stack.pop() - registers[index] = value - elif opcode == 102: # getproperty - index = u30(coder) - pname = multinames[index] - if pname == u'length': - obj = stack.pop() - assert isinstance(obj, list) - stack.append(len(obj)) - else: # Assume attribute access - idx = stack.pop() - assert isinstance(idx, int) - obj = stack.pop() - assert isinstance(obj, list) - stack.append(obj[idx]) - elif opcode == 128: # coerce - u30(coder) - elif opcode == 133: # coerce_s - assert isinstance(stack[-1], (type(None), compat_str)) - elif opcode == 164: # modulo - value2 = stack.pop() - value1 = stack.pop() - res = value1 % value2 - stack.append(res) - elif opcode == 175: # greaterequals - value2 = stack.pop() - value1 = stack.pop() - result = value1 >= value2 - stack.append(result) - elif opcode == 208: # getlocal_0 - stack.append(registers[0]) - elif opcode == 209: # getlocal_1 - stack.append(registers[1]) - elif opcode == 210: # getlocal_2 - stack.append(registers[2]) - elif opcode == 211: # getlocal_3 - stack.append(registers[3]) - elif opcode == 214: # setlocal_2 - registers[2] = stack.pop() - elif opcode == 215: # setlocal_3 - registers[3] = stack.pop() - else: - raise NotImplementedError( - u'Unsupported opcode %d' % opcode) - - avm_class.method_pyfunctions[func_name] = resfunc - return resfunc - - initial_function = extract_function(searched_class, u'decipher') + searched_class = swfi.extract_class(TARGET_CLASSNAME) + initial_function = swfi.extract_function(searched_class, u'decipher') return lambda s: initial_function([s]) def _decrypt_signature(self, s, video_id, player_url, age_gate=False): |