diff options
author | Philipp Hagemeister <phihag@phihag.de> | 2013-09-21 14:19:30 +0200 |
---|---|---|
committer | Philipp Hagemeister <phihag@phihag.de> | 2013-09-21 14:40:35 +0200 |
commit | e0df6211cc9364f62406b2907fa830847324db53 (patch) | |
tree | e35ce2c9d52ae9eec7cb0cdb5a138ae2ef6c00ce /youtube_dl/extractor/youtube.py | |
parent | b00ca882a4c1069de1ec2d04ffd50905c0f8b97f (diff) |
Restore accidentally deleted commits
That's what happens if you let Windows machines write :(
Diffstat (limited to 'youtube_dl/extractor/youtube.py')
-rw-r--r-- | youtube_dl/extractor/youtube.py | 603 |
1 files changed, 586 insertions, 17 deletions
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 47d5cb7ff..456d3cb0f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1,11 +1,16 @@ # coding: utf-8 +import collections +import itertools +import io import json import netrc import re import socket -import itertools -import xml.etree.ElementTree +import string +import struct +import traceback +import zlib from .common import InfoExtractor, SearchInfoExtractor from .subtitles import SubtitlesInfoExtractor @@ -393,6 +398,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): if YoutubePlaylistIE.suitable(url): return False return re.match(cls._VALID_URL, url, re.VERBOSE) is not None + def __init__(self, *args, **kwargs): + super(YoutubeIE, self).__init__(*args, **kwargs) + self._jsplayer_cache = {} + def report_video_webpage_download(self, video_id): """Report attempt to download video webpage.""" self.to_screen(u'%s: Downloading video webpage' % video_id) @@ -413,15 +422,565 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): """Indicate the download will use the RTMP protocol.""" self.to_screen(u'RTMP download detected') - def _decrypt_signature(self, s): + def _extract_signature_function(self, video_id, player_url): + id_m = re.match(r'.*-(?P<id>[^.]+)\.(?P<ext>[^.]+)$', player_url) + player_type = id_m.group('ext') + player_id = id_m.group('id') + + if player_type == 'js': + code = self._download_webpage( + player_url, video_id, + note=u'Downloading %s player %s' % (player_type, jsplayer_id), + errnote=u'Download of %s failed' % player_url) + return self._parse_sig_js(code) + elif player_tpye == 'swf': + urlh = self._request_webpage( + player_url, video_id, + note=u'Downloading %s player %s' % (player_type, jsplayer_id), + errnote=u'Download of %s failed' % player_url) + code = urlh.read() + return self._parse_sig_swf(code) + else: + assert False, 'Invalid player type %r' % player_type + + def _parse_sig_js(self, jscode): + funcname = self._search_regex( + r'signature=([a-zA-Z]+)', jscode, + u'Initial JS player signature function name') + + functions = {} + + def argidx(varname): + return string.lowercase.index(varname) + + def interpret_statement(stmt, local_vars, allow_recursion=20): + if allow_recursion < 0: + raise ExctractorError(u'Recursion limit reached') + + if stmt.startswith(u'var '): + stmt = stmt[len(u'var '):] + ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' + + r'=(?P<expr>.*)$', stmt) + if ass_m: + if ass_m.groupdict().get('index'): + def assign(val): + lvar = local_vars[ass_m.group('out')] + idx = interpret_expression(ass_m.group('index'), + local_vars, allow_recursion) + assert isinstance(idx, int) + lvar[idx] = val + return val + expr = ass_m.group('expr') + else: + def assign(val): + local_vars[ass_m.group('out')] = val + return val + expr = ass_m.group('expr') + elif stmt.startswith(u'return '): + assign = lambda v: v + expr = stmt[len(u'return '):] + else: + raise ExtractorError( + u'Cannot determine left side of statement in %r' % stmt) + + v = interpret_expression(expr, local_vars, allow_recursion) + return assign(v) + + def interpret_expression(expr, local_vars, allow_recursion): + if expr.isdigit(): + return int(expr) + + if expr.isalpha(): + return local_vars[expr] + + m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr) + if m: + member = m.group('member') + val = local_vars[m.group('in')] + if member == 'split("")': + return list(val) + if member == 'join("")': + return u''.join(val) + if member == 'length': + return len(val) + if member == 'reverse()': + return val[::-1] + slice_m = re.match(r'slice\((?P<idx>.*)\)', member) + if slice_m: + idx = interpret_expression( + slice_m.group('idx'), local_vars, allow_recursion-1) + return val[idx:] + + m = re.match( + r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr) + if m: + val = local_vars[m.group('in')] + idx = interpret_expression(m.group('idx'), local_vars, + allow_recursion-1) + return val[idx] + + m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr) + if m: + a = interpret_expression(m.group('a'), + local_vars, allow_recursion) + b = interpret_expression(m.group('b'), + local_vars, allow_recursion) + return a % b + + m = re.match( + r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr) + if m: + fname = m.group('func') + if fname not in functions: + functions[fname] = extract_function(fname) + argvals = [int(v) if v.isdigit() else local_vars[v] + for v in m.group('args').split(',')] + return functions[fname](argvals) + raise ExtractorError(u'Unsupported JS expression %r' % expr) + + def extract_function(funcname): + func_m = re.search( + r'function ' + re.escape(funcname) + + r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}', + jscode) + argnames = func_m.group('args').split(',') + + def resf(args): + local_vars = dict(zip(argnames, args)) + for stmt in func_m.group('code').split(';'): + res = interpret_statement(stmt, local_vars) + return res + return resf + + initial_function = extract_function(funcname) + return lambda s: initial_function([s]) + + def _parse_sig_swf(self, file_contents): + if file_contents[1:3] != b'WS': + raise ExtractorError( + u'Not an SWF file; header is %r' % file_contents[:3]) + if file_contents[:1] == b'C': + content = zlib.decompress(file_contents[8:]) + else: + raise NotImplementedError(u'Unsupported compression format %r' % + file_contents[:1]) + + def extract_tags(content): + pos = 0 + while pos < len(content): + header16 = struct.unpack('<H', content[pos:pos+2])[0] + pos += 2 + tag_code = header16 >> 6 + tag_len = header16 & 0x3f + if tag_len == 0x3f: + tag_len = struct.unpack('<I', content[pos:pos+4])[0] + pos += 4 + assert pos+tag_len <= len(content) + yield (tag_code, content[pos:pos+tag_len]) + pos += tag_len + + code_tag = next(tag + for tag_code, tag in extract_tags(content) + if tag_code == 82) + p = code_tag.index(b'\0', 4) + 1 + + # Parse ABC (AVM2 ByteCode) + def read_int(data=None, pos=None): + if hasattr(data, 'read'): + assert pos is None + + res = 0 + shift = 0 + for _ in range(5): + buf = data.read(1) + assert len(buf) == 1 + b = struct.unpack('<B', buf)[0] + res = res | ((b & 0x7f) << shift) + if b & 0x80 == 0: + break + shift += 7 + return res + + if data is None: + data = code_tag + if pos is None: + pos = p + res = 0 + shift = 0 + for _ in range(5): + b = struct.unpack('<B', data[pos:pos+1])[0] + pos += 1 + res = res | ((b & 0x7f) << shift) + if b & 0x80 == 0: + break + shift += 7 + return (res, pos) + assert read_int(b'\x00', 0) == (0, 1) + assert read_int(b'\x10', 0) == (16, 1) + assert read_int(b'\x34', 0) == (0x34, 1) + assert read_int(b'\xb4\x12', 0) == (0x12 * 0x80 + 0x34, 2) + assert read_int(b'\xff\xff\xff\x00', 0) == (0x1fffff, 4) + + def u30(*args, **kwargs): + res = read_int(*args, **kwargs) + if isinstance(res, tuple): + assert res[0] & 0xf0000000 == 0 + else: + assert res & 0xf0000000 == 0 + return res + u32 = read_int + + def s32(data=None, pos=None): + v, pos = read_int(data, pos) + if v & 0x80000000 != 0: + v = - ((v ^ 0xffffffff) + 1) + return (v, pos) + assert s32(b'\xff\xff\xff\xff\x0f', 0) == (-1, 5) + + def string(): + slen, p = u30() + return (code_tag[p:p+slen].decode('utf-8'), p + slen) + + def read_byte(data=None, pos=None): + if data is None: + data = code_tag + if pos is None: + pos = p + res = struct.unpack('<B', data[pos:pos+1])[0] + return (res, pos + 1) + + # minor_version + major_version + p += 2 + 2 + + # Constant pool + int_count, p = u30() + for _c in range(1, int_count): + _, p = s32() + uint_count, p = u30() + for _c in range(1, uint_count): + _, p = u32() + double_count, p = u30() + p += (double_count-1) * 8 + string_count, p = u30() + constant_strings = [u''] + for _c in range(1, string_count): + s, p = string() + constant_strings.append(s) + namespace_count, p = u30() + for _c in range(1, namespace_count): + p += 1 # kind + _, p = u30() # name + ns_set_count, p = u30() + for _c in range(1, ns_set_count): + count, p = u30() + for _c2 in range(count): + _, p = u30() + multiname_count, p = u30() + MULTINAME_SIZES = { + 0x07: 2, # QName + 0x0d: 2, # QNameA + 0x0f: 1, # RTQName + 0x10: 1, # RTQNameA + 0x11: 0, # RTQNameL + 0x12: 0, # RTQNameLA + 0x09: 2, # Multiname + 0x0e: 2, # MultinameA + 0x1b: 1, # MultinameL + 0x1c: 1, # MultinameLA + } + multinames = [u''] + for _c in range(1, multiname_count): + kind, p = u30() + assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind + if kind == 0x07: + namespace_idx, p = u30() + name_idx, p = u30() + multinames.append(constant_strings[name_idx]) + else: + multinames.append('[MULTINAME kind: %d]' % kind) + for _c2 in range(MULTINAME_SIZES[kind]): + _, p = u30() + + # Methods + method_count, p = u30() + MethodInfo = collections.namedtuple( + 'MethodInfo', + ['NEED_ARGUMENTS', 'NEED_REST']) + method_infos = [] + for method_id in range(method_count): + param_count, p = u30() + _, p = u30() # return type + for _ in range(param_count): + _, p = u30() # param type + _, p = u30() # name index (always 0 for youtube) + flags, p = read_byte() + if flags & 0x08 != 0: + # Options present + option_count, p = u30() + for c in range(option_count): + _, p = u30() # val + p += 1 # kind + if flags & 0x80 != 0: + # Param names present + for _ in range(param_count): + _, p = u30() # param name + mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0) + method_infos.append(mi) + + # Metadata + metadata_count, p = u30() + for _c in range(metadata_count): + _, p = u30() # name + item_count, p = u30() + for _c2 in range(item_count): + _, p = u30() # key + _, p = u30() # value + + def parse_traits_info(pos=None): + if pos is None: + pos = p + trait_name_idx, pos = u30(pos=pos) + kind_full, pos = read_byte(pos=pos) + kind = kind_full & 0x0f + attrs = kind_full >> 4 + methods = {} + if kind in [0x00, 0x06]: # Slot or Const + _, pos = u30(pos=pos) # Slot id + type_name_idx, pos = u30(pos=pos) + vindex, pos = u30(pos=pos) + if vindex != 0: + _, pos = read_byte(pos=pos) # vkind + elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter + _, pos = u30(pos=pos) # disp_id + method_idx, pos = u30(pos=pos) + methods[multinames[trait_name_idx]] = method_idx + elif kind == 0x04: # Class + _, pos = u30(pos=pos) # slot_id + _, pos = u30(pos=pos) # classi + elif kind == 0x05: # Function + _, pos = u30(pos=pos) # slot_id + function_idx, pos = u30(pos=pos) + methods[function_idx] = multinames[trait_name_idx] + else: + raise ExtractorError(u'Unsupported trait kind %d' % kind) + + if attrs & 0x4 != 0: # Metadata present + metadata_count, pos = u30(pos=pos) + for _c3 in range(metadata_count): + _, pos = u30(pos=pos) + + return (methods, pos) + + # Classes + TARGET_CLASSNAME = u'SignatureDecipher' + searched_idx = multinames.index(TARGET_CLASSNAME) + searched_class_id = None + class_count, p = u30() + for class_id in range(class_count): + name_idx, p = u30() + if name_idx == searched_idx: + # We found the class we're looking for! + searched_class_id = class_id + _, p = u30() # super_name idx + flags, p = read_byte() + if flags & 0x08 != 0: # Protected namespace is present + protected_ns_idx, p = u30() + intrf_count, p = u30() + for _c2 in range(intrf_count): + _, p = u30() + _, p = u30() # iinit + trait_count, p = u30() + for _c2 in range(trait_count): + _, p = parse_traits_info() + + if searched_class_id is None: + raise ExtractorError(u'Target class %r not found' % + TARGET_CLASSNAME) + + method_names = {} + method_idxs = {} + for class_id in range(class_count): + _, p = u30() # cinit + trait_count, p = u30() + for _c2 in range(trait_count): + trait_methods, p = parse_traits_info() + if class_id == searched_class_id: + method_names.update(trait_methods.items()) + method_idxs.update(dict( + (idx, name) + for name, idx in trait_methods.items())) + + # Scripts + script_count, p = u30() + for _c in range(script_count): + _, p = u30() # init + trait_count, p = u30() + for _c2 in range(trait_count): + _, p = parse_traits_info() + + # Method bodies + method_body_count, p = u30() + Method = collections.namedtuple('Method', ['code', 'local_count']) + methods = {} + for _c in range(method_body_count): + method_idx, p = u30() + max_stack, p = u30() + local_count, p = u30() + init_scope_depth, p = u30() + max_scope_depth, p = u30() + code_length, p = u30() + if method_idx in method_idxs: + m = Method(code_tag[p:p+code_length], local_count) + methods[method_idxs[method_idx]] = m + p += code_length + exception_count, p = u30() + for _c2 in range(exception_count): + _, p = u30() # from + _, p = u30() # to + _, p = u30() # target + _, p = u30() # exc_type + _, p = u30() # var_name + trait_count, p = u30() + for _c2 in range(trait_count): + _, p = parse_traits_info() + + assert p == len(code_tag) + assert len(methods) == len(method_idxs) + + method_pyfunctions = {} + + def extract_function(func_name): + if func_name in method_pyfunctions: + return method_pyfunctions[func_name] + if func_name not in methods: + raise ExtractorError(u'Cannot find function %r' % func_name) + m = methods[func_name] + + def resfunc(args): + print('Entering function %s(%r)' % (func_name, args)) + registers = ['(this)'] + list(args) + [None] * m.local_count + stack = [] + coder = io.BytesIO(m.code) + while True: + opcode = struct.unpack('!B', coder.read(1))[0] + if opcode == 208: # getlocal_0 + stack.append(registers[0]) + elif opcode == 209: # getlocal_1 + stack.append(registers[1]) + elif opcode == 210: # getlocal_2 + stack.append(registers[2]) + elif opcode == 36: # pushbyte + v = struct.unpack('!B', coder.read(1))[0] + stack.append(v) + elif opcode == 44: # pushstring + idx = u30(coder) + stack.append(constant_strings[idx]) + elif opcode == 48: # pushscope + # We don't implement the scope register, so we'll just + # ignore the popped value + stack.pop() + elif opcode == 70: # callproperty + index = u30(coder) + mname = multinames[index] + arg_count = u30(coder) + args = list(reversed( + [stack.pop() for _ in range(arg_count)])) + obj = stack.pop() + if mname == u'split': + assert len(args) == 1 + assert isinstance(args[0], compat_str) + assert isinstance(obj, compat_str) + if args[0] == u'': + res = list(obj) + else: + res = obj.split(args[0]) + stack.append(res) + elif mname in method_pyfunctions: + stack.append(method_pyfunctions[mname](args)) + else: + raise NotImplementedError( + u'Unsupported property %r on %r' + % (mname, obj)) + elif opcode == 93: # findpropstrict + index = u30(coder) + mname = multinames[index] + res = extract_function(mname) + stack.append(res) + elif opcode == 97: # setproperty + index = u30(coder) + value = stack.pop() + idx = stack.pop() + obj = stack.pop() + assert isinstance(obj, list) + assert isinstance(idx, int) + obj[idx] = value + elif opcode == 98: # getlocal + index = u30(coder) + stack.append(registers[index]) + elif opcode == 99: # setlocal + index = u30(coder) + value = stack.pop() + registers[index] = value + elif opcode == 102: # getproperty + index = u30(coder) + pname = multinames[index] + if pname == u'length': + obj = stack.pop() + assert isinstance(obj, list) + stack.append(len(obj)) + else: # Assume attribute access + idx = stack.pop() + assert isinstance(idx, int) + obj = stack.pop() + assert isinstance(obj, list) + stack.append(obj[idx]) + elif opcode == 128: # coerce + _ = u30(coder) + elif opcode == 133: # coerce_s + assert isinstance(stack[-1], (type(None), compat_str)) + elif opcode == 164: # modulo + value2 = stack.pop() + value1 = stack.pop() + res = value1 % value2 + stack.append(res) + elif opcode == 214: # setlocal_2 + registers[2] = stack.pop() + elif opcode == 215: # setlocal_3 + registers[3] = stack.pop() + else: + raise NotImplementedError( + u'Unsupported opcode %d' % opcode) + + method_pyfunctions[func_name] = resfunc + return resfunc + + initial_function = extract_function(u'decipher') + return lambda s: initial_function([s]) + + def _decrypt_signature(self, s, video_id, jsplayer_url, age_gate=False): """Turn the encrypted s field into a working signature""" - if len(s) == 93: - return s[86:29:-1] + s[88] + s[28:5:-1] - elif len(s) == 92: + if jsplayer_url is not None: + try: + if jsplayer_url not in self._jsplayer_cache: + self._jsplayer_cache[jsplayer_url] = self._extract_signature_function( + video_id, jsplayer_url + ) + return self._jsplayer_cache[jsplayer_url]([s]) + except Exception as e: + tb = traceback.format_exc() + self._downloader.report_warning(u'Automatic signature extraction failed: ' + tb) + + self._downloader.report_warning(u'Warning: Falling back to static signature algorithm') + + if age_gate: + # The videos with age protection use another player, so the + # algorithms can be different. + if len(s) == 86: + return s[2:63] + s[82] + s[64:82] + s[63] + + if len(s) == 92: return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83] - elif len(s) == 91: - return s[84:27:-1] + s[86] + s[26:5:-1] elif len(s) == 90: return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81] elif len(s) == 89: @@ -631,7 +1190,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): video_webpage = video_webpage_bytes.decode('utf-8', 'ignore') # Attempt to extract SWF player URL - mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) + mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) if mobj is not None: player_url = re.sub(r'\\(.)', r'\1', mobj.group(1)) else: @@ -784,21 +1343,31 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): if 'sig' in url_data: url += '&signature=' + url_data['sig'][0] elif 's' in url_data: + encrypted_sig = url_data['s'][0] if self._downloader.params.get('verbose'): - s = url_data['s'][0] if age_gate: - player = 'flash player' + player_version = self._search_regex(r'-(.+)\.swf$', + player_url if player_url else 'NOT FOUND', + 'flash player', fatal=False) + player_desc = 'flash player %s' % player_version else: - player = u'html5 player %s' % self._search_regex(r'html5player-(.+?)\.js', video_webpage, + player_version = self._search_regex(r'html5player-(.+?)\.js', video_webpage, 'html5 player', fatal=False) - parts_sizes = u'.'.join(compat_str(len(part)) for part in s.split('.')) + player_desc = u'html5 player %s' % player_version + + parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.')) self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' % - (len(s), parts_sizes, url_data['itag'][0], player)) - encrypted_sig = url_data['s'][0] + (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc)) + if age_gate: - signature = self._decrypt_signature_age_gate(encrypted_sig) + jsplayer_url = None else: - signature = self._decrypt_signature(encrypted_sig) + jsplayer_url_json = self._search_regex( + r'"assets":.+?"js":\s*("[^"]+")', + video_webpage, u'JS player URL') + jsplayer_url = json.loads(jsplayer_url_json) + + signature = self._decrypt_signature(encrypted_sig, video_id, jsplayer_url, age_gate) url += '&signature=' + signature if 'ratebypass' not in url: url += '&ratebypass=yes' |