diff options
Diffstat (limited to 'youtube_dl/extractor/youtube.py')
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 476 | 
1 files changed, 36 insertions, 440 deletions
| diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6123e1256..072e711c2 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1,19 +1,17 @@  # coding: utf-8 -import collections  import errno  import io  import itertools  import json  import os.path  import re -import struct  import traceback -import zlib  from .common import InfoExtractor, SearchInfoExtractor  from .subtitles import SubtitlesInfoExtractor  from ..jsinterp import JSInterpreter +from ..swfinterp import SWFInterpreter  from ..utils import (      compat_chr,      compat_parse_qs, @@ -347,8 +345,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          self.to_screen(u'RTMP download detected')      def _extract_signature_function(self, video_id, player_url, slen): -        id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$', -                        player_url) +        id_m = re.match( +            r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3)?\.(?P<ext>[a-z]+)$', +            player_url)          player_type = id_m.group('ext')          player_id = id_m.group('id') @@ -449,417 +448,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          return lambda s: initial_function([s])      def _parse_sig_swf(self, file_contents): -        if file_contents[1:3] != b'WS': -            raise ExtractorError( -                u'Not an SWF file; header is %r' % file_contents[:3]) -        if file_contents[:1] == b'C': -            content = zlib.decompress(file_contents[8:]) -        else: -            raise NotImplementedError(u'Unsupported compression format %r' % -                                      file_contents[:1]) - -        def extract_tags(content): -            pos = 0 -            while pos < len(content): -                header16 = struct.unpack('<H', content[pos:pos+2])[0] -                pos += 2 -                tag_code = header16 >> 6 -                tag_len = header16 & 0x3f -                if tag_len == 0x3f: -                    tag_len = struct.unpack('<I', content[pos:pos+4])[0] -                    pos += 4 -                assert pos+tag_len <= len(content) -                yield (tag_code, content[pos:pos+tag_len]) -                pos += tag_len - -        code_tag = next(tag -                        for tag_code, tag in extract_tags(content) -                        if tag_code == 82) -        p = code_tag.index(b'\0', 4) + 1 -        code_reader = io.BytesIO(code_tag[p:]) - -        # Parse ABC (AVM2 ByteCode) -        def read_int(reader=None): -            if reader is None: -                reader = code_reader -            res = 0 -            shift = 0 -            for _ in range(5): -                buf = reader.read(1) -                assert len(buf) == 1 -                b = struct.unpack('<B', buf)[0] -                res = res | ((b & 0x7f) << shift) -                if b & 0x80 == 0: -                    break -                shift += 7 -            return res - -        def u30(reader=None): -            res = read_int(reader) -            assert res & 0xf0000000 == 0 -            return res -        u32 = read_int - -        def s32(reader=None): -            v = read_int(reader) -            if v & 0x80000000 != 0: -                v = - ((v ^ 0xffffffff) + 1) -            return v - -        def read_string(reader=None): -            if reader is None: -                reader = code_reader -            slen = u30(reader) -            resb = reader.read(slen) -            assert len(resb) == slen -            return resb.decode('utf-8') - -        def read_bytes(count, reader=None): -            if reader is None: -                reader = code_reader -            resb = reader.read(count) -            assert len(resb) == count -            return resb - -        def read_byte(reader=None): -            resb = read_bytes(1, reader=reader) -            res = struct.unpack('<B', resb)[0] -            return res - -        # minor_version + major_version -        read_bytes(2 + 2) - -        # Constant pool -        int_count = u30() -        for _c in range(1, int_count): -            s32() -        uint_count = u30() -        for _c in range(1, uint_count): -            u32() -        double_count = u30() -        read_bytes((double_count-1) * 8) -        string_count = u30() -        constant_strings = [u''] -        for _c in range(1, string_count): -            s = read_string() -            constant_strings.append(s) -        namespace_count = u30() -        for _c in range(1, namespace_count): -            read_bytes(1)  # kind -            u30()  # name -        ns_set_count = u30() -        for _c in range(1, ns_set_count): -            count = u30() -            for _c2 in range(count): -                u30() -        multiname_count = u30() -        MULTINAME_SIZES = { -            0x07: 2,  # QName -            0x0d: 2,  # QNameA -            0x0f: 1,  # RTQName -            0x10: 1,  # RTQNameA -            0x11: 0,  # RTQNameL -            0x12: 0,  # RTQNameLA -            0x09: 2,  # Multiname -            0x0e: 2,  # MultinameA -            0x1b: 1,  # MultinameL -            0x1c: 1,  # MultinameLA -        } -        multinames = [u''] -        for _c in range(1, multiname_count): -            kind = u30() -            assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind -            if kind == 0x07: -                u30()  # namespace_idx -                name_idx = u30() -                multinames.append(constant_strings[name_idx]) -            else: -                multinames.append('[MULTINAME kind: %d]' % kind) -                for _c2 in range(MULTINAME_SIZES[kind]): -                    u30() - -        # Methods -        method_count = u30() -        MethodInfo = collections.namedtuple( -            'MethodInfo', -            ['NEED_ARGUMENTS', 'NEED_REST']) -        method_infos = [] -        for method_id in range(method_count): -            param_count = u30() -            u30()  # return type -            for _ in range(param_count): -                u30()  # param type -            u30()  # name index (always 0 for youtube) -            flags = read_byte() -            if flags & 0x08 != 0: -                # Options present -                option_count = u30() -                for c in range(option_count): -                    u30()  # val -                    read_bytes(1)  # kind -            if flags & 0x80 != 0: -                # Param names present -                for _ in range(param_count): -                    u30()  # param name -            mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0) -            method_infos.append(mi) - -        # Metadata -        metadata_count = u30() -        for _c in range(metadata_count): -            u30()  # name -            item_count = u30() -            for _c2 in range(item_count): -                u30()  # key -                u30()  # value - -        def parse_traits_info(): -            trait_name_idx = u30() -            kind_full = read_byte() -            kind = kind_full & 0x0f -            attrs = kind_full >> 4 -            methods = {} -            if kind in [0x00, 0x06]:  # Slot or Const -                u30()  # Slot id -                u30()  # type_name_idx -                vindex = u30() -                if vindex != 0: -                    read_byte()  # vkind -            elif kind in [0x01, 0x02, 0x03]:  # Method / Getter / Setter -                u30()  # disp_id -                method_idx = u30() -                methods[multinames[trait_name_idx]] = method_idx -            elif kind == 0x04:  # Class -                u30()  # slot_id -                u30()  # classi -            elif kind == 0x05:  # Function -                u30()  # slot_id -                function_idx = u30() -                methods[function_idx] = multinames[trait_name_idx] -            else: -                raise ExtractorError(u'Unsupported trait kind %d' % kind) - -            if attrs & 0x4 != 0:  # Metadata present -                metadata_count = u30() -                for _c3 in range(metadata_count): -                    u30()  # metadata index - -            return methods - -        # Classes +        swfi = SWFInterpreter(file_contents)          TARGET_CLASSNAME = u'SignatureDecipher' -        searched_idx = multinames.index(TARGET_CLASSNAME) -        searched_class_id = None -        class_count = u30() -        for class_id in range(class_count): -            name_idx = u30() -            if name_idx == searched_idx: -                # We found the class we're looking for! -                searched_class_id = class_id -            u30()  # super_name idx -            flags = read_byte() -            if flags & 0x08 != 0:  # Protected namespace is present -                u30()  # protected_ns_idx -            intrf_count = u30() -            for _c2 in range(intrf_count): -                u30() -            u30()  # iinit -            trait_count = u30() -            for _c2 in range(trait_count): -                parse_traits_info() - -        if searched_class_id is None: -            raise ExtractorError(u'Target class %r not found' % -                                 TARGET_CLASSNAME) - -        method_names = {} -        method_idxs = {} -        for class_id in range(class_count): -            u30()  # cinit -            trait_count = u30() -            for _c2 in range(trait_count): -                trait_methods = parse_traits_info() -                if class_id == searched_class_id: -                    method_names.update(trait_methods.items()) -                    method_idxs.update(dict( -                        (idx, name) -                        for name, idx in trait_methods.items())) - -        # Scripts -        script_count = u30() -        for _c in range(script_count): -            u30()  # init -            trait_count = u30() -            for _c2 in range(trait_count): -                parse_traits_info() - -        # Method bodies -        method_body_count = u30() -        Method = collections.namedtuple('Method', ['code', 'local_count']) -        methods = {} -        for _c in range(method_body_count): -            method_idx = u30() -            u30()  # max_stack -            local_count = u30() -            u30()  # init_scope_depth -            u30()  # max_scope_depth -            code_length = u30() -            code = read_bytes(code_length) -            if method_idx in method_idxs: -                m = Method(code, local_count) -                methods[method_idxs[method_idx]] = m -            exception_count = u30() -            for _c2 in range(exception_count): -                u30()  # from -                u30()  # to -                u30()  # target -                u30()  # exc_type -                u30()  # var_name -            trait_count = u30() -            for _c2 in range(trait_count): -                parse_traits_info() - -        assert p + code_reader.tell() == len(code_tag) -        assert len(methods) == len(method_idxs) - -        method_pyfunctions = {} - -        def extract_function(func_name): -            if func_name in method_pyfunctions: -                return method_pyfunctions[func_name] -            if func_name not in methods: -                raise ExtractorError(u'Cannot find function %r' % func_name) -            m = methods[func_name] - -            def resfunc(args): -                registers = ['(this)'] + list(args) + [None] * m.local_count -                stack = [] -                coder = io.BytesIO(m.code) -                while True: -                    opcode = struct.unpack('!B', coder.read(1))[0] -                    if opcode == 36:  # pushbyte -                        v = struct.unpack('!B', coder.read(1))[0] -                        stack.append(v) -                    elif opcode == 44:  # pushstring -                        idx = u30(coder) -                        stack.append(constant_strings[idx]) -                    elif opcode == 48:  # pushscope -                        # We don't implement the scope register, so we'll just -                        # ignore the popped value -                        stack.pop() -                    elif opcode == 70:  # callproperty -                        index = u30(coder) -                        mname = multinames[index] -                        arg_count = u30(coder) -                        args = list(reversed( -                            [stack.pop() for _ in range(arg_count)])) -                        obj = stack.pop() -                        if mname == u'split': -                            assert len(args) == 1 -                            assert isinstance(args[0], compat_str) -                            assert isinstance(obj, compat_str) -                            if args[0] == u'': -                                res = list(obj) -                            else: -                                res = obj.split(args[0]) -                            stack.append(res) -                        elif mname == u'slice': -                            assert len(args) == 1 -                            assert isinstance(args[0], int) -                            assert isinstance(obj, list) -                            res = obj[args[0]:] -                            stack.append(res) -                        elif mname == u'join': -                            assert len(args) == 1 -                            assert isinstance(args[0], compat_str) -                            assert isinstance(obj, list) -                            res = args[0].join(obj) -                            stack.append(res) -                        elif mname in method_pyfunctions: -                            stack.append(method_pyfunctions[mname](args)) -                        else: -                            raise NotImplementedError( -                                u'Unsupported property %r on %r' -                                % (mname, obj)) -                    elif opcode == 72:  # returnvalue -                        res = stack.pop() -                        return res -                    elif opcode == 79:  # callpropvoid -                        index = u30(coder) -                        mname = multinames[index] -                        arg_count = u30(coder) -                        args = list(reversed( -                            [stack.pop() for _ in range(arg_count)])) -                        obj = stack.pop() -                        if mname == u'reverse': -                            assert isinstance(obj, list) -                            obj.reverse() -                        else: -                            raise NotImplementedError( -                                u'Unsupported (void) property %r on %r' -                                % (mname, obj)) -                    elif opcode == 93:  # findpropstrict -                        index = u30(coder) -                        mname = multinames[index] -                        res = extract_function(mname) -                        stack.append(res) -                    elif opcode == 97:  # setproperty -                        index = u30(coder) -                        value = stack.pop() -                        idx = stack.pop() -                        obj = stack.pop() -                        assert isinstance(obj, list) -                        assert isinstance(idx, int) -                        obj[idx] = value -                    elif opcode == 98:  # getlocal -                        index = u30(coder) -                        stack.append(registers[index]) -                    elif opcode == 99:  # setlocal -                        index = u30(coder) -                        value = stack.pop() -                        registers[index] = value -                    elif opcode == 102:  # getproperty -                        index = u30(coder) -                        pname = multinames[index] -                        if pname == u'length': -                            obj = stack.pop() -                            assert isinstance(obj, list) -                            stack.append(len(obj)) -                        else:  # Assume attribute access -                            idx = stack.pop() -                            assert isinstance(idx, int) -                            obj = stack.pop() -                            assert isinstance(obj, list) -                            stack.append(obj[idx]) -                    elif opcode == 128:  # coerce -                        u30(coder) -                    elif opcode == 133:  # coerce_s -                        assert isinstance(stack[-1], (type(None), compat_str)) -                    elif opcode == 164:  # modulo -                        value2 = stack.pop() -                        value1 = stack.pop() -                        res = value1 % value2 -                        stack.append(res) -                    elif opcode == 208:  # getlocal_0 -                        stack.append(registers[0]) -                    elif opcode == 209:  # getlocal_1 -                        stack.append(registers[1]) -                    elif opcode == 210:  # getlocal_2 -                        stack.append(registers[2]) -                    elif opcode == 211:  # getlocal_3 -                        stack.append(registers[3]) -                    elif opcode == 214:  # setlocal_2 -                        registers[2] = stack.pop() -                    elif opcode == 215:  # setlocal_3 -                        registers[3] = stack.pop() -                    else: -                        raise NotImplementedError( -                            u'Unsupported opcode %d' % opcode) - -            method_pyfunctions[func_name] = resfunc -            return resfunc - -        initial_function = extract_function(u'decipher') +        searched_class = swfi.extract_class(TARGET_CLASSNAME) +        initial_function = swfi.extract_function(searched_class, u'decipher')          return lambda s: initial_function([s])      def _decrypt_signature(self, s, video_id, player_url, age_gate=False): @@ -1014,14 +606,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):              age_gate = True              # We simulate the access to the video from www.youtube.com/v/{video_id}              # this can be viewed without login into Youtube -            data = compat_urllib_parse.urlencode({'video_id': video_id, -                                                  'el': 'player_embedded', -                                                  'gl': 'US', -                                                  'hl': 'en', -                                                  'eurl': 'https://youtube.googleapis.com/v/' + video_id, -                                                  'asv': 3, -                                                  'sts':'1588', -                                                  }) +            data = compat_urllib_parse.urlencode({ +                'video_id': video_id, +                'eurl': 'https://youtube.googleapis.com/v/' + video_id, +                'sts':'16268', +            })              video_info_url = proto + '://www.youtube.com/get_video_info?' + data              video_info_webpage = self._download_webpage(video_info_url, video_id,                                      note=False, @@ -1220,31 +809,38 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):                          url += '&signature=' + url_data['sig'][0]                      elif 's' in url_data:                          encrypted_sig = url_data['s'][0] + +                        if not age_gate: +                            jsplayer_url_json = self._search_regex( +                                r'"assets":.+?"js":\s*("[^"]+")', +                                video_webpage, u'JS player URL') +                            player_url = json.loads(jsplayer_url_json) +                        if player_url is None: +                            player_url_json = self._search_regex( +                                r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")', +                                video_webpage, u'age gate player URL') +                            player_url = json.loads(player_url_json) +                          if self._downloader.params.get('verbose'): -                            if age_gate: -                                if player_url is None: -                                    player_version = 'unknown' -                                else: +                            if player_url is None: +                                player_version = 'unknown' +                                player_desc = 'unknown' +                            else: +                                if player_url.endswith('swf'):                                      player_version = self._search_regex( -                                        r'-(.+)\.swf$', player_url, +                                        r'-(.+?)(?:/watch_as3)?\.swf$', player_url,                                          u'flash player', fatal=False) -                                player_desc = 'flash player %s' % player_version -                            else: -                                player_version = self._search_regex( -                                    r'html5player-(.+?)\.js', video_webpage, -                                    'html5 player', fatal=False) -                                player_desc = u'html5 player %s' % player_version +                                    player_desc = 'flash player %s' % player_version +                                else: +                                    player_version = self._search_regex( +                                        r'html5player-(.+?)\.js', video_webpage, +                                        'html5 player', fatal=False) +                                    player_desc = u'html5 player %s' % player_version                              parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))                              self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %                                  (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc)) -                        if not age_gate: -                            jsplayer_url_json = self._search_regex( -                                r'"assets":.+?"js":\s*("[^"]+")', -                                video_webpage, u'JS player URL') -                            player_url = json.loads(jsplayer_url_json) -                          signature = self._decrypt_signature(                              encrypted_sig, video_id, player_url, age_gate)                          url += '&signature=' + signature | 
