diff options
Diffstat (limited to 'tcg')
-rw-r--r-- | tcg/aarch64/tcg-target.inc.c | 7 | ||||
-rw-r--r-- | tcg/arm/tcg-target.inc.c | 78 | ||||
-rw-r--r-- | tcg/ppc/tcg-target.inc.c | 71 | ||||
-rw-r--r-- | tcg/tcg-runtime.c | 28 | ||||
-rw-r--r-- | tcg/tcg.c | 20 | ||||
-rw-r--r-- | tcg/tcg.h | 2 |
6 files changed, 89 insertions, 117 deletions
diff --git a/tcg/aarch64/tcg-target.inc.c b/tcg/aarch64/tcg-target.inc.c index 5f185458f1..1fa3bccc89 100644 --- a/tcg/aarch64/tcg-target.inc.c +++ b/tcg/aarch64/tcg-target.inc.c @@ -616,7 +616,12 @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd, /* Look for host pointer values within 4G of the PC. This happens often when loading pointers to QEMU's own data structures. */ if (type == TCG_TYPE_I64) { - tcg_target_long disp = (value >> 12) - ((intptr_t)s->code_ptr >> 12); + tcg_target_long disp = value - (intptr_t)s->code_ptr; + if (disp == sextract64(disp, 0, 21)) { + tcg_out_insn(s, 3406, ADR, rd, disp); + return; + } + disp = (value >> 12) - ((intptr_t)s->code_ptr >> 12); if (disp == sextract64(disp, 0, 21)) { tcg_out_insn(s, 3406, ADRP, rd, disp); if (value & 0xfff) { diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c index 9f5cb66718..d1793ec77d 100644 --- a/tcg/arm/tcg-target.inc.c +++ b/tcg/arm/tcg-target.inc.c @@ -418,23 +418,37 @@ static inline void tcg_out_dat_imm(TCGContext *s, static void tcg_out_movi32(TCGContext *s, int cond, int rd, uint32_t arg) { - int rot, opc, rn; - - /* For armv7, make sure not to use movw+movt when mov/mvn would do. - Speed things up by only checking when movt would be required. - Prior to armv7, have one go at fully rotated immediates before - doing the decomposition thing below. */ - if (!use_armv7_instructions || (arg & 0xffff0000)) { - rot = encode_imm(arg); + int rot, opc, rn, diff; + + /* Check a single MOV/MVN before anything else. */ + rot = encode_imm(arg); + if (rot >= 0) { + tcg_out_dat_imm(s, cond, ARITH_MOV, rd, 0, + rotl(arg, rot) | (rot << 7)); + return; + } + rot = encode_imm(~arg); + if (rot >= 0) { + tcg_out_dat_imm(s, cond, ARITH_MVN, rd, 0, + rotl(~arg, rot) | (rot << 7)); + return; + } + + /* Check for a pc-relative address. This will usually be the TB, + or within the TB, which is immediately before the code block. */ + diff = arg - ((intptr_t)s->code_ptr + 8); + if (diff >= 0) { + rot = encode_imm(diff); if (rot >= 0) { - tcg_out_dat_imm(s, cond, ARITH_MOV, rd, 0, - rotl(arg, rot) | (rot << 7)); + tcg_out_dat_imm(s, cond, ARITH_ADD, rd, TCG_REG_PC, + rotl(diff, rot) | (rot << 7)); return; } - rot = encode_imm(~arg); + } else { + rot = encode_imm(-diff); if (rot >= 0) { - tcg_out_dat_imm(s, cond, ARITH_MVN, rd, 0, - rotl(~arg, rot) | (rot << 7)); + tcg_out_dat_imm(s, cond, ARITH_SUB, rd, TCG_REG_PC, + rotl(-diff, rot) | (rot << 7)); return; } } @@ -1026,16 +1040,6 @@ static void tcg_out_call(TCGContext *s, tcg_insn_unit *addr) } } -void arm_tb_set_jmp_target(uintptr_t jmp_addr, uintptr_t addr) -{ - tcg_insn_unit *code_ptr = (tcg_insn_unit *)jmp_addr; - tcg_insn_unit *target = (tcg_insn_unit *)addr; - - /* we could use a ldr pc, [pc, #-4] kind of branch and avoid the flush */ - reloc_pc24_atomic(code_ptr, target); - flush_icache_range(jmp_addr, jmp_addr + 4); -} - static inline void tcg_out_goto_label(TCGContext *s, int cond, TCGLabel *l) { if (l->has_value) { @@ -1665,17 +1669,27 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, } break; case INDEX_op_goto_tb: - if (s->tb_jmp_insn_offset) { - /* Direct jump method */ - s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s); - tcg_out_b_noaddr(s, COND_AL); - } else { + { /* Indirect jump method */ - intptr_t ptr = (intptr_t)(s->tb_jmp_target_addr + args[0]); - tcg_out_movi32(s, COND_AL, TCG_REG_R0, ptr & ~0xfff); - tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_R0, ptr & 0xfff); + intptr_t ptr, dif, dil; + TCGReg base = TCG_REG_PC; + + tcg_debug_assert(s->tb_jmp_insn_offset == 0); + ptr = (intptr_t)(s->tb_jmp_target_addr + args[0]); + dif = ptr - ((intptr_t)s->code_ptr + 8); + dil = sextract32(dif, 0, 12); + if (dif != dil) { + /* The TB is close, but outside the 12 bits addressable by + the load. We can extend this to 20 bits with a sub of a + shifted immediate from pc. In the vastly unlikely event + the code requires more than 1MB, we'll use 2 insns and + be no worse off. */ + base = TCG_REG_R0; + tcg_out_movi32(s, COND_AL, base, ptr - dil); + } + tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, base, dil); + s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s); } - s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s); break; case INDEX_op_goto_ptr: tcg_out_bx(s, COND_AL, args[0]); diff --git a/tcg/ppc/tcg-target.inc.c b/tcg/ppc/tcg-target.inc.c index 8d50f18328..1f690df20d 100644 --- a/tcg/ppc/tcg-target.inc.c +++ b/tcg/ppc/tcg-target.inc.c @@ -2820,14 +2820,11 @@ void tcg_register_jit(void *buf, size_t buf_size) } #endif /* __ELF__ */ -static size_t dcache_bsize = 16; -static size_t icache_bsize = 16; - void flush_icache_range(uintptr_t start, uintptr_t stop) { uintptr_t p, start1, stop1; - size_t dsize = dcache_bsize; - size_t isize = icache_bsize; + size_t dsize = qemu_dcache_linesize; + size_t isize = qemu_icache_linesize; start1 = start & ~(dsize - 1); stop1 = (stop + dsize - 1) & ~(dsize - 1); @@ -2844,67 +2841,3 @@ void flush_icache_range(uintptr_t start, uintptr_t stop) asm volatile ("sync" : : : "memory"); asm volatile ("isync" : : : "memory"); } - -#if defined _AIX -#include <sys/systemcfg.h> - -static void __attribute__((constructor)) tcg_cache_init(void) -{ - icache_bsize = _system_configuration.icache_line; - dcache_bsize = _system_configuration.dcache_line; -} - -#elif defined __linux__ -static void __attribute__((constructor)) tcg_cache_init(void) -{ - unsigned long dsize = qemu_getauxval(AT_DCACHEBSIZE); - unsigned long isize = qemu_getauxval(AT_ICACHEBSIZE); - - if (dsize == 0 || isize == 0) { - if (dsize == 0) { - fprintf(stderr, "getauxval AT_DCACHEBSIZE failed\n"); - } - if (isize == 0) { - fprintf(stderr, "getauxval AT_ICACHEBSIZE failed\n"); - } - exit(1); - } - dcache_bsize = dsize; - icache_bsize = isize; -} - -#elif defined __APPLE__ -#include <sys/sysctl.h> - -static void __attribute__((constructor)) tcg_cache_init(void) -{ - size_t len; - unsigned cacheline; - int name[2] = { CTL_HW, HW_CACHELINE }; - - len = sizeof(cacheline); - if (sysctl(name, 2, &cacheline, &len, NULL, 0)) { - perror("sysctl CTL_HW HW_CACHELINE failed"); - exit(1); - } - dcache_bsize = cacheline; - icache_bsize = cacheline; -} - -#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) -#include <sys/sysctl.h> - -static void __attribute__((constructor)) tcg_cache_init(void) -{ - size_t len = 4; - unsigned cacheline; - - if (sysctlbyname ("machdep.cacheline_size", &cacheline, &len, NULL, 0)) { - fprintf(stderr, "sysctlbyname machdep.cacheline_size failed: %s\n", - strerror(errno)); - exit(1); - } - dcache_bsize = cacheline; - icache_bsize = cacheline; -} -#endif diff --git a/tcg/tcg-runtime.c b/tcg/tcg-runtime.c index 7fa90ce508..ec3a34e461 100644 --- a/tcg/tcg-runtime.c +++ b/tcg/tcg-runtime.c @@ -149,23 +149,23 @@ void *HELPER(lookup_tb_ptr)(CPUArchState *env, target_ulong addr) CPUState *cpu = ENV_GET_CPU(env); TranslationBlock *tb; target_ulong cs_base, pc; - uint32_t flags; - - tb = atomic_rcu_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(addr)]); - if (likely(tb)) { - cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags); - if (likely(tb->pc == addr && tb->cs_base == cs_base && - tb->flags == flags)) { - goto found; - } + uint32_t flags, addr_hash; + + addr_hash = tb_jmp_cache_hash_func(addr); + tb = atomic_rcu_read(&cpu->tb_jmp_cache[addr_hash]); + cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags); + + if (unlikely(!(tb + && tb->pc == addr + && tb->cs_base == cs_base + && tb->flags == flags))) { tb = tb_htable_lookup(cpu, addr, cs_base, flags); - if (likely(tb)) { - atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(addr)], tb); - goto found; + if (!tb) { + return tcg_ctx.code_gen_epilogue; } + atomic_set(&cpu->tb_jmp_cache[addr_hash], tb); } - return tcg_ctx.code_gen_epilogue; - found: + qemu_log_mask_and_addr(CPU_LOG_EXEC, addr, "Chain %p [%d: " TARGET_FMT_lx "] %s\n", tb->tc_ptr, cpu->cpu_index, addr, @@ -383,6 +383,26 @@ void tcg_context_init(TCGContext *s) } } +/* + * Allocate TBs right before their corresponding translated code, making + * sure that TBs and code are on different cache lines. + */ +TranslationBlock *tcg_tb_alloc(TCGContext *s) +{ + uintptr_t align = qemu_icache_linesize; + TranslationBlock *tb; + void *next; + + tb = (void *)ROUND_UP((uintptr_t)s->code_gen_ptr, align); + next = (void *)ROUND_UP((uintptr_t)(tb + 1), align); + + if (unlikely(next > s->code_gen_highwater)) { + return NULL; + } + s->code_gen_ptr = next; + return tb; +} + void tcg_prologue_init(TCGContext *s) { size_t prologue_size, total_size; @@ -697,7 +697,6 @@ struct TCGContext { here, because there's too much arithmetic throughout that relies on addition and subtraction working on bytes. Rely on the GCC extension that allows arithmetic on void*. */ - int code_gen_max_blocks; void *code_gen_prologue; void *code_gen_epilogue; void *code_gen_buffer; @@ -756,6 +755,7 @@ static inline bool tcg_op_buf_full(void) /* tb_lock must be held for tcg_malloc_internal. */ void *tcg_malloc_internal(TCGContext *s, int size); void tcg_pool_reset(TCGContext *s); +TranslationBlock *tcg_tb_alloc(TCGContext *s); void tb_lock(void); void tb_unlock(void); |