aboutsummaryrefslogtreecommitdiff
path: root/tcg
diff options
context:
space:
mode:
Diffstat (limited to 'tcg')
-rw-r--r--tcg/aarch64/tcg-target.inc.c7
-rw-r--r--tcg/arm/tcg-target.inc.c78
-rw-r--r--tcg/ppc/tcg-target.inc.c71
-rw-r--r--tcg/tcg-runtime.c28
-rw-r--r--tcg/tcg.c20
-rw-r--r--tcg/tcg.h2
6 files changed, 89 insertions, 117 deletions
diff --git a/tcg/aarch64/tcg-target.inc.c b/tcg/aarch64/tcg-target.inc.c
index 5f185458f1..1fa3bccc89 100644
--- a/tcg/aarch64/tcg-target.inc.c
+++ b/tcg/aarch64/tcg-target.inc.c
@@ -616,7 +616,12 @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
/* Look for host pointer values within 4G of the PC. This happens
often when loading pointers to QEMU's own data structures. */
if (type == TCG_TYPE_I64) {
- tcg_target_long disp = (value >> 12) - ((intptr_t)s->code_ptr >> 12);
+ tcg_target_long disp = value - (intptr_t)s->code_ptr;
+ if (disp == sextract64(disp, 0, 21)) {
+ tcg_out_insn(s, 3406, ADR, rd, disp);
+ return;
+ }
+ disp = (value >> 12) - ((intptr_t)s->code_ptr >> 12);
if (disp == sextract64(disp, 0, 21)) {
tcg_out_insn(s, 3406, ADRP, rd, disp);
if (value & 0xfff) {
diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c
index 9f5cb66718..d1793ec77d 100644
--- a/tcg/arm/tcg-target.inc.c
+++ b/tcg/arm/tcg-target.inc.c
@@ -418,23 +418,37 @@ static inline void tcg_out_dat_imm(TCGContext *s,
static void tcg_out_movi32(TCGContext *s, int cond, int rd, uint32_t arg)
{
- int rot, opc, rn;
-
- /* For armv7, make sure not to use movw+movt when mov/mvn would do.
- Speed things up by only checking when movt would be required.
- Prior to armv7, have one go at fully rotated immediates before
- doing the decomposition thing below. */
- if (!use_armv7_instructions || (arg & 0xffff0000)) {
- rot = encode_imm(arg);
+ int rot, opc, rn, diff;
+
+ /* Check a single MOV/MVN before anything else. */
+ rot = encode_imm(arg);
+ if (rot >= 0) {
+ tcg_out_dat_imm(s, cond, ARITH_MOV, rd, 0,
+ rotl(arg, rot) | (rot << 7));
+ return;
+ }
+ rot = encode_imm(~arg);
+ if (rot >= 0) {
+ tcg_out_dat_imm(s, cond, ARITH_MVN, rd, 0,
+ rotl(~arg, rot) | (rot << 7));
+ return;
+ }
+
+ /* Check for a pc-relative address. This will usually be the TB,
+ or within the TB, which is immediately before the code block. */
+ diff = arg - ((intptr_t)s->code_ptr + 8);
+ if (diff >= 0) {
+ rot = encode_imm(diff);
if (rot >= 0) {
- tcg_out_dat_imm(s, cond, ARITH_MOV, rd, 0,
- rotl(arg, rot) | (rot << 7));
+ tcg_out_dat_imm(s, cond, ARITH_ADD, rd, TCG_REG_PC,
+ rotl(diff, rot) | (rot << 7));
return;
}
- rot = encode_imm(~arg);
+ } else {
+ rot = encode_imm(-diff);
if (rot >= 0) {
- tcg_out_dat_imm(s, cond, ARITH_MVN, rd, 0,
- rotl(~arg, rot) | (rot << 7));
+ tcg_out_dat_imm(s, cond, ARITH_SUB, rd, TCG_REG_PC,
+ rotl(-diff, rot) | (rot << 7));
return;
}
}
@@ -1026,16 +1040,6 @@ static void tcg_out_call(TCGContext *s, tcg_insn_unit *addr)
}
}
-void arm_tb_set_jmp_target(uintptr_t jmp_addr, uintptr_t addr)
-{
- tcg_insn_unit *code_ptr = (tcg_insn_unit *)jmp_addr;
- tcg_insn_unit *target = (tcg_insn_unit *)addr;
-
- /* we could use a ldr pc, [pc, #-4] kind of branch and avoid the flush */
- reloc_pc24_atomic(code_ptr, target);
- flush_icache_range(jmp_addr, jmp_addr + 4);
-}
-
static inline void tcg_out_goto_label(TCGContext *s, int cond, TCGLabel *l)
{
if (l->has_value) {
@@ -1665,17 +1669,27 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
}
break;
case INDEX_op_goto_tb:
- if (s->tb_jmp_insn_offset) {
- /* Direct jump method */
- s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
- tcg_out_b_noaddr(s, COND_AL);
- } else {
+ {
/* Indirect jump method */
- intptr_t ptr = (intptr_t)(s->tb_jmp_target_addr + args[0]);
- tcg_out_movi32(s, COND_AL, TCG_REG_R0, ptr & ~0xfff);
- tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_R0, ptr & 0xfff);
+ intptr_t ptr, dif, dil;
+ TCGReg base = TCG_REG_PC;
+
+ tcg_debug_assert(s->tb_jmp_insn_offset == 0);
+ ptr = (intptr_t)(s->tb_jmp_target_addr + args[0]);
+ dif = ptr - ((intptr_t)s->code_ptr + 8);
+ dil = sextract32(dif, 0, 12);
+ if (dif != dil) {
+ /* The TB is close, but outside the 12 bits addressable by
+ the load. We can extend this to 20 bits with a sub of a
+ shifted immediate from pc. In the vastly unlikely event
+ the code requires more than 1MB, we'll use 2 insns and
+ be no worse off. */
+ base = TCG_REG_R0;
+ tcg_out_movi32(s, COND_AL, base, ptr - dil);
+ }
+ tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, base, dil);
+ s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
}
- s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
break;
case INDEX_op_goto_ptr:
tcg_out_bx(s, COND_AL, args[0]);
diff --git a/tcg/ppc/tcg-target.inc.c b/tcg/ppc/tcg-target.inc.c
index 8d50f18328..1f690df20d 100644
--- a/tcg/ppc/tcg-target.inc.c
+++ b/tcg/ppc/tcg-target.inc.c
@@ -2820,14 +2820,11 @@ void tcg_register_jit(void *buf, size_t buf_size)
}
#endif /* __ELF__ */
-static size_t dcache_bsize = 16;
-static size_t icache_bsize = 16;
-
void flush_icache_range(uintptr_t start, uintptr_t stop)
{
uintptr_t p, start1, stop1;
- size_t dsize = dcache_bsize;
- size_t isize = icache_bsize;
+ size_t dsize = qemu_dcache_linesize;
+ size_t isize = qemu_icache_linesize;
start1 = start & ~(dsize - 1);
stop1 = (stop + dsize - 1) & ~(dsize - 1);
@@ -2844,67 +2841,3 @@ void flush_icache_range(uintptr_t start, uintptr_t stop)
asm volatile ("sync" : : : "memory");
asm volatile ("isync" : : : "memory");
}
-
-#if defined _AIX
-#include <sys/systemcfg.h>
-
-static void __attribute__((constructor)) tcg_cache_init(void)
-{
- icache_bsize = _system_configuration.icache_line;
- dcache_bsize = _system_configuration.dcache_line;
-}
-
-#elif defined __linux__
-static void __attribute__((constructor)) tcg_cache_init(void)
-{
- unsigned long dsize = qemu_getauxval(AT_DCACHEBSIZE);
- unsigned long isize = qemu_getauxval(AT_ICACHEBSIZE);
-
- if (dsize == 0 || isize == 0) {
- if (dsize == 0) {
- fprintf(stderr, "getauxval AT_DCACHEBSIZE failed\n");
- }
- if (isize == 0) {
- fprintf(stderr, "getauxval AT_ICACHEBSIZE failed\n");
- }
- exit(1);
- }
- dcache_bsize = dsize;
- icache_bsize = isize;
-}
-
-#elif defined __APPLE__
-#include <sys/sysctl.h>
-
-static void __attribute__((constructor)) tcg_cache_init(void)
-{
- size_t len;
- unsigned cacheline;
- int name[2] = { CTL_HW, HW_CACHELINE };
-
- len = sizeof(cacheline);
- if (sysctl(name, 2, &cacheline, &len, NULL, 0)) {
- perror("sysctl CTL_HW HW_CACHELINE failed");
- exit(1);
- }
- dcache_bsize = cacheline;
- icache_bsize = cacheline;
-}
-
-#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
-#include <sys/sysctl.h>
-
-static void __attribute__((constructor)) tcg_cache_init(void)
-{
- size_t len = 4;
- unsigned cacheline;
-
- if (sysctlbyname ("machdep.cacheline_size", &cacheline, &len, NULL, 0)) {
- fprintf(stderr, "sysctlbyname machdep.cacheline_size failed: %s\n",
- strerror(errno));
- exit(1);
- }
- dcache_bsize = cacheline;
- icache_bsize = cacheline;
-}
-#endif
diff --git a/tcg/tcg-runtime.c b/tcg/tcg-runtime.c
index 7fa90ce508..ec3a34e461 100644
--- a/tcg/tcg-runtime.c
+++ b/tcg/tcg-runtime.c
@@ -149,23 +149,23 @@ void *HELPER(lookup_tb_ptr)(CPUArchState *env, target_ulong addr)
CPUState *cpu = ENV_GET_CPU(env);
TranslationBlock *tb;
target_ulong cs_base, pc;
- uint32_t flags;
-
- tb = atomic_rcu_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(addr)]);
- if (likely(tb)) {
- cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
- if (likely(tb->pc == addr && tb->cs_base == cs_base &&
- tb->flags == flags)) {
- goto found;
- }
+ uint32_t flags, addr_hash;
+
+ addr_hash = tb_jmp_cache_hash_func(addr);
+ tb = atomic_rcu_read(&cpu->tb_jmp_cache[addr_hash]);
+ cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
+
+ if (unlikely(!(tb
+ && tb->pc == addr
+ && tb->cs_base == cs_base
+ && tb->flags == flags))) {
tb = tb_htable_lookup(cpu, addr, cs_base, flags);
- if (likely(tb)) {
- atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(addr)], tb);
- goto found;
+ if (!tb) {
+ return tcg_ctx.code_gen_epilogue;
}
+ atomic_set(&cpu->tb_jmp_cache[addr_hash], tb);
}
- return tcg_ctx.code_gen_epilogue;
- found:
+
qemu_log_mask_and_addr(CPU_LOG_EXEC, addr,
"Chain %p [%d: " TARGET_FMT_lx "] %s\n",
tb->tc_ptr, cpu->cpu_index, addr,
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 564292f54d..35598296c5 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -383,6 +383,26 @@ void tcg_context_init(TCGContext *s)
}
}
+/*
+ * Allocate TBs right before their corresponding translated code, making
+ * sure that TBs and code are on different cache lines.
+ */
+TranslationBlock *tcg_tb_alloc(TCGContext *s)
+{
+ uintptr_t align = qemu_icache_linesize;
+ TranslationBlock *tb;
+ void *next;
+
+ tb = (void *)ROUND_UP((uintptr_t)s->code_gen_ptr, align);
+ next = (void *)ROUND_UP((uintptr_t)(tb + 1), align);
+
+ if (unlikely(next > s->code_gen_highwater)) {
+ return NULL;
+ }
+ s->code_gen_ptr = next;
+ return tb;
+}
+
void tcg_prologue_init(TCGContext *s)
{
size_t prologue_size, total_size;
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 5ec48d1787..9e37722799 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -697,7 +697,6 @@ struct TCGContext {
here, because there's too much arithmetic throughout that relies
on addition and subtraction working on bytes. Rely on the GCC
extension that allows arithmetic on void*. */
- int code_gen_max_blocks;
void *code_gen_prologue;
void *code_gen_epilogue;
void *code_gen_buffer;
@@ -756,6 +755,7 @@ static inline bool tcg_op_buf_full(void)
/* tb_lock must be held for tcg_malloc_internal. */
void *tcg_malloc_internal(TCGContext *s, int size);
void tcg_pool_reset(TCGContext *s);
+TranslationBlock *tcg_tb_alloc(TCGContext *s);
void tb_lock(void);
void tb_unlock(void);