aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnthony Liguori <aliguori@us.ibm.com>2013-07-26 13:04:21 -0500
committerAnthony Liguori <aliguori@us.ibm.com>2013-07-26 13:04:21 -0500
commitf0ef1cf4d6c996d90df6e4a32a82423d180f3e4d (patch)
treececa64dbd292ac8cddb28752a6d6495d138a6c28
parentb142d79328cd0a0556634b0eeb891a15bfd5a00c (diff)
parentf290e4988dd8eb012de0517a1ff93df130e87da1 (diff)
Merge remote-tracking branch 'rth/tcg-next' into staging
# By Claudio Fontana (1) and others # Via Richard Henderson * rth/tcg-next: tcg: Remove temp_buf tcg/aarch64: Implement tlb lookup fast path tcg/aarch64: implement ldst 12bit scaled uimm offset Message-id: 1373919944-8521-1-git-send-email-rth@twiddle.net Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
-rwxr-xr-xconfigure2
-rw-r--r--include/exec/cpu-defs.h2
-rw-r--r--include/exec/exec-all.h14
-rw-r--r--tcg/aarch64/tcg-target.c256
4 files changed, 218 insertions, 56 deletions
diff --git a/configure b/configure
index 3d83e168bc..f5e299adcf 100755
--- a/configure
+++ b/configure
@@ -3645,7 +3645,7 @@ echo "libs_softmmu=$libs_softmmu" >> $config_host_mak
echo "ARCH=$ARCH" >> $config_host_mak
case "$cpu" in
- arm|i386|x86_64|ppc)
+ arm|i386|x86_64|ppc|aarch64)
# The TCG interpreter currently does not support ld/st optimization.
if test "$tcg_interpreter" = "no" ; then
echo "CONFIG_QEMU_LDST_OPTIMIZATION=y" >> $config_host_mak
diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
index b5b93db842..a5c028c536 100644
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@@ -154,8 +154,6 @@ typedef struct CPUWatchpoint {
memory was accessed */ \
CPU_COMMON_TLB \
struct TranslationBlock *tb_jmp_cache[TB_JMP_CACHE_SIZE]; \
- /* buffer for temporaries in the code generator */ \
- long temp_buf[CPU_TEMP_BUF_NLONGS]; \
\
int64_t icount_extra; /* Instructions until next timer event. */ \
/* Number of cycles left, with interrupt flag in high bit. \
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index b2162a4ec4..5920f73c90 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -358,6 +358,20 @@ static inline uintptr_t tcg_getpc_ldst(uintptr_t ra)
not the start of the next opcode */
return ra;
}
+#elif defined(__aarch64__)
+# define GETRA() ((uintptr_t)__builtin_return_address(0))
+# define GETPC_LDST() tcg_getpc_ldst(GETRA())
+static inline uintptr_t tcg_getpc_ldst(uintptr_t ra)
+{
+ int32_t b;
+ ra += 4; /* skip one instruction */
+ b = *(int32_t *)ra; /* load the branch insn */
+ b = (b << 6) >> (6 - 2); /* extract the displacement */
+ ra += b; /* apply the displacement */
+ ra -= 4; /* return a pointer into the current opcode,
+ not the start of the next opcode */
+ return ra;
+}
# else
# error "CONFIG_QEMU_LDST_OPTIMIZATION needs GETPC_LDST() implementation!"
# endif
diff --git a/tcg/aarch64/tcg-target.c b/tcg/aarch64/tcg-target.c
index 562a549dab..41a17f8a62 100644
--- a/tcg/aarch64/tcg-target.c
+++ b/tcg/aarch64/tcg-target.c
@@ -315,6 +315,17 @@ static inline void tcg_out_ldst_9(TCGContext *s,
tcg_out32(s, op_data << 24 | mod << 20 | off << 12 | rn << 5 | rd);
}
+/* tcg_out_ldst_12 expects a scaled unsigned immediate offset */
+static inline void tcg_out_ldst_12(TCGContext *s,
+ enum aarch64_ldst_op_data op_data,
+ enum aarch64_ldst_op_type op_type,
+ TCGReg rd, TCGReg rn,
+ tcg_target_ulong scaled_uimm)
+{
+ tcg_out32(s, (op_data | 1) << 24
+ | op_type << 20 | scaled_uimm << 10 | rn << 5 | rd);
+}
+
static inline void tcg_out_movr(TCGContext *s, int ext, TCGReg rd, TCGReg src)
{
/* register to register move using MOV (shifted register with no shift) */
@@ -374,10 +385,25 @@ static inline void tcg_out_ldst(TCGContext *s, enum aarch64_ldst_op_data data,
{
if (offset >= -256 && offset < 256) {
tcg_out_ldst_9(s, data, type, rd, rn, offset);
- } else {
- tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP, offset);
- tcg_out_ldst_r(s, data, type, rd, rn, TCG_REG_TMP);
+ return;
+ }
+
+ if (offset >= 256) {
+ /* if the offset is naturally aligned and in range,
+ then we can use the scaled uimm12 encoding */
+ unsigned int s_bits = data >> 6;
+ if (!(offset & ((1 << s_bits) - 1))) {
+ tcg_target_ulong scaled_uimm = offset >> s_bits;
+ if (scaled_uimm <= 0xfff) {
+ tcg_out_ldst_12(s, data, type, rd, rn, scaled_uimm);
+ return;
+ }
+ }
}
+
+ /* worst-case scenario, move offset to temp register, use reg offset */
+ tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP, offset);
+ tcg_out_ldst_r(s, data, type, rd, rn, TCG_REG_TMP);
}
/* mov alias implemented with add immediate, useful to move to/from SP */
@@ -706,6 +732,51 @@ static inline void tcg_out_uxt(TCGContext *s, int s_bits,
tcg_out_ubfm(s, 0, rd, rn, 0, bits);
}
+static inline void tcg_out_addi(TCGContext *s, int ext,
+ TCGReg rd, TCGReg rn, unsigned int aimm)
+{
+ /* add immediate aimm unsigned 12bit value (with LSL 0 or 12) */
+ /* using ADD 0x11000000 | (ext) | (aimm << 10) | (rn << 5) | rd */
+ unsigned int base = ext ? 0x91000000 : 0x11000000;
+
+ if (aimm <= 0xfff) {
+ aimm <<= 10;
+ } else {
+ /* we can only shift left by 12, on assert we cannot represent */
+ assert(!(aimm & 0xfff));
+ assert(aimm <= 0xfff000);
+ base |= 1 << 22; /* apply LSL 12 */
+ aimm >>= 2;
+ }
+
+ tcg_out32(s, base | aimm | (rn << 5) | rd);
+}
+
+static inline void tcg_out_subi(TCGContext *s, int ext,
+ TCGReg rd, TCGReg rn, unsigned int aimm)
+{
+ /* sub immediate aimm unsigned 12bit value (with LSL 0 or 12) */
+ /* using SUB 0x51000000 | (ext) | (aimm << 10) | (rn << 5) | rd */
+ unsigned int base = ext ? 0xd1000000 : 0x51000000;
+
+ if (aimm <= 0xfff) {
+ aimm <<= 10;
+ } else {
+ /* we can only shift left by 12, on assert we cannot represent */
+ assert(!(aimm & 0xfff));
+ assert(aimm <= 0xfff000);
+ base |= 1 << 22; /* apply LSL 12 */
+ aimm >>= 2;
+ }
+
+ tcg_out32(s, base | aimm | (rn << 5) | rd);
+}
+
+static inline void tcg_out_nop(TCGContext *s)
+{
+ tcg_out32(s, 0xd503201f);
+}
+
#ifdef CONFIG_SOFTMMU
#include "exec/softmmu_defs.h"
@@ -727,7 +798,125 @@ static const void * const qemu_st_helpers[4] = {
helper_stq_mmu,
};
-#else /* !CONFIG_SOFTMMU */
+static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
+{
+ reloc_pc19(lb->label_ptr[0], (tcg_target_long)s->code_ptr);
+ tcg_out_movr(s, 1, TCG_REG_X0, TCG_AREG0);
+ tcg_out_movr(s, (TARGET_LONG_BITS == 64), TCG_REG_X1, lb->addrlo_reg);
+ tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X2, lb->mem_index);
+ tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP,
+ (tcg_target_long)qemu_ld_helpers[lb->opc & 3]);
+ tcg_out_callr(s, TCG_REG_TMP);
+ if (lb->opc & 0x04) {
+ tcg_out_sxt(s, 1, lb->opc & 3, lb->datalo_reg, TCG_REG_X0);
+ } else {
+ tcg_out_movr(s, 1, lb->datalo_reg, TCG_REG_X0);
+ }
+
+ tcg_out_goto(s, (tcg_target_long)lb->raddr);
+}
+
+static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
+{
+ reloc_pc19(lb->label_ptr[0], (tcg_target_long)s->code_ptr);
+
+ tcg_out_movr(s, 1, TCG_REG_X0, TCG_AREG0);
+ tcg_out_movr(s, (TARGET_LONG_BITS == 64), TCG_REG_X1, lb->addrlo_reg);
+ tcg_out_movr(s, 1, TCG_REG_X2, lb->datalo_reg);
+ tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X3, lb->mem_index);
+ tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP,
+ (tcg_target_long)qemu_st_helpers[lb->opc & 3]);
+ tcg_out_callr(s, TCG_REG_TMP);
+
+ tcg_out_nop(s);
+ tcg_out_goto(s, (tcg_target_long)lb->raddr);
+}
+
+void tcg_out_tb_finalize(TCGContext *s)
+{
+ int i;
+ for (i = 0; i < s->nb_qemu_ldst_labels; i++) {
+ TCGLabelQemuLdst *label = &s->qemu_ldst_labels[i];
+ if (label->is_ld) {
+ tcg_out_qemu_ld_slow_path(s, label);
+ } else {
+ tcg_out_qemu_st_slow_path(s, label);
+ }
+ }
+}
+
+static void add_qemu_ldst_label(TCGContext *s, int is_ld, int opc,
+ TCGReg data_reg, TCGReg addr_reg,
+ int mem_index,
+ uint8_t *raddr, uint8_t *label_ptr)
+{
+ int idx;
+ TCGLabelQemuLdst *label;
+
+ if (s->nb_qemu_ldst_labels >= TCG_MAX_QEMU_LDST) {
+ tcg_abort();
+ }
+
+ idx = s->nb_qemu_ldst_labels++;
+ label = &s->qemu_ldst_labels[idx];
+ label->is_ld = is_ld;
+ label->opc = opc;
+ label->datalo_reg = data_reg;
+ label->addrlo_reg = addr_reg;
+ label->mem_index = mem_index;
+ label->raddr = raddr;
+ label->label_ptr[0] = label_ptr;
+}
+
+/* Load and compare a TLB entry, emitting the conditional jump to the
+ slow path for the failure case, which will be patched later when finalizing
+ the slow path. Generated code returns the host addend in X1,
+ clobbers X0,X2,X3,TMP. */
+static void tcg_out_tlb_read(TCGContext *s, TCGReg addr_reg,
+ int s_bits, uint8_t **label_ptr, int mem_index, int is_read)
+{
+ TCGReg base = TCG_AREG0;
+ int tlb_offset = is_read ?
+ offsetof(CPUArchState, tlb_table[mem_index][0].addr_read)
+ : offsetof(CPUArchState, tlb_table[mem_index][0].addr_write);
+ /* Extract the TLB index from the address into X0.
+ X0<CPU_TLB_BITS:0> =
+ addr_reg<TARGET_PAGE_BITS+CPU_TLB_BITS:TARGET_PAGE_BITS> */
+ tcg_out_ubfm(s, (TARGET_LONG_BITS == 64), TCG_REG_X0, addr_reg,
+ TARGET_PAGE_BITS, TARGET_PAGE_BITS + CPU_TLB_BITS);
+ /* Store the page mask part of the address and the low s_bits into X3.
+ Later this allows checking for equality and alignment at the same time.
+ X3 = addr_reg & (PAGE_MASK | ((1 << s_bits) - 1)) */
+ tcg_out_andi(s, (TARGET_LONG_BITS == 64), TCG_REG_X3, addr_reg,
+ (TARGET_LONG_BITS - TARGET_PAGE_BITS) + s_bits,
+ (TARGET_LONG_BITS - TARGET_PAGE_BITS));
+ /* Add any "high bits" from the tlb offset to the env address into X2,
+ to take advantage of the LSL12 form of the addi instruction.
+ X2 = env + (tlb_offset & 0xfff000) */
+ tcg_out_addi(s, 1, TCG_REG_X2, base, tlb_offset & 0xfff000);
+ /* Merge the tlb index contribution into X2.
+ X2 = X2 + (X0 << CPU_TLB_ENTRY_BITS) */
+ tcg_out_arith(s, ARITH_ADD, 1, TCG_REG_X2, TCG_REG_X2,
+ TCG_REG_X0, -CPU_TLB_ENTRY_BITS);
+ /* Merge "low bits" from tlb offset, load the tlb comparator into X0.
+ X0 = load [X2 + (tlb_offset & 0x000fff)] */
+ tcg_out_ldst(s, TARGET_LONG_BITS == 64 ? LDST_64 : LDST_32,
+ LDST_LD, TCG_REG_X0, TCG_REG_X2,
+ (tlb_offset & 0xfff));
+ /* Load the tlb addend. Do that early to avoid stalling.
+ X1 = load [X2 + (tlb_offset & 0xfff) + offsetof(addend)] */
+ tcg_out_ldst(s, LDST_64, LDST_LD, TCG_REG_X1, TCG_REG_X2,
+ (tlb_offset & 0xfff) + (offsetof(CPUTLBEntry, addend)) -
+ (is_read ? offsetof(CPUTLBEntry, addr_read)
+ : offsetof(CPUTLBEntry, addr_write)));
+ /* Perform the address comparison. */
+ tcg_out_cmp(s, (TARGET_LONG_BITS == 64), TCG_REG_X0, TCG_REG_X3, 0);
+ *label_ptr = s->code_ptr;
+ /* If not equal, we jump to the slow path. */
+ tcg_out_goto_cond_noaddr(s, TCG_COND_NE);
+}
+
+#endif /* CONFIG_SOFTMMU */
static void tcg_out_qemu_ld_direct(TCGContext *s, int opc, TCGReg data_r,
TCGReg addr_r, TCGReg off_r)
@@ -815,13 +1004,13 @@ static void tcg_out_qemu_st_direct(TCGContext *s, int opc, TCGReg data_r,
tcg_abort();
}
}
-#endif /* CONFIG_SOFTMMU */
static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
{
TCGReg addr_reg, data_reg;
#ifdef CONFIG_SOFTMMU
int mem_index, s_bits;
+ uint8_t *label_ptr;
#endif
data_reg = args[0];
addr_reg = args[1];
@@ -829,23 +1018,10 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
#ifdef CONFIG_SOFTMMU
mem_index = args[2];
s_bits = opc & 3;
-
- /* TODO: insert TLB lookup here */
-
- /* all arguments passed via registers */
- tcg_out_movr(s, 1, TCG_REG_X0, TCG_AREG0);
- tcg_out_movr(s, (TARGET_LONG_BITS == 64), TCG_REG_X1, addr_reg);
- tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X2, mem_index);
- tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP,
- (tcg_target_long)qemu_ld_helpers[s_bits]);
- tcg_out_callr(s, TCG_REG_TMP);
-
- if (opc & 0x04) { /* sign extend */
- tcg_out_sxt(s, 1, s_bits, data_reg, TCG_REG_X0);
- } else {
- tcg_out_movr(s, 1, data_reg, TCG_REG_X0);
- }
-
+ tcg_out_tlb_read(s, addr_reg, s_bits, &label_ptr, mem_index, 1);
+ tcg_out_qemu_ld_direct(s, opc, data_reg, addr_reg, TCG_REG_X1);
+ add_qemu_ldst_label(s, 1, opc, data_reg, addr_reg,
+ mem_index, s->code_ptr, label_ptr);
#else /* !CONFIG_SOFTMMU */
tcg_out_qemu_ld_direct(s, opc, data_reg, addr_reg,
GUEST_BASE ? TCG_REG_GUEST_BASE : TCG_REG_XZR);
@@ -857,6 +1033,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
TCGReg addr_reg, data_reg;
#ifdef CONFIG_SOFTMMU
int mem_index, s_bits;
+ uint8_t *label_ptr;
#endif
data_reg = args[0];
addr_reg = args[1];
@@ -865,17 +1042,10 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
mem_index = args[2];
s_bits = opc & 3;
- /* TODO: insert TLB lookup here */
-
- /* all arguments passed via registers */
- tcg_out_movr(s, 1, TCG_REG_X0, TCG_AREG0);
- tcg_out_movr(s, (TARGET_LONG_BITS == 64), TCG_REG_X1, addr_reg);
- tcg_out_movr(s, 1, TCG_REG_X2, data_reg);
- tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X3, mem_index);
- tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP,
- (tcg_target_long)qemu_st_helpers[s_bits]);
- tcg_out_callr(s, TCG_REG_TMP);
-
+ tcg_out_tlb_read(s, addr_reg, s_bits, &label_ptr, mem_index, 0);
+ tcg_out_qemu_st_direct(s, opc, data_reg, addr_reg, TCG_REG_X1);
+ add_qemu_ldst_label(s, 0, opc, data_reg, addr_reg,
+ mem_index, s->code_ptr, label_ptr);
#else /* !CONFIG_SOFTMMU */
tcg_out_qemu_st_direct(s, opc, data_reg, addr_reg,
GUEST_BASE ? TCG_REG_GUEST_BASE : TCG_REG_XZR);
@@ -1318,26 +1488,6 @@ static void tcg_target_init(TCGContext *s)
tcg_add_target_add_op_defs(aarch64_op_defs);
}
-static inline void tcg_out_addi(TCGContext *s, int ext,
- TCGReg rd, TCGReg rn, unsigned int aimm)
-{
- /* add immediate aimm unsigned 12bit value (we use LSL 0 - no shift) */
- /* using ADD 0x11000000 | (ext) | (aimm << 10) | (rn << 5) | rd */
- unsigned int base = ext ? 0x91000000 : 0x11000000;
- assert(aimm <= 0xfff);
- tcg_out32(s, base | (aimm << 10) | (rn << 5) | rd);
-}
-
-static inline void tcg_out_subi(TCGContext *s, int ext,
- TCGReg rd, TCGReg rn, unsigned int aimm)
-{
- /* sub immediate aimm unsigned 12bit value (we use LSL 0 - no shift) */
- /* using SUB 0x51000000 | (ext) | (aimm << 10) | (rn << 5) | rd */
- unsigned int base = ext ? 0xd1000000 : 0x51000000;
- assert(aimm <= 0xfff);
- tcg_out32(s, base | (aimm << 10) | (rn << 5) | rd);
-}
-
static void tcg_target_qemu_prologue(TCGContext *s)
{
/* NB: frame sizes are in 16 byte stack units! */