diff options
50 files changed, 5320 insertions, 3327 deletions
diff --git a/accel/tcg/atomic_common.c.inc b/accel/tcg/atomic_common.c.inc index 8f2ce43ee6..fe0eea018f 100644 --- a/accel/tcg/atomic_common.c.inc +++ b/accel/tcg/atomic_common.c.inc @@ -13,20 +13,20 @@ * See the COPYING file in the top-level directory. */ -static void atomic_trace_rmw_post(CPUArchState *env, target_ulong addr, +static void atomic_trace_rmw_post(CPUArchState *env, uint64_t addr, MemOpIdx oi) { qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_RW); } #if HAVE_ATOMIC128 -static void atomic_trace_ld_post(CPUArchState *env, target_ulong addr, +static void atomic_trace_ld_post(CPUArchState *env, uint64_t addr, MemOpIdx oi) { qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R); } -static void atomic_trace_st_post(CPUArchState *env, target_ulong addr, +static void atomic_trace_st_post(CPUArchState *env, uint64_t addr, MemOpIdx oi) { qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W); @@ -40,7 +40,7 @@ static void atomic_trace_st_post(CPUArchState *env, target_ulong addr, */ #define CMPXCHG_HELPER(OP, TYPE) \ - TYPE HELPER(atomic_##OP)(CPUArchState *env, target_ulong addr, \ + TYPE HELPER(atomic_##OP)(CPUArchState *env, uint64_t addr, \ TYPE oldv, TYPE newv, uint32_t oi) \ { return cpu_atomic_##OP##_mmu(env, addr, oldv, newv, oi, GETPC()); } @@ -62,7 +62,7 @@ CMPXCHG_HELPER(cmpxchgo_le, Int128) #undef CMPXCHG_HELPER -Int128 HELPER(nonatomic_cmpxchgo_be)(CPUArchState *env, target_ulong addr, +Int128 HELPER(nonatomic_cmpxchgo_be)(CPUArchState *env, uint64_t addr, Int128 cmpv, Int128 newv, uint32_t oi) { #if TCG_TARGET_REG_BITS == 32 @@ -82,7 +82,7 @@ Int128 HELPER(nonatomic_cmpxchgo_be)(CPUArchState *env, target_ulong addr, #endif } -Int128 HELPER(nonatomic_cmpxchgo_le)(CPUArchState *env, target_ulong addr, +Int128 HELPER(nonatomic_cmpxchgo_le)(CPUArchState *env, uint64_t addr, Int128 cmpv, Int128 newv, uint32_t oi) { #if TCG_TARGET_REG_BITS == 32 @@ -103,7 +103,7 @@ Int128 HELPER(nonatomic_cmpxchgo_le)(CPUArchState *env, target_ulong addr, } #define ATOMIC_HELPER(OP, TYPE) \ - TYPE HELPER(glue(atomic_,OP))(CPUArchState *env, target_ulong addr, \ + TYPE HELPER(glue(atomic_,OP))(CPUArchState *env, uint64_t addr, \ TYPE val, uint32_t oi) \ { return glue(glue(cpu_atomic_,OP),_mmu)(env, addr, val, oi, GETPC()); } diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c index 617777055a..ae0fbcdee2 100644 --- a/accel/tcg/cputlb.c +++ b/accel/tcg/cputlb.c @@ -40,6 +40,7 @@ #include "qemu/plugin-memory.h" #endif #include "tcg/tcg-ldst.h" +#include "exec/helper-proto.h" /* DEBUG defines, enable DEBUG_TLB_LOG to log to the CPU_LOG_MMU target */ /* #define DEBUG_TLB */ @@ -1668,6 +1669,9 @@ tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, target_ulong addr, return qemu_ram_addr_from_host_nofail(p); } +/* Load/store with atomicity primitives. */ +#include "ldst_atomicity.c.inc" + #ifdef CONFIG_PLUGIN /* * Perform a TLB lookup and populate the qemu_plugin_hwaddr structure. @@ -2010,60 +2014,13 @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr, } /* - * Verify that we have passed the correct MemOp to the correct function. - * - * In the case of the helper_*_mmu functions, we will have done this by - * using the MemOp to look up the helper during code generation. - * - * In the case of the cpu_*_mmu functions, this is up to the caller. - * We could present one function to target code, and dispatch based on - * the MemOp, but so far we have worked hard to avoid an indirect function - * call along the memory path. - */ -static void validate_memop(MemOpIdx oi, MemOp expected) -{ -#ifdef CONFIG_DEBUG_TCG - MemOp have = get_memop(oi) & (MO_SIZE | MO_BSWAP); - assert(have == expected); -#endif -} - -/* * Load Helpers * * We support two different access types. SOFTMMU_CODE_ACCESS is * specifically for reading instructions from system memory. It is * called by the translation loop and in some helpers where the code * is disassembled. It shouldn't be called directly by guest code. - */ - -typedef uint64_t FullLoadHelper(CPUArchState *env, target_ulong addr, - MemOpIdx oi, uintptr_t retaddr); - -static inline uint64_t QEMU_ALWAYS_INLINE -load_memop(const void *haddr, MemOp op) -{ - switch (op) { - case MO_UB: - return ldub_p(haddr); - case MO_BEUW: - return lduw_be_p(haddr); - case MO_LEUW: - return lduw_le_p(haddr); - case MO_BEUL: - return (uint32_t)ldl_be_p(haddr); - case MO_LEUL: - return (uint32_t)ldl_le_p(haddr); - case MO_BEUQ: - return ldq_be_p(haddr); - case MO_LEUQ: - return ldq_le_p(haddr); - default: - qemu_build_not_reached(); - } -} - -/* + * * For the benefit of TCG generated code, we want to avoid the * complication of ABI-specific return type promotion and always * return a value extended to the register size of the host. This is @@ -2119,20 +2076,224 @@ static uint64_t do_ld_bytes_beN(MMULookupPageData *p, uint64_t ret_be) return ret_be; } +/** + * do_ld_parts_beN + * @p: translation parameters + * @ret_be: accumulated data + * + * As do_ld_bytes_beN, but atomically on each aligned part. + */ +static uint64_t do_ld_parts_beN(MMULookupPageData *p, uint64_t ret_be) +{ + void *haddr = p->haddr; + int size = p->size; + + do { + uint64_t x; + int n; + + /* + * Find minimum of alignment and size. + * This is slightly stronger than required by MO_ATOM_SUBALIGN, which + * would have only checked the low bits of addr|size once at the start, + * but is just as easy. + */ + switch (((uintptr_t)haddr | size) & 7) { + case 4: + x = cpu_to_be32(load_atomic4(haddr)); + ret_be = (ret_be << 32) | x; + n = 4; + break; + case 2: + case 6: + x = cpu_to_be16(load_atomic2(haddr)); + ret_be = (ret_be << 16) | x; + n = 2; + break; + default: + x = *(uint8_t *)haddr; + ret_be = (ret_be << 8) | x; + n = 1; + break; + case 0: + g_assert_not_reached(); + } + haddr += n; + size -= n; + } while (size != 0); + return ret_be; +} + +/** + * do_ld_parts_be4 + * @p: translation parameters + * @ret_be: accumulated data + * + * As do_ld_bytes_beN, but with one atomic load. + * Four aligned bytes are guaranteed to cover the load. + */ +static uint64_t do_ld_whole_be4(MMULookupPageData *p, uint64_t ret_be) +{ + int o = p->addr & 3; + uint32_t x = load_atomic4(p->haddr - o); + + x = cpu_to_be32(x); + x <<= o * 8; + x >>= (4 - p->size) * 8; + return (ret_be << (p->size * 8)) | x; +} + +/** + * do_ld_parts_be8 + * @p: translation parameters + * @ret_be: accumulated data + * + * As do_ld_bytes_beN, but with one atomic load. + * Eight aligned bytes are guaranteed to cover the load. + */ +static uint64_t do_ld_whole_be8(CPUArchState *env, uintptr_t ra, + MMULookupPageData *p, uint64_t ret_be) +{ + int o = p->addr & 7; + uint64_t x = load_atomic8_or_exit(env, ra, p->haddr - o); + + x = cpu_to_be64(x); + x <<= o * 8; + x >>= (8 - p->size) * 8; + return (ret_be << (p->size * 8)) | x; +} + +/** + * do_ld_parts_be16 + * @p: translation parameters + * @ret_be: accumulated data + * + * As do_ld_bytes_beN, but with one atomic load. + * 16 aligned bytes are guaranteed to cover the load. + */ +static Int128 do_ld_whole_be16(CPUArchState *env, uintptr_t ra, + MMULookupPageData *p, uint64_t ret_be) +{ + int o = p->addr & 15; + Int128 x, y = load_atomic16_or_exit(env, ra, p->haddr - o); + int size = p->size; + + if (!HOST_BIG_ENDIAN) { + y = bswap128(y); + } + y = int128_lshift(y, o * 8); + y = int128_urshift(y, (16 - size) * 8); + x = int128_make64(ret_be); + x = int128_lshift(x, size * 8); + return int128_or(x, y); +} + /* * Wrapper for the above. */ static uint64_t do_ld_beN(CPUArchState *env, MMULookupPageData *p, - uint64_t ret_be, int mmu_idx, - MMUAccessType type, uintptr_t ra) + uint64_t ret_be, int mmu_idx, MMUAccessType type, + MemOp mop, uintptr_t ra) { + MemOp atom; + unsigned tmp, half_size; + if (unlikely(p->flags & TLB_MMIO)) { return do_ld_mmio_beN(env, p, ret_be, mmu_idx, type, ra); - } else { + } + + /* + * It is a given that we cross a page and therefore there is no + * atomicity for the load as a whole, but subobjects may need attention. + */ + atom = mop & MO_ATOM_MASK; + switch (atom) { + case MO_ATOM_SUBALIGN: + return do_ld_parts_beN(p, ret_be); + + case MO_ATOM_IFALIGN_PAIR: + case MO_ATOM_WITHIN16_PAIR: + tmp = mop & MO_SIZE; + tmp = tmp ? tmp - 1 : 0; + half_size = 1 << tmp; + if (atom == MO_ATOM_IFALIGN_PAIR + ? p->size == half_size + : p->size >= half_size) { + if (!HAVE_al8_fast && p->size < 4) { + return do_ld_whole_be4(p, ret_be); + } else { + return do_ld_whole_be8(env, ra, p, ret_be); + } + } + /* fall through */ + + case MO_ATOM_IFALIGN: + case MO_ATOM_WITHIN16: + case MO_ATOM_NONE: return do_ld_bytes_beN(p, ret_be); + + default: + g_assert_not_reached(); } } +/* + * Wrapper for the above, for 8 < size < 16. + */ +static Int128 do_ld16_beN(CPUArchState *env, MMULookupPageData *p, + uint64_t a, int mmu_idx, MemOp mop, uintptr_t ra) +{ + int size = p->size; + uint64_t b; + MemOp atom; + + if (unlikely(p->flags & TLB_MMIO)) { + p->size = size - 8; + a = do_ld_mmio_beN(env, p, a, mmu_idx, MMU_DATA_LOAD, ra); + p->addr += p->size; + p->size = 8; + b = do_ld_mmio_beN(env, p, 0, mmu_idx, MMU_DATA_LOAD, ra); + return int128_make128(b, a); + } + + /* + * It is a given that we cross a page and therefore there is no + * atomicity for the load as a whole, but subobjects may need attention. + */ + atom = mop & MO_ATOM_MASK; + switch (atom) { + case MO_ATOM_SUBALIGN: + p->size = size - 8; + a = do_ld_parts_beN(p, a); + p->haddr += size - 8; + p->size = 8; + b = do_ld_parts_beN(p, 0); + break; + + case MO_ATOM_WITHIN16_PAIR: + /* Since size > 8, this is the half that must be atomic. */ + return do_ld_whole_be16(env, ra, p, a); + + case MO_ATOM_IFALIGN_PAIR: + /* + * Since size > 8, both halves are misaligned, + * and so neither is atomic. + */ + case MO_ATOM_IFALIGN: + case MO_ATOM_WITHIN16: + case MO_ATOM_NONE: + p->size = size - 8; + a = do_ld_bytes_beN(p, a); + b = ldq_be_p(p->haddr + size - 8); + break; + + default: + g_assert_not_reached(); + } + + return int128_make128(b, a); +} + static uint8_t do_ld_1(CPUArchState *env, MMULookupPageData *p, int mmu_idx, MMUAccessType type, uintptr_t ra) { @@ -2153,7 +2314,7 @@ static uint16_t do_ld_2(CPUArchState *env, MMULookupPageData *p, int mmu_idx, } /* Perform the load host endian, then swap if necessary. */ - ret = load_memop(p->haddr, MO_UW); + ret = load_atom_2(env, ra, p->haddr, memop); if (memop & MO_BSWAP) { ret = bswap16(ret); } @@ -2170,7 +2331,7 @@ static uint32_t do_ld_4(CPUArchState *env, MMULookupPageData *p, int mmu_idx, } /* Perform the load host endian. */ - ret = load_memop(p->haddr, MO_UL); + ret = load_atom_4(env, ra, p->haddr, memop); if (memop & MO_BSWAP) { ret = bswap32(ret); } @@ -2187,7 +2348,7 @@ static uint64_t do_ld_8(CPUArchState *env, MMULookupPageData *p, int mmu_idx, } /* Perform the load host endian. */ - ret = load_memop(p->haddr, MO_UQ); + ret = load_atom_8(env, ra, p->haddr, memop); if (memop & MO_BSWAP) { ret = bswap64(ret); } @@ -2206,10 +2367,10 @@ static uint8_t do_ld1_mmu(CPUArchState *env, target_ulong addr, MemOpIdx oi, return do_ld_1(env, &l.page[0], l.mmu_idx, access_type, ra); } -tcg_target_ulong helper_ret_ldub_mmu(CPUArchState *env, target_ulong addr, - MemOpIdx oi, uintptr_t retaddr) +tcg_target_ulong helper_ldub_mmu(CPUArchState *env, uint64_t addr, + MemOpIdx oi, uintptr_t retaddr) { - validate_memop(oi, MO_UB); + tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_8); return do_ld1_mmu(env, addr, oi, retaddr, MMU_DATA_LOAD); } @@ -2237,17 +2398,10 @@ static uint16_t do_ld2_mmu(CPUArchState *env, target_ulong addr, MemOpIdx oi, return ret; } -tcg_target_ulong helper_le_lduw_mmu(CPUArchState *env, target_ulong addr, - MemOpIdx oi, uintptr_t retaddr) +tcg_target_ulong helper_lduw_mmu(CPUArchState *env, uint64_t addr, + MemOpIdx oi, uintptr_t retaddr) { - validate_memop(oi, MO_LEUW); - return do_ld2_mmu(env, addr, oi, retaddr, MMU_DATA_LOAD); -} - -tcg_target_ulong helper_be_lduw_mmu(CPUArchState *env, target_ulong addr, - MemOpIdx oi, uintptr_t retaddr) -{ - validate_memop(oi, MO_BEUW); + tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_16); return do_ld2_mmu(env, addr, oi, retaddr, MMU_DATA_LOAD); } @@ -2263,25 +2417,18 @@ static uint32_t do_ld4_mmu(CPUArchState *env, target_ulong addr, MemOpIdx oi, return do_ld_4(env, &l.page[0], l.mmu_idx, access_type, l.memop, ra); } - ret = do_ld_beN(env, &l.page[0], 0, l.mmu_idx, access_type, ra); - ret = do_ld_beN(env, &l.page[1], ret, l.mmu_idx, access_type, ra); + ret = do_ld_beN(env, &l.page[0], 0, l.mmu_idx, access_type, l.memop, ra); + ret = do_ld_beN(env, &l.page[1], ret, l.mmu_idx, access_type, l.memop, ra); if ((l.memop & MO_BSWAP) == MO_LE) { ret = bswap32(ret); } return ret; } -tcg_target_ulong helper_le_ldul_mmu(CPUArchState *env, target_ulong addr, - MemOpIdx oi, uintptr_t retaddr) +tcg_target_ulong helper_ldul_mmu(CPUArchState *env, uint64_t addr, + MemOpIdx oi, uintptr_t retaddr) { - validate_memop(oi, MO_LEUL); - return do_ld4_mmu(env, addr, oi, retaddr, MMU_DATA_LOAD); -} - -tcg_target_ulong helper_be_ldul_mmu(CPUArchState *env, target_ulong addr, - MemOpIdx oi, uintptr_t retaddr) -{ - validate_memop(oi, MO_BEUL); + tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_32); return do_ld4_mmu(env, addr, oi, retaddr, MMU_DATA_LOAD); } @@ -2297,25 +2444,18 @@ static uint64_t do_ld8_mmu(CPUArchState *env, target_ulong addr, MemOpIdx oi, return do_ld_8(env, &l.page[0], l.mmu_idx, access_type, l.memop, ra); } - ret = do_ld_beN(env, &l.page[0], 0, l.mmu_idx, access_type, ra); - ret = do_ld_beN(env, &l.page[1], ret, l.mmu_idx, access_type, ra); + ret = do_ld_beN(env, &l.page[0], 0, l.mmu_idx, access_type, l.memop, ra); + ret = do_ld_beN(env, &l.page[1], ret, l.mmu_idx, access_type, l.memop, ra); if ((l.memop & MO_BSWAP) == MO_LE) { ret = bswap64(ret); } return ret; } -uint64_t helper_le_ldq_mmu(CPUArchState *env, target_ulong addr, - MemOpIdx oi, uintptr_t retaddr) -{ - validate_memop(oi, MO_LEUQ); - return do_ld8_mmu(env, addr, oi, retaddr, MMU_DATA_LOAD); -} - -uint64_t helper_be_ldq_mmu(CPUArchState *env, target_ulong addr, - MemOpIdx oi, uintptr_t retaddr) +uint64_t helper_ldq_mmu(CPUArchState *env, uint64_t addr, + MemOpIdx oi, uintptr_t retaddr) { - validate_memop(oi, MO_BEUQ); + tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_64); return do_ld8_mmu(env, addr, oi, retaddr, MMU_DATA_LOAD); } @@ -2324,35 +2464,96 @@ uint64_t helper_be_ldq_mmu(CPUArchState *env, target_ulong addr, * avoid this for 64-bit data, or for 32-bit data on 32-bit host. */ +tcg_target_ulong helper_ldsb_mmu(CPUArchState *env, uint64_t addr, + MemOpIdx oi, uintptr_t retaddr) +{ + return (int8_t)helper_ldub_mmu(env, addr, oi, retaddr); +} -tcg_target_ulong helper_ret_ldsb_mmu(CPUArchState *env, target_ulong addr, - MemOpIdx oi, uintptr_t retaddr) +tcg_target_ulong helper_ldsw_mmu(CPUArchState *env, uint64_t addr, + MemOpIdx oi, uintptr_t retaddr) { - return (int8_t)helper_ret_ldub_mmu(env, addr, oi, retaddr); + return (int16_t)helper_lduw_mmu(env, addr, oi, retaddr); } -tcg_target_ulong helper_le_ldsw_mmu(CPUArchState *env, target_ulong addr, - MemOpIdx oi, uintptr_t retaddr) +tcg_target_ulong helper_ldsl_mmu(CPUArchState *env, uint64_t addr, + MemOpIdx oi, uintptr_t retaddr) { - return (int16_t)helper_le_lduw_mmu(env, addr, oi, retaddr); + return (int32_t)helper_ldul_mmu(env, addr, oi, retaddr); } -tcg_target_ulong helper_be_ldsw_mmu(CPUArchState *env, target_ulong addr, - MemOpIdx oi, uintptr_t retaddr) +static Int128 do_ld16_mmu(CPUArchState *env, target_ulong addr, + MemOpIdx oi, uintptr_t ra) { - return (int16_t)helper_be_lduw_mmu(env, addr, oi, retaddr); + MMULookupLocals l; + bool crosspage; + uint64_t a, b; + Int128 ret; + int first; + + crosspage = mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD, &l); + if (likely(!crosspage)) { + /* Perform the load host endian. */ + if (unlikely(l.page[0].flags & TLB_MMIO)) { + QEMU_IOTHREAD_LOCK_GUARD(); + a = io_readx(env, l.page[0].full, l.mmu_idx, addr, + ra, MMU_DATA_LOAD, MO_64); + b = io_readx(env, l.page[0].full, l.mmu_idx, addr + 8, + ra, MMU_DATA_LOAD, MO_64); + ret = int128_make128(HOST_BIG_ENDIAN ? b : a, + HOST_BIG_ENDIAN ? a : b); + } else { + ret = load_atom_16(env, ra, l.page[0].haddr, l.memop); + } + if (l.memop & MO_BSWAP) { + ret = bswap128(ret); + } + return ret; + } + + first = l.page[0].size; + if (first == 8) { + MemOp mop8 = (l.memop & ~MO_SIZE) | MO_64; + + a = do_ld_8(env, &l.page[0], l.mmu_idx, MMU_DATA_LOAD, mop8, ra); + b = do_ld_8(env, &l.page[1], l.mmu_idx, MMU_DATA_LOAD, mop8, ra); + if ((mop8 & MO_BSWAP) == MO_LE) { + ret = int128_make128(a, b); + } else { + ret = int128_make128(b, a); + } + return ret; + } + + if (first < 8) { + a = do_ld_beN(env, &l.page[0], 0, l.mmu_idx, + MMU_DATA_LOAD, l.memop, ra); + ret = do_ld16_beN(env, &l.page[1], a, l.mmu_idx, l.memop, ra); + } else { + ret = do_ld16_beN(env, &l.page[0], 0, l.mmu_idx, l.memop, ra); + b = int128_getlo(ret); + ret = int128_lshift(ret, l.page[1].size * 8); + a = int128_gethi(ret); + b = do_ld_beN(env, &l.page[1], b, l.mmu_idx, + MMU_DATA_LOAD, l.memop, ra); + ret = int128_make128(b, a); + } + if ((l.memop & MO_BSWAP) == MO_LE) { + ret = bswap128(ret); + } + return ret; } -tcg_target_ulong helper_le_ldsl_mmu(CPUArchState *env, target_ulong addr, - MemOpIdx oi, uintptr_t retaddr) +Int128 helper_ld16_mmu(CPUArchState *env, uint64_t addr, + uint32_t oi, uintptr_t retaddr) { - return (int32_t)helper_le_ldul_mmu(env, addr, oi, retaddr); + tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_128); + return do_ld16_mmu(env, addr, oi, retaddr); } -tcg_target_ulong helper_be_ldsl_mmu(CPUArchState *env, target_ulong addr, - MemOpIdx oi, uintptr_t retaddr) +Int128 helper_ld_i128(CPUArchState *env, uint64_t addr, uint32_t oi) { - return (int32_t)helper_be_ldul_mmu(env, addr, oi, retaddr); + return helper_ld16_mmu(env, addr, oi, GETPC()); } /* @@ -2368,7 +2569,7 @@ uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr addr, MemOpIdx oi, uintptr_t ra) { uint8_t ret; - validate_memop(oi, MO_UB); + tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_UB); ret = do_ld1_mmu(env, addr, oi, ra, MMU_DATA_LOAD); plugin_load_cb(env, addr, oi); return ret; @@ -2379,7 +2580,7 @@ uint16_t cpu_ldw_be_mmu(CPUArchState *env, abi_ptr addr, { uint16_t ret; - validate_memop(oi, MO_BEUW); + tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_BEUW); ret = do_ld2_mmu(env, addr, oi, ra, MMU_DATA_LOAD); plugin_load_cb(env, addr, oi); return ret; @@ -2390,7 +2591,7 @@ uint32_t cpu_ldl_be_mmu(CPUArchState *env, abi_ptr addr, { uint32_t ret; - validate_memop(oi, MO_BEUL); + tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_BEUL); ret = do_ld4_mmu(env, addr, oi, ra, MMU_DATA_LOAD); plugin_load_cb(env, addr, oi); return ret; @@ -2401,7 +2602,7 @@ uint64_t cpu_ldq_be_mmu(CPUArchState *env, abi_ptr addr, { uint64_t ret; - validate_memop(oi, MO_BEUQ); + tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_BEUQ); ret = do_ld8_mmu(env, addr, oi, ra, MMU_DATA_LOAD); plugin_load_cb(env, addr, oi); return ret; @@ -2412,7 +2613,7 @@ uint16_t cpu_ldw_le_mmu(CPUArchState *env, abi_ptr addr, { uint16_t ret; - validate_memop(oi, MO_LEUW); + tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_LEUW); ret = do_ld2_mmu(env, addr, oi, ra, MMU_DATA_LOAD); plugin_load_cb(env, addr, oi); return ret; @@ -2423,7 +2624,7 @@ uint32_t cpu_ldl_le_mmu(CPUArchState *env, abi_ptr addr, { uint32_t ret; - validate_memop(oi, MO_LEUL); + tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_LEUL); ret = do_ld4_mmu(env, addr, oi, ra, MMU_DATA_LOAD); plugin_load_cb(env, addr, oi); return ret; @@ -2434,7 +2635,7 @@ uint64_t cpu_ldq_le_mmu(CPUArchState *env, abi_ptr addr, { uint64_t ret; - validate_memop(oi, MO_LEUQ); + tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_LEUQ); ret = do_ld8_mmu(env, addr, oi, ra, MMU_DATA_LOAD); plugin_load_cb(env, addr, oi); return ret; @@ -2443,95 +2644,29 @@ uint64_t cpu_ldq_le_mmu(CPUArchState *env, abi_ptr addr, Int128 cpu_ld16_be_mmu(CPUArchState *env, abi_ptr addr, MemOpIdx oi, uintptr_t ra) { - MemOp mop = get_memop(oi); - int mmu_idx = get_mmuidx(oi); - MemOpIdx new_oi; - unsigned a_bits; - uint64_t h, l; - - tcg_debug_assert((mop & (MO_BSWAP|MO_SSIZE)) == (MO_BE|MO_128)); - a_bits = get_alignment_bits(mop); - - /* Handle CPU specific unaligned behaviour */ - if (addr & ((1 << a_bits) - 1)) { - cpu_unaligned_access(env_cpu(env), addr, MMU_DATA_LOAD, - mmu_idx, ra); - } + Int128 ret; - /* Construct an unaligned 64-bit replacement MemOpIdx. */ - mop = (mop & ~(MO_SIZE | MO_AMASK)) | MO_64 | MO_UNALN; - new_oi = make_memop_idx(mop, mmu_idx); - - h = helper_be_ldq_mmu(env, addr, new_oi, ra); - l = helper_be_ldq_mmu(env, addr + 8, new_oi, ra); - - qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R); - return int128_make128(l, h); + tcg_debug_assert((get_memop(oi) & (MO_BSWAP|MO_SIZE)) == (MO_BE|MO_128)); + ret = do_ld16_mmu(env, addr, oi, ra); + plugin_load_cb(env, addr, oi); + return ret; } Int128 cpu_ld16_le_mmu(CPUArchState *env, abi_ptr addr, MemOpIdx oi, uintptr_t ra) { - MemOp mop = get_memop(oi); - int mmu_idx = get_mmuidx(oi); - MemOpIdx new_oi; - unsigned a_bits; - uint64_t h, l; - - tcg_debug_assert((mop & (MO_BSWAP|MO_SSIZE)) == (MO_LE|MO_128)); - a_bits = get_alignment_bits(mop); + Int128 ret; - /* Handle CPU specific unaligned behaviour */ - if (addr & ((1 << a_bits) - 1)) { - cpu_unaligned_access(env_cpu(env), addr, MMU_DATA_LOAD, - mmu_idx, ra); - } - - /* Construct an unaligned 64-bit replacement MemOpIdx. */ - mop = (mop & ~(MO_SIZE | MO_AMASK)) | MO_64 | MO_UNALN; - new_oi = make_memop_idx(mop, mmu_idx); - - l = helper_le_ldq_mmu(env, addr, new_oi, ra); - h = helper_le_ldq_mmu(env, addr + 8, new_oi, ra); - - qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R); - return int128_make128(l, h); + tcg_debug_assert((get_memop(oi) & (MO_BSWAP|MO_SIZE)) == (MO_LE|MO_128)); + ret = do_ld16_mmu(env, addr, oi, ra); + plugin_load_cb(env, addr, oi); + return ret; } /* * Store Helpers */ -static inline void QEMU_ALWAYS_INLINE -store_memop(void *haddr, uint64_t val, MemOp op) -{ - switch (op) { - case MO_UB: - stb_p(haddr, val); - break; - case MO_BEUW: - stw_be_p(haddr, val); - break; - case MO_LEUW: - stw_le_p(haddr, val); - break; - case MO_BEUL: - stl_be_p(haddr, val); - break; - case MO_LEUL: - stl_le_p(haddr, val); - break; - case MO_BEUQ: - stq_be_p(haddr, val); - break; - case MO_LEUQ: - stq_le_p(haddr, val); - break; - default: - qemu_build_not_reached(); - } -} - /** * do_st_mmio_leN: * @env: cpu context @@ -2558,38 +2693,110 @@ static uint64_t do_st_mmio_leN(CPUArchState *env, MMULookupPageData *p, return val_le; } -/** - * do_st_bytes_leN: - * @p: translation parameters - * @val_le: data to store - * - * Store @p->size bytes at @p->haddr, which is RAM. - * The bytes to store are extracted in little-endian order from @val_le; - * return the bytes of @val_le beyond @p->size that have not been stored. +/* + * Wrapper for the above. */ -static uint64_t do_st_bytes_leN(MMULookupPageData *p, uint64_t val_le) +static uint64_t do_st_leN(CPUArchState *env, MMULookupPageData *p, + uint64_t val_le, int mmu_idx, + MemOp mop, uintptr_t ra) { - uint8_t *haddr = p->haddr; - int i, size = p->size; + MemOp atom; + unsigned tmp, half_size; - for (i = 0; i < size; i++, val_le >>= 8) { - haddr[i] = val_le; + if (unlikely(p->flags & TLB_MMIO)) { + return do_st_mmio_leN(env, p, val_le, mmu_idx, ra); + } else if (unlikely(p->flags & TLB_DISCARD_WRITE)) { + return val_le >> (p->size * 8); + } + + /* + * It is a given that we cross a page and therefore there is no atomicity + * for the store as a whole, but subobjects may need attention. + */ + atom = mop & MO_ATOM_MASK; + switch (atom) { + case MO_ATOM_SUBALIGN: + return store_parts_leN(p->haddr, p->size, val_le); + + case MO_ATOM_IFALIGN_PAIR: + case MO_ATOM_WITHIN16_PAIR: + tmp = mop & MO_SIZE; + tmp = tmp ? tmp - 1 : 0; + half_size = 1 << tmp; + if (atom == MO_ATOM_IFALIGN_PAIR + ? p->size == half_size + : p->size >= half_size) { + if (!HAVE_al8_fast && p->size <= 4) { + return store_whole_le4(p->haddr, p->size, val_le); + } else if (HAVE_al8) { + return store_whole_le8(p->haddr, p->size, val_le); + } else { + cpu_loop_exit_atomic(env_cpu(env), ra); + } + } + /* fall through */ + + case MO_ATOM_IFALIGN: + case MO_ATOM_WITHIN16: + case MO_ATOM_NONE: + return store_bytes_leN(p->haddr, p->size, val_le); + + default: + g_assert_not_reached(); } - return val_le; } /* - * Wrapper for the above. + * Wrapper for the above, for 8 < size < 16. */ -static uint64_t do_st_leN(CPUArchState *env, MMULookupPageData *p, - uint64_t val_le, int mmu_idx, uintptr_t ra) +static uint64_t do_st16_leN(CPUArchState *env, MMULookupPageData *p, + Int128 val_le, int mmu_idx, + MemOp mop, uintptr_t ra) { + int size = p->size; + MemOp atom; + if (unlikely(p->flags & TLB_MMIO)) { - return do_st_mmio_leN(env, p, val_le, mmu_idx, ra); + p->size = 8; + do_st_mmio_leN(env, p, int128_getlo(val_le), mmu_idx, ra); + p->size = size - 8; + p->addr += 8; + return do_st_mmio_leN(env, p, int128_gethi(val_le), mmu_idx, ra); } else if (unlikely(p->flags & TLB_DISCARD_WRITE)) { - return val_le >> (p->size * 8); - } else { - return do_st_bytes_leN(p, val_le); + return int128_gethi(val_le) >> ((size - 8) * 8); + } + + /* + * It is a given that we cross a page and therefore there is no atomicity + * for the store as a whole, but subobjects may need attention. + */ + atom = mop & MO_ATOM_MASK; + switch (atom) { + case MO_ATOM_SUBALIGN: + store_parts_leN(p->haddr, 8, int128_getlo(val_le)); + return store_parts_leN(p->haddr + 8, p->size - 8, + int128_gethi(val_le)); + + case MO_ATOM_WITHIN16_PAIR: + /* Since size > 8, this is the half that must be atomic. */ + if (!HAVE_al16) { + cpu_loop_exit_atomic(env_cpu(env), ra); + } + return store_whole_le16(p->haddr, p->size, val_le); + + case MO_ATOM_IFALIGN_PAIR: + /* + * Since size > 8, both halves are misaligned, + * and so neither is atomic. + */ + case MO_ATOM_IFALIGN: + case MO_ATOM_NONE: + stq_le_p(p->haddr, int128_getlo(val_le)); + return store_bytes_leN(p->haddr + 8, p->size - 8, + int128_gethi(val_le)); + + default: + g_assert_not_reached(); } } @@ -2617,7 +2824,7 @@ static void do_st_2(CPUArchState *env, MMULookupPageData *p, uint16_t val, if (memop & MO_BSWAP) { val = bswap16(val); } - store_memop(p->haddr, val, MO_UW); + store_atom_2(env, ra, p->haddr, memop, val); } } @@ -2633,7 +2840,7 @@ static void do_st_4(CPUArchState *env, MMULookupPageData *p, uint32_t val, if (memop & MO_BSWAP) { val = bswap32(val); } - store_memop(p->haddr, val, MO_UL); + store_atom_4(env, ra, p->haddr, memop, val); } } @@ -2649,17 +2856,17 @@ static void do_st_8(CPUArchState *env, MMULookupPageData *p, uint64_t val, if (memop & MO_BSWAP) { val = bswap64(val); } - store_memop(p->haddr, val, MO_UQ); + store_atom_8(env, ra, p->haddr, memop, val); } } -void helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint32_t val, - MemOpIdx oi, uintptr_t ra) +void helper_stb_mmu(CPUArchState *env, uint64_t addr, uint32_t val, + MemOpIdx oi, uintptr_t ra) { MMULookupLocals l; bool crosspage; - validate_memop(oi, MO_UB); + tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_8); crosspage = mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE, &l); tcg_debug_assert(!crosspage); @@ -2688,17 +2895,10 @@ static void do_st2_mmu(CPUArchState *env, target_ulong addr, uint16_t val, do_st_1(env, &l.page[1], b, l.mmu_idx, ra); } -void helper_le_stw_mmu(CPUArchState *env, target_ulong addr, uint32_t val, - MemOpIdx oi, uintptr_t retaddr) -{ - validate_memop(oi, MO_LEUW); - do_st2_mmu(env, addr, val, oi, retaddr); -} - -void helper_be_stw_mmu(CPUArchState *env, target_ulong addr, uint32_t val, - MemOpIdx oi, uintptr_t retaddr) +void helper_stw_mmu(CPUArchState *env, uint64_t addr, uint32_t val, + MemOpIdx oi, uintptr_t retaddr) { - validate_memop(oi, MO_BEUW); + tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_16); do_st2_mmu(env, addr, val, oi, retaddr); } @@ -2718,21 +2918,14 @@ static void do_st4_mmu(CPUArchState *env, target_ulong addr, uint32_t val, if ((l.memop & MO_BSWAP) != MO_LE) { val = bswap32(val); } - val = do_st_leN(env, &l.page[0], val, l.mmu_idx, ra); - (void) do_st_leN(env, &l.page[1], val, l.mmu_idx, ra); + val = do_st_leN(env, &l.page[0], val, l.mmu_idx, l.memop, ra); + (void) do_st_leN(env, &l.page[1], val, l.mmu_idx, l.memop, ra); } -void helper_le_stl_mmu(CPUArchState *env, target_ulong addr, uint32_t val, - MemOpIdx oi, uintptr_t retaddr) -{ - validate_memop(oi, MO_LEUL); - do_st4_mmu(env, addr, val, oi, retaddr); -} - -void helper_be_stl_mmu(CPUArchState *env, target_ulong addr, uint32_t val, - MemOpIdx oi, uintptr_t retaddr) +void helper_stl_mmu(CPUArchState *env, uint64_t addr, uint32_t val, + MemOpIdx oi, uintptr_t retaddr) { - validate_memop(oi, MO_BEUL); + tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_32); do_st4_mmu(env, addr, val, oi, retaddr); } @@ -2752,22 +2945,88 @@ static void do_st8_mmu(CPUArchState *env, target_ulong addr, uint64_t val, if ((l.memop & MO_BSWAP) != MO_LE) { val = bswap64(val); } - val = do_st_leN(env, &l.page[0], val, l.mmu_idx, ra); - (void) do_st_leN(env, &l.page[1], val, l.mmu_idx, ra); + val = do_st_leN(env, &l.page[0], val, l.mmu_idx, l.memop, ra); + (void) do_st_leN(env, &l.page[1], val, l.mmu_idx, l.memop, ra); } -void helper_le_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val, - MemOpIdx oi, uintptr_t retaddr) +void helper_stq_mmu(CPUArchState *env, uint64_t addr, uint64_t val, + MemOpIdx oi, uintptr_t retaddr) { - validate_memop(oi, MO_LEUQ); + tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_64); do_st8_mmu(env, addr, val, oi, retaddr); } -void helper_be_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val, - MemOpIdx oi, uintptr_t retaddr) +static void do_st16_mmu(CPUArchState *env, target_ulong addr, Int128 val, + MemOpIdx oi, uintptr_t ra) { - validate_memop(oi, MO_BEUQ); - do_st8_mmu(env, addr, val, oi, retaddr); + MMULookupLocals l; + bool crosspage; + uint64_t a, b; + int first; + + crosspage = mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE, &l); + if (likely(!crosspage)) { + /* Swap to host endian if necessary, then store. */ + if (l.memop & MO_BSWAP) { + val = bswap128(val); + } + if (unlikely(l.page[0].flags & TLB_MMIO)) { + QEMU_IOTHREAD_LOCK_GUARD(); + if (HOST_BIG_ENDIAN) { + b = int128_getlo(val), a = int128_gethi(val); + } else { + a = int128_getlo(val), b = int128_gethi(val); + } + io_writex(env, l.page[0].full, l.mmu_idx, a, addr, ra, MO_64); + io_writex(env, l.page[0].full, l.mmu_idx, b, addr + 8, ra, MO_64); + } else if (unlikely(l.page[0].flags & TLB_DISCARD_WRITE)) { + /* nothing */ + } else { + store_atom_16(env, ra, l.page[0].haddr, l.memop, val); + } + return; + } + + first = l.page[0].size; + if (first == 8) { + MemOp mop8 = (l.memop & ~(MO_SIZE | MO_BSWAP)) | MO_64; + + if (l.memop & MO_BSWAP) { + val = bswap128(val); + } + if (HOST_BIG_ENDIAN) { + b = int128_getlo(val), a = int128_gethi(val); + } else { + a = int128_getlo(val), b = int128_gethi(val); + } + do_st_8(env, &l.page[0], a, l.mmu_idx, mop8, ra); + do_st_8(env, &l.page[1], b, l.mmu_idx, mop8, ra); + return; + } + + if ((l.memop & MO_BSWAP) != MO_LE) { + val = bswap128(val); + } + if (first < 8) { + do_st_leN(env, &l.page[0], int128_getlo(val), l.mmu_idx, l.memop, ra); + val = int128_urshift(val, first * 8); + do_st16_leN(env, &l.page[1], val, l.mmu_idx, l.memop, ra); + } else { + b = do_st16_leN(env, &l.page[0], val, l.mmu_idx, l.memop, ra); + do_st_leN(env, &l.page[1], b, l.mmu_idx, l.memop, ra); + } +} + +void helper_st16_mmu(CPUArchState *env, uint64_t addr, Int128 val, + MemOpIdx oi, uintptr_t retaddr) +{ + tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_128); + do_st16_mmu(env, addr, val, oi, retaddr); +} + +void helper_st_i128(CPUArchState *env, uint64_t addr, Int128 val, MemOpIdx oi) +{ + helper_st16_mmu(env, addr, val, oi, GETPC()); } /* @@ -2782,104 +3041,72 @@ static void plugin_store_cb(CPUArchState *env, abi_ptr addr, MemOpIdx oi) void cpu_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val, MemOpIdx oi, uintptr_t retaddr) { - helper_ret_stb_mmu(env, addr, val, oi, retaddr); + helper_stb_mmu(env, addr, val, oi, retaddr); plugin_store_cb(env, addr, oi); } void cpu_stw_be_mmu(CPUArchState *env, target_ulong addr, uint16_t val, MemOpIdx oi, uintptr_t retaddr) { - helper_be_stw_mmu(env, addr, val, oi, retaddr); + tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_BEUW); + do_st2_mmu(env, addr, val, oi, retaddr); plugin_store_cb(env, addr, oi); } void cpu_stl_be_mmu(CPUArchState *env, target_ulong addr, uint32_t val, MemOpIdx oi, uintptr_t retaddr) { - helper_be_stl_mmu(env, addr, val, oi, retaddr); + tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_BEUL); + do_st4_mmu(env, addr, val, oi, retaddr); plugin_store_cb(env, addr, oi); } void cpu_stq_be_mmu(CPUArchState *env, target_ulong addr, uint64_t val, MemOpIdx oi, uintptr_t retaddr) { - helper_be_stq_mmu(env, addr, val, oi, retaddr); + tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_BEUQ); + do_st8_mmu(env, addr, val, oi, retaddr); plugin_store_cb(env, addr, oi); } void cpu_stw_le_mmu(CPUArchState *env, target_ulong addr, uint16_t val, MemOpIdx oi, uintptr_t retaddr) { - helper_le_stw_mmu(env, addr, val, oi, retaddr); + tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_LEUW); + do_st2_mmu(env, addr, val, oi, retaddr); plugin_store_cb(env, addr, oi); } void cpu_stl_le_mmu(CPUArchState *env, target_ulong addr, uint32_t val, MemOpIdx oi, uintptr_t retaddr) { - helper_le_stl_mmu(env, addr, val, oi, retaddr); + tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_LEUL); + do_st4_mmu(env, addr, val, oi, retaddr); plugin_store_cb(env, addr, oi); } void cpu_stq_le_mmu(CPUArchState *env, target_ulong addr, uint64_t val, MemOpIdx oi, uintptr_t retaddr) { - helper_le_stq_mmu(env, addr, val, oi, retaddr); + tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_LEUQ); + do_st8_mmu(env, addr, val, oi, retaddr); plugin_store_cb(env, addr, oi); } -void cpu_st16_be_mmu(CPUArchState *env, abi_ptr addr, Int128 val, - MemOpIdx oi, uintptr_t ra) +void cpu_st16_be_mmu(CPUArchState *env, target_ulong addr, Int128 val, + MemOpIdx oi, uintptr_t retaddr) { - MemOp mop = get_memop(oi); - int mmu_idx = get_mmuidx(oi); - MemOpIdx new_oi; - unsigned a_bits; - - tcg_debug_assert((mop & (MO_BSWAP|MO_SSIZE)) == (MO_BE|MO_128)); - a_bits = get_alignment_bits(mop); - - /* Handle CPU specific unaligned behaviour */ - if (addr & ((1 << a_bits) - 1)) { - cpu_unaligned_access(env_cpu(env), addr, MMU_DATA_STORE, - mmu_idx, ra); - } - - /* Construct an unaligned 64-bit replacement MemOpIdx. */ - mop = (mop & ~(MO_SIZE | MO_AMASK)) | MO_64 | MO_UNALN; - new_oi = make_memop_idx(mop, mmu_idx); - - helper_be_stq_mmu(env, addr, int128_gethi(val), new_oi, ra); - helper_be_stq_mmu(env, addr + 8, int128_getlo(val), new_oi, ra); - - qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W); + tcg_debug_assert((get_memop(oi) & (MO_BSWAP|MO_SIZE)) == (MO_BE|MO_128)); + do_st16_mmu(env, addr, val, oi, retaddr); + plugin_store_cb(env, addr, oi); } -void cpu_st16_le_mmu(CPUArchState *env, abi_ptr addr, Int128 val, - MemOpIdx oi, uintptr_t ra) +void cpu_st16_le_mmu(CPUArchState *env, target_ulong addr, Int128 val, + MemOpIdx oi, uintptr_t retaddr) { - MemOp mop = get_memop(oi); - int mmu_idx = get_mmuidx(oi); - MemOpIdx new_oi; - unsigned a_bits; - - tcg_debug_assert((mop & (MO_BSWAP|MO_SSIZE)) == (MO_LE|MO_128)); - a_bits = get_alignment_bits(mop); - - /* Handle CPU specific unaligned behaviour */ - if (addr & ((1 << a_bits) - 1)) { - cpu_unaligned_access(env_cpu(env), addr, MMU_DATA_STORE, - mmu_idx, ra); - } - - /* Construct an unaligned 64-bit replacement MemOpIdx. */ - mop = (mop & ~(MO_SIZE | MO_AMASK)) | MO_64 | MO_UNALN; - new_oi = make_memop_idx(mop, mmu_idx); - - helper_le_stq_mmu(env, addr, int128_getlo(val), new_oi, ra); - helper_le_stq_mmu(env, addr + 8, int128_gethi(val), new_oi, ra); - - qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W); + tcg_debug_assert((get_memop(oi) & (MO_BSWAP|MO_SIZE)) == (MO_LE|MO_128)); + do_st16_mmu(env, addr, val, oi, retaddr); + plugin_store_cb(env, addr, oi); } #include "ldst_common.c.inc" diff --git a/accel/tcg/ldst_atomicity.c.inc b/accel/tcg/ldst_atomicity.c.inc new file mode 100644 index 0000000000..ba5db7c366 --- /dev/null +++ b/accel/tcg/ldst_atomicity.c.inc @@ -0,0 +1,1262 @@ +/* + * Routines common to user and system emulation of load/store. + * + * Copyright (c) 2022 Linaro, Ltd. + * + * SPDX-License-Identifier: GPL-2.0-or-later + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifdef CONFIG_ATOMIC64 +# define HAVE_al8 true +#else +# define HAVE_al8 false +#endif +#define HAVE_al8_fast (ATOMIC_REG_SIZE >= 8) + +/* + * If __alignof(unsigned __int128) < 16, GCC may refuse to inline atomics + * that are supported by the host, e.g. s390x. We can force the pointer to + * have our known alignment with __builtin_assume_aligned, however prior to + * GCC 13 that was only reliable with optimization enabled. See + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107389 + */ +#if defined(CONFIG_ATOMIC128_OPT) +# if !defined(__OPTIMIZE__) +# define ATTRIBUTE_ATOMIC128_OPT __attribute__((optimize("O1"))) +# endif +# define CONFIG_ATOMIC128 +#endif +#ifndef ATTRIBUTE_ATOMIC128_OPT +# define ATTRIBUTE_ATOMIC128_OPT +#endif + +#if defined(CONFIG_ATOMIC128) +# define HAVE_al16_fast true +#else +# define HAVE_al16_fast false +#endif +#if defined(CONFIG_ATOMIC128) || defined(CONFIG_CMPXCHG128) +# define HAVE_al16 true +#else +# define HAVE_al16 false +#endif + + +/** + * required_atomicity: + * + * Return the lg2 bytes of atomicity required by @memop for @p. + * If the operation must be split into two operations to be + * examined separately for atomicity, return -lg2. + */ +static int required_atomicity(CPUArchState *env, uintptr_t p, MemOp memop) +{ + MemOp atom = memop & MO_ATOM_MASK; + MemOp size = memop & MO_SIZE; + MemOp half = size ? size - 1 : 0; + unsigned tmp; + int atmax; + + switch (atom) { + case MO_ATOM_NONE: + atmax = MO_8; + break; + + case MO_ATOM_IFALIGN_PAIR: + size = half; + /* fall through */ + + case MO_ATOM_IFALIGN: + tmp = (1 << size) - 1; + atmax = p & tmp ? MO_8 : size; + break; + + case MO_ATOM_WITHIN16: + tmp = p & 15; + atmax = (tmp + (1 << size) <= 16 ? size : MO_8); + break; + + case MO_ATOM_WITHIN16_PAIR: + tmp = p & 15; + if (tmp + (1 << size) <= 16) { + atmax = size; + } else if (tmp + (1 << half) == 16) { + /* + * The pair exactly straddles the boundary. + * Both halves are naturally aligned and atomic. + */ + atmax = half; + } else { + /* + * One of the pair crosses the boundary, and is non-atomic. + * The other of the pair does not cross, and is atomic. + */ + atmax = -half; + } + break; + + case MO_ATOM_SUBALIGN: + /* + * Examine the alignment of p to determine if there are subobjects + * that must be aligned. Note that we only really need ctz4() -- + * any more sigificant bits are discarded by the immediately + * following comparison. + */ + tmp = ctz32(p); + atmax = MIN(size, tmp); + break; + + default: + g_assert_not_reached(); + } + + /* + * Here we have the architectural atomicity of the operation. + * However, when executing in a serial context, we need no extra + * host atomicity in order to avoid racing. This reduction + * avoids looping with cpu_loop_exit_atomic. + */ + if (cpu_in_serial_context(env_cpu(env))) { + return MO_8; + } + return atmax; +} + +/** + * load_atomic2: + * @pv: host address + * + * Atomically load 2 aligned bytes from @pv. + */ +static inline uint16_t load_atomic2(void *pv) +{ + uint16_t *p = __builtin_assume_aligned(pv, 2); + return qatomic_read(p); +} + +/** + * load_atomic4: + * @pv: host address + * + * Atomically load 4 aligned bytes from @pv. + */ +static inline uint32_t load_atomic4(void *pv) +{ + uint32_t *p = __builtin_assume_aligned(pv, 4); + return qatomic_read(p); +} + +/** + * load_atomic8: + * @pv: host address + * + * Atomically load 8 aligned bytes from @pv. + */ +static inline uint64_t load_atomic8(void *pv) +{ + uint64_t *p = __builtin_assume_aligned(pv, 8); + + qemu_build_assert(HAVE_al8); + return qatomic_read__nocheck(p); +} + +/** + * load_atomic16: + * @pv: host address + * + * Atomically load 16 aligned bytes from @pv. + */ +static inline Int128 ATTRIBUTE_ATOMIC128_OPT +load_atomic16(void *pv) +{ +#ifdef CONFIG_ATOMIC128 + __uint128_t *p = __builtin_assume_aligned(pv, 16); + Int128Alias r; + + r.u = qatomic_read__nocheck(p); + return r.s; +#else + qemu_build_not_reached(); +#endif +} + +/** + * load_atomic8_or_exit: + * @env: cpu context + * @ra: host unwind address + * @pv: host address + * + * Atomically load 8 aligned bytes from @pv. + * If this is not possible, longjmp out to restart serially. + */ +static uint64_t load_atomic8_or_exit(CPUArchState *env, uintptr_t ra, void *pv) +{ + if (HAVE_al8) { + return load_atomic8(pv); + } + +#ifdef CONFIG_USER_ONLY + /* + * If the page is not writable, then assume the value is immutable + * and requires no locking. This ignores the case of MAP_SHARED with + * another process, because the fallback start_exclusive solution + * provides no protection across processes. + */ + if (!page_check_range(h2g(pv), 8, PAGE_WRITE)) { + uint64_t *p = __builtin_assume_aligned(pv, 8); + return *p; + } +#endif + + /* Ultimate fallback: re-execute in serial context. */ + cpu_loop_exit_atomic(env_cpu(env), ra); +} + +/** + * load_atomic16_or_exit: + * @env: cpu context + * @ra: host unwind address + * @pv: host address + * + * Atomically load 16 aligned bytes from @pv. + * If this is not possible, longjmp out to restart serially. + */ +static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv) +{ + Int128 *p = __builtin_assume_aligned(pv, 16); + + if (HAVE_al16_fast) { + return load_atomic16(p); + } + +#ifdef CONFIG_USER_ONLY + /* + * We can only use cmpxchg to emulate a load if the page is writable. + * If the page is not writable, then assume the value is immutable + * and requires no locking. This ignores the case of MAP_SHARED with + * another process, because the fallback start_exclusive solution + * provides no protection across processes. + */ + if (!page_check_range(h2g(p), 16, PAGE_WRITE)) { + return *p; + } +#endif + + /* + * In system mode all guest pages are writable, and for user-only + * we have just checked writability. Try cmpxchg. + */ +#if defined(CONFIG_CMPXCHG128) + /* Swap 0 with 0, with the side-effect of returning the old value. */ + { + Int128Alias r; + r.u = __sync_val_compare_and_swap_16((__uint128_t *)p, 0, 0); + return r.s; + } +#endif + + /* Ultimate fallback: re-execute in serial context. */ + cpu_loop_exit_atomic(env_cpu(env), ra); +} + +/** + * load_atom_extract_al4x2: + * @pv: host address + * + * Load 4 bytes from @p, from two sequential atomic 4-byte loads. + */ +static uint32_t load_atom_extract_al4x2(void *pv) +{ + uintptr_t pi = (uintptr_t)pv; + int sh = (pi & 3) * 8; + uint32_t a, b; + + pv = (void *)(pi & ~3); + a = load_atomic4(pv); + b = load_atomic4(pv + 4); + + if (HOST_BIG_ENDIAN) { + return (a << sh) | (b >> (-sh & 31)); + } else { + return (a >> sh) | (b << (-sh & 31)); + } +} + +/** + * load_atom_extract_al8x2: + * @pv: host address + * + * Load 8 bytes from @p, from two sequential atomic 8-byte loads. + */ +static uint64_t load_atom_extract_al8x2(void *pv) +{ + uintptr_t pi = (uintptr_t)pv; + int sh = (pi & 7) * 8; + uint64_t a, b; + + pv = (void *)(pi & ~7); + a = load_atomic8(pv); + b = load_atomic8(pv + 8); + + if (HOST_BIG_ENDIAN) { + return (a << sh) | (b >> (-sh & 63)); + } else { + return (a >> sh) | (b << (-sh & 63)); + } +} + +/** + * load_atom_extract_al8_or_exit: + * @env: cpu context + * @ra: host unwind address + * @pv: host address + * @s: object size in bytes, @s <= 4. + * + * Atomically load @s bytes from @p, when p % s != 0, and [p, p+s-1] does + * not cross an 8-byte boundary. This means that we can perform an atomic + * 8-byte load and extract. + * The value is returned in the low bits of a uint32_t. + */ +static uint32_t load_atom_extract_al8_or_exit(CPUArchState *env, uintptr_t ra, + void *pv, int s) +{ + uintptr_t pi = (uintptr_t)pv; + int o = pi & 7; + int shr = (HOST_BIG_ENDIAN ? 8 - s - o : o) * 8; + + pv = (void *)(pi & ~7); + return load_atomic8_or_exit(env, ra, pv) >> shr; +} + +/** + * load_atom_extract_al16_or_exit: + * @env: cpu context + * @ra: host unwind address + * @p: host address + * @s: object size in bytes, @s <= 8. + * + * Atomically load @s bytes from @p, when p % 16 < 8 + * and p % 16 + s > 8. I.e. does not cross a 16-byte + * boundary, but *does* cross an 8-byte boundary. + * This is the slow version, so we must have eliminated + * any faster load_atom_extract_al8_or_exit case. + * + * If this is not possible, longjmp out to restart serially. + */ +static uint64_t load_atom_extract_al16_or_exit(CPUArchState *env, uintptr_t ra, + void *pv, int s) +{ + uintptr_t pi = (uintptr_t)pv; + int o = pi & 7; + int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8; + Int128 r; + + /* + * Note constraints above: p & 8 must be clear. + * Provoke SIGBUS if possible otherwise. + */ + pv = (void *)(pi & ~7); + r = load_atomic16_or_exit(env, ra, pv); + + r = int128_urshift(r, shr); + return int128_getlo(r); +} + +/** + * load_atom_extract_al16_or_al8: + * @p: host address + * @s: object size in bytes, @s <= 8. + * + * Load @s bytes from @p, when p % s != 0. If [p, p+s-1] does not + * cross an 16-byte boundary then the access must be 16-byte atomic, + * otherwise the access must be 8-byte atomic. + */ +static inline uint64_t ATTRIBUTE_ATOMIC128_OPT +load_atom_extract_al16_or_al8(void *pv, int s) +{ +#if defined(CONFIG_ATOMIC128) + uintptr_t pi = (uintptr_t)pv; + int o = pi & 7; + int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8; + __uint128_t r; + + pv = (void *)(pi & ~7); + if (pi & 8) { + uint64_t *p8 = __builtin_assume_aligned(pv, 16, 8); + uint64_t a = qatomic_read__nocheck(p8); + uint64_t b = qatomic_read__nocheck(p8 + 1); + + if (HOST_BIG_ENDIAN) { + r = ((__uint128_t)a << 64) | b; + } else { + r = ((__uint128_t)b << 64) | a; + } + } else { + __uint128_t *p16 = __builtin_assume_aligned(pv, 16, 0); + r = qatomic_read__nocheck(p16); + } + return r >> shr; +#else + qemu_build_not_reached(); +#endif +} + +/** + * load_atom_4_by_2: + * @pv: host address + * + * Load 4 bytes from @pv, with two 2-byte atomic loads. + */ +static inline uint32_t load_atom_4_by_2(void *pv) +{ + uint32_t a = load_atomic2(pv); + uint32_t b = load_atomic2(pv + 2); + + if (HOST_BIG_ENDIAN) { + return (a << 16) | b; + } else { + return (b << 16) | a; + } +} + +/** + * load_atom_8_by_2: + * @pv: host address + * + * Load 8 bytes from @pv, with four 2-byte atomic loads. + */ +static inline uint64_t load_atom_8_by_2(void *pv) +{ + uint32_t a = load_atom_4_by_2(pv); + uint32_t b = load_atom_4_by_2(pv + 4); + + if (HOST_BIG_ENDIAN) { + return ((uint64_t)a << 32) | b; + } else { + return ((uint64_t)b << 32) | a; + } +} + +/** + * load_atom_8_by_4: + * @pv: host address + * + * Load 8 bytes from @pv, with two 4-byte atomic loads. + */ +static inline uint64_t load_atom_8_by_4(void *pv) +{ + uint32_t a = load_atomic4(pv); + uint32_t b = load_atomic4(pv + 4); + + if (HOST_BIG_ENDIAN) { + return ((uint64_t)a << 32) | b; + } else { + return ((uint64_t)b << 32) | a; + } +} + +/** + * load_atom_8_by_8_or_4: + * @pv: host address + * + * Load 8 bytes from aligned @pv, with at least 4-byte atomicity. + */ +static inline uint64_t load_atom_8_by_8_or_4(void *pv) +{ + if (HAVE_al8_fast) { + return load_atomic8(pv); + } else { + return load_atom_8_by_4(pv); + } +} + +/** + * load_atom_2: + * @p: host address + * @memop: the full memory op + * + * Load 2 bytes from @p, honoring the atomicity of @memop. + */ +static uint16_t load_atom_2(CPUArchState *env, uintptr_t ra, + void *pv, MemOp memop) +{ + uintptr_t pi = (uintptr_t)pv; + int atmax; + + if (likely((pi & 1) == 0)) { + return load_atomic2(pv); + } + if (HAVE_al16_fast) { + return load_atom_extract_al16_or_al8(pv, 2); + } + + atmax = required_atomicity(env, pi, memop); + switch (atmax) { + case MO_8: + return lduw_he_p(pv); + case MO_16: + /* The only case remaining is MO_ATOM_WITHIN16. */ + if (!HAVE_al8_fast && (pi & 3) == 1) { + /* Big or little endian, we want the middle two bytes. */ + return load_atomic4(pv - 1) >> 8; + } + if ((pi & 15) != 7) { + return load_atom_extract_al8_or_exit(env, ra, pv, 2); + } + return load_atom_extract_al16_or_exit(env, ra, pv, 2); + default: + g_assert_not_reached(); + } +} + +/** + * load_atom_4: + * @p: host address + * @memop: the full memory op + * + * Load 4 bytes from @p, honoring the atomicity of @memop. + */ +static uint32_t load_atom_4(CPUArchState *env, uintptr_t ra, + void *pv, MemOp memop) +{ + uintptr_t pi = (uintptr_t)pv; + int atmax; + + if (likely((pi & 3) == 0)) { + return load_atomic4(pv); + } + if (HAVE_al16_fast) { + return load_atom_extract_al16_or_al8(pv, 4); + } + + atmax = required_atomicity(env, pi, memop); + switch (atmax) { + case MO_8: + case MO_16: + case -MO_16: + /* + * For MO_ATOM_IFALIGN, this is more atomicity than required, + * but it's trivially supported on all hosts, better than 4 + * individual byte loads (when the host requires alignment), + * and overlaps with the MO_ATOM_SUBALIGN case of p % 2 == 0. + */ + return load_atom_extract_al4x2(pv); + case MO_32: + if (!(pi & 4)) { + return load_atom_extract_al8_or_exit(env, ra, pv, 4); + } + return load_atom_extract_al16_or_exit(env, ra, pv, 4); + default: + g_assert_not_reached(); + } +} + +/** + * load_atom_8: + * @p: host address + * @memop: the full memory op + * + * Load 8 bytes from @p, honoring the atomicity of @memop. + */ +static uint64_t load_atom_8(CPUArchState *env, uintptr_t ra, + void *pv, MemOp memop) +{ + uintptr_t pi = (uintptr_t)pv; + int atmax; + + /* + * If the host does not support 8-byte atomics, wait until we have + * examined the atomicity parameters below. + */ + if (HAVE_al8 && likely((pi & 7) == 0)) { + return load_atomic8(pv); + } + if (HAVE_al16_fast) { + return load_atom_extract_al16_or_al8(pv, 8); + } + + atmax = required_atomicity(env, pi, memop); + if (atmax == MO_64) { + if (!HAVE_al8 && (pi & 7) == 0) { + load_atomic8_or_exit(env, ra, pv); + } + return load_atom_extract_al16_or_exit(env, ra, pv, 8); + } + if (HAVE_al8_fast) { + return load_atom_extract_al8x2(pv); + } + switch (atmax) { + case MO_8: + return ldq_he_p(pv); + case MO_16: + return load_atom_8_by_2(pv); + case MO_32: + return load_atom_8_by_4(pv); + case -MO_32: + if (HAVE_al8) { + return load_atom_extract_al8x2(pv); + } + cpu_loop_exit_atomic(env_cpu(env), ra); + default: + g_assert_not_reached(); + } +} + +/** + * load_atom_16: + * @p: host address + * @memop: the full memory op + * + * Load 16 bytes from @p, honoring the atomicity of @memop. + */ +static Int128 load_atom_16(CPUArchState *env, uintptr_t ra, + void *pv, MemOp memop) +{ + uintptr_t pi = (uintptr_t)pv; + int atmax; + Int128 r; + uint64_t a, b; + + /* + * If the host does not support 16-byte atomics, wait until we have + * examined the atomicity parameters below. + */ + if (HAVE_al16_fast && likely((pi & 15) == 0)) { + return load_atomic16(pv); + } + + atmax = required_atomicity(env, pi, memop); + switch (atmax) { + case MO_8: + memcpy(&r, pv, 16); + return r; + case MO_16: + a = load_atom_8_by_2(pv); + b = load_atom_8_by_2(pv + 8); + break; + case MO_32: + a = load_atom_8_by_4(pv); + b = load_atom_8_by_4(pv + 8); + break; + case MO_64: + if (!HAVE_al8) { + cpu_loop_exit_atomic(env_cpu(env), ra); + } + a = load_atomic8(pv); + b = load_atomic8(pv + 8); + break; + case -MO_64: + if (!HAVE_al8) { + cpu_loop_exit_atomic(env_cpu(env), ra); + } + a = load_atom_extract_al8x2(pv); + b = load_atom_extract_al8x2(pv + 8); + break; + case MO_128: + return load_atomic16_or_exit(env, ra, pv); + default: + g_assert_not_reached(); + } + return int128_make128(HOST_BIG_ENDIAN ? b : a, HOST_BIG_ENDIAN ? a : b); +} + +/** + * store_atomic2: + * @pv: host address + * @val: value to store + * + * Atomically store 2 aligned bytes to @pv. + */ +static inline void store_atomic2(void *pv, uint16_t val) +{ + uint16_t *p = __builtin_assume_aligned(pv, 2); + qatomic_set(p, val); +} + +/** + * store_atomic4: + * @pv: host address + * @val: value to store + * + * Atomically store 4 aligned bytes to @pv. + */ +static inline void store_atomic4(void *pv, uint32_t val) +{ + uint32_t *p = __builtin_assume_aligned(pv, 4); + qatomic_set(p, val); +} + +/** + * store_atomic8: + * @pv: host address + * @val: value to store + * + * Atomically store 8 aligned bytes to @pv. + */ +static inline void store_atomic8(void *pv, uint64_t val) +{ + uint64_t *p = __builtin_assume_aligned(pv, 8); + + qemu_build_assert(HAVE_al8); + qatomic_set__nocheck(p, val); +} + +/** + * store_atomic16: + * @pv: host address + * @val: value to store + * + * Atomically store 16 aligned bytes to @pv. + */ +static inline void ATTRIBUTE_ATOMIC128_OPT +store_atomic16(void *pv, Int128Alias val) +{ +#if defined(CONFIG_ATOMIC128) + __uint128_t *pu = __builtin_assume_aligned(pv, 16); + qatomic_set__nocheck(pu, val.u); +#elif defined(CONFIG_CMPXCHG128) + __uint128_t *pu = __builtin_assume_aligned(pv, 16); + __uint128_t o; + + /* + * Without CONFIG_ATOMIC128, __atomic_compare_exchange_n will always + * defer to libatomic, so we must use __sync_*_compare_and_swap_16 + * and accept the sequential consistency that comes with it. + */ + do { + o = *pu; + } while (!__sync_bool_compare_and_swap_16(pu, o, val.u)); +#else + qemu_build_not_reached(); +#endif +} + +/** + * store_atom_4x2 + */ +static inline void store_atom_4_by_2(void *pv, uint32_t val) +{ + store_atomic2(pv, val >> (HOST_BIG_ENDIAN ? 16 : 0)); + store_atomic2(pv + 2, val >> (HOST_BIG_ENDIAN ? 0 : 16)); +} + +/** + * store_atom_8_by_2 + */ +static inline void store_atom_8_by_2(void *pv, uint64_t val) +{ + store_atom_4_by_2(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0)); + store_atom_4_by_2(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32)); +} + +/** + * store_atom_8_by_4 + */ +static inline void store_atom_8_by_4(void *pv, uint64_t val) +{ + store_atomic4(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0)); + store_atomic4(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32)); +} + +/** + * store_atom_insert_al4: + * @p: host address + * @val: shifted value to store + * @msk: mask for value to store + * + * Atomically store @val to @p, masked by @msk. + */ +static void store_atom_insert_al4(uint32_t *p, uint32_t val, uint32_t msk) +{ + uint32_t old, new; + + p = __builtin_assume_aligned(p, 4); + old = qatomic_read(p); + do { + new = (old & ~msk) | val; + } while (!__atomic_compare_exchange_n(p, &old, new, true, + __ATOMIC_RELAXED, __ATOMIC_RELAXED)); +} + +/** + * store_atom_insert_al8: + * @p: host address + * @val: shifted value to store + * @msk: mask for value to store + * + * Atomically store @val to @p masked by @msk. + */ +static void store_atom_insert_al8(uint64_t *p, uint64_t val, uint64_t msk) +{ + uint64_t old, new; + + qemu_build_assert(HAVE_al8); + p = __builtin_assume_aligned(p, 8); + old = qatomic_read__nocheck(p); + do { + new = (old & ~msk) | val; + } while (!__atomic_compare_exchange_n(p, &old, new, true, + __ATOMIC_RELAXED, __ATOMIC_RELAXED)); +} + +/** + * store_atom_insert_al16: + * @p: host address + * @val: shifted value to store + * @msk: mask for value to store + * + * Atomically store @val to @p masked by @msk. + */ +static void ATTRIBUTE_ATOMIC128_OPT +store_atom_insert_al16(Int128 *ps, Int128Alias val, Int128Alias msk) +{ +#if defined(CONFIG_ATOMIC128) + __uint128_t *pu, old, new; + + /* With CONFIG_ATOMIC128, we can avoid the memory barriers. */ + pu = __builtin_assume_aligned(ps, 16); + old = *pu; + do { + new = (old & ~msk.u) | val.u; + } while (!__atomic_compare_exchange_n(pu, &old, new, true, + __ATOMIC_RELAXED, __ATOMIC_RELAXED)); +#elif defined(CONFIG_CMPXCHG128) + __uint128_t *pu, old, new; + + /* + * Without CONFIG_ATOMIC128, __atomic_compare_exchange_n will always + * defer to libatomic, so we must use __sync_*_compare_and_swap_16 + * and accept the sequential consistency that comes with it. + */ + pu = __builtin_assume_aligned(ps, 16); + do { + old = *pu; + new = (old & ~msk.u) | val.u; + } while (!__sync_bool_compare_and_swap_16(pu, old, new)); +#else + qemu_build_not_reached(); +#endif +} + +/** + * store_bytes_leN: + * @pv: host address + * @size: number of bytes to store + * @val_le: data to store + * + * Store @size bytes at @p. The bytes to store are extracted in little-endian order + * from @val_le; return the bytes of @val_le beyond @size that have not been stored. + */ +static uint64_t store_bytes_leN(void *pv, int size, uint64_t val_le) +{ + uint8_t *p = pv; + for (int i = 0; i < size; i++, val_le >>= 8) { + p[i] = val_le; + } + return val_le; +} + +/** + * store_parts_leN + * @pv: host address + * @size: number of bytes to store + * @val_le: data to store + * + * As store_bytes_leN, but atomically on each aligned part. + */ +G_GNUC_UNUSED +static uint64_t store_parts_leN(void *pv, int size, uint64_t val_le) +{ + do { + int n; + + /* Find minimum of alignment and size */ + switch (((uintptr_t)pv | size) & 7) { + case 4: + store_atomic4(pv, le32_to_cpu(val_le)); + val_le >>= 32; + n = 4; + break; + case 2: + case 6: + store_atomic2(pv, le16_to_cpu(val_le)); + val_le >>= 16; + n = 2; + break; + default: + *(uint8_t *)pv = val_le; + val_le >>= 8; + n = 1; + break; + case 0: + g_assert_not_reached(); + } + pv += n; + size -= n; + } while (size != 0); + + return val_le; +} + +/** + * store_whole_le4 + * @pv: host address + * @size: number of bytes to store + * @val_le: data to store + * + * As store_bytes_leN, but atomically as a whole. + * Four aligned bytes are guaranteed to cover the store. + */ +static uint64_t store_whole_le4(void *pv, int size, uint64_t val_le) +{ + int sz = size * 8; + int o = (uintptr_t)pv & 3; + int sh = o * 8; + uint32_t m = MAKE_64BIT_MASK(0, sz); + uint32_t v; + + if (HOST_BIG_ENDIAN) { + v = bswap32(val_le) >> sh; + m = bswap32(m) >> sh; + } else { + v = val_le << sh; + m <<= sh; + } + store_atom_insert_al4(pv - o, v, m); + return val_le >> sz; +} + +/** + * store_whole_le8 + * @pv: host address + * @size: number of bytes to store + * @val_le: data to store + * + * As store_bytes_leN, but atomically as a whole. + * Eight aligned bytes are guaranteed to cover the store. + */ +static uint64_t store_whole_le8(void *pv, int size, uint64_t val_le) +{ + int sz = size * 8; + int o = (uintptr_t)pv & 7; + int sh = o * 8; + uint64_t m = MAKE_64BIT_MASK(0, sz); + uint64_t v; + + qemu_build_assert(HAVE_al8); + if (HOST_BIG_ENDIAN) { + v = bswap64(val_le) >> sh; + m = bswap64(m) >> sh; + } else { + v = val_le << sh; + m <<= sh; + } + store_atom_insert_al8(pv - o, v, m); + return val_le >> sz; +} + +/** + * store_whole_le16 + * @pv: host address + * @size: number of bytes to store + * @val_le: data to store + * + * As store_bytes_leN, but atomically as a whole. + * 16 aligned bytes are guaranteed to cover the store. + */ +static uint64_t store_whole_le16(void *pv, int size, Int128 val_le) +{ + int sz = size * 8; + int o = (uintptr_t)pv & 15; + int sh = o * 8; + Int128 m, v; + + qemu_build_assert(HAVE_al16); + + /* Like MAKE_64BIT_MASK(0, sz), but larger. */ + if (sz <= 64) { + m = int128_make64(MAKE_64BIT_MASK(0, sz)); + } else { + m = int128_make128(-1, MAKE_64BIT_MASK(0, sz - 64)); + } + + if (HOST_BIG_ENDIAN) { + v = int128_urshift(bswap128(val_le), sh); + m = int128_urshift(bswap128(m), sh); + } else { + v = int128_lshift(val_le, sh); + m = int128_lshift(m, sh); + } + store_atom_insert_al16(pv - o, v, m); + + /* Unused if sz <= 64. */ + return int128_gethi(val_le) >> (sz - 64); +} + +/** + * store_atom_2: + * @p: host address + * @val: the value to store + * @memop: the full memory op + * + * Store 2 bytes to @p, honoring the atomicity of @memop. + */ +static void store_atom_2(CPUArchState *env, uintptr_t ra, + void *pv, MemOp memop, uint16_t val) +{ + uintptr_t pi = (uintptr_t)pv; + int atmax; + + if (likely((pi & 1) == 0)) { + store_atomic2(pv, val); + return; + } + + atmax = required_atomicity(env, pi, memop); + if (atmax == MO_8) { + stw_he_p(pv, val); + return; + } + + /* + * The only case remaining is MO_ATOM_WITHIN16. + * Big or little endian, we want the middle two bytes in each test. + */ + if ((pi & 3) == 1) { + store_atom_insert_al4(pv - 1, (uint32_t)val << 8, MAKE_64BIT_MASK(8, 16)); + return; + } else if ((pi & 7) == 3) { + if (HAVE_al8) { + store_atom_insert_al8(pv - 3, (uint64_t)val << 24, MAKE_64BIT_MASK(24, 16)); + return; + } + } else if ((pi & 15) == 7) { + if (HAVE_al16) { + Int128 v = int128_lshift(int128_make64(val), 56); + Int128 m = int128_lshift(int128_make64(0xffff), 56); + store_atom_insert_al16(pv - 7, v, m); + return; + } + } else { + g_assert_not_reached(); + } + + cpu_loop_exit_atomic(env_cpu(env), ra); +} + +/** + * store_atom_4: + * @p: host address + * @val: the value to store + * @memop: the full memory op + * + * Store 4 bytes to @p, honoring the atomicity of @memop. + */ +static void store_atom_4(CPUArchState *env, uintptr_t ra, + void *pv, MemOp memop, uint32_t val) +{ + uintptr_t pi = (uintptr_t)pv; + int atmax; + + if (likely((pi & 3) == 0)) { + store_atomic4(pv, val); + return; + } + + atmax = required_atomicity(env, pi, memop); + switch (atmax) { + case MO_8: + stl_he_p(pv, val); + return; + case MO_16: + store_atom_4_by_2(pv, val); + return; + case -MO_16: + { + uint32_t val_le = cpu_to_le32(val); + int s2 = pi & 3; + int s1 = 4 - s2; + + switch (s2) { + case 1: + val_le = store_whole_le4(pv, s1, val_le); + *(uint8_t *)(pv + 3) = val_le; + break; + case 3: + *(uint8_t *)pv = val_le; + store_whole_le4(pv + 1, s2, val_le >> 8); + break; + case 0: /* aligned */ + case 2: /* atmax MO_16 */ + default: + g_assert_not_reached(); + } + } + return; + case MO_32: + if ((pi & 7) < 4) { + if (HAVE_al8) { + store_whole_le8(pv, 4, cpu_to_le32(val)); + return; + } + } else { + if (HAVE_al16) { + store_whole_le16(pv, 4, int128_make64(cpu_to_le32(val))); + return; + } + } + cpu_loop_exit_atomic(env_cpu(env), ra); + default: + g_assert_not_reached(); + } +} + +/** + * store_atom_8: + * @p: host address + * @val: the value to store + * @memop: the full memory op + * + * Store 8 bytes to @p, honoring the atomicity of @memop. + */ +static void store_atom_8(CPUArchState *env, uintptr_t ra, + void *pv, MemOp memop, uint64_t val) +{ + uintptr_t pi = (uintptr_t)pv; + int atmax; + + if (HAVE_al8 && likely((pi & 7) == 0)) { + store_atomic8(pv, val); + return; + } + + atmax = required_atomicity(env, pi, memop); + switch (atmax) { + case MO_8: + stq_he_p(pv, val); + return; + case MO_16: + store_atom_8_by_2(pv, val); + return; + case MO_32: + store_atom_8_by_4(pv, val); + return; + case -MO_32: + if (HAVE_al8) { + uint64_t val_le = cpu_to_le64(val); + int s2 = pi & 7; + int s1 = 8 - s2; + + switch (s2) { + case 1 ... 3: + val_le = store_whole_le8(pv, s1, val_le); + store_bytes_leN(pv + s1, s2, val_le); + break; + case 5 ... 7: + val_le = store_bytes_leN(pv, s1, val_le); + store_whole_le8(pv + s1, s2, val_le); + break; + case 0: /* aligned */ + case 4: /* atmax MO_32 */ + default: + g_assert_not_reached(); + } + return; + } + break; + case MO_64: + if (HAVE_al16) { + store_whole_le16(pv, 8, int128_make64(cpu_to_le64(val))); + return; + } + break; + default: + g_assert_not_reached(); + } + cpu_loop_exit_atomic(env_cpu(env), ra); +} + +/** + * store_atom_16: + * @p: host address + * @val: the value to store + * @memop: the full memory op + * + * Store 16 bytes to @p, honoring the atomicity of @memop. + */ +static void store_atom_16(CPUArchState *env, uintptr_t ra, + void *pv, MemOp memop, Int128 val) +{ + uintptr_t pi = (uintptr_t)pv; + uint64_t a, b; + int atmax; + + if (HAVE_al16_fast && likely((pi & 15) == 0)) { + store_atomic16(pv, val); + return; + } + + atmax = required_atomicity(env, pi, memop); + + a = HOST_BIG_ENDIAN ? int128_gethi(val) : int128_getlo(val); + b = HOST_BIG_ENDIAN ? int128_getlo(val) : int128_gethi(val); + switch (atmax) { + case MO_8: + memcpy(pv, &val, 16); + return; + case MO_16: + store_atom_8_by_2(pv, a); + store_atom_8_by_2(pv + 8, b); + return; + case MO_32: + store_atom_8_by_4(pv, a); + store_atom_8_by_4(pv + 8, b); + return; + case MO_64: + if (HAVE_al8) { + store_atomic8(pv, a); + store_atomic8(pv + 8, b); + return; + } + break; + case -MO_64: + if (HAVE_al16) { + uint64_t val_le; + int s2 = pi & 15; + int s1 = 16 - s2; + + if (HOST_BIG_ENDIAN) { + val = bswap128(val); + } + switch (s2) { + case 1 ... 7: + val_le = store_whole_le16(pv, s1, val); + store_bytes_leN(pv + s1, s2, val_le); + break; + case 9 ... 15: + store_bytes_leN(pv, s1, int128_getlo(val)); + val = int128_urshift(val, s1 * 8); + store_whole_le16(pv + s1, s2, val); + break; + case 0: /* aligned */ + case 8: /* atmax MO_64 */ + default: + g_assert_not_reached(); + } + return; + } + break; + case MO_128: + if (HAVE_al16) { + store_atomic16(pv, val); + return; + } + break; + default: + g_assert_not_reached(); + } + cpu_loop_exit_atomic(env_cpu(env), ra); +} diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c index 5efb8db258..34be1b940c 100644 --- a/accel/tcg/plugin-gen.c +++ b/accel/tcg/plugin-gen.c @@ -92,27 +92,6 @@ void HELPER(plugin_vcpu_mem_cb)(unsigned int vcpu_index, void *userdata) { } -static void do_gen_mem_cb(TCGv vaddr, uint32_t info) -{ - TCGv_i32 cpu_index = tcg_temp_ebb_new_i32(); - TCGv_i32 meminfo = tcg_temp_ebb_new_i32(); - TCGv_i64 vaddr64 = tcg_temp_ebb_new_i64(); - TCGv_ptr udata = tcg_temp_ebb_new_ptr(); - - tcg_gen_movi_i32(meminfo, info); - tcg_gen_movi_ptr(udata, 0); - tcg_gen_ld_i32(cpu_index, cpu_env, - -offsetof(ArchCPU, env) + offsetof(CPUState, cpu_index)); - tcg_gen_extu_tl_i64(vaddr64, vaddr); - - gen_helper_plugin_vcpu_mem_cb(cpu_index, meminfo, vaddr64, udata); - - tcg_temp_free_ptr(udata); - tcg_temp_free_i64(vaddr64); - tcg_temp_free_i32(meminfo); - tcg_temp_free_i32(cpu_index); -} - static void gen_empty_udata_cb(void) { TCGv_i32 cpu_index = tcg_temp_ebb_new_i32(); @@ -145,9 +124,22 @@ static void gen_empty_inline_cb(void) tcg_temp_free_i64(val); } -static void gen_empty_mem_cb(TCGv addr, uint32_t info) +static void gen_empty_mem_cb(TCGv_i64 addr, uint32_t info) { - do_gen_mem_cb(addr, info); + TCGv_i32 cpu_index = tcg_temp_ebb_new_i32(); + TCGv_i32 meminfo = tcg_temp_ebb_new_i32(); + TCGv_ptr udata = tcg_temp_ebb_new_ptr(); + + tcg_gen_movi_i32(meminfo, info); + tcg_gen_movi_ptr(udata, 0); + tcg_gen_ld_i32(cpu_index, cpu_env, + -offsetof(ArchCPU, env) + offsetof(CPUState, cpu_index)); + + gen_helper_plugin_vcpu_mem_cb(cpu_index, meminfo, addr, udata); + + tcg_temp_free_ptr(udata); + tcg_temp_free_i32(meminfo); + tcg_temp_free_i32(cpu_index); } /* @@ -202,35 +194,17 @@ static void plugin_gen_empty_callback(enum plugin_gen_from from) } } -union mem_gen_fn { - void (*mem_fn)(TCGv, uint32_t); - void (*inline_fn)(void); -}; - -static void gen_mem_wrapped(enum plugin_gen_cb type, - const union mem_gen_fn *f, TCGv addr, - uint32_t info, bool is_mem) +void plugin_gen_empty_mem_callback(TCGv_i64 addr, uint32_t info) { enum qemu_plugin_mem_rw rw = get_plugin_meminfo_rw(info); - gen_plugin_cb_start(PLUGIN_GEN_FROM_MEM, type, rw); - if (is_mem) { - f->mem_fn(addr, info); - } else { - f->inline_fn(); - } + gen_plugin_cb_start(PLUGIN_GEN_FROM_MEM, PLUGIN_GEN_CB_MEM, rw); + gen_empty_mem_cb(addr, info); tcg_gen_plugin_cb_end(); -} -void plugin_gen_empty_mem_callback(TCGv addr, uint32_t info) -{ - union mem_gen_fn fn; - - fn.mem_fn = gen_empty_mem_cb; - gen_mem_wrapped(PLUGIN_GEN_CB_MEM, &fn, addr, info, true); - - fn.inline_fn = gen_empty_inline_cb; - gen_mem_wrapped(PLUGIN_GEN_CB_INLINE, &fn, 0, info, false); + gen_plugin_cb_start(PLUGIN_GEN_FROM_MEM, PLUGIN_GEN_CB_INLINE, rw); + gen_empty_inline_cb(); + tcg_gen_plugin_cb_end(); } static TCGOp *find_op(TCGOp *op, TCGOpcode opc) diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h index b8e6421c8a..6f8c2061d0 100644 --- a/accel/tcg/tcg-runtime.h +++ b/accel/tcg/tcg-runtime.h @@ -39,62 +39,65 @@ DEF_HELPER_FLAGS_1(exit_atomic, TCG_CALL_NO_WG, noreturn, env) DEF_HELPER_FLAGS_3(memset, TCG_CALL_NO_RWG, ptr, ptr, int, ptr) #endif /* IN_HELPER_PROTO */ +DEF_HELPER_FLAGS_3(ld_i128, TCG_CALL_NO_WG, i128, env, i64, i32) +DEF_HELPER_FLAGS_4(st_i128, TCG_CALL_NO_WG, void, env, i64, i128, i32) + DEF_HELPER_FLAGS_5(atomic_cmpxchgb, TCG_CALL_NO_WG, - i32, env, tl, i32, i32, i32) + i32, env, i64, i32, i32, i32) DEF_HELPER_FLAGS_5(atomic_cmpxchgw_be, TCG_CALL_NO_WG, - i32, env, tl, i32, i32, i32) + i32, env, i64, i32, i32, i32) DEF_HELPER_FLAGS_5(atomic_cmpxchgw_le, TCG_CALL_NO_WG, - i32, env, tl, i32, i32, i32) + i32, env, i64, i32, i32, i32) DEF_HELPER_FLAGS_5(atomic_cmpxchgl_be, TCG_CALL_NO_WG, - i32, env, tl, i32, i32, i32) + i32, env, i64, i32, i32, i32) DEF_HELPER_FLAGS_5(atomic_cmpxchgl_le, TCG_CALL_NO_WG, - i32, env, tl, i32, i32, i32) + i32, env, i64, i32, i32, i32) #ifdef CONFIG_ATOMIC64 DEF_HELPER_FLAGS_5(atomic_cmpxchgq_be, TCG_CALL_NO_WG, - i64, env, tl, i64, i64, i32) + i64, env, i64, i64, i64, i32) DEF_HELPER_FLAGS_5(atomic_cmpxchgq_le, TCG_CALL_NO_WG, - i64, env, tl, i64, i64, i32) + i64, env, i64, i64, i64, i32) #endif #ifdef CONFIG_CMPXCHG128 DEF_HELPER_FLAGS_5(atomic_cmpxchgo_be, TCG_CALL_NO_WG, - i128, env, tl, i128, i128, i32) + i128, env, i64, i128, i128, i32) DEF_HELPER_FLAGS_5(atomic_cmpxchgo_le, TCG_CALL_NO_WG, - i128, env, tl, i128, i128, i32) + i128, env, i64, i128, i128, i32) #endif DEF_HELPER_FLAGS_5(nonatomic_cmpxchgo_be, TCG_CALL_NO_WG, - i128, env, tl, i128, i128, i32) + i128, env, i64, i128, i128, i32) DEF_HELPER_FLAGS_5(nonatomic_cmpxchgo_le, TCG_CALL_NO_WG, - i128, env, tl, i128, i128, i32) + i128, env, i64, i128, i128, i32) #ifdef CONFIG_ATOMIC64 #define GEN_ATOMIC_HELPERS(NAME) \ DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), b), \ - TCG_CALL_NO_WG, i32, env, tl, i32, i32) \ + TCG_CALL_NO_WG, i32, env, i64, i32, i32) \ DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), w_le), \ - TCG_CALL_NO_WG, i32, env, tl, i32, i32) \ + TCG_CALL_NO_WG, i32, env, i64, i32, i32) \ DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), w_be), \ - TCG_CALL_NO_WG, i32, env, tl, i32, i32) \ + TCG_CALL_NO_WG, i32, env, i64, i32, i32) \ DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), l_le), \ - TCG_CALL_NO_WG, i32, env, tl, i32, i32) \ + TCG_CALL_NO_WG, i32, env, i64, i32, i32) \ DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), l_be), \ - TCG_CALL_NO_WG, i32, env, tl, i32, i32) \ + TCG_CALL_NO_WG, i32, env, i64, i32, i32) \ DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), q_le), \ - TCG_CALL_NO_WG, i64, env, tl, i64, i32) \ + TCG_CALL_NO_WG, i64, env, i64, i64, i32) \ DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), q_be), \ - TCG_CALL_NO_WG, i64, env, tl, i64, i32) + TCG_CALL_NO_WG, i64, env, i64, i64, i32) #else #define GEN_ATOMIC_HELPERS(NAME) \ DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), b), \ - TCG_CALL_NO_WG, i32, env, tl, i32, i32) \ + TCG_CALL_NO_WG, i32, env, i64, i32, i32) \ DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), w_le), \ - TCG_CALL_NO_WG, i32, env, tl, i32, i32) \ + TCG_CALL_NO_WG, i32, env, i64, i32, i32) \ DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), w_be), \ - TCG_CALL_NO_WG, i32, env, tl, i32, i32) \ + TCG_CALL_NO_WG, i32, env, i64, i32, i32) \ DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), l_le), \ - TCG_CALL_NO_WG, i32, env, tl, i32, i32) \ + TCG_CALL_NO_WG, i32, env, i64, i32, i32) \ DEF_HELPER_FLAGS_4(glue(glue(atomic_, NAME), l_be), \ - TCG_CALL_NO_WG, i32, env, tl, i32, i32) + TCG_CALL_NO_WG, i32, env, i64, i32, i32) #endif /* CONFIG_ATOMIC64 */ GEN_ATOMIC_HELPERS(fetch_add) diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c index 5b13281119..353849ca6d 100644 --- a/accel/tcg/translate-all.c +++ b/accel/tcg/translate-all.c @@ -72,9 +72,11 @@ QEMU_BUILD_BUG_ON(CPU_TRACE_DSTATE_MAX_EVENTS > TBContext tb_ctx; -/* Encode VAL as a signed leb128 sequence at P. - Return P incremented past the encoded value. */ -static uint8_t *encode_sleb128(uint8_t *p, target_long val) +/* + * Encode VAL as a signed leb128 sequence at P. + * Return P incremented past the encoded value. + */ +static uint8_t *encode_sleb128(uint8_t *p, int64_t val) { int more, byte; @@ -92,21 +94,23 @@ static uint8_t *encode_sleb128(uint8_t *p, target_long val) return p; } -/* Decode a signed leb128 sequence at *PP; increment *PP past the - decoded value. Return the decoded value. */ -static target_long decode_sleb128(const uint8_t **pp) +/* + * Decode a signed leb128 sequence at *PP; increment *PP past the + * decoded value. Return the decoded value. + */ +static int64_t decode_sleb128(const uint8_t **pp) { const uint8_t *p = *pp; - target_long val = 0; + int64_t val = 0; int byte, shift = 0; do { byte = *p++; - val |= (target_ulong)(byte & 0x7f) << shift; + val |= (int64_t)(byte & 0x7f) << shift; shift += 7; } while (byte & 0x80); if (shift < TARGET_LONG_BITS && (byte & 0x40)) { - val |= -(target_ulong)1 << shift; + val |= -(int64_t)1 << shift; } *pp = p; @@ -132,7 +136,7 @@ static int encode_search(TranslationBlock *tb, uint8_t *block) int i, j, n; for (i = 0, n = tb->icount; i < n; ++i) { - target_ulong prev; + uint64_t prev; for (j = 0; j < TARGET_INSN_START_WORDS; ++j) { if (i == 0) { @@ -352,6 +356,13 @@ TranslationBlock *tb_gen_code(CPUState *cpu, tb_set_page_addr0(tb, phys_pc); tb_set_page_addr1(tb, -1); tcg_ctx->gen_tb = tb; + tcg_ctx->addr_type = TCG_TYPE_TL; +#ifdef CONFIG_SOFTMMU + tcg_ctx->page_bits = TARGET_PAGE_BITS; + tcg_ctx->page_mask = TARGET_PAGE_MASK; + tcg_ctx->tlb_dyn_max_bits = CPU_TLB_DYN_MAX_BITS; +#endif + tb_overflow: #ifdef CONFIG_PROFILER @@ -444,7 +455,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu, /* Dump header and the first instruction */ fprintf(logfile, "OUT: [size=%d]\n", gen_code_size); fprintf(logfile, - " -- guest addr 0x" TARGET_FMT_lx " + tb prologue\n", + " -- guest addr 0x%016" PRIx64 " + tb prologue\n", tcg_ctx->gen_insn_data[insn][0]); chunk_start = tcg_ctx->gen_insn_end_off[insn]; disas(logfile, tb->tc.ptr, chunk_start); @@ -457,7 +468,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu, while (insn < tb->icount) { size_t chunk_end = tcg_ctx->gen_insn_end_off[insn]; if (chunk_end > chunk_start) { - fprintf(logfile, " -- guest addr 0x" TARGET_FMT_lx "\n", + fprintf(logfile, " -- guest addr 0x%016" PRIx64 "\n", tcg_ctx->gen_insn_data[insn][0]); disas(logfile, tb->tc.ptr + chunk_start, chunk_end - chunk_start); diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c index fc597a010d..36ad8284a5 100644 --- a/accel/tcg/user-exec.c +++ b/accel/tcg/user-exec.c @@ -889,35 +889,9 @@ void page_reset_target_data(target_ulong start, target_ulong last) { } /* The softmmu versions of these helpers are in cputlb.c. */ -/* - * Verify that we have passed the correct MemOp to the correct function. - * - * We could present one function to target code, and dispatch based on - * the MemOp, but so far we have worked hard to avoid an indirect function - * call along the memory path. - */ -static void validate_memop(MemOpIdx oi, MemOp expected) -{ -#ifdef CONFIG_DEBUG_TCG - MemOp have = get_memop(oi) & (MO_SIZE | MO_BSWAP); - assert(have == expected); -#endif -} - -void helper_unaligned_ld(CPUArchState *env, target_ulong addr) -{ - cpu_loop_exit_sigbus(env_cpu(env), addr, MMU_DATA_LOAD, GETPC()); -} - -void helper_unaligned_st(CPUArchState *env, target_ulong addr) -{ - cpu_loop_exit_sigbus(env_cpu(env), addr, MMU_DATA_STORE, GETPC()); -} - -static void *cpu_mmu_lookup(CPUArchState *env, target_ulong addr, - MemOpIdx oi, uintptr_t ra, MMUAccessType type) +static void *cpu_mmu_lookup(CPUArchState *env, abi_ptr addr, + MemOp mop, uintptr_t ra, MMUAccessType type) { - MemOp mop = get_memop(oi); int a_bits = get_alignment_bits(mop); void *ret; @@ -931,116 +905,251 @@ static void *cpu_mmu_lookup(CPUArchState *env, target_ulong addr, return ret; } -uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr addr, - MemOpIdx oi, uintptr_t ra) +#include "ldst_atomicity.c.inc" + +static uint8_t do_ld1_mmu(CPUArchState *env, abi_ptr addr, + MemOp mop, uintptr_t ra) { void *haddr; uint8_t ret; - validate_memop(oi, MO_UB); - haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD); + tcg_debug_assert((mop & MO_SIZE) == MO_8); + haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD); ret = ldub_p(haddr); clear_helper_retaddr(); + return ret; +} + +tcg_target_ulong helper_ldub_mmu(CPUArchState *env, uint64_t addr, + MemOpIdx oi, uintptr_t ra) +{ + return do_ld1_mmu(env, addr, get_memop(oi), ra); +} + +tcg_target_ulong helper_ldsb_mmu(CPUArchState *env, uint64_t addr, + MemOpIdx oi, uintptr_t ra) +{ + return (int8_t)do_ld1_mmu(env, addr, get_memop(oi), ra); +} + +uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr addr, + MemOpIdx oi, uintptr_t ra) +{ + uint8_t ret = do_ld1_mmu(env, addr, get_memop(oi), ra); qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R); return ret; } -uint16_t cpu_ldw_be_mmu(CPUArchState *env, abi_ptr addr, - MemOpIdx oi, uintptr_t ra) +static uint16_t do_ld2_he_mmu(CPUArchState *env, abi_ptr addr, + MemOp mop, uintptr_t ra) { void *haddr; uint16_t ret; - validate_memop(oi, MO_BEUW); - haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD); - ret = lduw_be_p(haddr); + tcg_debug_assert((mop & MO_SIZE) == MO_16); + haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD); + ret = load_atom_2(env, ra, haddr, mop); clear_helper_retaddr(); - qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R); return ret; } -uint32_t cpu_ldl_be_mmu(CPUArchState *env, abi_ptr addr, +tcg_target_ulong helper_lduw_mmu(CPUArchState *env, uint64_t addr, + MemOpIdx oi, uintptr_t ra) +{ + MemOp mop = get_memop(oi); + uint16_t ret = do_ld2_he_mmu(env, addr, mop, ra); + + if (mop & MO_BSWAP) { + ret = bswap16(ret); + } + return ret; +} + +tcg_target_ulong helper_ldsw_mmu(CPUArchState *env, uint64_t addr, + MemOpIdx oi, uintptr_t ra) +{ + MemOp mop = get_memop(oi); + int16_t ret = do_ld2_he_mmu(env, addr, mop, ra); + + if (mop & MO_BSWAP) { + ret = bswap16(ret); + } + return ret; +} + +uint16_t cpu_ldw_be_mmu(CPUArchState *env, abi_ptr addr, MemOpIdx oi, uintptr_t ra) { + MemOp mop = get_memop(oi); + uint16_t ret; + + tcg_debug_assert((mop & MO_BSWAP) == MO_BE); + ret = do_ld2_he_mmu(env, addr, mop, ra); + qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R); + return cpu_to_be16(ret); +} + +uint16_t cpu_ldw_le_mmu(CPUArchState *env, abi_ptr addr, + MemOpIdx oi, uintptr_t ra) +{ + MemOp mop = get_memop(oi); + uint16_t ret; + + tcg_debug_assert((mop & MO_BSWAP) == MO_LE); + ret = do_ld2_he_mmu(env, addr, mop, ra); + qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R); + return cpu_to_le16(ret); +} + +static uint32_t do_ld4_he_mmu(CPUArchState *env, abi_ptr addr, + MemOp mop, uintptr_t ra) +{ void *haddr; uint32_t ret; - validate_memop(oi, MO_BEUL); - haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD); - ret = ldl_be_p(haddr); + tcg_debug_assert((mop & MO_SIZE) == MO_32); + haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD); + ret = load_atom_4(env, ra, haddr, mop); clear_helper_retaddr(); - qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R); return ret; } -uint64_t cpu_ldq_be_mmu(CPUArchState *env, abi_ptr addr, +tcg_target_ulong helper_ldul_mmu(CPUArchState *env, uint64_t addr, + MemOpIdx oi, uintptr_t ra) +{ + MemOp mop = get_memop(oi); + uint32_t ret = do_ld4_he_mmu(env, addr, mop, ra); + + if (mop & MO_BSWAP) { + ret = bswap32(ret); + } + return ret; +} + +tcg_target_ulong helper_ldsl_mmu(CPUArchState *env, uint64_t addr, + MemOpIdx oi, uintptr_t ra) +{ + MemOp mop = get_memop(oi); + int32_t ret = do_ld4_he_mmu(env, addr, mop, ra); + + if (mop & MO_BSWAP) { + ret = bswap32(ret); + } + return ret; +} + +uint32_t cpu_ldl_be_mmu(CPUArchState *env, abi_ptr addr, MemOpIdx oi, uintptr_t ra) { + MemOp mop = get_memop(oi); + uint32_t ret; + + tcg_debug_assert((mop & MO_BSWAP) == MO_BE); + ret = do_ld4_he_mmu(env, addr, mop, ra); + qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R); + return cpu_to_be32(ret); +} + +uint32_t cpu_ldl_le_mmu(CPUArchState *env, abi_ptr addr, + MemOpIdx oi, uintptr_t ra) +{ + MemOp mop = get_memop(oi); + uint32_t ret; + + tcg_debug_assert((mop & MO_BSWAP) == MO_LE); + ret = do_ld4_he_mmu(env, addr, mop, ra); + qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R); + return cpu_to_le32(ret); +} + +static uint64_t do_ld8_he_mmu(CPUArchState *env, abi_ptr addr, + MemOp mop, uintptr_t ra) +{ void *haddr; uint64_t ret; - validate_memop(oi, MO_BEUQ); - haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD); - ret = ldq_be_p(haddr); + tcg_debug_assert((mop & MO_SIZE) == MO_64); + haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD); + ret = load_atom_8(env, ra, haddr, mop); clear_helper_retaddr(); - qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R); return ret; } -uint16_t cpu_ldw_le_mmu(CPUArchState *env, abi_ptr addr, +uint64_t helper_ldq_mmu(CPUArchState *env, uint64_t addr, MemOpIdx oi, uintptr_t ra) { - void *haddr; - uint16_t ret; + MemOp mop = get_memop(oi); + uint64_t ret = do_ld8_he_mmu(env, addr, mop, ra); - validate_memop(oi, MO_LEUW); - haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD); - ret = lduw_le_p(haddr); - clear_helper_retaddr(); - qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R); + if (mop & MO_BSWAP) { + ret = bswap64(ret); + } return ret; } -uint32_t cpu_ldl_le_mmu(CPUArchState *env, abi_ptr addr, +uint64_t cpu_ldq_be_mmu(CPUArchState *env, abi_ptr addr, MemOpIdx oi, uintptr_t ra) { - void *haddr; - uint32_t ret; + MemOp mop = get_memop(oi); + uint64_t ret; - validate_memop(oi, MO_LEUL); - haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD); - ret = ldl_le_p(haddr); - clear_helper_retaddr(); + tcg_debug_assert((mop & MO_BSWAP) == MO_BE); + ret = do_ld8_he_mmu(env, addr, mop, ra); qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R); - return ret; + return cpu_to_be64(ret); } uint64_t cpu_ldq_le_mmu(CPUArchState *env, abi_ptr addr, MemOpIdx oi, uintptr_t ra) { - void *haddr; + MemOp mop = get_memop(oi); uint64_t ret; - validate_memop(oi, MO_LEUQ); - haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD); - ret = ldq_le_p(haddr); - clear_helper_retaddr(); + tcg_debug_assert((mop & MO_BSWAP) == MO_LE); + ret = do_ld8_he_mmu(env, addr, mop, ra); qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R); + return cpu_to_le64(ret); +} + +static Int128 do_ld16_he_mmu(CPUArchState *env, abi_ptr addr, + MemOp mop, uintptr_t ra) +{ + void *haddr; + Int128 ret; + + tcg_debug_assert((mop & MO_SIZE) == MO_128); + haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD); + ret = load_atom_16(env, ra, haddr, mop); + clear_helper_retaddr(); return ret; } +Int128 helper_ld16_mmu(CPUArchState *env, uint64_t addr, + MemOpIdx oi, uintptr_t ra) +{ + MemOp mop = get_memop(oi); + Int128 ret = do_ld16_he_mmu(env, addr, mop, ra); + + if (mop & MO_BSWAP) { + ret = bswap128(ret); + } + return ret; +} + +Int128 helper_ld_i128(CPUArchState *env, uint64_t addr, MemOpIdx oi) +{ + return helper_ld16_mmu(env, addr, oi, GETPC()); +} + Int128 cpu_ld16_be_mmu(CPUArchState *env, abi_ptr addr, MemOpIdx oi, uintptr_t ra) { - void *haddr; + MemOp mop = get_memop(oi); Int128 ret; - validate_memop(oi, MO_128 | MO_BE); - haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD); - memcpy(&ret, haddr, 16); - clear_helper_retaddr(); + tcg_debug_assert((mop & MO_BSWAP) == MO_BE); + ret = do_ld16_he_mmu(env, addr, mop, ra); qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R); - if (!HOST_BIG_ENDIAN) { ret = bswap128(ret); } @@ -1050,132 +1159,218 @@ Int128 cpu_ld16_be_mmu(CPUArchState *env, abi_ptr addr, Int128 cpu_ld16_le_mmu(CPUArchState *env, abi_ptr addr, MemOpIdx oi, uintptr_t ra) { - void *haddr; + MemOp mop = get_memop(oi); Int128 ret; - validate_memop(oi, MO_128 | MO_LE); - haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD); - memcpy(&ret, haddr, 16); - clear_helper_retaddr(); + tcg_debug_assert((mop & MO_BSWAP) == MO_LE); + ret = do_ld16_he_mmu(env, addr, mop, ra); qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R); - if (HOST_BIG_ENDIAN) { ret = bswap128(ret); } return ret; } -void cpu_stb_mmu(CPUArchState *env, abi_ptr addr, uint8_t val, - MemOpIdx oi, uintptr_t ra) +static void do_st1_mmu(CPUArchState *env, abi_ptr addr, uint8_t val, + MemOp mop, uintptr_t ra) { void *haddr; - validate_memop(oi, MO_UB); - haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE); + tcg_debug_assert((mop & MO_SIZE) == MO_8); + haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE); stb_p(haddr, val); clear_helper_retaddr(); - qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W); } -void cpu_stw_be_mmu(CPUArchState *env, abi_ptr addr, uint16_t val, +void helper_stb_mmu(CPUArchState *env, uint64_t addr, uint32_t val, MemOpIdx oi, uintptr_t ra) { - void *haddr; + do_st1_mmu(env, addr, val, get_memop(oi), ra); +} - validate_memop(oi, MO_BEUW); - haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE); - stw_be_p(haddr, val); - clear_helper_retaddr(); +void cpu_stb_mmu(CPUArchState *env, abi_ptr addr, uint8_t val, + MemOpIdx oi, uintptr_t ra) +{ + do_st1_mmu(env, addr, val, get_memop(oi), ra); qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W); } -void cpu_stl_be_mmu(CPUArchState *env, abi_ptr addr, uint32_t val, - MemOpIdx oi, uintptr_t ra) +static void do_st2_he_mmu(CPUArchState *env, abi_ptr addr, uint16_t val, + MemOp mop, uintptr_t ra) { void *haddr; - validate_memop(oi, MO_BEUL); - haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE); - stl_be_p(haddr, val); + tcg_debug_assert((mop & MO_SIZE) == MO_16); + haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE); + store_atom_2(env, ra, haddr, mop, val); clear_helper_retaddr(); - qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W); } -void cpu_stq_be_mmu(CPUArchState *env, abi_ptr addr, uint64_t val, +void helper_stw_mmu(CPUArchState *env, uint64_t addr, uint32_t val, MemOpIdx oi, uintptr_t ra) { - void *haddr; + MemOp mop = get_memop(oi); - validate_memop(oi, MO_BEUQ); - haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE); - stq_be_p(haddr, val); - clear_helper_retaddr(); + if (mop & MO_BSWAP) { + val = bswap16(val); + } + do_st2_he_mmu(env, addr, val, mop, ra); +} + +void cpu_stw_be_mmu(CPUArchState *env, abi_ptr addr, uint16_t val, + MemOpIdx oi, uintptr_t ra) +{ + MemOp mop = get_memop(oi); + + tcg_debug_assert((mop & MO_BSWAP) == MO_BE); + do_st2_he_mmu(env, addr, be16_to_cpu(val), mop, ra); qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W); } void cpu_stw_le_mmu(CPUArchState *env, abi_ptr addr, uint16_t val, MemOpIdx oi, uintptr_t ra) { + MemOp mop = get_memop(oi); + + tcg_debug_assert((mop & MO_BSWAP) == MO_LE); + do_st2_he_mmu(env, addr, le16_to_cpu(val), mop, ra); + qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W); +} + +static void do_st4_he_mmu(CPUArchState *env, abi_ptr addr, uint32_t val, + MemOp mop, uintptr_t ra) +{ void *haddr; - validate_memop(oi, MO_LEUW); - haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE); - stw_le_p(haddr, val); + tcg_debug_assert((mop & MO_SIZE) == MO_32); + haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE); + store_atom_4(env, ra, haddr, mop, val); clear_helper_retaddr(); +} + +void helper_stl_mmu(CPUArchState *env, uint64_t addr, uint32_t val, + MemOpIdx oi, uintptr_t ra) +{ + MemOp mop = get_memop(oi); + + if (mop & MO_BSWAP) { + val = bswap32(val); + } + do_st4_he_mmu(env, addr, val, mop, ra); +} + +void cpu_stl_be_mmu(CPUArchState *env, abi_ptr addr, uint32_t val, + MemOpIdx oi, uintptr_t ra) +{ + MemOp mop = get_memop(oi); + + tcg_debug_assert((mop & MO_BSWAP) == MO_BE); + do_st4_he_mmu(env, addr, be32_to_cpu(val), mop, ra); qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W); } void cpu_stl_le_mmu(CPUArchState *env, abi_ptr addr, uint32_t val, MemOpIdx oi, uintptr_t ra) { + MemOp mop = get_memop(oi); + + tcg_debug_assert((mop & MO_BSWAP) == MO_LE); + do_st4_he_mmu(env, addr, le32_to_cpu(val), mop, ra); + qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W); +} + +static void do_st8_he_mmu(CPUArchState *env, abi_ptr addr, uint64_t val, + MemOp mop, uintptr_t ra) +{ void *haddr; - validate_memop(oi, MO_LEUL); - haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE); - stl_le_p(haddr, val); + tcg_debug_assert((mop & MO_SIZE) == MO_64); + haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE); + store_atom_8(env, ra, haddr, mop, val); clear_helper_retaddr(); +} + +void helper_stq_mmu(CPUArchState *env, uint64_t addr, uint64_t val, + MemOpIdx oi, uintptr_t ra) +{ + MemOp mop = get_memop(oi); + + if (mop & MO_BSWAP) { + val = bswap64(val); + } + do_st8_he_mmu(env, addr, val, mop, ra); +} + +void cpu_stq_be_mmu(CPUArchState *env, abi_ptr addr, uint64_t val, + MemOpIdx oi, uintptr_t ra) +{ + MemOp mop = get_memop(oi); + + tcg_debug_assert((mop & MO_BSWAP) == MO_BE); + do_st8_he_mmu(env, addr, cpu_to_be64(val), mop, ra); qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W); } void cpu_stq_le_mmu(CPUArchState *env, abi_ptr addr, uint64_t val, MemOpIdx oi, uintptr_t ra) { + MemOp mop = get_memop(oi); + + tcg_debug_assert((mop & MO_BSWAP) == MO_LE); + do_st8_he_mmu(env, addr, cpu_to_le64(val), mop, ra); + qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W); +} + +static void do_st16_he_mmu(CPUArchState *env, abi_ptr addr, Int128 val, + MemOp mop, uintptr_t ra) +{ void *haddr; - validate_memop(oi, MO_LEUQ); - haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE); - stq_le_p(haddr, val); + tcg_debug_assert((mop & MO_SIZE) == MO_128); + haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE); + store_atom_16(env, ra, haddr, mop, val); clear_helper_retaddr(); - qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W); +} + +void helper_st16_mmu(CPUArchState *env, uint64_t addr, Int128 val, + MemOpIdx oi, uintptr_t ra) +{ + MemOp mop = get_memop(oi); + + if (mop & MO_BSWAP) { + val = bswap128(val); + } + do_st16_he_mmu(env, addr, val, mop, ra); +} + +void helper_st_i128(CPUArchState *env, uint64_t addr, Int128 val, MemOpIdx oi) +{ + helper_st16_mmu(env, addr, val, oi, GETPC()); } void cpu_st16_be_mmu(CPUArchState *env, abi_ptr addr, Int128 val, MemOpIdx oi, uintptr_t ra) { - void *haddr; + MemOp mop = get_memop(oi); - validate_memop(oi, MO_128 | MO_BE); - haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE); + tcg_debug_assert((mop & MO_BSWAP) == MO_BE); if (!HOST_BIG_ENDIAN) { val = bswap128(val); } - memcpy(haddr, &val, 16); - clear_helper_retaddr(); + do_st16_he_mmu(env, addr, val, mop, ra); qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W); } void cpu_st16_le_mmu(CPUArchState *env, abi_ptr addr, Int128 val, MemOpIdx oi, uintptr_t ra) { - void *haddr; + MemOp mop = get_memop(oi); - validate_memop(oi, MO_128 | MO_LE); - haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE); + tcg_debug_assert((mop & MO_BSWAP) == MO_LE); if (HOST_BIG_ENDIAN) { val = bswap128(val); } - memcpy(haddr, &val, 16); - clear_helper_retaddr(); + do_st16_he_mmu(env, addr, val, mop, ra); qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W); } @@ -1267,7 +1462,6 @@ uint64_t cpu_ldq_code_mmu(CPUArchState *env, abi_ptr addr, void *haddr; uint64_t ret; - validate_memop(oi, MO_BEUQ); haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD); ret = ldq_p(haddr); clear_helper_retaddr(); diff --git a/docs/devel/loads-stores.rst b/docs/devel/loads-stores.rst index ad5dfe133e..d2cefc77a2 100644 --- a/docs/devel/loads-stores.rst +++ b/docs/devel/loads-stores.rst @@ -297,31 +297,20 @@ swap: ``translator_ld{sign}{size}_swap(env, ptr, swap)`` Regexes for git grep - ``\<translator_ld[us]\?[bwlq]\(_swap\)\?\>`` -``helper_*_{ld,st}*_mmu`` +``helper_{ld,st}*_mmu`` ~~~~~~~~~~~~~~~~~~~~~~~~~ These functions are intended primarily to be called by the code -generated by the TCG backend. They may also be called by target -CPU helper function code. Like the ``cpu_{ld,st}_mmuidx_ra`` functions -they perform accesses by guest virtual address, with a given ``mmuidx``. +generated by the TCG backend. Like the ``cpu_{ld,st}_mmu`` functions +they perform accesses by guest virtual address, with a given ``MemOpIdx``. -These functions specify an ``opindex`` parameter which encodes -(among other things) the mmu index to use for the access. This parameter -should be created by calling ``make_memop_idx()``. +They differ from ``cpu_{ld,st}_mmu`` in that they take the endianness +of the operation only from the MemOpIdx, and loads extend the return +value to the size of a host general register (``tcg_target_ulong``). -The ``retaddr`` parameter should be the result of GETPC() called directly -from the top level HELPER(foo) function (or 0 if no guest CPU state -unwinding is required). +load: ``helper_ld{sign}{size}_mmu(env, addr, opindex, retaddr)`` -**TODO** The names of these functions are a bit odd for historical -reasons because they were originally expected to be called only from -within generated code. We should rename them to bring them more in -line with the other memory access functions. The explicit endianness -is the only feature they have beyond ``*_mmuidx_ra``. - -load: ``helper_{endian}_ld{sign}{size}_mmu(env, addr, opindex, retaddr)`` - -store: ``helper_{endian}_st{size}_mmu(env, addr, val, opindex, retaddr)`` +store: ``helper_{size}_mmu(env, addr, val, opindex, retaddr)`` ``sign`` - (empty) : for 32 or 64 bit sizes @@ -334,14 +323,9 @@ store: ``helper_{endian}_st{size}_mmu(env, addr, val, opindex, retaddr)`` - ``l`` : 32 bits - ``q`` : 64 bits -``endian`` - - ``le`` : little endian - - ``be`` : big endian - - ``ret`` : target endianness - Regexes for git grep - - ``\<helper_\(le\|be\|ret\)_ld[us]\?[bwlq]_mmu\>`` - - ``\<helper_\(le\|be\|ret\)_st[bwlq]_mmu\>`` + - ``\<helper_ld[us]\?[bwlq]_mmu\>`` + - ``\<helper_st[bwlq]_mmu\>`` ``address_space_*`` ~~~~~~~~~~~~~~~~~~~ diff --git a/docs/devel/tcg-ops.rst b/docs/devel/tcg-ops.rst index f3f451b77f..6a166c5665 100644 --- a/docs/devel/tcg-ops.rst +++ b/docs/devel/tcg-ops.rst @@ -672,19 +672,20 @@ QEMU specific operations | This operation is optional. If the TCG backend does not implement the goto_ptr opcode, emitting this op is equivalent to emitting exit_tb(0). - * - qemu_ld_i32/i64 *t0*, *t1*, *flags*, *memidx* + * - qemu_ld_i32/i64/i128 *t0*, *t1*, *flags*, *memidx* - qemu_st_i32/i64 *t0*, *t1*, *flags*, *memidx* + qemu_st_i32/i64/i128 *t0*, *t1*, *flags*, *memidx* qemu_st8_i32 *t0*, *t1*, *flags*, *memidx* - | Load data at the guest address *t1* into *t0*, or store data in *t0* at guest - address *t1*. The _i32/_i64 size applies to the size of the input/output + address *t1*. The _i32/_i64/_i128 size applies to the size of the input/output register *t0* only. The address *t1* is always sized according to the guest, and the width of the memory operation is controlled by *flags*. | | Both *t0* and *t1* may be split into little-endian ordered pairs of registers - if dealing with 64-bit quantities on a 32-bit host. + if dealing with 64-bit quantities on a 32-bit host, or 128-bit quantities on + a 64-bit host. | | The *memidx* selects the qemu tlb index to use (e.g. user or kernel access). The flags are the MemOp bits, selecting the sign, width, and endianness @@ -693,6 +694,8 @@ QEMU specific operations | For a 32-bit host, qemu_ld/st_i64 is guaranteed to only be used with a 64-bit memory access specified in *flags*. | + | For qemu_ld/st_i128, these are only supported for a 64-bit host. + | | For i386, qemu_st8_i32 is exactly like qemu_st_i32, except the size of the memory operation is known to be 8-bit. This allows the backend to provide a different set of register constraints. diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h index ad824fee52..78d258af44 100644 --- a/include/exec/cpu-all.h +++ b/include/exec/cpu-all.h @@ -84,11 +84,8 @@ #if defined(CONFIG_USER_ONLY) #include "exec/user/abitypes.h" +#include "exec/user/guest-base.h" -/* On some host systems the guest address space is reserved on the host. - * This allows the guest address space to be offset to a convenient location. - */ -extern uintptr_t guest_base; extern bool have_guest_base; /* diff --git a/include/exec/memop.h b/include/exec/memop.h index 07f5f88188..a86dc6743a 100644 --- a/include/exec/memop.h +++ b/include/exec/memop.h @@ -72,6 +72,43 @@ typedef enum MemOp { MO_ALIGN_64 = 6 << MO_ASHIFT, MO_ALIGN = MO_AMASK, + /* + * MO_ATOM_* describes the atomicity requirements of the operation: + * MO_ATOM_IFALIGN: the operation must be single-copy atomic if it + * is aligned; if unaligned there is no atomicity. + * MO_ATOM_IFALIGN_PAIR: the entire operation may be considered to + * be a pair of half-sized operations which are packed together + * for convenience, with single-copy atomicity on each half if + * the half is aligned. + * This is the atomicity e.g. of Arm pre-FEAT_LSE2 LDP. + * MO_ATOM_WITHIN16: the operation is single-copy atomic, even if it + * is unaligned, so long as it does not cross a 16-byte boundary; + * if it crosses a 16-byte boundary there is no atomicity. + * This is the atomicity e.g. of Arm FEAT_LSE2 LDR. + * MO_ATOM_WITHIN16_PAIR: the entire operation is single-copy atomic, + * if it happens to be within a 16-byte boundary, otherwise it + * devolves to a pair of half-sized MO_ATOM_WITHIN16 operations. + * Depending on alignment, one or both will be single-copy atomic. + * This is the atomicity e.g. of Arm FEAT_LSE2 LDP. + * MO_ATOM_SUBALIGN: the operation is single-copy atomic by parts + * by the alignment. E.g. if the address is 0 mod 4, then each + * 4-byte subobject is single-copy atomic. + * This is the atomicity e.g. of IBM Power. + * MO_ATOM_NONE: the operation has no atomicity requirements. + * + * Note the default (i.e. 0) value is single-copy atomic to the + * size of the operation, if aligned. This retains the behaviour + * from before this field was introduced. + */ + MO_ATOM_SHIFT = 8, + MO_ATOM_IFALIGN = 0 << MO_ATOM_SHIFT, + MO_ATOM_IFALIGN_PAIR = 1 << MO_ATOM_SHIFT, + MO_ATOM_WITHIN16 = 2 << MO_ATOM_SHIFT, + MO_ATOM_WITHIN16_PAIR = 3 << MO_ATOM_SHIFT, + MO_ATOM_SUBALIGN = 4 << MO_ATOM_SHIFT, + MO_ATOM_NONE = 5 << MO_ATOM_SHIFT, + MO_ATOM_MASK = 7 << MO_ATOM_SHIFT, + /* Combinations of the above, for ease of use. */ MO_UB = MO_8, MO_UW = MO_16, diff --git a/include/exec/plugin-gen.h b/include/exec/plugin-gen.h index 5f5506f1cc..3af0168e65 100644 --- a/include/exec/plugin-gen.h +++ b/include/exec/plugin-gen.h @@ -27,7 +27,7 @@ void plugin_gen_insn_start(CPUState *cpu, const struct DisasContextBase *db); void plugin_gen_insn_end(void); void plugin_gen_disable_mem_helpers(void); -void plugin_gen_empty_mem_callback(TCGv addr, uint32_t info); +void plugin_gen_empty_mem_callback(TCGv_i64 addr, uint32_t info); static inline void plugin_insn_append(abi_ptr pc, const void *from, size_t size) { @@ -69,7 +69,7 @@ static inline void plugin_gen_tb_end(CPUState *cpu) static inline void plugin_gen_disable_mem_helpers(void) { } -static inline void plugin_gen_empty_mem_callback(TCGv addr, uint32_t info) +static inline void plugin_gen_empty_mem_callback(TCGv_i64 addr, uint32_t info) { } static inline void plugin_insn_append(abi_ptr pc, const void *from, size_t size) diff --git a/include/exec/user/guest-base.h b/include/exec/user/guest-base.h new file mode 100644 index 0000000000..afe2ab7fbb --- /dev/null +++ b/include/exec/user/guest-base.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/* + * Declaration of guest_base. + * Copyright (c) 2003 Fabrice Bellard + */ + +#ifndef EXEC_USER_GUEST_BASE_H +#define EXEC_USER_GUEST_BASE_H + +extern uintptr_t guest_base; + +#endif diff --git a/include/qemu/cpuid.h b/include/qemu/cpuid.h index 1451e8ef2f..35325f1995 100644 --- a/include/qemu/cpuid.h +++ b/include/qemu/cpuid.h @@ -71,6 +71,24 @@ #define bit_LZCNT (1 << 5) #endif +/* + * Signatures for different CPU implementations as returned from Leaf 0. + */ + +#ifndef signature_INTEL_ecx +/* "Genu" "ineI" "ntel" */ +#define signature_INTEL_ebx 0x756e6547 +#define signature_INTEL_edx 0x49656e69 +#define signature_INTEL_ecx 0x6c65746e +#endif + +#ifndef signature_AMD_ecx +/* "Auth" "enti" "cAMD" */ +#define signature_AMD_ebx 0x68747541 +#define signature_AMD_edx 0x69746e65 +#define signature_AMD_ecx 0x444d4163 +#endif + static inline unsigned xgetbv_low(unsigned c) { unsigned a, d; diff --git a/include/tcg/tcg-ldst.h b/include/tcg/tcg-ldst.h index 684e394b06..6ccfe9131d 100644 --- a/include/tcg/tcg-ldst.h +++ b/include/tcg/tcg-ldst.h @@ -25,59 +25,39 @@ #ifndef TCG_LDST_H #define TCG_LDST_H -#ifdef CONFIG_SOFTMMU - /* Value zero-extended to tcg register size. */ -tcg_target_ulong helper_ret_ldub_mmu(CPUArchState *env, target_ulong addr, - MemOpIdx oi, uintptr_t retaddr); -tcg_target_ulong helper_le_lduw_mmu(CPUArchState *env, target_ulong addr, - MemOpIdx oi, uintptr_t retaddr); -tcg_target_ulong helper_le_ldul_mmu(CPUArchState *env, target_ulong addr, - MemOpIdx oi, uintptr_t retaddr); -uint64_t helper_le_ldq_mmu(CPUArchState *env, target_ulong addr, - MemOpIdx oi, uintptr_t retaddr); -tcg_target_ulong helper_be_lduw_mmu(CPUArchState *env, target_ulong addr, - MemOpIdx oi, uintptr_t retaddr); -tcg_target_ulong helper_be_ldul_mmu(CPUArchState *env, target_ulong addr, - MemOpIdx oi, uintptr_t retaddr); -uint64_t helper_be_ldq_mmu(CPUArchState *env, target_ulong addr, - MemOpIdx oi, uintptr_t retaddr); +tcg_target_ulong helper_ldub_mmu(CPUArchState *env, uint64_t addr, + MemOpIdx oi, uintptr_t retaddr); +tcg_target_ulong helper_lduw_mmu(CPUArchState *env, uint64_t addr, + MemOpIdx oi, uintptr_t retaddr); +tcg_target_ulong helper_ldul_mmu(CPUArchState *env, uint64_t addr, + MemOpIdx oi, uintptr_t retaddr); +uint64_t helper_ldq_mmu(CPUArchState *env, uint64_t addr, + MemOpIdx oi, uintptr_t retaddr); +Int128 helper_ld16_mmu(CPUArchState *env, uint64_t addr, + MemOpIdx oi, uintptr_t retaddr); /* Value sign-extended to tcg register size. */ -tcg_target_ulong helper_ret_ldsb_mmu(CPUArchState *env, target_ulong addr, - MemOpIdx oi, uintptr_t retaddr); -tcg_target_ulong helper_le_ldsw_mmu(CPUArchState *env, target_ulong addr, - MemOpIdx oi, uintptr_t retaddr); -tcg_target_ulong helper_le_ldsl_mmu(CPUArchState *env, target_ulong addr, - MemOpIdx oi, uintptr_t retaddr); -tcg_target_ulong helper_be_ldsw_mmu(CPUArchState *env, target_ulong addr, - MemOpIdx oi, uintptr_t retaddr); -tcg_target_ulong helper_be_ldsl_mmu(CPUArchState *env, target_ulong addr, - MemOpIdx oi, uintptr_t retaddr); +tcg_target_ulong helper_ldsb_mmu(CPUArchState *env, uint64_t addr, + MemOpIdx oi, uintptr_t retaddr); +tcg_target_ulong helper_ldsw_mmu(CPUArchState *env, uint64_t addr, + MemOpIdx oi, uintptr_t retaddr); +tcg_target_ulong helper_ldsl_mmu(CPUArchState *env, uint64_t addr, + MemOpIdx oi, uintptr_t retaddr); /* * Value extended to at least uint32_t, so that some ABIs do not require * zero-extension from uint8_t or uint16_t. */ -void helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint32_t val, - MemOpIdx oi, uintptr_t retaddr); -void helper_le_stw_mmu(CPUArchState *env, target_ulong addr, uint32_t val, - MemOpIdx oi, uintptr_t retaddr); -void helper_le_stl_mmu(CPUArchState *env, target_ulong addr, uint32_t val, - MemOpIdx oi, uintptr_t retaddr); -void helper_le_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val, - MemOpIdx oi, uintptr_t retaddr); -void helper_be_stw_mmu(CPUArchState *env, target_ulong addr, uint32_t val, - MemOpIdx oi, uintptr_t retaddr); -void helper_be_stl_mmu(CPUArchState *env, target_ulong addr, uint32_t val, - MemOpIdx oi, uintptr_t retaddr); -void helper_be_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val, - MemOpIdx oi, uintptr_t retaddr); - -#else - -G_NORETURN void helper_unaligned_ld(CPUArchState *env, target_ulong addr); -G_NORETURN void helper_unaligned_st(CPUArchState *env, target_ulong addr); +void helper_stb_mmu(CPUArchState *env, uint64_t addr, uint32_t val, + MemOpIdx oi, uintptr_t retaddr); +void helper_stw_mmu(CPUArchState *env, uint64_t addr, uint32_t val, + MemOpIdx oi, uintptr_t retaddr); +void helper_stl_mmu(CPUArchState *env, uint64_t addr, uint32_t val, + MemOpIdx oi, uintptr_t retaddr); +void helper_stq_mmu(CPUArchState *env, uint64_t addr, uint64_t val, + MemOpIdx oi, uintptr_t retaddr); +void helper_st16_mmu(CPUArchState *env, uint64_t addr, Int128 val, + MemOpIdx oi, uintptr_t retaddr); -#endif /* CONFIG_SOFTMMU */ #endif /* TCG_LDST_H */ diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h index 4401fa493c..35c5700183 100644 --- a/include/tcg/tcg-op.h +++ b/include/tcg/tcg-op.h @@ -723,48 +723,27 @@ static inline void tcg_gen_concat32_i64(TCGv_i64 ret, TCGv_i64 lo, TCGv_i64 hi) #endif #if TARGET_INSN_START_WORDS == 1 -# if TARGET_LONG_BITS <= TCG_TARGET_REG_BITS static inline void tcg_gen_insn_start(target_ulong pc) { - tcg_gen_op1(INDEX_op_insn_start, pc); + TCGOp *op = tcg_emit_op(INDEX_op_insn_start, 64 / TCG_TARGET_REG_BITS); + tcg_set_insn_start_param(op, 0, pc); } -# else -static inline void tcg_gen_insn_start(target_ulong pc) -{ - tcg_gen_op2(INDEX_op_insn_start, (uint32_t)pc, (uint32_t)(pc >> 32)); -} -# endif #elif TARGET_INSN_START_WORDS == 2 -# if TARGET_LONG_BITS <= TCG_TARGET_REG_BITS -static inline void tcg_gen_insn_start(target_ulong pc, target_ulong a1) -{ - tcg_gen_op2(INDEX_op_insn_start, pc, a1); -} -# else static inline void tcg_gen_insn_start(target_ulong pc, target_ulong a1) { - tcg_gen_op4(INDEX_op_insn_start, - (uint32_t)pc, (uint32_t)(pc >> 32), - (uint32_t)a1, (uint32_t)(a1 >> 32)); + TCGOp *op = tcg_emit_op(INDEX_op_insn_start, 2 * 64 / TCG_TARGET_REG_BITS); + tcg_set_insn_start_param(op, 0, pc); + tcg_set_insn_start_param(op, 1, a1); } -# endif #elif TARGET_INSN_START_WORDS == 3 -# if TARGET_LONG_BITS <= TCG_TARGET_REG_BITS -static inline void tcg_gen_insn_start(target_ulong pc, target_ulong a1, - target_ulong a2) -{ - tcg_gen_op3(INDEX_op_insn_start, pc, a1, a2); -} -# else static inline void tcg_gen_insn_start(target_ulong pc, target_ulong a1, target_ulong a2) { - tcg_gen_op6(INDEX_op_insn_start, - (uint32_t)pc, (uint32_t)(pc >> 32), - (uint32_t)a1, (uint32_t)(a1 >> 32), - (uint32_t)a2, (uint32_t)(a2 >> 32)); + TCGOp *op = tcg_emit_op(INDEX_op_insn_start, 3 * 64 / TCG_TARGET_REG_BITS); + tcg_set_insn_start_param(op, 0, pc); + tcg_set_insn_start_param(op, 1, a1); + tcg_set_insn_start_param(op, 2, a2); } -# endif #else # error "Unhandled number of operands to insn_start" #endif @@ -824,73 +803,203 @@ static inline void tcg_gen_plugin_cb_end(void) #define tcg_temp_new() tcg_temp_new_i32() #define tcg_global_mem_new tcg_global_mem_new_i32 #define tcg_temp_free tcg_temp_free_i32 +#define tcgv_tl_temp tcgv_i32_temp #define tcg_gen_qemu_ld_tl tcg_gen_qemu_ld_i32 #define tcg_gen_qemu_st_tl tcg_gen_qemu_st_i32 #else #define tcg_temp_new() tcg_temp_new_i64() #define tcg_global_mem_new tcg_global_mem_new_i64 #define tcg_temp_free tcg_temp_free_i64 +#define tcgv_tl_temp tcgv_i64_temp #define tcg_gen_qemu_ld_tl tcg_gen_qemu_ld_i64 #define tcg_gen_qemu_st_tl tcg_gen_qemu_st_i64 #endif -void tcg_gen_qemu_ld_i32(TCGv_i32, TCGv, TCGArg, MemOp); -void tcg_gen_qemu_st_i32(TCGv_i32, TCGv, TCGArg, MemOp); -void tcg_gen_qemu_ld_i64(TCGv_i64, TCGv, TCGArg, MemOp); -void tcg_gen_qemu_st_i64(TCGv_i64, TCGv, TCGArg, MemOp); -void tcg_gen_qemu_ld_i128(TCGv_i128, TCGv, TCGArg, MemOp); -void tcg_gen_qemu_st_i128(TCGv_i128, TCGv, TCGArg, MemOp); - -void tcg_gen_atomic_cmpxchg_i32(TCGv_i32, TCGv, TCGv_i32, TCGv_i32, - TCGArg, MemOp); -void tcg_gen_atomic_cmpxchg_i64(TCGv_i64, TCGv, TCGv_i64, TCGv_i64, - TCGArg, MemOp); -void tcg_gen_atomic_cmpxchg_i128(TCGv_i128, TCGv, TCGv_i128, TCGv_i128, - TCGArg, MemOp); - -void tcg_gen_nonatomic_cmpxchg_i32(TCGv_i32, TCGv, TCGv_i32, TCGv_i32, - TCGArg, MemOp); -void tcg_gen_nonatomic_cmpxchg_i64(TCGv_i64, TCGv, TCGv_i64, TCGv_i64, - TCGArg, MemOp); -void tcg_gen_nonatomic_cmpxchg_i128(TCGv_i128, TCGv, TCGv_i128, TCGv_i128, - TCGArg, MemOp); - -void tcg_gen_atomic_xchg_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, MemOp); -void tcg_gen_atomic_xchg_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, MemOp); - -void tcg_gen_atomic_fetch_add_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, MemOp); -void tcg_gen_atomic_fetch_add_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, MemOp); -void tcg_gen_atomic_fetch_and_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, MemOp); -void tcg_gen_atomic_fetch_and_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, MemOp); -void tcg_gen_atomic_fetch_or_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, MemOp); -void tcg_gen_atomic_fetch_or_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, MemOp); -void tcg_gen_atomic_fetch_xor_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, MemOp); -void tcg_gen_atomic_fetch_xor_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, MemOp); -void tcg_gen_atomic_fetch_smin_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, MemOp); -void tcg_gen_atomic_fetch_smin_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, MemOp); -void tcg_gen_atomic_fetch_umin_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, MemOp); -void tcg_gen_atomic_fetch_umin_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, MemOp); -void tcg_gen_atomic_fetch_smax_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, MemOp); -void tcg_gen_atomic_fetch_smax_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, MemOp); -void tcg_gen_atomic_fetch_umax_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, MemOp); -void tcg_gen_atomic_fetch_umax_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, MemOp); - -void tcg_gen_atomic_add_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, MemOp); -void tcg_gen_atomic_add_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, MemOp); -void tcg_gen_atomic_and_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, MemOp); -void tcg_gen_atomic_and_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, MemOp); -void tcg_gen_atomic_or_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, MemOp); -void tcg_gen_atomic_or_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, MemOp); -void tcg_gen_atomic_xor_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, MemOp); -void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, MemOp); -void tcg_gen_atomic_smin_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, MemOp); -void tcg_gen_atomic_smin_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, MemOp); -void tcg_gen_atomic_umin_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, MemOp); -void tcg_gen_atomic_umin_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, MemOp); -void tcg_gen_atomic_smax_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, MemOp); -void tcg_gen_atomic_smax_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, MemOp); -void tcg_gen_atomic_umax_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, MemOp); -void tcg_gen_atomic_umax_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, MemOp); +void tcg_gen_qemu_ld_i32_chk(TCGv_i32, TCGTemp *, TCGArg, MemOp, TCGType); +void tcg_gen_qemu_st_i32_chk(TCGv_i32, TCGTemp *, TCGArg, MemOp, TCGType); +void tcg_gen_qemu_ld_i64_chk(TCGv_i64, TCGTemp *, TCGArg, MemOp, TCGType); +void tcg_gen_qemu_st_i64_chk(TCGv_i64, TCGTemp *, TCGArg, MemOp, TCGType); +void tcg_gen_qemu_ld_i128_chk(TCGv_i128, TCGTemp *, TCGArg, MemOp, TCGType); +void tcg_gen_qemu_st_i128_chk(TCGv_i128, TCGTemp *, TCGArg, MemOp, TCGType); + +static inline void +tcg_gen_qemu_ld_i32(TCGv_i32 v, TCGv a, TCGArg i, MemOp m) +{ + tcg_gen_qemu_ld_i32_chk(v, tcgv_tl_temp(a), i, m, TCG_TYPE_TL); +} + +static inline void +tcg_gen_qemu_st_i32(TCGv_i32 v, TCGv a, TCGArg i, MemOp m) +{ + tcg_gen_qemu_st_i32_chk(v, tcgv_tl_temp(a), i, m, TCG_TYPE_TL); +} + +static inline void +tcg_gen_qemu_ld_i64(TCGv_i64 v, TCGv a, TCGArg i, MemOp m) +{ + tcg_gen_qemu_ld_i64_chk(v, tcgv_tl_temp(a), i, m, TCG_TYPE_TL); +} + +static inline void +tcg_gen_qemu_st_i64(TCGv_i64 v, TCGv a, TCGArg i, MemOp m) +{ + tcg_gen_qemu_st_i64_chk(v, tcgv_tl_temp(a), i, m, TCG_TYPE_TL); +} + +static inline void +tcg_gen_qemu_ld_i128(TCGv_i128 v, TCGv a, TCGArg i, MemOp m) +{ + tcg_gen_qemu_ld_i128_chk(v, tcgv_tl_temp(a), i, m, TCG_TYPE_TL); +} + +static inline void +tcg_gen_qemu_st_i128(TCGv_i128 v, TCGv a, TCGArg i, MemOp m) +{ + tcg_gen_qemu_st_i128_chk(v, tcgv_tl_temp(a), i, m, TCG_TYPE_TL); +} + +void tcg_gen_atomic_cmpxchg_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32, TCGv_i32, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_cmpxchg_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64, TCGv_i64, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_cmpxchg_i128_chk(TCGv_i128, TCGTemp *, TCGv_i128, + TCGv_i128, TCGArg, MemOp, TCGType); + +void tcg_gen_nonatomic_cmpxchg_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32, TCGv_i32, + TCGArg, MemOp, TCGType); +void tcg_gen_nonatomic_cmpxchg_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64, TCGv_i64, + TCGArg, MemOp, TCGType); +void tcg_gen_nonatomic_cmpxchg_i128_chk(TCGv_i128, TCGTemp *, TCGv_i128, + TCGv_i128, TCGArg, MemOp, TCGType); + +void tcg_gen_atomic_xchg_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_xchg_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64, + TCGArg, MemOp, TCGType); + +void tcg_gen_atomic_fetch_add_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_fetch_add_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_fetch_and_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_fetch_and_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_fetch_or_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_fetch_or_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_fetch_xor_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_fetch_xor_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_fetch_smin_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_fetch_smin_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_fetch_umin_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_fetch_umin_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_fetch_smax_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_fetch_smax_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_fetch_umax_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_fetch_umax_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64, + TCGArg, MemOp, TCGType); + +void tcg_gen_atomic_add_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_add_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_and_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_and_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_or_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_or_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_xor_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_xor_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_smin_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_smin_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_umin_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_umin_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_smax_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_smax_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_umax_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32, + TCGArg, MemOp, TCGType); +void tcg_gen_atomic_umax_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64, + TCGArg, MemOp, TCGType); + +#define DEF_ATOMIC2(N, S) \ + static inline void N##_##S(TCGv_##S r, TCGv a, TCGv_##S v, \ + TCGArg i, MemOp m) \ + { N##_##S##_chk(r, tcgv_tl_temp(a), v, i, m, TCG_TYPE_TL); } + +#define DEF_ATOMIC3(N, S) \ + static inline void N##_##S(TCGv_##S r, TCGv a, TCGv_##S o, \ + TCGv_##S n, TCGArg i, MemOp m) \ + { N##_##S##_chk(r, tcgv_tl_temp(a), o, n, i, m, TCG_TYPE_TL); } + +DEF_ATOMIC3(tcg_gen_atomic_cmpxchg, i32) +DEF_ATOMIC3(tcg_gen_atomic_cmpxchg, i64) +DEF_ATOMIC3(tcg_gen_atomic_cmpxchg, i128) + +DEF_ATOMIC3(tcg_gen_nonatomic_cmpxchg, i32) +DEF_ATOMIC3(tcg_gen_nonatomic_cmpxchg, i64) +DEF_ATOMIC3(tcg_gen_nonatomic_cmpxchg, i128) + +DEF_ATOMIC2(tcg_gen_atomic_xchg, i32) +DEF_ATOMIC2(tcg_gen_atomic_xchg, i64) + +DEF_ATOMIC2(tcg_gen_atomic_fetch_add, i32) +DEF_ATOMIC2(tcg_gen_atomic_fetch_add, i64) +DEF_ATOMIC2(tcg_gen_atomic_fetch_and, i32) +DEF_ATOMIC2(tcg_gen_atomic_fetch_and, i64) +DEF_ATOMIC2(tcg_gen_atomic_fetch_or, i32) +DEF_ATOMIC2(tcg_gen_atomic_fetch_or, i64) +DEF_ATOMIC2(tcg_gen_atomic_fetch_xor, i32) +DEF_ATOMIC2(tcg_gen_atomic_fetch_xor, i64) +DEF_ATOMIC2(tcg_gen_atomic_fetch_smin, i32) +DEF_ATOMIC2(tcg_gen_atomic_fetch_smin, i64) +DEF_ATOMIC2(tcg_gen_atomic_fetch_umin, i32) +DEF_ATOMIC2(tcg_gen_atomic_fetch_umin, i64) +DEF_ATOMIC2(tcg_gen_atomic_fetch_smax, i32) +DEF_ATOMIC2(tcg_gen_atomic_fetch_smax, i64) +DEF_ATOMIC2(tcg_gen_atomic_fetch_umax, i32) +DEF_ATOMIC2(tcg_gen_atomic_fetch_umax, i64) + +DEF_ATOMIC2(tcg_gen_atomic_add_fetch, i32) +DEF_ATOMIC2(tcg_gen_atomic_add_fetch, i64) +DEF_ATOMIC2(tcg_gen_atomic_and_fetch, i32) +DEF_ATOMIC2(tcg_gen_atomic_and_fetch, i64) +DEF_ATOMIC2(tcg_gen_atomic_or_fetch, i32) +DEF_ATOMIC2(tcg_gen_atomic_or_fetch, i64) +DEF_ATOMIC2(tcg_gen_atomic_xor_fetch, i32) +DEF_ATOMIC2(tcg_gen_atomic_xor_fetch, i64) +DEF_ATOMIC2(tcg_gen_atomic_smin_fetch, i32) +DEF_ATOMIC2(tcg_gen_atomic_smin_fetch, i64) +DEF_ATOMIC2(tcg_gen_atomic_umin_fetch, i32) +DEF_ATOMIC2(tcg_gen_atomic_umin_fetch, i64) +DEF_ATOMIC2(tcg_gen_atomic_smax_fetch, i32) +DEF_ATOMIC2(tcg_gen_atomic_smax_fetch, i64) +DEF_ATOMIC2(tcg_gen_atomic_umax_fetch, i32) +DEF_ATOMIC2(tcg_gen_atomic_umax_fetch, i64) + +#undef DEF_ATOMIC2 +#undef DEF_ATOMIC3 void tcg_gen_mov_vec(TCGv_vec, TCGv_vec); void tcg_gen_dup_i32_vec(unsigned vece, TCGv_vec, TCGv_i32); diff --git a/include/tcg/tcg-opc.h b/include/tcg/tcg-opc.h index dd444734d9..21594c1590 100644 --- a/include/tcg/tcg-opc.h +++ b/include/tcg/tcg-opc.h @@ -186,11 +186,10 @@ DEF(muls2_i64, 2, 2, 0, IMPL64 | IMPL(TCG_TARGET_HAS_muls2_i64)) DEF(muluh_i64, 1, 2, 0, IMPL64 | IMPL(TCG_TARGET_HAS_muluh_i64)) DEF(mulsh_i64, 1, 2, 0, IMPL64 | IMPL(TCG_TARGET_HAS_mulsh_i64)) -#define TLADDR_ARGS (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? 1 : 2) #define DATA64_ARGS (TCG_TARGET_REG_BITS == 64 ? 1 : 2) /* QEMU specific */ -DEF(insn_start, 0, 0, TLADDR_ARGS * TARGET_INSN_START_WORDS, +DEF(insn_start, 0, 0, DATA64_ARGS * TARGET_INSN_START_WORDS, TCG_OPF_NOT_PRESENT) DEF(exit_tb, 0, 0, 1, TCG_OPF_BB_EXIT | TCG_OPF_BB_END) DEF(goto_tb, 0, 0, 1, TCG_OPF_BB_EXIT | TCG_OPF_BB_END) @@ -199,20 +198,47 @@ DEF(goto_ptr, 0, 1, 0, TCG_OPF_BB_EXIT | TCG_OPF_BB_END) DEF(plugin_cb_start, 0, 0, 3, TCG_OPF_NOT_PRESENT) DEF(plugin_cb_end, 0, 0, 0, TCG_OPF_NOT_PRESENT) -DEF(qemu_ld_i32, 1, TLADDR_ARGS, 1, +/* Replicate ld/st ops for 32 and 64-bit guest addresses. */ +DEF(qemu_ld_a32_i32, 1, 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS) -DEF(qemu_st_i32, 0, TLADDR_ARGS + 1, 1, +DEF(qemu_st_a32_i32, 0, 1 + 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS) -DEF(qemu_ld_i64, DATA64_ARGS, TLADDR_ARGS, 1, +DEF(qemu_ld_a32_i64, DATA64_ARGS, 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT) -DEF(qemu_st_i64, 0, TLADDR_ARGS + DATA64_ARGS, 1, +DEF(qemu_st_a32_i64, 0, DATA64_ARGS + 1, 1, + TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT) + +DEF(qemu_ld_a64_i32, 1, DATA64_ARGS, 1, + TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS) +DEF(qemu_st_a64_i32, 0, 1 + DATA64_ARGS, 1, + TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS) +DEF(qemu_ld_a64_i64, DATA64_ARGS, DATA64_ARGS, 1, + TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT) +DEF(qemu_st_a64_i64, 0, DATA64_ARGS + DATA64_ARGS, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT) /* Only used by i386 to cope with stupid register constraints. */ -DEF(qemu_st8_i32, 0, TLADDR_ARGS + 1, 1, +DEF(qemu_st8_a32_i32, 0, 1 + 1, 1, + TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | + IMPL(TCG_TARGET_HAS_qemu_st8_i32)) +DEF(qemu_st8_a64_i32, 0, 1 + DATA64_ARGS, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | IMPL(TCG_TARGET_HAS_qemu_st8_i32)) +/* Only for 64-bit hosts at the moment. */ +DEF(qemu_ld_a32_i128, 2, 1, 1, + TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT | + IMPL(TCG_TARGET_HAS_qemu_ldst_i128)) +DEF(qemu_ld_a64_i128, 2, 1, 1, + TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT | + IMPL(TCG_TARGET_HAS_qemu_ldst_i128)) +DEF(qemu_st_a32_i128, 0, 3, 1, + TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT | + IMPL(TCG_TARGET_HAS_qemu_ldst_i128)) +DEF(qemu_st_a64_i128, 0, 3, 1, + TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT | + IMPL(TCG_TARGET_HAS_qemu_ldst_i128)) + /* Host vector support. */ #define IMPLVEC TCG_OPF_VECTOR | IMPL(TCG_TARGET_MAYBE_vec) @@ -283,7 +309,6 @@ DEF(tci_movi, 1, 0, 1, TCG_OPF_NOT_PRESENT) DEF(tci_movl, 1, 0, 1, TCG_OPF_NOT_PRESENT) #endif -#undef TLADDR_ARGS #undef DATA64_ARGS #undef IMPL #undef IMPL64 diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h index b19e167e1d..cd6327b175 100644 --- a/include/tcg/tcg.h +++ b/include/tcg/tcg.h @@ -558,6 +558,13 @@ struct TCGContext { int nb_temps; int nb_indirects; int nb_ops; + TCGType addr_type; /* TCG_TYPE_I32 or TCG_TYPE_I64 */ + +#ifdef CONFIG_SOFTMMU + int page_mask; + uint8_t page_bits; + uint8_t tlb_dyn_max_bits; +#endif TCGRegSet reserved_regs; intptr_t current_frame_offset; @@ -629,7 +636,7 @@ struct TCGContext { TCGTemp *reg_to_temp[TCG_TARGET_NB_REGS]; uint16_t gen_insn_end_off[TCG_MAX_INSNS]; - target_ulong gen_insn_data[TCG_MAX_INSNS][TARGET_INSN_START_WORDS]; + uint64_t gen_insn_data[TCG_MAX_INSNS][TARGET_INSN_START_WORDS]; /* Exit to translator on overflow. */ sigjmp_buf jmp_trans; @@ -771,24 +778,24 @@ static inline void tcg_set_insn_param(TCGOp *op, int arg, TCGArg v) op->args[arg] = v; } -static inline target_ulong tcg_get_insn_start_param(TCGOp *op, int arg) +static inline uint64_t tcg_get_insn_start_param(TCGOp *op, int arg) { -#if TARGET_LONG_BITS <= TCG_TARGET_REG_BITS - return tcg_get_insn_param(op, arg); -#else - return tcg_get_insn_param(op, arg * 2) | - ((uint64_t)tcg_get_insn_param(op, arg * 2 + 1) << 32); -#endif + if (TCG_TARGET_REG_BITS == 64) { + return tcg_get_insn_param(op, arg); + } else { + return deposit64(tcg_get_insn_param(op, arg * 2), 32, 32, + tcg_get_insn_param(op, arg * 2 + 1)); + } } -static inline void tcg_set_insn_start_param(TCGOp *op, int arg, target_ulong v) +static inline void tcg_set_insn_start_param(TCGOp *op, int arg, uint64_t v) { -#if TARGET_LONG_BITS <= TCG_TARGET_REG_BITS - tcg_set_insn_param(op, arg, v); -#else - tcg_set_insn_param(op, arg * 2, v); - tcg_set_insn_param(op, arg * 2 + 1, v >> 32); -#endif + if (TCG_TARGET_REG_BITS == 64) { + tcg_set_insn_param(op, arg, v); + } else { + tcg_set_insn_param(op, arg * 2, v); + tcg_set_insn_param(op, arg * 2 + 1, v >> 32); + } } /* The last op that was emitted. */ @@ -852,7 +859,7 @@ void tcg_register_thread(void); void tcg_prologue_init(TCGContext *s); void tcg_func_start(TCGContext *s); -int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start); +int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start); void tb_target_set_jmp_target(const TranslationBlock *, int, uintptr_t, uintptr_t); diff --git a/meson.build b/meson.build index 5e2807ea7c..4dddccb890 100644 --- a/meson.build +++ b/meson.build @@ -2259,23 +2259,21 @@ config_host_data.set('HAVE_BROKEN_SIZE_MAX', not cc.compiles(''' return printf("%zu", SIZE_MAX); }''', args: ['-Werror'])) -atomic_test = ''' +# See if 64-bit atomic operations are supported. +# Note that without __atomic builtins, we can only +# assume atomic loads/stores max at pointer size. +config_host_data.set('CONFIG_ATOMIC64', cc.links(''' #include <stdint.h> int main(void) { - @0@ x = 0, y = 0; + uint64_t x = 0, y = 0; y = __atomic_load_n(&x, __ATOMIC_RELAXED); __atomic_store_n(&x, y, __ATOMIC_RELAXED); __atomic_compare_exchange_n(&x, &y, x, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED); __atomic_exchange_n(&x, y, __ATOMIC_RELAXED); __atomic_fetch_add(&x, y, __ATOMIC_RELAXED); return 0; - }''' - -# See if 64-bit atomic operations are supported. -# Note that without __atomic builtins, we can only -# assume atomic loads/stores max at pointer size. -config_host_data.set('CONFIG_ATOMIC64', cc.links(atomic_test.format('uint64_t'))) + }''')) has_int128 = cc.links(''' __int128_t a; @@ -2293,21 +2291,39 @@ if has_int128 # "do we have 128-bit atomics which are handled inline and specifically not # via libatomic". The reason we can't use libatomic is documented in the # comment starting "GCC is a house divided" in include/qemu/atomic128.h. - has_atomic128 = cc.links(atomic_test.format('unsigned __int128')) + # We only care about these operations on 16-byte aligned pointers, so + # force 16-byte alignment of the pointer, which may be greater than + # __alignof(unsigned __int128) for the host. + atomic_test_128 = ''' + int main(int ac, char **av) { + unsigned __int128 *p = __builtin_assume_aligned(av[ac - 1], sizeof(16)); + p[1] = __atomic_load_n(&p[0], __ATOMIC_RELAXED); + __atomic_store_n(&p[2], p[3], __ATOMIC_RELAXED); + __atomic_compare_exchange_n(&p[4], &p[5], p[6], 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED); + return 0; + }''' + has_atomic128 = cc.links(atomic_test_128) config_host_data.set('CONFIG_ATOMIC128', has_atomic128) if not has_atomic128 - has_cmpxchg128 = cc.links(''' - int main(void) - { - unsigned __int128 x = 0, y = 0; - __sync_val_compare_and_swap_16(&x, y, x); - return 0; - } - ''') - - config_host_data.set('CONFIG_CMPXCHG128', has_cmpxchg128) + # Even with __builtin_assume_aligned, the above test may have failed + # without optimization enabled. Try again with optimizations locally + # enabled for the function. See + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107389 + has_atomic128_opt = cc.links('__attribute__((optimize("O1")))' + atomic_test_128) + config_host_data.set('CONFIG_ATOMIC128_OPT', has_atomic128_opt) + + if not has_atomic128_opt + config_host_data.set('CONFIG_CMPXCHG128', cc.links(''' + int main(void) + { + unsigned __int128 x = 0, y = 0; + __sync_val_compare_and_swap_16(&x, y, x); + return 0; + } + ''')) + endif endif endif diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc index 62dd22d73c..bc6b99a1bd 100644 --- a/tcg/aarch64/tcg-target.c.inc +++ b/tcg/aarch64/tcg-target.c.inc @@ -13,6 +13,12 @@ #include "../tcg-ldst.c.inc" #include "../tcg-pool.c.inc" #include "qemu/bitops.h" +#ifdef __linux__ +#include <asm/hwcap.h> +#endif +#ifdef CONFIG_DARWIN +#include <sys/sysctl.h> +#endif /* We're going to re-use TCGType in setting of the SF bit, which controls the size of the operation performed. If we know the values match, it @@ -71,15 +77,13 @@ static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot) return TCG_REG_X0 + slot; } +bool have_lse; +bool have_lse2; + #define TCG_REG_TMP TCG_REG_X30 #define TCG_VEC_TMP TCG_REG_V31 #ifndef CONFIG_SOFTMMU -/* Note that XZR cannot be encoded in the address base register slot, - as that actaully encodes SP. So if we need to zero-extend the guest - address, via the address index register slot, we need to load even - a zero guest base into a register. */ -#define USE_GUEST_BASE (guest_base != 0 || TARGET_LONG_BITS == 32) #define TCG_REG_GUEST_BASE TCG_REG_X28 #endif @@ -1584,41 +1588,13 @@ typedef struct { TCGReg base; TCGReg index; TCGType index_ext; + TCGAtomAlign aa; } HostAddress; -#ifdef CONFIG_SOFTMMU -/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr, - * MemOpIdx oi, uintptr_t ra) - */ -static void * const qemu_ld_helpers[MO_SIZE + 1] = { - [MO_8] = helper_ret_ldub_mmu, -#if HOST_BIG_ENDIAN - [MO_16] = helper_be_lduw_mmu, - [MO_32] = helper_be_ldul_mmu, - [MO_64] = helper_be_ldq_mmu, -#else - [MO_16] = helper_le_lduw_mmu, - [MO_32] = helper_le_ldul_mmu, - [MO_64] = helper_le_ldq_mmu, -#endif -}; - -/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr, - * uintxx_t val, MemOpIdx oi, - * uintptr_t ra) - */ -static void * const qemu_st_helpers[MO_SIZE + 1] = { - [MO_8] = helper_ret_stb_mmu, -#if HOST_BIG_ENDIAN - [MO_16] = helper_be_stw_mmu, - [MO_32] = helper_be_stl_mmu, - [MO_64] = helper_be_stq_mmu, -#else - [MO_16] = helper_le_stw_mmu, - [MO_32] = helper_le_stl_mmu, - [MO_64] = helper_le_stq_mmu, -#endif -}; +bool tcg_target_has_memory_bswap(MemOp memop) +{ + return false; +} static const TCGLdstHelperParam ldst_helper_param = { .ntmp = 1, .tmp = { TCG_REG_TMP } @@ -1652,40 +1628,6 @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb) tcg_out_goto(s, lb->raddr); return true; } -#else -static void tcg_out_adr(TCGContext *s, TCGReg rd, const void *target) -{ - ptrdiff_t offset = tcg_pcrel_diff(s, target); - tcg_debug_assert(offset == sextract64(offset, 0, 21)); - tcg_out_insn(s, 3406, ADR, rd, offset); -} - -static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l) -{ - if (!reloc_pc19(l->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) { - return false; - } - - tcg_out_mov(s, TCG_TYPE_TL, TCG_REG_X1, l->addrlo_reg); - tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_X0, TCG_AREG0); - - /* "Tail call" to the helper, with the return address back inline. */ - tcg_out_adr(s, TCG_REG_LR, l->raddr); - tcg_out_goto_long(s, (const void *)(l->is_ld ? helper_unaligned_ld - : helper_unaligned_st)); - return true; -} - -static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) -{ - return tcg_out_fail_alignment(s, l); -} - -static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) -{ - return tcg_out_fail_alignment(s, l); -} -#endif /* CONFIG_SOFTMMU */ /* * For softmmu, perform the TLB load and compare. @@ -1697,11 +1639,16 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, TCGReg addr_reg, MemOpIdx oi, bool is_ld) { - TCGType addr_type = TARGET_LONG_BITS == 64 ? TCG_TYPE_I64 : TCG_TYPE_I32; + TCGType addr_type = s->addr_type; TCGLabelQemuLdst *ldst = NULL; MemOp opc = get_memop(oi); - unsigned a_bits = get_alignment_bits(opc); - unsigned a_mask = (1u << a_bits) - 1; + unsigned a_mask; + + h->aa = atom_and_align_for_opc(s, opc, + have_lse2 ? MO_ATOM_WITHIN16 + : MO_ATOM_IFALIGN, + false); + a_mask = (1 << h->aa.align) - 1; #ifdef CONFIG_SOFTMMU unsigned s_bits = opc & MO_SIZE; @@ -1716,7 +1663,7 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, ldst->oi = oi; ldst->addrlo_reg = addr_reg; - mask_type = (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32 + mask_type = (s->page_bits + s->tlb_dyn_max_bits > 32 ? TCG_TYPE_I64 : TCG_TYPE_I32); /* Load env_tlb(env)->f[mmu_idx].{mask,table} into {x0,x1}. */ @@ -1730,13 +1677,13 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, /* Extract the TLB index from the address into X0. */ tcg_out_insn(s, 3502S, AND_LSR, mask_type == TCG_TYPE_I64, TCG_REG_X0, TCG_REG_X0, addr_reg, - TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS); + s->page_bits - CPU_TLB_ENTRY_BITS); /* Add the tlb_table pointer, creating the CPUTLBEntry address into X1. */ tcg_out_insn(s, 3502, ADD, 1, TCG_REG_X1, TCG_REG_X1, TCG_REG_X0); /* Load the tlb comparator into X0, and the fast path addend into X1. */ - tcg_out_ld(s, TCG_TYPE_TL, TCG_REG_X0, TCG_REG_X1, + tcg_out_ld(s, addr_type, TCG_REG_X0, TCG_REG_X1, is_ld ? offsetof(CPUTLBEntry, addr_read) : offsetof(CPUTLBEntry, addr_write)); tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_X1, TCG_REG_X1, @@ -1747,31 +1694,28 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, * bits within the address. For unaligned access, we check that we don't * cross pages using the address of the last byte of the access. */ - if (a_bits >= s_bits) { + if (a_mask >= s_mask) { x3 = addr_reg; } else { - tcg_out_insn(s, 3401, ADDI, TARGET_LONG_BITS == 64, + tcg_out_insn(s, 3401, ADDI, addr_type, TCG_REG_X3, addr_reg, s_mask - a_mask); x3 = TCG_REG_X3; } - compare_mask = (uint64_t)TARGET_PAGE_MASK | a_mask; + compare_mask = (uint64_t)s->page_mask | a_mask; /* Store the page mask part of the address into X3. */ - tcg_out_logicali(s, I3404_ANDI, TARGET_LONG_BITS == 64, - TCG_REG_X3, x3, compare_mask); + tcg_out_logicali(s, I3404_ANDI, addr_type, TCG_REG_X3, x3, compare_mask); /* Perform the address comparison. */ - tcg_out_cmp(s, TARGET_LONG_BITS == 64, TCG_REG_X0, TCG_REG_X3, 0); + tcg_out_cmp(s, addr_type, TCG_REG_X0, TCG_REG_X3, 0); /* If not equal, we jump to the slow path. */ ldst->label_ptr[0] = s->code_ptr; tcg_out_insn(s, 3202, B_C, TCG_COND_NE, 0); - *h = (HostAddress){ - .base = TCG_REG_X1, - .index = addr_reg, - .index_ext = addr_type - }; + h->base = TCG_REG_X1, + h->index = addr_reg; + h->index_ext = addr_type; #else if (a_mask) { ldst = new_ldst_label(s); @@ -1788,18 +1732,14 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, tcg_out_insn(s, 3202, B_C, TCG_COND_NE, 0); } - if (USE_GUEST_BASE) { - *h = (HostAddress){ - .base = TCG_REG_GUEST_BASE, - .index = addr_reg, - .index_ext = addr_type - }; + if (guest_base || addr_type == TCG_TYPE_I32) { + h->base = TCG_REG_GUEST_BASE; + h->index = addr_reg; + h->index_ext = addr_type; } else { - *h = (HostAddress){ - .base = addr_reg, - .index = TCG_REG_XZR, - .index_ext = TCG_TYPE_I64 - }; + h->base = addr_reg; + h->index = TCG_REG_XZR; + h->index_ext = TCG_TYPE_I64; } #endif @@ -2218,12 +2158,16 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, tcg_out_insn(s, 3506, CSEL, ext, a0, REG0(3), REG0(4), args[5]); break; - case INDEX_op_qemu_ld_i32: - case INDEX_op_qemu_ld_i64: + case INDEX_op_qemu_ld_a32_i32: + case INDEX_op_qemu_ld_a64_i32: + case INDEX_op_qemu_ld_a32_i64: + case INDEX_op_qemu_ld_a64_i64: tcg_out_qemu_ld(s, a0, a1, a2, ext); break; - case INDEX_op_qemu_st_i32: - case INDEX_op_qemu_st_i64: + case INDEX_op_qemu_st_a32_i32: + case INDEX_op_qemu_st_a64_i32: + case INDEX_op_qemu_st_a32_i64: + case INDEX_op_qemu_st_a64_i64: tcg_out_qemu_st(s, REG0(0), a1, a2, ext); break; @@ -2860,11 +2804,15 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_movcond_i64: return C_O1_I4(r, r, rA, rZ, rZ); - case INDEX_op_qemu_ld_i32: - case INDEX_op_qemu_ld_i64: + case INDEX_op_qemu_ld_a32_i32: + case INDEX_op_qemu_ld_a64_i32: + case INDEX_op_qemu_ld_a32_i64: + case INDEX_op_qemu_ld_a64_i64: return C_O1_I1(r, l); - case INDEX_op_qemu_st_i32: - case INDEX_op_qemu_st_i64: + case INDEX_op_qemu_st_a32_i32: + case INDEX_op_qemu_st_a64_i32: + case INDEX_op_qemu_st_a32_i64: + case INDEX_op_qemu_st_a64_i64: return C_O0_I2(lZ, l); case INDEX_op_deposit_i32: @@ -2930,8 +2878,39 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) } } +#ifdef CONFIG_DARWIN +static bool sysctl_for_bool(const char *name) +{ + int val = 0; + size_t len = sizeof(val); + + if (sysctlbyname(name, &val, &len, NULL, 0) == 0) { + return val != 0; + } + + /* + * We might in the future ask for properties not present in older kernels, + * but we're only asking about static properties, all of which should be + * 'int'. So we shouln't see ENOMEM (val too small), or any of the other + * more exotic errors. + */ + assert(errno == ENOENT); + return false; +} +#endif + static void tcg_target_init(TCGContext *s) { +#ifdef __linux__ + unsigned long hwcap = qemu_getauxval(AT_HWCAP); + have_lse = hwcap & HWCAP_ATOMICS; + have_lse2 = hwcap & HWCAP_USCAT; +#endif +#ifdef CONFIG_DARWIN + have_lse = sysctl_for_bool("hw.optional.arm.FEAT_LSE"); + have_lse2 = sysctl_for_bool("hw.optional.arm.FEAT_LSE2"); +#endif + tcg_target_available_regs[TCG_TYPE_I32] = 0xffffffffu; tcg_target_available_regs[TCG_TYPE_I64] = 0xffffffffu; tcg_target_available_regs[TCG_TYPE_V64] = 0xffffffff00000000ull; @@ -3008,10 +2987,14 @@ static void tcg_target_qemu_prologue(TCGContext *s) CPU_TEMP_BUF_NLONGS * sizeof(long)); #if !defined(CONFIG_SOFTMMU) - if (USE_GUEST_BASE) { - tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_GUEST_BASE, guest_base); - tcg_regset_set_reg(s->reserved_regs, TCG_REG_GUEST_BASE); - } + /* + * Note that XZR cannot be encoded in the address base register slot, + * as that actaully encodes SP. Depending on the guest, we may need + * to zero-extend the guest address via the address index register slot, + * therefore we need to load even a zero guest base into a register. + */ + tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_GUEST_BASE, guest_base); + tcg_regset_set_reg(s->reserved_regs, TCG_REG_GUEST_BASE); #endif tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h index c0b0f614ba..74ee2ed255 100644 --- a/tcg/aarch64/tcg-target.h +++ b/tcg/aarch64/tcg-target.h @@ -57,6 +57,9 @@ typedef enum { #define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_EVEN #define TCG_TARGET_CALL_RET_I128 TCG_CALL_RET_NORMAL +extern bool have_lse; +extern bool have_lse2; + /* optional instructions */ #define TCG_TARGET_HAS_div_i32 1 #define TCG_TARGET_HAS_rem_i32 1 @@ -126,6 +129,8 @@ typedef enum { #define TCG_TARGET_HAS_muluh_i64 1 #define TCG_TARGET_HAS_mulsh_i64 1 +#define TCG_TARGET_HAS_qemu_ldst_i128 0 + #define TCG_TARGET_HAS_v64 1 #define TCG_TARGET_HAS_v128 1 #define TCG_TARGET_HAS_v256 0 @@ -151,7 +156,6 @@ typedef enum { #define TCG_TARGET_HAS_cmpsel_vec 0 #define TCG_TARGET_DEFAULT_MO (0) -#define TCG_TARGET_HAS_MEMORY_BSWAP 0 #define TCG_TARGET_NEED_LDST_LABELS #define TCG_TARGET_NEED_POOL_LABELS diff --git a/tcg/arm/tcg-target-con-set.h b/tcg/arm/tcg-target-con-set.h index b8849b2478..229ae258ac 100644 --- a/tcg/arm/tcg-target-con-set.h +++ b/tcg/arm/tcg-target-con-set.h @@ -12,19 +12,19 @@ C_O0_I1(r) C_O0_I2(r, r) C_O0_I2(r, rIN) -C_O0_I2(s, s) +C_O0_I2(q, q) C_O0_I2(w, r) -C_O0_I3(s, s, s) -C_O0_I3(S, p, s) +C_O0_I3(q, q, q) +C_O0_I3(Q, p, q) C_O0_I4(r, r, rI, rI) -C_O0_I4(S, p, s, s) -C_O1_I1(r, l) +C_O0_I4(Q, p, q, q) +C_O1_I1(r, q) C_O1_I1(r, r) C_O1_I1(w, r) C_O1_I1(w, w) C_O1_I1(w, wr) C_O1_I2(r, 0, rZ) -C_O1_I2(r, l, l) +C_O1_I2(r, q, q) C_O1_I2(r, r, r) C_O1_I2(r, r, rI) C_O1_I2(r, r, rIK) @@ -39,8 +39,8 @@ C_O1_I2(w, w, wZ) C_O1_I3(w, w, w, w) C_O1_I4(r, r, r, rI, rI) C_O1_I4(r, r, rIN, rIK, 0) -C_O2_I1(e, p, l) -C_O2_I2(e, p, l, l) +C_O2_I1(e, p, q) +C_O2_I2(e, p, q, q) C_O2_I2(r, r, r, r) C_O2_I4(r, r, r, r, rIN, rIK) C_O2_I4(r, r, rI, rI, rIN, rIK) diff --git a/tcg/arm/tcg-target-con-str.h b/tcg/arm/tcg-target-con-str.h index 24b4b59feb..f83f1d3919 100644 --- a/tcg/arm/tcg-target-con-str.h +++ b/tcg/arm/tcg-target-con-str.h @@ -10,9 +10,8 @@ */ REGS('e', ALL_GENERAL_REGS & 0x5555) /* even regs */ REGS('r', ALL_GENERAL_REGS) -REGS('l', ALL_QLOAD_REGS) -REGS('s', ALL_QSTORE_REGS) -REGS('S', ALL_QSTORE_REGS & 0x5555) /* even qstore */ +REGS('q', ALL_QLDST_REGS) +REGS('Q', ALL_QLDST_REGS & 0x5555) /* even qldst */ REGS('w', ALL_VECTOR_REGS) /* diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc index df514e56fc..20cc1cc477 100644 --- a/tcg/arm/tcg-target.c.inc +++ b/tcg/arm/tcg-target.c.inc @@ -353,23 +353,16 @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type, #define ALL_VECTOR_REGS 0xffff0000u /* - * r0-r2 will be overwritten when reading the tlb entry (softmmu only) - * and r0-r1 doing the byte swapping, so don't use these. - * r3 is removed for softmmu to avoid clashes with helper arguments. + * r0-r3 will be overwritten when reading the tlb entry (softmmu only); + * r14 will be overwritten by the BLNE branching to the slow path. */ #ifdef CONFIG_SOFTMMU -#define ALL_QLOAD_REGS \ +#define ALL_QLDST_REGS \ (ALL_GENERAL_REGS & ~((1 << TCG_REG_R0) | (1 << TCG_REG_R1) | \ (1 << TCG_REG_R2) | (1 << TCG_REG_R3) | \ (1 << TCG_REG_R14))) -#define ALL_QSTORE_REGS \ - (ALL_GENERAL_REGS & ~((1 << TCG_REG_R0) | (1 << TCG_REG_R1) | \ - (1 << TCG_REG_R2) | (1 << TCG_REG_R14) | \ - ((TARGET_LONG_BITS == 64) << TCG_REG_R3))) #else -#define ALL_QLOAD_REGS ALL_GENERAL_REGS -#define ALL_QSTORE_REGS \ - (ALL_GENERAL_REGS & ~((1 << TCG_REG_R0) | (1 << TCG_REG_R1))) +#define ALL_QLDST_REGS (ALL_GENERAL_REGS & ~(1 << TCG_REG_R14)) #endif /* @@ -1330,45 +1323,13 @@ typedef struct { TCGReg base; int index; bool index_scratch; + TCGAtomAlign aa; } HostAddress; -#ifdef CONFIG_SOFTMMU -/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr, - * int mmu_idx, uintptr_t ra) - */ -static void * const qemu_ld_helpers[MO_SSIZE + 1] = { - [MO_UB] = helper_ret_ldub_mmu, - [MO_SB] = helper_ret_ldsb_mmu, -#if HOST_BIG_ENDIAN - [MO_UW] = helper_be_lduw_mmu, - [MO_UL] = helper_be_ldul_mmu, - [MO_UQ] = helper_be_ldq_mmu, - [MO_SW] = helper_be_ldsw_mmu, - [MO_SL] = helper_be_ldul_mmu, -#else - [MO_UW] = helper_le_lduw_mmu, - [MO_UL] = helper_le_ldul_mmu, - [MO_UQ] = helper_le_ldq_mmu, - [MO_SW] = helper_le_ldsw_mmu, - [MO_SL] = helper_le_ldul_mmu, -#endif -}; - -/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr, - * uintxx_t val, int mmu_idx, uintptr_t ra) - */ -static void * const qemu_st_helpers[MO_SIZE + 1] = { - [MO_8] = helper_ret_stb_mmu, -#if HOST_BIG_ENDIAN - [MO_16] = helper_be_stw_mmu, - [MO_32] = helper_be_stl_mmu, - [MO_64] = helper_be_stq_mmu, -#else - [MO_16] = helper_le_stw_mmu, - [MO_32] = helper_le_stl_mmu, - [MO_64] = helper_le_stq_mmu, -#endif -}; +bool tcg_target_has_memory_bswap(MemOp memop) +{ + return false; +} static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg) { @@ -1412,50 +1373,6 @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb) tcg_out_goto(s, COND_AL, qemu_st_helpers[opc & MO_SIZE]); return true; } -#else -static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l) -{ - if (!reloc_pc24(l->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) { - return false; - } - - if (TARGET_LONG_BITS == 64) { - /* 64-bit target address is aligned into R2:R3. */ - TCGMovExtend ext[2] = { - { .dst = TCG_REG_R2, .dst_type = TCG_TYPE_I32, - .src = l->addrlo_reg, - .src_type = TCG_TYPE_I32, .src_ext = MO_UL }, - { .dst = TCG_REG_R3, .dst_type = TCG_TYPE_I32, - .src = l->addrhi_reg, - .src_type = TCG_TYPE_I32, .src_ext = MO_UL }, - }; - tcg_out_movext2(s, &ext[0], &ext[1], TCG_REG_TMP); - } else { - tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R1, l->addrlo_reg); - } - tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_R0, TCG_AREG0); - - /* - * Tail call to the helper, with the return address back inline, - * just for the clarity of the debugging traceback -- the helper - * cannot return. We have used BLNE to arrive here, so LR is - * already set. - */ - tcg_out_goto(s, COND_AL, (const void *) - (l->is_ld ? helper_unaligned_ld : helper_unaligned_st)); - return true; -} - -static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) -{ - return tcg_out_fail_alignment(s, l); -} - -static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) -{ - return tcg_out_fail_alignment(s, l); -} -#endif /* SOFTMMU */ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, TCGReg addrlo, TCGReg addrhi, @@ -1463,8 +1380,26 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, { TCGLabelQemuLdst *ldst = NULL; MemOp opc = get_memop(oi); - MemOp a_bits = get_alignment_bits(opc); - unsigned a_mask = (1 << a_bits) - 1; + unsigned a_mask; + +#ifdef CONFIG_SOFTMMU + *h = (HostAddress){ + .cond = COND_AL, + .base = addrlo, + .index = TCG_REG_R1, + .index_scratch = true, + }; +#else + *h = (HostAddress){ + .cond = COND_AL, + .base = addrlo, + .index = guest_base ? TCG_REG_GUEST_BASE : -1, + .index_scratch = false, + }; +#endif + + h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, false); + a_mask = (1 << h->aa.align) - 1; #ifdef CONFIG_SOFTMMU int mem_index = get_mmuidx(oi); @@ -1489,25 +1424,25 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, /* Extract the tlb index from the address into R0. */ tcg_out_dat_reg(s, COND_AL, ARITH_AND, TCG_REG_R0, TCG_REG_R0, addrlo, - SHIFT_IMM_LSR(TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS)); + SHIFT_IMM_LSR(s->page_bits - CPU_TLB_ENTRY_BITS)); /* * Add the tlb_table pointer, creating the CPUTLBEntry address in R1. * Load the tlb comparator into R2/R3 and the fast path addend into R1. */ if (cmp_off == 0) { - if (TARGET_LONG_BITS == 64) { - tcg_out_ldrd_rwb(s, COND_AL, TCG_REG_R2, TCG_REG_R1, TCG_REG_R0); - } else { + if (s->addr_type == TCG_TYPE_I32) { tcg_out_ld32_rwb(s, COND_AL, TCG_REG_R2, TCG_REG_R1, TCG_REG_R0); + } else { + tcg_out_ldrd_rwb(s, COND_AL, TCG_REG_R2, TCG_REG_R1, TCG_REG_R0); } } else { tcg_out_dat_reg(s, COND_AL, ARITH_ADD, TCG_REG_R1, TCG_REG_R1, TCG_REG_R0, 0); - if (TARGET_LONG_BITS == 64) { - tcg_out_ldrd_8(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off); - } else { + if (s->addr_type == TCG_TYPE_I32) { tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off); + } else { + tcg_out_ldrd_8(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off); } } @@ -1533,8 +1468,8 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, tcg_out_dat_imm(s, COND_AL, ARITH_ADD, t_addr, addrlo, s_mask - a_mask); } - if (use_armv7_instructions && TARGET_PAGE_BITS <= 16) { - tcg_out_movi32(s, COND_AL, TCG_REG_TMP, ~(TARGET_PAGE_MASK | a_mask)); + if (use_armv7_instructions && s->page_bits <= 16) { + tcg_out_movi32(s, COND_AL, TCG_REG_TMP, ~(s->page_mask | a_mask)); tcg_out_dat_reg(s, COND_AL, ARITH_BIC, TCG_REG_TMP, t_addr, TCG_REG_TMP, 0); tcg_out_dat_reg(s, COND_AL, ARITH_CMP, 0, TCG_REG_R2, TCG_REG_TMP, 0); @@ -1544,22 +1479,15 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, tcg_out_dat_imm(s, COND_AL, ARITH_TST, 0, addrlo, a_mask); } tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP, 0, t_addr, - SHIFT_IMM_LSR(TARGET_PAGE_BITS)); + SHIFT_IMM_LSR(s->page_bits)); tcg_out_dat_reg(s, (a_mask ? COND_EQ : COND_AL), ARITH_CMP, 0, TCG_REG_R2, TCG_REG_TMP, - SHIFT_IMM_LSL(TARGET_PAGE_BITS)); + SHIFT_IMM_LSL(s->page_bits)); } - if (TARGET_LONG_BITS == 64) { + if (s->addr_type != TCG_TYPE_I32) { tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0, TCG_REG_R3, addrhi, 0); } - - *h = (HostAddress){ - .cond = COND_AL, - .base = addrlo, - .index = TCG_REG_R1, - .index_scratch = true, - }; #else if (a_mask) { ldst = new_ldst_label(s); @@ -1568,18 +1496,11 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, ldst->addrlo_reg = addrlo; ldst->addrhi_reg = addrhi; - /* We are expecting a_bits to max out at 7 */ + /* We are expecting alignment to max out at 7 */ tcg_debug_assert(a_mask <= 0xff); /* tst addr, #mask */ tcg_out_dat_imm(s, COND_AL, ARITH_TST, 0, addrlo, a_mask); } - - *h = (HostAddress){ - .cond = COND_AL, - .base = addrlo, - .index = guest_base ? TCG_REG_GUEST_BASE : -1, - .index_scratch = false, - }; #endif return ldst; @@ -2064,41 +1985,36 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, ARITH_MOV, args[0], 0, 0); break; - case INDEX_op_qemu_ld_i32: - if (TARGET_LONG_BITS == 32) { - tcg_out_qemu_ld(s, args[0], -1, args[1], -1, - args[2], TCG_TYPE_I32); - } else { - tcg_out_qemu_ld(s, args[0], -1, args[1], args[2], - args[3], TCG_TYPE_I32); - } + case INDEX_op_qemu_ld_a32_i32: + tcg_out_qemu_ld(s, args[0], -1, args[1], -1, args[2], TCG_TYPE_I32); break; - case INDEX_op_qemu_ld_i64: - if (TARGET_LONG_BITS == 32) { - tcg_out_qemu_ld(s, args[0], args[1], args[2], -1, - args[3], TCG_TYPE_I64); - } else { - tcg_out_qemu_ld(s, args[0], args[1], args[2], args[3], - args[4], TCG_TYPE_I64); - } + case INDEX_op_qemu_ld_a64_i32: + tcg_out_qemu_ld(s, args[0], -1, args[1], args[2], + args[3], TCG_TYPE_I32); break; - case INDEX_op_qemu_st_i32: - if (TARGET_LONG_BITS == 32) { - tcg_out_qemu_st(s, args[0], -1, args[1], -1, - args[2], TCG_TYPE_I32); - } else { - tcg_out_qemu_st(s, args[0], -1, args[1], args[2], - args[3], TCG_TYPE_I32); - } + case INDEX_op_qemu_ld_a32_i64: + tcg_out_qemu_ld(s, args[0], args[1], args[2], -1, + args[3], TCG_TYPE_I64); break; - case INDEX_op_qemu_st_i64: - if (TARGET_LONG_BITS == 32) { - tcg_out_qemu_st(s, args[0], args[1], args[2], -1, - args[3], TCG_TYPE_I64); - } else { - tcg_out_qemu_st(s, args[0], args[1], args[2], args[3], - args[4], TCG_TYPE_I64); - } + case INDEX_op_qemu_ld_a64_i64: + tcg_out_qemu_ld(s, args[0], args[1], args[2], args[3], + args[4], TCG_TYPE_I64); + break; + + case INDEX_op_qemu_st_a32_i32: + tcg_out_qemu_st(s, args[0], -1, args[1], -1, args[2], TCG_TYPE_I32); + break; + case INDEX_op_qemu_st_a64_i32: + tcg_out_qemu_st(s, args[0], -1, args[1], args[2], + args[3], TCG_TYPE_I32); + break; + case INDEX_op_qemu_st_a32_i64: + tcg_out_qemu_st(s, args[0], args[1], args[2], -1, + args[3], TCG_TYPE_I64); + break; + case INDEX_op_qemu_st_a64_i64: + tcg_out_qemu_st(s, args[0], args[1], args[2], args[3], + args[4], TCG_TYPE_I64); break; case INDEX_op_bswap16_i32: @@ -2239,14 +2155,22 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_setcond2_i32: return C_O1_I4(r, r, r, rI, rI); - case INDEX_op_qemu_ld_i32: - return TARGET_LONG_BITS == 32 ? C_O1_I1(r, l) : C_O1_I2(r, l, l); - case INDEX_op_qemu_ld_i64: - return TARGET_LONG_BITS == 32 ? C_O2_I1(e, p, l) : C_O2_I2(e, p, l, l); - case INDEX_op_qemu_st_i32: - return TARGET_LONG_BITS == 32 ? C_O0_I2(s, s) : C_O0_I3(s, s, s); - case INDEX_op_qemu_st_i64: - return TARGET_LONG_BITS == 32 ? C_O0_I3(S, p, s) : C_O0_I4(S, p, s, s); + case INDEX_op_qemu_ld_a32_i32: + return C_O1_I1(r, q); + case INDEX_op_qemu_ld_a64_i32: + return C_O1_I2(r, q, q); + case INDEX_op_qemu_ld_a32_i64: + return C_O2_I1(e, p, q); + case INDEX_op_qemu_ld_a64_i64: + return C_O2_I2(e, p, q, q); + case INDEX_op_qemu_st_a32_i32: + return C_O0_I2(q, q); + case INDEX_op_qemu_st_a64_i32: + return C_O0_I3(q, q, q); + case INDEX_op_qemu_st_a32_i64: + return C_O0_I3(Q, p, q); + case INDEX_op_qemu_st_a64_i64: + return C_O0_I4(Q, p, q, q); case INDEX_op_st_vec: return C_O0_I2(w, r); diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h index def2a189e6..65efc538f4 100644 --- a/tcg/arm/tcg-target.h +++ b/tcg/arm/tcg-target.h @@ -125,6 +125,8 @@ extern bool use_neon_instructions; #define TCG_TARGET_HAS_rem_i32 0 #define TCG_TARGET_HAS_qemu_st8_i32 0 +#define TCG_TARGET_HAS_qemu_ldst_i128 0 + #define TCG_TARGET_HAS_v64 use_neon_instructions #define TCG_TARGET_HAS_v128 use_neon_instructions #define TCG_TARGET_HAS_v256 0 @@ -150,7 +152,6 @@ extern bool use_neon_instructions; #define TCG_TARGET_HAS_cmpsel_vec 0 #define TCG_TARGET_DEFAULT_MO (0) -#define TCG_TARGET_HAS_MEMORY_BSWAP 0 #define TCG_TARGET_NEED_LDST_LABELS #define TCG_TARGET_NEED_POOL_LABELS diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc index a01bfad773..8b9a5f00e5 100644 --- a/tcg/i386/tcg-target.c.inc +++ b/tcg/i386/tcg-target.c.inc @@ -185,6 +185,7 @@ bool have_avx512dq; bool have_avx512vbmi2; bool have_avx512vl; bool have_movbe; +bool have_atomic16; #ifdef CONFIG_CPUID_H static bool have_bmi2; @@ -1091,7 +1092,7 @@ static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs, { /* This function is only used for passing structs by reference. */ tcg_debug_assert(imm == (int32_t)imm); - tcg_out_modrm_offset(s, OPC_LEA, rd, rs, imm); + tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm); } static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val) @@ -1314,7 +1315,9 @@ static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src) { - tcg_out_ext32u(s, dest, src); + if (dest != src) { + tcg_out_ext32u(s, dest, src); + } } static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src) @@ -1773,34 +1776,13 @@ typedef struct { int index; int ofs; int seg; + TCGAtomAlign aa; } HostAddress; -#if defined(CONFIG_SOFTMMU) -/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr, - * int mmu_idx, uintptr_t ra) - */ -static void * const qemu_ld_helpers[(MO_SIZE | MO_BSWAP) + 1] = { - [MO_UB] = helper_ret_ldub_mmu, - [MO_LEUW] = helper_le_lduw_mmu, - [MO_LEUL] = helper_le_ldul_mmu, - [MO_LEUQ] = helper_le_ldq_mmu, - [MO_BEUW] = helper_be_lduw_mmu, - [MO_BEUL] = helper_be_ldul_mmu, - [MO_BEUQ] = helper_be_ldq_mmu, -}; - -/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr, - * uintxx_t val, int mmu_idx, uintptr_t ra) - */ -static void * const qemu_st_helpers[(MO_SIZE | MO_BSWAP) + 1] = { - [MO_UB] = helper_ret_stb_mmu, - [MO_LEUW] = helper_le_stw_mmu, - [MO_LEUL] = helper_le_stl_mmu, - [MO_LEUQ] = helper_le_stq_mmu, - [MO_BEUW] = helper_be_stw_mmu, - [MO_BEUL] = helper_be_stl_mmu, - [MO_BEUQ] = helper_be_stq_mmu, -}; +bool tcg_target_has_memory_bswap(MemOp memop) +{ + return have_movbe; +} /* * Because i686 has no register parameters and because x86_64 has xchg @@ -1837,12 +1819,12 @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) /* resolve label address */ tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); - if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { + if (label_ptr[1]) { tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); } tcg_out_ld_helper_args(s, l, &ldst_helper_param); - tcg_out_branch(s, 1, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]); + tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]); tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param); tcg_out_jmp(s, l->raddr); @@ -1859,61 +1841,18 @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) /* resolve label address */ tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4); - if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { + if (label_ptr[1]) { tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4); } tcg_out_st_helper_args(s, l, &ldst_helper_param); - tcg_out_branch(s, 1, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]); + tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]); tcg_out_jmp(s, l->raddr); return true; } -#else -static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l) -{ - /* resolve label address */ - tcg_patch32(l->label_ptr[0], s->code_ptr - l->label_ptr[0] - 4); - - if (TCG_TARGET_REG_BITS == 32) { - int ofs = 0; - - tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs); - ofs += 4; - - tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs); - ofs += 4; - if (TARGET_LONG_BITS == 64) { - tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs); - ofs += 4; - } - - tcg_out_pushi(s, (uintptr_t)l->raddr); - } else { - tcg_out_mov(s, TCG_TYPE_TL, tcg_target_call_iarg_regs[1], - l->addrlo_reg); - tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0); - - tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RAX, (uintptr_t)l->raddr); - tcg_out_push(s, TCG_REG_RAX); - } - - /* "Tail call" to the helper, with the return address back inline. */ - tcg_out_jmp(s, (const void *)(l->is_ld ? helper_unaligned_ld - : helper_unaligned_st)); - return true; -} - -static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) -{ - return tcg_out_fail_alignment(s, l); -} - -static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) -{ - return tcg_out_fail_alignment(s, l); -} +#ifndef CONFIG_SOFTMMU static HostAddress x86_guest_base = { .index = -1 }; @@ -1945,7 +1884,7 @@ static inline int setup_guest_base_seg(void) return 0; } #endif /* setup_guest_base_seg */ -#endif /* SOFTMMU */ +#endif /* !SOFTMMU */ /* * For softmmu, perform the TLB load and compare. @@ -1959,8 +1898,18 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, { TCGLabelQemuLdst *ldst = NULL; MemOp opc = get_memop(oi); - unsigned a_bits = get_alignment_bits(opc); - unsigned a_mask = (1 << a_bits) - 1; + unsigned a_mask; + +#ifdef CONFIG_SOFTMMU + h->index = TCG_REG_L0; + h->ofs = 0; + h->seg = 0; +#else + *h = x86_guest_base; +#endif + h->base = addrlo; + h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, false); + a_mask = (1 << h->aa.align) - 1; #ifdef CONFIG_SOFTMMU int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read) @@ -1971,7 +1920,7 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, unsigned mem_index = get_mmuidx(oi); unsigned s_bits = opc & MO_SIZE; unsigned s_mask = (1 << s_bits) - 1; - target_ulong tlb_mask; + int tlb_mask; ldst = new_ldst_label(s); ldst->is_ld = is_ld; @@ -1980,13 +1929,11 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, ldst->addrhi_reg = addrhi; if (TCG_TARGET_REG_BITS == 64) { - if (TARGET_LONG_BITS == 64) { - ttype = TCG_TYPE_I64; - trexw = P_REXW; - } + ttype = s->addr_type; + trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW); if (TCG_TYPE_PTR == TCG_TYPE_I64) { hrexw = P_REXW; - if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) { + if (s->page_bits + s->tlb_dyn_max_bits > 32) { tlbtype = TCG_TYPE_I64; tlbrexw = P_REXW; } @@ -1995,7 +1942,7 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo); tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0, - TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS); + s->page_bits - CPU_TLB_ENTRY_BITS); tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0, TLB_MASK_TABLE_OFS(mem_index) + @@ -2010,13 +1957,13 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, * copy the address and mask. For lesser alignments, check that we don't * cross pages for the complete access. */ - if (a_bits >= s_bits) { + if (a_mask >= s_mask) { tcg_out_mov(s, ttype, TCG_REG_L1, addrlo); } else { tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1, addrlo, s_mask - a_mask); } - tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask; + tlb_mask = s->page_mask | a_mask; tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0); /* cmp 0(TCG_REG_L0), TCG_REG_L1 */ @@ -2028,7 +1975,7 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, ldst->label_ptr[0] = s->code_ptr; s->code_ptr += 4; - if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { + if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) { /* cmp 4(TCG_REG_L0), addrhi */ tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, TCG_REG_L0, cmp_ofs + 4); @@ -2041,13 +1988,8 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, /* TLB Hit. */ tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0, offsetof(CPUTLBEntry, addend)); - - *h = (HostAddress) { - .base = addrlo, - .index = TCG_REG_L0, - }; #else - if (a_bits) { + if (a_mask) { ldst = new_ldst_label(s); ldst->is_ld = is_ld; @@ -2061,9 +2003,6 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, ldst->label_ptr[0] = s->code_ptr; s->code_ptr += 4; } - - *h = x86_guest_base; - h->base = addrlo; #endif return ldst; @@ -2536,35 +2475,51 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0); break; - case INDEX_op_qemu_ld_i32: - if (TCG_TARGET_REG_BITS >= TARGET_LONG_BITS) { - tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32); - } else { + case INDEX_op_qemu_ld_a64_i32: + if (TCG_TARGET_REG_BITS == 32) { tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32); + break; } + /* fall through */ + case INDEX_op_qemu_ld_a32_i32: + tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32); break; - case INDEX_op_qemu_ld_i64: + case INDEX_op_qemu_ld_a32_i64: if (TCG_TARGET_REG_BITS == 64) { tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); - } else if (TARGET_LONG_BITS == 32) { + } else { tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64); + } + break; + case INDEX_op_qemu_ld_a64_i64: + if (TCG_TARGET_REG_BITS == 64) { + tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); } else { tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64); } break; - case INDEX_op_qemu_st_i32: - case INDEX_op_qemu_st8_i32: - if (TCG_TARGET_REG_BITS >= TARGET_LONG_BITS) { - tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32); - } else { + + case INDEX_op_qemu_st_a64_i32: + case INDEX_op_qemu_st8_a64_i32: + if (TCG_TARGET_REG_BITS == 32) { tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32); + break; } + /* fall through */ + case INDEX_op_qemu_st_a32_i32: + case INDEX_op_qemu_st8_a32_i32: + tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32); break; - case INDEX_op_qemu_st_i64: + case INDEX_op_qemu_st_a32_i64: if (TCG_TARGET_REG_BITS == 64) { tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); - } else if (TARGET_LONG_BITS == 32) { + } else { tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64); + } + break; + case INDEX_op_qemu_st_a64_i64: + if (TCG_TARGET_REG_BITS == 64) { + tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64); } else { tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64); } @@ -3242,26 +3197,29 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_clz_i64: return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r); - case INDEX_op_qemu_ld_i32: - return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS - ? C_O1_I1(r, L) : C_O1_I2(r, L, L)); - - case INDEX_op_qemu_st_i32: - return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS - ? C_O0_I2(L, L) : C_O0_I3(L, L, L)); - case INDEX_op_qemu_st8_i32: - return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS - ? C_O0_I2(s, L) : C_O0_I3(s, L, L)); - - case INDEX_op_qemu_ld_i64: - return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) - : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O2_I1(r, r, L) - : C_O2_I2(r, r, L, L)); - - case INDEX_op_qemu_st_i64: - return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) - : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O0_I3(L, L, L) - : C_O0_I4(L, L, L, L)); + case INDEX_op_qemu_ld_a32_i32: + return C_O1_I1(r, L); + case INDEX_op_qemu_ld_a64_i32: + return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O1_I2(r, L, L); + + case INDEX_op_qemu_st_a32_i32: + return C_O0_I2(L, L); + case INDEX_op_qemu_st_a64_i32: + return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L); + case INDEX_op_qemu_st8_a32_i32: + return C_O0_I2(s, L); + case INDEX_op_qemu_st8_a64_i32: + return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(s, L) : C_O0_I3(s, L, L); + + case INDEX_op_qemu_ld_a32_i64: + return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L); + case INDEX_op_qemu_ld_a64_i64: + return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I2(r, r, L, L); + + case INDEX_op_qemu_st_a32_i64: + return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L); + case INDEX_op_qemu_st_a64_i64: + return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L); case INDEX_op_brcond2_i32: return C_O0_I4(r, r, ri, ri); @@ -4052,6 +4010,32 @@ static void tcg_target_init(TCGContext *s) have_avx512dq = (b7 & bit_AVX512DQ) != 0; have_avx512vbmi2 = (c7 & bit_AVX512VBMI2) != 0; } + + /* + * The Intel SDM has added: + * Processors that enumerate support for Intel® AVX + * (by setting the feature flag CPUID.01H:ECX.AVX[bit 28]) + * guarantee that the 16-byte memory operations performed + * by the following instructions will always be carried + * out atomically: + * - MOVAPD, MOVAPS, and MOVDQA. + * - VMOVAPD, VMOVAPS, and VMOVDQA when encoded with VEX.128. + * - VMOVAPD, VMOVAPS, VMOVDQA32, and VMOVDQA64 when encoded + * with EVEX.128 and k0 (masking disabled). + * Note that these instructions require the linear addresses + * of their memory operands to be 16-byte aligned. + * + * AMD has provided an even stronger guarantee that processors + * with AVX provide 16-byte atomicity for all cachable, + * naturally aligned single loads and stores, e.g. MOVDQU. + * + * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688 + */ + if (have_avx1) { + __cpuid(0, a, b, c, d); + have_atomic16 = (c == signature_INTEL_ecx || + c == signature_AMD_ecx); + } } } } diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h index d4f2a6f8c2..0b5a2c68c5 100644 --- a/tcg/i386/tcg-target.h +++ b/tcg/i386/tcg-target.h @@ -120,6 +120,7 @@ extern bool have_avx512dq; extern bool have_avx512vbmi2; extern bool have_avx512vl; extern bool have_movbe; +extern bool have_atomic16; /* optional instructions */ #define TCG_TARGET_HAS_div2_i32 1 @@ -153,9 +154,9 @@ extern bool have_movbe; #define TCG_TARGET_HAS_mulsh_i32 0 #if TCG_TARGET_REG_BITS == 64 -/* Keep target addresses zero-extended in a register. */ -#define TCG_TARGET_HAS_extrl_i64_i32 (TARGET_LONG_BITS == 32) -#define TCG_TARGET_HAS_extrh_i64_i32 (TARGET_LONG_BITS == 32) +/* Keep 32-bit values zero-extended in a register. */ +#define TCG_TARGET_HAS_extrl_i64_i32 1 +#define TCG_TARGET_HAS_extrh_i64_i32 1 #define TCG_TARGET_HAS_div2_i64 1 #define TCG_TARGET_HAS_rot_i64 1 #define TCG_TARGET_HAS_ext8s_i64 1 @@ -193,6 +194,8 @@ extern bool have_movbe; #define TCG_TARGET_HAS_qemu_st8_i32 1 #endif +#define TCG_TARGET_HAS_qemu_ldst_i128 0 + /* We do not support older SSE systems, only beginning with AVX1. */ #define TCG_TARGET_HAS_v64 have_avx1 #define TCG_TARGET_HAS_v128 have_avx1 @@ -239,9 +242,6 @@ extern bool have_movbe; #include "tcg/tcg-mo.h" #define TCG_TARGET_DEFAULT_MO (TCG_MO_ALL & ~TCG_MO_ST_LD) - -#define TCG_TARGET_HAS_MEMORY_BSWAP have_movbe - #define TCG_TARGET_NEED_LDST_LABELS #define TCG_TARGET_NEED_POOL_LABELS diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc index 83fa45c802..0bae922982 100644 --- a/tcg/loongarch64/tcg-target.c.inc +++ b/tcg/loongarch64/tcg-target.c.inc @@ -30,6 +30,7 @@ */ #include "../tcg-ldst.c.inc" +#include <asm/hwcap.h> #ifdef CONFIG_DEBUG_TCG static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { @@ -783,30 +784,6 @@ static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, * Load/store helpers for SoftMMU, and qemu_ld/st implementations */ -#if defined(CONFIG_SOFTMMU) -/* - * helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr, - * MemOpIdx oi, uintptr_t ra) - */ -static void * const qemu_ld_helpers[4] = { - [MO_8] = helper_ret_ldub_mmu, - [MO_16] = helper_le_lduw_mmu, - [MO_32] = helper_le_ldul_mmu, - [MO_64] = helper_le_ldq_mmu, -}; - -/* - * helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr, - * uintxx_t val, MemOpIdx oi, - * uintptr_t ra) - */ -static void * const qemu_st_helpers[4] = { - [MO_8] = helper_ret_stb_mmu, - [MO_16] = helper_le_stw_mmu, - [MO_32] = helper_le_stl_mmu, - [MO_64] = helper_le_stq_mmu, -}; - static bool tcg_out_goto(TCGContext *s, const tcg_insn_unit *target) { tcg_out_opc_b(s, 0); @@ -845,41 +822,18 @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) tcg_out_call_int(s, qemu_st_helpers[opc & MO_SIZE], false); return tcg_out_goto(s, l->raddr); } -#else -static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l) -{ - /* resolve label address */ - if (!reloc_br_sk16(l->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) { - return false; - } - - tcg_out_mov(s, TCG_TYPE_TL, TCG_REG_A1, l->addrlo_reg); - tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_A0, TCG_AREG0); - - /* tail call, with the return address back inline. */ - tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RA, (uintptr_t)l->raddr); - tcg_out_call_int(s, (const void *)(l->is_ld ? helper_unaligned_ld - : helper_unaligned_st), true); - return true; -} - -static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) -{ - return tcg_out_fail_alignment(s, l); -} - -static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) -{ - return tcg_out_fail_alignment(s, l); -} - -#endif /* CONFIG_SOFTMMU */ typedef struct { TCGReg base; TCGReg index; + TCGAtomAlign aa; } HostAddress; +bool tcg_target_has_memory_bswap(MemOp memop) +{ + return false; +} + /* * For softmmu, perform the TLB load and compare. * For useronly, perform any required alignment tests. @@ -890,9 +844,13 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, TCGReg addr_reg, MemOpIdx oi, bool is_ld) { + TCGType addr_type = s->addr_type; TCGLabelQemuLdst *ldst = NULL; MemOp opc = get_memop(oi); - unsigned a_bits = get_alignment_bits(opc); + MemOp a_bits; + + h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, false); + a_bits = h->aa.align; #ifdef CONFIG_SOFTMMU unsigned s_bits = opc & MO_SIZE; @@ -900,7 +858,6 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, int fast_ofs = TLB_MASK_TABLE_OFS(mem_index); int mask_ofs = fast_ofs + offsetof(CPUTLBDescFast, mask); int table_ofs = fast_ofs + offsetof(CPUTLBDescFast, table); - tcg_target_long compare_mask; ldst = new_ldst_label(s); ldst->is_ld = is_ld; @@ -913,25 +870,31 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_AREG0, table_ofs); tcg_out_opc_srli_d(s, TCG_REG_TMP2, addr_reg, - TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS); + s->page_bits - CPU_TLB_ENTRY_BITS); tcg_out_opc_and(s, TCG_REG_TMP2, TCG_REG_TMP2, TCG_REG_TMP0); tcg_out_opc_add_d(s, TCG_REG_TMP2, TCG_REG_TMP2, TCG_REG_TMP1); /* Load the tlb comparator and the addend. */ - tcg_out_ld(s, TCG_TYPE_TL, TCG_REG_TMP0, TCG_REG_TMP2, + tcg_out_ld(s, addr_type, TCG_REG_TMP0, TCG_REG_TMP2, is_ld ? offsetof(CPUTLBEntry, addr_read) : offsetof(CPUTLBEntry, addr_write)); tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP2, TCG_REG_TMP2, offsetof(CPUTLBEntry, addend)); - /* We don't support unaligned accesses. */ + /* + * For aligned accesses, we check the first byte and include the alignment + * bits within the address. For unaligned access, we check that we don't + * cross pages using the address of the last byte of the access. + */ if (a_bits < s_bits) { - a_bits = s_bits; + unsigned a_mask = (1u << a_bits) - 1; + unsigned s_mask = (1u << s_bits) - 1; + tcg_out_addi(s, addr_type, TCG_REG_TMP1, addr_reg, s_mask - a_mask); + } else { + tcg_out_mov(s, addr_type, TCG_REG_TMP1, addr_reg); } - /* Clear the non-page, non-alignment bits from the address. */ - compare_mask = (tcg_target_long)TARGET_PAGE_MASK | ((1 << a_bits) - 1); - tcg_out_movi(s, TCG_TYPE_TL, TCG_REG_TMP1, compare_mask); - tcg_out_opc_and(s, TCG_REG_TMP1, TCG_REG_TMP1, addr_reg); + tcg_out_opc_bstrins_d(s, TCG_REG_TMP1, TCG_REG_ZERO, + a_bits, s->page_bits - 1); /* Compare masked address with the TLB entry. */ ldst->label_ptr[0] = s->code_ptr; @@ -961,7 +924,7 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, h->index = USE_GUEST_BASE ? TCG_GUEST_BASE_REG : TCG_REG_ZERO; #endif - if (TARGET_LONG_BITS == 32) { + if (addr_type == TCG_TYPE_I32) { h->base = TCG_REG_TMP0; tcg_out_ext32u(s, h->base, addr_reg); } else { @@ -1481,16 +1444,20 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, tcg_out_ldst(s, OPC_ST_D, a0, a1, a2); break; - case INDEX_op_qemu_ld_i32: + case INDEX_op_qemu_ld_a32_i32: + case INDEX_op_qemu_ld_a64_i32: tcg_out_qemu_ld(s, a0, a1, a2, TCG_TYPE_I32); break; - case INDEX_op_qemu_ld_i64: + case INDEX_op_qemu_ld_a32_i64: + case INDEX_op_qemu_ld_a64_i64: tcg_out_qemu_ld(s, a0, a1, a2, TCG_TYPE_I64); break; - case INDEX_op_qemu_st_i32: + case INDEX_op_qemu_st_a32_i32: + case INDEX_op_qemu_st_a64_i32: tcg_out_qemu_st(s, a0, a1, a2, TCG_TYPE_I32); break; - case INDEX_op_qemu_st_i64: + case INDEX_op_qemu_st_a32_i64: + case INDEX_op_qemu_st_a64_i64: tcg_out_qemu_st(s, a0, a1, a2, TCG_TYPE_I64); break; @@ -1530,8 +1497,10 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_st32_i64: case INDEX_op_st_i32: case INDEX_op_st_i64: - case INDEX_op_qemu_st_i32: - case INDEX_op_qemu_st_i64: + case INDEX_op_qemu_st_a32_i32: + case INDEX_op_qemu_st_a64_i32: + case INDEX_op_qemu_st_a32_i64: + case INDEX_op_qemu_st_a64_i64: return C_O0_I2(rZ, r); case INDEX_op_brcond_i32: @@ -1573,8 +1542,10 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_ld32u_i64: case INDEX_op_ld_i32: case INDEX_op_ld_i64: - case INDEX_op_qemu_ld_i32: - case INDEX_op_qemu_ld_i64: + case INDEX_op_qemu_ld_a32_i32: + case INDEX_op_qemu_ld_a64_i32: + case INDEX_op_qemu_ld_a32_i64: + case INDEX_op_qemu_ld_a64_i64: return C_O1_I1(r, r); case INDEX_op_andc_i32: @@ -1727,6 +1698,14 @@ static void tcg_target_qemu_prologue(TCGContext *s) static void tcg_target_init(TCGContext *s) { + unsigned long hwcap = qemu_getauxval(AT_HWCAP); + + /* Server and desktop class cpus have UAL; embedded cpus do not. */ + if (!(hwcap & HWCAP_LOONGARCH_UAL)) { + error_report("TCG: unaligned access support required; exiting"); + exit(EXIT_FAILURE); + } + tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS; tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS; diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h index 17b8193aa5..482901ac15 100644 --- a/tcg/loongarch64/tcg-target.h +++ b/tcg/loongarch64/tcg-target.h @@ -168,11 +168,10 @@ typedef enum { #define TCG_TARGET_HAS_muls2_i64 0 #define TCG_TARGET_HAS_muluh_i64 1 #define TCG_TARGET_HAS_mulsh_i64 1 +#define TCG_TARGET_HAS_qemu_ldst_i128 0 #define TCG_TARGET_DEFAULT_MO (0) #define TCG_TARGET_NEED_LDST_LABELS -#define TCG_TARGET_HAS_MEMORY_BSWAP 0 - #endif /* LOONGARCH_TCG_TARGET_H */ diff --git a/tcg/meson.build b/tcg/meson.build index c4c63b19d4..f56c465f4d 100644 --- a/tcg/meson.build +++ b/tcg/meson.build @@ -6,6 +6,7 @@ tcg_ss.add(files( 'tcg.c', 'tcg-common.c', 'tcg-op.c', + 'tcg-op-ldst.c', 'tcg-op-gvec.c', 'tcg-op-vec.c', )) diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc index 5ad9867882..ef146b193c 100644 --- a/tcg/mips/tcg-target.c.inc +++ b/tcg/mips/tcg-target.c.inc @@ -354,10 +354,6 @@ typedef enum { /* Aliases for convenience. */ ALIAS_PADD = sizeof(void *) == 4 ? OPC_ADDU : OPC_DADDU, ALIAS_PADDI = sizeof(void *) == 4 ? OPC_ADDIU : OPC_DADDIU, - ALIAS_TSRL = TARGET_LONG_BITS == 32 || TCG_TARGET_REG_BITS == 32 - ? OPC_SRL : OPC_DSRL, - ALIAS_TADDI = TARGET_LONG_BITS == 32 || TCG_TARGET_REG_BITS == 32 - ? OPC_ADDIU : OPC_DADDIU, } MIPSInsn; /* @@ -1075,38 +1071,6 @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *arg, tcg_out_nop(s); } -#if defined(CONFIG_SOFTMMU) -static void * const qemu_ld_helpers[MO_SSIZE + 1] = { - [MO_UB] = helper_ret_ldub_mmu, - [MO_SB] = helper_ret_ldsb_mmu, -#if HOST_BIG_ENDIAN - [MO_UW] = helper_be_lduw_mmu, - [MO_SW] = helper_be_ldsw_mmu, - [MO_UL] = helper_be_ldul_mmu, - [MO_SL] = helper_be_ldsl_mmu, - [MO_UQ] = helper_be_ldq_mmu, -#else - [MO_UW] = helper_le_lduw_mmu, - [MO_SW] = helper_le_ldsw_mmu, - [MO_UL] = helper_le_ldul_mmu, - [MO_UQ] = helper_le_ldq_mmu, - [MO_SL] = helper_le_ldsl_mmu, -#endif -}; - -static void * const qemu_st_helpers[MO_SIZE + 1] = { - [MO_UB] = helper_ret_stb_mmu, -#if HOST_BIG_ENDIAN - [MO_UW] = helper_be_stw_mmu, - [MO_UL] = helper_be_stl_mmu, - [MO_UQ] = helper_be_stq_mmu, -#else - [MO_UW] = helper_le_stw_mmu, - [MO_UL] = helper_le_stl_mmu, - [MO_UQ] = helper_le_stq_mmu, -#endif -}; - /* We have four temps, we might as well expose three of them. */ static const TCGLdstHelperParam ldst_helper_param = { .ntmp = 3, .tmp = { TCG_TMP0, TCG_TMP1, TCG_TMP2 } @@ -1119,8 +1083,7 @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) /* resolve label address */ if (!reloc_pc16(l->label_ptr[0], tgt_rx) - || (TCG_TARGET_REG_BITS < TARGET_LONG_BITS - && !reloc_pc16(l->label_ptr[1], tgt_rx))) { + || (l->label_ptr[1] && !reloc_pc16(l->label_ptr[1], tgt_rx))) { return false; } @@ -1149,8 +1112,7 @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) /* resolve label address */ if (!reloc_pc16(l->label_ptr[0], tgt_rx) - || (TCG_TARGET_REG_BITS < TARGET_LONG_BITS - && !reloc_pc16(l->label_ptr[1], tgt_rx))) { + || (l->label_ptr[1] && !reloc_pc16(l->label_ptr[1], tgt_rx))) { return false; } @@ -1170,61 +1132,16 @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) return true; } -#else -static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l) -{ - void *target; - - if (!reloc_pc16(l->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) { - return false; - } - - if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) { - /* A0 is env, A1 is skipped, A2:A3 is the uint64_t address. */ - TCGReg a2 = MIPS_BE ? l->addrhi_reg : l->addrlo_reg; - TCGReg a3 = MIPS_BE ? l->addrlo_reg : l->addrhi_reg; - - if (a3 != TCG_REG_A2) { - tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_A2, a2); - tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_A3, a3); - } else if (a2 != TCG_REG_A3) { - tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_A3, a3); - tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_A2, a2); - } else { - tcg_out_mov(s, TCG_TYPE_I32, TCG_TMP0, TCG_REG_A2); - tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_A2, TCG_REG_A3); - tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_A3, TCG_TMP0); - } - } else { - tcg_out_mov(s, TCG_TYPE_TL, TCG_REG_A1, l->addrlo_reg); - } - tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_A0, TCG_AREG0); - - /* - * Tail call to the helper, with the return address back inline. - * We have arrived here via BNEL, so $31 is already set. - */ - target = (l->is_ld ? helper_unaligned_ld : helper_unaligned_st); - tcg_out_call_int(s, target, true); - return true; -} - -static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) -{ - return tcg_out_fail_alignment(s, l); -} - -static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) -{ - return tcg_out_fail_alignment(s, l); -} -#endif /* SOFTMMU */ - typedef struct { TCGReg base; - MemOp align; + TCGAtomAlign aa; } HostAddress; +bool tcg_target_has_memory_bswap(MemOp memop) +{ + return false; +} + /* * For softmmu, perform the TLB load and compare. * For useronly, perform any required alignment tests. @@ -1235,13 +1152,18 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, TCGReg addrlo, TCGReg addrhi, MemOpIdx oi, bool is_ld) { + TCGType addr_type = s->addr_type; TCGLabelQemuLdst *ldst = NULL; MemOp opc = get_memop(oi); - unsigned a_bits = get_alignment_bits(opc); + MemOp a_bits; unsigned s_bits = opc & MO_SIZE; - unsigned a_mask = (1 << a_bits) - 1; + unsigned a_mask; TCGReg base; + h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, false); + a_bits = h->aa.align; + a_mask = (1 << a_bits) - 1; + #ifdef CONFIG_SOFTMMU unsigned s_mask = (1 << s_bits) - 1; int mem_index = get_mmuidx(oi); @@ -1265,23 +1187,26 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP1, TCG_AREG0, table_off); /* Extract the TLB index from the address into TMP3. */ - tcg_out_opc_sa(s, ALIAS_TSRL, TCG_TMP3, addrlo, - TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS); + if (TCG_TARGET_REG_BITS == 32 || addr_type == TCG_TYPE_I32) { + tcg_out_opc_sa(s, OPC_SRL, TCG_TMP3, addrlo, + s->page_bits - CPU_TLB_ENTRY_BITS); + } else { + tcg_out_dsrl(s, TCG_TMP3, addrlo, + s->page_bits - CPU_TLB_ENTRY_BITS); + } tcg_out_opc_reg(s, OPC_AND, TCG_TMP3, TCG_TMP3, TCG_TMP0); /* Add the tlb_table pointer, creating the CPUTLBEntry address in TMP3. */ tcg_out_opc_reg(s, ALIAS_PADD, TCG_TMP3, TCG_TMP3, TCG_TMP1); - /* Load the (low-half) tlb comparator. */ - if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) { - tcg_out_ldst(s, OPC_LW, TCG_TMP0, TCG_TMP3, cmp_off + LO_OFF); - } else { - tcg_out_ld(s, TCG_TYPE_TL, TCG_TMP0, TCG_TMP3, cmp_off); - } - - if (TCG_TARGET_REG_BITS >= TARGET_LONG_BITS) { + if (TCG_TARGET_REG_BITS == 64 || addr_type == TCG_TYPE_I32) { + /* Load the tlb comparator. */ + tcg_out_ld(s, addr_type, TCG_TMP0, TCG_TMP3, cmp_off); /* Load the tlb addend for the fast path. */ tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP3, TCG_TMP3, add_off); + } else { + /* Load the low half of the tlb comparator. */ + tcg_out_ldst(s, OPC_LW, TCG_TMP0, TCG_TMP3, cmp_off + LO_OFF); } /* @@ -1289,16 +1214,20 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, * For unaligned accesses, compare against the end of the access to * verify that it does not cross a page boundary. */ - tcg_out_movi(s, TCG_TYPE_TL, TCG_TMP1, TARGET_PAGE_MASK | a_mask); + tcg_out_movi(s, addr_type, TCG_TMP1, s->page_mask | a_mask); if (a_mask < s_mask) { - tcg_out_opc_imm(s, ALIAS_TADDI, TCG_TMP2, addrlo, s_mask - a_mask); + if (TCG_TARGET_REG_BITS == 32 || addr_type == TCG_TYPE_I32) { + tcg_out_opc_imm(s, OPC_ADDIU, TCG_TMP2, addrlo, s_mask - a_mask); + } else { + tcg_out_opc_imm(s, OPC_DADDIU, TCG_TMP2, addrlo, s_mask - a_mask); + } tcg_out_opc_reg(s, OPC_AND, TCG_TMP1, TCG_TMP1, TCG_TMP2); } else { tcg_out_opc_reg(s, OPC_AND, TCG_TMP1, TCG_TMP1, addrlo); } /* Zero extend a 32-bit guest address for a 64-bit host. */ - if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) { + if (TCG_TARGET_REG_BITS == 64 && addr_type == TCG_TYPE_I32) { tcg_out_ext32u(s, TCG_TMP2, addrlo); addrlo = TCG_TMP2; } @@ -1307,7 +1236,7 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, tcg_out_opc_br(s, OPC_BNE, TCG_TMP1, TCG_TMP0); /* Load and test the high half tlb comparator. */ - if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) { + if (TCG_TARGET_REG_BITS == 32 && addr_type != TCG_TYPE_I32) { /* delay slot */ tcg_out_ldst(s, OPC_LW, TCG_TMP0, TCG_TMP3, cmp_off + HI_OFF); @@ -1344,7 +1273,7 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, } base = addrlo; - if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) { + if (TCG_TARGET_REG_BITS == 64 && addr_type == TCG_TYPE_I32) { tcg_out_ext32u(s, TCG_REG_A0, base); base = TCG_REG_A0; } @@ -1360,7 +1289,6 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, #endif h->base = base; - h->align = a_bits; return ldst; } @@ -1473,7 +1401,7 @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi, ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true); - if (use_mips32r6_instructions || h.align >= (opc & MO_SIZE)) { + if (use_mips32r6_instructions || h.aa.align >= (opc & MO_SIZE)) { tcg_out_qemu_ld_direct(s, datalo, datahi, h.base, opc, data_type); } else { tcg_out_qemu_ld_unalign(s, datalo, datahi, h.base, opc, data_type); @@ -1560,7 +1488,7 @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi, ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false); - if (use_mips32r6_instructions || h.align >= (opc & MO_SIZE)) { + if (use_mips32r6_instructions || h.aa.align >= (opc & MO_SIZE)) { tcg_out_qemu_st_direct(s, datalo, datahi, h.base, opc); } else { tcg_out_qemu_st_unalign(s, datalo, datahi, h.base, opc); @@ -2030,34 +1958,49 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, tcg_out_setcond2(s, args[5], a0, a1, a2, args[3], args[4]); break; - case INDEX_op_qemu_ld_i32: - if (TCG_TARGET_REG_BITS >= TARGET_LONG_BITS) { - tcg_out_qemu_ld(s, a0, 0, a1, 0, a2, TCG_TYPE_I32); - } else { + case INDEX_op_qemu_ld_a64_i32: + if (TCG_TARGET_REG_BITS == 32) { tcg_out_qemu_ld(s, a0, 0, a1, a2, args[3], TCG_TYPE_I32); + break; } + /* fall through */ + case INDEX_op_qemu_ld_a32_i32: + tcg_out_qemu_ld(s, a0, 0, a1, 0, a2, TCG_TYPE_I32); break; - case INDEX_op_qemu_ld_i64: + case INDEX_op_qemu_ld_a32_i64: if (TCG_TARGET_REG_BITS == 64) { tcg_out_qemu_ld(s, a0, 0, a1, 0, a2, TCG_TYPE_I64); - } else if (TARGET_LONG_BITS == 32) { + } else { tcg_out_qemu_ld(s, a0, a1, a2, 0, args[3], TCG_TYPE_I64); + } + break; + case INDEX_op_qemu_ld_a64_i64: + if (TCG_TARGET_REG_BITS == 64) { + tcg_out_qemu_ld(s, a0, 0, a1, 0, a2, TCG_TYPE_I64); } else { tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64); } break; - case INDEX_op_qemu_st_i32: - if (TCG_TARGET_REG_BITS >= TARGET_LONG_BITS) { - tcg_out_qemu_st(s, a0, 0, a1, 0, a2, TCG_TYPE_I32); - } else { + + case INDEX_op_qemu_st_a64_i32: + if (TCG_TARGET_REG_BITS == 32) { tcg_out_qemu_st(s, a0, 0, a1, a2, args[3], TCG_TYPE_I32); + break; } + /* fall through */ + case INDEX_op_qemu_st_a32_i32: + tcg_out_qemu_st(s, a0, 0, a1, 0, a2, TCG_TYPE_I32); break; - case INDEX_op_qemu_st_i64: + case INDEX_op_qemu_st_a32_i64: if (TCG_TARGET_REG_BITS == 64) { tcg_out_qemu_st(s, a0, 0, a1, 0, a2, TCG_TYPE_I64); - } else if (TARGET_LONG_BITS == 32) { + } else { tcg_out_qemu_st(s, a0, a1, a2, 0, args[3], TCG_TYPE_I64); + } + break; + case INDEX_op_qemu_st_a64_i64: + if (TCG_TARGET_REG_BITS == 64) { + tcg_out_qemu_st(s, a0, 0, a1, 0, a2, TCG_TYPE_I64); } else { tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64); } @@ -2216,19 +2159,22 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_brcond2_i32: return C_O0_I4(rZ, rZ, rZ, rZ); - case INDEX_op_qemu_ld_i32: - return (TCG_TARGET_REG_BITS == 64 || TARGET_LONG_BITS == 32 - ? C_O1_I1(r, r) : C_O1_I2(r, r, r)); - case INDEX_op_qemu_st_i32: - return (TCG_TARGET_REG_BITS == 64 || TARGET_LONG_BITS == 32 - ? C_O0_I2(rZ, r) : C_O0_I3(rZ, r, r)); - case INDEX_op_qemu_ld_i64: - return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, r) - : TARGET_LONG_BITS == 32 ? C_O2_I1(r, r, r) - : C_O2_I2(r, r, r, r)); - case INDEX_op_qemu_st_i64: + case INDEX_op_qemu_ld_a32_i32: + return C_O1_I1(r, r); + case INDEX_op_qemu_ld_a64_i32: + return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, r) : C_O1_I2(r, r, r); + case INDEX_op_qemu_st_a32_i32: + return C_O0_I2(rZ, r); + case INDEX_op_qemu_st_a64_i32: + return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(rZ, r) : C_O0_I3(rZ, r, r); + case INDEX_op_qemu_ld_a32_i64: + return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, r) : C_O2_I1(r, r, r); + case INDEX_op_qemu_ld_a64_i64: + return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, r) : C_O2_I2(r, r, r, r); + case INDEX_op_qemu_st_a32_i64: + return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(rZ, r) : C_O0_I3(rZ, rZ, r); + case INDEX_op_qemu_st_a64_i64: return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(rZ, r) - : TARGET_LONG_BITS == 32 ? C_O0_I3(rZ, rZ, r) : C_O0_I4(rZ, rZ, r, r)); default: diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h index 42bd7fff01..7277a117ef 100644 --- a/tcg/mips/tcg-target.h +++ b/tcg/mips/tcg-target.h @@ -204,9 +204,9 @@ extern bool use_mips32r2_instructions; #define TCG_TARGET_HAS_ext16u_i64 0 /* andi rt, rs, 0xffff */ #endif -#define TCG_TARGET_DEFAULT_MO 0 -#define TCG_TARGET_HAS_MEMORY_BSWAP 0 +#define TCG_TARGET_HAS_qemu_ldst_i128 0 +#define TCG_TARGET_DEFAULT_MO 0 #define TCG_TARGET_NEED_LDST_LABELS #endif diff --git a/tcg/optimize.c b/tcg/optimize.c index 9614fa3638..bf975a3a6c 100644 --- a/tcg/optimize.c +++ b/tcg/optimize.c @@ -2184,13 +2184,22 @@ void tcg_optimize(TCGContext *s) CASE_OP_32_64_VEC(orc): done = fold_orc(&ctx, op); break; - case INDEX_op_qemu_ld_i32: - case INDEX_op_qemu_ld_i64: + case INDEX_op_qemu_ld_a32_i32: + case INDEX_op_qemu_ld_a64_i32: + case INDEX_op_qemu_ld_a32_i64: + case INDEX_op_qemu_ld_a64_i64: + case INDEX_op_qemu_ld_a32_i128: + case INDEX_op_qemu_ld_a64_i128: done = fold_qemu_ld(&ctx, op); break; - case INDEX_op_qemu_st_i32: - case INDEX_op_qemu_st8_i32: - case INDEX_op_qemu_st_i64: + case INDEX_op_qemu_st8_a32_i32: + case INDEX_op_qemu_st8_a64_i32: + case INDEX_op_qemu_st_a32_i32: + case INDEX_op_qemu_st_a64_i32: + case INDEX_op_qemu_st_a32_i64: + case INDEX_op_qemu_st_a64_i64: + case INDEX_op_qemu_st_a32_i128: + case INDEX_op_qemu_st_a64_i128: done = fold_qemu_st(&ctx, op); break; CASE_OP_32_64(rem): diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc index 29bfbfcc61..d4269dffcf 100644 --- a/tcg/ppc/tcg-target.c.inc +++ b/tcg/ppc/tcg-target.c.inc @@ -1962,33 +1962,6 @@ static const uint32_t qemu_stx_opc[(MO_SIZE + MO_BSWAP) + 1] = { [MO_BSWAP | MO_UQ] = STDBRX, }; -#if defined (CONFIG_SOFTMMU) -/* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr, - * int mmu_idx, uintptr_t ra) - */ -static void * const qemu_ld_helpers[(MO_SIZE | MO_BSWAP) + 1] = { - [MO_UB] = helper_ret_ldub_mmu, - [MO_LEUW] = helper_le_lduw_mmu, - [MO_LEUL] = helper_le_ldul_mmu, - [MO_LEUQ] = helper_le_ldq_mmu, - [MO_BEUW] = helper_be_lduw_mmu, - [MO_BEUL] = helper_be_ldul_mmu, - [MO_BEUQ] = helper_be_ldq_mmu, -}; - -/* helper signature: helper_st_mmu(CPUState *env, target_ulong addr, - * uintxx_t val, int mmu_idx, uintptr_t ra) - */ -static void * const qemu_st_helpers[(MO_SIZE | MO_BSWAP) + 1] = { - [MO_UB] = helper_ret_stb_mmu, - [MO_LEUW] = helper_le_stw_mmu, - [MO_LEUL] = helper_le_stl_mmu, - [MO_LEUQ] = helper_le_stq_mmu, - [MO_BEUW] = helper_be_stw_mmu, - [MO_BEUL] = helper_be_stl_mmu, - [MO_BEUQ] = helper_be_stq_mmu, -}; - static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg) { if (arg < 0) { @@ -2017,7 +1990,7 @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb) } tcg_out_ld_helper_args(s, lb, &ldst_helper_param); - tcg_out_call_int(s, LK, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]); + tcg_out_call_int(s, LK, qemu_ld_helpers[opc & MO_SIZE]); tcg_out_ld_helper_ret(s, lb, false, &ldst_helper_param); tcg_out_b(s, 0, lb->raddr); @@ -2033,60 +2006,23 @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb) } tcg_out_st_helper_args(s, lb, &ldst_helper_param); - tcg_out_call_int(s, LK, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]); + tcg_out_call_int(s, LK, qemu_st_helpers[opc & MO_SIZE]); tcg_out_b(s, 0, lb->raddr); return true; } -#else -static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l) -{ - if (!reloc_pc14(l->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) { - return false; - } - - if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) { - TCGReg arg = TCG_REG_R4; - - arg |= (TCG_TARGET_CALL_ARG_I64 == TCG_CALL_ARG_EVEN); - if (l->addrlo_reg != arg) { - tcg_out_mov(s, TCG_TYPE_I32, arg, l->addrhi_reg); - tcg_out_mov(s, TCG_TYPE_I32, arg + 1, l->addrlo_reg); - } else if (l->addrhi_reg != arg + 1) { - tcg_out_mov(s, TCG_TYPE_I32, arg + 1, l->addrlo_reg); - tcg_out_mov(s, TCG_TYPE_I32, arg, l->addrhi_reg); - } else { - tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R0, arg); - tcg_out_mov(s, TCG_TYPE_I32, arg, arg + 1); - tcg_out_mov(s, TCG_TYPE_I32, arg + 1, TCG_REG_R0); - } - } else { - tcg_out_mov(s, TCG_TYPE_TL, TCG_REG_R4, l->addrlo_reg); - } - tcg_out_mov(s, TCG_TYPE_TL, TCG_REG_R3, TCG_AREG0); - - /* "Tail call" to the helper, with the return address back inline. */ - tcg_out_call_int(s, 0, (const void *)(l->is_ld ? helper_unaligned_ld - : helper_unaligned_st)); - return true; -} - -static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) -{ - return tcg_out_fail_alignment(s, l); -} - -static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) -{ - return tcg_out_fail_alignment(s, l); -} -#endif /* SOFTMMU */ typedef struct { TCGReg base; TCGReg index; + TCGAtomAlign aa; } HostAddress; +bool tcg_target_has_memory_bswap(MemOp memop) +{ + return true; +} + /* * For softmmu, perform the TLB load and compare. * For useronly, perform any required alignment tests. @@ -2099,7 +2035,23 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, { TCGLabelQemuLdst *ldst = NULL; MemOp opc = get_memop(oi); - unsigned a_bits = get_alignment_bits(opc); + MemOp a_bits; + + /* + * Book II, Section 1.4, Single-Copy Atomicity, specifies: + * + * Before 3.0, "An access that is not atomic is performed as a set of + * smaller disjoint atomic accesses. In general, the number and alignment + * of these accesses are implementation-dependent." Thus MO_ATOM_IFALIGN. + * + * As of 3.0, "the non-atomic access is performed as described in + * the corresponding list", which matches MO_ATOM_SUBALIGN. + */ + h->aa = atom_and_align_for_opc(s, opc, + have_isa_3_00 ? MO_ATOM_SUBALIGN + : MO_ATOM_IFALIGN, + false); + a_bits = h->aa.align; #ifdef CONFIG_SOFTMMU int mem_index = get_mmuidx(oi); @@ -2125,10 +2077,10 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, /* Extract the page index, shifted into place for tlb index. */ if (TCG_TARGET_REG_BITS == 32) { tcg_out_shri32(s, TCG_REG_R0, addrlo, - TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS); + s->page_bits - CPU_TLB_ENTRY_BITS); } else { tcg_out_shri64(s, TCG_REG_R0, addrlo, - TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS); + s->page_bits - CPU_TLB_ENTRY_BITS); } tcg_out32(s, AND | SAB(TCG_REG_TMP1, TCG_REG_TMP1, TCG_REG_R0)); @@ -2167,7 +2119,7 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, a_bits = s_bits; } tcg_out_rlw(s, RLWINM, TCG_REG_R0, addrlo, 0, - (32 - a_bits) & 31, 31 - TARGET_PAGE_BITS); + (32 - a_bits) & 31, 31 - s->page_bits); } else { TCGReg t = addrlo; @@ -2188,13 +2140,13 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, /* Mask the address for the requested alignment. */ if (TARGET_LONG_BITS == 32) { tcg_out_rlw(s, RLWINM, TCG_REG_R0, t, 0, - (32 - a_bits) & 31, 31 - TARGET_PAGE_BITS); + (32 - a_bits) & 31, 31 - s->page_bits); } else if (a_bits == 0) { - tcg_out_rld(s, RLDICR, TCG_REG_R0, t, 0, 63 - TARGET_PAGE_BITS); + tcg_out_rld(s, RLDICR, TCG_REG_R0, t, 0, 63 - s->page_bits); } else { tcg_out_rld(s, RLDICL, TCG_REG_R0, t, - 64 - TARGET_PAGE_BITS, TARGET_PAGE_BITS - a_bits); - tcg_out_rld(s, RLDICL, TCG_REG_R0, TCG_REG_R0, TARGET_PAGE_BITS, 0); + 64 - s->page_bits, s->page_bits - a_bits); + tcg_out_rld(s, RLDICL, TCG_REG_R0, TCG_REG_R0, s->page_bits, 0); } } @@ -2880,43 +2832,58 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, tcg_out32(s, MODUD | TAB(args[0], args[1], args[2])); break; - case INDEX_op_qemu_ld_i32: - if (TCG_TARGET_REG_BITS >= TARGET_LONG_BITS) { - tcg_out_qemu_ld(s, args[0], -1, args[1], -1, - args[2], TCG_TYPE_I32); - } else { + case INDEX_op_qemu_ld_a64_i32: + if (TCG_TARGET_REG_BITS == 32) { tcg_out_qemu_ld(s, args[0], -1, args[1], args[2], args[3], TCG_TYPE_I32); + break; } + /* fall through */ + case INDEX_op_qemu_ld_a32_i32: + tcg_out_qemu_ld(s, args[0], -1, args[1], -1, args[2], TCG_TYPE_I32); break; - case INDEX_op_qemu_ld_i64: + case INDEX_op_qemu_ld_a32_i64: if (TCG_TARGET_REG_BITS == 64) { tcg_out_qemu_ld(s, args[0], -1, args[1], -1, args[2], TCG_TYPE_I64); - } else if (TARGET_LONG_BITS == 32) { + } else { tcg_out_qemu_ld(s, args[0], args[1], args[2], -1, args[3], TCG_TYPE_I64); + } + break; + case INDEX_op_qemu_ld_a64_i64: + if (TCG_TARGET_REG_BITS == 64) { + tcg_out_qemu_ld(s, args[0], -1, args[1], -1, + args[2], TCG_TYPE_I64); } else { tcg_out_qemu_ld(s, args[0], args[1], args[2], args[3], args[4], TCG_TYPE_I64); } break; - case INDEX_op_qemu_st_i32: - if (TCG_TARGET_REG_BITS >= TARGET_LONG_BITS) { - tcg_out_qemu_st(s, args[0], -1, args[1], -1, - args[2], TCG_TYPE_I32); - } else { + + case INDEX_op_qemu_st_a64_i32: + if (TCG_TARGET_REG_BITS == 32) { tcg_out_qemu_st(s, args[0], -1, args[1], args[2], args[3], TCG_TYPE_I32); + break; } + /* fall through */ + case INDEX_op_qemu_st_a32_i32: + tcg_out_qemu_st(s, args[0], -1, args[1], -1, args[2], TCG_TYPE_I32); break; - case INDEX_op_qemu_st_i64: + case INDEX_op_qemu_st_a32_i64: if (TCG_TARGET_REG_BITS == 64) { tcg_out_qemu_st(s, args[0], -1, args[1], -1, args[2], TCG_TYPE_I64); - } else if (TARGET_LONG_BITS == 32) { + } else { tcg_out_qemu_st(s, args[0], args[1], args[2], -1, args[3], TCG_TYPE_I64); + } + break; + case INDEX_op_qemu_st_a64_i64: + if (TCG_TARGET_REG_BITS == 64) { + tcg_out_qemu_st(s, args[0], -1, args[1], -1, + args[2], TCG_TYPE_I64); } else { tcg_out_qemu_st(s, args[0], args[1], args[2], args[3], args[4], TCG_TYPE_I64); @@ -3737,25 +3704,23 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_sub2_i32: return C_O2_I4(r, r, rI, rZM, r, r); - case INDEX_op_qemu_ld_i32: - return (TCG_TARGET_REG_BITS == 64 || TARGET_LONG_BITS == 32 - ? C_O1_I1(r, r) - : C_O1_I2(r, r, r)); - - case INDEX_op_qemu_st_i32: - return (TCG_TARGET_REG_BITS == 64 || TARGET_LONG_BITS == 32 - ? C_O0_I2(r, r) - : C_O0_I3(r, r, r)); - - case INDEX_op_qemu_ld_i64: - return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, r) - : TARGET_LONG_BITS == 32 ? C_O2_I1(r, r, r) - : C_O2_I2(r, r, r, r)); - - case INDEX_op_qemu_st_i64: - return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(r, r) - : TARGET_LONG_BITS == 32 ? C_O0_I3(r, r, r) - : C_O0_I4(r, r, r, r)); + case INDEX_op_qemu_ld_a32_i32: + return C_O1_I1(r, r); + case INDEX_op_qemu_ld_a64_i32: + return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, r) : C_O1_I2(r, r, r); + case INDEX_op_qemu_ld_a32_i64: + return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, r) : C_O2_I1(r, r, r); + case INDEX_op_qemu_ld_a64_i64: + return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, r) : C_O2_I2(r, r, r, r); + + case INDEX_op_qemu_st_a32_i32: + return C_O0_I2(r, r); + case INDEX_op_qemu_st_a64_i32: + return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(r, r) : C_O0_I3(r, r, r); + case INDEX_op_qemu_st_a32_i64: + return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(r, r) : C_O0_I3(r, r, r); + case INDEX_op_qemu_st_a64_i64: + return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(r, r) : C_O0_I4(r, r, r, r); case INDEX_op_add_vec: case INDEX_op_sub_vec: diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h index af81c5a57f..0914380bd7 100644 --- a/tcg/ppc/tcg-target.h +++ b/tcg/ppc/tcg-target.h @@ -149,6 +149,8 @@ extern bool have_vsx; #define TCG_TARGET_HAS_mulsh_i64 1 #endif +#define TCG_TARGET_HAS_qemu_ldst_i128 0 + /* * While technically Altivec could support V64, it has no 64-bit store * instruction and substituting two 32-bit stores makes the generated @@ -179,7 +181,6 @@ extern bool have_vsx; #define TCG_TARGET_HAS_cmpsel_vec 0 #define TCG_TARGET_DEFAULT_MO (0) -#define TCG_TARGET_HAS_MEMORY_BSWAP 1 #define TCG_TARGET_NEED_LDST_LABELS #define TCG_TARGET_NEED_POOL_LABELS diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc index d12b824d8c..ff6334980f 100644 --- a/tcg/riscv/tcg-target.c.inc +++ b/tcg/riscv/tcg-target.c.inc @@ -846,49 +846,6 @@ static void tcg_out_mb(TCGContext *s, TCGArg a0) * Load/store and TLB */ -#if defined(CONFIG_SOFTMMU) -/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr, - * MemOpIdx oi, uintptr_t ra) - */ -static void * const qemu_ld_helpers[MO_SSIZE + 1] = { - [MO_UB] = helper_ret_ldub_mmu, - [MO_SB] = helper_ret_ldsb_mmu, -#if HOST_BIG_ENDIAN - [MO_UW] = helper_be_lduw_mmu, - [MO_SW] = helper_be_ldsw_mmu, - [MO_UL] = helper_be_ldul_mmu, -#if TCG_TARGET_REG_BITS == 64 - [MO_SL] = helper_be_ldsl_mmu, -#endif - [MO_UQ] = helper_be_ldq_mmu, -#else - [MO_UW] = helper_le_lduw_mmu, - [MO_SW] = helper_le_ldsw_mmu, - [MO_UL] = helper_le_ldul_mmu, -#if TCG_TARGET_REG_BITS == 64 - [MO_SL] = helper_le_ldsl_mmu, -#endif - [MO_UQ] = helper_le_ldq_mmu, -#endif -}; - -/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr, - * uintxx_t val, MemOpIdx oi, - * uintptr_t ra) - */ -static void * const qemu_st_helpers[MO_SIZE + 1] = { - [MO_8] = helper_ret_stb_mmu, -#if HOST_BIG_ENDIAN - [MO_16] = helper_be_stw_mmu, - [MO_32] = helper_be_stl_mmu, - [MO_64] = helper_be_stq_mmu, -#else - [MO_16] = helper_le_stw_mmu, - [MO_32] = helper_le_stl_mmu, - [MO_64] = helper_le_stq_mmu, -#endif -}; - static void tcg_out_goto(TCGContext *s, const tcg_insn_unit *target) { tcg_out_opc_jump(s, OPC_JAL, TCG_REG_ZERO, 0); @@ -896,6 +853,11 @@ static void tcg_out_goto(TCGContext *s, const tcg_insn_unit *target) tcg_debug_assert(ok); } +bool tcg_target_has_memory_bswap(MemOp memop) +{ + return false; +} + /* We have three temps, we might as well expose them. */ static const TCGLdstHelperParam ldst_helper_param = { .ntmp = 3, .tmp = { TCG_REG_TMP0, TCG_REG_TMP1, TCG_REG_TMP2 } @@ -935,34 +897,6 @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) tcg_out_goto(s, l->raddr); return true; } -#else -static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l) -{ - /* resolve label address */ - if (!reloc_sbimm12(l->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) { - return false; - } - - tcg_out_mov(s, TCG_TYPE_TL, TCG_REG_A1, l->addrlo_reg); - tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_A0, TCG_AREG0); - - /* tail call, with the return address back inline. */ - tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RA, (uintptr_t)l->raddr); - tcg_out_call_int(s, (const void *)(l->is_ld ? helper_unaligned_ld - : helper_unaligned_st), true); - return true; -} - -static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) -{ - return tcg_out_fail_alignment(s, l); -} - -static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) -{ - return tcg_out_fail_alignment(s, l); -} -#endif /* CONFIG_SOFTMMU */ /* * For softmmu, perform the TLB load and compare. @@ -976,17 +910,21 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, TCGReg *pbase, { TCGLabelQemuLdst *ldst = NULL; MemOp opc = get_memop(oi); - unsigned a_bits = get_alignment_bits(opc); - unsigned a_mask = (1u << a_bits) - 1; + TCGAtomAlign aa; + unsigned a_mask; + + aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, false); + a_mask = (1u << aa.align) - 1; #ifdef CONFIG_SOFTMMU unsigned s_bits = opc & MO_SIZE; + unsigned s_mask = (1u << s_bits) - 1; int mem_index = get_mmuidx(oi); int fast_ofs = TLB_MASK_TABLE_OFS(mem_index); int mask_ofs = fast_ofs + offsetof(CPUTLBDescFast, mask); int table_ofs = fast_ofs + offsetof(CPUTLBDescFast, table); - TCGReg mask_base = TCG_AREG0, table_base = TCG_AREG0; - tcg_target_long compare_mask; + int compare_mask; + TCGReg addr_adj; ldst = new_ldst_label(s); ldst->is_ld = is_ld; @@ -995,14 +933,33 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, TCGReg *pbase, QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0); QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -(1 << 11)); - tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, mask_base, mask_ofs); - tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, table_base, table_ofs); + tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_AREG0, mask_ofs); + tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_AREG0, table_ofs); tcg_out_opc_imm(s, OPC_SRLI, TCG_REG_TMP2, addr_reg, - TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS); + s->page_bits - CPU_TLB_ENTRY_BITS); tcg_out_opc_reg(s, OPC_AND, TCG_REG_TMP2, TCG_REG_TMP2, TCG_REG_TMP0); tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP2, TCG_REG_TMP2, TCG_REG_TMP1); + /* + * For aligned accesses, we check the first byte and include the alignment + * bits within the address. For unaligned access, we check that we don't + * cross pages using the address of the last byte of the access. + */ + addr_adj = addr_reg; + if (a_mask < s_mask) { + addr_adj = TCG_REG_TMP0; + tcg_out_opc_imm(s, TARGET_LONG_BITS == 32 ? OPC_ADDIW : OPC_ADDI, + addr_adj, addr_reg, s_mask - a_mask); + } + compare_mask = s->page_mask | a_mask; + if (compare_mask == sextreg(compare_mask, 0, 12)) { + tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_TMP1, addr_adj, compare_mask); + } else { + tcg_out_movi(s, TCG_TYPE_TL, TCG_REG_TMP1, compare_mask); + tcg_out_opc_reg(s, OPC_AND, TCG_REG_TMP1, TCG_REG_TMP1, addr_adj); + } + /* Load the tlb comparator and the addend. */ tcg_out_ld(s, TCG_TYPE_TL, TCG_REG_TMP0, TCG_REG_TMP2, is_ld ? offsetof(CPUTLBEntry, addr_read) @@ -1010,29 +967,17 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, TCGReg *pbase, tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP2, TCG_REG_TMP2, offsetof(CPUTLBEntry, addend)); - /* We don't support unaligned accesses. */ - if (a_bits < s_bits) { - a_bits = s_bits; - } - /* Clear the non-page, non-alignment bits from the address. */ - compare_mask = (tcg_target_long)TARGET_PAGE_MASK | a_mask; - if (compare_mask == sextreg(compare_mask, 0, 12)) { - tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_TMP1, addr_reg, compare_mask); - } else { - tcg_out_movi(s, TCG_TYPE_TL, TCG_REG_TMP1, compare_mask); - tcg_out_opc_reg(s, OPC_AND, TCG_REG_TMP1, TCG_REG_TMP1, addr_reg); - } - /* Compare masked address with the TLB entry. */ ldst->label_ptr[0] = s->code_ptr; tcg_out_opc_branch(s, OPC_BNE, TCG_REG_TMP0, TCG_REG_TMP1, 0); /* TLB Hit - translate address using addend. */ + addr_adj = addr_reg; if (TARGET_LONG_BITS == 32) { - tcg_out_ext32u(s, TCG_REG_TMP0, addr_reg); - addr_reg = TCG_REG_TMP0; + addr_adj = TCG_REG_TMP0; + tcg_out_ext32u(s, addr_adj, addr_reg); } - tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP0, TCG_REG_TMP2, addr_reg); + tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP0, TCG_REG_TMP2, addr_adj); *pbase = TCG_REG_TMP0; #else if (a_mask) { @@ -1041,8 +986,8 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, TCGReg *pbase, ldst->oi = oi; ldst->addrlo_reg = addr_reg; - /* We are expecting a_bits max 7, so we can always use andi. */ - tcg_debug_assert(a_bits < 12); + /* We are expecting alignment max 7, so we can always use andi. */ + tcg_debug_assert(a_mask == sextreg(a_mask, 0, 12)); tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_TMP1, addr_reg, a_mask); ldst->label_ptr[0] = s->code_ptr; @@ -1437,16 +1382,20 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, tcg_out_setcond(s, args[3], a0, a1, a2); break; - case INDEX_op_qemu_ld_i32: + case INDEX_op_qemu_ld_a32_i32: + case INDEX_op_qemu_ld_a64_i32: tcg_out_qemu_ld(s, a0, a1, a2, TCG_TYPE_I32); break; - case INDEX_op_qemu_ld_i64: + case INDEX_op_qemu_ld_a32_i64: + case INDEX_op_qemu_ld_a64_i64: tcg_out_qemu_ld(s, a0, a1, a2, TCG_TYPE_I64); break; - case INDEX_op_qemu_st_i32: + case INDEX_op_qemu_st_a32_i32: + case INDEX_op_qemu_st_a64_i32: tcg_out_qemu_st(s, a0, a1, a2, TCG_TYPE_I32); break; - case INDEX_op_qemu_st_i64: + case INDEX_op_qemu_st_a32_i64: + case INDEX_op_qemu_st_a64_i64: tcg_out_qemu_st(s, a0, a1, a2, TCG_TYPE_I64); break; @@ -1588,11 +1537,15 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_sub2_i64: return C_O2_I4(r, r, rZ, rZ, rM, rM); - case INDEX_op_qemu_ld_i32: - case INDEX_op_qemu_ld_i64: + case INDEX_op_qemu_ld_a32_i32: + case INDEX_op_qemu_ld_a64_i32: + case INDEX_op_qemu_ld_a32_i64: + case INDEX_op_qemu_ld_a64_i64: return C_O1_I1(r, r); - case INDEX_op_qemu_st_i32: - case INDEX_op_qemu_st_i64: + case INDEX_op_qemu_st_a32_i32: + case INDEX_op_qemu_st_a64_i32: + case INDEX_op_qemu_st_a32_i64: + case INDEX_op_qemu_st_a64_i64: return C_O0_I2(rZ, r); default: diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h index dddf2486c1..494c986b49 100644 --- a/tcg/riscv/tcg-target.h +++ b/tcg/riscv/tcg-target.h @@ -163,11 +163,11 @@ typedef enum { #define TCG_TARGET_HAS_muluh_i64 1 #define TCG_TARGET_HAS_mulsh_i64 1 +#define TCG_TARGET_HAS_qemu_ldst_i128 0 + #define TCG_TARGET_DEFAULT_MO (0) #define TCG_TARGET_NEED_LDST_LABELS #define TCG_TARGET_NEED_POOL_LABELS -#define TCG_TARGET_HAS_MEMORY_BSWAP 0 - #endif diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc index aacbaf21d5..dfaa34c264 100644 --- a/tcg/s390x/tcg-target.c.inc +++ b/tcg/s390x/tcg-target.c.inc @@ -438,33 +438,6 @@ static const uint8_t tcg_cond_to_ltr_cond[] = { [TCG_COND_GEU] = S390_CC_ALWAYS, }; -#ifdef CONFIG_SOFTMMU -static void * const qemu_ld_helpers[(MO_SSIZE | MO_BSWAP) + 1] = { - [MO_UB] = helper_ret_ldub_mmu, - [MO_SB] = helper_ret_ldsb_mmu, - [MO_LEUW] = helper_le_lduw_mmu, - [MO_LESW] = helper_le_ldsw_mmu, - [MO_LEUL] = helper_le_ldul_mmu, - [MO_LESL] = helper_le_ldsl_mmu, - [MO_LEUQ] = helper_le_ldq_mmu, - [MO_BEUW] = helper_be_lduw_mmu, - [MO_BESW] = helper_be_ldsw_mmu, - [MO_BEUL] = helper_be_ldul_mmu, - [MO_BESL] = helper_be_ldsl_mmu, - [MO_BEUQ] = helper_be_ldq_mmu, -}; - -static void * const qemu_st_helpers[(MO_SIZE | MO_BSWAP) + 1] = { - [MO_UB] = helper_ret_stb_mmu, - [MO_LEUW] = helper_le_stw_mmu, - [MO_LEUL] = helper_le_stl_mmu, - [MO_LEUQ] = helper_le_stq_mmu, - [MO_BEUW] = helper_be_stw_mmu, - [MO_BEUL] = helper_be_stl_mmu, - [MO_BEUQ] = helper_be_stq_mmu, -}; -#endif - static const tcg_insn_unit *tb_ret_addr; uint64_t s390_facilities[3]; @@ -1599,8 +1572,14 @@ typedef struct { TCGReg base; TCGReg index; int disp; + TCGAtomAlign aa; } HostAddress; +bool tcg_target_has_memory_bswap(MemOp memop) +{ + return true; +} + static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp opc, TCGReg data, HostAddress h) { @@ -1706,7 +1685,6 @@ static void tcg_out_qemu_st_direct(TCGContext *s, MemOp opc, TCGReg data, } } -#if defined(CONFIG_SOFTMMU) static const TCGLdstHelperParam ldst_helper_param = { .ntmp = 1, .tmp = { TCG_TMP0 } }; @@ -1721,7 +1699,7 @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb) } tcg_out_ld_helper_args(s, lb, &ldst_helper_param); - tcg_out_call_int(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]); + tcg_out_call_int(s, qemu_ld_helpers[opc & MO_SIZE]); tcg_out_ld_helper_ret(s, lb, false, &ldst_helper_param); tgen_gotoi(s, S390_CC_ALWAYS, lb->raddr); @@ -1738,39 +1716,11 @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb) } tcg_out_st_helper_args(s, lb, &ldst_helper_param); - tcg_out_call_int(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]); + tcg_out_call_int(s, qemu_st_helpers[opc & MO_SIZE]); tgen_gotoi(s, S390_CC_ALWAYS, lb->raddr); return true; } -#else -static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l) -{ - if (!patch_reloc(l->label_ptr[0], R_390_PC16DBL, - (intptr_t)tcg_splitwx_to_rx(s->code_ptr), 2)) { - return false; - } - - tcg_out_mov(s, TCG_TYPE_TL, TCG_REG_R3, l->addrlo_reg); - tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_R2, TCG_AREG0); - - /* "Tail call" to the helper, with the return address back inline. */ - tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R14, (uintptr_t)l->raddr); - tgen_gotoi(s, S390_CC_ALWAYS, (const void *)(l->is_ld ? helper_unaligned_ld - : helper_unaligned_st)); - return true; -} - -static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) -{ - return tcg_out_fail_alignment(s, l); -} - -static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) -{ - return tcg_out_fail_alignment(s, l); -} -#endif /* CONFIG_SOFTMMU */ /* * For softmmu, perform the TLB load and compare. @@ -1784,8 +1734,10 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, { TCGLabelQemuLdst *ldst = NULL; MemOp opc = get_memop(oi); - unsigned a_bits = get_alignment_bits(opc); - unsigned a_mask = (1u << a_bits) - 1; + unsigned a_mask; + + h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, false); + a_mask = (1 << h->aa.align) - 1; #ifdef CONFIG_SOFTMMU unsigned s_bits = opc & MO_SIZE; @@ -1803,7 +1755,7 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, ldst->addrlo_reg = addr_reg; tcg_out_sh64(s, RSY_SRLG, TCG_TMP0, addr_reg, TCG_REG_NONE, - TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS); + s->page_bits - CPU_TLB_ENTRY_BITS); QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0); QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -(1 << 19)); @@ -1815,8 +1767,8 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, * bits within the address. For unaligned access, we check that we don't * cross pages using the address of the last byte of the access. */ - a_off = (a_bits >= s_bits ? 0 : s_mask - a_mask); - tlb_mask = (uint64_t)TARGET_PAGE_MASK | a_mask; + a_off = (a_mask >= s_mask ? 0 : s_mask - a_mask); + tlb_mask = (uint64_t)s->page_mask | a_mask; if (a_off == 0) { tgen_andi_risbg(s, TCG_REG_R0, addr_reg, tlb_mask); } else { @@ -1857,7 +1809,7 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, ldst->addrlo_reg = addr_reg; /* We are expecting a_bits to max out at 7, much lower than TMLL. */ - tcg_debug_assert(a_bits < 16); + tcg_debug_assert(a_mask <= 0xffff); tcg_out_insn(s, RI, TMLL, addr_reg, a_mask); tcg_out16(s, RI_BRC | (7 << 4)); /* CC in {1,2,3} */ @@ -2258,16 +2210,20 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, args[2], const_args[2], args[3], const_args[3], args[4]); break; - case INDEX_op_qemu_ld_i32: + case INDEX_op_qemu_ld_a32_i32: + case INDEX_op_qemu_ld_a64_i32: tcg_out_qemu_ld(s, args[0], args[1], args[2], TCG_TYPE_I32); break; - case INDEX_op_qemu_ld_i64: + case INDEX_op_qemu_ld_a32_i64: + case INDEX_op_qemu_ld_a64_i64: tcg_out_qemu_ld(s, args[0], args[1], args[2], TCG_TYPE_I64); break; - case INDEX_op_qemu_st_i32: + case INDEX_op_qemu_st_a32_i32: + case INDEX_op_qemu_st_a64_i32: tcg_out_qemu_st(s, args[0], args[1], args[2], TCG_TYPE_I32); break; - case INDEX_op_qemu_st_i64: + case INDEX_op_qemu_st_a32_i64: + case INDEX_op_qemu_st_a64_i64: tcg_out_qemu_st(s, args[0], args[1], args[2], TCG_TYPE_I64); break; @@ -3141,11 +3097,15 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_ctpop_i64: return C_O1_I1(r, r); - case INDEX_op_qemu_ld_i32: - case INDEX_op_qemu_ld_i64: + case INDEX_op_qemu_ld_a32_i32: + case INDEX_op_qemu_ld_a64_i32: + case INDEX_op_qemu_ld_a32_i64: + case INDEX_op_qemu_ld_a64_i64: return C_O1_I1(r, r); - case INDEX_op_qemu_st_i64: - case INDEX_op_qemu_st_i32: + case INDEX_op_qemu_st_a32_i64: + case INDEX_op_qemu_st_a64_i64: + case INDEX_op_qemu_st_a32_i32: + case INDEX_op_qemu_st_a64_i32: return C_O0_I2(r, r); case INDEX_op_deposit_i32: diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h index a05b473117..170007bea5 100644 --- a/tcg/s390x/tcg-target.h +++ b/tcg/s390x/tcg-target.h @@ -140,6 +140,8 @@ extern uint64_t s390_facilities[3]; #define TCG_TARGET_HAS_muluh_i64 0 #define TCG_TARGET_HAS_mulsh_i64 0 +#define TCG_TARGET_HAS_qemu_ldst_i128 0 + #define TCG_TARGET_HAS_v64 HAVE_FACILITY(VECTOR) #define TCG_TARGET_HAS_v128 HAVE_FACILITY(VECTOR) #define TCG_TARGET_HAS_v256 0 @@ -172,8 +174,6 @@ extern uint64_t s390_facilities[3]; #define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_BY_REF #define TCG_TARGET_CALL_RET_I128 TCG_CALL_RET_BY_REF -#define TCG_TARGET_HAS_MEMORY_BSWAP 1 - #define TCG_TARGET_DEFAULT_MO (TCG_MO_ALL & ~TCG_MO_ST_LD) #define TCG_TARGET_NEED_LDST_LABELS #define TCG_TARGET_NEED_POOL_LABELS diff --git a/tcg/sparc64/tcg-target-con-set.h b/tcg/sparc64/tcg-target-con-set.h index 31e6fea1fc..434bf25072 100644 --- a/tcg/sparc64/tcg-target-con-set.h +++ b/tcg/sparc64/tcg-target-con-set.h @@ -12,8 +12,6 @@ C_O0_I1(r) C_O0_I2(rZ, r) C_O0_I2(rZ, rJ) -C_O0_I2(sZ, s) -C_O1_I1(r, s) C_O1_I1(r, r) C_O1_I2(r, r, r) C_O1_I2(r, rZ, rJ) diff --git a/tcg/sparc64/tcg-target-con-str.h b/tcg/sparc64/tcg-target-con-str.h index 8f5c7aef97..0577ec4942 100644 --- a/tcg/sparc64/tcg-target-con-str.h +++ b/tcg/sparc64/tcg-target-con-str.h @@ -9,7 +9,6 @@ * REGS(letter, register_mask) */ REGS('r', ALL_GENERAL_REGS) -REGS('s', ALL_QLDST_REGS) /* * Define constraint letters for constants: diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc index 7e6466d3b6..d2d0f604c2 100644 --- a/tcg/sparc64/tcg-target.c.inc +++ b/tcg/sparc64/tcg-target.c.inc @@ -27,6 +27,7 @@ #error "unsupported code generation mode" #endif +#include "../tcg-ldst.c.inc" #include "../tcg-pool.c.inc" #ifdef CONFIG_DEBUG_TCG @@ -70,22 +71,12 @@ static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { #define TCG_CT_CONST_S13 0x200 #define TCG_CT_CONST_ZERO 0x400 -/* - * For softmmu, we need to avoid conflicts with the first 3 - * argument registers to perform the tlb lookup, and to call - * the helper function. - */ -#ifdef CONFIG_SOFTMMU -#define SOFTMMU_RESERVE_REGS MAKE_64BIT_MASK(TCG_REG_O0, 3) -#else -#define SOFTMMU_RESERVE_REGS 0 -#endif -#define ALL_GENERAL_REGS MAKE_64BIT_MASK(0, 32) -#define ALL_QLDST_REGS (ALL_GENERAL_REGS & ~SOFTMMU_RESERVE_REGS) +#define ALL_GENERAL_REGS MAKE_64BIT_MASK(0, 32) -/* Define some temporary registers. T2 is used for constant generation. */ +/* Define some temporary registers. T3 is used for constant generation. */ #define TCG_REG_T1 TCG_REG_G1 -#define TCG_REG_T2 TCG_REG_O7 +#define TCG_REG_T2 TCG_REG_G2 +#define TCG_REG_T3 TCG_REG_O7 #ifndef CONFIG_SOFTMMU # define TCG_GUEST_BASE_REG TCG_REG_I5 @@ -110,7 +101,6 @@ static const int tcg_target_reg_alloc_order[] = { TCG_REG_I4, TCG_REG_I5, - TCG_REG_G2, TCG_REG_G3, TCG_REG_G4, TCG_REG_G5, @@ -399,22 +389,25 @@ static void tcg_out_sethi(TCGContext *s, TCGReg ret, uint32_t arg) tcg_out32(s, SETHI | INSN_RD(ret) | ((arg & 0xfffffc00) >> 10)); } -static void tcg_out_movi_imm13(TCGContext *s, TCGReg ret, int32_t arg) +/* A 13-bit constant sign-extended to 64 bits. */ +static void tcg_out_movi_s13(TCGContext *s, TCGReg ret, int32_t arg) { tcg_out_arithi(s, ret, TCG_REG_G0, arg, ARITH_OR); } -static void tcg_out_movi_imm32(TCGContext *s, TCGReg ret, int32_t arg) +/* A 32-bit constant sign-extended to 64 bits. */ +static void tcg_out_movi_s32(TCGContext *s, TCGReg ret, int32_t arg) { - if (check_fit_i32(arg, 13)) { - /* A 13-bit constant sign-extended to 64-bits. */ - tcg_out_movi_imm13(s, ret, arg); - } else { - /* A 32-bit constant zero-extended to 64 bits. */ - tcg_out_sethi(s, ret, arg); - if (arg & 0x3ff) { - tcg_out_arithi(s, ret, ret, arg & 0x3ff, ARITH_OR); - } + tcg_out_sethi(s, ret, ~arg); + tcg_out_arithi(s, ret, ret, (arg & 0x3ff) | -0x400, ARITH_XOR); +} + +/* A 32-bit constant zero-extended to 64 bits. */ +static void tcg_out_movi_u32(TCGContext *s, TCGReg ret, uint32_t arg) +{ + tcg_out_sethi(s, ret, arg); + if (arg & 0x3ff) { + tcg_out_arithi(s, ret, ret, arg & 0x3ff, ARITH_OR); } } @@ -425,15 +418,15 @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret, tcg_target_long hi, lo = (int32_t)arg; tcg_target_long test, lsb; - /* A 32-bit constant, or 32-bit zero-extended to 64-bits. */ - if (type == TCG_TYPE_I32 || arg == (uint32_t)arg) { - tcg_out_movi_imm32(s, ret, arg); + /* A 13-bit constant sign-extended to 64-bits. */ + if (check_fit_tl(arg, 13)) { + tcg_out_movi_s13(s, ret, arg); return; } - /* A 13-bit constant sign-extended to 64-bits. */ - if (check_fit_tl(arg, 13)) { - tcg_out_movi_imm13(s, ret, arg); + /* A 32-bit constant, or 32-bit zero-extended to 64-bits. */ + if (type == TCG_TYPE_I32 || arg == (uint32_t)arg) { + tcg_out_movi_u32(s, ret, arg); return; } @@ -448,8 +441,7 @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret, /* A 32-bit constant sign-extended to 64-bits. */ if (arg == lo) { - tcg_out_sethi(s, ret, ~arg); - tcg_out_arithi(s, ret, ret, (arg & 0x3ff) | -0x400, ARITH_XOR); + tcg_out_movi_s32(s, ret, arg); return; } @@ -477,13 +469,13 @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret, /* A 64-bit constant decomposed into 2 32-bit pieces. */ if (check_fit_i32(lo, 13)) { hi = (arg - lo) >> 32; - tcg_out_movi_imm32(s, ret, hi); + tcg_out_movi_u32(s, ret, hi); tcg_out_arithi(s, ret, ret, 32, SHIFT_SLLX); tcg_out_arithi(s, ret, ret, lo, ARITH_ADD); } else { hi = arg >> 32; - tcg_out_movi_imm32(s, ret, hi); - tcg_out_movi_imm32(s, scratch, lo); + tcg_out_movi_u32(s, ret, hi); + tcg_out_movi_u32(s, scratch, lo); tcg_out_arithi(s, ret, ret, 32, SHIFT_SLLX); tcg_out_arith(s, ret, ret, scratch, ARITH_OR); } @@ -492,8 +484,8 @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret, static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg ret, tcg_target_long arg) { - tcg_debug_assert(ret != TCG_REG_T2); - tcg_out_movi_int(s, type, ret, arg, false, TCG_REG_T2); + tcg_debug_assert(ret != TCG_REG_T3); + tcg_out_movi_int(s, type, ret, arg, false, TCG_REG_T3); } static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg rd, TCGReg rs) @@ -767,7 +759,7 @@ static void tcg_out_setcond_i32(TCGContext *s, TCGCond cond, TCGReg ret, default: tcg_out_cmp(s, c1, c2, c2const); - tcg_out_movi_imm13(s, ret, 0); + tcg_out_movi_s13(s, ret, 0); tcg_out_movcc(s, cond, MOVCC_ICC, ret, 1, 1); return; } @@ -803,11 +795,11 @@ static void tcg_out_setcond_i64(TCGContext *s, TCGCond cond, TCGReg ret, /* For 64-bit signed comparisons vs zero, we can avoid the compare if the input does not overlap the output. */ if (c2 == 0 && !is_unsigned_cond(cond) && c1 != ret) { - tcg_out_movi_imm13(s, ret, 0); + tcg_out_movi_s13(s, ret, 0); tcg_out_movr(s, cond, ret, c1, 1, 1); } else { tcg_out_cmp(s, c1, c2, c2const); - tcg_out_movi_imm13(s, ret, 0); + tcg_out_movi_s13(s, ret, 0); tcg_out_movcc(s, cond, MOVCC_XCC, ret, 1, 1); } } @@ -844,7 +836,7 @@ static void tcg_out_addsub2_i64(TCGContext *s, TCGReg rl, TCGReg rh, if (use_vis3_instructions && !is_sub) { /* Note that ADDXC doesn't accept immediates. */ if (bhconst && bh != 0) { - tcg_out_movi_imm13(s, TCG_REG_T2, bh); + tcg_out_movi_s13(s, TCG_REG_T2, bh); bh = TCG_REG_T2; } tcg_out_arith(s, rh, ah, bh, ARITH_ADDXC); @@ -866,7 +858,7 @@ static void tcg_out_addsub2_i64(TCGContext *s, TCGReg rl, TCGReg rh, * so the adjustment fits 12 bits. */ if (bhconst) { - tcg_out_movi_imm13(s, TCG_REG_T2, bh + (is_sub ? -1 : 1)); + tcg_out_movi_s13(s, TCG_REG_T2, bh + (is_sub ? -1 : 1)); } else { tcg_out_arithi(s, TCG_REG_T2, bh, 1, is_sub ? ARITH_SUB : ARITH_ADD); @@ -885,10 +877,8 @@ static void tcg_out_jmpl_const(TCGContext *s, const tcg_insn_unit *dest, { uintptr_t desti = (uintptr_t)dest; - /* Be careful not to clobber %o7 for a tail call. */ tcg_out_movi_int(s, TCG_TYPE_PTR, TCG_REG_T1, - desti & ~0xfff, in_prologue, - tail_call ? TCG_REG_G2 : TCG_REG_O7); + desti & ~0xfff, in_prologue, TCG_REG_T2); tcg_out_arithi(s, tail_call ? TCG_REG_G0 : TCG_REG_O7, TCG_REG_T1, desti & 0xfff, JMPL); } @@ -918,104 +908,6 @@ static void tcg_out_mb(TCGContext *s, TCGArg a0) tcg_out32(s, MEMBAR | (a0 & TCG_MO_ALL)); } -#ifdef CONFIG_SOFTMMU -static const tcg_insn_unit *qemu_ld_trampoline[(MO_SSIZE | MO_BSWAP) + 1]; -static const tcg_insn_unit *qemu_st_trampoline[(MO_SIZE | MO_BSWAP) + 1]; - -static void build_trampolines(TCGContext *s) -{ - static void * const qemu_ld_helpers[] = { - [MO_UB] = helper_ret_ldub_mmu, - [MO_SB] = helper_ret_ldsb_mmu, - [MO_LEUW] = helper_le_lduw_mmu, - [MO_LESW] = helper_le_ldsw_mmu, - [MO_LEUL] = helper_le_ldul_mmu, - [MO_LEUQ] = helper_le_ldq_mmu, - [MO_BEUW] = helper_be_lduw_mmu, - [MO_BESW] = helper_be_ldsw_mmu, - [MO_BEUL] = helper_be_ldul_mmu, - [MO_BEUQ] = helper_be_ldq_mmu, - }; - static void * const qemu_st_helpers[] = { - [MO_UB] = helper_ret_stb_mmu, - [MO_LEUW] = helper_le_stw_mmu, - [MO_LEUL] = helper_le_stl_mmu, - [MO_LEUQ] = helper_le_stq_mmu, - [MO_BEUW] = helper_be_stw_mmu, - [MO_BEUL] = helper_be_stl_mmu, - [MO_BEUQ] = helper_be_stq_mmu, - }; - - int i; - - for (i = 0; i < ARRAY_SIZE(qemu_ld_helpers); ++i) { - if (qemu_ld_helpers[i] == NULL) { - continue; - } - - /* May as well align the trampoline. */ - while ((uintptr_t)s->code_ptr & 15) { - tcg_out_nop(s); - } - qemu_ld_trampoline[i] = tcg_splitwx_to_rx(s->code_ptr); - - /* Set the retaddr operand. */ - tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_O3, TCG_REG_O7); - /* Tail call. */ - tcg_out_jmpl_const(s, qemu_ld_helpers[i], true, true); - /* delay slot -- set the env argument */ - tcg_out_mov_delay(s, TCG_REG_O0, TCG_AREG0); - } - - for (i = 0; i < ARRAY_SIZE(qemu_st_helpers); ++i) { - if (qemu_st_helpers[i] == NULL) { - continue; - } - - /* May as well align the trampoline. */ - while ((uintptr_t)s->code_ptr & 15) { - tcg_out_nop(s); - } - qemu_st_trampoline[i] = tcg_splitwx_to_rx(s->code_ptr); - - /* Set the retaddr operand. */ - tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_O4, TCG_REG_O7); - - /* Tail call. */ - tcg_out_jmpl_const(s, qemu_st_helpers[i], true, true); - /* delay slot -- set the env argument */ - tcg_out_mov_delay(s, TCG_REG_O0, TCG_AREG0); - } -} -#else -static const tcg_insn_unit *qemu_unalign_ld_trampoline; -static const tcg_insn_unit *qemu_unalign_st_trampoline; - -static void build_trampolines(TCGContext *s) -{ - for (int ld = 0; ld < 2; ++ld) { - void *helper; - - while ((uintptr_t)s->code_ptr & 15) { - tcg_out_nop(s); - } - - if (ld) { - helper = helper_unaligned_ld; - qemu_unalign_ld_trampoline = tcg_splitwx_to_rx(s->code_ptr); - } else { - helper = helper_unaligned_st; - qemu_unalign_st_trampoline = tcg_splitwx_to_rx(s->code_ptr); - } - - /* Tail call. */ - tcg_out_jmpl_const(s, helper, true, true); - /* delay slot -- set the env argument */ - tcg_out_mov_delay(s, TCG_REG_O0, TCG_AREG0); - } -} -#endif - /* Generate global QEMU prologue and epilogue code */ static void tcg_target_qemu_prologue(TCGContext *s) { @@ -1060,9 +952,7 @@ static void tcg_target_qemu_prologue(TCGContext *s) tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr); tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN); /* delay slot */ - tcg_out_movi_imm13(s, TCG_REG_O0, 0); - - build_trampolines(s); + tcg_out_movi_s13(s, TCG_REG_O0, 0); } static void tcg_out_nop_fill(tcg_insn_unit *p, int count) @@ -1073,388 +963,239 @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count) } } -#if defined(CONFIG_SOFTMMU) - -/* We expect to use a 13-bit negative offset from ENV. */ -QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0); -QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -(1 << 12)); +static const TCGLdstHelperParam ldst_helper_param = { + .ntmp = 1, .tmp = { TCG_REG_T1 } +}; -/* Perform the TLB load and compare. +static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb) +{ + MemOp opc = get_memop(lb->oi); + MemOp sgn; - Inputs: - ADDRLO and ADDRHI contain the possible two parts of the address. + if (!patch_reloc(lb->label_ptr[0], R_SPARC_WDISP19, + (intptr_t)tcg_splitwx_to_rx(s->code_ptr), 0)) { + return false; + } - MEM_INDEX and S_BITS are the memory context and log2 size of the load. + /* Use inline tcg_out_ext32s; otherwise let the helper sign-extend. */ + sgn = (opc & MO_SIZE) < MO_32 ? MO_SIGN : 0; - WHICH is the offset into the CPUTLBEntry structure of the slot to read. - This should be offsetof addr_read or addr_write. + tcg_out_ld_helper_args(s, lb, &ldst_helper_param); + tcg_out_call(s, qemu_ld_helpers[opc & (MO_SIZE | sgn)], NULL); + tcg_out_ld_helper_ret(s, lb, sgn, &ldst_helper_param); - The result of the TLB comparison is in %[ix]cc. The sanitized address - is in the returned register, maybe %o0. The TLB addend is in %o1. */ + tcg_out_bpcc0(s, COND_A, BPCC_A | BPCC_PT, 0); + return patch_reloc(s->code_ptr - 1, R_SPARC_WDISP19, + (intptr_t)lb->raddr, 0); +} -static TCGReg tcg_out_tlb_load(TCGContext *s, TCGReg addr, int mem_index, - MemOp opc, int which) +static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb) { - int fast_off = TLB_MASK_TABLE_OFS(mem_index); - int mask_off = fast_off + offsetof(CPUTLBDescFast, mask); - int table_off = fast_off + offsetof(CPUTLBDescFast, table); - const TCGReg r0 = TCG_REG_O0; - const TCGReg r1 = TCG_REG_O1; - const TCGReg r2 = TCG_REG_O2; - unsigned s_bits = opc & MO_SIZE; - unsigned a_bits = get_alignment_bits(opc); - tcg_target_long compare_mask; - - /* Load tlb_mask[mmu_idx] and tlb_table[mmu_idx]. */ - tcg_out_ld(s, TCG_TYPE_PTR, r0, TCG_AREG0, mask_off); - tcg_out_ld(s, TCG_TYPE_PTR, r1, TCG_AREG0, table_off); - - /* Extract the page index, shifted into place for tlb index. */ - tcg_out_arithi(s, r2, addr, TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS, - SHIFT_SRL); - tcg_out_arith(s, r2, r2, r0, ARITH_AND); - - /* Add the tlb_table pointer, creating the CPUTLBEntry address into R2. */ - tcg_out_arith(s, r2, r2, r1, ARITH_ADD); - - /* Load the tlb comparator and the addend. */ - tcg_out_ld(s, TCG_TYPE_TL, r0, r2, which); - tcg_out_ld(s, TCG_TYPE_PTR, r1, r2, offsetof(CPUTLBEntry, addend)); + MemOp opc = get_memop(lb->oi); - /* Mask out the page offset, except for the required alignment. - We don't support unaligned accesses. */ - if (a_bits < s_bits) { - a_bits = s_bits; + if (!patch_reloc(lb->label_ptr[0], R_SPARC_WDISP19, + (intptr_t)tcg_splitwx_to_rx(s->code_ptr), 0)) { + return false; } - compare_mask = (tcg_target_ulong)TARGET_PAGE_MASK | ((1 << a_bits) - 1); - if (check_fit_tl(compare_mask, 13)) { - tcg_out_arithi(s, r2, addr, compare_mask, ARITH_AND); - } else { - tcg_out_movi(s, TCG_TYPE_TL, r2, compare_mask); - tcg_out_arith(s, r2, addr, r2, ARITH_AND); - } - tcg_out_cmp(s, r0, r2, 0); - /* If the guest address must be zero-extended, do so now. */ - if (TARGET_LONG_BITS == 32) { - tcg_out_ext32u(s, r0, addr); - return r0; - } - return addr; -} -#endif /* CONFIG_SOFTMMU */ - -static const int qemu_ld_opc[(MO_SSIZE | MO_BSWAP) + 1] = { - [MO_UB] = LDUB, - [MO_SB] = LDSB, - [MO_UB | MO_LE] = LDUB, - [MO_SB | MO_LE] = LDSB, - - [MO_BEUW] = LDUH, - [MO_BESW] = LDSH, - [MO_BEUL] = LDUW, - [MO_BESL] = LDSW, - [MO_BEUQ] = LDX, - [MO_BESQ] = LDX, - - [MO_LEUW] = LDUH_LE, - [MO_LESW] = LDSH_LE, - [MO_LEUL] = LDUW_LE, - [MO_LESL] = LDSW_LE, - [MO_LEUQ] = LDX_LE, - [MO_LESQ] = LDX_LE, -}; + tcg_out_st_helper_args(s, lb, &ldst_helper_param); + tcg_out_call(s, qemu_st_helpers[opc & MO_SIZE], NULL); -static const int qemu_st_opc[(MO_SIZE | MO_BSWAP) + 1] = { - [MO_UB] = STB, + tcg_out_bpcc0(s, COND_A, BPCC_A | BPCC_PT, 0); + return patch_reloc(s->code_ptr - 1, R_SPARC_WDISP19, + (intptr_t)lb->raddr, 0); +} - [MO_BEUW] = STH, - [MO_BEUL] = STW, - [MO_BEUQ] = STX, +typedef struct { + TCGReg base; + TCGReg index; + TCGAtomAlign aa; +} HostAddress; - [MO_LEUW] = STH_LE, - [MO_LEUL] = STW_LE, - [MO_LEUQ] = STX_LE, -}; +bool tcg_target_has_memory_bswap(MemOp memop) +{ + return true; +} -static void tcg_out_qemu_ld(TCGContext *s, TCGReg data, TCGReg addr, - MemOpIdx oi, TCGType data_type) +/* + * For softmmu, perform the TLB load and compare. + * For useronly, perform any required alignment tests. + * In both cases, return a TCGLabelQemuLdst structure if the slow path + * is required and fill in @h with the host address for the fast path. + */ +static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, + TCGReg addr_reg, MemOpIdx oi, + bool is_ld) { - MemOp memop = get_memop(oi); - tcg_insn_unit *label_ptr; + TCGLabelQemuLdst *ldst = NULL; + MemOp opc = get_memop(oi); + MemOp s_bits = opc & MO_SIZE; + unsigned a_mask; -#ifdef CONFIG_SOFTMMU - unsigned memi = get_mmuidx(oi); - TCGReg addrz; - const tcg_insn_unit *func; + /* We don't support unaligned accesses. */ + h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, false); + h->aa.align = MAX(h->aa.align, s_bits); + a_mask = (1u << h->aa.align) - 1; - addrz = tcg_out_tlb_load(s, addr, memi, memop, - offsetof(CPUTLBEntry, addr_read)); +#ifdef CONFIG_SOFTMMU + int mem_index = get_mmuidx(oi); + int fast_off = TLB_MASK_TABLE_OFS(mem_index); + int mask_off = fast_off + offsetof(CPUTLBDescFast, mask); + int table_off = fast_off + offsetof(CPUTLBDescFast, table); + int cmp_off = is_ld ? offsetof(CPUTLBEntry, addr_read) + : offsetof(CPUTLBEntry, addr_write); + int add_off = offsetof(CPUTLBEntry, addend); + int compare_mask; + int cc; - /* The fast path is exactly one insn. Thus we can perform the - entire TLB Hit in the (annulled) delay slot of the branch - over the TLB Miss case. */ + /* Load tlb_mask[mmu_idx] and tlb_table[mmu_idx]. */ + QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0); + QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -(1 << 12)); + tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_T2, TCG_AREG0, mask_off); + tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_T3, TCG_AREG0, table_off); - /* beq,a,pt %[xi]cc, label0 */ - label_ptr = s->code_ptr; - tcg_out_bpcc0(s, COND_E, BPCC_A | BPCC_PT - | (TARGET_LONG_BITS == 64 ? BPCC_XCC : BPCC_ICC), 0); - /* delay slot */ - tcg_out_ldst_rr(s, data, addrz, TCG_REG_O1, - qemu_ld_opc[memop & (MO_BSWAP | MO_SSIZE)]); + /* Extract the page index, shifted into place for tlb index. */ + tcg_out_arithi(s, TCG_REG_T1, addr_reg, + s->page_bits - CPU_TLB_ENTRY_BITS, SHIFT_SRL); + tcg_out_arith(s, TCG_REG_T1, TCG_REG_T1, TCG_REG_T2, ARITH_AND); - /* TLB Miss. */ + /* Add the tlb_table pointer, creating the CPUTLBEntry address into R2. */ + tcg_out_arith(s, TCG_REG_T1, TCG_REG_T1, TCG_REG_T3, ARITH_ADD); - tcg_out_mov(s, TCG_TYPE_REG, TCG_REG_O1, addrz); + /* Load the tlb comparator and the addend. */ + tcg_out_ld(s, TCG_TYPE_TL, TCG_REG_T2, TCG_REG_T1, cmp_off); + tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_T1, TCG_REG_T1, add_off); + h->base = TCG_REG_T1; - /* We use the helpers to extend SB and SW data, leaving the case - of SL needing explicit extending below. */ - if ((memop & MO_SSIZE) == MO_SL) { - func = qemu_ld_trampoline[memop & (MO_BSWAP | MO_SIZE)]; + /* Mask out the page offset, except for the required alignment. */ + compare_mask = s->page_mask | a_mask; + if (check_fit_tl(compare_mask, 13)) { + tcg_out_arithi(s, TCG_REG_T3, addr_reg, compare_mask, ARITH_AND); } else { - func = qemu_ld_trampoline[memop & (MO_BSWAP | MO_SSIZE)]; + tcg_out_movi_s32(s, TCG_REG_T3, compare_mask); + tcg_out_arith(s, TCG_REG_T3, addr_reg, TCG_REG_T3, ARITH_AND); } - tcg_debug_assert(func != NULL); - tcg_out_call_nodelay(s, func, false); - /* delay slot */ - tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_O2, oi); + tcg_out_cmp(s, TCG_REG_T2, TCG_REG_T3, 0); - /* We let the helper sign-extend SB and SW, but leave SL for here. */ - if ((memop & MO_SSIZE) == MO_SL) { - tcg_out_ext32s(s, data, TCG_REG_O0); - } else { - tcg_out_mov(s, TCG_TYPE_REG, data, TCG_REG_O0); - } + ldst = new_ldst_label(s); + ldst->is_ld = is_ld; + ldst->oi = oi; + ldst->addrlo_reg = addr_reg; + ldst->label_ptr[0] = s->code_ptr; - *label_ptr |= INSN_OFF19(tcg_ptr_byte_diff(s->code_ptr, label_ptr)); + /* bne,pn %[xi]cc, label0 */ + cc = TARGET_LONG_BITS == 64 ? BPCC_XCC : BPCC_ICC; + tcg_out_bpcc0(s, COND_NE, BPCC_PN | cc, 0); #else - TCGReg index = (guest_base ? TCG_GUEST_BASE_REG : TCG_REG_G0); - unsigned a_bits = get_alignment_bits(memop); - unsigned s_bits = memop & MO_SIZE; - unsigned t_bits; - - if (TARGET_LONG_BITS == 32) { - tcg_out_ext32u(s, TCG_REG_T1, addr); - addr = TCG_REG_T1; - } - /* - * Normal case: alignment equal to access size. + * If the size equals the required alignment, we can skip the test + * and allow host SIGBUS to deliver SIGBUS to the guest. + * Otherwise, test for at least natural alignment and defer + * everything else to the helper functions. */ - if (a_bits == s_bits) { - tcg_out_ldst_rr(s, data, addr, index, - qemu_ld_opc[memop & (MO_BSWAP | MO_SSIZE)]); - return; + if (s_bits != get_alignment_bits(opc)) { + tcg_debug_assert(check_fit_tl(a_mask, 13)); + tcg_out_arithi(s, TCG_REG_G0, addr_reg, a_mask, ARITH_ANDCC); + + ldst = new_ldst_label(s); + ldst->is_ld = is_ld; + ldst->oi = oi; + ldst->addrlo_reg = addr_reg; + ldst->label_ptr[0] = s->code_ptr; + + /* bne,pn %icc, label0 */ + tcg_out_bpcc0(s, COND_NE, BPCC_PN | BPCC_ICC, 0); } + h->base = guest_base ? TCG_GUEST_BASE_REG : TCG_REG_G0; +#endif - /* - * Test for at least natural alignment, and assume most accesses - * will be aligned -- perform a straight load in the delay slot. - * This is required to preserve atomicity for aligned accesses. - */ - t_bits = MAX(a_bits, s_bits); - tcg_debug_assert(t_bits < 13); - tcg_out_arithi(s, TCG_REG_G0, addr, (1u << t_bits) - 1, ARITH_ANDCC); - - /* beq,a,pt %icc, label */ - label_ptr = s->code_ptr; - tcg_out_bpcc0(s, COND_E, BPCC_A | BPCC_PT | BPCC_ICC, 0); - /* delay slot */ - tcg_out_ldst_rr(s, data, addr, index, - qemu_ld_opc[memop & (MO_BSWAP | MO_SSIZE)]); - - if (a_bits >= s_bits) { - /* - * Overalignment: A successful alignment test will perform the memory - * operation in the delay slot, and failure need only invoke the - * handler for SIGBUS. - */ - tcg_out_call_nodelay(s, qemu_unalign_ld_trampoline, false); - /* delay slot -- move to low part of argument reg */ - tcg_out_mov_delay(s, TCG_REG_O1, addr); + /* If the guest address must be zero-extended, do in the delay slot. */ + if (TARGET_LONG_BITS == 32) { + tcg_out_ext32u(s, TCG_REG_T2, addr_reg); + h->index = TCG_REG_T2; } else { - /* Underalignment: load by pieces of minimum alignment. */ - int ld_opc, a_size, s_size, i; - - /* - * Force full address into T1 early; avoids problems with - * overlap between @addr and @data. - */ - tcg_out_arith(s, TCG_REG_T1, addr, index, ARITH_ADD); - - a_size = 1 << a_bits; - s_size = 1 << s_bits; - if ((memop & MO_BSWAP) == MO_BE) { - ld_opc = qemu_ld_opc[a_bits | MO_BE | (memop & MO_SIGN)]; - tcg_out_ldst(s, data, TCG_REG_T1, 0, ld_opc); - ld_opc = qemu_ld_opc[a_bits | MO_BE]; - for (i = a_size; i < s_size; i += a_size) { - tcg_out_ldst(s, TCG_REG_T2, TCG_REG_T1, i, ld_opc); - tcg_out_arithi(s, data, data, a_size, SHIFT_SLLX); - tcg_out_arith(s, data, data, TCG_REG_T2, ARITH_OR); - } - } else if (a_bits == 0) { - ld_opc = LDUB; - tcg_out_ldst(s, data, TCG_REG_T1, 0, ld_opc); - for (i = a_size; i < s_size; i += a_size) { - if ((memop & MO_SIGN) && i == s_size - a_size) { - ld_opc = LDSB; - } - tcg_out_ldst(s, TCG_REG_T2, TCG_REG_T1, i, ld_opc); - tcg_out_arithi(s, TCG_REG_T2, TCG_REG_T2, i * 8, SHIFT_SLLX); - tcg_out_arith(s, data, data, TCG_REG_T2, ARITH_OR); - } - } else { - ld_opc = qemu_ld_opc[a_bits | MO_LE]; - tcg_out_ldst_rr(s, data, TCG_REG_T1, TCG_REG_G0, ld_opc); - for (i = a_size; i < s_size; i += a_size) { - tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, a_size, ARITH_ADD); - if ((memop & MO_SIGN) && i == s_size - a_size) { - ld_opc = qemu_ld_opc[a_bits | MO_LE | MO_SIGN]; - } - tcg_out_ldst_rr(s, TCG_REG_T2, TCG_REG_T1, TCG_REG_G0, ld_opc); - tcg_out_arithi(s, TCG_REG_T2, TCG_REG_T2, i * 8, SHIFT_SLLX); - tcg_out_arith(s, data, data, TCG_REG_T2, ARITH_OR); - } + if (ldst) { + tcg_out_nop(s); } + h->index = addr_reg; } - - *label_ptr |= INSN_OFF19(tcg_ptr_byte_diff(s->code_ptr, label_ptr)); -#endif /* CONFIG_SOFTMMU */ + return ldst; } -static void tcg_out_qemu_st(TCGContext *s, TCGReg data, TCGReg addr, +static void tcg_out_qemu_ld(TCGContext *s, TCGReg data, TCGReg addr, MemOpIdx oi, TCGType data_type) { - MemOp memop = get_memop(oi); - tcg_insn_unit *label_ptr; + static const int ld_opc[(MO_SSIZE | MO_BSWAP) + 1] = { + [MO_UB] = LDUB, + [MO_SB] = LDSB, + [MO_UB | MO_LE] = LDUB, + [MO_SB | MO_LE] = LDSB, + + [MO_BEUW] = LDUH, + [MO_BESW] = LDSH, + [MO_BEUL] = LDUW, + [MO_BESL] = LDSW, + [MO_BEUQ] = LDX, + [MO_BESQ] = LDX, + + [MO_LEUW] = LDUH_LE, + [MO_LESW] = LDSH_LE, + [MO_LEUL] = LDUW_LE, + [MO_LESL] = LDSW_LE, + [MO_LEUQ] = LDX_LE, + [MO_LESQ] = LDX_LE, + }; -#ifdef CONFIG_SOFTMMU - unsigned memi = get_mmuidx(oi); - TCGReg addrz; - const tcg_insn_unit *func; - - addrz = tcg_out_tlb_load(s, addr, memi, memop, - offsetof(CPUTLBEntry, addr_write)); - - /* The fast path is exactly one insn. Thus we can perform the entire - TLB Hit in the (annulled) delay slot of the branch over TLB Miss. */ - /* beq,a,pt %[xi]cc, label0 */ - label_ptr = s->code_ptr; - tcg_out_bpcc0(s, COND_E, BPCC_A | BPCC_PT - | (TARGET_LONG_BITS == 64 ? BPCC_XCC : BPCC_ICC), 0); - /* delay slot */ - tcg_out_ldst_rr(s, data, addrz, TCG_REG_O1, - qemu_st_opc[memop & (MO_BSWAP | MO_SIZE)]); + TCGLabelQemuLdst *ldst; + HostAddress h; - /* TLB Miss. */ + ldst = prepare_host_addr(s, &h, addr, oi, true); - tcg_out_mov(s, TCG_TYPE_REG, TCG_REG_O1, addrz); - tcg_out_movext(s, (memop & MO_SIZE) == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32, - TCG_REG_O2, data_type, memop & MO_SIZE, data); + tcg_out_ldst_rr(s, data, h.base, h.index, + ld_opc[get_memop(oi) & (MO_BSWAP | MO_SSIZE)]); - func = qemu_st_trampoline[memop & (MO_BSWAP | MO_SIZE)]; - tcg_debug_assert(func != NULL); - tcg_out_call_nodelay(s, func, false); - /* delay slot */ - tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_O3, oi); + if (ldst) { + ldst->type = data_type; + ldst->datalo_reg = data; + ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); + } +} - *label_ptr |= INSN_OFF19(tcg_ptr_byte_diff(s->code_ptr, label_ptr)); -#else - TCGReg index = (guest_base ? TCG_GUEST_BASE_REG : TCG_REG_G0); - unsigned a_bits = get_alignment_bits(memop); - unsigned s_bits = memop & MO_SIZE; - unsigned t_bits; +static void tcg_out_qemu_st(TCGContext *s, TCGReg data, TCGReg addr, + MemOpIdx oi, TCGType data_type) +{ + static const int st_opc[(MO_SIZE | MO_BSWAP) + 1] = { + [MO_UB] = STB, - if (TARGET_LONG_BITS == 32) { - tcg_out_ext32u(s, TCG_REG_T1, addr); - addr = TCG_REG_T1; - } + [MO_BEUW] = STH, + [MO_BEUL] = STW, + [MO_BEUQ] = STX, - /* - * Normal case: alignment equal to access size. - */ - if (a_bits == s_bits) { - tcg_out_ldst_rr(s, data, addr, index, - qemu_st_opc[memop & (MO_BSWAP | MO_SIZE)]); - return; - } + [MO_LEUW] = STH_LE, + [MO_LEUL] = STW_LE, + [MO_LEUQ] = STX_LE, + }; - /* - * Test for at least natural alignment, and assume most accesses - * will be aligned -- perform a straight store in the delay slot. - * This is required to preserve atomicity for aligned accesses. - */ - t_bits = MAX(a_bits, s_bits); - tcg_debug_assert(t_bits < 13); - tcg_out_arithi(s, TCG_REG_G0, addr, (1u << t_bits) - 1, ARITH_ANDCC); + TCGLabelQemuLdst *ldst; + HostAddress h; - /* beq,a,pt %icc, label */ - label_ptr = s->code_ptr; - tcg_out_bpcc0(s, COND_E, BPCC_A | BPCC_PT | BPCC_ICC, 0); - /* delay slot */ - tcg_out_ldst_rr(s, data, addr, index, - qemu_st_opc[memop & (MO_BSWAP | MO_SIZE)]); + ldst = prepare_host_addr(s, &h, addr, oi, false); - if (a_bits >= s_bits) { - /* - * Overalignment: A successful alignment test will perform the memory - * operation in the delay slot, and failure need only invoke the - * handler for SIGBUS. - */ - tcg_out_call_nodelay(s, qemu_unalign_st_trampoline, false); - /* delay slot -- move to low part of argument reg */ - tcg_out_mov_delay(s, TCG_REG_O1, addr); - } else { - /* Underalignment: store by pieces of minimum alignment. */ - int st_opc, a_size, s_size, i; + tcg_out_ldst_rr(s, data, h.base, h.index, + st_opc[get_memop(oi) & (MO_BSWAP | MO_SIZE)]); - /* - * Force full address into T1 early; avoids problems with - * overlap between @addr and @data. - */ - tcg_out_arith(s, TCG_REG_T1, addr, index, ARITH_ADD); - - a_size = 1 << a_bits; - s_size = 1 << s_bits; - if ((memop & MO_BSWAP) == MO_BE) { - st_opc = qemu_st_opc[a_bits | MO_BE]; - for (i = 0; i < s_size; i += a_size) { - TCGReg d = data; - int shift = (s_size - a_size - i) * 8; - if (shift) { - d = TCG_REG_T2; - tcg_out_arithi(s, d, data, shift, SHIFT_SRLX); - } - tcg_out_ldst(s, d, TCG_REG_T1, i, st_opc); - } - } else if (a_bits == 0) { - tcg_out_ldst(s, data, TCG_REG_T1, 0, STB); - for (i = 1; i < s_size; i++) { - tcg_out_arithi(s, TCG_REG_T2, data, i * 8, SHIFT_SRLX); - tcg_out_ldst(s, TCG_REG_T2, TCG_REG_T1, i, STB); - } - } else { - /* Note that ST*A with immediate asi must use indexed address. */ - st_opc = qemu_st_opc[a_bits + MO_LE]; - tcg_out_ldst_rr(s, data, TCG_REG_T1, TCG_REG_G0, st_opc); - for (i = a_size; i < s_size; i += a_size) { - tcg_out_arithi(s, TCG_REG_T2, data, i * 8, SHIFT_SRLX); - tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, a_size, ARITH_ADD); - tcg_out_ldst_rr(s, TCG_REG_T2, TCG_REG_T1, TCG_REG_G0, st_opc); - } - } + if (ldst) { + ldst->type = data_type; + ldst->datalo_reg = data; + ldst->raddr = tcg_splitwx_to_rx(s->code_ptr); } - - *label_ptr |= INSN_OFF19(tcg_ptr_byte_diff(s->code_ptr, label_ptr)); -#endif /* CONFIG_SOFTMMU */ } static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0) { if (check_fit_ptr(a0, 13)) { tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN); - tcg_out_movi_imm13(s, TCG_REG_O0, a0); + tcg_out_movi_s13(s, TCG_REG_O0, a0); return; } else { intptr_t tb_diff = tcg_tbrel_diff(s, (void *)a0); @@ -1635,16 +1376,20 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, tcg_out_arithi(s, a1, a0, 32, SHIFT_SRLX); break; - case INDEX_op_qemu_ld_i32: + case INDEX_op_qemu_ld_a32_i32: + case INDEX_op_qemu_ld_a64_i32: tcg_out_qemu_ld(s, a0, a1, a2, TCG_TYPE_I32); break; - case INDEX_op_qemu_ld_i64: + case INDEX_op_qemu_ld_a32_i64: + case INDEX_op_qemu_ld_a64_i64: tcg_out_qemu_ld(s, a0, a1, a2, TCG_TYPE_I64); break; - case INDEX_op_qemu_st_i32: + case INDEX_op_qemu_st_a32_i32: + case INDEX_op_qemu_st_a64_i32: tcg_out_qemu_st(s, a0, a1, a2, TCG_TYPE_I32); break; - case INDEX_op_qemu_st_i64: + case INDEX_op_qemu_st_a32_i64: + case INDEX_op_qemu_st_a64_i64: tcg_out_qemu_st(s, a0, a1, a2, TCG_TYPE_I64); break; @@ -1766,6 +1511,10 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_extu_i32_i64: case INDEX_op_extrl_i64_i32: case INDEX_op_extrh_i64_i32: + case INDEX_op_qemu_ld_a32_i32: + case INDEX_op_qemu_ld_a64_i32: + case INDEX_op_qemu_ld_a32_i64: + case INDEX_op_qemu_ld_a64_i64: return C_O1_I1(r, r); case INDEX_op_st8_i32: @@ -1775,6 +1524,10 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_st_i32: case INDEX_op_st32_i64: case INDEX_op_st_i64: + case INDEX_op_qemu_st_a32_i32: + case INDEX_op_qemu_st_a64_i32: + case INDEX_op_qemu_st_a32_i64: + case INDEX_op_qemu_st_a64_i64: return C_O0_I2(rZ, r); case INDEX_op_add_i32: @@ -1824,13 +1577,6 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_muluh_i64: return C_O1_I2(r, r, r); - case INDEX_op_qemu_ld_i32: - case INDEX_op_qemu_ld_i64: - return C_O1_I1(r, s); - case INDEX_op_qemu_st_i32: - case INDEX_op_qemu_st_i64: - return C_O0_I2(sZ, s); - default: g_assert_not_reached(); } @@ -1878,6 +1624,7 @@ static void tcg_target_init(TCGContext *s) tcg_regset_set_reg(s->reserved_regs, TCG_REG_O6); /* stack pointer */ tcg_regset_set_reg(s->reserved_regs, TCG_REG_T1); /* for internal use */ tcg_regset_set_reg(s->reserved_regs, TCG_REG_T2); /* for internal use */ + tcg_regset_set_reg(s->reserved_regs, TCG_REG_T3); /* for internal use */ } #define ELF_HOST_MACHINE EM_SPARCV9 diff --git a/tcg/sparc64/tcg-target.h b/tcg/sparc64/tcg-target.h index ffe22b1d21..31c5537379 100644 --- a/tcg/sparc64/tcg-target.h +++ b/tcg/sparc64/tcg-target.h @@ -151,10 +151,12 @@ extern bool use_vis3_instructions; #define TCG_TARGET_HAS_muluh_i64 use_vis3_instructions #define TCG_TARGET_HAS_mulsh_i64 0 +#define TCG_TARGET_HAS_qemu_ldst_i128 0 + #define TCG_AREG0 TCG_REG_I0 #define TCG_TARGET_DEFAULT_MO (0) -#define TCG_TARGET_HAS_MEMORY_BSWAP 1 +#define TCG_TARGET_NEED_LDST_LABELS #define TCG_TARGET_NEED_POOL_LABELS #endif diff --git a/tcg/tcg-internal.h b/tcg/tcg-internal.h index 0f1ba01a9a..67b698bd5c 100644 --- a/tcg/tcg-internal.h +++ b/tcg/tcg-internal.h @@ -126,4 +126,6 @@ static inline TCGv_i64 TCGV128_HIGH(TCGv_i128 t) return temp_tcgv_i64(tcgv_i128_temp(t) + o); } +bool tcg_target_has_memory_bswap(MemOp memop); + #endif /* TCG_INTERNAL_H */ diff --git a/tcg/tcg-op-ldst.c b/tcg/tcg-op-ldst.c new file mode 100644 index 0000000000..f4e508cb68 --- /dev/null +++ b/tcg/tcg-op-ldst.c @@ -0,0 +1,1232 @@ +/* + * Tiny Code Generator for QEMU + * + * Copyright (c) 2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "exec/exec-all.h" +#include "tcg/tcg.h" +#include "tcg/tcg-temp-internal.h" +#include "tcg/tcg-op.h" +#include "tcg/tcg-mo.h" +#include "exec/plugin-gen.h" +#include "tcg-internal.h" + + +static inline MemOp tcg_canonicalize_memop(MemOp op, bool is64, bool st) +{ + /* Trigger the asserts within as early as possible. */ + unsigned a_bits = get_alignment_bits(op); + + /* Prefer MO_ALIGN+MO_XX over MO_ALIGN_XX+MO_XX */ + if (a_bits == (op & MO_SIZE)) { + op = (op & ~MO_AMASK) | MO_ALIGN; + } + + switch (op & MO_SIZE) { + case MO_8: + op &= ~MO_BSWAP; + break; + case MO_16: + break; + case MO_32: + if (!is64) { + op &= ~MO_SIGN; + } + break; + case MO_64: + if (is64) { + op &= ~MO_SIGN; + break; + } + /* fall through */ + default: + g_assert_not_reached(); + } + if (st) { + op &= ~MO_SIGN; + } + return op; +} + +static void gen_ldst(TCGOpcode opc, TCGTemp *vl, TCGTemp *vh, + TCGTemp *addr, MemOpIdx oi) +{ + if (TCG_TARGET_REG_BITS == 64 || tcg_ctx->addr_type == TCG_TYPE_I32) { + if (vh) { + tcg_gen_op4(opc, temp_arg(vl), temp_arg(vh), temp_arg(addr), oi); + } else { + tcg_gen_op3(opc, temp_arg(vl), temp_arg(addr), oi); + } + } else { + /* See TCGV_LOW/HIGH. */ + TCGTemp *al = addr + HOST_BIG_ENDIAN; + TCGTemp *ah = addr + !HOST_BIG_ENDIAN; + + if (vh) { + tcg_gen_op5(opc, temp_arg(vl), temp_arg(vh), + temp_arg(al), temp_arg(ah), oi); + } else { + tcg_gen_op4(opc, temp_arg(vl), temp_arg(al), temp_arg(ah), oi); + } + } +} + +static void gen_ldst_i64(TCGOpcode opc, TCGv_i64 v, TCGTemp *addr, MemOpIdx oi) +{ + if (TCG_TARGET_REG_BITS == 32) { + TCGTemp *vl = tcgv_i32_temp(TCGV_LOW(v)); + TCGTemp *vh = tcgv_i32_temp(TCGV_HIGH(v)); + gen_ldst(opc, vl, vh, addr, oi); + } else { + gen_ldst(opc, tcgv_i64_temp(v), NULL, addr, oi); + } +} + +static void tcg_gen_req_mo(TCGBar type) +{ +#ifdef TCG_GUEST_DEFAULT_MO + type &= TCG_GUEST_DEFAULT_MO; +#endif + type &= ~TCG_TARGET_DEFAULT_MO; + if (type) { + tcg_gen_mb(type | TCG_BAR_SC); + } +} + +/* Only required for loads, where value might overlap addr. */ +static TCGv_i64 plugin_maybe_preserve_addr(TCGTemp *addr) +{ +#ifdef CONFIG_PLUGIN + if (tcg_ctx->plugin_insn != NULL) { + /* Save a copy of the vaddr for use after a load. */ + TCGv_i64 temp = tcg_temp_ebb_new_i64(); + if (tcg_ctx->addr_type == TCG_TYPE_I32) { + tcg_gen_extu_i32_i64(temp, temp_tcgv_i32(addr)); + } else { + tcg_gen_mov_i64(temp, temp_tcgv_i64(addr)); + } + return temp; + } +#endif + return NULL; +} + +static void +plugin_gen_mem_callbacks(TCGv_i64 copy_addr, TCGTemp *orig_addr, MemOpIdx oi, + enum qemu_plugin_mem_rw rw) +{ +#ifdef CONFIG_PLUGIN + if (tcg_ctx->plugin_insn != NULL) { + qemu_plugin_meminfo_t info = make_plugin_meminfo(oi, rw); + + if (tcg_ctx->addr_type == TCG_TYPE_I32) { + if (!copy_addr) { + copy_addr = tcg_temp_ebb_new_i64(); + tcg_gen_extu_i32_i64(copy_addr, temp_tcgv_i32(orig_addr)); + } + plugin_gen_empty_mem_callback(copy_addr, info); + tcg_temp_free_i64(copy_addr); + } else { + if (copy_addr) { + plugin_gen_empty_mem_callback(copy_addr, info); + tcg_temp_free_i64(copy_addr); + } else { + plugin_gen_empty_mem_callback(temp_tcgv_i64(orig_addr), info); + } + } + } +#endif +} + +static void tcg_gen_qemu_ld_i32_int(TCGv_i32 val, TCGTemp *addr, + TCGArg idx, MemOp memop) +{ + MemOp orig_memop; + MemOpIdx orig_oi, oi; + TCGv_i64 copy_addr; + TCGOpcode opc; + + tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_ST_LD); + orig_memop = memop = tcg_canonicalize_memop(memop, 0, 0); + orig_oi = oi = make_memop_idx(memop, idx); + + if ((memop & MO_BSWAP) && !tcg_target_has_memory_bswap(memop)) { + memop &= ~MO_BSWAP; + /* The bswap primitive benefits from zero-extended input. */ + if ((memop & MO_SSIZE) == MO_SW) { + memop &= ~MO_SIGN; + } + oi = make_memop_idx(memop, idx); + } + + copy_addr = plugin_maybe_preserve_addr(addr); + if (tcg_ctx->addr_type == TCG_TYPE_I32) { + opc = INDEX_op_qemu_ld_a32_i32; + } else { + opc = INDEX_op_qemu_ld_a64_i32; + } + gen_ldst(opc, tcgv_i32_temp(val), NULL, addr, oi); + plugin_gen_mem_callbacks(copy_addr, addr, orig_oi, QEMU_PLUGIN_MEM_R); + + if ((orig_memop ^ memop) & MO_BSWAP) { + switch (orig_memop & MO_SIZE) { + case MO_16: + tcg_gen_bswap16_i32(val, val, (orig_memop & MO_SIGN + ? TCG_BSWAP_IZ | TCG_BSWAP_OS + : TCG_BSWAP_IZ | TCG_BSWAP_OZ)); + break; + case MO_32: + tcg_gen_bswap32_i32(val, val); + break; + default: + g_assert_not_reached(); + } + } +} + +void tcg_gen_qemu_ld_i32_chk(TCGv_i32 val, TCGTemp *addr, TCGArg idx, + MemOp memop, TCGType addr_type) +{ + tcg_debug_assert(addr_type == tcg_ctx->addr_type); + tcg_debug_assert((memop & MO_SIZE) <= MO_32); + tcg_gen_qemu_ld_i32_int(val, addr, idx, memop); +} + +static void tcg_gen_qemu_st_i32_int(TCGv_i32 val, TCGTemp *addr, + TCGArg idx, MemOp memop) +{ + TCGv_i32 swap = NULL; + MemOpIdx orig_oi, oi; + TCGOpcode opc; + + tcg_gen_req_mo(TCG_MO_LD_ST | TCG_MO_ST_ST); + memop = tcg_canonicalize_memop(memop, 0, 1); + orig_oi = oi = make_memop_idx(memop, idx); + + if ((memop & MO_BSWAP) && !tcg_target_has_memory_bswap(memop)) { + swap = tcg_temp_ebb_new_i32(); + switch (memop & MO_SIZE) { + case MO_16: + tcg_gen_bswap16_i32(swap, val, 0); + break; + case MO_32: + tcg_gen_bswap32_i32(swap, val); + break; + default: + g_assert_not_reached(); + } + val = swap; + memop &= ~MO_BSWAP; + oi = make_memop_idx(memop, idx); + } + + if (TCG_TARGET_HAS_qemu_st8_i32 && (memop & MO_SIZE) == MO_8) { + if (tcg_ctx->addr_type == TCG_TYPE_I32) { + opc = INDEX_op_qemu_st8_a32_i32; + } else { + opc = INDEX_op_qemu_st8_a64_i32; + } + } else { + if (tcg_ctx->addr_type == TCG_TYPE_I32) { + opc = INDEX_op_qemu_st_a32_i32; + } else { + opc = INDEX_op_qemu_st_a64_i32; + } + } + gen_ldst(opc, tcgv_i32_temp(val), NULL, addr, oi); + plugin_gen_mem_callbacks(NULL, addr, orig_oi, QEMU_PLUGIN_MEM_W); + + if (swap) { + tcg_temp_free_i32(swap); + } +} + +void tcg_gen_qemu_st_i32_chk(TCGv_i32 val, TCGTemp *addr, TCGArg idx, + MemOp memop, TCGType addr_type) +{ + tcg_debug_assert(addr_type == tcg_ctx->addr_type); + tcg_debug_assert((memop & MO_SIZE) <= MO_32); + tcg_gen_qemu_st_i32_int(val, addr, idx, memop); +} + +static void tcg_gen_qemu_ld_i64_int(TCGv_i64 val, TCGTemp *addr, + TCGArg idx, MemOp memop) +{ + MemOp orig_memop; + MemOpIdx orig_oi, oi; + TCGv_i64 copy_addr; + TCGOpcode opc; + + if (TCG_TARGET_REG_BITS == 32 && (memop & MO_SIZE) < MO_64) { + tcg_gen_qemu_ld_i32_int(TCGV_LOW(val), addr, idx, memop); + if (memop & MO_SIGN) { + tcg_gen_sari_i32(TCGV_HIGH(val), TCGV_LOW(val), 31); + } else { + tcg_gen_movi_i32(TCGV_HIGH(val), 0); + } + return; + } + + tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_ST_LD); + orig_memop = memop = tcg_canonicalize_memop(memop, 1, 0); + orig_oi = oi = make_memop_idx(memop, idx); + + if ((memop & MO_BSWAP) && !tcg_target_has_memory_bswap(memop)) { + memop &= ~MO_BSWAP; + /* The bswap primitive benefits from zero-extended input. */ + if ((memop & MO_SIGN) && (memop & MO_SIZE) < MO_64) { + memop &= ~MO_SIGN; + } + oi = make_memop_idx(memop, idx); + } + + copy_addr = plugin_maybe_preserve_addr(addr); + if (tcg_ctx->addr_type == TCG_TYPE_I32) { + opc = INDEX_op_qemu_ld_a32_i64; + } else { + opc = INDEX_op_qemu_ld_a64_i64; + } + gen_ldst_i64(opc, val, addr, oi); + plugin_gen_mem_callbacks(copy_addr, addr, orig_oi, QEMU_PLUGIN_MEM_R); + + if ((orig_memop ^ memop) & MO_BSWAP) { + int flags = (orig_memop & MO_SIGN + ? TCG_BSWAP_IZ | TCG_BSWAP_OS + : TCG_BSWAP_IZ | TCG_BSWAP_OZ); + switch (orig_memop & MO_SIZE) { + case MO_16: + tcg_gen_bswap16_i64(val, val, flags); + break; + case MO_32: + tcg_gen_bswap32_i64(val, val, flags); + break; + case MO_64: + tcg_gen_bswap64_i64(val, val); + break; + default: + g_assert_not_reached(); + } + } +} + +void tcg_gen_qemu_ld_i64_chk(TCGv_i64 val, TCGTemp *addr, TCGArg idx, + MemOp memop, TCGType addr_type) +{ + tcg_debug_assert(addr_type == tcg_ctx->addr_type); + tcg_debug_assert((memop & MO_SIZE) <= MO_64); + tcg_gen_qemu_ld_i64_int(val, addr, idx, memop); +} + +static void tcg_gen_qemu_st_i64_int(TCGv_i64 val, TCGTemp *addr, + TCGArg idx, MemOp memop) +{ + TCGv_i64 swap = NULL; + MemOpIdx orig_oi, oi; + TCGOpcode opc; + + if (TCG_TARGET_REG_BITS == 32 && (memop & MO_SIZE) < MO_64) { + tcg_gen_qemu_st_i32_int(TCGV_LOW(val), addr, idx, memop); + return; + } + + tcg_gen_req_mo(TCG_MO_LD_ST | TCG_MO_ST_ST); + memop = tcg_canonicalize_memop(memop, 1, 1); + orig_oi = oi = make_memop_idx(memop, idx); + + if ((memop & MO_BSWAP) && !tcg_target_has_memory_bswap(memop)) { + swap = tcg_temp_ebb_new_i64(); + switch (memop & MO_SIZE) { + case MO_16: + tcg_gen_bswap16_i64(swap, val, 0); + break; + case MO_32: + tcg_gen_bswap32_i64(swap, val, 0); + break; + case MO_64: + tcg_gen_bswap64_i64(swap, val); + break; + default: + g_assert_not_reached(); + } + val = swap; + memop &= ~MO_BSWAP; + oi = make_memop_idx(memop, idx); + } + + if (tcg_ctx->addr_type == TCG_TYPE_I32) { + opc = INDEX_op_qemu_st_a32_i64; + } else { + opc = INDEX_op_qemu_st_a64_i64; + } + gen_ldst_i64(opc, val, addr, oi); + plugin_gen_mem_callbacks(NULL, addr, orig_oi, QEMU_PLUGIN_MEM_W); + + if (swap) { + tcg_temp_free_i64(swap); + } +} + +void tcg_gen_qemu_st_i64_chk(TCGv_i64 val, TCGTemp *addr, TCGArg idx, + MemOp memop, TCGType addr_type) +{ + tcg_debug_assert(addr_type == tcg_ctx->addr_type); + tcg_debug_assert((memop & MO_SIZE) <= MO_64); + tcg_gen_qemu_st_i64_int(val, addr, idx, memop); +} + +/* + * Return true if @mop, without knowledge of the pointer alignment, + * does not require 16-byte atomicity, and it would be adventagous + * to avoid a call to a helper function. + */ +static bool use_two_i64_for_i128(MemOp mop) +{ +#ifdef CONFIG_SOFTMMU + /* Two softmmu tlb lookups is larger than one function call. */ + return false; +#else + /* + * For user-only, two 64-bit operations may well be smaller than a call. + * Determine if that would be legal for the requested atomicity. + */ + switch (mop & MO_ATOM_MASK) { + case MO_ATOM_NONE: + case MO_ATOM_IFALIGN_PAIR: + return true; + case MO_ATOM_IFALIGN: + case MO_ATOM_SUBALIGN: + case MO_ATOM_WITHIN16: + case MO_ATOM_WITHIN16_PAIR: + /* In a serialized context, no atomicity is required. */ + return !(tcg_ctx->gen_tb->cflags & CF_PARALLEL); + default: + g_assert_not_reached(); + } +#endif +} + +static void canonicalize_memop_i128_as_i64(MemOp ret[2], MemOp orig) +{ + MemOp mop_1 = orig, mop_2; + + /* Reduce the size to 64-bit. */ + mop_1 = (mop_1 & ~MO_SIZE) | MO_64; + + /* Retain the alignment constraints of the original. */ + switch (orig & MO_AMASK) { + case MO_UNALN: + case MO_ALIGN_2: + case MO_ALIGN_4: + mop_2 = mop_1; + break; + case MO_ALIGN_8: + /* Prefer MO_ALIGN+MO_64 to MO_ALIGN_8+MO_64. */ + mop_1 = (mop_1 & ~MO_AMASK) | MO_ALIGN; + mop_2 = mop_1; + break; + case MO_ALIGN: + /* Second has 8-byte alignment; first has 16-byte alignment. */ + mop_2 = mop_1; + mop_1 = (mop_1 & ~MO_AMASK) | MO_ALIGN_16; + break; + case MO_ALIGN_16: + case MO_ALIGN_32: + case MO_ALIGN_64: + /* Second has 8-byte alignment; first retains original. */ + mop_2 = (mop_1 & ~MO_AMASK) | MO_ALIGN; + break; + default: + g_assert_not_reached(); + } + + /* Use a memory ordering implemented by the host. */ + if ((orig & MO_BSWAP) && !tcg_target_has_memory_bswap(mop_1)) { + mop_1 &= ~MO_BSWAP; + mop_2 &= ~MO_BSWAP; + } + + ret[0] = mop_1; + ret[1] = mop_2; +} + +static TCGv_i64 maybe_extend_addr64(TCGTemp *addr) +{ + if (tcg_ctx->addr_type == TCG_TYPE_I32) { + TCGv_i64 a64 = tcg_temp_ebb_new_i64(); + tcg_gen_extu_i32_i64(a64, temp_tcgv_i32(addr)); + return a64; + } + return temp_tcgv_i64(addr); +} + +static void maybe_free_addr64(TCGv_i64 a64) +{ + if (tcg_ctx->addr_type == TCG_TYPE_I32) { + tcg_temp_free_i64(a64); + } +} + +static void tcg_gen_qemu_ld_i128_int(TCGv_i128 val, TCGTemp *addr, + TCGArg idx, MemOp memop) +{ + const MemOpIdx orig_oi = make_memop_idx(memop, idx); + TCGv_i64 ext_addr = NULL; + TCGOpcode opc; + + tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_ST_LD); + + /* TODO: For now, force 32-bit hosts to use the helper. */ + if (TCG_TARGET_HAS_qemu_ldst_i128 && TCG_TARGET_REG_BITS == 64) { + TCGv_i64 lo, hi; + bool need_bswap = false; + MemOpIdx oi = orig_oi; + + if ((memop & MO_BSWAP) && !tcg_target_has_memory_bswap(memop)) { + lo = TCGV128_HIGH(val); + hi = TCGV128_LOW(val); + oi = make_memop_idx(memop & ~MO_BSWAP, idx); + need_bswap = true; + } else { + lo = TCGV128_LOW(val); + hi = TCGV128_HIGH(val); + } + + if (tcg_ctx->addr_type == TCG_TYPE_I32) { + opc = INDEX_op_qemu_ld_a32_i128; + } else { + opc = INDEX_op_qemu_ld_a64_i128; + } + gen_ldst(opc, tcgv_i64_temp(lo), tcgv_i64_temp(hi), addr, oi); + + if (need_bswap) { + tcg_gen_bswap64_i64(lo, lo); + tcg_gen_bswap64_i64(hi, hi); + } + } else if (use_two_i64_for_i128(memop)) { + MemOp mop[2]; + TCGTemp *addr_p8; + TCGv_i64 x, y; + bool need_bswap; + + canonicalize_memop_i128_as_i64(mop, memop); + need_bswap = (mop[0] ^ memop) & MO_BSWAP; + + if (tcg_ctx->addr_type == TCG_TYPE_I32) { + opc = INDEX_op_qemu_ld_a32_i64; + } else { + opc = INDEX_op_qemu_ld_a64_i64; + } + + /* + * Since there are no global TCGv_i128, there is no visible state + * changed if the second load faults. Load directly into the two + * subwords. + */ + if ((memop & MO_BSWAP) == MO_LE) { + x = TCGV128_LOW(val); + y = TCGV128_HIGH(val); + } else { + x = TCGV128_HIGH(val); + y = TCGV128_LOW(val); + } + + gen_ldst_i64(opc, x, addr, make_memop_idx(mop[0], idx)); + + if (need_bswap) { + tcg_gen_bswap64_i64(x, x); + } + + if (tcg_ctx->addr_type == TCG_TYPE_I32) { + TCGv_i32 t = tcg_temp_ebb_new_i32(); + tcg_gen_addi_i32(t, temp_tcgv_i32(addr), 8); + addr_p8 = tcgv_i32_temp(t); + } else { + TCGv_i64 t = tcg_temp_ebb_new_i64(); + tcg_gen_addi_i64(t, temp_tcgv_i64(addr), 8); + addr_p8 = tcgv_i64_temp(t); + } + + gen_ldst_i64(opc, y, addr_p8, make_memop_idx(mop[1], idx)); + tcg_temp_free_internal(addr_p8); + + if (need_bswap) { + tcg_gen_bswap64_i64(y, y); + } + } else { + if (tcg_ctx->addr_type == TCG_TYPE_I32) { + ext_addr = tcg_temp_ebb_new_i64(); + tcg_gen_extu_i32_i64(ext_addr, temp_tcgv_i32(addr)); + addr = tcgv_i64_temp(ext_addr); + } + gen_helper_ld_i128(val, cpu_env, temp_tcgv_i64(addr), + tcg_constant_i32(orig_oi)); + } + + plugin_gen_mem_callbacks(ext_addr, addr, orig_oi, QEMU_PLUGIN_MEM_R); +} + +void tcg_gen_qemu_ld_i128_chk(TCGv_i128 val, TCGTemp *addr, TCGArg idx, + MemOp memop, TCGType addr_type) +{ + tcg_debug_assert(addr_type == tcg_ctx->addr_type); + tcg_debug_assert((memop & MO_SIZE) == MO_128); + tcg_debug_assert((memop & MO_SIGN) == 0); + tcg_gen_qemu_ld_i128_int(val, addr, idx, memop); +} + +static void tcg_gen_qemu_st_i128_int(TCGv_i128 val, TCGTemp *addr, + TCGArg idx, MemOp memop) +{ + const MemOpIdx orig_oi = make_memop_idx(memop, idx); + TCGv_i64 ext_addr = NULL; + TCGOpcode opc; + + tcg_gen_req_mo(TCG_MO_ST_LD | TCG_MO_ST_ST); + + /* TODO: For now, force 32-bit hosts to use the helper. */ + + if (TCG_TARGET_HAS_qemu_ldst_i128 && TCG_TARGET_REG_BITS == 64) { + TCGv_i64 lo, hi; + MemOpIdx oi = orig_oi; + bool need_bswap = false; + + if ((memop & MO_BSWAP) && !tcg_target_has_memory_bswap(memop)) { + lo = tcg_temp_ebb_new_i64(); + hi = tcg_temp_ebb_new_i64(); + tcg_gen_bswap64_i64(lo, TCGV128_HIGH(val)); + tcg_gen_bswap64_i64(hi, TCGV128_LOW(val)); + oi = make_memop_idx(memop & ~MO_BSWAP, idx); + need_bswap = true; + } else { + lo = TCGV128_LOW(val); + hi = TCGV128_HIGH(val); + } + + if (tcg_ctx->addr_type == TCG_TYPE_I32) { + opc = INDEX_op_qemu_st_a32_i128; + } else { + opc = INDEX_op_qemu_st_a64_i128; + } + gen_ldst(opc, tcgv_i64_temp(lo), tcgv_i64_temp(hi), addr, oi); + + if (need_bswap) { + tcg_temp_free_i64(lo); + tcg_temp_free_i64(hi); + } + } else if (use_two_i64_for_i128(memop)) { + MemOp mop[2]; + TCGTemp *addr_p8; + TCGv_i64 x, y, b = NULL; + + canonicalize_memop_i128_as_i64(mop, memop); + + if (tcg_ctx->addr_type == TCG_TYPE_I32) { + opc = INDEX_op_qemu_st_a32_i64; + } else { + opc = INDEX_op_qemu_st_a64_i64; + } + + if ((memop & MO_BSWAP) == MO_LE) { + x = TCGV128_LOW(val); + y = TCGV128_HIGH(val); + } else { + x = TCGV128_HIGH(val); + y = TCGV128_LOW(val); + } + + if ((mop[0] ^ memop) & MO_BSWAP) { + b = tcg_temp_ebb_new_i64(); + tcg_gen_bswap64_i64(b, x); + x = b; + } + + gen_ldst_i64(opc, x, addr, make_memop_idx(mop[0], idx)); + + if (tcg_ctx->addr_type == TCG_TYPE_I32) { + TCGv_i32 t = tcg_temp_ebb_new_i32(); + tcg_gen_addi_i32(t, temp_tcgv_i32(addr), 8); + addr_p8 = tcgv_i32_temp(t); + } else { + TCGv_i64 t = tcg_temp_ebb_new_i64(); + tcg_gen_addi_i64(t, temp_tcgv_i64(addr), 8); + addr_p8 = tcgv_i64_temp(t); + } + + if (b) { + tcg_gen_bswap64_i64(b, y); + gen_ldst_i64(opc, b, addr_p8, make_memop_idx(mop[1], idx)); + tcg_temp_free_i64(b); + } else { + gen_ldst_i64(opc, y, addr_p8, make_memop_idx(mop[1], idx)); + } + tcg_temp_free_internal(addr_p8); + } else { + if (tcg_ctx->addr_type == TCG_TYPE_I32) { + ext_addr = tcg_temp_ebb_new_i64(); + tcg_gen_extu_i32_i64(ext_addr, temp_tcgv_i32(addr)); + addr = tcgv_i64_temp(ext_addr); + } + gen_helper_st_i128(cpu_env, temp_tcgv_i64(addr), val, + tcg_constant_i32(orig_oi)); + } + + plugin_gen_mem_callbacks(ext_addr, addr, orig_oi, QEMU_PLUGIN_MEM_W); +} + +void tcg_gen_qemu_st_i128_chk(TCGv_i128 val, TCGTemp *addr, TCGArg idx, + MemOp memop, TCGType addr_type) +{ + tcg_debug_assert(addr_type == tcg_ctx->addr_type); + tcg_debug_assert((memop & MO_SIZE) == MO_128); + tcg_debug_assert((memop & MO_SIGN) == 0); + tcg_gen_qemu_st_i128_int(val, addr, idx, memop); +} + +static void tcg_gen_ext_i32(TCGv_i32 ret, TCGv_i32 val, MemOp opc) +{ + switch (opc & MO_SSIZE) { + case MO_SB: + tcg_gen_ext8s_i32(ret, val); + break; + case MO_UB: + tcg_gen_ext8u_i32(ret, val); + break; + case MO_SW: + tcg_gen_ext16s_i32(ret, val); + break; + case MO_UW: + tcg_gen_ext16u_i32(ret, val); + break; + default: + tcg_gen_mov_i32(ret, val); + break; + } +} + +static void tcg_gen_ext_i64(TCGv_i64 ret, TCGv_i64 val, MemOp opc) +{ + switch (opc & MO_SSIZE) { + case MO_SB: + tcg_gen_ext8s_i64(ret, val); + break; + case MO_UB: + tcg_gen_ext8u_i64(ret, val); + break; + case MO_SW: + tcg_gen_ext16s_i64(ret, val); + break; + case MO_UW: + tcg_gen_ext16u_i64(ret, val); + break; + case MO_SL: + tcg_gen_ext32s_i64(ret, val); + break; + case MO_UL: + tcg_gen_ext32u_i64(ret, val); + break; + default: + tcg_gen_mov_i64(ret, val); + break; + } +} + +typedef void (*gen_atomic_cx_i32)(TCGv_i32, TCGv_env, TCGv_i64, + TCGv_i32, TCGv_i32, TCGv_i32); +typedef void (*gen_atomic_cx_i64)(TCGv_i64, TCGv_env, TCGv_i64, + TCGv_i64, TCGv_i64, TCGv_i32); +typedef void (*gen_atomic_cx_i128)(TCGv_i128, TCGv_env, TCGv_i64, + TCGv_i128, TCGv_i128, TCGv_i32); +typedef void (*gen_atomic_op_i32)(TCGv_i32, TCGv_env, TCGv_i64, + TCGv_i32, TCGv_i32); +typedef void (*gen_atomic_op_i64)(TCGv_i64, TCGv_env, TCGv_i64, + TCGv_i64, TCGv_i32); + +#ifdef CONFIG_ATOMIC64 +# define WITH_ATOMIC64(X) X, +#else +# define WITH_ATOMIC64(X) +#endif +#ifdef CONFIG_CMPXCHG128 +# define WITH_ATOMIC128(X) X, +#else +# define WITH_ATOMIC128(X) +#endif + +static void * const table_cmpxchg[(MO_SIZE | MO_BSWAP) + 1] = { + [MO_8] = gen_helper_atomic_cmpxchgb, + [MO_16 | MO_LE] = gen_helper_atomic_cmpxchgw_le, + [MO_16 | MO_BE] = gen_helper_atomic_cmpxchgw_be, + [MO_32 | MO_LE] = gen_helper_atomic_cmpxchgl_le, + [MO_32 | MO_BE] = gen_helper_atomic_cmpxchgl_be, + WITH_ATOMIC64([MO_64 | MO_LE] = gen_helper_atomic_cmpxchgq_le) + WITH_ATOMIC64([MO_64 | MO_BE] = gen_helper_atomic_cmpxchgq_be) + WITH_ATOMIC128([MO_128 | MO_LE] = gen_helper_atomic_cmpxchgo_le) + WITH_ATOMIC128([MO_128 | MO_BE] = gen_helper_atomic_cmpxchgo_be) +}; + +static void tcg_gen_nonatomic_cmpxchg_i32_int(TCGv_i32 retv, TCGTemp *addr, + TCGv_i32 cmpv, TCGv_i32 newv, + TCGArg idx, MemOp memop) +{ + TCGv_i32 t1 = tcg_temp_ebb_new_i32(); + TCGv_i32 t2 = tcg_temp_ebb_new_i32(); + + tcg_gen_ext_i32(t2, cmpv, memop & MO_SIZE); + + tcg_gen_qemu_ld_i32_int(t1, addr, idx, memop & ~MO_SIGN); + tcg_gen_movcond_i32(TCG_COND_EQ, t2, t1, t2, newv, t1); + tcg_gen_qemu_st_i32_int(t2, addr, idx, memop); + tcg_temp_free_i32(t2); + + if (memop & MO_SIGN) { + tcg_gen_ext_i32(retv, t1, memop); + } else { + tcg_gen_mov_i32(retv, t1); + } + tcg_temp_free_i32(t1); +} + +void tcg_gen_nonatomic_cmpxchg_i32_chk(TCGv_i32 retv, TCGTemp *addr, + TCGv_i32 cmpv, TCGv_i32 newv, + TCGArg idx, MemOp memop, + TCGType addr_type) +{ + tcg_debug_assert(addr_type == tcg_ctx->addr_type); + tcg_debug_assert((memop & MO_SIZE) <= MO_32); + tcg_gen_nonatomic_cmpxchg_i32_int(retv, addr, cmpv, newv, idx, memop); +} + +static void tcg_gen_atomic_cmpxchg_i32_int(TCGv_i32 retv, TCGTemp *addr, + TCGv_i32 cmpv, TCGv_i32 newv, + TCGArg idx, MemOp memop) +{ + gen_atomic_cx_i32 gen; + TCGv_i64 a64; + MemOpIdx oi; + + if (!(tcg_ctx->gen_tb->cflags & CF_PARALLEL)) { + tcg_gen_nonatomic_cmpxchg_i32_int(retv, addr, cmpv, newv, idx, memop); + return; + } + + memop = tcg_canonicalize_memop(memop, 0, 0); + gen = table_cmpxchg[memop & (MO_SIZE | MO_BSWAP)]; + tcg_debug_assert(gen != NULL); + + oi = make_memop_idx(memop & ~MO_SIGN, idx); + a64 = maybe_extend_addr64(addr); + gen(retv, cpu_env, a64, cmpv, newv, tcg_constant_i32(oi)); + maybe_free_addr64(a64); + + if (memop & MO_SIGN) { + tcg_gen_ext_i32(retv, retv, memop); + } +} + +void tcg_gen_atomic_cmpxchg_i32_chk(TCGv_i32 retv, TCGTemp *addr, + TCGv_i32 cmpv, TCGv_i32 newv, + TCGArg idx, MemOp memop, + TCGType addr_type) +{ + tcg_debug_assert(addr_type == tcg_ctx->addr_type); + tcg_debug_assert((memop & MO_SIZE) <= MO_32); + tcg_gen_atomic_cmpxchg_i32_int(retv, addr, cmpv, newv, idx, memop); +} + +static void tcg_gen_nonatomic_cmpxchg_i64_int(TCGv_i64 retv, TCGTemp *addr, + TCGv_i64 cmpv, TCGv_i64 newv, + TCGArg idx, MemOp memop) +{ + TCGv_i64 t1, t2; + + if (TCG_TARGET_REG_BITS == 32 && (memop & MO_SIZE) < MO_64) { + tcg_gen_nonatomic_cmpxchg_i32_int(TCGV_LOW(retv), addr, TCGV_LOW(cmpv), + TCGV_LOW(newv), idx, memop); + if (memop & MO_SIGN) { + tcg_gen_sari_i32(TCGV_HIGH(retv), TCGV_LOW(retv), 31); + } else { + tcg_gen_movi_i32(TCGV_HIGH(retv), 0); + } + return; + } + + t1 = tcg_temp_ebb_new_i64(); + t2 = tcg_temp_ebb_new_i64(); + + tcg_gen_ext_i64(t2, cmpv, memop & MO_SIZE); + + tcg_gen_qemu_ld_i64_int(t1, addr, idx, memop & ~MO_SIGN); + tcg_gen_movcond_i64(TCG_COND_EQ, t2, t1, t2, newv, t1); + tcg_gen_qemu_st_i64_int(t2, addr, idx, memop); + tcg_temp_free_i64(t2); + + if (memop & MO_SIGN) { + tcg_gen_ext_i64(retv, t1, memop); + } else { + tcg_gen_mov_i64(retv, t1); + } + tcg_temp_free_i64(t1); +} + +void tcg_gen_nonatomic_cmpxchg_i64_chk(TCGv_i64 retv, TCGTemp *addr, + TCGv_i64 cmpv, TCGv_i64 newv, + TCGArg idx, MemOp memop, + TCGType addr_type) +{ + tcg_debug_assert(addr_type == tcg_ctx->addr_type); + tcg_debug_assert((memop & MO_SIZE) <= MO_64); + tcg_gen_nonatomic_cmpxchg_i64_int(retv, addr, cmpv, newv, idx, memop); +} + +static void tcg_gen_atomic_cmpxchg_i64_int(TCGv_i64 retv, TCGTemp *addr, + TCGv_i64 cmpv, TCGv_i64 newv, + TCGArg idx, MemOp memop) +{ + if (!(tcg_ctx->gen_tb->cflags & CF_PARALLEL)) { + tcg_gen_nonatomic_cmpxchg_i64_int(retv, addr, cmpv, newv, idx, memop); + return; + } + + if ((memop & MO_SIZE) == MO_64) { + gen_atomic_cx_i64 gen; + + memop = tcg_canonicalize_memop(memop, 1, 0); + gen = table_cmpxchg[memop & (MO_SIZE | MO_BSWAP)]; + if (gen) { + MemOpIdx oi = make_memop_idx(memop, idx); + TCGv_i64 a64 = maybe_extend_addr64(addr); + gen(retv, cpu_env, a64, cmpv, newv, tcg_constant_i32(oi)); + maybe_free_addr64(a64); + return; + } + + gen_helper_exit_atomic(cpu_env); + + /* + * Produce a result for a well-formed opcode stream. This satisfies + * liveness for set before used, which happens before this dead code + * is removed. + */ + tcg_gen_movi_i64(retv, 0); + return; + } + + if (TCG_TARGET_REG_BITS == 32) { + tcg_gen_atomic_cmpxchg_i32_int(TCGV_LOW(retv), addr, TCGV_LOW(cmpv), + TCGV_LOW(newv), idx, memop); + if (memop & MO_SIGN) { + tcg_gen_sari_i32(TCGV_HIGH(retv), TCGV_LOW(retv), 31); + } else { + tcg_gen_movi_i32(TCGV_HIGH(retv), 0); + } + } else { + TCGv_i32 c32 = tcg_temp_ebb_new_i32(); + TCGv_i32 n32 = tcg_temp_ebb_new_i32(); + TCGv_i32 r32 = tcg_temp_ebb_new_i32(); + + tcg_gen_extrl_i64_i32(c32, cmpv); + tcg_gen_extrl_i64_i32(n32, newv); + tcg_gen_atomic_cmpxchg_i32_int(r32, addr, c32, n32, + idx, memop & ~MO_SIGN); + tcg_temp_free_i32(c32); + tcg_temp_free_i32(n32); + + tcg_gen_extu_i32_i64(retv, r32); + tcg_temp_free_i32(r32); + + if (memop & MO_SIGN) { + tcg_gen_ext_i64(retv, retv, memop); + } + } +} + +void tcg_gen_atomic_cmpxchg_i64_chk(TCGv_i64 retv, TCGTemp *addr, + TCGv_i64 cmpv, TCGv_i64 newv, + TCGArg idx, MemOp memop, TCGType addr_type) +{ + tcg_debug_assert(addr_type == tcg_ctx->addr_type); + tcg_debug_assert((memop & MO_SIZE) <= MO_64); + tcg_gen_atomic_cmpxchg_i64_int(retv, addr, cmpv, newv, idx, memop); +} + +static void tcg_gen_nonatomic_cmpxchg_i128_int(TCGv_i128 retv, TCGTemp *addr, + TCGv_i128 cmpv, TCGv_i128 newv, + TCGArg idx, MemOp memop) +{ + if (TCG_TARGET_REG_BITS == 32) { + /* Inline expansion below is simply too large for 32-bit hosts. */ + gen_atomic_cx_i128 gen = ((memop & MO_BSWAP) == MO_LE + ? gen_helper_nonatomic_cmpxchgo_le + : gen_helper_nonatomic_cmpxchgo_be); + MemOpIdx oi = make_memop_idx(memop, idx); + TCGv_i64 a64 = maybe_extend_addr64(addr); + + gen(retv, cpu_env, a64, cmpv, newv, tcg_constant_i32(oi)); + maybe_free_addr64(a64); + } else { + TCGv_i128 oldv = tcg_temp_ebb_new_i128(); + TCGv_i128 tmpv = tcg_temp_ebb_new_i128(); + TCGv_i64 t0 = tcg_temp_ebb_new_i64(); + TCGv_i64 t1 = tcg_temp_ebb_new_i64(); + TCGv_i64 z = tcg_constant_i64(0); + + tcg_gen_qemu_ld_i128_int(oldv, addr, idx, memop); + + /* Compare i128 */ + tcg_gen_xor_i64(t0, TCGV128_LOW(oldv), TCGV128_LOW(cmpv)); + tcg_gen_xor_i64(t1, TCGV128_HIGH(oldv), TCGV128_HIGH(cmpv)); + tcg_gen_or_i64(t0, t0, t1); + + /* tmpv = equal ? newv : oldv */ + tcg_gen_movcond_i64(TCG_COND_EQ, TCGV128_LOW(tmpv), t0, z, + TCGV128_LOW(newv), TCGV128_LOW(oldv)); + tcg_gen_movcond_i64(TCG_COND_EQ, TCGV128_HIGH(tmpv), t0, z, + TCGV128_HIGH(newv), TCGV128_HIGH(oldv)); + + /* Unconditional writeback. */ + tcg_gen_qemu_st_i128_int(tmpv, addr, idx, memop); + tcg_gen_mov_i128(retv, oldv); + + tcg_temp_free_i64(t0); + tcg_temp_free_i64(t1); + tcg_temp_free_i128(tmpv); + tcg_temp_free_i128(oldv); + } +} + +void tcg_gen_nonatomic_cmpxchg_i128_chk(TCGv_i128 retv, TCGTemp *addr, + TCGv_i128 cmpv, TCGv_i128 newv, + TCGArg idx, MemOp memop, + TCGType addr_type) +{ + tcg_debug_assert(addr_type == tcg_ctx->addr_type); + tcg_debug_assert((memop & (MO_SIZE | MO_SIGN)) == MO_128); + tcg_gen_nonatomic_cmpxchg_i128_int(retv, addr, cmpv, newv, idx, memop); +} + +static void tcg_gen_atomic_cmpxchg_i128_int(TCGv_i128 retv, TCGTemp *addr, + TCGv_i128 cmpv, TCGv_i128 newv, + TCGArg idx, MemOp memop) +{ + gen_atomic_cx_i128 gen; + + if (!(tcg_ctx->gen_tb->cflags & CF_PARALLEL)) { + tcg_gen_nonatomic_cmpxchg_i128_int(retv, addr, cmpv, newv, idx, memop); + return; + } + + gen = table_cmpxchg[memop & (MO_SIZE | MO_BSWAP)]; + if (gen) { + MemOpIdx oi = make_memop_idx(memop, idx); + TCGv_i64 a64 = maybe_extend_addr64(addr); + gen(retv, cpu_env, a64, cmpv, newv, tcg_constant_i32(oi)); + maybe_free_addr64(a64); + return; + } + + gen_helper_exit_atomic(cpu_env); + + /* + * Produce a result for a well-formed opcode stream. This satisfies + * liveness for set before used, which happens before this dead code + * is removed. + */ + tcg_gen_movi_i64(TCGV128_LOW(retv), 0); + tcg_gen_movi_i64(TCGV128_HIGH(retv), 0); +} + +void tcg_gen_atomic_cmpxchg_i128_chk(TCGv_i128 retv, TCGTemp *addr, + TCGv_i128 cmpv, TCGv_i128 newv, + TCGArg idx, MemOp memop, + TCGType addr_type) +{ + tcg_debug_assert(addr_type == tcg_ctx->addr_type); + tcg_debug_assert((memop & (MO_SIZE | MO_SIGN)) == MO_128); + tcg_gen_atomic_cmpxchg_i128_int(retv, addr, cmpv, newv, idx, memop); +} + +static void do_nonatomic_op_i32(TCGv_i32 ret, TCGTemp *addr, TCGv_i32 val, + TCGArg idx, MemOp memop, bool new_val, + void (*gen)(TCGv_i32, TCGv_i32, TCGv_i32)) +{ + TCGv_i32 t1 = tcg_temp_ebb_new_i32(); + TCGv_i32 t2 = tcg_temp_ebb_new_i32(); + + memop = tcg_canonicalize_memop(memop, 0, 0); + + tcg_gen_qemu_ld_i32_int(t1, addr, idx, memop); + tcg_gen_ext_i32(t2, val, memop); + gen(t2, t1, t2); + tcg_gen_qemu_st_i32_int(t2, addr, idx, memop); + + tcg_gen_ext_i32(ret, (new_val ? t2 : t1), memop); + tcg_temp_free_i32(t1); + tcg_temp_free_i32(t2); +} + +static void do_atomic_op_i32(TCGv_i32 ret, TCGTemp *addr, TCGv_i32 val, + TCGArg idx, MemOp memop, void * const table[]) +{ + gen_atomic_op_i32 gen; + TCGv_i64 a64; + MemOpIdx oi; + + memop = tcg_canonicalize_memop(memop, 0, 0); + + gen = table[memop & (MO_SIZE | MO_BSWAP)]; + tcg_debug_assert(gen != NULL); + + oi = make_memop_idx(memop & ~MO_SIGN, idx); + a64 = maybe_extend_addr64(addr); + gen(ret, cpu_env, a64, val, tcg_constant_i32(oi)); + maybe_free_addr64(a64); + + if (memop & MO_SIGN) { + tcg_gen_ext_i32(ret, ret, memop); + } +} + +static void do_nonatomic_op_i64(TCGv_i64 ret, TCGTemp *addr, TCGv_i64 val, + TCGArg idx, MemOp memop, bool new_val, + void (*gen)(TCGv_i64, TCGv_i64, TCGv_i64)) +{ + TCGv_i64 t1 = tcg_temp_ebb_new_i64(); + TCGv_i64 t2 = tcg_temp_ebb_new_i64(); + + memop = tcg_canonicalize_memop(memop, 1, 0); + + tcg_gen_qemu_ld_i64_int(t1, addr, idx, memop); + tcg_gen_ext_i64(t2, val, memop); + gen(t2, t1, t2); + tcg_gen_qemu_st_i64_int(t2, addr, idx, memop); + + tcg_gen_ext_i64(ret, (new_val ? t2 : t1), memop); + tcg_temp_free_i64(t1); + tcg_temp_free_i64(t2); +} + +static void do_atomic_op_i64(TCGv_i64 ret, TCGTemp *addr, TCGv_i64 val, + TCGArg idx, MemOp memop, void * const table[]) +{ + memop = tcg_canonicalize_memop(memop, 1, 0); + + if ((memop & MO_SIZE) == MO_64) { + gen_atomic_op_i64 gen = table[memop & (MO_SIZE | MO_BSWAP)]; + + if (gen) { + MemOpIdx oi = make_memop_idx(memop & ~MO_SIGN, idx); + TCGv_i64 a64 = maybe_extend_addr64(addr); + gen(ret, cpu_env, a64, val, tcg_constant_i32(oi)); + maybe_free_addr64(a64); + return; + } + + gen_helper_exit_atomic(cpu_env); + /* Produce a result, so that we have a well-formed opcode stream + with respect to uses of the result in the (dead) code following. */ + tcg_gen_movi_i64(ret, 0); + } else { + TCGv_i32 v32 = tcg_temp_ebb_new_i32(); + TCGv_i32 r32 = tcg_temp_ebb_new_i32(); + + tcg_gen_extrl_i64_i32(v32, val); + do_atomic_op_i32(r32, addr, v32, idx, memop & ~MO_SIGN, table); + tcg_temp_free_i32(v32); + + tcg_gen_extu_i32_i64(ret, r32); + tcg_temp_free_i32(r32); + + if (memop & MO_SIGN) { + tcg_gen_ext_i64(ret, ret, memop); + } + } +} + +#define GEN_ATOMIC_HELPER(NAME, OP, NEW) \ +static void * const table_##NAME[(MO_SIZE | MO_BSWAP) + 1] = { \ + [MO_8] = gen_helper_atomic_##NAME##b, \ + [MO_16 | MO_LE] = gen_helper_atomic_##NAME##w_le, \ + [MO_16 | MO_BE] = gen_helper_atomic_##NAME##w_be, \ + [MO_32 | MO_LE] = gen_helper_atomic_##NAME##l_le, \ + [MO_32 | MO_BE] = gen_helper_atomic_##NAME##l_be, \ + WITH_ATOMIC64([MO_64 | MO_LE] = gen_helper_atomic_##NAME##q_le) \ + WITH_ATOMIC64([MO_64 | MO_BE] = gen_helper_atomic_##NAME##q_be) \ +}; \ +void tcg_gen_atomic_##NAME##_i32_chk(TCGv_i32 ret, TCGTemp *addr, \ + TCGv_i32 val, TCGArg idx, \ + MemOp memop, TCGType addr_type) \ +{ \ + tcg_debug_assert(addr_type == tcg_ctx->addr_type); \ + tcg_debug_assert((memop & MO_SIZE) <= MO_32); \ + if (tcg_ctx->gen_tb->cflags & CF_PARALLEL) { \ + do_atomic_op_i32(ret, addr, val, idx, memop, table_##NAME); \ + } else { \ + do_nonatomic_op_i32(ret, addr, val, idx, memop, NEW, \ + tcg_gen_##OP##_i32); \ + } \ +} \ +void tcg_gen_atomic_##NAME##_i64_chk(TCGv_i64 ret, TCGTemp *addr, \ + TCGv_i64 val, TCGArg idx, \ + MemOp memop, TCGType addr_type) \ +{ \ + tcg_debug_assert(addr_type == tcg_ctx->addr_type); \ + tcg_debug_assert((memop & MO_SIZE) <= MO_64); \ + if (tcg_ctx->gen_tb->cflags & CF_PARALLEL) { \ + do_atomic_op_i64(ret, addr, val, idx, memop, table_##NAME); \ + } else { \ + do_nonatomic_op_i64(ret, addr, val, idx, memop, NEW, \ + tcg_gen_##OP##_i64); \ + } \ +} + +GEN_ATOMIC_HELPER(fetch_add, add, 0) +GEN_ATOMIC_HELPER(fetch_and, and, 0) +GEN_ATOMIC_HELPER(fetch_or, or, 0) +GEN_ATOMIC_HELPER(fetch_xor, xor, 0) +GEN_ATOMIC_HELPER(fetch_smin, smin, 0) +GEN_ATOMIC_HELPER(fetch_umin, umin, 0) +GEN_ATOMIC_HELPER(fetch_smax, smax, 0) +GEN_ATOMIC_HELPER(fetch_umax, umax, 0) + +GEN_ATOMIC_HELPER(add_fetch, add, 1) +GEN_ATOMIC_HELPER(and_fetch, and, 1) +GEN_ATOMIC_HELPER(or_fetch, or, 1) +GEN_ATOMIC_HELPER(xor_fetch, xor, 1) +GEN_ATOMIC_HELPER(smin_fetch, smin, 1) +GEN_ATOMIC_HELPER(umin_fetch, umin, 1) +GEN_ATOMIC_HELPER(smax_fetch, smax, 1) +GEN_ATOMIC_HELPER(umax_fetch, umax, 1) + +static void tcg_gen_mov2_i32(TCGv_i32 r, TCGv_i32 a, TCGv_i32 b) +{ + tcg_gen_mov_i32(r, b); +} + +static void tcg_gen_mov2_i64(TCGv_i64 r, TCGv_i64 a, TCGv_i64 b) +{ + tcg_gen_mov_i64(r, b); +} + +GEN_ATOMIC_HELPER(xchg, mov2, 0) + +#undef GEN_ATOMIC_HELPER diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c index 3136cef81a..edbd1c61d7 100644 --- a/tcg/tcg-op.c +++ b/tcg/tcg-op.c @@ -27,7 +27,6 @@ #include "tcg/tcg.h" #include "tcg/tcg-temp-internal.h" #include "tcg/tcg-op.h" -#include "tcg/tcg-mo.h" #include "exec/plugin-gen.h" #include "tcg-internal.h" @@ -2841,866 +2840,3 @@ void tcg_gen_lookup_and_goto_ptr(void) tcg_gen_op1i(INDEX_op_goto_ptr, tcgv_ptr_arg(ptr)); tcg_temp_free_ptr(ptr); } - -static inline MemOp tcg_canonicalize_memop(MemOp op, bool is64, bool st) -{ - /* Trigger the asserts within as early as possible. */ - unsigned a_bits = get_alignment_bits(op); - - /* Prefer MO_ALIGN+MO_XX over MO_ALIGN_XX+MO_XX */ - if (a_bits == (op & MO_SIZE)) { - op = (op & ~MO_AMASK) | MO_ALIGN; - } - - switch (op & MO_SIZE) { - case MO_8: - op &= ~MO_BSWAP; - break; - case MO_16: - break; - case MO_32: - if (!is64) { - op &= ~MO_SIGN; - } - break; - case MO_64: - if (is64) { - op &= ~MO_SIGN; - break; - } - /* fall through */ - default: - g_assert_not_reached(); - } - if (st) { - op &= ~MO_SIGN; - } - return op; -} - -static void gen_ldst_i32(TCGOpcode opc, TCGv_i32 val, TCGv addr, - MemOp memop, TCGArg idx) -{ - MemOpIdx oi = make_memop_idx(memop, idx); -#if TARGET_LONG_BITS == 32 - tcg_gen_op3i_i32(opc, val, addr, oi); -#else - if (TCG_TARGET_REG_BITS == 32) { - tcg_gen_op4i_i32(opc, val, TCGV_LOW(addr), TCGV_HIGH(addr), oi); - } else { - tcg_gen_op3(opc, tcgv_i32_arg(val), tcgv_i64_arg(addr), oi); - } -#endif -} - -static void gen_ldst_i64(TCGOpcode opc, TCGv_i64 val, TCGv addr, - MemOp memop, TCGArg idx) -{ - MemOpIdx oi = make_memop_idx(memop, idx); -#if TARGET_LONG_BITS == 32 - if (TCG_TARGET_REG_BITS == 32) { - tcg_gen_op4i_i32(opc, TCGV_LOW(val), TCGV_HIGH(val), addr, oi); - } else { - tcg_gen_op3(opc, tcgv_i64_arg(val), tcgv_i32_arg(addr), oi); - } -#else - if (TCG_TARGET_REG_BITS == 32) { - tcg_gen_op5i_i32(opc, TCGV_LOW(val), TCGV_HIGH(val), - TCGV_LOW(addr), TCGV_HIGH(addr), oi); - } else { - tcg_gen_op3i_i64(opc, val, addr, oi); - } -#endif -} - -static void tcg_gen_req_mo(TCGBar type) -{ -#ifdef TCG_GUEST_DEFAULT_MO - type &= TCG_GUEST_DEFAULT_MO; -#endif - type &= ~TCG_TARGET_DEFAULT_MO; - if (type) { - tcg_gen_mb(type | TCG_BAR_SC); - } -} - -static inline TCGv plugin_prep_mem_callbacks(TCGv vaddr) -{ -#ifdef CONFIG_PLUGIN - if (tcg_ctx->plugin_insn != NULL) { - /* Save a copy of the vaddr for use after a load. */ - TCGv temp = tcg_temp_new(); - tcg_gen_mov_tl(temp, vaddr); - return temp; - } -#endif - return vaddr; -} - -static void plugin_gen_mem_callbacks(TCGv vaddr, MemOpIdx oi, - enum qemu_plugin_mem_rw rw) -{ -#ifdef CONFIG_PLUGIN - if (tcg_ctx->plugin_insn != NULL) { - qemu_plugin_meminfo_t info = make_plugin_meminfo(oi, rw); - plugin_gen_empty_mem_callback(vaddr, info); - tcg_temp_free(vaddr); - } -#endif -} - -void tcg_gen_qemu_ld_i32(TCGv_i32 val, TCGv addr, TCGArg idx, MemOp memop) -{ - MemOp orig_memop; - MemOpIdx oi; - - tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_ST_LD); - memop = tcg_canonicalize_memop(memop, 0, 0); - oi = make_memop_idx(memop, idx); - - orig_memop = memop; - if (!TCG_TARGET_HAS_MEMORY_BSWAP && (memop & MO_BSWAP)) { - memop &= ~MO_BSWAP; - /* The bswap primitive benefits from zero-extended input. */ - if ((memop & MO_SSIZE) == MO_SW) { - memop &= ~MO_SIGN; - } - } - - addr = plugin_prep_mem_callbacks(addr); - gen_ldst_i32(INDEX_op_qemu_ld_i32, val, addr, memop, idx); - plugin_gen_mem_callbacks(addr, oi, QEMU_PLUGIN_MEM_R); - - if ((orig_memop ^ memop) & MO_BSWAP) { - switch (orig_memop & MO_SIZE) { - case MO_16: - tcg_gen_bswap16_i32(val, val, (orig_memop & MO_SIGN - ? TCG_BSWAP_IZ | TCG_BSWAP_OS - : TCG_BSWAP_IZ | TCG_BSWAP_OZ)); - break; - case MO_32: - tcg_gen_bswap32_i32(val, val); - break; - default: - g_assert_not_reached(); - } - } -} - -void tcg_gen_qemu_st_i32(TCGv_i32 val, TCGv addr, TCGArg idx, MemOp memop) -{ - TCGv_i32 swap = NULL; - MemOpIdx oi; - - tcg_gen_req_mo(TCG_MO_LD_ST | TCG_MO_ST_ST); - memop = tcg_canonicalize_memop(memop, 0, 1); - oi = make_memop_idx(memop, idx); - - if (!TCG_TARGET_HAS_MEMORY_BSWAP && (memop & MO_BSWAP)) { - swap = tcg_temp_ebb_new_i32(); - switch (memop & MO_SIZE) { - case MO_16: - tcg_gen_bswap16_i32(swap, val, 0); - break; - case MO_32: - tcg_gen_bswap32_i32(swap, val); - break; - default: - g_assert_not_reached(); - } - val = swap; - memop &= ~MO_BSWAP; - } - - addr = plugin_prep_mem_callbacks(addr); - if (TCG_TARGET_HAS_qemu_st8_i32 && (memop & MO_SIZE) == MO_8) { - gen_ldst_i32(INDEX_op_qemu_st8_i32, val, addr, memop, idx); - } else { - gen_ldst_i32(INDEX_op_qemu_st_i32, val, addr, memop, idx); - } - plugin_gen_mem_callbacks(addr, oi, QEMU_PLUGIN_MEM_W); - - if (swap) { - tcg_temp_free_i32(swap); - } -} - -void tcg_gen_qemu_ld_i64(TCGv_i64 val, TCGv addr, TCGArg idx, MemOp memop) -{ - MemOp orig_memop; - MemOpIdx oi; - - if (TCG_TARGET_REG_BITS == 32 && (memop & MO_SIZE) < MO_64) { - tcg_gen_qemu_ld_i32(TCGV_LOW(val), addr, idx, memop); - if (memop & MO_SIGN) { - tcg_gen_sari_i32(TCGV_HIGH(val), TCGV_LOW(val), 31); - } else { - tcg_gen_movi_i32(TCGV_HIGH(val), 0); - } - return; - } - - tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_ST_LD); - memop = tcg_canonicalize_memop(memop, 1, 0); - oi = make_memop_idx(memop, idx); - - orig_memop = memop; - if (!TCG_TARGET_HAS_MEMORY_BSWAP && (memop & MO_BSWAP)) { - memop &= ~MO_BSWAP; - /* The bswap primitive benefits from zero-extended input. */ - if ((memop & MO_SIGN) && (memop & MO_SIZE) < MO_64) { - memop &= ~MO_SIGN; - } - } - - addr = plugin_prep_mem_callbacks(addr); - gen_ldst_i64(INDEX_op_qemu_ld_i64, val, addr, memop, idx); - plugin_gen_mem_callbacks(addr, oi, QEMU_PLUGIN_MEM_R); - - if ((orig_memop ^ memop) & MO_BSWAP) { - int flags = (orig_memop & MO_SIGN - ? TCG_BSWAP_IZ | TCG_BSWAP_OS - : TCG_BSWAP_IZ | TCG_BSWAP_OZ); - switch (orig_memop & MO_SIZE) { - case MO_16: - tcg_gen_bswap16_i64(val, val, flags); - break; - case MO_32: - tcg_gen_bswap32_i64(val, val, flags); - break; - case MO_64: - tcg_gen_bswap64_i64(val, val); - break; - default: - g_assert_not_reached(); - } - } -} - -void tcg_gen_qemu_st_i64(TCGv_i64 val, TCGv addr, TCGArg idx, MemOp memop) -{ - TCGv_i64 swap = NULL; - MemOpIdx oi; - - if (TCG_TARGET_REG_BITS == 32 && (memop & MO_SIZE) < MO_64) { - tcg_gen_qemu_st_i32(TCGV_LOW(val), addr, idx, memop); - return; - } - - tcg_gen_req_mo(TCG_MO_LD_ST | TCG_MO_ST_ST); - memop = tcg_canonicalize_memop(memop, 1, 1); - oi = make_memop_idx(memop, idx); - - if (!TCG_TARGET_HAS_MEMORY_BSWAP && (memop & MO_BSWAP)) { - swap = tcg_temp_ebb_new_i64(); - switch (memop & MO_SIZE) { - case MO_16: - tcg_gen_bswap16_i64(swap, val, 0); - break; - case MO_32: - tcg_gen_bswap32_i64(swap, val, 0); - break; - case MO_64: - tcg_gen_bswap64_i64(swap, val); - break; - default: - g_assert_not_reached(); - } - val = swap; - memop &= ~MO_BSWAP; - } - - addr = plugin_prep_mem_callbacks(addr); - gen_ldst_i64(INDEX_op_qemu_st_i64, val, addr, memop, idx); - plugin_gen_mem_callbacks(addr, oi, QEMU_PLUGIN_MEM_W); - - if (swap) { - tcg_temp_free_i64(swap); - } -} - -static void canonicalize_memop_i128_as_i64(MemOp ret[2], MemOp orig) -{ - MemOp mop_1 = orig, mop_2; - - tcg_debug_assert((orig & MO_SIZE) == MO_128); - tcg_debug_assert((orig & MO_SIGN) == 0); - - /* Use a memory ordering implemented by the host. */ - if (!TCG_TARGET_HAS_MEMORY_BSWAP && (orig & MO_BSWAP)) { - mop_1 &= ~MO_BSWAP; - } - - /* Reduce the size to 64-bit. */ - mop_1 = (mop_1 & ~MO_SIZE) | MO_64; - - /* Retain the alignment constraints of the original. */ - switch (orig & MO_AMASK) { - case MO_UNALN: - case MO_ALIGN_2: - case MO_ALIGN_4: - mop_2 = mop_1; - break; - case MO_ALIGN_8: - /* Prefer MO_ALIGN+MO_64 to MO_ALIGN_8+MO_64. */ - mop_1 = (mop_1 & ~MO_AMASK) | MO_ALIGN; - mop_2 = mop_1; - break; - case MO_ALIGN: - /* Second has 8-byte alignment; first has 16-byte alignment. */ - mop_2 = mop_1; - mop_1 = (mop_1 & ~MO_AMASK) | MO_ALIGN_16; - break; - case MO_ALIGN_16: - case MO_ALIGN_32: - case MO_ALIGN_64: - /* Second has 8-byte alignment; first retains original. */ - mop_2 = (mop_1 & ~MO_AMASK) | MO_ALIGN; - break; - default: - g_assert_not_reached(); - } - ret[0] = mop_1; - ret[1] = mop_2; -} - -void tcg_gen_qemu_ld_i128(TCGv_i128 val, TCGv addr, TCGArg idx, MemOp memop) -{ - MemOp mop[2]; - TCGv addr_p8; - TCGv_i64 x, y; - - canonicalize_memop_i128_as_i64(mop, memop); - - tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_ST_LD); - addr = plugin_prep_mem_callbacks(addr); - - /* TODO: respect atomicity of the operation. */ - /* TODO: allow the tcg backend to see the whole operation. */ - - /* - * Since there are no global TCGv_i128, there is no visible state - * changed if the second load faults. Load directly into the two - * subwords. - */ - if ((memop & MO_BSWAP) == MO_LE) { - x = TCGV128_LOW(val); - y = TCGV128_HIGH(val); - } else { - x = TCGV128_HIGH(val); - y = TCGV128_LOW(val); - } - - gen_ldst_i64(INDEX_op_qemu_ld_i64, x, addr, mop[0], idx); - - if ((mop[0] ^ memop) & MO_BSWAP) { - tcg_gen_bswap64_i64(x, x); - } - - addr_p8 = tcg_temp_new(); - tcg_gen_addi_tl(addr_p8, addr, 8); - gen_ldst_i64(INDEX_op_qemu_ld_i64, y, addr_p8, mop[1], idx); - tcg_temp_free(addr_p8); - - if ((mop[0] ^ memop) & MO_BSWAP) { - tcg_gen_bswap64_i64(y, y); - } - - plugin_gen_mem_callbacks(addr, make_memop_idx(memop, idx), - QEMU_PLUGIN_MEM_R); -} - -void tcg_gen_qemu_st_i128(TCGv_i128 val, TCGv addr, TCGArg idx, MemOp memop) -{ - MemOp mop[2]; - TCGv addr_p8; - TCGv_i64 x, y; - - canonicalize_memop_i128_as_i64(mop, memop); - - tcg_gen_req_mo(TCG_MO_ST_LD | TCG_MO_ST_ST); - addr = plugin_prep_mem_callbacks(addr); - - /* TODO: respect atomicity of the operation. */ - /* TODO: allow the tcg backend to see the whole operation. */ - - if ((memop & MO_BSWAP) == MO_LE) { - x = TCGV128_LOW(val); - y = TCGV128_HIGH(val); - } else { - x = TCGV128_HIGH(val); - y = TCGV128_LOW(val); - } - - addr_p8 = tcg_temp_new(); - if ((mop[0] ^ memop) & MO_BSWAP) { - TCGv_i64 t = tcg_temp_ebb_new_i64(); - - tcg_gen_bswap64_i64(t, x); - gen_ldst_i64(INDEX_op_qemu_st_i64, t, addr, mop[0], idx); - tcg_gen_bswap64_i64(t, y); - tcg_gen_addi_tl(addr_p8, addr, 8); - gen_ldst_i64(INDEX_op_qemu_st_i64, t, addr_p8, mop[1], idx); - tcg_temp_free_i64(t); - } else { - gen_ldst_i64(INDEX_op_qemu_st_i64, x, addr, mop[0], idx); - tcg_gen_addi_tl(addr_p8, addr, 8); - gen_ldst_i64(INDEX_op_qemu_st_i64, y, addr_p8, mop[1], idx); - } - tcg_temp_free(addr_p8); - - plugin_gen_mem_callbacks(addr, make_memop_idx(memop, idx), - QEMU_PLUGIN_MEM_W); -} - -static void tcg_gen_ext_i32(TCGv_i32 ret, TCGv_i32 val, MemOp opc) -{ - switch (opc & MO_SSIZE) { - case MO_SB: - tcg_gen_ext8s_i32(ret, val); - break; - case MO_UB: - tcg_gen_ext8u_i32(ret, val); - break; - case MO_SW: - tcg_gen_ext16s_i32(ret, val); - break; - case MO_UW: - tcg_gen_ext16u_i32(ret, val); - break; - default: - tcg_gen_mov_i32(ret, val); - break; - } -} - -static void tcg_gen_ext_i64(TCGv_i64 ret, TCGv_i64 val, MemOp opc) -{ - switch (opc & MO_SSIZE) { - case MO_SB: - tcg_gen_ext8s_i64(ret, val); - break; - case MO_UB: - tcg_gen_ext8u_i64(ret, val); - break; - case MO_SW: - tcg_gen_ext16s_i64(ret, val); - break; - case MO_UW: - tcg_gen_ext16u_i64(ret, val); - break; - case MO_SL: - tcg_gen_ext32s_i64(ret, val); - break; - case MO_UL: - tcg_gen_ext32u_i64(ret, val); - break; - default: - tcg_gen_mov_i64(ret, val); - break; - } -} - -typedef void (*gen_atomic_cx_i32)(TCGv_i32, TCGv_env, TCGv, - TCGv_i32, TCGv_i32, TCGv_i32); -typedef void (*gen_atomic_cx_i64)(TCGv_i64, TCGv_env, TCGv, - TCGv_i64, TCGv_i64, TCGv_i32); -typedef void (*gen_atomic_cx_i128)(TCGv_i128, TCGv_env, TCGv, - TCGv_i128, TCGv_i128, TCGv_i32); -typedef void (*gen_atomic_op_i32)(TCGv_i32, TCGv_env, TCGv, - TCGv_i32, TCGv_i32); -typedef void (*gen_atomic_op_i64)(TCGv_i64, TCGv_env, TCGv, - TCGv_i64, TCGv_i32); - -#ifdef CONFIG_ATOMIC64 -# define WITH_ATOMIC64(X) X, -#else -# define WITH_ATOMIC64(X) -#endif -#ifdef CONFIG_CMPXCHG128 -# define WITH_ATOMIC128(X) X, -#else -# define WITH_ATOMIC128(X) -#endif - -static void * const table_cmpxchg[(MO_SIZE | MO_BSWAP) + 1] = { - [MO_8] = gen_helper_atomic_cmpxchgb, - [MO_16 | MO_LE] = gen_helper_atomic_cmpxchgw_le, - [MO_16 | MO_BE] = gen_helper_atomic_cmpxchgw_be, - [MO_32 | MO_LE] = gen_helper_atomic_cmpxchgl_le, - [MO_32 | MO_BE] = gen_helper_atomic_cmpxchgl_be, - WITH_ATOMIC64([MO_64 | MO_LE] = gen_helper_atomic_cmpxchgq_le) - WITH_ATOMIC64([MO_64 | MO_BE] = gen_helper_atomic_cmpxchgq_be) - WITH_ATOMIC128([MO_128 | MO_LE] = gen_helper_atomic_cmpxchgo_le) - WITH_ATOMIC128([MO_128 | MO_BE] = gen_helper_atomic_cmpxchgo_be) -}; - -void tcg_gen_nonatomic_cmpxchg_i32(TCGv_i32 retv, TCGv addr, TCGv_i32 cmpv, - TCGv_i32 newv, TCGArg idx, MemOp memop) -{ - TCGv_i32 t1 = tcg_temp_ebb_new_i32(); - TCGv_i32 t2 = tcg_temp_ebb_new_i32(); - - tcg_gen_ext_i32(t2, cmpv, memop & MO_SIZE); - - tcg_gen_qemu_ld_i32(t1, addr, idx, memop & ~MO_SIGN); - tcg_gen_movcond_i32(TCG_COND_EQ, t2, t1, t2, newv, t1); - tcg_gen_qemu_st_i32(t2, addr, idx, memop); - tcg_temp_free_i32(t2); - - if (memop & MO_SIGN) { - tcg_gen_ext_i32(retv, t1, memop); - } else { - tcg_gen_mov_i32(retv, t1); - } - tcg_temp_free_i32(t1); -} - -void tcg_gen_atomic_cmpxchg_i32(TCGv_i32 retv, TCGv addr, TCGv_i32 cmpv, - TCGv_i32 newv, TCGArg idx, MemOp memop) -{ - gen_atomic_cx_i32 gen; - MemOpIdx oi; - - if (!(tcg_ctx->gen_tb->cflags & CF_PARALLEL)) { - tcg_gen_nonatomic_cmpxchg_i32(retv, addr, cmpv, newv, idx, memop); - return; - } - - memop = tcg_canonicalize_memop(memop, 0, 0); - gen = table_cmpxchg[memop & (MO_SIZE | MO_BSWAP)]; - tcg_debug_assert(gen != NULL); - - oi = make_memop_idx(memop & ~MO_SIGN, idx); - gen(retv, cpu_env, addr, cmpv, newv, tcg_constant_i32(oi)); - - if (memop & MO_SIGN) { - tcg_gen_ext_i32(retv, retv, memop); - } -} - -void tcg_gen_nonatomic_cmpxchg_i64(TCGv_i64 retv, TCGv addr, TCGv_i64 cmpv, - TCGv_i64 newv, TCGArg idx, MemOp memop) -{ - TCGv_i64 t1, t2; - - if (TCG_TARGET_REG_BITS == 32 && (memop & MO_SIZE) < MO_64) { - tcg_gen_nonatomic_cmpxchg_i32(TCGV_LOW(retv), addr, TCGV_LOW(cmpv), - TCGV_LOW(newv), idx, memop); - if (memop & MO_SIGN) { - tcg_gen_sari_i32(TCGV_HIGH(retv), TCGV_LOW(retv), 31); - } else { - tcg_gen_movi_i32(TCGV_HIGH(retv), 0); - } - return; - } - - t1 = tcg_temp_ebb_new_i64(); - t2 = tcg_temp_ebb_new_i64(); - - tcg_gen_ext_i64(t2, cmpv, memop & MO_SIZE); - - tcg_gen_qemu_ld_i64(t1, addr, idx, memop & ~MO_SIGN); - tcg_gen_movcond_i64(TCG_COND_EQ, t2, t1, t2, newv, t1); - tcg_gen_qemu_st_i64(t2, addr, idx, memop); - tcg_temp_free_i64(t2); - - if (memop & MO_SIGN) { - tcg_gen_ext_i64(retv, t1, memop); - } else { - tcg_gen_mov_i64(retv, t1); - } - tcg_temp_free_i64(t1); -} - -void tcg_gen_atomic_cmpxchg_i64(TCGv_i64 retv, TCGv addr, TCGv_i64 cmpv, - TCGv_i64 newv, TCGArg idx, MemOp memop) -{ - if (!(tcg_ctx->gen_tb->cflags & CF_PARALLEL)) { - tcg_gen_nonatomic_cmpxchg_i64(retv, addr, cmpv, newv, idx, memop); - return; - } - - if ((memop & MO_SIZE) == MO_64) { - gen_atomic_cx_i64 gen; - - memop = tcg_canonicalize_memop(memop, 1, 0); - gen = table_cmpxchg[memop & (MO_SIZE | MO_BSWAP)]; - if (gen) { - MemOpIdx oi = make_memop_idx(memop, idx); - gen(retv, cpu_env, addr, cmpv, newv, tcg_constant_i32(oi)); - return; - } - - gen_helper_exit_atomic(cpu_env); - - /* - * Produce a result for a well-formed opcode stream. This satisfies - * liveness for set before used, which happens before this dead code - * is removed. - */ - tcg_gen_movi_i64(retv, 0); - return; - } - - if (TCG_TARGET_REG_BITS == 32) { - tcg_gen_atomic_cmpxchg_i32(TCGV_LOW(retv), addr, TCGV_LOW(cmpv), - TCGV_LOW(newv), idx, memop); - if (memop & MO_SIGN) { - tcg_gen_sari_i32(TCGV_HIGH(retv), TCGV_LOW(retv), 31); - } else { - tcg_gen_movi_i32(TCGV_HIGH(retv), 0); - } - } else { - TCGv_i32 c32 = tcg_temp_ebb_new_i32(); - TCGv_i32 n32 = tcg_temp_ebb_new_i32(); - TCGv_i32 r32 = tcg_temp_ebb_new_i32(); - - tcg_gen_extrl_i64_i32(c32, cmpv); - tcg_gen_extrl_i64_i32(n32, newv); - tcg_gen_atomic_cmpxchg_i32(r32, addr, c32, n32, idx, memop & ~MO_SIGN); - tcg_temp_free_i32(c32); - tcg_temp_free_i32(n32); - - tcg_gen_extu_i32_i64(retv, r32); - tcg_temp_free_i32(r32); - - if (memop & MO_SIGN) { - tcg_gen_ext_i64(retv, retv, memop); - } - } -} - -void tcg_gen_nonatomic_cmpxchg_i128(TCGv_i128 retv, TCGv addr, TCGv_i128 cmpv, - TCGv_i128 newv, TCGArg idx, MemOp memop) -{ - if (TCG_TARGET_REG_BITS == 32) { - /* Inline expansion below is simply too large for 32-bit hosts. */ - gen_atomic_cx_i128 gen = ((memop & MO_BSWAP) == MO_LE - ? gen_helper_nonatomic_cmpxchgo_le - : gen_helper_nonatomic_cmpxchgo_be); - MemOpIdx oi = make_memop_idx(memop, idx); - - tcg_debug_assert((memop & MO_SIZE) == MO_128); - tcg_debug_assert((memop & MO_SIGN) == 0); - - gen(retv, cpu_env, addr, cmpv, newv, tcg_constant_i32(oi)); - } else { - TCGv_i128 oldv = tcg_temp_ebb_new_i128(); - TCGv_i128 tmpv = tcg_temp_ebb_new_i128(); - TCGv_i64 t0 = tcg_temp_ebb_new_i64(); - TCGv_i64 t1 = tcg_temp_ebb_new_i64(); - TCGv_i64 z = tcg_constant_i64(0); - - tcg_gen_qemu_ld_i128(oldv, addr, idx, memop); - - /* Compare i128 */ - tcg_gen_xor_i64(t0, TCGV128_LOW(oldv), TCGV128_LOW(cmpv)); - tcg_gen_xor_i64(t1, TCGV128_HIGH(oldv), TCGV128_HIGH(cmpv)); - tcg_gen_or_i64(t0, t0, t1); - - /* tmpv = equal ? newv : oldv */ - tcg_gen_movcond_i64(TCG_COND_EQ, TCGV128_LOW(tmpv), t0, z, - TCGV128_LOW(newv), TCGV128_LOW(oldv)); - tcg_gen_movcond_i64(TCG_COND_EQ, TCGV128_HIGH(tmpv), t0, z, - TCGV128_HIGH(newv), TCGV128_HIGH(oldv)); - - /* Unconditional writeback. */ - tcg_gen_qemu_st_i128(tmpv, addr, idx, memop); - tcg_gen_mov_i128(retv, oldv); - - tcg_temp_free_i64(t0); - tcg_temp_free_i64(t1); - tcg_temp_free_i128(tmpv); - tcg_temp_free_i128(oldv); - } -} - -void tcg_gen_atomic_cmpxchg_i128(TCGv_i128 retv, TCGv addr, TCGv_i128 cmpv, - TCGv_i128 newv, TCGArg idx, MemOp memop) -{ - gen_atomic_cx_i128 gen; - - if (!(tcg_ctx->gen_tb->cflags & CF_PARALLEL)) { - tcg_gen_nonatomic_cmpxchg_i128(retv, addr, cmpv, newv, idx, memop); - return; - } - - tcg_debug_assert((memop & MO_SIZE) == MO_128); - tcg_debug_assert((memop & MO_SIGN) == 0); - gen = table_cmpxchg[memop & (MO_SIZE | MO_BSWAP)]; - - if (gen) { - MemOpIdx oi = make_memop_idx(memop, idx); - gen(retv, cpu_env, addr, cmpv, newv, tcg_constant_i32(oi)); - return; - } - - gen_helper_exit_atomic(cpu_env); - - /* - * Produce a result for a well-formed opcode stream. This satisfies - * liveness for set before used, which happens before this dead code - * is removed. - */ - tcg_gen_movi_i64(TCGV128_LOW(retv), 0); - tcg_gen_movi_i64(TCGV128_HIGH(retv), 0); -} - -static void do_nonatomic_op_i32(TCGv_i32 ret, TCGv addr, TCGv_i32 val, - TCGArg idx, MemOp memop, bool new_val, - void (*gen)(TCGv_i32, TCGv_i32, TCGv_i32)) -{ - TCGv_i32 t1 = tcg_temp_ebb_new_i32(); - TCGv_i32 t2 = tcg_temp_ebb_new_i32(); - - memop = tcg_canonicalize_memop(memop, 0, 0); - - tcg_gen_qemu_ld_i32(t1, addr, idx, memop); - tcg_gen_ext_i32(t2, val, memop); - gen(t2, t1, t2); - tcg_gen_qemu_st_i32(t2, addr, idx, memop); - - tcg_gen_ext_i32(ret, (new_val ? t2 : t1), memop); - tcg_temp_free_i32(t1); - tcg_temp_free_i32(t2); -} - -static void do_atomic_op_i32(TCGv_i32 ret, TCGv addr, TCGv_i32 val, - TCGArg idx, MemOp memop, void * const table[]) -{ - gen_atomic_op_i32 gen; - MemOpIdx oi; - - memop = tcg_canonicalize_memop(memop, 0, 0); - - gen = table[memop & (MO_SIZE | MO_BSWAP)]; - tcg_debug_assert(gen != NULL); - - oi = make_memop_idx(memop & ~MO_SIGN, idx); - gen(ret, cpu_env, addr, val, tcg_constant_i32(oi)); - - if (memop & MO_SIGN) { - tcg_gen_ext_i32(ret, ret, memop); - } -} - -static void do_nonatomic_op_i64(TCGv_i64 ret, TCGv addr, TCGv_i64 val, - TCGArg idx, MemOp memop, bool new_val, - void (*gen)(TCGv_i64, TCGv_i64, TCGv_i64)) -{ - TCGv_i64 t1 = tcg_temp_ebb_new_i64(); - TCGv_i64 t2 = tcg_temp_ebb_new_i64(); - - memop = tcg_canonicalize_memop(memop, 1, 0); - - tcg_gen_qemu_ld_i64(t1, addr, idx, memop); - tcg_gen_ext_i64(t2, val, memop); - gen(t2, t1, t2); - tcg_gen_qemu_st_i64(t2, addr, idx, memop); - - tcg_gen_ext_i64(ret, (new_val ? t2 : t1), memop); - tcg_temp_free_i64(t1); - tcg_temp_free_i64(t2); -} - -static void do_atomic_op_i64(TCGv_i64 ret, TCGv addr, TCGv_i64 val, - TCGArg idx, MemOp memop, void * const table[]) -{ - memop = tcg_canonicalize_memop(memop, 1, 0); - - if ((memop & MO_SIZE) == MO_64) { -#ifdef CONFIG_ATOMIC64 - gen_atomic_op_i64 gen; - MemOpIdx oi; - - gen = table[memop & (MO_SIZE | MO_BSWAP)]; - tcg_debug_assert(gen != NULL); - - oi = make_memop_idx(memop & ~MO_SIGN, idx); - gen(ret, cpu_env, addr, val, tcg_constant_i32(oi)); -#else - gen_helper_exit_atomic(cpu_env); - /* Produce a result, so that we have a well-formed opcode stream - with respect to uses of the result in the (dead) code following. */ - tcg_gen_movi_i64(ret, 0); -#endif /* CONFIG_ATOMIC64 */ - } else { - TCGv_i32 v32 = tcg_temp_ebb_new_i32(); - TCGv_i32 r32 = tcg_temp_ebb_new_i32(); - - tcg_gen_extrl_i64_i32(v32, val); - do_atomic_op_i32(r32, addr, v32, idx, memop & ~MO_SIGN, table); - tcg_temp_free_i32(v32); - - tcg_gen_extu_i32_i64(ret, r32); - tcg_temp_free_i32(r32); - - if (memop & MO_SIGN) { - tcg_gen_ext_i64(ret, ret, memop); - } - } -} - -#define GEN_ATOMIC_HELPER(NAME, OP, NEW) \ -static void * const table_##NAME[(MO_SIZE | MO_BSWAP) + 1] = { \ - [MO_8] = gen_helper_atomic_##NAME##b, \ - [MO_16 | MO_LE] = gen_helper_atomic_##NAME##w_le, \ - [MO_16 | MO_BE] = gen_helper_atomic_##NAME##w_be, \ - [MO_32 | MO_LE] = gen_helper_atomic_##NAME##l_le, \ - [MO_32 | MO_BE] = gen_helper_atomic_##NAME##l_be, \ - WITH_ATOMIC64([MO_64 | MO_LE] = gen_helper_atomic_##NAME##q_le) \ - WITH_ATOMIC64([MO_64 | MO_BE] = gen_helper_atomic_##NAME##q_be) \ -}; \ -void tcg_gen_atomic_##NAME##_i32 \ - (TCGv_i32 ret, TCGv addr, TCGv_i32 val, TCGArg idx, MemOp memop) \ -{ \ - if (tcg_ctx->gen_tb->cflags & CF_PARALLEL) { \ - do_atomic_op_i32(ret, addr, val, idx, memop, table_##NAME); \ - } else { \ - do_nonatomic_op_i32(ret, addr, val, idx, memop, NEW, \ - tcg_gen_##OP##_i32); \ - } \ -} \ -void tcg_gen_atomic_##NAME##_i64 \ - (TCGv_i64 ret, TCGv addr, TCGv_i64 val, TCGArg idx, MemOp memop) \ -{ \ - if (tcg_ctx->gen_tb->cflags & CF_PARALLEL) { \ - do_atomic_op_i64(ret, addr, val, idx, memop, table_##NAME); \ - } else { \ - do_nonatomic_op_i64(ret, addr, val, idx, memop, NEW, \ - tcg_gen_##OP##_i64); \ - } \ -} - -GEN_ATOMIC_HELPER(fetch_add, add, 0) -GEN_ATOMIC_HELPER(fetch_and, and, 0) -GEN_ATOMIC_HELPER(fetch_or, or, 0) -GEN_ATOMIC_HELPER(fetch_xor, xor, 0) -GEN_ATOMIC_HELPER(fetch_smin, smin, 0) -GEN_ATOMIC_HELPER(fetch_umin, umin, 0) -GEN_ATOMIC_HELPER(fetch_smax, smax, 0) -GEN_ATOMIC_HELPER(fetch_umax, umax, 0) - -GEN_ATOMIC_HELPER(add_fetch, add, 1) -GEN_ATOMIC_HELPER(and_fetch, and, 1) -GEN_ATOMIC_HELPER(or_fetch, or, 1) -GEN_ATOMIC_HELPER(xor_fetch, xor, 1) -GEN_ATOMIC_HELPER(smin_fetch, smin, 1) -GEN_ATOMIC_HELPER(umin_fetch, umin, 1) -GEN_ATOMIC_HELPER(smax_fetch, smax, 1) -GEN_ATOMIC_HELPER(umax_fetch, umax, 1) - -static void tcg_gen_mov2_i32(TCGv_i32 r, TCGv_i32 a, TCGv_i32 b) -{ - tcg_gen_mov_i32(r, b); -} - -static void tcg_gen_mov2_i64(TCGv_i64 r, TCGv_i64 a, TCGv_i64 b) -{ - tcg_gen_mov_i64(r, b); -} - -GEN_ATOMIC_HELPER(xchg, mov2, 0) - -#undef GEN_ATOMIC_HELPER @@ -63,6 +63,9 @@ #include "tcg/tcg-temp-internal.h" #include "tcg-internal.h" #include "accel/tcg/perf.h" +#ifdef CONFIG_USER_ONLY +#include "exec/user/guest-base.h" +#endif /* Forward declarations for functions declared in tcg-target.c.inc and used here. */ @@ -197,6 +200,38 @@ static void tcg_out_st_helper_args(TCGContext *s, const TCGLabelQemuLdst *l, const TCGLdstHelperParam *p) __attribute__((unused)); +static void * const qemu_ld_helpers[MO_SSIZE + 1] __attribute__((unused)) = { + [MO_UB] = helper_ldub_mmu, + [MO_SB] = helper_ldsb_mmu, + [MO_UW] = helper_lduw_mmu, + [MO_SW] = helper_ldsw_mmu, + [MO_UL] = helper_ldul_mmu, + [MO_UQ] = helper_ldq_mmu, +#if TCG_TARGET_REG_BITS == 64 + [MO_SL] = helper_ldsl_mmu, + [MO_128] = helper_ld16_mmu, +#endif +}; + +static void * const qemu_st_helpers[MO_SIZE + 1] __attribute__((unused)) = { + [MO_8] = helper_stb_mmu, + [MO_16] = helper_stw_mmu, + [MO_32] = helper_stl_mmu, + [MO_64] = helper_stq_mmu, +#if TCG_TARGET_REG_BITS == 64 + [MO_128] = helper_st16_mmu, +#endif +}; + +typedef struct { + MemOp atom; /* lg2 bits of atomicity required */ + MemOp align; /* lg2 bits of alignment to use */ +} TCGAtomAlign; + +static TCGAtomAlign atom_and_align_for_opc(TCGContext *s, MemOp opc, + MemOp host_atom, bool allow_two_ops) + __attribute__((unused)); + TCGContext tcg_init_ctx; __thread TCGContext *tcg_ctx; @@ -513,6 +548,82 @@ static void tcg_out_movext2(TCGContext *s, const TCGMovExtend *i1, tcg_out_movext1_new_src(s, i1, src1); } +/** + * tcg_out_movext3 -- move and extend three pair + * @s: tcg context + * @i1: first move description + * @i2: second move description + * @i3: third move description + * @scratch: temporary register, or -1 for none + * + * As tcg_out_movext, for all of @i1, @i2 and @i3, caring for overlap + * between the sources and destinations. + */ + +static void tcg_out_movext3(TCGContext *s, const TCGMovExtend *i1, + const TCGMovExtend *i2, const TCGMovExtend *i3, + int scratch) +{ + TCGReg src1 = i1->src; + TCGReg src2 = i2->src; + TCGReg src3 = i3->src; + + if (i1->dst != src2 && i1->dst != src3) { + tcg_out_movext1(s, i1); + tcg_out_movext2(s, i2, i3, scratch); + return; + } + if (i2->dst != src1 && i2->dst != src3) { + tcg_out_movext1(s, i2); + tcg_out_movext2(s, i1, i3, scratch); + return; + } + if (i3->dst != src1 && i3->dst != src2) { + tcg_out_movext1(s, i3); + tcg_out_movext2(s, i1, i2, scratch); + return; + } + + /* + * There is a cycle. Since there are only 3 nodes, the cycle is + * either "clockwise" or "anti-clockwise", and can be solved with + * a single scratch or two xchg. + */ + if (i1->dst == src2 && i2->dst == src3 && i3->dst == src1) { + /* "Clockwise" */ + if (tcg_out_xchg(s, MAX(i1->src_type, i2->src_type), src1, src2)) { + tcg_out_xchg(s, MAX(i2->src_type, i3->src_type), src2, src3); + /* The data is now in the correct registers, now extend. */ + tcg_out_movext1_new_src(s, i1, i1->dst); + tcg_out_movext1_new_src(s, i2, i2->dst); + tcg_out_movext1_new_src(s, i3, i3->dst); + } else { + tcg_debug_assert(scratch >= 0); + tcg_out_mov(s, i1->src_type, scratch, src1); + tcg_out_movext1(s, i3); + tcg_out_movext1(s, i2); + tcg_out_movext1_new_src(s, i1, scratch); + } + } else if (i1->dst == src3 && i2->dst == src1 && i3->dst == src2) { + /* "Anti-clockwise" */ + if (tcg_out_xchg(s, MAX(i2->src_type, i3->src_type), src2, src3)) { + tcg_out_xchg(s, MAX(i1->src_type, i2->src_type), src1, src2); + /* The data is now in the correct registers, now extend. */ + tcg_out_movext1_new_src(s, i1, i1->dst); + tcg_out_movext1_new_src(s, i2, i2->dst); + tcg_out_movext1_new_src(s, i3, i3->dst); + } else { + tcg_debug_assert(scratch >= 0); + tcg_out_mov(s, i1->src_type, scratch, src1); + tcg_out_movext1(s, i2); + tcg_out_movext1(s, i3); + tcg_out_movext1_new_src(s, i1, scratch); + } + } else { + g_assert_not_reached(); + } +} + #define C_PFX1(P, A) P##A #define C_PFX2(P, A, B) P##A##_##B #define C_PFX3(P, A, B, C) P##A##_##B##_##C @@ -757,7 +868,7 @@ static TCGHelperInfo info_helper_ld32_mmu = { .flags = TCG_CALL_NO_WG, .typemask = dh_typemask(ttl, 0) /* return tcg_target_ulong */ | dh_typemask(env, 1) - | dh_typemask(tl, 2) /* target_ulong addr */ + | dh_typemask(i64, 2) /* uint64_t addr */ | dh_typemask(i32, 3) /* unsigned oi */ | dh_typemask(ptr, 4) /* uintptr_t ra */ }; @@ -766,7 +877,16 @@ static TCGHelperInfo info_helper_ld64_mmu = { .flags = TCG_CALL_NO_WG, .typemask = dh_typemask(i64, 0) /* return uint64_t */ | dh_typemask(env, 1) - | dh_typemask(tl, 2) /* target_ulong addr */ + | dh_typemask(i64, 2) /* uint64_t addr */ + | dh_typemask(i32, 3) /* unsigned oi */ + | dh_typemask(ptr, 4) /* uintptr_t ra */ +}; + +static TCGHelperInfo info_helper_ld128_mmu = { + .flags = TCG_CALL_NO_WG, + .typemask = dh_typemask(i128, 0) /* return Int128 */ + | dh_typemask(env, 1) + | dh_typemask(i64, 2) /* uint64_t addr */ | dh_typemask(i32, 3) /* unsigned oi */ | dh_typemask(ptr, 4) /* uintptr_t ra */ }; @@ -775,7 +895,7 @@ static TCGHelperInfo info_helper_st32_mmu = { .flags = TCG_CALL_NO_WG, .typemask = dh_typemask(void, 0) | dh_typemask(env, 1) - | dh_typemask(tl, 2) /* target_ulong addr */ + | dh_typemask(i64, 2) /* uint64_t addr */ | dh_typemask(i32, 3) /* uint32_t data */ | dh_typemask(i32, 4) /* unsigned oi */ | dh_typemask(ptr, 5) /* uintptr_t ra */ @@ -785,12 +905,22 @@ static TCGHelperInfo info_helper_st64_mmu = { .flags = TCG_CALL_NO_WG, .typemask = dh_typemask(void, 0) | dh_typemask(env, 1) - | dh_typemask(tl, 2) /* target_ulong addr */ + | dh_typemask(i64, 2) /* uint64_t addr */ | dh_typemask(i64, 3) /* uint64_t data */ | dh_typemask(i32, 4) /* unsigned oi */ | dh_typemask(ptr, 5) /* uintptr_t ra */ }; +static TCGHelperInfo info_helper_st128_mmu = { + .flags = TCG_CALL_NO_WG, + .typemask = dh_typemask(void, 0) + | dh_typemask(env, 1) + | dh_typemask(i64, 2) /* uint64_t addr */ + | dh_typemask(i128, 3) /* Int128 data */ + | dh_typemask(i32, 4) /* unsigned oi */ + | dh_typemask(ptr, 5) /* uintptr_t ra */ +}; + #ifdef CONFIG_TCG_INTERPRETER static ffi_type *typecode_to_ffi(int argmask) { @@ -1204,8 +1334,10 @@ static void tcg_context_init(unsigned max_cpus) init_call_layout(&info_helper_ld32_mmu); init_call_layout(&info_helper_ld64_mmu); + init_call_layout(&info_helper_ld128_mmu); init_call_layout(&info_helper_st32_mmu); init_call_layout(&info_helper_st64_mmu); + init_call_layout(&info_helper_st128_mmu); #ifdef CONFIG_TCG_INTERPRETER init_ffi_layouts(); @@ -1391,6 +1523,9 @@ void tcg_func_start(TCGContext *s) QTAILQ_INIT(&s->ops); QTAILQ_INIT(&s->free_ops); QSIMPLEQ_INIT(&s->labels); + + tcg_debug_assert(s->addr_type == TCG_TYPE_I32 || + s->addr_type == TCG_TYPE_I64); } static TCGTemp *tcg_temp_alloc(TCGContext *s) @@ -1707,15 +1842,26 @@ bool tcg_op_supported(TCGOpcode op) case INDEX_op_exit_tb: case INDEX_op_goto_tb: case INDEX_op_goto_ptr: - case INDEX_op_qemu_ld_i32: - case INDEX_op_qemu_st_i32: - case INDEX_op_qemu_ld_i64: - case INDEX_op_qemu_st_i64: + case INDEX_op_qemu_ld_a32_i32: + case INDEX_op_qemu_ld_a64_i32: + case INDEX_op_qemu_st_a32_i32: + case INDEX_op_qemu_st_a64_i32: + case INDEX_op_qemu_ld_a32_i64: + case INDEX_op_qemu_ld_a64_i64: + case INDEX_op_qemu_st_a32_i64: + case INDEX_op_qemu_st_a64_i64: return true; - case INDEX_op_qemu_st8_i32: + case INDEX_op_qemu_st8_a32_i32: + case INDEX_op_qemu_st8_a64_i32: return TCG_TARGET_HAS_qemu_st8_i32; + case INDEX_op_qemu_ld_a32_i128: + case INDEX_op_qemu_ld_a64_i128: + case INDEX_op_qemu_st_a32_i128: + case INDEX_op_qemu_st_a64_i128: + return TCG_TARGET_HAS_qemu_ldst_i128; + case INDEX_op_mov_i32: case INDEX_op_setcond_i32: case INDEX_op_brcond_i32: @@ -2168,7 +2314,7 @@ static const char * const cond_name[] = [TCG_COND_GTU] = "gtu" }; -static const char * const ldst_name[] = +static const char * const ldst_name[(MO_BSWAP | MO_SSIZE) + 1] = { [MO_UB] = "ub", [MO_SB] = "sb", @@ -2182,6 +2328,8 @@ static const char * const ldst_name[] = [MO_BEUL] = "beul", [MO_BESL] = "besl", [MO_BEUQ] = "beq", + [MO_128 + MO_BE] = "beo", + [MO_128 + MO_LE] = "leo", }; static const char * const alignment_name[(MO_AMASK >> MO_ASHIFT) + 1] = { @@ -2195,6 +2343,15 @@ static const char * const alignment_name[(MO_AMASK >> MO_ASHIFT) + 1] = { [MO_ALIGN_64 >> MO_ASHIFT] = "al64+", }; +static const char * const atom_name[(MO_ATOM_MASK >> MO_ATOM_SHIFT) + 1] = { + [MO_ATOM_IFALIGN >> MO_ATOM_SHIFT] = "", + [MO_ATOM_IFALIGN_PAIR >> MO_ATOM_SHIFT] = "pair+", + [MO_ATOM_WITHIN16 >> MO_ATOM_SHIFT] = "w16+", + [MO_ATOM_WITHIN16_PAIR >> MO_ATOM_SHIFT] = "w16p+", + [MO_ATOM_SUBALIGN >> MO_ATOM_SHIFT] = "sub+", + [MO_ATOM_NONE >> MO_ATOM_SHIFT] = "noat+", +}; + static const char bswap_flag_name[][6] = { [TCG_BSWAP_IZ] = "iz", [TCG_BSWAP_OZ] = "oz", @@ -2240,13 +2397,8 @@ static void tcg_dump_ops(TCGContext *s, FILE *f, bool have_prefs) col += ne_fprintf(f, "\n ----"); for (i = 0; i < TARGET_INSN_START_WORDS; ++i) { - target_ulong a; -#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS - a = deposit64(op->args[i * 2], 32, 32, op->args[i * 2 + 1]); -#else - a = op->args[i]; -#endif - col += ne_fprintf(f, " " TARGET_FMT_lx, a); + col += ne_fprintf(f, " %016" PRIx64, + tcg_get_insn_start_param(op, i)); } } else if (c == INDEX_op_call) { const TCGHelperInfo *info = tcg_call_info(op); @@ -2324,23 +2476,38 @@ static void tcg_dump_ops(TCGContext *s, FILE *f, bool have_prefs) } i = 1; break; - case INDEX_op_qemu_ld_i32: - case INDEX_op_qemu_st_i32: - case INDEX_op_qemu_st8_i32: - case INDEX_op_qemu_ld_i64: - case INDEX_op_qemu_st_i64: + case INDEX_op_qemu_ld_a32_i32: + case INDEX_op_qemu_ld_a64_i32: + case INDEX_op_qemu_st_a32_i32: + case INDEX_op_qemu_st_a64_i32: + case INDEX_op_qemu_st8_a32_i32: + case INDEX_op_qemu_st8_a64_i32: + case INDEX_op_qemu_ld_a32_i64: + case INDEX_op_qemu_ld_a64_i64: + case INDEX_op_qemu_st_a32_i64: + case INDEX_op_qemu_st_a64_i64: + case INDEX_op_qemu_ld_a32_i128: + case INDEX_op_qemu_ld_a64_i128: + case INDEX_op_qemu_st_a32_i128: + case INDEX_op_qemu_st_a64_i128: { + const char *s_al, *s_op, *s_at; MemOpIdx oi = op->args[k++]; MemOp op = get_memop(oi); unsigned ix = get_mmuidx(oi); - if (op & ~(MO_AMASK | MO_BSWAP | MO_SSIZE)) { - col += ne_fprintf(f, ",$0x%x,%u", op, ix); + s_al = alignment_name[(op & MO_AMASK) >> MO_ASHIFT]; + s_op = ldst_name[op & (MO_BSWAP | MO_SSIZE)]; + s_at = atom_name[(op & MO_ATOM_MASK) >> MO_ATOM_SHIFT]; + op &= ~(MO_AMASK | MO_BSWAP | MO_SSIZE | MO_ATOM_MASK); + + /* If all fields are accounted for, print symbolically. */ + if (!op && s_al && s_op && s_at) { + col += ne_fprintf(f, ",%s%s%s,%u", + s_at, s_al, s_op, ix); } else { - const char *s_al, *s_op; - s_al = alignment_name[(op & MO_AMASK) >> MO_ASHIFT]; - s_op = ldst_name[op & (MO_BSWAP | MO_SSIZE)]; - col += ne_fprintf(f, ",%s%s,%u", s_al, s_op, ix); + op = get_memop(oi); + col += ne_fprintf(f, ",$0x%x,%u", op, ix); } i = 1; } @@ -5087,6 +5254,92 @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op) } } +/** + * atom_and_align_for_opc: + * @s: tcg context + * @opc: memory operation code + * @host_atom: MO_ATOM_{IFALIGN,WITHIN16,SUBALIGN} for host operations + * @allow_two_ops: true if we are prepared to issue two operations + * + * Return the alignment and atomicity to use for the inline fast path + * for the given memory operation. The alignment may be larger than + * that specified in @opc, and the correct alignment will be diagnosed + * by the slow path helper. + * + * If @allow_two_ops, the host is prepared to test for 2x alignment, + * and issue two loads or stores for subalignment. + */ +static TCGAtomAlign atom_and_align_for_opc(TCGContext *s, MemOp opc, + MemOp host_atom, bool allow_two_ops) +{ + MemOp align = get_alignment_bits(opc); + MemOp size = opc & MO_SIZE; + MemOp half = size ? size - 1 : 0; + MemOp atmax; + MemOp atom; + + /* When serialized, no further atomicity required. */ + if (s->gen_tb->cflags & CF_PARALLEL) { + atom = opc & MO_ATOM_MASK; + } else { + atom = MO_ATOM_NONE; + } + + switch (atom) { + case MO_ATOM_NONE: + /* The operation requires no specific atomicity. */ + atmax = MO_8; + break; + + case MO_ATOM_IFALIGN: + atmax = size; + break; + + case MO_ATOM_IFALIGN_PAIR: + atmax = half; + break; + + case MO_ATOM_WITHIN16: + atmax = size; + if (size == MO_128) { + /* Misalignment implies !within16, and therefore no atomicity. */ + } else if (host_atom != MO_ATOM_WITHIN16) { + /* The host does not implement within16, so require alignment. */ + align = MAX(align, size); + } + break; + + case MO_ATOM_WITHIN16_PAIR: + atmax = size; + /* + * Misalignment implies !within16, and therefore half atomicity. + * Any host prepared for two operations can implement this with + * half alignment. + */ + if (host_atom != MO_ATOM_WITHIN16 && allow_two_ops) { + align = MAX(align, half); + } + break; + + case MO_ATOM_SUBALIGN: + atmax = size; + if (host_atom != MO_ATOM_SUBALIGN) { + /* If unaligned but not odd, there are subobjects up to half. */ + if (allow_two_ops) { + align = MAX(align, half); + } else { + align = MAX(align, size); + } + } + break; + + default: + g_assert_not_reached(); + } + + return (TCGAtomAlign){ .atom = atmax, .align = align }; +} + /* * Similarly for qemu_ld/st slow path helpers. * We must re-implement tcg_gen_callN and tcg_reg_alloc_call simultaneously, @@ -5109,57 +5362,12 @@ static int tcg_out_helper_stk_ofs(TCGType type, unsigned slot) return ofs; } -static void tcg_out_helper_load_regs(TCGContext *s, - unsigned nmov, TCGMovExtend *mov, - unsigned ntmp, const int *tmp) -{ - switch (nmov) { - default: - /* The backend must have provided enough temps for the worst case. */ - tcg_debug_assert(ntmp + 1 >= nmov); - - for (unsigned i = nmov - 1; i >= 2; --i) { - TCGReg dst = mov[i].dst; - - for (unsigned j = 0; j < i; ++j) { - if (dst == mov[j].src) { - /* - * Conflict. - * Copy the source to a temporary, recurse for the - * remaining moves, perform the extension from our - * scratch on the way out. - */ - TCGReg scratch = tmp[--ntmp]; - tcg_out_mov(s, mov[i].src_type, scratch, mov[i].src); - mov[i].src = scratch; - - tcg_out_helper_load_regs(s, i, mov, ntmp, tmp); - tcg_out_movext1(s, &mov[i]); - return; - } - } - - /* No conflicts: perform this move and continue. */ - tcg_out_movext1(s, &mov[i]); - } - /* fall through for the final two moves */ - - case 2: - tcg_out_movext2(s, mov, mov + 1, ntmp ? tmp[0] : -1); - return; - case 1: - tcg_out_movext1(s, mov); - return; - case 0: - g_assert_not_reached(); - } -} - static void tcg_out_helper_load_slots(TCGContext *s, unsigned nmov, TCGMovExtend *mov, const TCGLdstHelperParam *parm) { unsigned i; + TCGReg dst3; /* * Start from the end, storing to the stack first. @@ -5197,7 +5405,47 @@ static void tcg_out_helper_load_slots(TCGContext *s, for (i = 0; i < nmov; ++i) { mov[i].dst = tcg_target_call_iarg_regs[mov[i].dst]; } - tcg_out_helper_load_regs(s, nmov, mov, parm->ntmp, parm->tmp); + + switch (nmov) { + case 4: + /* The backend must have provided enough temps for the worst case. */ + tcg_debug_assert(parm->ntmp >= 2); + + dst3 = mov[3].dst; + for (unsigned j = 0; j < 3; ++j) { + if (dst3 == mov[j].src) { + /* + * Conflict. Copy the source to a temporary, perform the + * remaining moves, then the extension from our scratch + * on the way out. + */ + TCGReg scratch = parm->tmp[1]; + + tcg_out_mov(s, mov[3].src_type, scratch, mov[3].src); + tcg_out_movext3(s, mov, mov + 1, mov + 2, parm->tmp[0]); + tcg_out_movext1_new_src(s, &mov[3], scratch); + break; + } + } + + /* No conflicts: perform this move and continue. */ + tcg_out_movext1(s, &mov[3]); + /* fall through */ + + case 3: + tcg_out_movext3(s, mov, mov + 1, mov + 2, + parm->ntmp ? parm->tmp[0] : -1); + break; + case 2: + tcg_out_movext2(s, mov, mov + 1, + parm->ntmp ? parm->tmp[0] : -1); + break; + case 1: + tcg_out_movext1(s, mov); + break; + default: + g_assert_not_reached(); + } } static void tcg_out_helper_load_imm(TCGContext *s, unsigned slot, @@ -5288,6 +5536,8 @@ static unsigned tcg_out_helper_add_mov(TCGMovExtend *mov, TCGType dst_type, TCGType src_type, TCGReg lo, TCGReg hi) { + MemOp reg_mo; + if (dst_type <= TCG_TYPE_REG) { MemOp src_ext; @@ -5315,19 +5565,25 @@ static unsigned tcg_out_helper_add_mov(TCGMovExtend *mov, return 1; } - assert(TCG_TARGET_REG_BITS == 32); + if (TCG_TARGET_REG_BITS == 32) { + assert(dst_type == TCG_TYPE_I64); + reg_mo = MO_32; + } else { + assert(dst_type == TCG_TYPE_I128); + reg_mo = MO_64; + } mov[0].dst = loc[HOST_BIG_ENDIAN].arg_slot; mov[0].src = lo; - mov[0].dst_type = TCG_TYPE_I32; - mov[0].src_type = TCG_TYPE_I32; - mov[0].src_ext = MO_32; + mov[0].dst_type = TCG_TYPE_REG; + mov[0].src_type = TCG_TYPE_REG; + mov[0].src_ext = reg_mo; mov[1].dst = loc[!HOST_BIG_ENDIAN].arg_slot; mov[1].src = hi; - mov[1].dst_type = TCG_TYPE_I32; - mov[1].src_type = TCG_TYPE_I32; - mov[1].src_ext = MO_32; + mov[1].dst_type = TCG_TYPE_REG; + mov[1].src_type = TCG_TYPE_REG; + mov[1].src_ext = reg_mo; return 2; } @@ -5350,6 +5606,9 @@ static void tcg_out_ld_helper_args(TCGContext *s, const TCGLabelQemuLdst *ldst, case MO_64: info = &info_helper_ld64_mmu; break; + case MO_128: + info = &info_helper_ld128_mmu; + break; default: g_assert_not_reached(); } @@ -5358,14 +5617,54 @@ static void tcg_out_ld_helper_args(TCGContext *s, const TCGLabelQemuLdst *ldst, next_arg = 1; loc = &info->in[next_arg]; - nmov = tcg_out_helper_add_mov(mov, loc, TCG_TYPE_TL, TCG_TYPE_TL, - ldst->addrlo_reg, ldst->addrhi_reg); - next_arg += nmov; + if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I32) { + /* + * 32-bit host with 32-bit guest: zero-extend the guest address + * to 64-bits for the helper by storing the low part, then + * load a zero for the high part. + */ + tcg_out_helper_add_mov(mov, loc + HOST_BIG_ENDIAN, + TCG_TYPE_I32, TCG_TYPE_I32, + ldst->addrlo_reg, -1); + tcg_out_helper_load_slots(s, 1, mov, parm); + + tcg_out_helper_load_imm(s, loc[!HOST_BIG_ENDIAN].arg_slot, + TCG_TYPE_I32, 0, parm); + next_arg += 2; + } else { + nmov = tcg_out_helper_add_mov(mov, loc, TCG_TYPE_I64, s->addr_type, + ldst->addrlo_reg, ldst->addrhi_reg); + tcg_out_helper_load_slots(s, nmov, mov, parm); + next_arg += nmov; + } - tcg_out_helper_load_slots(s, nmov, mov, parm); + switch (info->out_kind) { + case TCG_CALL_RET_NORMAL: + case TCG_CALL_RET_BY_VEC: + break; + case TCG_CALL_RET_BY_REF: + /* + * The return reference is in the first argument slot. + * We need memory in which to return: re-use the top of stack. + */ + { + int ofs_slot0 = TCG_TARGET_CALL_STACK_OFFSET; - /* No special attention for 32 and 64-bit return values. */ - tcg_debug_assert(info->out_kind == TCG_CALL_RET_NORMAL); + if (arg_slot_reg_p(0)) { + tcg_out_addi_ptr(s, tcg_target_call_iarg_regs[0], + TCG_REG_CALL_STACK, ofs_slot0); + } else { + tcg_debug_assert(parm->ntmp != 0); + tcg_out_addi_ptr(s, parm->tmp[0], + TCG_REG_CALL_STACK, ofs_slot0); + tcg_out_st(s, TCG_TYPE_PTR, parm->tmp[0], + TCG_REG_CALL_STACK, ofs_slot0); + } + } + break; + default: + g_assert_not_reached(); + } tcg_out_helper_load_common_args(s, ldst, parm, info, next_arg); } @@ -5374,11 +5673,18 @@ static void tcg_out_ld_helper_ret(TCGContext *s, const TCGLabelQemuLdst *ldst, bool load_sign, const TCGLdstHelperParam *parm) { + MemOp mop = get_memop(ldst->oi); TCGMovExtend mov[2]; + int ofs_slot0; - if (ldst->type <= TCG_TYPE_REG) { - MemOp mop = get_memop(ldst->oi); + switch (ldst->type) { + case TCG_TYPE_I64: + if (TCG_TARGET_REG_BITS == 32) { + break; + } + /* fall through */ + case TCG_TYPE_I32: mov[0].dst = ldst->datalo_reg; mov[0].src = tcg_target_call_oarg_reg(TCG_CALL_RET_NORMAL, 0); mov[0].dst_type = ldst->type; @@ -5404,25 +5710,49 @@ static void tcg_out_ld_helper_ret(TCGContext *s, const TCGLabelQemuLdst *ldst, mov[0].src_ext = mop & MO_SSIZE; } tcg_out_movext1(s, mov); - } else { - assert(TCG_TARGET_REG_BITS == 32); - - mov[0].dst = ldst->datalo_reg; - mov[0].src = - tcg_target_call_oarg_reg(TCG_CALL_RET_NORMAL, HOST_BIG_ENDIAN); - mov[0].dst_type = TCG_TYPE_I32; - mov[0].src_type = TCG_TYPE_I32; - mov[0].src_ext = MO_32; + return; - mov[1].dst = ldst->datahi_reg; - mov[1].src = - tcg_target_call_oarg_reg(TCG_CALL_RET_NORMAL, !HOST_BIG_ENDIAN); - mov[1].dst_type = TCG_TYPE_REG; - mov[1].src_type = TCG_TYPE_REG; - mov[1].src_ext = MO_32; + case TCG_TYPE_I128: + tcg_debug_assert(TCG_TARGET_REG_BITS == 64); + ofs_slot0 = TCG_TARGET_CALL_STACK_OFFSET; + switch (TCG_TARGET_CALL_RET_I128) { + case TCG_CALL_RET_NORMAL: + break; + case TCG_CALL_RET_BY_VEC: + tcg_out_st(s, TCG_TYPE_V128, + tcg_target_call_oarg_reg(TCG_CALL_RET_BY_VEC, 0), + TCG_REG_CALL_STACK, ofs_slot0); + /* fall through */ + case TCG_CALL_RET_BY_REF: + tcg_out_ld(s, TCG_TYPE_I64, ldst->datalo_reg, + TCG_REG_CALL_STACK, ofs_slot0 + 8 * HOST_BIG_ENDIAN); + tcg_out_ld(s, TCG_TYPE_I64, ldst->datahi_reg, + TCG_REG_CALL_STACK, ofs_slot0 + 8 * !HOST_BIG_ENDIAN); + return; + default: + g_assert_not_reached(); + } + break; - tcg_out_movext2(s, mov, mov + 1, parm->ntmp ? parm->tmp[0] : -1); + default: + g_assert_not_reached(); } + + mov[0].dst = ldst->datalo_reg; + mov[0].src = + tcg_target_call_oarg_reg(TCG_CALL_RET_NORMAL, HOST_BIG_ENDIAN); + mov[0].dst_type = TCG_TYPE_I32; + mov[0].src_type = TCG_TYPE_I32; + mov[0].src_ext = TCG_TARGET_REG_BITS == 32 ? MO_32 : MO_64; + + mov[1].dst = ldst->datahi_reg; + mov[1].src = + tcg_target_call_oarg_reg(TCG_CALL_RET_NORMAL, !HOST_BIG_ENDIAN); + mov[1].dst_type = TCG_TYPE_REG; + mov[1].src_type = TCG_TYPE_REG; + mov[1].src_ext = TCG_TARGET_REG_BITS == 32 ? MO_32 : MO_64; + + tcg_out_movext2(s, mov, mov + 1, parm->ntmp ? parm->tmp[0] : -1); } static void tcg_out_st_helper_args(TCGContext *s, const TCGLabelQemuLdst *ldst, @@ -5446,6 +5776,10 @@ static void tcg_out_st_helper_args(TCGContext *s, const TCGLabelQemuLdst *ldst, info = &info_helper_st64_mmu; data_type = TCG_TYPE_I64; break; + case MO_128: + info = &info_helper_st128_mmu; + data_type = TCG_TYPE_I128; + break; default: g_assert_not_reached(); } @@ -5456,20 +5790,74 @@ static void tcg_out_st_helper_args(TCGContext *s, const TCGLabelQemuLdst *ldst, /* Handle addr argument. */ loc = &info->in[next_arg]; - n = tcg_out_helper_add_mov(mov, loc, TCG_TYPE_TL, TCG_TYPE_TL, - ldst->addrlo_reg, ldst->addrhi_reg); - next_arg += n; - nmov += n; + if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I32) { + /* + * 32-bit host with 32-bit guest: zero-extend the guest address + * to 64-bits for the helper by storing the low part. Later, + * after we have processed the register inputs, we will load a + * zero for the high part. + */ + tcg_out_helper_add_mov(mov, loc + HOST_BIG_ENDIAN, + TCG_TYPE_I32, TCG_TYPE_I32, + ldst->addrlo_reg, -1); + next_arg += 2; + nmov += 1; + } else { + n = tcg_out_helper_add_mov(mov, loc, TCG_TYPE_I64, s->addr_type, + ldst->addrlo_reg, ldst->addrhi_reg); + next_arg += n; + nmov += n; + } /* Handle data argument. */ loc = &info->in[next_arg]; - n = tcg_out_helper_add_mov(mov + nmov, loc, data_type, ldst->type, - ldst->datalo_reg, ldst->datahi_reg); - next_arg += n; - nmov += n; - tcg_debug_assert(nmov <= ARRAY_SIZE(mov)); + switch (loc->kind) { + case TCG_CALL_ARG_NORMAL: + case TCG_CALL_ARG_EXTEND_U: + case TCG_CALL_ARG_EXTEND_S: + n = tcg_out_helper_add_mov(mov + nmov, loc, data_type, ldst->type, + ldst->datalo_reg, ldst->datahi_reg); + next_arg += n; + nmov += n; + tcg_out_helper_load_slots(s, nmov, mov, parm); + break; + + case TCG_CALL_ARG_BY_REF: + tcg_debug_assert(TCG_TARGET_REG_BITS == 64); + tcg_debug_assert(data_type == TCG_TYPE_I128); + tcg_out_st(s, TCG_TYPE_I64, + HOST_BIG_ENDIAN ? ldst->datahi_reg : ldst->datalo_reg, + TCG_REG_CALL_STACK, arg_slot_stk_ofs(loc[0].ref_slot)); + tcg_out_st(s, TCG_TYPE_I64, + HOST_BIG_ENDIAN ? ldst->datalo_reg : ldst->datahi_reg, + TCG_REG_CALL_STACK, arg_slot_stk_ofs(loc[1].ref_slot)); + + tcg_out_helper_load_slots(s, nmov, mov, parm); + + if (arg_slot_reg_p(loc->arg_slot)) { + tcg_out_addi_ptr(s, tcg_target_call_iarg_regs[loc->arg_slot], + TCG_REG_CALL_STACK, + arg_slot_stk_ofs(loc->ref_slot)); + } else { + tcg_debug_assert(parm->ntmp != 0); + tcg_out_addi_ptr(s, parm->tmp[0], TCG_REG_CALL_STACK, + arg_slot_stk_ofs(loc->ref_slot)); + tcg_out_st(s, TCG_TYPE_PTR, parm->tmp[0], + TCG_REG_CALL_STACK, arg_slot_stk_ofs(loc->arg_slot)); + } + next_arg += 2; + break; + + default: + g_assert_not_reached(); + } + + if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I32) { + /* Zero extend the address by loading a zero for the high part. */ + loc = &info->in[1 + !HOST_BIG_ENDIAN]; + tcg_out_helper_load_imm(s, loc->arg_slot, TCG_TYPE_I32, 0, parm); + } - tcg_out_helper_load_slots(s, nmov, mov, parm); tcg_out_helper_load_common_args(s, ldst, parm, info, next_arg); } @@ -5582,7 +5970,7 @@ int64_t tcg_cpu_exec_time(void) #endif -int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start) +int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start) { #ifdef CONFIG_PROFILER TCGProfile *prof = &s->prof; @@ -5743,13 +6131,8 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start) } num_insns++; for (i = 0; i < TARGET_INSN_START_WORDS; ++i) { - target_ulong a; -#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS - a = deposit64(op->args[i * 2], 32, 32, op->args[i * 2 + 1]); -#else - a = op->args[i]; -#endif - s->gen_insn_data[num_insns][i] = a; + s->gen_insn_data[num_insns][i] = + tcg_get_insn_start_param(op, i); } break; case INDEX_op_discard: @@ -286,162 +286,54 @@ static bool tci_compare64(uint64_t u0, uint64_t u1, TCGCond condition) return result; } -static uint64_t tci_qemu_ld(CPUArchState *env, target_ulong taddr, +static uint64_t tci_qemu_ld(CPUArchState *env, uint64_t taddr, MemOpIdx oi, const void *tb_ptr) { MemOp mop = get_memop(oi); uintptr_t ra = (uintptr_t)tb_ptr; -#ifdef CONFIG_SOFTMMU - switch (mop & (MO_BSWAP | MO_SSIZE)) { + switch (mop & MO_SSIZE) { case MO_UB: - return helper_ret_ldub_mmu(env, taddr, oi, ra); + return helper_ldub_mmu(env, taddr, oi, ra); case MO_SB: - return helper_ret_ldsb_mmu(env, taddr, oi, ra); - case MO_LEUW: - return helper_le_lduw_mmu(env, taddr, oi, ra); - case MO_LESW: - return helper_le_ldsw_mmu(env, taddr, oi, ra); - case MO_LEUL: - return helper_le_ldul_mmu(env, taddr, oi, ra); - case MO_LESL: - return helper_le_ldsl_mmu(env, taddr, oi, ra); - case MO_LEUQ: - return helper_le_ldq_mmu(env, taddr, oi, ra); - case MO_BEUW: - return helper_be_lduw_mmu(env, taddr, oi, ra); - case MO_BESW: - return helper_be_ldsw_mmu(env, taddr, oi, ra); - case MO_BEUL: - return helper_be_ldul_mmu(env, taddr, oi, ra); - case MO_BESL: - return helper_be_ldsl_mmu(env, taddr, oi, ra); - case MO_BEUQ: - return helper_be_ldq_mmu(env, taddr, oi, ra); + return helper_ldsb_mmu(env, taddr, oi, ra); + case MO_UW: + return helper_lduw_mmu(env, taddr, oi, ra); + case MO_SW: + return helper_ldsw_mmu(env, taddr, oi, ra); + case MO_UL: + return helper_ldul_mmu(env, taddr, oi, ra); + case MO_SL: + return helper_ldsl_mmu(env, taddr, oi, ra); + case MO_UQ: + return helper_ldq_mmu(env, taddr, oi, ra); default: g_assert_not_reached(); } -#else - void *haddr = g2h(env_cpu(env), taddr); - unsigned a_mask = (1u << get_alignment_bits(mop)) - 1; - uint64_t ret; - - set_helper_retaddr(ra); - if (taddr & a_mask) { - helper_unaligned_ld(env, taddr); - } - switch (mop & (MO_BSWAP | MO_SSIZE)) { - case MO_UB: - ret = ldub_p(haddr); - break; - case MO_SB: - ret = ldsb_p(haddr); - break; - case MO_LEUW: - ret = lduw_le_p(haddr); - break; - case MO_LESW: - ret = ldsw_le_p(haddr); - break; - case MO_LEUL: - ret = (uint32_t)ldl_le_p(haddr); - break; - case MO_LESL: - ret = (int32_t)ldl_le_p(haddr); - break; - case MO_LEUQ: - ret = ldq_le_p(haddr); - break; - case MO_BEUW: - ret = lduw_be_p(haddr); - break; - case MO_BESW: - ret = ldsw_be_p(haddr); - break; - case MO_BEUL: - ret = (uint32_t)ldl_be_p(haddr); - break; - case MO_BESL: - ret = (int32_t)ldl_be_p(haddr); - break; - case MO_BEUQ: - ret = ldq_be_p(haddr); - break; - default: - g_assert_not_reached(); - } - clear_helper_retaddr(); - return ret; -#endif } -static void tci_qemu_st(CPUArchState *env, target_ulong taddr, uint64_t val, +static void tci_qemu_st(CPUArchState *env, uint64_t taddr, uint64_t val, MemOpIdx oi, const void *tb_ptr) { MemOp mop = get_memop(oi); uintptr_t ra = (uintptr_t)tb_ptr; -#ifdef CONFIG_SOFTMMU - switch (mop & (MO_BSWAP | MO_SIZE)) { + switch (mop & MO_SIZE) { case MO_UB: - helper_ret_stb_mmu(env, taddr, val, oi, ra); + helper_stb_mmu(env, taddr, val, oi, ra); break; - case MO_LEUW: - helper_le_stw_mmu(env, taddr, val, oi, ra); + case MO_UW: + helper_stw_mmu(env, taddr, val, oi, ra); break; - case MO_LEUL: - helper_le_stl_mmu(env, taddr, val, oi, ra); + case MO_UL: + helper_stl_mmu(env, taddr, val, oi, ra); break; - case MO_LEUQ: - helper_le_stq_mmu(env, taddr, val, oi, ra); - break; - case MO_BEUW: - helper_be_stw_mmu(env, taddr, val, oi, ra); - break; - case MO_BEUL: - helper_be_stl_mmu(env, taddr, val, oi, ra); - break; - case MO_BEUQ: - helper_be_stq_mmu(env, taddr, val, oi, ra); + case MO_UQ: + helper_stq_mmu(env, taddr, val, oi, ra); break; default: g_assert_not_reached(); } -#else - void *haddr = g2h(env_cpu(env), taddr); - unsigned a_mask = (1u << get_alignment_bits(mop)) - 1; - - set_helper_retaddr(ra); - if (taddr & a_mask) { - helper_unaligned_st(env, taddr); - } - switch (mop & (MO_BSWAP | MO_SIZE)) { - case MO_UB: - stb_p(haddr, val); - break; - case MO_LEUW: - stw_le_p(haddr, val); - break; - case MO_LEUL: - stl_le_p(haddr, val); - break; - case MO_LEUQ: - stq_le_p(haddr, val); - break; - case MO_BEUW: - stw_be_p(haddr, val); - break; - case MO_BEUL: - stl_be_p(haddr, val); - break; - case MO_BEUQ: - stq_be_p(haddr, val); - break; - default: - g_assert_not_reached(); - } - clear_helper_retaddr(); -#endif } #if TCG_TARGET_REG_BITS == 64 @@ -480,10 +372,9 @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env, TCGReg r0, r1, r2, r3, r4, r5; tcg_target_ulong t1; TCGCond condition; - target_ulong taddr; uint8_t pos, len; uint32_t tmp32; - uint64_t tmp64; + uint64_t tmp64, taddr; uint64_t T1, T2; MemOpIdx oi; int32_t ofs; @@ -1030,30 +921,41 @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env, tb_ptr = ptr; break; - case INDEX_op_qemu_ld_i32: - if (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS) { + case INDEX_op_qemu_ld_a32_i32: + tci_args_rrm(insn, &r0, &r1, &oi); + taddr = (uint32_t)regs[r1]; + goto do_ld_i32; + case INDEX_op_qemu_ld_a64_i32: + if (TCG_TARGET_REG_BITS == 64) { tci_args_rrm(insn, &r0, &r1, &oi); taddr = regs[r1]; } else { tci_args_rrrm(insn, &r0, &r1, &r2, &oi); taddr = tci_uint64(regs[r2], regs[r1]); } - tmp32 = tci_qemu_ld(env, taddr, oi, tb_ptr); - regs[r0] = tmp32; + do_ld_i32: + regs[r0] = tci_qemu_ld(env, taddr, oi, tb_ptr); break; - case INDEX_op_qemu_ld_i64: + case INDEX_op_qemu_ld_a32_i64: if (TCG_TARGET_REG_BITS == 64) { tci_args_rrm(insn, &r0, &r1, &oi); - taddr = regs[r1]; - } else if (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS) { + taddr = (uint32_t)regs[r1]; + } else { tci_args_rrrm(insn, &r0, &r1, &r2, &oi); - taddr = regs[r2]; + taddr = (uint32_t)regs[r2]; + } + goto do_ld_i64; + case INDEX_op_qemu_ld_a64_i64: + if (TCG_TARGET_REG_BITS == 64) { + tci_args_rrm(insn, &r0, &r1, &oi); + taddr = regs[r1]; } else { tci_args_rrrrr(insn, &r0, &r1, &r2, &r3, &r4); taddr = tci_uint64(regs[r3], regs[r2]); oi = regs[r4]; } + do_ld_i64: tmp64 = tci_qemu_ld(env, taddr, oi, tb_ptr); if (TCG_TARGET_REG_BITS == 32) { tci_write_reg64(regs, r1, r0, tmp64); @@ -1062,34 +964,45 @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env, } break; - case INDEX_op_qemu_st_i32: - if (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS) { + case INDEX_op_qemu_st_a32_i32: + tci_args_rrm(insn, &r0, &r1, &oi); + taddr = (uint32_t)regs[r1]; + goto do_st_i32; + case INDEX_op_qemu_st_a64_i32: + if (TCG_TARGET_REG_BITS == 64) { tci_args_rrm(insn, &r0, &r1, &oi); taddr = regs[r1]; } else { tci_args_rrrm(insn, &r0, &r1, &r2, &oi); taddr = tci_uint64(regs[r2], regs[r1]); } - tmp32 = regs[r0]; - tci_qemu_st(env, taddr, tmp32, oi, tb_ptr); + do_st_i32: + tci_qemu_st(env, taddr, regs[r0], oi, tb_ptr); break; - case INDEX_op_qemu_st_i64: + case INDEX_op_qemu_st_a32_i64: if (TCG_TARGET_REG_BITS == 64) { tci_args_rrm(insn, &r0, &r1, &oi); - taddr = regs[r1]; tmp64 = regs[r0]; + taddr = (uint32_t)regs[r1]; } else { - if (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS) { - tci_args_rrrm(insn, &r0, &r1, &r2, &oi); - taddr = regs[r2]; - } else { - tci_args_rrrrr(insn, &r0, &r1, &r2, &r3, &r4); - taddr = tci_uint64(regs[r3], regs[r2]); - oi = regs[r4]; - } + tci_args_rrrm(insn, &r0, &r1, &r2, &oi); + tmp64 = tci_uint64(regs[r1], regs[r0]); + taddr = (uint32_t)regs[r2]; + } + goto do_st_i64; + case INDEX_op_qemu_st_a64_i64: + if (TCG_TARGET_REG_BITS == 64) { + tci_args_rrm(insn, &r0, &r1, &oi); + tmp64 = regs[r0]; + taddr = regs[r1]; + } else { + tci_args_rrrrr(insn, &r0, &r1, &r2, &r3, &r4); tmp64 = tci_uint64(regs[r1], regs[r0]); + taddr = tci_uint64(regs[r3], regs[r2]); + oi = regs[r4]; } + do_st_i64: tci_qemu_st(env, taddr, tmp64, oi, tb_ptr); break; @@ -1359,15 +1272,21 @@ int print_insn_tci(bfd_vma addr, disassemble_info *info) str_r(r3), str_r(r4), str_r(r5)); break; - case INDEX_op_qemu_ld_i64: - case INDEX_op_qemu_st_i64: - len = DIV_ROUND_UP(64, TCG_TARGET_REG_BITS); + case INDEX_op_qemu_ld_a32_i32: + case INDEX_op_qemu_st_a32_i32: + len = 1 + 1; + goto do_qemu_ldst; + case INDEX_op_qemu_ld_a32_i64: + case INDEX_op_qemu_st_a32_i64: + case INDEX_op_qemu_ld_a64_i32: + case INDEX_op_qemu_st_a64_i32: + len = 1 + DIV_ROUND_UP(64, TCG_TARGET_REG_BITS); + goto do_qemu_ldst; + case INDEX_op_qemu_ld_a64_i64: + case INDEX_op_qemu_st_a64_i64: + len = 2 * DIV_ROUND_UP(64, TCG_TARGET_REG_BITS); goto do_qemu_ldst; - case INDEX_op_qemu_ld_i32: - case INDEX_op_qemu_st_i32: - len = 1; do_qemu_ldst: - len += DIV_ROUND_UP(TARGET_LONG_BITS, TCG_TARGET_REG_BITS); switch (len) { case 2: tci_args_rrm(insn, &r0, &r1, &oi); diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc index 4cf03a579c..c9516a5e8b 100644 --- a/tcg/tci/tcg-target.c.inc +++ b/tcg/tci/tcg-target.c.inc @@ -156,22 +156,22 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_setcond2_i32: return C_O1_I4(r, r, r, r, r); - case INDEX_op_qemu_ld_i32: - return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS - ? C_O1_I1(r, r) - : C_O1_I2(r, r, r)); - case INDEX_op_qemu_ld_i64: - return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, r) - : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O2_I1(r, r, r) - : C_O2_I2(r, r, r, r)); - case INDEX_op_qemu_st_i32: - return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS - ? C_O0_I2(r, r) - : C_O0_I3(r, r, r)); - case INDEX_op_qemu_st_i64: - return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(r, r) - : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O0_I3(r, r, r) - : C_O0_I4(r, r, r, r)); + case INDEX_op_qemu_ld_a32_i32: + return C_O1_I1(r, r); + case INDEX_op_qemu_ld_a64_i32: + return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, r) : C_O1_I2(r, r, r); + case INDEX_op_qemu_ld_a32_i64: + return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, r) : C_O2_I1(r, r, r); + case INDEX_op_qemu_ld_a64_i64: + return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, r) : C_O2_I2(r, r, r, r); + case INDEX_op_qemu_st_a32_i32: + return C_O0_I2(r, r); + case INDEX_op_qemu_st_a64_i32: + return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(r, r) : C_O0_I3(r, r, r); + case INDEX_op_qemu_st_a32_i64: + return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(r, r) : C_O0_I3(r, r, r); + case INDEX_op_qemu_st_a64_i64: + return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(r, r) : C_O0_I4(r, r, r, r); default: g_assert_not_reached(); @@ -243,7 +243,7 @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type, return false; } -static void stack_bounds_check(TCGReg base, target_long offset) +static void stack_bounds_check(TCGReg base, intptr_t offset) { if (base == TCG_REG_CALL_STACK) { tcg_debug_assert(offset >= 0); @@ -849,21 +849,24 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, tcg_out_op_rrrr(s, opc, args[0], args[1], args[2], args[3]); break; - case INDEX_op_qemu_ld_i32: - case INDEX_op_qemu_st_i32: - if (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS) { + case INDEX_op_qemu_ld_a32_i32: + case INDEX_op_qemu_st_a32_i32: + tcg_out_op_rrm(s, opc, args[0], args[1], args[2]); + break; + case INDEX_op_qemu_ld_a64_i32: + case INDEX_op_qemu_st_a64_i32: + case INDEX_op_qemu_ld_a32_i64: + case INDEX_op_qemu_st_a32_i64: + if (TCG_TARGET_REG_BITS == 64) { tcg_out_op_rrm(s, opc, args[0], args[1], args[2]); } else { tcg_out_op_rrrm(s, opc, args[0], args[1], args[2], args[3]); } break; - - case INDEX_op_qemu_ld_i64: - case INDEX_op_qemu_st_i64: + case INDEX_op_qemu_ld_a64_i64: + case INDEX_op_qemu_st_a64_i64: if (TCG_TARGET_REG_BITS == 64) { tcg_out_op_rrm(s, opc, args[0], args[1], args[2]); - } else if (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS) { - tcg_out_op_rrrm(s, opc, args[0], args[1], args[2], args[3]); } else { tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_TMP, args[4]); tcg_out_op_rrrrr(s, opc, args[0], args[1], @@ -963,3 +966,8 @@ static void tcg_target_init(TCGContext *s) static inline void tcg_target_qemu_prologue(TCGContext *s) { } + +bool tcg_target_has_memory_bswap(MemOp memop) +{ + return true; +} diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h index 7140a76a73..28dc6d5cfc 100644 --- a/tcg/tci/tcg-target.h +++ b/tcg/tci/tcg-target.h @@ -127,6 +127,8 @@ #define TCG_TARGET_HAS_mulu2_i32 1 #endif /* TCG_TARGET_REG_BITS == 64 */ +#define TCG_TARGET_HAS_qemu_ldst_i128 0 + /* Number of registers available. */ #define TCG_TARGET_NB_REGS 16 @@ -176,6 +178,4 @@ typedef enum { We prefer consistency across hosts on this. */ #define TCG_TARGET_DEFAULT_MO (0) -#define TCG_TARGET_HAS_MEMORY_BSWAP 1 - #endif /* TCG_TARGET_H */ |