aboutsummaryrefslogtreecommitdiff
path: root/target-arm/helper-a64.c
diff options
context:
space:
mode:
authorEmilio G. Cota <cota@braap.org>2016-06-27 15:02:13 -0400
committerRichard Henderson <rth@twiddle.net>2016-10-26 08:29:02 -0700
commit1dd089d0eec060dcd8478735114d98421d414805 (patch)
tree32f50b948a31b291aa092c3dd7c70a4e198d3bed /target-arm/helper-a64.c
parentcf12bce088f22b92bf62ffa0d7f6a3e951e355a9 (diff)
target-arm: emulate aarch64's LL/SC using cmpxchg helpers
Emulating LL/SC with cmpxchg is not correct, since it can suffer from the ABA problem. Portable parallel code, however, is written assuming only cmpxchg--and not LL/SC--is available. This means that in practice emulating LL/SC with cmpxchg is a viable alternative. The appended emulates LL/SC pairs in aarch64 with cmpxchg helpers. This works in both user and system mode. In usermode, it avoids pausing all other CPUs to perform the LL/SC pair. The subsequent performance and scalability improvement is significant, as the plots below show. They plot the throughput of atomic_add-bench compiled for ARM and executed on a 64-core x86 machine. Hi-res plots: http://imgur.com/a/JVc8Y atomic_add-bench: 1000000 ops/thread, [0,1] range 18 ++---------+----------+---------+----------+----------+----------+---++ +cmpxchg +-E--+ + + + + + | 16 ++master +-H--+ ++ || | 14 ++ ++ | | | 12 ++| ++ | | | 10 ++++ ++ 8 ++E ++ |+++ | 6 ++ | ++ | | | 4 ++ | ++ | | | 2 +H++E+--- ++ + | +E++----+E+---+--+E+----++E+------+E+------+E++----+E+---+--+E| 0 ++H-H----H-+-----H----+---------+----------+----------+----------+---++ 0 10 20 30 40 50 60 Number of threads atomic_add-bench: 1000000 ops/thread, [0,2] range 18 ++---------+----------+---------+----------+----------+----------+---++ +cmpxchg +-E--+ + + + + + | 16 ++master +-H--+ ++ | | | 14 ++E ++ | | | 12 ++| ++ |+++ | 10 ++ | ++ 8 ++ | ++ | | | 6 ++ | ++ | | | 4 ++ | ++ | +E+--- | 2 +H+ +E+-----+++ +++ +++ ---+E+-----+E+------+++ +++ + +E+---+--+E+----++E+------+E+--- ++++ +++ + +E| 0 ++H-H----H-+-----H----+---------+----------+----------+----------+---++ 0 10 20 30 40 50 60 Number of threads atomic_add-bench: 1000000 ops/thread, [0,128] range 70 ++---------+----------+---------+----------+----------+----------+---++ +cmpxchg +-E--+ + + + + + | 60 ++master +-H--+ +++ ---+E+-----+E+------+E+ | +E+------E-------+E+--- | | --- +++ | 50 ++ +++--- ++ | -+E+ | 40 ++ +++---- ++ | E- | | --| | 30 ++ -- +++ ++ | +E+ | 20 ++E+ ++ |E+ | | | 10 ++ ++ + + + + + + + | 0 +HH-H----H-+-----H----+---------+----------+----------+----------+---++ 0 10 20 30 40 50 60 Number of threads atomic_add-bench: 1000000 ops/thread, [0,1024] range 160 ++---------+---------+----------+---------+----------+----------+---++ +cmpxchg +-E--+ + + + + + | 140 ++master +-H--+ +++ +++ | -+E+-----+E+-------E| 120 ++ +++ ---- +++ | +++ ----E-- | 100 ++ --E--- +++ ++ | +++ ---- +++ | 80 ++ --E-- ++ | ---- +++ | | -+E+ | 60 ++ ---- +++ ++ | +E+- | 40 ++ -- ++ | +E+ | 20 +EE+ ++ +++ + + + + + + | 0 +HH-H---H--+-----H---+----------+---------+----------+----------+---++ 0 10 20 30 40 50 60 Number of threads [rth: Rearrange 128-bit cmpxchg helper. Enforce alignment on LL.] Signed-off-by: Emilio G. Cota <cota@braap.org> Message-Id: <1467054136-10430-28-git-send-email-cota@braap.org> Signed-off-by: Richard Henderson <rth@twiddle.net>
Diffstat (limited to 'target-arm/helper-a64.c')
-rw-r--r--target-arm/helper-a64.c113
1 files changed, 113 insertions, 0 deletions
diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c
index 41e48a41b4..98b97df461 100644
--- a/target-arm/helper-a64.c
+++ b/target-arm/helper-a64.c
@@ -27,6 +27,10 @@
#include "qemu/bitops.h"
#include "internals.h"
#include "qemu/crc32c.h"
+#include "exec/exec-all.h"
+#include "exec/cpu_ldst.h"
+#include "qemu/int128.h"
+#include "tcg.h"
#include <zlib.h> /* For crc32 */
/* C2.4.7 Multiply and divide */
@@ -444,3 +448,112 @@ uint64_t HELPER(crc32c_64)(uint64_t acc, uint64_t val, uint32_t bytes)
/* Linux crc32c converts the output to one's complement. */
return crc32c(acc, buf, bytes) ^ 0xffffffff;
}
+
+/* Returns 0 on success; 1 otherwise. */
+uint64_t HELPER(paired_cmpxchg64_le)(CPUARMState *env, uint64_t addr,
+ uint64_t new_lo, uint64_t new_hi)
+{
+ uintptr_t ra = GETPC();
+ Int128 oldv, cmpv, newv;
+ bool success;
+
+ cmpv = int128_make128(env->exclusive_val, env->exclusive_high);
+ newv = int128_make128(new_lo, new_hi);
+
+ if (parallel_cpus) {
+#ifndef CONFIG_ATOMIC128
+ cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
+#else
+ int mem_idx = cpu_mmu_index(env, false);
+ TCGMemOpIdx oi = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
+ oldv = helper_atomic_cmpxchgo_le_mmu(env, addr, cmpv, newv, oi, ra);
+ success = int128_eq(oldv, cmpv);
+#endif
+ } else {
+ uint64_t o0, o1;
+
+#ifdef CONFIG_USER_ONLY
+ /* ??? Enforce alignment. */
+ uint64_t *haddr = g2h(addr);
+ o0 = ldq_le_p(haddr + 0);
+ o1 = ldq_le_p(haddr + 1);
+ oldv = int128_make128(o0, o1);
+
+ success = int128_eq(oldv, cmpv);
+ if (success) {
+ stq_le_p(haddr + 0, int128_getlo(newv));
+ stq_le_p(haddr + 1, int128_gethi(newv));
+ }
+#else
+ int mem_idx = cpu_mmu_index(env, false);
+ TCGMemOpIdx oi0 = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
+ TCGMemOpIdx oi1 = make_memop_idx(MO_LEQ, mem_idx);
+
+ o0 = helper_le_ldq_mmu(env, addr + 0, oi0, ra);
+ o1 = helper_le_ldq_mmu(env, addr + 8, oi1, ra);
+ oldv = int128_make128(o0, o1);
+
+ success = int128_eq(oldv, cmpv);
+ if (success) {
+ helper_le_stq_mmu(env, addr + 0, int128_getlo(newv), oi1, ra);
+ helper_le_stq_mmu(env, addr + 8, int128_gethi(newv), oi1, ra);
+ }
+#endif
+ }
+
+ return !success;
+}
+
+uint64_t HELPER(paired_cmpxchg64_be)(CPUARMState *env, uint64_t addr,
+ uint64_t new_lo, uint64_t new_hi)
+{
+ uintptr_t ra = GETPC();
+ Int128 oldv, cmpv, newv;
+ bool success;
+
+ cmpv = int128_make128(env->exclusive_val, env->exclusive_high);
+ newv = int128_make128(new_lo, new_hi);
+
+ if (parallel_cpus) {
+#ifndef CONFIG_ATOMIC128
+ cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
+#else
+ int mem_idx = cpu_mmu_index(env, false);
+ TCGMemOpIdx oi = make_memop_idx(MO_BEQ | MO_ALIGN_16, mem_idx);
+ oldv = helper_atomic_cmpxchgo_be_mmu(env, addr, cmpv, newv, oi, ra);
+ success = int128_eq(oldv, cmpv);
+#endif
+ } else {
+ uint64_t o0, o1;
+
+#ifdef CONFIG_USER_ONLY
+ /* ??? Enforce alignment. */
+ uint64_t *haddr = g2h(addr);
+ o1 = ldq_be_p(haddr + 0);
+ o0 = ldq_be_p(haddr + 1);
+ oldv = int128_make128(o0, o1);
+
+ success = int128_eq(oldv, cmpv);
+ if (success) {
+ stq_be_p(haddr + 0, int128_gethi(newv));
+ stq_be_p(haddr + 1, int128_getlo(newv));
+ }
+#else
+ int mem_idx = cpu_mmu_index(env, false);
+ TCGMemOpIdx oi0 = make_memop_idx(MO_BEQ | MO_ALIGN_16, mem_idx);
+ TCGMemOpIdx oi1 = make_memop_idx(MO_BEQ, mem_idx);
+
+ o1 = helper_be_ldq_mmu(env, addr + 0, oi0, ra);
+ o0 = helper_be_ldq_mmu(env, addr + 8, oi1, ra);
+ oldv = int128_make128(o0, o1);
+
+ success = int128_eq(oldv, cmpv);
+ if (success) {
+ helper_be_stq_mmu(env, addr + 0, int128_gethi(newv), oi1, ra);
+ helper_be_stq_mmu(env, addr + 8, int128_getlo(newv), oi1, ra);
+ }
+#endif
+ }
+
+ return !success;
+}