diff options
author | Emilio G. Cota <cota@braap.org> | 2016-06-27 15:02:13 -0400 |
---|---|---|
committer | Richard Henderson <rth@twiddle.net> | 2016-10-26 08:29:02 -0700 |
commit | 1dd089d0eec060dcd8478735114d98421d414805 (patch) | |
tree | 32f50b948a31b291aa092c3dd7c70a4e198d3bed /target-arm/helper-a64.c | |
parent | cf12bce088f22b92bf62ffa0d7f6a3e951e355a9 (diff) |
target-arm: emulate aarch64's LL/SC using cmpxchg helpers
Emulating LL/SC with cmpxchg is not correct, since it can
suffer from the ABA problem. Portable parallel code, however,
is written assuming only cmpxchg--and not LL/SC--is available.
This means that in practice emulating LL/SC with cmpxchg is
a viable alternative.
The appended emulates LL/SC pairs in aarch64 with cmpxchg helpers.
This works in both user and system mode. In usermode, it avoids
pausing all other CPUs to perform the LL/SC pair. The subsequent
performance and scalability improvement is significant, as the
plots below show. They plot the throughput of atomic_add-bench
compiled for ARM and executed on a 64-core x86 machine.
Hi-res plots: http://imgur.com/a/JVc8Y
atomic_add-bench: 1000000 ops/thread, [0,1] range
18 ++---------+----------+---------+----------+----------+----------+---++
+cmpxchg +-E--+ + + + + + |
16 ++master +-H--+ ++
|| |
14 ++ ++
| | |
12 ++| ++
| | |
10 ++++ ++
8 ++E ++
|+++ |
6 ++ | ++
| | |
4 ++ | ++
| | |
2 +H++E+--- ++
+ | +E++----+E+---+--+E+----++E+------+E+------+E++----+E+---+--+E|
0 ++H-H----H-+-----H----+---------+----------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
atomic_add-bench: 1000000 ops/thread, [0,2] range
18 ++---------+----------+---------+----------+----------+----------+---++
+cmpxchg +-E--+ + + + + + |
16 ++master +-H--+ ++
| | |
14 ++E ++
| | |
12 ++| ++
|+++ |
10 ++ | ++
8 ++ | ++
| | |
6 ++ | ++
| | |
4 ++ | ++
| +E+--- |
2 +H+ +E+-----+++ +++ +++ ---+E+-----+E+------+++
+++ + +E+---+--+E+----++E+------+E+--- ++++ +++ + +E|
0 ++H-H----H-+-----H----+---------+----------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
atomic_add-bench: 1000000 ops/thread, [0,128] range
70 ++---------+----------+---------+----------+----------+----------+---++
+cmpxchg +-E--+ + + + + + |
60 ++master +-H--+ +++ ---+E+-----+E+------+E+
| +E+------E-------+E+--- |
| --- +++ |
50 ++ +++--- ++
| -+E+ |
40 ++ +++---- ++
| E- |
| --| |
30 ++ -- +++ ++
| +E+ |
20 ++E+ ++
|E+ |
| |
10 ++ ++
+ + + + + + + |
0 +HH-H----H-+-----H----+---------+----------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
atomic_add-bench: 1000000 ops/thread, [0,1024] range
160 ++---------+---------+----------+---------+----------+----------+---++
+cmpxchg +-E--+ + + + + + |
140 ++master +-H--+ +++ +++
| -+E+-----+E+-------E|
120 ++ +++ ---- +++
| +++ ----E-- |
100 ++ --E--- +++ ++
| +++ ---- +++ |
80 ++ --E-- ++
| ---- +++ |
| -+E+ |
60 ++ ---- +++ ++
| +E+- |
40 ++ -- ++
| +E+ |
20 +EE+ ++
+++ + + + + + + |
0 +HH-H---H--+-----H---+----------+---------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
[rth: Rearrange 128-bit cmpxchg helper. Enforce alignment on LL.]
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1467054136-10430-28-git-send-email-cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
Diffstat (limited to 'target-arm/helper-a64.c')
-rw-r--r-- | target-arm/helper-a64.c | 113 |
1 files changed, 113 insertions, 0 deletions
diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c index 41e48a41b4..98b97df461 100644 --- a/target-arm/helper-a64.c +++ b/target-arm/helper-a64.c @@ -27,6 +27,10 @@ #include "qemu/bitops.h" #include "internals.h" #include "qemu/crc32c.h" +#include "exec/exec-all.h" +#include "exec/cpu_ldst.h" +#include "qemu/int128.h" +#include "tcg.h" #include <zlib.h> /* For crc32 */ /* C2.4.7 Multiply and divide */ @@ -444,3 +448,112 @@ uint64_t HELPER(crc32c_64)(uint64_t acc, uint64_t val, uint32_t bytes) /* Linux crc32c converts the output to one's complement. */ return crc32c(acc, buf, bytes) ^ 0xffffffff; } + +/* Returns 0 on success; 1 otherwise. */ +uint64_t HELPER(paired_cmpxchg64_le)(CPUARMState *env, uint64_t addr, + uint64_t new_lo, uint64_t new_hi) +{ + uintptr_t ra = GETPC(); + Int128 oldv, cmpv, newv; + bool success; + + cmpv = int128_make128(env->exclusive_val, env->exclusive_high); + newv = int128_make128(new_lo, new_hi); + + if (parallel_cpus) { +#ifndef CONFIG_ATOMIC128 + cpu_loop_exit_atomic(ENV_GET_CPU(env), ra); +#else + int mem_idx = cpu_mmu_index(env, false); + TCGMemOpIdx oi = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx); + oldv = helper_atomic_cmpxchgo_le_mmu(env, addr, cmpv, newv, oi, ra); + success = int128_eq(oldv, cmpv); +#endif + } else { + uint64_t o0, o1; + +#ifdef CONFIG_USER_ONLY + /* ??? Enforce alignment. */ + uint64_t *haddr = g2h(addr); + o0 = ldq_le_p(haddr + 0); + o1 = ldq_le_p(haddr + 1); + oldv = int128_make128(o0, o1); + + success = int128_eq(oldv, cmpv); + if (success) { + stq_le_p(haddr + 0, int128_getlo(newv)); + stq_le_p(haddr + 1, int128_gethi(newv)); + } +#else + int mem_idx = cpu_mmu_index(env, false); + TCGMemOpIdx oi0 = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx); + TCGMemOpIdx oi1 = make_memop_idx(MO_LEQ, mem_idx); + + o0 = helper_le_ldq_mmu(env, addr + 0, oi0, ra); + o1 = helper_le_ldq_mmu(env, addr + 8, oi1, ra); + oldv = int128_make128(o0, o1); + + success = int128_eq(oldv, cmpv); + if (success) { + helper_le_stq_mmu(env, addr + 0, int128_getlo(newv), oi1, ra); + helper_le_stq_mmu(env, addr + 8, int128_gethi(newv), oi1, ra); + } +#endif + } + + return !success; +} + +uint64_t HELPER(paired_cmpxchg64_be)(CPUARMState *env, uint64_t addr, + uint64_t new_lo, uint64_t new_hi) +{ + uintptr_t ra = GETPC(); + Int128 oldv, cmpv, newv; + bool success; + + cmpv = int128_make128(env->exclusive_val, env->exclusive_high); + newv = int128_make128(new_lo, new_hi); + + if (parallel_cpus) { +#ifndef CONFIG_ATOMIC128 + cpu_loop_exit_atomic(ENV_GET_CPU(env), ra); +#else + int mem_idx = cpu_mmu_index(env, false); + TCGMemOpIdx oi = make_memop_idx(MO_BEQ | MO_ALIGN_16, mem_idx); + oldv = helper_atomic_cmpxchgo_be_mmu(env, addr, cmpv, newv, oi, ra); + success = int128_eq(oldv, cmpv); +#endif + } else { + uint64_t o0, o1; + +#ifdef CONFIG_USER_ONLY + /* ??? Enforce alignment. */ + uint64_t *haddr = g2h(addr); + o1 = ldq_be_p(haddr + 0); + o0 = ldq_be_p(haddr + 1); + oldv = int128_make128(o0, o1); + + success = int128_eq(oldv, cmpv); + if (success) { + stq_be_p(haddr + 0, int128_gethi(newv)); + stq_be_p(haddr + 1, int128_getlo(newv)); + } +#else + int mem_idx = cpu_mmu_index(env, false); + TCGMemOpIdx oi0 = make_memop_idx(MO_BEQ | MO_ALIGN_16, mem_idx); + TCGMemOpIdx oi1 = make_memop_idx(MO_BEQ, mem_idx); + + o1 = helper_be_ldq_mmu(env, addr + 0, oi0, ra); + o0 = helper_be_ldq_mmu(env, addr + 8, oi1, ra); + oldv = int128_make128(o0, o1); + + success = int128_eq(oldv, cmpv); + if (success) { + helper_be_stq_mmu(env, addr + 0, int128_gethi(newv), oi1, ra); + helper_be_stq_mmu(env, addr + 8, int128_getlo(newv), oi1, ra); + } +#endif + } + + return !success; +} |