diff options
author | Emilio G. Cota <cota@braap.org> | 2016-06-27 15:02:08 -0400 |
---|---|---|
committer | Richard Henderson <rth@twiddle.net> | 2016-10-26 08:29:02 -0700 |
commit | 354161b37c6465a32073eac5f16fa35939af2bb4 (patch) | |
tree | b55943826699fbdc712b4b0d3a0b8086dcf17a3c | |
parent | 7f5616f53896a4e08ad37de3ac50d3a4cc8eff7a (diff) |
target-arm: emulate LL/SC using cmpxchg helpers
Emulating LL/SC with cmpxchg is not correct, since it can
suffer from the ABA problem. Portable parallel code, however,
is written assuming only cmpxchg--and not LL/SC--is available.
This means that in practice emulating LL/SC with cmpxchg is
a viable alternative.
The appended emulates LL/SC pairs in ARM with cmpxchg helpers.
This works in both user and system mode. In usermode, it avoids
pausing all other CPUs to perform the LL/SC pair. The subsequent
performance and scalability improvement is significant, as the
plots below show. They plot the throughput of atomic_add-bench
compiled for ARM and executed on a 64-core x86 machine.
Hi-res plots: http://imgur.com/a/aNQpB
atomic_add-bench: 1000000 ops/thread, [0,1] range
9 ++---------+----------+----------+----------+----------+----------+---++
+cmpxchg +-E--+ + + + + + |
8 +Emaster +-H--+ ++
| | |
7 ++E ++
| | |
6 ++++ ++
| | |
5 ++ | ++
4 ++ | ++
| | |
3 ++ | ++
| | |
2 ++ | ++
|H++E+--- +++ ---+E+------+E+------+E|
1 +++ +E+-----+E+------+E+------+E+------+E+-- +++ +++ ++
++H+ + +++ + +++ ++++ + + + |
0 ++--H----H-+-----H----+----------+----------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
atomic_add-bench: 1000000 ops/thread, [0,2] range
16 ++---------+----------+---------+----------+----------+----------+---++
+cmpxchg +-E--+ + + + + + |
14 ++master +-H--+ ++
| | |
12 ++| ++
| E |
10 ++| ++
| | |
8 ++++ ++
|E+| |
| | |
6 ++ | ++
| | |
4 ++ | ++
| +E+--- +++ +++ +++ ---+E+------+E|
2 +H+ +E+------E-------+E+-----+E+------+E+------+E+-- +++
+ | + +++ + ++++ + + + |
0 ++H-H----H-+-----H----+---------+----------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
atomic_add-bench: 1000000 ops/thread, [0,128] range
70 ++---------+----------+---------+----------+----------+----------+---++
+cmpxchg +-E--+ + + + ++++ + |
60 ++master +-H--+ ----E------+E+-------++
| -+E+--- +++ +++ +E|
| +++ ---- +++ ++|
50 ++ +++ ---+E+- ++
| -E--- |
40 ++ ---+++ ++
| +++--- |
| -+E+ |
30 ++ +++---- ++
| +E+ |
20 ++ +++-- ++
| +E+ |
|+E+ |
10 +E+ ++
+ + + + + + + |
0 +HH-H----H-+-----H----+---------+----------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
atomic_add-bench: 1000000 ops/thread, [0,1024] range
120 ++---------+---------+----------+---------+----------+----------+---++
+cmpxchg +-E--+ + + + + + |
| master +-H--+ ++|
100 ++ ----E+
| +++ ---+E+--- ++|
| --E--- +++ |
80 ++ ---- +++ ++
| ---+E+- |
60 ++ -+E+-- ++
| +++ ---- +++ |
| -+E+- |
40 ++ +++---- ++
| +++ ---+E+ |
| -+E+--- |
20 ++ +E+ ++
|+E+++ |
+E+ + + + + + + |
0 +HH-H---H--+-----H---+----------+---------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
[rth: Enforce alignment for ldrexd.]
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1467054136-10430-23-git-send-email-cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
-rw-r--r-- | target-arm/translate.c | 140 |
1 files changed, 45 insertions, 95 deletions
diff --git a/target-arm/translate.c b/target-arm/translate.c index ee2bf57f90..cc97d1ed46 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -1026,9 +1026,6 @@ DO_GEN_LD(8u, MO_UB) DO_GEN_LD(16s, MO_SW) DO_GEN_LD(16u, MO_UW) DO_GEN_LD(32u, MO_UL) -/* 'a' variants include an alignment check */ -DO_GEN_LD(16ua, MO_UW | MO_ALIGN) -DO_GEN_LD(32ua, MO_UL | MO_ALIGN) DO_GEN_ST(8, MO_UB) DO_GEN_ST(16, MO_UW) DO_GEN_ST(32, MO_UL) @@ -7720,45 +7717,30 @@ static void gen_logicq_cc(TCGv_i32 lo, TCGv_i32 hi) /* Load/Store exclusive instructions are implemented by remembering the value/address loaded, and seeing if these are the same - when the store is performed. This should be sufficient to implement + when the store is performed. This should be sufficient to implement the architecturally mandated semantics, and avoids having to monitor - regular stores. - - In system emulation mode only one CPU will be running at once, so - this sequence is effectively atomic. In user emulation mode we - throw an exception and handle the atomic operation elsewhere. */ + regular stores. The compare vs the remembered value is done during + the cmpxchg operation, but we must compare the addresses manually. */ static void gen_load_exclusive(DisasContext *s, int rt, int rt2, TCGv_i32 addr, int size) { TCGv_i32 tmp = tcg_temp_new_i32(); + TCGMemOp opc = size | MO_ALIGN | s->be_data; s->is_ldex = true; - switch (size) { - case 0: - gen_aa32_ld8u(s, tmp, addr, get_mem_index(s)); - break; - case 1: - gen_aa32_ld16ua(s, tmp, addr, get_mem_index(s)); - break; - case 2: - case 3: - gen_aa32_ld32ua(s, tmp, addr, get_mem_index(s)); - break; - default: - abort(); - } - if (size == 3) { TCGv_i32 tmp2 = tcg_temp_new_i32(); - TCGv_i32 tmp3 = tcg_temp_new_i32(); + TCGv_i64 t64 = tcg_temp_new_i64(); - tcg_gen_addi_i32(tmp2, addr, 4); - gen_aa32_ld32u(s, tmp3, tmp2, get_mem_index(s)); - tcg_temp_free_i32(tmp2); - tcg_gen_concat_i32_i64(cpu_exclusive_val, tmp, tmp3); - store_reg(s, rt2, tmp3); + gen_aa32_ld_i64(s, t64, addr, get_mem_index(s), opc); + tcg_gen_mov_i64(cpu_exclusive_val, t64); + tcg_gen_extr_i64_i32(tmp, tmp2, t64); + tcg_temp_free_i64(t64); + + store_reg(s, rt2, tmp2); } else { + gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), opc); tcg_gen_extu_i32_i64(cpu_exclusive_val, tmp); } @@ -7771,23 +7753,15 @@ static void gen_clrex(DisasContext *s) tcg_gen_movi_i64(cpu_exclusive_addr, -1); } -#ifdef CONFIG_USER_ONLY static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2, TCGv_i32 addr, int size) { - tcg_gen_extu_i32_i64(cpu_exclusive_test, addr); - tcg_gen_movi_i32(cpu_exclusive_info, - size | (rd << 4) | (rt << 8) | (rt2 << 12)); - gen_exception_internal_insn(s, 4, EXCP_STREX); -} -#else -static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2, - TCGv_i32 addr, int size) -{ - TCGv_i32 tmp; - TCGv_i64 val64, extaddr; + TCGv_i32 t0, t1, t2; + TCGv_i64 extaddr; + TCGv taddr; TCGLabel *done_label; TCGLabel *fail_label; + TCGMemOp opc = size | MO_ALIGN | s->be_data; /* if (env->exclusive_addr == addr && env->exclusive_val == [addr]) { [addr] = {Rt}; @@ -7802,69 +7776,45 @@ static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2, tcg_gen_brcond_i64(TCG_COND_NE, extaddr, cpu_exclusive_addr, fail_label); tcg_temp_free_i64(extaddr); - tmp = tcg_temp_new_i32(); - switch (size) { - case 0: - gen_aa32_ld8u(s, tmp, addr, get_mem_index(s)); - break; - case 1: - gen_aa32_ld16u(s, tmp, addr, get_mem_index(s)); - break; - case 2: - case 3: - gen_aa32_ld32u(s, tmp, addr, get_mem_index(s)); - break; - default: - abort(); - } - - val64 = tcg_temp_new_i64(); + taddr = gen_aa32_addr(s, addr, opc); + t0 = tcg_temp_new_i32(); + t1 = load_reg(s, rt); if (size == 3) { - TCGv_i32 tmp2 = tcg_temp_new_i32(); - TCGv_i32 tmp3 = tcg_temp_new_i32(); - tcg_gen_addi_i32(tmp2, addr, 4); - gen_aa32_ld32u(s, tmp3, tmp2, get_mem_index(s)); - tcg_temp_free_i32(tmp2); - tcg_gen_concat_i32_i64(val64, tmp, tmp3); - tcg_temp_free_i32(tmp3); - } else { - tcg_gen_extu_i32_i64(val64, tmp); - } - tcg_temp_free_i32(tmp); + TCGv_i64 o64 = tcg_temp_new_i64(); + TCGv_i64 n64 = tcg_temp_new_i64(); - tcg_gen_brcond_i64(TCG_COND_NE, val64, cpu_exclusive_val, fail_label); - tcg_temp_free_i64(val64); + t2 = load_reg(s, rt2); + tcg_gen_concat_i32_i64(n64, t1, t2); + tcg_temp_free_i32(t2); + gen_aa32_frob64(s, n64); - tmp = load_reg(s, rt); - switch (size) { - case 0: - gen_aa32_st8(s, tmp, addr, get_mem_index(s)); - break; - case 1: - gen_aa32_st16(s, tmp, addr, get_mem_index(s)); - break; - case 2: - case 3: - gen_aa32_st32(s, tmp, addr, get_mem_index(s)); - break; - default: - abort(); - } - tcg_temp_free_i32(tmp); - if (size == 3) { - tcg_gen_addi_i32(addr, addr, 4); - tmp = load_reg(s, rt2); - gen_aa32_st32(s, tmp, addr, get_mem_index(s)); - tcg_temp_free_i32(tmp); + tcg_gen_atomic_cmpxchg_i64(o64, taddr, cpu_exclusive_val, n64, + get_mem_index(s), opc); + tcg_temp_free_i64(n64); + + gen_aa32_frob64(s, o64); + tcg_gen_setcond_i64(TCG_COND_NE, o64, o64, cpu_exclusive_val); + tcg_gen_extrl_i64_i32(t0, o64); + + tcg_temp_free_i64(o64); + } else { + t2 = tcg_temp_new_i32(); + tcg_gen_extrl_i64_i32(t2, cpu_exclusive_val); + tcg_gen_atomic_cmpxchg_i32(t0, taddr, t2, t1, get_mem_index(s), opc); + tcg_gen_setcond_i32(TCG_COND_NE, t0, t0, t2); + tcg_temp_free_i32(t2); } - tcg_gen_movi_i32(cpu_R[rd], 0); + tcg_temp_free_i32(t1); + tcg_temp_free(taddr); + tcg_gen_mov_i32(cpu_R[rd], t0); + tcg_temp_free_i32(t0); tcg_gen_br(done_label); + gen_set_label(fail_label); tcg_gen_movi_i32(cpu_R[rd], 1); gen_set_label(done_label); tcg_gen_movi_i64(cpu_exclusive_addr, -1); } -#endif /* gen_srs: * @env: CPUARMState |