diff options
author | Emilio G. Cota <cota@braap.org> | 2017-08-04 23:46:31 -0400 |
---|---|---|
committer | Richard Henderson <richard.henderson@linaro.org> | 2018-06-15 08:18:48 -1000 |
commit | 0ac20318ce16f4de288969b2007ef5a654176058 (patch) | |
tree | ba285e1540b4d875b476acd72f364d8aaa7165d5 /accel/tcg/cpu-exec.c | |
parent | 705ad1ff0ce264475cb4c9a3aa31ba94a04869fe (diff) |
tcg: remove tb_lock
Use mmap_lock in user-mode to protect TCG state and the page descriptors.
In !user-mode, each vCPU has its own TCG state, so no locks needed.
Per-page locks are used to protect the page descriptors.
Per-TB locks are used in both modes to protect TB jumps.
Some notes:
- tb_lock is removed from notdirty_mem_write by passing a
locked page_collection to tb_invalidate_phys_page_fast.
- tcg_tb_lookup/remove/insert/etc have their own internal lock(s),
so there is no need to further serialize access to them.
- do_tb_flush is run in a safe async context, meaning no other
vCPU threads are running. Therefore acquiring mmap_lock there
is just to please tools such as thread sanitizer.
- Not visible in the diff, but tb_invalidate_phys_page already
has an assert_memory_lock.
- cpu_io_recompile is !user-only, so no mmap_lock there.
- Added mmap_unlock()'s before all siglongjmp's that could
be called in user-mode while mmap_lock is held.
+ Added an assert for !have_mmap_lock() after returning from
the longjmp in cpu_exec, just like we do in cpu_exec_step_atomic.
Performance numbers before/after:
Host: AMD Opteron(tm) Processor 6376
ubuntu 17.04 ppc64 bootup+shutdown time
700 +-+--+----+------+------------+-----------+------------*--+-+
| + + + + + *B |
| before ***B*** ** * |
|tb lock removal ###D### *** |
600 +-+ *** +-+
| ** # |
| *B* #D |
| *** * ## |
500 +-+ *** ### +-+
| * *** ### |
| *B* # ## |
| ** * #D# |
400 +-+ ** ## +-+
| ** ### |
| ** ## |
| ** # ## |
300 +-+ * B* #D# +-+
| B *** ### |
| * ** #### |
| * *** ### |
200 +-+ B *B #D# +-+
| #B* * ## # |
| #* ## |
| + D##D# + + + + |
100 +-+--+----+------+------------+-----------+------------+--+-+
1 8 16 Guest CPUs 48 64
png: https://imgur.com/HwmBHXe
debian jessie aarch64 bootup+shutdown time
90 +-+--+-----+-----+------------+------------+------------+--+-+
| + + + + + + |
| before ***B*** B |
80 +tb lock removal ###D### **D +-+
| **### |
| **## |
70 +-+ ** # +-+
| ** ## |
| ** # |
60 +-+ *B ## +-+
| ** ## |
| *** #D |
50 +-+ *** ## +-+
| * ** ### |
| **B* ### |
40 +-+ **** # ## +-+
| **** #D# |
| ***B** ### |
30 +-+ B***B** #### +-+
| B * * # ### |
| B ###D# |
20 +-+ D ##D## +-+
| D# |
| + + + + + + |
10 +-+--+-----+-----+------------+------------+------------+--+-+
1 8 16 Guest CPUs 48 64
png: https://imgur.com/iGpGFtv
The gains are high for 4-8 CPUs. Beyond that point, however, unrelated
lock contention significantly hurts scalability.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Diffstat (limited to 'accel/tcg/cpu-exec.c')
-rw-r--r-- | accel/tcg/cpu-exec.c | 34 |
1 files changed, 8 insertions, 26 deletions
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c index c482008bc7..c738b7f7d6 100644 --- a/accel/tcg/cpu-exec.c +++ b/accel/tcg/cpu-exec.c @@ -212,20 +212,20 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles, We only end up here when an existing TB is too long. */ cflags |= MIN(max_cycles, CF_COUNT_MASK); - tb_lock(); + mmap_lock(); tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base, orig_tb->flags, cflags); tb->orig_tb = orig_tb; - tb_unlock(); + mmap_unlock(); /* execute the generated code */ trace_exec_tb_nocache(tb, tb->pc); cpu_tb_exec(cpu, tb); - tb_lock(); + mmap_lock(); tb_phys_invalidate(tb, -1); + mmap_unlock(); tcg_tb_remove(tb); - tb_unlock(); } #endif @@ -244,9 +244,7 @@ void cpu_exec_step_atomic(CPUState *cpu) tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask); if (tb == NULL) { mmap_lock(); - tb_lock(); tb = tb_gen_code(cpu, pc, cs_base, flags, cflags); - tb_unlock(); mmap_unlock(); } @@ -261,15 +259,13 @@ void cpu_exec_step_atomic(CPUState *cpu) cpu_tb_exec(cpu, tb); cc->cpu_exec_exit(cpu); } else { - /* We may have exited due to another problem here, so we need - * to reset any tb_locks we may have taken but didn't release. + /* * The mmap_lock is dropped by tb_gen_code if it runs out of * memory. */ #ifndef CONFIG_SOFTMMU tcg_debug_assert(!have_mmap_lock()); #endif - tb_lock_reset(); assert_no_pages_locked(); } @@ -398,20 +394,11 @@ static inline TranslationBlock *tb_find(CPUState *cpu, TranslationBlock *tb; target_ulong cs_base, pc; uint32_t flags; - bool acquired_tb_lock = false; tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask); if (tb == NULL) { - /* mmap_lock is needed by tb_gen_code, and mmap_lock must be - * taken outside tb_lock. As system emulation is currently - * single threaded the locks are NOPs. - */ mmap_lock(); - tb_lock(); - acquired_tb_lock = true; - tb = tb_gen_code(cpu, pc, cs_base, flags, cf_mask); - mmap_unlock(); /* We add the TB in the virtual pc hash table for the fast lookup */ atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb); @@ -427,15 +414,8 @@ static inline TranslationBlock *tb_find(CPUState *cpu, #endif /* See if we can patch the calling TB. */ if (last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) { - if (!acquired_tb_lock) { - tb_lock(); - acquired_tb_lock = true; - } tb_add_jump(last_tb, tb_exit, tb); } - if (acquired_tb_lock) { - tb_unlock(); - } return tb; } @@ -710,7 +690,9 @@ int cpu_exec(CPUState *cpu) g_assert(cpu == current_cpu); g_assert(cc == CPU_GET_CLASS(cpu)); #endif /* buggy compiler */ - tb_lock_reset(); +#ifndef CONFIG_SOFTMMU + tcg_debug_assert(!have_mmap_lock()); +#endif if (qemu_mutex_iothread_locked()) { qemu_mutex_unlock_iothread(); } |