From 0ac20318ce16f4de288969b2007ef5a654176058 Mon Sep 17 00:00:00 2001 From: "Emilio G. Cota" Date: Fri, 4 Aug 2017 23:46:31 -0400 Subject: tcg: remove tb_lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use mmap_lock in user-mode to protect TCG state and the page descriptors. In !user-mode, each vCPU has its own TCG state, so no locks needed. Per-page locks are used to protect the page descriptors. Per-TB locks are used in both modes to protect TB jumps. Some notes: - tb_lock is removed from notdirty_mem_write by passing a locked page_collection to tb_invalidate_phys_page_fast. - tcg_tb_lookup/remove/insert/etc have their own internal lock(s), so there is no need to further serialize access to them. - do_tb_flush is run in a safe async context, meaning no other vCPU threads are running. Therefore acquiring mmap_lock there is just to please tools such as thread sanitizer. - Not visible in the diff, but tb_invalidate_phys_page already has an assert_memory_lock. - cpu_io_recompile is !user-only, so no mmap_lock there. - Added mmap_unlock()'s before all siglongjmp's that could be called in user-mode while mmap_lock is held. + Added an assert for !have_mmap_lock() after returning from the longjmp in cpu_exec, just like we do in cpu_exec_step_atomic. Performance numbers before/after: Host: AMD Opteron(tm) Processor 6376 ubuntu 17.04 ppc64 bootup+shutdown time 700 +-+--+----+------+------------+-----------+------------*--+-+ | + + + + + *B | | before ***B*** ** * | |tb lock removal ###D### *** | 600 +-+ *** +-+ | ** # | | *B* #D | | *** * ## | 500 +-+ *** ### +-+ | * *** ### | | *B* # ## | | ** * #D# | 400 +-+ ** ## +-+ | ** ### | | ** ## | | ** # ## | 300 +-+ * B* #D# +-+ | B *** ### | | * ** #### | | * *** ### | 200 +-+ B *B #D# +-+ | #B* * ## # | | #* ## | | + D##D# + + + + | 100 +-+--+----+------+------------+-----------+------------+--+-+ 1 8 16 Guest CPUs 48 64 png: https://imgur.com/HwmBHXe debian jessie aarch64 bootup+shutdown time 90 +-+--+-----+-----+------------+------------+------------+--+-+ | + + + + + + | | before ***B*** B | 80 +tb lock removal ###D### **D +-+ | **### | | **## | 70 +-+ ** # +-+ | ** ## | | ** # | 60 +-+ *B ## +-+ | ** ## | | *** #D | 50 +-+ *** ## +-+ | * ** ### | | **B* ### | 40 +-+ **** # ## +-+ | **** #D# | | ***B** ### | 30 +-+ B***B** #### +-+ | B * * # ### | | B ###D# | 20 +-+ D ##D## +-+ | D# | | + + + + + + | 10 +-+--+-----+-----+------------+------------+------------+--+-+ 1 8 16 Guest CPUs 48 64 png: https://imgur.com/iGpGFtv The gains are high for 4-8 CPUs. Beyond that point, however, unrelated lock contention significantly hurts scalability. Reviewed-by: Richard Henderson Reviewed-by: Alex Bennée Signed-off-by: Emilio G. Cota Signed-off-by: Richard Henderson --- accel/tcg/cpu-exec.c | 34 +++--------- accel/tcg/translate-all.c | 132 ++++++++++++++-------------------------------- accel/tcg/translate-all.h | 3 +- 3 files changed, 50 insertions(+), 119 deletions(-) (limited to 'accel/tcg') diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c index c482008bc7..c738b7f7d6 100644 --- a/accel/tcg/cpu-exec.c +++ b/accel/tcg/cpu-exec.c @@ -212,20 +212,20 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles, We only end up here when an existing TB is too long. */ cflags |= MIN(max_cycles, CF_COUNT_MASK); - tb_lock(); + mmap_lock(); tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base, orig_tb->flags, cflags); tb->orig_tb = orig_tb; - tb_unlock(); + mmap_unlock(); /* execute the generated code */ trace_exec_tb_nocache(tb, tb->pc); cpu_tb_exec(cpu, tb); - tb_lock(); + mmap_lock(); tb_phys_invalidate(tb, -1); + mmap_unlock(); tcg_tb_remove(tb); - tb_unlock(); } #endif @@ -244,9 +244,7 @@ void cpu_exec_step_atomic(CPUState *cpu) tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask); if (tb == NULL) { mmap_lock(); - tb_lock(); tb = tb_gen_code(cpu, pc, cs_base, flags, cflags); - tb_unlock(); mmap_unlock(); } @@ -261,15 +259,13 @@ void cpu_exec_step_atomic(CPUState *cpu) cpu_tb_exec(cpu, tb); cc->cpu_exec_exit(cpu); } else { - /* We may have exited due to another problem here, so we need - * to reset any tb_locks we may have taken but didn't release. + /* * The mmap_lock is dropped by tb_gen_code if it runs out of * memory. */ #ifndef CONFIG_SOFTMMU tcg_debug_assert(!have_mmap_lock()); #endif - tb_lock_reset(); assert_no_pages_locked(); } @@ -398,20 +394,11 @@ static inline TranslationBlock *tb_find(CPUState *cpu, TranslationBlock *tb; target_ulong cs_base, pc; uint32_t flags; - bool acquired_tb_lock = false; tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask); if (tb == NULL) { - /* mmap_lock is needed by tb_gen_code, and mmap_lock must be - * taken outside tb_lock. As system emulation is currently - * single threaded the locks are NOPs. - */ mmap_lock(); - tb_lock(); - acquired_tb_lock = true; - tb = tb_gen_code(cpu, pc, cs_base, flags, cf_mask); - mmap_unlock(); /* We add the TB in the virtual pc hash table for the fast lookup */ atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb); @@ -427,15 +414,8 @@ static inline TranslationBlock *tb_find(CPUState *cpu, #endif /* See if we can patch the calling TB. */ if (last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) { - if (!acquired_tb_lock) { - tb_lock(); - acquired_tb_lock = true; - } tb_add_jump(last_tb, tb_exit, tb); } - if (acquired_tb_lock) { - tb_unlock(); - } return tb; } @@ -710,7 +690,9 @@ int cpu_exec(CPUState *cpu) g_assert(cpu == current_cpu); g_assert(cc == CPU_GET_CLASS(cpu)); #endif /* buggy compiler */ - tb_lock_reset(); +#ifndef CONFIG_SOFTMMU + tcg_debug_assert(!have_mmap_lock()); +#endif if (qemu_mutex_iothread_locked()) { qemu_mutex_unlock_iothread(); } diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c index b33cd9bc98..f0c3fd4d03 100644 --- a/accel/tcg/translate-all.c +++ b/accel/tcg/translate-all.c @@ -88,13 +88,13 @@ #endif /* Access to the various translations structures need to be serialised via locks - * for consistency. This is automatic for SoftMMU based system - * emulation due to its single threaded nature. In user-mode emulation - * access to the memory related structures are protected with the - * mmap_lock. + * for consistency. + * In user-mode emulation access to the memory related structures are protected + * with mmap_lock. + * In !user-mode we use per-page locks. */ #ifdef CONFIG_SOFTMMU -#define assert_memory_lock() tcg_debug_assert(have_tb_lock) +#define assert_memory_lock() #else #define assert_memory_lock() tcg_debug_assert(have_mmap_lock()) #endif @@ -216,9 +216,6 @@ __thread TCGContext *tcg_ctx; TBContext tb_ctx; bool parallel_cpus; -/* translation block context */ -static __thread int have_tb_lock; - static void page_table_config_init(void) { uint32_t v_l1_bits; @@ -239,31 +236,6 @@ static void page_table_config_init(void) assert(v_l2_levels >= 0); } -#define assert_tb_locked() tcg_debug_assert(have_tb_lock) -#define assert_tb_unlocked() tcg_debug_assert(!have_tb_lock) - -void tb_lock(void) -{ - assert_tb_unlocked(); - qemu_mutex_lock(&tb_ctx.tb_lock); - have_tb_lock++; -} - -void tb_unlock(void) -{ - assert_tb_locked(); - have_tb_lock--; - qemu_mutex_unlock(&tb_ctx.tb_lock); -} - -void tb_lock_reset(void) -{ - if (have_tb_lock) { - qemu_mutex_unlock(&tb_ctx.tb_lock); - have_tb_lock = 0; - } -} - void cpu_gen_init(void) { tcg_context_init(&tcg_init_ctx); @@ -420,8 +392,7 @@ bool cpu_restore_state(CPUState *cpu, uintptr_t host_pc, bool will_exit) * - fault during translation (instruction fetch) * - fault from helper (not using GETPC() macro) * - * Either way we need return early to avoid blowing up on a - * recursive tb_lock() as we can't resolve it here. + * Either way we need return early as we can't resolve it here. * * We are using unsigned arithmetic so if host_pc < * tcg_init_ctx.code_gen_buffer check_offset will wrap to way @@ -430,7 +401,6 @@ bool cpu_restore_state(CPUState *cpu, uintptr_t host_pc, bool will_exit) check_offset = host_pc - (uintptr_t) tcg_init_ctx.code_gen_buffer; if (check_offset < tcg_init_ctx.code_gen_buffer_size) { - tb_lock(); tb = tcg_tb_lookup(host_pc); if (tb) { cpu_restore_state_from_tb(cpu, tb, host_pc, will_exit); @@ -441,7 +411,6 @@ bool cpu_restore_state(CPUState *cpu, uintptr_t host_pc, bool will_exit) } r = true; } - tb_unlock(); } return r; @@ -1130,7 +1099,6 @@ static inline void code_gen_alloc(size_t tb_size) fprintf(stderr, "Could not allocate dynamic translator buffer\n"); exit(1); } - qemu_mutex_init(&tb_ctx.tb_lock); } static bool tb_cmp(const void *ap, const void *bp) @@ -1174,14 +1142,12 @@ void tcg_exec_init(unsigned long tb_size) /* * Allocate a new translation block. Flush the translation buffer if * too many translation blocks or too much generated code. - * - * Called with tb_lock held. */ static TranslationBlock *tb_alloc(target_ulong pc) { TranslationBlock *tb; - assert_tb_locked(); + assert_memory_lock(); tb = tcg_tb_alloc(tcg_ctx); if (unlikely(tb == NULL)) { @@ -1248,8 +1214,7 @@ static gboolean tb_host_size_iter(gpointer key, gpointer value, gpointer data) /* flush all the translation blocks */ static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count) { - tb_lock(); - + mmap_lock(); /* If it is already been done on request of another CPU, * just retry. */ @@ -1279,7 +1244,7 @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count) atomic_mb_set(&tb_ctx.tb_flush_count, tb_ctx.tb_flush_count + 1); done: - tb_unlock(); + mmap_unlock(); } void tb_flush(CPUState *cpu) @@ -1313,7 +1278,7 @@ do_tb_invalidate_check(struct qht *ht, void *p, uint32_t hash, void *userp) /* verify that all the pages have correct rights for code * - * Called with tb_lock held. + * Called with mmap_lock held. */ static void tb_invalidate_check(target_ulong address) { @@ -1343,7 +1308,10 @@ static void tb_page_check(void) #endif /* CONFIG_USER_ONLY */ -/* call with @pd->lock held */ +/* + * user-mode: call with mmap_lock held + * !user-mode: call with @pd->lock held + */ static inline void tb_page_remove(PageDesc *pd, TranslationBlock *tb) { TranslationBlock *tb1; @@ -1437,7 +1405,11 @@ static inline void tb_jmp_unlink(TranslationBlock *dest) qemu_spin_unlock(&dest->jmp_lock); } -/* If @rm_from_page_list is set, call with the TB's pages' locks held */ +/* + * In user-mode, call with mmap_lock held. + * In !user-mode, if @rm_from_page_list is set, call with the TB's pages' + * locks held. + */ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list) { CPUState *cpu; @@ -1445,7 +1417,7 @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list) uint32_t h; tb_page_addr_t phys_pc; - assert_tb_locked(); + assert_memory_lock(); /* make sure no further incoming jumps will be chained to this TB */ qemu_spin_lock(&tb->jmp_lock); @@ -1498,7 +1470,7 @@ static void tb_phys_invalidate__locked(TranslationBlock *tb) /* invalidate one TB * - * Called with tb_lock held. + * Called with mmap_lock held in user-mode. */ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr) { @@ -1543,7 +1515,7 @@ static void build_page_bitmap(PageDesc *p) /* add the tb in the target page and protect it if necessary * * Called with mmap_lock held for user-mode emulation. - * Called with @p->lock held. + * Called with @p->lock held in !user-mode. */ static inline void tb_page_add(PageDesc *p, TranslationBlock *tb, unsigned int n, tb_page_addr_t page_addr) @@ -1815,10 +1787,9 @@ TranslationBlock *tb_gen_code(CPUState *cpu, if ((pc & TARGET_PAGE_MASK) != virt_page2) { phys_page2 = get_page_addr_code(env, virt_page2); } - /* As long as consistency of the TB stuff is provided by tb_lock in user - * mode and is implicit in single-threaded softmmu emulation, no explicit - * memory barrier is required before tb_link_page() makes the TB visible - * through the physical hash table and physical page list. + /* + * No explicit memory barrier is required -- tb_link_page() makes the + * TB visible in a consistent state. */ existing_tb = tb_link_page(tb, phys_pc, phys_page2); /* if the TB already exists, discard what we just translated */ @@ -1834,8 +1805,9 @@ TranslationBlock *tb_gen_code(CPUState *cpu, } /* - * Call with all @pages locked. * @p must be non-NULL. + * user-mode: call with mmap_lock held. + * !user-mode: call with all @pages locked. */ static void tb_invalidate_phys_page_range__locked(struct page_collection *pages, @@ -1920,6 +1892,7 @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages, page_collection_unlock(pages); /* Force execution of one insn next time. */ cpu->cflags_next_tb = 1 | curr_cflags(); + mmap_unlock(); cpu_loop_exit_noexc(cpu); } #endif @@ -1932,8 +1905,7 @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages, * access: the virtual CPU will exit the current TB if code is modified inside * this TB. * - * Called with tb_lock/mmap_lock held for user-mode emulation - * Called with tb_lock held for system-mode emulation + * Called with mmap_lock held for user-mode emulation */ void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end, int is_cpu_write_access) @@ -1942,7 +1914,6 @@ void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end, PageDesc *p; assert_memory_lock(); - assert_tb_locked(); p = page_find(start >> TARGET_PAGE_BITS); if (p == NULL) { @@ -1961,14 +1932,15 @@ void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end, * access: the virtual CPU will exit the current TB if code is modified inside * this TB. * - * Called with mmap_lock held for user-mode emulation, grabs tb_lock - * Called with tb_lock held for system-mode emulation + * Called with mmap_lock held for user-mode emulation. */ -static void tb_invalidate_phys_range_1(tb_page_addr_t start, tb_page_addr_t end) +void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end) { struct page_collection *pages; tb_page_addr_t next; + assert_memory_lock(); + pages = page_collection_lock(start, end); for (next = (start & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE; start < end; @@ -1984,30 +1956,16 @@ static void tb_invalidate_phys_range_1(tb_page_addr_t start, tb_page_addr_t end) page_collection_unlock(pages); } -#ifdef CONFIG_SOFTMMU -void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end) -{ - assert_tb_locked(); - tb_invalidate_phys_range_1(start, end); -} -#else -void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end) -{ - assert_memory_lock(); - tb_lock(); - tb_invalidate_phys_range_1(start, end); - tb_unlock(); -} -#endif - #ifdef CONFIG_SOFTMMU /* len must be <= 8 and start must be a multiple of len. * Called via softmmu_template.h when code areas are written to with * iothread mutex not held. + * + * Call with all @pages in the range [@start, @start + len[ locked. */ -void tb_invalidate_phys_page_fast(tb_page_addr_t start, int len) +void tb_invalidate_phys_page_fast(struct page_collection *pages, + tb_page_addr_t start, int len) { - struct page_collection *pages; PageDesc *p; #if 0 @@ -2026,7 +1984,6 @@ void tb_invalidate_phys_page_fast(tb_page_addr_t start, int len) return; } - pages = page_collection_lock(start, start + len); assert_page_locked(p); if (!p->code_bitmap && ++p->code_write_count >= SMC_BITMAP_USE_THRESHOLD) { @@ -2045,7 +2002,6 @@ void tb_invalidate_phys_page_fast(tb_page_addr_t start, int len) do_invalidate: tb_invalidate_phys_page_range__locked(pages, p, start, start + len, 1); } - page_collection_unlock(pages); } #else /* Called with mmap_lock held. If pc is not 0 then it indicates the @@ -2077,7 +2033,6 @@ static bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc) return false; } - tb_lock(); #ifdef TARGET_HAS_PRECISE_SMC if (p->first_tb && pc != 0) { current_tb = tcg_tb_lookup(pc); @@ -2110,12 +2065,9 @@ static bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc) if (current_tb_modified) { /* Force execution of one insn next time. */ cpu->cflags_next_tb = 1 | curr_cflags(); - /* tb_lock will be reset after cpu_loop_exit_noexc longjmps - * back into the cpu_exec loop. */ return true; } #endif - tb_unlock(); return false; } @@ -2136,18 +2088,18 @@ void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr, MemTxAttrs attrs) return; } ram_addr = memory_region_get_ram_addr(mr) + addr; - tb_lock(); tb_invalidate_phys_page_range(ram_addr, ram_addr + 1, 0); - tb_unlock(); rcu_read_unlock(); } #endif /* !defined(CONFIG_USER_ONLY) */ -/* Called with tb_lock held. */ +/* user-mode: call with mmap_lock held */ void tb_check_watchpoint(CPUState *cpu) { TranslationBlock *tb; + assert_memory_lock(); + tb = tcg_tb_lookup(cpu->mem_io_pc); if (tb) { /* We can use retranslation to find the PC. */ @@ -2181,7 +2133,6 @@ void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr) TranslationBlock *tb; uint32_t n; - tb_lock(); tb = tcg_tb_lookup(retaddr); if (!tb) { cpu_abort(cpu, "cpu_io_recompile: could not find TB for pc=%p", @@ -2229,9 +2180,6 @@ void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr) * repeating the fault, which is horribly inefficient. * Better would be to execute just this insn uncached, or generate a * second new TB. - * - * cpu_loop_exit_noexc will longjmp back to cpu_exec where the - * tb_lock gets reset. */ cpu_loop_exit_noexc(cpu); } diff --git a/accel/tcg/translate-all.h b/accel/tcg/translate-all.h index 6d1d2588b5..e6cb963d7e 100644 --- a/accel/tcg/translate-all.h +++ b/accel/tcg/translate-all.h @@ -26,7 +26,8 @@ struct page_collection *page_collection_lock(tb_page_addr_t start, tb_page_addr_t end); void page_collection_unlock(struct page_collection *set); -void tb_invalidate_phys_page_fast(tb_page_addr_t start, int len); +void tb_invalidate_phys_page_fast(struct page_collection *pages, + tb_page_addr_t start, int len); void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end, int is_cpu_write_access); void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end); -- cgit v1.2.3