diff options
40 files changed, 1878 insertions, 476 deletions
@@ -5879,6 +5879,7 @@ mkdir -p $target_dir echo "# Automatically generated by configure - do not modify" > $config_target_mak bflt="no" +mttcg="no" interp_prefix1=$(echo "$interp_prefix" | sed "s/%M/$target_name/g") gdb_xml_files="" @@ -5897,11 +5898,13 @@ case "$target_name" in arm|armeb) TARGET_ARCH=arm bflt="yes" + mttcg="yes" gdb_xml_files="arm-core.xml arm-vfp.xml arm-vfp3.xml arm-neon.xml" ;; aarch64) TARGET_BASE_ARCH=arm bflt="yes" + mttcg="yes" gdb_xml_files="aarch64-core.xml aarch64-fpu.xml arm-core.xml arm-vfp.xml arm-vfp3.xml arm-neon.xml" ;; cris) @@ -6066,6 +6069,9 @@ if test "$target_bigendian" = "yes" ; then fi if test "$target_softmmu" = "yes" ; then echo "CONFIG_SOFTMMU=y" >> $config_target_mak + if test "$mttcg" = "yes" ; then + echo "TARGET_SUPPORTS_MTTCG=y" >> $config_target_mak + fi fi if test "$target_user_only" = "yes" ; then echo "CONFIG_USER_ONLY=y" >> $config_target_mak diff --git a/cpu-exec-common.c b/cpu-exec-common.c index 767d9c6f0c..0504a9457b 100644 --- a/cpu-exec-common.c +++ b/cpu-exec-common.c @@ -23,9 +23,6 @@ #include "exec/exec-all.h" #include "exec/memory-internal.h" -bool exit_request; -CPUState *tcg_current_cpu; - /* exit the current TB, but without causing any exception to be raised */ void cpu_loop_exit_noexc(CPUState *cpu) { diff --git a/cpu-exec.c b/cpu-exec.c index 142a5862fc..1a5ad4889d 100644 --- a/cpu-exec.c +++ b/cpu-exec.c @@ -29,6 +29,7 @@ #include "qemu/rcu.h" #include "exec/tb-hash.h" #include "exec/log.h" +#include "qemu/main-loop.h" #if defined(TARGET_I386) && !defined(CONFIG_USER_ONLY) #include "hw/i386/apic.h" #endif @@ -227,20 +228,43 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles, static void cpu_exec_step(CPUState *cpu) { + CPUClass *cc = CPU_GET_CLASS(cpu); CPUArchState *env = (CPUArchState *)cpu->env_ptr; TranslationBlock *tb; target_ulong cs_base, pc; uint32_t flags; cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags); - tb = tb_gen_code(cpu, pc, cs_base, flags, - 1 | CF_NOCACHE | CF_IGNORE_ICOUNT); - tb->orig_tb = NULL; - /* execute the generated code */ - trace_exec_tb_nocache(tb, pc); - cpu_tb_exec(cpu, tb); - tb_phys_invalidate(tb, -1); - tb_free(tb); + if (sigsetjmp(cpu->jmp_env, 0) == 0) { + mmap_lock(); + tb_lock(); + tb = tb_gen_code(cpu, pc, cs_base, flags, + 1 | CF_NOCACHE | CF_IGNORE_ICOUNT); + tb->orig_tb = NULL; + tb_unlock(); + mmap_unlock(); + + cc->cpu_exec_enter(cpu); + /* execute the generated code */ + trace_exec_tb_nocache(tb, pc); + cpu_tb_exec(cpu, tb); + cc->cpu_exec_exit(cpu); + + tb_lock(); + tb_phys_invalidate(tb, -1); + tb_free(tb); + tb_unlock(); + } else { + /* We may have exited due to another problem here, so we need + * to reset any tb_locks we may have taken but didn't release. + * The mmap_lock is dropped by tb_gen_code if it runs out of + * memory. + */ +#ifndef CONFIG_SOFTMMU + tcg_debug_assert(!have_mmap_lock()); +#endif + tb_lock_reset(); + } } void cpu_exec_step_atomic(CPUState *cpu) @@ -384,12 +408,13 @@ static inline bool cpu_handle_halt(CPUState *cpu) if ((cpu->interrupt_request & CPU_INTERRUPT_POLL) && replay_interrupt()) { X86CPU *x86_cpu = X86_CPU(cpu); + qemu_mutex_lock_iothread(); apic_poll_irq(x86_cpu->apic_state); cpu_reset_interrupt(cpu, CPU_INTERRUPT_POLL); + qemu_mutex_unlock_iothread(); } #endif if (!cpu_has_work(cpu)) { - current_cpu = NULL; return true; } @@ -439,7 +464,9 @@ static inline bool cpu_handle_exception(CPUState *cpu, int *ret) #else if (replay_exception()) { CPUClass *cc = CPU_GET_CLASS(cpu); + qemu_mutex_lock_iothread(); cc->do_interrupt(cpu); + qemu_mutex_unlock_iothread(); cpu->exception_index = -1; } else if (!replay_has_interrupt()) { /* give a chance to iothread in replay mode */ @@ -465,9 +492,11 @@ static inline bool cpu_handle_interrupt(CPUState *cpu, TranslationBlock **last_tb) { CPUClass *cc = CPU_GET_CLASS(cpu); - int interrupt_request = cpu->interrupt_request; - if (unlikely(interrupt_request)) { + if (unlikely(atomic_read(&cpu->interrupt_request))) { + int interrupt_request; + qemu_mutex_lock_iothread(); + interrupt_request = cpu->interrupt_request; if (unlikely(cpu->singlestep_enabled & SSTEP_NOIRQ)) { /* Mask out external interrupts for this step. */ interrupt_request &= ~CPU_INTERRUPT_SSTEP_MASK; @@ -475,6 +504,7 @@ static inline bool cpu_handle_interrupt(CPUState *cpu, if (interrupt_request & CPU_INTERRUPT_DEBUG) { cpu->interrupt_request &= ~CPU_INTERRUPT_DEBUG; cpu->exception_index = EXCP_DEBUG; + qemu_mutex_unlock_iothread(); return true; } if (replay_mode == REPLAY_MODE_PLAY && !replay_has_interrupt()) { @@ -484,6 +514,7 @@ static inline bool cpu_handle_interrupt(CPUState *cpu, cpu->interrupt_request &= ~CPU_INTERRUPT_HALT; cpu->halted = 1; cpu->exception_index = EXCP_HLT; + qemu_mutex_unlock_iothread(); return true; } #if defined(TARGET_I386) @@ -494,12 +525,14 @@ static inline bool cpu_handle_interrupt(CPUState *cpu, cpu_svm_check_intercept_param(env, SVM_EXIT_INIT, 0, 0); do_cpu_init(x86_cpu); cpu->exception_index = EXCP_HALTED; + qemu_mutex_unlock_iothread(); return true; } #else else if (interrupt_request & CPU_INTERRUPT_RESET) { replay_interrupt(); cpu_reset(cpu); + qemu_mutex_unlock_iothread(); return true; } #endif @@ -522,7 +555,12 @@ static inline bool cpu_handle_interrupt(CPUState *cpu, the program flow was changed */ *last_tb = NULL; } + + /* If we exit via cpu_loop_exit/longjmp it is reset in cpu_exec */ + qemu_mutex_unlock_iothread(); } + + if (unlikely(atomic_read(&cpu->exit_request) || replay_has_interrupt())) { atomic_set(&cpu->exit_request, 0); cpu->exception_index = EXCP_INTERRUPT; @@ -548,15 +586,13 @@ static inline void cpu_loop_exec_tb(CPUState *cpu, TranslationBlock *tb, *tb_exit = ret & TB_EXIT_MASK; switch (*tb_exit) { case TB_EXIT_REQUESTED: - /* Something asked us to stop executing - * chained TBs; just continue round the main - * loop. Whatever requested the exit will also - * have set something else (eg exit_request or - * interrupt_request) which we will handle - * next time around the loop. But we need to - * ensure the zeroing of tcg_exit_req (see cpu_tb_exec) - * comes before the next read of cpu->exit_request - * or cpu->interrupt_request. + /* Something asked us to stop executing chained TBs; just + * continue round the main loop. Whatever requested the exit + * will also have set something else (eg interrupt_request) + * which we will handle next time around the loop. But we + * need to ensure the tcg_exit_req read in generated code + * comes before the next read of cpu->exit_request or + * cpu->interrupt_request. */ smp_mb(); *last_tb = NULL; @@ -608,13 +644,8 @@ int cpu_exec(CPUState *cpu) return EXCP_HALTED; } - atomic_mb_set(&tcg_current_cpu, cpu); rcu_read_lock(); - if (unlikely(atomic_mb_read(&exit_request))) { - cpu->exit_request = 1; - } - cc->cpu_exec_enter(cpu); /* Calculate difference between guest clock and host clock. @@ -640,6 +671,9 @@ int cpu_exec(CPUState *cpu) #endif /* buggy compiler */ cpu->can_do_io = 1; tb_lock_reset(); + if (qemu_mutex_iothread_locked()) { + qemu_mutex_unlock_iothread(); + } } /* if an exception is pending, we execute it here */ @@ -659,10 +693,5 @@ int cpu_exec(CPUState *cpu) cc->cpu_exec_exit(cpu); rcu_read_unlock(); - /* fail safe : never use current_cpu outside cpu_exec() */ - current_cpu = NULL; - - /* Does not need atomic_mb_set because a spurious wakeup is okay. */ - atomic_set(&tcg_current_cpu, NULL); return ret; } @@ -25,6 +25,7 @@ /* Needed early for CONFIG_BSD etc. */ #include "qemu/osdep.h" #include "qemu-common.h" +#include "qemu/config-file.h" #include "cpu.h" #include "monitor/monitor.h" #include "qapi/qmp/qerror.h" @@ -45,6 +46,7 @@ #include "qemu/main-loop.h" #include "qemu/bitmap.h" #include "qemu/seqlock.h" +#include "tcg.h" #include "qapi-event.h" #include "hw/nmi.h" #include "sysemu/replay.h" @@ -150,6 +152,77 @@ typedef struct TimersState { } TimersState; static TimersState timers_state; +bool mttcg_enabled; + +/* + * We default to false if we know other options have been enabled + * which are currently incompatible with MTTCG. Otherwise when each + * guest (target) has been updated to support: + * - atomic instructions + * - memory ordering primitives (barriers) + * they can set the appropriate CONFIG flags in ${target}-softmmu.mak + * + * Once a guest architecture has been converted to the new primitives + * there are two remaining limitations to check. + * + * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host) + * - The host must have a stronger memory order than the guest + * + * It may be possible in future to support strong guests on weak hosts + * but that will require tagging all load/stores in a guest with their + * implicit memory order requirements which would likely slow things + * down a lot. + */ + +static bool check_tcg_memory_orders_compatible(void) +{ +#if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO) + return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0; +#else + return false; +#endif +} + +static bool default_mttcg_enabled(void) +{ + QemuOpts *icount_opts = qemu_find_opts_singleton("icount"); + const char *rr = qemu_opt_get(icount_opts, "rr"); + + if (rr || TCG_OVERSIZED_GUEST) { + return false; + } else { +#ifdef TARGET_SUPPORTS_MTTCG + return check_tcg_memory_orders_compatible(); +#else + return false; +#endif + } +} + +void qemu_tcg_configure(QemuOpts *opts, Error **errp) +{ + const char *t = qemu_opt_get(opts, "thread"); + if (t) { + if (strcmp(t, "multi") == 0) { + if (TCG_OVERSIZED_GUEST) { + error_setg(errp, "No MTTCG when guest word size > hosts"); + } else { + if (!check_tcg_memory_orders_compatible()) { + error_report("Guest expects a stronger memory ordering " + "than the host provides"); + error_printf("This may cause strange/hard to debug errors"); + } + mttcg_enabled = true; + } + } else if (strcmp(t, "single") == 0) { + mttcg_enabled = false; + } else { + error_setg(errp, "Invalid 'thread' setting %s", t); + } + } else { + mttcg_enabled = default_mttcg_enabled(); + } +} int64_t cpu_get_icount_raw(void) { @@ -695,6 +768,63 @@ void configure_icount(QemuOpts *opts, Error **errp) } /***********************************************************/ +/* TCG vCPU kick timer + * + * The kick timer is responsible for moving single threaded vCPU + * emulation on to the next vCPU. If more than one vCPU is running a + * timer event with force a cpu->exit so the next vCPU can get + * scheduled. + * + * The timer is removed if all vCPUs are idle and restarted again once + * idleness is complete. + */ + +static QEMUTimer *tcg_kick_vcpu_timer; +static CPUState *tcg_current_rr_cpu; + +#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10) + +static inline int64_t qemu_tcg_next_kick(void) +{ + return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD; +} + +/* Kick the currently round-robin scheduled vCPU */ +static void qemu_cpu_kick_rr_cpu(void) +{ + CPUState *cpu; + do { + cpu = atomic_mb_read(&tcg_current_rr_cpu); + if (cpu) { + cpu_exit(cpu); + } + } while (cpu != atomic_mb_read(&tcg_current_rr_cpu)); +} + +static void kick_tcg_thread(void *opaque) +{ + timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick()); + qemu_cpu_kick_rr_cpu(); +} + +static void start_tcg_kick_timer(void) +{ + if (!mttcg_enabled && !tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) { + tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, + kick_tcg_thread, NULL); + timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick()); + } +} + +static void stop_tcg_kick_timer(void) +{ + if (tcg_kick_vcpu_timer) { + timer_del(tcg_kick_vcpu_timer); + tcg_kick_vcpu_timer = NULL; + } +} + +/***********************************************************/ void hw_error(const char *fmt, ...) { va_list ap; @@ -896,8 +1026,6 @@ static void qemu_kvm_init_cpu_signals(CPUState *cpu) #endif /* _WIN32 */ static QemuMutex qemu_global_mutex; -static QemuCond qemu_io_proceeded_cond; -static unsigned iothread_requesting_mutex; static QemuThread io_thread; @@ -911,7 +1039,6 @@ void qemu_init_cpu_loop(void) qemu_init_sigbus(); qemu_cond_init(&qemu_cpu_cond); qemu_cond_init(&qemu_pause_cond); - qemu_cond_init(&qemu_io_proceeded_cond); qemu_mutex_init(&qemu_global_mutex); qemu_thread_get_self(&io_thread); @@ -936,28 +1063,34 @@ static void qemu_tcg_destroy_vcpu(CPUState *cpu) static void qemu_wait_io_event_common(CPUState *cpu) { + atomic_mb_set(&cpu->thread_kicked, false); if (cpu->stop) { cpu->stop = false; cpu->stopped = true; qemu_cond_broadcast(&qemu_pause_cond); } process_queued_cpu_work(cpu); - cpu->thread_kicked = false; +} + +static bool qemu_tcg_should_sleep(CPUState *cpu) +{ + if (mttcg_enabled) { + return cpu_thread_is_idle(cpu); + } else { + return all_cpu_threads_idle(); + } } static void qemu_tcg_wait_io_event(CPUState *cpu) { - while (all_cpu_threads_idle()) { + while (qemu_tcg_should_sleep(cpu)) { + stop_tcg_kick_timer(); qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex); } - while (iothread_requesting_mutex) { - qemu_cond_wait(&qemu_io_proceeded_cond, &qemu_global_mutex); - } + start_tcg_kick_timer(); - CPU_FOREACH(cpu) { - qemu_wait_io_event_common(cpu); - } + qemu_wait_io_event_common(cpu); } static void qemu_kvm_wait_io_event(CPUState *cpu) @@ -1028,6 +1161,7 @@ static void *qemu_dummy_cpu_thread_fn(void *arg) qemu_thread_get_self(cpu->thread); cpu->thread_id = qemu_get_thread_id(); cpu->can_do_io = 1; + current_cpu = cpu; sigemptyset(&waitset); sigaddset(&waitset, SIG_IPI); @@ -1036,9 +1170,7 @@ static void *qemu_dummy_cpu_thread_fn(void *arg) cpu->created = true; qemu_cond_signal(&qemu_cpu_cond); - current_cpu = cpu; while (1) { - current_cpu = NULL; qemu_mutex_unlock_iothread(); do { int sig; @@ -1049,7 +1181,6 @@ static void *qemu_dummy_cpu_thread_fn(void *arg) exit(1); } qemu_mutex_lock_iothread(); - current_cpu = cpu; qemu_wait_io_event_common(cpu); } @@ -1115,9 +1246,11 @@ static int tcg_cpu_exec(CPUState *cpu) cpu->icount_decr.u16.low = decr; cpu->icount_extra = count; } + qemu_mutex_unlock_iothread(); cpu_exec_start(cpu); ret = cpu_exec(cpu); cpu_exec_end(cpu); + qemu_mutex_lock_iothread(); #ifdef CONFIG_PROFILER tcg_time += profile_getclock() - ti; #endif @@ -1150,7 +1283,16 @@ static void deal_with_unplugged_cpus(void) } } -static void *qemu_tcg_cpu_thread_fn(void *arg) +/* Single-threaded TCG + * + * In the single-threaded case each vCPU is simulated in turn. If + * there is more than a single vCPU we create a simple timer to kick + * the vCPU and ensure we don't get stuck in a tight loop in one vCPU. + * This is done explicitly rather than relying on side-effects + * elsewhere. + */ + +static void *qemu_tcg_rr_cpu_thread_fn(void *arg) { CPUState *cpu = arg; @@ -1172,15 +1314,18 @@ static void *qemu_tcg_cpu_thread_fn(void *arg) /* process any pending work */ CPU_FOREACH(cpu) { + current_cpu = cpu; qemu_wait_io_event_common(cpu); } } - /* process any pending work */ - atomic_mb_set(&exit_request, 1); + start_tcg_kick_timer(); cpu = first_cpu; + /* process any pending work */ + cpu->exit_request = 1; + while (1) { /* Account partial waits to QEMU_CLOCK_VIRTUAL. */ qemu_account_warp_timer(); @@ -1189,7 +1334,10 @@ static void *qemu_tcg_cpu_thread_fn(void *arg) cpu = first_cpu; } - for (; cpu != NULL && !exit_request; cpu = CPU_NEXT(cpu)) { + while (cpu && !cpu->queued_work_first && !cpu->exit_request) { + + atomic_mb_set(&tcg_current_rr_cpu, cpu); + current_cpu = cpu; qemu_clock_enable(QEMU_CLOCK_VIRTUAL, (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0); @@ -1200,22 +1348,32 @@ static void *qemu_tcg_cpu_thread_fn(void *arg) if (r == EXCP_DEBUG) { cpu_handle_guest_debug(cpu); break; + } else if (r == EXCP_ATOMIC) { + qemu_mutex_unlock_iothread(); + cpu_exec_step_atomic(cpu); + qemu_mutex_lock_iothread(); + break; } - } else if (cpu->stop || cpu->stopped) { + } else if (cpu->stop) { if (cpu->unplug) { cpu = CPU_NEXT(cpu); } break; } - } /* for cpu.. */ + cpu = CPU_NEXT(cpu); + } /* while (cpu && !cpu->exit_request).. */ + + /* Does not need atomic_mb_set because a spurious wakeup is okay. */ + atomic_set(&tcg_current_rr_cpu, NULL); - /* Pairs with smp_wmb in qemu_cpu_kick. */ - atomic_mb_set(&exit_request, 0); + if (cpu && cpu->exit_request) { + atomic_mb_set(&cpu->exit_request, 0); + } handle_icount_deadline(); - qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus)); + qemu_tcg_wait_io_event(cpu ? cpu : QTAILQ_FIRST(&cpus)); deal_with_unplugged_cpus(); } @@ -1262,6 +1420,68 @@ static void CALLBACK dummy_apc_func(ULONG_PTR unused) } #endif +/* Multi-threaded TCG + * + * In the multi-threaded case each vCPU has its own thread. The TLS + * variable current_cpu can be used deep in the code to find the + * current CPUState for a given thread. + */ + +static void *qemu_tcg_cpu_thread_fn(void *arg) +{ + CPUState *cpu = arg; + + rcu_register_thread(); + + qemu_mutex_lock_iothread(); + qemu_thread_get_self(cpu->thread); + + cpu->thread_id = qemu_get_thread_id(); + cpu->created = true; + cpu->can_do_io = 1; + current_cpu = cpu; + qemu_cond_signal(&qemu_cpu_cond); + + /* process any pending work */ + cpu->exit_request = 1; + + while (1) { + if (cpu_can_run(cpu)) { + int r; + r = tcg_cpu_exec(cpu); + switch (r) { + case EXCP_DEBUG: + cpu_handle_guest_debug(cpu); + break; + case EXCP_HALTED: + /* during start-up the vCPU is reset and the thread is + * kicked several times. If we don't ensure we go back + * to sleep in the halted state we won't cleanly + * start-up when the vCPU is enabled. + * + * cpu->halted should ensure we sleep in wait_io_event + */ + g_assert(cpu->halted); + break; + case EXCP_ATOMIC: + qemu_mutex_unlock_iothread(); + cpu_exec_step_atomic(cpu); + qemu_mutex_lock_iothread(); + default: + /* Ignore everything else? */ + break; + } + } + + handle_icount_deadline(); + + atomic_mb_set(&cpu->exit_request, 0); + qemu_tcg_wait_io_event(cpu); + } + + return NULL; +} + static void qemu_cpu_kick_thread(CPUState *cpu) { #ifndef _WIN32 @@ -1287,24 +1507,13 @@ static void qemu_cpu_kick_thread(CPUState *cpu) #endif } -static void qemu_cpu_kick_no_halt(void) -{ - CPUState *cpu; - /* Ensure whatever caused the exit has reached the CPU threads before - * writing exit_request. - */ - atomic_mb_set(&exit_request, 1); - cpu = atomic_mb_read(&tcg_current_cpu); - if (cpu) { - cpu_exit(cpu); - } -} - void qemu_cpu_kick(CPUState *cpu) { qemu_cond_broadcast(cpu->halt_cond); if (tcg_enabled()) { - qemu_cpu_kick_no_halt(); + cpu_exit(cpu); + /* NOP unless doing single-thread RR */ + qemu_cpu_kick_rr_cpu(); } else { if (hax_enabled()) { /* @@ -1342,27 +1551,14 @@ bool qemu_mutex_iothread_locked(void) void qemu_mutex_lock_iothread(void) { - atomic_inc(&iothread_requesting_mutex); - /* In the simple case there is no need to bump the VCPU thread out of - * TCG code execution. - */ - if (!tcg_enabled() || qemu_in_vcpu_thread() || - !first_cpu || !first_cpu->created) { - qemu_mutex_lock(&qemu_global_mutex); - atomic_dec(&iothread_requesting_mutex); - } else { - if (qemu_mutex_trylock(&qemu_global_mutex)) { - qemu_cpu_kick_no_halt(); - qemu_mutex_lock(&qemu_global_mutex); - } - atomic_dec(&iothread_requesting_mutex); - qemu_cond_broadcast(&qemu_io_proceeded_cond); - } + g_assert(!qemu_mutex_iothread_locked()); + qemu_mutex_lock(&qemu_global_mutex); iothread_locked = true; } void qemu_mutex_unlock_iothread(void) { + g_assert(qemu_mutex_iothread_locked()); iothread_locked = false; qemu_mutex_unlock(&qemu_global_mutex); } @@ -1392,13 +1588,6 @@ void pause_all_vcpus(void) if (qemu_in_vcpu_thread()) { cpu_stop_current(); - if (!kvm_enabled()) { - CPU_FOREACH(cpu) { - cpu->stop = false; - cpu->stopped = true; - } - return; - } } while (!all_vcpus_paused()) { @@ -1447,29 +1636,43 @@ void cpu_remove_sync(CPUState *cpu) static void qemu_tcg_init_vcpu(CPUState *cpu) { char thread_name[VCPU_THREAD_NAME_SIZE]; - static QemuCond *tcg_halt_cond; - static QemuThread *tcg_cpu_thread; + static QemuCond *single_tcg_halt_cond; + static QemuThread *single_tcg_cpu_thread; - /* share a single thread for all cpus with TCG */ - if (!tcg_cpu_thread) { + if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) { cpu->thread = g_malloc0(sizeof(QemuThread)); cpu->halt_cond = g_malloc0(sizeof(QemuCond)); qemu_cond_init(cpu->halt_cond); - tcg_halt_cond = cpu->halt_cond; - snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG", + + if (qemu_tcg_mttcg_enabled()) { + /* create a thread per vCPU with TCG (MTTCG) */ + parallel_cpus = true; + snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG", cpu->cpu_index); - qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn, - cpu, QEMU_THREAD_JOINABLE); + + qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn, + cpu, QEMU_THREAD_JOINABLE); + + } else { + /* share a single thread for all cpus with TCG */ + snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG"); + qemu_thread_create(cpu->thread, thread_name, + qemu_tcg_rr_cpu_thread_fn, + cpu, QEMU_THREAD_JOINABLE); + + single_tcg_halt_cond = cpu->halt_cond; + single_tcg_cpu_thread = cpu->thread; + } #ifdef _WIN32 cpu->hThread = qemu_thread_get_handle(cpu->thread); #endif while (!cpu->created) { qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex); } - tcg_cpu_thread = cpu->thread; } else { - cpu->thread = tcg_cpu_thread; - cpu->halt_cond = tcg_halt_cond; + /* For non-MTTCG cases we share the thread */ + cpu->thread = single_tcg_cpu_thread; + cpu->halt_cond = single_tcg_halt_cond; } } @@ -18,6 +18,7 @@ */ #include "qemu/osdep.h" +#include "qemu/main-loop.h" #include "cpu.h" #include "exec/exec-all.h" #include "exec/memory.h" @@ -57,6 +58,40 @@ } \ } while (0) +#define assert_cpu_is_self(this_cpu) do { \ + if (DEBUG_TLB_GATE) { \ + g_assert(!cpu->created || qemu_cpu_is_self(cpu)); \ + } \ + } while (0) + +/* run_on_cpu_data.target_ptr should always be big enough for a + * target_ulong even on 32 bit builds */ +QEMU_BUILD_BUG_ON(sizeof(target_ulong) > sizeof(run_on_cpu_data)); + +/* We currently can't handle more than 16 bits in the MMUIDX bitmask. + */ +QEMU_BUILD_BUG_ON(NB_MMU_MODES > 16); +#define ALL_MMUIDX_BITS ((1 << NB_MMU_MODES) - 1) + +/* flush_all_helper: run fn across all cpus + * + * If the wait flag is set then the src cpu's helper will be queued as + * "safe" work and the loop exited creating a synchronisation point + * where all queued work will be finished before execution starts + * again. + */ +static void flush_all_helper(CPUState *src, run_on_cpu_func fn, + run_on_cpu_data d) +{ + CPUState *cpu; + + CPU_FOREACH(cpu) { + if (cpu != src) { + async_run_on_cpu(cpu, fn, d); + } + } +} + /* statistics */ int tlb_flush_count; @@ -65,10 +100,22 @@ int tlb_flush_count; * flushing more entries than required is only an efficiency issue, * not a correctness issue. */ -void tlb_flush(CPUState *cpu) +static void tlb_flush_nocheck(CPUState *cpu) { CPUArchState *env = cpu->env_ptr; + /* The QOM tests will trigger tlb_flushes without setting up TCG + * so we bug out here in that case. + */ + if (!tcg_enabled()) { + return; + } + + assert_cpu_is_self(cpu); + tlb_debug("(count: %d)\n", tlb_flush_count++); + + tb_lock(); + memset(env->tlb_table, -1, sizeof(env->tlb_table)); memset(env->tlb_v_table, -1, sizeof(env->tlb_v_table)); memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache)); @@ -76,39 +123,117 @@ void tlb_flush(CPUState *cpu) env->vtlb_index = 0; env->tlb_flush_addr = -1; env->tlb_flush_mask = 0; - tlb_flush_count++; + + tb_unlock(); + + atomic_mb_set(&cpu->pending_tlb_flush, 0); +} + +static void tlb_flush_global_async_work(CPUState *cpu, run_on_cpu_data data) +{ + tlb_flush_nocheck(cpu); +} + +void tlb_flush(CPUState *cpu) +{ + if (cpu->created && !qemu_cpu_is_self(cpu)) { + if (atomic_mb_read(&cpu->pending_tlb_flush) != ALL_MMUIDX_BITS) { + atomic_mb_set(&cpu->pending_tlb_flush, ALL_MMUIDX_BITS); + async_run_on_cpu(cpu, tlb_flush_global_async_work, + RUN_ON_CPU_NULL); + } + } else { + tlb_flush_nocheck(cpu); + } +} + +void tlb_flush_all_cpus(CPUState *src_cpu) +{ + const run_on_cpu_func fn = tlb_flush_global_async_work; + flush_all_helper(src_cpu, fn, RUN_ON_CPU_NULL); + fn(src_cpu, RUN_ON_CPU_NULL); +} + +void tlb_flush_all_cpus_synced(CPUState *src_cpu) +{ + const run_on_cpu_func fn = tlb_flush_global_async_work; + flush_all_helper(src_cpu, fn, RUN_ON_CPU_NULL); + async_safe_run_on_cpu(src_cpu, fn, RUN_ON_CPU_NULL); } -static inline void v_tlb_flush_by_mmuidx(CPUState *cpu, va_list argp) +static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data) { CPUArchState *env = cpu->env_ptr; + unsigned long mmu_idx_bitmask = data.host_int; + int mmu_idx; - tlb_debug("start\n"); + assert_cpu_is_self(cpu); - for (;;) { - int mmu_idx = va_arg(argp, int); + tb_lock(); - if (mmu_idx < 0) { - break; - } + tlb_debug("start: mmu_idx:0x%04lx\n", mmu_idx_bitmask); - tlb_debug("%d\n", mmu_idx); + for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) { + + if (test_bit(mmu_idx, &mmu_idx_bitmask)) { + tlb_debug("%d\n", mmu_idx); - memset(env->tlb_table[mmu_idx], -1, sizeof(env->tlb_table[0])); - memset(env->tlb_v_table[mmu_idx], -1, sizeof(env->tlb_v_table[0])); + memset(env->tlb_table[mmu_idx], -1, sizeof(env->tlb_table[0])); + memset(env->tlb_v_table[mmu_idx], -1, sizeof(env->tlb_v_table[0])); + } } memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache)); + + tlb_debug("done\n"); + + tb_unlock(); } -void tlb_flush_by_mmuidx(CPUState *cpu, ...) +void tlb_flush_by_mmuidx(CPUState *cpu, uint16_t idxmap) { - va_list argp; - va_start(argp, cpu); - v_tlb_flush_by_mmuidx(cpu, argp); - va_end(argp); + tlb_debug("mmu_idx: 0x%" PRIx16 "\n", idxmap); + + if (!qemu_cpu_is_self(cpu)) { + uint16_t pending_flushes = idxmap; + pending_flushes &= ~atomic_mb_read(&cpu->pending_tlb_flush); + + if (pending_flushes) { + tlb_debug("reduced mmu_idx: 0x%" PRIx16 "\n", pending_flushes); + + atomic_or(&cpu->pending_tlb_flush, pending_flushes); + async_run_on_cpu(cpu, tlb_flush_by_mmuidx_async_work, + RUN_ON_CPU_HOST_INT(pending_flushes)); + } + } else { + tlb_flush_by_mmuidx_async_work(cpu, + RUN_ON_CPU_HOST_INT(idxmap)); + } +} + +void tlb_flush_by_mmuidx_all_cpus(CPUState *src_cpu, uint16_t idxmap) +{ + const run_on_cpu_func fn = tlb_flush_by_mmuidx_async_work; + + tlb_debug("mmu_idx: 0x%"PRIx16"\n", idxmap); + + flush_all_helper(src_cpu, fn, RUN_ON_CPU_HOST_INT(idxmap)); + fn(src_cpu, RUN_ON_CPU_HOST_INT(idxmap)); } +void tlb_flush_by_mmuidx_all_cpus_synced(CPUState *src_cpu, + uint16_t idxmap) +{ + const run_on_cpu_func fn = tlb_flush_by_mmuidx_async_work; + + tlb_debug("mmu_idx: 0x%"PRIx16"\n", idxmap); + + flush_all_helper(src_cpu, fn, RUN_ON_CPU_HOST_INT(idxmap)); + async_safe_run_on_cpu(src_cpu, fn, RUN_ON_CPU_HOST_INT(idxmap)); +} + + + static inline void tlb_flush_entry(CPUTLBEntry *tlb_entry, target_ulong addr) { if (addr == (tlb_entry->addr_read & @@ -121,12 +246,15 @@ static inline void tlb_flush_entry(CPUTLBEntry *tlb_entry, target_ulong addr) } } -void tlb_flush_page(CPUState *cpu, target_ulong addr) +static void tlb_flush_page_async_work(CPUState *cpu, run_on_cpu_data data) { CPUArchState *env = cpu->env_ptr; + target_ulong addr = (target_ulong) data.target_ptr; int i; int mmu_idx; + assert_cpu_is_self(cpu); + tlb_debug("page :" TARGET_FMT_lx "\n", addr); /* Check if we need to flush due to large pages. */ @@ -156,15 +284,62 @@ void tlb_flush_page(CPUState *cpu, target_ulong addr) tb_flush_jmp_cache(cpu, addr); } -void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, ...) +void tlb_flush_page(CPUState *cpu, target_ulong addr) +{ + tlb_debug("page :" TARGET_FMT_lx "\n", addr); + + if (!qemu_cpu_is_self(cpu)) { + async_run_on_cpu(cpu, tlb_flush_page_async_work, + RUN_ON_CPU_TARGET_PTR(addr)); + } else { + tlb_flush_page_async_work(cpu, RUN_ON_CPU_TARGET_PTR(addr)); + } +} + +/* As we are going to hijack the bottom bits of the page address for a + * mmuidx bit mask we need to fail to build if we can't do that + */ +QEMU_BUILD_BUG_ON(NB_MMU_MODES > TARGET_PAGE_BITS_MIN); + +static void tlb_flush_page_by_mmuidx_async_work(CPUState *cpu, + run_on_cpu_data data) { CPUArchState *env = cpu->env_ptr; - int i, k; - va_list argp; + target_ulong addr_and_mmuidx = (target_ulong) data.target_ptr; + target_ulong addr = addr_and_mmuidx & TARGET_PAGE_MASK; + unsigned long mmu_idx_bitmap = addr_and_mmuidx & ALL_MMUIDX_BITS; + int page = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1); + int mmu_idx; + int i; + + assert_cpu_is_self(cpu); - va_start(argp, addr); + tlb_debug("page:%d addr:"TARGET_FMT_lx" mmu_idx:0x%lx\n", + page, addr, mmu_idx_bitmap); + + for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) { + if (test_bit(mmu_idx, &mmu_idx_bitmap)) { + tlb_flush_entry(&env->tlb_table[mmu_idx][page], addr); + + /* check whether there are vltb entries that need to be flushed */ + for (i = 0; i < CPU_VTLB_SIZE; i++) { + tlb_flush_entry(&env->tlb_v_table[mmu_idx][i], addr); + } + } + } + + tb_flush_jmp_cache(cpu, addr); +} + +static void tlb_check_page_and_flush_by_mmuidx_async_work(CPUState *cpu, + run_on_cpu_data data) +{ + CPUArchState *env = cpu->env_ptr; + target_ulong addr_and_mmuidx = (target_ulong) data.target_ptr; + target_ulong addr = addr_and_mmuidx & TARGET_PAGE_MASK; + unsigned long mmu_idx_bitmap = addr_and_mmuidx & ALL_MMUIDX_BITS; - tlb_debug("addr "TARGET_FMT_lx"\n", addr); + tlb_debug("addr:"TARGET_FMT_lx" mmu_idx: %04lx\n", addr, mmu_idx_bitmap); /* Check if we need to flush due to large pages. */ if ((addr & env->tlb_flush_mask) == env->tlb_flush_addr) { @@ -172,33 +347,80 @@ void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, ...) TARGET_FMT_lx "/" TARGET_FMT_lx ")\n", env->tlb_flush_addr, env->tlb_flush_mask); - v_tlb_flush_by_mmuidx(cpu, argp); - va_end(argp); - return; + tlb_flush_by_mmuidx_async_work(cpu, + RUN_ON_CPU_HOST_INT(mmu_idx_bitmap)); + } else { + tlb_flush_page_by_mmuidx_async_work(cpu, data); } +} - addr &= TARGET_PAGE_MASK; - i = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1); +void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, uint16_t idxmap) +{ + target_ulong addr_and_mmu_idx; - for (;;) { - int mmu_idx = va_arg(argp, int); + tlb_debug("addr: "TARGET_FMT_lx" mmu_idx:%" PRIx16 "\n", addr, idxmap); - if (mmu_idx < 0) { - break; - } + /* This should already be page aligned */ + addr_and_mmu_idx = addr & TARGET_PAGE_MASK; + addr_and_mmu_idx |= idxmap; - tlb_debug("idx %d\n", mmu_idx); + if (!qemu_cpu_is_self(cpu)) { + async_run_on_cpu(cpu, tlb_check_page_and_flush_by_mmuidx_async_work, + RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx)); + } else { + tlb_check_page_and_flush_by_mmuidx_async_work( + cpu, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx)); + } +} - tlb_flush_entry(&env->tlb_table[mmu_idx][i], addr); +void tlb_flush_page_by_mmuidx_all_cpus(CPUState *src_cpu, target_ulong addr, + uint16_t idxmap) +{ + const run_on_cpu_func fn = tlb_check_page_and_flush_by_mmuidx_async_work; + target_ulong addr_and_mmu_idx; - /* check whether there are vltb entries that need to be flushed */ - for (k = 0; k < CPU_VTLB_SIZE; k++) { - tlb_flush_entry(&env->tlb_v_table[mmu_idx][k], addr); - } - } - va_end(argp); + tlb_debug("addr: "TARGET_FMT_lx" mmu_idx:%"PRIx16"\n", addr, idxmap); - tb_flush_jmp_cache(cpu, addr); + /* This should already be page aligned */ + addr_and_mmu_idx = addr & TARGET_PAGE_MASK; + addr_and_mmu_idx |= idxmap; + + flush_all_helper(src_cpu, fn, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx)); + fn(src_cpu, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx)); +} + +void tlb_flush_page_by_mmuidx_all_cpus_synced(CPUState *src_cpu, + target_ulong addr, + uint16_t idxmap) +{ + const run_on_cpu_func fn = tlb_check_page_and_flush_by_mmuidx_async_work; + target_ulong addr_and_mmu_idx; + + tlb_debug("addr: "TARGET_FMT_lx" mmu_idx:%"PRIx16"\n", addr, idxmap); + + /* This should already be page aligned */ + addr_and_mmu_idx = addr & TARGET_PAGE_MASK; + addr_and_mmu_idx |= idxmap; + + flush_all_helper(src_cpu, fn, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx)); + async_safe_run_on_cpu(src_cpu, fn, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx)); +} + +void tlb_flush_page_all_cpus(CPUState *src, target_ulong addr) +{ + const run_on_cpu_func fn = tlb_flush_page_async_work; + + flush_all_helper(src, fn, RUN_ON_CPU_TARGET_PTR(addr)); + fn(src, RUN_ON_CPU_TARGET_PTR(addr)); +} + +void tlb_flush_page_all_cpus_synced(CPUState *src, + target_ulong addr) +{ + const run_on_cpu_func fn = tlb_flush_page_async_work; + + flush_all_helper(src, fn, RUN_ON_CPU_TARGET_PTR(addr)); + async_safe_run_on_cpu(src, fn, RUN_ON_CPU_TARGET_PTR(addr)); } /* update the TLBs so that writes to code in the virtual page 'addr' @@ -216,36 +438,84 @@ void tlb_unprotect_code(ram_addr_t ram_addr) cpu_physical_memory_set_dirty_flag(ram_addr, DIRTY_MEMORY_CODE); } -static bool tlb_is_dirty_ram(CPUTLBEntry *tlbe) -{ - return (tlbe->addr_write & (TLB_INVALID_MASK|TLB_MMIO|TLB_NOTDIRTY)) == 0; -} -void tlb_reset_dirty_range(CPUTLBEntry *tlb_entry, uintptr_t start, +/* + * Dirty write flag handling + * + * When the TCG code writes to a location it looks up the address in + * the TLB and uses that data to compute the final address. If any of + * the lower bits of the address are set then the slow path is forced. + * There are a number of reasons to do this but for normal RAM the + * most usual is detecting writes to code regions which may invalidate + * generated code. + * + * Because we want other vCPUs to respond to changes straight away we + * update the te->addr_write field atomically. If the TLB entry has + * been changed by the vCPU in the mean time we skip the update. + * + * As this function uses atomic accesses we also need to ensure + * updates to tlb_entries follow the same access rules. We don't need + * to worry about this for oversized guests as MTTCG is disabled for + * them. + */ + +static void tlb_reset_dirty_range(CPUTLBEntry *tlb_entry, uintptr_t start, uintptr_t length) { - uintptr_t addr; +#if TCG_OVERSIZED_GUEST + uintptr_t addr = tlb_entry->addr_write; - if (tlb_is_dirty_ram(tlb_entry)) { - addr = (tlb_entry->addr_write & TARGET_PAGE_MASK) + tlb_entry->addend; + if ((addr & (TLB_INVALID_MASK | TLB_MMIO | TLB_NOTDIRTY)) == 0) { + addr &= TARGET_PAGE_MASK; + addr += tlb_entry->addend; if ((addr - start) < length) { tlb_entry->addr_write |= TLB_NOTDIRTY; } } +#else + /* paired with atomic_mb_set in tlb_set_page_with_attrs */ + uintptr_t orig_addr = atomic_mb_read(&tlb_entry->addr_write); + uintptr_t addr = orig_addr; + + if ((addr & (TLB_INVALID_MASK | TLB_MMIO | TLB_NOTDIRTY)) == 0) { + addr &= TARGET_PAGE_MASK; + addr += atomic_read(&tlb_entry->addend); + if ((addr - start) < length) { + uintptr_t notdirty_addr = orig_addr | TLB_NOTDIRTY; + atomic_cmpxchg(&tlb_entry->addr_write, orig_addr, notdirty_addr); + } + } +#endif } -static inline ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr) +/* For atomic correctness when running MTTCG we need to use the right + * primitives when copying entries */ +static inline void copy_tlb_helper(CPUTLBEntry *d, CPUTLBEntry *s, + bool atomic_set) { - ram_addr_t ram_addr; - - ram_addr = qemu_ram_addr_from_host(ptr); - if (ram_addr == RAM_ADDR_INVALID) { - fprintf(stderr, "Bad ram pointer %p\n", ptr); - abort(); +#if TCG_OVERSIZED_GUEST + *d = *s; +#else + if (atomic_set) { + d->addr_read = s->addr_read; + d->addr_code = s->addr_code; + atomic_set(&d->addend, atomic_read(&s->addend)); + /* Pairs with flag setting in tlb_reset_dirty_range */ + atomic_mb_set(&d->addr_write, atomic_read(&s->addr_write)); + } else { + d->addr_read = s->addr_read; + d->addr_write = atomic_read(&s->addr_write); + d->addr_code = s->addr_code; + d->addend = atomic_read(&s->addend); } - return ram_addr; +#endif } +/* This is a cross vCPU call (i.e. another vCPU resetting the flags of + * the target vCPU). As such care needs to be taken that we don't + * dangerously race with another vCPU update. The only thing actually + * updated is the target TLB entry ->addr_write flags. + */ void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length) { CPUArchState *env; @@ -283,6 +553,8 @@ void tlb_set_dirty(CPUState *cpu, target_ulong vaddr) int i; int mmu_idx; + assert_cpu_is_self(cpu); + vaddr &= TARGET_PAGE_MASK; i = (vaddr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1); for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) { @@ -337,11 +609,12 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr, target_ulong address; target_ulong code_address; uintptr_t addend; - CPUTLBEntry *te; + CPUTLBEntry *te, *tv, tn; hwaddr iotlb, xlat, sz; unsigned vidx = env->vtlb_index++ % CPU_VTLB_SIZE; int asidx = cpu_asidx_from_attrs(cpu, attrs); + assert_cpu_is_self(cpu); assert(size >= TARGET_PAGE_SIZE); if (size != TARGET_PAGE_SIZE) { tlb_add_large_page(env, vaddr, size); @@ -371,41 +644,50 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr, index = (vaddr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1); te = &env->tlb_table[mmu_idx][index]; - /* do not discard the translation in te, evict it into a victim tlb */ - env->tlb_v_table[mmu_idx][vidx] = *te; + tv = &env->tlb_v_table[mmu_idx][vidx]; + + /* addr_write can race with tlb_reset_dirty_range */ + copy_tlb_helper(tv, te, true); + env->iotlb_v[mmu_idx][vidx] = env->iotlb[mmu_idx][index]; /* refill the tlb */ env->iotlb[mmu_idx][index].addr = iotlb - vaddr; env->iotlb[mmu_idx][index].attrs = attrs; - te->addend = addend - vaddr; + + /* Now calculate the new entry */ + tn.addend = addend - vaddr; if (prot & PAGE_READ) { - te->addr_read = address; + tn.addr_read = address; } else { - te->addr_read = -1; + tn.addr_read = -1; } if (prot & PAGE_EXEC) { - te->addr_code = code_address; + tn.addr_code = code_address; } else { - te->addr_code = -1; + tn.addr_code = -1; } + + tn.addr_write = -1; if (prot & PAGE_WRITE) { if ((memory_region_is_ram(section->mr) && section->readonly) || memory_region_is_romd(section->mr)) { /* Write access calls the I/O callback. */ - te->addr_write = address | TLB_MMIO; + tn.addr_write = address | TLB_MMIO; } else if (memory_region_is_ram(section->mr) && cpu_physical_memory_is_clean( memory_region_get_ram_addr(section->mr) + xlat)) { - te->addr_write = address | TLB_NOTDIRTY; + tn.addr_write = address | TLB_NOTDIRTY; } else { - te->addr_write = address; + tn.addr_write = address; } - } else { - te->addr_write = -1; } + + /* Pairs with flag setting in tlb_reset_dirty_range */ + copy_tlb_helper(te, &tn, true); + /* atomic_mb_set(&te->addr_write, write_address); */ } /* Add a new TLB entry, but without specifying the memory @@ -452,6 +734,18 @@ static void report_bad_exec(CPUState *cpu, target_ulong addr) log_cpu_state_mask(LOG_GUEST_ERROR, cpu, CPU_DUMP_FPU | CPU_DUMP_CCOP); } +static inline ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr) +{ + ram_addr_t ram_addr; + + ram_addr = qemu_ram_addr_from_host(ptr); + if (ram_addr == RAM_ADDR_INVALID) { + error_report("Bad ram pointer %p", ptr); + abort(); + } + return ram_addr; +} + /* NOTE: this function can trigger an exception */ /* NOTE2: the returned address is not exactly the physical address: it * is actually a ram_addr_t (in system mode; the user mode emulation @@ -495,6 +789,7 @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry, hwaddr physaddr = iotlbentry->addr; MemoryRegion *mr = iotlb_to_region(cpu, physaddr, iotlbentry->attrs); uint64_t val; + bool locked = false; physaddr = (physaddr & TARGET_PAGE_MASK) + addr; cpu->mem_io_pc = retaddr; @@ -503,7 +798,16 @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry, } cpu->mem_io_vaddr = addr; + + if (mr->global_locking) { + qemu_mutex_lock_iothread(); + locked = true; + } memory_region_dispatch_read(mr, physaddr, &val, size, iotlbentry->attrs); + if (locked) { + qemu_mutex_unlock_iothread(); + } + return val; } @@ -514,15 +818,23 @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry, CPUState *cpu = ENV_GET_CPU(env); hwaddr physaddr = iotlbentry->addr; MemoryRegion *mr = iotlb_to_region(cpu, physaddr, iotlbentry->attrs); + bool locked = false; physaddr = (physaddr & TARGET_PAGE_MASK) + addr; if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) { cpu_io_recompile(cpu, retaddr); } - cpu->mem_io_vaddr = addr; cpu->mem_io_pc = retaddr; + + if (mr->global_locking) { + qemu_mutex_lock_iothread(); + locked = true; + } memory_region_dispatch_write(mr, physaddr, val, size, iotlbentry->attrs); + if (locked) { + qemu_mutex_unlock_iothread(); + } } /* Return true if ADDR is present in the victim tlb, and has been copied @@ -538,10 +850,13 @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index, if (cmp == page) { /* Found entry in victim tlb, swap tlb and iotlb. */ CPUTLBEntry tmptlb, *tlb = &env->tlb_table[mmu_idx][index]; + + copy_tlb_helper(&tmptlb, tlb, false); + copy_tlb_helper(tlb, vtlb, true); + copy_tlb_helper(vtlb, &tmptlb, true); + CPUIOTLBEntry tmpio, *io = &env->iotlb[mmu_idx][index]; CPUIOTLBEntry *vio = &env->iotlb_v[mmu_idx][vidx]; - - tmptlb = *tlb; *tlb = *vtlb; *vtlb = tmptlb; tmpio = *io; *io = *vio; *vio = tmpio; return true; } diff --git a/docs/multi-thread-tcg.txt b/docs/multi-thread-tcg.txt new file mode 100644 index 0000000000..a99b4564c6 --- /dev/null +++ b/docs/multi-thread-tcg.txt @@ -0,0 +1,350 @@ +Copyright (c) 2015-2016 Linaro Ltd. + +This work is licensed under the terms of the GNU GPL, version 2 or +later. See the COPYING file in the top-level directory. + +Introduction +============ + +This document outlines the design for multi-threaded TCG system-mode +emulation. The current user-mode emulation mirrors the thread +structure of the translated executable. Some of the work will be +applicable to both system and linux-user emulation. + +The original system-mode TCG implementation was single threaded and +dealt with multiple CPUs with simple round-robin scheduling. This +simplified a lot of things but became increasingly limited as systems +being emulated gained additional cores and per-core performance gains +for host systems started to level off. + +vCPU Scheduling +=============== + +We introduce a new running mode where each vCPU will run on its own +user-space thread. This will be enabled by default for all FE/BE +combinations that have had the required work done to support this +safely. + +In the general case of running translated code there should be no +inter-vCPU dependencies and all vCPUs should be able to run at full +speed. Synchronisation will only be required while accessing internal +shared data structures or when the emulated architecture requires a +coherent representation of the emulated machine state. + +Shared Data Structures +====================== + +Main Run Loop +------------- + +Even when there is no code being generated there are a number of +structures associated with the hot-path through the main run-loop. +These are associated with looking up the next translation block to +execute. These include: + + tb_jmp_cache (per-vCPU, cache of recent jumps) + tb_ctx.htable (global hash table, phys address->tb lookup) + +As TB linking only occurs when blocks are in the same page this code +is critical to performance as looking up the next TB to execute is the +most common reason to exit the generated code. + +DESIGN REQUIREMENT: Make access to lookup structures safe with +multiple reader/writer threads. Minimise any lock contention to do it. + +The hot-path avoids using locks where possible. The tb_jmp_cache is +updated with atomic accesses to ensure consistent results. The fall +back QHT based hash table is also designed for lockless lookups. Locks +are only taken when code generation is required or TranslationBlocks +have their block-to-block jumps patched. + +Global TCG State +---------------- + +We need to protect the entire code generation cycle including any post +generation patching of the translated code. This also implies a shared +translation buffer which contains code running on all cores. Any +execution path that comes to the main run loop will need to hold a +mutex for code generation. This also includes times when we need flush +code or entries from any shared lookups/caches. Structures held on a +per-vCPU basis won't need locking unless other vCPUs will need to +modify them. + +DESIGN REQUIREMENT: Add locking around all code generation and TB +patching. + +(Current solution) + +Mainly as part of the linux-user work all code generation is +serialised with a tb_lock(). For the SoftMMU tb_lock() also takes the +place of mmap_lock() in linux-user. + +Translation Blocks +------------------ + +Currently the whole system shares a single code generation buffer +which when full will force a flush of all translations and start from +scratch again. Some operations also force a full flush of translations +including: + + - debugging operations (breakpoint insertion/removal) + - some CPU helper functions + +This is done with the async_safe_run_on_cpu() mechanism to ensure all +vCPUs are quiescent when changes are being made to shared global +structures. + +More granular translation invalidation events are typically due +to a change of the state of a physical page: + + - code modification (self modify code, patching code) + - page changes (new page mapping in linux-user mode) + +While setting the invalid flag in a TranslationBlock will stop it +being used when looked up in the hot-path there are a number of other +book-keeping structures that need to be safely cleared. + +Any TranslationBlocks which have been patched to jump directly to the +now invalid blocks need the jump patches reversing so they will return +to the C code. + +There are a number of look-up caches that need to be properly updated +including the: + + - jump lookup cache + - the physical-to-tb lookup hash table + - the global page table + +The global page table (l1_map) which provides a multi-level look-up +for PageDesc structures which contain pointers to the start of a +linked list of all Translation Blocks in that page (see page_next). + +Both the jump patching and the page cache involve linked lists that +the invalidated TranslationBlock needs to be removed from. + +DESIGN REQUIREMENT: Safely handle invalidation of TBs + - safely patch/revert direct jumps + - remove central PageDesc lookup entries + - ensure lookup caches/hashes are safely updated + +(Current solution) + +The direct jump themselves are updated atomically by the TCG +tb_set_jmp_target() code. Modification to the linked lists that allow +searching for linked pages are done under the protect of the +tb_lock(). + +The global page table is protected by the tb_lock() in system-mode and +mmap_lock() in linux-user mode. + +The lookup caches are updated atomically and the lookup hash uses QHT +which is designed for concurrent safe lookup. + + +Memory maps and TLBs +-------------------- + +The memory handling code is fairly critical to the speed of memory +access in the emulated system. The SoftMMU code is designed so the +hot-path can be handled entirely within translated code. This is +handled with a per-vCPU TLB structure which once populated will allow +a series of accesses to the page to occur without exiting the +translated code. It is possible to set flags in the TLB address which +will ensure the slow-path is taken for each access. This can be done +to support: + + - Memory regions (dividing up access to PIO, MMIO and RAM) + - Dirty page tracking (for code gen, SMC detection, migration and display) + - Virtual TLB (for translating guest address->real address) + +When the TLB tables are updated by a vCPU thread other than their own +we need to ensure it is done in a safe way so no inconsistent state is +seen by the vCPU thread. + +Some operations require updating a number of vCPUs TLBs at the same +time in a synchronised manner. + +DESIGN REQUIREMENTS: + + - TLB Flush All/Page + - can be across-vCPUs + - cross vCPU TLB flush may need other vCPU brought to halt + - change may need to be visible to the calling vCPU immediately + - TLB Flag Update + - usually cross-vCPU + - want change to be visible as soon as possible + - TLB Update (update a CPUTLBEntry, via tlb_set_page_with_attrs) + - This is a per-vCPU table - by definition can't race + - updated by its own thread when the slow-path is forced + +(Current solution) + +We have updated cputlb.c to defer operations when a cross-vCPU +operation with async_run_on_cpu() which ensures each vCPU sees a +coherent state when it next runs its work (in a few instructions +time). + +A new set up operations (tlb_flush_*_all_cpus) take an additional flag +which when set will force synchronisation by setting the source vCPUs +work as "safe work" and exiting the cpu run loop. This ensure by the +time execution restarts all flush operations have completed. + +TLB flag updates are all done atomically and are also protected by the +tb_lock() which is used by the functions that update the TLB in bulk. + +(Known limitation) + +Not really a limitation but the wait mechanism is overly strict for +some architectures which only need flushes completed by a barrier +instruction. This could be a future optimisation. + +Emulated hardware state +----------------------- + +Currently thanks to KVM work any access to IO memory is automatically +protected by the global iothread mutex, also known as the BQL (Big +Qemu Lock). Any IO region that doesn't use global mutex is expected to +do its own locking. + +However IO memory isn't the only way emulated hardware state can be +modified. Some architectures have model specific registers that +trigger hardware emulation features. Generally any translation helper +that needs to update more than a single vCPUs of state should take the +BQL. + +As the BQL, or global iothread mutex is shared across the system we +push the use of the lock as far down into the TCG code as possible to +minimise contention. + +(Current solution) + +MMIO access automatically serialises hardware emulation by way of the +BQL. Currently ARM targets serialise all ARM_CP_IO register accesses +and also defer the reset/startup of vCPUs to the vCPU context by way +of async_run_on_cpu(). + +Updates to interrupt state are also protected by the BQL as they can +often be cross vCPU. + +Memory Consistency +================== + +Between emulated guests and host systems there are a range of memory +consistency models. Even emulating weakly ordered systems on strongly +ordered hosts needs to ensure things like store-after-load re-ordering +can be prevented when the guest wants to. + +Memory Barriers +--------------- + +Barriers (sometimes known as fences) provide a mechanism for software +to enforce a particular ordering of memory operations from the point +of view of external observers (e.g. another processor core). They can +apply to any memory operations as well as just loads or stores. + +The Linux kernel has an excellent write-up on the various forms of +memory barrier and the guarantees they can provide [1]. + +Barriers are often wrapped around synchronisation primitives to +provide explicit memory ordering semantics. However they can be used +by themselves to provide safe lockless access by ensuring for example +a change to a signal flag will only be visible once the changes to +payload are. + +DESIGN REQUIREMENT: Add a new tcg_memory_barrier op + +This would enforce a strong load/store ordering so all loads/stores +complete at the memory barrier. On single-core non-SMP strongly +ordered backends this could become a NOP. + +Aside from explicit standalone memory barrier instructions there are +also implicit memory ordering semantics which comes with each guest +memory access instruction. For example all x86 load/stores come with +fairly strong guarantees of sequential consistency where as ARM has +special variants of load/store instructions that imply acquire/release +semantics. + +In the case of a strongly ordered guest architecture being emulated on +a weakly ordered host the scope for a heavy performance impact is +quite high. + +DESIGN REQUIREMENTS: Be efficient with use of memory barriers + - host systems with stronger implied guarantees can skip some barriers + - merge consecutive barriers to the strongest one + +(Current solution) + +The system currently has a tcg_gen_mb() which will add memory barrier +operations if code generation is being done in a parallel context. The +tcg_optimize() function attempts to merge barriers up to their +strongest form before any load/store operations. The solution was +originally developed and tested for linux-user based systems. All +backends have been converted to emit fences when required. So far the +following front-ends have been updated to emit fences when required: + + - target-i386 + - target-arm + - target-aarch64 + - target-alpha + - target-mips + +Memory Control and Maintenance +------------------------------ + +This includes a class of instructions for controlling system cache +behaviour. While QEMU doesn't model cache behaviour these instructions +are often seen when code modification has taken place to ensure the +changes take effect. + +Synchronisation Primitives +-------------------------- + +There are two broad types of synchronisation primitives found in +modern ISAs: atomic instructions and exclusive regions. + +The first type offer a simple atomic instruction which will guarantee +some sort of test and conditional store will be truly atomic w.r.t. +other cores sharing access to the memory. The classic example is the +x86 cmpxchg instruction. + +The second type offer a pair of load/store instructions which offer a +guarantee that an region of memory has not been touched between the +load and store instructions. An example of this is ARM's ldrex/strex +pair where the strex instruction will return a flag indicating a +successful store only if no other CPU has accessed the memory region +since the ldrex. + +Traditionally TCG has generated a series of operations that work +because they are within the context of a single translation block so +will have completed before another CPU is scheduled. However with +the ability to have multiple threads running to emulate multiple CPUs +we will need to explicitly expose these semantics. + +DESIGN REQUIREMENTS: + - Support classic atomic instructions + - Support load/store exclusive (or load link/store conditional) pairs + - Generic enough infrastructure to support all guest architectures +CURRENT OPEN QUESTIONS: + - How problematic is the ABA problem in general? + +(Current solution) + +The TCG provides a number of atomic helpers (tcg_gen_atomic_*) which +can be used directly or combined to emulate other instructions like +ARM's ldrex/strex instructions. While they are susceptible to the ABA +problem so far common guests have not implemented patterns where +this may be a problem - typically presenting a locking ABI which +assumes cmpxchg like semantics. + +The code also includes a fall-back for cases where multi-threaded TCG +ops can't work (e.g. guest atomic width > host atomic width). In this +case an EXCP_ATOMIC exit occurs and the instruction is emulated with +an exclusive lock which ensures all emulation is serialised. + +While the atomic helpers look good enough for now there may be a need +to look at solutions that can more closely model the guest +architectures semantics. + +========== + +[1] https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/plain/Documentation/memory-barriers.txt @@ -2134,9 +2134,9 @@ static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags) } cpu->watchpoint_hit = wp; - /* The tb_lock will be reset when cpu_loop_exit or - * cpu_loop_exit_noexc longjmp back into the cpu_exec - * main loop. + /* Both tb_lock and iothread_mutex will be reset when + * cpu_loop_exit or cpu_loop_exit_noexc longjmp + * back into the cpu_exec main loop. */ tb_lock(); tb_check_watchpoint(cpu); @@ -2371,8 +2371,14 @@ static void io_mem_init(void) memory_region_init_io(&io_mem_rom, NULL, &unassigned_mem_ops, NULL, NULL, UINT64_MAX); memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL, NULL, UINT64_MAX); + + /* io_mem_notdirty calls tb_invalidate_phys_page_fast, + * which can be called without the iothread mutex. + */ memory_region_init_io(&io_mem_notdirty, NULL, ¬dirty_mem_ops, NULL, NULL, UINT64_MAX); + memory_region_clear_global_locking(&io_mem_notdirty); + memory_region_init_io(&io_mem_watch, NULL, &watch_mem_ops, NULL, NULL, UINT64_MAX); } diff --git a/hw/core/irq.c b/hw/core/irq.c index 49ff2e64fe..b98d1d69f5 100644 --- a/hw/core/irq.c +++ b/hw/core/irq.c @@ -22,6 +22,7 @@ * THE SOFTWARE. */ #include "qemu/osdep.h" +#include "qemu/main-loop.h" #include "qemu-common.h" #include "hw/irq.h" #include "qom/object.h" diff --git a/hw/i386/kvmvapic.c b/hw/i386/kvmvapic.c index 7135633863..82a49556af 100644 --- a/hw/i386/kvmvapic.c +++ b/hw/i386/kvmvapic.c @@ -457,8 +457,8 @@ static void patch_instruction(VAPICROMState *s, X86CPU *cpu, target_ulong ip) resume_all_vcpus(); if (!kvm_enabled()) { - /* tb_lock will be reset when cpu_loop_exit_noexc longjmps - * back into the cpu_exec loop. */ + /* Both tb_lock and iothread_mutex will be reset when + * longjmps back into the cpu_exec loop. */ tb_lock(); tb_gen_code(cs, current_pc, current_cs_base, current_flags, 1); cpu_loop_exit_noexc(cs); diff --git a/hw/intc/arm_gicv3_cpuif.c b/hw/intc/arm_gicv3_cpuif.c index c25ee03556..f775aba507 100644 --- a/hw/intc/arm_gicv3_cpuif.c +++ b/hw/intc/arm_gicv3_cpuif.c @@ -14,6 +14,7 @@ #include "qemu/osdep.h" #include "qemu/bitops.h" +#include "qemu/main-loop.h" #include "trace.h" #include "gicv3_internal.h" #include "cpu.h" @@ -733,6 +734,8 @@ void gicv3_cpuif_update(GICv3CPUState *cs) ARMCPU *cpu = ARM_CPU(cs->cpu); CPUARMState *env = &cpu->env; + g_assert(qemu_mutex_iothread_locked()); + trace_gicv3_cpuif_update(gicv3_redist_affid(cs), cs->hppi.irq, cs->hppi.grp, cs->hppi.prio); diff --git a/hw/misc/imx6_src.c b/hw/misc/imx6_src.c index 55b817b8d7..edbb756c36 100644 --- a/hw/misc/imx6_src.c +++ b/hw/misc/imx6_src.c @@ -14,6 +14,7 @@ #include "qemu/bitops.h" #include "qemu/log.h" #include "arm-powerctl.h" +#include "qom/cpu.h" #ifndef DEBUG_IMX6_SRC #define DEBUG_IMX6_SRC 0 @@ -113,6 +114,45 @@ static uint64_t imx6_src_read(void *opaque, hwaddr offset, unsigned size) return value; } + +/* The reset is asynchronous so we need to defer clearing the reset + * bit until the work is completed. + */ + +struct SRCSCRResetInfo { + IMX6SRCState *s; + int reset_bit; +}; + +static void imx6_clear_reset_bit(CPUState *cpu, run_on_cpu_data data) +{ + struct SRCSCRResetInfo *ri = data.host_ptr; + IMX6SRCState *s = ri->s; + + assert(qemu_mutex_iothread_locked()); + + s->regs[SRC_SCR] = deposit32(s->regs[SRC_SCR], ri->reset_bit, 1, 0); + DPRINTF("reg[%s] <= 0x%" PRIx32 "\n", + imx6_src_reg_name(SRC_SCR), s->regs[SRC_SCR]); + + g_free(ri); +} + +static void imx6_defer_clear_reset_bit(int cpuid, + IMX6SRCState *s, + unsigned long reset_shift) +{ + struct SRCSCRResetInfo *ri; + + ri = g_malloc(sizeof(struct SRCSCRResetInfo)); + ri->s = s; + ri->reset_bit = reset_shift; + + async_run_on_cpu(arm_get_cpu_by_id(cpuid), imx6_clear_reset_bit, + RUN_ON_CPU_HOST_PTR(ri)); +} + + static void imx6_src_write(void *opaque, hwaddr offset, uint64_t value, unsigned size) { @@ -153,7 +193,7 @@ static void imx6_src_write(void *opaque, hwaddr offset, uint64_t value, arm_set_cpu_off(3); } /* We clear the reset bits as the processor changed state */ - clear_bit(CORE3_RST_SHIFT, ¤t_value); + imx6_defer_clear_reset_bit(3, s, CORE3_RST_SHIFT); clear_bit(CORE3_RST_SHIFT, &change_mask); } if (EXTRACT(change_mask, CORE2_ENABLE)) { @@ -162,11 +202,11 @@ static void imx6_src_write(void *opaque, hwaddr offset, uint64_t value, arm_set_cpu_on(2, s->regs[SRC_GPR5], s->regs[SRC_GPR6], 3, false); } else { - /* CORE 3 is shut down */ + /* CORE 2 is shut down */ arm_set_cpu_off(2); } /* We clear the reset bits as the processor changed state */ - clear_bit(CORE2_RST_SHIFT, ¤t_value); + imx6_defer_clear_reset_bit(2, s, CORE2_RST_SHIFT); clear_bit(CORE2_RST_SHIFT, &change_mask); } if (EXTRACT(change_mask, CORE1_ENABLE)) { @@ -175,28 +215,28 @@ static void imx6_src_write(void *opaque, hwaddr offset, uint64_t value, arm_set_cpu_on(1, s->regs[SRC_GPR3], s->regs[SRC_GPR4], 3, false); } else { - /* CORE 3 is shut down */ + /* CORE 1 is shut down */ arm_set_cpu_off(1); } /* We clear the reset bits as the processor changed state */ - clear_bit(CORE1_RST_SHIFT, ¤t_value); + imx6_defer_clear_reset_bit(1, s, CORE1_RST_SHIFT); clear_bit(CORE1_RST_SHIFT, &change_mask); } if (EXTRACT(change_mask, CORE0_RST)) { arm_reset_cpu(0); - clear_bit(CORE0_RST_SHIFT, ¤t_value); + imx6_defer_clear_reset_bit(0, s, CORE0_RST_SHIFT); } if (EXTRACT(change_mask, CORE1_RST)) { arm_reset_cpu(1); - clear_bit(CORE1_RST_SHIFT, ¤t_value); + imx6_defer_clear_reset_bit(1, s, CORE1_RST_SHIFT); } if (EXTRACT(change_mask, CORE2_RST)) { arm_reset_cpu(2); - clear_bit(CORE2_RST_SHIFT, ¤t_value); + imx6_defer_clear_reset_bit(2, s, CORE2_RST_SHIFT); } if (EXTRACT(change_mask, CORE3_RST)) { arm_reset_cpu(3); - clear_bit(CORE3_RST_SHIFT, ¤t_value); + imx6_defer_clear_reset_bit(3, s, CORE3_RST_SHIFT); } if (EXTRACT(change_mask, SW_IPU2_RST)) { /* We pretend the IPU2 is reset */ diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c index d171e60b5c..5f93083d4a 100644 --- a/hw/ppc/ppc.c +++ b/hw/ppc/ppc.c @@ -62,7 +62,16 @@ void ppc_set_irq(PowerPCCPU *cpu, int n_IRQ, int level) { CPUState *cs = CPU(cpu); CPUPPCState *env = &cpu->env; - unsigned int old_pending = env->pending_interrupts; + unsigned int old_pending; + bool locked = false; + + /* We may already have the BQL if coming from the reset path */ + if (!qemu_mutex_iothread_locked()) { + locked = true; + qemu_mutex_lock_iothread(); + } + + old_pending = env->pending_interrupts; if (level) { env->pending_interrupts |= 1 << n_IRQ; @@ -80,9 +89,14 @@ void ppc_set_irq(PowerPCCPU *cpu, int n_IRQ, int level) #endif } + LOG_IRQ("%s: %p n_IRQ %d level %d => pending %08" PRIx32 "req %08x\n", __func__, env, n_IRQ, level, env->pending_interrupts, CPU(cpu)->interrupt_request); + + if (locked) { + qemu_mutex_unlock_iothread(); + } } /* PowerPC 6xx / 7xx internal IRQ controller */ diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 5904e6498f..87d8366c44 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -1010,6 +1010,9 @@ static void emulate_spapr_hypercall(PPCVirtualHypervisor *vhyp, { CPUPPCState *env = &cpu->env; + /* The TCG path should also be holding the BQL at this point */ + g_assert(qemu_mutex_iothread_locked()); + if (msr_pr) { hcall_dprintf("Hypercall made with MSR[PR]=1\n"); env->gpr[3] = H_PRIVILEGE; diff --git a/include/exec/cputlb.h b/include/exec/cputlb.h index d454c005b7..3f941783c5 100644 --- a/include/exec/cputlb.h +++ b/include/exec/cputlb.h @@ -23,8 +23,6 @@ /* cputlb.c */ void tlb_protect_code(ram_addr_t ram_addr); void tlb_unprotect_code(ram_addr_t ram_addr); -void tlb_reset_dirty_range(CPUTLBEntry *tlb_entry, uintptr_t start, - uintptr_t length); extern int tlb_flush_count; #endif diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h index 21ab7bf3fd..bcde1e6a14 100644 --- a/include/exec/exec-all.h +++ b/include/exec/exec-all.h @@ -93,6 +93,27 @@ void cpu_address_space_init(CPUState *cpu, AddressSpace *as, int asidx); */ void tlb_flush_page(CPUState *cpu, target_ulong addr); /** + * tlb_flush_page_all_cpus: + * @cpu: src CPU of the flush + * @addr: virtual address of page to be flushed + * + * Flush one page from the TLB of the specified CPU, for all + * MMU indexes. + */ +void tlb_flush_page_all_cpus(CPUState *src, target_ulong addr); +/** + * tlb_flush_page_all_cpus_synced: + * @cpu: src CPU of the flush + * @addr: virtual address of page to be flushed + * + * Flush one page from the TLB of the specified CPU, for all MMU + * indexes like tlb_flush_page_all_cpus except the source vCPUs work + * is scheduled as safe work meaning all flushes will be complete once + * the source vCPUs safe work is complete. This will depend on when + * the guests translation ends the TB. + */ +void tlb_flush_page_all_cpus_synced(CPUState *src, target_ulong addr); +/** * tlb_flush: * @cpu: CPU whose TLB should be flushed * @@ -103,24 +124,87 @@ void tlb_flush_page(CPUState *cpu, target_ulong addr); */ void tlb_flush(CPUState *cpu); /** + * tlb_flush_all_cpus: + * @cpu: src CPU of the flush + */ +void tlb_flush_all_cpus(CPUState *src_cpu); +/** + * tlb_flush_all_cpus_synced: + * @cpu: src CPU of the flush + * + * Like tlb_flush_all_cpus except this except the source vCPUs work is + * scheduled as safe work meaning all flushes will be complete once + * the source vCPUs safe work is complete. This will depend on when + * the guests translation ends the TB. + */ +void tlb_flush_all_cpus_synced(CPUState *src_cpu); +/** * tlb_flush_page_by_mmuidx: * @cpu: CPU whose TLB should be flushed * @addr: virtual address of page to be flushed - * @...: list of MMU indexes to flush, terminated by a negative value + * @idxmap: bitmap of MMU indexes to flush * * Flush one page from the TLB of the specified CPU, for the specified * MMU indexes. */ -void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, ...); +void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, + uint16_t idxmap); +/** + * tlb_flush_page_by_mmuidx_all_cpus: + * @cpu: Originating CPU of the flush + * @addr: virtual address of page to be flushed + * @idxmap: bitmap of MMU indexes to flush + * + * Flush one page from the TLB of all CPUs, for the specified + * MMU indexes. + */ +void tlb_flush_page_by_mmuidx_all_cpus(CPUState *cpu, target_ulong addr, + uint16_t idxmap); +/** + * tlb_flush_page_by_mmuidx_all_cpus_synced: + * @cpu: Originating CPU of the flush + * @addr: virtual address of page to be flushed + * @idxmap: bitmap of MMU indexes to flush + * + * Flush one page from the TLB of all CPUs, for the specified MMU + * indexes like tlb_flush_page_by_mmuidx_all_cpus except the source + * vCPUs work is scheduled as safe work meaning all flushes will be + * complete once the source vCPUs safe work is complete. This will + * depend on when the guests translation ends the TB. + */ +void tlb_flush_page_by_mmuidx_all_cpus_synced(CPUState *cpu, target_ulong addr, + uint16_t idxmap); /** * tlb_flush_by_mmuidx: * @cpu: CPU whose TLB should be flushed - * @...: list of MMU indexes to flush, terminated by a negative value + * @wait: If true ensure synchronisation by exiting the cpu_loop + * @idxmap: bitmap of MMU indexes to flush * * Flush all entries from the TLB of the specified CPU, for the specified * MMU indexes. */ -void tlb_flush_by_mmuidx(CPUState *cpu, ...); +void tlb_flush_by_mmuidx(CPUState *cpu, uint16_t idxmap); +/** + * tlb_flush_by_mmuidx_all_cpus: + * @cpu: Originating CPU of the flush + * @idxmap: bitmap of MMU indexes to flush + * + * Flush all entries from all TLBs of all CPUs, for the specified + * MMU indexes. + */ +void tlb_flush_by_mmuidx_all_cpus(CPUState *cpu, uint16_t idxmap); +/** + * tlb_flush_by_mmuidx_all_cpus_synced: + * @cpu: Originating CPU of the flush + * @idxmap: bitmap of MMU indexes to flush + * + * Flush all entries from all TLBs of all CPUs, for the specified + * MMU indexes like tlb_flush_by_mmuidx_all_cpus except except the source + * vCPUs work is scheduled as safe work meaning all flushes will be + * complete once the source vCPUs safe work is complete. This will + * depend on when the guests translation ends the TB. + */ +void tlb_flush_by_mmuidx_all_cpus_synced(CPUState *cpu, uint16_t idxmap); /** * tlb_set_page_with_attrs: * @cpu: CPU to add this TLB entry for @@ -162,17 +246,45 @@ void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx, static inline void tlb_flush_page(CPUState *cpu, target_ulong addr) { } - +static inline void tlb_flush_page_all_cpus(CPUState *src, target_ulong addr) +{ +} +static inline void tlb_flush_page_all_cpus_synced(CPUState *src, + target_ulong addr) +{ +} static inline void tlb_flush(CPUState *cpu) { } - +static inline void tlb_flush_all_cpus(CPUState *src_cpu) +{ +} +static inline void tlb_flush_all_cpus_synced(CPUState *src_cpu) +{ +} static inline void tlb_flush_page_by_mmuidx(CPUState *cpu, - target_ulong addr, ...) + target_ulong addr, uint16_t idxmap) { } -static inline void tlb_flush_by_mmuidx(CPUState *cpu, ...) +static inline void tlb_flush_by_mmuidx(CPUState *cpu, uint16_t idxmap) +{ +} +static inline void tlb_flush_page_by_mmuidx_all_cpus(CPUState *cpu, + target_ulong addr, + uint16_t idxmap) +{ +} +static inline void tlb_flush_page_by_mmuidx_all_cpus_synced(CPUState *cpu, + target_ulong addr, + uint16_t idxmap) +{ +} +static inline void tlb_flush_by_mmuidx_all_cpus(CPUState *cpu, uint16_t idxmap) +{ +} +static inline void tlb_flush_by_mmuidx_all_cpus_synced(CPUState *cpu, + uint16_t idxmap) { } #endif @@ -404,8 +516,4 @@ bool memory_region_is_unassigned(MemoryRegion *mr); /* vl.c */ extern int singlestep; -/* cpu-exec.c, accessed with atomic_mb_read/atomic_mb_set */ -extern CPUState *tcg_current_cpu; -extern bool exit_request; - #endif diff --git a/include/qom/cpu.h b/include/qom/cpu.h index f69b2407ea..3e61c880da 100644 --- a/include/qom/cpu.h +++ b/include/qom/cpu.h @@ -329,6 +329,7 @@ struct CPUState { bool unplug; bool crash_occurred; bool exit_request; + /* updates protected by BQL */ uint32_t interrupt_request; int singlestep_enabled; int64_t icount_extra; @@ -401,6 +402,12 @@ struct CPUState { bool hax_vcpu_dirty; struct hax_vcpu_state *hax_vcpu; + + /* The pending_tlb_flush flag is set and cleared atomically to + * avoid potential races. The aim of the flag is to avoid + * unnecessary flushes. + */ + uint16_t pending_tlb_flush; }; QTAILQ_HEAD(CPUTailQ, CPUState); @@ -416,6 +423,15 @@ extern struct CPUTailQ cpus; extern __thread CPUState *current_cpu; /** + * qemu_tcg_mttcg_enabled: + * Check whether we are running MultiThread TCG or not. + * + * Returns: %true if we are in MTTCG mode %false otherwise. + */ +extern bool mttcg_enabled; +#define qemu_tcg_mttcg_enabled() (mttcg_enabled) + +/** * cpu_paging_enabled: * @cpu: The CPU whose state is to be inspected. * diff --git a/include/sysemu/cpus.h b/include/sysemu/cpus.h index 3728a1ea7e..a73b5d4bce 100644 --- a/include/sysemu/cpus.h +++ b/include/sysemu/cpus.h @@ -36,4 +36,6 @@ extern int smp_threads; void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg); +void qemu_tcg_configure(QemuOpts *opts, Error **errp); + #endif @@ -917,6 +917,8 @@ void memory_region_transaction_commit(void) AddressSpace *as; assert(memory_region_transaction_depth); + assert(qemu_mutex_iothread_locked()); + --memory_region_transaction_depth; if (!memory_region_transaction_depth) { if (memory_region_update_pending) { diff --git a/qemu-options.hx b/qemu-options.hx index 9936cf38f3..bf458f83c3 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -95,6 +95,26 @@ STEXI Select CPU model (@code{-cpu help} for list and additional feature selection) ETEXI +DEF("accel", HAS_ARG, QEMU_OPTION_accel, + "-accel [accel=]accelerator[,thread=single|multi]\n" + " select accelerator ('-accel help for list')\n" + " thread=single|multi (enable multi-threaded TCG)", QEMU_ARCH_ALL) +STEXI +@item -accel @var{name}[,prop=@var{value}[,...]] +@findex -accel +This is used to enable an accelerator. Depending on the target architecture, +kvm, xen, or tcg can be available. By default, tcg is used. If there is more +than one accelerator specified, the next one is used if the previous one fails +to initialize. +@table @option +@item thread=single|multi +Controls number of TCG threads. When the TCG is multi-threaded there will be one +thread per vCPU therefor taking advantage of additional host cores. The default +is to enable multi-threading where both the back-end and front-ends support it and +no incompatible TCG features have been enabled (e.g. icount/replay). +@end table +ETEXI + DEF("smp", HAS_ARG, QEMU_OPTION_smp, "-smp [cpus=]n[,maxcpus=cpus][,cores=cores][,threads=threads][,sockets=sockets]\n" " set the number of CPUs to 'n' [default=1]\n" @@ -113,9 +113,19 @@ static void cpu_common_get_memory_mapping(CPUState *cpu, error_setg(errp, "Obtaining memory mappings is unsupported on this CPU."); } +/* Resetting the IRQ comes from across the code base so we take the + * BQL here if we need to. cpu_interrupt assumes it is held.*/ void cpu_reset_interrupt(CPUState *cpu, int mask) { + bool need_lock = !qemu_mutex_iothread_locked(); + + if (need_lock) { + qemu_mutex_lock_iothread(); + } cpu->interrupt_request &= ~mask; + if (need_lock) { + qemu_mutex_unlock_iothread(); + } } void cpu_exit(CPUState *cpu) diff --git a/target/arm/arm-powerctl.c b/target/arm/arm-powerctl.c index fbb7a15daa..25207cb850 100644 --- a/target/arm/arm-powerctl.c +++ b/target/arm/arm-powerctl.c @@ -14,6 +14,7 @@ #include "internals.h" #include "arm-powerctl.h" #include "qemu/log.h" +#include "qemu/main-loop.h" #include "exec/exec-all.h" #ifndef DEBUG_ARM_POWERCTL @@ -48,11 +49,93 @@ CPUState *arm_get_cpu_by_id(uint64_t id) return NULL; } +struct CpuOnInfo { + uint64_t entry; + uint64_t context_id; + uint32_t target_el; + bool target_aa64; +}; + + +static void arm_set_cpu_on_async_work(CPUState *target_cpu_state, + run_on_cpu_data data) +{ + ARMCPU *target_cpu = ARM_CPU(target_cpu_state); + struct CpuOnInfo *info = (struct CpuOnInfo *) data.host_ptr; + + /* Initialize the cpu we are turning on */ + cpu_reset(target_cpu_state); + target_cpu_state->halted = 0; + + if (info->target_aa64) { + if ((info->target_el < 3) && arm_feature(&target_cpu->env, + ARM_FEATURE_EL3)) { + /* + * As target mode is AArch64, we need to set lower + * exception level (the requested level 2) to AArch64 + */ + target_cpu->env.cp15.scr_el3 |= SCR_RW; + } + + if ((info->target_el < 2) && arm_feature(&target_cpu->env, + ARM_FEATURE_EL2)) { + /* + * As target mode is AArch64, we need to set lower + * exception level (the requested level 1) to AArch64 + */ + target_cpu->env.cp15.hcr_el2 |= HCR_RW; + } + + target_cpu->env.pstate = aarch64_pstate_mode(info->target_el, true); + } else { + /* We are requested to boot in AArch32 mode */ + static const uint32_t mode_for_el[] = { 0, + ARM_CPU_MODE_SVC, + ARM_CPU_MODE_HYP, + ARM_CPU_MODE_SVC }; + + cpsr_write(&target_cpu->env, mode_for_el[info->target_el], CPSR_M, + CPSRWriteRaw); + } + + if (info->target_el == 3) { + /* Processor is in secure mode */ + target_cpu->env.cp15.scr_el3 &= ~SCR_NS; + } else { + /* Processor is not in secure mode */ + target_cpu->env.cp15.scr_el3 |= SCR_NS; + } + + /* We check if the started CPU is now at the correct level */ + assert(info->target_el == arm_current_el(&target_cpu->env)); + + if (info->target_aa64) { + target_cpu->env.xregs[0] = info->context_id; + target_cpu->env.thumb = false; + } else { + target_cpu->env.regs[0] = info->context_id; + target_cpu->env.thumb = info->entry & 1; + info->entry &= 0xfffffffe; + } + + /* Start the new CPU at the requested address */ + cpu_set_pc(target_cpu_state, info->entry); + + g_free(info); + + /* Finally set the power status */ + assert(qemu_mutex_iothread_locked()); + target_cpu->power_state = PSCI_ON; +} + int arm_set_cpu_on(uint64_t cpuid, uint64_t entry, uint64_t context_id, uint32_t target_el, bool target_aa64) { CPUState *target_cpu_state; ARMCPU *target_cpu; + struct CpuOnInfo *info; + + assert(qemu_mutex_iothread_locked()); DPRINTF("cpu %" PRId64 " (EL %d, %s) @ 0x%" PRIx64 " with R0 = 0x%" PRIx64 "\n", cpuid, target_el, target_aa64 ? "aarch64" : "aarch32", entry, @@ -77,7 +160,7 @@ int arm_set_cpu_on(uint64_t cpuid, uint64_t entry, uint64_t context_id, } target_cpu = ARM_CPU(target_cpu_state); - if (!target_cpu->powered_off) { + if (target_cpu->power_state == PSCI_ON) { qemu_log_mask(LOG_GUEST_ERROR, "[ARM]%s: CPU %" PRId64 " is already on\n", __func__, cpuid); @@ -109,74 +192,54 @@ int arm_set_cpu_on(uint64_t cpuid, uint64_t entry, uint64_t context_id, return QEMU_ARM_POWERCTL_INVALID_PARAM; } - /* Initialize the cpu we are turning on */ - cpu_reset(target_cpu_state); - target_cpu->powered_off = false; - target_cpu_state->halted = 0; - - if (target_aa64) { - if ((target_el < 3) && arm_feature(&target_cpu->env, ARM_FEATURE_EL3)) { - /* - * As target mode is AArch64, we need to set lower - * exception level (the requested level 2) to AArch64 - */ - target_cpu->env.cp15.scr_el3 |= SCR_RW; - } - - if ((target_el < 2) && arm_feature(&target_cpu->env, ARM_FEATURE_EL2)) { - /* - * As target mode is AArch64, we need to set lower - * exception level (the requested level 1) to AArch64 - */ - target_cpu->env.cp15.hcr_el2 |= HCR_RW; - } - - target_cpu->env.pstate = aarch64_pstate_mode(target_el, true); - } else { - /* We are requested to boot in AArch32 mode */ - static uint32_t mode_for_el[] = { 0, - ARM_CPU_MODE_SVC, - ARM_CPU_MODE_HYP, - ARM_CPU_MODE_SVC }; - - cpsr_write(&target_cpu->env, mode_for_el[target_el], CPSR_M, - CPSRWriteRaw); - } - - if (target_el == 3) { - /* Processor is in secure mode */ - target_cpu->env.cp15.scr_el3 &= ~SCR_NS; - } else { - /* Processor is not in secure mode */ - target_cpu->env.cp15.scr_el3 |= SCR_NS; - } - - /* We check if the started CPU is now at the correct level */ - assert(target_el == arm_current_el(&target_cpu->env)); - - if (target_aa64) { - target_cpu->env.xregs[0] = context_id; - target_cpu->env.thumb = false; - } else { - target_cpu->env.regs[0] = context_id; - target_cpu->env.thumb = entry & 1; - entry &= 0xfffffffe; + /* + * If another CPU has powered the target on we are in the state + * ON_PENDING and additional attempts to power on the CPU should + * fail (see 6.6 Implementation CPU_ON/CPU_OFF races in the PSCI + * spec) + */ + if (target_cpu->power_state == PSCI_ON_PENDING) { + qemu_log_mask(LOG_GUEST_ERROR, + "[ARM]%s: CPU %" PRId64 " is already powering on\n", + __func__, cpuid); + return QEMU_ARM_POWERCTL_ON_PENDING; } - /* Start the new CPU at the requested address */ - cpu_set_pc(target_cpu_state, entry); + /* To avoid racing with a CPU we are just kicking off we do the + * final bit of preparation for the work in the target CPUs + * context. + */ + info = g_new(struct CpuOnInfo, 1); + info->entry = entry; + info->context_id = context_id; + info->target_el = target_el; + info->target_aa64 = target_aa64; - qemu_cpu_kick(target_cpu_state); + async_run_on_cpu(target_cpu_state, arm_set_cpu_on_async_work, + RUN_ON_CPU_HOST_PTR(info)); /* We are good to go */ return QEMU_ARM_POWERCTL_RET_SUCCESS; } +static void arm_set_cpu_off_async_work(CPUState *target_cpu_state, + run_on_cpu_data data) +{ + ARMCPU *target_cpu = ARM_CPU(target_cpu_state); + + assert(qemu_mutex_iothread_locked()); + target_cpu->power_state = PSCI_OFF; + target_cpu_state->halted = 1; + target_cpu_state->exception_index = EXCP_HLT; +} + int arm_set_cpu_off(uint64_t cpuid) { CPUState *target_cpu_state; ARMCPU *target_cpu; + assert(qemu_mutex_iothread_locked()); + DPRINTF("cpu %" PRId64 "\n", cpuid); /* change to the cpu we are powering up */ @@ -185,27 +248,34 @@ int arm_set_cpu_off(uint64_t cpuid) return QEMU_ARM_POWERCTL_INVALID_PARAM; } target_cpu = ARM_CPU(target_cpu_state); - if (target_cpu->powered_off) { + if (target_cpu->power_state == PSCI_OFF) { qemu_log_mask(LOG_GUEST_ERROR, "[ARM]%s: CPU %" PRId64 " is already off\n", __func__, cpuid); return QEMU_ARM_POWERCTL_IS_OFF; } - target_cpu->powered_off = true; - target_cpu_state->halted = 1; - target_cpu_state->exception_index = EXCP_HLT; - cpu_loop_exit(target_cpu_state); - /* notreached */ + /* Queue work to run under the target vCPUs context */ + async_run_on_cpu(target_cpu_state, arm_set_cpu_off_async_work, + RUN_ON_CPU_NULL); return QEMU_ARM_POWERCTL_RET_SUCCESS; } +static void arm_reset_cpu_async_work(CPUState *target_cpu_state, + run_on_cpu_data data) +{ + /* Reset the cpu */ + cpu_reset(target_cpu_state); +} + int arm_reset_cpu(uint64_t cpuid) { CPUState *target_cpu_state; ARMCPU *target_cpu; + assert(qemu_mutex_iothread_locked()); + DPRINTF("cpu %" PRId64 "\n", cpuid); /* change to the cpu we are resetting */ @@ -214,15 +284,17 @@ int arm_reset_cpu(uint64_t cpuid) return QEMU_ARM_POWERCTL_INVALID_PARAM; } target_cpu = ARM_CPU(target_cpu_state); - if (target_cpu->powered_off) { + + if (target_cpu->power_state == PSCI_OFF) { qemu_log_mask(LOG_GUEST_ERROR, "[ARM]%s: CPU %" PRId64 " is off\n", __func__, cpuid); return QEMU_ARM_POWERCTL_IS_OFF; } - /* Reset the cpu */ - cpu_reset(target_cpu_state); + /* Queue work to run under the target vCPUs context */ + async_run_on_cpu(target_cpu_state, arm_reset_cpu_async_work, + RUN_ON_CPU_NULL); return QEMU_ARM_POWERCTL_RET_SUCCESS; } diff --git a/target/arm/arm-powerctl.h b/target/arm/arm-powerctl.h index 98ee04989b..04353923c0 100644 --- a/target/arm/arm-powerctl.h +++ b/target/arm/arm-powerctl.h @@ -17,6 +17,7 @@ #define QEMU_ARM_POWERCTL_INVALID_PARAM QEMU_PSCI_RET_INVALID_PARAMS #define QEMU_ARM_POWERCTL_ALREADY_ON QEMU_PSCI_RET_ALREADY_ON #define QEMU_ARM_POWERCTL_IS_OFF QEMU_PSCI_RET_DENIED +#define QEMU_ARM_POWERCTL_ON_PENDING QEMU_PSCI_RET_ON_PENDING /* * arm_get_cpu_by_id: @@ -43,6 +44,7 @@ CPUState *arm_get_cpu_by_id(uint64_t cpuid); * Returns: QEMU_ARM_POWERCTL_RET_SUCCESS on success. * QEMU_ARM_POWERCTL_INVALID_PARAM if bad parameters are provided. * QEMU_ARM_POWERCTL_ALREADY_ON if the CPU was already started. + * QEMU_ARM_POWERCTL_ON_PENDING if the CPU is still powering up */ int arm_set_cpu_on(uint64_t cpuid, uint64_t entry, uint64_t context_id, uint32_t target_el, bool target_aa64); diff --git a/target/arm/cpu.c b/target/arm/cpu.c index 4a069f6985..f7157dc0e5 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -45,7 +45,7 @@ static bool arm_cpu_has_work(CPUState *cs) { ARMCPU *cpu = ARM_CPU(cs); - return !cpu->powered_off + return (cpu->power_state != PSCI_OFF) && cs->interrupt_request & (CPU_INTERRUPT_FIQ | CPU_INTERRUPT_HARD | CPU_INTERRUPT_VFIQ | CPU_INTERRUPT_VIRQ @@ -132,7 +132,7 @@ static void arm_cpu_reset(CPUState *s) env->vfp.xregs[ARM_VFP_MVFR1] = cpu->mvfr1; env->vfp.xregs[ARM_VFP_MVFR2] = cpu->mvfr2; - cpu->powered_off = cpu->start_powered_off; + cpu->power_state = cpu->start_powered_off ? PSCI_OFF : PSCI_ON; s->halted = cpu->start_powered_off; if (arm_feature(env, ARM_FEATURE_IWMMXT)) { diff --git a/target/arm/cpu.h b/target/arm/cpu.h index 0956a54e89..38a8e00908 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -30,6 +30,9 @@ # define TARGET_LONG_BITS 32 #endif +/* ARM processors have a weak memory model */ +#define TCG_GUEST_DEFAULT_MO (0) + #define CPUArchState struct CPUARMState #include "qemu-common.h" @@ -526,6 +529,15 @@ typedef struct CPUARMState { */ typedef void ARMELChangeHook(ARMCPU *cpu, void *opaque); + +/* These values map onto the return values for + * QEMU_PSCI_0_2_FN_AFFINITY_INFO */ +typedef enum ARMPSCIState { + PSCI_OFF = 0, + PSCI_ON = 1, + PSCI_ON_PENDING = 2 +} ARMPSCIState; + /** * ARMCPU: * @env: #CPUARMState @@ -582,8 +594,10 @@ struct ARMCPU { /* Should CPU start in PSCI powered-off state? */ bool start_powered_off; - /* CPU currently in PSCI powered-off state */ - bool powered_off; + + /* Current power state, access guarded by BQL */ + ARMPSCIState power_state; + /* CPU has virtualization extension */ bool has_el2; /* CPU has security extension */ diff --git a/target/arm/helper.c b/target/arm/helper.c index 47250bcf16..bcedb4a808 100644 --- a/target/arm/helper.c +++ b/target/arm/helper.c @@ -536,41 +536,33 @@ static void tlbimvaa_write(CPUARMState *env, const ARMCPRegInfo *ri, static void tlbiall_is_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) { - CPUState *other_cs; + CPUState *cs = ENV_GET_CPU(env); - CPU_FOREACH(other_cs) { - tlb_flush(other_cs); - } + tlb_flush_all_cpus_synced(cs); } static void tlbiasid_is_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) { - CPUState *other_cs; + CPUState *cs = ENV_GET_CPU(env); - CPU_FOREACH(other_cs) { - tlb_flush(other_cs); - } + tlb_flush_all_cpus_synced(cs); } static void tlbimva_is_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) { - CPUState *other_cs; + CPUState *cs = ENV_GET_CPU(env); - CPU_FOREACH(other_cs) { - tlb_flush_page(other_cs, value & TARGET_PAGE_MASK); - } + tlb_flush_page_all_cpus_synced(cs, value & TARGET_PAGE_MASK); } static void tlbimvaa_is_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) { - CPUState *other_cs; + CPUState *cs = ENV_GET_CPU(env); - CPU_FOREACH(other_cs) { - tlb_flush_page(other_cs, value & TARGET_PAGE_MASK); - } + tlb_flush_page_all_cpus_synced(cs, value & TARGET_PAGE_MASK); } static void tlbiall_nsnh_write(CPUARMState *env, const ARMCPRegInfo *ri, @@ -578,19 +570,21 @@ static void tlbiall_nsnh_write(CPUARMState *env, const ARMCPRegInfo *ri, { CPUState *cs = ENV_GET_CPU(env); - tlb_flush_by_mmuidx(cs, ARMMMUIdx_S12NSE1, ARMMMUIdx_S12NSE0, - ARMMMUIdx_S2NS, -1); + tlb_flush_by_mmuidx(cs, + (1 << ARMMMUIdx_S12NSE1) | + (1 << ARMMMUIdx_S12NSE0) | + (1 << ARMMMUIdx_S2NS)); } static void tlbiall_nsnh_is_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) { - CPUState *other_cs; + CPUState *cs = ENV_GET_CPU(env); - CPU_FOREACH(other_cs) { - tlb_flush_by_mmuidx(other_cs, ARMMMUIdx_S12NSE1, - ARMMMUIdx_S12NSE0, ARMMMUIdx_S2NS, -1); - } + tlb_flush_by_mmuidx_all_cpus_synced(cs, + (1 << ARMMMUIdx_S12NSE1) | + (1 << ARMMMUIdx_S12NSE0) | + (1 << ARMMMUIdx_S2NS)); } static void tlbiipas2_write(CPUARMState *env, const ARMCPRegInfo *ri, @@ -611,13 +605,13 @@ static void tlbiipas2_write(CPUARMState *env, const ARMCPRegInfo *ri, pageaddr = sextract64(value << 12, 0, 40); - tlb_flush_page_by_mmuidx(cs, pageaddr, ARMMMUIdx_S2NS, -1); + tlb_flush_page_by_mmuidx(cs, pageaddr, (1 << ARMMMUIdx_S2NS)); } static void tlbiipas2_is_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) { - CPUState *other_cs; + CPUState *cs = ENV_GET_CPU(env); uint64_t pageaddr; if (!arm_feature(env, ARM_FEATURE_EL2) || !(env->cp15.scr_el3 & SCR_NS)) { @@ -626,9 +620,8 @@ static void tlbiipas2_is_write(CPUARMState *env, const ARMCPRegInfo *ri, pageaddr = sextract64(value << 12, 0, 40); - CPU_FOREACH(other_cs) { - tlb_flush_page_by_mmuidx(other_cs, pageaddr, ARMMMUIdx_S2NS, -1); - } + tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr, + (1 << ARMMMUIdx_S2NS)); } static void tlbiall_hyp_write(CPUARMState *env, const ARMCPRegInfo *ri, @@ -636,17 +629,15 @@ static void tlbiall_hyp_write(CPUARMState *env, const ARMCPRegInfo *ri, { CPUState *cs = ENV_GET_CPU(env); - tlb_flush_by_mmuidx(cs, ARMMMUIdx_S1E2, -1); + tlb_flush_by_mmuidx(cs, (1 << ARMMMUIdx_S1E2)); } static void tlbiall_hyp_is_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) { - CPUState *other_cs; + CPUState *cs = ENV_GET_CPU(env); - CPU_FOREACH(other_cs) { - tlb_flush_by_mmuidx(other_cs, ARMMMUIdx_S1E2, -1); - } + tlb_flush_by_mmuidx_all_cpus_synced(cs, (1 << ARMMMUIdx_S1E2)); } static void tlbimva_hyp_write(CPUARMState *env, const ARMCPRegInfo *ri, @@ -655,18 +646,17 @@ static void tlbimva_hyp_write(CPUARMState *env, const ARMCPRegInfo *ri, CPUState *cs = ENV_GET_CPU(env); uint64_t pageaddr = value & ~MAKE_64BIT_MASK(0, 12); - tlb_flush_page_by_mmuidx(cs, pageaddr, ARMMMUIdx_S1E2, -1); + tlb_flush_page_by_mmuidx(cs, pageaddr, (1 << ARMMMUIdx_S1E2)); } static void tlbimva_hyp_is_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) { - CPUState *other_cs; + CPUState *cs = ENV_GET_CPU(env); uint64_t pageaddr = value & ~MAKE_64BIT_MASK(0, 12); - CPU_FOREACH(other_cs) { - tlb_flush_page_by_mmuidx(other_cs, pageaddr, ARMMMUIdx_S1E2, -1); - } + tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr, + (1 << ARMMMUIdx_S1E2)); } static const ARMCPRegInfo cp_reginfo[] = { @@ -2542,8 +2532,10 @@ static void vttbr_write(CPUARMState *env, const ARMCPRegInfo *ri, /* Accesses to VTTBR may change the VMID so we must flush the TLB. */ if (raw_read(env, ri) != value) { - tlb_flush_by_mmuidx(cs, ARMMMUIdx_S12NSE1, ARMMMUIdx_S12NSE0, - ARMMMUIdx_S2NS, -1); + tlb_flush_by_mmuidx(cs, + (1 << ARMMMUIdx_S12NSE1) | + (1 << ARMMMUIdx_S12NSE0) | + (1 << ARMMMUIdx_S2NS)); raw_write(env, ri, value); } } @@ -2898,29 +2890,33 @@ static CPAccessResult aa64_cacheop_access(CPUARMState *env, static void tlbi_aa64_vmalle1_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) { - ARMCPU *cpu = arm_env_get_cpu(env); - CPUState *cs = CPU(cpu); + CPUState *cs = ENV_GET_CPU(env); if (arm_is_secure_below_el3(env)) { - tlb_flush_by_mmuidx(cs, ARMMMUIdx_S1SE1, ARMMMUIdx_S1SE0, -1); + tlb_flush_by_mmuidx(cs, + (1 << ARMMMUIdx_S1SE1) | + (1 << ARMMMUIdx_S1SE0)); } else { - tlb_flush_by_mmuidx(cs, ARMMMUIdx_S12NSE1, ARMMMUIdx_S12NSE0, -1); + tlb_flush_by_mmuidx(cs, + (1 << ARMMMUIdx_S12NSE1) | + (1 << ARMMMUIdx_S12NSE0)); } } static void tlbi_aa64_vmalle1is_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) { + CPUState *cs = ENV_GET_CPU(env); bool sec = arm_is_secure_below_el3(env); - CPUState *other_cs; - CPU_FOREACH(other_cs) { - if (sec) { - tlb_flush_by_mmuidx(other_cs, ARMMMUIdx_S1SE1, ARMMMUIdx_S1SE0, -1); - } else { - tlb_flush_by_mmuidx(other_cs, ARMMMUIdx_S12NSE1, - ARMMMUIdx_S12NSE0, -1); - } + if (sec) { + tlb_flush_by_mmuidx_all_cpus_synced(cs, + (1 << ARMMMUIdx_S1SE1) | + (1 << ARMMMUIdx_S1SE0)); + } else { + tlb_flush_by_mmuidx_all_cpus_synced(cs, + (1 << ARMMMUIdx_S12NSE1) | + (1 << ARMMMUIdx_S12NSE0)); } } @@ -2935,13 +2931,19 @@ static void tlbi_aa64_alle1_write(CPUARMState *env, const ARMCPRegInfo *ri, CPUState *cs = CPU(cpu); if (arm_is_secure_below_el3(env)) { - tlb_flush_by_mmuidx(cs, ARMMMUIdx_S1SE1, ARMMMUIdx_S1SE0, -1); + tlb_flush_by_mmuidx(cs, + (1 << ARMMMUIdx_S1SE1) | + (1 << ARMMMUIdx_S1SE0)); } else { if (arm_feature(env, ARM_FEATURE_EL2)) { - tlb_flush_by_mmuidx(cs, ARMMMUIdx_S12NSE1, ARMMMUIdx_S12NSE0, - ARMMMUIdx_S2NS, -1); + tlb_flush_by_mmuidx(cs, + (1 << ARMMMUIdx_S12NSE1) | + (1 << ARMMMUIdx_S12NSE0) | + (1 << ARMMMUIdx_S2NS)); } else { - tlb_flush_by_mmuidx(cs, ARMMMUIdx_S12NSE1, ARMMMUIdx_S12NSE0, -1); + tlb_flush_by_mmuidx(cs, + (1 << ARMMMUIdx_S12NSE1) | + (1 << ARMMMUIdx_S12NSE0)); } } } @@ -2952,7 +2954,7 @@ static void tlbi_aa64_alle2_write(CPUARMState *env, const ARMCPRegInfo *ri, ARMCPU *cpu = arm_env_get_cpu(env); CPUState *cs = CPU(cpu); - tlb_flush_by_mmuidx(cs, ARMMMUIdx_S1E2, -1); + tlb_flush_by_mmuidx(cs, (1 << ARMMMUIdx_S1E2)); } static void tlbi_aa64_alle3_write(CPUARMState *env, const ARMCPRegInfo *ri, @@ -2961,7 +2963,7 @@ static void tlbi_aa64_alle3_write(CPUARMState *env, const ARMCPRegInfo *ri, ARMCPU *cpu = arm_env_get_cpu(env); CPUState *cs = CPU(cpu); - tlb_flush_by_mmuidx(cs, ARMMMUIdx_S1E3, -1); + tlb_flush_by_mmuidx(cs, (1 << ARMMMUIdx_S1E3)); } static void tlbi_aa64_alle1is_write(CPUARMState *env, const ARMCPRegInfo *ri, @@ -2971,41 +2973,40 @@ static void tlbi_aa64_alle1is_write(CPUARMState *env, const ARMCPRegInfo *ri, * stage 2 translations, whereas most other scopes only invalidate * stage 1 translations. */ + CPUState *cs = ENV_GET_CPU(env); bool sec = arm_is_secure_below_el3(env); bool has_el2 = arm_feature(env, ARM_FEATURE_EL2); - CPUState *other_cs; - - CPU_FOREACH(other_cs) { - if (sec) { - tlb_flush_by_mmuidx(other_cs, ARMMMUIdx_S1SE1, ARMMMUIdx_S1SE0, -1); - } else if (has_el2) { - tlb_flush_by_mmuidx(other_cs, ARMMMUIdx_S12NSE1, - ARMMMUIdx_S12NSE0, ARMMMUIdx_S2NS, -1); - } else { - tlb_flush_by_mmuidx(other_cs, ARMMMUIdx_S12NSE1, - ARMMMUIdx_S12NSE0, -1); - } + + if (sec) { + tlb_flush_by_mmuidx_all_cpus_synced(cs, + (1 << ARMMMUIdx_S1SE1) | + (1 << ARMMMUIdx_S1SE0)); + } else if (has_el2) { + tlb_flush_by_mmuidx_all_cpus_synced(cs, + (1 << ARMMMUIdx_S12NSE1) | + (1 << ARMMMUIdx_S12NSE0) | + (1 << ARMMMUIdx_S2NS)); + } else { + tlb_flush_by_mmuidx_all_cpus_synced(cs, + (1 << ARMMMUIdx_S12NSE1) | + (1 << ARMMMUIdx_S12NSE0)); } } static void tlbi_aa64_alle2is_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) { - CPUState *other_cs; + CPUState *cs = ENV_GET_CPU(env); - CPU_FOREACH(other_cs) { - tlb_flush_by_mmuidx(other_cs, ARMMMUIdx_S1E2, -1); - } + tlb_flush_by_mmuidx_all_cpus_synced(cs, (1 << ARMMMUIdx_S1E2)); } static void tlbi_aa64_alle3is_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) { - CPUState *other_cs; + CPUState *cs = ENV_GET_CPU(env); - CPU_FOREACH(other_cs) { - tlb_flush_by_mmuidx(other_cs, ARMMMUIdx_S1E3, -1); - } + tlb_flush_by_mmuidx_all_cpus_synced(cs, (1 << ARMMMUIdx_S1E3)); } static void tlbi_aa64_vae1_write(CPUARMState *env, const ARMCPRegInfo *ri, @@ -3021,11 +3022,13 @@ static void tlbi_aa64_vae1_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t pageaddr = sextract64(value << 12, 0, 56); if (arm_is_secure_below_el3(env)) { - tlb_flush_page_by_mmuidx(cs, pageaddr, ARMMMUIdx_S1SE1, - ARMMMUIdx_S1SE0, -1); + tlb_flush_page_by_mmuidx(cs, pageaddr, + (1 << ARMMMUIdx_S1SE1) | + (1 << ARMMMUIdx_S1SE0)); } else { - tlb_flush_page_by_mmuidx(cs, pageaddr, ARMMMUIdx_S12NSE1, - ARMMMUIdx_S12NSE0, -1); + tlb_flush_page_by_mmuidx(cs, pageaddr, + (1 << ARMMMUIdx_S12NSE1) | + (1 << ARMMMUIdx_S12NSE0)); } } @@ -3040,7 +3043,7 @@ static void tlbi_aa64_vae2_write(CPUARMState *env, const ARMCPRegInfo *ri, CPUState *cs = CPU(cpu); uint64_t pageaddr = sextract64(value << 12, 0, 56); - tlb_flush_page_by_mmuidx(cs, pageaddr, ARMMMUIdx_S1E2, -1); + tlb_flush_page_by_mmuidx(cs, pageaddr, (1 << ARMMMUIdx_S1E2)); } static void tlbi_aa64_vae3_write(CPUARMState *env, const ARMCPRegInfo *ri, @@ -3054,47 +3057,46 @@ static void tlbi_aa64_vae3_write(CPUARMState *env, const ARMCPRegInfo *ri, CPUState *cs = CPU(cpu); uint64_t pageaddr = sextract64(value << 12, 0, 56); - tlb_flush_page_by_mmuidx(cs, pageaddr, ARMMMUIdx_S1E3, -1); + tlb_flush_page_by_mmuidx(cs, pageaddr, (1 << ARMMMUIdx_S1E3)); } static void tlbi_aa64_vae1is_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) { + ARMCPU *cpu = arm_env_get_cpu(env); + CPUState *cs = CPU(cpu); bool sec = arm_is_secure_below_el3(env); - CPUState *other_cs; uint64_t pageaddr = sextract64(value << 12, 0, 56); - CPU_FOREACH(other_cs) { - if (sec) { - tlb_flush_page_by_mmuidx(other_cs, pageaddr, ARMMMUIdx_S1SE1, - ARMMMUIdx_S1SE0, -1); - } else { - tlb_flush_page_by_mmuidx(other_cs, pageaddr, ARMMMUIdx_S12NSE1, - ARMMMUIdx_S12NSE0, -1); - } + if (sec) { + tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr, + (1 << ARMMMUIdx_S1SE1) | + (1 << ARMMMUIdx_S1SE0)); + } else { + tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr, + (1 << ARMMMUIdx_S12NSE1) | + (1 << ARMMMUIdx_S12NSE0)); } } static void tlbi_aa64_vae2is_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) { - CPUState *other_cs; + CPUState *cs = ENV_GET_CPU(env); uint64_t pageaddr = sextract64(value << 12, 0, 56); - CPU_FOREACH(other_cs) { - tlb_flush_page_by_mmuidx(other_cs, pageaddr, ARMMMUIdx_S1E2, -1); - } + tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr, + (1 << ARMMMUIdx_S1E2)); } static void tlbi_aa64_vae3is_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) { - CPUState *other_cs; + CPUState *cs = ENV_GET_CPU(env); uint64_t pageaddr = sextract64(value << 12, 0, 56); - CPU_FOREACH(other_cs) { - tlb_flush_page_by_mmuidx(other_cs, pageaddr, ARMMMUIdx_S1E3, -1); - } + tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr, + (1 << ARMMMUIdx_S1E3)); } static void tlbi_aa64_ipas2e1_write(CPUARMState *env, const ARMCPRegInfo *ri, @@ -3116,13 +3118,13 @@ static void tlbi_aa64_ipas2e1_write(CPUARMState *env, const ARMCPRegInfo *ri, pageaddr = sextract64(value << 12, 0, 48); - tlb_flush_page_by_mmuidx(cs, pageaddr, ARMMMUIdx_S2NS, -1); + tlb_flush_page_by_mmuidx(cs, pageaddr, (1 << ARMMMUIdx_S2NS)); } static void tlbi_aa64_ipas2e1is_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) { - CPUState *other_cs; + CPUState *cs = ENV_GET_CPU(env); uint64_t pageaddr; if (!arm_feature(env, ARM_FEATURE_EL2) || !(env->cp15.scr_el3 & SCR_NS)) { @@ -3131,9 +3133,8 @@ static void tlbi_aa64_ipas2e1is_write(CPUARMState *env, const ARMCPRegInfo *ri, pageaddr = sextract64(value << 12, 0, 48); - CPU_FOREACH(other_cs) { - tlb_flush_page_by_mmuidx(other_cs, pageaddr, ARMMMUIdx_S2NS, -1); - } + tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr, + (1 << ARMMMUIdx_S2NS)); } static CPAccessResult aa64_zva_access(CPUARMState *env, const ARMCPRegInfo *ri, @@ -6769,6 +6770,12 @@ void arm_cpu_do_interrupt(CPUState *cs) arm_cpu_do_interrupt_aarch32(cs); } + /* Hooks may change global state so BQL should be held, also the + * BQL needs to be held for any modification of + * cs->interrupt_request. + */ + g_assert(qemu_mutex_iothread_locked()); + arm_call_el_change_hook(cpu); if (!kvm_enabled()) { diff --git a/target/arm/kvm.c b/target/arm/kvm.c index c00b94e42a..395e986973 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -488,8 +488,8 @@ int kvm_arm_sync_mpstate_to_kvm(ARMCPU *cpu) { if (cap_has_mp_state) { struct kvm_mp_state mp_state = { - .mp_state = - cpu->powered_off ? KVM_MP_STATE_STOPPED : KVM_MP_STATE_RUNNABLE + .mp_state = (cpu->power_state == PSCI_OFF) ? + KVM_MP_STATE_STOPPED : KVM_MP_STATE_RUNNABLE }; int ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MP_STATE, &mp_state); if (ret) { @@ -515,7 +515,8 @@ int kvm_arm_sync_mpstate_to_qemu(ARMCPU *cpu) __func__, ret, strerror(-ret)); abort(); } - cpu->powered_off = (mp_state.mp_state == KVM_MP_STATE_STOPPED); + cpu->power_state = (mp_state.mp_state == KVM_MP_STATE_STOPPED) ? + PSCI_OFF : PSCI_ON; } return 0; diff --git a/target/arm/machine.c b/target/arm/machine.c index fa5ec76090..d8094a840b 100644 --- a/target/arm/machine.c +++ b/target/arm/machine.c @@ -211,6 +211,38 @@ static const VMStateInfo vmstate_cpsr = { .put = put_cpsr, }; +static int get_power(QEMUFile *f, void *opaque, size_t size, + VMStateField *field) +{ + ARMCPU *cpu = opaque; + bool powered_off = qemu_get_byte(f); + cpu->power_state = powered_off ? PSCI_OFF : PSCI_ON; + return 0; +} + +static int put_power(QEMUFile *f, void *opaque, size_t size, + VMStateField *field, QJSON *vmdesc) +{ + ARMCPU *cpu = opaque; + + /* Migration should never happen while we transition power states */ + + if (cpu->power_state == PSCI_ON || + cpu->power_state == PSCI_OFF) { + bool powered_off = (cpu->power_state == PSCI_OFF) ? true : false; + qemu_put_byte(f, powered_off); + return 0; + } else { + return 1; + } +} + +static const VMStateInfo vmstate_powered_off = { + .name = "powered_off", + .get = get_power, + .put = put_power, +}; + static void cpu_pre_save(void *opaque) { ARMCPU *cpu = opaque; @@ -329,7 +361,14 @@ const VMStateDescription vmstate_arm_cpu = { VMSTATE_UINT64(env.exception.vaddress, ARMCPU), VMSTATE_TIMER_PTR(gt_timer[GTIMER_PHYS], ARMCPU), VMSTATE_TIMER_PTR(gt_timer[GTIMER_VIRT], ARMCPU), - VMSTATE_BOOL(powered_off, ARMCPU), + { + .name = "power_state", + .version_id = 0, + .size = sizeof(bool), + .info = &vmstate_powered_off, + .flags = VMS_SINGLE, + .offset = 0, + }, VMSTATE_END_OF_LIST() }, .subsections = (const VMStateDescription*[]) { diff --git a/target/arm/op_helper.c b/target/arm/op_helper.c index fb366fdc35..d64c8670fa 100644 --- a/target/arm/op_helper.c +++ b/target/arm/op_helper.c @@ -18,6 +18,7 @@ */ #include "qemu/osdep.h" #include "qemu/log.h" +#include "qemu/main-loop.h" #include "cpu.h" #include "exec/helper-proto.h" #include "internals.h" @@ -435,6 +436,13 @@ void HELPER(yield)(CPUARMState *env) ARMCPU *cpu = arm_env_get_cpu(env); CPUState *cs = CPU(cpu); + /* When running in MTTCG we don't generate jumps to the yield and + * WFE helpers as it won't affect the scheduling of other vCPUs. + * If we wanted to more completely model WFE/SEV so we don't busy + * spin unnecessarily we would need to do something more involved. + */ + g_assert(!parallel_cpus); + /* This is a non-trappable hint instruction that generally indicates * that the guest is currently busy-looping. Yield control back to the * top level loop so that a more deserving VCPU has a chance to run. @@ -487,7 +495,9 @@ void HELPER(cpsr_write_eret)(CPUARMState *env, uint32_t val) */ env->regs[15] &= (env->thumb ? ~1 : ~3); + qemu_mutex_lock_iothread(); arm_call_el_change_hook(arm_env_get_cpu(env)); + qemu_mutex_unlock_iothread(); } /* Access to user mode registers from privileged modes. */ @@ -735,28 +745,58 @@ void HELPER(set_cp_reg)(CPUARMState *env, void *rip, uint32_t value) { const ARMCPRegInfo *ri = rip; - ri->writefn(env, ri, value); + if (ri->type & ARM_CP_IO) { + qemu_mutex_lock_iothread(); + ri->writefn(env, ri, value); + qemu_mutex_unlock_iothread(); + } else { + ri->writefn(env, ri, value); + } } uint32_t HELPER(get_cp_reg)(CPUARMState *env, void *rip) { const ARMCPRegInfo *ri = rip; + uint32_t res; + + if (ri->type & ARM_CP_IO) { + qemu_mutex_lock_iothread(); + res = ri->readfn(env, ri); + qemu_mutex_unlock_iothread(); + } else { + res = ri->readfn(env, ri); + } - return ri->readfn(env, ri); + return res; } void HELPER(set_cp_reg64)(CPUARMState *env, void *rip, uint64_t value) { const ARMCPRegInfo *ri = rip; - ri->writefn(env, ri, value); + if (ri->type & ARM_CP_IO) { + qemu_mutex_lock_iothread(); + ri->writefn(env, ri, value); + qemu_mutex_unlock_iothread(); + } else { + ri->writefn(env, ri, value); + } } uint64_t HELPER(get_cp_reg64)(CPUARMState *env, void *rip) { const ARMCPRegInfo *ri = rip; + uint64_t res; - return ri->readfn(env, ri); + if (ri->type & ARM_CP_IO) { + qemu_mutex_lock_iothread(); + res = ri->readfn(env, ri); + qemu_mutex_unlock_iothread(); + } else { + res = ri->readfn(env, ri); + } + + return res; } void HELPER(msr_i_pstate)(CPUARMState *env, uint32_t op, uint32_t imm) @@ -989,7 +1029,9 @@ void HELPER(exception_return)(CPUARMState *env) cur_el, new_el, env->pc); } + qemu_mutex_lock_iothread(); arm_call_el_change_hook(arm_env_get_cpu(env)); + qemu_mutex_unlock_iothread(); return; diff --git a/target/arm/psci.c b/target/arm/psci.c index 64bf82eea1..ade9fe2ede 100644 --- a/target/arm/psci.c +++ b/target/arm/psci.c @@ -127,7 +127,9 @@ void arm_handle_psci_call(ARMCPU *cpu) break; } target_cpu = ARM_CPU(target_cpu_state); - ret = target_cpu->powered_off ? 1 : 0; + + g_assert(qemu_mutex_iothread_locked()); + ret = target_cpu->power_state; break; default: /* Everything above affinity level 0 is always on. */ diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c index e61bbd6b3b..e15eae6d41 100644 --- a/target/arm/translate-a64.c +++ b/target/arm/translate-a64.c @@ -1328,10 +1328,14 @@ static void handle_hint(DisasContext *s, uint32_t insn, s->is_jmp = DISAS_WFI; return; case 1: /* YIELD */ - s->is_jmp = DISAS_YIELD; + if (!parallel_cpus) { + s->is_jmp = DISAS_YIELD; + } return; case 2: /* WFE */ - s->is_jmp = DISAS_WFE; + if (!parallel_cpus) { + s->is_jmp = DISAS_WFE; + } return; case 4: /* SEV */ case 5: /* SEVL */ diff --git a/target/arm/translate.c b/target/arm/translate.c index 4436d8f3a2..abc1f77ee4 100644 --- a/target/arm/translate.c +++ b/target/arm/translate.c @@ -4404,20 +4404,32 @@ static void gen_exception_return(DisasContext *s, TCGv_i32 pc) gen_rfe(s, pc, load_cpu_field(spsr)); } +/* + * For WFI we will halt the vCPU until an IRQ. For WFE and YIELD we + * only call the helper when running single threaded TCG code to ensure + * the next round-robin scheduled vCPU gets a crack. In MTTCG mode we + * just skip this instruction. Currently the SEV/SEVL instructions + * which are *one* of many ways to wake the CPU from WFE are not + * implemented so we can't sleep like WFI does. + */ static void gen_nop_hint(DisasContext *s, int val) { switch (val) { case 1: /* yield */ - gen_set_pc_im(s, s->pc); - s->is_jmp = DISAS_YIELD; + if (!parallel_cpus) { + gen_set_pc_im(s, s->pc); + s->is_jmp = DISAS_YIELD; + } break; case 3: /* wfi */ gen_set_pc_im(s, s->pc); s->is_jmp = DISAS_WFI; break; case 2: /* wfe */ - gen_set_pc_im(s, s->pc); - s->is_jmp = DISAS_WFE; + if (!parallel_cpus) { + gen_set_pc_im(s, s->pc); + s->is_jmp = DISAS_WFE; + } break; case 4: /* sev */ case 5: /* sevl */ diff --git a/target/i386/smm_helper.c b/target/i386/smm_helper.c index 4dd6a2c544..f051a77c4a 100644 --- a/target/i386/smm_helper.c +++ b/target/i386/smm_helper.c @@ -18,6 +18,7 @@ */ #include "qemu/osdep.h" +#include "qemu/main-loop.h" #include "cpu.h" #include "exec/helper-proto.h" #include "exec/log.h" @@ -42,11 +43,14 @@ void helper_rsm(CPUX86State *env) #define SMM_REVISION_ID 0x00020000 #endif +/* Called with iothread lock taken */ void cpu_smm_update(X86CPU *cpu) { CPUX86State *env = &cpu->env; bool smm_enabled = (env->hflags & HF_SMM_MASK); + g_assert(qemu_mutex_iothread_locked()); + if (cpu->smram) { memory_region_set_enabled(cpu->smram, smm_enabled); } @@ -333,7 +337,10 @@ void helper_rsm(CPUX86State *env) } env->hflags2 &= ~HF2_SMM_INSIDE_NMI_MASK; env->hflags &= ~HF_SMM_MASK; + + qemu_mutex_lock_iothread(); cpu_smm_update(cpu); + qemu_mutex_unlock_iothread(); qemu_log_mask(CPU_LOG_INT, "SMM: after RSM\n"); log_cpu_state_mask(CPU_LOG_INT, CPU(cpu), CPU_DUMP_CCOP); diff --git a/target/s390x/misc_helper.c b/target/s390x/misc_helper.c index c9604ea9c7..3cb942e8bb 100644 --- a/target/s390x/misc_helper.c +++ b/target/s390x/misc_helper.c @@ -25,6 +25,7 @@ #include "exec/helper-proto.h" #include "sysemu/kvm.h" #include "qemu/timer.h" +#include "qemu/main-loop.h" #include "exec/address-spaces.h" #ifdef CONFIG_KVM #include <linux/kvm.h> @@ -109,11 +110,13 @@ void program_interrupt(CPUS390XState *env, uint32_t code, int ilen) /* SCLP service call */ uint32_t HELPER(servc)(CPUS390XState *env, uint64_t r1, uint64_t r2) { + qemu_mutex_lock_iothread(); int r = sclp_service_call(env, r1, r2); if (r < 0) { program_interrupt(env, -r, 4); - return 0; + r = 0; } + qemu_mutex_unlock_iothread(); return r; } diff --git a/target/sparc/ldst_helper.c b/target/sparc/ldst_helper.c index 2c05d6af75..57968d9143 100644 --- a/target/sparc/ldst_helper.c +++ b/target/sparc/ldst_helper.c @@ -1768,13 +1768,15 @@ void helper_st_asi(CPUSPARCState *env, target_ulong addr, target_ulong val, case 1: env->dmmu.mmu_primary_context = val; env->immu.mmu_primary_context = val; - tlb_flush_by_mmuidx(CPU(cpu), MMU_USER_IDX, MMU_KERNEL_IDX, -1); + tlb_flush_by_mmuidx(CPU(cpu), + (1 << MMU_USER_IDX) | (1 << MMU_KERNEL_IDX)); break; case 2: env->dmmu.mmu_secondary_context = val; env->immu.mmu_secondary_context = val; - tlb_flush_by_mmuidx(CPU(cpu), MMU_USER_SECONDARY_IDX, - MMU_KERNEL_SECONDARY_IDX, -1); + tlb_flush_by_mmuidx(CPU(cpu), + (1 << MMU_USER_SECONDARY_IDX) | + (1 << MMU_KERNEL_SECONDARY_IDX)); break; default: cpu_unassigned_access(cs, addr, true, false, 1, size); diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h index 21d96ec35c..4275787db9 100644 --- a/tcg/i386/tcg-target.h +++ b/tcg/i386/tcg-target.h @@ -165,4 +165,15 @@ static inline void flush_icache_range(uintptr_t start, uintptr_t stop) { } +/* This defines the natural memory order supported by this + * architecture before guarantees made by various barrier + * instructions. + * + * The x86 has a pretty strong memory ordering which only really + * allows for some stores to be re-ordered after loads. + */ +#include "tcg-mo.h" + +#define TCG_TARGET_DEFAULT_MO (TCG_MO_ALL & ~TCG_MO_ST_LD) + #endif diff --git a/tcg/tcg-mo.h b/tcg/tcg-mo.h new file mode 100644 index 0000000000..c2c55704e1 --- /dev/null +++ b/tcg/tcg-mo.h @@ -0,0 +1,48 @@ +/* + * Tiny Code Generator for QEMU + * + * Copyright (c) 2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef TCG_MO_H +#define TCG_MO_H + +typedef enum { + /* Used to indicate the type of accesses on which ordering + is to be ensured. Modeled after SPARC barriers. + + This is of the form TCG_MO_A_B where A is before B in program order. + */ + TCG_MO_LD_LD = 0x01, + TCG_MO_ST_LD = 0x02, + TCG_MO_LD_ST = 0x04, + TCG_MO_ST_ST = 0x08, + TCG_MO_ALL = 0x0F, /* OR of the above */ + + /* Used to indicate the kind of ordering which is to be ensured by the + instruction. These types are derived from x86/aarch64 instructions. + It should be noted that these are different from C11 semantics. */ + TCG_BAR_LDAQ = 0x10, /* Following ops will not come forward */ + TCG_BAR_STRL = 0x20, /* Previous ops will not be delayed */ + TCG_BAR_SC = 0x30, /* No ops cross barrier; OR of the above */ +} TCGBar; + +#endif /* TCG_MO_H */ @@ -29,6 +29,7 @@ #include "cpu.h" #include "exec/tb-context.h" #include "qemu/bitops.h" +#include "tcg-mo.h" #include "tcg-target.h" /* XXX: make safe guess about sizes */ @@ -79,6 +80,15 @@ typedef uint64_t tcg_target_ulong; #error unsupported #endif +/* Oversized TCG guests make things like MTTCG hard + * as we can't use atomics for cputlb updates. + */ +#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS +#define TCG_OVERSIZED_GUEST 1 +#else +#define TCG_OVERSIZED_GUEST 0 +#endif + #if TCG_TARGET_NB_REGS <= 32 typedef uint32_t TCGRegSet; #elif TCG_TARGET_NB_REGS <= 64 @@ -498,23 +508,6 @@ static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_PTR(TCGv_ptr t) #define TCG_CALL_DUMMY_TCGV MAKE_TCGV_I32(-1) #define TCG_CALL_DUMMY_ARG ((TCGArg)(-1)) -typedef enum { - /* Used to indicate the type of accesses on which ordering - is to be ensured. Modeled after SPARC barriers. */ - TCG_MO_LD_LD = 0x01, - TCG_MO_ST_LD = 0x02, - TCG_MO_LD_ST = 0x04, - TCG_MO_ST_ST = 0x08, - TCG_MO_ALL = 0x0F, /* OR of the above */ - - /* Used to indicate the kind of ordering which is to be ensured by the - instruction. These types are derived from x86/aarch64 instructions. - It should be noted that these are different from C11 semantics. */ - TCG_BAR_LDAQ = 0x10, /* Following ops will not come forward */ - TCG_BAR_STRL = 0x20, /* Previous ops will not be delayed */ - TCG_BAR_SC = 0x30, /* No ops cross barrier; OR of the above */ -} TCGBar; - /* Conditions. Note that these are laid out for easy manipulation by the functions below: bit 0 is used for inverting; diff --git a/translate-all.c b/translate-all.c index 5f44ec844e..9bac061c9b 100644 --- a/translate-all.c +++ b/translate-all.c @@ -55,11 +55,11 @@ #include "translate-all.h" #include "qemu/bitmap.h" #include "qemu/timer.h" +#include "qemu/main-loop.h" #include "exec/log.h" /* #define DEBUG_TB_INVALIDATE */ /* #define DEBUG_TB_FLUSH */ -/* #define DEBUG_LOCKING */ /* make various TB consistency checks */ /* #define DEBUG_TB_CHECK */ @@ -74,20 +74,10 @@ * access to the memory related structures are protected with the * mmap_lock. */ -#ifdef DEBUG_LOCKING -#define DEBUG_MEM_LOCKS 1 -#else -#define DEBUG_MEM_LOCKS 0 -#endif - #ifdef CONFIG_SOFTMMU -#define assert_memory_lock() do { /* nothing */ } while (0) +#define assert_memory_lock() tcg_debug_assert(have_tb_lock) #else -#define assert_memory_lock() do { \ - if (DEBUG_MEM_LOCKS) { \ - g_assert(have_mmap_lock()); \ - } \ - } while (0) +#define assert_memory_lock() tcg_debug_assert(have_mmap_lock()) #endif #define SMC_BITMAP_USE_THRESHOLD 10 @@ -145,9 +135,7 @@ TCGContext tcg_ctx; bool parallel_cpus; /* translation block context */ -#ifdef CONFIG_USER_ONLY __thread int have_tb_lock; -#endif static void page_table_config_init(void) { @@ -169,51 +157,31 @@ static void page_table_config_init(void) assert(v_l2_levels >= 0); } +#define assert_tb_locked() tcg_debug_assert(have_tb_lock) +#define assert_tb_unlocked() tcg_debug_assert(!have_tb_lock) + void tb_lock(void) { -#ifdef CONFIG_USER_ONLY - assert(!have_tb_lock); + assert_tb_unlocked(); qemu_mutex_lock(&tcg_ctx.tb_ctx.tb_lock); have_tb_lock++; -#endif } void tb_unlock(void) { -#ifdef CONFIG_USER_ONLY - assert(have_tb_lock); + assert_tb_locked(); have_tb_lock--; qemu_mutex_unlock(&tcg_ctx.tb_ctx.tb_lock); -#endif } void tb_lock_reset(void) { -#ifdef CONFIG_USER_ONLY if (have_tb_lock) { qemu_mutex_unlock(&tcg_ctx.tb_ctx.tb_lock); have_tb_lock = 0; } -#endif } -#ifdef DEBUG_LOCKING -#define DEBUG_TB_LOCKS 1 -#else -#define DEBUG_TB_LOCKS 0 -#endif - -#ifdef CONFIG_SOFTMMU -#define assert_tb_lock() do { /* nothing */ } while (0) -#else -#define assert_tb_lock() do { \ - if (DEBUG_TB_LOCKS) { \ - g_assert(have_tb_lock); \ - } \ - } while (0) -#endif - - static TranslationBlock *tb_find_pc(uintptr_t tc_ptr); void cpu_gen_init(void) @@ -847,7 +815,7 @@ static TranslationBlock *tb_alloc(target_ulong pc) { TranslationBlock *tb; - assert_tb_lock(); + assert_tb_locked(); if (tcg_ctx.tb_ctx.nb_tbs >= tcg_ctx.code_gen_max_blocks) { return NULL; @@ -862,7 +830,7 @@ static TranslationBlock *tb_alloc(target_ulong pc) /* Called with tb_lock held. */ void tb_free(TranslationBlock *tb) { - assert_tb_lock(); + assert_tb_locked(); /* In practice this is mostly used for single use temporary TB Ignore the hard cases and just back up if this TB happens to @@ -1104,7 +1072,7 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr) uint32_t h; tb_page_addr_t phys_pc; - assert_tb_lock(); + assert_tb_locked(); atomic_set(&tb->invalid, true); @@ -1421,7 +1389,7 @@ static void tb_invalidate_phys_range_1(tb_page_addr_t start, tb_page_addr_t end) #ifdef CONFIG_SOFTMMU void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end) { - assert_tb_lock(); + assert_tb_locked(); tb_invalidate_phys_range_1(start, end); } #else @@ -1464,7 +1432,7 @@ void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end, #endif /* TARGET_HAS_PRECISE_SMC */ assert_memory_lock(); - assert_tb_lock(); + assert_tb_locked(); p = page_find(start >> TARGET_PAGE_BITS); if (!p) { @@ -1543,7 +1511,7 @@ void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end, #ifdef CONFIG_SOFTMMU /* len must be <= 8 and start must be a multiple of len. * Called via softmmu_template.h when code areas are written to with - * tb_lock held. + * iothread mutex not held. */ void tb_invalidate_phys_page_fast(tb_page_addr_t start, int len) { @@ -1745,7 +1713,10 @@ void tb_check_watchpoint(CPUState *cpu) #ifndef CONFIG_USER_ONLY /* in deterministic execution mode, instructions doing device I/Os - must be at the end of the TB */ + * must be at the end of the TB. + * + * Called by softmmu_template.h, with iothread mutex not held. + */ void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr) { #if defined(TARGET_MIPS) || defined(TARGET_SH4) @@ -1957,6 +1928,7 @@ void dump_opcount_info(FILE *f, fprintf_function cpu_fprintf) void cpu_interrupt(CPUState *cpu, int mask) { + g_assert(qemu_mutex_iothread_locked()); cpu->interrupt_request |= mask; cpu->tcg_exit_req = 1; } diff --git a/translate-common.c b/translate-common.c index 5e989cdf70..d504dd0d33 100644 --- a/translate-common.c +++ b/translate-common.c @@ -21,6 +21,7 @@ #include "qemu-common.h" #include "qom/cpu.h" #include "sysemu/cpus.h" +#include "qemu/main-loop.h" uintptr_t qemu_real_host_page_size; intptr_t qemu_real_host_page_mask; @@ -30,6 +31,7 @@ intptr_t qemu_real_host_page_mask; static void tcg_handle_interrupt(CPUState *cpu, int mask) { int old_mask; + g_assert(qemu_mutex_iothread_locked()); old_mask = cpu->interrupt_request; cpu->interrupt_request |= mask; @@ -40,17 +42,16 @@ static void tcg_handle_interrupt(CPUState *cpu, int mask) */ if (!qemu_cpu_is_self(cpu)) { qemu_cpu_kick(cpu); - return; - } - - if (use_icount) { - cpu->icount_decr.u16.high = 0xffff; - if (!cpu->can_do_io - && (mask & ~old_mask) != 0) { - cpu_abort(cpu, "Raised interrupt while not in I/O function"); - } } else { - cpu->tcg_exit_req = 1; + if (use_icount) { + cpu->icount_decr.u16.high = 0xffff; + if (!cpu->can_do_io + && (mask & ~old_mask) != 0) { + cpu_abort(cpu, "Raised interrupt while not in I/O function"); + } + } else { + cpu->tcg_exit_req = 1; + } } } @@ -300,6 +300,26 @@ static QemuOptsList qemu_machine_opts = { }, }; +static QemuOptsList qemu_accel_opts = { + .name = "accel", + .implied_opt_name = "accel", + .head = QTAILQ_HEAD_INITIALIZER(qemu_accel_opts.head), + .merge_lists = true, + .desc = { + { + .name = "accel", + .type = QEMU_OPT_STRING, + .help = "Select the type of accelerator", + }, + { + .name = "thread", + .type = QEMU_OPT_STRING, + .help = "Enable/disable multi-threaded TCG", + }, + { /* end of list */ } + }, +}; + static QemuOptsList qemu_boot_opts = { .name = "boot-opts", .implied_opt_name = "order", @@ -2928,7 +2948,8 @@ int main(int argc, char **argv, char **envp) const char *boot_once = NULL; DisplayState *ds; int cyls, heads, secs, translation; - QemuOpts *hda_opts = NULL, *opts, *machine_opts, *icount_opts = NULL; + QemuOpts *opts, *machine_opts; + QemuOpts *hda_opts = NULL, *icount_opts = NULL, *accel_opts = NULL; QemuOptsList *olist; int optind; const char *optarg; @@ -2983,6 +3004,7 @@ int main(int argc, char **argv, char **envp) qemu_add_opts(&qemu_trace_opts); qemu_add_opts(&qemu_option_rom_opts); qemu_add_opts(&qemu_machine_opts); + qemu_add_opts(&qemu_accel_opts); qemu_add_opts(&qemu_mem_opts); qemu_add_opts(&qemu_smp_opts); qemu_add_opts(&qemu_boot_opts); @@ -3675,6 +3697,26 @@ int main(int argc, char **argv, char **envp) qdev_prop_register_global(&kvm_pit_lost_tick_policy); break; } + case QEMU_OPTION_accel: + accel_opts = qemu_opts_parse_noisily(qemu_find_opts("accel"), + optarg, true); + optarg = qemu_opt_get(accel_opts, "accel"); + + olist = qemu_find_opts("machine"); + if (strcmp("kvm", optarg) == 0) { + qemu_opts_parse_noisily(olist, "accel=kvm", false); + } else if (strcmp("xen", optarg) == 0) { + qemu_opts_parse_noisily(olist, "accel=xen", false); + } else if (strcmp("tcg", optarg) == 0) { + qemu_opts_parse_noisily(olist, "accel=tcg", false); + } else { + if (!is_help_option(optarg)) { + error_printf("Unknown accelerator: %s", optarg); + } + error_printf("Supported accelerators: kvm, xen, tcg\n"); + exit(1); + } + break; case QEMU_OPTION_usb: olist = qemu_find_opts("machine"); qemu_opts_parse_noisily(olist, "usb=on", false); @@ -3983,6 +4025,8 @@ int main(int argc, char **argv, char **envp) replay_configure(icount_opts); + qemu_tcg_configure(accel_opts, &error_fatal); + machine_class = select_machine(); set_memory_options(&ram_slots, &maxram_size, machine_class); @@ -4349,6 +4393,9 @@ int main(int argc, char **argv, char **envp) if (!tcg_enabled()) { error_report("-icount is not allowed with hardware virtualization"); exit(1); + } else if (qemu_tcg_mttcg_enabled()) { + error_report("-icount does not currently work with MTTCG"); + exit(1); } configure_icount(icount_opts, &error_abort); qemu_opts_del(icount_opts); |