diff options
author | Philippe Mathieu-Daudé <philmd@linaro.org> | 2023-10-04 11:06:28 +0200 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2023-10-08 21:08:08 +0200 |
commit | 8d7f2e767d8cd058c817dbe31430b89f2e11535d (patch) | |
tree | 5b7c25cddf7e6c1b9dfc93c5169541e8c14e2901 /system | |
parent | 01c85e60a4d0675f0ad203fe1fe119381e7ad15b (diff) |
system: Rename softmmu/ directory as system/
The softmmu/ directory contains files specific to system
emulation. Rename it as system/. Update meson rules, the
MAINTAINERS file and all the documentation and comments.
Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Message-ID: <20231004090629.37473-14-philmd@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Diffstat (limited to 'system')
-rw-r--r-- | system/arch_init.c | 50 | ||||
-rw-r--r-- | system/async-teardown.c | 143 | ||||
-rw-r--r-- | system/balloon.c | 106 | ||||
-rw-r--r-- | system/bootdevice.c | 430 | ||||
-rw-r--r-- | system/cpu-throttle.c | 128 | ||||
-rw-r--r-- | system/cpu-timers.c | 277 | ||||
-rw-r--r-- | system/cpus.c | 822 | ||||
-rw-r--r-- | system/datadir.c | 110 | ||||
-rw-r--r-- | system/device_tree.c | 703 | ||||
-rw-r--r-- | system/dirtylimit.c | 678 | ||||
-rw-r--r-- | system/dma-helpers.c | 347 | ||||
-rw-r--r-- | system/globals.c | 70 | ||||
-rw-r--r-- | system/ioport.c | 346 | ||||
-rw-r--r-- | system/main.c | 49 | ||||
-rw-r--r-- | system/memory.c | 3683 | ||||
-rw-r--r-- | system/memory_mapping.c | 377 | ||||
-rw-r--r-- | system/meson.build | 36 | ||||
-rw-r--r-- | system/physmem.c | 3796 | ||||
-rw-r--r-- | system/qdev-monitor.c | 1148 | ||||
-rw-r--r-- | system/qemu-seccomp.c | 486 | ||||
-rw-r--r-- | system/qtest.c | 1070 | ||||
-rw-r--r-- | system/rtc.c | 192 | ||||
-rw-r--r-- | system/runstate-action.c | 46 | ||||
-rw-r--r-- | system/runstate-hmp-cmds.c | 95 | ||||
-rw-r--r-- | system/runstate.c | 871 | ||||
-rw-r--r-- | system/tpm-hmp-cmds.c | 65 | ||||
-rw-r--r-- | system/tpm.c | 239 | ||||
-rw-r--r-- | system/trace-events | 40 | ||||
-rw-r--r-- | system/trace.h | 1 | ||||
-rw-r--r-- | system/vl.c | 3730 | ||||
-rw-r--r-- | system/watchpoint.c | 226 |
31 files changed, 20360 insertions, 0 deletions
diff --git a/system/arch_init.c b/system/arch_init.c new file mode 100644 index 0000000000..79716f959b --- /dev/null +++ b/system/arch_init.c @@ -0,0 +1,50 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu/osdep.h" +#include "qemu/module.h" +#include "sysemu/arch_init.h" + +#ifdef TARGET_SPARC +int graphic_width = 1024; +int graphic_height = 768; +int graphic_depth = 8; +#elif defined(TARGET_M68K) +int graphic_width = 800; +int graphic_height = 600; +int graphic_depth = 8; +#else +int graphic_width = 800; +int graphic_height = 600; +int graphic_depth = 32; +#endif + +const uint32_t arch_type = QEMU_ARCH; + +void qemu_init_arch_modules(void) +{ +#ifdef CONFIG_MODULES + module_init_info(qemu_modinfo); + module_allow_arch(TARGET_NAME); +#endif +} diff --git a/system/async-teardown.c b/system/async-teardown.c new file mode 100644 index 0000000000..396963c091 --- /dev/null +++ b/system/async-teardown.c @@ -0,0 +1,143 @@ +/* + * Asynchronous teardown + * + * Copyright IBM, Corp. 2022 + * + * Authors: + * Claudio Imbrenda <imbrenda@linux.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or (at your + * option) any later version. See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include <dirent.h> +#include <sys/prctl.h> +#include <sched.h> + +#include "qemu/async-teardown.h" + +#ifdef _SC_THREAD_STACK_MIN +#define CLONE_STACK_SIZE sysconf(_SC_THREAD_STACK_MIN) +#else +#define CLONE_STACK_SIZE 16384 +#endif + +static pid_t the_ppid; + +/* + * Close all open file descriptors. + */ +static void close_all_open_fd(void) +{ + struct dirent *de; + int fd, dfd; + DIR *dir; + +#ifdef CONFIG_CLOSE_RANGE + int r = close_range(0, ~0U, 0); + if (!r) { + /* Success, no need to try other ways. */ + return; + } +#endif + + dir = opendir("/proc/self/fd"); + if (!dir) { + /* If /proc is not mounted, there is nothing that can be done. */ + return; + } + /* Avoid closing the directory. */ + dfd = dirfd(dir); + + for (de = readdir(dir); de; de = readdir(dir)) { + fd = atoi(de->d_name); + if (fd != dfd) { + close(fd); + } + } + closedir(dir); +} + +static void hup_handler(int signal) +{ + /* Check every second if this process has been reparented. */ + while (the_ppid == getppid()) { + /* sleep() is safe to use in a signal handler. */ + sleep(1); + } + + /* At this point the parent process has terminated completely. */ + _exit(0); +} + +static int async_teardown_fn(void *arg) +{ + struct sigaction sa = { .sa_handler = hup_handler }; + sigset_t hup_signal; + char name[16]; + + /* Set a meaningful name for this process. */ + snprintf(name, 16, "cleanup/%d", the_ppid); + prctl(PR_SET_NAME, (unsigned long)name); + + /* + * Close all file descriptors that might have been inherited from the + * main qemu process when doing clone, needed to make libvirt happy. + * Not using close_range for increased compatibility with older kernels. + */ + close_all_open_fd(); + + /* Set up a handler for SIGHUP and unblock SIGHUP. */ + sigaction(SIGHUP, &sa, NULL); + sigemptyset(&hup_signal); + sigaddset(&hup_signal, SIGHUP); + sigprocmask(SIG_UNBLOCK, &hup_signal, NULL); + + /* Ask to receive SIGHUP when the parent dies. */ + prctl(PR_SET_PDEATHSIG, SIGHUP); + + /* + * Sleep forever, unless the parent process has already terminated. The + * only interruption can come from the SIGHUP signal, which in normal + * operation is received when the parent process dies. + */ + if (the_ppid == getppid()) { + pause(); + } + + /* At this point the parent process has terminated completely. */ + _exit(0); +} + +/* + * Allocate a new stack of a reasonable size, and return a pointer to its top. + */ +static void *new_stack_for_clone(void) +{ + size_t stack_size = CLONE_STACK_SIZE; + char *stack_ptr; + + /* Allocate a new stack and get a pointer to its top. */ + stack_ptr = qemu_alloc_stack(&stack_size); + stack_ptr += stack_size; + + return stack_ptr; +} + +/* + * Block all signals, start (clone) a new process sharing the address space + * with qemu (CLONE_VM), then restore signals. + */ +void init_async_teardown(void) +{ + sigset_t all_signals, old_signals; + + the_ppid = getpid(); + + sigfillset(&all_signals); + sigprocmask(SIG_BLOCK, &all_signals, &old_signals); + clone(async_teardown_fn, new_stack_for_clone(), CLONE_VM, NULL); + sigprocmask(SIG_SETMASK, &old_signals, NULL); +} diff --git a/system/balloon.c b/system/balloon.c new file mode 100644 index 0000000000..e0e8969a4b --- /dev/null +++ b/system/balloon.c @@ -0,0 +1,106 @@ +/* + * Generic Balloon handlers and management + * + * Copyright (c) 2003-2008 Fabrice Bellard + * Copyright (C) 2011 Red Hat, Inc. + * Copyright (C) 2011 Amit Shah <amit.shah@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qemu/atomic.h" +#include "sysemu/kvm.h" +#include "sysemu/balloon.h" +#include "qapi/error.h" +#include "qapi/qapi-commands-machine.h" +#include "qapi/qmp/qerror.h" +#include "trace.h" + +static QEMUBalloonEvent *balloon_event_fn; +static QEMUBalloonStatus *balloon_stat_fn; +static void *balloon_opaque; + +static bool have_balloon(Error **errp) +{ + if (kvm_enabled() && !kvm_has_sync_mmu()) { + error_set(errp, ERROR_CLASS_KVM_MISSING_CAP, + "Using KVM without synchronous MMU, balloon unavailable"); + return false; + } + if (!balloon_event_fn) { + error_set(errp, ERROR_CLASS_DEVICE_NOT_ACTIVE, + "No balloon device has been activated"); + return false; + } + return true; +} + +int qemu_add_balloon_handler(QEMUBalloonEvent *event_func, + QEMUBalloonStatus *stat_func, void *opaque) +{ + if (balloon_event_fn || balloon_stat_fn || balloon_opaque) { + /* We're already registered one balloon handler. How many can + * a guest really have? + */ + return -1; + } + balloon_event_fn = event_func; + balloon_stat_fn = stat_func; + balloon_opaque = opaque; + return 0; +} + +void qemu_remove_balloon_handler(void *opaque) +{ + if (balloon_opaque != opaque) { + return; + } + balloon_event_fn = NULL; + balloon_stat_fn = NULL; + balloon_opaque = NULL; +} + +BalloonInfo *qmp_query_balloon(Error **errp) +{ + BalloonInfo *info; + + if (!have_balloon(errp)) { + return NULL; + } + + info = g_malloc0(sizeof(*info)); + balloon_stat_fn(balloon_opaque, info); + return info; +} + +void qmp_balloon(int64_t target, Error **errp) +{ + if (!have_balloon(errp)) { + return; + } + + if (target <= 0) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "target", "a size"); + return; + } + + trace_balloon_event(balloon_opaque, target); + balloon_event_fn(balloon_opaque, target); +} diff --git a/system/bootdevice.c b/system/bootdevice.c new file mode 100644 index 0000000000..2106f1026f --- /dev/null +++ b/system/bootdevice.c @@ -0,0 +1,430 @@ +/* + * QEMU Boot Device Implement + * + * Copyright (c) 2014 HUAWEI TECHNOLOGIES CO., LTD. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "sysemu/sysemu.h" +#include "qapi/visitor.h" +#include "qemu/error-report.h" +#include "sysemu/reset.h" +#include "hw/qdev-core.h" +#include "hw/boards.h" + +typedef struct FWBootEntry FWBootEntry; + +struct FWBootEntry { + QTAILQ_ENTRY(FWBootEntry) link; + int32_t bootindex; + DeviceState *dev; + char *suffix; +}; + +static QTAILQ_HEAD(, FWBootEntry) fw_boot_order = + QTAILQ_HEAD_INITIALIZER(fw_boot_order); +static QEMUBootSetHandler *boot_set_handler; +static void *boot_set_opaque; + +void qemu_register_boot_set(QEMUBootSetHandler *func, void *opaque) +{ + boot_set_handler = func; + boot_set_opaque = opaque; +} + +void qemu_boot_set(const char *boot_order, Error **errp) +{ + Error *local_err = NULL; + + if (!boot_set_handler) { + error_setg(errp, "no function defined to set boot device list for" + " this architecture"); + return; + } + + validate_bootdevices(boot_order, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + + boot_set_handler(boot_set_opaque, boot_order, errp); +} + +void validate_bootdevices(const char *devices, Error **errp) +{ + /* We just do some generic consistency checks */ + const char *p; + int bitmap = 0; + + for (p = devices; *p != '\0'; p++) { + /* Allowed boot devices are: + * a-b: floppy disk drives + * c-f: IDE disk drives + * g-m: machine implementation dependent drives + * n-p: network devices + * It's up to each machine implementation to check if the given boot + * devices match the actual hardware implementation and firmware + * features. + */ + if (*p < 'a' || *p > 'p') { + error_setg(errp, "Invalid boot device '%c'", *p); + return; + } + if (bitmap & (1 << (*p - 'a'))) { + error_setg(errp, "Boot device '%c' was given twice", *p); + return; + } + bitmap |= 1 << (*p - 'a'); + } +} + +void restore_boot_order(void *opaque) +{ + char *normal_boot_order = opaque; + static int first = 1; + + /* Restore boot order and remove ourselves after the first boot */ + if (first) { + first = 0; + return; + } + + if (boot_set_handler) { + qemu_boot_set(normal_boot_order, &error_abort); + } + + qemu_unregister_reset(restore_boot_order, normal_boot_order); + g_free(normal_boot_order); +} + +void check_boot_index(int32_t bootindex, Error **errp) +{ + FWBootEntry *i; + + if (bootindex >= 0) { + QTAILQ_FOREACH(i, &fw_boot_order, link) { + if (i->bootindex == bootindex) { + error_setg(errp, "The bootindex %d has already been used", + bootindex); + return; + } + } + } +} + +void del_boot_device_path(DeviceState *dev, const char *suffix) +{ + FWBootEntry *i; + + if (dev == NULL) { + return; + } + + QTAILQ_FOREACH(i, &fw_boot_order, link) { + if ((!suffix || !g_strcmp0(i->suffix, suffix)) && + i->dev == dev) { + QTAILQ_REMOVE(&fw_boot_order, i, link); + g_free(i->suffix); + g_free(i); + + break; + } + } +} + +void add_boot_device_path(int32_t bootindex, DeviceState *dev, + const char *suffix) +{ + FWBootEntry *node, *i; + + if (bootindex < 0) { + del_boot_device_path(dev, suffix); + return; + } + + assert(dev != NULL || suffix != NULL); + + del_boot_device_path(dev, suffix); + + node = g_new0(FWBootEntry, 1); + node->bootindex = bootindex; + node->suffix = g_strdup(suffix); + node->dev = dev; + + QTAILQ_FOREACH(i, &fw_boot_order, link) { + if (i->bootindex == bootindex) { + error_report("Two devices with same boot index %d", bootindex); + exit(1); + } else if (i->bootindex < bootindex) { + continue; + } + QTAILQ_INSERT_BEFORE(i, node, link); + return; + } + QTAILQ_INSERT_TAIL(&fw_boot_order, node, link); +} + +DeviceState *get_boot_device(uint32_t position) +{ + uint32_t counter = 0; + FWBootEntry *i = NULL; + DeviceState *res = NULL; + + if (!QTAILQ_EMPTY(&fw_boot_order)) { + QTAILQ_FOREACH(i, &fw_boot_order, link) { + if (counter == position) { + res = i->dev; + break; + } + counter++; + } + } + return res; +} + +static char *get_boot_device_path(DeviceState *dev, bool ignore_suffixes, + const char *suffix) +{ + char *devpath = NULL, *s = NULL, *d, *bootpath; + + if (dev) { + devpath = qdev_get_fw_dev_path(dev); + assert(devpath); + } + + if (!ignore_suffixes) { + if (dev) { + d = qdev_get_own_fw_dev_path_from_handler(dev->parent_bus, dev); + if (d) { + assert(!suffix); + s = d; + } else { + s = g_strdup(suffix); + } + } else { + s = g_strdup(suffix); + } + } + + bootpath = g_strdup_printf("%s%s", + devpath ? devpath : "", + s ? s : ""); + g_free(devpath); + g_free(s); + + return bootpath; +} + +/* + * This function returns null terminated string that consist of new line + * separated device paths. + * + * memory pointed by "size" is assigned total length of the array in bytes + * + */ +char *get_boot_devices_list(size_t *size) +{ + FWBootEntry *i; + size_t total = 0; + char *list = NULL; + MachineClass *mc = MACHINE_GET_CLASS(qdev_get_machine()); + bool ignore_suffixes = mc->ignore_boot_device_suffixes; + + QTAILQ_FOREACH(i, &fw_boot_order, link) { + char *bootpath; + size_t len; + + bootpath = get_boot_device_path(i->dev, ignore_suffixes, i->suffix); + + if (total) { + list[total-1] = '\n'; + } + len = strlen(bootpath) + 1; + list = g_realloc(list, total + len); + memcpy(&list[total], bootpath, len); + total += len; + g_free(bootpath); + } + + *size = total; + + if (current_machine->boot_config.has_strict && + current_machine->boot_config.strict && *size > 0) { + list[total-1] = '\n'; + list = g_realloc(list, total + 5); + memcpy(&list[total], "HALT", 5); + *size = total + 5; + } + return list; +} + +typedef struct { + int32_t *bootindex; + const char *suffix; + DeviceState *dev; +} BootIndexProperty; + +static void device_get_bootindex(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + BootIndexProperty *prop = opaque; + visit_type_int32(v, name, prop->bootindex, errp); +} + +static void device_set_bootindex(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + BootIndexProperty *prop = opaque; + int32_t boot_index; + Error *local_err = NULL; + + if (!visit_type_int32(v, name, &boot_index, errp)) { + return; + } + /* check whether bootindex is present in fw_boot_order list */ + check_boot_index(boot_index, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + /* change bootindex to a new one */ + *prop->bootindex = boot_index; + + add_boot_device_path(*prop->bootindex, prop->dev, prop->suffix); +} + +static void property_release_bootindex(Object *obj, const char *name, + void *opaque) + +{ + BootIndexProperty *prop = opaque; + + del_boot_device_path(prop->dev, prop->suffix); + g_free(prop); +} + +void device_add_bootindex_property(Object *obj, int32_t *bootindex, + const char *name, const char *suffix, + DeviceState *dev) +{ + BootIndexProperty *prop = g_malloc0(sizeof(*prop)); + + prop->bootindex = bootindex; + prop->suffix = suffix; + prop->dev = dev; + + object_property_add(obj, name, "int32", + device_get_bootindex, + device_set_bootindex, + property_release_bootindex, + prop); + + /* initialize devices' bootindex property to -1 */ + object_property_set_int(obj, name, -1, NULL); +} + +typedef struct FWLCHSEntry FWLCHSEntry; + +struct FWLCHSEntry { + QTAILQ_ENTRY(FWLCHSEntry) link; + DeviceState *dev; + char *suffix; + uint32_t lcyls; + uint32_t lheads; + uint32_t lsecs; +}; + +static QTAILQ_HEAD(, FWLCHSEntry) fw_lchs = + QTAILQ_HEAD_INITIALIZER(fw_lchs); + +void add_boot_device_lchs(DeviceState *dev, const char *suffix, + uint32_t lcyls, uint32_t lheads, uint32_t lsecs) +{ + FWLCHSEntry *node; + + if (!lcyls && !lheads && !lsecs) { + return; + } + + assert(dev != NULL || suffix != NULL); + + node = g_new0(FWLCHSEntry, 1); + node->suffix = g_strdup(suffix); + node->dev = dev; + node->lcyls = lcyls; + node->lheads = lheads; + node->lsecs = lsecs; + + QTAILQ_INSERT_TAIL(&fw_lchs, node, link); +} + +void del_boot_device_lchs(DeviceState *dev, const char *suffix) +{ + FWLCHSEntry *i; + + if (dev == NULL) { + return; + } + + QTAILQ_FOREACH(i, &fw_lchs, link) { + if ((!suffix || !g_strcmp0(i->suffix, suffix)) && + i->dev == dev) { + QTAILQ_REMOVE(&fw_lchs, i, link); + g_free(i->suffix); + g_free(i); + + break; + } + } +} + +char *get_boot_devices_lchs_list(size_t *size) +{ + FWLCHSEntry *i; + size_t total = 0; + char *list = NULL; + + QTAILQ_FOREACH(i, &fw_lchs, link) { + char *bootpath; + char *chs_string; + size_t len; + + bootpath = get_boot_device_path(i->dev, false, i->suffix); + chs_string = g_strdup_printf("%s %" PRIu32 " %" PRIu32 " %" PRIu32, + bootpath, i->lcyls, i->lheads, i->lsecs); + + if (total) { + list[total - 1] = '\n'; + } + len = strlen(chs_string) + 1; + list = g_realloc(list, total + len); + memcpy(&list[total], chs_string, len); + total += len; + g_free(chs_string); + g_free(bootpath); + } + + *size = total; + + return list; +} diff --git a/system/cpu-throttle.c b/system/cpu-throttle.c new file mode 100644 index 0000000000..d9bb30a223 --- /dev/null +++ b/system/cpu-throttle.c @@ -0,0 +1,128 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qemu/thread.h" +#include "hw/core/cpu.h" +#include "qemu/main-loop.h" +#include "sysemu/cpus.h" +#include "sysemu/cpu-throttle.h" + +/* vcpu throttling controls */ +static QEMUTimer *throttle_timer; +static unsigned int throttle_percentage; + +#define CPU_THROTTLE_PCT_MIN 1 +#define CPU_THROTTLE_PCT_MAX 99 +#define CPU_THROTTLE_TIMESLICE_NS 10000000 + +static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque) +{ + double pct; + double throttle_ratio; + int64_t sleeptime_ns, endtime_ns; + + if (!cpu_throttle_get_percentage()) { + return; + } + + pct = (double)cpu_throttle_get_percentage() / 100; + throttle_ratio = pct / (1 - pct); + /* Add 1ns to fix double's rounding error (like 0.9999999...) */ + sleeptime_ns = (int64_t)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS + 1); + endtime_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + sleeptime_ns; + while (sleeptime_ns > 0 && !cpu->stop) { + if (sleeptime_ns > SCALE_MS) { + qemu_cond_timedwait_iothread(cpu->halt_cond, + sleeptime_ns / SCALE_MS); + } else { + qemu_mutex_unlock_iothread(); + g_usleep(sleeptime_ns / SCALE_US); + qemu_mutex_lock_iothread(); + } + sleeptime_ns = endtime_ns - qemu_clock_get_ns(QEMU_CLOCK_REALTIME); + } + qatomic_set(&cpu->throttle_thread_scheduled, 0); +} + +static void cpu_throttle_timer_tick(void *opaque) +{ + CPUState *cpu; + double pct; + + /* Stop the timer if needed */ + if (!cpu_throttle_get_percentage()) { + return; + } + CPU_FOREACH(cpu) { + if (!qatomic_xchg(&cpu->throttle_thread_scheduled, 1)) { + async_run_on_cpu(cpu, cpu_throttle_thread, + RUN_ON_CPU_NULL); + } + } + + pct = (double)cpu_throttle_get_percentage() / 100; + timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) + + CPU_THROTTLE_TIMESLICE_NS / (1 - pct)); +} + +void cpu_throttle_set(int new_throttle_pct) +{ + /* + * boolean to store whether throttle is already active or not, + * before modifying throttle_percentage + */ + bool throttle_active = cpu_throttle_active(); + + /* Ensure throttle percentage is within valid range */ + new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX); + new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN); + + qatomic_set(&throttle_percentage, new_throttle_pct); + + if (!throttle_active) { + cpu_throttle_timer_tick(NULL); + } +} + +void cpu_throttle_stop(void) +{ + qatomic_set(&throttle_percentage, 0); +} + +bool cpu_throttle_active(void) +{ + return (cpu_throttle_get_percentage() != 0); +} + +int cpu_throttle_get_percentage(void) +{ + return qatomic_read(&throttle_percentage); +} + +void cpu_throttle_init(void) +{ + throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT, + cpu_throttle_timer_tick, NULL); +} diff --git a/system/cpu-timers.c b/system/cpu-timers.c new file mode 100644 index 0000000000..7452d97b67 --- /dev/null +++ b/system/cpu-timers.c @@ -0,0 +1,277 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qemu/cutils.h" +#include "migration/vmstate.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "sysemu/cpus.h" +#include "qemu/main-loop.h" +#include "qemu/option.h" +#include "qemu/seqlock.h" +#include "sysemu/replay.h" +#include "sysemu/runstate.h" +#include "hw/core/cpu.h" +#include "sysemu/cpu-timers.h" +#include "sysemu/cpu-throttle.h" +#include "sysemu/cpu-timers-internal.h" + +/* clock and ticks */ + +static int64_t cpu_get_ticks_locked(void) +{ + int64_t ticks = timers_state.cpu_ticks_offset; + if (timers_state.cpu_ticks_enabled) { + ticks += cpu_get_host_ticks(); + } + + if (timers_state.cpu_ticks_prev > ticks) { + /* Non increasing ticks may happen if the host uses software suspend. */ + timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks; + ticks = timers_state.cpu_ticks_prev; + } + + timers_state.cpu_ticks_prev = ticks; + return ticks; +} + +/* + * return the time elapsed in VM between vm_start and vm_stop. + * cpu_get_ticks() uses units of the host CPU cycle counter. + */ +int64_t cpu_get_ticks(void) +{ + int64_t ticks; + + qemu_spin_lock(&timers_state.vm_clock_lock); + ticks = cpu_get_ticks_locked(); + qemu_spin_unlock(&timers_state.vm_clock_lock); + return ticks; +} + +int64_t cpu_get_clock_locked(void) +{ + int64_t time; + + time = timers_state.cpu_clock_offset; + if (timers_state.cpu_ticks_enabled) { + time += get_clock(); + } + + return time; +} + +/* + * Return the monotonic time elapsed in VM, i.e., + * the time between vm_start and vm_stop + */ +int64_t cpu_get_clock(void) +{ + int64_t ti; + unsigned start; + + do { + start = seqlock_read_begin(&timers_state.vm_clock_seqlock); + ti = cpu_get_clock_locked(); + } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start)); + + return ti; +} + +/* + * enable cpu_get_ticks() + * Caller must hold BQL which serves as mutex for vm_clock_seqlock. + */ +void cpu_enable_ticks(void) +{ + seqlock_write_lock(&timers_state.vm_clock_seqlock, + &timers_state.vm_clock_lock); + if (!timers_state.cpu_ticks_enabled) { + timers_state.cpu_ticks_offset -= cpu_get_host_ticks(); + timers_state.cpu_clock_offset -= get_clock(); + timers_state.cpu_ticks_enabled = 1; + } + seqlock_write_unlock(&timers_state.vm_clock_seqlock, + &timers_state.vm_clock_lock); +} + +/* + * disable cpu_get_ticks() : the clock is stopped. You must not call + * cpu_get_ticks() after that. + * Caller must hold BQL which serves as mutex for vm_clock_seqlock. + */ +void cpu_disable_ticks(void) +{ + seqlock_write_lock(&timers_state.vm_clock_seqlock, + &timers_state.vm_clock_lock); + if (timers_state.cpu_ticks_enabled) { + timers_state.cpu_ticks_offset += cpu_get_host_ticks(); + timers_state.cpu_clock_offset = cpu_get_clock_locked(); + timers_state.cpu_ticks_enabled = 0; + } + seqlock_write_unlock(&timers_state.vm_clock_seqlock, + &timers_state.vm_clock_lock); +} + +static bool icount_state_needed(void *opaque) +{ + return icount_enabled(); +} + +static bool warp_timer_state_needed(void *opaque) +{ + TimersState *s = opaque; + return s->icount_warp_timer != NULL; +} + +static bool adjust_timers_state_needed(void *opaque) +{ + TimersState *s = opaque; + return s->icount_rt_timer != NULL; +} + +static bool icount_shift_state_needed(void *opaque) +{ + return icount_enabled() == 2; +} + +/* + * Subsection for warp timer migration is optional, because may not be created + */ +static const VMStateDescription icount_vmstate_warp_timer = { + .name = "timer/icount/warp_timer", + .version_id = 1, + .minimum_version_id = 1, + .needed = warp_timer_state_needed, + .fields = (VMStateField[]) { + VMSTATE_INT64(vm_clock_warp_start, TimersState), + VMSTATE_TIMER_PTR(icount_warp_timer, TimersState), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription icount_vmstate_adjust_timers = { + .name = "timer/icount/timers", + .version_id = 1, + .minimum_version_id = 1, + .needed = adjust_timers_state_needed, + .fields = (VMStateField[]) { + VMSTATE_TIMER_PTR(icount_rt_timer, TimersState), + VMSTATE_TIMER_PTR(icount_vm_timer, TimersState), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription icount_vmstate_shift = { + .name = "timer/icount/shift", + .version_id = 2, + .minimum_version_id = 2, + .needed = icount_shift_state_needed, + .fields = (VMStateField[]) { + VMSTATE_INT16(icount_time_shift, TimersState), + VMSTATE_INT64(last_delta, TimersState), + VMSTATE_END_OF_LIST() + } +}; + +/* + * This is a subsection for icount migration. + */ +static const VMStateDescription icount_vmstate_timers = { + .name = "timer/icount", + .version_id = 1, + .minimum_version_id = 1, + .needed = icount_state_needed, + .fields = (VMStateField[]) { + VMSTATE_INT64(qemu_icount_bias, TimersState), + VMSTATE_INT64(qemu_icount, TimersState), + VMSTATE_END_OF_LIST() + }, + .subsections = (const VMStateDescription * []) { + &icount_vmstate_warp_timer, + &icount_vmstate_adjust_timers, + &icount_vmstate_shift, + NULL + } +}; + +static const VMStateDescription vmstate_timers = { + .name = "timer", + .version_id = 2, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_INT64(cpu_ticks_offset, TimersState), + VMSTATE_UNUSED(8), + VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2), + VMSTATE_END_OF_LIST() + }, + .subsections = (const VMStateDescription * []) { + &icount_vmstate_timers, + NULL + } +}; + +static void do_nothing(CPUState *cpu, run_on_cpu_data unused) +{ +} + +void qemu_timer_notify_cb(void *opaque, QEMUClockType type) +{ + if (!icount_enabled() || type != QEMU_CLOCK_VIRTUAL) { + qemu_notify_event(); + return; + } + + if (qemu_in_vcpu_thread()) { + /* + * A CPU is currently running; kick it back out to the + * tcg_cpu_exec() loop so it will recalculate its + * icount deadline immediately. + */ + qemu_cpu_kick(current_cpu); + } else if (first_cpu) { + /* + * qemu_cpu_kick is not enough to kick a halted CPU out of + * qemu_tcg_wait_io_event. async_run_on_cpu, instead, + * causes cpu_thread_is_idle to return false. This way, + * handle_icount_deadline can run. + * If we have no CPUs at all for some reason, we don't + * need to do anything. + */ + async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL); + } +} + +TimersState timers_state; + +/* initialize timers state and the cpu throttle for convenience */ +void cpu_timers_init(void) +{ + seqlock_init(&timers_state.vm_clock_seqlock); + qemu_spin_init(&timers_state.vm_clock_lock); + vmstate_register(NULL, 0, &vmstate_timers, &timers_state); + + cpu_throttle_init(); +} diff --git a/system/cpus.c b/system/cpus.c new file mode 100644 index 0000000000..0848e0dbdb --- /dev/null +++ b/system/cpus.c @@ -0,0 +1,822 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "monitor/monitor.h" +#include "qemu/coroutine-tls.h" +#include "qapi/error.h" +#include "qapi/qapi-commands-machine.h" +#include "qapi/qapi-commands-misc.h" +#include "qapi/qapi-events-run-state.h" +#include "qapi/qmp/qerror.h" +#include "exec/gdbstub.h" +#include "sysemu/hw_accel.h" +#include "exec/cpu-common.h" +#include "qemu/thread.h" +#include "qemu/main-loop.h" +#include "qemu/plugin.h" +#include "sysemu/cpus.h" +#include "qemu/guest-random.h" +#include "hw/nmi.h" +#include "sysemu/replay.h" +#include "sysemu/runstate.h" +#include "sysemu/cpu-timers.h" +#include "sysemu/whpx.h" +#include "hw/boards.h" +#include "hw/hw.h" +#include "trace.h" + +#ifdef CONFIG_LINUX + +#include <sys/prctl.h> + +#ifndef PR_MCE_KILL +#define PR_MCE_KILL 33 +#endif + +#ifndef PR_MCE_KILL_SET +#define PR_MCE_KILL_SET 1 +#endif + +#ifndef PR_MCE_KILL_EARLY +#define PR_MCE_KILL_EARLY 1 +#endif + +#endif /* CONFIG_LINUX */ + +static QemuMutex qemu_global_mutex; + +/* + * The chosen accelerator is supposed to register this. + */ +static const AccelOpsClass *cpus_accel; + +bool cpu_is_stopped(CPUState *cpu) +{ + return cpu->stopped || !runstate_is_running(); +} + +bool cpu_work_list_empty(CPUState *cpu) +{ + return QSIMPLEQ_EMPTY_ATOMIC(&cpu->work_list); +} + +bool cpu_thread_is_idle(CPUState *cpu) +{ + if (cpu->stop || !cpu_work_list_empty(cpu)) { + return false; + } + if (cpu_is_stopped(cpu)) { + return true; + } + if (!cpu->halted || cpu_has_work(cpu)) { + return false; + } + if (cpus_accel->cpu_thread_is_idle) { + return cpus_accel->cpu_thread_is_idle(cpu); + } + return true; +} + +bool all_cpu_threads_idle(void) +{ + CPUState *cpu; + + CPU_FOREACH(cpu) { + if (!cpu_thread_is_idle(cpu)) { + return false; + } + } + return true; +} + +/***********************************************************/ +void hw_error(const char *fmt, ...) +{ + va_list ap; + CPUState *cpu; + + va_start(ap, fmt); + fprintf(stderr, "qemu: hardware error: "); + vfprintf(stderr, fmt, ap); + fprintf(stderr, "\n"); + CPU_FOREACH(cpu) { + fprintf(stderr, "CPU #%d:\n", cpu->cpu_index); + cpu_dump_state(cpu, stderr, CPU_DUMP_FPU); + } + va_end(ap); + abort(); +} + +void cpu_synchronize_all_states(void) +{ + CPUState *cpu; + + CPU_FOREACH(cpu) { + cpu_synchronize_state(cpu); + } +} + +void cpu_synchronize_all_post_reset(void) +{ + CPUState *cpu; + + CPU_FOREACH(cpu) { + cpu_synchronize_post_reset(cpu); + } +} + +void cpu_synchronize_all_post_init(void) +{ + CPUState *cpu; + + CPU_FOREACH(cpu) { + cpu_synchronize_post_init(cpu); + } +} + +void cpu_synchronize_all_pre_loadvm(void) +{ + CPUState *cpu; + + CPU_FOREACH(cpu) { + cpu_synchronize_pre_loadvm(cpu); + } +} + +void cpu_synchronize_state(CPUState *cpu) +{ + if (cpus_accel->synchronize_state) { + cpus_accel->synchronize_state(cpu); + } +} + +void cpu_synchronize_post_reset(CPUState *cpu) +{ + if (cpus_accel->synchronize_post_reset) { + cpus_accel->synchronize_post_reset(cpu); + } +} + +void cpu_synchronize_post_init(CPUState *cpu) +{ + if (cpus_accel->synchronize_post_init) { + cpus_accel->synchronize_post_init(cpu); + } +} + +void cpu_synchronize_pre_loadvm(CPUState *cpu) +{ + if (cpus_accel->synchronize_pre_loadvm) { + cpus_accel->synchronize_pre_loadvm(cpu); + } +} + +bool cpus_are_resettable(void) +{ + if (cpus_accel->cpus_are_resettable) { + return cpus_accel->cpus_are_resettable(); + } + return true; +} + +int64_t cpus_get_virtual_clock(void) +{ + /* + * XXX + * + * need to check that cpus_accel is not NULL, because qcow2 calls + * qemu_get_clock_ns(CLOCK_VIRTUAL) without any accel initialized and + * with ticks disabled in some io-tests: + * 030 040 041 060 099 120 127 140 156 161 172 181 191 192 195 203 229 249 256 267 + * + * is this expected? + * + * XXX + */ + if (cpus_accel && cpus_accel->get_virtual_clock) { + return cpus_accel->get_virtual_clock(); + } + return cpu_get_clock(); +} + +/* + * return the time elapsed in VM between vm_start and vm_stop. Unless + * icount is active, cpus_get_elapsed_ticks() uses units of the host CPU cycle + * counter. + */ +int64_t cpus_get_elapsed_ticks(void) +{ + if (cpus_accel->get_elapsed_ticks) { + return cpus_accel->get_elapsed_ticks(); + } + return cpu_get_ticks(); +} + +static void generic_handle_interrupt(CPUState *cpu, int mask) +{ + cpu->interrupt_request |= mask; + + if (!qemu_cpu_is_self(cpu)) { + qemu_cpu_kick(cpu); + } +} + +void cpu_interrupt(CPUState *cpu, int mask) +{ + if (cpus_accel->handle_interrupt) { + cpus_accel->handle_interrupt(cpu, mask); + } else { + generic_handle_interrupt(cpu, mask); + } +} + +static int do_vm_stop(RunState state, bool send_stop) +{ + int ret = 0; + + if (runstate_is_running()) { + runstate_set(state); + cpu_disable_ticks(); + pause_all_vcpus(); + vm_state_notify(0, state); + if (send_stop) { + qapi_event_send_stop(); + } + } + + bdrv_drain_all(); + ret = bdrv_flush_all(); + trace_vm_stop_flush_all(ret); + + return ret; +} + +/* Special vm_stop() variant for terminating the process. Historically clients + * did not expect a QMP STOP event and so we need to retain compatibility. + */ +int vm_shutdown(void) +{ + return do_vm_stop(RUN_STATE_SHUTDOWN, false); +} + +bool cpu_can_run(CPUState *cpu) +{ + if (cpu->stop) { + return false; + } + if (cpu_is_stopped(cpu)) { + return false; + } + return true; +} + +void cpu_handle_guest_debug(CPUState *cpu) +{ + if (replay_running_debug()) { + if (!cpu->singlestep_enabled) { + /* + * Report about the breakpoint and + * make a single step to skip it + */ + replay_breakpoint(); + cpu_single_step(cpu, SSTEP_ENABLE); + } else { + cpu_single_step(cpu, 0); + } + } else { + gdb_set_stop_cpu(cpu); + qemu_system_debug_request(); + cpu->stopped = true; + } +} + +#ifdef CONFIG_LINUX +static void sigbus_reraise(void) +{ + sigset_t set; + struct sigaction action; + + memset(&action, 0, sizeof(action)); + action.sa_handler = SIG_DFL; + if (!sigaction(SIGBUS, &action, NULL)) { + raise(SIGBUS); + sigemptyset(&set); + sigaddset(&set, SIGBUS); + pthread_sigmask(SIG_UNBLOCK, &set, NULL); + } + perror("Failed to re-raise SIGBUS!"); + abort(); +} + +static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx) +{ + if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) { + sigbus_reraise(); + } + + if (current_cpu) { + /* Called asynchronously in VCPU thread. */ + if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) { + sigbus_reraise(); + } + } else { + /* Called synchronously (via signalfd) in main thread. */ + if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) { + sigbus_reraise(); + } + } +} + +static void qemu_init_sigbus(void) +{ + struct sigaction action; + + /* + * ALERT: when modifying this, take care that SIGBUS forwarding in + * qemu_prealloc_mem() will continue working as expected. + */ + memset(&action, 0, sizeof(action)); + action.sa_flags = SA_SIGINFO; + action.sa_sigaction = sigbus_handler; + sigaction(SIGBUS, &action, NULL); + + prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0); +} +#else /* !CONFIG_LINUX */ +static void qemu_init_sigbus(void) +{ +} +#endif /* !CONFIG_LINUX */ + +static QemuThread io_thread; + +/* cpu creation */ +static QemuCond qemu_cpu_cond; +/* system init */ +static QemuCond qemu_pause_cond; + +void qemu_init_cpu_loop(void) +{ + qemu_init_sigbus(); + qemu_cond_init(&qemu_cpu_cond); + qemu_cond_init(&qemu_pause_cond); + qemu_mutex_init(&qemu_global_mutex); + + qemu_thread_get_self(&io_thread); +} + +void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data) +{ + do_run_on_cpu(cpu, func, data, &qemu_global_mutex); +} + +static void qemu_cpu_stop(CPUState *cpu, bool exit) +{ + g_assert(qemu_cpu_is_self(cpu)); + cpu->stop = false; + cpu->stopped = true; + if (exit) { + cpu_exit(cpu); + } + qemu_cond_broadcast(&qemu_pause_cond); +} + +void qemu_wait_io_event_common(CPUState *cpu) +{ + qatomic_set_mb(&cpu->thread_kicked, false); + if (cpu->stop) { + qemu_cpu_stop(cpu, false); + } + process_queued_cpu_work(cpu); +} + +void qemu_wait_io_event(CPUState *cpu) +{ + bool slept = false; + + while (cpu_thread_is_idle(cpu)) { + if (!slept) { + slept = true; + qemu_plugin_vcpu_idle_cb(cpu); + } + qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex); + } + if (slept) { + qemu_plugin_vcpu_resume_cb(cpu); + } + + qemu_wait_io_event_common(cpu); +} + +void cpus_kick_thread(CPUState *cpu) +{ + if (cpu->thread_kicked) { + return; + } + cpu->thread_kicked = true; + +#ifndef _WIN32 + int err = pthread_kill(cpu->thread->thread, SIG_IPI); + if (err && err != ESRCH) { + fprintf(stderr, "qemu:%s: %s", __func__, strerror(err)); + exit(1); + } +#else + qemu_sem_post(&cpu->sem); +#endif +} + +void qemu_cpu_kick(CPUState *cpu) +{ + qemu_cond_broadcast(cpu->halt_cond); + if (cpus_accel->kick_vcpu_thread) { + cpus_accel->kick_vcpu_thread(cpu); + } else { /* default */ + cpus_kick_thread(cpu); + } +} + +void qemu_cpu_kick_self(void) +{ + assert(current_cpu); + cpus_kick_thread(current_cpu); +} + +bool qemu_cpu_is_self(CPUState *cpu) +{ + return qemu_thread_is_self(cpu->thread); +} + +bool qemu_in_vcpu_thread(void) +{ + return current_cpu && qemu_cpu_is_self(current_cpu); +} + +QEMU_DEFINE_STATIC_CO_TLS(bool, iothread_locked) + +bool qemu_mutex_iothread_locked(void) +{ + return get_iothread_locked(); +} + +bool qemu_in_main_thread(void) +{ + return qemu_mutex_iothread_locked(); +} + +/* + * The BQL is taken from so many places that it is worth profiling the + * callers directly, instead of funneling them all through a single function. + */ +void qemu_mutex_lock_iothread_impl(const char *file, int line) +{ + QemuMutexLockFunc bql_lock = qatomic_read(&qemu_bql_mutex_lock_func); + + g_assert(!qemu_mutex_iothread_locked()); + bql_lock(&qemu_global_mutex, file, line); + set_iothread_locked(true); +} + +void qemu_mutex_unlock_iothread(void) +{ + g_assert(qemu_mutex_iothread_locked()); + set_iothread_locked(false); + qemu_mutex_unlock(&qemu_global_mutex); +} + +void qemu_cond_wait_iothread(QemuCond *cond) +{ + qemu_cond_wait(cond, &qemu_global_mutex); +} + +void qemu_cond_timedwait_iothread(QemuCond *cond, int ms) +{ + qemu_cond_timedwait(cond, &qemu_global_mutex, ms); +} + +/* signal CPU creation */ +void cpu_thread_signal_created(CPUState *cpu) +{ + cpu->created = true; + qemu_cond_signal(&qemu_cpu_cond); +} + +/* signal CPU destruction */ +void cpu_thread_signal_destroyed(CPUState *cpu) +{ + cpu->created = false; + qemu_cond_signal(&qemu_cpu_cond); +} + + +static bool all_vcpus_paused(void) +{ + CPUState *cpu; + + CPU_FOREACH(cpu) { + if (!cpu->stopped) { + return false; + } + } + + return true; +} + +void pause_all_vcpus(void) +{ + CPUState *cpu; + + qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false); + CPU_FOREACH(cpu) { + if (qemu_cpu_is_self(cpu)) { + qemu_cpu_stop(cpu, true); + } else { + cpu->stop = true; + qemu_cpu_kick(cpu); + } + } + + /* We need to drop the replay_lock so any vCPU threads woken up + * can finish their replay tasks + */ + replay_mutex_unlock(); + + while (!all_vcpus_paused()) { + qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex); + CPU_FOREACH(cpu) { + qemu_cpu_kick(cpu); + } + } + + qemu_mutex_unlock_iothread(); + replay_mutex_lock(); + qemu_mutex_lock_iothread(); +} + +void cpu_resume(CPUState *cpu) +{ + cpu->stop = false; + cpu->stopped = false; + qemu_cpu_kick(cpu); +} + +void resume_all_vcpus(void) +{ + CPUState *cpu; + + if (!runstate_is_running()) { + return; + } + + qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true); + CPU_FOREACH(cpu) { + cpu_resume(cpu); + } +} + +void cpu_remove_sync(CPUState *cpu) +{ + cpu->stop = true; + cpu->unplug = true; + qemu_cpu_kick(cpu); + qemu_mutex_unlock_iothread(); + qemu_thread_join(cpu->thread); + qemu_mutex_lock_iothread(); +} + +void cpus_register_accel(const AccelOpsClass *ops) +{ + assert(ops != NULL); + assert(ops->create_vcpu_thread != NULL); /* mandatory */ + cpus_accel = ops; +} + +const AccelOpsClass *cpus_get_accel(void) +{ + /* broken if we call this early */ + assert(cpus_accel); + return cpus_accel; +} + +void qemu_init_vcpu(CPUState *cpu) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + + cpu->nr_cores = ms->smp.cores; + cpu->nr_threads = ms->smp.threads; + cpu->stopped = true; + cpu->random_seed = qemu_guest_random_seed_thread_part1(); + + if (!cpu->as) { + /* If the target cpu hasn't set up any address spaces itself, + * give it the default one. + */ + cpu->num_ases = 1; + cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory); + } + + /* accelerators all implement the AccelOpsClass */ + g_assert(cpus_accel != NULL && cpus_accel->create_vcpu_thread != NULL); + cpus_accel->create_vcpu_thread(cpu); + + while (!cpu->created) { + qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex); + } +} + +void cpu_stop_current(void) +{ + if (current_cpu) { + current_cpu->stop = true; + cpu_exit(current_cpu); + } +} + +int vm_stop(RunState state) +{ + if (qemu_in_vcpu_thread()) { + qemu_system_vmstop_request_prepare(); + qemu_system_vmstop_request(state); + /* + * FIXME: should not return to device code in case + * vm_stop() has been requested. + */ + cpu_stop_current(); + return 0; + } + + return do_vm_stop(state, true); +} + +/** + * Prepare for (re)starting the VM. + * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already + * running or in case of an error condition), 0 otherwise. + */ +int vm_prepare_start(bool step_pending) +{ + RunState requested; + + qemu_vmstop_requested(&requested); + if (runstate_is_running() && requested == RUN_STATE__MAX) { + return -1; + } + + /* Ensure that a STOP/RESUME pair of events is emitted if a + * vmstop request was pending. The BLOCK_IO_ERROR event, for + * example, according to documentation is always followed by + * the STOP event. + */ + if (runstate_is_running()) { + qapi_event_send_stop(); + qapi_event_send_resume(); + return -1; + } + + /* + * WHPX accelerator needs to know whether we are going to step + * any CPUs, before starting the first one. + */ + if (cpus_accel->synchronize_pre_resume) { + cpus_accel->synchronize_pre_resume(step_pending); + } + + /* We are sending this now, but the CPUs will be resumed shortly later */ + qapi_event_send_resume(); + + cpu_enable_ticks(); + runstate_set(RUN_STATE_RUNNING); + vm_state_notify(1, RUN_STATE_RUNNING); + return 0; +} + +void vm_start(void) +{ + if (!vm_prepare_start(false)) { + resume_all_vcpus(); + } +} + +/* does a state transition even if the VM is already stopped, + current state is forgotten forever */ +int vm_stop_force_state(RunState state) +{ + if (runstate_is_running()) { + return vm_stop(state); + } else { + int ret; + runstate_set(state); + + bdrv_drain_all(); + /* Make sure to return an error if the flush in a previous vm_stop() + * failed. */ + ret = bdrv_flush_all(); + trace_vm_stop_flush_all(ret); + return ret; + } +} + +void qmp_memsave(int64_t addr, int64_t size, const char *filename, + bool has_cpu, int64_t cpu_index, Error **errp) +{ + FILE *f; + uint32_t l; + CPUState *cpu; + uint8_t buf[1024]; + int64_t orig_addr = addr, orig_size = size; + + if (!has_cpu) { + cpu_index = 0; + } + + cpu = qemu_get_cpu(cpu_index); + if (cpu == NULL) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index", + "a CPU number"); + return; + } + + f = fopen(filename, "wb"); + if (!f) { + error_setg_file_open(errp, errno, filename); + return; + } + + while (size != 0) { + l = sizeof(buf); + if (l > size) + l = size; + if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) { + error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64 + " specified", orig_addr, orig_size); + goto exit; + } + if (fwrite(buf, 1, l, f) != l) { + error_setg(errp, QERR_IO_ERROR); + goto exit; + } + addr += l; + size -= l; + } + +exit: + fclose(f); +} + +void qmp_pmemsave(int64_t addr, int64_t size, const char *filename, + Error **errp) +{ + FILE *f; + uint32_t l; + uint8_t buf[1024]; + + f = fopen(filename, "wb"); + if (!f) { + error_setg_file_open(errp, errno, filename); + return; + } + + while (size != 0) { + l = sizeof(buf); + if (l > size) + l = size; + cpu_physical_memory_read(addr, buf, l); + if (fwrite(buf, 1, l, f) != l) { + error_setg(errp, QERR_IO_ERROR); + goto exit; + } + addr += l; + size -= l; + } + +exit: + fclose(f); +} + +void qmp_inject_nmi(Error **errp) +{ + nmi_monitor_handle(monitor_get_cpu_index(monitor_cur()), errp); +} + diff --git a/system/datadir.c b/system/datadir.c new file mode 100644 index 0000000000..c9237cb5d4 --- /dev/null +++ b/system/datadir.c @@ -0,0 +1,110 @@ +/* + * QEMU firmware and keymap file search + * + * Copyright (c) 2003-2020 QEMU contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qemu/datadir.h" +#include "qemu/cutils.h" +#include "trace.h" + +static const char *data_dir[16]; +static int data_dir_idx; + +char *qemu_find_file(int type, const char *name) +{ + int i; + const char *subdir; + char *buf; + + /* Try the name as a straight path first */ + if (access(name, R_OK) == 0) { + trace_load_file(name, name); + return g_strdup(name); + } + + switch (type) { + case QEMU_FILE_TYPE_BIOS: + subdir = ""; + break; + case QEMU_FILE_TYPE_KEYMAP: + subdir = "keymaps/"; + break; + default: + abort(); + } + + for (i = 0; i < data_dir_idx; i++) { + buf = g_strdup_printf("%s/%s%s", data_dir[i], subdir, name); + if (access(buf, R_OK) == 0) { + trace_load_file(name, buf); + return buf; + } + g_free(buf); + } + return NULL; +} + +void qemu_add_data_dir(char *path) +{ + int i; + + if (path == NULL) { + return; + } + if (data_dir_idx == ARRAY_SIZE(data_dir)) { + return; + } + for (i = 0; i < data_dir_idx; i++) { + if (strcmp(data_dir[i], path) == 0) { + g_free(path); /* duplicate */ + return; + } + } + data_dir[data_dir_idx++] = path; +} + +void qemu_add_default_firmwarepath(void) +{ + static const char * const dirs[] = { + CONFIG_QEMU_FIRMWAREPATH + NULL + }; + + size_t i; + + /* add configured firmware directories */ + for (i = 0; dirs[i] != NULL; i++) { + qemu_add_data_dir(get_relocated_path(dirs[i])); + } + + /* try to find datadir relative to the executable path */ + qemu_add_data_dir(get_relocated_path(CONFIG_QEMU_DATADIR)); +} + +void qemu_list_data_dirs(void) +{ + int i; + for (i = 0; i < data_dir_idx; i++) { + printf("%s\n", data_dir[i]); + } +} diff --git a/system/device_tree.c b/system/device_tree.c new file mode 100644 index 0000000000..eb5166ca36 --- /dev/null +++ b/system/device_tree.c @@ -0,0 +1,703 @@ +/* + * Functions to help device tree manipulation using libfdt. + * It also provides functions to read entries from device tree proc + * interface. + * + * Copyright 2008 IBM Corporation. + * Authors: Jerone Young <jyoung5@us.ibm.com> + * Hollis Blanchard <hollisb@us.ibm.com> + * + * This work is licensed under the GNU GPL license version 2 or later. + * + */ + +#include "qemu/osdep.h" + +#ifdef CONFIG_LINUX +#include <dirent.h> +#endif + +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "qemu/option.h" +#include "qemu/bswap.h" +#include "qemu/cutils.h" +#include "qemu/guest-random.h" +#include "sysemu/device_tree.h" +#include "hw/loader.h" +#include "hw/boards.h" +#include "qemu/config-file.h" +#include "qapi/qapi-commands-machine.h" +#include "qapi/qmp/qdict.h" +#include "monitor/hmp.h" + +#include <libfdt.h> + +#define FDT_MAX_SIZE 0x100000 + +void *create_device_tree(int *sizep) +{ + void *fdt; + int ret; + + *sizep = FDT_MAX_SIZE; + fdt = g_malloc0(FDT_MAX_SIZE); + ret = fdt_create(fdt, FDT_MAX_SIZE); + if (ret < 0) { + goto fail; + } + ret = fdt_finish_reservemap(fdt); + if (ret < 0) { + goto fail; + } + ret = fdt_begin_node(fdt, ""); + if (ret < 0) { + goto fail; + } + ret = fdt_end_node(fdt); + if (ret < 0) { + goto fail; + } + ret = fdt_finish(fdt); + if (ret < 0) { + goto fail; + } + ret = fdt_open_into(fdt, fdt, *sizep); + if (ret) { + error_report("%s: Unable to copy device tree into memory: %s", + __func__, fdt_strerror(ret)); + exit(1); + } + + return fdt; +fail: + error_report("%s Couldn't create dt: %s", __func__, fdt_strerror(ret)); + exit(1); +} + +void *load_device_tree(const char *filename_path, int *sizep) +{ + int dt_size; + int dt_file_load_size; + int ret; + void *fdt = NULL; + + *sizep = 0; + dt_size = get_image_size(filename_path); + if (dt_size < 0) { + error_report("Unable to get size of device tree file '%s'", + filename_path); + goto fail; + } + if (dt_size > INT_MAX / 2 - 10000) { + error_report("Device tree file '%s' is too large", filename_path); + goto fail; + } + + /* Expand to 2x size to give enough room for manipulation. */ + dt_size += 10000; + dt_size *= 2; + /* First allocate space in qemu for device tree */ + fdt = g_malloc0(dt_size); + + dt_file_load_size = load_image_size(filename_path, fdt, dt_size); + if (dt_file_load_size < 0) { + error_report("Unable to open device tree file '%s'", + filename_path); + goto fail; + } + + ret = fdt_open_into(fdt, fdt, dt_size); + if (ret) { + error_report("%s: Unable to copy device tree into memory: %s", + __func__, fdt_strerror(ret)); + goto fail; + } + + /* Check sanity of device tree */ + if (fdt_check_header(fdt)) { + error_report("Device tree file loaded into memory is invalid: %s", + filename_path); + goto fail; + } + *sizep = dt_size; + return fdt; + +fail: + g_free(fdt); + return NULL; +} + +#ifdef CONFIG_LINUX + +#define SYSFS_DT_BASEDIR "/proc/device-tree" + +/** + * read_fstree: this function is inspired from dtc read_fstree + * @fdt: preallocated fdt blob buffer, to be populated + * @dirname: directory to scan under SYSFS_DT_BASEDIR + * the search is recursive and the tree is searched down to the + * leaves (property files). + * + * the function asserts in case of error + */ +static void read_fstree(void *fdt, const char *dirname) +{ + DIR *d; + struct dirent *de; + struct stat st; + const char *root_dir = SYSFS_DT_BASEDIR; + const char *parent_node; + + if (strstr(dirname, root_dir) != dirname) { + error_report("%s: %s must be searched within %s", + __func__, dirname, root_dir); + exit(1); + } + parent_node = &dirname[strlen(SYSFS_DT_BASEDIR)]; + + d = opendir(dirname); + if (!d) { + error_report("%s cannot open %s", __func__, dirname); + exit(1); + } + + while ((de = readdir(d)) != NULL) { + char *tmpnam; + + if (!g_strcmp0(de->d_name, ".") + || !g_strcmp0(de->d_name, "..")) { + continue; + } + + tmpnam = g_strdup_printf("%s/%s", dirname, de->d_name); + + if (lstat(tmpnam, &st) < 0) { + error_report("%s cannot lstat %s", __func__, tmpnam); + exit(1); + } + + if (S_ISREG(st.st_mode)) { + gchar *val; + gsize len; + + if (!g_file_get_contents(tmpnam, &val, &len, NULL)) { + error_report("%s not able to extract info from %s", + __func__, tmpnam); + exit(1); + } + + if (strlen(parent_node) > 0) { + qemu_fdt_setprop(fdt, parent_node, + de->d_name, val, len); + } else { + qemu_fdt_setprop(fdt, "/", de->d_name, val, len); + } + g_free(val); + } else if (S_ISDIR(st.st_mode)) { + char *node_name; + + node_name = g_strdup_printf("%s/%s", + parent_node, de->d_name); + qemu_fdt_add_subnode(fdt, node_name); + g_free(node_name); + read_fstree(fdt, tmpnam); + } + + g_free(tmpnam); + } + + closedir(d); +} + +/* load_device_tree_from_sysfs: extract the dt blob from host sysfs */ +void *load_device_tree_from_sysfs(void) +{ + void *host_fdt; + int host_fdt_size; + + host_fdt = create_device_tree(&host_fdt_size); + read_fstree(host_fdt, SYSFS_DT_BASEDIR); + if (fdt_check_header(host_fdt)) { + error_report("%s host device tree extracted into memory is invalid", + __func__); + exit(1); + } + return host_fdt; +} + +#endif /* CONFIG_LINUX */ + +static int findnode_nofail(void *fdt, const char *node_path) +{ + int offset; + + offset = fdt_path_offset(fdt, node_path); + if (offset < 0) { + error_report("%s Couldn't find node %s: %s", __func__, node_path, + fdt_strerror(offset)); + exit(1); + } + + return offset; +} + +char **qemu_fdt_node_unit_path(void *fdt, const char *name, Error **errp) +{ + char *prefix = g_strdup_printf("%s@", name); + unsigned int path_len = 16, n = 0; + GSList *path_list = NULL, *iter; + const char *iter_name; + int offset, len, ret; + char **path_array; + + offset = fdt_next_node(fdt, -1, NULL); + + while (offset >= 0) { + iter_name = fdt_get_name(fdt, offset, &len); + if (!iter_name) { + offset = len; + break; + } + if (!strcmp(iter_name, name) || g_str_has_prefix(iter_name, prefix)) { + char *path; + + path = g_malloc(path_len); + while ((ret = fdt_get_path(fdt, offset, path, path_len)) + == -FDT_ERR_NOSPACE) { + path_len += 16; + path = g_realloc(path, path_len); + } + path_list = g_slist_prepend(path_list, path); + n++; + } + offset = fdt_next_node(fdt, offset, NULL); + } + g_free(prefix); + + if (offset < 0 && offset != -FDT_ERR_NOTFOUND) { + error_setg(errp, "%s: abort parsing dt for %s node units: %s", + __func__, name, fdt_strerror(offset)); + for (iter = path_list; iter; iter = iter->next) { + g_free(iter->data); + } + g_slist_free(path_list); + return NULL; + } + + path_array = g_new(char *, n + 1); + path_array[n--] = NULL; + + for (iter = path_list; iter; iter = iter->next) { + path_array[n--] = iter->data; + } + + g_slist_free(path_list); + + return path_array; +} + +char **qemu_fdt_node_path(void *fdt, const char *name, const char *compat, + Error **errp) +{ + int offset, len, ret; + const char *iter_name; + unsigned int path_len = 16, n = 0; + GSList *path_list = NULL, *iter; + char **path_array; + + offset = fdt_node_offset_by_compatible(fdt, -1, compat); + + while (offset >= 0) { + iter_name = fdt_get_name(fdt, offset, &len); + if (!iter_name) { + offset = len; + break; + } + if (!name || !strcmp(iter_name, name)) { + char *path; + + path = g_malloc(path_len); + while ((ret = fdt_get_path(fdt, offset, path, path_len)) + == -FDT_ERR_NOSPACE) { + path_len += 16; + path = g_realloc(path, path_len); + } + path_list = g_slist_prepend(path_list, path); + n++; + } + offset = fdt_node_offset_by_compatible(fdt, offset, compat); + } + + if (offset < 0 && offset != -FDT_ERR_NOTFOUND) { + error_setg(errp, "%s: abort parsing dt for %s/%s: %s", + __func__, name, compat, fdt_strerror(offset)); + for (iter = path_list; iter; iter = iter->next) { + g_free(iter->data); + } + g_slist_free(path_list); + return NULL; + } + + path_array = g_new(char *, n + 1); + path_array[n--] = NULL; + + for (iter = path_list; iter; iter = iter->next) { + path_array[n--] = iter->data; + } + + g_slist_free(path_list); + + return path_array; +} + +int qemu_fdt_setprop(void *fdt, const char *node_path, + const char *property, const void *val, int size) +{ + int r; + + r = fdt_setprop(fdt, findnode_nofail(fdt, node_path), property, val, size); + if (r < 0) { + error_report("%s: Couldn't set %s/%s: %s", __func__, node_path, + property, fdt_strerror(r)); + exit(1); + } + + return r; +} + +int qemu_fdt_setprop_cell(void *fdt, const char *node_path, + const char *property, uint32_t val) +{ + int r; + + r = fdt_setprop_cell(fdt, findnode_nofail(fdt, node_path), property, val); + if (r < 0) { + error_report("%s: Couldn't set %s/%s = %#08x: %s", __func__, + node_path, property, val, fdt_strerror(r)); + exit(1); + } + + return r; +} + +int qemu_fdt_setprop_u64(void *fdt, const char *node_path, + const char *property, uint64_t val) +{ + val = cpu_to_be64(val); + return qemu_fdt_setprop(fdt, node_path, property, &val, sizeof(val)); +} + +int qemu_fdt_setprop_string(void *fdt, const char *node_path, + const char *property, const char *string) +{ + int r; + + r = fdt_setprop_string(fdt, findnode_nofail(fdt, node_path), property, string); + if (r < 0) { + error_report("%s: Couldn't set %s/%s = %s: %s", __func__, + node_path, property, string, fdt_strerror(r)); + exit(1); + } + + return r; +} + +/* + * libfdt doesn't allow us to add string arrays directly but they are + * test a series of null terminated strings with a length. We build + * the string up here so we can calculate the final length. + */ +int qemu_fdt_setprop_string_array(void *fdt, const char *node_path, + const char *prop, char **array, int len) +{ + int ret, i, total_len = 0; + char *str, *p; + for (i = 0; i < len; i++) { + total_len += strlen(array[i]) + 1; + } + p = str = g_malloc0(total_len); + for (i = 0; i < len; i++) { + int offset = strlen(array[i]) + 1; + pstrcpy(p, offset, array[i]); + p += offset; + } + + ret = qemu_fdt_setprop(fdt, node_path, prop, str, total_len); + g_free(str); + return ret; +} + +const void *qemu_fdt_getprop(void *fdt, const char *node_path, + const char *property, int *lenp, Error **errp) +{ + int len; + const void *r; + + if (!lenp) { + lenp = &len; + } + r = fdt_getprop(fdt, findnode_nofail(fdt, node_path), property, lenp); + if (!r) { + error_setg(errp, "%s: Couldn't get %s/%s: %s", __func__, + node_path, property, fdt_strerror(*lenp)); + } + return r; +} + +uint32_t qemu_fdt_getprop_cell(void *fdt, const char *node_path, + const char *property, int *lenp, Error **errp) +{ + int len; + const uint32_t *p; + + if (!lenp) { + lenp = &len; + } + p = qemu_fdt_getprop(fdt, node_path, property, lenp, errp); + if (!p) { + return 0; + } else if (*lenp != 4) { + error_setg(errp, "%s: %s/%s not 4 bytes long (not a cell?)", + __func__, node_path, property); + *lenp = -EINVAL; + return 0; + } + return be32_to_cpu(*p); +} + +uint32_t qemu_fdt_get_phandle(void *fdt, const char *path) +{ + uint32_t r; + + r = fdt_get_phandle(fdt, findnode_nofail(fdt, path)); + if (r == 0) { + error_report("%s: Couldn't get phandle for %s: %s", __func__, + path, fdt_strerror(r)); + exit(1); + } + + return r; +} + +int qemu_fdt_setprop_phandle(void *fdt, const char *node_path, + const char *property, + const char *target_node_path) +{ + uint32_t phandle = qemu_fdt_get_phandle(fdt, target_node_path); + return qemu_fdt_setprop_cell(fdt, node_path, property, phandle); +} + +uint32_t qemu_fdt_alloc_phandle(void *fdt) +{ + static int phandle = 0x0; + + /* + * We need to find out if the user gave us special instruction at + * which phandle id to start allocating phandles. + */ + if (!phandle) { + phandle = machine_phandle_start(current_machine); + } + + if (!phandle) { + /* + * None or invalid phandle given on the command line, so fall back to + * default starting point. + */ + phandle = 0x8000; + } + + return phandle++; +} + +int qemu_fdt_nop_node(void *fdt, const char *node_path) +{ + int r; + + r = fdt_nop_node(fdt, findnode_nofail(fdt, node_path)); + if (r < 0) { + error_report("%s: Couldn't nop node %s: %s", __func__, node_path, + fdt_strerror(r)); + exit(1); + } + + return r; +} + +int qemu_fdt_add_subnode(void *fdt, const char *name) +{ + char *dupname = g_strdup(name); + char *basename = strrchr(dupname, '/'); + int retval; + int parent = 0; + + if (!basename) { + g_free(dupname); + return -1; + } + + basename[0] = '\0'; + basename++; + + if (dupname[0]) { + parent = findnode_nofail(fdt, dupname); + } + + retval = fdt_add_subnode(fdt, parent, basename); + if (retval < 0) { + error_report("%s: Failed to create subnode %s: %s", + __func__, name, fdt_strerror(retval)); + exit(1); + } + + g_free(dupname); + return retval; +} + +/* + * qemu_fdt_add_path: Like qemu_fdt_add_subnode(), but will add + * all missing subnodes from the given path. + */ +int qemu_fdt_add_path(void *fdt, const char *path) +{ + const char *name; + int namelen, retval; + int parent = 0; + + if (path[0] != '/') { + return -1; + } + + do { + name = path + 1; + path = strchr(name, '/'); + namelen = path != NULL ? path - name : strlen(name); + + retval = fdt_subnode_offset_namelen(fdt, parent, name, namelen); + if (retval < 0 && retval != -FDT_ERR_NOTFOUND) { + error_report("%s: Unexpected error in finding subnode %.*s: %s", + __func__, namelen, name, fdt_strerror(retval)); + exit(1); + } else if (retval == -FDT_ERR_NOTFOUND) { + retval = fdt_add_subnode_namelen(fdt, parent, name, namelen); + if (retval < 0) { + error_report("%s: Failed to create subnode %.*s: %s", + __func__, namelen, name, fdt_strerror(retval)); + exit(1); + } + } + + parent = retval; + } while (path); + + return retval; +} + +void qemu_fdt_dumpdtb(void *fdt, int size) +{ + const char *dumpdtb = current_machine->dumpdtb; + + if (dumpdtb) { + /* Dump the dtb to a file and quit */ + if (g_file_set_contents(dumpdtb, fdt, size, NULL)) { + info_report("dtb dumped to %s. Exiting.", dumpdtb); + exit(0); + } + error_report("%s: Failed dumping dtb to %s", __func__, dumpdtb); + exit(1); + } +} + +int qemu_fdt_setprop_sized_cells_from_array(void *fdt, + const char *node_path, + const char *property, + int numvalues, + uint64_t *values) +{ + uint32_t *propcells; + uint64_t value; + int cellnum, vnum, ncells; + uint32_t hival; + int ret; + + propcells = g_new0(uint32_t, numvalues * 2); + + cellnum = 0; + for (vnum = 0; vnum < numvalues; vnum++) { + ncells = values[vnum * 2]; + if (ncells != 1 && ncells != 2) { + ret = -1; + goto out; + } + value = values[vnum * 2 + 1]; + hival = cpu_to_be32(value >> 32); + if (ncells > 1) { + propcells[cellnum++] = hival; + } else if (hival != 0) { + ret = -1; + goto out; + } + propcells[cellnum++] = cpu_to_be32(value); + } + + ret = qemu_fdt_setprop(fdt, node_path, property, propcells, + cellnum * sizeof(uint32_t)); +out: + g_free(propcells); + return ret; +} + +void qmp_dumpdtb(const char *filename, Error **errp) +{ + g_autoptr(GError) err = NULL; + uint32_t size; + + if (!current_machine->fdt) { + error_setg(errp, "This machine doesn't have a FDT"); + return; + } + + size = fdt_totalsize(current_machine->fdt); + + g_assert(size > 0); + + if (!g_file_set_contents(filename, current_machine->fdt, size, &err)) { + error_setg(errp, "Error saving FDT to file %s: %s", + filename, err->message); + } +} + +void hmp_dumpdtb(Monitor *mon, const QDict *qdict) +{ + const char *filename = qdict_get_str(qdict, "filename"); + Error *local_err = NULL; + + qmp_dumpdtb(filename, &local_err); + + if (hmp_handle_error(mon, local_err)) { + return; + } + + info_report("dtb dumped to %s", filename); +} + +void qemu_fdt_randomize_seeds(void *fdt) +{ + int noffset, poffset, len; + const char *name; + uint8_t *data; + + for (noffset = fdt_next_node(fdt, 0, NULL); + noffset >= 0; + noffset = fdt_next_node(fdt, noffset, NULL)) { + for (poffset = fdt_first_property_offset(fdt, noffset); + poffset >= 0; + poffset = fdt_next_property_offset(fdt, poffset)) { + data = (uint8_t *)fdt_getprop_by_offset(fdt, poffset, &name, &len); + if (!data || strcmp(name, "rng-seed")) + continue; + qemu_guest_getrandom_nofail(data, len); + } + } +} diff --git a/system/dirtylimit.c b/system/dirtylimit.c new file mode 100644 index 0000000000..fa959d7743 --- /dev/null +++ b/system/dirtylimit.c @@ -0,0 +1,678 @@ +/* + * Dirty page rate limit implementation code + * + * Copyright (c) 2022 CHINA TELECOM CO.,LTD. + * + * Authors: + * Hyman Huang(黄勇) <huangy81@chinatelecom.cn> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/main-loop.h" +#include "qapi/qapi-commands-migration.h" +#include "qapi/qmp/qdict.h" +#include "qapi/error.h" +#include "sysemu/dirtyrate.h" +#include "sysemu/dirtylimit.h" +#include "monitor/hmp.h" +#include "monitor/monitor.h" +#include "exec/memory.h" +#include "exec/target_page.h" +#include "hw/boards.h" +#include "sysemu/kvm.h" +#include "trace.h" +#include "migration/misc.h" +#include "migration/migration.h" +#include "migration/options.h" + +/* + * Dirtylimit stop working if dirty page rate error + * value less than DIRTYLIMIT_TOLERANCE_RANGE + */ +#define DIRTYLIMIT_TOLERANCE_RANGE 25 /* MB/s */ +/* + * Plus or minus vcpu sleep time linearly if dirty + * page rate error value percentage over + * DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT. + * Otherwise, plus or minus a fixed vcpu sleep time. + */ +#define DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT 50 +/* + * Max vcpu sleep time percentage during a cycle + * composed of dirty ring full and sleep time. + */ +#define DIRTYLIMIT_THROTTLE_PCT_MAX 99 + +struct { + VcpuStat stat; + bool running; + QemuThread thread; +} *vcpu_dirty_rate_stat; + +typedef struct VcpuDirtyLimitState { + int cpu_index; + bool enabled; + /* + * Quota dirty page rate, unit is MB/s + * zero if not enabled. + */ + uint64_t quota; +} VcpuDirtyLimitState; + +struct { + VcpuDirtyLimitState *states; + /* Max cpus number configured by user */ + int max_cpus; + /* Number of vcpu under dirtylimit */ + int limited_nvcpu; +} *dirtylimit_state; + +/* protect dirtylimit_state */ +static QemuMutex dirtylimit_mutex; + +/* dirtylimit thread quit if dirtylimit_quit is true */ +static bool dirtylimit_quit; + +static void vcpu_dirty_rate_stat_collect(void) +{ + MigrationState *s = migrate_get_current(); + VcpuStat stat; + int i = 0; + int64_t period = DIRTYLIMIT_CALC_TIME_MS; + + if (migrate_dirty_limit() && + migration_is_active(s)) { + period = s->parameters.x_vcpu_dirty_limit_period; + } + + /* calculate vcpu dirtyrate */ + vcpu_calculate_dirtyrate(period, + &stat, + GLOBAL_DIRTY_LIMIT, + false); + + for (i = 0; i < stat.nvcpu; i++) { + vcpu_dirty_rate_stat->stat.rates[i].id = i; + vcpu_dirty_rate_stat->stat.rates[i].dirty_rate = + stat.rates[i].dirty_rate; + } + + g_free(stat.rates); +} + +static void *vcpu_dirty_rate_stat_thread(void *opaque) +{ + rcu_register_thread(); + + /* start log sync */ + global_dirty_log_change(GLOBAL_DIRTY_LIMIT, true); + + while (qatomic_read(&vcpu_dirty_rate_stat->running)) { + vcpu_dirty_rate_stat_collect(); + if (dirtylimit_in_service()) { + dirtylimit_process(); + } + } + + /* stop log sync */ + global_dirty_log_change(GLOBAL_DIRTY_LIMIT, false); + + rcu_unregister_thread(); + return NULL; +} + +int64_t vcpu_dirty_rate_get(int cpu_index) +{ + DirtyRateVcpu *rates = vcpu_dirty_rate_stat->stat.rates; + return qatomic_read_i64(&rates[cpu_index].dirty_rate); +} + +void vcpu_dirty_rate_stat_start(void) +{ + if (qatomic_read(&vcpu_dirty_rate_stat->running)) { + return; + } + + qatomic_set(&vcpu_dirty_rate_stat->running, 1); + qemu_thread_create(&vcpu_dirty_rate_stat->thread, + "dirtyrate-stat", + vcpu_dirty_rate_stat_thread, + NULL, + QEMU_THREAD_JOINABLE); +} + +void vcpu_dirty_rate_stat_stop(void) +{ + qatomic_set(&vcpu_dirty_rate_stat->running, 0); + dirtylimit_state_unlock(); + qemu_mutex_unlock_iothread(); + qemu_thread_join(&vcpu_dirty_rate_stat->thread); + qemu_mutex_lock_iothread(); + dirtylimit_state_lock(); +} + +void vcpu_dirty_rate_stat_initialize(void) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + int max_cpus = ms->smp.max_cpus; + + vcpu_dirty_rate_stat = + g_malloc0(sizeof(*vcpu_dirty_rate_stat)); + + vcpu_dirty_rate_stat->stat.nvcpu = max_cpus; + vcpu_dirty_rate_stat->stat.rates = + g_new0(DirtyRateVcpu, max_cpus); + + vcpu_dirty_rate_stat->running = false; +} + +void vcpu_dirty_rate_stat_finalize(void) +{ + g_free(vcpu_dirty_rate_stat->stat.rates); + vcpu_dirty_rate_stat->stat.rates = NULL; + + g_free(vcpu_dirty_rate_stat); + vcpu_dirty_rate_stat = NULL; +} + +void dirtylimit_state_lock(void) +{ + qemu_mutex_lock(&dirtylimit_mutex); +} + +void dirtylimit_state_unlock(void) +{ + qemu_mutex_unlock(&dirtylimit_mutex); +} + +static void +__attribute__((__constructor__)) dirtylimit_mutex_init(void) +{ + qemu_mutex_init(&dirtylimit_mutex); +} + +static inline VcpuDirtyLimitState *dirtylimit_vcpu_get_state(int cpu_index) +{ + return &dirtylimit_state->states[cpu_index]; +} + +void dirtylimit_state_initialize(void) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + int max_cpus = ms->smp.max_cpus; + int i; + + dirtylimit_state = g_malloc0(sizeof(*dirtylimit_state)); + + dirtylimit_state->states = + g_new0(VcpuDirtyLimitState, max_cpus); + + for (i = 0; i < max_cpus; i++) { + dirtylimit_state->states[i].cpu_index = i; + } + + dirtylimit_state->max_cpus = max_cpus; + trace_dirtylimit_state_initialize(max_cpus); +} + +void dirtylimit_state_finalize(void) +{ + g_free(dirtylimit_state->states); + dirtylimit_state->states = NULL; + + g_free(dirtylimit_state); + dirtylimit_state = NULL; + + trace_dirtylimit_state_finalize(); +} + +bool dirtylimit_in_service(void) +{ + return !!dirtylimit_state; +} + +bool dirtylimit_vcpu_index_valid(int cpu_index) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + + return !(cpu_index < 0 || + cpu_index >= ms->smp.max_cpus); +} + +static uint64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate) +{ + static uint64_t max_dirtyrate; + uint64_t dirty_ring_size_MiB; + + dirty_ring_size_MiB = qemu_target_pages_to_MiB(kvm_dirty_ring_size()); + + if (max_dirtyrate < dirtyrate) { + max_dirtyrate = dirtyrate; + } + + return dirty_ring_size_MiB * 1000000 / max_dirtyrate; +} + +static inline bool dirtylimit_done(uint64_t quota, + uint64_t current) +{ + uint64_t min, max; + + min = MIN(quota, current); + max = MAX(quota, current); + + return ((max - min) <= DIRTYLIMIT_TOLERANCE_RANGE) ? true : false; +} + +static inline bool +dirtylimit_need_linear_adjustment(uint64_t quota, + uint64_t current) +{ + uint64_t min, max; + + min = MIN(quota, current); + max = MAX(quota, current); + + return ((max - min) * 100 / max) > DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT; +} + +static void dirtylimit_set_throttle(CPUState *cpu, + uint64_t quota, + uint64_t current) +{ + int64_t ring_full_time_us = 0; + uint64_t sleep_pct = 0; + uint64_t throttle_us = 0; + + if (current == 0) { + cpu->throttle_us_per_full = 0; + return; + } + + ring_full_time_us = dirtylimit_dirty_ring_full_time(current); + + if (dirtylimit_need_linear_adjustment(quota, current)) { + if (quota < current) { + sleep_pct = (current - quota) * 100 / current; + throttle_us = + ring_full_time_us * sleep_pct / (double)(100 - sleep_pct); + cpu->throttle_us_per_full += throttle_us; + } else { + sleep_pct = (quota - current) * 100 / quota; + throttle_us = + ring_full_time_us * sleep_pct / (double)(100 - sleep_pct); + cpu->throttle_us_per_full -= throttle_us; + } + + trace_dirtylimit_throttle_pct(cpu->cpu_index, + sleep_pct, + throttle_us); + } else { + if (quota < current) { + cpu->throttle_us_per_full += ring_full_time_us / 10; + } else { + cpu->throttle_us_per_full -= ring_full_time_us / 10; + } + } + + /* + * TODO: in the big kvm_dirty_ring_size case (eg: 65536, or other scenario), + * current dirty page rate may never reach the quota, we should stop + * increasing sleep time? + */ + cpu->throttle_us_per_full = MIN(cpu->throttle_us_per_full, + ring_full_time_us * DIRTYLIMIT_THROTTLE_PCT_MAX); + + cpu->throttle_us_per_full = MAX(cpu->throttle_us_per_full, 0); +} + +static void dirtylimit_adjust_throttle(CPUState *cpu) +{ + uint64_t quota = 0; + uint64_t current = 0; + int cpu_index = cpu->cpu_index; + + quota = dirtylimit_vcpu_get_state(cpu_index)->quota; + current = vcpu_dirty_rate_get(cpu_index); + + if (!dirtylimit_done(quota, current)) { + dirtylimit_set_throttle(cpu, quota, current); + } + + return; +} + +void dirtylimit_process(void) +{ + CPUState *cpu; + + if (!qatomic_read(&dirtylimit_quit)) { + dirtylimit_state_lock(); + + if (!dirtylimit_in_service()) { + dirtylimit_state_unlock(); + return; + } + + CPU_FOREACH(cpu) { + if (!dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) { + continue; + } + dirtylimit_adjust_throttle(cpu); + } + dirtylimit_state_unlock(); + } +} + +void dirtylimit_change(bool start) +{ + if (start) { + qatomic_set(&dirtylimit_quit, 0); + } else { + qatomic_set(&dirtylimit_quit, 1); + } +} + +void dirtylimit_set_vcpu(int cpu_index, + uint64_t quota, + bool enable) +{ + trace_dirtylimit_set_vcpu(cpu_index, quota); + + if (enable) { + dirtylimit_state->states[cpu_index].quota = quota; + if (!dirtylimit_vcpu_get_state(cpu_index)->enabled) { + dirtylimit_state->limited_nvcpu++; + } + } else { + dirtylimit_state->states[cpu_index].quota = 0; + if (dirtylimit_state->states[cpu_index].enabled) { + dirtylimit_state->limited_nvcpu--; + } + } + + dirtylimit_state->states[cpu_index].enabled = enable; +} + +void dirtylimit_set_all(uint64_t quota, + bool enable) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + int max_cpus = ms->smp.max_cpus; + int i; + + for (i = 0; i < max_cpus; i++) { + dirtylimit_set_vcpu(i, quota, enable); + } +} + +void dirtylimit_vcpu_execute(CPUState *cpu) +{ + if (dirtylimit_in_service() && + dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled && + cpu->throttle_us_per_full) { + trace_dirtylimit_vcpu_execute(cpu->cpu_index, + cpu->throttle_us_per_full); + usleep(cpu->throttle_us_per_full); + } +} + +static void dirtylimit_init(void) +{ + dirtylimit_state_initialize(); + dirtylimit_change(true); + vcpu_dirty_rate_stat_initialize(); + vcpu_dirty_rate_stat_start(); +} + +static void dirtylimit_cleanup(void) +{ + vcpu_dirty_rate_stat_stop(); + vcpu_dirty_rate_stat_finalize(); + dirtylimit_change(false); + dirtylimit_state_finalize(); +} + +/* + * dirty page rate limit is not allowed to set if migration + * is running with dirty-limit capability enabled. + */ +static bool dirtylimit_is_allowed(void) +{ + MigrationState *ms = migrate_get_current(); + + if (migration_is_running(ms->state) && + (!qemu_thread_is_self(&ms->thread)) && + migrate_dirty_limit() && + dirtylimit_in_service()) { + return false; + } + return true; +} + +void qmp_cancel_vcpu_dirty_limit(bool has_cpu_index, + int64_t cpu_index, + Error **errp) +{ + if (!kvm_enabled() || !kvm_dirty_ring_enabled()) { + return; + } + + if (has_cpu_index && !dirtylimit_vcpu_index_valid(cpu_index)) { + error_setg(errp, "incorrect cpu index specified"); + return; + } + + if (!dirtylimit_is_allowed()) { + error_setg(errp, "can't cancel dirty page rate limit while" + " migration is running"); + return; + } + + if (!dirtylimit_in_service()) { + return; + } + + dirtylimit_state_lock(); + + if (has_cpu_index) { + dirtylimit_set_vcpu(cpu_index, 0, false); + } else { + dirtylimit_set_all(0, false); + } + + if (!dirtylimit_state->limited_nvcpu) { + dirtylimit_cleanup(); + } + + dirtylimit_state_unlock(); +} + +void hmp_cancel_vcpu_dirty_limit(Monitor *mon, const QDict *qdict) +{ + int64_t cpu_index = qdict_get_try_int(qdict, "cpu_index", -1); + Error *err = NULL; + + qmp_cancel_vcpu_dirty_limit(!!(cpu_index != -1), cpu_index, &err); + if (err) { + hmp_handle_error(mon, err); + return; + } + + monitor_printf(mon, "[Please use 'info vcpu_dirty_limit' to query " + "dirty limit for virtual CPU]\n"); +} + +void qmp_set_vcpu_dirty_limit(bool has_cpu_index, + int64_t cpu_index, + uint64_t dirty_rate, + Error **errp) +{ + if (!kvm_enabled() || !kvm_dirty_ring_enabled()) { + error_setg(errp, "dirty page limit feature requires KVM with" + " accelerator property 'dirty-ring-size' set'"); + return; + } + + if (has_cpu_index && !dirtylimit_vcpu_index_valid(cpu_index)) { + error_setg(errp, "incorrect cpu index specified"); + return; + } + + if (!dirtylimit_is_allowed()) { + error_setg(errp, "can't set dirty page rate limit while" + " migration is running"); + return; + } + + if (!dirty_rate) { + qmp_cancel_vcpu_dirty_limit(has_cpu_index, cpu_index, errp); + return; + } + + dirtylimit_state_lock(); + + if (!dirtylimit_in_service()) { + dirtylimit_init(); + } + + if (has_cpu_index) { + dirtylimit_set_vcpu(cpu_index, dirty_rate, true); + } else { + dirtylimit_set_all(dirty_rate, true); + } + + dirtylimit_state_unlock(); +} + +void hmp_set_vcpu_dirty_limit(Monitor *mon, const QDict *qdict) +{ + int64_t dirty_rate = qdict_get_int(qdict, "dirty_rate"); + int64_t cpu_index = qdict_get_try_int(qdict, "cpu_index", -1); + Error *err = NULL; + + if (dirty_rate < 0) { + error_setg(&err, "invalid dirty page limit %" PRId64, dirty_rate); + goto out; + } + + qmp_set_vcpu_dirty_limit(!!(cpu_index != -1), cpu_index, dirty_rate, &err); + +out: + hmp_handle_error(mon, err); +} + +/* Return the max throttle time of each virtual CPU */ +uint64_t dirtylimit_throttle_time_per_round(void) +{ + CPUState *cpu; + int64_t max = 0; + + CPU_FOREACH(cpu) { + if (cpu->throttle_us_per_full > max) { + max = cpu->throttle_us_per_full; + } + } + + return max; +} + +/* + * Estimate average dirty ring full time of each virtaul CPU. + * Return 0 if guest doesn't dirty memory. + */ +uint64_t dirtylimit_ring_full_time(void) +{ + CPUState *cpu; + uint64_t curr_rate = 0; + int nvcpus = 0; + + CPU_FOREACH(cpu) { + if (cpu->running) { + nvcpus++; + curr_rate += vcpu_dirty_rate_get(cpu->cpu_index); + } + } + + if (!curr_rate || !nvcpus) { + return 0; + } + + return dirtylimit_dirty_ring_full_time(curr_rate / nvcpus); +} + +static struct DirtyLimitInfo *dirtylimit_query_vcpu(int cpu_index) +{ + DirtyLimitInfo *info = NULL; + + info = g_malloc0(sizeof(*info)); + info->cpu_index = cpu_index; + info->limit_rate = dirtylimit_vcpu_get_state(cpu_index)->quota; + info->current_rate = vcpu_dirty_rate_get(cpu_index); + + return info; +} + +static struct DirtyLimitInfoList *dirtylimit_query_all(void) +{ + int i, index; + DirtyLimitInfo *info = NULL; + DirtyLimitInfoList *head = NULL, **tail = &head; + + dirtylimit_state_lock(); + + if (!dirtylimit_in_service()) { + dirtylimit_state_unlock(); + return NULL; + } + + for (i = 0; i < dirtylimit_state->max_cpus; i++) { + index = dirtylimit_state->states[i].cpu_index; + if (dirtylimit_vcpu_get_state(index)->enabled) { + info = dirtylimit_query_vcpu(index); + QAPI_LIST_APPEND(tail, info); + } + } + + dirtylimit_state_unlock(); + + return head; +} + +struct DirtyLimitInfoList *qmp_query_vcpu_dirty_limit(Error **errp) +{ + if (!dirtylimit_in_service()) { + return NULL; + } + + return dirtylimit_query_all(); +} + +void hmp_info_vcpu_dirty_limit(Monitor *mon, const QDict *qdict) +{ + DirtyLimitInfoList *info; + g_autoptr(DirtyLimitInfoList) head = NULL; + Error *err = NULL; + + if (!dirtylimit_in_service()) { + monitor_printf(mon, "Dirty page limit not enabled!\n"); + return; + } + + head = qmp_query_vcpu_dirty_limit(&err); + if (err) { + hmp_handle_error(mon, err); + return; + } + + for (info = head; info != NULL; info = info->next) { + monitor_printf(mon, "vcpu[%"PRIi64"], limit rate %"PRIi64 " (MB/s)," + " current rate %"PRIi64 " (MB/s)\n", + info->value->cpu_index, + info->value->limit_rate, + info->value->current_rate); + } +} diff --git a/system/dma-helpers.c b/system/dma-helpers.c new file mode 100644 index 0000000000..36211acc7e --- /dev/null +++ b/system/dma-helpers.c @@ -0,0 +1,347 @@ +/* + * DMA helper functions + * + * Copyright (c) 2009,2020 Red Hat + * + * This work is licensed under the terms of the GNU General Public License + * (GNU GPL), version 2 or later. + */ + +#include "qemu/osdep.h" +#include "sysemu/block-backend.h" +#include "sysemu/dma.h" +#include "trace/trace-root.h" +#include "qemu/thread.h" +#include "qemu/main-loop.h" +#include "sysemu/cpu-timers.h" +#include "qemu/range.h" + +/* #define DEBUG_IOMMU */ + +MemTxResult dma_memory_set(AddressSpace *as, dma_addr_t addr, + uint8_t c, dma_addr_t len, MemTxAttrs attrs) +{ + dma_barrier(as, DMA_DIRECTION_FROM_DEVICE); + + return address_space_set(as, addr, c, len, attrs); +} + +void qemu_sglist_init(QEMUSGList *qsg, DeviceState *dev, int alloc_hint, + AddressSpace *as) +{ + qsg->sg = g_new(ScatterGatherEntry, alloc_hint); + qsg->nsg = 0; + qsg->nalloc = alloc_hint; + qsg->size = 0; + qsg->as = as; + qsg->dev = dev; + object_ref(OBJECT(dev)); +} + +void qemu_sglist_add(QEMUSGList *qsg, dma_addr_t base, dma_addr_t len) +{ + if (qsg->nsg == qsg->nalloc) { + qsg->nalloc = 2 * qsg->nalloc + 1; + qsg->sg = g_renew(ScatterGatherEntry, qsg->sg, qsg->nalloc); + } + qsg->sg[qsg->nsg].base = base; + qsg->sg[qsg->nsg].len = len; + qsg->size += len; + ++qsg->nsg; +} + +void qemu_sglist_destroy(QEMUSGList *qsg) +{ + object_unref(OBJECT(qsg->dev)); + g_free(qsg->sg); + memset(qsg, 0, sizeof(*qsg)); +} + +typedef struct { + BlockAIOCB common; + AioContext *ctx; + BlockAIOCB *acb; + QEMUSGList *sg; + uint32_t align; + uint64_t offset; + DMADirection dir; + int sg_cur_index; + dma_addr_t sg_cur_byte; + QEMUIOVector iov; + QEMUBH *bh; + DMAIOFunc *io_func; + void *io_func_opaque; +} DMAAIOCB; + +static void dma_blk_cb(void *opaque, int ret); + +static void reschedule_dma(void *opaque) +{ + DMAAIOCB *dbs = (DMAAIOCB *)opaque; + + assert(!dbs->acb && dbs->bh); + qemu_bh_delete(dbs->bh); + dbs->bh = NULL; + dma_blk_cb(dbs, 0); +} + +static void dma_blk_unmap(DMAAIOCB *dbs) +{ + int i; + + for (i = 0; i < dbs->iov.niov; ++i) { + dma_memory_unmap(dbs->sg->as, dbs->iov.iov[i].iov_base, + dbs->iov.iov[i].iov_len, dbs->dir, + dbs->iov.iov[i].iov_len); + } + qemu_iovec_reset(&dbs->iov); +} + +static void dma_complete(DMAAIOCB *dbs, int ret) +{ + trace_dma_complete(dbs, ret, dbs->common.cb); + + assert(!dbs->acb && !dbs->bh); + dma_blk_unmap(dbs); + if (dbs->common.cb) { + dbs->common.cb(dbs->common.opaque, ret); + } + qemu_iovec_destroy(&dbs->iov); + qemu_aio_unref(dbs); +} + +static void dma_blk_cb(void *opaque, int ret) +{ + DMAAIOCB *dbs = (DMAAIOCB *)opaque; + AioContext *ctx = dbs->ctx; + dma_addr_t cur_addr, cur_len; + void *mem; + + trace_dma_blk_cb(dbs, ret); + + aio_context_acquire(ctx); + dbs->acb = NULL; + dbs->offset += dbs->iov.size; + + if (dbs->sg_cur_index == dbs->sg->nsg || ret < 0) { + dma_complete(dbs, ret); + goto out; + } + dma_blk_unmap(dbs); + + while (dbs->sg_cur_index < dbs->sg->nsg) { + cur_addr = dbs->sg->sg[dbs->sg_cur_index].base + dbs->sg_cur_byte; + cur_len = dbs->sg->sg[dbs->sg_cur_index].len - dbs->sg_cur_byte; + mem = dma_memory_map(dbs->sg->as, cur_addr, &cur_len, dbs->dir, + MEMTXATTRS_UNSPECIFIED); + /* + * Make reads deterministic in icount mode. Windows sometimes issues + * disk read requests with overlapping SGs. It leads + * to non-determinism, because resulting buffer contents may be mixed + * from several sectors. This code splits all SGs into several + * groups. SGs in every group do not overlap. + */ + if (mem && icount_enabled() && dbs->dir == DMA_DIRECTION_FROM_DEVICE) { + int i; + for (i = 0 ; i < dbs->iov.niov ; ++i) { + if (ranges_overlap((intptr_t)dbs->iov.iov[i].iov_base, + dbs->iov.iov[i].iov_len, (intptr_t)mem, + cur_len)) { + dma_memory_unmap(dbs->sg->as, mem, cur_len, + dbs->dir, cur_len); + mem = NULL; + break; + } + } + } + if (!mem) + break; + qemu_iovec_add(&dbs->iov, mem, cur_len); + dbs->sg_cur_byte += cur_len; + if (dbs->sg_cur_byte == dbs->sg->sg[dbs->sg_cur_index].len) { + dbs->sg_cur_byte = 0; + ++dbs->sg_cur_index; + } + } + + if (dbs->iov.size == 0) { + trace_dma_map_wait(dbs); + dbs->bh = aio_bh_new(ctx, reschedule_dma, dbs); + cpu_register_map_client(dbs->bh); + goto out; + } + + if (!QEMU_IS_ALIGNED(dbs->iov.size, dbs->align)) { + qemu_iovec_discard_back(&dbs->iov, + QEMU_ALIGN_DOWN(dbs->iov.size, dbs->align)); + } + + dbs->acb = dbs->io_func(dbs->offset, &dbs->iov, + dma_blk_cb, dbs, dbs->io_func_opaque); + assert(dbs->acb); +out: + aio_context_release(ctx); +} + +static void dma_aio_cancel(BlockAIOCB *acb) +{ + DMAAIOCB *dbs = container_of(acb, DMAAIOCB, common); + + trace_dma_aio_cancel(dbs); + + assert(!(dbs->acb && dbs->bh)); + if (dbs->acb) { + /* This will invoke dma_blk_cb. */ + blk_aio_cancel_async(dbs->acb); + return; + } + + if (dbs->bh) { + cpu_unregister_map_client(dbs->bh); + qemu_bh_delete(dbs->bh); + dbs->bh = NULL; + } + if (dbs->common.cb) { + dbs->common.cb(dbs->common.opaque, -ECANCELED); + } +} + +static const AIOCBInfo dma_aiocb_info = { + .aiocb_size = sizeof(DMAAIOCB), + .cancel_async = dma_aio_cancel, +}; + +BlockAIOCB *dma_blk_io(AioContext *ctx, + QEMUSGList *sg, uint64_t offset, uint32_t align, + DMAIOFunc *io_func, void *io_func_opaque, + BlockCompletionFunc *cb, + void *opaque, DMADirection dir) +{ + DMAAIOCB *dbs = qemu_aio_get(&dma_aiocb_info, NULL, cb, opaque); + + trace_dma_blk_io(dbs, io_func_opaque, offset, (dir == DMA_DIRECTION_TO_DEVICE)); + + dbs->acb = NULL; + dbs->sg = sg; + dbs->ctx = ctx; + dbs->offset = offset; + dbs->align = align; + dbs->sg_cur_index = 0; + dbs->sg_cur_byte = 0; + dbs->dir = dir; + dbs->io_func = io_func; + dbs->io_func_opaque = io_func_opaque; + dbs->bh = NULL; + qemu_iovec_init(&dbs->iov, sg->nsg); + dma_blk_cb(dbs, 0); + return &dbs->common; +} + + +static +BlockAIOCB *dma_blk_read_io_func(int64_t offset, QEMUIOVector *iov, + BlockCompletionFunc *cb, void *cb_opaque, + void *opaque) +{ + BlockBackend *blk = opaque; + return blk_aio_preadv(blk, offset, iov, 0, cb, cb_opaque); +} + +BlockAIOCB *dma_blk_read(BlockBackend *blk, + QEMUSGList *sg, uint64_t offset, uint32_t align, + void (*cb)(void *opaque, int ret), void *opaque) +{ + return dma_blk_io(blk_get_aio_context(blk), sg, offset, align, + dma_blk_read_io_func, blk, cb, opaque, + DMA_DIRECTION_FROM_DEVICE); +} + +static +BlockAIOCB *dma_blk_write_io_func(int64_t offset, QEMUIOVector *iov, + BlockCompletionFunc *cb, void *cb_opaque, + void *opaque) +{ + BlockBackend *blk = opaque; + return blk_aio_pwritev(blk, offset, iov, 0, cb, cb_opaque); +} + +BlockAIOCB *dma_blk_write(BlockBackend *blk, + QEMUSGList *sg, uint64_t offset, uint32_t align, + void (*cb)(void *opaque, int ret), void *opaque) +{ + return dma_blk_io(blk_get_aio_context(blk), sg, offset, align, + dma_blk_write_io_func, blk, cb, opaque, + DMA_DIRECTION_TO_DEVICE); +} + + +static MemTxResult dma_buf_rw(void *buf, dma_addr_t len, dma_addr_t *residual, + QEMUSGList *sg, DMADirection dir, + MemTxAttrs attrs) +{ + uint8_t *ptr = buf; + dma_addr_t xresidual; + int sg_cur_index; + MemTxResult res = MEMTX_OK; + + xresidual = sg->size; + sg_cur_index = 0; + len = MIN(len, xresidual); + while (len > 0) { + ScatterGatherEntry entry = sg->sg[sg_cur_index++]; + dma_addr_t xfer = MIN(len, entry.len); + res |= dma_memory_rw(sg->as, entry.base, ptr, xfer, dir, attrs); + ptr += xfer; + len -= xfer; + xresidual -= xfer; + } + + if (residual) { + *residual = xresidual; + } + return res; +} + +MemTxResult dma_buf_read(void *ptr, dma_addr_t len, dma_addr_t *residual, + QEMUSGList *sg, MemTxAttrs attrs) +{ + return dma_buf_rw(ptr, len, residual, sg, DMA_DIRECTION_FROM_DEVICE, attrs); +} + +MemTxResult dma_buf_write(void *ptr, dma_addr_t len, dma_addr_t *residual, + QEMUSGList *sg, MemTxAttrs attrs) +{ + return dma_buf_rw(ptr, len, residual, sg, DMA_DIRECTION_TO_DEVICE, attrs); +} + +void dma_acct_start(BlockBackend *blk, BlockAcctCookie *cookie, + QEMUSGList *sg, enum BlockAcctType type) +{ + block_acct_start(blk_get_stats(blk), cookie, sg->size, type); +} + +uint64_t dma_aligned_pow2_mask(uint64_t start, uint64_t end, int max_addr_bits) +{ + uint64_t max_mask = UINT64_MAX, addr_mask = end - start; + uint64_t alignment_mask, size_mask; + + if (max_addr_bits != 64) { + max_mask = (1ULL << max_addr_bits) - 1; + } + + alignment_mask = start ? (start & -start) - 1 : max_mask; + alignment_mask = MIN(alignment_mask, max_mask); + size_mask = MIN(addr_mask, max_mask); + + if (alignment_mask <= size_mask) { + /* Increase the alignment of start */ + return alignment_mask; + } else { + /* Find the largest page mask from size */ + if (addr_mask == UINT64_MAX) { + return UINT64_MAX; + } + return (1ULL << (63 - clz64(addr_mask + 1))) - 1; + } +} + diff --git a/system/globals.c b/system/globals.c new file mode 100644 index 0000000000..e83b5428d1 --- /dev/null +++ b/system/globals.c @@ -0,0 +1,70 @@ +/* + * Global variables that (mostly) should not exist + * + * Copyright (c) 2003-2020 QEMU contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "exec/cpu-common.h" +#include "hw/display/vga.h" +#include "hw/loader.h" +#include "hw/xen/xen.h" +#include "net/net.h" +#include "sysemu/cpus.h" +#include "sysemu/sysemu.h" + +enum vga_retrace_method vga_retrace_method = VGA_RETRACE_DUMB; +int display_opengl; +const char* keyboard_layout; +bool enable_mlock; +bool enable_cpu_pm; +int nb_nics; +NICInfo nd_table[MAX_NICS]; +int autostart = 1; +int vga_interface_type = VGA_NONE; +bool vga_interface_created; +Chardev *parallel_hds[MAX_PARALLEL_PORTS]; +int win2k_install_hack; +int fd_bootchk = 1; +int graphic_rotate; +QEMUOptionRom option_rom[MAX_OPTION_ROMS]; +int nb_option_roms; +int old_param; +const char *qemu_name; +unsigned int nb_prom_envs; +const char *prom_envs[MAX_PROM_ENVS]; +uint8_t *boot_splash_filedata; +int only_migratable; /* turn it off unless user states otherwise */ +int icount_align_option; + +/* The bytes in qemu_uuid are in the order specified by RFC4122, _not_ in the + * little-endian "wire format" described in the SMBIOS 2.6 specification. + */ +QemuUUID qemu_uuid; +bool qemu_uuid_set; + +uint32_t xen_domid; +enum xen_mode xen_mode = XEN_DISABLED; +bool xen_domid_restrict; +struct evtchn_backend_ops *xen_evtchn_ops; +struct gnttab_backend_ops *xen_gnttab_ops; +struct foreignmem_backend_ops *xen_foreignmem_ops; +struct xenstore_backend_ops *xen_xenstore_ops; diff --git a/system/ioport.c b/system/ioport.c new file mode 100644 index 0000000000..1824aa808c --- /dev/null +++ b/system/ioport.c @@ -0,0 +1,346 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +/* + * split out ioport related stuffs from vl.c. + */ + +#include "qemu/osdep.h" +#include "cpu.h" +#include "exec/ioport.h" +#include "exec/memory.h" +#include "exec/address-spaces.h" +#include "trace.h" + +struct MemoryRegionPortioList { + Object obj; + + MemoryRegion mr; + void *portio_opaque; + MemoryRegionPortio *ports; +}; + +#define TYPE_MEMORY_REGION_PORTIO_LIST "memory-region-portio-list" +OBJECT_DECLARE_SIMPLE_TYPE(MemoryRegionPortioList, MEMORY_REGION_PORTIO_LIST) + +static uint64_t unassigned_io_read(void *opaque, hwaddr addr, unsigned size) +{ + return -1ULL; +} + +static void unassigned_io_write(void *opaque, hwaddr addr, uint64_t val, + unsigned size) +{ +} + +const MemoryRegionOps unassigned_io_ops = { + .read = unassigned_io_read, + .write = unassigned_io_write, + .endianness = DEVICE_NATIVE_ENDIAN, +}; + +void cpu_outb(uint32_t addr, uint8_t val) +{ + trace_cpu_out(addr, 'b', val); + address_space_write(&address_space_io, addr, MEMTXATTRS_UNSPECIFIED, + &val, 1); +} + +void cpu_outw(uint32_t addr, uint16_t val) +{ + uint8_t buf[2]; + + trace_cpu_out(addr, 'w', val); + stw_p(buf, val); + address_space_write(&address_space_io, addr, MEMTXATTRS_UNSPECIFIED, + buf, 2); +} + +void cpu_outl(uint32_t addr, uint32_t val) +{ + uint8_t buf[4]; + + trace_cpu_out(addr, 'l', val); + stl_p(buf, val); + address_space_write(&address_space_io, addr, MEMTXATTRS_UNSPECIFIED, + buf, 4); +} + +uint8_t cpu_inb(uint32_t addr) +{ + uint8_t val; + + address_space_read(&address_space_io, addr, MEMTXATTRS_UNSPECIFIED, + &val, 1); + trace_cpu_in(addr, 'b', val); + return val; +} + +uint16_t cpu_inw(uint32_t addr) +{ + uint8_t buf[2]; + uint16_t val; + + address_space_read(&address_space_io, addr, MEMTXATTRS_UNSPECIFIED, buf, 2); + val = lduw_p(buf); + trace_cpu_in(addr, 'w', val); + return val; +} + +uint32_t cpu_inl(uint32_t addr) +{ + uint8_t buf[4]; + uint32_t val; + + address_space_read(&address_space_io, addr, MEMTXATTRS_UNSPECIFIED, buf, 4); + val = ldl_p(buf); + trace_cpu_in(addr, 'l', val); + return val; +} + +void portio_list_init(PortioList *piolist, + Object *owner, + const MemoryRegionPortio *callbacks, + void *opaque, const char *name) +{ + unsigned n = 0; + + while (callbacks[n].size) { + ++n; + } + + piolist->ports = callbacks; + piolist->nr = 0; + piolist->regions = g_new0(MemoryRegion *, n); + piolist->address_space = NULL; + piolist->opaque = opaque; + piolist->owner = owner; + piolist->name = name; + piolist->flush_coalesced_mmio = false; +} + +void portio_list_set_flush_coalesced(PortioList *piolist) +{ + piolist->flush_coalesced_mmio = true; +} + +void portio_list_destroy(PortioList *piolist) +{ + MemoryRegionPortioList *mrpio; + unsigned i; + + for (i = 0; i < piolist->nr; ++i) { + mrpio = container_of(piolist->regions[i], MemoryRegionPortioList, mr); + object_unparent(OBJECT(&mrpio->mr)); + object_unref(mrpio); + } + g_free(piolist->regions); +} + +static const MemoryRegionPortio *find_portio(MemoryRegionPortioList *mrpio, + uint64_t offset, unsigned size, + bool write) +{ + const MemoryRegionPortio *mrp; + + for (mrp = mrpio->ports; mrp->size; ++mrp) { + if (offset >= mrp->offset && offset < mrp->offset + mrp->len && + size == mrp->size && + (write ? (bool)mrp->write : (bool)mrp->read)) { + return mrp; + } + } + return NULL; +} + +static uint64_t portio_read(void *opaque, hwaddr addr, unsigned size) +{ + MemoryRegionPortioList *mrpio = opaque; + const MemoryRegionPortio *mrp = find_portio(mrpio, addr, size, false); + uint64_t data; + + data = ((uint64_t)1 << (size * 8)) - 1; + if (mrp) { + data = mrp->read(mrpio->portio_opaque, mrp->base + addr); + } else if (size == 2) { + mrp = find_portio(mrpio, addr, 1, false); + if (mrp) { + data = mrp->read(mrpio->portio_opaque, mrp->base + addr); + if (addr + 1 < mrp->offset + mrp->len) { + data |= mrp->read(mrpio->portio_opaque, mrp->base + addr + 1) << 8; + } else { + data |= 0xff00; + } + } + } + return data; +} + +static void portio_write(void *opaque, hwaddr addr, uint64_t data, + unsigned size) +{ + MemoryRegionPortioList *mrpio = opaque; + const MemoryRegionPortio *mrp = find_portio(mrpio, addr, size, true); + + if (mrp) { + mrp->write(mrpio->portio_opaque, mrp->base + addr, data); + } else if (size == 2) { + mrp = find_portio(mrpio, addr, 1, true); + if (mrp) { + mrp->write(mrpio->portio_opaque, mrp->base + addr, data & 0xff); + if (addr + 1 < mrp->offset + mrp->len) { + mrp->write(mrpio->portio_opaque, mrp->base + addr + 1, data >> 8); + } + } + } +} + +static const MemoryRegionOps portio_ops = { + .read = portio_read, + .write = portio_write, + .endianness = DEVICE_LITTLE_ENDIAN, + .valid.unaligned = true, + .impl.unaligned = true, +}; + +static void portio_list_add_1(PortioList *piolist, + const MemoryRegionPortio *pio_init, + unsigned count, unsigned start, + unsigned off_low, unsigned off_high) +{ + MemoryRegionPortioList *mrpio; + Object *owner; + char *name; + unsigned i; + + /* Copy the sub-list and null-terminate it. */ + mrpio = MEMORY_REGION_PORTIO_LIST( + object_new(TYPE_MEMORY_REGION_PORTIO_LIST)); + mrpio->portio_opaque = piolist->opaque; + mrpio->ports = g_malloc0(sizeof(MemoryRegionPortio) * (count + 1)); + memcpy(mrpio->ports, pio_init, sizeof(MemoryRegionPortio) * count); + memset(mrpio->ports + count, 0, sizeof(MemoryRegionPortio)); + + /* Adjust the offsets to all be zero-based for the region. */ + for (i = 0; i < count; ++i) { + mrpio->ports[i].offset -= off_low; + mrpio->ports[i].base = start + off_low; + } + + /* + * The MemoryRegion owner is the MemoryRegionPortioList since that manages + * the lifecycle via the refcount + */ + memory_region_init_io(&mrpio->mr, OBJECT(mrpio), &portio_ops, mrpio, + piolist->name, off_high - off_low); + + /* Reparent the MemoryRegion to the piolist owner */ + object_ref(&mrpio->mr); + object_unparent(OBJECT(&mrpio->mr)); + if (!piolist->owner) { + owner = container_get(qdev_get_machine(), "/unattached"); + } else { + owner = piolist->owner; + } + name = g_strdup_printf("%s[*]", piolist->name); + object_property_add_child(owner, name, OBJECT(&mrpio->mr)); + g_free(name); + + if (piolist->flush_coalesced_mmio) { + memory_region_set_flush_coalesced(&mrpio->mr); + } + memory_region_add_subregion(piolist->address_space, + start + off_low, &mrpio->mr); + piolist->regions[piolist->nr] = &mrpio->mr; + ++piolist->nr; +} + +void portio_list_add(PortioList *piolist, + MemoryRegion *address_space, + uint32_t start) +{ + const MemoryRegionPortio *pio, *pio_start = piolist->ports; + unsigned int off_low, off_high, off_last, count; + + piolist->address_space = address_space; + + /* Handle the first entry specially. */ + off_last = off_low = pio_start->offset; + off_high = off_low + pio_start->len + pio_start->size - 1; + count = 1; + + for (pio = pio_start + 1; pio->size != 0; pio++, count++) { + /* All entries must be sorted by offset. */ + assert(pio->offset >= off_last); + off_last = pio->offset; + + /* If we see a hole, break the region. */ + if (off_last > off_high) { + portio_list_add_1(piolist, pio_start, count, start, off_low, + off_high); + /* ... and start collecting anew. */ + pio_start = pio; + off_low = off_last; + off_high = off_low + pio->len + pio_start->size - 1; + count = 0; + } else if (off_last + pio->len > off_high) { + off_high = off_last + pio->len + pio_start->size - 1; + } + } + + /* There will always be an open sub-list. */ + portio_list_add_1(piolist, pio_start, count, start, off_low, off_high); +} + +void portio_list_del(PortioList *piolist) +{ + MemoryRegionPortioList *mrpio; + unsigned i; + + for (i = 0; i < piolist->nr; ++i) { + mrpio = container_of(piolist->regions[i], MemoryRegionPortioList, mr); + memory_region_del_subregion(piolist->address_space, &mrpio->mr); + } +} + +static void memory_region_portio_list_finalize(Object *obj) +{ + MemoryRegionPortioList *mrpio = MEMORY_REGION_PORTIO_LIST(obj); + + object_unref(&mrpio->mr); + g_free(mrpio->ports); +} + +static const TypeInfo memory_region_portio_list_info = { + .parent = TYPE_OBJECT, + .name = TYPE_MEMORY_REGION_PORTIO_LIST, + .instance_size = sizeof(MemoryRegionPortioList), + .instance_finalize = memory_region_portio_list_finalize, +}; + +static void ioport_register_types(void) +{ + type_register_static(&memory_region_portio_list_info); +} + +type_init(ioport_register_types) diff --git a/system/main.c b/system/main.c new file mode 100644 index 0000000000..694388bd7f --- /dev/null +++ b/system/main.c @@ -0,0 +1,49 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2003-2020 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qemu-main.h" +#include "sysemu/sysemu.h" + +#ifdef CONFIG_SDL +#include <SDL.h> +#endif + +int qemu_default_main(void) +{ + int status; + + status = qemu_main_loop(); + qemu_cleanup(); + + return status; +} + +int (*qemu_main)(void) = qemu_default_main; + +int main(int argc, char **argv) +{ + qemu_init(argc, argv); + return qemu_main(); +} diff --git a/system/memory.c b/system/memory.c new file mode 100644 index 0000000000..fa1c99f9ba --- /dev/null +++ b/system/memory.c @@ -0,0 +1,3683 @@ +/* + * Physical memory management + * + * Copyright 2011 Red Hat, Inc. and/or its affiliates + * + * Authors: + * Avi Kivity <avi@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "qemu/osdep.h" +#include "qemu/log.h" +#include "qapi/error.h" +#include "exec/memory.h" +#include "qapi/visitor.h" +#include "qemu/bitops.h" +#include "qemu/error-report.h" +#include "qemu/main-loop.h" +#include "qemu/qemu-print.h" +#include "qom/object.h" +#include "trace.h" + +#include "exec/memory-internal.h" +#include "exec/ram_addr.h" +#include "sysemu/kvm.h" +#include "sysemu/runstate.h" +#include "sysemu/tcg.h" +#include "qemu/accel.h" +#include "hw/boards.h" +#include "migration/vmstate.h" +#include "exec/address-spaces.h" + +//#define DEBUG_UNASSIGNED + +static unsigned memory_region_transaction_depth; +static bool memory_region_update_pending; +static bool ioeventfd_update_pending; +unsigned int global_dirty_tracking; + +static QTAILQ_HEAD(, MemoryListener) memory_listeners + = QTAILQ_HEAD_INITIALIZER(memory_listeners); + +static QTAILQ_HEAD(, AddressSpace) address_spaces + = QTAILQ_HEAD_INITIALIZER(address_spaces); + +static GHashTable *flat_views; + +typedef struct AddrRange AddrRange; + +/* + * Note that signed integers are needed for negative offsetting in aliases + * (large MemoryRegion::alias_offset). + */ +struct AddrRange { + Int128 start; + Int128 size; +}; + +static AddrRange addrrange_make(Int128 start, Int128 size) +{ + return (AddrRange) { start, size }; +} + +static bool addrrange_equal(AddrRange r1, AddrRange r2) +{ + return int128_eq(r1.start, r2.start) && int128_eq(r1.size, r2.size); +} + +static Int128 addrrange_end(AddrRange r) +{ + return int128_add(r.start, r.size); +} + +static AddrRange addrrange_shift(AddrRange range, Int128 delta) +{ + int128_addto(&range.start, delta); + return range; +} + +static bool addrrange_contains(AddrRange range, Int128 addr) +{ + return int128_ge(addr, range.start) + && int128_lt(addr, addrrange_end(range)); +} + +static bool addrrange_intersects(AddrRange r1, AddrRange r2) +{ + return addrrange_contains(r1, r2.start) + || addrrange_contains(r2, r1.start); +} + +static AddrRange addrrange_intersection(AddrRange r1, AddrRange r2) +{ + Int128 start = int128_max(r1.start, r2.start); + Int128 end = int128_min(addrrange_end(r1), addrrange_end(r2)); + return addrrange_make(start, int128_sub(end, start)); +} + +enum ListenerDirection { Forward, Reverse }; + +#define MEMORY_LISTENER_CALL_GLOBAL(_callback, _direction, _args...) \ + do { \ + MemoryListener *_listener; \ + \ + switch (_direction) { \ + case Forward: \ + QTAILQ_FOREACH(_listener, &memory_listeners, link) { \ + if (_listener->_callback) { \ + _listener->_callback(_listener, ##_args); \ + } \ + } \ + break; \ + case Reverse: \ + QTAILQ_FOREACH_REVERSE(_listener, &memory_listeners, link) { \ + if (_listener->_callback) { \ + _listener->_callback(_listener, ##_args); \ + } \ + } \ + break; \ + default: \ + abort(); \ + } \ + } while (0) + +#define MEMORY_LISTENER_CALL(_as, _callback, _direction, _section, _args...) \ + do { \ + MemoryListener *_listener; \ + \ + switch (_direction) { \ + case Forward: \ + QTAILQ_FOREACH(_listener, &(_as)->listeners, link_as) { \ + if (_listener->_callback) { \ + _listener->_callback(_listener, _section, ##_args); \ + } \ + } \ + break; \ + case Reverse: \ + QTAILQ_FOREACH_REVERSE(_listener, &(_as)->listeners, link_as) { \ + if (_listener->_callback) { \ + _listener->_callback(_listener, _section, ##_args); \ + } \ + } \ + break; \ + default: \ + abort(); \ + } \ + } while (0) + +/* No need to ref/unref .mr, the FlatRange keeps it alive. */ +#define MEMORY_LISTENER_UPDATE_REGION(fr, as, dir, callback, _args...) \ + do { \ + MemoryRegionSection mrs = section_from_flat_range(fr, \ + address_space_to_flatview(as)); \ + MEMORY_LISTENER_CALL(as, callback, dir, &mrs, ##_args); \ + } while(0) + +struct CoalescedMemoryRange { + AddrRange addr; + QTAILQ_ENTRY(CoalescedMemoryRange) link; +}; + +struct MemoryRegionIoeventfd { + AddrRange addr; + bool match_data; + uint64_t data; + EventNotifier *e; +}; + +static bool memory_region_ioeventfd_before(MemoryRegionIoeventfd *a, + MemoryRegionIoeventfd *b) +{ + if (int128_lt(a->addr.start, b->addr.start)) { + return true; + } else if (int128_gt(a->addr.start, b->addr.start)) { + return false; + } else if (int128_lt(a->addr.size, b->addr.size)) { + return true; + } else if (int128_gt(a->addr.size, b->addr.size)) { + return false; + } else if (a->match_data < b->match_data) { + return true; + } else if (a->match_data > b->match_data) { + return false; + } else if (a->match_data) { + if (a->data < b->data) { + return true; + } else if (a->data > b->data) { + return false; + } + } + if (a->e < b->e) { + return true; + } else if (a->e > b->e) { + return false; + } + return false; +} + +static bool memory_region_ioeventfd_equal(MemoryRegionIoeventfd *a, + MemoryRegionIoeventfd *b) +{ + if (int128_eq(a->addr.start, b->addr.start) && + (!int128_nz(a->addr.size) || !int128_nz(b->addr.size) || + (int128_eq(a->addr.size, b->addr.size) && + (a->match_data == b->match_data) && + ((a->match_data && (a->data == b->data)) || !a->match_data) && + (a->e == b->e)))) + return true; + + return false; +} + +/* Range of memory in the global map. Addresses are absolute. */ +struct FlatRange { + MemoryRegion *mr; + hwaddr offset_in_region; + AddrRange addr; + uint8_t dirty_log_mask; + bool romd_mode; + bool readonly; + bool nonvolatile; +}; + +#define FOR_EACH_FLAT_RANGE(var, view) \ + for (var = (view)->ranges; var < (view)->ranges + (view)->nr; ++var) + +static inline MemoryRegionSection +section_from_flat_range(FlatRange *fr, FlatView *fv) +{ + return (MemoryRegionSection) { + .mr = fr->mr, + .fv = fv, + .offset_within_region = fr->offset_in_region, + .size = fr->addr.size, + .offset_within_address_space = int128_get64(fr->addr.start), + .readonly = fr->readonly, + .nonvolatile = fr->nonvolatile, + }; +} + +static bool flatrange_equal(FlatRange *a, FlatRange *b) +{ + return a->mr == b->mr + && addrrange_equal(a->addr, b->addr) + && a->offset_in_region == b->offset_in_region + && a->romd_mode == b->romd_mode + && a->readonly == b->readonly + && a->nonvolatile == b->nonvolatile; +} + +static FlatView *flatview_new(MemoryRegion *mr_root) +{ + FlatView *view; + + view = g_new0(FlatView, 1); + view->ref = 1; + view->root = mr_root; + memory_region_ref(mr_root); + trace_flatview_new(view, mr_root); + + return view; +} + +/* Insert a range into a given position. Caller is responsible for maintaining + * sorting order. + */ +static void flatview_insert(FlatView *view, unsigned pos, FlatRange *range) +{ + if (view->nr == view->nr_allocated) { + view->nr_allocated = MAX(2 * view->nr, 10); + view->ranges = g_realloc(view->ranges, + view->nr_allocated * sizeof(*view->ranges)); + } + memmove(view->ranges + pos + 1, view->ranges + pos, + (view->nr - pos) * sizeof(FlatRange)); + view->ranges[pos] = *range; + memory_region_ref(range->mr); + ++view->nr; +} + +static void flatview_destroy(FlatView *view) +{ + int i; + + trace_flatview_destroy(view, view->root); + if (view->dispatch) { + address_space_dispatch_free(view->dispatch); + } + for (i = 0; i < view->nr; i++) { + memory_region_unref(view->ranges[i].mr); + } + g_free(view->ranges); + memory_region_unref(view->root); + g_free(view); +} + +static bool flatview_ref(FlatView *view) +{ + return qatomic_fetch_inc_nonzero(&view->ref) > 0; +} + +void flatview_unref(FlatView *view) +{ + if (qatomic_fetch_dec(&view->ref) == 1) { + trace_flatview_destroy_rcu(view, view->root); + assert(view->root); + call_rcu(view, flatview_destroy, rcu); + } +} + +static bool can_merge(FlatRange *r1, FlatRange *r2) +{ + return int128_eq(addrrange_end(r1->addr), r2->addr.start) + && r1->mr == r2->mr + && int128_eq(int128_add(int128_make64(r1->offset_in_region), + r1->addr.size), + int128_make64(r2->offset_in_region)) + && r1->dirty_log_mask == r2->dirty_log_mask + && r1->romd_mode == r2->romd_mode + && r1->readonly == r2->readonly + && r1->nonvolatile == r2->nonvolatile; +} + +/* Attempt to simplify a view by merging adjacent ranges */ +static void flatview_simplify(FlatView *view) +{ + unsigned i, j, k; + + i = 0; + while (i < view->nr) { + j = i + 1; + while (j < view->nr + && can_merge(&view->ranges[j-1], &view->ranges[j])) { + int128_addto(&view->ranges[i].addr.size, view->ranges[j].addr.size); + ++j; + } + ++i; + for (k = i; k < j; k++) { + memory_region_unref(view->ranges[k].mr); + } + memmove(&view->ranges[i], &view->ranges[j], + (view->nr - j) * sizeof(view->ranges[j])); + view->nr -= j - i; + } +} + +static bool memory_region_big_endian(MemoryRegion *mr) +{ +#if TARGET_BIG_ENDIAN + return mr->ops->endianness != DEVICE_LITTLE_ENDIAN; +#else + return mr->ops->endianness == DEVICE_BIG_ENDIAN; +#endif +} + +static void adjust_endianness(MemoryRegion *mr, uint64_t *data, MemOp op) +{ + if ((op & MO_BSWAP) != devend_memop(mr->ops->endianness)) { + switch (op & MO_SIZE) { + case MO_8: + break; + case MO_16: + *data = bswap16(*data); + break; + case MO_32: + *data = bswap32(*data); + break; + case MO_64: + *data = bswap64(*data); + break; + default: + g_assert_not_reached(); + } + } +} + +static inline void memory_region_shift_read_access(uint64_t *value, + signed shift, + uint64_t mask, + uint64_t tmp) +{ + if (shift >= 0) { + *value |= (tmp & mask) << shift; + } else { + *value |= (tmp & mask) >> -shift; + } +} + +static inline uint64_t memory_region_shift_write_access(uint64_t *value, + signed shift, + uint64_t mask) +{ + uint64_t tmp; + + if (shift >= 0) { + tmp = (*value >> shift) & mask; + } else { + tmp = (*value << -shift) & mask; + } + + return tmp; +} + +static hwaddr memory_region_to_absolute_addr(MemoryRegion *mr, hwaddr offset) +{ + MemoryRegion *root; + hwaddr abs_addr = offset; + + abs_addr += mr->addr; + for (root = mr; root->container; ) { + root = root->container; + abs_addr += root->addr; + } + + return abs_addr; +} + +static int get_cpu_index(void) +{ + if (current_cpu) { + return current_cpu->cpu_index; + } + return -1; +} + +static MemTxResult memory_region_read_accessor(MemoryRegion *mr, + hwaddr addr, + uint64_t *value, + unsigned size, + signed shift, + uint64_t mask, + MemTxAttrs attrs) +{ + uint64_t tmp; + + tmp = mr->ops->read(mr->opaque, addr, size); + if (mr->subpage) { + trace_memory_region_subpage_read(get_cpu_index(), mr, addr, tmp, size); + } else if (trace_event_get_state_backends(TRACE_MEMORY_REGION_OPS_READ)) { + hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr); + trace_memory_region_ops_read(get_cpu_index(), mr, abs_addr, tmp, size, + memory_region_name(mr)); + } + memory_region_shift_read_access(value, shift, mask, tmp); + return MEMTX_OK; +} + +static MemTxResult memory_region_read_with_attrs_accessor(MemoryRegion *mr, + hwaddr addr, + uint64_t *value, + unsigned size, + signed shift, + uint64_t mask, + MemTxAttrs attrs) +{ + uint64_t tmp = 0; + MemTxResult r; + + r = mr->ops->read_with_attrs(mr->opaque, addr, &tmp, size, attrs); + if (mr->subpage) { + trace_memory_region_subpage_read(get_cpu_index(), mr, addr, tmp, size); + } else if (trace_event_get_state_backends(TRACE_MEMORY_REGION_OPS_READ)) { + hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr); + trace_memory_region_ops_read(get_cpu_index(), mr, abs_addr, tmp, size, + memory_region_name(mr)); + } + memory_region_shift_read_access(value, shift, mask, tmp); + return r; +} + +static MemTxResult memory_region_write_accessor(MemoryRegion *mr, + hwaddr addr, + uint64_t *value, + unsigned size, + signed shift, + uint64_t mask, + MemTxAttrs attrs) +{ + uint64_t tmp = memory_region_shift_write_access(value, shift, mask); + + if (mr->subpage) { + trace_memory_region_subpage_write(get_cpu_index(), mr, addr, tmp, size); + } else if (trace_event_get_state_backends(TRACE_MEMORY_REGION_OPS_WRITE)) { + hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr); + trace_memory_region_ops_write(get_cpu_index(), mr, abs_addr, tmp, size, + memory_region_name(mr)); + } + mr->ops->write(mr->opaque, addr, tmp, size); + return MEMTX_OK; +} + +static MemTxResult memory_region_write_with_attrs_accessor(MemoryRegion *mr, + hwaddr addr, + uint64_t *value, + unsigned size, + signed shift, + uint64_t mask, + MemTxAttrs attrs) +{ + uint64_t tmp = memory_region_shift_write_access(value, shift, mask); + + if (mr->subpage) { + trace_memory_region_subpage_write(get_cpu_index(), mr, addr, tmp, size); + } else if (trace_event_get_state_backends(TRACE_MEMORY_REGION_OPS_WRITE)) { + hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr); + trace_memory_region_ops_write(get_cpu_index(), mr, abs_addr, tmp, size, + memory_region_name(mr)); + } + return mr->ops->write_with_attrs(mr->opaque, addr, tmp, size, attrs); +} + +static MemTxResult access_with_adjusted_size(hwaddr addr, + uint64_t *value, + unsigned size, + unsigned access_size_min, + unsigned access_size_max, + MemTxResult (*access_fn) + (MemoryRegion *mr, + hwaddr addr, + uint64_t *value, + unsigned size, + signed shift, + uint64_t mask, + MemTxAttrs attrs), + MemoryRegion *mr, + MemTxAttrs attrs) +{ + uint64_t access_mask; + unsigned access_size; + unsigned i; + MemTxResult r = MEMTX_OK; + bool reentrancy_guard_applied = false; + + if (!access_size_min) { + access_size_min = 1; + } + if (!access_size_max) { + access_size_max = 4; + } + + /* Do not allow more than one simultaneous access to a device's IO Regions */ + if (mr->dev && !mr->disable_reentrancy_guard && + !mr->ram_device && !mr->ram && !mr->rom_device && !mr->readonly) { + if (mr->dev->mem_reentrancy_guard.engaged_in_io) { + warn_report_once("Blocked re-entrant IO on MemoryRegion: " + "%s at addr: 0x%" HWADDR_PRIX, + memory_region_name(mr), addr); + return MEMTX_ACCESS_ERROR; + } + mr->dev->mem_reentrancy_guard.engaged_in_io = true; + reentrancy_guard_applied = true; + } + + /* FIXME: support unaligned access? */ + access_size = MAX(MIN(size, access_size_max), access_size_min); + access_mask = MAKE_64BIT_MASK(0, access_size * 8); + if (memory_region_big_endian(mr)) { + for (i = 0; i < size; i += access_size) { + r |= access_fn(mr, addr + i, value, access_size, + (size - access_size - i) * 8, access_mask, attrs); + } + } else { + for (i = 0; i < size; i += access_size) { + r |= access_fn(mr, addr + i, value, access_size, i * 8, + access_mask, attrs); + } + } + if (mr->dev && reentrancy_guard_applied) { + mr->dev->mem_reentrancy_guard.engaged_in_io = false; + } + return r; +} + +static AddressSpace *memory_region_to_address_space(MemoryRegion *mr) +{ + AddressSpace *as; + + while (mr->container) { + mr = mr->container; + } + QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { + if (mr == as->root) { + return as; + } + } + return NULL; +} + +/* Render a memory region into the global view. Ranges in @view obscure + * ranges in @mr. + */ +static void render_memory_region(FlatView *view, + MemoryRegion *mr, + Int128 base, + AddrRange clip, + bool readonly, + bool nonvolatile) +{ + MemoryRegion *subregion; + unsigned i; + hwaddr offset_in_region; + Int128 remain; + Int128 now; + FlatRange fr; + AddrRange tmp; + + if (!mr->enabled) { + return; + } + + int128_addto(&base, int128_make64(mr->addr)); + readonly |= mr->readonly; + nonvolatile |= mr->nonvolatile; + + tmp = addrrange_make(base, mr->size); + + if (!addrrange_intersects(tmp, clip)) { + return; + } + + clip = addrrange_intersection(tmp, clip); + + if (mr->alias) { + int128_subfrom(&base, int128_make64(mr->alias->addr)); + int128_subfrom(&base, int128_make64(mr->alias_offset)); + render_memory_region(view, mr->alias, base, clip, + readonly, nonvolatile); + return; + } + + /* Render subregions in priority order. */ + QTAILQ_FOREACH(subregion, &mr->subregions, subregions_link) { + render_memory_region(view, subregion, base, clip, + readonly, nonvolatile); + } + + if (!mr->terminates) { + return; + } + + offset_in_region = int128_get64(int128_sub(clip.start, base)); + base = clip.start; + remain = clip.size; + + fr.mr = mr; + fr.dirty_log_mask = memory_region_get_dirty_log_mask(mr); + fr.romd_mode = mr->romd_mode; + fr.readonly = readonly; + fr.nonvolatile = nonvolatile; + + /* Render the region itself into any gaps left by the current view. */ + for (i = 0; i < view->nr && int128_nz(remain); ++i) { + if (int128_ge(base, addrrange_end(view->ranges[i].addr))) { + continue; + } + if (int128_lt(base, view->ranges[i].addr.start)) { + now = int128_min(remain, + int128_sub(view->ranges[i].addr.start, base)); + fr.offset_in_region = offset_in_region; + fr.addr = addrrange_make(base, now); + flatview_insert(view, i, &fr); + ++i; + int128_addto(&base, now); + offset_in_region += int128_get64(now); + int128_subfrom(&remain, now); + } + now = int128_sub(int128_min(int128_add(base, remain), + addrrange_end(view->ranges[i].addr)), + base); + int128_addto(&base, now); + offset_in_region += int128_get64(now); + int128_subfrom(&remain, now); + } + if (int128_nz(remain)) { + fr.offset_in_region = offset_in_region; + fr.addr = addrrange_make(base, remain); + flatview_insert(view, i, &fr); + } +} + +void flatview_for_each_range(FlatView *fv, flatview_cb cb , void *opaque) +{ + FlatRange *fr; + + assert(fv); + assert(cb); + + FOR_EACH_FLAT_RANGE(fr, fv) { + if (cb(fr->addr.start, fr->addr.size, fr->mr, + fr->offset_in_region, opaque)) { + break; + } + } +} + +static MemoryRegion *memory_region_get_flatview_root(MemoryRegion *mr) +{ + while (mr->enabled) { + if (mr->alias) { + if (!mr->alias_offset && int128_ge(mr->size, mr->alias->size)) { + /* The alias is included in its entirety. Use it as + * the "real" root, so that we can share more FlatViews. + */ + mr = mr->alias; + continue; + } + } else if (!mr->terminates) { + unsigned int found = 0; + MemoryRegion *child, *next = NULL; + QTAILQ_FOREACH(child, &mr->subregions, subregions_link) { + if (child->enabled) { + if (++found > 1) { + next = NULL; + break; + } + if (!child->addr && int128_ge(mr->size, child->size)) { + /* A child is included in its entirety. If it's the only + * enabled one, use it in the hope of finding an alias down the + * way. This will also let us share FlatViews. + */ + next = child; + } + } + } + if (found == 0) { + return NULL; + } + if (next) { + mr = next; + continue; + } + } + + return mr; + } + + return NULL; +} + +/* Render a memory topology into a list of disjoint absolute ranges. */ +static FlatView *generate_memory_topology(MemoryRegion *mr) +{ + int i; + FlatView *view; + + view = flatview_new(mr); + + if (mr) { + render_memory_region(view, mr, int128_zero(), + addrrange_make(int128_zero(), int128_2_64()), + false, false); + } + flatview_simplify(view); + + view->dispatch = address_space_dispatch_new(view); + for (i = 0; i < view->nr; i++) { + MemoryRegionSection mrs = + section_from_flat_range(&view->ranges[i], view); + flatview_add_to_dispatch(view, &mrs); + } + address_space_dispatch_compact(view->dispatch); + g_hash_table_replace(flat_views, mr, view); + + return view; +} + +static void address_space_add_del_ioeventfds(AddressSpace *as, + MemoryRegionIoeventfd *fds_new, + unsigned fds_new_nb, + MemoryRegionIoeventfd *fds_old, + unsigned fds_old_nb) +{ + unsigned iold, inew; + MemoryRegionIoeventfd *fd; + MemoryRegionSection section; + + /* Generate a symmetric difference of the old and new fd sets, adding + * and deleting as necessary. + */ + + iold = inew = 0; + while (iold < fds_old_nb || inew < fds_new_nb) { + if (iold < fds_old_nb + && (inew == fds_new_nb + || memory_region_ioeventfd_before(&fds_old[iold], + &fds_new[inew]))) { + fd = &fds_old[iold]; + section = (MemoryRegionSection) { + .fv = address_space_to_flatview(as), + .offset_within_address_space = int128_get64(fd->addr.start), + .size = fd->addr.size, + }; + MEMORY_LISTENER_CALL(as, eventfd_del, Forward, §ion, + fd->match_data, fd->data, fd->e); + ++iold; + } else if (inew < fds_new_nb + && (iold == fds_old_nb + || memory_region_ioeventfd_before(&fds_new[inew], + &fds_old[iold]))) { + fd = &fds_new[inew]; + section = (MemoryRegionSection) { + .fv = address_space_to_flatview(as), + .offset_within_address_space = int128_get64(fd->addr.start), + .size = fd->addr.size, + }; + MEMORY_LISTENER_CALL(as, eventfd_add, Reverse, §ion, + fd->match_data, fd->data, fd->e); + ++inew; + } else { + ++iold; + ++inew; + } + } +} + +FlatView *address_space_get_flatview(AddressSpace *as) +{ + FlatView *view; + + RCU_READ_LOCK_GUARD(); + do { + view = address_space_to_flatview(as); + /* If somebody has replaced as->current_map concurrently, + * flatview_ref returns false. + */ + } while (!flatview_ref(view)); + return view; +} + +static void address_space_update_ioeventfds(AddressSpace *as) +{ + FlatView *view; + FlatRange *fr; + unsigned ioeventfd_nb = 0; + unsigned ioeventfd_max; + MemoryRegionIoeventfd *ioeventfds; + AddrRange tmp; + unsigned i; + + if (!as->ioeventfd_notifiers) { + return; + } + + /* + * It is likely that the number of ioeventfds hasn't changed much, so use + * the previous size as the starting value, with some headroom to avoid + * gratuitous reallocations. + */ + ioeventfd_max = QEMU_ALIGN_UP(as->ioeventfd_nb, 4); + ioeventfds = g_new(MemoryRegionIoeventfd, ioeventfd_max); + + view = address_space_get_flatview(as); + FOR_EACH_FLAT_RANGE(fr, view) { + for (i = 0; i < fr->mr->ioeventfd_nb; ++i) { + tmp = addrrange_shift(fr->mr->ioeventfds[i].addr, + int128_sub(fr->addr.start, + int128_make64(fr->offset_in_region))); + if (addrrange_intersects(fr->addr, tmp)) { + ++ioeventfd_nb; + if (ioeventfd_nb > ioeventfd_max) { + ioeventfd_max = MAX(ioeventfd_max * 2, 4); + ioeventfds = g_realloc(ioeventfds, + ioeventfd_max * sizeof(*ioeventfds)); + } + ioeventfds[ioeventfd_nb-1] = fr->mr->ioeventfds[i]; + ioeventfds[ioeventfd_nb-1].addr = tmp; + } + } + } + + address_space_add_del_ioeventfds(as, ioeventfds, ioeventfd_nb, + as->ioeventfds, as->ioeventfd_nb); + + g_free(as->ioeventfds); + as->ioeventfds = ioeventfds; + as->ioeventfd_nb = ioeventfd_nb; + flatview_unref(view); +} + +/* + * Notify the memory listeners about the coalesced IO change events of + * range `cmr'. Only the part that has intersection of the specified + * FlatRange will be sent. + */ +static void flat_range_coalesced_io_notify(FlatRange *fr, AddressSpace *as, + CoalescedMemoryRange *cmr, bool add) +{ + AddrRange tmp; + + tmp = addrrange_shift(cmr->addr, + int128_sub(fr->addr.start, + int128_make64(fr->offset_in_region))); + if (!addrrange_intersects(tmp, fr->addr)) { + return; + } + tmp = addrrange_intersection(tmp, fr->addr); + + if (add) { + MEMORY_LISTENER_UPDATE_REGION(fr, as, Forward, coalesced_io_add, + int128_get64(tmp.start), + int128_get64(tmp.size)); + } else { + MEMORY_LISTENER_UPDATE_REGION(fr, as, Reverse, coalesced_io_del, + int128_get64(tmp.start), + int128_get64(tmp.size)); + } +} + +static void flat_range_coalesced_io_del(FlatRange *fr, AddressSpace *as) +{ + CoalescedMemoryRange *cmr; + + QTAILQ_FOREACH(cmr, &fr->mr->coalesced, link) { + flat_range_coalesced_io_notify(fr, as, cmr, false); + } +} + +static void flat_range_coalesced_io_add(FlatRange *fr, AddressSpace *as) +{ + MemoryRegion *mr = fr->mr; + CoalescedMemoryRange *cmr; + + if (QTAILQ_EMPTY(&mr->coalesced)) { + return; + } + + QTAILQ_FOREACH(cmr, &mr->coalesced, link) { + flat_range_coalesced_io_notify(fr, as, cmr, true); + } +} + +static void address_space_update_topology_pass(AddressSpace *as, + const FlatView *old_view, + const FlatView *new_view, + bool adding) +{ + unsigned iold, inew; + FlatRange *frold, *frnew; + + /* Generate a symmetric difference of the old and new memory maps. + * Kill ranges in the old map, and instantiate ranges in the new map. + */ + iold = inew = 0; + while (iold < old_view->nr || inew < new_view->nr) { + if (iold < old_view->nr) { + frold = &old_view->ranges[iold]; + } else { + frold = NULL; + } + if (inew < new_view->nr) { + frnew = &new_view->ranges[inew]; + } else { + frnew = NULL; + } + + if (frold + && (!frnew + || int128_lt(frold->addr.start, frnew->addr.start) + || (int128_eq(frold->addr.start, frnew->addr.start) + && !flatrange_equal(frold, frnew)))) { + /* In old but not in new, or in both but attributes changed. */ + + if (!adding) { + flat_range_coalesced_io_del(frold, as); + MEMORY_LISTENER_UPDATE_REGION(frold, as, Reverse, region_del); + } + + ++iold; + } else if (frold && frnew && flatrange_equal(frold, frnew)) { + /* In both and unchanged (except logging may have changed) */ + + if (adding) { + MEMORY_LISTENER_UPDATE_REGION(frnew, as, Forward, region_nop); + if (frnew->dirty_log_mask & ~frold->dirty_log_mask) { + MEMORY_LISTENER_UPDATE_REGION(frnew, as, Forward, log_start, + frold->dirty_log_mask, + frnew->dirty_log_mask); + } + if (frold->dirty_log_mask & ~frnew->dirty_log_mask) { + MEMORY_LISTENER_UPDATE_REGION(frnew, as, Reverse, log_stop, + frold->dirty_log_mask, + frnew->dirty_log_mask); + } + } + + ++iold; + ++inew; + } else { + /* In new */ + + if (adding) { + MEMORY_LISTENER_UPDATE_REGION(frnew, as, Forward, region_add); + flat_range_coalesced_io_add(frnew, as); + } + + ++inew; + } + } +} + +static void flatviews_init(void) +{ + static FlatView *empty_view; + + if (flat_views) { + return; + } + + flat_views = g_hash_table_new_full(g_direct_hash, g_direct_equal, NULL, + (GDestroyNotify) flatview_unref); + if (!empty_view) { + empty_view = generate_memory_topology(NULL); + /* We keep it alive forever in the global variable. */ + flatview_ref(empty_view); + } else { + g_hash_table_replace(flat_views, NULL, empty_view); + flatview_ref(empty_view); + } +} + +static void flatviews_reset(void) +{ + AddressSpace *as; + + if (flat_views) { + g_hash_table_unref(flat_views); + flat_views = NULL; + } + flatviews_init(); + + /* Render unique FVs */ + QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { + MemoryRegion *physmr = memory_region_get_flatview_root(as->root); + + if (g_hash_table_lookup(flat_views, physmr)) { + continue; + } + + generate_memory_topology(physmr); + } +} + +static void address_space_set_flatview(AddressSpace *as) +{ + FlatView *old_view = address_space_to_flatview(as); + MemoryRegion *physmr = memory_region_get_flatview_root(as->root); + FlatView *new_view = g_hash_table_lookup(flat_views, physmr); + + assert(new_view); + + if (old_view == new_view) { + return; + } + + if (old_view) { + flatview_ref(old_view); + } + + flatview_ref(new_view); + + if (!QTAILQ_EMPTY(&as->listeners)) { + FlatView tmpview = { .nr = 0 }, *old_view2 = old_view; + + if (!old_view2) { + old_view2 = &tmpview; + } + address_space_update_topology_pass(as, old_view2, new_view, false); + address_space_update_topology_pass(as, old_view2, new_view, true); + } + + /* Writes are protected by the BQL. */ + qatomic_rcu_set(&as->current_map, new_view); + if (old_view) { + flatview_unref(old_view); + } + + /* Note that all the old MemoryRegions are still alive up to this + * point. This relieves most MemoryListeners from the need to + * ref/unref the MemoryRegions they get---unless they use them + * outside the iothread mutex, in which case precise reference + * counting is necessary. + */ + if (old_view) { + flatview_unref(old_view); + } +} + +static void address_space_update_topology(AddressSpace *as) +{ + MemoryRegion *physmr = memory_region_get_flatview_root(as->root); + + flatviews_init(); + if (!g_hash_table_lookup(flat_views, physmr)) { + generate_memory_topology(physmr); + } + address_space_set_flatview(as); +} + +void memory_region_transaction_begin(void) +{ + qemu_flush_coalesced_mmio_buffer(); + ++memory_region_transaction_depth; +} + +void memory_region_transaction_commit(void) +{ + AddressSpace *as; + + assert(memory_region_transaction_depth); + assert(qemu_mutex_iothread_locked()); + + --memory_region_transaction_depth; + if (!memory_region_transaction_depth) { + if (memory_region_update_pending) { + flatviews_reset(); + + MEMORY_LISTENER_CALL_GLOBAL(begin, Forward); + + QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { + address_space_set_flatview(as); + address_space_update_ioeventfds(as); + } + memory_region_update_pending = false; + ioeventfd_update_pending = false; + MEMORY_LISTENER_CALL_GLOBAL(commit, Forward); + } else if (ioeventfd_update_pending) { + QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { + address_space_update_ioeventfds(as); + } + ioeventfd_update_pending = false; + } + } +} + +static void memory_region_destructor_none(MemoryRegion *mr) +{ +} + +static void memory_region_destructor_ram(MemoryRegion *mr) +{ + qemu_ram_free(mr->ram_block); +} + +static bool memory_region_need_escape(char c) +{ + return c == '/' || c == '[' || c == '\\' || c == ']'; +} + +static char *memory_region_escape_name(const char *name) +{ + const char *p; + char *escaped, *q; + uint8_t c; + size_t bytes = 0; + + for (p = name; *p; p++) { + bytes += memory_region_need_escape(*p) ? 4 : 1; + } + if (bytes == p - name) { + return g_memdup(name, bytes + 1); + } + + escaped = g_malloc(bytes + 1); + for (p = name, q = escaped; *p; p++) { + c = *p; + if (unlikely(memory_region_need_escape(c))) { + *q++ = '\\'; + *q++ = 'x'; + *q++ = "0123456789abcdef"[c >> 4]; + c = "0123456789abcdef"[c & 15]; + } + *q++ = c; + } + *q = 0; + return escaped; +} + +static void memory_region_do_init(MemoryRegion *mr, + Object *owner, + const char *name, + uint64_t size) +{ + mr->size = int128_make64(size); + if (size == UINT64_MAX) { + mr->size = int128_2_64(); + } + mr->name = g_strdup(name); + mr->owner = owner; + mr->dev = (DeviceState *) object_dynamic_cast(mr->owner, TYPE_DEVICE); + mr->ram_block = NULL; + + if (name) { + char *escaped_name = memory_region_escape_name(name); + char *name_array = g_strdup_printf("%s[*]", escaped_name); + + if (!owner) { + owner = container_get(qdev_get_machine(), "/unattached"); + } + + object_property_add_child(owner, name_array, OBJECT(mr)); + object_unref(OBJECT(mr)); + g_free(name_array); + g_free(escaped_name); + } +} + +void memory_region_init(MemoryRegion *mr, + Object *owner, + const char *name, + uint64_t size) +{ + object_initialize(mr, sizeof(*mr), TYPE_MEMORY_REGION); + memory_region_do_init(mr, owner, name, size); +} + +static void memory_region_get_container(Object *obj, Visitor *v, + const char *name, void *opaque, + Error **errp) +{ + MemoryRegion *mr = MEMORY_REGION(obj); + char *path = (char *)""; + + if (mr->container) { + path = object_get_canonical_path(OBJECT(mr->container)); + } + visit_type_str(v, name, &path, errp); + if (mr->container) { + g_free(path); + } +} + +static Object *memory_region_resolve_container(Object *obj, void *opaque, + const char *part) +{ + MemoryRegion *mr = MEMORY_REGION(obj); + + return OBJECT(mr->container); +} + +static void memory_region_get_priority(Object *obj, Visitor *v, + const char *name, void *opaque, + Error **errp) +{ + MemoryRegion *mr = MEMORY_REGION(obj); + int32_t value = mr->priority; + + visit_type_int32(v, name, &value, errp); +} + +static void memory_region_get_size(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + MemoryRegion *mr = MEMORY_REGION(obj); + uint64_t value = memory_region_size(mr); + + visit_type_uint64(v, name, &value, errp); +} + +static void memory_region_initfn(Object *obj) +{ + MemoryRegion *mr = MEMORY_REGION(obj); + ObjectProperty *op; + + mr->ops = &unassigned_mem_ops; + mr->enabled = true; + mr->romd_mode = true; + mr->destructor = memory_region_destructor_none; + QTAILQ_INIT(&mr->subregions); + QTAILQ_INIT(&mr->coalesced); + + op = object_property_add(OBJECT(mr), "container", + "link<" TYPE_MEMORY_REGION ">", + memory_region_get_container, + NULL, /* memory_region_set_container */ + NULL, NULL); + op->resolve = memory_region_resolve_container; + + object_property_add_uint64_ptr(OBJECT(mr), "addr", + &mr->addr, OBJ_PROP_FLAG_READ); + object_property_add(OBJECT(mr), "priority", "uint32", + memory_region_get_priority, + NULL, /* memory_region_set_priority */ + NULL, NULL); + object_property_add(OBJECT(mr), "size", "uint64", + memory_region_get_size, + NULL, /* memory_region_set_size, */ + NULL, NULL); +} + +static void iommu_memory_region_initfn(Object *obj) +{ + MemoryRegion *mr = MEMORY_REGION(obj); + + mr->is_iommu = true; +} + +static uint64_t unassigned_mem_read(void *opaque, hwaddr addr, + unsigned size) +{ +#ifdef DEBUG_UNASSIGNED + printf("Unassigned mem read " HWADDR_FMT_plx "\n", addr); +#endif + return 0; +} + +static void unassigned_mem_write(void *opaque, hwaddr addr, + uint64_t val, unsigned size) +{ +#ifdef DEBUG_UNASSIGNED + printf("Unassigned mem write " HWADDR_FMT_plx " = 0x%"PRIx64"\n", addr, val); +#endif +} + +static bool unassigned_mem_accepts(void *opaque, hwaddr addr, + unsigned size, bool is_write, + MemTxAttrs attrs) +{ + return false; +} + +const MemoryRegionOps unassigned_mem_ops = { + .valid.accepts = unassigned_mem_accepts, + .endianness = DEVICE_NATIVE_ENDIAN, +}; + +static uint64_t memory_region_ram_device_read(void *opaque, + hwaddr addr, unsigned size) +{ + MemoryRegion *mr = opaque; + uint64_t data = (uint64_t)~0; + + switch (size) { + case 1: + data = *(uint8_t *)(mr->ram_block->host + addr); + break; + case 2: + data = *(uint16_t *)(mr->ram_block->host + addr); + break; + case 4: + data = *(uint32_t *)(mr->ram_block->host + addr); + break; + case 8: + data = *(uint64_t *)(mr->ram_block->host + addr); + break; + } + + trace_memory_region_ram_device_read(get_cpu_index(), mr, addr, data, size); + + return data; +} + +static void memory_region_ram_device_write(void *opaque, hwaddr addr, + uint64_t data, unsigned size) +{ + MemoryRegion *mr = opaque; + + trace_memory_region_ram_device_write(get_cpu_index(), mr, addr, data, size); + + switch (size) { + case 1: + *(uint8_t *)(mr->ram_block->host + addr) = (uint8_t)data; + break; + case 2: + *(uint16_t *)(mr->ram_block->host + addr) = (uint16_t)data; + break; + case 4: + *(uint32_t *)(mr->ram_block->host + addr) = (uint32_t)data; + break; + case 8: + *(uint64_t *)(mr->ram_block->host + addr) = data; + break; + } +} + +static const MemoryRegionOps ram_device_mem_ops = { + .read = memory_region_ram_device_read, + .write = memory_region_ram_device_write, + .endianness = DEVICE_HOST_ENDIAN, + .valid = { + .min_access_size = 1, + .max_access_size = 8, + .unaligned = true, + }, + .impl = { + .min_access_size = 1, + .max_access_size = 8, + .unaligned = true, + }, +}; + +bool memory_region_access_valid(MemoryRegion *mr, + hwaddr addr, + unsigned size, + bool is_write, + MemTxAttrs attrs) +{ + if (mr->ops->valid.accepts + && !mr->ops->valid.accepts(mr->opaque, addr, size, is_write, attrs)) { + qemu_log_mask(LOG_GUEST_ERROR, "Invalid %s at addr 0x%" HWADDR_PRIX + ", size %u, region '%s', reason: rejected\n", + is_write ? "write" : "read", + addr, size, memory_region_name(mr)); + return false; + } + + if (!mr->ops->valid.unaligned && (addr & (size - 1))) { + qemu_log_mask(LOG_GUEST_ERROR, "Invalid %s at addr 0x%" HWADDR_PRIX + ", size %u, region '%s', reason: unaligned\n", + is_write ? "write" : "read", + addr, size, memory_region_name(mr)); + return false; + } + + /* Treat zero as compatibility all valid */ + if (!mr->ops->valid.max_access_size) { + return true; + } + + if (size > mr->ops->valid.max_access_size + || size < mr->ops->valid.min_access_size) { + qemu_log_mask(LOG_GUEST_ERROR, "Invalid %s at addr 0x%" HWADDR_PRIX + ", size %u, region '%s', reason: invalid size " + "(min:%u max:%u)\n", + is_write ? "write" : "read", + addr, size, memory_region_name(mr), + mr->ops->valid.min_access_size, + mr->ops->valid.max_access_size); + return false; + } + return true; +} + +static MemTxResult memory_region_dispatch_read1(MemoryRegion *mr, + hwaddr addr, + uint64_t *pval, + unsigned size, + MemTxAttrs attrs) +{ + *pval = 0; + + if (mr->ops->read) { + return access_with_adjusted_size(addr, pval, size, + mr->ops->impl.min_access_size, + mr->ops->impl.max_access_size, + memory_region_read_accessor, + mr, attrs); + } else { + return access_with_adjusted_size(addr, pval, size, + mr->ops->impl.min_access_size, + mr->ops->impl.max_access_size, + memory_region_read_with_attrs_accessor, + mr, attrs); + } +} + +MemTxResult memory_region_dispatch_read(MemoryRegion *mr, + hwaddr addr, + uint64_t *pval, + MemOp op, + MemTxAttrs attrs) +{ + unsigned size = memop_size(op); + MemTxResult r; + + if (mr->alias) { + return memory_region_dispatch_read(mr->alias, + mr->alias_offset + addr, + pval, op, attrs); + } + if (!memory_region_access_valid(mr, addr, size, false, attrs)) { + *pval = unassigned_mem_read(mr, addr, size); + return MEMTX_DECODE_ERROR; + } + + r = memory_region_dispatch_read1(mr, addr, pval, size, attrs); + adjust_endianness(mr, pval, op); + return r; +} + +/* Return true if an eventfd was signalled */ +static bool memory_region_dispatch_write_eventfds(MemoryRegion *mr, + hwaddr addr, + uint64_t data, + unsigned size, + MemTxAttrs attrs) +{ + MemoryRegionIoeventfd ioeventfd = { + .addr = addrrange_make(int128_make64(addr), int128_make64(size)), + .data = data, + }; + unsigned i; + + for (i = 0; i < mr->ioeventfd_nb; i++) { + ioeventfd.match_data = mr->ioeventfds[i].match_data; + ioeventfd.e = mr->ioeventfds[i].e; + + if (memory_region_ioeventfd_equal(&ioeventfd, &mr->ioeventfds[i])) { + event_notifier_set(ioeventfd.e); + return true; + } + } + + return false; +} + +MemTxResult memory_region_dispatch_write(MemoryRegion *mr, + hwaddr addr, + uint64_t data, + MemOp op, + MemTxAttrs attrs) +{ + unsigned size = memop_size(op); + + if (mr->alias) { + return memory_region_dispatch_write(mr->alias, + mr->alias_offset + addr, + data, op, attrs); + } + if (!memory_region_access_valid(mr, addr, size, true, attrs)) { + unassigned_mem_write(mr, addr, data, size); + return MEMTX_DECODE_ERROR; + } + + adjust_endianness(mr, &data, op); + + if ((!kvm_eventfds_enabled()) && + memory_region_dispatch_write_eventfds(mr, addr, data, size, attrs)) { + return MEMTX_OK; + } + + if (mr->ops->write) { + return access_with_adjusted_size(addr, &data, size, + mr->ops->impl.min_access_size, + mr->ops->impl.max_access_size, + memory_region_write_accessor, mr, + attrs); + } else { + return + access_with_adjusted_size(addr, &data, size, + mr->ops->impl.min_access_size, + mr->ops->impl.max_access_size, + memory_region_write_with_attrs_accessor, + mr, attrs); + } +} + +void memory_region_init_io(MemoryRegion *mr, + Object *owner, + const MemoryRegionOps *ops, + void *opaque, + const char *name, + uint64_t size) +{ + memory_region_init(mr, owner, name, size); + mr->ops = ops ? ops : &unassigned_mem_ops; + mr->opaque = opaque; + mr->terminates = true; +} + +void memory_region_init_ram_nomigrate(MemoryRegion *mr, + Object *owner, + const char *name, + uint64_t size, + Error **errp) +{ + memory_region_init_ram_flags_nomigrate(mr, owner, name, size, 0, errp); +} + +void memory_region_init_ram_flags_nomigrate(MemoryRegion *mr, + Object *owner, + const char *name, + uint64_t size, + uint32_t ram_flags, + Error **errp) +{ + Error *err = NULL; + memory_region_init(mr, owner, name, size); + mr->ram = true; + mr->terminates = true; + mr->destructor = memory_region_destructor_ram; + mr->ram_block = qemu_ram_alloc(size, ram_flags, mr, &err); + if (err) { + mr->size = int128_zero(); + object_unparent(OBJECT(mr)); + error_propagate(errp, err); + } +} + +void memory_region_init_resizeable_ram(MemoryRegion *mr, + Object *owner, + const char *name, + uint64_t size, + uint64_t max_size, + void (*resized)(const char*, + uint64_t length, + void *host), + Error **errp) +{ + Error *err = NULL; + memory_region_init(mr, owner, name, size); + mr->ram = true; + mr->terminates = true; + mr->destructor = memory_region_destructor_ram; + mr->ram_block = qemu_ram_alloc_resizeable(size, max_size, resized, + mr, &err); + if (err) { + mr->size = int128_zero(); + object_unparent(OBJECT(mr)); + error_propagate(errp, err); + } +} + +#ifdef CONFIG_POSIX +void memory_region_init_ram_from_file(MemoryRegion *mr, + Object *owner, + const char *name, + uint64_t size, + uint64_t align, + uint32_t ram_flags, + const char *path, + ram_addr_t offset, + Error **errp) +{ + Error *err = NULL; + memory_region_init(mr, owner, name, size); + mr->ram = true; + mr->readonly = !!(ram_flags & RAM_READONLY); + mr->terminates = true; + mr->destructor = memory_region_destructor_ram; + mr->align = align; + mr->ram_block = qemu_ram_alloc_from_file(size, mr, ram_flags, path, + offset, &err); + if (err) { + mr->size = int128_zero(); + object_unparent(OBJECT(mr)); + error_propagate(errp, err); + } +} + +void memory_region_init_ram_from_fd(MemoryRegion *mr, + Object *owner, + const char *name, + uint64_t size, + uint32_t ram_flags, + int fd, + ram_addr_t offset, + Error **errp) +{ + Error *err = NULL; + memory_region_init(mr, owner, name, size); + mr->ram = true; + mr->readonly = !!(ram_flags & RAM_READONLY); + mr->terminates = true; + mr->destructor = memory_region_destructor_ram; + mr->ram_block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, offset, + &err); + if (err) { + mr->size = int128_zero(); + object_unparent(OBJECT(mr)); + error_propagate(errp, err); + } +} +#endif + +void memory_region_init_ram_ptr(MemoryRegion *mr, + Object *owner, + const char *name, + uint64_t size, + void *ptr) +{ + memory_region_init(mr, owner, name, size); + mr->ram = true; + mr->terminates = true; + mr->destructor = memory_region_destructor_ram; + + /* qemu_ram_alloc_from_ptr cannot fail with ptr != NULL. */ + assert(ptr != NULL); + mr->ram_block = qemu_ram_alloc_from_ptr(size, ptr, mr, &error_fatal); +} + +void memory_region_init_ram_device_ptr(MemoryRegion *mr, + Object *owner, + const char *name, + uint64_t size, + void *ptr) +{ + memory_region_init(mr, owner, name, size); + mr->ram = true; + mr->terminates = true; + mr->ram_device = true; + mr->ops = &ram_device_mem_ops; + mr->opaque = mr; + mr->destructor = memory_region_destructor_ram; + + /* qemu_ram_alloc_from_ptr cannot fail with ptr != NULL. */ + assert(ptr != NULL); + mr->ram_block = qemu_ram_alloc_from_ptr(size, ptr, mr, &error_fatal); +} + +void memory_region_init_alias(MemoryRegion *mr, + Object *owner, + const char *name, + MemoryRegion *orig, + hwaddr offset, + uint64_t size) +{ + memory_region_init(mr, owner, name, size); + mr->alias = orig; + mr->alias_offset = offset; +} + +void memory_region_init_rom_nomigrate(MemoryRegion *mr, + Object *owner, + const char *name, + uint64_t size, + Error **errp) +{ + memory_region_init_ram_flags_nomigrate(mr, owner, name, size, 0, errp); + mr->readonly = true; +} + +void memory_region_init_rom_device_nomigrate(MemoryRegion *mr, + Object *owner, + const MemoryRegionOps *ops, + void *opaque, + const char *name, + uint64_t size, + Error **errp) +{ + Error *err = NULL; + assert(ops); + memory_region_init(mr, owner, name, size); + mr->ops = ops; + mr->opaque = opaque; + mr->terminates = true; + mr->rom_device = true; + mr->destructor = memory_region_destructor_ram; + mr->ram_block = qemu_ram_alloc(size, 0, mr, &err); + if (err) { + mr->size = int128_zero(); + object_unparent(OBJECT(mr)); + error_propagate(errp, err); + } +} + +void memory_region_init_iommu(void *_iommu_mr, + size_t instance_size, + const char *mrtypename, + Object *owner, + const char *name, + uint64_t size) +{ + struct IOMMUMemoryRegion *iommu_mr; + struct MemoryRegion *mr; + + object_initialize(_iommu_mr, instance_size, mrtypename); + mr = MEMORY_REGION(_iommu_mr); + memory_region_do_init(mr, owner, name, size); + iommu_mr = IOMMU_MEMORY_REGION(mr); + mr->terminates = true; /* then re-forwards */ + QLIST_INIT(&iommu_mr->iommu_notify); + iommu_mr->iommu_notify_flags = IOMMU_NOTIFIER_NONE; +} + +static void memory_region_finalize(Object *obj) +{ + MemoryRegion *mr = MEMORY_REGION(obj); + + assert(!mr->container); + + /* We know the region is not visible in any address space (it + * does not have a container and cannot be a root either because + * it has no references, so we can blindly clear mr->enabled. + * memory_region_set_enabled instead could trigger a transaction + * and cause an infinite loop. + */ + mr->enabled = false; + memory_region_transaction_begin(); + while (!QTAILQ_EMPTY(&mr->subregions)) { + MemoryRegion *subregion = QTAILQ_FIRST(&mr->subregions); + memory_region_del_subregion(mr, subregion); + } + memory_region_transaction_commit(); + + mr->destructor(mr); + memory_region_clear_coalescing(mr); + g_free((char *)mr->name); + g_free(mr->ioeventfds); +} + +Object *memory_region_owner(MemoryRegion *mr) +{ + Object *obj = OBJECT(mr); + return obj->parent; +} + +void memory_region_ref(MemoryRegion *mr) +{ + /* MMIO callbacks most likely will access data that belongs + * to the owner, hence the need to ref/unref the owner whenever + * the memory region is in use. + * + * The memory region is a child of its owner. As long as the + * owner doesn't call unparent itself on the memory region, + * ref-ing the owner will also keep the memory region alive. + * Memory regions without an owner are supposed to never go away; + * we do not ref/unref them because it slows down DMA sensibly. + */ + if (mr && mr->owner) { + object_ref(mr->owner); + } +} + +void memory_region_unref(MemoryRegion *mr) +{ + if (mr && mr->owner) { + object_unref(mr->owner); + } +} + +uint64_t memory_region_size(MemoryRegion *mr) +{ + if (int128_eq(mr->size, int128_2_64())) { + return UINT64_MAX; + } + return int128_get64(mr->size); +} + +const char *memory_region_name(const MemoryRegion *mr) +{ + if (!mr->name) { + ((MemoryRegion *)mr)->name = + g_strdup(object_get_canonical_path_component(OBJECT(mr))); + } + return mr->name; +} + +bool memory_region_is_ram_device(MemoryRegion *mr) +{ + return mr->ram_device; +} + +bool memory_region_is_protected(MemoryRegion *mr) +{ + return mr->ram && (mr->ram_block->flags & RAM_PROTECTED); +} + +uint8_t memory_region_get_dirty_log_mask(MemoryRegion *mr) +{ + uint8_t mask = mr->dirty_log_mask; + RAMBlock *rb = mr->ram_block; + + if (global_dirty_tracking && ((rb && qemu_ram_is_migratable(rb)) || + memory_region_is_iommu(mr))) { + mask |= (1 << DIRTY_MEMORY_MIGRATION); + } + + if (tcg_enabled() && rb) { + /* TCG only cares about dirty memory logging for RAM, not IOMMU. */ + mask |= (1 << DIRTY_MEMORY_CODE); + } + return mask; +} + +bool memory_region_is_logging(MemoryRegion *mr, uint8_t client) +{ + return memory_region_get_dirty_log_mask(mr) & (1 << client); +} + +static int memory_region_update_iommu_notify_flags(IOMMUMemoryRegion *iommu_mr, + Error **errp) +{ + IOMMUNotifierFlag flags = IOMMU_NOTIFIER_NONE; + IOMMUNotifier *iommu_notifier; + IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr); + int ret = 0; + + IOMMU_NOTIFIER_FOREACH(iommu_notifier, iommu_mr) { + flags |= iommu_notifier->notifier_flags; + } + + if (flags != iommu_mr->iommu_notify_flags && imrc->notify_flag_changed) { + ret = imrc->notify_flag_changed(iommu_mr, + iommu_mr->iommu_notify_flags, + flags, errp); + } + + if (!ret) { + iommu_mr->iommu_notify_flags = flags; + } + return ret; +} + +int memory_region_iommu_set_page_size_mask(IOMMUMemoryRegion *iommu_mr, + uint64_t page_size_mask, + Error **errp) +{ + IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr); + int ret = 0; + + if (imrc->iommu_set_page_size_mask) { + ret = imrc->iommu_set_page_size_mask(iommu_mr, page_size_mask, errp); + } + return ret; +} + +int memory_region_register_iommu_notifier(MemoryRegion *mr, + IOMMUNotifier *n, Error **errp) +{ + IOMMUMemoryRegion *iommu_mr; + int ret; + + if (mr->alias) { + return memory_region_register_iommu_notifier(mr->alias, n, errp); + } + + /* We need to register for at least one bitfield */ + iommu_mr = IOMMU_MEMORY_REGION(mr); + assert(n->notifier_flags != IOMMU_NOTIFIER_NONE); + assert(n->start <= n->end); + assert(n->iommu_idx >= 0 && + n->iommu_idx < memory_region_iommu_num_indexes(iommu_mr)); + + QLIST_INSERT_HEAD(&iommu_mr->iommu_notify, n, node); + ret = memory_region_update_iommu_notify_flags(iommu_mr, errp); + if (ret) { + QLIST_REMOVE(n, node); + } + return ret; +} + +uint64_t memory_region_iommu_get_min_page_size(IOMMUMemoryRegion *iommu_mr) +{ + IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr); + + if (imrc->get_min_page_size) { + return imrc->get_min_page_size(iommu_mr); + } + return TARGET_PAGE_SIZE; +} + +void memory_region_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) +{ + MemoryRegion *mr = MEMORY_REGION(iommu_mr); + IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr); + hwaddr addr, granularity; + IOMMUTLBEntry iotlb; + + /* If the IOMMU has its own replay callback, override */ + if (imrc->replay) { + imrc->replay(iommu_mr, n); + return; + } + + granularity = memory_region_iommu_get_min_page_size(iommu_mr); + + for (addr = 0; addr < memory_region_size(mr); addr += granularity) { + iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, n->iommu_idx); + if (iotlb.perm != IOMMU_NONE) { + n->notify(n, &iotlb); + } + + /* if (2^64 - MR size) < granularity, it's possible to get an + * infinite loop here. This should catch such a wraparound */ + if ((addr + granularity) < addr) { + break; + } + } +} + +void memory_region_unregister_iommu_notifier(MemoryRegion *mr, + IOMMUNotifier *n) +{ + IOMMUMemoryRegion *iommu_mr; + + if (mr->alias) { + memory_region_unregister_iommu_notifier(mr->alias, n); + return; + } + QLIST_REMOVE(n, node); + iommu_mr = IOMMU_MEMORY_REGION(mr); + memory_region_update_iommu_notify_flags(iommu_mr, NULL); +} + +void memory_region_notify_iommu_one(IOMMUNotifier *notifier, + IOMMUTLBEvent *event) +{ + IOMMUTLBEntry *entry = &event->entry; + hwaddr entry_end = entry->iova + entry->addr_mask; + IOMMUTLBEntry tmp = *entry; + + if (event->type == IOMMU_NOTIFIER_UNMAP) { + assert(entry->perm == IOMMU_NONE); + } + + /* + * Skip the notification if the notification does not overlap + * with registered range. + */ + if (notifier->start > entry_end || notifier->end < entry->iova) { + return; + } + + if (notifier->notifier_flags & IOMMU_NOTIFIER_DEVIOTLB_UNMAP) { + /* Crop (iova, addr_mask) to range */ + tmp.iova = MAX(tmp.iova, notifier->start); + tmp.addr_mask = MIN(entry_end, notifier->end) - tmp.iova; + } else { + assert(entry->iova >= notifier->start && entry_end <= notifier->end); + } + + if (event->type & notifier->notifier_flags) { + notifier->notify(notifier, &tmp); + } +} + +void memory_region_unmap_iommu_notifier_range(IOMMUNotifier *notifier) +{ + IOMMUTLBEvent event; + + event.type = IOMMU_NOTIFIER_UNMAP; + event.entry.target_as = &address_space_memory; + event.entry.iova = notifier->start; + event.entry.perm = IOMMU_NONE; + event.entry.addr_mask = notifier->end - notifier->start; + + memory_region_notify_iommu_one(notifier, &event); +} + +void memory_region_notify_iommu(IOMMUMemoryRegion *iommu_mr, + int iommu_idx, + IOMMUTLBEvent event) +{ + IOMMUNotifier *iommu_notifier; + + assert(memory_region_is_iommu(MEMORY_REGION(iommu_mr))); + + IOMMU_NOTIFIER_FOREACH(iommu_notifier, iommu_mr) { + if (iommu_notifier->iommu_idx == iommu_idx) { + memory_region_notify_iommu_one(iommu_notifier, &event); + } + } +} + +int memory_region_iommu_get_attr(IOMMUMemoryRegion *iommu_mr, + enum IOMMUMemoryRegionAttr attr, + void *data) +{ + IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr); + + if (!imrc->get_attr) { + return -EINVAL; + } + + return imrc->get_attr(iommu_mr, attr, data); +} + +int memory_region_iommu_attrs_to_index(IOMMUMemoryRegion *iommu_mr, + MemTxAttrs attrs) +{ + IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr); + + if (!imrc->attrs_to_index) { + return 0; + } + + return imrc->attrs_to_index(iommu_mr, attrs); +} + +int memory_region_iommu_num_indexes(IOMMUMemoryRegion *iommu_mr) +{ + IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr); + + if (!imrc->num_indexes) { + return 1; + } + + return imrc->num_indexes(iommu_mr); +} + +RamDiscardManager *memory_region_get_ram_discard_manager(MemoryRegion *mr) +{ + if (!memory_region_is_mapped(mr) || !memory_region_is_ram(mr)) { + return NULL; + } + return mr->rdm; +} + +void memory_region_set_ram_discard_manager(MemoryRegion *mr, + RamDiscardManager *rdm) +{ + g_assert(memory_region_is_ram(mr) && !memory_region_is_mapped(mr)); + g_assert(!rdm || !mr->rdm); + mr->rdm = rdm; +} + +uint64_t ram_discard_manager_get_min_granularity(const RamDiscardManager *rdm, + const MemoryRegion *mr) +{ + RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_GET_CLASS(rdm); + + g_assert(rdmc->get_min_granularity); + return rdmc->get_min_granularity(rdm, mr); +} + +bool ram_discard_manager_is_populated(const RamDiscardManager *rdm, + const MemoryRegionSection *section) +{ + RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_GET_CLASS(rdm); + + g_assert(rdmc->is_populated); + return rdmc->is_populated(rdm, section); +} + +int ram_discard_manager_replay_populated(const RamDiscardManager *rdm, + MemoryRegionSection *section, + ReplayRamPopulate replay_fn, + void *opaque) +{ + RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_GET_CLASS(rdm); + + g_assert(rdmc->replay_populated); + return rdmc->replay_populated(rdm, section, replay_fn, opaque); +} + +void ram_discard_manager_replay_discarded(const RamDiscardManager *rdm, + MemoryRegionSection *section, + ReplayRamDiscard replay_fn, + void *opaque) +{ + RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_GET_CLASS(rdm); + + g_assert(rdmc->replay_discarded); + rdmc->replay_discarded(rdm, section, replay_fn, opaque); +} + +void ram_discard_manager_register_listener(RamDiscardManager *rdm, + RamDiscardListener *rdl, + MemoryRegionSection *section) +{ + RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_GET_CLASS(rdm); + + g_assert(rdmc->register_listener); + rdmc->register_listener(rdm, rdl, section); +} + +void ram_discard_manager_unregister_listener(RamDiscardManager *rdm, + RamDiscardListener *rdl) +{ + RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_GET_CLASS(rdm); + + g_assert(rdmc->unregister_listener); + rdmc->unregister_listener(rdm, rdl); +} + +/* Called with rcu_read_lock held. */ +bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, + ram_addr_t *ram_addr, bool *read_only, + bool *mr_has_discard_manager) +{ + MemoryRegion *mr; + hwaddr xlat; + hwaddr len = iotlb->addr_mask + 1; + bool writable = iotlb->perm & IOMMU_WO; + + if (mr_has_discard_manager) { + *mr_has_discard_manager = false; + } + /* + * The IOMMU TLB entry we have just covers translation through + * this IOMMU to its immediate target. We need to translate + * it the rest of the way through to memory. + */ + mr = address_space_translate(&address_space_memory, iotlb->translated_addr, + &xlat, &len, writable, MEMTXATTRS_UNSPECIFIED); + if (!memory_region_is_ram(mr)) { + error_report("iommu map to non memory area %" HWADDR_PRIx "", xlat); + return false; + } else if (memory_region_has_ram_discard_manager(mr)) { + RamDiscardManager *rdm = memory_region_get_ram_discard_manager(mr); + MemoryRegionSection tmp = { + .mr = mr, + .offset_within_region = xlat, + .size = int128_make64(len), + }; + if (mr_has_discard_manager) { + *mr_has_discard_manager = true; + } + /* + * Malicious VMs can map memory into the IOMMU, which is expected + * to remain discarded. vfio will pin all pages, populating memory. + * Disallow that. vmstate priorities make sure any RamDiscardManager + * were already restored before IOMMUs are restored. + */ + if (!ram_discard_manager_is_populated(rdm, &tmp)) { + error_report("iommu map to discarded memory (e.g., unplugged via" + " virtio-mem): %" HWADDR_PRIx "", + iotlb->translated_addr); + return false; + } + } + + /* + * Translation truncates length to the IOMMU page size, + * check that it did not truncate too much. + */ + if (len & iotlb->addr_mask) { + error_report("iommu has granularity incompatible with target AS"); + return false; + } + + if (vaddr) { + *vaddr = memory_region_get_ram_ptr(mr) + xlat; + } + + if (ram_addr) { + *ram_addr = memory_region_get_ram_addr(mr) + xlat; + } + + if (read_only) { + *read_only = !writable || mr->readonly; + } + + return true; +} + +void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client) +{ + uint8_t mask = 1 << client; + uint8_t old_logging; + + assert(client == DIRTY_MEMORY_VGA); + old_logging = mr->vga_logging_count; + mr->vga_logging_count += log ? 1 : -1; + if (!!old_logging == !!mr->vga_logging_count) { + return; + } + + memory_region_transaction_begin(); + mr->dirty_log_mask = (mr->dirty_log_mask & ~mask) | (log * mask); + memory_region_update_pending |= mr->enabled; + memory_region_transaction_commit(); +} + +void memory_region_set_dirty(MemoryRegion *mr, hwaddr addr, + hwaddr size) +{ + assert(mr->ram_block); + cpu_physical_memory_set_dirty_range(memory_region_get_ram_addr(mr) + addr, + size, + memory_region_get_dirty_log_mask(mr)); +} + +/* + * If memory region `mr' is NULL, do global sync. Otherwise, sync + * dirty bitmap for the specified memory region. + */ +static void memory_region_sync_dirty_bitmap(MemoryRegion *mr, bool last_stage) +{ + MemoryListener *listener; + AddressSpace *as; + FlatView *view; + FlatRange *fr; + + /* If the same address space has multiple log_sync listeners, we + * visit that address space's FlatView multiple times. But because + * log_sync listeners are rare, it's still cheaper than walking each + * address space once. + */ + QTAILQ_FOREACH(listener, &memory_listeners, link) { + if (listener->log_sync) { + as = listener->address_space; + view = address_space_get_flatview(as); + FOR_EACH_FLAT_RANGE(fr, view) { + if (fr->dirty_log_mask && (!mr || fr->mr == mr)) { + MemoryRegionSection mrs = section_from_flat_range(fr, view); + listener->log_sync(listener, &mrs); + } + } + flatview_unref(view); + trace_memory_region_sync_dirty(mr ? mr->name : "(all)", listener->name, 0); + } else if (listener->log_sync_global) { + /* + * No matter whether MR is specified, what we can do here + * is to do a global sync, because we are not capable to + * sync in a finer granularity. + */ + listener->log_sync_global(listener, last_stage); + trace_memory_region_sync_dirty(mr ? mr->name : "(all)", listener->name, 1); + } + } +} + +void memory_region_clear_dirty_bitmap(MemoryRegion *mr, hwaddr start, + hwaddr len) +{ + MemoryRegionSection mrs; + MemoryListener *listener; + AddressSpace *as; + FlatView *view; + FlatRange *fr; + hwaddr sec_start, sec_end, sec_size; + + QTAILQ_FOREACH(listener, &memory_listeners, link) { + if (!listener->log_clear) { + continue; + } + as = listener->address_space; + view = address_space_get_flatview(as); + FOR_EACH_FLAT_RANGE(fr, view) { + if (!fr->dirty_log_mask || fr->mr != mr) { + /* + * Clear dirty bitmap operation only applies to those + * regions whose dirty logging is at least enabled + */ + continue; + } + + mrs = section_from_flat_range(fr, view); + + sec_start = MAX(mrs.offset_within_region, start); + sec_end = mrs.offset_within_region + int128_get64(mrs.size); + sec_end = MIN(sec_end, start + len); + + if (sec_start >= sec_end) { + /* + * If this memory region section has no intersection + * with the requested range, skip. + */ + continue; + } + + /* Valid case; shrink the section if needed */ + mrs.offset_within_address_space += + sec_start - mrs.offset_within_region; + mrs.offset_within_region = sec_start; + sec_size = sec_end - sec_start; + mrs.size = int128_make64(sec_size); + listener->log_clear(listener, &mrs); + } + flatview_unref(view); + } +} + +DirtyBitmapSnapshot *memory_region_snapshot_and_clear_dirty(MemoryRegion *mr, + hwaddr addr, + hwaddr size, + unsigned client) +{ + DirtyBitmapSnapshot *snapshot; + assert(mr->ram_block); + memory_region_sync_dirty_bitmap(mr, false); + snapshot = cpu_physical_memory_snapshot_and_clear_dirty(mr, addr, size, client); + memory_global_after_dirty_log_sync(); + return snapshot; +} + +bool memory_region_snapshot_get_dirty(MemoryRegion *mr, DirtyBitmapSnapshot *snap, + hwaddr addr, hwaddr size) +{ + assert(mr->ram_block); + return cpu_physical_memory_snapshot_get_dirty(snap, + memory_region_get_ram_addr(mr) + addr, size); +} + +void memory_region_set_readonly(MemoryRegion *mr, bool readonly) +{ + if (mr->readonly != readonly) { + memory_region_transaction_begin(); + mr->readonly = readonly; + memory_region_update_pending |= mr->enabled; + memory_region_transaction_commit(); + } +} + +void memory_region_set_nonvolatile(MemoryRegion *mr, bool nonvolatile) +{ + if (mr->nonvolatile != nonvolatile) { + memory_region_transaction_begin(); + mr->nonvolatile = nonvolatile; + memory_region_update_pending |= mr->enabled; + memory_region_transaction_commit(); + } +} + +void memory_region_rom_device_set_romd(MemoryRegion *mr, bool romd_mode) +{ + if (mr->romd_mode != romd_mode) { + memory_region_transaction_begin(); + mr->romd_mode = romd_mode; + memory_region_update_pending |= mr->enabled; + memory_region_transaction_commit(); + } +} + +void memory_region_reset_dirty(MemoryRegion *mr, hwaddr addr, + hwaddr size, unsigned client) +{ + assert(mr->ram_block); + cpu_physical_memory_test_and_clear_dirty( + memory_region_get_ram_addr(mr) + addr, size, client); +} + +int memory_region_get_fd(MemoryRegion *mr) +{ + RCU_READ_LOCK_GUARD(); + while (mr->alias) { + mr = mr->alias; + } + return mr->ram_block->fd; +} + +void *memory_region_get_ram_ptr(MemoryRegion *mr) +{ + uint64_t offset = 0; + + RCU_READ_LOCK_GUARD(); + while (mr->alias) { + offset += mr->alias_offset; + mr = mr->alias; + } + assert(mr->ram_block); + return qemu_map_ram_ptr(mr->ram_block, offset); +} + +MemoryRegion *memory_region_from_host(void *ptr, ram_addr_t *offset) +{ + RAMBlock *block; + + block = qemu_ram_block_from_host(ptr, false, offset); + if (!block) { + return NULL; + } + + return block->mr; +} + +ram_addr_t memory_region_get_ram_addr(MemoryRegion *mr) +{ + return mr->ram_block ? mr->ram_block->offset : RAM_ADDR_INVALID; +} + +void memory_region_ram_resize(MemoryRegion *mr, ram_addr_t newsize, Error **errp) +{ + assert(mr->ram_block); + + qemu_ram_resize(mr->ram_block, newsize, errp); +} + +void memory_region_msync(MemoryRegion *mr, hwaddr addr, hwaddr size) +{ + if (mr->ram_block) { + qemu_ram_msync(mr->ram_block, addr, size); + } +} + +void memory_region_writeback(MemoryRegion *mr, hwaddr addr, hwaddr size) +{ + /* + * Might be extended case needed to cover + * different types of memory regions + */ + if (mr->dirty_log_mask) { + memory_region_msync(mr, addr, size); + } +} + +/* + * Call proper memory listeners about the change on the newly + * added/removed CoalescedMemoryRange. + */ +static void memory_region_update_coalesced_range(MemoryRegion *mr, + CoalescedMemoryRange *cmr, + bool add) +{ + AddressSpace *as; + FlatView *view; + FlatRange *fr; + + QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { + view = address_space_get_flatview(as); + FOR_EACH_FLAT_RANGE(fr, view) { + if (fr->mr == mr) { + flat_range_coalesced_io_notify(fr, as, cmr, add); + } + } + flatview_unref(view); + } +} + +void memory_region_set_coalescing(MemoryRegion *mr) +{ + memory_region_clear_coalescing(mr); + memory_region_add_coalescing(mr, 0, int128_get64(mr->size)); +} + +void memory_region_add_coalescing(MemoryRegion *mr, + hwaddr offset, + uint64_t size) +{ + CoalescedMemoryRange *cmr = g_malloc(sizeof(*cmr)); + + cmr->addr = addrrange_make(int128_make64(offset), int128_make64(size)); + QTAILQ_INSERT_TAIL(&mr->coalesced, cmr, link); + memory_region_update_coalesced_range(mr, cmr, true); + memory_region_set_flush_coalesced(mr); +} + +void memory_region_clear_coalescing(MemoryRegion *mr) +{ + CoalescedMemoryRange *cmr; + + if (QTAILQ_EMPTY(&mr->coalesced)) { + return; + } + + qemu_flush_coalesced_mmio_buffer(); + mr->flush_coalesced_mmio = false; + + while (!QTAILQ_EMPTY(&mr->coalesced)) { + cmr = QTAILQ_FIRST(&mr->coalesced); + QTAILQ_REMOVE(&mr->coalesced, cmr, link); + memory_region_update_coalesced_range(mr, cmr, false); + g_free(cmr); + } +} + +void memory_region_set_flush_coalesced(MemoryRegion *mr) +{ + mr->flush_coalesced_mmio = true; +} + +void memory_region_clear_flush_coalesced(MemoryRegion *mr) +{ + qemu_flush_coalesced_mmio_buffer(); + if (QTAILQ_EMPTY(&mr->coalesced)) { + mr->flush_coalesced_mmio = false; + } +} + +static bool userspace_eventfd_warning; + +void memory_region_add_eventfd(MemoryRegion *mr, + hwaddr addr, + unsigned size, + bool match_data, + uint64_t data, + EventNotifier *e) +{ + MemoryRegionIoeventfd mrfd = { + .addr.start = int128_make64(addr), + .addr.size = int128_make64(size), + .match_data = match_data, + .data = data, + .e = e, + }; + unsigned i; + + if (kvm_enabled() && (!(kvm_eventfds_enabled() || + userspace_eventfd_warning))) { + userspace_eventfd_warning = true; + error_report("Using eventfd without MMIO binding in KVM. " + "Suboptimal performance expected"); + } + + if (size) { + adjust_endianness(mr, &mrfd.data, size_memop(size) | MO_TE); + } + memory_region_transaction_begin(); + for (i = 0; i < mr->ioeventfd_nb; ++i) { + if (memory_region_ioeventfd_before(&mrfd, &mr->ioeventfds[i])) { + break; + } + } + ++mr->ioeventfd_nb; + mr->ioeventfds = g_realloc(mr->ioeventfds, + sizeof(*mr->ioeventfds) * mr->ioeventfd_nb); + memmove(&mr->ioeventfds[i+1], &mr->ioeventfds[i], + sizeof(*mr->ioeventfds) * (mr->ioeventfd_nb-1 - i)); + mr->ioeventfds[i] = mrfd; + ioeventfd_update_pending |= mr->enabled; + memory_region_transaction_commit(); +} + +void memory_region_del_eventfd(MemoryRegion *mr, + hwaddr addr, + unsigned size, + bool match_data, + uint64_t data, + EventNotifier *e) +{ + MemoryRegionIoeventfd mrfd = { + .addr.start = int128_make64(addr), + .addr.size = int128_make64(size), + .match_data = match_data, + .data = data, + .e = e, + }; + unsigned i; + + if (size) { + adjust_endianness(mr, &mrfd.data, size_memop(size) | MO_TE); + } + memory_region_transaction_begin(); + for (i = 0; i < mr->ioeventfd_nb; ++i) { + if (memory_region_ioeventfd_equal(&mrfd, &mr->ioeventfds[i])) { + break; + } + } + assert(i != mr->ioeventfd_nb); + memmove(&mr->ioeventfds[i], &mr->ioeventfds[i+1], + sizeof(*mr->ioeventfds) * (mr->ioeventfd_nb - (i+1))); + --mr->ioeventfd_nb; + mr->ioeventfds = g_realloc(mr->ioeventfds, + sizeof(*mr->ioeventfds)*mr->ioeventfd_nb + 1); + ioeventfd_update_pending |= mr->enabled; + memory_region_transaction_commit(); +} + +static void memory_region_update_container_subregions(MemoryRegion *subregion) +{ + MemoryRegion *mr = subregion->container; + MemoryRegion *other; + + memory_region_transaction_begin(); + + memory_region_ref(subregion); + QTAILQ_FOREACH(other, &mr->subregions, subregions_link) { + if (subregion->priority >= other->priority) { + QTAILQ_INSERT_BEFORE(other, subregion, subregions_link); + goto done; + } + } + QTAILQ_INSERT_TAIL(&mr->subregions, subregion, subregions_link); +done: + memory_region_update_pending |= mr->enabled && subregion->enabled; + memory_region_transaction_commit(); +} + +static void memory_region_add_subregion_common(MemoryRegion *mr, + hwaddr offset, + MemoryRegion *subregion) +{ + MemoryRegion *alias; + + assert(!subregion->container); + subregion->container = mr; + for (alias = subregion->alias; alias; alias = alias->alias) { + alias->mapped_via_alias++; + } + subregion->addr = offset; + memory_region_update_container_subregions(subregion); +} + +void memory_region_add_subregion(MemoryRegion *mr, + hwaddr offset, + MemoryRegion *subregion) +{ + subregion->priority = 0; + memory_region_add_subregion_common(mr, offset, subregion); +} + +void memory_region_add_subregion_overlap(MemoryRegion *mr, + hwaddr offset, + MemoryRegion *subregion, + int priority) +{ + subregion->priority = priority; + memory_region_add_subregion_common(mr, offset, subregion); +} + +void memory_region_del_subregion(MemoryRegion *mr, + MemoryRegion *subregion) +{ + MemoryRegion *alias; + + memory_region_transaction_begin(); + assert(subregion->container == mr); + subregion->container = NULL; + for (alias = subregion->alias; alias; alias = alias->alias) { + alias->mapped_via_alias--; + assert(alias->mapped_via_alias >= 0); + } + QTAILQ_REMOVE(&mr->subregions, subregion, subregions_link); + memory_region_unref(subregion); + memory_region_update_pending |= mr->enabled && subregion->enabled; + memory_region_transaction_commit(); +} + +void memory_region_set_enabled(MemoryRegion *mr, bool enabled) +{ + if (enabled == mr->enabled) { + return; + } + memory_region_transaction_begin(); + mr->enabled = enabled; + memory_region_update_pending = true; + memory_region_transaction_commit(); +} + +void memory_region_set_size(MemoryRegion *mr, uint64_t size) +{ + Int128 s = int128_make64(size); + + if (size == UINT64_MAX) { + s = int128_2_64(); + } + if (int128_eq(s, mr->size)) { + return; + } + memory_region_transaction_begin(); + mr->size = s; + memory_region_update_pending = true; + memory_region_transaction_commit(); +} + +static void memory_region_readd_subregion(MemoryRegion *mr) +{ + MemoryRegion *container = mr->container; + + if (container) { + memory_region_transaction_begin(); + memory_region_ref(mr); + memory_region_del_subregion(container, mr); + memory_region_add_subregion_common(container, mr->addr, mr); + memory_region_unref(mr); + memory_region_transaction_commit(); + } +} + +void memory_region_set_address(MemoryRegion *mr, hwaddr addr) +{ + if (addr != mr->addr) { + mr->addr = addr; + memory_region_readd_subregion(mr); + } +} + +void memory_region_set_alias_offset(MemoryRegion *mr, hwaddr offset) +{ + assert(mr->alias); + + if (offset == mr->alias_offset) { + return; + } + + memory_region_transaction_begin(); + mr->alias_offset = offset; + memory_region_update_pending |= mr->enabled; + memory_region_transaction_commit(); +} + +uint64_t memory_region_get_alignment(const MemoryRegion *mr) +{ + return mr->align; +} + +static int cmp_flatrange_addr(const void *addr_, const void *fr_) +{ + const AddrRange *addr = addr_; + const FlatRange *fr = fr_; + + if (int128_le(addrrange_end(*addr), fr->addr.start)) { + return -1; + } else if (int128_ge(addr->start, addrrange_end(fr->addr))) { + return 1; + } + return 0; +} + +static FlatRange *flatview_lookup(FlatView *view, AddrRange addr) +{ + return bsearch(&addr, view->ranges, view->nr, + sizeof(FlatRange), cmp_flatrange_addr); +} + +bool memory_region_is_mapped(MemoryRegion *mr) +{ + return !!mr->container || mr->mapped_via_alias; +} + +/* Same as memory_region_find, but it does not add a reference to the + * returned region. It must be called from an RCU critical section. + */ +static MemoryRegionSection memory_region_find_rcu(MemoryRegion *mr, + hwaddr addr, uint64_t size) +{ + MemoryRegionSection ret = { .mr = NULL }; + MemoryRegion *root; + AddressSpace *as; + AddrRange range; + FlatView *view; + FlatRange *fr; + + addr += mr->addr; + for (root = mr; root->container; ) { + root = root->container; + addr += root->addr; + } + + as = memory_region_to_address_space(root); + if (!as) { + return ret; + } + range = addrrange_make(int128_make64(addr), int128_make64(size)); + + view = address_space_to_flatview(as); + fr = flatview_lookup(view, range); + if (!fr) { + return ret; + } + + while (fr > view->ranges && addrrange_intersects(fr[-1].addr, range)) { + --fr; + } + + ret.mr = fr->mr; + ret.fv = view; + range = addrrange_intersection(range, fr->addr); + ret.offset_within_region = fr->offset_in_region; + ret.offset_within_region += int128_get64(int128_sub(range.start, + fr->addr.start)); + ret.size = range.size; + ret.offset_within_address_space = int128_get64(range.start); + ret.readonly = fr->readonly; + ret.nonvolatile = fr->nonvolatile; + return ret; +} + +MemoryRegionSection memory_region_find(MemoryRegion *mr, + hwaddr addr, uint64_t size) +{ + MemoryRegionSection ret; + RCU_READ_LOCK_GUARD(); + ret = memory_region_find_rcu(mr, addr, size); + if (ret.mr) { + memory_region_ref(ret.mr); + } + return ret; +} + +MemoryRegionSection *memory_region_section_new_copy(MemoryRegionSection *s) +{ + MemoryRegionSection *tmp = g_new(MemoryRegionSection, 1); + + *tmp = *s; + if (tmp->mr) { + memory_region_ref(tmp->mr); + } + if (tmp->fv) { + bool ret = flatview_ref(tmp->fv); + + g_assert(ret); + } + return tmp; +} + +void memory_region_section_free_copy(MemoryRegionSection *s) +{ + if (s->fv) { + flatview_unref(s->fv); + } + if (s->mr) { + memory_region_unref(s->mr); + } + g_free(s); +} + +bool memory_region_present(MemoryRegion *container, hwaddr addr) +{ + MemoryRegion *mr; + + RCU_READ_LOCK_GUARD(); + mr = memory_region_find_rcu(container, addr, 1).mr; + return mr && mr != container; +} + +void memory_global_dirty_log_sync(bool last_stage) +{ + memory_region_sync_dirty_bitmap(NULL, last_stage); +} + +void memory_global_after_dirty_log_sync(void) +{ + MEMORY_LISTENER_CALL_GLOBAL(log_global_after_sync, Forward); +} + +/* + * Dirty track stop flags that are postponed due to VM being stopped. Should + * only be used within vmstate_change hook. + */ +static unsigned int postponed_stop_flags; +static VMChangeStateEntry *vmstate_change; +static void memory_global_dirty_log_stop_postponed_run(void); + +void memory_global_dirty_log_start(unsigned int flags) +{ + unsigned int old_flags; + + assert(flags && !(flags & (~GLOBAL_DIRTY_MASK))); + + if (vmstate_change) { + /* If there is postponed stop(), operate on it first */ + postponed_stop_flags &= ~flags; + memory_global_dirty_log_stop_postponed_run(); + } + + flags &= ~global_dirty_tracking; + if (!flags) { + return; + } + + old_flags = global_dirty_tracking; + global_dirty_tracking |= flags; + trace_global_dirty_changed(global_dirty_tracking); + + if (!old_flags) { + MEMORY_LISTENER_CALL_GLOBAL(log_global_start, Forward); + memory_region_transaction_begin(); + memory_region_update_pending = true; + memory_region_transaction_commit(); + } +} + +static void memory_global_dirty_log_do_stop(unsigned int flags) +{ + assert(flags && !(flags & (~GLOBAL_DIRTY_MASK))); + assert((global_dirty_tracking & flags) == flags); + global_dirty_tracking &= ~flags; + + trace_global_dirty_changed(global_dirty_tracking); + + if (!global_dirty_tracking) { + memory_region_transaction_begin(); + memory_region_update_pending = true; + memory_region_transaction_commit(); + MEMORY_LISTENER_CALL_GLOBAL(log_global_stop, Reverse); + } +} + +/* + * Execute the postponed dirty log stop operations if there is, then reset + * everything (including the flags and the vmstate change hook). + */ +static void memory_global_dirty_log_stop_postponed_run(void) +{ + /* This must be called with the vmstate handler registered */ + assert(vmstate_change); + + /* Note: postponed_stop_flags can be cleared in log start routine */ + if (postponed_stop_flags) { + memory_global_dirty_log_do_stop(postponed_stop_flags); + postponed_stop_flags = 0; + } + + qemu_del_vm_change_state_handler(vmstate_change); + vmstate_change = NULL; +} + +static void memory_vm_change_state_handler(void *opaque, bool running, + RunState state) +{ + if (running) { + memory_global_dirty_log_stop_postponed_run(); + } +} + +void memory_global_dirty_log_stop(unsigned int flags) +{ + if (!runstate_is_running()) { + /* Postpone the dirty log stop, e.g., to when VM starts again */ + if (vmstate_change) { + /* Batch with previous postponed flags */ + postponed_stop_flags |= flags; + } else { + postponed_stop_flags = flags; + vmstate_change = qemu_add_vm_change_state_handler( + memory_vm_change_state_handler, NULL); + } + return; + } + + memory_global_dirty_log_do_stop(flags); +} + +static void listener_add_address_space(MemoryListener *listener, + AddressSpace *as) +{ + FlatView *view; + FlatRange *fr; + + if (listener->begin) { + listener->begin(listener); + } + if (global_dirty_tracking) { + if (listener->log_global_start) { + listener->log_global_start(listener); + } + } + + view = address_space_get_flatview(as); + FOR_EACH_FLAT_RANGE(fr, view) { + MemoryRegionSection section = section_from_flat_range(fr, view); + + if (listener->region_add) { + listener->region_add(listener, §ion); + } + if (fr->dirty_log_mask && listener->log_start) { + listener->log_start(listener, §ion, 0, fr->dirty_log_mask); + } + } + if (listener->commit) { + listener->commit(listener); + } + flatview_unref(view); +} + +static void listener_del_address_space(MemoryListener *listener, + AddressSpace *as) +{ + FlatView *view; + FlatRange *fr; + + if (listener->begin) { + listener->begin(listener); + } + view = address_space_get_flatview(as); + FOR_EACH_FLAT_RANGE(fr, view) { + MemoryRegionSection section = section_from_flat_range(fr, view); + + if (fr->dirty_log_mask && listener->log_stop) { + listener->log_stop(listener, §ion, fr->dirty_log_mask, 0); + } + if (listener->region_del) { + listener->region_del(listener, §ion); + } + } + if (listener->commit) { + listener->commit(listener); + } + flatview_unref(view); +} + +void memory_listener_register(MemoryListener *listener, AddressSpace *as) +{ + MemoryListener *other = NULL; + + /* Only one of them can be defined for a listener */ + assert(!(listener->log_sync && listener->log_sync_global)); + + listener->address_space = as; + if (QTAILQ_EMPTY(&memory_listeners) + || listener->priority >= QTAILQ_LAST(&memory_listeners)->priority) { + QTAILQ_INSERT_TAIL(&memory_listeners, listener, link); + } else { + QTAILQ_FOREACH(other, &memory_listeners, link) { + if (listener->priority < other->priority) { + break; + } + } + QTAILQ_INSERT_BEFORE(other, listener, link); + } + + if (QTAILQ_EMPTY(&as->listeners) + || listener->priority >= QTAILQ_LAST(&as->listeners)->priority) { + QTAILQ_INSERT_TAIL(&as->listeners, listener, link_as); + } else { + QTAILQ_FOREACH(other, &as->listeners, link_as) { + if (listener->priority < other->priority) { + break; + } + } + QTAILQ_INSERT_BEFORE(other, listener, link_as); + } + + listener_add_address_space(listener, as); + + if (listener->eventfd_add || listener->eventfd_del) { + as->ioeventfd_notifiers++; + } +} + +void memory_listener_unregister(MemoryListener *listener) +{ + if (!listener->address_space) { + return; + } + + if (listener->eventfd_add || listener->eventfd_del) { + listener->address_space->ioeventfd_notifiers--; + } + + listener_del_address_space(listener, listener->address_space); + QTAILQ_REMOVE(&memory_listeners, listener, link); + QTAILQ_REMOVE(&listener->address_space->listeners, listener, link_as); + listener->address_space = NULL; +} + +void address_space_remove_listeners(AddressSpace *as) +{ + while (!QTAILQ_EMPTY(&as->listeners)) { + memory_listener_unregister(QTAILQ_FIRST(&as->listeners)); + } +} + +void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name) +{ + memory_region_ref(root); + as->root = root; + as->current_map = NULL; + as->ioeventfd_nb = 0; + as->ioeventfds = NULL; + QTAILQ_INIT(&as->listeners); + QTAILQ_INSERT_TAIL(&address_spaces, as, address_spaces_link); + as->name = g_strdup(name ? name : "anonymous"); + address_space_update_topology(as); + address_space_update_ioeventfds(as); +} + +static void do_address_space_destroy(AddressSpace *as) +{ + assert(QTAILQ_EMPTY(&as->listeners)); + + flatview_unref(as->current_map); + g_free(as->name); + g_free(as->ioeventfds); + memory_region_unref(as->root); +} + +void address_space_destroy(AddressSpace *as) +{ + MemoryRegion *root = as->root; + + /* Flush out anything from MemoryListeners listening in on this */ + memory_region_transaction_begin(); + as->root = NULL; + memory_region_transaction_commit(); + QTAILQ_REMOVE(&address_spaces, as, address_spaces_link); + + /* At this point, as->dispatch and as->current_map are dummy + * entries that the guest should never use. Wait for the old + * values to expire before freeing the data. + */ + as->root = root; + call_rcu(as, do_address_space_destroy, rcu); +} + +static const char *memory_region_type(MemoryRegion *mr) +{ + if (mr->alias) { + return memory_region_type(mr->alias); + } + if (memory_region_is_ram_device(mr)) { + return "ramd"; + } else if (memory_region_is_romd(mr)) { + return "romd"; + } else if (memory_region_is_rom(mr)) { + return "rom"; + } else if (memory_region_is_ram(mr)) { + return "ram"; + } else { + return "i/o"; + } +} + +typedef struct MemoryRegionList MemoryRegionList; + +struct MemoryRegionList { + const MemoryRegion *mr; + QTAILQ_ENTRY(MemoryRegionList) mrqueue; +}; + +typedef QTAILQ_HEAD(, MemoryRegionList) MemoryRegionListHead; + +#define MR_SIZE(size) (int128_nz(size) ? (hwaddr)int128_get64( \ + int128_sub((size), int128_one())) : 0) +#define MTREE_INDENT " " + +static void mtree_expand_owner(const char *label, Object *obj) +{ + DeviceState *dev = (DeviceState *) object_dynamic_cast(obj, TYPE_DEVICE); + + qemu_printf(" %s:{%s", label, dev ? "dev" : "obj"); + if (dev && dev->id) { + qemu_printf(" id=%s", dev->id); + } else { + char *canonical_path = object_get_canonical_path(obj); + if (canonical_path) { + qemu_printf(" path=%s", canonical_path); + g_free(canonical_path); + } else { + qemu_printf(" type=%s", object_get_typename(obj)); + } + } + qemu_printf("}"); +} + +static void mtree_print_mr_owner(const MemoryRegion *mr) +{ + Object *owner = mr->owner; + Object *parent = memory_region_owner((MemoryRegion *)mr); + + if (!owner && !parent) { + qemu_printf(" orphan"); + return; + } + if (owner) { + mtree_expand_owner("owner", owner); + } + if (parent && parent != owner) { + mtree_expand_owner("parent", parent); + } +} + +static void mtree_print_mr(const MemoryRegion *mr, unsigned int level, + hwaddr base, + MemoryRegionListHead *alias_print_queue, + bool owner, bool display_disabled) +{ + MemoryRegionList *new_ml, *ml, *next_ml; + MemoryRegionListHead submr_print_queue; + const MemoryRegion *submr; + unsigned int i; + hwaddr cur_start, cur_end; + + if (!mr) { + return; + } + + cur_start = base + mr->addr; + cur_end = cur_start + MR_SIZE(mr->size); + + /* + * Try to detect overflow of memory region. This should never + * happen normally. When it happens, we dump something to warn the + * user who is observing this. + */ + if (cur_start < base || cur_end < cur_start) { + qemu_printf("[DETECTED OVERFLOW!] "); + } + + if (mr->alias) { + bool found = false; + + /* check if the alias is already in the queue */ + QTAILQ_FOREACH(ml, alias_print_queue, mrqueue) { + if (ml->mr == mr->alias) { + found = true; + } + } + + if (!found) { + ml = g_new(MemoryRegionList, 1); + ml->mr = mr->alias; + QTAILQ_INSERT_TAIL(alias_print_queue, ml, mrqueue); + } + if (mr->enabled || display_disabled) { + for (i = 0; i < level; i++) { + qemu_printf(MTREE_INDENT); + } + qemu_printf(HWADDR_FMT_plx "-" HWADDR_FMT_plx + " (prio %d, %s%s): alias %s @%s " HWADDR_FMT_plx + "-" HWADDR_FMT_plx "%s", + cur_start, cur_end, + mr->priority, + mr->nonvolatile ? "nv-" : "", + memory_region_type((MemoryRegion *)mr), + memory_region_name(mr), + memory_region_name(mr->alias), + mr->alias_offset, + mr->alias_offset + MR_SIZE(mr->size), + mr->enabled ? "" : " [disabled]"); + if (owner) { + mtree_print_mr_owner(mr); + } + qemu_printf("\n"); + } + } else { + if (mr->enabled || display_disabled) { + for (i = 0; i < level; i++) { + qemu_printf(MTREE_INDENT); + } + qemu_printf(HWADDR_FMT_plx "-" HWADDR_FMT_plx + " (prio %d, %s%s): %s%s", + cur_start, cur_end, + mr->priority, + mr->nonvolatile ? "nv-" : "", + memory_region_type((MemoryRegion *)mr), + memory_region_name(mr), + mr->enabled ? "" : " [disabled]"); + if (owner) { + mtree_print_mr_owner(mr); + } + qemu_printf("\n"); + } + } + + QTAILQ_INIT(&submr_print_queue); + + QTAILQ_FOREACH(submr, &mr->subregions, subregions_link) { + new_ml = g_new(MemoryRegionList, 1); + new_ml->mr = submr; + QTAILQ_FOREACH(ml, &submr_print_queue, mrqueue) { + if (new_ml->mr->addr < ml->mr->addr || + (new_ml->mr->addr == ml->mr->addr && + new_ml->mr->priority > ml->mr->priority)) { + QTAILQ_INSERT_BEFORE(ml, new_ml, mrqueue); + new_ml = NULL; + break; + } + } + if (new_ml) { + QTAILQ_INSERT_TAIL(&submr_print_queue, new_ml, mrqueue); + } + } + + QTAILQ_FOREACH(ml, &submr_print_queue, mrqueue) { + mtree_print_mr(ml->mr, level + 1, cur_start, + alias_print_queue, owner, display_disabled); + } + + QTAILQ_FOREACH_SAFE(ml, &submr_print_queue, mrqueue, next_ml) { + g_free(ml); + } +} + +struct FlatViewInfo { + int counter; + bool dispatch_tree; + bool owner; + AccelClass *ac; +}; + +static void mtree_print_flatview(gpointer key, gpointer value, + gpointer user_data) +{ + FlatView *view = key; + GArray *fv_address_spaces = value; + struct FlatViewInfo *fvi = user_data; + FlatRange *range = &view->ranges[0]; + MemoryRegion *mr; + int n = view->nr; + int i; + AddressSpace *as; + + qemu_printf("FlatView #%d\n", fvi->counter); + ++fvi->counter; + + for (i = 0; i < fv_address_spaces->len; ++i) { + as = g_array_index(fv_address_spaces, AddressSpace*, i); + qemu_printf(" AS \"%s\", root: %s", + as->name, memory_region_name(as->root)); + if (as->root->alias) { + qemu_printf(", alias %s", memory_region_name(as->root->alias)); + } + qemu_printf("\n"); + } + + qemu_printf(" Root memory region: %s\n", + view->root ? memory_region_name(view->root) : "(none)"); + + if (n <= 0) { + qemu_printf(MTREE_INDENT "No rendered FlatView\n\n"); + return; + } + + while (n--) { + mr = range->mr; + if (range->offset_in_region) { + qemu_printf(MTREE_INDENT HWADDR_FMT_plx "-" HWADDR_FMT_plx + " (prio %d, %s%s): %s @" HWADDR_FMT_plx, + int128_get64(range->addr.start), + int128_get64(range->addr.start) + + MR_SIZE(range->addr.size), + mr->priority, + range->nonvolatile ? "nv-" : "", + range->readonly ? "rom" : memory_region_type(mr), + memory_region_name(mr), + range->offset_in_region); + } else { + qemu_printf(MTREE_INDENT HWADDR_FMT_plx "-" HWADDR_FMT_plx + " (prio %d, %s%s): %s", + int128_get64(range->addr.start), + int128_get64(range->addr.start) + + MR_SIZE(range->addr.size), + mr->priority, + range->nonvolatile ? "nv-" : "", + range->readonly ? "rom" : memory_region_type(mr), + memory_region_name(mr)); + } + if (fvi->owner) { + mtree_print_mr_owner(mr); + } + + if (fvi->ac) { + for (i = 0; i < fv_address_spaces->len; ++i) { + as = g_array_index(fv_address_spaces, AddressSpace*, i); + if (fvi->ac->has_memory(current_machine, as, + int128_get64(range->addr.start), + MR_SIZE(range->addr.size) + 1)) { + qemu_printf(" %s", fvi->ac->name); + } + } + } + qemu_printf("\n"); + range++; + } + +#if !defined(CONFIG_USER_ONLY) + if (fvi->dispatch_tree && view->root) { + mtree_print_dispatch(view->dispatch, view->root); + } +#endif + + qemu_printf("\n"); +} + +static gboolean mtree_info_flatview_free(gpointer key, gpointer value, + gpointer user_data) +{ + FlatView *view = key; + GArray *fv_address_spaces = value; + + g_array_unref(fv_address_spaces); + flatview_unref(view); + + return true; +} + +static void mtree_info_flatview(bool dispatch_tree, bool owner) +{ + struct FlatViewInfo fvi = { + .counter = 0, + .dispatch_tree = dispatch_tree, + .owner = owner, + }; + AddressSpace *as; + FlatView *view; + GArray *fv_address_spaces; + GHashTable *views = g_hash_table_new(g_direct_hash, g_direct_equal); + AccelClass *ac = ACCEL_GET_CLASS(current_accel()); + + if (ac->has_memory) { + fvi.ac = ac; + } + + /* Gather all FVs in one table */ + QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { + view = address_space_get_flatview(as); + + fv_address_spaces = g_hash_table_lookup(views, view); + if (!fv_address_spaces) { + fv_address_spaces = g_array_new(false, false, sizeof(as)); + g_hash_table_insert(views, view, fv_address_spaces); + } + + g_array_append_val(fv_address_spaces, as); + } + + /* Print */ + g_hash_table_foreach(views, mtree_print_flatview, &fvi); + + /* Free */ + g_hash_table_foreach_remove(views, mtree_info_flatview_free, 0); + g_hash_table_unref(views); +} + +struct AddressSpaceInfo { + MemoryRegionListHead *ml_head; + bool owner; + bool disabled; +}; + +/* Returns negative value if a < b; zero if a = b; positive value if a > b. */ +static gint address_space_compare_name(gconstpointer a, gconstpointer b) +{ + const AddressSpace *as_a = a; + const AddressSpace *as_b = b; + + return g_strcmp0(as_a->name, as_b->name); +} + +static void mtree_print_as_name(gpointer data, gpointer user_data) +{ + AddressSpace *as = data; + + qemu_printf("address-space: %s\n", as->name); +} + +static void mtree_print_as(gpointer key, gpointer value, gpointer user_data) +{ + MemoryRegion *mr = key; + GSList *as_same_root_mr_list = value; + struct AddressSpaceInfo *asi = user_data; + + g_slist_foreach(as_same_root_mr_list, mtree_print_as_name, NULL); + mtree_print_mr(mr, 1, 0, asi->ml_head, asi->owner, asi->disabled); + qemu_printf("\n"); +} + +static gboolean mtree_info_as_free(gpointer key, gpointer value, + gpointer user_data) +{ + GSList *as_same_root_mr_list = value; + + g_slist_free(as_same_root_mr_list); + + return true; +} + +static void mtree_info_as(bool dispatch_tree, bool owner, bool disabled) +{ + MemoryRegionListHead ml_head; + MemoryRegionList *ml, *ml2; + AddressSpace *as; + GHashTable *views = g_hash_table_new(g_direct_hash, g_direct_equal); + GSList *as_same_root_mr_list; + struct AddressSpaceInfo asi = { + .ml_head = &ml_head, + .owner = owner, + .disabled = disabled, + }; + + QTAILQ_INIT(&ml_head); + + QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { + /* Create hashtable, key=AS root MR, value = list of AS */ + as_same_root_mr_list = g_hash_table_lookup(views, as->root); + as_same_root_mr_list = g_slist_insert_sorted(as_same_root_mr_list, as, + address_space_compare_name); + g_hash_table_insert(views, as->root, as_same_root_mr_list); + } + + /* print address spaces */ + g_hash_table_foreach(views, mtree_print_as, &asi); + g_hash_table_foreach_remove(views, mtree_info_as_free, 0); + g_hash_table_unref(views); + + /* print aliased regions */ + QTAILQ_FOREACH(ml, &ml_head, mrqueue) { + qemu_printf("memory-region: %s\n", memory_region_name(ml->mr)); + mtree_print_mr(ml->mr, 1, 0, &ml_head, owner, disabled); + qemu_printf("\n"); + } + + QTAILQ_FOREACH_SAFE(ml, &ml_head, mrqueue, ml2) { + g_free(ml); + } +} + +void mtree_info(bool flatview, bool dispatch_tree, bool owner, bool disabled) +{ + if (flatview) { + mtree_info_flatview(dispatch_tree, owner); + } else { + mtree_info_as(dispatch_tree, owner, disabled); + } +} + +void memory_region_init_ram(MemoryRegion *mr, + Object *owner, + const char *name, + uint64_t size, + Error **errp) +{ + DeviceState *owner_dev; + Error *err = NULL; + + memory_region_init_ram_nomigrate(mr, owner, name, size, &err); + if (err) { + error_propagate(errp, err); + return; + } + /* This will assert if owner is neither NULL nor a DeviceState. + * We only want the owner here for the purposes of defining a + * unique name for migration. TODO: Ideally we should implement + * a naming scheme for Objects which are not DeviceStates, in + * which case we can relax this restriction. + */ + owner_dev = DEVICE(owner); + vmstate_register_ram(mr, owner_dev); +} + +void memory_region_init_rom(MemoryRegion *mr, + Object *owner, + const char *name, + uint64_t size, + Error **errp) +{ + DeviceState *owner_dev; + Error *err = NULL; + + memory_region_init_rom_nomigrate(mr, owner, name, size, &err); + if (err) { + error_propagate(errp, err); + return; + } + /* This will assert if owner is neither NULL nor a DeviceState. + * We only want the owner here for the purposes of defining a + * unique name for migration. TODO: Ideally we should implement + * a naming scheme for Objects which are not DeviceStates, in + * which case we can relax this restriction. + */ + owner_dev = DEVICE(owner); + vmstate_register_ram(mr, owner_dev); +} + +void memory_region_init_rom_device(MemoryRegion *mr, + Object *owner, + const MemoryRegionOps *ops, + void *opaque, + const char *name, + uint64_t size, + Error **errp) +{ + DeviceState *owner_dev; + Error *err = NULL; + + memory_region_init_rom_device_nomigrate(mr, owner, ops, opaque, + name, size, &err); + if (err) { + error_propagate(errp, err); + return; + } + /* This will assert if owner is neither NULL nor a DeviceState. + * We only want the owner here for the purposes of defining a + * unique name for migration. TODO: Ideally we should implement + * a naming scheme for Objects which are not DeviceStates, in + * which case we can relax this restriction. + */ + owner_dev = DEVICE(owner); + vmstate_register_ram(mr, owner_dev); +} + +/* + * Support system builds with CONFIG_FUZZ using a weak symbol and a stub for + * the fuzz_dma_read_cb callback + */ +#ifdef CONFIG_FUZZ +void __attribute__((weak)) fuzz_dma_read_cb(size_t addr, + size_t len, + MemoryRegion *mr) +{ +} +#endif + +static const TypeInfo memory_region_info = { + .parent = TYPE_OBJECT, + .name = TYPE_MEMORY_REGION, + .class_size = sizeof(MemoryRegionClass), + .instance_size = sizeof(MemoryRegion), + .instance_init = memory_region_initfn, + .instance_finalize = memory_region_finalize, +}; + +static const TypeInfo iommu_memory_region_info = { + .parent = TYPE_MEMORY_REGION, + .name = TYPE_IOMMU_MEMORY_REGION, + .class_size = sizeof(IOMMUMemoryRegionClass), + .instance_size = sizeof(IOMMUMemoryRegion), + .instance_init = iommu_memory_region_initfn, + .abstract = true, +}; + +static const TypeInfo ram_discard_manager_info = { + .parent = TYPE_INTERFACE, + .name = TYPE_RAM_DISCARD_MANAGER, + .class_size = sizeof(RamDiscardManagerClass), +}; + +static void memory_register_types(void) +{ + type_register_static(&memory_region_info); + type_register_static(&iommu_memory_region_info); + type_register_static(&ram_discard_manager_info); +} + +type_init(memory_register_types) diff --git a/system/memory_mapping.c b/system/memory_mapping.c new file mode 100644 index 0000000000..d7f1d096e0 --- /dev/null +++ b/system/memory_mapping.c @@ -0,0 +1,377 @@ +/* + * QEMU memory mapping + * + * Copyright Fujitsu, Corp. 2011, 2012 + * + * Authors: + * Wen Congyang <wency@cn.fujitsu.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" + +#include "sysemu/memory_mapping.h" +#include "exec/memory.h" +#include "exec/address-spaces.h" +#include "hw/core/cpu.h" + +//#define DEBUG_GUEST_PHYS_REGION_ADD + +static void memory_mapping_list_add_mapping_sorted(MemoryMappingList *list, + MemoryMapping *mapping) +{ + MemoryMapping *p; + + QTAILQ_FOREACH(p, &list->head, next) { + if (p->phys_addr >= mapping->phys_addr) { + QTAILQ_INSERT_BEFORE(p, mapping, next); + return; + } + } + QTAILQ_INSERT_TAIL(&list->head, mapping, next); +} + +static void create_new_memory_mapping(MemoryMappingList *list, + hwaddr phys_addr, + hwaddr virt_addr, + ram_addr_t length) +{ + MemoryMapping *memory_mapping; + + memory_mapping = g_new(MemoryMapping, 1); + memory_mapping->phys_addr = phys_addr; + memory_mapping->virt_addr = virt_addr; + memory_mapping->length = length; + list->last_mapping = memory_mapping; + list->num++; + memory_mapping_list_add_mapping_sorted(list, memory_mapping); +} + +static inline bool mapping_contiguous(MemoryMapping *map, + hwaddr phys_addr, + hwaddr virt_addr) +{ + return phys_addr == map->phys_addr + map->length && + virt_addr == map->virt_addr + map->length; +} + +/* + * [map->phys_addr, map->phys_addr + map->length) and + * [phys_addr, phys_addr + length) have intersection? + */ +static inline bool mapping_have_same_region(MemoryMapping *map, + hwaddr phys_addr, + ram_addr_t length) +{ + return !(phys_addr + length < map->phys_addr || + phys_addr >= map->phys_addr + map->length); +} + +/* + * [map->phys_addr, map->phys_addr + map->length) and + * [phys_addr, phys_addr + length) have intersection. The virtual address in the + * intersection are the same? + */ +static inline bool mapping_conflict(MemoryMapping *map, + hwaddr phys_addr, + hwaddr virt_addr) +{ + return virt_addr - map->virt_addr != phys_addr - map->phys_addr; +} + +/* + * [map->virt_addr, map->virt_addr + map->length) and + * [virt_addr, virt_addr + length) have intersection. And the physical address + * in the intersection are the same. + */ +static inline void mapping_merge(MemoryMapping *map, + hwaddr virt_addr, + ram_addr_t length) +{ + if (virt_addr < map->virt_addr) { + map->length += map->virt_addr - virt_addr; + map->virt_addr = virt_addr; + } + + if ((virt_addr + length) > + (map->virt_addr + map->length)) { + map->length = virt_addr + length - map->virt_addr; + } +} + +void memory_mapping_list_add_merge_sorted(MemoryMappingList *list, + hwaddr phys_addr, + hwaddr virt_addr, + ram_addr_t length) +{ + MemoryMapping *memory_mapping, *last_mapping; + + if (QTAILQ_EMPTY(&list->head)) { + create_new_memory_mapping(list, phys_addr, virt_addr, length); + return; + } + + last_mapping = list->last_mapping; + if (last_mapping) { + if (mapping_contiguous(last_mapping, phys_addr, virt_addr)) { + last_mapping->length += length; + return; + } + } + + QTAILQ_FOREACH(memory_mapping, &list->head, next) { + if (mapping_contiguous(memory_mapping, phys_addr, virt_addr)) { + memory_mapping->length += length; + list->last_mapping = memory_mapping; + return; + } + + if (phys_addr + length < memory_mapping->phys_addr) { + /* create a new region before memory_mapping */ + break; + } + + if (mapping_have_same_region(memory_mapping, phys_addr, length)) { + if (mapping_conflict(memory_mapping, phys_addr, virt_addr)) { + continue; + } + + /* merge this region into memory_mapping */ + mapping_merge(memory_mapping, virt_addr, length); + list->last_mapping = memory_mapping; + return; + } + } + + /* this region can not be merged into any existed memory mapping. */ + create_new_memory_mapping(list, phys_addr, virt_addr, length); +} + +void memory_mapping_list_free(MemoryMappingList *list) +{ + MemoryMapping *p, *q; + + QTAILQ_FOREACH_SAFE(p, &list->head, next, q) { + QTAILQ_REMOVE(&list->head, p, next); + g_free(p); + } + + list->num = 0; + list->last_mapping = NULL; +} + +void memory_mapping_list_init(MemoryMappingList *list) +{ + list->num = 0; + list->last_mapping = NULL; + QTAILQ_INIT(&list->head); +} + +void guest_phys_blocks_free(GuestPhysBlockList *list) +{ + GuestPhysBlock *p, *q; + + QTAILQ_FOREACH_SAFE(p, &list->head, next, q) { + QTAILQ_REMOVE(&list->head, p, next); + memory_region_unref(p->mr); + g_free(p); + } + list->num = 0; +} + +void guest_phys_blocks_init(GuestPhysBlockList *list) +{ + list->num = 0; + QTAILQ_INIT(&list->head); +} + +typedef struct GuestPhysListener { + GuestPhysBlockList *list; + MemoryListener listener; +} GuestPhysListener; + +static void guest_phys_block_add_section(GuestPhysListener *g, + MemoryRegionSection *section) +{ + const hwaddr target_start = section->offset_within_address_space; + const hwaddr target_end = target_start + int128_get64(section->size); + uint8_t *host_addr = memory_region_get_ram_ptr(section->mr) + + section->offset_within_region; + GuestPhysBlock *predecessor = NULL; + + /* find continuity in guest physical address space */ + if (!QTAILQ_EMPTY(&g->list->head)) { + hwaddr predecessor_size; + + predecessor = QTAILQ_LAST(&g->list->head); + predecessor_size = predecessor->target_end - predecessor->target_start; + + /* the memory API guarantees monotonically increasing traversal */ + g_assert(predecessor->target_end <= target_start); + + /* we want continuity in both guest-physical and host-virtual memory */ + if (predecessor->target_end < target_start || + predecessor->host_addr + predecessor_size != host_addr || + predecessor->mr != section->mr) { + predecessor = NULL; + } + } + + if (predecessor == NULL) { + /* isolated mapping, allocate it and add it to the list */ + GuestPhysBlock *block = g_malloc0(sizeof *block); + + block->target_start = target_start; + block->target_end = target_end; + block->host_addr = host_addr; + block->mr = section->mr; + memory_region_ref(section->mr); + + QTAILQ_INSERT_TAIL(&g->list->head, block, next); + ++g->list->num; + } else { + /* expand predecessor until @target_end; predecessor's start doesn't + * change + */ + predecessor->target_end = target_end; + } + +#ifdef DEBUG_GUEST_PHYS_REGION_ADD + fprintf(stderr, "%s: target_start=" HWADDR_FMT_plx " target_end=" + HWADDR_FMT_plx ": %s (count: %u)\n", __func__, target_start, + target_end, predecessor ? "joined" : "added", g->list->num); +#endif +} + +static int guest_phys_ram_populate_cb(MemoryRegionSection *section, + void *opaque) +{ + GuestPhysListener *g = opaque; + + guest_phys_block_add_section(g, section); + return 0; +} + +static void guest_phys_blocks_region_add(MemoryListener *listener, + MemoryRegionSection *section) +{ + GuestPhysListener *g = container_of(listener, GuestPhysListener, listener); + + /* we only care about RAM */ + if (!memory_region_is_ram(section->mr) || + memory_region_is_ram_device(section->mr) || + memory_region_is_nonvolatile(section->mr)) { + return; + } + + /* for special sparse regions, only add populated parts */ + if (memory_region_has_ram_discard_manager(section->mr)) { + RamDiscardManager *rdm; + + rdm = memory_region_get_ram_discard_manager(section->mr); + ram_discard_manager_replay_populated(rdm, section, + guest_phys_ram_populate_cb, g); + return; + } + + guest_phys_block_add_section(g, section); +} + +void guest_phys_blocks_append(GuestPhysBlockList *list) +{ + GuestPhysListener g = { 0 }; + + g.list = list; + g.listener.region_add = &guest_phys_blocks_region_add; + memory_listener_register(&g.listener, &address_space_memory); + memory_listener_unregister(&g.listener); +} + +static CPUState *find_paging_enabled_cpu(CPUState *start_cpu) +{ + CPUState *cpu; + + CPU_FOREACH(cpu) { + if (cpu_paging_enabled(cpu)) { + return cpu; + } + } + + return NULL; +} + +void qemu_get_guest_memory_mapping(MemoryMappingList *list, + const GuestPhysBlockList *guest_phys_blocks, + Error **errp) +{ + CPUState *cpu, *first_paging_enabled_cpu; + GuestPhysBlock *block; + ram_addr_t offset, length; + + first_paging_enabled_cpu = find_paging_enabled_cpu(first_cpu); + if (first_paging_enabled_cpu) { + for (cpu = first_paging_enabled_cpu; cpu != NULL; + cpu = CPU_NEXT(cpu)) { + Error *err = NULL; + cpu_get_memory_mapping(cpu, list, &err); + if (err) { + error_propagate(errp, err); + return; + } + } + return; + } + + /* + * If the guest doesn't use paging, the virtual address is equal to physical + * address. + */ + QTAILQ_FOREACH(block, &guest_phys_blocks->head, next) { + offset = block->target_start; + length = block->target_end - block->target_start; + create_new_memory_mapping(list, offset, offset, length); + } +} + +void qemu_get_guest_simple_memory_mapping(MemoryMappingList *list, + const GuestPhysBlockList *guest_phys_blocks) +{ + GuestPhysBlock *block; + + QTAILQ_FOREACH(block, &guest_phys_blocks->head, next) { + create_new_memory_mapping(list, block->target_start, 0, + block->target_end - block->target_start); + } +} + +void memory_mapping_filter(MemoryMappingList *list, int64_t begin, + int64_t length) +{ + MemoryMapping *cur, *next; + + QTAILQ_FOREACH_SAFE(cur, &list->head, next, next) { + if (cur->phys_addr >= begin + length || + cur->phys_addr + cur->length <= begin) { + QTAILQ_REMOVE(&list->head, cur, next); + g_free(cur); + list->num--; + continue; + } + + if (cur->phys_addr < begin) { + cur->length -= begin - cur->phys_addr; + if (cur->virt_addr) { + cur->virt_addr += begin - cur->phys_addr; + } + cur->phys_addr = begin; + } + + if (cur->phys_addr + cur->length > begin + length) { + cur->length -= cur->phys_addr + cur->length - begin - length; + } + } +} diff --git a/system/meson.build b/system/meson.build new file mode 100644 index 0000000000..3a64dd89de --- /dev/null +++ b/system/meson.build @@ -0,0 +1,36 @@ +specific_ss.add(when: 'CONFIG_SYSTEM_ONLY', if_true: [files( + 'arch_init.c', + 'ioport.c', + 'memory.c', + 'physmem.c', + 'watchpoint.c', +)]) + +system_ss.add(files( + 'balloon.c', + 'bootdevice.c', + 'cpus.c', + 'cpu-throttle.c', + 'cpu-timers.c', + 'datadir.c', + 'dirtylimit.c', + 'dma-helpers.c', + 'globals.c', + 'memory_mapping.c', + 'qdev-monitor.c', + 'qtest.c', + 'rtc.c', + 'runstate-action.c', + 'runstate-hmp-cmds.c', + 'runstate.c', + 'tpm-hmp-cmds.c', + 'vl.c', +), sdl, libpmem, libdaxctl) + +if have_tpm + system_ss.add(files('tpm.c')) +endif + +system_ss.add(when: seccomp, if_true: files('qemu-seccomp.c')) +system_ss.add(when: fdt, if_true: files('device_tree.c')) +system_ss.add(when: 'CONFIG_LINUX', if_true: files('async-teardown.c')) diff --git a/system/physmem.c b/system/physmem.c new file mode 100644 index 0000000000..edc3ed8ab9 --- /dev/null +++ b/system/physmem.c @@ -0,0 +1,3796 @@ +/* + * RAM allocation and memory access + * + * Copyright (c) 2003 Fabrice Bellard + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "exec/page-vary.h" +#include "qapi/error.h" + +#include "qemu/cutils.h" +#include "qemu/cacheflush.h" +#include "qemu/hbitmap.h" +#include "qemu/madvise.h" + +#ifdef CONFIG_TCG +#include "hw/core/tcg-cpu-ops.h" +#endif /* CONFIG_TCG */ + +#include "exec/exec-all.h" +#include "exec/target_page.h" +#include "hw/qdev-core.h" +#include "hw/qdev-properties.h" +#include "hw/boards.h" +#include "hw/xen/xen.h" +#include "sysemu/kvm.h" +#include "sysemu/tcg.h" +#include "sysemu/qtest.h" +#include "qemu/timer.h" +#include "qemu/config-file.h" +#include "qemu/error-report.h" +#include "qemu/qemu-print.h" +#include "qemu/log.h" +#include "qemu/memalign.h" +#include "exec/memory.h" +#include "exec/ioport.h" +#include "sysemu/dma.h" +#include "sysemu/hostmem.h" +#include "sysemu/hw_accel.h" +#include "sysemu/xen-mapcache.h" +#include "trace/trace-root.h" + +#ifdef CONFIG_FALLOCATE_PUNCH_HOLE +#include <linux/falloc.h> +#endif + +#include "qemu/rcu_queue.h" +#include "qemu/main-loop.h" +#include "exec/translate-all.h" +#include "sysemu/replay.h" + +#include "exec/memory-internal.h" +#include "exec/ram_addr.h" + +#include "qemu/pmem.h" + +#include "migration/vmstate.h" + +#include "qemu/range.h" +#ifndef _WIN32 +#include "qemu/mmap-alloc.h" +#endif + +#include "monitor/monitor.h" + +#ifdef CONFIG_LIBDAXCTL +#include <daxctl/libdaxctl.h> +#endif + +//#define DEBUG_SUBPAGE + +/* ram_list is read under rcu_read_lock()/rcu_read_unlock(). Writes + * are protected by the ramlist lock. + */ +RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) }; + +static MemoryRegion *system_memory; +static MemoryRegion *system_io; + +AddressSpace address_space_io; +AddressSpace address_space_memory; + +static MemoryRegion io_mem_unassigned; + +typedef struct PhysPageEntry PhysPageEntry; + +struct PhysPageEntry { + /* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */ + uint32_t skip : 6; + /* index into phys_sections (!skip) or phys_map_nodes (skip) */ + uint32_t ptr : 26; +}; + +#define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6) + +/* Size of the L2 (and L3, etc) page tables. */ +#define ADDR_SPACE_BITS 64 + +#define P_L2_BITS 9 +#define P_L2_SIZE (1 << P_L2_BITS) + +#define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1) + +typedef PhysPageEntry Node[P_L2_SIZE]; + +typedef struct PhysPageMap { + struct rcu_head rcu; + + unsigned sections_nb; + unsigned sections_nb_alloc; + unsigned nodes_nb; + unsigned nodes_nb_alloc; + Node *nodes; + MemoryRegionSection *sections; +} PhysPageMap; + +struct AddressSpaceDispatch { + MemoryRegionSection *mru_section; + /* This is a multi-level map on the physical address space. + * The bottom level has pointers to MemoryRegionSections. + */ + PhysPageEntry phys_map; + PhysPageMap map; +}; + +#define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK) +typedef struct subpage_t { + MemoryRegion iomem; + FlatView *fv; + hwaddr base; + uint16_t sub_section[]; +} subpage_t; + +#define PHYS_SECTION_UNASSIGNED 0 + +static void io_mem_init(void); +static void memory_map_init(void); +static void tcg_log_global_after_sync(MemoryListener *listener); +static void tcg_commit(MemoryListener *listener); + +/** + * CPUAddressSpace: all the information a CPU needs about an AddressSpace + * @cpu: the CPU whose AddressSpace this is + * @as: the AddressSpace itself + * @memory_dispatch: its dispatch pointer (cached, RCU protected) + * @tcg_as_listener: listener for tracking changes to the AddressSpace + */ +struct CPUAddressSpace { + CPUState *cpu; + AddressSpace *as; + struct AddressSpaceDispatch *memory_dispatch; + MemoryListener tcg_as_listener; +}; + +struct DirtyBitmapSnapshot { + ram_addr_t start; + ram_addr_t end; + unsigned long dirty[]; +}; + +static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes) +{ + static unsigned alloc_hint = 16; + if (map->nodes_nb + nodes > map->nodes_nb_alloc) { + map->nodes_nb_alloc = MAX(alloc_hint, map->nodes_nb + nodes); + map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc); + alloc_hint = map->nodes_nb_alloc; + } +} + +static uint32_t phys_map_node_alloc(PhysPageMap *map, bool leaf) +{ + unsigned i; + uint32_t ret; + PhysPageEntry e; + PhysPageEntry *p; + + ret = map->nodes_nb++; + p = map->nodes[ret]; + assert(ret != PHYS_MAP_NODE_NIL); + assert(ret != map->nodes_nb_alloc); + + e.skip = leaf ? 0 : 1; + e.ptr = leaf ? PHYS_SECTION_UNASSIGNED : PHYS_MAP_NODE_NIL; + for (i = 0; i < P_L2_SIZE; ++i) { + memcpy(&p[i], &e, sizeof(e)); + } + return ret; +} + +static void phys_page_set_level(PhysPageMap *map, PhysPageEntry *lp, + hwaddr *index, uint64_t *nb, uint16_t leaf, + int level) +{ + PhysPageEntry *p; + hwaddr step = (hwaddr)1 << (level * P_L2_BITS); + + if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) { + lp->ptr = phys_map_node_alloc(map, level == 0); + } + p = map->nodes[lp->ptr]; + lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)]; + + while (*nb && lp < &p[P_L2_SIZE]) { + if ((*index & (step - 1)) == 0 && *nb >= step) { + lp->skip = 0; + lp->ptr = leaf; + *index += step; + *nb -= step; + } else { + phys_page_set_level(map, lp, index, nb, leaf, level - 1); + } + ++lp; + } +} + +static void phys_page_set(AddressSpaceDispatch *d, + hwaddr index, uint64_t nb, + uint16_t leaf) +{ + /* Wildly overreserve - it doesn't matter much. */ + phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS); + + phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1); +} + +/* Compact a non leaf page entry. Simply detect that the entry has a single child, + * and update our entry so we can skip it and go directly to the destination. + */ +static void phys_page_compact(PhysPageEntry *lp, Node *nodes) +{ + unsigned valid_ptr = P_L2_SIZE; + int valid = 0; + PhysPageEntry *p; + int i; + + if (lp->ptr == PHYS_MAP_NODE_NIL) { + return; + } + + p = nodes[lp->ptr]; + for (i = 0; i < P_L2_SIZE; i++) { + if (p[i].ptr == PHYS_MAP_NODE_NIL) { + continue; + } + + valid_ptr = i; + valid++; + if (p[i].skip) { + phys_page_compact(&p[i], nodes); + } + } + + /* We can only compress if there's only one child. */ + if (valid != 1) { + return; + } + + assert(valid_ptr < P_L2_SIZE); + + /* Don't compress if it won't fit in the # of bits we have. */ + if (P_L2_LEVELS >= (1 << 6) && + lp->skip + p[valid_ptr].skip >= (1 << 6)) { + return; + } + + lp->ptr = p[valid_ptr].ptr; + if (!p[valid_ptr].skip) { + /* If our only child is a leaf, make this a leaf. */ + /* By design, we should have made this node a leaf to begin with so we + * should never reach here. + * But since it's so simple to handle this, let's do it just in case we + * change this rule. + */ + lp->skip = 0; + } else { + lp->skip += p[valid_ptr].skip; + } +} + +void address_space_dispatch_compact(AddressSpaceDispatch *d) +{ + if (d->phys_map.skip) { + phys_page_compact(&d->phys_map, d->map.nodes); + } +} + +static inline bool section_covers_addr(const MemoryRegionSection *section, + hwaddr addr) +{ + /* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means + * the section must cover the entire address space. + */ + return int128_gethi(section->size) || + range_covers_byte(section->offset_within_address_space, + int128_getlo(section->size), addr); +} + +static MemoryRegionSection *phys_page_find(AddressSpaceDispatch *d, hwaddr addr) +{ + PhysPageEntry lp = d->phys_map, *p; + Node *nodes = d->map.nodes; + MemoryRegionSection *sections = d->map.sections; + hwaddr index = addr >> TARGET_PAGE_BITS; + int i; + + for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) { + if (lp.ptr == PHYS_MAP_NODE_NIL) { + return §ions[PHYS_SECTION_UNASSIGNED]; + } + p = nodes[lp.ptr]; + lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)]; + } + + if (section_covers_addr(§ions[lp.ptr], addr)) { + return §ions[lp.ptr]; + } else { + return §ions[PHYS_SECTION_UNASSIGNED]; + } +} + +/* Called from RCU critical section */ +static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d, + hwaddr addr, + bool resolve_subpage) +{ + MemoryRegionSection *section = qatomic_read(&d->mru_section); + subpage_t *subpage; + + if (!section || section == &d->map.sections[PHYS_SECTION_UNASSIGNED] || + !section_covers_addr(section, addr)) { + section = phys_page_find(d, addr); + qatomic_set(&d->mru_section, section); + } + if (resolve_subpage && section->mr->subpage) { + subpage = container_of(section->mr, subpage_t, iomem); + section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]]; + } + return section; +} + +/* Called from RCU critical section */ +static MemoryRegionSection * +address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *xlat, + hwaddr *plen, bool resolve_subpage) +{ + MemoryRegionSection *section; + MemoryRegion *mr; + Int128 diff; + + section = address_space_lookup_region(d, addr, resolve_subpage); + /* Compute offset within MemoryRegionSection */ + addr -= section->offset_within_address_space; + + /* Compute offset within MemoryRegion */ + *xlat = addr + section->offset_within_region; + + mr = section->mr; + + /* MMIO registers can be expected to perform full-width accesses based only + * on their address, without considering adjacent registers that could + * decode to completely different MemoryRegions. When such registers + * exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO + * regions overlap wildly. For this reason we cannot clamp the accesses + * here. + * + * If the length is small (as is the case for address_space_ldl/stl), + * everything works fine. If the incoming length is large, however, + * the caller really has to do the clamping through memory_access_size. + */ + if (memory_region_is_ram(mr)) { + diff = int128_sub(section->size, int128_make64(addr)); + *plen = int128_get64(int128_min(diff, int128_make64(*plen))); + } + return section; +} + +/** + * address_space_translate_iommu - translate an address through an IOMMU + * memory region and then through the target address space. + * + * @iommu_mr: the IOMMU memory region that we start the translation from + * @addr: the address to be translated through the MMU + * @xlat: the translated address offset within the destination memory region. + * It cannot be %NULL. + * @plen_out: valid read/write length of the translated address. It + * cannot be %NULL. + * @page_mask_out: page mask for the translated address. This + * should only be meaningful for IOMMU translated + * addresses, since there may be huge pages that this bit + * would tell. It can be %NULL if we don't care about it. + * @is_write: whether the translation operation is for write + * @is_mmio: whether this can be MMIO, set true if it can + * @target_as: the address space targeted by the IOMMU + * @attrs: transaction attributes + * + * This function is called from RCU critical section. It is the common + * part of flatview_do_translate and address_space_translate_cached. + */ +static MemoryRegionSection address_space_translate_iommu(IOMMUMemoryRegion *iommu_mr, + hwaddr *xlat, + hwaddr *plen_out, + hwaddr *page_mask_out, + bool is_write, + bool is_mmio, + AddressSpace **target_as, + MemTxAttrs attrs) +{ + MemoryRegionSection *section; + hwaddr page_mask = (hwaddr)-1; + + do { + hwaddr addr = *xlat; + IOMMUMemoryRegionClass *imrc = memory_region_get_iommu_class_nocheck(iommu_mr); + int iommu_idx = 0; + IOMMUTLBEntry iotlb; + + if (imrc->attrs_to_index) { + iommu_idx = imrc->attrs_to_index(iommu_mr, attrs); + } + + iotlb = imrc->translate(iommu_mr, addr, is_write ? + IOMMU_WO : IOMMU_RO, iommu_idx); + + if (!(iotlb.perm & (1 << is_write))) { + goto unassigned; + } + + addr = ((iotlb.translated_addr & ~iotlb.addr_mask) + | (addr & iotlb.addr_mask)); + page_mask &= iotlb.addr_mask; + *plen_out = MIN(*plen_out, (addr | iotlb.addr_mask) - addr + 1); + *target_as = iotlb.target_as; + + section = address_space_translate_internal( + address_space_to_dispatch(iotlb.target_as), addr, xlat, + plen_out, is_mmio); + + iommu_mr = memory_region_get_iommu(section->mr); + } while (unlikely(iommu_mr)); + + if (page_mask_out) { + *page_mask_out = page_mask; + } + return *section; + +unassigned: + return (MemoryRegionSection) { .mr = &io_mem_unassigned }; +} + +/** + * flatview_do_translate - translate an address in FlatView + * + * @fv: the flat view that we want to translate on + * @addr: the address to be translated in above address space + * @xlat: the translated address offset within memory region. It + * cannot be @NULL. + * @plen_out: valid read/write length of the translated address. It + * can be @NULL when we don't care about it. + * @page_mask_out: page mask for the translated address. This + * should only be meaningful for IOMMU translated + * addresses, since there may be huge pages that this bit + * would tell. It can be @NULL if we don't care about it. + * @is_write: whether the translation operation is for write + * @is_mmio: whether this can be MMIO, set true if it can + * @target_as: the address space targeted by the IOMMU + * @attrs: memory transaction attributes + * + * This function is called from RCU critical section + */ +static MemoryRegionSection flatview_do_translate(FlatView *fv, + hwaddr addr, + hwaddr *xlat, + hwaddr *plen_out, + hwaddr *page_mask_out, + bool is_write, + bool is_mmio, + AddressSpace **target_as, + MemTxAttrs attrs) +{ + MemoryRegionSection *section; + IOMMUMemoryRegion *iommu_mr; + hwaddr plen = (hwaddr)(-1); + + if (!plen_out) { + plen_out = &plen; + } + + section = address_space_translate_internal( + flatview_to_dispatch(fv), addr, xlat, + plen_out, is_mmio); + + iommu_mr = memory_region_get_iommu(section->mr); + if (unlikely(iommu_mr)) { + return address_space_translate_iommu(iommu_mr, xlat, + plen_out, page_mask_out, + is_write, is_mmio, + target_as, attrs); + } + if (page_mask_out) { + /* Not behind an IOMMU, use default page size. */ + *page_mask_out = ~TARGET_PAGE_MASK; + } + + return *section; +} + +/* Called from RCU critical section */ +IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr, + bool is_write, MemTxAttrs attrs) +{ + MemoryRegionSection section; + hwaddr xlat, page_mask; + + /* + * This can never be MMIO, and we don't really care about plen, + * but page mask. + */ + section = flatview_do_translate(address_space_to_flatview(as), addr, &xlat, + NULL, &page_mask, is_write, false, &as, + attrs); + + /* Illegal translation */ + if (section.mr == &io_mem_unassigned) { + goto iotlb_fail; + } + + /* Convert memory region offset into address space offset */ + xlat += section.offset_within_address_space - + section.offset_within_region; + + return (IOMMUTLBEntry) { + .target_as = as, + .iova = addr & ~page_mask, + .translated_addr = xlat & ~page_mask, + .addr_mask = page_mask, + /* IOTLBs are for DMAs, and DMA only allows on RAMs. */ + .perm = IOMMU_RW, + }; + +iotlb_fail: + return (IOMMUTLBEntry) {0}; +} + +/* Called from RCU critical section */ +MemoryRegion *flatview_translate(FlatView *fv, hwaddr addr, hwaddr *xlat, + hwaddr *plen, bool is_write, + MemTxAttrs attrs) +{ + MemoryRegion *mr; + MemoryRegionSection section; + AddressSpace *as = NULL; + + /* This can be MMIO, so setup MMIO bit. */ + section = flatview_do_translate(fv, addr, xlat, plen, NULL, + is_write, true, &as, attrs); + mr = section.mr; + + if (xen_enabled() && memory_access_is_direct(mr, is_write)) { + hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr; + *plen = MIN(page, *plen); + } + + return mr; +} + +typedef struct TCGIOMMUNotifier { + IOMMUNotifier n; + MemoryRegion *mr; + CPUState *cpu; + int iommu_idx; + bool active; +} TCGIOMMUNotifier; + +static void tcg_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) +{ + TCGIOMMUNotifier *notifier = container_of(n, TCGIOMMUNotifier, n); + + if (!notifier->active) { + return; + } + tlb_flush(notifier->cpu); + notifier->active = false; + /* We leave the notifier struct on the list to avoid reallocating it later. + * Generally the number of IOMMUs a CPU deals with will be small. + * In any case we can't unregister the iommu notifier from a notify + * callback. + */ +} + +static void tcg_register_iommu_notifier(CPUState *cpu, + IOMMUMemoryRegion *iommu_mr, + int iommu_idx) +{ + /* Make sure this CPU has an IOMMU notifier registered for this + * IOMMU/IOMMU index combination, so that we can flush its TLB + * when the IOMMU tells us the mappings we've cached have changed. + */ + MemoryRegion *mr = MEMORY_REGION(iommu_mr); + TCGIOMMUNotifier *notifier = NULL; + int i; + + for (i = 0; i < cpu->iommu_notifiers->len; i++) { + notifier = g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i); + if (notifier->mr == mr && notifier->iommu_idx == iommu_idx) { + break; + } + } + if (i == cpu->iommu_notifiers->len) { + /* Not found, add a new entry at the end of the array */ + cpu->iommu_notifiers = g_array_set_size(cpu->iommu_notifiers, i + 1); + notifier = g_new0(TCGIOMMUNotifier, 1); + g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i) = notifier; + + notifier->mr = mr; + notifier->iommu_idx = iommu_idx; + notifier->cpu = cpu; + /* Rather than trying to register interest in the specific part + * of the iommu's address space that we've accessed and then + * expand it later as subsequent accesses touch more of it, we + * just register interest in the whole thing, on the assumption + * that iommu reconfiguration will be rare. + */ + iommu_notifier_init(¬ifier->n, + tcg_iommu_unmap_notify, + IOMMU_NOTIFIER_UNMAP, + 0, + HWADDR_MAX, + iommu_idx); + memory_region_register_iommu_notifier(notifier->mr, ¬ifier->n, + &error_fatal); + } + + if (!notifier->active) { + notifier->active = true; + } +} + +void tcg_iommu_free_notifier_list(CPUState *cpu) +{ + /* Destroy the CPU's notifier list */ + int i; + TCGIOMMUNotifier *notifier; + + for (i = 0; i < cpu->iommu_notifiers->len; i++) { + notifier = g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i); + memory_region_unregister_iommu_notifier(notifier->mr, ¬ifier->n); + g_free(notifier); + } + g_array_free(cpu->iommu_notifiers, true); +} + +void tcg_iommu_init_notifier_list(CPUState *cpu) +{ + cpu->iommu_notifiers = g_array_new(false, true, sizeof(TCGIOMMUNotifier *)); +} + +/* Called from RCU critical section */ +MemoryRegionSection * +address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr orig_addr, + hwaddr *xlat, hwaddr *plen, + MemTxAttrs attrs, int *prot) +{ + MemoryRegionSection *section; + IOMMUMemoryRegion *iommu_mr; + IOMMUMemoryRegionClass *imrc; + IOMMUTLBEntry iotlb; + int iommu_idx; + hwaddr addr = orig_addr; + AddressSpaceDispatch *d = cpu->cpu_ases[asidx].memory_dispatch; + + for (;;) { + section = address_space_translate_internal(d, addr, &addr, plen, false); + + iommu_mr = memory_region_get_iommu(section->mr); + if (!iommu_mr) { + break; + } + + imrc = memory_region_get_iommu_class_nocheck(iommu_mr); + + iommu_idx = imrc->attrs_to_index(iommu_mr, attrs); + tcg_register_iommu_notifier(cpu, iommu_mr, iommu_idx); + /* We need all the permissions, so pass IOMMU_NONE so the IOMMU + * doesn't short-cut its translation table walk. + */ + iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, iommu_idx); + addr = ((iotlb.translated_addr & ~iotlb.addr_mask) + | (addr & iotlb.addr_mask)); + /* Update the caller's prot bits to remove permissions the IOMMU + * is giving us a failure response for. If we get down to no + * permissions left at all we can give up now. + */ + if (!(iotlb.perm & IOMMU_RO)) { + *prot &= ~(PAGE_READ | PAGE_EXEC); + } + if (!(iotlb.perm & IOMMU_WO)) { + *prot &= ~PAGE_WRITE; + } + + if (!*prot) { + goto translate_fail; + } + + d = flatview_to_dispatch(address_space_to_flatview(iotlb.target_as)); + } + + assert(!memory_region_is_iommu(section->mr)); + *xlat = addr; + return section; + +translate_fail: + /* + * We should be given a page-aligned address -- certainly + * tlb_set_page_with_attrs() does so. The page offset of xlat + * is used to index sections[], and PHYS_SECTION_UNASSIGNED = 0. + * The page portion of xlat will be logged by memory_region_access_valid() + * when this memory access is rejected, so use the original untranslated + * physical address. + */ + assert((orig_addr & ~TARGET_PAGE_MASK) == 0); + *xlat = orig_addr; + return &d->map.sections[PHYS_SECTION_UNASSIGNED]; +} + +void cpu_address_space_init(CPUState *cpu, int asidx, + const char *prefix, MemoryRegion *mr) +{ + CPUAddressSpace *newas; + AddressSpace *as = g_new0(AddressSpace, 1); + char *as_name; + + assert(mr); + as_name = g_strdup_printf("%s-%d", prefix, cpu->cpu_index); + address_space_init(as, mr, as_name); + g_free(as_name); + + /* Target code should have set num_ases before calling us */ + assert(asidx < cpu->num_ases); + + if (asidx == 0) { + /* address space 0 gets the convenience alias */ + cpu->as = as; + } + + /* KVM cannot currently support multiple address spaces. */ + assert(asidx == 0 || !kvm_enabled()); + + if (!cpu->cpu_ases) { + cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases); + } + + newas = &cpu->cpu_ases[asidx]; + newas->cpu = cpu; + newas->as = as; + if (tcg_enabled()) { + newas->tcg_as_listener.log_global_after_sync = tcg_log_global_after_sync; + newas->tcg_as_listener.commit = tcg_commit; + newas->tcg_as_listener.name = "tcg"; + memory_listener_register(&newas->tcg_as_listener, as); + } +} + +AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx) +{ + /* Return the AddressSpace corresponding to the specified index */ + return cpu->cpu_ases[asidx].as; +} + +/* Called from RCU critical section */ +static RAMBlock *qemu_get_ram_block(ram_addr_t addr) +{ + RAMBlock *block; + + block = qatomic_rcu_read(&ram_list.mru_block); + if (block && addr - block->offset < block->max_length) { + return block; + } + RAMBLOCK_FOREACH(block) { + if (addr - block->offset < block->max_length) { + goto found; + } + } + + fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr); + abort(); + +found: + /* It is safe to write mru_block outside the iothread lock. This + * is what happens: + * + * mru_block = xxx + * rcu_read_unlock() + * xxx removed from list + * rcu_read_lock() + * read mru_block + * mru_block = NULL; + * call_rcu(reclaim_ramblock, xxx); + * rcu_read_unlock() + * + * qatomic_rcu_set is not needed here. The block was already published + * when it was placed into the list. Here we're just making an extra + * copy of the pointer. + */ + ram_list.mru_block = block; + return block; +} + +static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length) +{ + CPUState *cpu; + ram_addr_t start1; + RAMBlock *block; + ram_addr_t end; + + assert(tcg_enabled()); + end = TARGET_PAGE_ALIGN(start + length); + start &= TARGET_PAGE_MASK; + + RCU_READ_LOCK_GUARD(); + block = qemu_get_ram_block(start); + assert(block == qemu_get_ram_block(end - 1)); + start1 = (uintptr_t)ramblock_ptr(block, start - block->offset); + CPU_FOREACH(cpu) { + tlb_reset_dirty(cpu, start1, length); + } +} + +/* Note: start and end must be within the same ram block. */ +bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start, + ram_addr_t length, + unsigned client) +{ + DirtyMemoryBlocks *blocks; + unsigned long end, page, start_page; + bool dirty = false; + RAMBlock *ramblock; + uint64_t mr_offset, mr_size; + + if (length == 0) { + return false; + } + + end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS; + start_page = start >> TARGET_PAGE_BITS; + page = start_page; + + WITH_RCU_READ_LOCK_GUARD() { + blocks = qatomic_rcu_read(&ram_list.dirty_memory[client]); + ramblock = qemu_get_ram_block(start); + /* Range sanity check on the ramblock */ + assert(start >= ramblock->offset && + start + length <= ramblock->offset + ramblock->used_length); + + while (page < end) { + unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE; + unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE; + unsigned long num = MIN(end - page, + DIRTY_MEMORY_BLOCK_SIZE - offset); + + dirty |= bitmap_test_and_clear_atomic(blocks->blocks[idx], + offset, num); + page += num; + } + + mr_offset = (ram_addr_t)(start_page << TARGET_PAGE_BITS) - ramblock->offset; + mr_size = (end - start_page) << TARGET_PAGE_BITS; + memory_region_clear_dirty_bitmap(ramblock->mr, mr_offset, mr_size); + } + + if (dirty && tcg_enabled()) { + tlb_reset_dirty_range_all(start, length); + } + + return dirty; +} + +DirtyBitmapSnapshot *cpu_physical_memory_snapshot_and_clear_dirty + (MemoryRegion *mr, hwaddr offset, hwaddr length, unsigned client) +{ + DirtyMemoryBlocks *blocks; + ram_addr_t start = memory_region_get_ram_addr(mr) + offset; + unsigned long align = 1UL << (TARGET_PAGE_BITS + BITS_PER_LEVEL); + ram_addr_t first = QEMU_ALIGN_DOWN(start, align); + ram_addr_t last = QEMU_ALIGN_UP(start + length, align); + DirtyBitmapSnapshot *snap; + unsigned long page, end, dest; + + snap = g_malloc0(sizeof(*snap) + + ((last - first) >> (TARGET_PAGE_BITS + 3))); + snap->start = first; + snap->end = last; + + page = first >> TARGET_PAGE_BITS; + end = last >> TARGET_PAGE_BITS; + dest = 0; + + WITH_RCU_READ_LOCK_GUARD() { + blocks = qatomic_rcu_read(&ram_list.dirty_memory[client]); + + while (page < end) { + unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE; + unsigned long ofs = page % DIRTY_MEMORY_BLOCK_SIZE; + unsigned long num = MIN(end - page, + DIRTY_MEMORY_BLOCK_SIZE - ofs); + + assert(QEMU_IS_ALIGNED(ofs, (1 << BITS_PER_LEVEL))); + assert(QEMU_IS_ALIGNED(num, (1 << BITS_PER_LEVEL))); + ofs >>= BITS_PER_LEVEL; + + bitmap_copy_and_clear_atomic(snap->dirty + dest, + blocks->blocks[idx] + ofs, + num); + page += num; + dest += num >> BITS_PER_LEVEL; + } + } + + if (tcg_enabled()) { + tlb_reset_dirty_range_all(start, length); + } + + memory_region_clear_dirty_bitmap(mr, offset, length); + + return snap; +} + +bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap, + ram_addr_t start, + ram_addr_t length) +{ + unsigned long page, end; + + assert(start >= snap->start); + assert(start + length <= snap->end); + + end = TARGET_PAGE_ALIGN(start + length - snap->start) >> TARGET_PAGE_BITS; + page = (start - snap->start) >> TARGET_PAGE_BITS; + + while (page < end) { + if (test_bit(page, snap->dirty)) { + return true; + } + page++; + } + return false; +} + +/* Called from RCU critical section */ +hwaddr memory_region_section_get_iotlb(CPUState *cpu, + MemoryRegionSection *section) +{ + AddressSpaceDispatch *d = flatview_to_dispatch(section->fv); + return section - d->map.sections; +} + +static int subpage_register(subpage_t *mmio, uint32_t start, uint32_t end, + uint16_t section); +static subpage_t *subpage_init(FlatView *fv, hwaddr base); + +static uint16_t phys_section_add(PhysPageMap *map, + MemoryRegionSection *section) +{ + /* The physical section number is ORed with a page-aligned + * pointer to produce the iotlb entries. Thus it should + * never overflow into the page-aligned value. + */ + assert(map->sections_nb < TARGET_PAGE_SIZE); + + if (map->sections_nb == map->sections_nb_alloc) { + map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16); + map->sections = g_renew(MemoryRegionSection, map->sections, + map->sections_nb_alloc); + } + map->sections[map->sections_nb] = *section; + memory_region_ref(section->mr); + return map->sections_nb++; +} + +static void phys_section_destroy(MemoryRegion *mr) +{ + bool have_sub_page = mr->subpage; + + memory_region_unref(mr); + + if (have_sub_page) { + subpage_t *subpage = container_of(mr, subpage_t, iomem); + object_unref(OBJECT(&subpage->iomem)); + g_free(subpage); + } +} + +static void phys_sections_free(PhysPageMap *map) +{ + while (map->sections_nb > 0) { + MemoryRegionSection *section = &map->sections[--map->sections_nb]; + phys_section_destroy(section->mr); + } + g_free(map->sections); + g_free(map->nodes); +} + +static void register_subpage(FlatView *fv, MemoryRegionSection *section) +{ + AddressSpaceDispatch *d = flatview_to_dispatch(fv); + subpage_t *subpage; + hwaddr base = section->offset_within_address_space + & TARGET_PAGE_MASK; + MemoryRegionSection *existing = phys_page_find(d, base); + MemoryRegionSection subsection = { + .offset_within_address_space = base, + .size = int128_make64(TARGET_PAGE_SIZE), + }; + hwaddr start, end; + + assert(existing->mr->subpage || existing->mr == &io_mem_unassigned); + + if (!(existing->mr->subpage)) { + subpage = subpage_init(fv, base); + subsection.fv = fv; + subsection.mr = &subpage->iomem; + phys_page_set(d, base >> TARGET_PAGE_BITS, 1, + phys_section_add(&d->map, &subsection)); + } else { + subpage = container_of(existing->mr, subpage_t, iomem); + } + start = section->offset_within_address_space & ~TARGET_PAGE_MASK; + end = start + int128_get64(section->size) - 1; + subpage_register(subpage, start, end, + phys_section_add(&d->map, section)); +} + + +static void register_multipage(FlatView *fv, + MemoryRegionSection *section) +{ + AddressSpaceDispatch *d = flatview_to_dispatch(fv); + hwaddr start_addr = section->offset_within_address_space; + uint16_t section_index = phys_section_add(&d->map, section); + uint64_t num_pages = int128_get64(int128_rshift(section->size, + TARGET_PAGE_BITS)); + + assert(num_pages); + phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index); +} + +/* + * The range in *section* may look like this: + * + * |s|PPPPPPP|s| + * + * where s stands for subpage and P for page. + */ +void flatview_add_to_dispatch(FlatView *fv, MemoryRegionSection *section) +{ + MemoryRegionSection remain = *section; + Int128 page_size = int128_make64(TARGET_PAGE_SIZE); + + /* register first subpage */ + if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) { + uint64_t left = TARGET_PAGE_ALIGN(remain.offset_within_address_space) + - remain.offset_within_address_space; + + MemoryRegionSection now = remain; + now.size = int128_min(int128_make64(left), now.size); + register_subpage(fv, &now); + if (int128_eq(remain.size, now.size)) { + return; + } + remain.size = int128_sub(remain.size, now.size); + remain.offset_within_address_space += int128_get64(now.size); + remain.offset_within_region += int128_get64(now.size); + } + + /* register whole pages */ + if (int128_ge(remain.size, page_size)) { + MemoryRegionSection now = remain; + now.size = int128_and(now.size, int128_neg(page_size)); + register_multipage(fv, &now); + if (int128_eq(remain.size, now.size)) { + return; + } + remain.size = int128_sub(remain.size, now.size); + remain.offset_within_address_space += int128_get64(now.size); + remain.offset_within_region += int128_get64(now.size); + } + + /* register last subpage */ + register_subpage(fv, &remain); +} + +void qemu_flush_coalesced_mmio_buffer(void) +{ + if (kvm_enabled()) + kvm_flush_coalesced_mmio_buffer(); +} + +void qemu_mutex_lock_ramlist(void) +{ + qemu_mutex_lock(&ram_list.mutex); +} + +void qemu_mutex_unlock_ramlist(void) +{ + qemu_mutex_unlock(&ram_list.mutex); +} + +GString *ram_block_format(void) +{ + RAMBlock *block; + char *psize; + GString *buf = g_string_new(""); + + RCU_READ_LOCK_GUARD(); + g_string_append_printf(buf, "%24s %8s %18s %18s %18s %18s %3s\n", + "Block Name", "PSize", "Offset", "Used", "Total", + "HVA", "RO"); + + RAMBLOCK_FOREACH(block) { + psize = size_to_str(block->page_size); + g_string_append_printf(buf, "%24s %8s 0x%016" PRIx64 " 0x%016" PRIx64 + " 0x%016" PRIx64 " 0x%016" PRIx64 " %3s\n", + block->idstr, psize, + (uint64_t)block->offset, + (uint64_t)block->used_length, + (uint64_t)block->max_length, + (uint64_t)(uintptr_t)block->host, + block->mr->readonly ? "ro" : "rw"); + + g_free(psize); + } + + return buf; +} + +static int find_min_backend_pagesize(Object *obj, void *opaque) +{ + long *hpsize_min = opaque; + + if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) { + HostMemoryBackend *backend = MEMORY_BACKEND(obj); + long hpsize = host_memory_backend_pagesize(backend); + + if (host_memory_backend_is_mapped(backend) && (hpsize < *hpsize_min)) { + *hpsize_min = hpsize; + } + } + + return 0; +} + +static int find_max_backend_pagesize(Object *obj, void *opaque) +{ + long *hpsize_max = opaque; + + if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) { + HostMemoryBackend *backend = MEMORY_BACKEND(obj); + long hpsize = host_memory_backend_pagesize(backend); + + if (host_memory_backend_is_mapped(backend) && (hpsize > *hpsize_max)) { + *hpsize_max = hpsize; + } + } + + return 0; +} + +/* + * TODO: We assume right now that all mapped host memory backends are + * used as RAM, however some might be used for different purposes. + */ +long qemu_minrampagesize(void) +{ + long hpsize = LONG_MAX; + Object *memdev_root = object_resolve_path("/objects", NULL); + + object_child_foreach(memdev_root, find_min_backend_pagesize, &hpsize); + return hpsize; +} + +long qemu_maxrampagesize(void) +{ + long pagesize = 0; + Object *memdev_root = object_resolve_path("/objects", NULL); + + object_child_foreach(memdev_root, find_max_backend_pagesize, &pagesize); + return pagesize; +} + +#ifdef CONFIG_POSIX +static int64_t get_file_size(int fd) +{ + int64_t size; +#if defined(__linux__) + struct stat st; + + if (fstat(fd, &st) < 0) { + return -errno; + } + + /* Special handling for devdax character devices */ + if (S_ISCHR(st.st_mode)) { + g_autofree char *subsystem_path = NULL; + g_autofree char *subsystem = NULL; + + subsystem_path = g_strdup_printf("/sys/dev/char/%d:%d/subsystem", + major(st.st_rdev), minor(st.st_rdev)); + subsystem = g_file_read_link(subsystem_path, NULL); + + if (subsystem && g_str_has_suffix(subsystem, "/dax")) { + g_autofree char *size_path = NULL; + g_autofree char *size_str = NULL; + + size_path = g_strdup_printf("/sys/dev/char/%d:%d/size", + major(st.st_rdev), minor(st.st_rdev)); + + if (g_file_get_contents(size_path, &size_str, NULL, NULL)) { + return g_ascii_strtoll(size_str, NULL, 0); + } + } + } +#endif /* defined(__linux__) */ + + /* st.st_size may be zero for special files yet lseek(2) works */ + size = lseek(fd, 0, SEEK_END); + if (size < 0) { + return -errno; + } + return size; +} + +static int64_t get_file_align(int fd) +{ + int64_t align = -1; +#if defined(__linux__) && defined(CONFIG_LIBDAXCTL) + struct stat st; + + if (fstat(fd, &st) < 0) { + return -errno; + } + + /* Special handling for devdax character devices */ + if (S_ISCHR(st.st_mode)) { + g_autofree char *path = NULL; + g_autofree char *rpath = NULL; + struct daxctl_ctx *ctx; + struct daxctl_region *region; + int rc = 0; + + path = g_strdup_printf("/sys/dev/char/%d:%d", + major(st.st_rdev), minor(st.st_rdev)); + rpath = realpath(path, NULL); + if (!rpath) { + return -errno; + } + + rc = daxctl_new(&ctx); + if (rc) { + return -1; + } + + daxctl_region_foreach(ctx, region) { + if (strstr(rpath, daxctl_region_get_path(region))) { + align = daxctl_region_get_align(region); + break; + } + } + daxctl_unref(ctx); + } +#endif /* defined(__linux__) && defined(CONFIG_LIBDAXCTL) */ + + return align; +} + +static int file_ram_open(const char *path, + const char *region_name, + bool readonly, + bool *created) +{ + char *filename; + char *sanitized_name; + char *c; + int fd = -1; + + *created = false; + for (;;) { + fd = open(path, readonly ? O_RDONLY : O_RDWR); + if (fd >= 0) { + /* + * open(O_RDONLY) won't fail with EISDIR. Check manually if we + * opened a directory and fail similarly to how we fail ENOENT + * in readonly mode. Note that mkstemp() would imply O_RDWR. + */ + if (readonly) { + struct stat file_stat; + + if (fstat(fd, &file_stat)) { + close(fd); + if (errno == EINTR) { + continue; + } + return -errno; + } else if (S_ISDIR(file_stat.st_mode)) { + close(fd); + return -EISDIR; + } + } + /* @path names an existing file, use it */ + break; + } + if (errno == ENOENT) { + if (readonly) { + /* Refuse to create new, readonly files. */ + return -ENOENT; + } + /* @path names a file that doesn't exist, create it */ + fd = open(path, O_RDWR | O_CREAT | O_EXCL, 0644); + if (fd >= 0) { + *created = true; + break; + } + } else if (errno == EISDIR) { + /* @path names a directory, create a file there */ + /* Make name safe to use with mkstemp by replacing '/' with '_'. */ + sanitized_name = g_strdup(region_name); + for (c = sanitized_name; *c != '\0'; c++) { + if (*c == '/') { + *c = '_'; + } + } + + filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path, + sanitized_name); + g_free(sanitized_name); + + fd = mkstemp(filename); + if (fd >= 0) { + unlink(filename); + g_free(filename); + break; + } + g_free(filename); + } + if (errno != EEXIST && errno != EINTR) { + return -errno; + } + /* + * Try again on EINTR and EEXIST. The latter happens when + * something else creates the file between our two open(). + */ + } + + return fd; +} + +static void *file_ram_alloc(RAMBlock *block, + ram_addr_t memory, + int fd, + bool truncate, + off_t offset, + Error **errp) +{ + uint32_t qemu_map_flags; + void *area; + + block->page_size = qemu_fd_getpagesize(fd); + if (block->mr->align % block->page_size) { + error_setg(errp, "alignment 0x%" PRIx64 + " must be multiples of page size 0x%zx", + block->mr->align, block->page_size); + return NULL; + } else if (block->mr->align && !is_power_of_2(block->mr->align)) { + error_setg(errp, "alignment 0x%" PRIx64 + " must be a power of two", block->mr->align); + return NULL; + } else if (offset % block->page_size) { + error_setg(errp, "offset 0x%" PRIx64 + " must be multiples of page size 0x%zx", + offset, block->page_size); + return NULL; + } + block->mr->align = MAX(block->page_size, block->mr->align); +#if defined(__s390x__) + if (kvm_enabled()) { + block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN); + } +#endif + + if (memory < block->page_size) { + error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to " + "or larger than page size 0x%zx", + memory, block->page_size); + return NULL; + } + + memory = ROUND_UP(memory, block->page_size); + + /* + * ftruncate is not supported by hugetlbfs in older + * hosts, so don't bother bailing out on errors. + * If anything goes wrong with it under other filesystems, + * mmap will fail. + * + * Do not truncate the non-empty backend file to avoid corrupting + * the existing data in the file. Disabling shrinking is not + * enough. For example, the current vNVDIMM implementation stores + * the guest NVDIMM labels at the end of the backend file. If the + * backend file is later extended, QEMU will not be able to find + * those labels. Therefore, extending the non-empty backend file + * is disabled as well. + */ + if (truncate && ftruncate(fd, offset + memory)) { + perror("ftruncate"); + } + + qemu_map_flags = (block->flags & RAM_READONLY) ? QEMU_MAP_READONLY : 0; + qemu_map_flags |= (block->flags & RAM_SHARED) ? QEMU_MAP_SHARED : 0; + qemu_map_flags |= (block->flags & RAM_PMEM) ? QEMU_MAP_SYNC : 0; + qemu_map_flags |= (block->flags & RAM_NORESERVE) ? QEMU_MAP_NORESERVE : 0; + area = qemu_ram_mmap(fd, memory, block->mr->align, qemu_map_flags, offset); + if (area == MAP_FAILED) { + error_setg_errno(errp, errno, + "unable to map backing store for guest RAM"); + return NULL; + } + + block->fd = fd; + block->fd_offset = offset; + return area; +} +#endif + +/* Allocate space within the ram_addr_t space that governs the + * dirty bitmaps. + * Called with the ramlist lock held. + */ +static ram_addr_t find_ram_offset(ram_addr_t size) +{ + RAMBlock *block, *next_block; + ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX; + + assert(size != 0); /* it would hand out same offset multiple times */ + + if (QLIST_EMPTY_RCU(&ram_list.blocks)) { + return 0; + } + + RAMBLOCK_FOREACH(block) { + ram_addr_t candidate, next = RAM_ADDR_MAX; + + /* Align blocks to start on a 'long' in the bitmap + * which makes the bitmap sync'ing take the fast path. + */ + candidate = block->offset + block->max_length; + candidate = ROUND_UP(candidate, BITS_PER_LONG << TARGET_PAGE_BITS); + + /* Search for the closest following block + * and find the gap. + */ + RAMBLOCK_FOREACH(next_block) { + if (next_block->offset >= candidate) { + next = MIN(next, next_block->offset); + } + } + + /* If it fits remember our place and remember the size + * of gap, but keep going so that we might find a smaller + * gap to fill so avoiding fragmentation. + */ + if (next - candidate >= size && next - candidate < mingap) { + offset = candidate; + mingap = next - candidate; + } + + trace_find_ram_offset_loop(size, candidate, offset, next, mingap); + } + + if (offset == RAM_ADDR_MAX) { + fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n", + (uint64_t)size); + abort(); + } + + trace_find_ram_offset(size, offset); + + return offset; +} + +static unsigned long last_ram_page(void) +{ + RAMBlock *block; + ram_addr_t last = 0; + + RCU_READ_LOCK_GUARD(); + RAMBLOCK_FOREACH(block) { + last = MAX(last, block->offset + block->max_length); + } + return last >> TARGET_PAGE_BITS; +} + +static void qemu_ram_setup_dump(void *addr, ram_addr_t size) +{ + int ret; + + /* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */ + if (!machine_dump_guest_core(current_machine)) { + ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP); + if (ret) { + perror("qemu_madvise"); + fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, " + "but dump_guest_core=off specified\n"); + } + } +} + +const char *qemu_ram_get_idstr(RAMBlock *rb) +{ + return rb->idstr; +} + +void *qemu_ram_get_host_addr(RAMBlock *rb) +{ + return rb->host; +} + +ram_addr_t qemu_ram_get_offset(RAMBlock *rb) +{ + return rb->offset; +} + +ram_addr_t qemu_ram_get_used_length(RAMBlock *rb) +{ + return rb->used_length; +} + +ram_addr_t qemu_ram_get_max_length(RAMBlock *rb) +{ + return rb->max_length; +} + +bool qemu_ram_is_shared(RAMBlock *rb) +{ + return rb->flags & RAM_SHARED; +} + +bool qemu_ram_is_noreserve(RAMBlock *rb) +{ + return rb->flags & RAM_NORESERVE; +} + +/* Note: Only set at the start of postcopy */ +bool qemu_ram_is_uf_zeroable(RAMBlock *rb) +{ + return rb->flags & RAM_UF_ZEROPAGE; +} + +void qemu_ram_set_uf_zeroable(RAMBlock *rb) +{ + rb->flags |= RAM_UF_ZEROPAGE; +} + +bool qemu_ram_is_migratable(RAMBlock *rb) +{ + return rb->flags & RAM_MIGRATABLE; +} + +void qemu_ram_set_migratable(RAMBlock *rb) +{ + rb->flags |= RAM_MIGRATABLE; +} + +void qemu_ram_unset_migratable(RAMBlock *rb) +{ + rb->flags &= ~RAM_MIGRATABLE; +} + +bool qemu_ram_is_named_file(RAMBlock *rb) +{ + return rb->flags & RAM_NAMED_FILE; +} + +int qemu_ram_get_fd(RAMBlock *rb) +{ + return rb->fd; +} + +/* Called with iothread lock held. */ +void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev) +{ + RAMBlock *block; + + assert(new_block); + assert(!new_block->idstr[0]); + + if (dev) { + char *id = qdev_get_dev_path(dev); + if (id) { + snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id); + g_free(id); + } + } + pstrcat(new_block->idstr, sizeof(new_block->idstr), name); + + RCU_READ_LOCK_GUARD(); + RAMBLOCK_FOREACH(block) { + if (block != new_block && + !strcmp(block->idstr, new_block->idstr)) { + fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n", + new_block->idstr); + abort(); + } + } +} + +/* Called with iothread lock held. */ +void qemu_ram_unset_idstr(RAMBlock *block) +{ + /* FIXME: arch_init.c assumes that this is not called throughout + * migration. Ignore the problem since hot-unplug during migration + * does not work anyway. + */ + if (block) { + memset(block->idstr, 0, sizeof(block->idstr)); + } +} + +size_t qemu_ram_pagesize(RAMBlock *rb) +{ + return rb->page_size; +} + +/* Returns the largest size of page in use */ +size_t qemu_ram_pagesize_largest(void) +{ + RAMBlock *block; + size_t largest = 0; + + RAMBLOCK_FOREACH(block) { + largest = MAX(largest, qemu_ram_pagesize(block)); + } + + return largest; +} + +static int memory_try_enable_merging(void *addr, size_t len) +{ + if (!machine_mem_merge(current_machine)) { + /* disabled by the user */ + return 0; + } + + return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE); +} + +/* + * Resizing RAM while migrating can result in the migration being canceled. + * Care has to be taken if the guest might have already detected the memory. + * + * As memory core doesn't know how is memory accessed, it is up to + * resize callback to update device state and/or add assertions to detect + * misuse, if necessary. + */ +int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, Error **errp) +{ + const ram_addr_t oldsize = block->used_length; + const ram_addr_t unaligned_size = newsize; + + assert(block); + + newsize = HOST_PAGE_ALIGN(newsize); + + if (block->used_length == newsize) { + /* + * We don't have to resize the ram block (which only knows aligned + * sizes), however, we have to notify if the unaligned size changed. + */ + if (unaligned_size != memory_region_size(block->mr)) { + memory_region_set_size(block->mr, unaligned_size); + if (block->resized) { + block->resized(block->idstr, unaligned_size, block->host); + } + } + return 0; + } + + if (!(block->flags & RAM_RESIZEABLE)) { + error_setg_errno(errp, EINVAL, + "Size mismatch: %s: 0x" RAM_ADDR_FMT + " != 0x" RAM_ADDR_FMT, block->idstr, + newsize, block->used_length); + return -EINVAL; + } + + if (block->max_length < newsize) { + error_setg_errno(errp, EINVAL, + "Size too large: %s: 0x" RAM_ADDR_FMT + " > 0x" RAM_ADDR_FMT, block->idstr, + newsize, block->max_length); + return -EINVAL; + } + + /* Notify before modifying the ram block and touching the bitmaps. */ + if (block->host) { + ram_block_notify_resize(block->host, oldsize, newsize); + } + + cpu_physical_memory_clear_dirty_range(block->offset, block->used_length); + block->used_length = newsize; + cpu_physical_memory_set_dirty_range(block->offset, block->used_length, + DIRTY_CLIENTS_ALL); + memory_region_set_size(block->mr, unaligned_size); + if (block->resized) { + block->resized(block->idstr, unaligned_size, block->host); + } + return 0; +} + +/* + * Trigger sync on the given ram block for range [start, start + length] + * with the backing store if one is available. + * Otherwise no-op. + * @Note: this is supposed to be a synchronous op. + */ +void qemu_ram_msync(RAMBlock *block, ram_addr_t start, ram_addr_t length) +{ + /* The requested range should fit in within the block range */ + g_assert((start + length) <= block->used_length); + +#ifdef CONFIG_LIBPMEM + /* The lack of support for pmem should not block the sync */ + if (ramblock_is_pmem(block)) { + void *addr = ramblock_ptr(block, start); + pmem_persist(addr, length); + return; + } +#endif + if (block->fd >= 0) { + /** + * Case there is no support for PMEM or the memory has not been + * specified as persistent (or is not one) - use the msync. + * Less optimal but still achieves the same goal + */ + void *addr = ramblock_ptr(block, start); + if (qemu_msync(addr, length, block->fd)) { + warn_report("%s: failed to sync memory range: start: " + RAM_ADDR_FMT " length: " RAM_ADDR_FMT, + __func__, start, length); + } + } +} + +/* Called with ram_list.mutex held */ +static void dirty_memory_extend(ram_addr_t old_ram_size, + ram_addr_t new_ram_size) +{ + ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size, + DIRTY_MEMORY_BLOCK_SIZE); + ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size, + DIRTY_MEMORY_BLOCK_SIZE); + int i; + + /* Only need to extend if block count increased */ + if (new_num_blocks <= old_num_blocks) { + return; + } + + for (i = 0; i < DIRTY_MEMORY_NUM; i++) { + DirtyMemoryBlocks *old_blocks; + DirtyMemoryBlocks *new_blocks; + int j; + + old_blocks = qatomic_rcu_read(&ram_list.dirty_memory[i]); + new_blocks = g_malloc(sizeof(*new_blocks) + + sizeof(new_blocks->blocks[0]) * new_num_blocks); + + if (old_num_blocks) { + memcpy(new_blocks->blocks, old_blocks->blocks, + old_num_blocks * sizeof(old_blocks->blocks[0])); + } + + for (j = old_num_blocks; j < new_num_blocks; j++) { + new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE); + } + + qatomic_rcu_set(&ram_list.dirty_memory[i], new_blocks); + + if (old_blocks) { + g_free_rcu(old_blocks, rcu); + } + } +} + +static void ram_block_add(RAMBlock *new_block, Error **errp) +{ + const bool noreserve = qemu_ram_is_noreserve(new_block); + const bool shared = qemu_ram_is_shared(new_block); + RAMBlock *block; + RAMBlock *last_block = NULL; + ram_addr_t old_ram_size, new_ram_size; + Error *err = NULL; + + old_ram_size = last_ram_page(); + + qemu_mutex_lock_ramlist(); + new_block->offset = find_ram_offset(new_block->max_length); + + if (!new_block->host) { + if (xen_enabled()) { + xen_ram_alloc(new_block->offset, new_block->max_length, + new_block->mr, &err); + if (err) { + error_propagate(errp, err); + qemu_mutex_unlock_ramlist(); + return; + } + } else { + new_block->host = qemu_anon_ram_alloc(new_block->max_length, + &new_block->mr->align, + shared, noreserve); + if (!new_block->host) { + error_setg_errno(errp, errno, + "cannot set up guest memory '%s'", + memory_region_name(new_block->mr)); + qemu_mutex_unlock_ramlist(); + return; + } + memory_try_enable_merging(new_block->host, new_block->max_length); + } + } + + new_ram_size = MAX(old_ram_size, + (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS); + if (new_ram_size > old_ram_size) { + dirty_memory_extend(old_ram_size, new_ram_size); + } + /* Keep the list sorted from biggest to smallest block. Unlike QTAILQ, + * QLIST (which has an RCU-friendly variant) does not have insertion at + * tail, so save the last element in last_block. + */ + RAMBLOCK_FOREACH(block) { + last_block = block; + if (block->max_length < new_block->max_length) { + break; + } + } + if (block) { + QLIST_INSERT_BEFORE_RCU(block, new_block, next); + } else if (last_block) { + QLIST_INSERT_AFTER_RCU(last_block, new_block, next); + } else { /* list is empty */ + QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next); + } + ram_list.mru_block = NULL; + + /* Write list before version */ + smp_wmb(); + ram_list.version++; + qemu_mutex_unlock_ramlist(); + + cpu_physical_memory_set_dirty_range(new_block->offset, + new_block->used_length, + DIRTY_CLIENTS_ALL); + + if (new_block->host) { + qemu_ram_setup_dump(new_block->host, new_block->max_length); + qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE); + /* + * MADV_DONTFORK is also needed by KVM in absence of synchronous MMU + * Configure it unless the machine is a qtest server, in which case + * KVM is not used and it may be forked (eg for fuzzing purposes). + */ + if (!qtest_enabled()) { + qemu_madvise(new_block->host, new_block->max_length, + QEMU_MADV_DONTFORK); + } + ram_block_notify_add(new_block->host, new_block->used_length, + new_block->max_length); + } +} + +#ifdef CONFIG_POSIX +RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr, + uint32_t ram_flags, int fd, off_t offset, + Error **errp) +{ + RAMBlock *new_block; + Error *local_err = NULL; + int64_t file_size, file_align; + + /* Just support these ram flags by now. */ + assert((ram_flags & ~(RAM_SHARED | RAM_PMEM | RAM_NORESERVE | + RAM_PROTECTED | RAM_NAMED_FILE | RAM_READONLY | + RAM_READONLY_FD)) == 0); + + if (xen_enabled()) { + error_setg(errp, "-mem-path not supported with Xen"); + return NULL; + } + + if (kvm_enabled() && !kvm_has_sync_mmu()) { + error_setg(errp, + "host lacks kvm mmu notifiers, -mem-path unsupported"); + return NULL; + } + + size = HOST_PAGE_ALIGN(size); + file_size = get_file_size(fd); + if (file_size > offset && file_size < (offset + size)) { + error_setg(errp, "backing store size 0x%" PRIx64 + " does not match 'size' option 0x" RAM_ADDR_FMT, + file_size, size); + return NULL; + } + + file_align = get_file_align(fd); + if (file_align > 0 && file_align > mr->align) { + error_setg(errp, "backing store align 0x%" PRIx64 + " is larger than 'align' option 0x%" PRIx64, + file_align, mr->align); + return NULL; + } + + new_block = g_malloc0(sizeof(*new_block)); + new_block->mr = mr; + new_block->used_length = size; + new_block->max_length = size; + new_block->flags = ram_flags; + new_block->host = file_ram_alloc(new_block, size, fd, !file_size, offset, + errp); + if (!new_block->host) { + g_free(new_block); + return NULL; + } + + ram_block_add(new_block, &local_err); + if (local_err) { + g_free(new_block); + error_propagate(errp, local_err); + return NULL; + } + return new_block; + +} + + +RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr, + uint32_t ram_flags, const char *mem_path, + off_t offset, Error **errp) +{ + int fd; + bool created; + RAMBlock *block; + + fd = file_ram_open(mem_path, memory_region_name(mr), + !!(ram_flags & RAM_READONLY_FD), &created); + if (fd < 0) { + error_setg_errno(errp, -fd, "can't open backing store %s for guest RAM", + mem_path); + if (!(ram_flags & RAM_READONLY_FD) && !(ram_flags & RAM_SHARED) && + fd == -EACCES) { + /* + * If we can open the file R/O (note: will never create a new file) + * and we are dealing with a private mapping, there are still ways + * to consume such files and get RAM instead of ROM. + */ + fd = file_ram_open(mem_path, memory_region_name(mr), true, + &created); + if (fd < 0) { + return NULL; + } + assert(!created); + close(fd); + error_append_hint(errp, "Consider opening the backing store" + " read-only but still creating writable RAM using" + " '-object memory-backend-file,readonly=on,rom=off...'" + " (see \"VM templating\" documentation)\n"); + } + return NULL; + } + + block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, offset, errp); + if (!block) { + if (created) { + unlink(mem_path); + } + close(fd); + return NULL; + } + + return block; +} +#endif + +static +RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size, + void (*resized)(const char*, + uint64_t length, + void *host), + void *host, uint32_t ram_flags, + MemoryRegion *mr, Error **errp) +{ + RAMBlock *new_block; + Error *local_err = NULL; + + assert((ram_flags & ~(RAM_SHARED | RAM_RESIZEABLE | RAM_PREALLOC | + RAM_NORESERVE)) == 0); + assert(!host ^ (ram_flags & RAM_PREALLOC)); + + size = HOST_PAGE_ALIGN(size); + max_size = HOST_PAGE_ALIGN(max_size); + new_block = g_malloc0(sizeof(*new_block)); + new_block->mr = mr; + new_block->resized = resized; + new_block->used_length = size; + new_block->max_length = max_size; + assert(max_size >= size); + new_block->fd = -1; + new_block->page_size = qemu_real_host_page_size(); + new_block->host = host; + new_block->flags = ram_flags; + ram_block_add(new_block, &local_err); + if (local_err) { + g_free(new_block); + error_propagate(errp, local_err); + return NULL; + } + return new_block; +} + +RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host, + MemoryRegion *mr, Error **errp) +{ + return qemu_ram_alloc_internal(size, size, NULL, host, RAM_PREALLOC, mr, + errp); +} + +RAMBlock *qemu_ram_alloc(ram_addr_t size, uint32_t ram_flags, + MemoryRegion *mr, Error **errp) +{ + assert((ram_flags & ~(RAM_SHARED | RAM_NORESERVE)) == 0); + return qemu_ram_alloc_internal(size, size, NULL, NULL, ram_flags, mr, errp); +} + +RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz, + void (*resized)(const char*, + uint64_t length, + void *host), + MemoryRegion *mr, Error **errp) +{ + return qemu_ram_alloc_internal(size, maxsz, resized, NULL, + RAM_RESIZEABLE, mr, errp); +} + +static void reclaim_ramblock(RAMBlock *block) +{ + if (block->flags & RAM_PREALLOC) { + ; + } else if (xen_enabled()) { + xen_invalidate_map_cache_entry(block->host); +#ifndef _WIN32 + } else if (block->fd >= 0) { + qemu_ram_munmap(block->fd, block->host, block->max_length); + close(block->fd); +#endif + } else { + qemu_anon_ram_free(block->host, block->max_length); + } + g_free(block); +} + +void qemu_ram_free(RAMBlock *block) +{ + if (!block) { + return; + } + + if (block->host) { + ram_block_notify_remove(block->host, block->used_length, + block->max_length); + } + + qemu_mutex_lock_ramlist(); + QLIST_REMOVE_RCU(block, next); + ram_list.mru_block = NULL; + /* Write list before version */ + smp_wmb(); + ram_list.version++; + call_rcu(block, reclaim_ramblock, rcu); + qemu_mutex_unlock_ramlist(); +} + +#ifndef _WIN32 +void qemu_ram_remap(ram_addr_t addr, ram_addr_t length) +{ + RAMBlock *block; + ram_addr_t offset; + int flags; + void *area, *vaddr; + int prot; + + RAMBLOCK_FOREACH(block) { + offset = addr - block->offset; + if (offset < block->max_length) { + vaddr = ramblock_ptr(block, offset); + if (block->flags & RAM_PREALLOC) { + ; + } else if (xen_enabled()) { + abort(); + } else { + flags = MAP_FIXED; + flags |= block->flags & RAM_SHARED ? + MAP_SHARED : MAP_PRIVATE; + flags |= block->flags & RAM_NORESERVE ? MAP_NORESERVE : 0; + prot = PROT_READ; + prot |= block->flags & RAM_READONLY ? 0 : PROT_WRITE; + if (block->fd >= 0) { + area = mmap(vaddr, length, prot, flags, block->fd, + offset + block->fd_offset); + } else { + flags |= MAP_ANONYMOUS; + area = mmap(vaddr, length, prot, flags, -1, 0); + } + if (area != vaddr) { + error_report("Could not remap addr: " + RAM_ADDR_FMT "@" RAM_ADDR_FMT "", + length, addr); + exit(1); + } + memory_try_enable_merging(vaddr, length); + qemu_ram_setup_dump(vaddr, length); + } + } + } +} +#endif /* !_WIN32 */ + +/* Return a host pointer to ram allocated with qemu_ram_alloc. + * This should not be used for general purpose DMA. Use address_space_map + * or address_space_rw instead. For local memory (e.g. video ram) that the + * device owns, use memory_region_get_ram_ptr. + * + * Called within RCU critical section. + */ +void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr) +{ + RAMBlock *block = ram_block; + + if (block == NULL) { + block = qemu_get_ram_block(addr); + addr -= block->offset; + } + + if (xen_enabled() && block->host == NULL) { + /* We need to check if the requested address is in the RAM + * because we don't want to map the entire memory in QEMU. + * In that case just map until the end of the page. + */ + if (block->offset == 0) { + return xen_map_cache(addr, 0, 0, false); + } + + block->host = xen_map_cache(block->offset, block->max_length, 1, false); + } + return ramblock_ptr(block, addr); +} + +/* Return a host pointer to guest's ram. Similar to qemu_map_ram_ptr + * but takes a size argument. + * + * Called within RCU critical section. + */ +static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr, + hwaddr *size, bool lock) +{ + RAMBlock *block = ram_block; + if (*size == 0) { + return NULL; + } + + if (block == NULL) { + block = qemu_get_ram_block(addr); + addr -= block->offset; + } + *size = MIN(*size, block->max_length - addr); + + if (xen_enabled() && block->host == NULL) { + /* We need to check if the requested address is in the RAM + * because we don't want to map the entire memory in QEMU. + * In that case just map the requested area. + */ + if (block->offset == 0) { + return xen_map_cache(addr, *size, lock, lock); + } + + block->host = xen_map_cache(block->offset, block->max_length, 1, lock); + } + + return ramblock_ptr(block, addr); +} + +/* Return the offset of a hostpointer within a ramblock */ +ram_addr_t qemu_ram_block_host_offset(RAMBlock *rb, void *host) +{ + ram_addr_t res = (uint8_t *)host - (uint8_t *)rb->host; + assert((uintptr_t)host >= (uintptr_t)rb->host); + assert(res < rb->max_length); + + return res; +} + +/* + * Translates a host ptr back to a RAMBlock, a ram_addr and an offset + * in that RAMBlock. + * + * ptr: Host pointer to look up + * round_offset: If true round the result offset down to a page boundary + * *ram_addr: set to result ram_addr + * *offset: set to result offset within the RAMBlock + * + * Returns: RAMBlock (or NULL if not found) + * + * By the time this function returns, the returned pointer is not protected + * by RCU anymore. If the caller is not within an RCU critical section and + * does not hold the iothread lock, it must have other means of protecting the + * pointer, such as a reference to the region that includes the incoming + * ram_addr_t. + */ +RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset, + ram_addr_t *offset) +{ + RAMBlock *block; + uint8_t *host = ptr; + + if (xen_enabled()) { + ram_addr_t ram_addr; + RCU_READ_LOCK_GUARD(); + ram_addr = xen_ram_addr_from_mapcache(ptr); + block = qemu_get_ram_block(ram_addr); + if (block) { + *offset = ram_addr - block->offset; + } + return block; + } + + RCU_READ_LOCK_GUARD(); + block = qatomic_rcu_read(&ram_list.mru_block); + if (block && block->host && host - block->host < block->max_length) { + goto found; + } + + RAMBLOCK_FOREACH(block) { + /* This case append when the block is not mapped. */ + if (block->host == NULL) { + continue; + } + if (host - block->host < block->max_length) { + goto found; + } + } + + return NULL; + +found: + *offset = (host - block->host); + if (round_offset) { + *offset &= TARGET_PAGE_MASK; + } + return block; +} + +/* + * Finds the named RAMBlock + * + * name: The name of RAMBlock to find + * + * Returns: RAMBlock (or NULL if not found) + */ +RAMBlock *qemu_ram_block_by_name(const char *name) +{ + RAMBlock *block; + + RAMBLOCK_FOREACH(block) { + if (!strcmp(name, block->idstr)) { + return block; + } + } + + return NULL; +} + +/* + * Some of the system routines need to translate from a host pointer + * (typically a TLB entry) back to a ram offset. + */ +ram_addr_t qemu_ram_addr_from_host(void *ptr) +{ + RAMBlock *block; + ram_addr_t offset; + + block = qemu_ram_block_from_host(ptr, false, &offset); + if (!block) { + return RAM_ADDR_INVALID; + } + + return block->offset + offset; +} + +ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr) +{ + ram_addr_t ram_addr; + + ram_addr = qemu_ram_addr_from_host(ptr); + if (ram_addr == RAM_ADDR_INVALID) { + error_report("Bad ram pointer %p", ptr); + abort(); + } + return ram_addr; +} + +static MemTxResult flatview_read(FlatView *fv, hwaddr addr, + MemTxAttrs attrs, void *buf, hwaddr len); +static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs, + const void *buf, hwaddr len); +static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len, + bool is_write, MemTxAttrs attrs); + +static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data, + unsigned len, MemTxAttrs attrs) +{ + subpage_t *subpage = opaque; + uint8_t buf[8]; + MemTxResult res; + +#if defined(DEBUG_SUBPAGE) + printf("%s: subpage %p len %u addr " HWADDR_FMT_plx "\n", __func__, + subpage, len, addr); +#endif + res = flatview_read(subpage->fv, addr + subpage->base, attrs, buf, len); + if (res) { + return res; + } + *data = ldn_p(buf, len); + return MEMTX_OK; +} + +static MemTxResult subpage_write(void *opaque, hwaddr addr, + uint64_t value, unsigned len, MemTxAttrs attrs) +{ + subpage_t *subpage = opaque; + uint8_t buf[8]; + +#if defined(DEBUG_SUBPAGE) + printf("%s: subpage %p len %u addr " HWADDR_FMT_plx + " value %"PRIx64"\n", + __func__, subpage, len, addr, value); +#endif + stn_p(buf, len, value); + return flatview_write(subpage->fv, addr + subpage->base, attrs, buf, len); +} + +static bool subpage_accepts(void *opaque, hwaddr addr, + unsigned len, bool is_write, + MemTxAttrs attrs) +{ + subpage_t *subpage = opaque; +#if defined(DEBUG_SUBPAGE) + printf("%s: subpage %p %c len %u addr " HWADDR_FMT_plx "\n", + __func__, subpage, is_write ? 'w' : 'r', len, addr); +#endif + + return flatview_access_valid(subpage->fv, addr + subpage->base, + len, is_write, attrs); +} + +static const MemoryRegionOps subpage_ops = { + .read_with_attrs = subpage_read, + .write_with_attrs = subpage_write, + .impl.min_access_size = 1, + .impl.max_access_size = 8, + .valid.min_access_size = 1, + .valid.max_access_size = 8, + .valid.accepts = subpage_accepts, + .endianness = DEVICE_NATIVE_ENDIAN, +}; + +static int subpage_register(subpage_t *mmio, uint32_t start, uint32_t end, + uint16_t section) +{ + int idx, eidx; + + if (start >= TARGET_PAGE_SIZE || end >= TARGET_PAGE_SIZE) + return -1; + idx = SUBPAGE_IDX(start); + eidx = SUBPAGE_IDX(end); +#if defined(DEBUG_SUBPAGE) + printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n", + __func__, mmio, start, end, idx, eidx, section); +#endif + for (; idx <= eidx; idx++) { + mmio->sub_section[idx] = section; + } + + return 0; +} + +static subpage_t *subpage_init(FlatView *fv, hwaddr base) +{ + subpage_t *mmio; + + /* mmio->sub_section is set to PHYS_SECTION_UNASSIGNED with g_malloc0 */ + mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t)); + mmio->fv = fv; + mmio->base = base; + memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio, + NULL, TARGET_PAGE_SIZE); + mmio->iomem.subpage = true; +#if defined(DEBUG_SUBPAGE) + printf("%s: %p base " HWADDR_FMT_plx " len %08x\n", __func__, + mmio, base, TARGET_PAGE_SIZE); +#endif + + return mmio; +} + +static uint16_t dummy_section(PhysPageMap *map, FlatView *fv, MemoryRegion *mr) +{ + assert(fv); + MemoryRegionSection section = { + .fv = fv, + .mr = mr, + .offset_within_address_space = 0, + .offset_within_region = 0, + .size = int128_2_64(), + }; + + return phys_section_add(map, §ion); +} + +MemoryRegionSection *iotlb_to_section(CPUState *cpu, + hwaddr index, MemTxAttrs attrs) +{ + int asidx = cpu_asidx_from_attrs(cpu, attrs); + CPUAddressSpace *cpuas = &cpu->cpu_ases[asidx]; + AddressSpaceDispatch *d = cpuas->memory_dispatch; + int section_index = index & ~TARGET_PAGE_MASK; + MemoryRegionSection *ret; + + assert(section_index < d->map.sections_nb); + ret = d->map.sections + section_index; + assert(ret->mr); + assert(ret->mr->ops); + + return ret; +} + +static void io_mem_init(void) +{ + memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL, + NULL, UINT64_MAX); +} + +AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv) +{ + AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1); + uint16_t n; + + n = dummy_section(&d->map, fv, &io_mem_unassigned); + assert(n == PHYS_SECTION_UNASSIGNED); + + d->phys_map = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 }; + + return d; +} + +void address_space_dispatch_free(AddressSpaceDispatch *d) +{ + phys_sections_free(&d->map); + g_free(d); +} + +static void do_nothing(CPUState *cpu, run_on_cpu_data d) +{ +} + +static void tcg_log_global_after_sync(MemoryListener *listener) +{ + CPUAddressSpace *cpuas; + + /* Wait for the CPU to end the current TB. This avoids the following + * incorrect race: + * + * vCPU migration + * ---------------------- ------------------------- + * TLB check -> slow path + * notdirty_mem_write + * write to RAM + * mark dirty + * clear dirty flag + * TLB check -> fast path + * read memory + * write to RAM + * + * by pushing the migration thread's memory read after the vCPU thread has + * written the memory. + */ + if (replay_mode == REPLAY_MODE_NONE) { + /* + * VGA can make calls to this function while updating the screen. + * In record/replay mode this causes a deadlock, because + * run_on_cpu waits for rr mutex. Therefore no races are possible + * in this case and no need for making run_on_cpu when + * record/replay is enabled. + */ + cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener); + run_on_cpu(cpuas->cpu, do_nothing, RUN_ON_CPU_NULL); + } +} + +static void tcg_commit_cpu(CPUState *cpu, run_on_cpu_data data) +{ + CPUAddressSpace *cpuas = data.host_ptr; + + cpuas->memory_dispatch = address_space_to_dispatch(cpuas->as); + tlb_flush(cpu); +} + +static void tcg_commit(MemoryListener *listener) +{ + CPUAddressSpace *cpuas; + CPUState *cpu; + + assert(tcg_enabled()); + /* since each CPU stores ram addresses in its TLB cache, we must + reset the modified entries */ + cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener); + cpu = cpuas->cpu; + + /* + * Defer changes to as->memory_dispatch until the cpu is quiescent. + * Otherwise we race between (1) other cpu threads and (2) ongoing + * i/o for the current cpu thread, with data cached by mmu_lookup(). + * + * In addition, queueing the work function will kick the cpu back to + * the main loop, which will end the RCU critical section and reclaim + * the memory data structures. + * + * That said, the listener is also called during realize, before + * all of the tcg machinery for run-on is initialized: thus halt_cond. + */ + if (cpu->halt_cond) { + async_run_on_cpu(cpu, tcg_commit_cpu, RUN_ON_CPU_HOST_PTR(cpuas)); + } else { + tcg_commit_cpu(cpu, RUN_ON_CPU_HOST_PTR(cpuas)); + } +} + +static void memory_map_init(void) +{ + system_memory = g_malloc(sizeof(*system_memory)); + + memory_region_init(system_memory, NULL, "system", UINT64_MAX); + address_space_init(&address_space_memory, system_memory, "memory"); + + system_io = g_malloc(sizeof(*system_io)); + memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io", + 65536); + address_space_init(&address_space_io, system_io, "I/O"); +} + +MemoryRegion *get_system_memory(void) +{ + return system_memory; +} + +MemoryRegion *get_system_io(void) +{ + return system_io; +} + +static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr, + hwaddr length) +{ + uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr); + addr += memory_region_get_ram_addr(mr); + + /* No early return if dirty_log_mask is or becomes 0, because + * cpu_physical_memory_set_dirty_range will still call + * xen_modified_memory. + */ + if (dirty_log_mask) { + dirty_log_mask = + cpu_physical_memory_range_includes_clean(addr, length, dirty_log_mask); + } + if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) { + assert(tcg_enabled()); + tb_invalidate_phys_range(addr, addr + length - 1); + dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE); + } + cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask); +} + +void memory_region_flush_rom_device(MemoryRegion *mr, hwaddr addr, hwaddr size) +{ + /* + * In principle this function would work on other memory region types too, + * but the ROM device use case is the only one where this operation is + * necessary. Other memory regions should use the + * address_space_read/write() APIs. + */ + assert(memory_region_is_romd(mr)); + + invalidate_and_set_dirty(mr, addr, size); +} + +int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr) +{ + unsigned access_size_max = mr->ops->valid.max_access_size; + + /* Regions are assumed to support 1-4 byte accesses unless + otherwise specified. */ + if (access_size_max == 0) { + access_size_max = 4; + } + + /* Bound the maximum access by the alignment of the address. */ + if (!mr->ops->impl.unaligned) { + unsigned align_size_max = addr & -addr; + if (align_size_max != 0 && align_size_max < access_size_max) { + access_size_max = align_size_max; + } + } + + /* Don't attempt accesses larger than the maximum. */ + if (l > access_size_max) { + l = access_size_max; + } + l = pow2floor(l); + + return l; +} + +bool prepare_mmio_access(MemoryRegion *mr) +{ + bool release_lock = false; + + if (!qemu_mutex_iothread_locked()) { + qemu_mutex_lock_iothread(); + release_lock = true; + } + if (mr->flush_coalesced_mmio) { + qemu_flush_coalesced_mmio_buffer(); + } + + return release_lock; +} + +/** + * flatview_access_allowed + * @mr: #MemoryRegion to be accessed + * @attrs: memory transaction attributes + * @addr: address within that memory region + * @len: the number of bytes to access + * + * Check if a memory transaction is allowed. + * + * Returns: true if transaction is allowed, false if denied. + */ +static bool flatview_access_allowed(MemoryRegion *mr, MemTxAttrs attrs, + hwaddr addr, hwaddr len) +{ + if (likely(!attrs.memory)) { + return true; + } + if (memory_region_is_ram(mr)) { + return true; + } + qemu_log_mask(LOG_GUEST_ERROR, + "Invalid access to non-RAM device at " + "addr 0x%" HWADDR_PRIX ", size %" HWADDR_PRIu ", " + "region '%s'\n", addr, len, memory_region_name(mr)); + return false; +} + +/* Called within RCU critical section. */ +static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr, + MemTxAttrs attrs, + const void *ptr, + hwaddr len, hwaddr addr1, + hwaddr l, MemoryRegion *mr) +{ + uint8_t *ram_ptr; + uint64_t val; + MemTxResult result = MEMTX_OK; + bool release_lock = false; + const uint8_t *buf = ptr; + + for (;;) { + if (!flatview_access_allowed(mr, attrs, addr1, l)) { + result |= MEMTX_ACCESS_ERROR; + /* Keep going. */ + } else if (!memory_access_is_direct(mr, true)) { + release_lock |= prepare_mmio_access(mr); + l = memory_access_size(mr, l, addr1); + /* XXX: could force current_cpu to NULL to avoid + potential bugs */ + val = ldn_he_p(buf, l); + result |= memory_region_dispatch_write(mr, addr1, val, + size_memop(l), attrs); + } else { + /* RAM case */ + ram_ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false); + memmove(ram_ptr, buf, l); + invalidate_and_set_dirty(mr, addr1, l); + } + + if (release_lock) { + qemu_mutex_unlock_iothread(); + release_lock = false; + } + + len -= l; + buf += l; + addr += l; + + if (!len) { + break; + } + + l = len; + mr = flatview_translate(fv, addr, &addr1, &l, true, attrs); + } + + return result; +} + +/* Called from RCU critical section. */ +static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs, + const void *buf, hwaddr len) +{ + hwaddr l; + hwaddr addr1; + MemoryRegion *mr; + + l = len; + mr = flatview_translate(fv, addr, &addr1, &l, true, attrs); + if (!flatview_access_allowed(mr, attrs, addr, len)) { + return MEMTX_ACCESS_ERROR; + } + return flatview_write_continue(fv, addr, attrs, buf, len, + addr1, l, mr); +} + +/* Called within RCU critical section. */ +MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr, + MemTxAttrs attrs, void *ptr, + hwaddr len, hwaddr addr1, hwaddr l, + MemoryRegion *mr) +{ + uint8_t *ram_ptr; + uint64_t val; + MemTxResult result = MEMTX_OK; + bool release_lock = false; + uint8_t *buf = ptr; + + fuzz_dma_read_cb(addr, len, mr); + for (;;) { + if (!flatview_access_allowed(mr, attrs, addr1, l)) { + result |= MEMTX_ACCESS_ERROR; + /* Keep going. */ + } else if (!memory_access_is_direct(mr, false)) { + /* I/O case */ + release_lock |= prepare_mmio_access(mr); + l = memory_access_size(mr, l, addr1); + result |= memory_region_dispatch_read(mr, addr1, &val, + size_memop(l), attrs); + stn_he_p(buf, l, val); + } else { + /* RAM case */ + ram_ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false); + memcpy(buf, ram_ptr, l); + } + + if (release_lock) { + qemu_mutex_unlock_iothread(); + release_lock = false; + } + + len -= l; + buf += l; + addr += l; + + if (!len) { + break; + } + + l = len; + mr = flatview_translate(fv, addr, &addr1, &l, false, attrs); + } + + return result; +} + +/* Called from RCU critical section. */ +static MemTxResult flatview_read(FlatView *fv, hwaddr addr, + MemTxAttrs attrs, void *buf, hwaddr len) +{ + hwaddr l; + hwaddr addr1; + MemoryRegion *mr; + + l = len; + mr = flatview_translate(fv, addr, &addr1, &l, false, attrs); + if (!flatview_access_allowed(mr, attrs, addr, len)) { + return MEMTX_ACCESS_ERROR; + } + return flatview_read_continue(fv, addr, attrs, buf, len, + addr1, l, mr); +} + +MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr, + MemTxAttrs attrs, void *buf, hwaddr len) +{ + MemTxResult result = MEMTX_OK; + FlatView *fv; + + if (len > 0) { + RCU_READ_LOCK_GUARD(); + fv = address_space_to_flatview(as); + result = flatview_read(fv, addr, attrs, buf, len); + } + + return result; +} + +MemTxResult address_space_write(AddressSpace *as, hwaddr addr, + MemTxAttrs attrs, + const void *buf, hwaddr len) +{ + MemTxResult result = MEMTX_OK; + FlatView *fv; + + if (len > 0) { + RCU_READ_LOCK_GUARD(); + fv = address_space_to_flatview(as); + result = flatview_write(fv, addr, attrs, buf, len); + } + + return result; +} + +MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs, + void *buf, hwaddr len, bool is_write) +{ + if (is_write) { + return address_space_write(as, addr, attrs, buf, len); + } else { + return address_space_read_full(as, addr, attrs, buf, len); + } +} + +MemTxResult address_space_set(AddressSpace *as, hwaddr addr, + uint8_t c, hwaddr len, MemTxAttrs attrs) +{ +#define FILLBUF_SIZE 512 + uint8_t fillbuf[FILLBUF_SIZE]; + int l; + MemTxResult error = MEMTX_OK; + + memset(fillbuf, c, FILLBUF_SIZE); + while (len > 0) { + l = len < FILLBUF_SIZE ? len : FILLBUF_SIZE; + error |= address_space_write(as, addr, attrs, fillbuf, l); + len -= l; + addr += l; + } + + return error; +} + +void cpu_physical_memory_rw(hwaddr addr, void *buf, + hwaddr len, bool is_write) +{ + address_space_rw(&address_space_memory, addr, MEMTXATTRS_UNSPECIFIED, + buf, len, is_write); +} + +enum write_rom_type { + WRITE_DATA, + FLUSH_CACHE, +}; + +static inline MemTxResult address_space_write_rom_internal(AddressSpace *as, + hwaddr addr, + MemTxAttrs attrs, + const void *ptr, + hwaddr len, + enum write_rom_type type) +{ + hwaddr l; + uint8_t *ram_ptr; + hwaddr addr1; + MemoryRegion *mr; + const uint8_t *buf = ptr; + + RCU_READ_LOCK_GUARD(); + while (len > 0) { + l = len; + mr = address_space_translate(as, addr, &addr1, &l, true, attrs); + + if (!(memory_region_is_ram(mr) || + memory_region_is_romd(mr))) { + l = memory_access_size(mr, l, addr1); + } else { + /* ROM/RAM case */ + ram_ptr = qemu_map_ram_ptr(mr->ram_block, addr1); + switch (type) { + case WRITE_DATA: + memcpy(ram_ptr, buf, l); + invalidate_and_set_dirty(mr, addr1, l); + break; + case FLUSH_CACHE: + flush_idcache_range((uintptr_t)ram_ptr, (uintptr_t)ram_ptr, l); + break; + } + } + len -= l; + buf += l; + addr += l; + } + return MEMTX_OK; +} + +/* used for ROM loading : can write in RAM and ROM */ +MemTxResult address_space_write_rom(AddressSpace *as, hwaddr addr, + MemTxAttrs attrs, + const void *buf, hwaddr len) +{ + return address_space_write_rom_internal(as, addr, attrs, + buf, len, WRITE_DATA); +} + +void cpu_flush_icache_range(hwaddr start, hwaddr len) +{ + /* + * This function should do the same thing as an icache flush that was + * triggered from within the guest. For TCG we are always cache coherent, + * so there is no need to flush anything. For KVM / Xen we need to flush + * the host's instruction cache at least. + */ + if (tcg_enabled()) { + return; + } + + address_space_write_rom_internal(&address_space_memory, + start, MEMTXATTRS_UNSPECIFIED, + NULL, len, FLUSH_CACHE); +} + +typedef struct { + MemoryRegion *mr; + void *buffer; + hwaddr addr; + hwaddr len; + bool in_use; +} BounceBuffer; + +static BounceBuffer bounce; + +typedef struct MapClient { + QEMUBH *bh; + QLIST_ENTRY(MapClient) link; +} MapClient; + +QemuMutex map_client_list_lock; +static QLIST_HEAD(, MapClient) map_client_list + = QLIST_HEAD_INITIALIZER(map_client_list); + +static void cpu_unregister_map_client_do(MapClient *client) +{ + QLIST_REMOVE(client, link); + g_free(client); +} + +static void cpu_notify_map_clients_locked(void) +{ + MapClient *client; + + while (!QLIST_EMPTY(&map_client_list)) { + client = QLIST_FIRST(&map_client_list); + qemu_bh_schedule(client->bh); + cpu_unregister_map_client_do(client); + } +} + +void cpu_register_map_client(QEMUBH *bh) +{ + MapClient *client = g_malloc(sizeof(*client)); + + qemu_mutex_lock(&map_client_list_lock); + client->bh = bh; + QLIST_INSERT_HEAD(&map_client_list, client, link); + /* Write map_client_list before reading in_use. */ + smp_mb(); + if (!qatomic_read(&bounce.in_use)) { + cpu_notify_map_clients_locked(); + } + qemu_mutex_unlock(&map_client_list_lock); +} + +void cpu_exec_init_all(void) +{ + qemu_mutex_init(&ram_list.mutex); + /* The data structures we set up here depend on knowing the page size, + * so no more changes can be made after this point. + * In an ideal world, nothing we did before we had finished the + * machine setup would care about the target page size, and we could + * do this much later, rather than requiring board models to state + * up front what their requirements are. + */ + finalize_target_page_bits(); + io_mem_init(); + memory_map_init(); + qemu_mutex_init(&map_client_list_lock); +} + +void cpu_unregister_map_client(QEMUBH *bh) +{ + MapClient *client; + + qemu_mutex_lock(&map_client_list_lock); + QLIST_FOREACH(client, &map_client_list, link) { + if (client->bh == bh) { + cpu_unregister_map_client_do(client); + break; + } + } + qemu_mutex_unlock(&map_client_list_lock); +} + +static void cpu_notify_map_clients(void) +{ + qemu_mutex_lock(&map_client_list_lock); + cpu_notify_map_clients_locked(); + qemu_mutex_unlock(&map_client_list_lock); +} + +static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len, + bool is_write, MemTxAttrs attrs) +{ + MemoryRegion *mr; + hwaddr l, xlat; + + while (len > 0) { + l = len; + mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs); + if (!memory_access_is_direct(mr, is_write)) { + l = memory_access_size(mr, l, addr); + if (!memory_region_access_valid(mr, xlat, l, is_write, attrs)) { + return false; + } + } + + len -= l; + addr += l; + } + return true; +} + +bool address_space_access_valid(AddressSpace *as, hwaddr addr, + hwaddr len, bool is_write, + MemTxAttrs attrs) +{ + FlatView *fv; + + RCU_READ_LOCK_GUARD(); + fv = address_space_to_flatview(as); + return flatview_access_valid(fv, addr, len, is_write, attrs); +} + +static hwaddr +flatview_extend_translation(FlatView *fv, hwaddr addr, + hwaddr target_len, + MemoryRegion *mr, hwaddr base, hwaddr len, + bool is_write, MemTxAttrs attrs) +{ + hwaddr done = 0; + hwaddr xlat; + MemoryRegion *this_mr; + + for (;;) { + target_len -= len; + addr += len; + done += len; + if (target_len == 0) { + return done; + } + + len = target_len; + this_mr = flatview_translate(fv, addr, &xlat, + &len, is_write, attrs); + if (this_mr != mr || xlat != base + done) { + return done; + } + } +} + +/* Map a physical memory region into a host virtual address. + * May map a subset of the requested range, given by and returned in *plen. + * May return NULL if resources needed to perform the mapping are exhausted. + * Use only for reads OR writes - not for read-modify-write operations. + * Use cpu_register_map_client() to know when retrying the map operation is + * likely to succeed. + */ +void *address_space_map(AddressSpace *as, + hwaddr addr, + hwaddr *plen, + bool is_write, + MemTxAttrs attrs) +{ + hwaddr len = *plen; + hwaddr l, xlat; + MemoryRegion *mr; + FlatView *fv; + + if (len == 0) { + return NULL; + } + + l = len; + RCU_READ_LOCK_GUARD(); + fv = address_space_to_flatview(as); + mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs); + + if (!memory_access_is_direct(mr, is_write)) { + if (qatomic_xchg(&bounce.in_use, true)) { + *plen = 0; + return NULL; + } + /* Avoid unbounded allocations */ + l = MIN(l, TARGET_PAGE_SIZE); + bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l); + bounce.addr = addr; + bounce.len = l; + + memory_region_ref(mr); + bounce.mr = mr; + if (!is_write) { + flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED, + bounce.buffer, l); + } + + *plen = l; + return bounce.buffer; + } + + + memory_region_ref(mr); + *plen = flatview_extend_translation(fv, addr, len, mr, xlat, + l, is_write, attrs); + fuzz_dma_read_cb(addr, *plen, mr); + return qemu_ram_ptr_length(mr->ram_block, xlat, plen, true); +} + +/* Unmaps a memory region previously mapped by address_space_map(). + * Will also mark the memory as dirty if is_write is true. access_len gives + * the amount of memory that was actually read or written by the caller. + */ +void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len, + bool is_write, hwaddr access_len) +{ + if (buffer != bounce.buffer) { + MemoryRegion *mr; + ram_addr_t addr1; + + mr = memory_region_from_host(buffer, &addr1); + assert(mr != NULL); + if (is_write) { + invalidate_and_set_dirty(mr, addr1, access_len); + } + if (xen_enabled()) { + xen_invalidate_map_cache_entry(buffer); + } + memory_region_unref(mr); + return; + } + if (is_write) { + address_space_write(as, bounce.addr, MEMTXATTRS_UNSPECIFIED, + bounce.buffer, access_len); + } + qemu_vfree(bounce.buffer); + bounce.buffer = NULL; + memory_region_unref(bounce.mr); + /* Clear in_use before reading map_client_list. */ + qatomic_set_mb(&bounce.in_use, false); + cpu_notify_map_clients(); +} + +void *cpu_physical_memory_map(hwaddr addr, + hwaddr *plen, + bool is_write) +{ + return address_space_map(&address_space_memory, addr, plen, is_write, + MEMTXATTRS_UNSPECIFIED); +} + +void cpu_physical_memory_unmap(void *buffer, hwaddr len, + bool is_write, hwaddr access_len) +{ + return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len); +} + +#define ARG1_DECL AddressSpace *as +#define ARG1 as +#define SUFFIX +#define TRANSLATE(...) address_space_translate(as, __VA_ARGS__) +#define RCU_READ_LOCK(...) rcu_read_lock() +#define RCU_READ_UNLOCK(...) rcu_read_unlock() +#include "memory_ldst.c.inc" + +int64_t address_space_cache_init(MemoryRegionCache *cache, + AddressSpace *as, + hwaddr addr, + hwaddr len, + bool is_write) +{ + AddressSpaceDispatch *d; + hwaddr l; + MemoryRegion *mr; + Int128 diff; + + assert(len > 0); + + l = len; + cache->fv = address_space_get_flatview(as); + d = flatview_to_dispatch(cache->fv); + cache->mrs = *address_space_translate_internal(d, addr, &cache->xlat, &l, true); + + /* + * cache->xlat is now relative to cache->mrs.mr, not to the section itself. + * Take that into account to compute how many bytes are there between + * cache->xlat and the end of the section. + */ + diff = int128_sub(cache->mrs.size, + int128_make64(cache->xlat - cache->mrs.offset_within_region)); + l = int128_get64(int128_min(diff, int128_make64(l))); + + mr = cache->mrs.mr; + memory_region_ref(mr); + if (memory_access_is_direct(mr, is_write)) { + /* We don't care about the memory attributes here as we're only + * doing this if we found actual RAM, which behaves the same + * regardless of attributes; so UNSPECIFIED is fine. + */ + l = flatview_extend_translation(cache->fv, addr, len, mr, + cache->xlat, l, is_write, + MEMTXATTRS_UNSPECIFIED); + cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, &l, true); + } else { + cache->ptr = NULL; + } + + cache->len = l; + cache->is_write = is_write; + return l; +} + +void address_space_cache_invalidate(MemoryRegionCache *cache, + hwaddr addr, + hwaddr access_len) +{ + assert(cache->is_write); + if (likely(cache->ptr)) { + invalidate_and_set_dirty(cache->mrs.mr, addr + cache->xlat, access_len); + } +} + +void address_space_cache_destroy(MemoryRegionCache *cache) +{ + if (!cache->mrs.mr) { + return; + } + + if (xen_enabled()) { + xen_invalidate_map_cache_entry(cache->ptr); + } + memory_region_unref(cache->mrs.mr); + flatview_unref(cache->fv); + cache->mrs.mr = NULL; + cache->fv = NULL; +} + +/* Called from RCU critical section. This function has the same + * semantics as address_space_translate, but it only works on a + * predefined range of a MemoryRegion that was mapped with + * address_space_cache_init. + */ +static inline MemoryRegion *address_space_translate_cached( + MemoryRegionCache *cache, hwaddr addr, hwaddr *xlat, + hwaddr *plen, bool is_write, MemTxAttrs attrs) +{ + MemoryRegionSection section; + MemoryRegion *mr; + IOMMUMemoryRegion *iommu_mr; + AddressSpace *target_as; + + assert(!cache->ptr); + *xlat = addr + cache->xlat; + + mr = cache->mrs.mr; + iommu_mr = memory_region_get_iommu(mr); + if (!iommu_mr) { + /* MMIO region. */ + return mr; + } + + section = address_space_translate_iommu(iommu_mr, xlat, plen, + NULL, is_write, true, + &target_as, attrs); + return section.mr; +} + +/* Called from RCU critical section. address_space_read_cached uses this + * out of line function when the target is an MMIO or IOMMU region. + */ +MemTxResult +address_space_read_cached_slow(MemoryRegionCache *cache, hwaddr addr, + void *buf, hwaddr len) +{ + hwaddr addr1, l; + MemoryRegion *mr; + + l = len; + mr = address_space_translate_cached(cache, addr, &addr1, &l, false, + MEMTXATTRS_UNSPECIFIED); + return flatview_read_continue(cache->fv, + addr, MEMTXATTRS_UNSPECIFIED, buf, len, + addr1, l, mr); +} + +/* Called from RCU critical section. address_space_write_cached uses this + * out of line function when the target is an MMIO or IOMMU region. + */ +MemTxResult +address_space_write_cached_slow(MemoryRegionCache *cache, hwaddr addr, + const void *buf, hwaddr len) +{ + hwaddr addr1, l; + MemoryRegion *mr; + + l = len; + mr = address_space_translate_cached(cache, addr, &addr1, &l, true, + MEMTXATTRS_UNSPECIFIED); + return flatview_write_continue(cache->fv, + addr, MEMTXATTRS_UNSPECIFIED, buf, len, + addr1, l, mr); +} + +#define ARG1_DECL MemoryRegionCache *cache +#define ARG1 cache +#define SUFFIX _cached_slow +#define TRANSLATE(...) address_space_translate_cached(cache, __VA_ARGS__) +#define RCU_READ_LOCK() ((void)0) +#define RCU_READ_UNLOCK() ((void)0) +#include "memory_ldst.c.inc" + +/* virtual memory access for debug (includes writing to ROM) */ +int cpu_memory_rw_debug(CPUState *cpu, vaddr addr, + void *ptr, size_t len, bool is_write) +{ + hwaddr phys_addr; + vaddr l, page; + uint8_t *buf = ptr; + + cpu_synchronize_state(cpu); + while (len > 0) { + int asidx; + MemTxAttrs attrs; + MemTxResult res; + + page = addr & TARGET_PAGE_MASK; + phys_addr = cpu_get_phys_page_attrs_debug(cpu, page, &attrs); + asidx = cpu_asidx_from_attrs(cpu, attrs); + /* if no physical page mapped, return an error */ + if (phys_addr == -1) + return -1; + l = (page + TARGET_PAGE_SIZE) - addr; + if (l > len) + l = len; + phys_addr += (addr & ~TARGET_PAGE_MASK); + if (is_write) { + res = address_space_write_rom(cpu->cpu_ases[asidx].as, phys_addr, + attrs, buf, l); + } else { + res = address_space_read(cpu->cpu_ases[asidx].as, phys_addr, + attrs, buf, l); + } + if (res != MEMTX_OK) { + return -1; + } + len -= l; + buf += l; + addr += l; + } + return 0; +} + +/* + * Allows code that needs to deal with migration bitmaps etc to still be built + * target independent. + */ +size_t qemu_target_page_size(void) +{ + return TARGET_PAGE_SIZE; +} + +int qemu_target_page_mask(void) +{ + return TARGET_PAGE_MASK; +} + +int qemu_target_page_bits(void) +{ + return TARGET_PAGE_BITS; +} + +int qemu_target_page_bits_min(void) +{ + return TARGET_PAGE_BITS_MIN; +} + +/* Convert target pages to MiB (2**20). */ +size_t qemu_target_pages_to_MiB(size_t pages) +{ + int page_bits = TARGET_PAGE_BITS; + + /* So far, the largest (non-huge) page size is 64k, i.e. 16 bits. */ + g_assert(page_bits < 20); + + return pages >> (20 - page_bits); +} + +bool cpu_physical_memory_is_io(hwaddr phys_addr) +{ + MemoryRegion*mr; + hwaddr l = 1; + + RCU_READ_LOCK_GUARD(); + mr = address_space_translate(&address_space_memory, + phys_addr, &phys_addr, &l, false, + MEMTXATTRS_UNSPECIFIED); + + return !(memory_region_is_ram(mr) || memory_region_is_romd(mr)); +} + +int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque) +{ + RAMBlock *block; + int ret = 0; + + RCU_READ_LOCK_GUARD(); + RAMBLOCK_FOREACH(block) { + ret = func(block, opaque); + if (ret) { + break; + } + } + return ret; +} + +/* + * Unmap pages of memory from start to start+length such that + * they a) read as 0, b) Trigger whatever fault mechanism + * the OS provides for postcopy. + * The pages must be unmapped by the end of the function. + * Returns: 0 on success, none-0 on failure + * + */ +int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length) +{ + int ret = -1; + + uint8_t *host_startaddr = rb->host + start; + + if (!QEMU_PTR_IS_ALIGNED(host_startaddr, rb->page_size)) { + error_report("ram_block_discard_range: Unaligned start address: %p", + host_startaddr); + goto err; + } + + if ((start + length) <= rb->max_length) { + bool need_madvise, need_fallocate; + if (!QEMU_IS_ALIGNED(length, rb->page_size)) { + error_report("ram_block_discard_range: Unaligned length: %zx", + length); + goto err; + } + + errno = ENOTSUP; /* If we are missing MADVISE etc */ + + /* The logic here is messy; + * madvise DONTNEED fails for hugepages + * fallocate works on hugepages and shmem + * shared anonymous memory requires madvise REMOVE + */ + need_madvise = (rb->page_size == qemu_host_page_size); + need_fallocate = rb->fd != -1; + if (need_fallocate) { + /* For a file, this causes the area of the file to be zero'd + * if read, and for hugetlbfs also causes it to be unmapped + * so a userfault will trigger. + */ +#ifdef CONFIG_FALLOCATE_PUNCH_HOLE + /* + * fallocate() will fail with readonly files. Let's print a + * proper error message. + */ + if (rb->flags & RAM_READONLY_FD) { + error_report("ram_block_discard_range: Discarding RAM" + " with readonly files is not supported"); + goto err; + + } + /* + * We'll discard data from the actual file, even though we only + * have a MAP_PRIVATE mapping, possibly messing with other + * MAP_PRIVATE/MAP_SHARED mappings. There is no easy way to + * change that behavior whithout violating the promised + * semantics of ram_block_discard_range(). + * + * Only warn, because it works as long as nobody else uses that + * file. + */ + if (!qemu_ram_is_shared(rb)) { + warn_report_once("ram_block_discard_range: Discarding RAM" + " in private file mappings is possibly" + " dangerous, because it will modify the" + " underlying file and will affect other" + " users of the file"); + } + + ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + start, length); + if (ret) { + ret = -errno; + error_report("ram_block_discard_range: Failed to fallocate " + "%s:%" PRIx64 " +%zx (%d)", + rb->idstr, start, length, ret); + goto err; + } +#else + ret = -ENOSYS; + error_report("ram_block_discard_range: fallocate not available/file" + "%s:%" PRIx64 " +%zx (%d)", + rb->idstr, start, length, ret); + goto err; +#endif + } + if (need_madvise) { + /* For normal RAM this causes it to be unmapped, + * for shared memory it causes the local mapping to disappear + * and to fall back on the file contents (which we just + * fallocate'd away). + */ +#if defined(CONFIG_MADVISE) + if (qemu_ram_is_shared(rb) && rb->fd < 0) { + ret = madvise(host_startaddr, length, QEMU_MADV_REMOVE); + } else { + ret = madvise(host_startaddr, length, QEMU_MADV_DONTNEED); + } + if (ret) { + ret = -errno; + error_report("ram_block_discard_range: Failed to discard range " + "%s:%" PRIx64 " +%zx (%d)", + rb->idstr, start, length, ret); + goto err; + } +#else + ret = -ENOSYS; + error_report("ram_block_discard_range: MADVISE not available" + "%s:%" PRIx64 " +%zx (%d)", + rb->idstr, start, length, ret); + goto err; +#endif + } + trace_ram_block_discard_range(rb->idstr, host_startaddr, length, + need_madvise, need_fallocate, ret); + } else { + error_report("ram_block_discard_range: Overrun block '%s' (%" PRIu64 + "/%zx/" RAM_ADDR_FMT")", + rb->idstr, start, length, rb->max_length); + } + +err: + return ret; +} + +bool ramblock_is_pmem(RAMBlock *rb) +{ + return rb->flags & RAM_PMEM; +} + +static void mtree_print_phys_entries(int start, int end, int skip, int ptr) +{ + if (start == end - 1) { + qemu_printf("\t%3d ", start); + } else { + qemu_printf("\t%3d..%-3d ", start, end - 1); + } + qemu_printf(" skip=%d ", skip); + if (ptr == PHYS_MAP_NODE_NIL) { + qemu_printf(" ptr=NIL"); + } else if (!skip) { + qemu_printf(" ptr=#%d", ptr); + } else { + qemu_printf(" ptr=[%d]", ptr); + } + qemu_printf("\n"); +} + +#define MR_SIZE(size) (int128_nz(size) ? (hwaddr)int128_get64( \ + int128_sub((size), int128_one())) : 0) + +void mtree_print_dispatch(AddressSpaceDispatch *d, MemoryRegion *root) +{ + int i; + + qemu_printf(" Dispatch\n"); + qemu_printf(" Physical sections\n"); + + for (i = 0; i < d->map.sections_nb; ++i) { + MemoryRegionSection *s = d->map.sections + i; + const char *names[] = { " [unassigned]", " [not dirty]", + " [ROM]", " [watch]" }; + + qemu_printf(" #%d @" HWADDR_FMT_plx ".." HWADDR_FMT_plx + " %s%s%s%s%s", + i, + s->offset_within_address_space, + s->offset_within_address_space + MR_SIZE(s->size), + s->mr->name ? s->mr->name : "(noname)", + i < ARRAY_SIZE(names) ? names[i] : "", + s->mr == root ? " [ROOT]" : "", + s == d->mru_section ? " [MRU]" : "", + s->mr->is_iommu ? " [iommu]" : ""); + + if (s->mr->alias) { + qemu_printf(" alias=%s", s->mr->alias->name ? + s->mr->alias->name : "noname"); + } + qemu_printf("\n"); + } + + qemu_printf(" Nodes (%d bits per level, %d levels) ptr=[%d] skip=%d\n", + P_L2_BITS, P_L2_LEVELS, d->phys_map.ptr, d->phys_map.skip); + for (i = 0; i < d->map.nodes_nb; ++i) { + int j, jprev; + PhysPageEntry prev; + Node *n = d->map.nodes + i; + + qemu_printf(" [%d]\n", i); + + for (j = 0, jprev = 0, prev = *n[0]; j < ARRAY_SIZE(*n); ++j) { + PhysPageEntry *pe = *n + j; + + if (pe->ptr == prev.ptr && pe->skip == prev.skip) { + continue; + } + + mtree_print_phys_entries(jprev, j, prev.skip, prev.ptr); + + jprev = j; + prev = *pe; + } + + if (jprev != ARRAY_SIZE(*n)) { + mtree_print_phys_entries(jprev, j, prev.skip, prev.ptr); + } + } +} + +/* Require any discards to work. */ +static unsigned int ram_block_discard_required_cnt; +/* Require only coordinated discards to work. */ +static unsigned int ram_block_coordinated_discard_required_cnt; +/* Disable any discards. */ +static unsigned int ram_block_discard_disabled_cnt; +/* Disable only uncoordinated discards. */ +static unsigned int ram_block_uncoordinated_discard_disabled_cnt; +static QemuMutex ram_block_discard_disable_mutex; + +static void ram_block_discard_disable_mutex_lock(void) +{ + static gsize initialized; + + if (g_once_init_enter(&initialized)) { + qemu_mutex_init(&ram_block_discard_disable_mutex); + g_once_init_leave(&initialized, 1); + } + qemu_mutex_lock(&ram_block_discard_disable_mutex); +} + +static void ram_block_discard_disable_mutex_unlock(void) +{ + qemu_mutex_unlock(&ram_block_discard_disable_mutex); +} + +int ram_block_discard_disable(bool state) +{ + int ret = 0; + + ram_block_discard_disable_mutex_lock(); + if (!state) { + ram_block_discard_disabled_cnt--; + } else if (ram_block_discard_required_cnt || + ram_block_coordinated_discard_required_cnt) { + ret = -EBUSY; + } else { + ram_block_discard_disabled_cnt++; + } + ram_block_discard_disable_mutex_unlock(); + return ret; +} + +int ram_block_uncoordinated_discard_disable(bool state) +{ + int ret = 0; + + ram_block_discard_disable_mutex_lock(); + if (!state) { + ram_block_uncoordinated_discard_disabled_cnt--; + } else if (ram_block_discard_required_cnt) { + ret = -EBUSY; + } else { + ram_block_uncoordinated_discard_disabled_cnt++; + } + ram_block_discard_disable_mutex_unlock(); + return ret; +} + +int ram_block_discard_require(bool state) +{ + int ret = 0; + + ram_block_discard_disable_mutex_lock(); + if (!state) { + ram_block_discard_required_cnt--; + } else if (ram_block_discard_disabled_cnt || + ram_block_uncoordinated_discard_disabled_cnt) { + ret = -EBUSY; + } else { + ram_block_discard_required_cnt++; + } + ram_block_discard_disable_mutex_unlock(); + return ret; +} + +int ram_block_coordinated_discard_require(bool state) +{ + int ret = 0; + + ram_block_discard_disable_mutex_lock(); + if (!state) { + ram_block_coordinated_discard_required_cnt--; + } else if (ram_block_discard_disabled_cnt) { + ret = -EBUSY; + } else { + ram_block_coordinated_discard_required_cnt++; + } + ram_block_discard_disable_mutex_unlock(); + return ret; +} + +bool ram_block_discard_is_disabled(void) +{ + return qatomic_read(&ram_block_discard_disabled_cnt) || + qatomic_read(&ram_block_uncoordinated_discard_disabled_cnt); +} + +bool ram_block_discard_is_required(void) +{ + return qatomic_read(&ram_block_discard_required_cnt) || + qatomic_read(&ram_block_coordinated_discard_required_cnt); +} diff --git a/system/qdev-monitor.c b/system/qdev-monitor.c new file mode 100644 index 0000000000..74f4e41338 --- /dev/null +++ b/system/qdev-monitor.c @@ -0,0 +1,1148 @@ +/* + * Dynamic device configuration and creation. + * + * Copyright (c) 2009 CodeSourcery + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "hw/sysbus.h" +#include "monitor/hmp.h" +#include "monitor/monitor.h" +#include "monitor/qdev.h" +#include "sysemu/arch_init.h" +#include "qapi/error.h" +#include "qapi/qapi-commands-qdev.h" +#include "qapi/qmp/dispatch.h" +#include "qapi/qmp/qdict.h" +#include "qapi/qmp/qerror.h" +#include "qapi/qmp/qstring.h" +#include "qapi/qobject-input-visitor.h" +#include "qemu/config-file.h" +#include "qemu/error-report.h" +#include "qemu/help_option.h" +#include "qemu/option.h" +#include "qemu/qemu-print.h" +#include "qemu/option_int.h" +#include "sysemu/block-backend.h" +#include "migration/misc.h" +#include "migration/migration.h" +#include "qemu/cutils.h" +#include "hw/qdev-properties.h" +#include "hw/clock.h" +#include "hw/boards.h" + +/* + * Aliases were a bad idea from the start. Let's keep them + * from spreading further. + */ +typedef struct QDevAlias +{ + const char *typename; + const char *alias; + uint32_t arch_mask; +} QDevAlias; + +/* default virtio transport per architecture */ +#define QEMU_ARCH_VIRTIO_PCI (QEMU_ARCH_ALPHA | QEMU_ARCH_ARM | \ + QEMU_ARCH_HPPA | QEMU_ARCH_I386 | \ + QEMU_ARCH_MIPS | QEMU_ARCH_PPC | \ + QEMU_ARCH_RISCV | QEMU_ARCH_SH4 | \ + QEMU_ARCH_SPARC | QEMU_ARCH_XTENSA | \ + QEMU_ARCH_LOONGARCH) +#define QEMU_ARCH_VIRTIO_CCW (QEMU_ARCH_S390X) +#define QEMU_ARCH_VIRTIO_MMIO (QEMU_ARCH_M68K) + +/* Please keep this table sorted by typename. */ +static const QDevAlias qdev_alias_table[] = { + { "AC97", "ac97" }, /* -soundhw name */ + { "e1000", "e1000-82540em" }, + { "ES1370", "es1370" }, /* -soundhw name */ + { "ich9-ahci", "ahci" }, + { "lsi53c895a", "lsi" }, + { "virtio-9p-device", "virtio-9p", QEMU_ARCH_VIRTIO_MMIO }, + { "virtio-9p-ccw", "virtio-9p", QEMU_ARCH_VIRTIO_CCW }, + { "virtio-9p-pci", "virtio-9p", QEMU_ARCH_VIRTIO_PCI }, + { "virtio-balloon-device", "virtio-balloon", QEMU_ARCH_VIRTIO_MMIO }, + { "virtio-balloon-ccw", "virtio-balloon", QEMU_ARCH_VIRTIO_CCW }, + { "virtio-balloon-pci", "virtio-balloon", QEMU_ARCH_VIRTIO_PCI }, + { "virtio-blk-device", "virtio-blk", QEMU_ARCH_VIRTIO_MMIO }, + { "virtio-blk-ccw", "virtio-blk", QEMU_ARCH_VIRTIO_CCW }, + { "virtio-blk-pci", "virtio-blk", QEMU_ARCH_VIRTIO_PCI }, + { "virtio-gpu-device", "virtio-gpu", QEMU_ARCH_VIRTIO_MMIO }, + { "virtio-gpu-ccw", "virtio-gpu", QEMU_ARCH_VIRTIO_CCW }, + { "virtio-gpu-pci", "virtio-gpu", QEMU_ARCH_VIRTIO_PCI }, + { "virtio-gpu-gl-device", "virtio-gpu-gl", QEMU_ARCH_VIRTIO_MMIO }, + { "virtio-gpu-gl-pci", "virtio-gpu-gl", QEMU_ARCH_VIRTIO_PCI }, + { "virtio-input-host-device", "virtio-input-host", QEMU_ARCH_VIRTIO_MMIO }, + { "virtio-input-host-ccw", "virtio-input-host", QEMU_ARCH_VIRTIO_CCW }, + { "virtio-input-host-pci", "virtio-input-host", QEMU_ARCH_VIRTIO_PCI }, + { "virtio-iommu-pci", "virtio-iommu", QEMU_ARCH_VIRTIO_PCI }, + { "virtio-keyboard-device", "virtio-keyboard", QEMU_ARCH_VIRTIO_MMIO }, + { "virtio-keyboard-ccw", "virtio-keyboard", QEMU_ARCH_VIRTIO_CCW }, + { "virtio-keyboard-pci", "virtio-keyboard", QEMU_ARCH_VIRTIO_PCI }, + { "virtio-mouse-device", "virtio-mouse", QEMU_ARCH_VIRTIO_MMIO }, + { "virtio-mouse-ccw", "virtio-mouse", QEMU_ARCH_VIRTIO_CCW }, + { "virtio-mouse-pci", "virtio-mouse", QEMU_ARCH_VIRTIO_PCI }, + { "virtio-net-device", "virtio-net", QEMU_ARCH_VIRTIO_MMIO }, + { "virtio-net-ccw", "virtio-net", QEMU_ARCH_VIRTIO_CCW }, + { "virtio-net-pci", "virtio-net", QEMU_ARCH_VIRTIO_PCI }, + { "virtio-rng-device", "virtio-rng", QEMU_ARCH_VIRTIO_MMIO }, + { "virtio-rng-ccw", "virtio-rng", QEMU_ARCH_VIRTIO_CCW }, + { "virtio-rng-pci", "virtio-rng", QEMU_ARCH_VIRTIO_PCI }, + { "virtio-scsi-device", "virtio-scsi", QEMU_ARCH_VIRTIO_MMIO }, + { "virtio-scsi-ccw", "virtio-scsi", QEMU_ARCH_VIRTIO_CCW }, + { "virtio-scsi-pci", "virtio-scsi", QEMU_ARCH_VIRTIO_PCI }, + { "virtio-serial-device", "virtio-serial", QEMU_ARCH_VIRTIO_MMIO }, + { "virtio-serial-ccw", "virtio-serial", QEMU_ARCH_VIRTIO_CCW }, + { "virtio-serial-pci", "virtio-serial", QEMU_ARCH_VIRTIO_PCI}, + { "virtio-tablet-device", "virtio-tablet", QEMU_ARCH_VIRTIO_MMIO }, + { "virtio-tablet-ccw", "virtio-tablet", QEMU_ARCH_VIRTIO_CCW }, + { "virtio-tablet-pci", "virtio-tablet", QEMU_ARCH_VIRTIO_PCI }, + { } +}; + +static const char *qdev_class_get_alias(DeviceClass *dc) +{ + const char *typename = object_class_get_name(OBJECT_CLASS(dc)); + int i; + + for (i = 0; qdev_alias_table[i].typename; i++) { + if (qdev_alias_table[i].arch_mask && + !(qdev_alias_table[i].arch_mask & arch_type)) { + continue; + } + + if (strcmp(qdev_alias_table[i].typename, typename) == 0) { + return qdev_alias_table[i].alias; + } + } + + return NULL; +} + +static bool qdev_class_has_alias(DeviceClass *dc) +{ + return (qdev_class_get_alias(dc) != NULL); +} + +static void qdev_print_devinfo(DeviceClass *dc) +{ + qemu_printf("name \"%s\"", object_class_get_name(OBJECT_CLASS(dc))); + if (dc->bus_type) { + qemu_printf(", bus %s", dc->bus_type); + } + if (qdev_class_has_alias(dc)) { + qemu_printf(", alias \"%s\"", qdev_class_get_alias(dc)); + } + if (dc->desc) { + qemu_printf(", desc \"%s\"", dc->desc); + } + if (!dc->user_creatable) { + qemu_printf(", no-user"); + } + qemu_printf("\n"); +} + +static void qdev_print_devinfos(bool show_no_user) +{ + static const char *cat_name[DEVICE_CATEGORY_MAX + 1] = { + [DEVICE_CATEGORY_BRIDGE] = "Controller/Bridge/Hub", + [DEVICE_CATEGORY_USB] = "USB", + [DEVICE_CATEGORY_STORAGE] = "Storage", + [DEVICE_CATEGORY_NETWORK] = "Network", + [DEVICE_CATEGORY_INPUT] = "Input", + [DEVICE_CATEGORY_DISPLAY] = "Display", + [DEVICE_CATEGORY_SOUND] = "Sound", + [DEVICE_CATEGORY_MISC] = "Misc", + [DEVICE_CATEGORY_CPU] = "CPU", + [DEVICE_CATEGORY_WATCHDOG]= "Watchdog", + [DEVICE_CATEGORY_MAX] = "Uncategorized", + }; + GSList *list, *elt; + int i; + bool cat_printed; + + module_load_qom_all(); + list = object_class_get_list_sorted(TYPE_DEVICE, false); + + for (i = 0; i <= DEVICE_CATEGORY_MAX; i++) { + cat_printed = false; + for (elt = list; elt; elt = elt->next) { + DeviceClass *dc = OBJECT_CLASS_CHECK(DeviceClass, elt->data, + TYPE_DEVICE); + if ((i < DEVICE_CATEGORY_MAX + ? !test_bit(i, dc->categories) + : !bitmap_empty(dc->categories, DEVICE_CATEGORY_MAX)) + || (!show_no_user + && !dc->user_creatable)) { + continue; + } + if (!cat_printed) { + qemu_printf("%s%s devices:\n", i ? "\n" : "", cat_name[i]); + cat_printed = true; + } + qdev_print_devinfo(dc); + } + } + + g_slist_free(list); +} + +static const char *find_typename_by_alias(const char *alias) +{ + int i; + + for (i = 0; qdev_alias_table[i].alias; i++) { + if (qdev_alias_table[i].arch_mask && + !(qdev_alias_table[i].arch_mask & arch_type)) { + continue; + } + + if (strcmp(qdev_alias_table[i].alias, alias) == 0) { + return qdev_alias_table[i].typename; + } + } + + return NULL; +} + +static DeviceClass *qdev_get_device_class(const char **driver, Error **errp) +{ + ObjectClass *oc; + DeviceClass *dc; + const char *original_name = *driver; + + oc = module_object_class_by_name(*driver); + if (!oc) { + const char *typename = find_typename_by_alias(*driver); + + if (typename) { + *driver = typename; + oc = module_object_class_by_name(*driver); + } + } + + if (!object_class_dynamic_cast(oc, TYPE_DEVICE)) { + if (*driver != original_name) { + error_setg(errp, "'%s' (alias '%s') is not a valid device model" + " name", original_name, *driver); + } else { + error_setg(errp, "'%s' is not a valid device model name", *driver); + } + return NULL; + } + + if (object_class_is_abstract(oc)) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "driver", + "a non-abstract device type"); + return NULL; + } + + dc = DEVICE_CLASS(oc); + if (!dc->user_creatable || + (phase_check(PHASE_MACHINE_READY) && !dc->hotpluggable)) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "driver", + "a pluggable device type"); + return NULL; + } + + if (object_class_dynamic_cast(oc, TYPE_SYS_BUS_DEVICE)) { + /* sysbus devices need to be allowed by the machine */ + MachineClass *mc = MACHINE_CLASS(object_get_class(qdev_get_machine())); + if (!device_type_is_dynamic_sysbus(mc, *driver)) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "driver", + "a dynamic sysbus device type for the machine"); + return NULL; + } + } + + return dc; +} + + +int qdev_device_help(QemuOpts *opts) +{ + Error *local_err = NULL; + const char *driver; + ObjectPropertyInfoList *prop_list; + ObjectPropertyInfoList *prop; + GPtrArray *array; + int i; + + driver = qemu_opt_get(opts, "driver"); + if (driver && is_help_option(driver)) { + qdev_print_devinfos(false); + return 1; + } + + if (!driver || !qemu_opt_has_help_opt(opts)) { + return 0; + } + + if (!object_class_by_name(driver)) { + const char *typename = find_typename_by_alias(driver); + + if (typename) { + driver = typename; + } + } + + prop_list = qmp_device_list_properties(driver, &local_err); + if (local_err) { + goto error; + } + + if (prop_list) { + qemu_printf("%s options:\n", driver); + } else { + qemu_printf("There are no options for %s.\n", driver); + } + array = g_ptr_array_new(); + for (prop = prop_list; prop; prop = prop->next) { + g_ptr_array_add(array, + object_property_help(prop->value->name, + prop->value->type, + prop->value->default_value, + prop->value->description)); + } + g_ptr_array_sort(array, (GCompareFunc)qemu_pstrcmp0); + for (i = 0; i < array->len; i++) { + qemu_printf("%s\n", (char *)array->pdata[i]); + } + g_ptr_array_set_free_func(array, g_free); + g_ptr_array_free(array, true); + qapi_free_ObjectPropertyInfoList(prop_list); + return 1; + +error: + error_report_err(local_err); + return 1; +} + +static Object *qdev_get_peripheral(void) +{ + static Object *dev; + + if (dev == NULL) { + dev = container_get(qdev_get_machine(), "/peripheral"); + } + + return dev; +} + +static Object *qdev_get_peripheral_anon(void) +{ + static Object *dev; + + if (dev == NULL) { + dev = container_get(qdev_get_machine(), "/peripheral-anon"); + } + + return dev; +} + +static void qbus_error_append_bus_list_hint(DeviceState *dev, + Error *const *errp) +{ + BusState *child; + const char *sep = " "; + + error_append_hint(errp, "child buses at \"%s\":", + dev->id ? dev->id : object_get_typename(OBJECT(dev))); + QLIST_FOREACH(child, &dev->child_bus, sibling) { + error_append_hint(errp, "%s\"%s\"", sep, child->name); + sep = ", "; + } + error_append_hint(errp, "\n"); +} + +static void qbus_error_append_dev_list_hint(BusState *bus, + Error *const *errp) +{ + BusChild *kid; + const char *sep = " "; + + error_append_hint(errp, "devices at \"%s\":", bus->name); + QTAILQ_FOREACH(kid, &bus->children, sibling) { + DeviceState *dev = kid->child; + error_append_hint(errp, "%s\"%s\"", sep, + object_get_typename(OBJECT(dev))); + if (dev->id) { + error_append_hint(errp, "/\"%s\"", dev->id); + } + sep = ", "; + } + error_append_hint(errp, "\n"); +} + +static BusState *qbus_find_bus(DeviceState *dev, char *elem) +{ + BusState *child; + + QLIST_FOREACH(child, &dev->child_bus, sibling) { + if (strcmp(child->name, elem) == 0) { + return child; + } + } + return NULL; +} + +static DeviceState *qbus_find_dev(BusState *bus, char *elem) +{ + BusChild *kid; + + /* + * try to match in order: + * (1) instance id, if present + * (2) driver name + * (3) driver alias, if present + */ + QTAILQ_FOREACH(kid, &bus->children, sibling) { + DeviceState *dev = kid->child; + if (dev->id && strcmp(dev->id, elem) == 0) { + return dev; + } + } + QTAILQ_FOREACH(kid, &bus->children, sibling) { + DeviceState *dev = kid->child; + if (strcmp(object_get_typename(OBJECT(dev)), elem) == 0) { + return dev; + } + } + QTAILQ_FOREACH(kid, &bus->children, sibling) { + DeviceState *dev = kid->child; + DeviceClass *dc = DEVICE_GET_CLASS(dev); + + if (qdev_class_has_alias(dc) && + strcmp(qdev_class_get_alias(dc), elem) == 0) { + return dev; + } + } + return NULL; +} + +static inline bool qbus_is_full(BusState *bus) +{ + BusClass *bus_class; + + if (bus->full) { + return true; + } + bus_class = BUS_GET_CLASS(bus); + return bus_class->max_dev && bus->num_children >= bus_class->max_dev; +} + +/* + * Search the tree rooted at @bus for a bus. + * If @name, search for a bus with that name. Note that bus names + * need not be unique. Yes, that's screwed up. + * Else search for a bus that is a subtype of @bus_typename. + * If more than one exists, prefer one that can take another device. + * Return the bus if found, else %NULL. + */ +static BusState *qbus_find_recursive(BusState *bus, const char *name, + const char *bus_typename) +{ + BusChild *kid; + BusState *pick, *child, *ret; + bool match; + + assert(name || bus_typename); + if (name) { + match = !strcmp(bus->name, name); + } else { + match = !!object_dynamic_cast(OBJECT(bus), bus_typename); + } + + if (match && !qbus_is_full(bus)) { + return bus; /* root matches and isn't full */ + } + + pick = match ? bus : NULL; + + QTAILQ_FOREACH(kid, &bus->children, sibling) { + DeviceState *dev = kid->child; + QLIST_FOREACH(child, &dev->child_bus, sibling) { + ret = qbus_find_recursive(child, name, bus_typename); + if (ret && !qbus_is_full(ret)) { + return ret; /* a descendant matches and isn't full */ + } + if (ret && !pick) { + pick = ret; + } + } + } + + /* root or a descendant matches, but is full */ + return pick; +} + +static BusState *qbus_find(const char *path, Error **errp) +{ + DeviceState *dev; + BusState *bus; + char elem[128]; + int pos, len; + + /* find start element */ + if (path[0] == '/') { + bus = sysbus_get_default(); + pos = 0; + } else { + if (sscanf(path, "%127[^/]%n", elem, &len) != 1) { + assert(!path[0]); + elem[0] = len = 0; + } + bus = qbus_find_recursive(sysbus_get_default(), elem, NULL); + if (!bus) { + error_setg(errp, "Bus '%s' not found", elem); + return NULL; + } + pos = len; + } + + for (;;) { + assert(path[pos] == '/' || !path[pos]); + while (path[pos] == '/') { + pos++; + } + if (path[pos] == '\0') { + break; + } + + /* find device */ + if (sscanf(path+pos, "%127[^/]%n", elem, &len) != 1) { + g_assert_not_reached(); + elem[0] = len = 0; + } + pos += len; + dev = qbus_find_dev(bus, elem); + if (!dev) { + error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND, + "Device '%s' not found", elem); + qbus_error_append_dev_list_hint(bus, errp); + return NULL; + } + + assert(path[pos] == '/' || !path[pos]); + while (path[pos] == '/') { + pos++; + } + if (path[pos] == '\0') { + /* last specified element is a device. If it has exactly + * one child bus accept it nevertheless */ + if (dev->num_child_bus == 1) { + bus = QLIST_FIRST(&dev->child_bus); + break; + } + if (dev->num_child_bus) { + error_setg(errp, "Device '%s' has multiple child buses", + elem); + qbus_error_append_bus_list_hint(dev, errp); + } else { + error_setg(errp, "Device '%s' has no child bus", elem); + } + return NULL; + } + + /* find bus */ + if (sscanf(path+pos, "%127[^/]%n", elem, &len) != 1) { + g_assert_not_reached(); + elem[0] = len = 0; + } + pos += len; + bus = qbus_find_bus(dev, elem); + if (!bus) { + error_setg(errp, "Bus '%s' not found", elem); + qbus_error_append_bus_list_hint(dev, errp); + return NULL; + } + } + + if (qbus_is_full(bus)) { + error_setg(errp, "Bus '%s' is full", path); + return NULL; + } + return bus; +} + +/* Takes ownership of @id, will be freed when deleting the device */ +const char *qdev_set_id(DeviceState *dev, char *id, Error **errp) +{ + ObjectProperty *prop; + + assert(!dev->id && !dev->realized); + + /* + * object_property_[try_]add_child() below will assert the device + * has no parent + */ + if (id) { + prop = object_property_try_add_child(qdev_get_peripheral(), id, + OBJECT(dev), NULL); + if (prop) { + dev->id = id; + } else { + error_setg(errp, "Duplicate device ID '%s'", id); + g_free(id); + return NULL; + } + } else { + static int anon_count; + gchar *name = g_strdup_printf("device[%d]", anon_count++); + prop = object_property_add_child(qdev_get_peripheral_anon(), name, + OBJECT(dev)); + g_free(name); + } + + return prop->name; +} + +DeviceState *qdev_device_add_from_qdict(const QDict *opts, + bool from_json, Error **errp) +{ + ERRP_GUARD(); + DeviceClass *dc; + const char *driver, *path; + char *id; + DeviceState *dev = NULL; + BusState *bus = NULL; + + driver = qdict_get_try_str(opts, "driver"); + if (!driver) { + error_setg(errp, QERR_MISSING_PARAMETER, "driver"); + return NULL; + } + + /* find driver */ + dc = qdev_get_device_class(&driver, errp); + if (!dc) { + return NULL; + } + + /* find bus */ + path = qdict_get_try_str(opts, "bus"); + if (path != NULL) { + bus = qbus_find(path, errp); + if (!bus) { + return NULL; + } + if (!object_dynamic_cast(OBJECT(bus), dc->bus_type)) { + error_setg(errp, "Device '%s' can't go on %s bus", + driver, object_get_typename(OBJECT(bus))); + return NULL; + } + } else if (dc->bus_type != NULL) { + bus = qbus_find_recursive(sysbus_get_default(), NULL, dc->bus_type); + if (!bus || qbus_is_full(bus)) { + error_setg(errp, "No '%s' bus found for device '%s'", + dc->bus_type, driver); + return NULL; + } + } + + if (qdev_should_hide_device(opts, from_json, errp)) { + if (bus && !qbus_is_hotpluggable(bus)) { + error_setg(errp, QERR_BUS_NO_HOTPLUG, bus->name); + } + return NULL; + } else if (*errp) { + return NULL; + } + + if (phase_check(PHASE_MACHINE_READY) && bus && !qbus_is_hotpluggable(bus)) { + error_setg(errp, QERR_BUS_NO_HOTPLUG, bus->name); + return NULL; + } + + if (!migration_is_idle()) { + error_setg(errp, "device_add not allowed while migrating"); + return NULL; + } + + /* create device */ + dev = qdev_new(driver); + + /* Check whether the hotplug is allowed by the machine */ + if (phase_check(PHASE_MACHINE_READY)) { + if (!qdev_hotplug_allowed(dev, errp)) { + goto err_del_dev; + } + + if (!bus && !qdev_get_machine_hotplug_handler(dev)) { + /* No bus, no machine hotplug handler --> device is not hotpluggable */ + error_setg(errp, "Device '%s' can not be hotplugged on this machine", + driver); + goto err_del_dev; + } + } + + /* + * set dev's parent and register its id. + * If it fails it means the id is already taken. + */ + id = g_strdup(qdict_get_try_str(opts, "id")); + if (!qdev_set_id(dev, id, errp)) { + goto err_del_dev; + } + + /* set properties */ + dev->opts = qdict_clone_shallow(opts); + qdict_del(dev->opts, "driver"); + qdict_del(dev->opts, "bus"); + qdict_del(dev->opts, "id"); + + object_set_properties_from_keyval(&dev->parent_obj, dev->opts, from_json, + errp); + if (*errp) { + goto err_del_dev; + } + + if (!qdev_realize(dev, bus, errp)) { + goto err_del_dev; + } + return dev; + +err_del_dev: + if (dev) { + object_unparent(OBJECT(dev)); + object_unref(OBJECT(dev)); + } + return NULL; +} + +/* Takes ownership of @opts on success */ +DeviceState *qdev_device_add(QemuOpts *opts, Error **errp) +{ + QDict *qdict = qemu_opts_to_qdict(opts, NULL); + DeviceState *ret; + + ret = qdev_device_add_from_qdict(qdict, false, errp); + if (ret) { + qemu_opts_del(opts); + } + qobject_unref(qdict); + return ret; +} + +#define qdev_printf(fmt, ...) monitor_printf(mon, "%*s" fmt, indent, "", ## __VA_ARGS__) +static void qbus_print(Monitor *mon, BusState *bus, int indent); + +static void qdev_print_props(Monitor *mon, DeviceState *dev, Property *props, + int indent) +{ + if (!props) + return; + for (; props->name; props++) { + char *value; + char *legacy_name = g_strdup_printf("legacy-%s", props->name); + + if (object_property_get_type(OBJECT(dev), legacy_name, NULL)) { + value = object_property_get_str(OBJECT(dev), legacy_name, NULL); + } else { + value = object_property_print(OBJECT(dev), props->name, true, + NULL); + } + g_free(legacy_name); + + if (!value) { + continue; + } + qdev_printf("%s = %s\n", props->name, + *value ? value : "<null>"); + g_free(value); + } +} + +static void bus_print_dev(BusState *bus, Monitor *mon, DeviceState *dev, int indent) +{ + BusClass *bc = BUS_GET_CLASS(bus); + + if (bc->print_dev) { + bc->print_dev(mon, dev, indent); + } +} + +static void qdev_print(Monitor *mon, DeviceState *dev, int indent) +{ + ObjectClass *class; + BusState *child; + NamedGPIOList *ngl; + NamedClockList *ncl; + + qdev_printf("dev: %s, id \"%s\"\n", object_get_typename(OBJECT(dev)), + dev->id ? dev->id : ""); + indent += 2; + QLIST_FOREACH(ngl, &dev->gpios, node) { + if (ngl->num_in) { + qdev_printf("gpio-in \"%s\" %d\n", ngl->name ? ngl->name : "", + ngl->num_in); + } + if (ngl->num_out) { + qdev_printf("gpio-out \"%s\" %d\n", ngl->name ? ngl->name : "", + ngl->num_out); + } + } + QLIST_FOREACH(ncl, &dev->clocks, node) { + g_autofree char *freq_str = clock_display_freq(ncl->clock); + qdev_printf("clock-%s%s \"%s\" freq_hz=%s\n", + ncl->output ? "out" : "in", + ncl->alias ? " (alias)" : "", + ncl->name, freq_str); + } + class = object_get_class(OBJECT(dev)); + do { + qdev_print_props(mon, dev, DEVICE_CLASS(class)->props_, indent); + class = object_class_get_parent(class); + } while (class != object_class_by_name(TYPE_DEVICE)); + bus_print_dev(dev->parent_bus, mon, dev, indent); + QLIST_FOREACH(child, &dev->child_bus, sibling) { + qbus_print(mon, child, indent); + } +} + +static void qbus_print(Monitor *mon, BusState *bus, int indent) +{ + BusChild *kid; + + qdev_printf("bus: %s\n", bus->name); + indent += 2; + qdev_printf("type %s\n", object_get_typename(OBJECT(bus))); + QTAILQ_FOREACH(kid, &bus->children, sibling) { + DeviceState *dev = kid->child; + qdev_print(mon, dev, indent); + } +} +#undef qdev_printf + +void hmp_info_qtree(Monitor *mon, const QDict *qdict) +{ + if (sysbus_get_default()) + qbus_print(mon, sysbus_get_default(), 0); +} + +void hmp_info_qdm(Monitor *mon, const QDict *qdict) +{ + qdev_print_devinfos(true); +} + +void qmp_device_add(QDict *qdict, QObject **ret_data, Error **errp) +{ + QemuOpts *opts; + DeviceState *dev; + + opts = qemu_opts_from_qdict(qemu_find_opts("device"), qdict, errp); + if (!opts) { + return; + } + if (!monitor_cur_is_qmp() && qdev_device_help(opts)) { + qemu_opts_del(opts); + return; + } + dev = qdev_device_add(opts, errp); + + /* + * Drain all pending RCU callbacks. This is done because + * some bus related operations can delay a device removal + * (in this case this can happen if device is added and then + * removed due to a configuration error) + * to a RCU callback, but user might expect that this interface + * will finish its job completely once qmp command returns result + * to the user + */ + drain_call_rcu(); + + if (!dev) { + qemu_opts_del(opts); + return; + } + object_unref(OBJECT(dev)); +} + +static DeviceState *find_device_state(const char *id, Error **errp) +{ + Object *obj = object_resolve_path_at(qdev_get_peripheral(), id); + DeviceState *dev; + + if (!obj) { + error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND, + "Device '%s' not found", id); + return NULL; + } + + dev = (DeviceState *)object_dynamic_cast(obj, TYPE_DEVICE); + if (!dev) { + error_setg(errp, "%s is not a hotpluggable device", id); + return NULL; + } + + return dev; +} + +void qdev_unplug(DeviceState *dev, Error **errp) +{ + DeviceClass *dc = DEVICE_GET_CLASS(dev); + HotplugHandler *hotplug_ctrl; + HotplugHandlerClass *hdc; + Error *local_err = NULL; + + if (qdev_unplug_blocked(dev, errp)) { + return; + } + + if (dev->parent_bus && !qbus_is_hotpluggable(dev->parent_bus)) { + error_setg(errp, QERR_BUS_NO_HOTPLUG, dev->parent_bus->name); + return; + } + + if (!dc->hotpluggable) { + error_setg(errp, QERR_DEVICE_NO_HOTPLUG, + object_get_typename(OBJECT(dev))); + return; + } + + if (!migration_is_idle() && !dev->allow_unplug_during_migration) { + error_setg(errp, "device_del not allowed while migrating"); + return; + } + + qdev_hot_removed = true; + + hotplug_ctrl = qdev_get_hotplug_handler(dev); + /* hotpluggable device MUST have HotplugHandler, if it doesn't + * then something is very wrong with it */ + g_assert(hotplug_ctrl); + + /* If device supports async unplug just request it to be done, + * otherwise just remove it synchronously */ + hdc = HOTPLUG_HANDLER_GET_CLASS(hotplug_ctrl); + if (hdc->unplug_request) { + hotplug_handler_unplug_request(hotplug_ctrl, dev, &local_err); + } else { + hotplug_handler_unplug(hotplug_ctrl, dev, &local_err); + if (!local_err) { + object_unparent(OBJECT(dev)); + } + } + error_propagate(errp, local_err); +} + +void qmp_device_del(const char *id, Error **errp) +{ + DeviceState *dev = find_device_state(id, errp); + if (dev != NULL) { + if (dev->pending_deleted_event && + (dev->pending_deleted_expires_ms == 0 || + dev->pending_deleted_expires_ms > qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL))) { + error_setg(errp, "Device %s is already in the " + "process of unplug", id); + return; + } + + qdev_unplug(dev, errp); + } +} + +void hmp_device_add(Monitor *mon, const QDict *qdict) +{ + Error *err = NULL; + + qmp_device_add((QDict *)qdict, NULL, &err); + hmp_handle_error(mon, err); +} + +void hmp_device_del(Monitor *mon, const QDict *qdict) +{ + const char *id = qdict_get_str(qdict, "id"); + Error *err = NULL; + + qmp_device_del(id, &err); + hmp_handle_error(mon, err); +} + +void device_add_completion(ReadLineState *rs, int nb_args, const char *str) +{ + GSList *list, *elt; + size_t len; + + if (nb_args != 2) { + return; + } + + len = strlen(str); + readline_set_completion_index(rs, len); + list = elt = object_class_get_list(TYPE_DEVICE, false); + while (elt) { + DeviceClass *dc = OBJECT_CLASS_CHECK(DeviceClass, elt->data, + TYPE_DEVICE); + + if (dc->user_creatable) { + readline_add_completion_of(rs, str, + object_class_get_name(OBJECT_CLASS(dc))); + } + elt = elt->next; + } + g_slist_free(list); +} + +static int qdev_add_hotpluggable_device(Object *obj, void *opaque) +{ + GSList **list = opaque; + DeviceState *dev = (DeviceState *)object_dynamic_cast(obj, TYPE_DEVICE); + + if (dev == NULL) { + return 0; + } + + if (dev->realized && object_property_get_bool(obj, "hotpluggable", NULL)) { + *list = g_slist_append(*list, dev); + } + + return 0; +} + +static GSList *qdev_build_hotpluggable_device_list(Object *peripheral) +{ + GSList *list = NULL; + + object_child_foreach(peripheral, qdev_add_hotpluggable_device, &list); + + return list; +} + +static void peripheral_device_del_completion(ReadLineState *rs, + const char *str) +{ + Object *peripheral = container_get(qdev_get_machine(), "/peripheral"); + GSList *list, *item; + + list = qdev_build_hotpluggable_device_list(peripheral); + if (!list) { + return; + } + + for (item = list; item; item = g_slist_next(item)) { + DeviceState *dev = item->data; + + if (dev->id) { + readline_add_completion_of(rs, str, dev->id); + } + } + + g_slist_free(list); +} + +void device_del_completion(ReadLineState *rs, int nb_args, const char *str) +{ + if (nb_args != 2) { + return; + } + + readline_set_completion_index(rs, strlen(str)); + peripheral_device_del_completion(rs, str); +} + +BlockBackend *blk_by_qdev_id(const char *id, Error **errp) +{ + DeviceState *dev; + BlockBackend *blk; + + GLOBAL_STATE_CODE(); + + dev = find_device_state(id, errp); + if (dev == NULL) { + return NULL; + } + + blk = blk_by_dev(dev); + if (!blk) { + error_setg(errp, "Device does not have a block device backend"); + } + return blk; +} + +QemuOptsList qemu_device_opts = { + .name = "device", + .implied_opt_name = "driver", + .head = QTAILQ_HEAD_INITIALIZER(qemu_device_opts.head), + .desc = { + /* + * no elements => accept any + * sanity checking will happen later + * when setting device properties + */ + { /* end of list */ } + }, +}; + +QemuOptsList qemu_global_opts = { + .name = "global", + .head = QTAILQ_HEAD_INITIALIZER(qemu_global_opts.head), + .desc = { + { + .name = "driver", + .type = QEMU_OPT_STRING, + },{ + .name = "property", + .type = QEMU_OPT_STRING, + },{ + .name = "value", + .type = QEMU_OPT_STRING, + }, + { /* end of list */ } + }, +}; + +int qemu_global_option(const char *str) +{ + char driver[64], property[64]; + QemuOpts *opts; + int rc, offset; + + rc = sscanf(str, "%63[^.=].%63[^=]%n", driver, property, &offset); + if (rc == 2 && str[offset] == '=') { + opts = qemu_opts_create(&qemu_global_opts, NULL, 0, &error_abort); + qemu_opt_set(opts, "driver", driver, &error_abort); + qemu_opt_set(opts, "property", property, &error_abort); + qemu_opt_set(opts, "value", str + offset + 1, &error_abort); + return 0; + } + + opts = qemu_opts_parse_noisily(&qemu_global_opts, str, false); + if (!opts) { + return -1; + } + if (!qemu_opt_get(opts, "driver") + || !qemu_opt_get(opts, "property") + || !qemu_opt_get(opts, "value")) { + error_report("options 'driver', 'property', and 'value'" + " are required"); + return -1; + } + + return 0; +} + +bool qmp_command_available(const QmpCommand *cmd, Error **errp) +{ + if (!phase_check(PHASE_MACHINE_READY) && + !(cmd->options & QCO_ALLOW_PRECONFIG)) { + error_setg(errp, "The command '%s' is permitted only after machine initialization has completed", + cmd->name); + return false; + } + return true; +} diff --git a/system/qemu-seccomp.c b/system/qemu-seccomp.c new file mode 100644 index 0000000000..4d7439e7f7 --- /dev/null +++ b/system/qemu-seccomp.c @@ -0,0 +1,486 @@ +/* + * QEMU seccomp mode 2 support with libseccomp + * + * Copyright IBM, Corp. 2012 + * + * Authors: + * Eduardo Otubo <eotubo@br.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu/config-file.h" +#include "qemu/option.h" +#include "qemu/module.h" +#include <sys/prctl.h> +#include <seccomp.h> +#include "sysemu/seccomp.h" +#include <linux/seccomp.h> + +/* For some architectures (notably ARM) cacheflush is not supported until + * libseccomp 2.2.3, but configure enforces that we are using a more recent + * version on those hosts, so it is OK for this check to be less strict. + */ +#if SCMP_VER_MAJOR >= 3 + #define HAVE_CACHEFLUSH +#elif SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 2 + #define HAVE_CACHEFLUSH +#endif + +struct QemuSeccompSyscall { + int32_t num; + uint8_t set; + uint8_t narg; + const struct scmp_arg_cmp *arg_cmp; + uint32_t action; +}; + +const struct scmp_arg_cmp sched_setscheduler_arg[] = { + /* was SCMP_A1(SCMP_CMP_NE, SCHED_IDLE), but expanded due to GCC 4.x bug */ + { .arg = 1, .op = SCMP_CMP_NE, .datum_a = SCHED_IDLE } +}; + +/* + * See 'NOTES' in 'man 2 clone' - s390 & cross have 'flags' in + * different position to other architectures + */ +#if defined(HOST_S390X) || defined(HOST_S390) || defined(HOST_CRIS) +#define CLONE_FLAGS_ARG 1 +#else +#define CLONE_FLAGS_ARG 0 +#endif + +#ifndef CLONE_PIDFD +# define CLONE_PIDFD 0x00001000 +#endif + +#define REQUIRE_CLONE_FLAG(flag) \ + const struct scmp_arg_cmp clone_arg ## flag[] = { \ + { .arg = CLONE_FLAGS_ARG, \ + .op = SCMP_CMP_MASKED_EQ, \ + .datum_a = flag, .datum_b = 0 } } + +#define FORBID_CLONE_FLAG(flag) \ + const struct scmp_arg_cmp clone_arg ## flag[] = { \ + { .arg = CLONE_FLAGS_ARG, \ + .op = SCMP_CMP_MASKED_EQ, \ + .datum_a = flag, .datum_b = flag } } + +#define RULE_CLONE_FLAG(flag) \ + { SCMP_SYS(clone), QEMU_SECCOMP_SET_SPAWN, \ + ARRAY_SIZE(clone_arg ## flag), clone_arg ## flag, SCMP_ACT_TRAP } + +/* If no CLONE_* flags are set, except CSIGNAL, deny */ +const struct scmp_arg_cmp clone_arg_none[] = { + { .arg = CLONE_FLAGS_ARG, + .op = SCMP_CMP_MASKED_EQ, + .datum_a = ~(CSIGNAL), .datum_b = 0 } +}; + +/* + * pthread_create should always set all of these. + */ +REQUIRE_CLONE_FLAG(CLONE_VM); +REQUIRE_CLONE_FLAG(CLONE_FS); +REQUIRE_CLONE_FLAG(CLONE_FILES); +REQUIRE_CLONE_FLAG(CLONE_SIGHAND); +REQUIRE_CLONE_FLAG(CLONE_THREAD); +REQUIRE_CLONE_FLAG(CLONE_SYSVSEM); +REQUIRE_CLONE_FLAG(CLONE_SETTLS); +REQUIRE_CLONE_FLAG(CLONE_PARENT_SETTID); +REQUIRE_CLONE_FLAG(CLONE_CHILD_CLEARTID); +/* + * Musl sets this in pthread_create too, but it is + * obsolete and harmless since its behaviour is + * subsumed under CLONE_THREAD + */ +/*REQUIRE_CLONE_FLAG(CLONE_DETACHED);*/ + + +/* + * These all indicate an attempt to spawn a process + * instead of a thread, or other undesirable scenarios + */ +FORBID_CLONE_FLAG(CLONE_PIDFD); +FORBID_CLONE_FLAG(CLONE_PTRACE); +FORBID_CLONE_FLAG(CLONE_VFORK); +FORBID_CLONE_FLAG(CLONE_PARENT); +FORBID_CLONE_FLAG(CLONE_NEWNS); +FORBID_CLONE_FLAG(CLONE_UNTRACED); +FORBID_CLONE_FLAG(CLONE_NEWCGROUP); +FORBID_CLONE_FLAG(CLONE_NEWUTS); +FORBID_CLONE_FLAG(CLONE_NEWIPC); +FORBID_CLONE_FLAG(CLONE_NEWUSER); +FORBID_CLONE_FLAG(CLONE_NEWPID); +FORBID_CLONE_FLAG(CLONE_NEWNET); +FORBID_CLONE_FLAG(CLONE_IO); + + +static const struct QemuSeccompSyscall denylist[] = { + /* default set of syscalls that should get blocked */ + { SCMP_SYS(reboot), QEMU_SECCOMP_SET_DEFAULT, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(swapon), QEMU_SECCOMP_SET_DEFAULT, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(swapoff), QEMU_SECCOMP_SET_DEFAULT, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(syslog), QEMU_SECCOMP_SET_DEFAULT, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(mount), QEMU_SECCOMP_SET_DEFAULT, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(umount), QEMU_SECCOMP_SET_DEFAULT, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(kexec_load), QEMU_SECCOMP_SET_DEFAULT, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(afs_syscall), QEMU_SECCOMP_SET_DEFAULT, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(break), QEMU_SECCOMP_SET_DEFAULT, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(ftime), QEMU_SECCOMP_SET_DEFAULT, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(getpmsg), QEMU_SECCOMP_SET_DEFAULT, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(gtty), QEMU_SECCOMP_SET_DEFAULT, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(lock), QEMU_SECCOMP_SET_DEFAULT, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(mpx), QEMU_SECCOMP_SET_DEFAULT, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(prof), QEMU_SECCOMP_SET_DEFAULT, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(profil), QEMU_SECCOMP_SET_DEFAULT, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(putpmsg), QEMU_SECCOMP_SET_DEFAULT, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(security), QEMU_SECCOMP_SET_DEFAULT, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(stty), QEMU_SECCOMP_SET_DEFAULT, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(tuxcall), QEMU_SECCOMP_SET_DEFAULT, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(ulimit), QEMU_SECCOMP_SET_DEFAULT, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(vserver), QEMU_SECCOMP_SET_DEFAULT, + 0, NULL, SCMP_ACT_TRAP }, + /* obsolete */ + { SCMP_SYS(readdir), QEMU_SECCOMP_SET_OBSOLETE, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(_sysctl), QEMU_SECCOMP_SET_OBSOLETE, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(bdflush), QEMU_SECCOMP_SET_OBSOLETE, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(create_module), QEMU_SECCOMP_SET_OBSOLETE, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(get_kernel_syms), QEMU_SECCOMP_SET_OBSOLETE, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(query_module), QEMU_SECCOMP_SET_OBSOLETE, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(sgetmask), QEMU_SECCOMP_SET_OBSOLETE, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(ssetmask), QEMU_SECCOMP_SET_OBSOLETE, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(sysfs), QEMU_SECCOMP_SET_OBSOLETE, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(uselib), QEMU_SECCOMP_SET_OBSOLETE, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(ustat), QEMU_SECCOMP_SET_OBSOLETE, + 0, NULL, SCMP_ACT_TRAP }, + /* privileged */ + { SCMP_SYS(setuid), QEMU_SECCOMP_SET_PRIVILEGED, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(setgid), QEMU_SECCOMP_SET_PRIVILEGED, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(setpgid), QEMU_SECCOMP_SET_PRIVILEGED, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(setsid), QEMU_SECCOMP_SET_PRIVILEGED, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(setreuid), QEMU_SECCOMP_SET_PRIVILEGED, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(setregid), QEMU_SECCOMP_SET_PRIVILEGED, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(setresuid), QEMU_SECCOMP_SET_PRIVILEGED, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(setresgid), QEMU_SECCOMP_SET_PRIVILEGED, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(setfsuid), QEMU_SECCOMP_SET_PRIVILEGED, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(setfsgid), QEMU_SECCOMP_SET_PRIVILEGED, + 0, NULL, SCMP_ACT_TRAP }, + /* spawn */ + { SCMP_SYS(fork), QEMU_SECCOMP_SET_SPAWN, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(vfork), QEMU_SECCOMP_SET_SPAWN, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(execve), QEMU_SECCOMP_SET_SPAWN, + 0, NULL, SCMP_ACT_TRAP }, + { SCMP_SYS(clone), QEMU_SECCOMP_SET_SPAWN, + ARRAY_SIZE(clone_arg_none), clone_arg_none, SCMP_ACT_TRAP }, + RULE_CLONE_FLAG(CLONE_VM), + RULE_CLONE_FLAG(CLONE_FS), + RULE_CLONE_FLAG(CLONE_FILES), + RULE_CLONE_FLAG(CLONE_SIGHAND), + RULE_CLONE_FLAG(CLONE_THREAD), + RULE_CLONE_FLAG(CLONE_SYSVSEM), + RULE_CLONE_FLAG(CLONE_SETTLS), + RULE_CLONE_FLAG(CLONE_PARENT_SETTID), + RULE_CLONE_FLAG(CLONE_CHILD_CLEARTID), + /*RULE_CLONE_FLAG(CLONE_DETACHED),*/ + RULE_CLONE_FLAG(CLONE_PIDFD), + RULE_CLONE_FLAG(CLONE_PTRACE), + RULE_CLONE_FLAG(CLONE_VFORK), + RULE_CLONE_FLAG(CLONE_PARENT), + RULE_CLONE_FLAG(CLONE_NEWNS), + RULE_CLONE_FLAG(CLONE_UNTRACED), + RULE_CLONE_FLAG(CLONE_NEWCGROUP), + RULE_CLONE_FLAG(CLONE_NEWUTS), + RULE_CLONE_FLAG(CLONE_NEWIPC), + RULE_CLONE_FLAG(CLONE_NEWUSER), + RULE_CLONE_FLAG(CLONE_NEWPID), + RULE_CLONE_FLAG(CLONE_NEWNET), + RULE_CLONE_FLAG(CLONE_IO), +#ifdef __SNR_clone3 + { SCMP_SYS(clone3), QEMU_SECCOMP_SET_SPAWN, + 0, NULL, SCMP_ACT_ERRNO(ENOSYS) }, +#endif +#ifdef __SNR_execveat + { SCMP_SYS(execveat), QEMU_SECCOMP_SET_SPAWN }, +#endif + { SCMP_SYS(setns), QEMU_SECCOMP_SET_SPAWN }, + { SCMP_SYS(unshare), QEMU_SECCOMP_SET_SPAWN }, + /* resource control */ + { SCMP_SYS(setpriority), QEMU_SECCOMP_SET_RESOURCECTL, + 0, NULL, SCMP_ACT_ERRNO(EPERM) }, + { SCMP_SYS(sched_setparam), QEMU_SECCOMP_SET_RESOURCECTL, + 0, NULL, SCMP_ACT_ERRNO(EPERM) }, + { SCMP_SYS(sched_setscheduler), QEMU_SECCOMP_SET_RESOURCECTL, + ARRAY_SIZE(sched_setscheduler_arg), sched_setscheduler_arg, + SCMP_ACT_ERRNO(EPERM) }, + { SCMP_SYS(sched_setaffinity), QEMU_SECCOMP_SET_RESOURCECTL, + 0, NULL, SCMP_ACT_ERRNO(EPERM) }, +}; + +static inline __attribute__((unused)) int +qemu_seccomp(unsigned int operation, unsigned int flags, void *args) +{ +#ifdef __NR_seccomp + return syscall(__NR_seccomp, operation, flags, args); +#else + errno = ENOSYS; + return -1; +#endif +} + +static uint32_t qemu_seccomp_update_action(uint32_t action) +{ +#if defined(SECCOMP_GET_ACTION_AVAIL) && defined(SCMP_ACT_KILL_PROCESS) && \ + defined(SECCOMP_RET_KILL_PROCESS) + if (action == SCMP_ACT_TRAP) { + static int kill_process = -1; + if (kill_process == -1) { + uint32_t testaction = SECCOMP_RET_KILL_PROCESS; + + if (qemu_seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &testaction) == 0) { + kill_process = 1; + } else { + kill_process = 0; + } + } + if (kill_process == 1) { + return SCMP_ACT_KILL_PROCESS; + } + } +#endif + return action; +} + + +static int seccomp_start(uint32_t seccomp_opts, Error **errp) +{ + int rc = -1; + unsigned int i = 0; + scmp_filter_ctx ctx; + + ctx = seccomp_init(SCMP_ACT_ALLOW); + if (ctx == NULL) { + error_setg(errp, "failed to initialize seccomp context"); + goto seccomp_return; + } + +#if defined(CONFIG_SECCOMP_SYSRAWRC) + /* + * This must be the first seccomp_attr_set() call to have full + * error propagation from subsequent seccomp APIs. + */ + rc = seccomp_attr_set(ctx, SCMP_FLTATR_API_SYSRAWRC, 1); + if (rc != 0) { + error_setg_errno(errp, -rc, + "failed to set seccomp rawrc attribute"); + goto seccomp_return; + } +#endif + + rc = seccomp_attr_set(ctx, SCMP_FLTATR_CTL_TSYNC, 1); + if (rc != 0) { + error_setg_errno(errp, -rc, + "failed to set seccomp thread synchronization"); + goto seccomp_return; + } + + for (i = 0; i < ARRAY_SIZE(denylist); i++) { + uint32_t action; + if (!(seccomp_opts & denylist[i].set)) { + continue; + } + + action = qemu_seccomp_update_action(denylist[i].action); + rc = seccomp_rule_add_array(ctx, action, denylist[i].num, + denylist[i].narg, denylist[i].arg_cmp); + if (rc < 0) { + error_setg_errno(errp, -rc, + "failed to add seccomp denylist rules"); + goto seccomp_return; + } + } + + rc = seccomp_load(ctx); + if (rc < 0) { + error_setg_errno(errp, -rc, + "failed to load seccomp syscall filter in kernel"); + } + + seccomp_return: + seccomp_release(ctx); + return rc < 0 ? -1 : 0; +} + +int parse_sandbox(void *opaque, QemuOpts *opts, Error **errp) +{ + if (qemu_opt_get_bool(opts, "enable", false)) { + uint32_t seccomp_opts = QEMU_SECCOMP_SET_DEFAULT + | QEMU_SECCOMP_SET_OBSOLETE; + const char *value = NULL; + + value = qemu_opt_get(opts, "obsolete"); + if (value) { + if (g_str_equal(value, "allow")) { + seccomp_opts &= ~QEMU_SECCOMP_SET_OBSOLETE; + } else if (g_str_equal(value, "deny")) { + /* this is the default option, this if is here + * to provide a little bit of consistency for + * the command line */ + } else { + error_setg(errp, "invalid argument for obsolete"); + return -1; + } + } + + value = qemu_opt_get(opts, "elevateprivileges"); + if (value) { + if (g_str_equal(value, "deny")) { + seccomp_opts |= QEMU_SECCOMP_SET_PRIVILEGED; + } else if (g_str_equal(value, "children")) { + seccomp_opts |= QEMU_SECCOMP_SET_PRIVILEGED; + + /* calling prctl directly because we're + * not sure if host has CAP_SYS_ADMIN set*/ + if (prctl(PR_SET_NO_NEW_PRIVS, 1)) { + error_setg(errp, "failed to set no_new_privs aborting"); + return -1; + } + } else if (g_str_equal(value, "allow")) { + /* default value */ + } else { + error_setg(errp, "invalid argument for elevateprivileges"); + return -1; + } + } + + value = qemu_opt_get(opts, "spawn"); + if (value) { + if (g_str_equal(value, "deny")) { + seccomp_opts |= QEMU_SECCOMP_SET_SPAWN; + } else if (g_str_equal(value, "allow")) { + /* default value */ + } else { + error_setg(errp, "invalid argument for spawn"); + return -1; + } + } + + value = qemu_opt_get(opts, "resourcecontrol"); + if (value) { + if (g_str_equal(value, "deny")) { + seccomp_opts |= QEMU_SECCOMP_SET_RESOURCECTL; + } else if (g_str_equal(value, "allow")) { + /* default value */ + } else { + error_setg(errp, "invalid argument for resourcecontrol"); + return -1; + } + } + + if (seccomp_start(seccomp_opts, errp) < 0) { + return -1; + } + } + + return 0; +} + +static QemuOptsList qemu_sandbox_opts = { + .name = "sandbox", + .implied_opt_name = "enable", + .head = QTAILQ_HEAD_INITIALIZER(qemu_sandbox_opts.head), + .desc = { + { + .name = "enable", + .type = QEMU_OPT_BOOL, + }, + { + .name = "obsolete", + .type = QEMU_OPT_STRING, + }, + { + .name = "elevateprivileges", + .type = QEMU_OPT_STRING, + }, + { + .name = "spawn", + .type = QEMU_OPT_STRING, + }, + { + .name = "resourcecontrol", + .type = QEMU_OPT_STRING, + }, + { /* end of list */ } + }, +}; + +static void seccomp_register(void) +{ + bool add = false; + + /* FIXME: use seccomp_api_get() >= 2 check when released */ + +#if defined(SECCOMP_FILTER_FLAG_TSYNC) + int check; + + /* check host TSYNC capability, it returns errno == ENOSYS if unavailable */ + check = qemu_seccomp(SECCOMP_SET_MODE_FILTER, + SECCOMP_FILTER_FLAG_TSYNC, NULL); + if (check < 0 && errno == EFAULT) { + add = true; + } +#endif + + if (add) { + qemu_add_opts(&qemu_sandbox_opts); + } +} +opts_init(seccomp_register); diff --git a/system/qtest.c b/system/qtest.c new file mode 100644 index 0000000000..35b643a274 --- /dev/null +++ b/system/qtest.c @@ -0,0 +1,1070 @@ +/* + * Test Server + * + * Copyright IBM, Corp. 2011 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "sysemu/qtest.h" +#include "sysemu/runstate.h" +#include "chardev/char-fe.h" +#include "exec/ioport.h" +#include "exec/memory.h" +#include "exec/tswap.h" +#include "hw/qdev-core.h" +#include "hw/irq.h" +#include "qemu/accel.h" +#include "sysemu/cpu-timers.h" +#include "qemu/config-file.h" +#include "qemu/option.h" +#include "qemu/error-report.h" +#include "qemu/module.h" +#include "qemu/cutils.h" +#include "qom/object_interfaces.h" + +#define MAX_IRQ 256 + +#define TYPE_QTEST "qtest" + +OBJECT_DECLARE_SIMPLE_TYPE(QTest, QTEST) + +struct QTest { + Object parent; + + bool has_machine_link; + char *chr_name; + Chardev *chr; + CharBackend qtest_chr; + char *log; +}; + +bool qtest_allowed; + +static DeviceState *irq_intercept_dev; +static FILE *qtest_log_fp; +static QTest *qtest; +static GString *inbuf; +static int irq_levels[MAX_IRQ]; +static GTimer *timer; +static bool qtest_opened; +static void (*qtest_server_send)(void*, const char*); +static void *qtest_server_send_opaque; + +#define FMT_timeval "%.06f" + +/** + * DOC: QTest Protocol + * + * Line based protocol, request/response based. Server can send async messages + * so clients should always handle many async messages before the response + * comes in. + * + * Valid requests + * ^^^^^^^^^^^^^^ + * + * Clock management: + * """"""""""""""""" + * + * The qtest client is completely in charge of the QEMU_CLOCK_VIRTUAL. qtest commands + * let you adjust the value of the clock (monotonically). All the commands + * return the current value of the clock in nanoseconds. + * + * .. code-block:: none + * + * > clock_step + * < OK VALUE + * + * Advance the clock to the next deadline. Useful when waiting for + * asynchronous events. + * + * .. code-block:: none + * + * > clock_step NS + * < OK VALUE + * + * Advance the clock by NS nanoseconds. + * + * .. code-block:: none + * + * > clock_set NS + * < OK VALUE + * + * Advance the clock to NS nanoseconds (do nothing if it's already past). + * + * PIO and memory access: + * """""""""""""""""""""" + * + * .. code-block:: none + * + * > outb ADDR VALUE + * < OK + * + * .. code-block:: none + * + * > outw ADDR VALUE + * < OK + * + * .. code-block:: none + * + * > outl ADDR VALUE + * < OK + * + * .. code-block:: none + * + * > inb ADDR + * < OK VALUE + * + * .. code-block:: none + * + * > inw ADDR + * < OK VALUE + * + * .. code-block:: none + * + * > inl ADDR + * < OK VALUE + * + * .. code-block:: none + * + * > writeb ADDR VALUE + * < OK + * + * .. code-block:: none + * + * > writew ADDR VALUE + * < OK + * + * .. code-block:: none + * + * > writel ADDR VALUE + * < OK + * + * .. code-block:: none + * + * > writeq ADDR VALUE + * < OK + * + * .. code-block:: none + * + * > readb ADDR + * < OK VALUE + * + * .. code-block:: none + * + * > readw ADDR + * < OK VALUE + * + * .. code-block:: none + * + * > readl ADDR + * < OK VALUE + * + * .. code-block:: none + * + * > readq ADDR + * < OK VALUE + * + * .. code-block:: none + * + * > read ADDR SIZE + * < OK DATA + * + * .. code-block:: none + * + * > write ADDR SIZE DATA + * < OK + * + * .. code-block:: none + * + * > b64read ADDR SIZE + * < OK B64_DATA + * + * .. code-block:: none + * + * > b64write ADDR SIZE B64_DATA + * < OK + * + * .. code-block:: none + * + * > memset ADDR SIZE VALUE + * < OK + * + * ADDR, SIZE, VALUE are all integers parsed with strtoul() with a base of 0. + * For 'memset' a zero size is permitted and does nothing. + * + * DATA is an arbitrarily long hex number prefixed with '0x'. If it's smaller + * than the expected size, the value will be zero filled at the end of the data + * sequence. + * + * B64_DATA is an arbitrarily long base64 encoded string. + * If the sizes do not match, the data will be truncated. + * + * IRQ management: + * """"""""""""""" + * + * .. code-block:: none + * + * > irq_intercept_in QOM-PATH + * < OK + * + * .. code-block:: none + * + * > irq_intercept_out QOM-PATH + * < OK + * + * Attach to the gpio-in (resp. gpio-out) pins exported by the device at + * QOM-PATH. When the pin is triggered, one of the following async messages + * will be printed to the qtest stream:: + * + * IRQ raise NUM + * IRQ lower NUM + * + * where NUM is an IRQ number. For the PC, interrupts can be intercepted + * simply with "irq_intercept_in ioapic" (note that IRQ0 comes out with + * NUM=0 even though it is remapped to GSI 2). + * + * Setting interrupt level: + * """""""""""""""""""""""" + * + * .. code-block:: none + * + * > set_irq_in QOM-PATH NAME NUM LEVEL + * < OK + * + * where NAME is the name of the irq/gpio list, NUM is an IRQ number and + * LEVEL is an signed integer IRQ level. + * + * Forcibly set the given interrupt pin to the given level. + * + */ + +static int hex2nib(char ch) +{ + if (ch >= '0' && ch <= '9') { + return ch - '0'; + } else if (ch >= 'a' && ch <= 'f') { + return 10 + (ch - 'a'); + } else if (ch >= 'A' && ch <= 'F') { + return 10 + (ch - 'A'); + } else { + return -1; + } +} + +void qtest_send_prefix(CharBackend *chr) +{ + if (!qtest_log_fp || !qtest_opened) { + return; + } + + fprintf(qtest_log_fp, "[S +" FMT_timeval "] ", g_timer_elapsed(timer, NULL)); +} + +static void G_GNUC_PRINTF(1, 2) qtest_log_send(const char *fmt, ...) +{ + va_list ap; + + if (!qtest_log_fp || !qtest_opened) { + return; + } + + qtest_send_prefix(NULL); + + va_start(ap, fmt); + vfprintf(qtest_log_fp, fmt, ap); + va_end(ap); +} + +static void qtest_server_char_be_send(void *opaque, const char *str) +{ + size_t len = strlen(str); + CharBackend* chr = (CharBackend *)opaque; + qemu_chr_fe_write_all(chr, (uint8_t *)str, len); + if (qtest_log_fp && qtest_opened) { + fprintf(qtest_log_fp, "%s", str); + } +} + +static void qtest_send(CharBackend *chr, const char *str) +{ + qtest_server_send(qtest_server_send_opaque, str); +} + +void qtest_sendf(CharBackend *chr, const char *fmt, ...) +{ + va_list ap; + gchar *buffer; + + va_start(ap, fmt); + buffer = g_strdup_vprintf(fmt, ap); + qtest_send(chr, buffer); + g_free(buffer); + va_end(ap); +} + +static void qtest_irq_handler(void *opaque, int n, int level) +{ + qemu_irq old_irq = *(qemu_irq *)opaque; + qemu_set_irq(old_irq, level); + + if (irq_levels[n] != level) { + CharBackend *chr = &qtest->qtest_chr; + irq_levels[n] = level; + qtest_send_prefix(chr); + qtest_sendf(chr, "IRQ %s %d\n", + level ? "raise" : "lower", n); + } +} + +static int64_t qtest_clock_counter; + +int64_t qtest_get_virtual_clock(void) +{ + return qatomic_read_i64(&qtest_clock_counter); +} + +static void qtest_set_virtual_clock(int64_t count) +{ + qatomic_set_i64(&qtest_clock_counter, count); +} + +static void qtest_clock_warp(int64_t dest) +{ + int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); + AioContext *aio_context; + assert(qtest_enabled()); + aio_context = qemu_get_aio_context(); + while (clock < dest) { + int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL, + QEMU_TIMER_ATTR_ALL); + int64_t warp = qemu_soonest_timeout(dest - clock, deadline); + + qtest_set_virtual_clock(qtest_get_virtual_clock() + warp); + + qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL); + timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]); + clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); + } + qemu_clock_notify(QEMU_CLOCK_VIRTUAL); +} + +static bool (*process_command_cb)(CharBackend *chr, gchar **words); + +void qtest_set_command_cb(bool (*pc_cb)(CharBackend *chr, gchar **words)) +{ + assert(!process_command_cb); /* Switch to a list if we need more than one */ + + process_command_cb = pc_cb; +} + +static void qtest_install_gpio_out_intercept(DeviceState *dev, const char *name, int n) +{ + qemu_irq *disconnected = g_new0(qemu_irq, 1); + qemu_irq icpt = qemu_allocate_irq(qtest_irq_handler, + disconnected, n); + + *disconnected = qdev_intercept_gpio_out(dev, icpt, name, n); +} + +static void qtest_process_command(CharBackend *chr, gchar **words) +{ + const gchar *command; + + g_assert(words); + + command = words[0]; + + if (qtest_log_fp) { + int i; + + fprintf(qtest_log_fp, "[R +" FMT_timeval "]", g_timer_elapsed(timer, NULL)); + for (i = 0; words[i]; i++) { + fprintf(qtest_log_fp, " %s", words[i]); + } + fprintf(qtest_log_fp, "\n"); + } + + g_assert(command); + if (strcmp(words[0], "irq_intercept_out") == 0 + || strcmp(words[0], "irq_intercept_in") == 0) { + DeviceState *dev; + NamedGPIOList *ngl; + bool is_named; + bool is_outbound; + bool interception_succeeded = false; + + g_assert(words[1]); + is_named = words[2] != NULL; + is_outbound = words[0][14] == 'o'; + dev = DEVICE(object_resolve_path(words[1], NULL)); + if (!dev) { + qtest_send_prefix(chr); + qtest_send(chr, "FAIL Unknown device\n"); + return; + } + + if (is_named && !is_outbound) { + qtest_send_prefix(chr); + qtest_send(chr, "FAIL Interception of named in-GPIOs not yet supported\n"); + return; + } + + if (irq_intercept_dev) { + qtest_send_prefix(chr); + if (irq_intercept_dev != dev) { + qtest_send(chr, "FAIL IRQ intercept already enabled\n"); + } else { + qtest_send(chr, "OK\n"); + } + return; + } + + QLIST_FOREACH(ngl, &dev->gpios, node) { + /* We don't support inbound interception of named GPIOs yet */ + if (is_outbound) { + /* NULL is valid and matchable, for "unnamed GPIO" */ + if (g_strcmp0(ngl->name, words[2]) == 0) { + int i; + for (i = 0; i < ngl->num_out; ++i) { + qtest_install_gpio_out_intercept(dev, ngl->name, i); + } + interception_succeeded = true; + } + } else { + qemu_irq_intercept_in(ngl->in, qtest_irq_handler, + ngl->num_in); + interception_succeeded = true; + } + } + + qtest_send_prefix(chr); + if (interception_succeeded) { + irq_intercept_dev = dev; + qtest_send(chr, "OK\n"); + } else { + qtest_send(chr, "FAIL No intercepts installed\n"); + } + } else if (strcmp(words[0], "set_irq_in") == 0) { + DeviceState *dev; + qemu_irq irq; + char *name; + int ret; + int num; + int level; + + g_assert(words[1] && words[2] && words[3] && words[4]); + + dev = DEVICE(object_resolve_path(words[1], NULL)); + if (!dev) { + qtest_send_prefix(chr); + qtest_send(chr, "FAIL Unknown device\n"); + return; + } + + if (strcmp(words[2], "unnamed-gpio-in") == 0) { + name = NULL; + } else { + name = words[2]; + } + + ret = qemu_strtoi(words[3], NULL, 0, &num); + g_assert(!ret); + ret = qemu_strtoi(words[4], NULL, 0, &level); + g_assert(!ret); + + irq = qdev_get_gpio_in_named(dev, name, num); + + qemu_set_irq(irq, level); + qtest_send_prefix(chr); + qtest_send(chr, "OK\n"); + } else if (strcmp(words[0], "outb") == 0 || + strcmp(words[0], "outw") == 0 || + strcmp(words[0], "outl") == 0) { + unsigned long addr; + unsigned long value; + int ret; + + g_assert(words[1] && words[2]); + ret = qemu_strtoul(words[1], NULL, 0, &addr); + g_assert(ret == 0); + ret = qemu_strtoul(words[2], NULL, 0, &value); + g_assert(ret == 0); + g_assert(addr <= 0xffff); + + if (words[0][3] == 'b') { + cpu_outb(addr, value); + } else if (words[0][3] == 'w') { + cpu_outw(addr, value); + } else if (words[0][3] == 'l') { + cpu_outl(addr, value); + } + qtest_send_prefix(chr); + qtest_send(chr, "OK\n"); + } else if (strcmp(words[0], "inb") == 0 || + strcmp(words[0], "inw") == 0 || + strcmp(words[0], "inl") == 0) { + unsigned long addr; + uint32_t value = -1U; + int ret; + + g_assert(words[1]); + ret = qemu_strtoul(words[1], NULL, 0, &addr); + g_assert(ret == 0); + g_assert(addr <= 0xffff); + + if (words[0][2] == 'b') { + value = cpu_inb(addr); + } else if (words[0][2] == 'w') { + value = cpu_inw(addr); + } else if (words[0][2] == 'l') { + value = cpu_inl(addr); + } + qtest_send_prefix(chr); + qtest_sendf(chr, "OK 0x%04x\n", value); + } else if (strcmp(words[0], "writeb") == 0 || + strcmp(words[0], "writew") == 0 || + strcmp(words[0], "writel") == 0 || + strcmp(words[0], "writeq") == 0) { + uint64_t addr; + uint64_t value; + int ret; + + g_assert(words[1] && words[2]); + ret = qemu_strtou64(words[1], NULL, 0, &addr); + g_assert(ret == 0); + ret = qemu_strtou64(words[2], NULL, 0, &value); + g_assert(ret == 0); + + if (words[0][5] == 'b') { + uint8_t data = value; + address_space_write(first_cpu->as, addr, MEMTXATTRS_UNSPECIFIED, + &data, 1); + } else if (words[0][5] == 'w') { + uint16_t data = value; + tswap16s(&data); + address_space_write(first_cpu->as, addr, MEMTXATTRS_UNSPECIFIED, + &data, 2); + } else if (words[0][5] == 'l') { + uint32_t data = value; + tswap32s(&data); + address_space_write(first_cpu->as, addr, MEMTXATTRS_UNSPECIFIED, + &data, 4); + } else if (words[0][5] == 'q') { + uint64_t data = value; + tswap64s(&data); + address_space_write(first_cpu->as, addr, MEMTXATTRS_UNSPECIFIED, + &data, 8); + } + qtest_send_prefix(chr); + qtest_send(chr, "OK\n"); + } else if (strcmp(words[0], "readb") == 0 || + strcmp(words[0], "readw") == 0 || + strcmp(words[0], "readl") == 0 || + strcmp(words[0], "readq") == 0) { + uint64_t addr; + uint64_t value = UINT64_C(-1); + int ret; + + g_assert(words[1]); + ret = qemu_strtou64(words[1], NULL, 0, &addr); + g_assert(ret == 0); + + if (words[0][4] == 'b') { + uint8_t data; + address_space_read(first_cpu->as, addr, MEMTXATTRS_UNSPECIFIED, + &data, 1); + value = data; + } else if (words[0][4] == 'w') { + uint16_t data; + address_space_read(first_cpu->as, addr, MEMTXATTRS_UNSPECIFIED, + &data, 2); + value = tswap16(data); + } else if (words[0][4] == 'l') { + uint32_t data; + address_space_read(first_cpu->as, addr, MEMTXATTRS_UNSPECIFIED, + &data, 4); + value = tswap32(data); + } else if (words[0][4] == 'q') { + address_space_read(first_cpu->as, addr, MEMTXATTRS_UNSPECIFIED, + &value, 8); + tswap64s(&value); + } + qtest_send_prefix(chr); + qtest_sendf(chr, "OK 0x%016" PRIx64 "\n", value); + } else if (strcmp(words[0], "read") == 0) { + uint64_t addr, len, i; + uint8_t *data; + char *enc; + int ret; + + g_assert(words[1] && words[2]); + ret = qemu_strtou64(words[1], NULL, 0, &addr); + g_assert(ret == 0); + ret = qemu_strtou64(words[2], NULL, 0, &len); + g_assert(ret == 0); + /* We'd send garbage to libqtest if len is 0 */ + g_assert(len); + + data = g_malloc(len); + address_space_read(first_cpu->as, addr, MEMTXATTRS_UNSPECIFIED, data, + len); + + enc = g_malloc(2 * len + 1); + for (i = 0; i < len; i++) { + sprintf(&enc[i * 2], "%02x", data[i]); + } + + qtest_send_prefix(chr); + qtest_sendf(chr, "OK 0x%s\n", enc); + + g_free(data); + g_free(enc); + } else if (strcmp(words[0], "b64read") == 0) { + uint64_t addr, len; + uint8_t *data; + gchar *b64_data; + int ret; + + g_assert(words[1] && words[2]); + ret = qemu_strtou64(words[1], NULL, 0, &addr); + g_assert(ret == 0); + ret = qemu_strtou64(words[2], NULL, 0, &len); + g_assert(ret == 0); + + data = g_malloc(len); + address_space_read(first_cpu->as, addr, MEMTXATTRS_UNSPECIFIED, data, + len); + b64_data = g_base64_encode(data, len); + qtest_send_prefix(chr); + qtest_sendf(chr, "OK %s\n", b64_data); + + g_free(data); + g_free(b64_data); + } else if (strcmp(words[0], "write") == 0) { + uint64_t addr, len, i; + uint8_t *data; + size_t data_len; + int ret; + + g_assert(words[1] && words[2] && words[3]); + ret = qemu_strtou64(words[1], NULL, 0, &addr); + g_assert(ret == 0); + ret = qemu_strtou64(words[2], NULL, 0, &len); + g_assert(ret == 0); + + data_len = strlen(words[3]); + if (data_len < 3) { + qtest_send(chr, "ERR invalid argument size\n"); + return; + } + + data = g_malloc(len); + for (i = 0; i < len; i++) { + if ((i * 2 + 4) <= data_len) { + data[i] = hex2nib(words[3][i * 2 + 2]) << 4; + data[i] |= hex2nib(words[3][i * 2 + 3]); + } else { + data[i] = 0; + } + } + address_space_write(first_cpu->as, addr, MEMTXATTRS_UNSPECIFIED, data, + len); + g_free(data); + + qtest_send_prefix(chr); + qtest_send(chr, "OK\n"); + } else if (strcmp(words[0], "memset") == 0) { + uint64_t addr, len; + uint8_t *data; + unsigned long pattern; + int ret; + + g_assert(words[1] && words[2] && words[3]); + ret = qemu_strtou64(words[1], NULL, 0, &addr); + g_assert(ret == 0); + ret = qemu_strtou64(words[2], NULL, 0, &len); + g_assert(ret == 0); + ret = qemu_strtoul(words[3], NULL, 0, &pattern); + g_assert(ret == 0); + + if (len) { + data = g_malloc(len); + memset(data, pattern, len); + address_space_write(first_cpu->as, addr, MEMTXATTRS_UNSPECIFIED, + data, len); + g_free(data); + } + + qtest_send_prefix(chr); + qtest_send(chr, "OK\n"); + } else if (strcmp(words[0], "b64write") == 0) { + uint64_t addr, len; + uint8_t *data; + size_t data_len; + gsize out_len; + int ret; + + g_assert(words[1] && words[2] && words[3]); + ret = qemu_strtou64(words[1], NULL, 0, &addr); + g_assert(ret == 0); + ret = qemu_strtou64(words[2], NULL, 0, &len); + g_assert(ret == 0); + + data_len = strlen(words[3]); + if (data_len < 3) { + qtest_send(chr, "ERR invalid argument size\n"); + return; + } + + data = g_base64_decode_inplace(words[3], &out_len); + if (out_len != len) { + qtest_log_send("b64write: data length mismatch (told %"PRIu64", " + "found %zu)\n", + len, out_len); + out_len = MIN(out_len, len); + } + + address_space_write(first_cpu->as, addr, MEMTXATTRS_UNSPECIFIED, data, + len); + + qtest_send_prefix(chr); + qtest_send(chr, "OK\n"); + } else if (strcmp(words[0], "endianness") == 0) { + qtest_send_prefix(chr); + if (target_words_bigendian()) { + qtest_sendf(chr, "OK big\n"); + } else { + qtest_sendf(chr, "OK little\n"); + } + } else if (qtest_enabled() && strcmp(words[0], "clock_step") == 0) { + int64_t ns; + + if (words[1]) { + int ret = qemu_strtoi64(words[1], NULL, 0, &ns); + g_assert(ret == 0); + } else { + ns = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL, + QEMU_TIMER_ATTR_ALL); + } + qtest_clock_warp(qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + ns); + qtest_send_prefix(chr); + qtest_sendf(chr, "OK %"PRIi64"\n", + (int64_t)qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL)); + } else if (strcmp(words[0], "module_load") == 0) { + Error *local_err = NULL; + int rv; + g_assert(words[1] && words[2]); + + qtest_send_prefix(chr); + rv = module_load(words[1], words[2], &local_err); + if (rv > 0) { + qtest_sendf(chr, "OK\n"); + } else { + if (rv < 0) { + error_report_err(local_err); + } + qtest_sendf(chr, "FAIL\n"); + } + } else if (qtest_enabled() && strcmp(words[0], "clock_set") == 0) { + int64_t ns; + int ret; + + g_assert(words[1]); + ret = qemu_strtoi64(words[1], NULL, 0, &ns); + g_assert(ret == 0); + qtest_clock_warp(ns); + qtest_send_prefix(chr); + qtest_sendf(chr, "OK %"PRIi64"\n", + (int64_t)qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL)); + } else if (process_command_cb && process_command_cb(chr, words)) { + /* Command got consumed by the callback handler */ + } else { + qtest_send_prefix(chr); + qtest_sendf(chr, "FAIL Unknown command '%s'\n", words[0]); + } +} + +static void qtest_process_inbuf(CharBackend *chr, GString *inbuf) +{ + char *end; + + while ((end = strchr(inbuf->str, '\n')) != NULL) { + size_t offset; + GString *cmd; + gchar **words; + + offset = end - inbuf->str; + + cmd = g_string_new_len(inbuf->str, offset); + g_string_erase(inbuf, 0, offset + 1); + + words = g_strsplit(cmd->str, " ", 0); + qtest_process_command(chr, words); + g_strfreev(words); + + g_string_free(cmd, TRUE); + } +} + +static void qtest_read(void *opaque, const uint8_t *buf, int size) +{ + CharBackend *chr = opaque; + + g_string_append_len(inbuf, (const gchar *)buf, size); + qtest_process_inbuf(chr, inbuf); +} + +static int qtest_can_read(void *opaque) +{ + return 1024; +} + +static void qtest_event(void *opaque, QEMUChrEvent event) +{ + int i; + + switch (event) { + case CHR_EVENT_OPENED: + /* + * We used to call qemu_system_reset() here, hoping we could + * use the same process for multiple tests that way. Never + * used. Injects an extra reset even when it's not used, and + * that can mess up tests, e.g. -boot once. + */ + for (i = 0; i < ARRAY_SIZE(irq_levels); i++) { + irq_levels[i] = 0; + } + + g_clear_pointer(&timer, g_timer_destroy); + timer = g_timer_new(); + qtest_opened = true; + if (qtest_log_fp) { + fprintf(qtest_log_fp, "[I " FMT_timeval "] OPENED\n", g_timer_elapsed(timer, NULL)); + } + break; + case CHR_EVENT_CLOSED: + qtest_opened = false; + if (qtest_log_fp) { + fprintf(qtest_log_fp, "[I +" FMT_timeval "] CLOSED\n", g_timer_elapsed(timer, NULL)); + } + g_clear_pointer(&timer, g_timer_destroy); + break; + default: + break; + } +} + +void qtest_server_init(const char *qtest_chrdev, const char *qtest_log, Error **errp) +{ + ERRP_GUARD(); + Chardev *chr; + Object *qtest; + + chr = qemu_chr_new("qtest", qtest_chrdev, NULL); + if (chr == NULL) { + error_setg(errp, "Failed to initialize device for qtest: \"%s\"", + qtest_chrdev); + return; + } + + qtest = object_new(TYPE_QTEST); + object_property_set_str(qtest, "chardev", chr->label, &error_abort); + if (qtest_log) { + object_property_set_str(qtest, "log", qtest_log, &error_abort); + } + object_property_add_child(qdev_get_machine(), "qtest", qtest); + user_creatable_complete(USER_CREATABLE(qtest), errp); + if (*errp) { + object_unparent(qtest); + } + object_unref(OBJECT(chr)); + object_unref(qtest); +} + +static bool qtest_server_start(QTest *q, Error **errp) +{ + Chardev *chr = q->chr; + const char *qtest_log = q->log; + + if (qtest_log) { + if (strcmp(qtest_log, "none") != 0) { + qtest_log_fp = fopen(qtest_log, "w+"); + } + } else { + qtest_log_fp = stderr; + } + + if (!qemu_chr_fe_init(&q->qtest_chr, chr, errp)) { + return false; + } + qemu_chr_fe_set_handlers(&q->qtest_chr, qtest_can_read, qtest_read, + qtest_event, NULL, &q->qtest_chr, NULL, true); + qemu_chr_fe_set_echo(&q->qtest_chr, true); + + inbuf = g_string_new(""); + + if (!qtest_server_send) { + qtest_server_set_send_handler(qtest_server_char_be_send, &q->qtest_chr); + } + qtest = q; + return true; +} + +void qtest_server_set_send_handler(void (*send)(void*, const char*), + void *opaque) +{ + qtest_server_send = send; + qtest_server_send_opaque = opaque; +} + +bool qtest_driver(void) +{ + return qtest && qtest->qtest_chr.chr != NULL; +} + +void qtest_server_inproc_recv(void *dummy, const char *buf) +{ + static GString *gstr; + if (!gstr) { + gstr = g_string_new(NULL); + } + g_string_append(gstr, buf); + if (gstr->str[gstr->len - 1] == '\n') { + qtest_process_inbuf(NULL, gstr); + g_string_truncate(gstr, 0); + } +} + +static void qtest_complete(UserCreatable *uc, Error **errp) +{ + QTest *q = QTEST(uc); + if (qtest) { + error_setg(errp, "Only one instance of qtest can be created"); + return; + } + if (!q->chr_name) { + error_setg(errp, "No backend specified"); + return; + } + + if (OBJECT(uc)->parent != qdev_get_machine()) { + q->has_machine_link = true; + object_property_add_const_link(qdev_get_machine(), "qtest", OBJECT(uc)); + } else { + /* -qtest was used. */ + } + + qtest_server_start(q, errp); +} + +static void qtest_unparent(Object *obj) +{ + QTest *q = QTEST(obj); + + if (qtest == q) { + qemu_chr_fe_disconnect(&q->qtest_chr); + assert(!qtest_opened); + qemu_chr_fe_deinit(&q->qtest_chr, false); + if (qtest_log_fp) { + fclose(qtest_log_fp); + qtest_log_fp = NULL; + } + qtest = NULL; + } + + if (q->has_machine_link) { + object_property_del(qdev_get_machine(), "qtest"); + q->has_machine_link = false; + } +} + +static void qtest_set_log(Object *obj, const char *value, Error **errp) +{ + QTest *q = QTEST(obj); + + if (qtest == q) { + error_setg(errp, "Property 'log' can not be set now"); + } else { + g_free(q->log); + q->log = g_strdup(value); + } +} + +static char *qtest_get_log(Object *obj, Error **errp) +{ + QTest *q = QTEST(obj); + + return g_strdup(q->log); +} + +static void qtest_set_chardev(Object *obj, const char *value, Error **errp) +{ + QTest *q = QTEST(obj); + Chardev *chr; + + if (qtest == q) { + error_setg(errp, "Property 'chardev' can not be set now"); + return; + } + + chr = qemu_chr_find(value); + if (!chr) { + error_setg(errp, "Cannot find character device '%s'", value); + return; + } + + g_free(q->chr_name); + q->chr_name = g_strdup(value); + + if (q->chr) { + object_unref(q->chr); + } + q->chr = chr; + object_ref(chr); +} + +static char *qtest_get_chardev(Object *obj, Error **errp) +{ + QTest *q = QTEST(obj); + + return g_strdup(q->chr_name); +} + +static void qtest_class_init(ObjectClass *oc, void *data) +{ + UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc); + + oc->unparent = qtest_unparent; + ucc->complete = qtest_complete; + + object_class_property_add_str(oc, "chardev", + qtest_get_chardev, qtest_set_chardev); + object_class_property_add_str(oc, "log", + qtest_get_log, qtest_set_log); +} + +static const TypeInfo qtest_info = { + .name = TYPE_QTEST, + .parent = TYPE_OBJECT, + .class_init = qtest_class_init, + .instance_size = sizeof(QTest), + .interfaces = (InterfaceInfo[]) { + { TYPE_USER_CREATABLE }, + { } + } +}; + +static void register_types(void) +{ + type_register_static(&qtest_info); +} + +type_init(register_types); diff --git a/system/rtc.c b/system/rtc.c new file mode 100644 index 0000000000..4904581abe --- /dev/null +++ b/system/rtc.c @@ -0,0 +1,192 @@ +/* + * RTC configuration and clock read + * + * Copyright (c) 2003-2020 QEMU contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qemu/cutils.h" +#include "qapi/error.h" +#include "qapi/qmp/qerror.h" +#include "qemu/error-report.h" +#include "qemu/option.h" +#include "qemu/timer.h" +#include "qom/object.h" +#include "sysemu/replay.h" +#include "sysemu/sysemu.h" +#include "sysemu/rtc.h" +#include "hw/rtc/mc146818rtc.h" + +static enum { + RTC_BASE_UTC, + RTC_BASE_LOCALTIME, + RTC_BASE_DATETIME, +} rtc_base_type = RTC_BASE_UTC; +static time_t rtc_ref_start_datetime; +static int rtc_realtime_clock_offset; /* used only with QEMU_CLOCK_REALTIME */ +static int rtc_host_datetime_offset = -1; /* valid & used only with + RTC_BASE_DATETIME */ +QEMUClockType rtc_clock; +/***********************************************************/ +/* RTC reference time/date access */ +static time_t qemu_ref_timedate(QEMUClockType clock) +{ + time_t value = qemu_clock_get_ms(clock) / 1000; + switch (clock) { + case QEMU_CLOCK_REALTIME: + value -= rtc_realtime_clock_offset; + /* fall through */ + case QEMU_CLOCK_VIRTUAL: + value += rtc_ref_start_datetime; + break; + case QEMU_CLOCK_HOST: + if (rtc_base_type == RTC_BASE_DATETIME) { + value -= rtc_host_datetime_offset; + } + break; + default: + assert(0); + } + return value; +} + +void qemu_get_timedate(struct tm *tm, time_t offset) +{ + time_t ti = qemu_ref_timedate(rtc_clock); + + ti += offset; + + switch (rtc_base_type) { + case RTC_BASE_DATETIME: + case RTC_BASE_UTC: + gmtime_r(&ti, tm); + break; + case RTC_BASE_LOCALTIME: + localtime_r(&ti, tm); + break; + } +} + +time_t qemu_timedate_diff(struct tm *tm) +{ + time_t seconds; + + switch (rtc_base_type) { + case RTC_BASE_DATETIME: + case RTC_BASE_UTC: + seconds = mktimegm(tm); + break; + case RTC_BASE_LOCALTIME: + { + struct tm tmp = *tm; + tmp.tm_isdst = -1; /* use timezone to figure it out */ + seconds = mktime(&tmp); + break; + } + default: + abort(); + } + + return seconds - qemu_ref_timedate(QEMU_CLOCK_HOST); +} + +static void configure_rtc_base_datetime(const char *startdate) +{ + time_t rtc_start_datetime; + struct tm tm; + + if (sscanf(startdate, "%d-%d-%dT%d:%d:%d", &tm.tm_year, &tm.tm_mon, + &tm.tm_mday, &tm.tm_hour, &tm.tm_min, &tm.tm_sec) == 6) { + /* OK */ + } else if (sscanf(startdate, "%d-%d-%d", + &tm.tm_year, &tm.tm_mon, &tm.tm_mday) == 3) { + tm.tm_hour = 0; + tm.tm_min = 0; + tm.tm_sec = 0; + } else { + goto date_fail; + } + tm.tm_year -= 1900; + tm.tm_mon--; + rtc_start_datetime = mktimegm(&tm); + if (rtc_start_datetime == -1) { + date_fail: + error_report("invalid datetime format"); + error_printf("valid formats: " + "'2006-06-17T16:01:21' or '2006-06-17'\n"); + exit(1); + } + rtc_host_datetime_offset = rtc_ref_start_datetime - rtc_start_datetime; + rtc_ref_start_datetime = rtc_start_datetime; +} + +void configure_rtc(QemuOpts *opts) +{ + const char *value; + + /* Set defaults */ + rtc_clock = QEMU_CLOCK_HOST; + rtc_ref_start_datetime = qemu_clock_get_ms(QEMU_CLOCK_HOST) / 1000; + rtc_realtime_clock_offset = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) / 1000; + + value = qemu_opt_get(opts, "base"); + if (value) { + if (!strcmp(value, "utc")) { + rtc_base_type = RTC_BASE_UTC; + } else if (!strcmp(value, "localtime")) { + rtc_base_type = RTC_BASE_LOCALTIME; + replay_add_blocker("-rtc base=localtime"); + } else { + rtc_base_type = RTC_BASE_DATETIME; + configure_rtc_base_datetime(value); + } + } + value = qemu_opt_get(opts, "clock"); + if (value) { + if (!strcmp(value, "host")) { + rtc_clock = QEMU_CLOCK_HOST; + } else if (!strcmp(value, "rt")) { + rtc_clock = QEMU_CLOCK_REALTIME; + } else if (!strcmp(value, "vm")) { + rtc_clock = QEMU_CLOCK_VIRTUAL; + } else { + error_report("invalid option value '%s'", value); + exit(1); + } + } + value = qemu_opt_get(opts, "driftfix"); + if (value) { + if (!strcmp(value, "slew")) { + object_register_sugar_prop(TYPE_MC146818_RTC, + "lost_tick_policy", + "slew", + false); + if (!object_class_by_name(TYPE_MC146818_RTC)) { + warn_report("driftfix 'slew' is not available with this machine"); + } + } else if (!strcmp(value, "none")) { + /* discard is default */ + } else { + error_report("invalid option value '%s'", value); + exit(1); + } + } +} diff --git a/system/runstate-action.c b/system/runstate-action.c new file mode 100644 index 0000000000..ae0761a9c3 --- /dev/null +++ b/system/runstate-action.c @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2020 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "sysemu/runstate-action.h" +#include "sysemu/watchdog.h" +#include "qemu/config-file.h" +#include "qapi/error.h" +#include "qemu/option_int.h" + +RebootAction reboot_action = REBOOT_ACTION_RESET; +ShutdownAction shutdown_action = SHUTDOWN_ACTION_POWEROFF; +PanicAction panic_action = PANIC_ACTION_SHUTDOWN; + +/* + * Receives actions to be applied for specific guest events + * and sets the internal state as requested. + */ +void qmp_set_action(bool has_reboot, RebootAction reboot, + bool has_shutdown, ShutdownAction shutdown, + bool has_panic, PanicAction panic, + bool has_watchdog, WatchdogAction watchdog, + Error **errp) +{ + if (has_reboot) { + reboot_action = reboot; + } + + if (has_panic) { + panic_action = panic; + } + + if (has_watchdog) { + qmp_watchdog_set_action(watchdog, errp); + } + + /* Process shutdown last, in case the panic action needs to be altered */ + if (has_shutdown) { + shutdown_action = shutdown; + } +} diff --git a/system/runstate-hmp-cmds.c b/system/runstate-hmp-cmds.c new file mode 100644 index 0000000000..2df670f0c0 --- /dev/null +++ b/system/runstate-hmp-cmds.c @@ -0,0 +1,95 @@ +/* + * HMP commands related to run state + * + * Copyright IBM, Corp. 2011 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "qemu/osdep.h" +#include "exec/cpu-common.h" +#include "monitor/hmp.h" +#include "monitor/monitor.h" +#include "qapi/error.h" +#include "qapi/qapi-commands-run-state.h" +#include "qapi/qmp/qdict.h" +#include "qemu/accel.h" + +void hmp_info_status(Monitor *mon, const QDict *qdict) +{ + StatusInfo *info; + + info = qmp_query_status(NULL); + + monitor_printf(mon, "VM status: %s", + info->running ? "running" : "paused"); + + if (!info->running && info->status != RUN_STATE_PAUSED) { + monitor_printf(mon, " (%s)", RunState_str(info->status)); + } + + monitor_printf(mon, "\n"); + + qapi_free_StatusInfo(info); +} + +void hmp_one_insn_per_tb(Monitor *mon, const QDict *qdict) +{ + const char *option = qdict_get_try_str(qdict, "option"); + AccelState *accel = current_accel(); + bool newval; + + if (!object_property_find(OBJECT(accel), "one-insn-per-tb")) { + monitor_printf(mon, + "This accelerator does not support setting one-insn-per-tb\n"); + return; + } + + if (!option || !strcmp(option, "on")) { + newval = true; + } else if (!strcmp(option, "off")) { + newval = false; + } else { + monitor_printf(mon, "unexpected option %s\n", option); + return; + } + /* If the property exists then setting it can never fail */ + object_property_set_bool(OBJECT(accel), "one-insn-per-tb", + newval, &error_abort); +} + +void hmp_watchdog_action(Monitor *mon, const QDict *qdict) +{ + Error *err = NULL; + WatchdogAction action; + char *qapi_value; + + qapi_value = g_ascii_strdown(qdict_get_str(qdict, "action"), -1); + action = qapi_enum_parse(&WatchdogAction_lookup, qapi_value, -1, &err); + g_free(qapi_value); + if (err) { + hmp_handle_error(mon, err); + return; + } + qmp_watchdog_set_action(action, &error_abort); +} + +void watchdog_action_completion(ReadLineState *rs, int nb_args, const char *str) +{ + int i; + + if (nb_args != 2) { + return; + } + readline_set_completion_index(rs, strlen(str)); + for (i = 0; i < WATCHDOG_ACTION__MAX; i++) { + readline_add_completion_of(rs, str, WatchdogAction_str(i)); + } +} diff --git a/system/runstate.c b/system/runstate.c new file mode 100644 index 0000000000..1652ed0439 --- /dev/null +++ b/system/runstate.c @@ -0,0 +1,871 @@ +/* + * QEMU main system emulation loop + * + * Copyright (c) 2003-2020 QEMU contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "audio/audio.h" +#include "block/block.h" +#include "block/export.h" +#include "chardev/char.h" +#include "crypto/cipher.h" +#include "crypto/init.h" +#include "exec/cpu-common.h" +#include "gdbstub/syscalls.h" +#include "hw/boards.h" +#include "migration/misc.h" +#include "migration/postcopy-ram.h" +#include "monitor/monitor.h" +#include "net/net.h" +#include "net/vhost_net.h" +#include "qapi/error.h" +#include "qapi/qapi-commands-run-state.h" +#include "qapi/qapi-events-run-state.h" +#include "qemu/accel.h" +#include "qemu/error-report.h" +#include "qemu/job.h" +#include "qemu/log.h" +#include "qemu/module.h" +#include "qemu/plugin.h" +#include "qemu/sockets.h" +#include "qemu/timer.h" +#include "qemu/thread.h" +#include "qom/object.h" +#include "qom/object_interfaces.h" +#include "sysemu/cpus.h" +#include "sysemu/qtest.h" +#include "sysemu/replay.h" +#include "sysemu/reset.h" +#include "sysemu/runstate.h" +#include "sysemu/runstate-action.h" +#include "sysemu/sysemu.h" +#include "sysemu/tpm.h" +#include "trace.h" + +static NotifierList exit_notifiers = + NOTIFIER_LIST_INITIALIZER(exit_notifiers); + +static RunState current_run_state = RUN_STATE_PRELAUNCH; + +/* We use RUN_STATE__MAX but any invalid value will do */ +static RunState vmstop_requested = RUN_STATE__MAX; +static QemuMutex vmstop_lock; + +typedef struct { + RunState from; + RunState to; +} RunStateTransition; + +static const RunStateTransition runstate_transitions_def[] = { + { RUN_STATE_PRELAUNCH, RUN_STATE_INMIGRATE }, + + { RUN_STATE_DEBUG, RUN_STATE_RUNNING }, + { RUN_STATE_DEBUG, RUN_STATE_FINISH_MIGRATE }, + { RUN_STATE_DEBUG, RUN_STATE_PRELAUNCH }, + + { RUN_STATE_INMIGRATE, RUN_STATE_INTERNAL_ERROR }, + { RUN_STATE_INMIGRATE, RUN_STATE_IO_ERROR }, + { RUN_STATE_INMIGRATE, RUN_STATE_PAUSED }, + { RUN_STATE_INMIGRATE, RUN_STATE_RUNNING }, + { RUN_STATE_INMIGRATE, RUN_STATE_SHUTDOWN }, + { RUN_STATE_INMIGRATE, RUN_STATE_SUSPENDED }, + { RUN_STATE_INMIGRATE, RUN_STATE_WATCHDOG }, + { RUN_STATE_INMIGRATE, RUN_STATE_GUEST_PANICKED }, + { RUN_STATE_INMIGRATE, RUN_STATE_FINISH_MIGRATE }, + { RUN_STATE_INMIGRATE, RUN_STATE_PRELAUNCH }, + { RUN_STATE_INMIGRATE, RUN_STATE_POSTMIGRATE }, + { RUN_STATE_INMIGRATE, RUN_STATE_COLO }, + + { RUN_STATE_INTERNAL_ERROR, RUN_STATE_PAUSED }, + { RUN_STATE_INTERNAL_ERROR, RUN_STATE_FINISH_MIGRATE }, + { RUN_STATE_INTERNAL_ERROR, RUN_STATE_PRELAUNCH }, + + { RUN_STATE_IO_ERROR, RUN_STATE_RUNNING }, + { RUN_STATE_IO_ERROR, RUN_STATE_FINISH_MIGRATE }, + { RUN_STATE_IO_ERROR, RUN_STATE_PRELAUNCH }, + + { RUN_STATE_PAUSED, RUN_STATE_RUNNING }, + { RUN_STATE_PAUSED, RUN_STATE_FINISH_MIGRATE }, + { RUN_STATE_PAUSED, RUN_STATE_POSTMIGRATE }, + { RUN_STATE_PAUSED, RUN_STATE_PRELAUNCH }, + { RUN_STATE_PAUSED, RUN_STATE_COLO}, + + { RUN_STATE_POSTMIGRATE, RUN_STATE_RUNNING }, + { RUN_STATE_POSTMIGRATE, RUN_STATE_FINISH_MIGRATE }, + { RUN_STATE_POSTMIGRATE, RUN_STATE_PRELAUNCH }, + + { RUN_STATE_PRELAUNCH, RUN_STATE_RUNNING }, + { RUN_STATE_PRELAUNCH, RUN_STATE_FINISH_MIGRATE }, + { RUN_STATE_PRELAUNCH, RUN_STATE_INMIGRATE }, + + { RUN_STATE_FINISH_MIGRATE, RUN_STATE_RUNNING }, + { RUN_STATE_FINISH_MIGRATE, RUN_STATE_PAUSED }, + { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE }, + { RUN_STATE_FINISH_MIGRATE, RUN_STATE_PRELAUNCH }, + { RUN_STATE_FINISH_MIGRATE, RUN_STATE_COLO }, + { RUN_STATE_FINISH_MIGRATE, RUN_STATE_INTERNAL_ERROR }, + { RUN_STATE_FINISH_MIGRATE, RUN_STATE_IO_ERROR }, + { RUN_STATE_FINISH_MIGRATE, RUN_STATE_SHUTDOWN }, + { RUN_STATE_FINISH_MIGRATE, RUN_STATE_SUSPENDED }, + { RUN_STATE_FINISH_MIGRATE, RUN_STATE_WATCHDOG }, + { RUN_STATE_FINISH_MIGRATE, RUN_STATE_GUEST_PANICKED }, + + { RUN_STATE_RESTORE_VM, RUN_STATE_RUNNING }, + { RUN_STATE_RESTORE_VM, RUN_STATE_PRELAUNCH }, + + { RUN_STATE_COLO, RUN_STATE_RUNNING }, + { RUN_STATE_COLO, RUN_STATE_PRELAUNCH }, + { RUN_STATE_COLO, RUN_STATE_SHUTDOWN}, + + { RUN_STATE_RUNNING, RUN_STATE_DEBUG }, + { RUN_STATE_RUNNING, RUN_STATE_INTERNAL_ERROR }, + { RUN_STATE_RUNNING, RUN_STATE_IO_ERROR }, + { RUN_STATE_RUNNING, RUN_STATE_PAUSED }, + { RUN_STATE_RUNNING, RUN_STATE_FINISH_MIGRATE }, + { RUN_STATE_RUNNING, RUN_STATE_RESTORE_VM }, + { RUN_STATE_RUNNING, RUN_STATE_SAVE_VM }, + { RUN_STATE_RUNNING, RUN_STATE_SHUTDOWN }, + { RUN_STATE_RUNNING, RUN_STATE_WATCHDOG }, + { RUN_STATE_RUNNING, RUN_STATE_GUEST_PANICKED }, + { RUN_STATE_RUNNING, RUN_STATE_COLO}, + + { RUN_STATE_SAVE_VM, RUN_STATE_RUNNING }, + + { RUN_STATE_SHUTDOWN, RUN_STATE_PAUSED }, + { RUN_STATE_SHUTDOWN, RUN_STATE_FINISH_MIGRATE }, + { RUN_STATE_SHUTDOWN, RUN_STATE_PRELAUNCH }, + { RUN_STATE_SHUTDOWN, RUN_STATE_COLO }, + + { RUN_STATE_DEBUG, RUN_STATE_SUSPENDED }, + { RUN_STATE_RUNNING, RUN_STATE_SUSPENDED }, + { RUN_STATE_SUSPENDED, RUN_STATE_RUNNING }, + { RUN_STATE_SUSPENDED, RUN_STATE_FINISH_MIGRATE }, + { RUN_STATE_SUSPENDED, RUN_STATE_PRELAUNCH }, + { RUN_STATE_SUSPENDED, RUN_STATE_COLO}, + + { RUN_STATE_WATCHDOG, RUN_STATE_RUNNING }, + { RUN_STATE_WATCHDOG, RUN_STATE_FINISH_MIGRATE }, + { RUN_STATE_WATCHDOG, RUN_STATE_PRELAUNCH }, + { RUN_STATE_WATCHDOG, RUN_STATE_COLO}, + + { RUN_STATE_GUEST_PANICKED, RUN_STATE_RUNNING }, + { RUN_STATE_GUEST_PANICKED, RUN_STATE_FINISH_MIGRATE }, + { RUN_STATE_GUEST_PANICKED, RUN_STATE_PRELAUNCH }, + + { RUN_STATE__MAX, RUN_STATE__MAX }, +}; + +static bool runstate_valid_transitions[RUN_STATE__MAX][RUN_STATE__MAX]; + +bool runstate_check(RunState state) +{ + return current_run_state == state; +} + +static void runstate_init(void) +{ + const RunStateTransition *p; + + memset(&runstate_valid_transitions, 0, sizeof(runstate_valid_transitions)); + for (p = &runstate_transitions_def[0]; p->from != RUN_STATE__MAX; p++) { + runstate_valid_transitions[p->from][p->to] = true; + } + + qemu_mutex_init(&vmstop_lock); +} + +/* This function will abort() on invalid state transitions */ +void runstate_set(RunState new_state) +{ + assert(new_state < RUN_STATE__MAX); + + trace_runstate_set(current_run_state, RunState_str(current_run_state), + new_state, RunState_str(new_state)); + + if (current_run_state == new_state) { + return; + } + + if (!runstate_valid_transitions[current_run_state][new_state]) { + error_report("invalid runstate transition: '%s' -> '%s'", + RunState_str(current_run_state), + RunState_str(new_state)); + abort(); + } + + current_run_state = new_state; +} + +RunState runstate_get(void) +{ + return current_run_state; +} + +bool runstate_is_running(void) +{ + return runstate_check(RUN_STATE_RUNNING); +} + +bool runstate_needs_reset(void) +{ + return runstate_check(RUN_STATE_INTERNAL_ERROR) || + runstate_check(RUN_STATE_SHUTDOWN); +} + +StatusInfo *qmp_query_status(Error **errp) +{ + StatusInfo *info = g_malloc0(sizeof(*info)); + AccelState *accel = current_accel(); + + /* + * We ignore errors, which will happen if the accelerator + * is not TCG. "singlestep" is meaningless for other accelerators, + * so we will set the StatusInfo field to false for those. + */ + info->singlestep = object_property_get_bool(OBJECT(accel), + "one-insn-per-tb", NULL); + info->running = runstate_is_running(); + info->status = current_run_state; + + return info; +} + +bool qemu_vmstop_requested(RunState *r) +{ + qemu_mutex_lock(&vmstop_lock); + *r = vmstop_requested; + vmstop_requested = RUN_STATE__MAX; + qemu_mutex_unlock(&vmstop_lock); + return *r < RUN_STATE__MAX; +} + +void qemu_system_vmstop_request_prepare(void) +{ + qemu_mutex_lock(&vmstop_lock); +} + +void qemu_system_vmstop_request(RunState state) +{ + vmstop_requested = state; + qemu_mutex_unlock(&vmstop_lock); + qemu_notify_event(); +} +struct VMChangeStateEntry { + VMChangeStateHandler *cb; + VMChangeStateHandler *prepare_cb; + void *opaque; + QTAILQ_ENTRY(VMChangeStateEntry) entries; + int priority; +}; + +static QTAILQ_HEAD(, VMChangeStateEntry) vm_change_state_head = + QTAILQ_HEAD_INITIALIZER(vm_change_state_head); + +/** + * qemu_add_vm_change_state_handler_prio: + * @cb: the callback to invoke + * @opaque: user data passed to the callback + * @priority: low priorities execute first when the vm runs and the reverse is + * true when the vm stops + * + * Register a callback function that is invoked when the vm starts or stops + * running. + * + * Returns: an entry to be freed using qemu_del_vm_change_state_handler() + */ +VMChangeStateEntry *qemu_add_vm_change_state_handler_prio( + VMChangeStateHandler *cb, void *opaque, int priority) +{ + return qemu_add_vm_change_state_handler_prio_full(cb, NULL, opaque, + priority); +} + +/** + * qemu_add_vm_change_state_handler_prio_full: + * @cb: the main callback to invoke + * @prepare_cb: a callback to invoke before the main callback + * @opaque: user data passed to the callbacks + * @priority: low priorities execute first when the vm runs and the reverse is + * true when the vm stops + * + * Register a main callback function and an optional prepare callback function + * that are invoked when the vm starts or stops running. The main callback and + * the prepare callback are called in two separate phases: First all prepare + * callbacks are called and only then all main callbacks are called. As its + * name suggests, the prepare callback can be used to do some preparatory work + * before invoking the main callback. + * + * Returns: an entry to be freed using qemu_del_vm_change_state_handler() + */ +VMChangeStateEntry * +qemu_add_vm_change_state_handler_prio_full(VMChangeStateHandler *cb, + VMChangeStateHandler *prepare_cb, + void *opaque, int priority) +{ + VMChangeStateEntry *e; + VMChangeStateEntry *other; + + e = g_malloc0(sizeof(*e)); + e->cb = cb; + e->prepare_cb = prepare_cb; + e->opaque = opaque; + e->priority = priority; + + /* Keep list sorted in ascending priority order */ + QTAILQ_FOREACH(other, &vm_change_state_head, entries) { + if (priority < other->priority) { + QTAILQ_INSERT_BEFORE(other, e, entries); + return e; + } + } + + QTAILQ_INSERT_TAIL(&vm_change_state_head, e, entries); + return e; +} + +VMChangeStateEntry *qemu_add_vm_change_state_handler(VMChangeStateHandler *cb, + void *opaque) +{ + return qemu_add_vm_change_state_handler_prio(cb, opaque, 0); +} + +void qemu_del_vm_change_state_handler(VMChangeStateEntry *e) +{ + QTAILQ_REMOVE(&vm_change_state_head, e, entries); + g_free(e); +} + +void vm_state_notify(bool running, RunState state) +{ + VMChangeStateEntry *e, *next; + + trace_vm_state_notify(running, state, RunState_str(state)); + + if (running) { + QTAILQ_FOREACH_SAFE(e, &vm_change_state_head, entries, next) { + if (e->prepare_cb) { + e->prepare_cb(e->opaque, running, state); + } + } + + QTAILQ_FOREACH_SAFE(e, &vm_change_state_head, entries, next) { + e->cb(e->opaque, running, state); + } + } else { + QTAILQ_FOREACH_REVERSE_SAFE(e, &vm_change_state_head, entries, next) { + if (e->prepare_cb) { + e->prepare_cb(e->opaque, running, state); + } + } + + QTAILQ_FOREACH_REVERSE_SAFE(e, &vm_change_state_head, entries, next) { + e->cb(e->opaque, running, state); + } + } +} + +static ShutdownCause reset_requested; +static ShutdownCause shutdown_requested; +static int shutdown_signal; +static pid_t shutdown_pid; +static int powerdown_requested; +static int debug_requested; +static int suspend_requested; +static WakeupReason wakeup_reason; +static NotifierList powerdown_notifiers = + NOTIFIER_LIST_INITIALIZER(powerdown_notifiers); +static NotifierList suspend_notifiers = + NOTIFIER_LIST_INITIALIZER(suspend_notifiers); +static NotifierList wakeup_notifiers = + NOTIFIER_LIST_INITIALIZER(wakeup_notifiers); +static NotifierList shutdown_notifiers = + NOTIFIER_LIST_INITIALIZER(shutdown_notifiers); +static uint32_t wakeup_reason_mask = ~(1 << QEMU_WAKEUP_REASON_NONE); + +ShutdownCause qemu_shutdown_requested_get(void) +{ + return shutdown_requested; +} + +ShutdownCause qemu_reset_requested_get(void) +{ + return reset_requested; +} + +static int qemu_shutdown_requested(void) +{ + return qatomic_xchg(&shutdown_requested, SHUTDOWN_CAUSE_NONE); +} + +static void qemu_kill_report(void) +{ + if (!qtest_driver() && shutdown_signal) { + if (shutdown_pid == 0) { + /* This happens for eg ^C at the terminal, so it's worth + * avoiding printing an odd message in that case. + */ + error_report("terminating on signal %d", shutdown_signal); + } else { + char *shutdown_cmd = qemu_get_pid_name(shutdown_pid); + + error_report("terminating on signal %d from pid " FMT_pid " (%s)", + shutdown_signal, shutdown_pid, + shutdown_cmd ? shutdown_cmd : "<unknown process>"); + g_free(shutdown_cmd); + } + shutdown_signal = 0; + } +} + +static ShutdownCause qemu_reset_requested(void) +{ + ShutdownCause r = reset_requested; + + if (r && replay_checkpoint(CHECKPOINT_RESET_REQUESTED)) { + reset_requested = SHUTDOWN_CAUSE_NONE; + return r; + } + return SHUTDOWN_CAUSE_NONE; +} + +static int qemu_suspend_requested(void) +{ + int r = suspend_requested; + if (r && replay_checkpoint(CHECKPOINT_SUSPEND_REQUESTED)) { + suspend_requested = 0; + return r; + } + return false; +} + +static WakeupReason qemu_wakeup_requested(void) +{ + return wakeup_reason; +} + +static int qemu_powerdown_requested(void) +{ + int r = powerdown_requested; + powerdown_requested = 0; + return r; +} + +static int qemu_debug_requested(void) +{ + int r = debug_requested; + debug_requested = 0; + return r; +} + +/* + * Reset the VM. Issue an event unless @reason is SHUTDOWN_CAUSE_NONE. + */ +void qemu_system_reset(ShutdownCause reason) +{ + MachineClass *mc; + + mc = current_machine ? MACHINE_GET_CLASS(current_machine) : NULL; + + cpu_synchronize_all_states(); + + if (mc && mc->reset) { + mc->reset(current_machine, reason); + } else { + qemu_devices_reset(reason); + } + switch (reason) { + case SHUTDOWN_CAUSE_NONE: + case SHUTDOWN_CAUSE_SUBSYSTEM_RESET: + case SHUTDOWN_CAUSE_SNAPSHOT_LOAD: + break; + default: + qapi_event_send_reset(shutdown_caused_by_guest(reason), reason); + } + cpu_synchronize_all_post_reset(); +} + +/* + * Wake the VM after suspend. + */ +static void qemu_system_wakeup(void) +{ + MachineClass *mc; + + mc = current_machine ? MACHINE_GET_CLASS(current_machine) : NULL; + + if (mc && mc->wakeup) { + mc->wakeup(current_machine); + } +} + +void qemu_system_guest_panicked(GuestPanicInformation *info) +{ + qemu_log_mask(LOG_GUEST_ERROR, "Guest crashed"); + + if (current_cpu) { + current_cpu->crash_occurred = true; + } + /* + * TODO: Currently the available panic actions are: none, pause, and + * shutdown, but in principle debug and reset could be supported as well. + * Investigate any potential use cases for the unimplemented actions. + */ + if (panic_action == PANIC_ACTION_PAUSE + || (panic_action == PANIC_ACTION_SHUTDOWN && shutdown_action == SHUTDOWN_ACTION_PAUSE)) { + qapi_event_send_guest_panicked(GUEST_PANIC_ACTION_PAUSE, info); + vm_stop(RUN_STATE_GUEST_PANICKED); + } else if (panic_action == PANIC_ACTION_SHUTDOWN || + panic_action == PANIC_ACTION_EXIT_FAILURE) { + qapi_event_send_guest_panicked(GUEST_PANIC_ACTION_POWEROFF, info); + vm_stop(RUN_STATE_GUEST_PANICKED); + qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_PANIC); + } else { + qapi_event_send_guest_panicked(GUEST_PANIC_ACTION_RUN, info); + } + + if (info) { + if (info->type == GUEST_PANIC_INFORMATION_TYPE_HYPER_V) { + qemu_log_mask(LOG_GUEST_ERROR, "\nHV crash parameters: (%#"PRIx64 + " %#"PRIx64" %#"PRIx64" %#"PRIx64" %#"PRIx64")\n", + info->u.hyper_v.arg1, + info->u.hyper_v.arg2, + info->u.hyper_v.arg3, + info->u.hyper_v.arg4, + info->u.hyper_v.arg5); + } else if (info->type == GUEST_PANIC_INFORMATION_TYPE_S390) { + qemu_log_mask(LOG_GUEST_ERROR, " on cpu %d: %s\n" + "PSW: 0x%016" PRIx64 " 0x%016" PRIx64"\n", + info->u.s390.core, + S390CrashReason_str(info->u.s390.reason), + info->u.s390.psw_mask, + info->u.s390.psw_addr); + } + qapi_free_GuestPanicInformation(info); + } +} + +void qemu_system_guest_crashloaded(GuestPanicInformation *info) +{ + qemu_log_mask(LOG_GUEST_ERROR, "Guest crash loaded"); + qapi_event_send_guest_crashloaded(GUEST_PANIC_ACTION_RUN, info); + qapi_free_GuestPanicInformation(info); +} + +void qemu_system_reset_request(ShutdownCause reason) +{ + if (reboot_action == REBOOT_ACTION_SHUTDOWN && + reason != SHUTDOWN_CAUSE_SUBSYSTEM_RESET) { + shutdown_requested = reason; + } else if (!cpus_are_resettable()) { + error_report("cpus are not resettable, terminating"); + shutdown_requested = reason; + } else { + reset_requested = reason; + } + cpu_stop_current(); + qemu_notify_event(); +} + +static void qemu_system_suspend(void) +{ + pause_all_vcpus(); + notifier_list_notify(&suspend_notifiers, NULL); + runstate_set(RUN_STATE_SUSPENDED); + qapi_event_send_suspend(); +} + +void qemu_system_suspend_request(void) +{ + if (runstate_check(RUN_STATE_SUSPENDED)) { + return; + } + suspend_requested = 1; + cpu_stop_current(); + qemu_notify_event(); +} + +void qemu_register_suspend_notifier(Notifier *notifier) +{ + notifier_list_add(&suspend_notifiers, notifier); +} + +void qemu_system_wakeup_request(WakeupReason reason, Error **errp) +{ + trace_system_wakeup_request(reason); + + if (!runstate_check(RUN_STATE_SUSPENDED)) { + error_setg(errp, + "Unable to wake up: guest is not in suspended state"); + return; + } + if (!(wakeup_reason_mask & (1 << reason))) { + return; + } + runstate_set(RUN_STATE_RUNNING); + wakeup_reason = reason; + qemu_notify_event(); +} + +void qemu_system_wakeup_enable(WakeupReason reason, bool enabled) +{ + if (enabled) { + wakeup_reason_mask |= (1 << reason); + } else { + wakeup_reason_mask &= ~(1 << reason); + } +} + +void qemu_register_wakeup_notifier(Notifier *notifier) +{ + notifier_list_add(&wakeup_notifiers, notifier); +} + +static bool wakeup_suspend_enabled; + +void qemu_register_wakeup_support(void) +{ + wakeup_suspend_enabled = true; +} + +bool qemu_wakeup_suspend_enabled(void) +{ + return wakeup_suspend_enabled; +} + +void qemu_system_killed(int signal, pid_t pid) +{ + shutdown_signal = signal; + shutdown_pid = pid; + shutdown_action = SHUTDOWN_ACTION_POWEROFF; + + /* Cannot call qemu_system_shutdown_request directly because + * we are in a signal handler. + */ + shutdown_requested = SHUTDOWN_CAUSE_HOST_SIGNAL; + qemu_notify_event(); +} + +void qemu_system_shutdown_request(ShutdownCause reason) +{ + trace_qemu_system_shutdown_request(reason); + replay_shutdown_request(reason); + shutdown_requested = reason; + qemu_notify_event(); +} + +static void qemu_system_powerdown(void) +{ + qapi_event_send_powerdown(); + notifier_list_notify(&powerdown_notifiers, NULL); +} + +static void qemu_system_shutdown(ShutdownCause cause) +{ + qapi_event_send_shutdown(shutdown_caused_by_guest(cause), cause); + notifier_list_notify(&shutdown_notifiers, &cause); +} + +void qemu_system_powerdown_request(void) +{ + trace_qemu_system_powerdown_request(); + powerdown_requested = 1; + qemu_notify_event(); +} + +void qemu_register_powerdown_notifier(Notifier *notifier) +{ + notifier_list_add(&powerdown_notifiers, notifier); +} + +void qemu_register_shutdown_notifier(Notifier *notifier) +{ + notifier_list_add(&shutdown_notifiers, notifier); +} + +void qemu_system_debug_request(void) +{ + debug_requested = 1; + qemu_notify_event(); +} + +static bool main_loop_should_exit(int *status) +{ + RunState r; + ShutdownCause request; + + if (qemu_debug_requested()) { + vm_stop(RUN_STATE_DEBUG); + } + if (qemu_suspend_requested()) { + qemu_system_suspend(); + } + request = qemu_shutdown_requested(); + if (request) { + qemu_kill_report(); + qemu_system_shutdown(request); + if (shutdown_action == SHUTDOWN_ACTION_PAUSE) { + vm_stop(RUN_STATE_SHUTDOWN); + } else { + if (request == SHUTDOWN_CAUSE_GUEST_PANIC && + panic_action == PANIC_ACTION_EXIT_FAILURE) { + *status = EXIT_FAILURE; + } + return true; + } + } + request = qemu_reset_requested(); + if (request) { + pause_all_vcpus(); + qemu_system_reset(request); + resume_all_vcpus(); + /* + * runstate can change in pause_all_vcpus() + * as iothread mutex is unlocked + */ + if (!runstate_check(RUN_STATE_RUNNING) && + !runstate_check(RUN_STATE_INMIGRATE) && + !runstate_check(RUN_STATE_FINISH_MIGRATE)) { + runstate_set(RUN_STATE_PRELAUNCH); + } + } + if (qemu_wakeup_requested()) { + pause_all_vcpus(); + qemu_system_wakeup(); + notifier_list_notify(&wakeup_notifiers, &wakeup_reason); + wakeup_reason = QEMU_WAKEUP_REASON_NONE; + resume_all_vcpus(); + qapi_event_send_wakeup(); + } + if (qemu_powerdown_requested()) { + qemu_system_powerdown(); + } + if (qemu_vmstop_requested(&r)) { + vm_stop(r); + } + return false; +} + +int qemu_main_loop(void) +{ + int status = EXIT_SUCCESS; + + while (!main_loop_should_exit(&status)) { + main_loop_wait(false); + } + + return status; +} + +void qemu_add_exit_notifier(Notifier *notify) +{ + notifier_list_add(&exit_notifiers, notify); +} + +void qemu_remove_exit_notifier(Notifier *notify) +{ + notifier_remove(notify); +} + +static void qemu_run_exit_notifiers(void) +{ + notifier_list_notify(&exit_notifiers, NULL); +} + +void qemu_init_subsystems(void) +{ + Error *err = NULL; + + os_set_line_buffering(); + + module_call_init(MODULE_INIT_TRACE); + + qemu_init_cpu_list(); + qemu_init_cpu_loop(); + qemu_mutex_lock_iothread(); + + atexit(qemu_run_exit_notifiers); + + module_call_init(MODULE_INIT_QOM); + module_call_init(MODULE_INIT_MIGRATION); + + runstate_init(); + precopy_infrastructure_init(); + postcopy_infrastructure_init(); + monitor_init_globals(); + + if (qcrypto_init(&err) < 0) { + error_reportf_err(err, "cannot initialize crypto: "); + exit(1); + } + + os_setup_early_signal_handling(); + + bdrv_init_with_whitelist(); + socket_init(); +} + + +void qemu_cleanup(void) +{ + gdb_exit(0); + + /* + * cleaning up the migration object cancels any existing migration + * try to do this early so that it also stops using devices. + */ + migration_shutdown(); + + /* + * Close the exports before draining the block layer. The export + * drivers may have coroutines yielding on it, so we need to clean + * them up before the drain, as otherwise they may be get stuck in + * blk_wait_while_drained(). + */ + blk_exp_close_all(); + + + /* No more vcpu or device emulation activity beyond this point */ + vm_shutdown(); + replay_finish(); + + /* + * We must cancel all block jobs while the block layer is drained, + * or cancelling will be affected by throttling and thus may block + * for an extended period of time. + * Begin the drained section after vm_shutdown() to avoid requests being + * stuck in the BlockBackend's request queue. + * We do not need to end this section, because we do not want any + * requests happening from here on anyway. + */ + bdrv_drain_all_begin(); + job_cancel_sync_all(); + bdrv_close_all(); + + /* vhost-user must be cleaned up before chardevs. */ + tpm_cleanup(); + net_cleanup(); + audio_cleanup(); + monitor_cleanup(); + qemu_chr_cleanup(); + user_creatable_cleanup(); + /* TODO: unref root container, check all devices are ok */ +} diff --git a/system/tpm-hmp-cmds.c b/system/tpm-hmp-cmds.c new file mode 100644 index 0000000000..9ed6ad6c4d --- /dev/null +++ b/system/tpm-hmp-cmds.c @@ -0,0 +1,65 @@ +/* + * HMP commands related to TPM + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * (at your option) any later version. + */ + +#include "qemu/osdep.h" +#include "qapi/qapi-commands-tpm.h" +#include "monitor/monitor.h" +#include "monitor/hmp.h" +#include "qapi/error.h" + +void hmp_info_tpm(Monitor *mon, const QDict *qdict) +{ +#ifdef CONFIG_TPM + TPMInfoList *info_list, *info; + Error *err = NULL; + unsigned int c = 0; + TPMPassthroughOptions *tpo; + TPMEmulatorOptions *teo; + + info_list = qmp_query_tpm(&err); + if (err) { + monitor_printf(mon, "TPM device not supported\n"); + error_free(err); + return; + } + + if (info_list) { + monitor_printf(mon, "TPM device:\n"); + } + + for (info = info_list; info; info = info->next) { + TPMInfo *ti = info->value; + monitor_printf(mon, " tpm%d: model=%s\n", + c, TpmModel_str(ti->model)); + + monitor_printf(mon, " \\ %s: type=%s", + ti->id, TpmType_str(ti->options->type)); + + switch (ti->options->type) { + case TPM_TYPE_PASSTHROUGH: + tpo = ti->options->u.passthrough.data; + monitor_printf(mon, "%s%s%s%s", + tpo->path ? ",path=" : "", + tpo->path ?: "", + tpo->cancel_path ? ",cancel-path=" : "", + tpo->cancel_path ?: ""); + break; + case TPM_TYPE_EMULATOR: + teo = ti->options->u.emulator.data; + monitor_printf(mon, ",chardev=%s", teo->chardev); + break; + case TPM_TYPE__MAX: + break; + } + monitor_printf(mon, "\n"); + c++; + } + qapi_free_TPMInfoList(info_list); +#else + monitor_printf(mon, "TPM device not supported\n"); +#endif /* CONFIG_TPM */ +} diff --git a/system/tpm.c b/system/tpm.c new file mode 100644 index 0000000000..578563f05a --- /dev/null +++ b/system/tpm.c @@ -0,0 +1,239 @@ +/* + * TPM configuration + * + * Copyright (C) 2011-2013 IBM Corporation + * + * Authors: + * Stefan Berger <stefanb@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + * Based on net.c + */ + +#include "qemu/osdep.h" + +#include "qapi/error.h" +#include "qapi/qapi-commands-tpm.h" +#include "qapi/qmp/qerror.h" +#include "sysemu/tpm_backend.h" +#include "sysemu/tpm.h" +#include "qemu/config-file.h" +#include "qemu/error-report.h" + +static QLIST_HEAD(, TPMBackend) tpm_backends = + QLIST_HEAD_INITIALIZER(tpm_backends); + +static const TPMBackendClass * +tpm_be_find_by_type(enum TpmType type) +{ + ObjectClass *oc; + char *typename = g_strdup_printf("tpm-%s", TpmType_str(type)); + + oc = object_class_by_name(typename); + g_free(typename); + + if (!object_class_dynamic_cast(oc, TYPE_TPM_BACKEND)) { + return NULL; + } + + return TPM_BACKEND_CLASS(oc); +} + +/* + * Walk the list of available TPM backend drivers and display them on the + * screen. + */ +static void tpm_display_backend_drivers(void) +{ + bool got_one = false; + int i; + + for (i = 0; i < TPM_TYPE__MAX; i++) { + const TPMBackendClass *bc = tpm_be_find_by_type(i); + if (!bc) { + continue; + } + if (!got_one) { + error_printf("Supported TPM types (choose only one):\n"); + got_one = true; + } + error_printf("%12s %s\n", TpmType_str(i), bc->desc); + } + if (!got_one) { + error_printf("No TPM backend types are available\n"); + } +} + +/* + * Find the TPM with the given Id + */ +TPMBackend *qemu_find_tpm_be(const char *id) +{ + TPMBackend *drv; + + if (id) { + QLIST_FOREACH(drv, &tpm_backends, list) { + if (!strcmp(drv->id, id)) { + return drv; + } + } + } + + return NULL; +} + +static int tpm_init_tpmdev(void *dummy, QemuOpts *opts, Error **errp) +{ + /* + * Use of error_report() in a function with an Error ** parameter + * is suspicious. It is okay here. The parameter only exists to + * make the function usable with qemu_opts_foreach(). It is not + * actually used. + */ + const char *value; + const char *id; + const TPMBackendClass *be; + TPMBackend *drv; + Error *local_err = NULL; + int i; + + if (!QLIST_EMPTY(&tpm_backends)) { + error_report("Only one TPM is allowed."); + return 1; + } + + id = qemu_opts_id(opts); + if (id == NULL) { + error_report(QERR_MISSING_PARAMETER, "id"); + return 1; + } + + value = qemu_opt_get(opts, "type"); + if (!value) { + error_report(QERR_MISSING_PARAMETER, "type"); + tpm_display_backend_drivers(); + return 1; + } + + i = qapi_enum_parse(&TpmType_lookup, value, -1, NULL); + be = i >= 0 ? tpm_be_find_by_type(i) : NULL; + if (be == NULL) { + error_report(QERR_INVALID_PARAMETER_VALUE, + "type", "a TPM backend type"); + tpm_display_backend_drivers(); + return 1; + } + + /* validate backend specific opts */ + if (!qemu_opts_validate(opts, be->opts, &local_err)) { + error_report_err(local_err); + return 1; + } + + drv = be->create(opts); + if (!drv) { + return 1; + } + + drv->id = g_strdup(id); + QLIST_INSERT_HEAD(&tpm_backends, drv, list); + + return 0; +} + +/* + * Walk the list of TPM backend drivers that are in use and call their + * destroy function to have them cleaned up. + */ +void tpm_cleanup(void) +{ + TPMBackend *drv, *next; + + QLIST_FOREACH_SAFE(drv, &tpm_backends, list, next) { + QLIST_REMOVE(drv, list); + object_unref(OBJECT(drv)); + } +} + +/* + * Initialize the TPM. Process the tpmdev command line options describing the + * TPM backend. + */ +int tpm_init(void) +{ + if (qemu_opts_foreach(qemu_find_opts("tpmdev"), + tpm_init_tpmdev, NULL, NULL)) { + return -1; + } + + return 0; +} + +/* + * Parse the TPM configuration options. + * To display all available TPM backends the user may use '-tpmdev help' + */ +int tpm_config_parse(QemuOptsList *opts_list, const char *optarg) +{ + QemuOpts *opts; + + if (!strcmp(optarg, "help")) { + tpm_display_backend_drivers(); + return -1; + } + opts = qemu_opts_parse_noisily(opts_list, optarg, true); + if (!opts) { + return -1; + } + return 0; +} + +/* + * Walk the list of active TPM backends and collect information about them. + */ +TPMInfoList *qmp_query_tpm(Error **errp) +{ + TPMBackend *drv; + TPMInfoList *head = NULL, **tail = &head; + + QLIST_FOREACH(drv, &tpm_backends, list) { + if (!drv->tpmif) { + continue; + } + + QAPI_LIST_APPEND(tail, tpm_backend_query_tpm(drv)); + } + + return head; +} + +TpmTypeList *qmp_query_tpm_types(Error **errp) +{ + unsigned int i = 0; + TpmTypeList *head = NULL, **tail = &head; + + for (i = 0; i < TPM_TYPE__MAX; i++) { + if (!tpm_be_find_by_type(i)) { + continue; + } + QAPI_LIST_APPEND(tail, i); + } + + return head; +} +TpmModelList *qmp_query_tpm_models(Error **errp) +{ + TpmModelList *head = NULL, **tail = &head; + GSList *e, *l = object_class_get_list(TYPE_TPM_IF, false); + + for (e = l; e; e = e->next) { + TPMIfClass *c = TPM_IF_CLASS(e->data); + + QAPI_LIST_APPEND(tail, c->model); + } + g_slist_free(l); + + return head; +} diff --git a/system/trace-events b/system/trace-events new file mode 100644 index 0000000000..69c9044151 --- /dev/null +++ b/system/trace-events @@ -0,0 +1,40 @@ +# See docs/devel/tracing.rst for syntax documentation. + +# balloon.c +# Since requests are raised via monitor, not many tracepoints are needed. +balloon_event(void *opaque, unsigned long addr) "opaque %p addr %lu" + +# ioport.c +cpu_in(unsigned int addr, char size, unsigned int val) "addr 0x%x(%c) value %u" +cpu_out(unsigned int addr, char size, unsigned int val) "addr 0x%x(%c) value %u" + +# memory.c +memory_region_ops_read(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size, const char *name) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u name '%s'" +memory_region_ops_write(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size, const char *name) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u name '%s'" +memory_region_subpage_read(int cpu_index, void *mr, uint64_t offset, uint64_t value, unsigned size) "cpu %d mr %p offset 0x%"PRIx64" value 0x%"PRIx64" size %u" +memory_region_subpage_write(int cpu_index, void *mr, uint64_t offset, uint64_t value, unsigned size) "cpu %d mr %p offset 0x%"PRIx64" value 0x%"PRIx64" size %u" +memory_region_ram_device_read(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u" +memory_region_ram_device_write(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u" +memory_region_sync_dirty(const char *mr, const char *listener, int global) "mr '%s' listener '%s' synced (global=%d)" +flatview_new(void *view, void *root) "%p (root %p)" +flatview_destroy(void *view, void *root) "%p (root %p)" +flatview_destroy_rcu(void *view, void *root) "%p (root %p)" +global_dirty_changed(unsigned int bitmask) "bitmask 0x%"PRIx32 + +# cpus.c +vm_stop_flush_all(int ret) "ret %d" + +# vl.c +vm_state_notify(int running, int reason, const char *reason_str) "running %d reason %d (%s)" +load_file(const char *name, const char *path) "name %s location %s" +runstate_set(int current_state, const char *current_state_str, int new_state, const char *new_state_str) "current_run_state %d (%s) new_state %d (%s)" +system_wakeup_request(int reason) "reason=%d" +qemu_system_shutdown_request(int reason) "reason=%d" +qemu_system_powerdown_request(void) "" + +#dirtylimit.c +dirtylimit_state_initialize(int max_cpus) "dirtylimit state initialize: max cpus %d" +dirtylimit_state_finalize(void) +dirtylimit_throttle_pct(int cpu_index, uint64_t pct, int64_t time_us) "CPU[%d] throttle percent: %" PRIu64 ", throttle adjust time %"PRIi64 " us" +dirtylimit_set_vcpu(int cpu_index, uint64_t quota) "CPU[%d] set dirty page rate limit %"PRIu64 +dirtylimit_vcpu_execute(int cpu_index, int64_t sleep_time_us) "CPU[%d] sleep %"PRIi64 " us" diff --git a/system/trace.h b/system/trace.h new file mode 100644 index 0000000000..cd0136dcdc --- /dev/null +++ b/system/trace.h @@ -0,0 +1 @@ +#include "trace/trace-system.h" diff --git a/system/vl.c b/system/vl.c new file mode 100644 index 0000000000..98e071e63b --- /dev/null +++ b/system/vl.c @@ -0,0 +1,3730 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qemu/help-texts.h" +#include "qemu/datadir.h" +#include "qemu/units.h" +#include "exec/cpu-common.h" +#include "exec/page-vary.h" +#include "hw/qdev-properties.h" +#include "qapi/compat-policy.h" +#include "qapi/error.h" +#include "qapi/qmp/qdict.h" +#include "qapi/qmp/qstring.h" +#include "qapi/qmp/qjson.h" +#include "qemu-version.h" +#include "qemu/cutils.h" +#include "qemu/help_option.h" +#include "qemu/hw-version.h" +#include "qemu/uuid.h" +#include "sysemu/reset.h" +#include "sysemu/runstate.h" +#include "sysemu/runstate-action.h" +#include "sysemu/seccomp.h" +#include "sysemu/tcg.h" +#include "sysemu/xen.h" + +#include "qemu/error-report.h" +#include "qemu/sockets.h" +#include "qemu/accel.h" +#include "qemu/async-teardown.h" +#include "hw/usb.h" +#include "hw/isa/isa.h" +#include "hw/scsi/scsi.h" +#include "hw/display/vga.h" +#include "hw/firmware/smbios.h" +#include "hw/acpi/acpi.h" +#include "hw/xen/xen.h" +#include "hw/loader.h" +#include "monitor/qdev.h" +#include "net/net.h" +#include "net/slirp.h" +#include "monitor/monitor.h" +#include "ui/console.h" +#include "ui/input.h" +#include "sysemu/sysemu.h" +#include "sysemu/numa.h" +#include "sysemu/hostmem.h" +#include "exec/gdbstub.h" +#include "qemu/timer.h" +#include "chardev/char.h" +#include "qemu/bitmap.h" +#include "qemu/log.h" +#include "sysemu/blockdev.h" +#include "hw/block/block.h" +#include "hw/i386/x86.h" +#include "hw/i386/pc.h" +#include "migration/misc.h" +#include "migration/snapshot.h" +#include "sysemu/tpm.h" +#include "sysemu/dma.h" +#include "hw/audio/soundhw.h" +#include "audio/audio.h" +#include "sysemu/cpus.h" +#include "sysemu/cpu-timers.h" +#include "migration/colo.h" +#include "migration/postcopy-ram.h" +#include "sysemu/kvm.h" +#include "qapi/qobject-input-visitor.h" +#include "qemu/option.h" +#include "qemu/config-file.h" +#include "qemu/main-loop.h" +#ifdef CONFIG_VIRTFS +#include "fsdev/qemu-fsdev.h" +#endif +#include "sysemu/qtest.h" +#ifdef CONFIG_TCG +#include "accel/tcg/perf.h" +#endif + +#include "disas/disas.h" + +#include "trace.h" +#include "trace/control.h" +#include "qemu/plugin.h" +#include "qemu/queue.h" +#include "sysemu/arch_init.h" +#include "exec/confidential-guest-support.h" + +#include "ui/qemu-spice.h" +#include "qapi/string-input-visitor.h" +#include "qapi/opts-visitor.h" +#include "qapi/clone-visitor.h" +#include "qom/object_interfaces.h" +#include "semihosting/semihost.h" +#include "crypto/init.h" +#include "sysemu/replay.h" +#include "qapi/qapi-events-run-state.h" +#include "qapi/qapi-types-audio.h" +#include "qapi/qapi-visit-audio.h" +#include "qapi/qapi-visit-block-core.h" +#include "qapi/qapi-visit-compat.h" +#include "qapi/qapi-visit-machine.h" +#include "qapi/qapi-visit-ui.h" +#include "qapi/qapi-commands-block-core.h" +#include "qapi/qapi-commands-migration.h" +#include "qapi/qapi-commands-misc.h" +#include "qapi/qapi-visit-qom.h" +#include "qapi/qapi-commands-ui.h" +#include "block/qdict.h" +#include "qapi/qmp/qerror.h" +#include "sysemu/iothread.h" +#include "qemu/guest-random.h" +#include "qemu/keyval.h" + +#define MAX_VIRTIO_CONSOLES 1 + +typedef struct BlockdevOptionsQueueEntry { + BlockdevOptions *bdo; + Location loc; + QSIMPLEQ_ENTRY(BlockdevOptionsQueueEntry) entry; +} BlockdevOptionsQueueEntry; + +typedef QSIMPLEQ_HEAD(, BlockdevOptionsQueueEntry) BlockdevOptionsQueue; + +typedef struct ObjectOption { + ObjectOptions *opts; + QTAILQ_ENTRY(ObjectOption) next; +} ObjectOption; + +typedef struct DeviceOption { + QDict *opts; + Location loc; + QTAILQ_ENTRY(DeviceOption) next; +} DeviceOption; + +static const char *cpu_option; +static const char *mem_path; +static const char *incoming; +static const char *loadvm; +static const char *accelerators; +static bool have_custom_ram_size; +static const char *ram_memdev_id; +static QDict *machine_opts_dict; +static QTAILQ_HEAD(, ObjectOption) object_opts = QTAILQ_HEAD_INITIALIZER(object_opts); +static QTAILQ_HEAD(, DeviceOption) device_opts = QTAILQ_HEAD_INITIALIZER(device_opts); +static int display_remote; +static int snapshot; +static bool preconfig_requested; +static QemuPluginList plugin_list = QTAILQ_HEAD_INITIALIZER(plugin_list); +static BlockdevOptionsQueue bdo_queue = QSIMPLEQ_HEAD_INITIALIZER(bdo_queue); +static bool nographic = false; +static int mem_prealloc; /* force preallocation of physical target memory */ +static const char *vga_model = NULL; +static DisplayOptions dpy; +static int num_serial_hds; +static Chardev **serial_hds; +static const char *log_mask; +static const char *log_file; +static bool list_data_dirs; +static const char *qtest_chrdev; +static const char *qtest_log; +static bool opt_one_insn_per_tb; + +static int has_defaults = 1; +static int default_serial = 1; +static int default_parallel = 1; +static int default_monitor = 1; +static int default_floppy = 1; +static int default_cdrom = 1; +static int default_sdcard = 1; +static int default_vga = 1; +static int default_net = 1; + +static struct { + const char *driver; + int *flag; +} default_list[] = { + { .driver = "isa-serial", .flag = &default_serial }, + { .driver = "isa-parallel", .flag = &default_parallel }, + { .driver = "isa-fdc", .flag = &default_floppy }, + { .driver = "floppy", .flag = &default_floppy }, + { .driver = "ide-cd", .flag = &default_cdrom }, + { .driver = "ide-hd", .flag = &default_cdrom }, + { .driver = "scsi-cd", .flag = &default_cdrom }, + { .driver = "scsi-hd", .flag = &default_cdrom }, + { .driver = "VGA", .flag = &default_vga }, + { .driver = "isa-vga", .flag = &default_vga }, + { .driver = "cirrus-vga", .flag = &default_vga }, + { .driver = "isa-cirrus-vga", .flag = &default_vga }, + { .driver = "vmware-svga", .flag = &default_vga }, + { .driver = "qxl-vga", .flag = &default_vga }, + { .driver = "virtio-vga", .flag = &default_vga }, + { .driver = "ati-vga", .flag = &default_vga }, + { .driver = "vhost-user-vga", .flag = &default_vga }, + { .driver = "virtio-vga-gl", .flag = &default_vga }, +}; + +static QemuOptsList qemu_rtc_opts = { + .name = "rtc", + .head = QTAILQ_HEAD_INITIALIZER(qemu_rtc_opts.head), + .merge_lists = true, + .desc = { + { + .name = "base", + .type = QEMU_OPT_STRING, + },{ + .name = "clock", + .type = QEMU_OPT_STRING, + },{ + .name = "driftfix", + .type = QEMU_OPT_STRING, + }, + { /* end of list */ } + }, +}; + +static QemuOptsList qemu_option_rom_opts = { + .name = "option-rom", + .implied_opt_name = "romfile", + .head = QTAILQ_HEAD_INITIALIZER(qemu_option_rom_opts.head), + .desc = { + { + .name = "bootindex", + .type = QEMU_OPT_NUMBER, + }, { + .name = "romfile", + .type = QEMU_OPT_STRING, + }, + { /* end of list */ } + }, +}; + +static QemuOptsList qemu_accel_opts = { + .name = "accel", + .implied_opt_name = "accel", + .head = QTAILQ_HEAD_INITIALIZER(qemu_accel_opts.head), + .desc = { + /* + * no elements => accept any + * sanity checking will happen later + * when setting accelerator properties + */ + { } + }, +}; + +static QemuOptsList qemu_boot_opts = { + .name = "boot-opts", + .implied_opt_name = "order", + .merge_lists = true, + .head = QTAILQ_HEAD_INITIALIZER(qemu_boot_opts.head), + .desc = { + { + .name = "order", + .type = QEMU_OPT_STRING, + }, { + .name = "once", + .type = QEMU_OPT_STRING, + }, { + .name = "menu", + .type = QEMU_OPT_BOOL, + }, { + .name = "splash", + .type = QEMU_OPT_STRING, + }, { + .name = "splash-time", + .type = QEMU_OPT_NUMBER, + }, { + .name = "reboot-timeout", + .type = QEMU_OPT_NUMBER, + }, { + .name = "strict", + .type = QEMU_OPT_BOOL, + }, + { /*End of list */ } + }, +}; + +static QemuOptsList qemu_add_fd_opts = { + .name = "add-fd", + .head = QTAILQ_HEAD_INITIALIZER(qemu_add_fd_opts.head), + .desc = { + { + .name = "fd", + .type = QEMU_OPT_NUMBER, + .help = "file descriptor of which a duplicate is added to fd set", + },{ + .name = "set", + .type = QEMU_OPT_NUMBER, + .help = "ID of the fd set to add fd to", + },{ + .name = "opaque", + .type = QEMU_OPT_STRING, + .help = "free-form string used to describe fd", + }, + { /* end of list */ } + }, +}; + +static QemuOptsList qemu_object_opts = { + .name = "object", + .implied_opt_name = "qom-type", + .head = QTAILQ_HEAD_INITIALIZER(qemu_object_opts.head), + .desc = { + { } + }, +}; + +static QemuOptsList qemu_tpmdev_opts = { + .name = "tpmdev", + .implied_opt_name = "type", + .head = QTAILQ_HEAD_INITIALIZER(qemu_tpmdev_opts.head), + .desc = { + /* options are defined in the TPM backends */ + { /* end of list */ } + }, +}; + +static QemuOptsList qemu_overcommit_opts = { + .name = "overcommit", + .head = QTAILQ_HEAD_INITIALIZER(qemu_overcommit_opts.head), + .desc = { + { + .name = "mem-lock", + .type = QEMU_OPT_BOOL, + }, + { + .name = "cpu-pm", + .type = QEMU_OPT_BOOL, + }, + { /* end of list */ } + }, +}; + +static QemuOptsList qemu_msg_opts = { + .name = "msg", + .head = QTAILQ_HEAD_INITIALIZER(qemu_msg_opts.head), + .desc = { + { + .name = "timestamp", + .type = QEMU_OPT_BOOL, + }, + { + .name = "guest-name", + .type = QEMU_OPT_BOOL, + .help = "Prepends guest name for error messages but only if " + "-name guest is set otherwise option is ignored\n", + }, + { /* end of list */ } + }, +}; + +static QemuOptsList qemu_name_opts = { + .name = "name", + .implied_opt_name = "guest", + .merge_lists = true, + .head = QTAILQ_HEAD_INITIALIZER(qemu_name_opts.head), + .desc = { + { + .name = "guest", + .type = QEMU_OPT_STRING, + .help = "Sets the name of the guest.\n" + "This name will be displayed in the SDL window caption.\n" + "The name will also be used for the VNC server", + }, { + .name = "process", + .type = QEMU_OPT_STRING, + .help = "Sets the name of the QEMU process, as shown in top etc", + }, { + .name = "debug-threads", + .type = QEMU_OPT_BOOL, + .help = "When enabled, name the individual threads; defaults off.\n" + "NOTE: The thread names are for debugging and not a\n" + "stable API.", + }, + { /* End of list */ } + }, +}; + +static QemuOptsList qemu_mem_opts = { + .name = "memory", + .implied_opt_name = "size", + .head = QTAILQ_HEAD_INITIALIZER(qemu_mem_opts.head), + .merge_lists = true, + .desc = { + { + .name = "size", + .type = QEMU_OPT_SIZE, + }, + { + .name = "slots", + .type = QEMU_OPT_NUMBER, + }, + { + .name = "maxmem", + .type = QEMU_OPT_SIZE, + }, + { /* end of list */ } + }, +}; + +static QemuOptsList qemu_icount_opts = { + .name = "icount", + .implied_opt_name = "shift", + .merge_lists = true, + .head = QTAILQ_HEAD_INITIALIZER(qemu_icount_opts.head), + .desc = { + { + .name = "shift", + .type = QEMU_OPT_STRING, + }, { + .name = "align", + .type = QEMU_OPT_BOOL, + }, { + .name = "sleep", + .type = QEMU_OPT_BOOL, + }, { + .name = "rr", + .type = QEMU_OPT_STRING, + }, { + .name = "rrfile", + .type = QEMU_OPT_STRING, + }, { + .name = "rrsnapshot", + .type = QEMU_OPT_STRING, + }, + { /* end of list */ } + }, +}; + +static QemuOptsList qemu_fw_cfg_opts = { + .name = "fw_cfg", + .implied_opt_name = "name", + .head = QTAILQ_HEAD_INITIALIZER(qemu_fw_cfg_opts.head), + .desc = { + { + .name = "name", + .type = QEMU_OPT_STRING, + .help = "Sets the fw_cfg name of the blob to be inserted", + }, { + .name = "file", + .type = QEMU_OPT_STRING, + .help = "Sets the name of the file from which " + "the fw_cfg blob will be loaded", + }, { + .name = "string", + .type = QEMU_OPT_STRING, + .help = "Sets content of the blob to be inserted from a string", + }, { + .name = "gen_id", + .type = QEMU_OPT_STRING, + .help = "Sets id of the object generating the fw_cfg blob " + "to be inserted", + }, + { /* end of list */ } + }, +}; + +static QemuOptsList qemu_action_opts = { + .name = "action", + .merge_lists = true, + .head = QTAILQ_HEAD_INITIALIZER(qemu_action_opts.head), + .desc = { + { + .name = "shutdown", + .type = QEMU_OPT_STRING, + },{ + .name = "reboot", + .type = QEMU_OPT_STRING, + },{ + .name = "panic", + .type = QEMU_OPT_STRING, + },{ + .name = "watchdog", + .type = QEMU_OPT_STRING, + }, + { /* end of list */ } + }, +}; + +const char *qemu_get_vm_name(void) +{ + return qemu_name; +} + +static void default_driver_disable(const char *driver) +{ + int i; + + if (!driver) { + return; + } + + for (i = 0; i < ARRAY_SIZE(default_list); i++) { + if (strcmp(default_list[i].driver, driver) != 0) + continue; + *(default_list[i].flag) = 0; + } +} + +static int default_driver_check(void *opaque, QemuOpts *opts, Error **errp) +{ + const char *driver = qemu_opt_get(opts, "driver"); + + default_driver_disable(driver); + return 0; +} + +static void default_driver_check_json(void) +{ + DeviceOption *opt; + + QTAILQ_FOREACH(opt, &device_opts, next) { + const char *driver = qdict_get_try_str(opt->opts, "driver"); + default_driver_disable(driver); + } +} + +static int parse_name(void *opaque, QemuOpts *opts, Error **errp) +{ + const char *proc_name; + + if (qemu_opt_get(opts, "debug-threads")) { + qemu_thread_naming(qemu_opt_get_bool(opts, "debug-threads", false)); + } + qemu_name = qemu_opt_get(opts, "guest"); + + proc_name = qemu_opt_get(opts, "process"); + if (proc_name) { + os_set_proc_name(proc_name); + } + + return 0; +} + +bool defaults_enabled(void) +{ + return has_defaults; +} + +#ifndef _WIN32 +static int parse_add_fd(void *opaque, QemuOpts *opts, Error **errp) +{ + int fd, dupfd, flags; + int64_t fdset_id; + const char *fd_opaque = NULL; + AddfdInfo *fdinfo; + + fd = qemu_opt_get_number(opts, "fd", -1); + fdset_id = qemu_opt_get_number(opts, "set", -1); + fd_opaque = qemu_opt_get(opts, "opaque"); + + if (fd < 0) { + error_setg(errp, "fd option is required and must be non-negative"); + return -1; + } + + if (fd <= STDERR_FILENO) { + error_setg(errp, "fd cannot be a standard I/O stream"); + return -1; + } + + /* + * All fds inherited across exec() necessarily have FD_CLOEXEC + * clear, while qemu sets FD_CLOEXEC on all other fds used internally. + */ + flags = fcntl(fd, F_GETFD); + if (flags == -1 || (flags & FD_CLOEXEC)) { + error_setg(errp, "fd is not valid or already in use"); + return -1; + } + + if (fdset_id < 0) { + error_setg(errp, "set option is required and must be non-negative"); + return -1; + } + +#ifdef F_DUPFD_CLOEXEC + dupfd = fcntl(fd, F_DUPFD_CLOEXEC, 0); +#else + dupfd = dup(fd); + if (dupfd != -1) { + qemu_set_cloexec(dupfd); + } +#endif + if (dupfd == -1) { + error_setg(errp, "error duplicating fd: %s", strerror(errno)); + return -1; + } + + /* add the duplicate fd, and optionally the opaque string, to the fd set */ + fdinfo = monitor_fdset_add_fd(dupfd, true, fdset_id, fd_opaque, + &error_abort); + g_free(fdinfo); + + return 0; +} + +static int cleanup_add_fd(void *opaque, QemuOpts *opts, Error **errp) +{ + int fd; + + fd = qemu_opt_get_number(opts, "fd", -1); + close(fd); + + return 0; +} +#endif + +/***********************************************************/ +/* QEMU Block devices */ + +#define HD_OPTS "media=disk" +#define CDROM_OPTS "media=cdrom" +#define FD_OPTS "" +#define PFLASH_OPTS "" +#define MTD_OPTS "" +#define SD_OPTS "" + +static int drive_init_func(void *opaque, QemuOpts *opts, Error **errp) +{ + BlockInterfaceType *block_default_type = opaque; + + return drive_new(opts, *block_default_type, errp) == NULL; +} + +static int drive_enable_snapshot(void *opaque, QemuOpts *opts, Error **errp) +{ + if (qemu_opt_get(opts, "snapshot") == NULL) { + qemu_opt_set(opts, "snapshot", "on", &error_abort); + } + return 0; +} + +static void default_drive(int enable, int snapshot, BlockInterfaceType type, + int index, const char *optstr) +{ + QemuOpts *opts; + DriveInfo *dinfo; + + if (!enable || drive_get_by_index(type, index)) { + return; + } + + opts = drive_add(type, index, NULL, optstr); + if (snapshot) { + drive_enable_snapshot(NULL, opts, NULL); + } + + dinfo = drive_new(opts, type, &error_abort); + dinfo->is_default = true; + +} + +static void configure_blockdev(BlockdevOptionsQueue *bdo_queue, + MachineClass *machine_class, int snapshot) +{ + /* + * If the currently selected machine wishes to override the + * units-per-bus property of its default HBA interface type, do so + * now. + */ + if (machine_class->units_per_default_bus) { + override_max_devs(machine_class->block_default_type, + machine_class->units_per_default_bus); + } + + /* open the virtual block devices */ + while (!QSIMPLEQ_EMPTY(bdo_queue)) { + BlockdevOptionsQueueEntry *bdo = QSIMPLEQ_FIRST(bdo_queue); + + QSIMPLEQ_REMOVE_HEAD(bdo_queue, entry); + loc_push_restore(&bdo->loc); + qmp_blockdev_add(bdo->bdo, &error_fatal); + loc_pop(&bdo->loc); + qapi_free_BlockdevOptions(bdo->bdo); + g_free(bdo); + } + if (snapshot) { + qemu_opts_foreach(qemu_find_opts("drive"), drive_enable_snapshot, + NULL, NULL); + } + if (qemu_opts_foreach(qemu_find_opts("drive"), drive_init_func, + &machine_class->block_default_type, &error_fatal)) { + /* We printed help */ + exit(0); + } + + default_drive(default_cdrom, snapshot, machine_class->block_default_type, 2, + CDROM_OPTS); + default_drive(default_floppy, snapshot, IF_FLOPPY, 0, FD_OPTS); + default_drive(default_sdcard, snapshot, IF_SD, 0, SD_OPTS); + +} + +static QemuOptsList qemu_smp_opts = { + .name = "smp-opts", + .implied_opt_name = "cpus", + .merge_lists = true, + .head = QTAILQ_HEAD_INITIALIZER(qemu_smp_opts.head), + .desc = { + { + .name = "cpus", + .type = QEMU_OPT_NUMBER, + }, { + .name = "sockets", + .type = QEMU_OPT_NUMBER, + }, { + .name = "dies", + .type = QEMU_OPT_NUMBER, + }, { + .name = "clusters", + .type = QEMU_OPT_NUMBER, + }, { + .name = "cores", + .type = QEMU_OPT_NUMBER, + }, { + .name = "threads", + .type = QEMU_OPT_NUMBER, + }, { + .name = "maxcpus", + .type = QEMU_OPT_NUMBER, + }, + { /*End of list */ } + }, +}; + +#if defined(CONFIG_POSIX) +static QemuOptsList qemu_run_with_opts = { + .name = "run-with", + .head = QTAILQ_HEAD_INITIALIZER(qemu_run_with_opts.head), + .desc = { +#if defined(CONFIG_LINUX) + { + .name = "async-teardown", + .type = QEMU_OPT_BOOL, + }, +#endif + { + .name = "chroot", + .type = QEMU_OPT_STRING, + }, + { /* end of list */ } + }, +}; + +#define qemu_add_run_with_opts() qemu_add_opts(&qemu_run_with_opts) + +#else + +#define qemu_add_run_with_opts() + +#endif /* CONFIG_POSIX */ + +static void realtime_init(void) +{ + if (enable_mlock) { + if (os_mlock() < 0) { + error_report("locking memory failed"); + exit(1); + } + } +} + + +static void configure_msg(QemuOpts *opts) +{ + message_with_timestamp = qemu_opt_get_bool(opts, "timestamp", false); + error_with_guestname = qemu_opt_get_bool(opts, "guest-name", false); +} + + +/***********************************************************/ +/* USB devices */ + +static int usb_device_add(const char *devname) +{ + USBDevice *dev = NULL; + + if (!machine_usb(current_machine)) { + return -1; + } + + dev = usbdevice_create(devname); + if (!dev) + return -1; + + return 0; +} + +static int usb_parse(const char *cmdline) +{ + int r; + r = usb_device_add(cmdline); + if (r < 0) { + error_report("could not add USB device '%s'", cmdline); + } + return r; +} + +/***********************************************************/ +/* machine registration */ + +static MachineClass *find_machine(const char *name, GSList *machines) +{ + GSList *el; + + for (el = machines; el; el = el->next) { + MachineClass *mc = el->data; + + if (!strcmp(mc->name, name) || !g_strcmp0(mc->alias, name)) { + return mc; + } + } + + return NULL; +} + +static MachineClass *find_default_machine(GSList *machines) +{ + GSList *el; + MachineClass *default_machineclass = NULL; + + for (el = machines; el; el = el->next) { + MachineClass *mc = el->data; + + if (mc->is_default) { + assert(default_machineclass == NULL && "Multiple default machines"); + default_machineclass = mc; + } + } + + return default_machineclass; +} + +static void version(void) +{ + printf("QEMU emulator version " QEMU_FULL_VERSION "\n" + QEMU_COPYRIGHT "\n"); +} + +static void help(int exitcode) +{ + version(); + printf("usage: %s [options] [disk_image]\n\n" + "'disk_image' is a raw hard disk image for IDE hard disk 0\n\n", + g_get_prgname()); + +#define DEF(option, opt_arg, opt_enum, opt_help, arch_mask) \ + if ((arch_mask) & arch_type) \ + fputs(opt_help, stdout); + +#define ARCHHEADING(text, arch_mask) \ + if ((arch_mask) & arch_type) \ + puts(stringify(text)); + +#define DEFHEADING(text) ARCHHEADING(text, QEMU_ARCH_ALL) + +#include "qemu-options.def" + + printf("\nDuring emulation, the following keys are useful:\n" + "ctrl-alt-f toggle full screen\n" + "ctrl-alt-n switch to virtual console 'n'\n" + "ctrl-alt toggle mouse and keyboard grab\n" + "\n" + "When using -nographic, press 'ctrl-a h' to get some help.\n" + "\n" + QEMU_HELP_BOTTOM "\n"); + + exit(exitcode); +} + +enum { + +#define DEF(option, opt_arg, opt_enum, opt_help, arch_mask) \ + opt_enum, +#define DEFHEADING(text) +#define ARCHHEADING(text, arch_mask) + +#include "qemu-options.def" +}; + +#define HAS_ARG 0x0001 + +typedef struct QEMUOption { + const char *name; + int flags; + int index; + uint32_t arch_mask; +} QEMUOption; + +static const QEMUOption qemu_options[] = { + { "h", 0, QEMU_OPTION_h, QEMU_ARCH_ALL }, + +#define DEF(option, opt_arg, opt_enum, opt_help, arch_mask) \ + { option, opt_arg, opt_enum, arch_mask }, +#define DEFHEADING(text) +#define ARCHHEADING(text, arch_mask) + +#include "qemu-options.def" + { /* end of list */ } +}; + +typedef struct VGAInterfaceInfo { + const char *opt_name; /* option name */ + const char *name; /* human-readable name */ + /* Class names indicating that support is available. + * If no class is specified, the interface is always available */ + const char *class_names[2]; +} VGAInterfaceInfo; + +static const VGAInterfaceInfo vga_interfaces[VGA_TYPE_MAX] = { + [VGA_NONE] = { + .opt_name = "none", + .name = "no graphic card", + }, + [VGA_STD] = { + .opt_name = "std", + .name = "standard VGA", + .class_names = { "VGA", "isa-vga" }, + }, + [VGA_CIRRUS] = { + .opt_name = "cirrus", + .name = "Cirrus VGA", + .class_names = { "cirrus-vga", "isa-cirrus-vga" }, + }, + [VGA_VMWARE] = { + .opt_name = "vmware", + .name = "VMWare SVGA", + .class_names = { "vmware-svga" }, + }, + [VGA_VIRTIO] = { + .opt_name = "virtio", + .name = "Virtio VGA", + .class_names = { "virtio-vga" }, + }, + [VGA_QXL] = { + .opt_name = "qxl", + .name = "QXL VGA", + .class_names = { "qxl-vga" }, + }, + [VGA_TCX] = { + .opt_name = "tcx", + .name = "TCX framebuffer", + .class_names = { "sun-tcx" }, + }, + [VGA_CG3] = { + .opt_name = "cg3", + .name = "CG3 framebuffer", + .class_names = { "cgthree" }, + }, +#ifdef CONFIG_XEN_BACKEND + [VGA_XENFB] = { + .opt_name = "xenfb", + .name = "Xen paravirtualized framebuffer", + }, +#endif +}; + +static bool vga_interface_available(VGAInterfaceType t) +{ + const VGAInterfaceInfo *ti = &vga_interfaces[t]; + + assert(t < VGA_TYPE_MAX); + return !ti->class_names[0] || + module_object_class_by_name(ti->class_names[0]) || + module_object_class_by_name(ti->class_names[1]); +} + +static const char * +get_default_vga_model(const MachineClass *machine_class) +{ + if (machine_class->default_display) { + for (int t = 0; t < VGA_TYPE_MAX; t++) { + const VGAInterfaceInfo *ti = &vga_interfaces[t]; + + if (ti->opt_name && vga_interface_available(t) && + g_str_equal(ti->opt_name, machine_class->default_display)) { + return machine_class->default_display; + } + } + + warn_report_once("Default display '%s' is not available in this binary", + machine_class->default_display); + return NULL; + } else if (vga_interface_available(VGA_CIRRUS)) { + return "cirrus"; + } else if (vga_interface_available(VGA_STD)) { + return "std"; + } + + return NULL; +} + +static void select_vgahw(const MachineClass *machine_class, const char *p) +{ + const char *opts; + int t; + + if (g_str_equal(p, "help")) { + const char *def = get_default_vga_model(machine_class); + + for (t = 0; t < VGA_TYPE_MAX; t++) { + const VGAInterfaceInfo *ti = &vga_interfaces[t]; + + if (vga_interface_available(t) && ti->opt_name) { + printf("%-20s %s%s\n", ti->opt_name, ti->name ?: "", + (def && g_str_equal(ti->opt_name, def)) ? + " (default)" : ""); + } + } + exit(0); + } + + assert(vga_interface_type == VGA_NONE); + for (t = 0; t < VGA_TYPE_MAX; t++) { + const VGAInterfaceInfo *ti = &vga_interfaces[t]; + if (ti->opt_name && strstart(p, ti->opt_name, &opts)) { + if (!vga_interface_available(t)) { + error_report("%s not available", ti->name); + exit(1); + } + vga_interface_type = t; + break; + } + } + if (t == VGA_TYPE_MAX) { + invalid_vga: + error_report("unknown vga type: %s", p); + exit(1); + } + while (*opts) { + const char *nextopt; + + if (strstart(opts, ",retrace=", &nextopt)) { + opts = nextopt; + if (strstart(opts, "dumb", &nextopt)) + vga_retrace_method = VGA_RETRACE_DUMB; + else if (strstart(opts, "precise", &nextopt)) + vga_retrace_method = VGA_RETRACE_PRECISE; + else goto invalid_vga; + } else goto invalid_vga; + opts = nextopt; + } +} + +static void parse_display_qapi(const char *optarg) +{ + DisplayOptions *opts; + Visitor *v; + + v = qobject_input_visitor_new_str(optarg, "type", &error_fatal); + + visit_type_DisplayOptions(v, NULL, &opts, &error_fatal); + QAPI_CLONE_MEMBERS(DisplayOptions, &dpy, opts); + + qapi_free_DisplayOptions(opts); + visit_free(v); +} + +DisplayOptions *qmp_query_display_options(Error **errp) +{ + return QAPI_CLONE(DisplayOptions, &dpy); +} + +static void parse_display(const char *p) +{ + const char *opts; + + if (is_help_option(p)) { + qemu_display_help(); + exit(0); + } + + if (strstart(p, "vnc", &opts)) { + /* + * vnc isn't a (local) DisplayType but a protocol for remote + * display access. + */ + if (*opts == '=') { + vnc_parse(opts + 1); + } else { + error_report("VNC requires a display argument vnc=<display>"); + exit(1); + } + } else { + parse_display_qapi(p); + } +} + +static inline bool nonempty_str(const char *str) +{ + return str && *str; +} + +static int parse_fw_cfg(void *opaque, QemuOpts *opts, Error **errp) +{ + gchar *buf; + size_t size; + const char *name, *file, *str, *gen_id; + FWCfgState *fw_cfg = (FWCfgState *) opaque; + + if (fw_cfg == NULL) { + error_setg(errp, "fw_cfg device not available"); + return -1; + } + name = qemu_opt_get(opts, "name"); + file = qemu_opt_get(opts, "file"); + str = qemu_opt_get(opts, "string"); + gen_id = qemu_opt_get(opts, "gen_id"); + + /* we need the name, and exactly one of: file, content string, gen_id */ + if (!nonempty_str(name) || + nonempty_str(file) + nonempty_str(str) + nonempty_str(gen_id) != 1) { + error_setg(errp, "name, plus exactly one of file," + " string and gen_id, are needed"); + return -1; + } + if (strlen(name) > FW_CFG_MAX_FILE_PATH - 1) { + error_setg(errp, "name too long (max. %d char)", + FW_CFG_MAX_FILE_PATH - 1); + return -1; + } + if (nonempty_str(gen_id)) { + /* + * In this particular case where the content is populated + * internally, the "etc/" namespace protection is relaxed, + * so do not emit a warning. + */ + } else if (strncmp(name, "opt/", 4) != 0) { + warn_report("externally provided fw_cfg item names " + "should be prefixed with \"opt/\""); + } + if (nonempty_str(str)) { + size = strlen(str); /* NUL terminator NOT included in fw_cfg blob */ + buf = g_memdup(str, size); + } else if (nonempty_str(gen_id)) { + if (!fw_cfg_add_from_generator(fw_cfg, name, gen_id, errp)) { + return -1; + } + return 0; + } else { + GError *err = NULL; + if (!g_file_get_contents(file, &buf, &size, &err)) { + error_setg(errp, "can't load %s: %s", file, err->message); + g_error_free(err); + return -1; + } + } + /* For legacy, keep user files in a specific global order. */ + fw_cfg_set_order_override(fw_cfg, FW_CFG_ORDER_OVERRIDE_USER); + fw_cfg_add_file(fw_cfg, name, buf, size); + fw_cfg_reset_order_override(fw_cfg); + return 0; +} + +static int device_help_func(void *opaque, QemuOpts *opts, Error **errp) +{ + return qdev_device_help(opts); +} + +static int device_init_func(void *opaque, QemuOpts *opts, Error **errp) +{ + DeviceState *dev; + + dev = qdev_device_add(opts, errp); + if (!dev && *errp) { + error_report_err(*errp); + return -1; + } else if (dev) { + object_unref(OBJECT(dev)); + } + return 0; +} + +static int chardev_init_func(void *opaque, QemuOpts *opts, Error **errp) +{ + Error *local_err = NULL; + + if (!qemu_chr_new_from_opts(opts, NULL, &local_err)) { + if (local_err) { + error_propagate(errp, local_err); + return -1; + } + exit(0); + } + return 0; +} + +#ifdef CONFIG_VIRTFS +static int fsdev_init_func(void *opaque, QemuOpts *opts, Error **errp) +{ + return qemu_fsdev_add(opts, errp); +} +#endif + +static int mon_init_func(void *opaque, QemuOpts *opts, Error **errp) +{ + return monitor_init_opts(opts, errp); +} + +static void monitor_parse(const char *optarg, const char *mode, bool pretty) +{ + static int monitor_device_index = 0; + QemuOpts *opts; + const char *p; + char label[32]; + + if (strstart(optarg, "chardev:", &p)) { + snprintf(label, sizeof(label), "%s", p); + } else { + snprintf(label, sizeof(label), "compat_monitor%d", + monitor_device_index); + opts = qemu_chr_parse_compat(label, optarg, true); + if (!opts) { + error_report("parse error: %s", optarg); + exit(1); + } + } + + opts = qemu_opts_create(qemu_find_opts("mon"), label, 1, &error_fatal); + qemu_opt_set(opts, "mode", mode, &error_abort); + qemu_opt_set(opts, "chardev", label, &error_abort); + if (!strcmp(mode, "control")) { + qemu_opt_set_bool(opts, "pretty", pretty, &error_abort); + } else { + assert(pretty == false); + } + monitor_device_index++; +} + +struct device_config { + enum { + DEV_USB, /* -usbdevice */ + DEV_SERIAL, /* -serial */ + DEV_PARALLEL, /* -parallel */ + DEV_DEBUGCON, /* -debugcon */ + DEV_GDB, /* -gdb, -s */ + DEV_SCLP, /* s390 sclp */ + } type; + const char *cmdline; + Location loc; + QTAILQ_ENTRY(device_config) next; +}; + +static QTAILQ_HEAD(, device_config) device_configs = + QTAILQ_HEAD_INITIALIZER(device_configs); + +static void add_device_config(int type, const char *cmdline) +{ + struct device_config *conf; + + conf = g_malloc0(sizeof(*conf)); + conf->type = type; + conf->cmdline = cmdline; + loc_save(&conf->loc); + QTAILQ_INSERT_TAIL(&device_configs, conf, next); +} + +static int foreach_device_config(int type, int (*func)(const char *cmdline)) +{ + struct device_config *conf; + int rc; + + QTAILQ_FOREACH(conf, &device_configs, next) { + if (conf->type != type) + continue; + loc_push_restore(&conf->loc); + rc = func(conf->cmdline); + loc_pop(&conf->loc); + if (rc) { + return rc; + } + } + return 0; +} + +static void qemu_disable_default_devices(void) +{ + MachineClass *machine_class = MACHINE_GET_CLASS(current_machine); + + default_driver_check_json(); + qemu_opts_foreach(qemu_find_opts("device"), + default_driver_check, NULL, NULL); + qemu_opts_foreach(qemu_find_opts("global"), + default_driver_check, NULL, NULL); + + if (!vga_model && !default_vga) { + vga_interface_type = VGA_DEVICE; + vga_interface_created = true; + } + if (!has_defaults || machine_class->no_serial) { + default_serial = 0; + } + if (!has_defaults || machine_class->no_parallel) { + default_parallel = 0; + } + if (!has_defaults || machine_class->no_floppy) { + default_floppy = 0; + } + if (!has_defaults || machine_class->no_cdrom) { + default_cdrom = 0; + } + if (!has_defaults || machine_class->no_sdcard) { + default_sdcard = 0; + } + if (!has_defaults) { + default_monitor = 0; + default_net = 0; + default_vga = 0; + } else { + if (default_net && machine_class->default_nic && + !module_object_class_by_name(machine_class->default_nic)) { + warn_report("Default NIC '%s' is not available in this binary", + machine_class->default_nic); + default_net = 0; + } + } +} + +static void qemu_create_default_devices(void) +{ + MachineClass *machine_class = MACHINE_GET_CLASS(current_machine); + + if (is_daemonized()) { + /* According to documentation and historically, -nographic redirects + * serial port, parallel port and monitor to stdio, which does not work + * with -daemonize. We can redirect these to null instead, but since + * -nographic is legacy, let's just error out. + * We disallow -nographic only if all other ports are not redirected + * explicitly, to not break existing legacy setups which uses + * -nographic _and_ redirects all ports explicitly - this is valid + * usage, -nographic is just a no-op in this case. + */ + if (nographic + && (default_parallel || default_serial || default_monitor)) { + error_report("-nographic cannot be used with -daemonize"); + exit(1); + } + } + + if (nographic) { + if (default_parallel) + add_device_config(DEV_PARALLEL, "null"); + if (default_serial && default_monitor) { + add_device_config(DEV_SERIAL, "mon:stdio"); + } else { + if (default_serial) + add_device_config(DEV_SERIAL, "stdio"); + if (default_monitor) + monitor_parse("stdio", "readline", false); + } + } else { + if (default_serial) + add_device_config(DEV_SERIAL, "vc:80Cx24C"); + if (default_parallel) + add_device_config(DEV_PARALLEL, "vc:80Cx24C"); + if (default_monitor) + monitor_parse("vc:80Cx24C", "readline", false); + } + + if (default_net) { + QemuOptsList *net = qemu_find_opts("net"); + qemu_opts_parse(net, "nic", true, &error_abort); +#ifdef CONFIG_SLIRP + qemu_opts_parse(net, "user", true, &error_abort); +#endif + } + +#if defined(CONFIG_VNC) + if (!QTAILQ_EMPTY(&(qemu_find_opts("vnc")->head))) { + display_remote++; + } +#endif + if (dpy.type == DISPLAY_TYPE_DEFAULT && !display_remote) { + if (!qemu_display_find_default(&dpy)) { + dpy.type = DISPLAY_TYPE_NONE; +#if defined(CONFIG_VNC) + vnc_parse("localhost:0,to=99,id=default"); +#endif + } + } + if (dpy.type == DISPLAY_TYPE_DEFAULT) { + dpy.type = DISPLAY_TYPE_NONE; + } + + /* If no default VGA is requested, the default is "none". */ + if (default_vga) { + vga_model = get_default_vga_model(machine_class); + } + if (vga_model) { + select_vgahw(machine_class, vga_model); + } +} + +static int serial_parse(const char *devname) +{ + int index = num_serial_hds; + char label[32]; + + if (strcmp(devname, "none") == 0) + return 0; + snprintf(label, sizeof(label), "serial%d", index); + serial_hds = g_renew(Chardev *, serial_hds, index + 1); + + serial_hds[index] = qemu_chr_new_mux_mon(label, devname, NULL); + if (!serial_hds[index]) { + error_report("could not connect serial device" + " to character backend '%s'", devname); + return -1; + } + num_serial_hds++; + return 0; +} + +Chardev *serial_hd(int i) +{ + assert(i >= 0); + if (i < num_serial_hds) { + return serial_hds[i]; + } + return NULL; +} + +static int parallel_parse(const char *devname) +{ + static int index = 0; + char label[32]; + + if (strcmp(devname, "none") == 0) + return 0; + if (index == MAX_PARALLEL_PORTS) { + error_report("too many parallel ports"); + exit(1); + } + snprintf(label, sizeof(label), "parallel%d", index); + parallel_hds[index] = qemu_chr_new_mux_mon(label, devname, NULL); + if (!parallel_hds[index]) { + error_report("could not connect parallel device" + " to character backend '%s'", devname); + return -1; + } + index++; + return 0; +} + +static int debugcon_parse(const char *devname) +{ + QemuOpts *opts; + + if (!qemu_chr_new_mux_mon("debugcon", devname, NULL)) { + error_report("invalid character backend '%s'", devname); + exit(1); + } + opts = qemu_opts_create(qemu_find_opts("device"), "debugcon", 1, NULL); + if (!opts) { + error_report("already have a debugcon device"); + exit(1); + } + qemu_opt_set(opts, "driver", "isa-debugcon", &error_abort); + qemu_opt_set(opts, "chardev", "debugcon", &error_abort); + return 0; +} + +static gint machine_class_cmp(gconstpointer a, gconstpointer b) +{ + const MachineClass *mc1 = a, *mc2 = b; + int res; + + if (mc1->family == NULL) { + if (mc2->family == NULL) { + /* Compare standalone machine types against each other; they sort + * in increasing order. + */ + return strcmp(object_class_get_name(OBJECT_CLASS(mc1)), + object_class_get_name(OBJECT_CLASS(mc2))); + } + + /* Standalone machine types sort after families. */ + return 1; + } + + if (mc2->family == NULL) { + /* Families sort before standalone machine types. */ + return -1; + } + + /* Families sort between each other alphabetically increasingly. */ + res = strcmp(mc1->family, mc2->family); + if (res != 0) { + return res; + } + + /* Within the same family, machine types sort in decreasing order. */ + return strcmp(object_class_get_name(OBJECT_CLASS(mc2)), + object_class_get_name(OBJECT_CLASS(mc1))); +} + +static void machine_help_func(const QDict *qdict) +{ + GSList *machines, *el; + const char *type = qdict_get_try_str(qdict, "type"); + + machines = object_class_get_list(TYPE_MACHINE, false); + if (type) { + ObjectClass *machine_class = OBJECT_CLASS(find_machine(type, machines)); + if (machine_class) { + type_print_class_properties(object_class_get_name(machine_class)); + return; + } + } + + printf("Supported machines are:\n"); + machines = g_slist_sort(machines, machine_class_cmp); + for (el = machines; el; el = el->next) { + MachineClass *mc = el->data; + if (mc->alias) { + printf("%-20s %s (alias of %s)\n", mc->alias, mc->desc, mc->name); + } + printf("%-20s %s%s%s\n", mc->name, mc->desc, + mc->is_default ? " (default)" : "", + mc->deprecation_reason ? " (deprecated)" : ""); + } +} + +static void +machine_merge_property(const char *propname, QDict *prop, Error **errp) +{ + QDict *opts; + + opts = qdict_new(); + /* Preserve the caller's reference to prop. */ + qobject_ref(prop); + qdict_put(opts, propname, prop); + keyval_merge(machine_opts_dict, opts, errp); + qobject_unref(opts); +} + +static void +machine_parse_property_opt(QemuOptsList *opts_list, const char *propname, + const char *arg) +{ + QDict *prop = NULL; + bool help = false; + + prop = keyval_parse(arg, opts_list->implied_opt_name, &help, &error_fatal); + if (help) { + qemu_opts_print_help(opts_list, true); + exit(0); + } + machine_merge_property(propname, prop, &error_fatal); + qobject_unref(prop); +} + +static const char *pid_file; +struct UnlinkPidfileNotifier { + Notifier notifier; + char *pid_file_realpath; +}; +static struct UnlinkPidfileNotifier qemu_unlink_pidfile_notifier; + +static void qemu_unlink_pidfile(Notifier *n, void *data) +{ + struct UnlinkPidfileNotifier *upn; + + upn = DO_UPCAST(struct UnlinkPidfileNotifier, notifier, n); + unlink(upn->pid_file_realpath); +} + +static const QEMUOption *lookup_opt(int argc, char **argv, + const char **poptarg, int *poptind) +{ + const QEMUOption *popt; + int optind = *poptind; + char *r = argv[optind]; + const char *optarg; + + loc_set_cmdline(argv, optind, 1); + optind++; + /* Treat --foo the same as -foo. */ + if (r[1] == '-') + r++; + popt = qemu_options; + for(;;) { + if (!popt->name) { + error_report("invalid option"); + exit(1); + } + if (!strcmp(popt->name, r + 1)) + break; + popt++; + } + if (popt->flags & HAS_ARG) { + if (optind >= argc) { + error_report("requires an argument"); + exit(1); + } + optarg = argv[optind++]; + loc_set_cmdline(argv, optind - 2, 2); + } else { + optarg = NULL; + } + + *poptarg = optarg; + *poptind = optind; + + return popt; +} + +static MachineClass *select_machine(QDict *qdict, Error **errp) +{ + const char *optarg = qdict_get_try_str(qdict, "type"); + GSList *machines = object_class_get_list(TYPE_MACHINE, false); + MachineClass *machine_class; + Error *local_err = NULL; + + if (optarg) { + machine_class = find_machine(optarg, machines); + qdict_del(qdict, "type"); + if (!machine_class) { + error_setg(&local_err, "unsupported machine type"); + } + } else { + machine_class = find_default_machine(machines); + if (!machine_class) { + error_setg(&local_err, "No machine specified, and there is no default"); + } + } + + g_slist_free(machines); + if (local_err) { + error_append_hint(&local_err, "Use -machine help to list supported machines\n"); + error_propagate(errp, local_err); + } + return machine_class; +} + +static int object_parse_property_opt(Object *obj, + const char *name, const char *value, + const char *skip, Error **errp) +{ + if (g_str_equal(name, skip)) { + return 0; + } + + if (!object_property_parse(obj, name, value, errp)) { + return -1; + } + + return 0; +} + +/* *Non*recursively replace underscores with dashes in QDict keys. */ +static void keyval_dashify(QDict *qdict, Error **errp) +{ + const QDictEntry *ent, *next; + char *p; + + for (ent = qdict_first(qdict); ent; ent = next) { + g_autofree char *new_key = NULL; + + next = qdict_next(qdict, ent); + if (!strchr(ent->key, '_')) { + continue; + } + new_key = g_strdup(ent->key); + for (p = new_key; *p; p++) { + if (*p == '_') { + *p = '-'; + } + } + if (qdict_haskey(qdict, new_key)) { + error_setg(errp, "Conflict between '%s' and '%s'", ent->key, new_key); + return; + } + qobject_ref(ent->value); + qdict_put_obj(qdict, new_key, ent->value); + qdict_del(qdict, ent->key); + } +} + +static void qemu_apply_legacy_machine_options(QDict *qdict) +{ + const char *value; + QObject *prop; + + keyval_dashify(qdict, &error_fatal); + + /* Legacy options do not correspond to MachineState properties. */ + value = qdict_get_try_str(qdict, "accel"); + if (value) { + accelerators = g_strdup(value); + qdict_del(qdict, "accel"); + } + + value = qdict_get_try_str(qdict, "igd-passthru"); + if (value) { + object_register_sugar_prop(ACCEL_CLASS_NAME("xen"), "igd-passthru", value, + false); + qdict_del(qdict, "igd-passthru"); + } + + value = qdict_get_try_str(qdict, "kvm-shadow-mem"); + if (value) { + object_register_sugar_prop(ACCEL_CLASS_NAME("kvm"), "kvm-shadow-mem", value, + false); + qdict_del(qdict, "kvm-shadow-mem"); + } + + value = qdict_get_try_str(qdict, "kernel-irqchip"); + if (value) { + object_register_sugar_prop(ACCEL_CLASS_NAME("kvm"), "kernel-irqchip", value, + false); + object_register_sugar_prop(ACCEL_CLASS_NAME("whpx"), "kernel-irqchip", value, + false); + qdict_del(qdict, "kernel-irqchip"); + } + + value = qdict_get_try_str(qdict, "memory-backend"); + if (value) { + if (mem_path) { + error_report("'-mem-path' can't be used together with" + "'-machine memory-backend'"); + exit(EXIT_FAILURE); + } + + /* Resolved later. */ + ram_memdev_id = g_strdup(value); + qdict_del(qdict, "memory-backend"); + } + + prop = qdict_get(qdict, "memory"); + if (prop) { + have_custom_ram_size = + qobject_type(prop) == QTYPE_QDICT && + qdict_haskey(qobject_to(QDict, prop), "size"); + } +} + +static void object_option_foreach_add(bool (*type_opt_predicate)(const char *)) +{ + ObjectOption *opt, *next; + + QTAILQ_FOREACH_SAFE(opt, &object_opts, next, next) { + const char *type = ObjectType_str(opt->opts->qom_type); + if (type_opt_predicate(type)) { + user_creatable_add_qapi(opt->opts, &error_fatal); + qapi_free_ObjectOptions(opt->opts); + QTAILQ_REMOVE(&object_opts, opt, next); + g_free(opt); + } + } +} + +static void object_option_add_visitor(Visitor *v) +{ + ObjectOption *opt = g_new0(ObjectOption, 1); + visit_type_ObjectOptions(v, NULL, &opt->opts, &error_fatal); + QTAILQ_INSERT_TAIL(&object_opts, opt, next); +} + +static void object_option_parse(const char *optarg) +{ + QemuOpts *opts; + const char *type; + Visitor *v; + + if (optarg[0] == '{') { + QObject *obj = qobject_from_json(optarg, &error_fatal); + + v = qobject_input_visitor_new(obj); + qobject_unref(obj); + } else { + opts = qemu_opts_parse_noisily(qemu_find_opts("object"), + optarg, true); + if (!opts) { + exit(1); + } + + type = qemu_opt_get(opts, "qom-type"); + if (!type) { + error_setg(&error_fatal, QERR_MISSING_PARAMETER, "qom-type"); + } + if (user_creatable_print_help(type, opts)) { + exit(0); + } + + v = opts_visitor_new(opts); + } + + object_option_add_visitor(v); + visit_free(v); +} + +/* + * Very early object creation, before the sandbox options have been activated. + */ +static bool object_create_pre_sandbox(const char *type) +{ + /* + * Objects should in general not get initialized "too early" without + * a reason. If you add one, state the reason in a comment! + */ + + /* + * Reason: -sandbox on,resourcecontrol=deny disallows setting CPU + * affinity of threads. + */ + if (g_str_equal(type, "thread-context")) { + return true; + } + + return false; +} + +/* + * Initial object creation happens before all other + * QEMU data types are created. The majority of objects + * can be created at this point. The rng-egd object + * cannot be created here, as it depends on the chardev + * already existing. + */ +static bool object_create_early(const char *type) +{ + /* + * Objects should not be made "delayed" without a reason. If you + * add one, state the reason in a comment! + */ + + /* Reason: already created. */ + if (object_create_pre_sandbox(type)) { + return false; + } + + /* Reason: property "chardev" */ + if (g_str_equal(type, "rng-egd") || + g_str_equal(type, "qtest")) { + return false; + } + +#if defined(CONFIG_VHOST_USER) && defined(CONFIG_LINUX) + /* Reason: cryptodev-vhost-user property "chardev" */ + if (g_str_equal(type, "cryptodev-vhost-user")) { + return false; + } +#endif + + /* Reason: vhost-user-blk-server property "node-name" */ + if (g_str_equal(type, "vhost-user-blk-server")) { + return false; + } + /* + * Reason: filter-* property "netdev" etc. + */ + if (g_str_equal(type, "filter-buffer") || + g_str_equal(type, "filter-dump") || + g_str_equal(type, "filter-mirror") || + g_str_equal(type, "filter-redirector") || + g_str_equal(type, "colo-compare") || + g_str_equal(type, "filter-rewriter") || + g_str_equal(type, "filter-replay")) { + return false; + } + + /* + * Allocation of large amounts of memory may delay + * chardev initialization for too long, and trigger timeouts + * on software that waits for a monitor socket to be created + * (e.g. libvirt). + */ + if (g_str_has_prefix(type, "memory-backend-")) { + return false; + } + + return true; +} + +static void qemu_apply_machine_options(QDict *qdict) +{ + object_set_properties_from_keyval(OBJECT(current_machine), qdict, false, &error_fatal); + + if (semihosting_enabled(false) && !semihosting_get_argc()) { + /* fall back to the -kernel/-append */ + semihosting_arg_fallback(current_machine->kernel_filename, current_machine->kernel_cmdline); + } + + if (current_machine->smp.cpus > 1) { + replay_add_blocker("smp"); + } +} + +static void qemu_create_early_backends(void) +{ + MachineClass *machine_class = MACHINE_GET_CLASS(current_machine); +#if defined(CONFIG_SDL) + const bool use_sdl = (dpy.type == DISPLAY_TYPE_SDL); +#else + const bool use_sdl = false; +#endif +#if defined(CONFIG_GTK) + const bool use_gtk = (dpy.type == DISPLAY_TYPE_GTK); +#else + const bool use_gtk = false; +#endif + + if (dpy.has_window_close && !use_gtk && !use_sdl) { + error_report("window-close is only valid for GTK and SDL, " + "ignoring option"); + } + + qemu_display_early_init(&dpy); + qemu_console_early_init(); + + if (dpy.has_gl && dpy.gl != DISPLAYGL_MODE_OFF && display_opengl == 0) { +#if defined(CONFIG_OPENGL) + error_report("OpenGL is not supported by the display"); +#else + error_report("OpenGL support is disabled"); +#endif + exit(1); + } + + object_option_foreach_add(object_create_early); + + /* spice needs the timers to be initialized by this point */ + /* spice must initialize before audio as it changes the default audiodev */ + /* spice must initialize before chardevs (for spicevmc and spiceport) */ + qemu_spice.init(); + + qemu_opts_foreach(qemu_find_opts("chardev"), + chardev_init_func, NULL, &error_fatal); + +#ifdef CONFIG_VIRTFS + qemu_opts_foreach(qemu_find_opts("fsdev"), + fsdev_init_func, NULL, &error_fatal); +#endif + + /* + * Note: we need to create audio and block backends before + * setting machine properties, so they can be referred to. + */ + configure_blockdev(&bdo_queue, machine_class, snapshot); + audio_init_audiodevs(); +} + + +/* + * The remainder of object creation happens after the + * creation of chardev, fsdev, net clients and device data types. + */ +static bool object_create_late(const char *type) +{ + return !object_create_early(type) && !object_create_pre_sandbox(type); +} + +static void qemu_create_late_backends(void) +{ + if (qtest_chrdev) { + qtest_server_init(qtest_chrdev, qtest_log, &error_fatal); + } + + net_init_clients(); + + object_option_foreach_add(object_create_late); + + if (tpm_init() < 0) { + exit(1); + } + + qemu_opts_foreach(qemu_find_opts("mon"), + mon_init_func, NULL, &error_fatal); + + if (foreach_device_config(DEV_SERIAL, serial_parse) < 0) + exit(1); + if (foreach_device_config(DEV_PARALLEL, parallel_parse) < 0) + exit(1); + if (foreach_device_config(DEV_DEBUGCON, debugcon_parse) < 0) + exit(1); + + /* now chardevs have been created we may have semihosting to connect */ + qemu_semihosting_chardev_init(); +} + +static void qemu_resolve_machine_memdev(void) +{ + if (ram_memdev_id) { + Object *backend; + ram_addr_t backend_size; + + backend = object_resolve_path_type(ram_memdev_id, + TYPE_MEMORY_BACKEND, NULL); + if (!backend) { + error_report("Memory backend '%s' not found", ram_memdev_id); + exit(EXIT_FAILURE); + } + if (!have_custom_ram_size) { + backend_size = object_property_get_uint(backend, "size", &error_abort); + current_machine->ram_size = backend_size; + } + object_property_set_link(OBJECT(current_machine), + "memory-backend", backend, &error_fatal); + } +} + +static void parse_memory_options(void) +{ + QemuOpts *opts = qemu_find_opts_singleton("memory"); + QDict *dict, *prop; + const char *mem_str; + Location loc; + + loc_push_none(&loc); + qemu_opts_loc_restore(opts); + + prop = qdict_new(); + + if (qemu_opt_get_size(opts, "size", 0) != 0) { + /* Fix up legacy suffix-less format */ + mem_str = qemu_opt_get(opts, "size"); + if (g_ascii_isdigit(mem_str[strlen(mem_str) - 1])) { + g_autofree char *mib_str = g_strdup_printf("%sM", mem_str); + qdict_put_str(prop, "size", mib_str); + } else { + qdict_put_str(prop, "size", mem_str); + } + } + + if (qemu_opt_get(opts, "maxmem")) { + qdict_put_str(prop, "max-size", qemu_opt_get(opts, "maxmem")); + } + if (qemu_opt_get(opts, "slots")) { + qdict_put_str(prop, "slots", qemu_opt_get(opts, "slots")); + } + + dict = qdict_new(); + qdict_put(dict, "memory", prop); + keyval_merge(machine_opts_dict, dict, &error_fatal); + qobject_unref(dict); + loc_pop(&loc); +} + +static void qemu_create_machine(QDict *qdict) +{ + MachineClass *machine_class = select_machine(qdict, &error_fatal); + object_set_machine_compat_props(machine_class->compat_props); + + current_machine = MACHINE(object_new_with_class(OBJECT_CLASS(machine_class))); + object_property_add_child(object_get_root(), "machine", + OBJECT(current_machine)); + object_property_add_child(container_get(OBJECT(current_machine), + "/unattached"), + "sysbus", OBJECT(sysbus_get_default())); + + if (machine_class->minimum_page_bits) { + if (!set_preferred_target_page_bits(machine_class->minimum_page_bits)) { + /* This would be a board error: specifying a minimum smaller than + * a target's compile-time fixed setting. + */ + g_assert_not_reached(); + } + } + + cpu_exec_init_all(); + page_size_init(); + + if (machine_class->hw_version) { + qemu_set_hw_version(machine_class->hw_version); + } + + /* + * Get the default machine options from the machine if it is not already + * specified either by the configuration file or by the command line. + */ + if (machine_class->default_machine_opts) { + QDict *default_opts = + keyval_parse(machine_class->default_machine_opts, NULL, NULL, + &error_abort); + qemu_apply_legacy_machine_options(default_opts); + object_set_properties_from_keyval(OBJECT(current_machine), default_opts, + false, &error_abort); + qobject_unref(default_opts); + } +} + +static int global_init_func(void *opaque, QemuOpts *opts, Error **errp) +{ + GlobalProperty *g; + + g = g_malloc0(sizeof(*g)); + g->driver = qemu_opt_get(opts, "driver"); + g->property = qemu_opt_get(opts, "property"); + g->value = qemu_opt_get(opts, "value"); + qdev_prop_register_global(g); + return 0; +} + +/* + * Return whether configuration group @group is stored in QemuOpts, or + * recorded as one or more QDicts by qemu_record_config_group. + */ +static bool is_qemuopts_group(const char *group) +{ + if (g_str_equal(group, "object") || + g_str_equal(group, "audiodev") || + g_str_equal(group, "machine") || + g_str_equal(group, "smp-opts") || + g_str_equal(group, "boot-opts")) { + return false; + } + return true; +} + +static void qemu_record_config_group(const char *group, QDict *dict, + bool from_json, Error **errp) +{ + if (g_str_equal(group, "object")) { + Visitor *v = qobject_input_visitor_new_keyval(QOBJECT(dict)); + object_option_add_visitor(v); + visit_free(v); + + } else if (g_str_equal(group, "audiodev")) { + Audiodev *dev = NULL; + Visitor *v = qobject_input_visitor_new_keyval(QOBJECT(dict)); + if (visit_type_Audiodev(v, NULL, &dev, errp)) { + audio_define(dev); + } + visit_free(v); + + } else if (g_str_equal(group, "machine")) { + /* + * Cannot merge string-valued and type-safe dictionaries, so JSON + * is not accepted yet for -M. + */ + assert(!from_json); + keyval_merge(machine_opts_dict, dict, errp); + } else if (g_str_equal(group, "smp-opts")) { + machine_merge_property("smp", dict, &error_fatal); + } else if (g_str_equal(group, "boot-opts")) { + machine_merge_property("boot", dict, &error_fatal); + } else { + abort(); + } +} + +/* + * Parse non-QemuOpts config file groups, pass the rest to + * qemu_config_do_parse. + */ +static void qemu_parse_config_group(const char *group, QDict *qdict, + void *opaque, Error **errp) +{ + QObject *crumpled; + if (is_qemuopts_group(group)) { + qemu_config_do_parse(group, qdict, opaque, errp); + return; + } + + crumpled = qdict_crumple(qdict, errp); + if (!crumpled) { + return; + } + switch (qobject_type(crumpled)) { + case QTYPE_QDICT: + qemu_record_config_group(group, qobject_to(QDict, crumpled), false, errp); + break; + case QTYPE_QLIST: + error_setg(errp, "Lists cannot be at top level of a configuration section"); + break; + default: + g_assert_not_reached(); + } + qobject_unref(crumpled); +} + +static void qemu_read_default_config_file(Error **errp) +{ + ERRP_GUARD(); + int ret; + g_autofree char *file = get_relocated_path(CONFIG_QEMU_CONFDIR "/qemu.conf"); + + ret = qemu_read_config_file(file, qemu_parse_config_group, errp); + if (ret < 0) { + if (ret == -ENOENT) { + error_free(*errp); + *errp = NULL; + } + } +} + +static void qemu_set_option(const char *str, Error **errp) +{ + char group[64], id[64], arg[64]; + QemuOptsList *list; + QemuOpts *opts; + int rc, offset; + + rc = sscanf(str, "%63[^.].%63[^.].%63[^=]%n", group, id, arg, &offset); + if (rc < 3 || str[offset] != '=') { + error_setg(errp, "can't parse: \"%s\"", str); + return; + } + + if (!is_qemuopts_group(group)) { + error_setg(errp, "-set is not supported with %s", group); + } else { + list = qemu_find_opts_err(group, errp); + if (list) { + opts = qemu_opts_find(list, id); + if (!opts) { + error_setg(errp, "there is no %s \"%s\" defined", group, id); + return; + } + qemu_opt_set(opts, arg, str + offset + 1, errp); + } + } +} + +static void user_register_global_props(void) +{ + qemu_opts_foreach(qemu_find_opts("global"), + global_init_func, NULL, NULL); +} + +static int do_configure_icount(void *opaque, QemuOpts *opts, Error **errp) +{ + icount_configure(opts, errp); + return 0; +} + +static int accelerator_set_property(void *opaque, + const char *name, const char *value, + Error **errp) +{ + return object_parse_property_opt(opaque, name, value, "accel", errp); +} + +static int do_configure_accelerator(void *opaque, QemuOpts *opts, Error **errp) +{ + bool *p_init_failed = opaque; + const char *acc = qemu_opt_get(opts, "accel"); + AccelClass *ac = accel_find(acc); + AccelState *accel; + int ret; + bool qtest_with_kvm; + + if (!acc) { + error_setg(errp, QERR_MISSING_PARAMETER, "accel"); + goto bad; + } + + qtest_with_kvm = g_str_equal(acc, "kvm") && qtest_chrdev != NULL; + + if (!ac) { + if (!qtest_with_kvm) { + error_report("invalid accelerator %s", acc); + } + goto bad; + } + accel = ACCEL(object_new_with_class(OBJECT_CLASS(ac))); + object_apply_compat_props(OBJECT(accel)); + qemu_opt_foreach(opts, accelerator_set_property, + accel, + &error_fatal); + /* + * If legacy -singlestep option is set, honour it for TCG and + * silently ignore for any other accelerator (which is how this + * option has always behaved). + */ + if (opt_one_insn_per_tb) { + /* + * This will always succeed for TCG, and we want to ignore + * the error from trying to set a nonexistent property + * on any other accelerator. + */ + object_property_set_bool(OBJECT(accel), "one-insn-per-tb", true, NULL); + } + ret = accel_init_machine(accel, current_machine); + if (ret < 0) { + if (!qtest_with_kvm || ret != -ENOENT) { + error_report("failed to initialize %s: %s", acc, strerror(-ret)); + } + goto bad; + } + + return 1; + +bad: + *p_init_failed = true; + return 0; +} + +static void configure_accelerators(const char *progname) +{ + bool init_failed = false; + + qemu_opts_foreach(qemu_find_opts("icount"), + do_configure_icount, NULL, &error_fatal); + + if (QTAILQ_EMPTY(&qemu_accel_opts.head)) { + char **accel_list, **tmp; + + if (accelerators == NULL) { + /* Select the default accelerator */ + bool have_tcg = accel_find("tcg"); + bool have_kvm = accel_find("kvm"); + + if (have_tcg && have_kvm) { + if (g_str_has_suffix(progname, "kvm")) { + /* If the program name ends with "kvm", we prefer KVM */ + accelerators = "kvm:tcg"; + } else { + accelerators = "tcg:kvm"; + } + } else if (have_kvm) { + accelerators = "kvm"; + } else if (have_tcg) { + accelerators = "tcg"; + } else { + error_report("No accelerator selected and" + " no default accelerator available"); + exit(1); + } + } + accel_list = g_strsplit(accelerators, ":", 0); + + for (tmp = accel_list; *tmp; tmp++) { + /* + * Filter invalid accelerators here, to prevent obscenities + * such as "-machine accel=tcg,,thread=single". + */ + if (accel_find(*tmp)) { + qemu_opts_parse_noisily(qemu_find_opts("accel"), *tmp, true); + } else { + init_failed = true; + error_report("invalid accelerator %s", *tmp); + } + } + g_strfreev(accel_list); + } else { + if (accelerators != NULL) { + error_report("The -accel and \"-machine accel=\" options are incompatible"); + exit(1); + } + } + + if (!qemu_opts_foreach(qemu_find_opts("accel"), + do_configure_accelerator, &init_failed, &error_fatal)) { + if (!init_failed) { + error_report("no accelerator found"); + } + exit(1); + } + + if (init_failed && !qtest_chrdev) { + error_report("falling back to %s", current_accel_name()); + } + + if (icount_enabled() && !tcg_enabled()) { + error_report("-icount is not allowed with hardware virtualization"); + exit(1); + } +} + +static void qemu_validate_options(const QDict *machine_opts) +{ + const char *kernel_filename = qdict_get_try_str(machine_opts, "kernel"); + const char *initrd_filename = qdict_get_try_str(machine_opts, "initrd"); + const char *kernel_cmdline = qdict_get_try_str(machine_opts, "append"); + + if (kernel_filename == NULL) { + if (kernel_cmdline != NULL) { + error_report("-append only allowed with -kernel option"); + exit(1); + } + + if (initrd_filename != NULL) { + error_report("-initrd only allowed with -kernel option"); + exit(1); + } + } + + if (loadvm && preconfig_requested) { + error_report("'preconfig' and 'loadvm' options are " + "mutually exclusive"); + exit(EXIT_FAILURE); + } + if (incoming && preconfig_requested && strcmp(incoming, "defer") != 0) { + error_report("'preconfig' supports '-incoming defer' only"); + exit(EXIT_FAILURE); + } + +#ifdef CONFIG_CURSES + if (is_daemonized() && dpy.type == DISPLAY_TYPE_CURSES) { + error_report("curses display cannot be used with -daemonize"); + exit(1); + } +#endif +} + +static void qemu_process_sugar_options(void) +{ + if (mem_prealloc) { + QObject *smp = qdict_get(machine_opts_dict, "smp"); + if (smp && qobject_type(smp) == QTYPE_QDICT) { + QObject *cpus = qdict_get(qobject_to(QDict, smp), "cpus"); + if (cpus && qobject_type(cpus) == QTYPE_QSTRING) { + const char *val = qstring_get_str(qobject_to(QString, cpus)); + object_register_sugar_prop("memory-backend", "prealloc-threads", + val, false); + } + } + object_register_sugar_prop("memory-backend", "prealloc", "on", false); + } +} + +/* -action processing */ + +/* + * Process all the -action parameters parsed from cmdline. + */ +static int process_runstate_actions(void *opaque, QemuOpts *opts, Error **errp) +{ + Error *local_err = NULL; + QDict *qdict = qemu_opts_to_qdict(opts, NULL); + QObject *ret = NULL; + qmp_marshal_set_action(qdict, &ret, &local_err); + qobject_unref(ret); + qobject_unref(qdict); + if (local_err) { + error_propagate(errp, local_err); + return 1; + } + return 0; +} + +static void qemu_process_early_options(void) +{ + qemu_opts_foreach(qemu_find_opts("name"), + parse_name, NULL, &error_fatal); + + object_option_foreach_add(object_create_pre_sandbox); + +#ifdef CONFIG_SECCOMP + QemuOptsList *olist = qemu_find_opts_err("sandbox", NULL); + if (olist) { + qemu_opts_foreach(olist, parse_sandbox, NULL, &error_fatal); + } +#endif + + if (qemu_opts_foreach(qemu_find_opts("action"), + process_runstate_actions, NULL, &error_fatal)) { + exit(1); + } + +#ifndef _WIN32 + qemu_opts_foreach(qemu_find_opts("add-fd"), + parse_add_fd, NULL, &error_fatal); + + qemu_opts_foreach(qemu_find_opts("add-fd"), + cleanup_add_fd, NULL, &error_fatal); +#endif + + /* Open the logfile at this point and set the log mask if necessary. */ + { + int mask = 0; + if (log_mask) { + mask = qemu_str_to_log_mask(log_mask); + if (!mask) { + qemu_print_log_usage(stdout); + exit(1); + } + } + qemu_set_log_filename_flags(log_file, mask, &error_fatal); + } + + qemu_add_default_firmwarepath(); +} + +static void qemu_process_help_options(void) +{ + /* + * Check for -cpu help and -device help before we call select_machine(), + * which will return an error if the architecture has no default machine + * type and the user did not specify one, so that the user doesn't need + * to say '-cpu help -machine something'. + */ + if (cpu_option && is_help_option(cpu_option)) { + list_cpus(); + exit(0); + } + + if (qemu_opts_foreach(qemu_find_opts("device"), + device_help_func, NULL, NULL)) { + exit(0); + } + + /* -L help lists the data directories and exits. */ + if (list_data_dirs) { + qemu_list_data_dirs(); + exit(0); + } +} + +static void qemu_maybe_daemonize(const char *pid_file) +{ + Error *err = NULL; + + os_daemonize(); + rcu_disable_atfork(); + + if (pid_file) { + char *pid_file_realpath = NULL; + + if (!qemu_write_pidfile(pid_file, &err)) { + error_reportf_err(err, "cannot create PID file: "); + exit(1); + } + + pid_file_realpath = g_malloc0(PATH_MAX); + if (!realpath(pid_file, pid_file_realpath)) { + if (errno != ENOENT) { + warn_report("not removing PID file on exit: cannot resolve PID " + "file path: %s: %s", pid_file, strerror(errno)); + } + return; + } + + qemu_unlink_pidfile_notifier = (struct UnlinkPidfileNotifier) { + .notifier = { + .notify = qemu_unlink_pidfile, + }, + .pid_file_realpath = pid_file_realpath, + }; + qemu_add_exit_notifier(&qemu_unlink_pidfile_notifier.notifier); + } +} + +static void qemu_init_displays(void) +{ + DisplayState *ds; + + /* init local displays */ + ds = init_displaystate(); + qemu_display_init(ds, &dpy); + + /* must be after terminal init, SDL library changes signal handlers */ + os_setup_signal_handling(); + + /* init remote displays */ +#ifdef CONFIG_VNC + qemu_opts_foreach(qemu_find_opts("vnc"), + vnc_init_func, NULL, &error_fatal); +#endif + + if (using_spice) { + qemu_spice.display_init(); + } +} + +static void qemu_init_board(void) +{ + /* process plugin before CPUs are created, but once -smp has been parsed */ + qemu_plugin_load_list(&plugin_list, &error_fatal); + + /* From here on we enter MACHINE_PHASE_INITIALIZED. */ + machine_run_board_init(current_machine, mem_path, &error_fatal); + + drive_check_orphaned(); + + realtime_init(); +} + +static void qemu_create_cli_devices(void) +{ + DeviceOption *opt; + + soundhw_init(); + + qemu_opts_foreach(qemu_find_opts("fw_cfg"), + parse_fw_cfg, fw_cfg_find(), &error_fatal); + + /* init USB devices */ + if (machine_usb(current_machine)) { + if (foreach_device_config(DEV_USB, usb_parse) < 0) + exit(1); + } + + /* init generic devices */ + rom_set_order_override(FW_CFG_ORDER_OVERRIDE_DEVICE); + qemu_opts_foreach(qemu_find_opts("device"), + device_init_func, NULL, &error_fatal); + QTAILQ_FOREACH(opt, &device_opts, next) { + DeviceState *dev; + loc_push_restore(&opt->loc); + /* + * TODO Eventually we should call qmp_device_add() here to make sure it + * behaves the same, but QMP still has to accept incorrectly typed + * options until libvirt is fixed and we want to be strict on the CLI + * from the start, so call qdev_device_add_from_qdict() directly for + * now. + */ + dev = qdev_device_add_from_qdict(opt->opts, true, &error_fatal); + object_unref(OBJECT(dev)); + loc_pop(&opt->loc); + } + rom_reset_order_override(); +} + +static void qemu_machine_creation_done(void) +{ + MachineState *machine = MACHINE(qdev_get_machine()); + + /* Did we create any drives that we failed to create a device for? */ + drive_check_orphaned(); + + /* Don't warn about the default network setup that you get if + * no command line -net or -netdev options are specified. There + * are two cases that we would otherwise complain about: + * (1) board doesn't support a NIC but the implicit "-net nic" + * requested one + * (2) CONFIG_SLIRP not set, in which case the implicit "-net nic" + * sets up a nic that isn't connected to anything. + */ + if (!default_net && (!qtest_enabled() || has_defaults)) { + net_check_clients(); + } + + qdev_prop_check_globals(); + + qdev_machine_creation_done(); + + if (machine->cgs) { + /* + * Verify that Confidential Guest Support has actually been initialized + */ + assert(machine->cgs->ready); + } + + if (foreach_device_config(DEV_GDB, gdbserver_start) < 0) { + exit(1); + } + if (!vga_interface_created && !default_vga && + vga_interface_type != VGA_NONE) { + warn_report("A -vga option was passed but this machine " + "type does not use that option; " + "No VGA device has been created"); + } +} + +void qmp_x_exit_preconfig(Error **errp) +{ + if (phase_check(PHASE_MACHINE_INITIALIZED)) { + error_setg(errp, "The command is permitted only before machine initialization"); + return; + } + + qemu_init_board(); + qemu_create_cli_devices(); + qemu_machine_creation_done(); + + if (loadvm) { + load_snapshot(loadvm, NULL, false, NULL, &error_fatal); + } + if (replay_mode != REPLAY_MODE_NONE) { + replay_vmstate_init(); + } + + if (incoming) { + Error *local_err = NULL; + if (strcmp(incoming, "defer") != 0) { + qmp_migrate_incoming(incoming, &local_err); + if (local_err) { + error_reportf_err(local_err, "-incoming %s: ", incoming); + exit(1); + } + } + } else if (autostart) { + qmp_cont(NULL); + } +} + +void qemu_init(int argc, char **argv) +{ + QemuOpts *opts; + QemuOpts *icount_opts = NULL, *accel_opts = NULL; + QemuOptsList *olist; + int optind; + const char *optarg; + MachineClass *machine_class; + bool userconfig = true; + FILE *vmstate_dump_file = NULL; + + qemu_add_opts(&qemu_drive_opts); + qemu_add_drive_opts(&qemu_legacy_drive_opts); + qemu_add_drive_opts(&qemu_common_drive_opts); + qemu_add_drive_opts(&qemu_drive_opts); + qemu_add_drive_opts(&bdrv_runtime_opts); + qemu_add_opts(&qemu_chardev_opts); + qemu_add_opts(&qemu_device_opts); + qemu_add_opts(&qemu_netdev_opts); + qemu_add_opts(&qemu_nic_opts); + qemu_add_opts(&qemu_net_opts); + qemu_add_opts(&qemu_rtc_opts); + qemu_add_opts(&qemu_global_opts); + qemu_add_opts(&qemu_mon_opts); + qemu_add_opts(&qemu_trace_opts); + qemu_plugin_add_opts(); + qemu_add_opts(&qemu_option_rom_opts); + qemu_add_opts(&qemu_accel_opts); + qemu_add_opts(&qemu_mem_opts); + qemu_add_opts(&qemu_smp_opts); + qemu_add_opts(&qemu_boot_opts); + qemu_add_opts(&qemu_add_fd_opts); + qemu_add_opts(&qemu_object_opts); + qemu_add_opts(&qemu_tpmdev_opts); + qemu_add_opts(&qemu_overcommit_opts); + qemu_add_opts(&qemu_msg_opts); + qemu_add_opts(&qemu_name_opts); + qemu_add_opts(&qemu_numa_opts); + qemu_add_opts(&qemu_icount_opts); + qemu_add_opts(&qemu_semihosting_config_opts); + qemu_add_opts(&qemu_fw_cfg_opts); + qemu_add_opts(&qemu_action_opts); + qemu_add_run_with_opts(); + module_call_init(MODULE_INIT_OPTS); + + error_init(argv[0]); + qemu_init_exec_dir(argv[0]); + + qemu_init_arch_modules(); + + qemu_init_subsystems(); + + /* first pass of option parsing */ + optind = 1; + while (optind < argc) { + if (argv[optind][0] != '-') { + /* disk image */ + optind++; + } else { + const QEMUOption *popt; + + popt = lookup_opt(argc, argv, &optarg, &optind); + switch (popt->index) { + case QEMU_OPTION_nouserconfig: + userconfig = false; + break; + } + } + } + + machine_opts_dict = qdict_new(); + if (userconfig) { + qemu_read_default_config_file(&error_fatal); + } + + /* second pass of option parsing */ + optind = 1; + for(;;) { + if (optind >= argc) + break; + if (argv[optind][0] != '-') { + loc_set_cmdline(argv, optind, 1); + drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS); + } else { + const QEMUOption *popt; + + popt = lookup_opt(argc, argv, &optarg, &optind); + if (!(popt->arch_mask & arch_type)) { + error_report("Option not supported for this target"); + exit(1); + } + switch(popt->index) { + case QEMU_OPTION_cpu: + /* hw initialization will check this */ + cpu_option = optarg; + break; + case QEMU_OPTION_hda: + case QEMU_OPTION_hdb: + case QEMU_OPTION_hdc: + case QEMU_OPTION_hdd: + drive_add(IF_DEFAULT, popt->index - QEMU_OPTION_hda, optarg, + HD_OPTS); + break; + case QEMU_OPTION_blockdev: + { + Visitor *v; + BlockdevOptionsQueueEntry *bdo; + + v = qobject_input_visitor_new_str(optarg, "driver", + &error_fatal); + + bdo = g_new(BlockdevOptionsQueueEntry, 1); + visit_type_BlockdevOptions(v, NULL, &bdo->bdo, + &error_fatal); + visit_free(v); + loc_save(&bdo->loc); + QSIMPLEQ_INSERT_TAIL(&bdo_queue, bdo, entry); + break; + } + case QEMU_OPTION_drive: + opts = qemu_opts_parse_noisily(qemu_find_opts("drive"), + optarg, false); + if (opts == NULL) { + exit(1); + } + break; + case QEMU_OPTION_set: + qemu_set_option(optarg, &error_fatal); + break; + case QEMU_OPTION_global: + if (qemu_global_option(optarg) != 0) + exit(1); + break; + case QEMU_OPTION_mtdblock: + drive_add(IF_MTD, -1, optarg, MTD_OPTS); + break; + case QEMU_OPTION_sd: + drive_add(IF_SD, -1, optarg, SD_OPTS); + break; + case QEMU_OPTION_pflash: + drive_add(IF_PFLASH, -1, optarg, PFLASH_OPTS); + break; + case QEMU_OPTION_snapshot: + snapshot = 1; + replay_add_blocker("-snapshot"); + break; + case QEMU_OPTION_numa: + opts = qemu_opts_parse_noisily(qemu_find_opts("numa"), + optarg, true); + if (!opts) { + exit(1); + } + break; + case QEMU_OPTION_display: + parse_display(optarg); + break; + case QEMU_OPTION_nographic: + qdict_put_str(machine_opts_dict, "graphics", "off"); + nographic = true; + dpy.type = DISPLAY_TYPE_NONE; + break; + case QEMU_OPTION_portrait: + graphic_rotate = 90; + break; + case QEMU_OPTION_rotate: + graphic_rotate = strtol(optarg, (char **) &optarg, 10); + if (graphic_rotate != 0 && graphic_rotate != 90 && + graphic_rotate != 180 && graphic_rotate != 270) { + error_report("only 90, 180, 270 deg rotation is available"); + exit(1); + } + break; + case QEMU_OPTION_kernel: + qdict_put_str(machine_opts_dict, "kernel", optarg); + break; + case QEMU_OPTION_initrd: + qdict_put_str(machine_opts_dict, "initrd", optarg); + break; + case QEMU_OPTION_append: + qdict_put_str(machine_opts_dict, "append", optarg); + break; + case QEMU_OPTION_dtb: + qdict_put_str(machine_opts_dict, "dtb", optarg); + break; + case QEMU_OPTION_cdrom: + drive_add(IF_DEFAULT, 2, optarg, CDROM_OPTS); + break; + case QEMU_OPTION_boot: + machine_parse_property_opt(qemu_find_opts("boot-opts"), "boot", optarg); + break; + case QEMU_OPTION_fda: + case QEMU_OPTION_fdb: + drive_add(IF_FLOPPY, popt->index - QEMU_OPTION_fda, + optarg, FD_OPTS); + break; + case QEMU_OPTION_no_fd_bootchk: + fd_bootchk = 0; + break; + case QEMU_OPTION_netdev: + default_net = 0; + if (netdev_is_modern(optarg)) { + netdev_parse_modern(optarg); + } else { + net_client_parse(qemu_find_opts("netdev"), optarg); + } + break; + case QEMU_OPTION_nic: + default_net = 0; + net_client_parse(qemu_find_opts("nic"), optarg); + break; + case QEMU_OPTION_net: + default_net = 0; + net_client_parse(qemu_find_opts("net"), optarg); + break; +#ifdef CONFIG_LIBISCSI + case QEMU_OPTION_iscsi: + opts = qemu_opts_parse_noisily(qemu_find_opts("iscsi"), + optarg, false); + if (!opts) { + exit(1); + } + break; +#endif + case QEMU_OPTION_audiodev: + audio_parse_option(optarg); + break; + case QEMU_OPTION_audio: { + bool help; + char *model; + Audiodev *dev = NULL; + Visitor *v; + QDict *dict = keyval_parse(optarg, "driver", &help, &error_fatal); + if (help || (qdict_haskey(dict, "driver") && + is_help_option(qdict_get_str(dict, "driver")))) { + audio_help(); + exit(EXIT_SUCCESS); + } + if (!qdict_haskey(dict, "id")) { + qdict_put_str(dict, "id", "audiodev0"); + } + if (!qdict_haskey(dict, "model")) { + error_setg(&error_fatal, "Parameter 'model' is missing"); + } + model = g_strdup(qdict_get_str(dict, "model")); + qdict_del(dict, "model"); + if (is_help_option(model)) { + show_valid_soundhw(); + exit(0); + } + v = qobject_input_visitor_new_keyval(QOBJECT(dict)); + qobject_unref(dict); + visit_type_Audiodev(v, NULL, &dev, &error_fatal); + visit_free(v); + audio_define(dev); + select_soundhw(model, dev->id); + g_free(model); + break; + } + case QEMU_OPTION_h: + help(0); + break; + case QEMU_OPTION_version: + version(); + exit(0); + break; + case QEMU_OPTION_m: + opts = qemu_opts_parse_noisily(qemu_find_opts("memory"), optarg, true); + if (opts == NULL) { + exit(1); + } + break; +#ifdef CONFIG_TPM + case QEMU_OPTION_tpmdev: + if (tpm_config_parse(qemu_find_opts("tpmdev"), optarg) < 0) { + exit(1); + } + break; +#endif + case QEMU_OPTION_mempath: + mem_path = optarg; + break; + case QEMU_OPTION_mem_prealloc: + mem_prealloc = 1; + break; + case QEMU_OPTION_d: + log_mask = optarg; + break; + case QEMU_OPTION_D: + log_file = optarg; + break; + case QEMU_OPTION_DFILTER: + qemu_set_dfilter_ranges(optarg, &error_fatal); + break; +#if defined(CONFIG_TCG) && defined(CONFIG_LINUX) + case QEMU_OPTION_perfmap: + perf_enable_perfmap(); + break; + case QEMU_OPTION_jitdump: + perf_enable_jitdump(); + break; +#endif + case QEMU_OPTION_seed: + qemu_guest_random_seed_main(optarg, &error_fatal); + break; + case QEMU_OPTION_s: + add_device_config(DEV_GDB, "tcp::" DEFAULT_GDBSTUB_PORT); + break; + case QEMU_OPTION_gdb: + add_device_config(DEV_GDB, optarg); + break; + case QEMU_OPTION_L: + if (is_help_option(optarg)) { + list_data_dirs = true; + } else { + qemu_add_data_dir(g_strdup(optarg)); + } + break; + case QEMU_OPTION_bios: + qdict_put_str(machine_opts_dict, "firmware", optarg); + break; + case QEMU_OPTION_singlestep: + opt_one_insn_per_tb = true; + break; + case QEMU_OPTION_S: + autostart = 0; + break; + case QEMU_OPTION_k: + keyboard_layout = optarg; + break; + case QEMU_OPTION_vga: + vga_model = optarg; + default_vga = 0; + break; + case QEMU_OPTION_g: + { + const char *p; + int w, h, depth; + p = optarg; + w = strtol(p, (char **)&p, 10); + if (w <= 0) { + graphic_error: + error_report("invalid resolution or depth"); + exit(1); + } + if (*p != 'x') + goto graphic_error; + p++; + h = strtol(p, (char **)&p, 10); + if (h <= 0) + goto graphic_error; + if (*p == 'x') { + p++; + depth = strtol(p, (char **)&p, 10); + if (depth != 1 && depth != 2 && depth != 4 && + depth != 8 && depth != 15 && depth != 16 && + depth != 24 && depth != 32) + goto graphic_error; + } else if (*p == '\0') { + depth = graphic_depth; + } else { + goto graphic_error; + } + + graphic_width = w; + graphic_height = h; + graphic_depth = depth; + } + break; + case QEMU_OPTION_echr: + { + char *r; + term_escape_char = strtol(optarg, &r, 0); + if (r == optarg) + printf("Bad argument to echr\n"); + break; + } + case QEMU_OPTION_monitor: + default_monitor = 0; + if (strncmp(optarg, "none", 4)) { + monitor_parse(optarg, "readline", false); + } + break; + case QEMU_OPTION_qmp: + monitor_parse(optarg, "control", false); + default_monitor = 0; + break; + case QEMU_OPTION_qmp_pretty: + monitor_parse(optarg, "control", true); + default_monitor = 0; + break; + case QEMU_OPTION_mon: + opts = qemu_opts_parse_noisily(qemu_find_opts("mon"), optarg, + true); + if (!opts) { + exit(1); + } + default_monitor = 0; + break; + case QEMU_OPTION_chardev: + opts = qemu_opts_parse_noisily(qemu_find_opts("chardev"), + optarg, true); + if (!opts) { + exit(1); + } + break; + case QEMU_OPTION_fsdev: + olist = qemu_find_opts("fsdev"); + if (!olist) { + error_report("fsdev support is disabled"); + exit(1); + } + opts = qemu_opts_parse_noisily(olist, optarg, true); + if (!opts) { + exit(1); + } + break; + case QEMU_OPTION_virtfs: { + QemuOpts *fsdev; + QemuOpts *device; + const char *writeout, *sock_fd, *socket, *path, *security_model, + *multidevs; + + olist = qemu_find_opts("virtfs"); + if (!olist) { + error_report("virtfs support is disabled"); + exit(1); + } + opts = qemu_opts_parse_noisily(olist, optarg, true); + if (!opts) { + exit(1); + } + + if (qemu_opt_get(opts, "fsdriver") == NULL || + qemu_opt_get(opts, "mount_tag") == NULL) { + error_report("Usage: -virtfs fsdriver,mount_tag=tag"); + exit(1); + } + fsdev = qemu_opts_create(qemu_find_opts("fsdev"), + qemu_opts_id(opts) ?: + qemu_opt_get(opts, "mount_tag"), + 1, NULL); + if (!fsdev) { + error_report("duplicate or invalid fsdev id: %s", + qemu_opt_get(opts, "mount_tag")); + exit(1); + } + + writeout = qemu_opt_get(opts, "writeout"); + if (writeout) { +#ifdef CONFIG_SYNC_FILE_RANGE + qemu_opt_set(fsdev, "writeout", writeout, &error_abort); +#else + error_report("writeout=immediate not supported " + "on this platform"); + exit(1); +#endif + } + qemu_opt_set(fsdev, "fsdriver", + qemu_opt_get(opts, "fsdriver"), &error_abort); + path = qemu_opt_get(opts, "path"); + if (path) { + qemu_opt_set(fsdev, "path", path, &error_abort); + } + security_model = qemu_opt_get(opts, "security_model"); + if (security_model) { + qemu_opt_set(fsdev, "security_model", security_model, + &error_abort); + } + socket = qemu_opt_get(opts, "socket"); + if (socket) { + qemu_opt_set(fsdev, "socket", socket, &error_abort); + } + sock_fd = qemu_opt_get(opts, "sock_fd"); + if (sock_fd) { + qemu_opt_set(fsdev, "sock_fd", sock_fd, &error_abort); + } + + qemu_opt_set_bool(fsdev, "readonly", + qemu_opt_get_bool(opts, "readonly", 0), + &error_abort); + multidevs = qemu_opt_get(opts, "multidevs"); + if (multidevs) { + qemu_opt_set(fsdev, "multidevs", multidevs, &error_abort); + } + device = qemu_opts_create(qemu_find_opts("device"), NULL, 0, + &error_abort); + qemu_opt_set(device, "driver", "virtio-9p-pci", &error_abort); + qemu_opt_set(device, "fsdev", + qemu_opts_id(fsdev), &error_abort); + qemu_opt_set(device, "mount_tag", + qemu_opt_get(opts, "mount_tag"), &error_abort); + break; + } + case QEMU_OPTION_serial: + add_device_config(DEV_SERIAL, optarg); + default_serial = 0; + if (strncmp(optarg, "mon:", 4) == 0) { + default_monitor = 0; + } + break; + case QEMU_OPTION_action: + olist = qemu_find_opts("action"); + if (!qemu_opts_parse_noisily(olist, optarg, false)) { + exit(1); + } + break; + case QEMU_OPTION_watchdog_action: { + opts = qemu_opts_create(qemu_find_opts("action"), NULL, 0, &error_abort); + qemu_opt_set(opts, "watchdog", optarg, &error_abort); + break; + } + case QEMU_OPTION_parallel: + add_device_config(DEV_PARALLEL, optarg); + default_parallel = 0; + if (strncmp(optarg, "mon:", 4) == 0) { + default_monitor = 0; + } + break; + case QEMU_OPTION_debugcon: + add_device_config(DEV_DEBUGCON, optarg); + break; + case QEMU_OPTION_loadvm: + loadvm = optarg; + break; + case QEMU_OPTION_full_screen: + dpy.has_full_screen = true; + dpy.full_screen = true; + break; + case QEMU_OPTION_pidfile: + pid_file = optarg; + break; + case QEMU_OPTION_win2k_hack: + win2k_install_hack = 1; + break; + case QEMU_OPTION_acpitable: + opts = qemu_opts_parse_noisily(qemu_find_opts("acpi"), + optarg, true); + if (!opts) { + exit(1); + } + acpi_table_add(opts, &error_fatal); + break; + case QEMU_OPTION_smbios: + opts = qemu_opts_parse_noisily(qemu_find_opts("smbios"), + optarg, false); + if (!opts) { + exit(1); + } + smbios_entry_add(opts, &error_fatal); + break; + case QEMU_OPTION_fwcfg: + opts = qemu_opts_parse_noisily(qemu_find_opts("fw_cfg"), + optarg, true); + if (opts == NULL) { + exit(1); + } + break; + case QEMU_OPTION_preconfig: + preconfig_requested = true; + break; + case QEMU_OPTION_enable_kvm: + qdict_put_str(machine_opts_dict, "accel", "kvm"); + break; + case QEMU_OPTION_M: + case QEMU_OPTION_machine: + { + bool help; + + keyval_parse_into(machine_opts_dict, optarg, "type", &help, &error_fatal); + if (help) { + machine_help_func(machine_opts_dict); + exit(EXIT_SUCCESS); + } + break; + } + case QEMU_OPTION_accel: + accel_opts = qemu_opts_parse_noisily(qemu_find_opts("accel"), + optarg, true); + optarg = qemu_opt_get(accel_opts, "accel"); + if (!optarg || is_help_option(optarg)) { + printf("Accelerators supported in QEMU binary:\n"); + GSList *el, *accel_list = object_class_get_list(TYPE_ACCEL, + false); + for (el = accel_list; el; el = el->next) { + gchar *typename = g_strdup(object_class_get_name( + OBJECT_CLASS(el->data))); + /* omit qtest which is used for tests only */ + if (g_strcmp0(typename, ACCEL_CLASS_NAME("qtest")) && + g_str_has_suffix(typename, ACCEL_CLASS_SUFFIX)) { + gchar **optname = g_strsplit(typename, + ACCEL_CLASS_SUFFIX, 0); + printf("%s\n", optname[0]); + g_strfreev(optname); + } + g_free(typename); + } + g_slist_free(accel_list); + exit(0); + } + break; + case QEMU_OPTION_usb: + qdict_put_str(machine_opts_dict, "usb", "on"); + break; + case QEMU_OPTION_usbdevice: + qdict_put_str(machine_opts_dict, "usb", "on"); + add_device_config(DEV_USB, optarg); + break; + case QEMU_OPTION_device: + if (optarg[0] == '{') { + QObject *obj = qobject_from_json(optarg, &error_fatal); + DeviceOption *opt = g_new0(DeviceOption, 1); + opt->opts = qobject_to(QDict, obj); + loc_save(&opt->loc); + assert(opt->opts != NULL); + QTAILQ_INSERT_TAIL(&device_opts, opt, next); + } else { + if (!qemu_opts_parse_noisily(qemu_find_opts("device"), + optarg, true)) { + exit(1); + } + } + break; + case QEMU_OPTION_smp: + machine_parse_property_opt(qemu_find_opts("smp-opts"), + "smp", optarg); + break; + case QEMU_OPTION_vnc: + vnc_parse(optarg); + break; + case QEMU_OPTION_no_acpi: + warn_report("-no-acpi is deprecated, use '-machine acpi=off' instead"); + qdict_put_str(machine_opts_dict, "acpi", "off"); + break; + case QEMU_OPTION_no_hpet: + warn_report("-no-hpet is deprecated, use '-machine hpet=off' instead"); + qdict_put_str(machine_opts_dict, "hpet", "off"); + break; + case QEMU_OPTION_no_reboot: + olist = qemu_find_opts("action"); + qemu_opts_parse_noisily(olist, "reboot=shutdown", false); + break; + case QEMU_OPTION_no_shutdown: + olist = qemu_find_opts("action"); + qemu_opts_parse_noisily(olist, "shutdown=pause", false); + break; + case QEMU_OPTION_uuid: + if (qemu_uuid_parse(optarg, &qemu_uuid) < 0) { + error_report("failed to parse UUID string: wrong format"); + exit(1); + } + qemu_uuid_set = true; + break; + case QEMU_OPTION_option_rom: + if (nb_option_roms >= MAX_OPTION_ROMS) { + error_report("too many option ROMs"); + exit(1); + } + opts = qemu_opts_parse_noisily(qemu_find_opts("option-rom"), + optarg, true); + if (!opts) { + exit(1); + } + option_rom[nb_option_roms].name = qemu_opt_get(opts, "romfile"); + option_rom[nb_option_roms].bootindex = + qemu_opt_get_number(opts, "bootindex", -1); + if (!option_rom[nb_option_roms].name) { + error_report("Option ROM file is not specified"); + exit(1); + } + nb_option_roms++; + break; + case QEMU_OPTION_semihosting: + qemu_semihosting_enable(); + break; + case QEMU_OPTION_semihosting_config: + if (qemu_semihosting_config_options(optarg) != 0) { + exit(1); + } + break; + case QEMU_OPTION_name: + opts = qemu_opts_parse_noisily(qemu_find_opts("name"), + optarg, true); + if (!opts) { + exit(1); + } + /* Capture guest name if -msg guest-name is used later */ + error_guest_name = qemu_opt_get(opts, "guest"); + break; + case QEMU_OPTION_prom_env: + if (nb_prom_envs >= MAX_PROM_ENVS) { + error_report("too many prom variables"); + exit(1); + } + prom_envs[nb_prom_envs] = optarg; + nb_prom_envs++; + break; + case QEMU_OPTION_old_param: + old_param = 1; + break; + case QEMU_OPTION_rtc: + opts = qemu_opts_parse_noisily(qemu_find_opts("rtc"), optarg, + false); + if (!opts) { + exit(1); + } + break; + case QEMU_OPTION_icount: + icount_opts = qemu_opts_parse_noisily(qemu_find_opts("icount"), + optarg, true); + if (!icount_opts) { + exit(1); + } + break; + case QEMU_OPTION_incoming: + if (!incoming) { + runstate_set(RUN_STATE_INMIGRATE); + } + incoming = optarg; + break; + case QEMU_OPTION_only_migratable: + only_migratable = 1; + break; + case QEMU_OPTION_nodefaults: + has_defaults = 0; + break; + case QEMU_OPTION_xen_domid: + if (!(accel_find("xen")) && !(accel_find("kvm"))) { + error_report("Option not supported for this target"); + exit(1); + } + xen_domid = atoi(optarg); + break; + case QEMU_OPTION_xen_attach: + if (!(accel_find("xen"))) { + error_report("Option not supported for this target"); + exit(1); + } + xen_mode = XEN_ATTACH; + break; + case QEMU_OPTION_xen_domid_restrict: + if (!(accel_find("xen"))) { + error_report("Option not supported for this target"); + exit(1); + } + xen_domid_restrict = true; + break; + case QEMU_OPTION_trace: + trace_opt_parse(optarg); + break; + case QEMU_OPTION_plugin: + qemu_plugin_opt_parse(optarg, &plugin_list); + break; + case QEMU_OPTION_readconfig: + qemu_read_config_file(optarg, qemu_parse_config_group, &error_fatal); + break; +#ifdef CONFIG_SPICE + case QEMU_OPTION_spice: + olist = qemu_find_opts_err("spice", NULL); + if (!olist) { + error_report("spice support is disabled"); + exit(1); + } + opts = qemu_opts_parse_noisily(olist, optarg, false); + if (!opts) { + exit(1); + } + display_remote++; + break; +#endif + case QEMU_OPTION_qtest: + qtest_chrdev = optarg; + break; + case QEMU_OPTION_qtest_log: + qtest_log = optarg; + break; + case QEMU_OPTION_sandbox: + olist = qemu_find_opts("sandbox"); + if (!olist) { +#ifndef CONFIG_SECCOMP + error_report("-sandbox support is not enabled " + "in this QEMU binary"); +#endif + exit(1); + } + + opts = qemu_opts_parse_noisily(olist, optarg, true); + if (!opts) { + exit(1); + } + break; + case QEMU_OPTION_add_fd: +#ifndef _WIN32 + opts = qemu_opts_parse_noisily(qemu_find_opts("add-fd"), + optarg, false); + if (!opts) { + exit(1); + } +#else + error_report("File descriptor passing is disabled on this " + "platform"); + exit(1); +#endif + break; + case QEMU_OPTION_object: + object_option_parse(optarg); + break; + case QEMU_OPTION_overcommit: + opts = qemu_opts_parse_noisily(qemu_find_opts("overcommit"), + optarg, false); + if (!opts) { + exit(1); + } + enable_mlock = qemu_opt_get_bool(opts, "mem-lock", false); + enable_cpu_pm = qemu_opt_get_bool(opts, "cpu-pm", false); + break; + case QEMU_OPTION_compat: + { + CompatPolicy *opts_policy; + Visitor *v; + + v = qobject_input_visitor_new_str(optarg, NULL, + &error_fatal); + + visit_type_CompatPolicy(v, NULL, &opts_policy, &error_fatal); + QAPI_CLONE_MEMBERS(CompatPolicy, &compat_policy, opts_policy); + + qapi_free_CompatPolicy(opts_policy); + visit_free(v); + break; + } + case QEMU_OPTION_msg: + opts = qemu_opts_parse_noisily(qemu_find_opts("msg"), optarg, + false); + if (!opts) { + exit(1); + } + configure_msg(opts); + break; + case QEMU_OPTION_dump_vmstate: + if (vmstate_dump_file) { + error_report("only one '-dump-vmstate' " + "option may be given"); + exit(1); + } + vmstate_dump_file = fopen(optarg, "w"); + if (vmstate_dump_file == NULL) { + error_report("open %s: %s", optarg, strerror(errno)); + exit(1); + } + break; + case QEMU_OPTION_enable_sync_profile: + qsp_enable(); + break; + case QEMU_OPTION_nouserconfig: + /* Nothing to be parsed here. Especially, do not error out below. */ + break; +#if defined(CONFIG_POSIX) + case QEMU_OPTION_runas: + if (!os_set_runas(optarg)) { + error_report("User \"%s\" doesn't exist" + " (and is not <uid>:<gid>)", + optarg); + exit(1); + } + break; + case QEMU_OPTION_chroot: + warn_report("option is deprecated," + " use '-run-with chroot=...' instead"); + os_set_chroot(optarg); + break; + case QEMU_OPTION_daemonize: + os_set_daemonize(true); + break; +#if defined(CONFIG_LINUX) + /* deprecated */ + case QEMU_OPTION_asyncteardown: + init_async_teardown(); + break; +#endif + case QEMU_OPTION_run_with: { + const char *str; + opts = qemu_opts_parse_noisily(qemu_find_opts("run-with"), + optarg, false); + if (!opts) { + exit(1); + } +#if defined(CONFIG_LINUX) + if (qemu_opt_get_bool(opts, "async-teardown", false)) { + init_async_teardown(); + } +#endif + str = qemu_opt_get(opts, "chroot"); + if (str) { + os_set_chroot(str); + } + break; + } +#endif /* CONFIG_POSIX */ + + default: + error_report("Option not supported in this build"); + exit(1); + } + } + } + /* + * Clear error location left behind by the loop. + * Best done right after the loop. Do not insert code here! + */ + loc_set_none(); + + qemu_validate_options(machine_opts_dict); + qemu_process_sugar_options(); + + /* + * These options affect everything else and should be processed + * before daemonizing. + */ + qemu_process_early_options(); + + qemu_process_help_options(); + qemu_maybe_daemonize(pid_file); + + /* + * The trace backend must be initialized after daemonizing. + * trace_init_backends() will call st_init(), which will create the + * trace thread in the parent, and also register st_flush_trace_buffer() + * in atexit(). This function will force the parent to wait for the + * writeout thread to finish, which will not occur, and the parent + * process will be left in the host. + */ + if (!trace_init_backends()) { + exit(1); + } + trace_init_file(); + + qemu_init_main_loop(&error_fatal); + cpu_timers_init(); + + user_register_global_props(); + replay_configure(icount_opts); + + configure_rtc(qemu_find_opts_singleton("rtc")); + + /* Transfer QemuOpts options into machine options */ + parse_memory_options(); + + qemu_create_machine(machine_opts_dict); + + suspend_mux_open(); + + qemu_disable_default_devices(); + qemu_create_default_devices(); + qemu_create_early_backends(); + + qemu_apply_legacy_machine_options(machine_opts_dict); + qemu_apply_machine_options(machine_opts_dict); + qobject_unref(machine_opts_dict); + phase_advance(PHASE_MACHINE_CREATED); + + /* + * Note: uses machine properties such as kernel-irqchip, must run + * after qemu_apply_machine_options. + */ + configure_accelerators(argv[0]); + phase_advance(PHASE_ACCEL_CREATED); + + /* + * Beware, QOM objects created before this point miss global and + * compat properties. + * + * Global properties get set up by qdev_prop_register_global(), + * called from user_register_global_props(), and certain option + * desugaring. Also in CPU feature desugaring (buried in + * parse_cpu_option()), which happens below this point, but may + * only target the CPU type, which can only be created after + * parse_cpu_option() returned the type. + * + * Machine compat properties: object_set_machine_compat_props(). + * Accelerator compat props: object_set_accelerator_compat_props(), + * called from do_configure_accelerator(). + */ + + machine_class = MACHINE_GET_CLASS(current_machine); + if (!qtest_enabled() && machine_class->deprecation_reason) { + warn_report("Machine type '%s' is deprecated: %s", + machine_class->name, machine_class->deprecation_reason); + } + + /* + * Create backends before creating migration objects, so that it can + * check against compatibilities on the backend memories (e.g. postcopy + * over memory-backend-file objects). + */ + qemu_create_late_backends(); + + /* + * Note: creates a QOM object, must run only after global and + * compat properties have been set up. + */ + migration_object_init(); + + /* parse features once if machine provides default cpu_type */ + current_machine->cpu_type = machine_class->default_cpu_type; + if (cpu_option) { + current_machine->cpu_type = parse_cpu_option(cpu_option); + } + /* NB: for machine none cpu_type could STILL be NULL here! */ + + qemu_resolve_machine_memdev(); + parse_numa_opts(current_machine); + + if (vmstate_dump_file) { + /* dump and exit */ + module_load_qom_all(); + dump_vmstate_json_to_file(vmstate_dump_file); + exit(0); + } + + if (!preconfig_requested) { + qmp_x_exit_preconfig(&error_fatal); + } + qemu_init_displays(); + accel_setup_post(current_machine); + os_setup_post(); + resume_mux_open(); +} diff --git a/system/watchpoint.c b/system/watchpoint.c new file mode 100644 index 0000000000..45d1f12faf --- /dev/null +++ b/system/watchpoint.c @@ -0,0 +1,226 @@ +/* + * CPU watchpoints + * + * Copyright (c) 2003 Fabrice Bellard + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "qemu/main-loop.h" +#include "qemu/error-report.h" +#include "exec/exec-all.h" +#include "exec/translate-all.h" +#include "sysemu/tcg.h" +#include "sysemu/replay.h" +#include "hw/core/tcg-cpu-ops.h" +#include "hw/core/cpu.h" + +/* Add a watchpoint. */ +int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len, + int flags, CPUWatchpoint **watchpoint) +{ + CPUWatchpoint *wp; + vaddr in_page; + + /* forbid ranges which are empty or run off the end of the address space */ + if (len == 0 || (addr + len - 1) < addr) { + error_report("tried to set invalid watchpoint at %" + VADDR_PRIx ", len=%" VADDR_PRIu, addr, len); + return -EINVAL; + } + wp = g_malloc(sizeof(*wp)); + + wp->vaddr = addr; + wp->len = len; + wp->flags = flags; + + /* keep all GDB-injected watchpoints in front */ + if (flags & BP_GDB) { + QTAILQ_INSERT_HEAD(&cpu->watchpoints, wp, entry); + } else { + QTAILQ_INSERT_TAIL(&cpu->watchpoints, wp, entry); + } + + in_page = -(addr | TARGET_PAGE_MASK); + if (len <= in_page) { + tlb_flush_page(cpu, addr); + } else { + tlb_flush(cpu); + } + + if (watchpoint) { + *watchpoint = wp; + } + return 0; +} + +/* Remove a specific watchpoint. */ +int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len, + int flags) +{ + CPUWatchpoint *wp; + + QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) { + if (addr == wp->vaddr && len == wp->len + && flags == (wp->flags & ~BP_WATCHPOINT_HIT)) { + cpu_watchpoint_remove_by_ref(cpu, wp); + return 0; + } + } + return -ENOENT; +} + +/* Remove a specific watchpoint by reference. */ +void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint) +{ + QTAILQ_REMOVE(&cpu->watchpoints, watchpoint, entry); + + tlb_flush_page(cpu, watchpoint->vaddr); + + g_free(watchpoint); +} + +/* Remove all matching watchpoints. */ +void cpu_watchpoint_remove_all(CPUState *cpu, int mask) +{ + CPUWatchpoint *wp, *next; + + QTAILQ_FOREACH_SAFE(wp, &cpu->watchpoints, entry, next) { + if (wp->flags & mask) { + cpu_watchpoint_remove_by_ref(cpu, wp); + } + } +} + +#ifdef CONFIG_TCG + +/* + * Return true if this watchpoint address matches the specified + * access (ie the address range covered by the watchpoint overlaps + * partially or completely with the address range covered by the + * access). + */ +static inline bool watchpoint_address_matches(CPUWatchpoint *wp, + vaddr addr, vaddr len) +{ + /* + * We know the lengths are non-zero, but a little caution is + * required to avoid errors in the case where the range ends + * exactly at the top of the address space and so addr + len + * wraps round to zero. + */ + vaddr wpend = wp->vaddr + wp->len - 1; + vaddr addrend = addr + len - 1; + + return !(addr > wpend || wp->vaddr > addrend); +} + +/* Return flags for watchpoints that match addr + prot. */ +int cpu_watchpoint_address_matches(CPUState *cpu, vaddr addr, vaddr len) +{ + CPUWatchpoint *wp; + int ret = 0; + + QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) { + if (watchpoint_address_matches(wp, addr, len)) { + ret |= wp->flags; + } + } + return ret; +} + +/* Generate a debug exception if a watchpoint has been hit. */ +void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len, + MemTxAttrs attrs, int flags, uintptr_t ra) +{ + CPUClass *cc = CPU_GET_CLASS(cpu); + CPUWatchpoint *wp; + + assert(tcg_enabled()); + if (cpu->watchpoint_hit) { + /* + * We re-entered the check after replacing the TB. + * Now raise the debug interrupt so that it will + * trigger after the current instruction. + */ + qemu_mutex_lock_iothread(); + cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG); + qemu_mutex_unlock_iothread(); + return; + } + + if (cc->tcg_ops->adjust_watchpoint_address) { + /* this is currently used only by ARM BE32 */ + addr = cc->tcg_ops->adjust_watchpoint_address(cpu, addr, len); + } + + assert((flags & ~BP_MEM_ACCESS) == 0); + QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) { + int hit_flags = wp->flags & flags; + + if (hit_flags && watchpoint_address_matches(wp, addr, len)) { + if (replay_running_debug()) { + /* + * replay_breakpoint reads icount. + * Force recompile to succeed, because icount may + * be read only at the end of the block. + */ + if (!cpu->neg.can_do_io) { + /* Force execution of one insn next time. */ + cpu->cflags_next_tb = 1 | CF_LAST_IO | CF_NOIRQ + | curr_cflags(cpu); + cpu_loop_exit_restore(cpu, ra); + } + /* + * Don't process the watchpoints when we are + * in a reverse debugging operation. + */ + replay_breakpoint(); + return; + } + + wp->flags |= hit_flags << BP_HIT_SHIFT; + wp->hitaddr = MAX(addr, wp->vaddr); + wp->hitattrs = attrs; + + if (wp->flags & BP_CPU + && cc->tcg_ops->debug_check_watchpoint + && !cc->tcg_ops->debug_check_watchpoint(cpu, wp)) { + wp->flags &= ~BP_WATCHPOINT_HIT; + continue; + } + cpu->watchpoint_hit = wp; + + mmap_lock(); + /* This call also restores vCPU state */ + tb_check_watchpoint(cpu, ra); + if (wp->flags & BP_STOP_BEFORE_ACCESS) { + cpu->exception_index = EXCP_DEBUG; + mmap_unlock(); + cpu_loop_exit(cpu); + } else { + /* Force execution of one insn next time. */ + cpu->cflags_next_tb = 1 | CF_LAST_IO | CF_NOIRQ + | curr_cflags(cpu); + mmap_unlock(); + cpu_loop_exit_noexc(cpu); + } + } else { + wp->flags &= ~BP_WATCHPOINT_HIT; + } + } +} + +#endif /* CONFIG_TCG */ |