aboutsummaryrefslogtreecommitdiff
path: root/system
diff options
context:
space:
mode:
authorPhilippe Mathieu-Daudé <philmd@linaro.org>2023-10-04 11:06:28 +0200
committerPaolo Bonzini <pbonzini@redhat.com>2023-10-08 21:08:08 +0200
commit8d7f2e767d8cd058c817dbe31430b89f2e11535d (patch)
tree5b7c25cddf7e6c1b9dfc93c5169541e8c14e2901 /system
parent01c85e60a4d0675f0ad203fe1fe119381e7ad15b (diff)
system: Rename softmmu/ directory as system/
The softmmu/ directory contains files specific to system emulation. Rename it as system/. Update meson rules, the MAINTAINERS file and all the documentation and comments. Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org> Message-ID: <20231004090629.37473-14-philmd@linaro.org> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Diffstat (limited to 'system')
-rw-r--r--system/arch_init.c50
-rw-r--r--system/async-teardown.c143
-rw-r--r--system/balloon.c106
-rw-r--r--system/bootdevice.c430
-rw-r--r--system/cpu-throttle.c128
-rw-r--r--system/cpu-timers.c277
-rw-r--r--system/cpus.c822
-rw-r--r--system/datadir.c110
-rw-r--r--system/device_tree.c703
-rw-r--r--system/dirtylimit.c678
-rw-r--r--system/dma-helpers.c347
-rw-r--r--system/globals.c70
-rw-r--r--system/ioport.c346
-rw-r--r--system/main.c49
-rw-r--r--system/memory.c3683
-rw-r--r--system/memory_mapping.c377
-rw-r--r--system/meson.build36
-rw-r--r--system/physmem.c3796
-rw-r--r--system/qdev-monitor.c1148
-rw-r--r--system/qemu-seccomp.c486
-rw-r--r--system/qtest.c1070
-rw-r--r--system/rtc.c192
-rw-r--r--system/runstate-action.c46
-rw-r--r--system/runstate-hmp-cmds.c95
-rw-r--r--system/runstate.c871
-rw-r--r--system/tpm-hmp-cmds.c65
-rw-r--r--system/tpm.c239
-rw-r--r--system/trace-events40
-rw-r--r--system/trace.h1
-rw-r--r--system/vl.c3730
-rw-r--r--system/watchpoint.c226
31 files changed, 20360 insertions, 0 deletions
diff --git a/system/arch_init.c b/system/arch_init.c
new file mode 100644
index 0000000000..79716f959b
--- /dev/null
+++ b/system/arch_init.c
@@ -0,0 +1,50 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu/osdep.h"
+#include "qemu/module.h"
+#include "sysemu/arch_init.h"
+
+#ifdef TARGET_SPARC
+int graphic_width = 1024;
+int graphic_height = 768;
+int graphic_depth = 8;
+#elif defined(TARGET_M68K)
+int graphic_width = 800;
+int graphic_height = 600;
+int graphic_depth = 8;
+#else
+int graphic_width = 800;
+int graphic_height = 600;
+int graphic_depth = 32;
+#endif
+
+const uint32_t arch_type = QEMU_ARCH;
+
+void qemu_init_arch_modules(void)
+{
+#ifdef CONFIG_MODULES
+ module_init_info(qemu_modinfo);
+ module_allow_arch(TARGET_NAME);
+#endif
+}
diff --git a/system/async-teardown.c b/system/async-teardown.c
new file mode 100644
index 0000000000..396963c091
--- /dev/null
+++ b/system/async-teardown.c
@@ -0,0 +1,143 @@
+/*
+ * Asynchronous teardown
+ *
+ * Copyright IBM, Corp. 2022
+ *
+ * Authors:
+ * Claudio Imbrenda <imbrenda@linux.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or (at your
+ * option) any later version. See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include <dirent.h>
+#include <sys/prctl.h>
+#include <sched.h>
+
+#include "qemu/async-teardown.h"
+
+#ifdef _SC_THREAD_STACK_MIN
+#define CLONE_STACK_SIZE sysconf(_SC_THREAD_STACK_MIN)
+#else
+#define CLONE_STACK_SIZE 16384
+#endif
+
+static pid_t the_ppid;
+
+/*
+ * Close all open file descriptors.
+ */
+static void close_all_open_fd(void)
+{
+ struct dirent *de;
+ int fd, dfd;
+ DIR *dir;
+
+#ifdef CONFIG_CLOSE_RANGE
+ int r = close_range(0, ~0U, 0);
+ if (!r) {
+ /* Success, no need to try other ways. */
+ return;
+ }
+#endif
+
+ dir = opendir("/proc/self/fd");
+ if (!dir) {
+ /* If /proc is not mounted, there is nothing that can be done. */
+ return;
+ }
+ /* Avoid closing the directory. */
+ dfd = dirfd(dir);
+
+ for (de = readdir(dir); de; de = readdir(dir)) {
+ fd = atoi(de->d_name);
+ if (fd != dfd) {
+ close(fd);
+ }
+ }
+ closedir(dir);
+}
+
+static void hup_handler(int signal)
+{
+ /* Check every second if this process has been reparented. */
+ while (the_ppid == getppid()) {
+ /* sleep() is safe to use in a signal handler. */
+ sleep(1);
+ }
+
+ /* At this point the parent process has terminated completely. */
+ _exit(0);
+}
+
+static int async_teardown_fn(void *arg)
+{
+ struct sigaction sa = { .sa_handler = hup_handler };
+ sigset_t hup_signal;
+ char name[16];
+
+ /* Set a meaningful name for this process. */
+ snprintf(name, 16, "cleanup/%d", the_ppid);
+ prctl(PR_SET_NAME, (unsigned long)name);
+
+ /*
+ * Close all file descriptors that might have been inherited from the
+ * main qemu process when doing clone, needed to make libvirt happy.
+ * Not using close_range for increased compatibility with older kernels.
+ */
+ close_all_open_fd();
+
+ /* Set up a handler for SIGHUP and unblock SIGHUP. */
+ sigaction(SIGHUP, &sa, NULL);
+ sigemptyset(&hup_signal);
+ sigaddset(&hup_signal, SIGHUP);
+ sigprocmask(SIG_UNBLOCK, &hup_signal, NULL);
+
+ /* Ask to receive SIGHUP when the parent dies. */
+ prctl(PR_SET_PDEATHSIG, SIGHUP);
+
+ /*
+ * Sleep forever, unless the parent process has already terminated. The
+ * only interruption can come from the SIGHUP signal, which in normal
+ * operation is received when the parent process dies.
+ */
+ if (the_ppid == getppid()) {
+ pause();
+ }
+
+ /* At this point the parent process has terminated completely. */
+ _exit(0);
+}
+
+/*
+ * Allocate a new stack of a reasonable size, and return a pointer to its top.
+ */
+static void *new_stack_for_clone(void)
+{
+ size_t stack_size = CLONE_STACK_SIZE;
+ char *stack_ptr;
+
+ /* Allocate a new stack and get a pointer to its top. */
+ stack_ptr = qemu_alloc_stack(&stack_size);
+ stack_ptr += stack_size;
+
+ return stack_ptr;
+}
+
+/*
+ * Block all signals, start (clone) a new process sharing the address space
+ * with qemu (CLONE_VM), then restore signals.
+ */
+void init_async_teardown(void)
+{
+ sigset_t all_signals, old_signals;
+
+ the_ppid = getpid();
+
+ sigfillset(&all_signals);
+ sigprocmask(SIG_BLOCK, &all_signals, &old_signals);
+ clone(async_teardown_fn, new_stack_for_clone(), CLONE_VM, NULL);
+ sigprocmask(SIG_SETMASK, &old_signals, NULL);
+}
diff --git a/system/balloon.c b/system/balloon.c
new file mode 100644
index 0000000000..e0e8969a4b
--- /dev/null
+++ b/system/balloon.c
@@ -0,0 +1,106 @@
+/*
+ * Generic Balloon handlers and management
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (C) 2011 Red Hat, Inc.
+ * Copyright (C) 2011 Amit Shah <amit.shah@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/atomic.h"
+#include "sysemu/kvm.h"
+#include "sysemu/balloon.h"
+#include "qapi/error.h"
+#include "qapi/qapi-commands-machine.h"
+#include "qapi/qmp/qerror.h"
+#include "trace.h"
+
+static QEMUBalloonEvent *balloon_event_fn;
+static QEMUBalloonStatus *balloon_stat_fn;
+static void *balloon_opaque;
+
+static bool have_balloon(Error **errp)
+{
+ if (kvm_enabled() && !kvm_has_sync_mmu()) {
+ error_set(errp, ERROR_CLASS_KVM_MISSING_CAP,
+ "Using KVM without synchronous MMU, balloon unavailable");
+ return false;
+ }
+ if (!balloon_event_fn) {
+ error_set(errp, ERROR_CLASS_DEVICE_NOT_ACTIVE,
+ "No balloon device has been activated");
+ return false;
+ }
+ return true;
+}
+
+int qemu_add_balloon_handler(QEMUBalloonEvent *event_func,
+ QEMUBalloonStatus *stat_func, void *opaque)
+{
+ if (balloon_event_fn || balloon_stat_fn || balloon_opaque) {
+ /* We're already registered one balloon handler. How many can
+ * a guest really have?
+ */
+ return -1;
+ }
+ balloon_event_fn = event_func;
+ balloon_stat_fn = stat_func;
+ balloon_opaque = opaque;
+ return 0;
+}
+
+void qemu_remove_balloon_handler(void *opaque)
+{
+ if (balloon_opaque != opaque) {
+ return;
+ }
+ balloon_event_fn = NULL;
+ balloon_stat_fn = NULL;
+ balloon_opaque = NULL;
+}
+
+BalloonInfo *qmp_query_balloon(Error **errp)
+{
+ BalloonInfo *info;
+
+ if (!have_balloon(errp)) {
+ return NULL;
+ }
+
+ info = g_malloc0(sizeof(*info));
+ balloon_stat_fn(balloon_opaque, info);
+ return info;
+}
+
+void qmp_balloon(int64_t target, Error **errp)
+{
+ if (!have_balloon(errp)) {
+ return;
+ }
+
+ if (target <= 0) {
+ error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "target", "a size");
+ return;
+ }
+
+ trace_balloon_event(balloon_opaque, target);
+ balloon_event_fn(balloon_opaque, target);
+}
diff --git a/system/bootdevice.c b/system/bootdevice.c
new file mode 100644
index 0000000000..2106f1026f
--- /dev/null
+++ b/system/bootdevice.c
@@ -0,0 +1,430 @@
+/*
+ * QEMU Boot Device Implement
+ *
+ * Copyright (c) 2014 HUAWEI TECHNOLOGIES CO., LTD.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "sysemu/sysemu.h"
+#include "qapi/visitor.h"
+#include "qemu/error-report.h"
+#include "sysemu/reset.h"
+#include "hw/qdev-core.h"
+#include "hw/boards.h"
+
+typedef struct FWBootEntry FWBootEntry;
+
+struct FWBootEntry {
+ QTAILQ_ENTRY(FWBootEntry) link;
+ int32_t bootindex;
+ DeviceState *dev;
+ char *suffix;
+};
+
+static QTAILQ_HEAD(, FWBootEntry) fw_boot_order =
+ QTAILQ_HEAD_INITIALIZER(fw_boot_order);
+static QEMUBootSetHandler *boot_set_handler;
+static void *boot_set_opaque;
+
+void qemu_register_boot_set(QEMUBootSetHandler *func, void *opaque)
+{
+ boot_set_handler = func;
+ boot_set_opaque = opaque;
+}
+
+void qemu_boot_set(const char *boot_order, Error **errp)
+{
+ Error *local_err = NULL;
+
+ if (!boot_set_handler) {
+ error_setg(errp, "no function defined to set boot device list for"
+ " this architecture");
+ return;
+ }
+
+ validate_bootdevices(boot_order, &local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ return;
+ }
+
+ boot_set_handler(boot_set_opaque, boot_order, errp);
+}
+
+void validate_bootdevices(const char *devices, Error **errp)
+{
+ /* We just do some generic consistency checks */
+ const char *p;
+ int bitmap = 0;
+
+ for (p = devices; *p != '\0'; p++) {
+ /* Allowed boot devices are:
+ * a-b: floppy disk drives
+ * c-f: IDE disk drives
+ * g-m: machine implementation dependent drives
+ * n-p: network devices
+ * It's up to each machine implementation to check if the given boot
+ * devices match the actual hardware implementation and firmware
+ * features.
+ */
+ if (*p < 'a' || *p > 'p') {
+ error_setg(errp, "Invalid boot device '%c'", *p);
+ return;
+ }
+ if (bitmap & (1 << (*p - 'a'))) {
+ error_setg(errp, "Boot device '%c' was given twice", *p);
+ return;
+ }
+ bitmap |= 1 << (*p - 'a');
+ }
+}
+
+void restore_boot_order(void *opaque)
+{
+ char *normal_boot_order = opaque;
+ static int first = 1;
+
+ /* Restore boot order and remove ourselves after the first boot */
+ if (first) {
+ first = 0;
+ return;
+ }
+
+ if (boot_set_handler) {
+ qemu_boot_set(normal_boot_order, &error_abort);
+ }
+
+ qemu_unregister_reset(restore_boot_order, normal_boot_order);
+ g_free(normal_boot_order);
+}
+
+void check_boot_index(int32_t bootindex, Error **errp)
+{
+ FWBootEntry *i;
+
+ if (bootindex >= 0) {
+ QTAILQ_FOREACH(i, &fw_boot_order, link) {
+ if (i->bootindex == bootindex) {
+ error_setg(errp, "The bootindex %d has already been used",
+ bootindex);
+ return;
+ }
+ }
+ }
+}
+
+void del_boot_device_path(DeviceState *dev, const char *suffix)
+{
+ FWBootEntry *i;
+
+ if (dev == NULL) {
+ return;
+ }
+
+ QTAILQ_FOREACH(i, &fw_boot_order, link) {
+ if ((!suffix || !g_strcmp0(i->suffix, suffix)) &&
+ i->dev == dev) {
+ QTAILQ_REMOVE(&fw_boot_order, i, link);
+ g_free(i->suffix);
+ g_free(i);
+
+ break;
+ }
+ }
+}
+
+void add_boot_device_path(int32_t bootindex, DeviceState *dev,
+ const char *suffix)
+{
+ FWBootEntry *node, *i;
+
+ if (bootindex < 0) {
+ del_boot_device_path(dev, suffix);
+ return;
+ }
+
+ assert(dev != NULL || suffix != NULL);
+
+ del_boot_device_path(dev, suffix);
+
+ node = g_new0(FWBootEntry, 1);
+ node->bootindex = bootindex;
+ node->suffix = g_strdup(suffix);
+ node->dev = dev;
+
+ QTAILQ_FOREACH(i, &fw_boot_order, link) {
+ if (i->bootindex == bootindex) {
+ error_report("Two devices with same boot index %d", bootindex);
+ exit(1);
+ } else if (i->bootindex < bootindex) {
+ continue;
+ }
+ QTAILQ_INSERT_BEFORE(i, node, link);
+ return;
+ }
+ QTAILQ_INSERT_TAIL(&fw_boot_order, node, link);
+}
+
+DeviceState *get_boot_device(uint32_t position)
+{
+ uint32_t counter = 0;
+ FWBootEntry *i = NULL;
+ DeviceState *res = NULL;
+
+ if (!QTAILQ_EMPTY(&fw_boot_order)) {
+ QTAILQ_FOREACH(i, &fw_boot_order, link) {
+ if (counter == position) {
+ res = i->dev;
+ break;
+ }
+ counter++;
+ }
+ }
+ return res;
+}
+
+static char *get_boot_device_path(DeviceState *dev, bool ignore_suffixes,
+ const char *suffix)
+{
+ char *devpath = NULL, *s = NULL, *d, *bootpath;
+
+ if (dev) {
+ devpath = qdev_get_fw_dev_path(dev);
+ assert(devpath);
+ }
+
+ if (!ignore_suffixes) {
+ if (dev) {
+ d = qdev_get_own_fw_dev_path_from_handler(dev->parent_bus, dev);
+ if (d) {
+ assert(!suffix);
+ s = d;
+ } else {
+ s = g_strdup(suffix);
+ }
+ } else {
+ s = g_strdup(suffix);
+ }
+ }
+
+ bootpath = g_strdup_printf("%s%s",
+ devpath ? devpath : "",
+ s ? s : "");
+ g_free(devpath);
+ g_free(s);
+
+ return bootpath;
+}
+
+/*
+ * This function returns null terminated string that consist of new line
+ * separated device paths.
+ *
+ * memory pointed by "size" is assigned total length of the array in bytes
+ *
+ */
+char *get_boot_devices_list(size_t *size)
+{
+ FWBootEntry *i;
+ size_t total = 0;
+ char *list = NULL;
+ MachineClass *mc = MACHINE_GET_CLASS(qdev_get_machine());
+ bool ignore_suffixes = mc->ignore_boot_device_suffixes;
+
+ QTAILQ_FOREACH(i, &fw_boot_order, link) {
+ char *bootpath;
+ size_t len;
+
+ bootpath = get_boot_device_path(i->dev, ignore_suffixes, i->suffix);
+
+ if (total) {
+ list[total-1] = '\n';
+ }
+ len = strlen(bootpath) + 1;
+ list = g_realloc(list, total + len);
+ memcpy(&list[total], bootpath, len);
+ total += len;
+ g_free(bootpath);
+ }
+
+ *size = total;
+
+ if (current_machine->boot_config.has_strict &&
+ current_machine->boot_config.strict && *size > 0) {
+ list[total-1] = '\n';
+ list = g_realloc(list, total + 5);
+ memcpy(&list[total], "HALT", 5);
+ *size = total + 5;
+ }
+ return list;
+}
+
+typedef struct {
+ int32_t *bootindex;
+ const char *suffix;
+ DeviceState *dev;
+} BootIndexProperty;
+
+static void device_get_bootindex(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ BootIndexProperty *prop = opaque;
+ visit_type_int32(v, name, prop->bootindex, errp);
+}
+
+static void device_set_bootindex(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ BootIndexProperty *prop = opaque;
+ int32_t boot_index;
+ Error *local_err = NULL;
+
+ if (!visit_type_int32(v, name, &boot_index, errp)) {
+ return;
+ }
+ /* check whether bootindex is present in fw_boot_order list */
+ check_boot_index(boot_index, &local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ return;
+ }
+ /* change bootindex to a new one */
+ *prop->bootindex = boot_index;
+
+ add_boot_device_path(*prop->bootindex, prop->dev, prop->suffix);
+}
+
+static void property_release_bootindex(Object *obj, const char *name,
+ void *opaque)
+
+{
+ BootIndexProperty *prop = opaque;
+
+ del_boot_device_path(prop->dev, prop->suffix);
+ g_free(prop);
+}
+
+void device_add_bootindex_property(Object *obj, int32_t *bootindex,
+ const char *name, const char *suffix,
+ DeviceState *dev)
+{
+ BootIndexProperty *prop = g_malloc0(sizeof(*prop));
+
+ prop->bootindex = bootindex;
+ prop->suffix = suffix;
+ prop->dev = dev;
+
+ object_property_add(obj, name, "int32",
+ device_get_bootindex,
+ device_set_bootindex,
+ property_release_bootindex,
+ prop);
+
+ /* initialize devices' bootindex property to -1 */
+ object_property_set_int(obj, name, -1, NULL);
+}
+
+typedef struct FWLCHSEntry FWLCHSEntry;
+
+struct FWLCHSEntry {
+ QTAILQ_ENTRY(FWLCHSEntry) link;
+ DeviceState *dev;
+ char *suffix;
+ uint32_t lcyls;
+ uint32_t lheads;
+ uint32_t lsecs;
+};
+
+static QTAILQ_HEAD(, FWLCHSEntry) fw_lchs =
+ QTAILQ_HEAD_INITIALIZER(fw_lchs);
+
+void add_boot_device_lchs(DeviceState *dev, const char *suffix,
+ uint32_t lcyls, uint32_t lheads, uint32_t lsecs)
+{
+ FWLCHSEntry *node;
+
+ if (!lcyls && !lheads && !lsecs) {
+ return;
+ }
+
+ assert(dev != NULL || suffix != NULL);
+
+ node = g_new0(FWLCHSEntry, 1);
+ node->suffix = g_strdup(suffix);
+ node->dev = dev;
+ node->lcyls = lcyls;
+ node->lheads = lheads;
+ node->lsecs = lsecs;
+
+ QTAILQ_INSERT_TAIL(&fw_lchs, node, link);
+}
+
+void del_boot_device_lchs(DeviceState *dev, const char *suffix)
+{
+ FWLCHSEntry *i;
+
+ if (dev == NULL) {
+ return;
+ }
+
+ QTAILQ_FOREACH(i, &fw_lchs, link) {
+ if ((!suffix || !g_strcmp0(i->suffix, suffix)) &&
+ i->dev == dev) {
+ QTAILQ_REMOVE(&fw_lchs, i, link);
+ g_free(i->suffix);
+ g_free(i);
+
+ break;
+ }
+ }
+}
+
+char *get_boot_devices_lchs_list(size_t *size)
+{
+ FWLCHSEntry *i;
+ size_t total = 0;
+ char *list = NULL;
+
+ QTAILQ_FOREACH(i, &fw_lchs, link) {
+ char *bootpath;
+ char *chs_string;
+ size_t len;
+
+ bootpath = get_boot_device_path(i->dev, false, i->suffix);
+ chs_string = g_strdup_printf("%s %" PRIu32 " %" PRIu32 " %" PRIu32,
+ bootpath, i->lcyls, i->lheads, i->lsecs);
+
+ if (total) {
+ list[total - 1] = '\n';
+ }
+ len = strlen(chs_string) + 1;
+ list = g_realloc(list, total + len);
+ memcpy(&list[total], chs_string, len);
+ total += len;
+ g_free(chs_string);
+ g_free(bootpath);
+ }
+
+ *size = total;
+
+ return list;
+}
diff --git a/system/cpu-throttle.c b/system/cpu-throttle.c
new file mode 100644
index 0000000000..d9bb30a223
--- /dev/null
+++ b/system/cpu-throttle.c
@@ -0,0 +1,128 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/thread.h"
+#include "hw/core/cpu.h"
+#include "qemu/main-loop.h"
+#include "sysemu/cpus.h"
+#include "sysemu/cpu-throttle.h"
+
+/* vcpu throttling controls */
+static QEMUTimer *throttle_timer;
+static unsigned int throttle_percentage;
+
+#define CPU_THROTTLE_PCT_MIN 1
+#define CPU_THROTTLE_PCT_MAX 99
+#define CPU_THROTTLE_TIMESLICE_NS 10000000
+
+static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
+{
+ double pct;
+ double throttle_ratio;
+ int64_t sleeptime_ns, endtime_ns;
+
+ if (!cpu_throttle_get_percentage()) {
+ return;
+ }
+
+ pct = (double)cpu_throttle_get_percentage() / 100;
+ throttle_ratio = pct / (1 - pct);
+ /* Add 1ns to fix double's rounding error (like 0.9999999...) */
+ sleeptime_ns = (int64_t)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS + 1);
+ endtime_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + sleeptime_ns;
+ while (sleeptime_ns > 0 && !cpu->stop) {
+ if (sleeptime_ns > SCALE_MS) {
+ qemu_cond_timedwait_iothread(cpu->halt_cond,
+ sleeptime_ns / SCALE_MS);
+ } else {
+ qemu_mutex_unlock_iothread();
+ g_usleep(sleeptime_ns / SCALE_US);
+ qemu_mutex_lock_iothread();
+ }
+ sleeptime_ns = endtime_ns - qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+ }
+ qatomic_set(&cpu->throttle_thread_scheduled, 0);
+}
+
+static void cpu_throttle_timer_tick(void *opaque)
+{
+ CPUState *cpu;
+ double pct;
+
+ /* Stop the timer if needed */
+ if (!cpu_throttle_get_percentage()) {
+ return;
+ }
+ CPU_FOREACH(cpu) {
+ if (!qatomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
+ async_run_on_cpu(cpu, cpu_throttle_thread,
+ RUN_ON_CPU_NULL);
+ }
+ }
+
+ pct = (double)cpu_throttle_get_percentage() / 100;
+ timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
+ CPU_THROTTLE_TIMESLICE_NS / (1 - pct));
+}
+
+void cpu_throttle_set(int new_throttle_pct)
+{
+ /*
+ * boolean to store whether throttle is already active or not,
+ * before modifying throttle_percentage
+ */
+ bool throttle_active = cpu_throttle_active();
+
+ /* Ensure throttle percentage is within valid range */
+ new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
+ new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
+
+ qatomic_set(&throttle_percentage, new_throttle_pct);
+
+ if (!throttle_active) {
+ cpu_throttle_timer_tick(NULL);
+ }
+}
+
+void cpu_throttle_stop(void)
+{
+ qatomic_set(&throttle_percentage, 0);
+}
+
+bool cpu_throttle_active(void)
+{
+ return (cpu_throttle_get_percentage() != 0);
+}
+
+int cpu_throttle_get_percentage(void)
+{
+ return qatomic_read(&throttle_percentage);
+}
+
+void cpu_throttle_init(void)
+{
+ throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
+ cpu_throttle_timer_tick, NULL);
+}
diff --git a/system/cpu-timers.c b/system/cpu-timers.c
new file mode 100644
index 0000000000..7452d97b67
--- /dev/null
+++ b/system/cpu-timers.c
@@ -0,0 +1,277 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/cutils.h"
+#include "migration/vmstate.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "sysemu/cpus.h"
+#include "qemu/main-loop.h"
+#include "qemu/option.h"
+#include "qemu/seqlock.h"
+#include "sysemu/replay.h"
+#include "sysemu/runstate.h"
+#include "hw/core/cpu.h"
+#include "sysemu/cpu-timers.h"
+#include "sysemu/cpu-throttle.h"
+#include "sysemu/cpu-timers-internal.h"
+
+/* clock and ticks */
+
+static int64_t cpu_get_ticks_locked(void)
+{
+ int64_t ticks = timers_state.cpu_ticks_offset;
+ if (timers_state.cpu_ticks_enabled) {
+ ticks += cpu_get_host_ticks();
+ }
+
+ if (timers_state.cpu_ticks_prev > ticks) {
+ /* Non increasing ticks may happen if the host uses software suspend. */
+ timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
+ ticks = timers_state.cpu_ticks_prev;
+ }
+
+ timers_state.cpu_ticks_prev = ticks;
+ return ticks;
+}
+
+/*
+ * return the time elapsed in VM between vm_start and vm_stop.
+ * cpu_get_ticks() uses units of the host CPU cycle counter.
+ */
+int64_t cpu_get_ticks(void)
+{
+ int64_t ticks;
+
+ qemu_spin_lock(&timers_state.vm_clock_lock);
+ ticks = cpu_get_ticks_locked();
+ qemu_spin_unlock(&timers_state.vm_clock_lock);
+ return ticks;
+}
+
+int64_t cpu_get_clock_locked(void)
+{
+ int64_t time;
+
+ time = timers_state.cpu_clock_offset;
+ if (timers_state.cpu_ticks_enabled) {
+ time += get_clock();
+ }
+
+ return time;
+}
+
+/*
+ * Return the monotonic time elapsed in VM, i.e.,
+ * the time between vm_start and vm_stop
+ */
+int64_t cpu_get_clock(void)
+{
+ int64_t ti;
+ unsigned start;
+
+ do {
+ start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
+ ti = cpu_get_clock_locked();
+ } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
+
+ return ti;
+}
+
+/*
+ * enable cpu_get_ticks()
+ * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
+ */
+void cpu_enable_ticks(void)
+{
+ seqlock_write_lock(&timers_state.vm_clock_seqlock,
+ &timers_state.vm_clock_lock);
+ if (!timers_state.cpu_ticks_enabled) {
+ timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
+ timers_state.cpu_clock_offset -= get_clock();
+ timers_state.cpu_ticks_enabled = 1;
+ }
+ seqlock_write_unlock(&timers_state.vm_clock_seqlock,
+ &timers_state.vm_clock_lock);
+}
+
+/*
+ * disable cpu_get_ticks() : the clock is stopped. You must not call
+ * cpu_get_ticks() after that.
+ * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
+ */
+void cpu_disable_ticks(void)
+{
+ seqlock_write_lock(&timers_state.vm_clock_seqlock,
+ &timers_state.vm_clock_lock);
+ if (timers_state.cpu_ticks_enabled) {
+ timers_state.cpu_ticks_offset += cpu_get_host_ticks();
+ timers_state.cpu_clock_offset = cpu_get_clock_locked();
+ timers_state.cpu_ticks_enabled = 0;
+ }
+ seqlock_write_unlock(&timers_state.vm_clock_seqlock,
+ &timers_state.vm_clock_lock);
+}
+
+static bool icount_state_needed(void *opaque)
+{
+ return icount_enabled();
+}
+
+static bool warp_timer_state_needed(void *opaque)
+{
+ TimersState *s = opaque;
+ return s->icount_warp_timer != NULL;
+}
+
+static bool adjust_timers_state_needed(void *opaque)
+{
+ TimersState *s = opaque;
+ return s->icount_rt_timer != NULL;
+}
+
+static bool icount_shift_state_needed(void *opaque)
+{
+ return icount_enabled() == 2;
+}
+
+/*
+ * Subsection for warp timer migration is optional, because may not be created
+ */
+static const VMStateDescription icount_vmstate_warp_timer = {
+ .name = "timer/icount/warp_timer",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .needed = warp_timer_state_needed,
+ .fields = (VMStateField[]) {
+ VMSTATE_INT64(vm_clock_warp_start, TimersState),
+ VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static const VMStateDescription icount_vmstate_adjust_timers = {
+ .name = "timer/icount/timers",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .needed = adjust_timers_state_needed,
+ .fields = (VMStateField[]) {
+ VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
+ VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static const VMStateDescription icount_vmstate_shift = {
+ .name = "timer/icount/shift",
+ .version_id = 2,
+ .minimum_version_id = 2,
+ .needed = icount_shift_state_needed,
+ .fields = (VMStateField[]) {
+ VMSTATE_INT16(icount_time_shift, TimersState),
+ VMSTATE_INT64(last_delta, TimersState),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+/*
+ * This is a subsection for icount migration.
+ */
+static const VMStateDescription icount_vmstate_timers = {
+ .name = "timer/icount",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .needed = icount_state_needed,
+ .fields = (VMStateField[]) {
+ VMSTATE_INT64(qemu_icount_bias, TimersState),
+ VMSTATE_INT64(qemu_icount, TimersState),
+ VMSTATE_END_OF_LIST()
+ },
+ .subsections = (const VMStateDescription * []) {
+ &icount_vmstate_warp_timer,
+ &icount_vmstate_adjust_timers,
+ &icount_vmstate_shift,
+ NULL
+ }
+};
+
+static const VMStateDescription vmstate_timers = {
+ .name = "timer",
+ .version_id = 2,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_INT64(cpu_ticks_offset, TimersState),
+ VMSTATE_UNUSED(8),
+ VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
+ VMSTATE_END_OF_LIST()
+ },
+ .subsections = (const VMStateDescription * []) {
+ &icount_vmstate_timers,
+ NULL
+ }
+};
+
+static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
+{
+}
+
+void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
+{
+ if (!icount_enabled() || type != QEMU_CLOCK_VIRTUAL) {
+ qemu_notify_event();
+ return;
+ }
+
+ if (qemu_in_vcpu_thread()) {
+ /*
+ * A CPU is currently running; kick it back out to the
+ * tcg_cpu_exec() loop so it will recalculate its
+ * icount deadline immediately.
+ */
+ qemu_cpu_kick(current_cpu);
+ } else if (first_cpu) {
+ /*
+ * qemu_cpu_kick is not enough to kick a halted CPU out of
+ * qemu_tcg_wait_io_event. async_run_on_cpu, instead,
+ * causes cpu_thread_is_idle to return false. This way,
+ * handle_icount_deadline can run.
+ * If we have no CPUs at all for some reason, we don't
+ * need to do anything.
+ */
+ async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
+ }
+}
+
+TimersState timers_state;
+
+/* initialize timers state and the cpu throttle for convenience */
+void cpu_timers_init(void)
+{
+ seqlock_init(&timers_state.vm_clock_seqlock);
+ qemu_spin_init(&timers_state.vm_clock_lock);
+ vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
+
+ cpu_throttle_init();
+}
diff --git a/system/cpus.c b/system/cpus.c
new file mode 100644
index 0000000000..0848e0dbdb
--- /dev/null
+++ b/system/cpus.c
@@ -0,0 +1,822 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "monitor/monitor.h"
+#include "qemu/coroutine-tls.h"
+#include "qapi/error.h"
+#include "qapi/qapi-commands-machine.h"
+#include "qapi/qapi-commands-misc.h"
+#include "qapi/qapi-events-run-state.h"
+#include "qapi/qmp/qerror.h"
+#include "exec/gdbstub.h"
+#include "sysemu/hw_accel.h"
+#include "exec/cpu-common.h"
+#include "qemu/thread.h"
+#include "qemu/main-loop.h"
+#include "qemu/plugin.h"
+#include "sysemu/cpus.h"
+#include "qemu/guest-random.h"
+#include "hw/nmi.h"
+#include "sysemu/replay.h"
+#include "sysemu/runstate.h"
+#include "sysemu/cpu-timers.h"
+#include "sysemu/whpx.h"
+#include "hw/boards.h"
+#include "hw/hw.h"
+#include "trace.h"
+
+#ifdef CONFIG_LINUX
+
+#include <sys/prctl.h>
+
+#ifndef PR_MCE_KILL
+#define PR_MCE_KILL 33
+#endif
+
+#ifndef PR_MCE_KILL_SET
+#define PR_MCE_KILL_SET 1
+#endif
+
+#ifndef PR_MCE_KILL_EARLY
+#define PR_MCE_KILL_EARLY 1
+#endif
+
+#endif /* CONFIG_LINUX */
+
+static QemuMutex qemu_global_mutex;
+
+/*
+ * The chosen accelerator is supposed to register this.
+ */
+static const AccelOpsClass *cpus_accel;
+
+bool cpu_is_stopped(CPUState *cpu)
+{
+ return cpu->stopped || !runstate_is_running();
+}
+
+bool cpu_work_list_empty(CPUState *cpu)
+{
+ return QSIMPLEQ_EMPTY_ATOMIC(&cpu->work_list);
+}
+
+bool cpu_thread_is_idle(CPUState *cpu)
+{
+ if (cpu->stop || !cpu_work_list_empty(cpu)) {
+ return false;
+ }
+ if (cpu_is_stopped(cpu)) {
+ return true;
+ }
+ if (!cpu->halted || cpu_has_work(cpu)) {
+ return false;
+ }
+ if (cpus_accel->cpu_thread_is_idle) {
+ return cpus_accel->cpu_thread_is_idle(cpu);
+ }
+ return true;
+}
+
+bool all_cpu_threads_idle(void)
+{
+ CPUState *cpu;
+
+ CPU_FOREACH(cpu) {
+ if (!cpu_thread_is_idle(cpu)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+/***********************************************************/
+void hw_error(const char *fmt, ...)
+{
+ va_list ap;
+ CPUState *cpu;
+
+ va_start(ap, fmt);
+ fprintf(stderr, "qemu: hardware error: ");
+ vfprintf(stderr, fmt, ap);
+ fprintf(stderr, "\n");
+ CPU_FOREACH(cpu) {
+ fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
+ cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
+ }
+ va_end(ap);
+ abort();
+}
+
+void cpu_synchronize_all_states(void)
+{
+ CPUState *cpu;
+
+ CPU_FOREACH(cpu) {
+ cpu_synchronize_state(cpu);
+ }
+}
+
+void cpu_synchronize_all_post_reset(void)
+{
+ CPUState *cpu;
+
+ CPU_FOREACH(cpu) {
+ cpu_synchronize_post_reset(cpu);
+ }
+}
+
+void cpu_synchronize_all_post_init(void)
+{
+ CPUState *cpu;
+
+ CPU_FOREACH(cpu) {
+ cpu_synchronize_post_init(cpu);
+ }
+}
+
+void cpu_synchronize_all_pre_loadvm(void)
+{
+ CPUState *cpu;
+
+ CPU_FOREACH(cpu) {
+ cpu_synchronize_pre_loadvm(cpu);
+ }
+}
+
+void cpu_synchronize_state(CPUState *cpu)
+{
+ if (cpus_accel->synchronize_state) {
+ cpus_accel->synchronize_state(cpu);
+ }
+}
+
+void cpu_synchronize_post_reset(CPUState *cpu)
+{
+ if (cpus_accel->synchronize_post_reset) {
+ cpus_accel->synchronize_post_reset(cpu);
+ }
+}
+
+void cpu_synchronize_post_init(CPUState *cpu)
+{
+ if (cpus_accel->synchronize_post_init) {
+ cpus_accel->synchronize_post_init(cpu);
+ }
+}
+
+void cpu_synchronize_pre_loadvm(CPUState *cpu)
+{
+ if (cpus_accel->synchronize_pre_loadvm) {
+ cpus_accel->synchronize_pre_loadvm(cpu);
+ }
+}
+
+bool cpus_are_resettable(void)
+{
+ if (cpus_accel->cpus_are_resettable) {
+ return cpus_accel->cpus_are_resettable();
+ }
+ return true;
+}
+
+int64_t cpus_get_virtual_clock(void)
+{
+ /*
+ * XXX
+ *
+ * need to check that cpus_accel is not NULL, because qcow2 calls
+ * qemu_get_clock_ns(CLOCK_VIRTUAL) without any accel initialized and
+ * with ticks disabled in some io-tests:
+ * 030 040 041 060 099 120 127 140 156 161 172 181 191 192 195 203 229 249 256 267
+ *
+ * is this expected?
+ *
+ * XXX
+ */
+ if (cpus_accel && cpus_accel->get_virtual_clock) {
+ return cpus_accel->get_virtual_clock();
+ }
+ return cpu_get_clock();
+}
+
+/*
+ * return the time elapsed in VM between vm_start and vm_stop. Unless
+ * icount is active, cpus_get_elapsed_ticks() uses units of the host CPU cycle
+ * counter.
+ */
+int64_t cpus_get_elapsed_ticks(void)
+{
+ if (cpus_accel->get_elapsed_ticks) {
+ return cpus_accel->get_elapsed_ticks();
+ }
+ return cpu_get_ticks();
+}
+
+static void generic_handle_interrupt(CPUState *cpu, int mask)
+{
+ cpu->interrupt_request |= mask;
+
+ if (!qemu_cpu_is_self(cpu)) {
+ qemu_cpu_kick(cpu);
+ }
+}
+
+void cpu_interrupt(CPUState *cpu, int mask)
+{
+ if (cpus_accel->handle_interrupt) {
+ cpus_accel->handle_interrupt(cpu, mask);
+ } else {
+ generic_handle_interrupt(cpu, mask);
+ }
+}
+
+static int do_vm_stop(RunState state, bool send_stop)
+{
+ int ret = 0;
+
+ if (runstate_is_running()) {
+ runstate_set(state);
+ cpu_disable_ticks();
+ pause_all_vcpus();
+ vm_state_notify(0, state);
+ if (send_stop) {
+ qapi_event_send_stop();
+ }
+ }
+
+ bdrv_drain_all();
+ ret = bdrv_flush_all();
+ trace_vm_stop_flush_all(ret);
+
+ return ret;
+}
+
+/* Special vm_stop() variant for terminating the process. Historically clients
+ * did not expect a QMP STOP event and so we need to retain compatibility.
+ */
+int vm_shutdown(void)
+{
+ return do_vm_stop(RUN_STATE_SHUTDOWN, false);
+}
+
+bool cpu_can_run(CPUState *cpu)
+{
+ if (cpu->stop) {
+ return false;
+ }
+ if (cpu_is_stopped(cpu)) {
+ return false;
+ }
+ return true;
+}
+
+void cpu_handle_guest_debug(CPUState *cpu)
+{
+ if (replay_running_debug()) {
+ if (!cpu->singlestep_enabled) {
+ /*
+ * Report about the breakpoint and
+ * make a single step to skip it
+ */
+ replay_breakpoint();
+ cpu_single_step(cpu, SSTEP_ENABLE);
+ } else {
+ cpu_single_step(cpu, 0);
+ }
+ } else {
+ gdb_set_stop_cpu(cpu);
+ qemu_system_debug_request();
+ cpu->stopped = true;
+ }
+}
+
+#ifdef CONFIG_LINUX
+static void sigbus_reraise(void)
+{
+ sigset_t set;
+ struct sigaction action;
+
+ memset(&action, 0, sizeof(action));
+ action.sa_handler = SIG_DFL;
+ if (!sigaction(SIGBUS, &action, NULL)) {
+ raise(SIGBUS);
+ sigemptyset(&set);
+ sigaddset(&set, SIGBUS);
+ pthread_sigmask(SIG_UNBLOCK, &set, NULL);
+ }
+ perror("Failed to re-raise SIGBUS!");
+ abort();
+}
+
+static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
+{
+ if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
+ sigbus_reraise();
+ }
+
+ if (current_cpu) {
+ /* Called asynchronously in VCPU thread. */
+ if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
+ sigbus_reraise();
+ }
+ } else {
+ /* Called synchronously (via signalfd) in main thread. */
+ if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
+ sigbus_reraise();
+ }
+ }
+}
+
+static void qemu_init_sigbus(void)
+{
+ struct sigaction action;
+
+ /*
+ * ALERT: when modifying this, take care that SIGBUS forwarding in
+ * qemu_prealloc_mem() will continue working as expected.
+ */
+ memset(&action, 0, sizeof(action));
+ action.sa_flags = SA_SIGINFO;
+ action.sa_sigaction = sigbus_handler;
+ sigaction(SIGBUS, &action, NULL);
+
+ prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
+}
+#else /* !CONFIG_LINUX */
+static void qemu_init_sigbus(void)
+{
+}
+#endif /* !CONFIG_LINUX */
+
+static QemuThread io_thread;
+
+/* cpu creation */
+static QemuCond qemu_cpu_cond;
+/* system init */
+static QemuCond qemu_pause_cond;
+
+void qemu_init_cpu_loop(void)
+{
+ qemu_init_sigbus();
+ qemu_cond_init(&qemu_cpu_cond);
+ qemu_cond_init(&qemu_pause_cond);
+ qemu_mutex_init(&qemu_global_mutex);
+
+ qemu_thread_get_self(&io_thread);
+}
+
+void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
+{
+ do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
+}
+
+static void qemu_cpu_stop(CPUState *cpu, bool exit)
+{
+ g_assert(qemu_cpu_is_self(cpu));
+ cpu->stop = false;
+ cpu->stopped = true;
+ if (exit) {
+ cpu_exit(cpu);
+ }
+ qemu_cond_broadcast(&qemu_pause_cond);
+}
+
+void qemu_wait_io_event_common(CPUState *cpu)
+{
+ qatomic_set_mb(&cpu->thread_kicked, false);
+ if (cpu->stop) {
+ qemu_cpu_stop(cpu, false);
+ }
+ process_queued_cpu_work(cpu);
+}
+
+void qemu_wait_io_event(CPUState *cpu)
+{
+ bool slept = false;
+
+ while (cpu_thread_is_idle(cpu)) {
+ if (!slept) {
+ slept = true;
+ qemu_plugin_vcpu_idle_cb(cpu);
+ }
+ qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
+ }
+ if (slept) {
+ qemu_plugin_vcpu_resume_cb(cpu);
+ }
+
+ qemu_wait_io_event_common(cpu);
+}
+
+void cpus_kick_thread(CPUState *cpu)
+{
+ if (cpu->thread_kicked) {
+ return;
+ }
+ cpu->thread_kicked = true;
+
+#ifndef _WIN32
+ int err = pthread_kill(cpu->thread->thread, SIG_IPI);
+ if (err && err != ESRCH) {
+ fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
+ exit(1);
+ }
+#else
+ qemu_sem_post(&cpu->sem);
+#endif
+}
+
+void qemu_cpu_kick(CPUState *cpu)
+{
+ qemu_cond_broadcast(cpu->halt_cond);
+ if (cpus_accel->kick_vcpu_thread) {
+ cpus_accel->kick_vcpu_thread(cpu);
+ } else { /* default */
+ cpus_kick_thread(cpu);
+ }
+}
+
+void qemu_cpu_kick_self(void)
+{
+ assert(current_cpu);
+ cpus_kick_thread(current_cpu);
+}
+
+bool qemu_cpu_is_self(CPUState *cpu)
+{
+ return qemu_thread_is_self(cpu->thread);
+}
+
+bool qemu_in_vcpu_thread(void)
+{
+ return current_cpu && qemu_cpu_is_self(current_cpu);
+}
+
+QEMU_DEFINE_STATIC_CO_TLS(bool, iothread_locked)
+
+bool qemu_mutex_iothread_locked(void)
+{
+ return get_iothread_locked();
+}
+
+bool qemu_in_main_thread(void)
+{
+ return qemu_mutex_iothread_locked();
+}
+
+/*
+ * The BQL is taken from so many places that it is worth profiling the
+ * callers directly, instead of funneling them all through a single function.
+ */
+void qemu_mutex_lock_iothread_impl(const char *file, int line)
+{
+ QemuMutexLockFunc bql_lock = qatomic_read(&qemu_bql_mutex_lock_func);
+
+ g_assert(!qemu_mutex_iothread_locked());
+ bql_lock(&qemu_global_mutex, file, line);
+ set_iothread_locked(true);
+}
+
+void qemu_mutex_unlock_iothread(void)
+{
+ g_assert(qemu_mutex_iothread_locked());
+ set_iothread_locked(false);
+ qemu_mutex_unlock(&qemu_global_mutex);
+}
+
+void qemu_cond_wait_iothread(QemuCond *cond)
+{
+ qemu_cond_wait(cond, &qemu_global_mutex);
+}
+
+void qemu_cond_timedwait_iothread(QemuCond *cond, int ms)
+{
+ qemu_cond_timedwait(cond, &qemu_global_mutex, ms);
+}
+
+/* signal CPU creation */
+void cpu_thread_signal_created(CPUState *cpu)
+{
+ cpu->created = true;
+ qemu_cond_signal(&qemu_cpu_cond);
+}
+
+/* signal CPU destruction */
+void cpu_thread_signal_destroyed(CPUState *cpu)
+{
+ cpu->created = false;
+ qemu_cond_signal(&qemu_cpu_cond);
+}
+
+
+static bool all_vcpus_paused(void)
+{
+ CPUState *cpu;
+
+ CPU_FOREACH(cpu) {
+ if (!cpu->stopped) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+void pause_all_vcpus(void)
+{
+ CPUState *cpu;
+
+ qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
+ CPU_FOREACH(cpu) {
+ if (qemu_cpu_is_self(cpu)) {
+ qemu_cpu_stop(cpu, true);
+ } else {
+ cpu->stop = true;
+ qemu_cpu_kick(cpu);
+ }
+ }
+
+ /* We need to drop the replay_lock so any vCPU threads woken up
+ * can finish their replay tasks
+ */
+ replay_mutex_unlock();
+
+ while (!all_vcpus_paused()) {
+ qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
+ CPU_FOREACH(cpu) {
+ qemu_cpu_kick(cpu);
+ }
+ }
+
+ qemu_mutex_unlock_iothread();
+ replay_mutex_lock();
+ qemu_mutex_lock_iothread();
+}
+
+void cpu_resume(CPUState *cpu)
+{
+ cpu->stop = false;
+ cpu->stopped = false;
+ qemu_cpu_kick(cpu);
+}
+
+void resume_all_vcpus(void)
+{
+ CPUState *cpu;
+
+ if (!runstate_is_running()) {
+ return;
+ }
+
+ qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
+ CPU_FOREACH(cpu) {
+ cpu_resume(cpu);
+ }
+}
+
+void cpu_remove_sync(CPUState *cpu)
+{
+ cpu->stop = true;
+ cpu->unplug = true;
+ qemu_cpu_kick(cpu);
+ qemu_mutex_unlock_iothread();
+ qemu_thread_join(cpu->thread);
+ qemu_mutex_lock_iothread();
+}
+
+void cpus_register_accel(const AccelOpsClass *ops)
+{
+ assert(ops != NULL);
+ assert(ops->create_vcpu_thread != NULL); /* mandatory */
+ cpus_accel = ops;
+}
+
+const AccelOpsClass *cpus_get_accel(void)
+{
+ /* broken if we call this early */
+ assert(cpus_accel);
+ return cpus_accel;
+}
+
+void qemu_init_vcpu(CPUState *cpu)
+{
+ MachineState *ms = MACHINE(qdev_get_machine());
+
+ cpu->nr_cores = ms->smp.cores;
+ cpu->nr_threads = ms->smp.threads;
+ cpu->stopped = true;
+ cpu->random_seed = qemu_guest_random_seed_thread_part1();
+
+ if (!cpu->as) {
+ /* If the target cpu hasn't set up any address spaces itself,
+ * give it the default one.
+ */
+ cpu->num_ases = 1;
+ cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
+ }
+
+ /* accelerators all implement the AccelOpsClass */
+ g_assert(cpus_accel != NULL && cpus_accel->create_vcpu_thread != NULL);
+ cpus_accel->create_vcpu_thread(cpu);
+
+ while (!cpu->created) {
+ qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
+ }
+}
+
+void cpu_stop_current(void)
+{
+ if (current_cpu) {
+ current_cpu->stop = true;
+ cpu_exit(current_cpu);
+ }
+}
+
+int vm_stop(RunState state)
+{
+ if (qemu_in_vcpu_thread()) {
+ qemu_system_vmstop_request_prepare();
+ qemu_system_vmstop_request(state);
+ /*
+ * FIXME: should not return to device code in case
+ * vm_stop() has been requested.
+ */
+ cpu_stop_current();
+ return 0;
+ }
+
+ return do_vm_stop(state, true);
+}
+
+/**
+ * Prepare for (re)starting the VM.
+ * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
+ * running or in case of an error condition), 0 otherwise.
+ */
+int vm_prepare_start(bool step_pending)
+{
+ RunState requested;
+
+ qemu_vmstop_requested(&requested);
+ if (runstate_is_running() && requested == RUN_STATE__MAX) {
+ return -1;
+ }
+
+ /* Ensure that a STOP/RESUME pair of events is emitted if a
+ * vmstop request was pending. The BLOCK_IO_ERROR event, for
+ * example, according to documentation is always followed by
+ * the STOP event.
+ */
+ if (runstate_is_running()) {
+ qapi_event_send_stop();
+ qapi_event_send_resume();
+ return -1;
+ }
+
+ /*
+ * WHPX accelerator needs to know whether we are going to step
+ * any CPUs, before starting the first one.
+ */
+ if (cpus_accel->synchronize_pre_resume) {
+ cpus_accel->synchronize_pre_resume(step_pending);
+ }
+
+ /* We are sending this now, but the CPUs will be resumed shortly later */
+ qapi_event_send_resume();
+
+ cpu_enable_ticks();
+ runstate_set(RUN_STATE_RUNNING);
+ vm_state_notify(1, RUN_STATE_RUNNING);
+ return 0;
+}
+
+void vm_start(void)
+{
+ if (!vm_prepare_start(false)) {
+ resume_all_vcpus();
+ }
+}
+
+/* does a state transition even if the VM is already stopped,
+ current state is forgotten forever */
+int vm_stop_force_state(RunState state)
+{
+ if (runstate_is_running()) {
+ return vm_stop(state);
+ } else {
+ int ret;
+ runstate_set(state);
+
+ bdrv_drain_all();
+ /* Make sure to return an error if the flush in a previous vm_stop()
+ * failed. */
+ ret = bdrv_flush_all();
+ trace_vm_stop_flush_all(ret);
+ return ret;
+ }
+}
+
+void qmp_memsave(int64_t addr, int64_t size, const char *filename,
+ bool has_cpu, int64_t cpu_index, Error **errp)
+{
+ FILE *f;
+ uint32_t l;
+ CPUState *cpu;
+ uint8_t buf[1024];
+ int64_t orig_addr = addr, orig_size = size;
+
+ if (!has_cpu) {
+ cpu_index = 0;
+ }
+
+ cpu = qemu_get_cpu(cpu_index);
+ if (cpu == NULL) {
+ error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
+ "a CPU number");
+ return;
+ }
+
+ f = fopen(filename, "wb");
+ if (!f) {
+ error_setg_file_open(errp, errno, filename);
+ return;
+ }
+
+ while (size != 0) {
+ l = sizeof(buf);
+ if (l > size)
+ l = size;
+ if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
+ error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
+ " specified", orig_addr, orig_size);
+ goto exit;
+ }
+ if (fwrite(buf, 1, l, f) != l) {
+ error_setg(errp, QERR_IO_ERROR);
+ goto exit;
+ }
+ addr += l;
+ size -= l;
+ }
+
+exit:
+ fclose(f);
+}
+
+void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
+ Error **errp)
+{
+ FILE *f;
+ uint32_t l;
+ uint8_t buf[1024];
+
+ f = fopen(filename, "wb");
+ if (!f) {
+ error_setg_file_open(errp, errno, filename);
+ return;
+ }
+
+ while (size != 0) {
+ l = sizeof(buf);
+ if (l > size)
+ l = size;
+ cpu_physical_memory_read(addr, buf, l);
+ if (fwrite(buf, 1, l, f) != l) {
+ error_setg(errp, QERR_IO_ERROR);
+ goto exit;
+ }
+ addr += l;
+ size -= l;
+ }
+
+exit:
+ fclose(f);
+}
+
+void qmp_inject_nmi(Error **errp)
+{
+ nmi_monitor_handle(monitor_get_cpu_index(monitor_cur()), errp);
+}
+
diff --git a/system/datadir.c b/system/datadir.c
new file mode 100644
index 0000000000..c9237cb5d4
--- /dev/null
+++ b/system/datadir.c
@@ -0,0 +1,110 @@
+/*
+ * QEMU firmware and keymap file search
+ *
+ * Copyright (c) 2003-2020 QEMU contributors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/datadir.h"
+#include "qemu/cutils.h"
+#include "trace.h"
+
+static const char *data_dir[16];
+static int data_dir_idx;
+
+char *qemu_find_file(int type, const char *name)
+{
+ int i;
+ const char *subdir;
+ char *buf;
+
+ /* Try the name as a straight path first */
+ if (access(name, R_OK) == 0) {
+ trace_load_file(name, name);
+ return g_strdup(name);
+ }
+
+ switch (type) {
+ case QEMU_FILE_TYPE_BIOS:
+ subdir = "";
+ break;
+ case QEMU_FILE_TYPE_KEYMAP:
+ subdir = "keymaps/";
+ break;
+ default:
+ abort();
+ }
+
+ for (i = 0; i < data_dir_idx; i++) {
+ buf = g_strdup_printf("%s/%s%s", data_dir[i], subdir, name);
+ if (access(buf, R_OK) == 0) {
+ trace_load_file(name, buf);
+ return buf;
+ }
+ g_free(buf);
+ }
+ return NULL;
+}
+
+void qemu_add_data_dir(char *path)
+{
+ int i;
+
+ if (path == NULL) {
+ return;
+ }
+ if (data_dir_idx == ARRAY_SIZE(data_dir)) {
+ return;
+ }
+ for (i = 0; i < data_dir_idx; i++) {
+ if (strcmp(data_dir[i], path) == 0) {
+ g_free(path); /* duplicate */
+ return;
+ }
+ }
+ data_dir[data_dir_idx++] = path;
+}
+
+void qemu_add_default_firmwarepath(void)
+{
+ static const char * const dirs[] = {
+ CONFIG_QEMU_FIRMWAREPATH
+ NULL
+ };
+
+ size_t i;
+
+ /* add configured firmware directories */
+ for (i = 0; dirs[i] != NULL; i++) {
+ qemu_add_data_dir(get_relocated_path(dirs[i]));
+ }
+
+ /* try to find datadir relative to the executable path */
+ qemu_add_data_dir(get_relocated_path(CONFIG_QEMU_DATADIR));
+}
+
+void qemu_list_data_dirs(void)
+{
+ int i;
+ for (i = 0; i < data_dir_idx; i++) {
+ printf("%s\n", data_dir[i]);
+ }
+}
diff --git a/system/device_tree.c b/system/device_tree.c
new file mode 100644
index 0000000000..eb5166ca36
--- /dev/null
+++ b/system/device_tree.c
@@ -0,0 +1,703 @@
+/*
+ * Functions to help device tree manipulation using libfdt.
+ * It also provides functions to read entries from device tree proc
+ * interface.
+ *
+ * Copyright 2008 IBM Corporation.
+ * Authors: Jerone Young <jyoung5@us.ibm.com>
+ * Hollis Blanchard <hollisb@us.ibm.com>
+ *
+ * This work is licensed under the GNU GPL license version 2 or later.
+ *
+ */
+
+#include "qemu/osdep.h"
+
+#ifdef CONFIG_LINUX
+#include <dirent.h>
+#endif
+
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/option.h"
+#include "qemu/bswap.h"
+#include "qemu/cutils.h"
+#include "qemu/guest-random.h"
+#include "sysemu/device_tree.h"
+#include "hw/loader.h"
+#include "hw/boards.h"
+#include "qemu/config-file.h"
+#include "qapi/qapi-commands-machine.h"
+#include "qapi/qmp/qdict.h"
+#include "monitor/hmp.h"
+
+#include <libfdt.h>
+
+#define FDT_MAX_SIZE 0x100000
+
+void *create_device_tree(int *sizep)
+{
+ void *fdt;
+ int ret;
+
+ *sizep = FDT_MAX_SIZE;
+ fdt = g_malloc0(FDT_MAX_SIZE);
+ ret = fdt_create(fdt, FDT_MAX_SIZE);
+ if (ret < 0) {
+ goto fail;
+ }
+ ret = fdt_finish_reservemap(fdt);
+ if (ret < 0) {
+ goto fail;
+ }
+ ret = fdt_begin_node(fdt, "");
+ if (ret < 0) {
+ goto fail;
+ }
+ ret = fdt_end_node(fdt);
+ if (ret < 0) {
+ goto fail;
+ }
+ ret = fdt_finish(fdt);
+ if (ret < 0) {
+ goto fail;
+ }
+ ret = fdt_open_into(fdt, fdt, *sizep);
+ if (ret) {
+ error_report("%s: Unable to copy device tree into memory: %s",
+ __func__, fdt_strerror(ret));
+ exit(1);
+ }
+
+ return fdt;
+fail:
+ error_report("%s Couldn't create dt: %s", __func__, fdt_strerror(ret));
+ exit(1);
+}
+
+void *load_device_tree(const char *filename_path, int *sizep)
+{
+ int dt_size;
+ int dt_file_load_size;
+ int ret;
+ void *fdt = NULL;
+
+ *sizep = 0;
+ dt_size = get_image_size(filename_path);
+ if (dt_size < 0) {
+ error_report("Unable to get size of device tree file '%s'",
+ filename_path);
+ goto fail;
+ }
+ if (dt_size > INT_MAX / 2 - 10000) {
+ error_report("Device tree file '%s' is too large", filename_path);
+ goto fail;
+ }
+
+ /* Expand to 2x size to give enough room for manipulation. */
+ dt_size += 10000;
+ dt_size *= 2;
+ /* First allocate space in qemu for device tree */
+ fdt = g_malloc0(dt_size);
+
+ dt_file_load_size = load_image_size(filename_path, fdt, dt_size);
+ if (dt_file_load_size < 0) {
+ error_report("Unable to open device tree file '%s'",
+ filename_path);
+ goto fail;
+ }
+
+ ret = fdt_open_into(fdt, fdt, dt_size);
+ if (ret) {
+ error_report("%s: Unable to copy device tree into memory: %s",
+ __func__, fdt_strerror(ret));
+ goto fail;
+ }
+
+ /* Check sanity of device tree */
+ if (fdt_check_header(fdt)) {
+ error_report("Device tree file loaded into memory is invalid: %s",
+ filename_path);
+ goto fail;
+ }
+ *sizep = dt_size;
+ return fdt;
+
+fail:
+ g_free(fdt);
+ return NULL;
+}
+
+#ifdef CONFIG_LINUX
+
+#define SYSFS_DT_BASEDIR "/proc/device-tree"
+
+/**
+ * read_fstree: this function is inspired from dtc read_fstree
+ * @fdt: preallocated fdt blob buffer, to be populated
+ * @dirname: directory to scan under SYSFS_DT_BASEDIR
+ * the search is recursive and the tree is searched down to the
+ * leaves (property files).
+ *
+ * the function asserts in case of error
+ */
+static void read_fstree(void *fdt, const char *dirname)
+{
+ DIR *d;
+ struct dirent *de;
+ struct stat st;
+ const char *root_dir = SYSFS_DT_BASEDIR;
+ const char *parent_node;
+
+ if (strstr(dirname, root_dir) != dirname) {
+ error_report("%s: %s must be searched within %s",
+ __func__, dirname, root_dir);
+ exit(1);
+ }
+ parent_node = &dirname[strlen(SYSFS_DT_BASEDIR)];
+
+ d = opendir(dirname);
+ if (!d) {
+ error_report("%s cannot open %s", __func__, dirname);
+ exit(1);
+ }
+
+ while ((de = readdir(d)) != NULL) {
+ char *tmpnam;
+
+ if (!g_strcmp0(de->d_name, ".")
+ || !g_strcmp0(de->d_name, "..")) {
+ continue;
+ }
+
+ tmpnam = g_strdup_printf("%s/%s", dirname, de->d_name);
+
+ if (lstat(tmpnam, &st) < 0) {
+ error_report("%s cannot lstat %s", __func__, tmpnam);
+ exit(1);
+ }
+
+ if (S_ISREG(st.st_mode)) {
+ gchar *val;
+ gsize len;
+
+ if (!g_file_get_contents(tmpnam, &val, &len, NULL)) {
+ error_report("%s not able to extract info from %s",
+ __func__, tmpnam);
+ exit(1);
+ }
+
+ if (strlen(parent_node) > 0) {
+ qemu_fdt_setprop(fdt, parent_node,
+ de->d_name, val, len);
+ } else {
+ qemu_fdt_setprop(fdt, "/", de->d_name, val, len);
+ }
+ g_free(val);
+ } else if (S_ISDIR(st.st_mode)) {
+ char *node_name;
+
+ node_name = g_strdup_printf("%s/%s",
+ parent_node, de->d_name);
+ qemu_fdt_add_subnode(fdt, node_name);
+ g_free(node_name);
+ read_fstree(fdt, tmpnam);
+ }
+
+ g_free(tmpnam);
+ }
+
+ closedir(d);
+}
+
+/* load_device_tree_from_sysfs: extract the dt blob from host sysfs */
+void *load_device_tree_from_sysfs(void)
+{
+ void *host_fdt;
+ int host_fdt_size;
+
+ host_fdt = create_device_tree(&host_fdt_size);
+ read_fstree(host_fdt, SYSFS_DT_BASEDIR);
+ if (fdt_check_header(host_fdt)) {
+ error_report("%s host device tree extracted into memory is invalid",
+ __func__);
+ exit(1);
+ }
+ return host_fdt;
+}
+
+#endif /* CONFIG_LINUX */
+
+static int findnode_nofail(void *fdt, const char *node_path)
+{
+ int offset;
+
+ offset = fdt_path_offset(fdt, node_path);
+ if (offset < 0) {
+ error_report("%s Couldn't find node %s: %s", __func__, node_path,
+ fdt_strerror(offset));
+ exit(1);
+ }
+
+ return offset;
+}
+
+char **qemu_fdt_node_unit_path(void *fdt, const char *name, Error **errp)
+{
+ char *prefix = g_strdup_printf("%s@", name);
+ unsigned int path_len = 16, n = 0;
+ GSList *path_list = NULL, *iter;
+ const char *iter_name;
+ int offset, len, ret;
+ char **path_array;
+
+ offset = fdt_next_node(fdt, -1, NULL);
+
+ while (offset >= 0) {
+ iter_name = fdt_get_name(fdt, offset, &len);
+ if (!iter_name) {
+ offset = len;
+ break;
+ }
+ if (!strcmp(iter_name, name) || g_str_has_prefix(iter_name, prefix)) {
+ char *path;
+
+ path = g_malloc(path_len);
+ while ((ret = fdt_get_path(fdt, offset, path, path_len))
+ == -FDT_ERR_NOSPACE) {
+ path_len += 16;
+ path = g_realloc(path, path_len);
+ }
+ path_list = g_slist_prepend(path_list, path);
+ n++;
+ }
+ offset = fdt_next_node(fdt, offset, NULL);
+ }
+ g_free(prefix);
+
+ if (offset < 0 && offset != -FDT_ERR_NOTFOUND) {
+ error_setg(errp, "%s: abort parsing dt for %s node units: %s",
+ __func__, name, fdt_strerror(offset));
+ for (iter = path_list; iter; iter = iter->next) {
+ g_free(iter->data);
+ }
+ g_slist_free(path_list);
+ return NULL;
+ }
+
+ path_array = g_new(char *, n + 1);
+ path_array[n--] = NULL;
+
+ for (iter = path_list; iter; iter = iter->next) {
+ path_array[n--] = iter->data;
+ }
+
+ g_slist_free(path_list);
+
+ return path_array;
+}
+
+char **qemu_fdt_node_path(void *fdt, const char *name, const char *compat,
+ Error **errp)
+{
+ int offset, len, ret;
+ const char *iter_name;
+ unsigned int path_len = 16, n = 0;
+ GSList *path_list = NULL, *iter;
+ char **path_array;
+
+ offset = fdt_node_offset_by_compatible(fdt, -1, compat);
+
+ while (offset >= 0) {
+ iter_name = fdt_get_name(fdt, offset, &len);
+ if (!iter_name) {
+ offset = len;
+ break;
+ }
+ if (!name || !strcmp(iter_name, name)) {
+ char *path;
+
+ path = g_malloc(path_len);
+ while ((ret = fdt_get_path(fdt, offset, path, path_len))
+ == -FDT_ERR_NOSPACE) {
+ path_len += 16;
+ path = g_realloc(path, path_len);
+ }
+ path_list = g_slist_prepend(path_list, path);
+ n++;
+ }
+ offset = fdt_node_offset_by_compatible(fdt, offset, compat);
+ }
+
+ if (offset < 0 && offset != -FDT_ERR_NOTFOUND) {
+ error_setg(errp, "%s: abort parsing dt for %s/%s: %s",
+ __func__, name, compat, fdt_strerror(offset));
+ for (iter = path_list; iter; iter = iter->next) {
+ g_free(iter->data);
+ }
+ g_slist_free(path_list);
+ return NULL;
+ }
+
+ path_array = g_new(char *, n + 1);
+ path_array[n--] = NULL;
+
+ for (iter = path_list; iter; iter = iter->next) {
+ path_array[n--] = iter->data;
+ }
+
+ g_slist_free(path_list);
+
+ return path_array;
+}
+
+int qemu_fdt_setprop(void *fdt, const char *node_path,
+ const char *property, const void *val, int size)
+{
+ int r;
+
+ r = fdt_setprop(fdt, findnode_nofail(fdt, node_path), property, val, size);
+ if (r < 0) {
+ error_report("%s: Couldn't set %s/%s: %s", __func__, node_path,
+ property, fdt_strerror(r));
+ exit(1);
+ }
+
+ return r;
+}
+
+int qemu_fdt_setprop_cell(void *fdt, const char *node_path,
+ const char *property, uint32_t val)
+{
+ int r;
+
+ r = fdt_setprop_cell(fdt, findnode_nofail(fdt, node_path), property, val);
+ if (r < 0) {
+ error_report("%s: Couldn't set %s/%s = %#08x: %s", __func__,
+ node_path, property, val, fdt_strerror(r));
+ exit(1);
+ }
+
+ return r;
+}
+
+int qemu_fdt_setprop_u64(void *fdt, const char *node_path,
+ const char *property, uint64_t val)
+{
+ val = cpu_to_be64(val);
+ return qemu_fdt_setprop(fdt, node_path, property, &val, sizeof(val));
+}
+
+int qemu_fdt_setprop_string(void *fdt, const char *node_path,
+ const char *property, const char *string)
+{
+ int r;
+
+ r = fdt_setprop_string(fdt, findnode_nofail(fdt, node_path), property, string);
+ if (r < 0) {
+ error_report("%s: Couldn't set %s/%s = %s: %s", __func__,
+ node_path, property, string, fdt_strerror(r));
+ exit(1);
+ }
+
+ return r;
+}
+
+/*
+ * libfdt doesn't allow us to add string arrays directly but they are
+ * test a series of null terminated strings with a length. We build
+ * the string up here so we can calculate the final length.
+ */
+int qemu_fdt_setprop_string_array(void *fdt, const char *node_path,
+ const char *prop, char **array, int len)
+{
+ int ret, i, total_len = 0;
+ char *str, *p;
+ for (i = 0; i < len; i++) {
+ total_len += strlen(array[i]) + 1;
+ }
+ p = str = g_malloc0(total_len);
+ for (i = 0; i < len; i++) {
+ int offset = strlen(array[i]) + 1;
+ pstrcpy(p, offset, array[i]);
+ p += offset;
+ }
+
+ ret = qemu_fdt_setprop(fdt, node_path, prop, str, total_len);
+ g_free(str);
+ return ret;
+}
+
+const void *qemu_fdt_getprop(void *fdt, const char *node_path,
+ const char *property, int *lenp, Error **errp)
+{
+ int len;
+ const void *r;
+
+ if (!lenp) {
+ lenp = &len;
+ }
+ r = fdt_getprop(fdt, findnode_nofail(fdt, node_path), property, lenp);
+ if (!r) {
+ error_setg(errp, "%s: Couldn't get %s/%s: %s", __func__,
+ node_path, property, fdt_strerror(*lenp));
+ }
+ return r;
+}
+
+uint32_t qemu_fdt_getprop_cell(void *fdt, const char *node_path,
+ const char *property, int *lenp, Error **errp)
+{
+ int len;
+ const uint32_t *p;
+
+ if (!lenp) {
+ lenp = &len;
+ }
+ p = qemu_fdt_getprop(fdt, node_path, property, lenp, errp);
+ if (!p) {
+ return 0;
+ } else if (*lenp != 4) {
+ error_setg(errp, "%s: %s/%s not 4 bytes long (not a cell?)",
+ __func__, node_path, property);
+ *lenp = -EINVAL;
+ return 0;
+ }
+ return be32_to_cpu(*p);
+}
+
+uint32_t qemu_fdt_get_phandle(void *fdt, const char *path)
+{
+ uint32_t r;
+
+ r = fdt_get_phandle(fdt, findnode_nofail(fdt, path));
+ if (r == 0) {
+ error_report("%s: Couldn't get phandle for %s: %s", __func__,
+ path, fdt_strerror(r));
+ exit(1);
+ }
+
+ return r;
+}
+
+int qemu_fdt_setprop_phandle(void *fdt, const char *node_path,
+ const char *property,
+ const char *target_node_path)
+{
+ uint32_t phandle = qemu_fdt_get_phandle(fdt, target_node_path);
+ return qemu_fdt_setprop_cell(fdt, node_path, property, phandle);
+}
+
+uint32_t qemu_fdt_alloc_phandle(void *fdt)
+{
+ static int phandle = 0x0;
+
+ /*
+ * We need to find out if the user gave us special instruction at
+ * which phandle id to start allocating phandles.
+ */
+ if (!phandle) {
+ phandle = machine_phandle_start(current_machine);
+ }
+
+ if (!phandle) {
+ /*
+ * None or invalid phandle given on the command line, so fall back to
+ * default starting point.
+ */
+ phandle = 0x8000;
+ }
+
+ return phandle++;
+}
+
+int qemu_fdt_nop_node(void *fdt, const char *node_path)
+{
+ int r;
+
+ r = fdt_nop_node(fdt, findnode_nofail(fdt, node_path));
+ if (r < 0) {
+ error_report("%s: Couldn't nop node %s: %s", __func__, node_path,
+ fdt_strerror(r));
+ exit(1);
+ }
+
+ return r;
+}
+
+int qemu_fdt_add_subnode(void *fdt, const char *name)
+{
+ char *dupname = g_strdup(name);
+ char *basename = strrchr(dupname, '/');
+ int retval;
+ int parent = 0;
+
+ if (!basename) {
+ g_free(dupname);
+ return -1;
+ }
+
+ basename[0] = '\0';
+ basename++;
+
+ if (dupname[0]) {
+ parent = findnode_nofail(fdt, dupname);
+ }
+
+ retval = fdt_add_subnode(fdt, parent, basename);
+ if (retval < 0) {
+ error_report("%s: Failed to create subnode %s: %s",
+ __func__, name, fdt_strerror(retval));
+ exit(1);
+ }
+
+ g_free(dupname);
+ return retval;
+}
+
+/*
+ * qemu_fdt_add_path: Like qemu_fdt_add_subnode(), but will add
+ * all missing subnodes from the given path.
+ */
+int qemu_fdt_add_path(void *fdt, const char *path)
+{
+ const char *name;
+ int namelen, retval;
+ int parent = 0;
+
+ if (path[0] != '/') {
+ return -1;
+ }
+
+ do {
+ name = path + 1;
+ path = strchr(name, '/');
+ namelen = path != NULL ? path - name : strlen(name);
+
+ retval = fdt_subnode_offset_namelen(fdt, parent, name, namelen);
+ if (retval < 0 && retval != -FDT_ERR_NOTFOUND) {
+ error_report("%s: Unexpected error in finding subnode %.*s: %s",
+ __func__, namelen, name, fdt_strerror(retval));
+ exit(1);
+ } else if (retval == -FDT_ERR_NOTFOUND) {
+ retval = fdt_add_subnode_namelen(fdt, parent, name, namelen);
+ if (retval < 0) {
+ error_report("%s: Failed to create subnode %.*s: %s",
+ __func__, namelen, name, fdt_strerror(retval));
+ exit(1);
+ }
+ }
+
+ parent = retval;
+ } while (path);
+
+ return retval;
+}
+
+void qemu_fdt_dumpdtb(void *fdt, int size)
+{
+ const char *dumpdtb = current_machine->dumpdtb;
+
+ if (dumpdtb) {
+ /* Dump the dtb to a file and quit */
+ if (g_file_set_contents(dumpdtb, fdt, size, NULL)) {
+ info_report("dtb dumped to %s. Exiting.", dumpdtb);
+ exit(0);
+ }
+ error_report("%s: Failed dumping dtb to %s", __func__, dumpdtb);
+ exit(1);
+ }
+}
+
+int qemu_fdt_setprop_sized_cells_from_array(void *fdt,
+ const char *node_path,
+ const char *property,
+ int numvalues,
+ uint64_t *values)
+{
+ uint32_t *propcells;
+ uint64_t value;
+ int cellnum, vnum, ncells;
+ uint32_t hival;
+ int ret;
+
+ propcells = g_new0(uint32_t, numvalues * 2);
+
+ cellnum = 0;
+ for (vnum = 0; vnum < numvalues; vnum++) {
+ ncells = values[vnum * 2];
+ if (ncells != 1 && ncells != 2) {
+ ret = -1;
+ goto out;
+ }
+ value = values[vnum * 2 + 1];
+ hival = cpu_to_be32(value >> 32);
+ if (ncells > 1) {
+ propcells[cellnum++] = hival;
+ } else if (hival != 0) {
+ ret = -1;
+ goto out;
+ }
+ propcells[cellnum++] = cpu_to_be32(value);
+ }
+
+ ret = qemu_fdt_setprop(fdt, node_path, property, propcells,
+ cellnum * sizeof(uint32_t));
+out:
+ g_free(propcells);
+ return ret;
+}
+
+void qmp_dumpdtb(const char *filename, Error **errp)
+{
+ g_autoptr(GError) err = NULL;
+ uint32_t size;
+
+ if (!current_machine->fdt) {
+ error_setg(errp, "This machine doesn't have a FDT");
+ return;
+ }
+
+ size = fdt_totalsize(current_machine->fdt);
+
+ g_assert(size > 0);
+
+ if (!g_file_set_contents(filename, current_machine->fdt, size, &err)) {
+ error_setg(errp, "Error saving FDT to file %s: %s",
+ filename, err->message);
+ }
+}
+
+void hmp_dumpdtb(Monitor *mon, const QDict *qdict)
+{
+ const char *filename = qdict_get_str(qdict, "filename");
+ Error *local_err = NULL;
+
+ qmp_dumpdtb(filename, &local_err);
+
+ if (hmp_handle_error(mon, local_err)) {
+ return;
+ }
+
+ info_report("dtb dumped to %s", filename);
+}
+
+void qemu_fdt_randomize_seeds(void *fdt)
+{
+ int noffset, poffset, len;
+ const char *name;
+ uint8_t *data;
+
+ for (noffset = fdt_next_node(fdt, 0, NULL);
+ noffset >= 0;
+ noffset = fdt_next_node(fdt, noffset, NULL)) {
+ for (poffset = fdt_first_property_offset(fdt, noffset);
+ poffset >= 0;
+ poffset = fdt_next_property_offset(fdt, poffset)) {
+ data = (uint8_t *)fdt_getprop_by_offset(fdt, poffset, &name, &len);
+ if (!data || strcmp(name, "rng-seed"))
+ continue;
+ qemu_guest_getrandom_nofail(data, len);
+ }
+ }
+}
diff --git a/system/dirtylimit.c b/system/dirtylimit.c
new file mode 100644
index 0000000000..fa959d7743
--- /dev/null
+++ b/system/dirtylimit.c
@@ -0,0 +1,678 @@
+/*
+ * Dirty page rate limit implementation code
+ *
+ * Copyright (c) 2022 CHINA TELECOM CO.,LTD.
+ *
+ * Authors:
+ * Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/main-loop.h"
+#include "qapi/qapi-commands-migration.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/error.h"
+#include "sysemu/dirtyrate.h"
+#include "sysemu/dirtylimit.h"
+#include "monitor/hmp.h"
+#include "monitor/monitor.h"
+#include "exec/memory.h"
+#include "exec/target_page.h"
+#include "hw/boards.h"
+#include "sysemu/kvm.h"
+#include "trace.h"
+#include "migration/misc.h"
+#include "migration/migration.h"
+#include "migration/options.h"
+
+/*
+ * Dirtylimit stop working if dirty page rate error
+ * value less than DIRTYLIMIT_TOLERANCE_RANGE
+ */
+#define DIRTYLIMIT_TOLERANCE_RANGE 25 /* MB/s */
+/*
+ * Plus or minus vcpu sleep time linearly if dirty
+ * page rate error value percentage over
+ * DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT.
+ * Otherwise, plus or minus a fixed vcpu sleep time.
+ */
+#define DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT 50
+/*
+ * Max vcpu sleep time percentage during a cycle
+ * composed of dirty ring full and sleep time.
+ */
+#define DIRTYLIMIT_THROTTLE_PCT_MAX 99
+
+struct {
+ VcpuStat stat;
+ bool running;
+ QemuThread thread;
+} *vcpu_dirty_rate_stat;
+
+typedef struct VcpuDirtyLimitState {
+ int cpu_index;
+ bool enabled;
+ /*
+ * Quota dirty page rate, unit is MB/s
+ * zero if not enabled.
+ */
+ uint64_t quota;
+} VcpuDirtyLimitState;
+
+struct {
+ VcpuDirtyLimitState *states;
+ /* Max cpus number configured by user */
+ int max_cpus;
+ /* Number of vcpu under dirtylimit */
+ int limited_nvcpu;
+} *dirtylimit_state;
+
+/* protect dirtylimit_state */
+static QemuMutex dirtylimit_mutex;
+
+/* dirtylimit thread quit if dirtylimit_quit is true */
+static bool dirtylimit_quit;
+
+static void vcpu_dirty_rate_stat_collect(void)
+{
+ MigrationState *s = migrate_get_current();
+ VcpuStat stat;
+ int i = 0;
+ int64_t period = DIRTYLIMIT_CALC_TIME_MS;
+
+ if (migrate_dirty_limit() &&
+ migration_is_active(s)) {
+ period = s->parameters.x_vcpu_dirty_limit_period;
+ }
+
+ /* calculate vcpu dirtyrate */
+ vcpu_calculate_dirtyrate(period,
+ &stat,
+ GLOBAL_DIRTY_LIMIT,
+ false);
+
+ for (i = 0; i < stat.nvcpu; i++) {
+ vcpu_dirty_rate_stat->stat.rates[i].id = i;
+ vcpu_dirty_rate_stat->stat.rates[i].dirty_rate =
+ stat.rates[i].dirty_rate;
+ }
+
+ g_free(stat.rates);
+}
+
+static void *vcpu_dirty_rate_stat_thread(void *opaque)
+{
+ rcu_register_thread();
+
+ /* start log sync */
+ global_dirty_log_change(GLOBAL_DIRTY_LIMIT, true);
+
+ while (qatomic_read(&vcpu_dirty_rate_stat->running)) {
+ vcpu_dirty_rate_stat_collect();
+ if (dirtylimit_in_service()) {
+ dirtylimit_process();
+ }
+ }
+
+ /* stop log sync */
+ global_dirty_log_change(GLOBAL_DIRTY_LIMIT, false);
+
+ rcu_unregister_thread();
+ return NULL;
+}
+
+int64_t vcpu_dirty_rate_get(int cpu_index)
+{
+ DirtyRateVcpu *rates = vcpu_dirty_rate_stat->stat.rates;
+ return qatomic_read_i64(&rates[cpu_index].dirty_rate);
+}
+
+void vcpu_dirty_rate_stat_start(void)
+{
+ if (qatomic_read(&vcpu_dirty_rate_stat->running)) {
+ return;
+ }
+
+ qatomic_set(&vcpu_dirty_rate_stat->running, 1);
+ qemu_thread_create(&vcpu_dirty_rate_stat->thread,
+ "dirtyrate-stat",
+ vcpu_dirty_rate_stat_thread,
+ NULL,
+ QEMU_THREAD_JOINABLE);
+}
+
+void vcpu_dirty_rate_stat_stop(void)
+{
+ qatomic_set(&vcpu_dirty_rate_stat->running, 0);
+ dirtylimit_state_unlock();
+ qemu_mutex_unlock_iothread();
+ qemu_thread_join(&vcpu_dirty_rate_stat->thread);
+ qemu_mutex_lock_iothread();
+ dirtylimit_state_lock();
+}
+
+void vcpu_dirty_rate_stat_initialize(void)
+{
+ MachineState *ms = MACHINE(qdev_get_machine());
+ int max_cpus = ms->smp.max_cpus;
+
+ vcpu_dirty_rate_stat =
+ g_malloc0(sizeof(*vcpu_dirty_rate_stat));
+
+ vcpu_dirty_rate_stat->stat.nvcpu = max_cpus;
+ vcpu_dirty_rate_stat->stat.rates =
+ g_new0(DirtyRateVcpu, max_cpus);
+
+ vcpu_dirty_rate_stat->running = false;
+}
+
+void vcpu_dirty_rate_stat_finalize(void)
+{
+ g_free(vcpu_dirty_rate_stat->stat.rates);
+ vcpu_dirty_rate_stat->stat.rates = NULL;
+
+ g_free(vcpu_dirty_rate_stat);
+ vcpu_dirty_rate_stat = NULL;
+}
+
+void dirtylimit_state_lock(void)
+{
+ qemu_mutex_lock(&dirtylimit_mutex);
+}
+
+void dirtylimit_state_unlock(void)
+{
+ qemu_mutex_unlock(&dirtylimit_mutex);
+}
+
+static void
+__attribute__((__constructor__)) dirtylimit_mutex_init(void)
+{
+ qemu_mutex_init(&dirtylimit_mutex);
+}
+
+static inline VcpuDirtyLimitState *dirtylimit_vcpu_get_state(int cpu_index)
+{
+ return &dirtylimit_state->states[cpu_index];
+}
+
+void dirtylimit_state_initialize(void)
+{
+ MachineState *ms = MACHINE(qdev_get_machine());
+ int max_cpus = ms->smp.max_cpus;
+ int i;
+
+ dirtylimit_state = g_malloc0(sizeof(*dirtylimit_state));
+
+ dirtylimit_state->states =
+ g_new0(VcpuDirtyLimitState, max_cpus);
+
+ for (i = 0; i < max_cpus; i++) {
+ dirtylimit_state->states[i].cpu_index = i;
+ }
+
+ dirtylimit_state->max_cpus = max_cpus;
+ trace_dirtylimit_state_initialize(max_cpus);
+}
+
+void dirtylimit_state_finalize(void)
+{
+ g_free(dirtylimit_state->states);
+ dirtylimit_state->states = NULL;
+
+ g_free(dirtylimit_state);
+ dirtylimit_state = NULL;
+
+ trace_dirtylimit_state_finalize();
+}
+
+bool dirtylimit_in_service(void)
+{
+ return !!dirtylimit_state;
+}
+
+bool dirtylimit_vcpu_index_valid(int cpu_index)
+{
+ MachineState *ms = MACHINE(qdev_get_machine());
+
+ return !(cpu_index < 0 ||
+ cpu_index >= ms->smp.max_cpus);
+}
+
+static uint64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate)
+{
+ static uint64_t max_dirtyrate;
+ uint64_t dirty_ring_size_MiB;
+
+ dirty_ring_size_MiB = qemu_target_pages_to_MiB(kvm_dirty_ring_size());
+
+ if (max_dirtyrate < dirtyrate) {
+ max_dirtyrate = dirtyrate;
+ }
+
+ return dirty_ring_size_MiB * 1000000 / max_dirtyrate;
+}
+
+static inline bool dirtylimit_done(uint64_t quota,
+ uint64_t current)
+{
+ uint64_t min, max;
+
+ min = MIN(quota, current);
+ max = MAX(quota, current);
+
+ return ((max - min) <= DIRTYLIMIT_TOLERANCE_RANGE) ? true : false;
+}
+
+static inline bool
+dirtylimit_need_linear_adjustment(uint64_t quota,
+ uint64_t current)
+{
+ uint64_t min, max;
+
+ min = MIN(quota, current);
+ max = MAX(quota, current);
+
+ return ((max - min) * 100 / max) > DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT;
+}
+
+static void dirtylimit_set_throttle(CPUState *cpu,
+ uint64_t quota,
+ uint64_t current)
+{
+ int64_t ring_full_time_us = 0;
+ uint64_t sleep_pct = 0;
+ uint64_t throttle_us = 0;
+
+ if (current == 0) {
+ cpu->throttle_us_per_full = 0;
+ return;
+ }
+
+ ring_full_time_us = dirtylimit_dirty_ring_full_time(current);
+
+ if (dirtylimit_need_linear_adjustment(quota, current)) {
+ if (quota < current) {
+ sleep_pct = (current - quota) * 100 / current;
+ throttle_us =
+ ring_full_time_us * sleep_pct / (double)(100 - sleep_pct);
+ cpu->throttle_us_per_full += throttle_us;
+ } else {
+ sleep_pct = (quota - current) * 100 / quota;
+ throttle_us =
+ ring_full_time_us * sleep_pct / (double)(100 - sleep_pct);
+ cpu->throttle_us_per_full -= throttle_us;
+ }
+
+ trace_dirtylimit_throttle_pct(cpu->cpu_index,
+ sleep_pct,
+ throttle_us);
+ } else {
+ if (quota < current) {
+ cpu->throttle_us_per_full += ring_full_time_us / 10;
+ } else {
+ cpu->throttle_us_per_full -= ring_full_time_us / 10;
+ }
+ }
+
+ /*
+ * TODO: in the big kvm_dirty_ring_size case (eg: 65536, or other scenario),
+ * current dirty page rate may never reach the quota, we should stop
+ * increasing sleep time?
+ */
+ cpu->throttle_us_per_full = MIN(cpu->throttle_us_per_full,
+ ring_full_time_us * DIRTYLIMIT_THROTTLE_PCT_MAX);
+
+ cpu->throttle_us_per_full = MAX(cpu->throttle_us_per_full, 0);
+}
+
+static void dirtylimit_adjust_throttle(CPUState *cpu)
+{
+ uint64_t quota = 0;
+ uint64_t current = 0;
+ int cpu_index = cpu->cpu_index;
+
+ quota = dirtylimit_vcpu_get_state(cpu_index)->quota;
+ current = vcpu_dirty_rate_get(cpu_index);
+
+ if (!dirtylimit_done(quota, current)) {
+ dirtylimit_set_throttle(cpu, quota, current);
+ }
+
+ return;
+}
+
+void dirtylimit_process(void)
+{
+ CPUState *cpu;
+
+ if (!qatomic_read(&dirtylimit_quit)) {
+ dirtylimit_state_lock();
+
+ if (!dirtylimit_in_service()) {
+ dirtylimit_state_unlock();
+ return;
+ }
+
+ CPU_FOREACH(cpu) {
+ if (!dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) {
+ continue;
+ }
+ dirtylimit_adjust_throttle(cpu);
+ }
+ dirtylimit_state_unlock();
+ }
+}
+
+void dirtylimit_change(bool start)
+{
+ if (start) {
+ qatomic_set(&dirtylimit_quit, 0);
+ } else {
+ qatomic_set(&dirtylimit_quit, 1);
+ }
+}
+
+void dirtylimit_set_vcpu(int cpu_index,
+ uint64_t quota,
+ bool enable)
+{
+ trace_dirtylimit_set_vcpu(cpu_index, quota);
+
+ if (enable) {
+ dirtylimit_state->states[cpu_index].quota = quota;
+ if (!dirtylimit_vcpu_get_state(cpu_index)->enabled) {
+ dirtylimit_state->limited_nvcpu++;
+ }
+ } else {
+ dirtylimit_state->states[cpu_index].quota = 0;
+ if (dirtylimit_state->states[cpu_index].enabled) {
+ dirtylimit_state->limited_nvcpu--;
+ }
+ }
+
+ dirtylimit_state->states[cpu_index].enabled = enable;
+}
+
+void dirtylimit_set_all(uint64_t quota,
+ bool enable)
+{
+ MachineState *ms = MACHINE(qdev_get_machine());
+ int max_cpus = ms->smp.max_cpus;
+ int i;
+
+ for (i = 0; i < max_cpus; i++) {
+ dirtylimit_set_vcpu(i, quota, enable);
+ }
+}
+
+void dirtylimit_vcpu_execute(CPUState *cpu)
+{
+ if (dirtylimit_in_service() &&
+ dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled &&
+ cpu->throttle_us_per_full) {
+ trace_dirtylimit_vcpu_execute(cpu->cpu_index,
+ cpu->throttle_us_per_full);
+ usleep(cpu->throttle_us_per_full);
+ }
+}
+
+static void dirtylimit_init(void)
+{
+ dirtylimit_state_initialize();
+ dirtylimit_change(true);
+ vcpu_dirty_rate_stat_initialize();
+ vcpu_dirty_rate_stat_start();
+}
+
+static void dirtylimit_cleanup(void)
+{
+ vcpu_dirty_rate_stat_stop();
+ vcpu_dirty_rate_stat_finalize();
+ dirtylimit_change(false);
+ dirtylimit_state_finalize();
+}
+
+/*
+ * dirty page rate limit is not allowed to set if migration
+ * is running with dirty-limit capability enabled.
+ */
+static bool dirtylimit_is_allowed(void)
+{
+ MigrationState *ms = migrate_get_current();
+
+ if (migration_is_running(ms->state) &&
+ (!qemu_thread_is_self(&ms->thread)) &&
+ migrate_dirty_limit() &&
+ dirtylimit_in_service()) {
+ return false;
+ }
+ return true;
+}
+
+void qmp_cancel_vcpu_dirty_limit(bool has_cpu_index,
+ int64_t cpu_index,
+ Error **errp)
+{
+ if (!kvm_enabled() || !kvm_dirty_ring_enabled()) {
+ return;
+ }
+
+ if (has_cpu_index && !dirtylimit_vcpu_index_valid(cpu_index)) {
+ error_setg(errp, "incorrect cpu index specified");
+ return;
+ }
+
+ if (!dirtylimit_is_allowed()) {
+ error_setg(errp, "can't cancel dirty page rate limit while"
+ " migration is running");
+ return;
+ }
+
+ if (!dirtylimit_in_service()) {
+ return;
+ }
+
+ dirtylimit_state_lock();
+
+ if (has_cpu_index) {
+ dirtylimit_set_vcpu(cpu_index, 0, false);
+ } else {
+ dirtylimit_set_all(0, false);
+ }
+
+ if (!dirtylimit_state->limited_nvcpu) {
+ dirtylimit_cleanup();
+ }
+
+ dirtylimit_state_unlock();
+}
+
+void hmp_cancel_vcpu_dirty_limit(Monitor *mon, const QDict *qdict)
+{
+ int64_t cpu_index = qdict_get_try_int(qdict, "cpu_index", -1);
+ Error *err = NULL;
+
+ qmp_cancel_vcpu_dirty_limit(!!(cpu_index != -1), cpu_index, &err);
+ if (err) {
+ hmp_handle_error(mon, err);
+ return;
+ }
+
+ monitor_printf(mon, "[Please use 'info vcpu_dirty_limit' to query "
+ "dirty limit for virtual CPU]\n");
+}
+
+void qmp_set_vcpu_dirty_limit(bool has_cpu_index,
+ int64_t cpu_index,
+ uint64_t dirty_rate,
+ Error **errp)
+{
+ if (!kvm_enabled() || !kvm_dirty_ring_enabled()) {
+ error_setg(errp, "dirty page limit feature requires KVM with"
+ " accelerator property 'dirty-ring-size' set'");
+ return;
+ }
+
+ if (has_cpu_index && !dirtylimit_vcpu_index_valid(cpu_index)) {
+ error_setg(errp, "incorrect cpu index specified");
+ return;
+ }
+
+ if (!dirtylimit_is_allowed()) {
+ error_setg(errp, "can't set dirty page rate limit while"
+ " migration is running");
+ return;
+ }
+
+ if (!dirty_rate) {
+ qmp_cancel_vcpu_dirty_limit(has_cpu_index, cpu_index, errp);
+ return;
+ }
+
+ dirtylimit_state_lock();
+
+ if (!dirtylimit_in_service()) {
+ dirtylimit_init();
+ }
+
+ if (has_cpu_index) {
+ dirtylimit_set_vcpu(cpu_index, dirty_rate, true);
+ } else {
+ dirtylimit_set_all(dirty_rate, true);
+ }
+
+ dirtylimit_state_unlock();
+}
+
+void hmp_set_vcpu_dirty_limit(Monitor *mon, const QDict *qdict)
+{
+ int64_t dirty_rate = qdict_get_int(qdict, "dirty_rate");
+ int64_t cpu_index = qdict_get_try_int(qdict, "cpu_index", -1);
+ Error *err = NULL;
+
+ if (dirty_rate < 0) {
+ error_setg(&err, "invalid dirty page limit %" PRId64, dirty_rate);
+ goto out;
+ }
+
+ qmp_set_vcpu_dirty_limit(!!(cpu_index != -1), cpu_index, dirty_rate, &err);
+
+out:
+ hmp_handle_error(mon, err);
+}
+
+/* Return the max throttle time of each virtual CPU */
+uint64_t dirtylimit_throttle_time_per_round(void)
+{
+ CPUState *cpu;
+ int64_t max = 0;
+
+ CPU_FOREACH(cpu) {
+ if (cpu->throttle_us_per_full > max) {
+ max = cpu->throttle_us_per_full;
+ }
+ }
+
+ return max;
+}
+
+/*
+ * Estimate average dirty ring full time of each virtaul CPU.
+ * Return 0 if guest doesn't dirty memory.
+ */
+uint64_t dirtylimit_ring_full_time(void)
+{
+ CPUState *cpu;
+ uint64_t curr_rate = 0;
+ int nvcpus = 0;
+
+ CPU_FOREACH(cpu) {
+ if (cpu->running) {
+ nvcpus++;
+ curr_rate += vcpu_dirty_rate_get(cpu->cpu_index);
+ }
+ }
+
+ if (!curr_rate || !nvcpus) {
+ return 0;
+ }
+
+ return dirtylimit_dirty_ring_full_time(curr_rate / nvcpus);
+}
+
+static struct DirtyLimitInfo *dirtylimit_query_vcpu(int cpu_index)
+{
+ DirtyLimitInfo *info = NULL;
+
+ info = g_malloc0(sizeof(*info));
+ info->cpu_index = cpu_index;
+ info->limit_rate = dirtylimit_vcpu_get_state(cpu_index)->quota;
+ info->current_rate = vcpu_dirty_rate_get(cpu_index);
+
+ return info;
+}
+
+static struct DirtyLimitInfoList *dirtylimit_query_all(void)
+{
+ int i, index;
+ DirtyLimitInfo *info = NULL;
+ DirtyLimitInfoList *head = NULL, **tail = &head;
+
+ dirtylimit_state_lock();
+
+ if (!dirtylimit_in_service()) {
+ dirtylimit_state_unlock();
+ return NULL;
+ }
+
+ for (i = 0; i < dirtylimit_state->max_cpus; i++) {
+ index = dirtylimit_state->states[i].cpu_index;
+ if (dirtylimit_vcpu_get_state(index)->enabled) {
+ info = dirtylimit_query_vcpu(index);
+ QAPI_LIST_APPEND(tail, info);
+ }
+ }
+
+ dirtylimit_state_unlock();
+
+ return head;
+}
+
+struct DirtyLimitInfoList *qmp_query_vcpu_dirty_limit(Error **errp)
+{
+ if (!dirtylimit_in_service()) {
+ return NULL;
+ }
+
+ return dirtylimit_query_all();
+}
+
+void hmp_info_vcpu_dirty_limit(Monitor *mon, const QDict *qdict)
+{
+ DirtyLimitInfoList *info;
+ g_autoptr(DirtyLimitInfoList) head = NULL;
+ Error *err = NULL;
+
+ if (!dirtylimit_in_service()) {
+ monitor_printf(mon, "Dirty page limit not enabled!\n");
+ return;
+ }
+
+ head = qmp_query_vcpu_dirty_limit(&err);
+ if (err) {
+ hmp_handle_error(mon, err);
+ return;
+ }
+
+ for (info = head; info != NULL; info = info->next) {
+ monitor_printf(mon, "vcpu[%"PRIi64"], limit rate %"PRIi64 " (MB/s),"
+ " current rate %"PRIi64 " (MB/s)\n",
+ info->value->cpu_index,
+ info->value->limit_rate,
+ info->value->current_rate);
+ }
+}
diff --git a/system/dma-helpers.c b/system/dma-helpers.c
new file mode 100644
index 0000000000..36211acc7e
--- /dev/null
+++ b/system/dma-helpers.c
@@ -0,0 +1,347 @@
+/*
+ * DMA helper functions
+ *
+ * Copyright (c) 2009,2020 Red Hat
+ *
+ * This work is licensed under the terms of the GNU General Public License
+ * (GNU GPL), version 2 or later.
+ */
+
+#include "qemu/osdep.h"
+#include "sysemu/block-backend.h"
+#include "sysemu/dma.h"
+#include "trace/trace-root.h"
+#include "qemu/thread.h"
+#include "qemu/main-loop.h"
+#include "sysemu/cpu-timers.h"
+#include "qemu/range.h"
+
+/* #define DEBUG_IOMMU */
+
+MemTxResult dma_memory_set(AddressSpace *as, dma_addr_t addr,
+ uint8_t c, dma_addr_t len, MemTxAttrs attrs)
+{
+ dma_barrier(as, DMA_DIRECTION_FROM_DEVICE);
+
+ return address_space_set(as, addr, c, len, attrs);
+}
+
+void qemu_sglist_init(QEMUSGList *qsg, DeviceState *dev, int alloc_hint,
+ AddressSpace *as)
+{
+ qsg->sg = g_new(ScatterGatherEntry, alloc_hint);
+ qsg->nsg = 0;
+ qsg->nalloc = alloc_hint;
+ qsg->size = 0;
+ qsg->as = as;
+ qsg->dev = dev;
+ object_ref(OBJECT(dev));
+}
+
+void qemu_sglist_add(QEMUSGList *qsg, dma_addr_t base, dma_addr_t len)
+{
+ if (qsg->nsg == qsg->nalloc) {
+ qsg->nalloc = 2 * qsg->nalloc + 1;
+ qsg->sg = g_renew(ScatterGatherEntry, qsg->sg, qsg->nalloc);
+ }
+ qsg->sg[qsg->nsg].base = base;
+ qsg->sg[qsg->nsg].len = len;
+ qsg->size += len;
+ ++qsg->nsg;
+}
+
+void qemu_sglist_destroy(QEMUSGList *qsg)
+{
+ object_unref(OBJECT(qsg->dev));
+ g_free(qsg->sg);
+ memset(qsg, 0, sizeof(*qsg));
+}
+
+typedef struct {
+ BlockAIOCB common;
+ AioContext *ctx;
+ BlockAIOCB *acb;
+ QEMUSGList *sg;
+ uint32_t align;
+ uint64_t offset;
+ DMADirection dir;
+ int sg_cur_index;
+ dma_addr_t sg_cur_byte;
+ QEMUIOVector iov;
+ QEMUBH *bh;
+ DMAIOFunc *io_func;
+ void *io_func_opaque;
+} DMAAIOCB;
+
+static void dma_blk_cb(void *opaque, int ret);
+
+static void reschedule_dma(void *opaque)
+{
+ DMAAIOCB *dbs = (DMAAIOCB *)opaque;
+
+ assert(!dbs->acb && dbs->bh);
+ qemu_bh_delete(dbs->bh);
+ dbs->bh = NULL;
+ dma_blk_cb(dbs, 0);
+}
+
+static void dma_blk_unmap(DMAAIOCB *dbs)
+{
+ int i;
+
+ for (i = 0; i < dbs->iov.niov; ++i) {
+ dma_memory_unmap(dbs->sg->as, dbs->iov.iov[i].iov_base,
+ dbs->iov.iov[i].iov_len, dbs->dir,
+ dbs->iov.iov[i].iov_len);
+ }
+ qemu_iovec_reset(&dbs->iov);
+}
+
+static void dma_complete(DMAAIOCB *dbs, int ret)
+{
+ trace_dma_complete(dbs, ret, dbs->common.cb);
+
+ assert(!dbs->acb && !dbs->bh);
+ dma_blk_unmap(dbs);
+ if (dbs->common.cb) {
+ dbs->common.cb(dbs->common.opaque, ret);
+ }
+ qemu_iovec_destroy(&dbs->iov);
+ qemu_aio_unref(dbs);
+}
+
+static void dma_blk_cb(void *opaque, int ret)
+{
+ DMAAIOCB *dbs = (DMAAIOCB *)opaque;
+ AioContext *ctx = dbs->ctx;
+ dma_addr_t cur_addr, cur_len;
+ void *mem;
+
+ trace_dma_blk_cb(dbs, ret);
+
+ aio_context_acquire(ctx);
+ dbs->acb = NULL;
+ dbs->offset += dbs->iov.size;
+
+ if (dbs->sg_cur_index == dbs->sg->nsg || ret < 0) {
+ dma_complete(dbs, ret);
+ goto out;
+ }
+ dma_blk_unmap(dbs);
+
+ while (dbs->sg_cur_index < dbs->sg->nsg) {
+ cur_addr = dbs->sg->sg[dbs->sg_cur_index].base + dbs->sg_cur_byte;
+ cur_len = dbs->sg->sg[dbs->sg_cur_index].len - dbs->sg_cur_byte;
+ mem = dma_memory_map(dbs->sg->as, cur_addr, &cur_len, dbs->dir,
+ MEMTXATTRS_UNSPECIFIED);
+ /*
+ * Make reads deterministic in icount mode. Windows sometimes issues
+ * disk read requests with overlapping SGs. It leads
+ * to non-determinism, because resulting buffer contents may be mixed
+ * from several sectors. This code splits all SGs into several
+ * groups. SGs in every group do not overlap.
+ */
+ if (mem && icount_enabled() && dbs->dir == DMA_DIRECTION_FROM_DEVICE) {
+ int i;
+ for (i = 0 ; i < dbs->iov.niov ; ++i) {
+ if (ranges_overlap((intptr_t)dbs->iov.iov[i].iov_base,
+ dbs->iov.iov[i].iov_len, (intptr_t)mem,
+ cur_len)) {
+ dma_memory_unmap(dbs->sg->as, mem, cur_len,
+ dbs->dir, cur_len);
+ mem = NULL;
+ break;
+ }
+ }
+ }
+ if (!mem)
+ break;
+ qemu_iovec_add(&dbs->iov, mem, cur_len);
+ dbs->sg_cur_byte += cur_len;
+ if (dbs->sg_cur_byte == dbs->sg->sg[dbs->sg_cur_index].len) {
+ dbs->sg_cur_byte = 0;
+ ++dbs->sg_cur_index;
+ }
+ }
+
+ if (dbs->iov.size == 0) {
+ trace_dma_map_wait(dbs);
+ dbs->bh = aio_bh_new(ctx, reschedule_dma, dbs);
+ cpu_register_map_client(dbs->bh);
+ goto out;
+ }
+
+ if (!QEMU_IS_ALIGNED(dbs->iov.size, dbs->align)) {
+ qemu_iovec_discard_back(&dbs->iov,
+ QEMU_ALIGN_DOWN(dbs->iov.size, dbs->align));
+ }
+
+ dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
+ dma_blk_cb, dbs, dbs->io_func_opaque);
+ assert(dbs->acb);
+out:
+ aio_context_release(ctx);
+}
+
+static void dma_aio_cancel(BlockAIOCB *acb)
+{
+ DMAAIOCB *dbs = container_of(acb, DMAAIOCB, common);
+
+ trace_dma_aio_cancel(dbs);
+
+ assert(!(dbs->acb && dbs->bh));
+ if (dbs->acb) {
+ /* This will invoke dma_blk_cb. */
+ blk_aio_cancel_async(dbs->acb);
+ return;
+ }
+
+ if (dbs->bh) {
+ cpu_unregister_map_client(dbs->bh);
+ qemu_bh_delete(dbs->bh);
+ dbs->bh = NULL;
+ }
+ if (dbs->common.cb) {
+ dbs->common.cb(dbs->common.opaque, -ECANCELED);
+ }
+}
+
+static const AIOCBInfo dma_aiocb_info = {
+ .aiocb_size = sizeof(DMAAIOCB),
+ .cancel_async = dma_aio_cancel,
+};
+
+BlockAIOCB *dma_blk_io(AioContext *ctx,
+ QEMUSGList *sg, uint64_t offset, uint32_t align,
+ DMAIOFunc *io_func, void *io_func_opaque,
+ BlockCompletionFunc *cb,
+ void *opaque, DMADirection dir)
+{
+ DMAAIOCB *dbs = qemu_aio_get(&dma_aiocb_info, NULL, cb, opaque);
+
+ trace_dma_blk_io(dbs, io_func_opaque, offset, (dir == DMA_DIRECTION_TO_DEVICE));
+
+ dbs->acb = NULL;
+ dbs->sg = sg;
+ dbs->ctx = ctx;
+ dbs->offset = offset;
+ dbs->align = align;
+ dbs->sg_cur_index = 0;
+ dbs->sg_cur_byte = 0;
+ dbs->dir = dir;
+ dbs->io_func = io_func;
+ dbs->io_func_opaque = io_func_opaque;
+ dbs->bh = NULL;
+ qemu_iovec_init(&dbs->iov, sg->nsg);
+ dma_blk_cb(dbs, 0);
+ return &dbs->common;
+}
+
+
+static
+BlockAIOCB *dma_blk_read_io_func(int64_t offset, QEMUIOVector *iov,
+ BlockCompletionFunc *cb, void *cb_opaque,
+ void *opaque)
+{
+ BlockBackend *blk = opaque;
+ return blk_aio_preadv(blk, offset, iov, 0, cb, cb_opaque);
+}
+
+BlockAIOCB *dma_blk_read(BlockBackend *blk,
+ QEMUSGList *sg, uint64_t offset, uint32_t align,
+ void (*cb)(void *opaque, int ret), void *opaque)
+{
+ return dma_blk_io(blk_get_aio_context(blk), sg, offset, align,
+ dma_blk_read_io_func, blk, cb, opaque,
+ DMA_DIRECTION_FROM_DEVICE);
+}
+
+static
+BlockAIOCB *dma_blk_write_io_func(int64_t offset, QEMUIOVector *iov,
+ BlockCompletionFunc *cb, void *cb_opaque,
+ void *opaque)
+{
+ BlockBackend *blk = opaque;
+ return blk_aio_pwritev(blk, offset, iov, 0, cb, cb_opaque);
+}
+
+BlockAIOCB *dma_blk_write(BlockBackend *blk,
+ QEMUSGList *sg, uint64_t offset, uint32_t align,
+ void (*cb)(void *opaque, int ret), void *opaque)
+{
+ return dma_blk_io(blk_get_aio_context(blk), sg, offset, align,
+ dma_blk_write_io_func, blk, cb, opaque,
+ DMA_DIRECTION_TO_DEVICE);
+}
+
+
+static MemTxResult dma_buf_rw(void *buf, dma_addr_t len, dma_addr_t *residual,
+ QEMUSGList *sg, DMADirection dir,
+ MemTxAttrs attrs)
+{
+ uint8_t *ptr = buf;
+ dma_addr_t xresidual;
+ int sg_cur_index;
+ MemTxResult res = MEMTX_OK;
+
+ xresidual = sg->size;
+ sg_cur_index = 0;
+ len = MIN(len, xresidual);
+ while (len > 0) {
+ ScatterGatherEntry entry = sg->sg[sg_cur_index++];
+ dma_addr_t xfer = MIN(len, entry.len);
+ res |= dma_memory_rw(sg->as, entry.base, ptr, xfer, dir, attrs);
+ ptr += xfer;
+ len -= xfer;
+ xresidual -= xfer;
+ }
+
+ if (residual) {
+ *residual = xresidual;
+ }
+ return res;
+}
+
+MemTxResult dma_buf_read(void *ptr, dma_addr_t len, dma_addr_t *residual,
+ QEMUSGList *sg, MemTxAttrs attrs)
+{
+ return dma_buf_rw(ptr, len, residual, sg, DMA_DIRECTION_FROM_DEVICE, attrs);
+}
+
+MemTxResult dma_buf_write(void *ptr, dma_addr_t len, dma_addr_t *residual,
+ QEMUSGList *sg, MemTxAttrs attrs)
+{
+ return dma_buf_rw(ptr, len, residual, sg, DMA_DIRECTION_TO_DEVICE, attrs);
+}
+
+void dma_acct_start(BlockBackend *blk, BlockAcctCookie *cookie,
+ QEMUSGList *sg, enum BlockAcctType type)
+{
+ block_acct_start(blk_get_stats(blk), cookie, sg->size, type);
+}
+
+uint64_t dma_aligned_pow2_mask(uint64_t start, uint64_t end, int max_addr_bits)
+{
+ uint64_t max_mask = UINT64_MAX, addr_mask = end - start;
+ uint64_t alignment_mask, size_mask;
+
+ if (max_addr_bits != 64) {
+ max_mask = (1ULL << max_addr_bits) - 1;
+ }
+
+ alignment_mask = start ? (start & -start) - 1 : max_mask;
+ alignment_mask = MIN(alignment_mask, max_mask);
+ size_mask = MIN(addr_mask, max_mask);
+
+ if (alignment_mask <= size_mask) {
+ /* Increase the alignment of start */
+ return alignment_mask;
+ } else {
+ /* Find the largest page mask from size */
+ if (addr_mask == UINT64_MAX) {
+ return UINT64_MAX;
+ }
+ return (1ULL << (63 - clz64(addr_mask + 1))) - 1;
+ }
+}
+
diff --git a/system/globals.c b/system/globals.c
new file mode 100644
index 0000000000..e83b5428d1
--- /dev/null
+++ b/system/globals.c
@@ -0,0 +1,70 @@
+/*
+ * Global variables that (mostly) should not exist
+ *
+ * Copyright (c) 2003-2020 QEMU contributors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "exec/cpu-common.h"
+#include "hw/display/vga.h"
+#include "hw/loader.h"
+#include "hw/xen/xen.h"
+#include "net/net.h"
+#include "sysemu/cpus.h"
+#include "sysemu/sysemu.h"
+
+enum vga_retrace_method vga_retrace_method = VGA_RETRACE_DUMB;
+int display_opengl;
+const char* keyboard_layout;
+bool enable_mlock;
+bool enable_cpu_pm;
+int nb_nics;
+NICInfo nd_table[MAX_NICS];
+int autostart = 1;
+int vga_interface_type = VGA_NONE;
+bool vga_interface_created;
+Chardev *parallel_hds[MAX_PARALLEL_PORTS];
+int win2k_install_hack;
+int fd_bootchk = 1;
+int graphic_rotate;
+QEMUOptionRom option_rom[MAX_OPTION_ROMS];
+int nb_option_roms;
+int old_param;
+const char *qemu_name;
+unsigned int nb_prom_envs;
+const char *prom_envs[MAX_PROM_ENVS];
+uint8_t *boot_splash_filedata;
+int only_migratable; /* turn it off unless user states otherwise */
+int icount_align_option;
+
+/* The bytes in qemu_uuid are in the order specified by RFC4122, _not_ in the
+ * little-endian "wire format" described in the SMBIOS 2.6 specification.
+ */
+QemuUUID qemu_uuid;
+bool qemu_uuid_set;
+
+uint32_t xen_domid;
+enum xen_mode xen_mode = XEN_DISABLED;
+bool xen_domid_restrict;
+struct evtchn_backend_ops *xen_evtchn_ops;
+struct gnttab_backend_ops *xen_gnttab_ops;
+struct foreignmem_backend_ops *xen_foreignmem_ops;
+struct xenstore_backend_ops *xen_xenstore_ops;
diff --git a/system/ioport.c b/system/ioport.c
new file mode 100644
index 0000000000..1824aa808c
--- /dev/null
+++ b/system/ioport.c
@@ -0,0 +1,346 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+/*
+ * split out ioport related stuffs from vl.c.
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "exec/ioport.h"
+#include "exec/memory.h"
+#include "exec/address-spaces.h"
+#include "trace.h"
+
+struct MemoryRegionPortioList {
+ Object obj;
+
+ MemoryRegion mr;
+ void *portio_opaque;
+ MemoryRegionPortio *ports;
+};
+
+#define TYPE_MEMORY_REGION_PORTIO_LIST "memory-region-portio-list"
+OBJECT_DECLARE_SIMPLE_TYPE(MemoryRegionPortioList, MEMORY_REGION_PORTIO_LIST)
+
+static uint64_t unassigned_io_read(void *opaque, hwaddr addr, unsigned size)
+{
+ return -1ULL;
+}
+
+static void unassigned_io_write(void *opaque, hwaddr addr, uint64_t val,
+ unsigned size)
+{
+}
+
+const MemoryRegionOps unassigned_io_ops = {
+ .read = unassigned_io_read,
+ .write = unassigned_io_write,
+ .endianness = DEVICE_NATIVE_ENDIAN,
+};
+
+void cpu_outb(uint32_t addr, uint8_t val)
+{
+ trace_cpu_out(addr, 'b', val);
+ address_space_write(&address_space_io, addr, MEMTXATTRS_UNSPECIFIED,
+ &val, 1);
+}
+
+void cpu_outw(uint32_t addr, uint16_t val)
+{
+ uint8_t buf[2];
+
+ trace_cpu_out(addr, 'w', val);
+ stw_p(buf, val);
+ address_space_write(&address_space_io, addr, MEMTXATTRS_UNSPECIFIED,
+ buf, 2);
+}
+
+void cpu_outl(uint32_t addr, uint32_t val)
+{
+ uint8_t buf[4];
+
+ trace_cpu_out(addr, 'l', val);
+ stl_p(buf, val);
+ address_space_write(&address_space_io, addr, MEMTXATTRS_UNSPECIFIED,
+ buf, 4);
+}
+
+uint8_t cpu_inb(uint32_t addr)
+{
+ uint8_t val;
+
+ address_space_read(&address_space_io, addr, MEMTXATTRS_UNSPECIFIED,
+ &val, 1);
+ trace_cpu_in(addr, 'b', val);
+ return val;
+}
+
+uint16_t cpu_inw(uint32_t addr)
+{
+ uint8_t buf[2];
+ uint16_t val;
+
+ address_space_read(&address_space_io, addr, MEMTXATTRS_UNSPECIFIED, buf, 2);
+ val = lduw_p(buf);
+ trace_cpu_in(addr, 'w', val);
+ return val;
+}
+
+uint32_t cpu_inl(uint32_t addr)
+{
+ uint8_t buf[4];
+ uint32_t val;
+
+ address_space_read(&address_space_io, addr, MEMTXATTRS_UNSPECIFIED, buf, 4);
+ val = ldl_p(buf);
+ trace_cpu_in(addr, 'l', val);
+ return val;
+}
+
+void portio_list_init(PortioList *piolist,
+ Object *owner,
+ const MemoryRegionPortio *callbacks,
+ void *opaque, const char *name)
+{
+ unsigned n = 0;
+
+ while (callbacks[n].size) {
+ ++n;
+ }
+
+ piolist->ports = callbacks;
+ piolist->nr = 0;
+ piolist->regions = g_new0(MemoryRegion *, n);
+ piolist->address_space = NULL;
+ piolist->opaque = opaque;
+ piolist->owner = owner;
+ piolist->name = name;
+ piolist->flush_coalesced_mmio = false;
+}
+
+void portio_list_set_flush_coalesced(PortioList *piolist)
+{
+ piolist->flush_coalesced_mmio = true;
+}
+
+void portio_list_destroy(PortioList *piolist)
+{
+ MemoryRegionPortioList *mrpio;
+ unsigned i;
+
+ for (i = 0; i < piolist->nr; ++i) {
+ mrpio = container_of(piolist->regions[i], MemoryRegionPortioList, mr);
+ object_unparent(OBJECT(&mrpio->mr));
+ object_unref(mrpio);
+ }
+ g_free(piolist->regions);
+}
+
+static const MemoryRegionPortio *find_portio(MemoryRegionPortioList *mrpio,
+ uint64_t offset, unsigned size,
+ bool write)
+{
+ const MemoryRegionPortio *mrp;
+
+ for (mrp = mrpio->ports; mrp->size; ++mrp) {
+ if (offset >= mrp->offset && offset < mrp->offset + mrp->len &&
+ size == mrp->size &&
+ (write ? (bool)mrp->write : (bool)mrp->read)) {
+ return mrp;
+ }
+ }
+ return NULL;
+}
+
+static uint64_t portio_read(void *opaque, hwaddr addr, unsigned size)
+{
+ MemoryRegionPortioList *mrpio = opaque;
+ const MemoryRegionPortio *mrp = find_portio(mrpio, addr, size, false);
+ uint64_t data;
+
+ data = ((uint64_t)1 << (size * 8)) - 1;
+ if (mrp) {
+ data = mrp->read(mrpio->portio_opaque, mrp->base + addr);
+ } else if (size == 2) {
+ mrp = find_portio(mrpio, addr, 1, false);
+ if (mrp) {
+ data = mrp->read(mrpio->portio_opaque, mrp->base + addr);
+ if (addr + 1 < mrp->offset + mrp->len) {
+ data |= mrp->read(mrpio->portio_opaque, mrp->base + addr + 1) << 8;
+ } else {
+ data |= 0xff00;
+ }
+ }
+ }
+ return data;
+}
+
+static void portio_write(void *opaque, hwaddr addr, uint64_t data,
+ unsigned size)
+{
+ MemoryRegionPortioList *mrpio = opaque;
+ const MemoryRegionPortio *mrp = find_portio(mrpio, addr, size, true);
+
+ if (mrp) {
+ mrp->write(mrpio->portio_opaque, mrp->base + addr, data);
+ } else if (size == 2) {
+ mrp = find_portio(mrpio, addr, 1, true);
+ if (mrp) {
+ mrp->write(mrpio->portio_opaque, mrp->base + addr, data & 0xff);
+ if (addr + 1 < mrp->offset + mrp->len) {
+ mrp->write(mrpio->portio_opaque, mrp->base + addr + 1, data >> 8);
+ }
+ }
+ }
+}
+
+static const MemoryRegionOps portio_ops = {
+ .read = portio_read,
+ .write = portio_write,
+ .endianness = DEVICE_LITTLE_ENDIAN,
+ .valid.unaligned = true,
+ .impl.unaligned = true,
+};
+
+static void portio_list_add_1(PortioList *piolist,
+ const MemoryRegionPortio *pio_init,
+ unsigned count, unsigned start,
+ unsigned off_low, unsigned off_high)
+{
+ MemoryRegionPortioList *mrpio;
+ Object *owner;
+ char *name;
+ unsigned i;
+
+ /* Copy the sub-list and null-terminate it. */
+ mrpio = MEMORY_REGION_PORTIO_LIST(
+ object_new(TYPE_MEMORY_REGION_PORTIO_LIST));
+ mrpio->portio_opaque = piolist->opaque;
+ mrpio->ports = g_malloc0(sizeof(MemoryRegionPortio) * (count + 1));
+ memcpy(mrpio->ports, pio_init, sizeof(MemoryRegionPortio) * count);
+ memset(mrpio->ports + count, 0, sizeof(MemoryRegionPortio));
+
+ /* Adjust the offsets to all be zero-based for the region. */
+ for (i = 0; i < count; ++i) {
+ mrpio->ports[i].offset -= off_low;
+ mrpio->ports[i].base = start + off_low;
+ }
+
+ /*
+ * The MemoryRegion owner is the MemoryRegionPortioList since that manages
+ * the lifecycle via the refcount
+ */
+ memory_region_init_io(&mrpio->mr, OBJECT(mrpio), &portio_ops, mrpio,
+ piolist->name, off_high - off_low);
+
+ /* Reparent the MemoryRegion to the piolist owner */
+ object_ref(&mrpio->mr);
+ object_unparent(OBJECT(&mrpio->mr));
+ if (!piolist->owner) {
+ owner = container_get(qdev_get_machine(), "/unattached");
+ } else {
+ owner = piolist->owner;
+ }
+ name = g_strdup_printf("%s[*]", piolist->name);
+ object_property_add_child(owner, name, OBJECT(&mrpio->mr));
+ g_free(name);
+
+ if (piolist->flush_coalesced_mmio) {
+ memory_region_set_flush_coalesced(&mrpio->mr);
+ }
+ memory_region_add_subregion(piolist->address_space,
+ start + off_low, &mrpio->mr);
+ piolist->regions[piolist->nr] = &mrpio->mr;
+ ++piolist->nr;
+}
+
+void portio_list_add(PortioList *piolist,
+ MemoryRegion *address_space,
+ uint32_t start)
+{
+ const MemoryRegionPortio *pio, *pio_start = piolist->ports;
+ unsigned int off_low, off_high, off_last, count;
+
+ piolist->address_space = address_space;
+
+ /* Handle the first entry specially. */
+ off_last = off_low = pio_start->offset;
+ off_high = off_low + pio_start->len + pio_start->size - 1;
+ count = 1;
+
+ for (pio = pio_start + 1; pio->size != 0; pio++, count++) {
+ /* All entries must be sorted by offset. */
+ assert(pio->offset >= off_last);
+ off_last = pio->offset;
+
+ /* If we see a hole, break the region. */
+ if (off_last > off_high) {
+ portio_list_add_1(piolist, pio_start, count, start, off_low,
+ off_high);
+ /* ... and start collecting anew. */
+ pio_start = pio;
+ off_low = off_last;
+ off_high = off_low + pio->len + pio_start->size - 1;
+ count = 0;
+ } else if (off_last + pio->len > off_high) {
+ off_high = off_last + pio->len + pio_start->size - 1;
+ }
+ }
+
+ /* There will always be an open sub-list. */
+ portio_list_add_1(piolist, pio_start, count, start, off_low, off_high);
+}
+
+void portio_list_del(PortioList *piolist)
+{
+ MemoryRegionPortioList *mrpio;
+ unsigned i;
+
+ for (i = 0; i < piolist->nr; ++i) {
+ mrpio = container_of(piolist->regions[i], MemoryRegionPortioList, mr);
+ memory_region_del_subregion(piolist->address_space, &mrpio->mr);
+ }
+}
+
+static void memory_region_portio_list_finalize(Object *obj)
+{
+ MemoryRegionPortioList *mrpio = MEMORY_REGION_PORTIO_LIST(obj);
+
+ object_unref(&mrpio->mr);
+ g_free(mrpio->ports);
+}
+
+static const TypeInfo memory_region_portio_list_info = {
+ .parent = TYPE_OBJECT,
+ .name = TYPE_MEMORY_REGION_PORTIO_LIST,
+ .instance_size = sizeof(MemoryRegionPortioList),
+ .instance_finalize = memory_region_portio_list_finalize,
+};
+
+static void ioport_register_types(void)
+{
+ type_register_static(&memory_region_portio_list_info);
+}
+
+type_init(ioport_register_types)
diff --git a/system/main.c b/system/main.c
new file mode 100644
index 0000000000..694388bd7f
--- /dev/null
+++ b/system/main.c
@@ -0,0 +1,49 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2020 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-main.h"
+#include "sysemu/sysemu.h"
+
+#ifdef CONFIG_SDL
+#include <SDL.h>
+#endif
+
+int qemu_default_main(void)
+{
+ int status;
+
+ status = qemu_main_loop();
+ qemu_cleanup();
+
+ return status;
+}
+
+int (*qemu_main)(void) = qemu_default_main;
+
+int main(int argc, char **argv)
+{
+ qemu_init(argc, argv);
+ return qemu_main();
+}
diff --git a/system/memory.c b/system/memory.c
new file mode 100644
index 0000000000..fa1c99f9ba
--- /dev/null
+++ b/system/memory.c
@@ -0,0 +1,3683 @@
+/*
+ * Physical memory management
+ *
+ * Copyright 2011 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ * Avi Kivity <avi@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "qapi/error.h"
+#include "exec/memory.h"
+#include "qapi/visitor.h"
+#include "qemu/bitops.h"
+#include "qemu/error-report.h"
+#include "qemu/main-loop.h"
+#include "qemu/qemu-print.h"
+#include "qom/object.h"
+#include "trace.h"
+
+#include "exec/memory-internal.h"
+#include "exec/ram_addr.h"
+#include "sysemu/kvm.h"
+#include "sysemu/runstate.h"
+#include "sysemu/tcg.h"
+#include "qemu/accel.h"
+#include "hw/boards.h"
+#include "migration/vmstate.h"
+#include "exec/address-spaces.h"
+
+//#define DEBUG_UNASSIGNED
+
+static unsigned memory_region_transaction_depth;
+static bool memory_region_update_pending;
+static bool ioeventfd_update_pending;
+unsigned int global_dirty_tracking;
+
+static QTAILQ_HEAD(, MemoryListener) memory_listeners
+ = QTAILQ_HEAD_INITIALIZER(memory_listeners);
+
+static QTAILQ_HEAD(, AddressSpace) address_spaces
+ = QTAILQ_HEAD_INITIALIZER(address_spaces);
+
+static GHashTable *flat_views;
+
+typedef struct AddrRange AddrRange;
+
+/*
+ * Note that signed integers are needed for negative offsetting in aliases
+ * (large MemoryRegion::alias_offset).
+ */
+struct AddrRange {
+ Int128 start;
+ Int128 size;
+};
+
+static AddrRange addrrange_make(Int128 start, Int128 size)
+{
+ return (AddrRange) { start, size };
+}
+
+static bool addrrange_equal(AddrRange r1, AddrRange r2)
+{
+ return int128_eq(r1.start, r2.start) && int128_eq(r1.size, r2.size);
+}
+
+static Int128 addrrange_end(AddrRange r)
+{
+ return int128_add(r.start, r.size);
+}
+
+static AddrRange addrrange_shift(AddrRange range, Int128 delta)
+{
+ int128_addto(&range.start, delta);
+ return range;
+}
+
+static bool addrrange_contains(AddrRange range, Int128 addr)
+{
+ return int128_ge(addr, range.start)
+ && int128_lt(addr, addrrange_end(range));
+}
+
+static bool addrrange_intersects(AddrRange r1, AddrRange r2)
+{
+ return addrrange_contains(r1, r2.start)
+ || addrrange_contains(r2, r1.start);
+}
+
+static AddrRange addrrange_intersection(AddrRange r1, AddrRange r2)
+{
+ Int128 start = int128_max(r1.start, r2.start);
+ Int128 end = int128_min(addrrange_end(r1), addrrange_end(r2));
+ return addrrange_make(start, int128_sub(end, start));
+}
+
+enum ListenerDirection { Forward, Reverse };
+
+#define MEMORY_LISTENER_CALL_GLOBAL(_callback, _direction, _args...) \
+ do { \
+ MemoryListener *_listener; \
+ \
+ switch (_direction) { \
+ case Forward: \
+ QTAILQ_FOREACH(_listener, &memory_listeners, link) { \
+ if (_listener->_callback) { \
+ _listener->_callback(_listener, ##_args); \
+ } \
+ } \
+ break; \
+ case Reverse: \
+ QTAILQ_FOREACH_REVERSE(_listener, &memory_listeners, link) { \
+ if (_listener->_callback) { \
+ _listener->_callback(_listener, ##_args); \
+ } \
+ } \
+ break; \
+ default: \
+ abort(); \
+ } \
+ } while (0)
+
+#define MEMORY_LISTENER_CALL(_as, _callback, _direction, _section, _args...) \
+ do { \
+ MemoryListener *_listener; \
+ \
+ switch (_direction) { \
+ case Forward: \
+ QTAILQ_FOREACH(_listener, &(_as)->listeners, link_as) { \
+ if (_listener->_callback) { \
+ _listener->_callback(_listener, _section, ##_args); \
+ } \
+ } \
+ break; \
+ case Reverse: \
+ QTAILQ_FOREACH_REVERSE(_listener, &(_as)->listeners, link_as) { \
+ if (_listener->_callback) { \
+ _listener->_callback(_listener, _section, ##_args); \
+ } \
+ } \
+ break; \
+ default: \
+ abort(); \
+ } \
+ } while (0)
+
+/* No need to ref/unref .mr, the FlatRange keeps it alive. */
+#define MEMORY_LISTENER_UPDATE_REGION(fr, as, dir, callback, _args...) \
+ do { \
+ MemoryRegionSection mrs = section_from_flat_range(fr, \
+ address_space_to_flatview(as)); \
+ MEMORY_LISTENER_CALL(as, callback, dir, &mrs, ##_args); \
+ } while(0)
+
+struct CoalescedMemoryRange {
+ AddrRange addr;
+ QTAILQ_ENTRY(CoalescedMemoryRange) link;
+};
+
+struct MemoryRegionIoeventfd {
+ AddrRange addr;
+ bool match_data;
+ uint64_t data;
+ EventNotifier *e;
+};
+
+static bool memory_region_ioeventfd_before(MemoryRegionIoeventfd *a,
+ MemoryRegionIoeventfd *b)
+{
+ if (int128_lt(a->addr.start, b->addr.start)) {
+ return true;
+ } else if (int128_gt(a->addr.start, b->addr.start)) {
+ return false;
+ } else if (int128_lt(a->addr.size, b->addr.size)) {
+ return true;
+ } else if (int128_gt(a->addr.size, b->addr.size)) {
+ return false;
+ } else if (a->match_data < b->match_data) {
+ return true;
+ } else if (a->match_data > b->match_data) {
+ return false;
+ } else if (a->match_data) {
+ if (a->data < b->data) {
+ return true;
+ } else if (a->data > b->data) {
+ return false;
+ }
+ }
+ if (a->e < b->e) {
+ return true;
+ } else if (a->e > b->e) {
+ return false;
+ }
+ return false;
+}
+
+static bool memory_region_ioeventfd_equal(MemoryRegionIoeventfd *a,
+ MemoryRegionIoeventfd *b)
+{
+ if (int128_eq(a->addr.start, b->addr.start) &&
+ (!int128_nz(a->addr.size) || !int128_nz(b->addr.size) ||
+ (int128_eq(a->addr.size, b->addr.size) &&
+ (a->match_data == b->match_data) &&
+ ((a->match_data && (a->data == b->data)) || !a->match_data) &&
+ (a->e == b->e))))
+ return true;
+
+ return false;
+}
+
+/* Range of memory in the global map. Addresses are absolute. */
+struct FlatRange {
+ MemoryRegion *mr;
+ hwaddr offset_in_region;
+ AddrRange addr;
+ uint8_t dirty_log_mask;
+ bool romd_mode;
+ bool readonly;
+ bool nonvolatile;
+};
+
+#define FOR_EACH_FLAT_RANGE(var, view) \
+ for (var = (view)->ranges; var < (view)->ranges + (view)->nr; ++var)
+
+static inline MemoryRegionSection
+section_from_flat_range(FlatRange *fr, FlatView *fv)
+{
+ return (MemoryRegionSection) {
+ .mr = fr->mr,
+ .fv = fv,
+ .offset_within_region = fr->offset_in_region,
+ .size = fr->addr.size,
+ .offset_within_address_space = int128_get64(fr->addr.start),
+ .readonly = fr->readonly,
+ .nonvolatile = fr->nonvolatile,
+ };
+}
+
+static bool flatrange_equal(FlatRange *a, FlatRange *b)
+{
+ return a->mr == b->mr
+ && addrrange_equal(a->addr, b->addr)
+ && a->offset_in_region == b->offset_in_region
+ && a->romd_mode == b->romd_mode
+ && a->readonly == b->readonly
+ && a->nonvolatile == b->nonvolatile;
+}
+
+static FlatView *flatview_new(MemoryRegion *mr_root)
+{
+ FlatView *view;
+
+ view = g_new0(FlatView, 1);
+ view->ref = 1;
+ view->root = mr_root;
+ memory_region_ref(mr_root);
+ trace_flatview_new(view, mr_root);
+
+ return view;
+}
+
+/* Insert a range into a given position. Caller is responsible for maintaining
+ * sorting order.
+ */
+static void flatview_insert(FlatView *view, unsigned pos, FlatRange *range)
+{
+ if (view->nr == view->nr_allocated) {
+ view->nr_allocated = MAX(2 * view->nr, 10);
+ view->ranges = g_realloc(view->ranges,
+ view->nr_allocated * sizeof(*view->ranges));
+ }
+ memmove(view->ranges + pos + 1, view->ranges + pos,
+ (view->nr - pos) * sizeof(FlatRange));
+ view->ranges[pos] = *range;
+ memory_region_ref(range->mr);
+ ++view->nr;
+}
+
+static void flatview_destroy(FlatView *view)
+{
+ int i;
+
+ trace_flatview_destroy(view, view->root);
+ if (view->dispatch) {
+ address_space_dispatch_free(view->dispatch);
+ }
+ for (i = 0; i < view->nr; i++) {
+ memory_region_unref(view->ranges[i].mr);
+ }
+ g_free(view->ranges);
+ memory_region_unref(view->root);
+ g_free(view);
+}
+
+static bool flatview_ref(FlatView *view)
+{
+ return qatomic_fetch_inc_nonzero(&view->ref) > 0;
+}
+
+void flatview_unref(FlatView *view)
+{
+ if (qatomic_fetch_dec(&view->ref) == 1) {
+ trace_flatview_destroy_rcu(view, view->root);
+ assert(view->root);
+ call_rcu(view, flatview_destroy, rcu);
+ }
+}
+
+static bool can_merge(FlatRange *r1, FlatRange *r2)
+{
+ return int128_eq(addrrange_end(r1->addr), r2->addr.start)
+ && r1->mr == r2->mr
+ && int128_eq(int128_add(int128_make64(r1->offset_in_region),
+ r1->addr.size),
+ int128_make64(r2->offset_in_region))
+ && r1->dirty_log_mask == r2->dirty_log_mask
+ && r1->romd_mode == r2->romd_mode
+ && r1->readonly == r2->readonly
+ && r1->nonvolatile == r2->nonvolatile;
+}
+
+/* Attempt to simplify a view by merging adjacent ranges */
+static void flatview_simplify(FlatView *view)
+{
+ unsigned i, j, k;
+
+ i = 0;
+ while (i < view->nr) {
+ j = i + 1;
+ while (j < view->nr
+ && can_merge(&view->ranges[j-1], &view->ranges[j])) {
+ int128_addto(&view->ranges[i].addr.size, view->ranges[j].addr.size);
+ ++j;
+ }
+ ++i;
+ for (k = i; k < j; k++) {
+ memory_region_unref(view->ranges[k].mr);
+ }
+ memmove(&view->ranges[i], &view->ranges[j],
+ (view->nr - j) * sizeof(view->ranges[j]));
+ view->nr -= j - i;
+ }
+}
+
+static bool memory_region_big_endian(MemoryRegion *mr)
+{
+#if TARGET_BIG_ENDIAN
+ return mr->ops->endianness != DEVICE_LITTLE_ENDIAN;
+#else
+ return mr->ops->endianness == DEVICE_BIG_ENDIAN;
+#endif
+}
+
+static void adjust_endianness(MemoryRegion *mr, uint64_t *data, MemOp op)
+{
+ if ((op & MO_BSWAP) != devend_memop(mr->ops->endianness)) {
+ switch (op & MO_SIZE) {
+ case MO_8:
+ break;
+ case MO_16:
+ *data = bswap16(*data);
+ break;
+ case MO_32:
+ *data = bswap32(*data);
+ break;
+ case MO_64:
+ *data = bswap64(*data);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ }
+}
+
+static inline void memory_region_shift_read_access(uint64_t *value,
+ signed shift,
+ uint64_t mask,
+ uint64_t tmp)
+{
+ if (shift >= 0) {
+ *value |= (tmp & mask) << shift;
+ } else {
+ *value |= (tmp & mask) >> -shift;
+ }
+}
+
+static inline uint64_t memory_region_shift_write_access(uint64_t *value,
+ signed shift,
+ uint64_t mask)
+{
+ uint64_t tmp;
+
+ if (shift >= 0) {
+ tmp = (*value >> shift) & mask;
+ } else {
+ tmp = (*value << -shift) & mask;
+ }
+
+ return tmp;
+}
+
+static hwaddr memory_region_to_absolute_addr(MemoryRegion *mr, hwaddr offset)
+{
+ MemoryRegion *root;
+ hwaddr abs_addr = offset;
+
+ abs_addr += mr->addr;
+ for (root = mr; root->container; ) {
+ root = root->container;
+ abs_addr += root->addr;
+ }
+
+ return abs_addr;
+}
+
+static int get_cpu_index(void)
+{
+ if (current_cpu) {
+ return current_cpu->cpu_index;
+ }
+ return -1;
+}
+
+static MemTxResult memory_region_read_accessor(MemoryRegion *mr,
+ hwaddr addr,
+ uint64_t *value,
+ unsigned size,
+ signed shift,
+ uint64_t mask,
+ MemTxAttrs attrs)
+{
+ uint64_t tmp;
+
+ tmp = mr->ops->read(mr->opaque, addr, size);
+ if (mr->subpage) {
+ trace_memory_region_subpage_read(get_cpu_index(), mr, addr, tmp, size);
+ } else if (trace_event_get_state_backends(TRACE_MEMORY_REGION_OPS_READ)) {
+ hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
+ trace_memory_region_ops_read(get_cpu_index(), mr, abs_addr, tmp, size,
+ memory_region_name(mr));
+ }
+ memory_region_shift_read_access(value, shift, mask, tmp);
+ return MEMTX_OK;
+}
+
+static MemTxResult memory_region_read_with_attrs_accessor(MemoryRegion *mr,
+ hwaddr addr,
+ uint64_t *value,
+ unsigned size,
+ signed shift,
+ uint64_t mask,
+ MemTxAttrs attrs)
+{
+ uint64_t tmp = 0;
+ MemTxResult r;
+
+ r = mr->ops->read_with_attrs(mr->opaque, addr, &tmp, size, attrs);
+ if (mr->subpage) {
+ trace_memory_region_subpage_read(get_cpu_index(), mr, addr, tmp, size);
+ } else if (trace_event_get_state_backends(TRACE_MEMORY_REGION_OPS_READ)) {
+ hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
+ trace_memory_region_ops_read(get_cpu_index(), mr, abs_addr, tmp, size,
+ memory_region_name(mr));
+ }
+ memory_region_shift_read_access(value, shift, mask, tmp);
+ return r;
+}
+
+static MemTxResult memory_region_write_accessor(MemoryRegion *mr,
+ hwaddr addr,
+ uint64_t *value,
+ unsigned size,
+ signed shift,
+ uint64_t mask,
+ MemTxAttrs attrs)
+{
+ uint64_t tmp = memory_region_shift_write_access(value, shift, mask);
+
+ if (mr->subpage) {
+ trace_memory_region_subpage_write(get_cpu_index(), mr, addr, tmp, size);
+ } else if (trace_event_get_state_backends(TRACE_MEMORY_REGION_OPS_WRITE)) {
+ hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
+ trace_memory_region_ops_write(get_cpu_index(), mr, abs_addr, tmp, size,
+ memory_region_name(mr));
+ }
+ mr->ops->write(mr->opaque, addr, tmp, size);
+ return MEMTX_OK;
+}
+
+static MemTxResult memory_region_write_with_attrs_accessor(MemoryRegion *mr,
+ hwaddr addr,
+ uint64_t *value,
+ unsigned size,
+ signed shift,
+ uint64_t mask,
+ MemTxAttrs attrs)
+{
+ uint64_t tmp = memory_region_shift_write_access(value, shift, mask);
+
+ if (mr->subpage) {
+ trace_memory_region_subpage_write(get_cpu_index(), mr, addr, tmp, size);
+ } else if (trace_event_get_state_backends(TRACE_MEMORY_REGION_OPS_WRITE)) {
+ hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
+ trace_memory_region_ops_write(get_cpu_index(), mr, abs_addr, tmp, size,
+ memory_region_name(mr));
+ }
+ return mr->ops->write_with_attrs(mr->opaque, addr, tmp, size, attrs);
+}
+
+static MemTxResult access_with_adjusted_size(hwaddr addr,
+ uint64_t *value,
+ unsigned size,
+ unsigned access_size_min,
+ unsigned access_size_max,
+ MemTxResult (*access_fn)
+ (MemoryRegion *mr,
+ hwaddr addr,
+ uint64_t *value,
+ unsigned size,
+ signed shift,
+ uint64_t mask,
+ MemTxAttrs attrs),
+ MemoryRegion *mr,
+ MemTxAttrs attrs)
+{
+ uint64_t access_mask;
+ unsigned access_size;
+ unsigned i;
+ MemTxResult r = MEMTX_OK;
+ bool reentrancy_guard_applied = false;
+
+ if (!access_size_min) {
+ access_size_min = 1;
+ }
+ if (!access_size_max) {
+ access_size_max = 4;
+ }
+
+ /* Do not allow more than one simultaneous access to a device's IO Regions */
+ if (mr->dev && !mr->disable_reentrancy_guard &&
+ !mr->ram_device && !mr->ram && !mr->rom_device && !mr->readonly) {
+ if (mr->dev->mem_reentrancy_guard.engaged_in_io) {
+ warn_report_once("Blocked re-entrant IO on MemoryRegion: "
+ "%s at addr: 0x%" HWADDR_PRIX,
+ memory_region_name(mr), addr);
+ return MEMTX_ACCESS_ERROR;
+ }
+ mr->dev->mem_reentrancy_guard.engaged_in_io = true;
+ reentrancy_guard_applied = true;
+ }
+
+ /* FIXME: support unaligned access? */
+ access_size = MAX(MIN(size, access_size_max), access_size_min);
+ access_mask = MAKE_64BIT_MASK(0, access_size * 8);
+ if (memory_region_big_endian(mr)) {
+ for (i = 0; i < size; i += access_size) {
+ r |= access_fn(mr, addr + i, value, access_size,
+ (size - access_size - i) * 8, access_mask, attrs);
+ }
+ } else {
+ for (i = 0; i < size; i += access_size) {
+ r |= access_fn(mr, addr + i, value, access_size, i * 8,
+ access_mask, attrs);
+ }
+ }
+ if (mr->dev && reentrancy_guard_applied) {
+ mr->dev->mem_reentrancy_guard.engaged_in_io = false;
+ }
+ return r;
+}
+
+static AddressSpace *memory_region_to_address_space(MemoryRegion *mr)
+{
+ AddressSpace *as;
+
+ while (mr->container) {
+ mr = mr->container;
+ }
+ QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) {
+ if (mr == as->root) {
+ return as;
+ }
+ }
+ return NULL;
+}
+
+/* Render a memory region into the global view. Ranges in @view obscure
+ * ranges in @mr.
+ */
+static void render_memory_region(FlatView *view,
+ MemoryRegion *mr,
+ Int128 base,
+ AddrRange clip,
+ bool readonly,
+ bool nonvolatile)
+{
+ MemoryRegion *subregion;
+ unsigned i;
+ hwaddr offset_in_region;
+ Int128 remain;
+ Int128 now;
+ FlatRange fr;
+ AddrRange tmp;
+
+ if (!mr->enabled) {
+ return;
+ }
+
+ int128_addto(&base, int128_make64(mr->addr));
+ readonly |= mr->readonly;
+ nonvolatile |= mr->nonvolatile;
+
+ tmp = addrrange_make(base, mr->size);
+
+ if (!addrrange_intersects(tmp, clip)) {
+ return;
+ }
+
+ clip = addrrange_intersection(tmp, clip);
+
+ if (mr->alias) {
+ int128_subfrom(&base, int128_make64(mr->alias->addr));
+ int128_subfrom(&base, int128_make64(mr->alias_offset));
+ render_memory_region(view, mr->alias, base, clip,
+ readonly, nonvolatile);
+ return;
+ }
+
+ /* Render subregions in priority order. */
+ QTAILQ_FOREACH(subregion, &mr->subregions, subregions_link) {
+ render_memory_region(view, subregion, base, clip,
+ readonly, nonvolatile);
+ }
+
+ if (!mr->terminates) {
+ return;
+ }
+
+ offset_in_region = int128_get64(int128_sub(clip.start, base));
+ base = clip.start;
+ remain = clip.size;
+
+ fr.mr = mr;
+ fr.dirty_log_mask = memory_region_get_dirty_log_mask(mr);
+ fr.romd_mode = mr->romd_mode;
+ fr.readonly = readonly;
+ fr.nonvolatile = nonvolatile;
+
+ /* Render the region itself into any gaps left by the current view. */
+ for (i = 0; i < view->nr && int128_nz(remain); ++i) {
+ if (int128_ge(base, addrrange_end(view->ranges[i].addr))) {
+ continue;
+ }
+ if (int128_lt(base, view->ranges[i].addr.start)) {
+ now = int128_min(remain,
+ int128_sub(view->ranges[i].addr.start, base));
+ fr.offset_in_region = offset_in_region;
+ fr.addr = addrrange_make(base, now);
+ flatview_insert(view, i, &fr);
+ ++i;
+ int128_addto(&base, now);
+ offset_in_region += int128_get64(now);
+ int128_subfrom(&remain, now);
+ }
+ now = int128_sub(int128_min(int128_add(base, remain),
+ addrrange_end(view->ranges[i].addr)),
+ base);
+ int128_addto(&base, now);
+ offset_in_region += int128_get64(now);
+ int128_subfrom(&remain, now);
+ }
+ if (int128_nz(remain)) {
+ fr.offset_in_region = offset_in_region;
+ fr.addr = addrrange_make(base, remain);
+ flatview_insert(view, i, &fr);
+ }
+}
+
+void flatview_for_each_range(FlatView *fv, flatview_cb cb , void *opaque)
+{
+ FlatRange *fr;
+
+ assert(fv);
+ assert(cb);
+
+ FOR_EACH_FLAT_RANGE(fr, fv) {
+ if (cb(fr->addr.start, fr->addr.size, fr->mr,
+ fr->offset_in_region, opaque)) {
+ break;
+ }
+ }
+}
+
+static MemoryRegion *memory_region_get_flatview_root(MemoryRegion *mr)
+{
+ while (mr->enabled) {
+ if (mr->alias) {
+ if (!mr->alias_offset && int128_ge(mr->size, mr->alias->size)) {
+ /* The alias is included in its entirety. Use it as
+ * the "real" root, so that we can share more FlatViews.
+ */
+ mr = mr->alias;
+ continue;
+ }
+ } else if (!mr->terminates) {
+ unsigned int found = 0;
+ MemoryRegion *child, *next = NULL;
+ QTAILQ_FOREACH(child, &mr->subregions, subregions_link) {
+ if (child->enabled) {
+ if (++found > 1) {
+ next = NULL;
+ break;
+ }
+ if (!child->addr && int128_ge(mr->size, child->size)) {
+ /* A child is included in its entirety. If it's the only
+ * enabled one, use it in the hope of finding an alias down the
+ * way. This will also let us share FlatViews.
+ */
+ next = child;
+ }
+ }
+ }
+ if (found == 0) {
+ return NULL;
+ }
+ if (next) {
+ mr = next;
+ continue;
+ }
+ }
+
+ return mr;
+ }
+
+ return NULL;
+}
+
+/* Render a memory topology into a list of disjoint absolute ranges. */
+static FlatView *generate_memory_topology(MemoryRegion *mr)
+{
+ int i;
+ FlatView *view;
+
+ view = flatview_new(mr);
+
+ if (mr) {
+ render_memory_region(view, mr, int128_zero(),
+ addrrange_make(int128_zero(), int128_2_64()),
+ false, false);
+ }
+ flatview_simplify(view);
+
+ view->dispatch = address_space_dispatch_new(view);
+ for (i = 0; i < view->nr; i++) {
+ MemoryRegionSection mrs =
+ section_from_flat_range(&view->ranges[i], view);
+ flatview_add_to_dispatch(view, &mrs);
+ }
+ address_space_dispatch_compact(view->dispatch);
+ g_hash_table_replace(flat_views, mr, view);
+
+ return view;
+}
+
+static void address_space_add_del_ioeventfds(AddressSpace *as,
+ MemoryRegionIoeventfd *fds_new,
+ unsigned fds_new_nb,
+ MemoryRegionIoeventfd *fds_old,
+ unsigned fds_old_nb)
+{
+ unsigned iold, inew;
+ MemoryRegionIoeventfd *fd;
+ MemoryRegionSection section;
+
+ /* Generate a symmetric difference of the old and new fd sets, adding
+ * and deleting as necessary.
+ */
+
+ iold = inew = 0;
+ while (iold < fds_old_nb || inew < fds_new_nb) {
+ if (iold < fds_old_nb
+ && (inew == fds_new_nb
+ || memory_region_ioeventfd_before(&fds_old[iold],
+ &fds_new[inew]))) {
+ fd = &fds_old[iold];
+ section = (MemoryRegionSection) {
+ .fv = address_space_to_flatview(as),
+ .offset_within_address_space = int128_get64(fd->addr.start),
+ .size = fd->addr.size,
+ };
+ MEMORY_LISTENER_CALL(as, eventfd_del, Forward, &section,
+ fd->match_data, fd->data, fd->e);
+ ++iold;
+ } else if (inew < fds_new_nb
+ && (iold == fds_old_nb
+ || memory_region_ioeventfd_before(&fds_new[inew],
+ &fds_old[iold]))) {
+ fd = &fds_new[inew];
+ section = (MemoryRegionSection) {
+ .fv = address_space_to_flatview(as),
+ .offset_within_address_space = int128_get64(fd->addr.start),
+ .size = fd->addr.size,
+ };
+ MEMORY_LISTENER_CALL(as, eventfd_add, Reverse, &section,
+ fd->match_data, fd->data, fd->e);
+ ++inew;
+ } else {
+ ++iold;
+ ++inew;
+ }
+ }
+}
+
+FlatView *address_space_get_flatview(AddressSpace *as)
+{
+ FlatView *view;
+
+ RCU_READ_LOCK_GUARD();
+ do {
+ view = address_space_to_flatview(as);
+ /* If somebody has replaced as->current_map concurrently,
+ * flatview_ref returns false.
+ */
+ } while (!flatview_ref(view));
+ return view;
+}
+
+static void address_space_update_ioeventfds(AddressSpace *as)
+{
+ FlatView *view;
+ FlatRange *fr;
+ unsigned ioeventfd_nb = 0;
+ unsigned ioeventfd_max;
+ MemoryRegionIoeventfd *ioeventfds;
+ AddrRange tmp;
+ unsigned i;
+
+ if (!as->ioeventfd_notifiers) {
+ return;
+ }
+
+ /*
+ * It is likely that the number of ioeventfds hasn't changed much, so use
+ * the previous size as the starting value, with some headroom to avoid
+ * gratuitous reallocations.
+ */
+ ioeventfd_max = QEMU_ALIGN_UP(as->ioeventfd_nb, 4);
+ ioeventfds = g_new(MemoryRegionIoeventfd, ioeventfd_max);
+
+ view = address_space_get_flatview(as);
+ FOR_EACH_FLAT_RANGE(fr, view) {
+ for (i = 0; i < fr->mr->ioeventfd_nb; ++i) {
+ tmp = addrrange_shift(fr->mr->ioeventfds[i].addr,
+ int128_sub(fr->addr.start,
+ int128_make64(fr->offset_in_region)));
+ if (addrrange_intersects(fr->addr, tmp)) {
+ ++ioeventfd_nb;
+ if (ioeventfd_nb > ioeventfd_max) {
+ ioeventfd_max = MAX(ioeventfd_max * 2, 4);
+ ioeventfds = g_realloc(ioeventfds,
+ ioeventfd_max * sizeof(*ioeventfds));
+ }
+ ioeventfds[ioeventfd_nb-1] = fr->mr->ioeventfds[i];
+ ioeventfds[ioeventfd_nb-1].addr = tmp;
+ }
+ }
+ }
+
+ address_space_add_del_ioeventfds(as, ioeventfds, ioeventfd_nb,
+ as->ioeventfds, as->ioeventfd_nb);
+
+ g_free(as->ioeventfds);
+ as->ioeventfds = ioeventfds;
+ as->ioeventfd_nb = ioeventfd_nb;
+ flatview_unref(view);
+}
+
+/*
+ * Notify the memory listeners about the coalesced IO change events of
+ * range `cmr'. Only the part that has intersection of the specified
+ * FlatRange will be sent.
+ */
+static void flat_range_coalesced_io_notify(FlatRange *fr, AddressSpace *as,
+ CoalescedMemoryRange *cmr, bool add)
+{
+ AddrRange tmp;
+
+ tmp = addrrange_shift(cmr->addr,
+ int128_sub(fr->addr.start,
+ int128_make64(fr->offset_in_region)));
+ if (!addrrange_intersects(tmp, fr->addr)) {
+ return;
+ }
+ tmp = addrrange_intersection(tmp, fr->addr);
+
+ if (add) {
+ MEMORY_LISTENER_UPDATE_REGION(fr, as, Forward, coalesced_io_add,
+ int128_get64(tmp.start),
+ int128_get64(tmp.size));
+ } else {
+ MEMORY_LISTENER_UPDATE_REGION(fr, as, Reverse, coalesced_io_del,
+ int128_get64(tmp.start),
+ int128_get64(tmp.size));
+ }
+}
+
+static void flat_range_coalesced_io_del(FlatRange *fr, AddressSpace *as)
+{
+ CoalescedMemoryRange *cmr;
+
+ QTAILQ_FOREACH(cmr, &fr->mr->coalesced, link) {
+ flat_range_coalesced_io_notify(fr, as, cmr, false);
+ }
+}
+
+static void flat_range_coalesced_io_add(FlatRange *fr, AddressSpace *as)
+{
+ MemoryRegion *mr = fr->mr;
+ CoalescedMemoryRange *cmr;
+
+ if (QTAILQ_EMPTY(&mr->coalesced)) {
+ return;
+ }
+
+ QTAILQ_FOREACH(cmr, &mr->coalesced, link) {
+ flat_range_coalesced_io_notify(fr, as, cmr, true);
+ }
+}
+
+static void address_space_update_topology_pass(AddressSpace *as,
+ const FlatView *old_view,
+ const FlatView *new_view,
+ bool adding)
+{
+ unsigned iold, inew;
+ FlatRange *frold, *frnew;
+
+ /* Generate a symmetric difference of the old and new memory maps.
+ * Kill ranges in the old map, and instantiate ranges in the new map.
+ */
+ iold = inew = 0;
+ while (iold < old_view->nr || inew < new_view->nr) {
+ if (iold < old_view->nr) {
+ frold = &old_view->ranges[iold];
+ } else {
+ frold = NULL;
+ }
+ if (inew < new_view->nr) {
+ frnew = &new_view->ranges[inew];
+ } else {
+ frnew = NULL;
+ }
+
+ if (frold
+ && (!frnew
+ || int128_lt(frold->addr.start, frnew->addr.start)
+ || (int128_eq(frold->addr.start, frnew->addr.start)
+ && !flatrange_equal(frold, frnew)))) {
+ /* In old but not in new, or in both but attributes changed. */
+
+ if (!adding) {
+ flat_range_coalesced_io_del(frold, as);
+ MEMORY_LISTENER_UPDATE_REGION(frold, as, Reverse, region_del);
+ }
+
+ ++iold;
+ } else if (frold && frnew && flatrange_equal(frold, frnew)) {
+ /* In both and unchanged (except logging may have changed) */
+
+ if (adding) {
+ MEMORY_LISTENER_UPDATE_REGION(frnew, as, Forward, region_nop);
+ if (frnew->dirty_log_mask & ~frold->dirty_log_mask) {
+ MEMORY_LISTENER_UPDATE_REGION(frnew, as, Forward, log_start,
+ frold->dirty_log_mask,
+ frnew->dirty_log_mask);
+ }
+ if (frold->dirty_log_mask & ~frnew->dirty_log_mask) {
+ MEMORY_LISTENER_UPDATE_REGION(frnew, as, Reverse, log_stop,
+ frold->dirty_log_mask,
+ frnew->dirty_log_mask);
+ }
+ }
+
+ ++iold;
+ ++inew;
+ } else {
+ /* In new */
+
+ if (adding) {
+ MEMORY_LISTENER_UPDATE_REGION(frnew, as, Forward, region_add);
+ flat_range_coalesced_io_add(frnew, as);
+ }
+
+ ++inew;
+ }
+ }
+}
+
+static void flatviews_init(void)
+{
+ static FlatView *empty_view;
+
+ if (flat_views) {
+ return;
+ }
+
+ flat_views = g_hash_table_new_full(g_direct_hash, g_direct_equal, NULL,
+ (GDestroyNotify) flatview_unref);
+ if (!empty_view) {
+ empty_view = generate_memory_topology(NULL);
+ /* We keep it alive forever in the global variable. */
+ flatview_ref(empty_view);
+ } else {
+ g_hash_table_replace(flat_views, NULL, empty_view);
+ flatview_ref(empty_view);
+ }
+}
+
+static void flatviews_reset(void)
+{
+ AddressSpace *as;
+
+ if (flat_views) {
+ g_hash_table_unref(flat_views);
+ flat_views = NULL;
+ }
+ flatviews_init();
+
+ /* Render unique FVs */
+ QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) {
+ MemoryRegion *physmr = memory_region_get_flatview_root(as->root);
+
+ if (g_hash_table_lookup(flat_views, physmr)) {
+ continue;
+ }
+
+ generate_memory_topology(physmr);
+ }
+}
+
+static void address_space_set_flatview(AddressSpace *as)
+{
+ FlatView *old_view = address_space_to_flatview(as);
+ MemoryRegion *physmr = memory_region_get_flatview_root(as->root);
+ FlatView *new_view = g_hash_table_lookup(flat_views, physmr);
+
+ assert(new_view);
+
+ if (old_view == new_view) {
+ return;
+ }
+
+ if (old_view) {
+ flatview_ref(old_view);
+ }
+
+ flatview_ref(new_view);
+
+ if (!QTAILQ_EMPTY(&as->listeners)) {
+ FlatView tmpview = { .nr = 0 }, *old_view2 = old_view;
+
+ if (!old_view2) {
+ old_view2 = &tmpview;
+ }
+ address_space_update_topology_pass(as, old_view2, new_view, false);
+ address_space_update_topology_pass(as, old_view2, new_view, true);
+ }
+
+ /* Writes are protected by the BQL. */
+ qatomic_rcu_set(&as->current_map, new_view);
+ if (old_view) {
+ flatview_unref(old_view);
+ }
+
+ /* Note that all the old MemoryRegions are still alive up to this
+ * point. This relieves most MemoryListeners from the need to
+ * ref/unref the MemoryRegions they get---unless they use them
+ * outside the iothread mutex, in which case precise reference
+ * counting is necessary.
+ */
+ if (old_view) {
+ flatview_unref(old_view);
+ }
+}
+
+static void address_space_update_topology(AddressSpace *as)
+{
+ MemoryRegion *physmr = memory_region_get_flatview_root(as->root);
+
+ flatviews_init();
+ if (!g_hash_table_lookup(flat_views, physmr)) {
+ generate_memory_topology(physmr);
+ }
+ address_space_set_flatview(as);
+}
+
+void memory_region_transaction_begin(void)
+{
+ qemu_flush_coalesced_mmio_buffer();
+ ++memory_region_transaction_depth;
+}
+
+void memory_region_transaction_commit(void)
+{
+ AddressSpace *as;
+
+ assert(memory_region_transaction_depth);
+ assert(qemu_mutex_iothread_locked());
+
+ --memory_region_transaction_depth;
+ if (!memory_region_transaction_depth) {
+ if (memory_region_update_pending) {
+ flatviews_reset();
+
+ MEMORY_LISTENER_CALL_GLOBAL(begin, Forward);
+
+ QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) {
+ address_space_set_flatview(as);
+ address_space_update_ioeventfds(as);
+ }
+ memory_region_update_pending = false;
+ ioeventfd_update_pending = false;
+ MEMORY_LISTENER_CALL_GLOBAL(commit, Forward);
+ } else if (ioeventfd_update_pending) {
+ QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) {
+ address_space_update_ioeventfds(as);
+ }
+ ioeventfd_update_pending = false;
+ }
+ }
+}
+
+static void memory_region_destructor_none(MemoryRegion *mr)
+{
+}
+
+static void memory_region_destructor_ram(MemoryRegion *mr)
+{
+ qemu_ram_free(mr->ram_block);
+}
+
+static bool memory_region_need_escape(char c)
+{
+ return c == '/' || c == '[' || c == '\\' || c == ']';
+}
+
+static char *memory_region_escape_name(const char *name)
+{
+ const char *p;
+ char *escaped, *q;
+ uint8_t c;
+ size_t bytes = 0;
+
+ for (p = name; *p; p++) {
+ bytes += memory_region_need_escape(*p) ? 4 : 1;
+ }
+ if (bytes == p - name) {
+ return g_memdup(name, bytes + 1);
+ }
+
+ escaped = g_malloc(bytes + 1);
+ for (p = name, q = escaped; *p; p++) {
+ c = *p;
+ if (unlikely(memory_region_need_escape(c))) {
+ *q++ = '\\';
+ *q++ = 'x';
+ *q++ = "0123456789abcdef"[c >> 4];
+ c = "0123456789abcdef"[c & 15];
+ }
+ *q++ = c;
+ }
+ *q = 0;
+ return escaped;
+}
+
+static void memory_region_do_init(MemoryRegion *mr,
+ Object *owner,
+ const char *name,
+ uint64_t size)
+{
+ mr->size = int128_make64(size);
+ if (size == UINT64_MAX) {
+ mr->size = int128_2_64();
+ }
+ mr->name = g_strdup(name);
+ mr->owner = owner;
+ mr->dev = (DeviceState *) object_dynamic_cast(mr->owner, TYPE_DEVICE);
+ mr->ram_block = NULL;
+
+ if (name) {
+ char *escaped_name = memory_region_escape_name(name);
+ char *name_array = g_strdup_printf("%s[*]", escaped_name);
+
+ if (!owner) {
+ owner = container_get(qdev_get_machine(), "/unattached");
+ }
+
+ object_property_add_child(owner, name_array, OBJECT(mr));
+ object_unref(OBJECT(mr));
+ g_free(name_array);
+ g_free(escaped_name);
+ }
+}
+
+void memory_region_init(MemoryRegion *mr,
+ Object *owner,
+ const char *name,
+ uint64_t size)
+{
+ object_initialize(mr, sizeof(*mr), TYPE_MEMORY_REGION);
+ memory_region_do_init(mr, owner, name, size);
+}
+
+static void memory_region_get_container(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ MemoryRegion *mr = MEMORY_REGION(obj);
+ char *path = (char *)"";
+
+ if (mr->container) {
+ path = object_get_canonical_path(OBJECT(mr->container));
+ }
+ visit_type_str(v, name, &path, errp);
+ if (mr->container) {
+ g_free(path);
+ }
+}
+
+static Object *memory_region_resolve_container(Object *obj, void *opaque,
+ const char *part)
+{
+ MemoryRegion *mr = MEMORY_REGION(obj);
+
+ return OBJECT(mr->container);
+}
+
+static void memory_region_get_priority(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ MemoryRegion *mr = MEMORY_REGION(obj);
+ int32_t value = mr->priority;
+
+ visit_type_int32(v, name, &value, errp);
+}
+
+static void memory_region_get_size(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ MemoryRegion *mr = MEMORY_REGION(obj);
+ uint64_t value = memory_region_size(mr);
+
+ visit_type_uint64(v, name, &value, errp);
+}
+
+static void memory_region_initfn(Object *obj)
+{
+ MemoryRegion *mr = MEMORY_REGION(obj);
+ ObjectProperty *op;
+
+ mr->ops = &unassigned_mem_ops;
+ mr->enabled = true;
+ mr->romd_mode = true;
+ mr->destructor = memory_region_destructor_none;
+ QTAILQ_INIT(&mr->subregions);
+ QTAILQ_INIT(&mr->coalesced);
+
+ op = object_property_add(OBJECT(mr), "container",
+ "link<" TYPE_MEMORY_REGION ">",
+ memory_region_get_container,
+ NULL, /* memory_region_set_container */
+ NULL, NULL);
+ op->resolve = memory_region_resolve_container;
+
+ object_property_add_uint64_ptr(OBJECT(mr), "addr",
+ &mr->addr, OBJ_PROP_FLAG_READ);
+ object_property_add(OBJECT(mr), "priority", "uint32",
+ memory_region_get_priority,
+ NULL, /* memory_region_set_priority */
+ NULL, NULL);
+ object_property_add(OBJECT(mr), "size", "uint64",
+ memory_region_get_size,
+ NULL, /* memory_region_set_size, */
+ NULL, NULL);
+}
+
+static void iommu_memory_region_initfn(Object *obj)
+{
+ MemoryRegion *mr = MEMORY_REGION(obj);
+
+ mr->is_iommu = true;
+}
+
+static uint64_t unassigned_mem_read(void *opaque, hwaddr addr,
+ unsigned size)
+{
+#ifdef DEBUG_UNASSIGNED
+ printf("Unassigned mem read " HWADDR_FMT_plx "\n", addr);
+#endif
+ return 0;
+}
+
+static void unassigned_mem_write(void *opaque, hwaddr addr,
+ uint64_t val, unsigned size)
+{
+#ifdef DEBUG_UNASSIGNED
+ printf("Unassigned mem write " HWADDR_FMT_plx " = 0x%"PRIx64"\n", addr, val);
+#endif
+}
+
+static bool unassigned_mem_accepts(void *opaque, hwaddr addr,
+ unsigned size, bool is_write,
+ MemTxAttrs attrs)
+{
+ return false;
+}
+
+const MemoryRegionOps unassigned_mem_ops = {
+ .valid.accepts = unassigned_mem_accepts,
+ .endianness = DEVICE_NATIVE_ENDIAN,
+};
+
+static uint64_t memory_region_ram_device_read(void *opaque,
+ hwaddr addr, unsigned size)
+{
+ MemoryRegion *mr = opaque;
+ uint64_t data = (uint64_t)~0;
+
+ switch (size) {
+ case 1:
+ data = *(uint8_t *)(mr->ram_block->host + addr);
+ break;
+ case 2:
+ data = *(uint16_t *)(mr->ram_block->host + addr);
+ break;
+ case 4:
+ data = *(uint32_t *)(mr->ram_block->host + addr);
+ break;
+ case 8:
+ data = *(uint64_t *)(mr->ram_block->host + addr);
+ break;
+ }
+
+ trace_memory_region_ram_device_read(get_cpu_index(), mr, addr, data, size);
+
+ return data;
+}
+
+static void memory_region_ram_device_write(void *opaque, hwaddr addr,
+ uint64_t data, unsigned size)
+{
+ MemoryRegion *mr = opaque;
+
+ trace_memory_region_ram_device_write(get_cpu_index(), mr, addr, data, size);
+
+ switch (size) {
+ case 1:
+ *(uint8_t *)(mr->ram_block->host + addr) = (uint8_t)data;
+ break;
+ case 2:
+ *(uint16_t *)(mr->ram_block->host + addr) = (uint16_t)data;
+ break;
+ case 4:
+ *(uint32_t *)(mr->ram_block->host + addr) = (uint32_t)data;
+ break;
+ case 8:
+ *(uint64_t *)(mr->ram_block->host + addr) = data;
+ break;
+ }
+}
+
+static const MemoryRegionOps ram_device_mem_ops = {
+ .read = memory_region_ram_device_read,
+ .write = memory_region_ram_device_write,
+ .endianness = DEVICE_HOST_ENDIAN,
+ .valid = {
+ .min_access_size = 1,
+ .max_access_size = 8,
+ .unaligned = true,
+ },
+ .impl = {
+ .min_access_size = 1,
+ .max_access_size = 8,
+ .unaligned = true,
+ },
+};
+
+bool memory_region_access_valid(MemoryRegion *mr,
+ hwaddr addr,
+ unsigned size,
+ bool is_write,
+ MemTxAttrs attrs)
+{
+ if (mr->ops->valid.accepts
+ && !mr->ops->valid.accepts(mr->opaque, addr, size, is_write, attrs)) {
+ qemu_log_mask(LOG_GUEST_ERROR, "Invalid %s at addr 0x%" HWADDR_PRIX
+ ", size %u, region '%s', reason: rejected\n",
+ is_write ? "write" : "read",
+ addr, size, memory_region_name(mr));
+ return false;
+ }
+
+ if (!mr->ops->valid.unaligned && (addr & (size - 1))) {
+ qemu_log_mask(LOG_GUEST_ERROR, "Invalid %s at addr 0x%" HWADDR_PRIX
+ ", size %u, region '%s', reason: unaligned\n",
+ is_write ? "write" : "read",
+ addr, size, memory_region_name(mr));
+ return false;
+ }
+
+ /* Treat zero as compatibility all valid */
+ if (!mr->ops->valid.max_access_size) {
+ return true;
+ }
+
+ if (size > mr->ops->valid.max_access_size
+ || size < mr->ops->valid.min_access_size) {
+ qemu_log_mask(LOG_GUEST_ERROR, "Invalid %s at addr 0x%" HWADDR_PRIX
+ ", size %u, region '%s', reason: invalid size "
+ "(min:%u max:%u)\n",
+ is_write ? "write" : "read",
+ addr, size, memory_region_name(mr),
+ mr->ops->valid.min_access_size,
+ mr->ops->valid.max_access_size);
+ return false;
+ }
+ return true;
+}
+
+static MemTxResult memory_region_dispatch_read1(MemoryRegion *mr,
+ hwaddr addr,
+ uint64_t *pval,
+ unsigned size,
+ MemTxAttrs attrs)
+{
+ *pval = 0;
+
+ if (mr->ops->read) {
+ return access_with_adjusted_size(addr, pval, size,
+ mr->ops->impl.min_access_size,
+ mr->ops->impl.max_access_size,
+ memory_region_read_accessor,
+ mr, attrs);
+ } else {
+ return access_with_adjusted_size(addr, pval, size,
+ mr->ops->impl.min_access_size,
+ mr->ops->impl.max_access_size,
+ memory_region_read_with_attrs_accessor,
+ mr, attrs);
+ }
+}
+
+MemTxResult memory_region_dispatch_read(MemoryRegion *mr,
+ hwaddr addr,
+ uint64_t *pval,
+ MemOp op,
+ MemTxAttrs attrs)
+{
+ unsigned size = memop_size(op);
+ MemTxResult r;
+
+ if (mr->alias) {
+ return memory_region_dispatch_read(mr->alias,
+ mr->alias_offset + addr,
+ pval, op, attrs);
+ }
+ if (!memory_region_access_valid(mr, addr, size, false, attrs)) {
+ *pval = unassigned_mem_read(mr, addr, size);
+ return MEMTX_DECODE_ERROR;
+ }
+
+ r = memory_region_dispatch_read1(mr, addr, pval, size, attrs);
+ adjust_endianness(mr, pval, op);
+ return r;
+}
+
+/* Return true if an eventfd was signalled */
+static bool memory_region_dispatch_write_eventfds(MemoryRegion *mr,
+ hwaddr addr,
+ uint64_t data,
+ unsigned size,
+ MemTxAttrs attrs)
+{
+ MemoryRegionIoeventfd ioeventfd = {
+ .addr = addrrange_make(int128_make64(addr), int128_make64(size)),
+ .data = data,
+ };
+ unsigned i;
+
+ for (i = 0; i < mr->ioeventfd_nb; i++) {
+ ioeventfd.match_data = mr->ioeventfds[i].match_data;
+ ioeventfd.e = mr->ioeventfds[i].e;
+
+ if (memory_region_ioeventfd_equal(&ioeventfd, &mr->ioeventfds[i])) {
+ event_notifier_set(ioeventfd.e);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+MemTxResult memory_region_dispatch_write(MemoryRegion *mr,
+ hwaddr addr,
+ uint64_t data,
+ MemOp op,
+ MemTxAttrs attrs)
+{
+ unsigned size = memop_size(op);
+
+ if (mr->alias) {
+ return memory_region_dispatch_write(mr->alias,
+ mr->alias_offset + addr,
+ data, op, attrs);
+ }
+ if (!memory_region_access_valid(mr, addr, size, true, attrs)) {
+ unassigned_mem_write(mr, addr, data, size);
+ return MEMTX_DECODE_ERROR;
+ }
+
+ adjust_endianness(mr, &data, op);
+
+ if ((!kvm_eventfds_enabled()) &&
+ memory_region_dispatch_write_eventfds(mr, addr, data, size, attrs)) {
+ return MEMTX_OK;
+ }
+
+ if (mr->ops->write) {
+ return access_with_adjusted_size(addr, &data, size,
+ mr->ops->impl.min_access_size,
+ mr->ops->impl.max_access_size,
+ memory_region_write_accessor, mr,
+ attrs);
+ } else {
+ return
+ access_with_adjusted_size(addr, &data, size,
+ mr->ops->impl.min_access_size,
+ mr->ops->impl.max_access_size,
+ memory_region_write_with_attrs_accessor,
+ mr, attrs);
+ }
+}
+
+void memory_region_init_io(MemoryRegion *mr,
+ Object *owner,
+ const MemoryRegionOps *ops,
+ void *opaque,
+ const char *name,
+ uint64_t size)
+{
+ memory_region_init(mr, owner, name, size);
+ mr->ops = ops ? ops : &unassigned_mem_ops;
+ mr->opaque = opaque;
+ mr->terminates = true;
+}
+
+void memory_region_init_ram_nomigrate(MemoryRegion *mr,
+ Object *owner,
+ const char *name,
+ uint64_t size,
+ Error **errp)
+{
+ memory_region_init_ram_flags_nomigrate(mr, owner, name, size, 0, errp);
+}
+
+void memory_region_init_ram_flags_nomigrate(MemoryRegion *mr,
+ Object *owner,
+ const char *name,
+ uint64_t size,
+ uint32_t ram_flags,
+ Error **errp)
+{
+ Error *err = NULL;
+ memory_region_init(mr, owner, name, size);
+ mr->ram = true;
+ mr->terminates = true;
+ mr->destructor = memory_region_destructor_ram;
+ mr->ram_block = qemu_ram_alloc(size, ram_flags, mr, &err);
+ if (err) {
+ mr->size = int128_zero();
+ object_unparent(OBJECT(mr));
+ error_propagate(errp, err);
+ }
+}
+
+void memory_region_init_resizeable_ram(MemoryRegion *mr,
+ Object *owner,
+ const char *name,
+ uint64_t size,
+ uint64_t max_size,
+ void (*resized)(const char*,
+ uint64_t length,
+ void *host),
+ Error **errp)
+{
+ Error *err = NULL;
+ memory_region_init(mr, owner, name, size);
+ mr->ram = true;
+ mr->terminates = true;
+ mr->destructor = memory_region_destructor_ram;
+ mr->ram_block = qemu_ram_alloc_resizeable(size, max_size, resized,
+ mr, &err);
+ if (err) {
+ mr->size = int128_zero();
+ object_unparent(OBJECT(mr));
+ error_propagate(errp, err);
+ }
+}
+
+#ifdef CONFIG_POSIX
+void memory_region_init_ram_from_file(MemoryRegion *mr,
+ Object *owner,
+ const char *name,
+ uint64_t size,
+ uint64_t align,
+ uint32_t ram_flags,
+ const char *path,
+ ram_addr_t offset,
+ Error **errp)
+{
+ Error *err = NULL;
+ memory_region_init(mr, owner, name, size);
+ mr->ram = true;
+ mr->readonly = !!(ram_flags & RAM_READONLY);
+ mr->terminates = true;
+ mr->destructor = memory_region_destructor_ram;
+ mr->align = align;
+ mr->ram_block = qemu_ram_alloc_from_file(size, mr, ram_flags, path,
+ offset, &err);
+ if (err) {
+ mr->size = int128_zero();
+ object_unparent(OBJECT(mr));
+ error_propagate(errp, err);
+ }
+}
+
+void memory_region_init_ram_from_fd(MemoryRegion *mr,
+ Object *owner,
+ const char *name,
+ uint64_t size,
+ uint32_t ram_flags,
+ int fd,
+ ram_addr_t offset,
+ Error **errp)
+{
+ Error *err = NULL;
+ memory_region_init(mr, owner, name, size);
+ mr->ram = true;
+ mr->readonly = !!(ram_flags & RAM_READONLY);
+ mr->terminates = true;
+ mr->destructor = memory_region_destructor_ram;
+ mr->ram_block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, offset,
+ &err);
+ if (err) {
+ mr->size = int128_zero();
+ object_unparent(OBJECT(mr));
+ error_propagate(errp, err);
+ }
+}
+#endif
+
+void memory_region_init_ram_ptr(MemoryRegion *mr,
+ Object *owner,
+ const char *name,
+ uint64_t size,
+ void *ptr)
+{
+ memory_region_init(mr, owner, name, size);
+ mr->ram = true;
+ mr->terminates = true;
+ mr->destructor = memory_region_destructor_ram;
+
+ /* qemu_ram_alloc_from_ptr cannot fail with ptr != NULL. */
+ assert(ptr != NULL);
+ mr->ram_block = qemu_ram_alloc_from_ptr(size, ptr, mr, &error_fatal);
+}
+
+void memory_region_init_ram_device_ptr(MemoryRegion *mr,
+ Object *owner,
+ const char *name,
+ uint64_t size,
+ void *ptr)
+{
+ memory_region_init(mr, owner, name, size);
+ mr->ram = true;
+ mr->terminates = true;
+ mr->ram_device = true;
+ mr->ops = &ram_device_mem_ops;
+ mr->opaque = mr;
+ mr->destructor = memory_region_destructor_ram;
+
+ /* qemu_ram_alloc_from_ptr cannot fail with ptr != NULL. */
+ assert(ptr != NULL);
+ mr->ram_block = qemu_ram_alloc_from_ptr(size, ptr, mr, &error_fatal);
+}
+
+void memory_region_init_alias(MemoryRegion *mr,
+ Object *owner,
+ const char *name,
+ MemoryRegion *orig,
+ hwaddr offset,
+ uint64_t size)
+{
+ memory_region_init(mr, owner, name, size);
+ mr->alias = orig;
+ mr->alias_offset = offset;
+}
+
+void memory_region_init_rom_nomigrate(MemoryRegion *mr,
+ Object *owner,
+ const char *name,
+ uint64_t size,
+ Error **errp)
+{
+ memory_region_init_ram_flags_nomigrate(mr, owner, name, size, 0, errp);
+ mr->readonly = true;
+}
+
+void memory_region_init_rom_device_nomigrate(MemoryRegion *mr,
+ Object *owner,
+ const MemoryRegionOps *ops,
+ void *opaque,
+ const char *name,
+ uint64_t size,
+ Error **errp)
+{
+ Error *err = NULL;
+ assert(ops);
+ memory_region_init(mr, owner, name, size);
+ mr->ops = ops;
+ mr->opaque = opaque;
+ mr->terminates = true;
+ mr->rom_device = true;
+ mr->destructor = memory_region_destructor_ram;
+ mr->ram_block = qemu_ram_alloc(size, 0, mr, &err);
+ if (err) {
+ mr->size = int128_zero();
+ object_unparent(OBJECT(mr));
+ error_propagate(errp, err);
+ }
+}
+
+void memory_region_init_iommu(void *_iommu_mr,
+ size_t instance_size,
+ const char *mrtypename,
+ Object *owner,
+ const char *name,
+ uint64_t size)
+{
+ struct IOMMUMemoryRegion *iommu_mr;
+ struct MemoryRegion *mr;
+
+ object_initialize(_iommu_mr, instance_size, mrtypename);
+ mr = MEMORY_REGION(_iommu_mr);
+ memory_region_do_init(mr, owner, name, size);
+ iommu_mr = IOMMU_MEMORY_REGION(mr);
+ mr->terminates = true; /* then re-forwards */
+ QLIST_INIT(&iommu_mr->iommu_notify);
+ iommu_mr->iommu_notify_flags = IOMMU_NOTIFIER_NONE;
+}
+
+static void memory_region_finalize(Object *obj)
+{
+ MemoryRegion *mr = MEMORY_REGION(obj);
+
+ assert(!mr->container);
+
+ /* We know the region is not visible in any address space (it
+ * does not have a container and cannot be a root either because
+ * it has no references, so we can blindly clear mr->enabled.
+ * memory_region_set_enabled instead could trigger a transaction
+ * and cause an infinite loop.
+ */
+ mr->enabled = false;
+ memory_region_transaction_begin();
+ while (!QTAILQ_EMPTY(&mr->subregions)) {
+ MemoryRegion *subregion = QTAILQ_FIRST(&mr->subregions);
+ memory_region_del_subregion(mr, subregion);
+ }
+ memory_region_transaction_commit();
+
+ mr->destructor(mr);
+ memory_region_clear_coalescing(mr);
+ g_free((char *)mr->name);
+ g_free(mr->ioeventfds);
+}
+
+Object *memory_region_owner(MemoryRegion *mr)
+{
+ Object *obj = OBJECT(mr);
+ return obj->parent;
+}
+
+void memory_region_ref(MemoryRegion *mr)
+{
+ /* MMIO callbacks most likely will access data that belongs
+ * to the owner, hence the need to ref/unref the owner whenever
+ * the memory region is in use.
+ *
+ * The memory region is a child of its owner. As long as the
+ * owner doesn't call unparent itself on the memory region,
+ * ref-ing the owner will also keep the memory region alive.
+ * Memory regions without an owner are supposed to never go away;
+ * we do not ref/unref them because it slows down DMA sensibly.
+ */
+ if (mr && mr->owner) {
+ object_ref(mr->owner);
+ }
+}
+
+void memory_region_unref(MemoryRegion *mr)
+{
+ if (mr && mr->owner) {
+ object_unref(mr->owner);
+ }
+}
+
+uint64_t memory_region_size(MemoryRegion *mr)
+{
+ if (int128_eq(mr->size, int128_2_64())) {
+ return UINT64_MAX;
+ }
+ return int128_get64(mr->size);
+}
+
+const char *memory_region_name(const MemoryRegion *mr)
+{
+ if (!mr->name) {
+ ((MemoryRegion *)mr)->name =
+ g_strdup(object_get_canonical_path_component(OBJECT(mr)));
+ }
+ return mr->name;
+}
+
+bool memory_region_is_ram_device(MemoryRegion *mr)
+{
+ return mr->ram_device;
+}
+
+bool memory_region_is_protected(MemoryRegion *mr)
+{
+ return mr->ram && (mr->ram_block->flags & RAM_PROTECTED);
+}
+
+uint8_t memory_region_get_dirty_log_mask(MemoryRegion *mr)
+{
+ uint8_t mask = mr->dirty_log_mask;
+ RAMBlock *rb = mr->ram_block;
+
+ if (global_dirty_tracking && ((rb && qemu_ram_is_migratable(rb)) ||
+ memory_region_is_iommu(mr))) {
+ mask |= (1 << DIRTY_MEMORY_MIGRATION);
+ }
+
+ if (tcg_enabled() && rb) {
+ /* TCG only cares about dirty memory logging for RAM, not IOMMU. */
+ mask |= (1 << DIRTY_MEMORY_CODE);
+ }
+ return mask;
+}
+
+bool memory_region_is_logging(MemoryRegion *mr, uint8_t client)
+{
+ return memory_region_get_dirty_log_mask(mr) & (1 << client);
+}
+
+static int memory_region_update_iommu_notify_flags(IOMMUMemoryRegion *iommu_mr,
+ Error **errp)
+{
+ IOMMUNotifierFlag flags = IOMMU_NOTIFIER_NONE;
+ IOMMUNotifier *iommu_notifier;
+ IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr);
+ int ret = 0;
+
+ IOMMU_NOTIFIER_FOREACH(iommu_notifier, iommu_mr) {
+ flags |= iommu_notifier->notifier_flags;
+ }
+
+ if (flags != iommu_mr->iommu_notify_flags && imrc->notify_flag_changed) {
+ ret = imrc->notify_flag_changed(iommu_mr,
+ iommu_mr->iommu_notify_flags,
+ flags, errp);
+ }
+
+ if (!ret) {
+ iommu_mr->iommu_notify_flags = flags;
+ }
+ return ret;
+}
+
+int memory_region_iommu_set_page_size_mask(IOMMUMemoryRegion *iommu_mr,
+ uint64_t page_size_mask,
+ Error **errp)
+{
+ IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr);
+ int ret = 0;
+
+ if (imrc->iommu_set_page_size_mask) {
+ ret = imrc->iommu_set_page_size_mask(iommu_mr, page_size_mask, errp);
+ }
+ return ret;
+}
+
+int memory_region_register_iommu_notifier(MemoryRegion *mr,
+ IOMMUNotifier *n, Error **errp)
+{
+ IOMMUMemoryRegion *iommu_mr;
+ int ret;
+
+ if (mr->alias) {
+ return memory_region_register_iommu_notifier(mr->alias, n, errp);
+ }
+
+ /* We need to register for at least one bitfield */
+ iommu_mr = IOMMU_MEMORY_REGION(mr);
+ assert(n->notifier_flags != IOMMU_NOTIFIER_NONE);
+ assert(n->start <= n->end);
+ assert(n->iommu_idx >= 0 &&
+ n->iommu_idx < memory_region_iommu_num_indexes(iommu_mr));
+
+ QLIST_INSERT_HEAD(&iommu_mr->iommu_notify, n, node);
+ ret = memory_region_update_iommu_notify_flags(iommu_mr, errp);
+ if (ret) {
+ QLIST_REMOVE(n, node);
+ }
+ return ret;
+}
+
+uint64_t memory_region_iommu_get_min_page_size(IOMMUMemoryRegion *iommu_mr)
+{
+ IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr);
+
+ if (imrc->get_min_page_size) {
+ return imrc->get_min_page_size(iommu_mr);
+ }
+ return TARGET_PAGE_SIZE;
+}
+
+void memory_region_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
+{
+ MemoryRegion *mr = MEMORY_REGION(iommu_mr);
+ IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr);
+ hwaddr addr, granularity;
+ IOMMUTLBEntry iotlb;
+
+ /* If the IOMMU has its own replay callback, override */
+ if (imrc->replay) {
+ imrc->replay(iommu_mr, n);
+ return;
+ }
+
+ granularity = memory_region_iommu_get_min_page_size(iommu_mr);
+
+ for (addr = 0; addr < memory_region_size(mr); addr += granularity) {
+ iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, n->iommu_idx);
+ if (iotlb.perm != IOMMU_NONE) {
+ n->notify(n, &iotlb);
+ }
+
+ /* if (2^64 - MR size) < granularity, it's possible to get an
+ * infinite loop here. This should catch such a wraparound */
+ if ((addr + granularity) < addr) {
+ break;
+ }
+ }
+}
+
+void memory_region_unregister_iommu_notifier(MemoryRegion *mr,
+ IOMMUNotifier *n)
+{
+ IOMMUMemoryRegion *iommu_mr;
+
+ if (mr->alias) {
+ memory_region_unregister_iommu_notifier(mr->alias, n);
+ return;
+ }
+ QLIST_REMOVE(n, node);
+ iommu_mr = IOMMU_MEMORY_REGION(mr);
+ memory_region_update_iommu_notify_flags(iommu_mr, NULL);
+}
+
+void memory_region_notify_iommu_one(IOMMUNotifier *notifier,
+ IOMMUTLBEvent *event)
+{
+ IOMMUTLBEntry *entry = &event->entry;
+ hwaddr entry_end = entry->iova + entry->addr_mask;
+ IOMMUTLBEntry tmp = *entry;
+
+ if (event->type == IOMMU_NOTIFIER_UNMAP) {
+ assert(entry->perm == IOMMU_NONE);
+ }
+
+ /*
+ * Skip the notification if the notification does not overlap
+ * with registered range.
+ */
+ if (notifier->start > entry_end || notifier->end < entry->iova) {
+ return;
+ }
+
+ if (notifier->notifier_flags & IOMMU_NOTIFIER_DEVIOTLB_UNMAP) {
+ /* Crop (iova, addr_mask) to range */
+ tmp.iova = MAX(tmp.iova, notifier->start);
+ tmp.addr_mask = MIN(entry_end, notifier->end) - tmp.iova;
+ } else {
+ assert(entry->iova >= notifier->start && entry_end <= notifier->end);
+ }
+
+ if (event->type & notifier->notifier_flags) {
+ notifier->notify(notifier, &tmp);
+ }
+}
+
+void memory_region_unmap_iommu_notifier_range(IOMMUNotifier *notifier)
+{
+ IOMMUTLBEvent event;
+
+ event.type = IOMMU_NOTIFIER_UNMAP;
+ event.entry.target_as = &address_space_memory;
+ event.entry.iova = notifier->start;
+ event.entry.perm = IOMMU_NONE;
+ event.entry.addr_mask = notifier->end - notifier->start;
+
+ memory_region_notify_iommu_one(notifier, &event);
+}
+
+void memory_region_notify_iommu(IOMMUMemoryRegion *iommu_mr,
+ int iommu_idx,
+ IOMMUTLBEvent event)
+{
+ IOMMUNotifier *iommu_notifier;
+
+ assert(memory_region_is_iommu(MEMORY_REGION(iommu_mr)));
+
+ IOMMU_NOTIFIER_FOREACH(iommu_notifier, iommu_mr) {
+ if (iommu_notifier->iommu_idx == iommu_idx) {
+ memory_region_notify_iommu_one(iommu_notifier, &event);
+ }
+ }
+}
+
+int memory_region_iommu_get_attr(IOMMUMemoryRegion *iommu_mr,
+ enum IOMMUMemoryRegionAttr attr,
+ void *data)
+{
+ IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr);
+
+ if (!imrc->get_attr) {
+ return -EINVAL;
+ }
+
+ return imrc->get_attr(iommu_mr, attr, data);
+}
+
+int memory_region_iommu_attrs_to_index(IOMMUMemoryRegion *iommu_mr,
+ MemTxAttrs attrs)
+{
+ IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr);
+
+ if (!imrc->attrs_to_index) {
+ return 0;
+ }
+
+ return imrc->attrs_to_index(iommu_mr, attrs);
+}
+
+int memory_region_iommu_num_indexes(IOMMUMemoryRegion *iommu_mr)
+{
+ IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr);
+
+ if (!imrc->num_indexes) {
+ return 1;
+ }
+
+ return imrc->num_indexes(iommu_mr);
+}
+
+RamDiscardManager *memory_region_get_ram_discard_manager(MemoryRegion *mr)
+{
+ if (!memory_region_is_mapped(mr) || !memory_region_is_ram(mr)) {
+ return NULL;
+ }
+ return mr->rdm;
+}
+
+void memory_region_set_ram_discard_manager(MemoryRegion *mr,
+ RamDiscardManager *rdm)
+{
+ g_assert(memory_region_is_ram(mr) && !memory_region_is_mapped(mr));
+ g_assert(!rdm || !mr->rdm);
+ mr->rdm = rdm;
+}
+
+uint64_t ram_discard_manager_get_min_granularity(const RamDiscardManager *rdm,
+ const MemoryRegion *mr)
+{
+ RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_GET_CLASS(rdm);
+
+ g_assert(rdmc->get_min_granularity);
+ return rdmc->get_min_granularity(rdm, mr);
+}
+
+bool ram_discard_manager_is_populated(const RamDiscardManager *rdm,
+ const MemoryRegionSection *section)
+{
+ RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_GET_CLASS(rdm);
+
+ g_assert(rdmc->is_populated);
+ return rdmc->is_populated(rdm, section);
+}
+
+int ram_discard_manager_replay_populated(const RamDiscardManager *rdm,
+ MemoryRegionSection *section,
+ ReplayRamPopulate replay_fn,
+ void *opaque)
+{
+ RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_GET_CLASS(rdm);
+
+ g_assert(rdmc->replay_populated);
+ return rdmc->replay_populated(rdm, section, replay_fn, opaque);
+}
+
+void ram_discard_manager_replay_discarded(const RamDiscardManager *rdm,
+ MemoryRegionSection *section,
+ ReplayRamDiscard replay_fn,
+ void *opaque)
+{
+ RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_GET_CLASS(rdm);
+
+ g_assert(rdmc->replay_discarded);
+ rdmc->replay_discarded(rdm, section, replay_fn, opaque);
+}
+
+void ram_discard_manager_register_listener(RamDiscardManager *rdm,
+ RamDiscardListener *rdl,
+ MemoryRegionSection *section)
+{
+ RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_GET_CLASS(rdm);
+
+ g_assert(rdmc->register_listener);
+ rdmc->register_listener(rdm, rdl, section);
+}
+
+void ram_discard_manager_unregister_listener(RamDiscardManager *rdm,
+ RamDiscardListener *rdl)
+{
+ RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_GET_CLASS(rdm);
+
+ g_assert(rdmc->unregister_listener);
+ rdmc->unregister_listener(rdm, rdl);
+}
+
+/* Called with rcu_read_lock held. */
+bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
+ ram_addr_t *ram_addr, bool *read_only,
+ bool *mr_has_discard_manager)
+{
+ MemoryRegion *mr;
+ hwaddr xlat;
+ hwaddr len = iotlb->addr_mask + 1;
+ bool writable = iotlb->perm & IOMMU_WO;
+
+ if (mr_has_discard_manager) {
+ *mr_has_discard_manager = false;
+ }
+ /*
+ * The IOMMU TLB entry we have just covers translation through
+ * this IOMMU to its immediate target. We need to translate
+ * it the rest of the way through to memory.
+ */
+ mr = address_space_translate(&address_space_memory, iotlb->translated_addr,
+ &xlat, &len, writable, MEMTXATTRS_UNSPECIFIED);
+ if (!memory_region_is_ram(mr)) {
+ error_report("iommu map to non memory area %" HWADDR_PRIx "", xlat);
+ return false;
+ } else if (memory_region_has_ram_discard_manager(mr)) {
+ RamDiscardManager *rdm = memory_region_get_ram_discard_manager(mr);
+ MemoryRegionSection tmp = {
+ .mr = mr,
+ .offset_within_region = xlat,
+ .size = int128_make64(len),
+ };
+ if (mr_has_discard_manager) {
+ *mr_has_discard_manager = true;
+ }
+ /*
+ * Malicious VMs can map memory into the IOMMU, which is expected
+ * to remain discarded. vfio will pin all pages, populating memory.
+ * Disallow that. vmstate priorities make sure any RamDiscardManager
+ * were already restored before IOMMUs are restored.
+ */
+ if (!ram_discard_manager_is_populated(rdm, &tmp)) {
+ error_report("iommu map to discarded memory (e.g., unplugged via"
+ " virtio-mem): %" HWADDR_PRIx "",
+ iotlb->translated_addr);
+ return false;
+ }
+ }
+
+ /*
+ * Translation truncates length to the IOMMU page size,
+ * check that it did not truncate too much.
+ */
+ if (len & iotlb->addr_mask) {
+ error_report("iommu has granularity incompatible with target AS");
+ return false;
+ }
+
+ if (vaddr) {
+ *vaddr = memory_region_get_ram_ptr(mr) + xlat;
+ }
+
+ if (ram_addr) {
+ *ram_addr = memory_region_get_ram_addr(mr) + xlat;
+ }
+
+ if (read_only) {
+ *read_only = !writable || mr->readonly;
+ }
+
+ return true;
+}
+
+void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client)
+{
+ uint8_t mask = 1 << client;
+ uint8_t old_logging;
+
+ assert(client == DIRTY_MEMORY_VGA);
+ old_logging = mr->vga_logging_count;
+ mr->vga_logging_count += log ? 1 : -1;
+ if (!!old_logging == !!mr->vga_logging_count) {
+ return;
+ }
+
+ memory_region_transaction_begin();
+ mr->dirty_log_mask = (mr->dirty_log_mask & ~mask) | (log * mask);
+ memory_region_update_pending |= mr->enabled;
+ memory_region_transaction_commit();
+}
+
+void memory_region_set_dirty(MemoryRegion *mr, hwaddr addr,
+ hwaddr size)
+{
+ assert(mr->ram_block);
+ cpu_physical_memory_set_dirty_range(memory_region_get_ram_addr(mr) + addr,
+ size,
+ memory_region_get_dirty_log_mask(mr));
+}
+
+/*
+ * If memory region `mr' is NULL, do global sync. Otherwise, sync
+ * dirty bitmap for the specified memory region.
+ */
+static void memory_region_sync_dirty_bitmap(MemoryRegion *mr, bool last_stage)
+{
+ MemoryListener *listener;
+ AddressSpace *as;
+ FlatView *view;
+ FlatRange *fr;
+
+ /* If the same address space has multiple log_sync listeners, we
+ * visit that address space's FlatView multiple times. But because
+ * log_sync listeners are rare, it's still cheaper than walking each
+ * address space once.
+ */
+ QTAILQ_FOREACH(listener, &memory_listeners, link) {
+ if (listener->log_sync) {
+ as = listener->address_space;
+ view = address_space_get_flatview(as);
+ FOR_EACH_FLAT_RANGE(fr, view) {
+ if (fr->dirty_log_mask && (!mr || fr->mr == mr)) {
+ MemoryRegionSection mrs = section_from_flat_range(fr, view);
+ listener->log_sync(listener, &mrs);
+ }
+ }
+ flatview_unref(view);
+ trace_memory_region_sync_dirty(mr ? mr->name : "(all)", listener->name, 0);
+ } else if (listener->log_sync_global) {
+ /*
+ * No matter whether MR is specified, what we can do here
+ * is to do a global sync, because we are not capable to
+ * sync in a finer granularity.
+ */
+ listener->log_sync_global(listener, last_stage);
+ trace_memory_region_sync_dirty(mr ? mr->name : "(all)", listener->name, 1);
+ }
+ }
+}
+
+void memory_region_clear_dirty_bitmap(MemoryRegion *mr, hwaddr start,
+ hwaddr len)
+{
+ MemoryRegionSection mrs;
+ MemoryListener *listener;
+ AddressSpace *as;
+ FlatView *view;
+ FlatRange *fr;
+ hwaddr sec_start, sec_end, sec_size;
+
+ QTAILQ_FOREACH(listener, &memory_listeners, link) {
+ if (!listener->log_clear) {
+ continue;
+ }
+ as = listener->address_space;
+ view = address_space_get_flatview(as);
+ FOR_EACH_FLAT_RANGE(fr, view) {
+ if (!fr->dirty_log_mask || fr->mr != mr) {
+ /*
+ * Clear dirty bitmap operation only applies to those
+ * regions whose dirty logging is at least enabled
+ */
+ continue;
+ }
+
+ mrs = section_from_flat_range(fr, view);
+
+ sec_start = MAX(mrs.offset_within_region, start);
+ sec_end = mrs.offset_within_region + int128_get64(mrs.size);
+ sec_end = MIN(sec_end, start + len);
+
+ if (sec_start >= sec_end) {
+ /*
+ * If this memory region section has no intersection
+ * with the requested range, skip.
+ */
+ continue;
+ }
+
+ /* Valid case; shrink the section if needed */
+ mrs.offset_within_address_space +=
+ sec_start - mrs.offset_within_region;
+ mrs.offset_within_region = sec_start;
+ sec_size = sec_end - sec_start;
+ mrs.size = int128_make64(sec_size);
+ listener->log_clear(listener, &mrs);
+ }
+ flatview_unref(view);
+ }
+}
+
+DirtyBitmapSnapshot *memory_region_snapshot_and_clear_dirty(MemoryRegion *mr,
+ hwaddr addr,
+ hwaddr size,
+ unsigned client)
+{
+ DirtyBitmapSnapshot *snapshot;
+ assert(mr->ram_block);
+ memory_region_sync_dirty_bitmap(mr, false);
+ snapshot = cpu_physical_memory_snapshot_and_clear_dirty(mr, addr, size, client);
+ memory_global_after_dirty_log_sync();
+ return snapshot;
+}
+
+bool memory_region_snapshot_get_dirty(MemoryRegion *mr, DirtyBitmapSnapshot *snap,
+ hwaddr addr, hwaddr size)
+{
+ assert(mr->ram_block);
+ return cpu_physical_memory_snapshot_get_dirty(snap,
+ memory_region_get_ram_addr(mr) + addr, size);
+}
+
+void memory_region_set_readonly(MemoryRegion *mr, bool readonly)
+{
+ if (mr->readonly != readonly) {
+ memory_region_transaction_begin();
+ mr->readonly = readonly;
+ memory_region_update_pending |= mr->enabled;
+ memory_region_transaction_commit();
+ }
+}
+
+void memory_region_set_nonvolatile(MemoryRegion *mr, bool nonvolatile)
+{
+ if (mr->nonvolatile != nonvolatile) {
+ memory_region_transaction_begin();
+ mr->nonvolatile = nonvolatile;
+ memory_region_update_pending |= mr->enabled;
+ memory_region_transaction_commit();
+ }
+}
+
+void memory_region_rom_device_set_romd(MemoryRegion *mr, bool romd_mode)
+{
+ if (mr->romd_mode != romd_mode) {
+ memory_region_transaction_begin();
+ mr->romd_mode = romd_mode;
+ memory_region_update_pending |= mr->enabled;
+ memory_region_transaction_commit();
+ }
+}
+
+void memory_region_reset_dirty(MemoryRegion *mr, hwaddr addr,
+ hwaddr size, unsigned client)
+{
+ assert(mr->ram_block);
+ cpu_physical_memory_test_and_clear_dirty(
+ memory_region_get_ram_addr(mr) + addr, size, client);
+}
+
+int memory_region_get_fd(MemoryRegion *mr)
+{
+ RCU_READ_LOCK_GUARD();
+ while (mr->alias) {
+ mr = mr->alias;
+ }
+ return mr->ram_block->fd;
+}
+
+void *memory_region_get_ram_ptr(MemoryRegion *mr)
+{
+ uint64_t offset = 0;
+
+ RCU_READ_LOCK_GUARD();
+ while (mr->alias) {
+ offset += mr->alias_offset;
+ mr = mr->alias;
+ }
+ assert(mr->ram_block);
+ return qemu_map_ram_ptr(mr->ram_block, offset);
+}
+
+MemoryRegion *memory_region_from_host(void *ptr, ram_addr_t *offset)
+{
+ RAMBlock *block;
+
+ block = qemu_ram_block_from_host(ptr, false, offset);
+ if (!block) {
+ return NULL;
+ }
+
+ return block->mr;
+}
+
+ram_addr_t memory_region_get_ram_addr(MemoryRegion *mr)
+{
+ return mr->ram_block ? mr->ram_block->offset : RAM_ADDR_INVALID;
+}
+
+void memory_region_ram_resize(MemoryRegion *mr, ram_addr_t newsize, Error **errp)
+{
+ assert(mr->ram_block);
+
+ qemu_ram_resize(mr->ram_block, newsize, errp);
+}
+
+void memory_region_msync(MemoryRegion *mr, hwaddr addr, hwaddr size)
+{
+ if (mr->ram_block) {
+ qemu_ram_msync(mr->ram_block, addr, size);
+ }
+}
+
+void memory_region_writeback(MemoryRegion *mr, hwaddr addr, hwaddr size)
+{
+ /*
+ * Might be extended case needed to cover
+ * different types of memory regions
+ */
+ if (mr->dirty_log_mask) {
+ memory_region_msync(mr, addr, size);
+ }
+}
+
+/*
+ * Call proper memory listeners about the change on the newly
+ * added/removed CoalescedMemoryRange.
+ */
+static void memory_region_update_coalesced_range(MemoryRegion *mr,
+ CoalescedMemoryRange *cmr,
+ bool add)
+{
+ AddressSpace *as;
+ FlatView *view;
+ FlatRange *fr;
+
+ QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) {
+ view = address_space_get_flatview(as);
+ FOR_EACH_FLAT_RANGE(fr, view) {
+ if (fr->mr == mr) {
+ flat_range_coalesced_io_notify(fr, as, cmr, add);
+ }
+ }
+ flatview_unref(view);
+ }
+}
+
+void memory_region_set_coalescing(MemoryRegion *mr)
+{
+ memory_region_clear_coalescing(mr);
+ memory_region_add_coalescing(mr, 0, int128_get64(mr->size));
+}
+
+void memory_region_add_coalescing(MemoryRegion *mr,
+ hwaddr offset,
+ uint64_t size)
+{
+ CoalescedMemoryRange *cmr = g_malloc(sizeof(*cmr));
+
+ cmr->addr = addrrange_make(int128_make64(offset), int128_make64(size));
+ QTAILQ_INSERT_TAIL(&mr->coalesced, cmr, link);
+ memory_region_update_coalesced_range(mr, cmr, true);
+ memory_region_set_flush_coalesced(mr);
+}
+
+void memory_region_clear_coalescing(MemoryRegion *mr)
+{
+ CoalescedMemoryRange *cmr;
+
+ if (QTAILQ_EMPTY(&mr->coalesced)) {
+ return;
+ }
+
+ qemu_flush_coalesced_mmio_buffer();
+ mr->flush_coalesced_mmio = false;
+
+ while (!QTAILQ_EMPTY(&mr->coalesced)) {
+ cmr = QTAILQ_FIRST(&mr->coalesced);
+ QTAILQ_REMOVE(&mr->coalesced, cmr, link);
+ memory_region_update_coalesced_range(mr, cmr, false);
+ g_free(cmr);
+ }
+}
+
+void memory_region_set_flush_coalesced(MemoryRegion *mr)
+{
+ mr->flush_coalesced_mmio = true;
+}
+
+void memory_region_clear_flush_coalesced(MemoryRegion *mr)
+{
+ qemu_flush_coalesced_mmio_buffer();
+ if (QTAILQ_EMPTY(&mr->coalesced)) {
+ mr->flush_coalesced_mmio = false;
+ }
+}
+
+static bool userspace_eventfd_warning;
+
+void memory_region_add_eventfd(MemoryRegion *mr,
+ hwaddr addr,
+ unsigned size,
+ bool match_data,
+ uint64_t data,
+ EventNotifier *e)
+{
+ MemoryRegionIoeventfd mrfd = {
+ .addr.start = int128_make64(addr),
+ .addr.size = int128_make64(size),
+ .match_data = match_data,
+ .data = data,
+ .e = e,
+ };
+ unsigned i;
+
+ if (kvm_enabled() && (!(kvm_eventfds_enabled() ||
+ userspace_eventfd_warning))) {
+ userspace_eventfd_warning = true;
+ error_report("Using eventfd without MMIO binding in KVM. "
+ "Suboptimal performance expected");
+ }
+
+ if (size) {
+ adjust_endianness(mr, &mrfd.data, size_memop(size) | MO_TE);
+ }
+ memory_region_transaction_begin();
+ for (i = 0; i < mr->ioeventfd_nb; ++i) {
+ if (memory_region_ioeventfd_before(&mrfd, &mr->ioeventfds[i])) {
+ break;
+ }
+ }
+ ++mr->ioeventfd_nb;
+ mr->ioeventfds = g_realloc(mr->ioeventfds,
+ sizeof(*mr->ioeventfds) * mr->ioeventfd_nb);
+ memmove(&mr->ioeventfds[i+1], &mr->ioeventfds[i],
+ sizeof(*mr->ioeventfds) * (mr->ioeventfd_nb-1 - i));
+ mr->ioeventfds[i] = mrfd;
+ ioeventfd_update_pending |= mr->enabled;
+ memory_region_transaction_commit();
+}
+
+void memory_region_del_eventfd(MemoryRegion *mr,
+ hwaddr addr,
+ unsigned size,
+ bool match_data,
+ uint64_t data,
+ EventNotifier *e)
+{
+ MemoryRegionIoeventfd mrfd = {
+ .addr.start = int128_make64(addr),
+ .addr.size = int128_make64(size),
+ .match_data = match_data,
+ .data = data,
+ .e = e,
+ };
+ unsigned i;
+
+ if (size) {
+ adjust_endianness(mr, &mrfd.data, size_memop(size) | MO_TE);
+ }
+ memory_region_transaction_begin();
+ for (i = 0; i < mr->ioeventfd_nb; ++i) {
+ if (memory_region_ioeventfd_equal(&mrfd, &mr->ioeventfds[i])) {
+ break;
+ }
+ }
+ assert(i != mr->ioeventfd_nb);
+ memmove(&mr->ioeventfds[i], &mr->ioeventfds[i+1],
+ sizeof(*mr->ioeventfds) * (mr->ioeventfd_nb - (i+1)));
+ --mr->ioeventfd_nb;
+ mr->ioeventfds = g_realloc(mr->ioeventfds,
+ sizeof(*mr->ioeventfds)*mr->ioeventfd_nb + 1);
+ ioeventfd_update_pending |= mr->enabled;
+ memory_region_transaction_commit();
+}
+
+static void memory_region_update_container_subregions(MemoryRegion *subregion)
+{
+ MemoryRegion *mr = subregion->container;
+ MemoryRegion *other;
+
+ memory_region_transaction_begin();
+
+ memory_region_ref(subregion);
+ QTAILQ_FOREACH(other, &mr->subregions, subregions_link) {
+ if (subregion->priority >= other->priority) {
+ QTAILQ_INSERT_BEFORE(other, subregion, subregions_link);
+ goto done;
+ }
+ }
+ QTAILQ_INSERT_TAIL(&mr->subregions, subregion, subregions_link);
+done:
+ memory_region_update_pending |= mr->enabled && subregion->enabled;
+ memory_region_transaction_commit();
+}
+
+static void memory_region_add_subregion_common(MemoryRegion *mr,
+ hwaddr offset,
+ MemoryRegion *subregion)
+{
+ MemoryRegion *alias;
+
+ assert(!subregion->container);
+ subregion->container = mr;
+ for (alias = subregion->alias; alias; alias = alias->alias) {
+ alias->mapped_via_alias++;
+ }
+ subregion->addr = offset;
+ memory_region_update_container_subregions(subregion);
+}
+
+void memory_region_add_subregion(MemoryRegion *mr,
+ hwaddr offset,
+ MemoryRegion *subregion)
+{
+ subregion->priority = 0;
+ memory_region_add_subregion_common(mr, offset, subregion);
+}
+
+void memory_region_add_subregion_overlap(MemoryRegion *mr,
+ hwaddr offset,
+ MemoryRegion *subregion,
+ int priority)
+{
+ subregion->priority = priority;
+ memory_region_add_subregion_common(mr, offset, subregion);
+}
+
+void memory_region_del_subregion(MemoryRegion *mr,
+ MemoryRegion *subregion)
+{
+ MemoryRegion *alias;
+
+ memory_region_transaction_begin();
+ assert(subregion->container == mr);
+ subregion->container = NULL;
+ for (alias = subregion->alias; alias; alias = alias->alias) {
+ alias->mapped_via_alias--;
+ assert(alias->mapped_via_alias >= 0);
+ }
+ QTAILQ_REMOVE(&mr->subregions, subregion, subregions_link);
+ memory_region_unref(subregion);
+ memory_region_update_pending |= mr->enabled && subregion->enabled;
+ memory_region_transaction_commit();
+}
+
+void memory_region_set_enabled(MemoryRegion *mr, bool enabled)
+{
+ if (enabled == mr->enabled) {
+ return;
+ }
+ memory_region_transaction_begin();
+ mr->enabled = enabled;
+ memory_region_update_pending = true;
+ memory_region_transaction_commit();
+}
+
+void memory_region_set_size(MemoryRegion *mr, uint64_t size)
+{
+ Int128 s = int128_make64(size);
+
+ if (size == UINT64_MAX) {
+ s = int128_2_64();
+ }
+ if (int128_eq(s, mr->size)) {
+ return;
+ }
+ memory_region_transaction_begin();
+ mr->size = s;
+ memory_region_update_pending = true;
+ memory_region_transaction_commit();
+}
+
+static void memory_region_readd_subregion(MemoryRegion *mr)
+{
+ MemoryRegion *container = mr->container;
+
+ if (container) {
+ memory_region_transaction_begin();
+ memory_region_ref(mr);
+ memory_region_del_subregion(container, mr);
+ memory_region_add_subregion_common(container, mr->addr, mr);
+ memory_region_unref(mr);
+ memory_region_transaction_commit();
+ }
+}
+
+void memory_region_set_address(MemoryRegion *mr, hwaddr addr)
+{
+ if (addr != mr->addr) {
+ mr->addr = addr;
+ memory_region_readd_subregion(mr);
+ }
+}
+
+void memory_region_set_alias_offset(MemoryRegion *mr, hwaddr offset)
+{
+ assert(mr->alias);
+
+ if (offset == mr->alias_offset) {
+ return;
+ }
+
+ memory_region_transaction_begin();
+ mr->alias_offset = offset;
+ memory_region_update_pending |= mr->enabled;
+ memory_region_transaction_commit();
+}
+
+uint64_t memory_region_get_alignment(const MemoryRegion *mr)
+{
+ return mr->align;
+}
+
+static int cmp_flatrange_addr(const void *addr_, const void *fr_)
+{
+ const AddrRange *addr = addr_;
+ const FlatRange *fr = fr_;
+
+ if (int128_le(addrrange_end(*addr), fr->addr.start)) {
+ return -1;
+ } else if (int128_ge(addr->start, addrrange_end(fr->addr))) {
+ return 1;
+ }
+ return 0;
+}
+
+static FlatRange *flatview_lookup(FlatView *view, AddrRange addr)
+{
+ return bsearch(&addr, view->ranges, view->nr,
+ sizeof(FlatRange), cmp_flatrange_addr);
+}
+
+bool memory_region_is_mapped(MemoryRegion *mr)
+{
+ return !!mr->container || mr->mapped_via_alias;
+}
+
+/* Same as memory_region_find, but it does not add a reference to the
+ * returned region. It must be called from an RCU critical section.
+ */
+static MemoryRegionSection memory_region_find_rcu(MemoryRegion *mr,
+ hwaddr addr, uint64_t size)
+{
+ MemoryRegionSection ret = { .mr = NULL };
+ MemoryRegion *root;
+ AddressSpace *as;
+ AddrRange range;
+ FlatView *view;
+ FlatRange *fr;
+
+ addr += mr->addr;
+ for (root = mr; root->container; ) {
+ root = root->container;
+ addr += root->addr;
+ }
+
+ as = memory_region_to_address_space(root);
+ if (!as) {
+ return ret;
+ }
+ range = addrrange_make(int128_make64(addr), int128_make64(size));
+
+ view = address_space_to_flatview(as);
+ fr = flatview_lookup(view, range);
+ if (!fr) {
+ return ret;
+ }
+
+ while (fr > view->ranges && addrrange_intersects(fr[-1].addr, range)) {
+ --fr;
+ }
+
+ ret.mr = fr->mr;
+ ret.fv = view;
+ range = addrrange_intersection(range, fr->addr);
+ ret.offset_within_region = fr->offset_in_region;
+ ret.offset_within_region += int128_get64(int128_sub(range.start,
+ fr->addr.start));
+ ret.size = range.size;
+ ret.offset_within_address_space = int128_get64(range.start);
+ ret.readonly = fr->readonly;
+ ret.nonvolatile = fr->nonvolatile;
+ return ret;
+}
+
+MemoryRegionSection memory_region_find(MemoryRegion *mr,
+ hwaddr addr, uint64_t size)
+{
+ MemoryRegionSection ret;
+ RCU_READ_LOCK_GUARD();
+ ret = memory_region_find_rcu(mr, addr, size);
+ if (ret.mr) {
+ memory_region_ref(ret.mr);
+ }
+ return ret;
+}
+
+MemoryRegionSection *memory_region_section_new_copy(MemoryRegionSection *s)
+{
+ MemoryRegionSection *tmp = g_new(MemoryRegionSection, 1);
+
+ *tmp = *s;
+ if (tmp->mr) {
+ memory_region_ref(tmp->mr);
+ }
+ if (tmp->fv) {
+ bool ret = flatview_ref(tmp->fv);
+
+ g_assert(ret);
+ }
+ return tmp;
+}
+
+void memory_region_section_free_copy(MemoryRegionSection *s)
+{
+ if (s->fv) {
+ flatview_unref(s->fv);
+ }
+ if (s->mr) {
+ memory_region_unref(s->mr);
+ }
+ g_free(s);
+}
+
+bool memory_region_present(MemoryRegion *container, hwaddr addr)
+{
+ MemoryRegion *mr;
+
+ RCU_READ_LOCK_GUARD();
+ mr = memory_region_find_rcu(container, addr, 1).mr;
+ return mr && mr != container;
+}
+
+void memory_global_dirty_log_sync(bool last_stage)
+{
+ memory_region_sync_dirty_bitmap(NULL, last_stage);
+}
+
+void memory_global_after_dirty_log_sync(void)
+{
+ MEMORY_LISTENER_CALL_GLOBAL(log_global_after_sync, Forward);
+}
+
+/*
+ * Dirty track stop flags that are postponed due to VM being stopped. Should
+ * only be used within vmstate_change hook.
+ */
+static unsigned int postponed_stop_flags;
+static VMChangeStateEntry *vmstate_change;
+static void memory_global_dirty_log_stop_postponed_run(void);
+
+void memory_global_dirty_log_start(unsigned int flags)
+{
+ unsigned int old_flags;
+
+ assert(flags && !(flags & (~GLOBAL_DIRTY_MASK)));
+
+ if (vmstate_change) {
+ /* If there is postponed stop(), operate on it first */
+ postponed_stop_flags &= ~flags;
+ memory_global_dirty_log_stop_postponed_run();
+ }
+
+ flags &= ~global_dirty_tracking;
+ if (!flags) {
+ return;
+ }
+
+ old_flags = global_dirty_tracking;
+ global_dirty_tracking |= flags;
+ trace_global_dirty_changed(global_dirty_tracking);
+
+ if (!old_flags) {
+ MEMORY_LISTENER_CALL_GLOBAL(log_global_start, Forward);
+ memory_region_transaction_begin();
+ memory_region_update_pending = true;
+ memory_region_transaction_commit();
+ }
+}
+
+static void memory_global_dirty_log_do_stop(unsigned int flags)
+{
+ assert(flags && !(flags & (~GLOBAL_DIRTY_MASK)));
+ assert((global_dirty_tracking & flags) == flags);
+ global_dirty_tracking &= ~flags;
+
+ trace_global_dirty_changed(global_dirty_tracking);
+
+ if (!global_dirty_tracking) {
+ memory_region_transaction_begin();
+ memory_region_update_pending = true;
+ memory_region_transaction_commit();
+ MEMORY_LISTENER_CALL_GLOBAL(log_global_stop, Reverse);
+ }
+}
+
+/*
+ * Execute the postponed dirty log stop operations if there is, then reset
+ * everything (including the flags and the vmstate change hook).
+ */
+static void memory_global_dirty_log_stop_postponed_run(void)
+{
+ /* This must be called with the vmstate handler registered */
+ assert(vmstate_change);
+
+ /* Note: postponed_stop_flags can be cleared in log start routine */
+ if (postponed_stop_flags) {
+ memory_global_dirty_log_do_stop(postponed_stop_flags);
+ postponed_stop_flags = 0;
+ }
+
+ qemu_del_vm_change_state_handler(vmstate_change);
+ vmstate_change = NULL;
+}
+
+static void memory_vm_change_state_handler(void *opaque, bool running,
+ RunState state)
+{
+ if (running) {
+ memory_global_dirty_log_stop_postponed_run();
+ }
+}
+
+void memory_global_dirty_log_stop(unsigned int flags)
+{
+ if (!runstate_is_running()) {
+ /* Postpone the dirty log stop, e.g., to when VM starts again */
+ if (vmstate_change) {
+ /* Batch with previous postponed flags */
+ postponed_stop_flags |= flags;
+ } else {
+ postponed_stop_flags = flags;
+ vmstate_change = qemu_add_vm_change_state_handler(
+ memory_vm_change_state_handler, NULL);
+ }
+ return;
+ }
+
+ memory_global_dirty_log_do_stop(flags);
+}
+
+static void listener_add_address_space(MemoryListener *listener,
+ AddressSpace *as)
+{
+ FlatView *view;
+ FlatRange *fr;
+
+ if (listener->begin) {
+ listener->begin(listener);
+ }
+ if (global_dirty_tracking) {
+ if (listener->log_global_start) {
+ listener->log_global_start(listener);
+ }
+ }
+
+ view = address_space_get_flatview(as);
+ FOR_EACH_FLAT_RANGE(fr, view) {
+ MemoryRegionSection section = section_from_flat_range(fr, view);
+
+ if (listener->region_add) {
+ listener->region_add(listener, &section);
+ }
+ if (fr->dirty_log_mask && listener->log_start) {
+ listener->log_start(listener, &section, 0, fr->dirty_log_mask);
+ }
+ }
+ if (listener->commit) {
+ listener->commit(listener);
+ }
+ flatview_unref(view);
+}
+
+static void listener_del_address_space(MemoryListener *listener,
+ AddressSpace *as)
+{
+ FlatView *view;
+ FlatRange *fr;
+
+ if (listener->begin) {
+ listener->begin(listener);
+ }
+ view = address_space_get_flatview(as);
+ FOR_EACH_FLAT_RANGE(fr, view) {
+ MemoryRegionSection section = section_from_flat_range(fr, view);
+
+ if (fr->dirty_log_mask && listener->log_stop) {
+ listener->log_stop(listener, &section, fr->dirty_log_mask, 0);
+ }
+ if (listener->region_del) {
+ listener->region_del(listener, &section);
+ }
+ }
+ if (listener->commit) {
+ listener->commit(listener);
+ }
+ flatview_unref(view);
+}
+
+void memory_listener_register(MemoryListener *listener, AddressSpace *as)
+{
+ MemoryListener *other = NULL;
+
+ /* Only one of them can be defined for a listener */
+ assert(!(listener->log_sync && listener->log_sync_global));
+
+ listener->address_space = as;
+ if (QTAILQ_EMPTY(&memory_listeners)
+ || listener->priority >= QTAILQ_LAST(&memory_listeners)->priority) {
+ QTAILQ_INSERT_TAIL(&memory_listeners, listener, link);
+ } else {
+ QTAILQ_FOREACH(other, &memory_listeners, link) {
+ if (listener->priority < other->priority) {
+ break;
+ }
+ }
+ QTAILQ_INSERT_BEFORE(other, listener, link);
+ }
+
+ if (QTAILQ_EMPTY(&as->listeners)
+ || listener->priority >= QTAILQ_LAST(&as->listeners)->priority) {
+ QTAILQ_INSERT_TAIL(&as->listeners, listener, link_as);
+ } else {
+ QTAILQ_FOREACH(other, &as->listeners, link_as) {
+ if (listener->priority < other->priority) {
+ break;
+ }
+ }
+ QTAILQ_INSERT_BEFORE(other, listener, link_as);
+ }
+
+ listener_add_address_space(listener, as);
+
+ if (listener->eventfd_add || listener->eventfd_del) {
+ as->ioeventfd_notifiers++;
+ }
+}
+
+void memory_listener_unregister(MemoryListener *listener)
+{
+ if (!listener->address_space) {
+ return;
+ }
+
+ if (listener->eventfd_add || listener->eventfd_del) {
+ listener->address_space->ioeventfd_notifiers--;
+ }
+
+ listener_del_address_space(listener, listener->address_space);
+ QTAILQ_REMOVE(&memory_listeners, listener, link);
+ QTAILQ_REMOVE(&listener->address_space->listeners, listener, link_as);
+ listener->address_space = NULL;
+}
+
+void address_space_remove_listeners(AddressSpace *as)
+{
+ while (!QTAILQ_EMPTY(&as->listeners)) {
+ memory_listener_unregister(QTAILQ_FIRST(&as->listeners));
+ }
+}
+
+void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name)
+{
+ memory_region_ref(root);
+ as->root = root;
+ as->current_map = NULL;
+ as->ioeventfd_nb = 0;
+ as->ioeventfds = NULL;
+ QTAILQ_INIT(&as->listeners);
+ QTAILQ_INSERT_TAIL(&address_spaces, as, address_spaces_link);
+ as->name = g_strdup(name ? name : "anonymous");
+ address_space_update_topology(as);
+ address_space_update_ioeventfds(as);
+}
+
+static void do_address_space_destroy(AddressSpace *as)
+{
+ assert(QTAILQ_EMPTY(&as->listeners));
+
+ flatview_unref(as->current_map);
+ g_free(as->name);
+ g_free(as->ioeventfds);
+ memory_region_unref(as->root);
+}
+
+void address_space_destroy(AddressSpace *as)
+{
+ MemoryRegion *root = as->root;
+
+ /* Flush out anything from MemoryListeners listening in on this */
+ memory_region_transaction_begin();
+ as->root = NULL;
+ memory_region_transaction_commit();
+ QTAILQ_REMOVE(&address_spaces, as, address_spaces_link);
+
+ /* At this point, as->dispatch and as->current_map are dummy
+ * entries that the guest should never use. Wait for the old
+ * values to expire before freeing the data.
+ */
+ as->root = root;
+ call_rcu(as, do_address_space_destroy, rcu);
+}
+
+static const char *memory_region_type(MemoryRegion *mr)
+{
+ if (mr->alias) {
+ return memory_region_type(mr->alias);
+ }
+ if (memory_region_is_ram_device(mr)) {
+ return "ramd";
+ } else if (memory_region_is_romd(mr)) {
+ return "romd";
+ } else if (memory_region_is_rom(mr)) {
+ return "rom";
+ } else if (memory_region_is_ram(mr)) {
+ return "ram";
+ } else {
+ return "i/o";
+ }
+}
+
+typedef struct MemoryRegionList MemoryRegionList;
+
+struct MemoryRegionList {
+ const MemoryRegion *mr;
+ QTAILQ_ENTRY(MemoryRegionList) mrqueue;
+};
+
+typedef QTAILQ_HEAD(, MemoryRegionList) MemoryRegionListHead;
+
+#define MR_SIZE(size) (int128_nz(size) ? (hwaddr)int128_get64( \
+ int128_sub((size), int128_one())) : 0)
+#define MTREE_INDENT " "
+
+static void mtree_expand_owner(const char *label, Object *obj)
+{
+ DeviceState *dev = (DeviceState *) object_dynamic_cast(obj, TYPE_DEVICE);
+
+ qemu_printf(" %s:{%s", label, dev ? "dev" : "obj");
+ if (dev && dev->id) {
+ qemu_printf(" id=%s", dev->id);
+ } else {
+ char *canonical_path = object_get_canonical_path(obj);
+ if (canonical_path) {
+ qemu_printf(" path=%s", canonical_path);
+ g_free(canonical_path);
+ } else {
+ qemu_printf(" type=%s", object_get_typename(obj));
+ }
+ }
+ qemu_printf("}");
+}
+
+static void mtree_print_mr_owner(const MemoryRegion *mr)
+{
+ Object *owner = mr->owner;
+ Object *parent = memory_region_owner((MemoryRegion *)mr);
+
+ if (!owner && !parent) {
+ qemu_printf(" orphan");
+ return;
+ }
+ if (owner) {
+ mtree_expand_owner("owner", owner);
+ }
+ if (parent && parent != owner) {
+ mtree_expand_owner("parent", parent);
+ }
+}
+
+static void mtree_print_mr(const MemoryRegion *mr, unsigned int level,
+ hwaddr base,
+ MemoryRegionListHead *alias_print_queue,
+ bool owner, bool display_disabled)
+{
+ MemoryRegionList *new_ml, *ml, *next_ml;
+ MemoryRegionListHead submr_print_queue;
+ const MemoryRegion *submr;
+ unsigned int i;
+ hwaddr cur_start, cur_end;
+
+ if (!mr) {
+ return;
+ }
+
+ cur_start = base + mr->addr;
+ cur_end = cur_start + MR_SIZE(mr->size);
+
+ /*
+ * Try to detect overflow of memory region. This should never
+ * happen normally. When it happens, we dump something to warn the
+ * user who is observing this.
+ */
+ if (cur_start < base || cur_end < cur_start) {
+ qemu_printf("[DETECTED OVERFLOW!] ");
+ }
+
+ if (mr->alias) {
+ bool found = false;
+
+ /* check if the alias is already in the queue */
+ QTAILQ_FOREACH(ml, alias_print_queue, mrqueue) {
+ if (ml->mr == mr->alias) {
+ found = true;
+ }
+ }
+
+ if (!found) {
+ ml = g_new(MemoryRegionList, 1);
+ ml->mr = mr->alias;
+ QTAILQ_INSERT_TAIL(alias_print_queue, ml, mrqueue);
+ }
+ if (mr->enabled || display_disabled) {
+ for (i = 0; i < level; i++) {
+ qemu_printf(MTREE_INDENT);
+ }
+ qemu_printf(HWADDR_FMT_plx "-" HWADDR_FMT_plx
+ " (prio %d, %s%s): alias %s @%s " HWADDR_FMT_plx
+ "-" HWADDR_FMT_plx "%s",
+ cur_start, cur_end,
+ mr->priority,
+ mr->nonvolatile ? "nv-" : "",
+ memory_region_type((MemoryRegion *)mr),
+ memory_region_name(mr),
+ memory_region_name(mr->alias),
+ mr->alias_offset,
+ mr->alias_offset + MR_SIZE(mr->size),
+ mr->enabled ? "" : " [disabled]");
+ if (owner) {
+ mtree_print_mr_owner(mr);
+ }
+ qemu_printf("\n");
+ }
+ } else {
+ if (mr->enabled || display_disabled) {
+ for (i = 0; i < level; i++) {
+ qemu_printf(MTREE_INDENT);
+ }
+ qemu_printf(HWADDR_FMT_plx "-" HWADDR_FMT_plx
+ " (prio %d, %s%s): %s%s",
+ cur_start, cur_end,
+ mr->priority,
+ mr->nonvolatile ? "nv-" : "",
+ memory_region_type((MemoryRegion *)mr),
+ memory_region_name(mr),
+ mr->enabled ? "" : " [disabled]");
+ if (owner) {
+ mtree_print_mr_owner(mr);
+ }
+ qemu_printf("\n");
+ }
+ }
+
+ QTAILQ_INIT(&submr_print_queue);
+
+ QTAILQ_FOREACH(submr, &mr->subregions, subregions_link) {
+ new_ml = g_new(MemoryRegionList, 1);
+ new_ml->mr = submr;
+ QTAILQ_FOREACH(ml, &submr_print_queue, mrqueue) {
+ if (new_ml->mr->addr < ml->mr->addr ||
+ (new_ml->mr->addr == ml->mr->addr &&
+ new_ml->mr->priority > ml->mr->priority)) {
+ QTAILQ_INSERT_BEFORE(ml, new_ml, mrqueue);
+ new_ml = NULL;
+ break;
+ }
+ }
+ if (new_ml) {
+ QTAILQ_INSERT_TAIL(&submr_print_queue, new_ml, mrqueue);
+ }
+ }
+
+ QTAILQ_FOREACH(ml, &submr_print_queue, mrqueue) {
+ mtree_print_mr(ml->mr, level + 1, cur_start,
+ alias_print_queue, owner, display_disabled);
+ }
+
+ QTAILQ_FOREACH_SAFE(ml, &submr_print_queue, mrqueue, next_ml) {
+ g_free(ml);
+ }
+}
+
+struct FlatViewInfo {
+ int counter;
+ bool dispatch_tree;
+ bool owner;
+ AccelClass *ac;
+};
+
+static void mtree_print_flatview(gpointer key, gpointer value,
+ gpointer user_data)
+{
+ FlatView *view = key;
+ GArray *fv_address_spaces = value;
+ struct FlatViewInfo *fvi = user_data;
+ FlatRange *range = &view->ranges[0];
+ MemoryRegion *mr;
+ int n = view->nr;
+ int i;
+ AddressSpace *as;
+
+ qemu_printf("FlatView #%d\n", fvi->counter);
+ ++fvi->counter;
+
+ for (i = 0; i < fv_address_spaces->len; ++i) {
+ as = g_array_index(fv_address_spaces, AddressSpace*, i);
+ qemu_printf(" AS \"%s\", root: %s",
+ as->name, memory_region_name(as->root));
+ if (as->root->alias) {
+ qemu_printf(", alias %s", memory_region_name(as->root->alias));
+ }
+ qemu_printf("\n");
+ }
+
+ qemu_printf(" Root memory region: %s\n",
+ view->root ? memory_region_name(view->root) : "(none)");
+
+ if (n <= 0) {
+ qemu_printf(MTREE_INDENT "No rendered FlatView\n\n");
+ return;
+ }
+
+ while (n--) {
+ mr = range->mr;
+ if (range->offset_in_region) {
+ qemu_printf(MTREE_INDENT HWADDR_FMT_plx "-" HWADDR_FMT_plx
+ " (prio %d, %s%s): %s @" HWADDR_FMT_plx,
+ int128_get64(range->addr.start),
+ int128_get64(range->addr.start)
+ + MR_SIZE(range->addr.size),
+ mr->priority,
+ range->nonvolatile ? "nv-" : "",
+ range->readonly ? "rom" : memory_region_type(mr),
+ memory_region_name(mr),
+ range->offset_in_region);
+ } else {
+ qemu_printf(MTREE_INDENT HWADDR_FMT_plx "-" HWADDR_FMT_plx
+ " (prio %d, %s%s): %s",
+ int128_get64(range->addr.start),
+ int128_get64(range->addr.start)
+ + MR_SIZE(range->addr.size),
+ mr->priority,
+ range->nonvolatile ? "nv-" : "",
+ range->readonly ? "rom" : memory_region_type(mr),
+ memory_region_name(mr));
+ }
+ if (fvi->owner) {
+ mtree_print_mr_owner(mr);
+ }
+
+ if (fvi->ac) {
+ for (i = 0; i < fv_address_spaces->len; ++i) {
+ as = g_array_index(fv_address_spaces, AddressSpace*, i);
+ if (fvi->ac->has_memory(current_machine, as,
+ int128_get64(range->addr.start),
+ MR_SIZE(range->addr.size) + 1)) {
+ qemu_printf(" %s", fvi->ac->name);
+ }
+ }
+ }
+ qemu_printf("\n");
+ range++;
+ }
+
+#if !defined(CONFIG_USER_ONLY)
+ if (fvi->dispatch_tree && view->root) {
+ mtree_print_dispatch(view->dispatch, view->root);
+ }
+#endif
+
+ qemu_printf("\n");
+}
+
+static gboolean mtree_info_flatview_free(gpointer key, gpointer value,
+ gpointer user_data)
+{
+ FlatView *view = key;
+ GArray *fv_address_spaces = value;
+
+ g_array_unref(fv_address_spaces);
+ flatview_unref(view);
+
+ return true;
+}
+
+static void mtree_info_flatview(bool dispatch_tree, bool owner)
+{
+ struct FlatViewInfo fvi = {
+ .counter = 0,
+ .dispatch_tree = dispatch_tree,
+ .owner = owner,
+ };
+ AddressSpace *as;
+ FlatView *view;
+ GArray *fv_address_spaces;
+ GHashTable *views = g_hash_table_new(g_direct_hash, g_direct_equal);
+ AccelClass *ac = ACCEL_GET_CLASS(current_accel());
+
+ if (ac->has_memory) {
+ fvi.ac = ac;
+ }
+
+ /* Gather all FVs in one table */
+ QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) {
+ view = address_space_get_flatview(as);
+
+ fv_address_spaces = g_hash_table_lookup(views, view);
+ if (!fv_address_spaces) {
+ fv_address_spaces = g_array_new(false, false, sizeof(as));
+ g_hash_table_insert(views, view, fv_address_spaces);
+ }
+
+ g_array_append_val(fv_address_spaces, as);
+ }
+
+ /* Print */
+ g_hash_table_foreach(views, mtree_print_flatview, &fvi);
+
+ /* Free */
+ g_hash_table_foreach_remove(views, mtree_info_flatview_free, 0);
+ g_hash_table_unref(views);
+}
+
+struct AddressSpaceInfo {
+ MemoryRegionListHead *ml_head;
+ bool owner;
+ bool disabled;
+};
+
+/* Returns negative value if a < b; zero if a = b; positive value if a > b. */
+static gint address_space_compare_name(gconstpointer a, gconstpointer b)
+{
+ const AddressSpace *as_a = a;
+ const AddressSpace *as_b = b;
+
+ return g_strcmp0(as_a->name, as_b->name);
+}
+
+static void mtree_print_as_name(gpointer data, gpointer user_data)
+{
+ AddressSpace *as = data;
+
+ qemu_printf("address-space: %s\n", as->name);
+}
+
+static void mtree_print_as(gpointer key, gpointer value, gpointer user_data)
+{
+ MemoryRegion *mr = key;
+ GSList *as_same_root_mr_list = value;
+ struct AddressSpaceInfo *asi = user_data;
+
+ g_slist_foreach(as_same_root_mr_list, mtree_print_as_name, NULL);
+ mtree_print_mr(mr, 1, 0, asi->ml_head, asi->owner, asi->disabled);
+ qemu_printf("\n");
+}
+
+static gboolean mtree_info_as_free(gpointer key, gpointer value,
+ gpointer user_data)
+{
+ GSList *as_same_root_mr_list = value;
+
+ g_slist_free(as_same_root_mr_list);
+
+ return true;
+}
+
+static void mtree_info_as(bool dispatch_tree, bool owner, bool disabled)
+{
+ MemoryRegionListHead ml_head;
+ MemoryRegionList *ml, *ml2;
+ AddressSpace *as;
+ GHashTable *views = g_hash_table_new(g_direct_hash, g_direct_equal);
+ GSList *as_same_root_mr_list;
+ struct AddressSpaceInfo asi = {
+ .ml_head = &ml_head,
+ .owner = owner,
+ .disabled = disabled,
+ };
+
+ QTAILQ_INIT(&ml_head);
+
+ QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) {
+ /* Create hashtable, key=AS root MR, value = list of AS */
+ as_same_root_mr_list = g_hash_table_lookup(views, as->root);
+ as_same_root_mr_list = g_slist_insert_sorted(as_same_root_mr_list, as,
+ address_space_compare_name);
+ g_hash_table_insert(views, as->root, as_same_root_mr_list);
+ }
+
+ /* print address spaces */
+ g_hash_table_foreach(views, mtree_print_as, &asi);
+ g_hash_table_foreach_remove(views, mtree_info_as_free, 0);
+ g_hash_table_unref(views);
+
+ /* print aliased regions */
+ QTAILQ_FOREACH(ml, &ml_head, mrqueue) {
+ qemu_printf("memory-region: %s\n", memory_region_name(ml->mr));
+ mtree_print_mr(ml->mr, 1, 0, &ml_head, owner, disabled);
+ qemu_printf("\n");
+ }
+
+ QTAILQ_FOREACH_SAFE(ml, &ml_head, mrqueue, ml2) {
+ g_free(ml);
+ }
+}
+
+void mtree_info(bool flatview, bool dispatch_tree, bool owner, bool disabled)
+{
+ if (flatview) {
+ mtree_info_flatview(dispatch_tree, owner);
+ } else {
+ mtree_info_as(dispatch_tree, owner, disabled);
+ }
+}
+
+void memory_region_init_ram(MemoryRegion *mr,
+ Object *owner,
+ const char *name,
+ uint64_t size,
+ Error **errp)
+{
+ DeviceState *owner_dev;
+ Error *err = NULL;
+
+ memory_region_init_ram_nomigrate(mr, owner, name, size, &err);
+ if (err) {
+ error_propagate(errp, err);
+ return;
+ }
+ /* This will assert if owner is neither NULL nor a DeviceState.
+ * We only want the owner here for the purposes of defining a
+ * unique name for migration. TODO: Ideally we should implement
+ * a naming scheme for Objects which are not DeviceStates, in
+ * which case we can relax this restriction.
+ */
+ owner_dev = DEVICE(owner);
+ vmstate_register_ram(mr, owner_dev);
+}
+
+void memory_region_init_rom(MemoryRegion *mr,
+ Object *owner,
+ const char *name,
+ uint64_t size,
+ Error **errp)
+{
+ DeviceState *owner_dev;
+ Error *err = NULL;
+
+ memory_region_init_rom_nomigrate(mr, owner, name, size, &err);
+ if (err) {
+ error_propagate(errp, err);
+ return;
+ }
+ /* This will assert if owner is neither NULL nor a DeviceState.
+ * We only want the owner here for the purposes of defining a
+ * unique name for migration. TODO: Ideally we should implement
+ * a naming scheme for Objects which are not DeviceStates, in
+ * which case we can relax this restriction.
+ */
+ owner_dev = DEVICE(owner);
+ vmstate_register_ram(mr, owner_dev);
+}
+
+void memory_region_init_rom_device(MemoryRegion *mr,
+ Object *owner,
+ const MemoryRegionOps *ops,
+ void *opaque,
+ const char *name,
+ uint64_t size,
+ Error **errp)
+{
+ DeviceState *owner_dev;
+ Error *err = NULL;
+
+ memory_region_init_rom_device_nomigrate(mr, owner, ops, opaque,
+ name, size, &err);
+ if (err) {
+ error_propagate(errp, err);
+ return;
+ }
+ /* This will assert if owner is neither NULL nor a DeviceState.
+ * We only want the owner here for the purposes of defining a
+ * unique name for migration. TODO: Ideally we should implement
+ * a naming scheme for Objects which are not DeviceStates, in
+ * which case we can relax this restriction.
+ */
+ owner_dev = DEVICE(owner);
+ vmstate_register_ram(mr, owner_dev);
+}
+
+/*
+ * Support system builds with CONFIG_FUZZ using a weak symbol and a stub for
+ * the fuzz_dma_read_cb callback
+ */
+#ifdef CONFIG_FUZZ
+void __attribute__((weak)) fuzz_dma_read_cb(size_t addr,
+ size_t len,
+ MemoryRegion *mr)
+{
+}
+#endif
+
+static const TypeInfo memory_region_info = {
+ .parent = TYPE_OBJECT,
+ .name = TYPE_MEMORY_REGION,
+ .class_size = sizeof(MemoryRegionClass),
+ .instance_size = sizeof(MemoryRegion),
+ .instance_init = memory_region_initfn,
+ .instance_finalize = memory_region_finalize,
+};
+
+static const TypeInfo iommu_memory_region_info = {
+ .parent = TYPE_MEMORY_REGION,
+ .name = TYPE_IOMMU_MEMORY_REGION,
+ .class_size = sizeof(IOMMUMemoryRegionClass),
+ .instance_size = sizeof(IOMMUMemoryRegion),
+ .instance_init = iommu_memory_region_initfn,
+ .abstract = true,
+};
+
+static const TypeInfo ram_discard_manager_info = {
+ .parent = TYPE_INTERFACE,
+ .name = TYPE_RAM_DISCARD_MANAGER,
+ .class_size = sizeof(RamDiscardManagerClass),
+};
+
+static void memory_register_types(void)
+{
+ type_register_static(&memory_region_info);
+ type_register_static(&iommu_memory_region_info);
+ type_register_static(&ram_discard_manager_info);
+}
+
+type_init(memory_register_types)
diff --git a/system/memory_mapping.c b/system/memory_mapping.c
new file mode 100644
index 0000000000..d7f1d096e0
--- /dev/null
+++ b/system/memory_mapping.c
@@ -0,0 +1,377 @@
+/*
+ * QEMU memory mapping
+ *
+ * Copyright Fujitsu, Corp. 2011, 2012
+ *
+ * Authors:
+ * Wen Congyang <wency@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+
+#include "sysemu/memory_mapping.h"
+#include "exec/memory.h"
+#include "exec/address-spaces.h"
+#include "hw/core/cpu.h"
+
+//#define DEBUG_GUEST_PHYS_REGION_ADD
+
+static void memory_mapping_list_add_mapping_sorted(MemoryMappingList *list,
+ MemoryMapping *mapping)
+{
+ MemoryMapping *p;
+
+ QTAILQ_FOREACH(p, &list->head, next) {
+ if (p->phys_addr >= mapping->phys_addr) {
+ QTAILQ_INSERT_BEFORE(p, mapping, next);
+ return;
+ }
+ }
+ QTAILQ_INSERT_TAIL(&list->head, mapping, next);
+}
+
+static void create_new_memory_mapping(MemoryMappingList *list,
+ hwaddr phys_addr,
+ hwaddr virt_addr,
+ ram_addr_t length)
+{
+ MemoryMapping *memory_mapping;
+
+ memory_mapping = g_new(MemoryMapping, 1);
+ memory_mapping->phys_addr = phys_addr;
+ memory_mapping->virt_addr = virt_addr;
+ memory_mapping->length = length;
+ list->last_mapping = memory_mapping;
+ list->num++;
+ memory_mapping_list_add_mapping_sorted(list, memory_mapping);
+}
+
+static inline bool mapping_contiguous(MemoryMapping *map,
+ hwaddr phys_addr,
+ hwaddr virt_addr)
+{
+ return phys_addr == map->phys_addr + map->length &&
+ virt_addr == map->virt_addr + map->length;
+}
+
+/*
+ * [map->phys_addr, map->phys_addr + map->length) and
+ * [phys_addr, phys_addr + length) have intersection?
+ */
+static inline bool mapping_have_same_region(MemoryMapping *map,
+ hwaddr phys_addr,
+ ram_addr_t length)
+{
+ return !(phys_addr + length < map->phys_addr ||
+ phys_addr >= map->phys_addr + map->length);
+}
+
+/*
+ * [map->phys_addr, map->phys_addr + map->length) and
+ * [phys_addr, phys_addr + length) have intersection. The virtual address in the
+ * intersection are the same?
+ */
+static inline bool mapping_conflict(MemoryMapping *map,
+ hwaddr phys_addr,
+ hwaddr virt_addr)
+{
+ return virt_addr - map->virt_addr != phys_addr - map->phys_addr;
+}
+
+/*
+ * [map->virt_addr, map->virt_addr + map->length) and
+ * [virt_addr, virt_addr + length) have intersection. And the physical address
+ * in the intersection are the same.
+ */
+static inline void mapping_merge(MemoryMapping *map,
+ hwaddr virt_addr,
+ ram_addr_t length)
+{
+ if (virt_addr < map->virt_addr) {
+ map->length += map->virt_addr - virt_addr;
+ map->virt_addr = virt_addr;
+ }
+
+ if ((virt_addr + length) >
+ (map->virt_addr + map->length)) {
+ map->length = virt_addr + length - map->virt_addr;
+ }
+}
+
+void memory_mapping_list_add_merge_sorted(MemoryMappingList *list,
+ hwaddr phys_addr,
+ hwaddr virt_addr,
+ ram_addr_t length)
+{
+ MemoryMapping *memory_mapping, *last_mapping;
+
+ if (QTAILQ_EMPTY(&list->head)) {
+ create_new_memory_mapping(list, phys_addr, virt_addr, length);
+ return;
+ }
+
+ last_mapping = list->last_mapping;
+ if (last_mapping) {
+ if (mapping_contiguous(last_mapping, phys_addr, virt_addr)) {
+ last_mapping->length += length;
+ return;
+ }
+ }
+
+ QTAILQ_FOREACH(memory_mapping, &list->head, next) {
+ if (mapping_contiguous(memory_mapping, phys_addr, virt_addr)) {
+ memory_mapping->length += length;
+ list->last_mapping = memory_mapping;
+ return;
+ }
+
+ if (phys_addr + length < memory_mapping->phys_addr) {
+ /* create a new region before memory_mapping */
+ break;
+ }
+
+ if (mapping_have_same_region(memory_mapping, phys_addr, length)) {
+ if (mapping_conflict(memory_mapping, phys_addr, virt_addr)) {
+ continue;
+ }
+
+ /* merge this region into memory_mapping */
+ mapping_merge(memory_mapping, virt_addr, length);
+ list->last_mapping = memory_mapping;
+ return;
+ }
+ }
+
+ /* this region can not be merged into any existed memory mapping. */
+ create_new_memory_mapping(list, phys_addr, virt_addr, length);
+}
+
+void memory_mapping_list_free(MemoryMappingList *list)
+{
+ MemoryMapping *p, *q;
+
+ QTAILQ_FOREACH_SAFE(p, &list->head, next, q) {
+ QTAILQ_REMOVE(&list->head, p, next);
+ g_free(p);
+ }
+
+ list->num = 0;
+ list->last_mapping = NULL;
+}
+
+void memory_mapping_list_init(MemoryMappingList *list)
+{
+ list->num = 0;
+ list->last_mapping = NULL;
+ QTAILQ_INIT(&list->head);
+}
+
+void guest_phys_blocks_free(GuestPhysBlockList *list)
+{
+ GuestPhysBlock *p, *q;
+
+ QTAILQ_FOREACH_SAFE(p, &list->head, next, q) {
+ QTAILQ_REMOVE(&list->head, p, next);
+ memory_region_unref(p->mr);
+ g_free(p);
+ }
+ list->num = 0;
+}
+
+void guest_phys_blocks_init(GuestPhysBlockList *list)
+{
+ list->num = 0;
+ QTAILQ_INIT(&list->head);
+}
+
+typedef struct GuestPhysListener {
+ GuestPhysBlockList *list;
+ MemoryListener listener;
+} GuestPhysListener;
+
+static void guest_phys_block_add_section(GuestPhysListener *g,
+ MemoryRegionSection *section)
+{
+ const hwaddr target_start = section->offset_within_address_space;
+ const hwaddr target_end = target_start + int128_get64(section->size);
+ uint8_t *host_addr = memory_region_get_ram_ptr(section->mr) +
+ section->offset_within_region;
+ GuestPhysBlock *predecessor = NULL;
+
+ /* find continuity in guest physical address space */
+ if (!QTAILQ_EMPTY(&g->list->head)) {
+ hwaddr predecessor_size;
+
+ predecessor = QTAILQ_LAST(&g->list->head);
+ predecessor_size = predecessor->target_end - predecessor->target_start;
+
+ /* the memory API guarantees monotonically increasing traversal */
+ g_assert(predecessor->target_end <= target_start);
+
+ /* we want continuity in both guest-physical and host-virtual memory */
+ if (predecessor->target_end < target_start ||
+ predecessor->host_addr + predecessor_size != host_addr ||
+ predecessor->mr != section->mr) {
+ predecessor = NULL;
+ }
+ }
+
+ if (predecessor == NULL) {
+ /* isolated mapping, allocate it and add it to the list */
+ GuestPhysBlock *block = g_malloc0(sizeof *block);
+
+ block->target_start = target_start;
+ block->target_end = target_end;
+ block->host_addr = host_addr;
+ block->mr = section->mr;
+ memory_region_ref(section->mr);
+
+ QTAILQ_INSERT_TAIL(&g->list->head, block, next);
+ ++g->list->num;
+ } else {
+ /* expand predecessor until @target_end; predecessor's start doesn't
+ * change
+ */
+ predecessor->target_end = target_end;
+ }
+
+#ifdef DEBUG_GUEST_PHYS_REGION_ADD
+ fprintf(stderr, "%s: target_start=" HWADDR_FMT_plx " target_end="
+ HWADDR_FMT_plx ": %s (count: %u)\n", __func__, target_start,
+ target_end, predecessor ? "joined" : "added", g->list->num);
+#endif
+}
+
+static int guest_phys_ram_populate_cb(MemoryRegionSection *section,
+ void *opaque)
+{
+ GuestPhysListener *g = opaque;
+
+ guest_phys_block_add_section(g, section);
+ return 0;
+}
+
+static void guest_phys_blocks_region_add(MemoryListener *listener,
+ MemoryRegionSection *section)
+{
+ GuestPhysListener *g = container_of(listener, GuestPhysListener, listener);
+
+ /* we only care about RAM */
+ if (!memory_region_is_ram(section->mr) ||
+ memory_region_is_ram_device(section->mr) ||
+ memory_region_is_nonvolatile(section->mr)) {
+ return;
+ }
+
+ /* for special sparse regions, only add populated parts */
+ if (memory_region_has_ram_discard_manager(section->mr)) {
+ RamDiscardManager *rdm;
+
+ rdm = memory_region_get_ram_discard_manager(section->mr);
+ ram_discard_manager_replay_populated(rdm, section,
+ guest_phys_ram_populate_cb, g);
+ return;
+ }
+
+ guest_phys_block_add_section(g, section);
+}
+
+void guest_phys_blocks_append(GuestPhysBlockList *list)
+{
+ GuestPhysListener g = { 0 };
+
+ g.list = list;
+ g.listener.region_add = &guest_phys_blocks_region_add;
+ memory_listener_register(&g.listener, &address_space_memory);
+ memory_listener_unregister(&g.listener);
+}
+
+static CPUState *find_paging_enabled_cpu(CPUState *start_cpu)
+{
+ CPUState *cpu;
+
+ CPU_FOREACH(cpu) {
+ if (cpu_paging_enabled(cpu)) {
+ return cpu;
+ }
+ }
+
+ return NULL;
+}
+
+void qemu_get_guest_memory_mapping(MemoryMappingList *list,
+ const GuestPhysBlockList *guest_phys_blocks,
+ Error **errp)
+{
+ CPUState *cpu, *first_paging_enabled_cpu;
+ GuestPhysBlock *block;
+ ram_addr_t offset, length;
+
+ first_paging_enabled_cpu = find_paging_enabled_cpu(first_cpu);
+ if (first_paging_enabled_cpu) {
+ for (cpu = first_paging_enabled_cpu; cpu != NULL;
+ cpu = CPU_NEXT(cpu)) {
+ Error *err = NULL;
+ cpu_get_memory_mapping(cpu, list, &err);
+ if (err) {
+ error_propagate(errp, err);
+ return;
+ }
+ }
+ return;
+ }
+
+ /*
+ * If the guest doesn't use paging, the virtual address is equal to physical
+ * address.
+ */
+ QTAILQ_FOREACH(block, &guest_phys_blocks->head, next) {
+ offset = block->target_start;
+ length = block->target_end - block->target_start;
+ create_new_memory_mapping(list, offset, offset, length);
+ }
+}
+
+void qemu_get_guest_simple_memory_mapping(MemoryMappingList *list,
+ const GuestPhysBlockList *guest_phys_blocks)
+{
+ GuestPhysBlock *block;
+
+ QTAILQ_FOREACH(block, &guest_phys_blocks->head, next) {
+ create_new_memory_mapping(list, block->target_start, 0,
+ block->target_end - block->target_start);
+ }
+}
+
+void memory_mapping_filter(MemoryMappingList *list, int64_t begin,
+ int64_t length)
+{
+ MemoryMapping *cur, *next;
+
+ QTAILQ_FOREACH_SAFE(cur, &list->head, next, next) {
+ if (cur->phys_addr >= begin + length ||
+ cur->phys_addr + cur->length <= begin) {
+ QTAILQ_REMOVE(&list->head, cur, next);
+ g_free(cur);
+ list->num--;
+ continue;
+ }
+
+ if (cur->phys_addr < begin) {
+ cur->length -= begin - cur->phys_addr;
+ if (cur->virt_addr) {
+ cur->virt_addr += begin - cur->phys_addr;
+ }
+ cur->phys_addr = begin;
+ }
+
+ if (cur->phys_addr + cur->length > begin + length) {
+ cur->length -= cur->phys_addr + cur->length - begin - length;
+ }
+ }
+}
diff --git a/system/meson.build b/system/meson.build
new file mode 100644
index 0000000000..3a64dd89de
--- /dev/null
+++ b/system/meson.build
@@ -0,0 +1,36 @@
+specific_ss.add(when: 'CONFIG_SYSTEM_ONLY', if_true: [files(
+ 'arch_init.c',
+ 'ioport.c',
+ 'memory.c',
+ 'physmem.c',
+ 'watchpoint.c',
+)])
+
+system_ss.add(files(
+ 'balloon.c',
+ 'bootdevice.c',
+ 'cpus.c',
+ 'cpu-throttle.c',
+ 'cpu-timers.c',
+ 'datadir.c',
+ 'dirtylimit.c',
+ 'dma-helpers.c',
+ 'globals.c',
+ 'memory_mapping.c',
+ 'qdev-monitor.c',
+ 'qtest.c',
+ 'rtc.c',
+ 'runstate-action.c',
+ 'runstate-hmp-cmds.c',
+ 'runstate.c',
+ 'tpm-hmp-cmds.c',
+ 'vl.c',
+), sdl, libpmem, libdaxctl)
+
+if have_tpm
+ system_ss.add(files('tpm.c'))
+endif
+
+system_ss.add(when: seccomp, if_true: files('qemu-seccomp.c'))
+system_ss.add(when: fdt, if_true: files('device_tree.c'))
+system_ss.add(when: 'CONFIG_LINUX', if_true: files('async-teardown.c'))
diff --git a/system/physmem.c b/system/physmem.c
new file mode 100644
index 0000000000..edc3ed8ab9
--- /dev/null
+++ b/system/physmem.c
@@ -0,0 +1,3796 @@
+/*
+ * RAM allocation and memory access
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "exec/page-vary.h"
+#include "qapi/error.h"
+
+#include "qemu/cutils.h"
+#include "qemu/cacheflush.h"
+#include "qemu/hbitmap.h"
+#include "qemu/madvise.h"
+
+#ifdef CONFIG_TCG
+#include "hw/core/tcg-cpu-ops.h"
+#endif /* CONFIG_TCG */
+
+#include "exec/exec-all.h"
+#include "exec/target_page.h"
+#include "hw/qdev-core.h"
+#include "hw/qdev-properties.h"
+#include "hw/boards.h"
+#include "hw/xen/xen.h"
+#include "sysemu/kvm.h"
+#include "sysemu/tcg.h"
+#include "sysemu/qtest.h"
+#include "qemu/timer.h"
+#include "qemu/config-file.h"
+#include "qemu/error-report.h"
+#include "qemu/qemu-print.h"
+#include "qemu/log.h"
+#include "qemu/memalign.h"
+#include "exec/memory.h"
+#include "exec/ioport.h"
+#include "sysemu/dma.h"
+#include "sysemu/hostmem.h"
+#include "sysemu/hw_accel.h"
+#include "sysemu/xen-mapcache.h"
+#include "trace/trace-root.h"
+
+#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
+#include <linux/falloc.h>
+#endif
+
+#include "qemu/rcu_queue.h"
+#include "qemu/main-loop.h"
+#include "exec/translate-all.h"
+#include "sysemu/replay.h"
+
+#include "exec/memory-internal.h"
+#include "exec/ram_addr.h"
+
+#include "qemu/pmem.h"
+
+#include "migration/vmstate.h"
+
+#include "qemu/range.h"
+#ifndef _WIN32
+#include "qemu/mmap-alloc.h"
+#endif
+
+#include "monitor/monitor.h"
+
+#ifdef CONFIG_LIBDAXCTL
+#include <daxctl/libdaxctl.h>
+#endif
+
+//#define DEBUG_SUBPAGE
+
+/* ram_list is read under rcu_read_lock()/rcu_read_unlock(). Writes
+ * are protected by the ramlist lock.
+ */
+RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
+
+static MemoryRegion *system_memory;
+static MemoryRegion *system_io;
+
+AddressSpace address_space_io;
+AddressSpace address_space_memory;
+
+static MemoryRegion io_mem_unassigned;
+
+typedef struct PhysPageEntry PhysPageEntry;
+
+struct PhysPageEntry {
+ /* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
+ uint32_t skip : 6;
+ /* index into phys_sections (!skip) or phys_map_nodes (skip) */
+ uint32_t ptr : 26;
+};
+
+#define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)
+
+/* Size of the L2 (and L3, etc) page tables. */
+#define ADDR_SPACE_BITS 64
+
+#define P_L2_BITS 9
+#define P_L2_SIZE (1 << P_L2_BITS)
+
+#define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)
+
+typedef PhysPageEntry Node[P_L2_SIZE];
+
+typedef struct PhysPageMap {
+ struct rcu_head rcu;
+
+ unsigned sections_nb;
+ unsigned sections_nb_alloc;
+ unsigned nodes_nb;
+ unsigned nodes_nb_alloc;
+ Node *nodes;
+ MemoryRegionSection *sections;
+} PhysPageMap;
+
+struct AddressSpaceDispatch {
+ MemoryRegionSection *mru_section;
+ /* This is a multi-level map on the physical address space.
+ * The bottom level has pointers to MemoryRegionSections.
+ */
+ PhysPageEntry phys_map;
+ PhysPageMap map;
+};
+
+#define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
+typedef struct subpage_t {
+ MemoryRegion iomem;
+ FlatView *fv;
+ hwaddr base;
+ uint16_t sub_section[];
+} subpage_t;
+
+#define PHYS_SECTION_UNASSIGNED 0
+
+static void io_mem_init(void);
+static void memory_map_init(void);
+static void tcg_log_global_after_sync(MemoryListener *listener);
+static void tcg_commit(MemoryListener *listener);
+
+/**
+ * CPUAddressSpace: all the information a CPU needs about an AddressSpace
+ * @cpu: the CPU whose AddressSpace this is
+ * @as: the AddressSpace itself
+ * @memory_dispatch: its dispatch pointer (cached, RCU protected)
+ * @tcg_as_listener: listener for tracking changes to the AddressSpace
+ */
+struct CPUAddressSpace {
+ CPUState *cpu;
+ AddressSpace *as;
+ struct AddressSpaceDispatch *memory_dispatch;
+ MemoryListener tcg_as_listener;
+};
+
+struct DirtyBitmapSnapshot {
+ ram_addr_t start;
+ ram_addr_t end;
+ unsigned long dirty[];
+};
+
+static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
+{
+ static unsigned alloc_hint = 16;
+ if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
+ map->nodes_nb_alloc = MAX(alloc_hint, map->nodes_nb + nodes);
+ map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
+ alloc_hint = map->nodes_nb_alloc;
+ }
+}
+
+static uint32_t phys_map_node_alloc(PhysPageMap *map, bool leaf)
+{
+ unsigned i;
+ uint32_t ret;
+ PhysPageEntry e;
+ PhysPageEntry *p;
+
+ ret = map->nodes_nb++;
+ p = map->nodes[ret];
+ assert(ret != PHYS_MAP_NODE_NIL);
+ assert(ret != map->nodes_nb_alloc);
+
+ e.skip = leaf ? 0 : 1;
+ e.ptr = leaf ? PHYS_SECTION_UNASSIGNED : PHYS_MAP_NODE_NIL;
+ for (i = 0; i < P_L2_SIZE; ++i) {
+ memcpy(&p[i], &e, sizeof(e));
+ }
+ return ret;
+}
+
+static void phys_page_set_level(PhysPageMap *map, PhysPageEntry *lp,
+ hwaddr *index, uint64_t *nb, uint16_t leaf,
+ int level)
+{
+ PhysPageEntry *p;
+ hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
+
+ if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
+ lp->ptr = phys_map_node_alloc(map, level == 0);
+ }
+ p = map->nodes[lp->ptr];
+ lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)];
+
+ while (*nb && lp < &p[P_L2_SIZE]) {
+ if ((*index & (step - 1)) == 0 && *nb >= step) {
+ lp->skip = 0;
+ lp->ptr = leaf;
+ *index += step;
+ *nb -= step;
+ } else {
+ phys_page_set_level(map, lp, index, nb, leaf, level - 1);
+ }
+ ++lp;
+ }
+}
+
+static void phys_page_set(AddressSpaceDispatch *d,
+ hwaddr index, uint64_t nb,
+ uint16_t leaf)
+{
+ /* Wildly overreserve - it doesn't matter much. */
+ phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
+
+ phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
+}
+
+/* Compact a non leaf page entry. Simply detect that the entry has a single child,
+ * and update our entry so we can skip it and go directly to the destination.
+ */
+static void phys_page_compact(PhysPageEntry *lp, Node *nodes)
+{
+ unsigned valid_ptr = P_L2_SIZE;
+ int valid = 0;
+ PhysPageEntry *p;
+ int i;
+
+ if (lp->ptr == PHYS_MAP_NODE_NIL) {
+ return;
+ }
+
+ p = nodes[lp->ptr];
+ for (i = 0; i < P_L2_SIZE; i++) {
+ if (p[i].ptr == PHYS_MAP_NODE_NIL) {
+ continue;
+ }
+
+ valid_ptr = i;
+ valid++;
+ if (p[i].skip) {
+ phys_page_compact(&p[i], nodes);
+ }
+ }
+
+ /* We can only compress if there's only one child. */
+ if (valid != 1) {
+ return;
+ }
+
+ assert(valid_ptr < P_L2_SIZE);
+
+ /* Don't compress if it won't fit in the # of bits we have. */
+ if (P_L2_LEVELS >= (1 << 6) &&
+ lp->skip + p[valid_ptr].skip >= (1 << 6)) {
+ return;
+ }
+
+ lp->ptr = p[valid_ptr].ptr;
+ if (!p[valid_ptr].skip) {
+ /* If our only child is a leaf, make this a leaf. */
+ /* By design, we should have made this node a leaf to begin with so we
+ * should never reach here.
+ * But since it's so simple to handle this, let's do it just in case we
+ * change this rule.
+ */
+ lp->skip = 0;
+ } else {
+ lp->skip += p[valid_ptr].skip;
+ }
+}
+
+void address_space_dispatch_compact(AddressSpaceDispatch *d)
+{
+ if (d->phys_map.skip) {
+ phys_page_compact(&d->phys_map, d->map.nodes);
+ }
+}
+
+static inline bool section_covers_addr(const MemoryRegionSection *section,
+ hwaddr addr)
+{
+ /* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means
+ * the section must cover the entire address space.
+ */
+ return int128_gethi(section->size) ||
+ range_covers_byte(section->offset_within_address_space,
+ int128_getlo(section->size), addr);
+}
+
+static MemoryRegionSection *phys_page_find(AddressSpaceDispatch *d, hwaddr addr)
+{
+ PhysPageEntry lp = d->phys_map, *p;
+ Node *nodes = d->map.nodes;
+ MemoryRegionSection *sections = d->map.sections;
+ hwaddr index = addr >> TARGET_PAGE_BITS;
+ int i;
+
+ for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
+ if (lp.ptr == PHYS_MAP_NODE_NIL) {
+ return &sections[PHYS_SECTION_UNASSIGNED];
+ }
+ p = nodes[lp.ptr];
+ lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
+ }
+
+ if (section_covers_addr(&sections[lp.ptr], addr)) {
+ return &sections[lp.ptr];
+ } else {
+ return &sections[PHYS_SECTION_UNASSIGNED];
+ }
+}
+
+/* Called from RCU critical section */
+static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
+ hwaddr addr,
+ bool resolve_subpage)
+{
+ MemoryRegionSection *section = qatomic_read(&d->mru_section);
+ subpage_t *subpage;
+
+ if (!section || section == &d->map.sections[PHYS_SECTION_UNASSIGNED] ||
+ !section_covers_addr(section, addr)) {
+ section = phys_page_find(d, addr);
+ qatomic_set(&d->mru_section, section);
+ }
+ if (resolve_subpage && section->mr->subpage) {
+ subpage = container_of(section->mr, subpage_t, iomem);
+ section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
+ }
+ return section;
+}
+
+/* Called from RCU critical section */
+static MemoryRegionSection *
+address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *xlat,
+ hwaddr *plen, bool resolve_subpage)
+{
+ MemoryRegionSection *section;
+ MemoryRegion *mr;
+ Int128 diff;
+
+ section = address_space_lookup_region(d, addr, resolve_subpage);
+ /* Compute offset within MemoryRegionSection */
+ addr -= section->offset_within_address_space;
+
+ /* Compute offset within MemoryRegion */
+ *xlat = addr + section->offset_within_region;
+
+ mr = section->mr;
+
+ /* MMIO registers can be expected to perform full-width accesses based only
+ * on their address, without considering adjacent registers that could
+ * decode to completely different MemoryRegions. When such registers
+ * exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
+ * regions overlap wildly. For this reason we cannot clamp the accesses
+ * here.
+ *
+ * If the length is small (as is the case for address_space_ldl/stl),
+ * everything works fine. If the incoming length is large, however,
+ * the caller really has to do the clamping through memory_access_size.
+ */
+ if (memory_region_is_ram(mr)) {
+ diff = int128_sub(section->size, int128_make64(addr));
+ *plen = int128_get64(int128_min(diff, int128_make64(*plen)));
+ }
+ return section;
+}
+
+/**
+ * address_space_translate_iommu - translate an address through an IOMMU
+ * memory region and then through the target address space.
+ *
+ * @iommu_mr: the IOMMU memory region that we start the translation from
+ * @addr: the address to be translated through the MMU
+ * @xlat: the translated address offset within the destination memory region.
+ * It cannot be %NULL.
+ * @plen_out: valid read/write length of the translated address. It
+ * cannot be %NULL.
+ * @page_mask_out: page mask for the translated address. This
+ * should only be meaningful for IOMMU translated
+ * addresses, since there may be huge pages that this bit
+ * would tell. It can be %NULL if we don't care about it.
+ * @is_write: whether the translation operation is for write
+ * @is_mmio: whether this can be MMIO, set true if it can
+ * @target_as: the address space targeted by the IOMMU
+ * @attrs: transaction attributes
+ *
+ * This function is called from RCU critical section. It is the common
+ * part of flatview_do_translate and address_space_translate_cached.
+ */
+static MemoryRegionSection address_space_translate_iommu(IOMMUMemoryRegion *iommu_mr,
+ hwaddr *xlat,
+ hwaddr *plen_out,
+ hwaddr *page_mask_out,
+ bool is_write,
+ bool is_mmio,
+ AddressSpace **target_as,
+ MemTxAttrs attrs)
+{
+ MemoryRegionSection *section;
+ hwaddr page_mask = (hwaddr)-1;
+
+ do {
+ hwaddr addr = *xlat;
+ IOMMUMemoryRegionClass *imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
+ int iommu_idx = 0;
+ IOMMUTLBEntry iotlb;
+
+ if (imrc->attrs_to_index) {
+ iommu_idx = imrc->attrs_to_index(iommu_mr, attrs);
+ }
+
+ iotlb = imrc->translate(iommu_mr, addr, is_write ?
+ IOMMU_WO : IOMMU_RO, iommu_idx);
+
+ if (!(iotlb.perm & (1 << is_write))) {
+ goto unassigned;
+ }
+
+ addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
+ | (addr & iotlb.addr_mask));
+ page_mask &= iotlb.addr_mask;
+ *plen_out = MIN(*plen_out, (addr | iotlb.addr_mask) - addr + 1);
+ *target_as = iotlb.target_as;
+
+ section = address_space_translate_internal(
+ address_space_to_dispatch(iotlb.target_as), addr, xlat,
+ plen_out, is_mmio);
+
+ iommu_mr = memory_region_get_iommu(section->mr);
+ } while (unlikely(iommu_mr));
+
+ if (page_mask_out) {
+ *page_mask_out = page_mask;
+ }
+ return *section;
+
+unassigned:
+ return (MemoryRegionSection) { .mr = &io_mem_unassigned };
+}
+
+/**
+ * flatview_do_translate - translate an address in FlatView
+ *
+ * @fv: the flat view that we want to translate on
+ * @addr: the address to be translated in above address space
+ * @xlat: the translated address offset within memory region. It
+ * cannot be @NULL.
+ * @plen_out: valid read/write length of the translated address. It
+ * can be @NULL when we don't care about it.
+ * @page_mask_out: page mask for the translated address. This
+ * should only be meaningful for IOMMU translated
+ * addresses, since there may be huge pages that this bit
+ * would tell. It can be @NULL if we don't care about it.
+ * @is_write: whether the translation operation is for write
+ * @is_mmio: whether this can be MMIO, set true if it can
+ * @target_as: the address space targeted by the IOMMU
+ * @attrs: memory transaction attributes
+ *
+ * This function is called from RCU critical section
+ */
+static MemoryRegionSection flatview_do_translate(FlatView *fv,
+ hwaddr addr,
+ hwaddr *xlat,
+ hwaddr *plen_out,
+ hwaddr *page_mask_out,
+ bool is_write,
+ bool is_mmio,
+ AddressSpace **target_as,
+ MemTxAttrs attrs)
+{
+ MemoryRegionSection *section;
+ IOMMUMemoryRegion *iommu_mr;
+ hwaddr plen = (hwaddr)(-1);
+
+ if (!plen_out) {
+ plen_out = &plen;
+ }
+
+ section = address_space_translate_internal(
+ flatview_to_dispatch(fv), addr, xlat,
+ plen_out, is_mmio);
+
+ iommu_mr = memory_region_get_iommu(section->mr);
+ if (unlikely(iommu_mr)) {
+ return address_space_translate_iommu(iommu_mr, xlat,
+ plen_out, page_mask_out,
+ is_write, is_mmio,
+ target_as, attrs);
+ }
+ if (page_mask_out) {
+ /* Not behind an IOMMU, use default page size. */
+ *page_mask_out = ~TARGET_PAGE_MASK;
+ }
+
+ return *section;
+}
+
+/* Called from RCU critical section */
+IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
+ bool is_write, MemTxAttrs attrs)
+{
+ MemoryRegionSection section;
+ hwaddr xlat, page_mask;
+
+ /*
+ * This can never be MMIO, and we don't really care about plen,
+ * but page mask.
+ */
+ section = flatview_do_translate(address_space_to_flatview(as), addr, &xlat,
+ NULL, &page_mask, is_write, false, &as,
+ attrs);
+
+ /* Illegal translation */
+ if (section.mr == &io_mem_unassigned) {
+ goto iotlb_fail;
+ }
+
+ /* Convert memory region offset into address space offset */
+ xlat += section.offset_within_address_space -
+ section.offset_within_region;
+
+ return (IOMMUTLBEntry) {
+ .target_as = as,
+ .iova = addr & ~page_mask,
+ .translated_addr = xlat & ~page_mask,
+ .addr_mask = page_mask,
+ /* IOTLBs are for DMAs, and DMA only allows on RAMs. */
+ .perm = IOMMU_RW,
+ };
+
+iotlb_fail:
+ return (IOMMUTLBEntry) {0};
+}
+
+/* Called from RCU critical section */
+MemoryRegion *flatview_translate(FlatView *fv, hwaddr addr, hwaddr *xlat,
+ hwaddr *plen, bool is_write,
+ MemTxAttrs attrs)
+{
+ MemoryRegion *mr;
+ MemoryRegionSection section;
+ AddressSpace *as = NULL;
+
+ /* This can be MMIO, so setup MMIO bit. */
+ section = flatview_do_translate(fv, addr, xlat, plen, NULL,
+ is_write, true, &as, attrs);
+ mr = section.mr;
+
+ if (xen_enabled() && memory_access_is_direct(mr, is_write)) {
+ hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
+ *plen = MIN(page, *plen);
+ }
+
+ return mr;
+}
+
+typedef struct TCGIOMMUNotifier {
+ IOMMUNotifier n;
+ MemoryRegion *mr;
+ CPUState *cpu;
+ int iommu_idx;
+ bool active;
+} TCGIOMMUNotifier;
+
+static void tcg_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
+{
+ TCGIOMMUNotifier *notifier = container_of(n, TCGIOMMUNotifier, n);
+
+ if (!notifier->active) {
+ return;
+ }
+ tlb_flush(notifier->cpu);
+ notifier->active = false;
+ /* We leave the notifier struct on the list to avoid reallocating it later.
+ * Generally the number of IOMMUs a CPU deals with will be small.
+ * In any case we can't unregister the iommu notifier from a notify
+ * callback.
+ */
+}
+
+static void tcg_register_iommu_notifier(CPUState *cpu,
+ IOMMUMemoryRegion *iommu_mr,
+ int iommu_idx)
+{
+ /* Make sure this CPU has an IOMMU notifier registered for this
+ * IOMMU/IOMMU index combination, so that we can flush its TLB
+ * when the IOMMU tells us the mappings we've cached have changed.
+ */
+ MemoryRegion *mr = MEMORY_REGION(iommu_mr);
+ TCGIOMMUNotifier *notifier = NULL;
+ int i;
+
+ for (i = 0; i < cpu->iommu_notifiers->len; i++) {
+ notifier = g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i);
+ if (notifier->mr == mr && notifier->iommu_idx == iommu_idx) {
+ break;
+ }
+ }
+ if (i == cpu->iommu_notifiers->len) {
+ /* Not found, add a new entry at the end of the array */
+ cpu->iommu_notifiers = g_array_set_size(cpu->iommu_notifiers, i + 1);
+ notifier = g_new0(TCGIOMMUNotifier, 1);
+ g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i) = notifier;
+
+ notifier->mr = mr;
+ notifier->iommu_idx = iommu_idx;
+ notifier->cpu = cpu;
+ /* Rather than trying to register interest in the specific part
+ * of the iommu's address space that we've accessed and then
+ * expand it later as subsequent accesses touch more of it, we
+ * just register interest in the whole thing, on the assumption
+ * that iommu reconfiguration will be rare.
+ */
+ iommu_notifier_init(&notifier->n,
+ tcg_iommu_unmap_notify,
+ IOMMU_NOTIFIER_UNMAP,
+ 0,
+ HWADDR_MAX,
+ iommu_idx);
+ memory_region_register_iommu_notifier(notifier->mr, &notifier->n,
+ &error_fatal);
+ }
+
+ if (!notifier->active) {
+ notifier->active = true;
+ }
+}
+
+void tcg_iommu_free_notifier_list(CPUState *cpu)
+{
+ /* Destroy the CPU's notifier list */
+ int i;
+ TCGIOMMUNotifier *notifier;
+
+ for (i = 0; i < cpu->iommu_notifiers->len; i++) {
+ notifier = g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i);
+ memory_region_unregister_iommu_notifier(notifier->mr, &notifier->n);
+ g_free(notifier);
+ }
+ g_array_free(cpu->iommu_notifiers, true);
+}
+
+void tcg_iommu_init_notifier_list(CPUState *cpu)
+{
+ cpu->iommu_notifiers = g_array_new(false, true, sizeof(TCGIOMMUNotifier *));
+}
+
+/* Called from RCU critical section */
+MemoryRegionSection *
+address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr orig_addr,
+ hwaddr *xlat, hwaddr *plen,
+ MemTxAttrs attrs, int *prot)
+{
+ MemoryRegionSection *section;
+ IOMMUMemoryRegion *iommu_mr;
+ IOMMUMemoryRegionClass *imrc;
+ IOMMUTLBEntry iotlb;
+ int iommu_idx;
+ hwaddr addr = orig_addr;
+ AddressSpaceDispatch *d = cpu->cpu_ases[asidx].memory_dispatch;
+
+ for (;;) {
+ section = address_space_translate_internal(d, addr, &addr, plen, false);
+
+ iommu_mr = memory_region_get_iommu(section->mr);
+ if (!iommu_mr) {
+ break;
+ }
+
+ imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
+
+ iommu_idx = imrc->attrs_to_index(iommu_mr, attrs);
+ tcg_register_iommu_notifier(cpu, iommu_mr, iommu_idx);
+ /* We need all the permissions, so pass IOMMU_NONE so the IOMMU
+ * doesn't short-cut its translation table walk.
+ */
+ iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, iommu_idx);
+ addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
+ | (addr & iotlb.addr_mask));
+ /* Update the caller's prot bits to remove permissions the IOMMU
+ * is giving us a failure response for. If we get down to no
+ * permissions left at all we can give up now.
+ */
+ if (!(iotlb.perm & IOMMU_RO)) {
+ *prot &= ~(PAGE_READ | PAGE_EXEC);
+ }
+ if (!(iotlb.perm & IOMMU_WO)) {
+ *prot &= ~PAGE_WRITE;
+ }
+
+ if (!*prot) {
+ goto translate_fail;
+ }
+
+ d = flatview_to_dispatch(address_space_to_flatview(iotlb.target_as));
+ }
+
+ assert(!memory_region_is_iommu(section->mr));
+ *xlat = addr;
+ return section;
+
+translate_fail:
+ /*
+ * We should be given a page-aligned address -- certainly
+ * tlb_set_page_with_attrs() does so. The page offset of xlat
+ * is used to index sections[], and PHYS_SECTION_UNASSIGNED = 0.
+ * The page portion of xlat will be logged by memory_region_access_valid()
+ * when this memory access is rejected, so use the original untranslated
+ * physical address.
+ */
+ assert((orig_addr & ~TARGET_PAGE_MASK) == 0);
+ *xlat = orig_addr;
+ return &d->map.sections[PHYS_SECTION_UNASSIGNED];
+}
+
+void cpu_address_space_init(CPUState *cpu, int asidx,
+ const char *prefix, MemoryRegion *mr)
+{
+ CPUAddressSpace *newas;
+ AddressSpace *as = g_new0(AddressSpace, 1);
+ char *as_name;
+
+ assert(mr);
+ as_name = g_strdup_printf("%s-%d", prefix, cpu->cpu_index);
+ address_space_init(as, mr, as_name);
+ g_free(as_name);
+
+ /* Target code should have set num_ases before calling us */
+ assert(asidx < cpu->num_ases);
+
+ if (asidx == 0) {
+ /* address space 0 gets the convenience alias */
+ cpu->as = as;
+ }
+
+ /* KVM cannot currently support multiple address spaces. */
+ assert(asidx == 0 || !kvm_enabled());
+
+ if (!cpu->cpu_ases) {
+ cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases);
+ }
+
+ newas = &cpu->cpu_ases[asidx];
+ newas->cpu = cpu;
+ newas->as = as;
+ if (tcg_enabled()) {
+ newas->tcg_as_listener.log_global_after_sync = tcg_log_global_after_sync;
+ newas->tcg_as_listener.commit = tcg_commit;
+ newas->tcg_as_listener.name = "tcg";
+ memory_listener_register(&newas->tcg_as_listener, as);
+ }
+}
+
+AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx)
+{
+ /* Return the AddressSpace corresponding to the specified index */
+ return cpu->cpu_ases[asidx].as;
+}
+
+/* Called from RCU critical section */
+static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
+{
+ RAMBlock *block;
+
+ block = qatomic_rcu_read(&ram_list.mru_block);
+ if (block && addr - block->offset < block->max_length) {
+ return block;
+ }
+ RAMBLOCK_FOREACH(block) {
+ if (addr - block->offset < block->max_length) {
+ goto found;
+ }
+ }
+
+ fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
+ abort();
+
+found:
+ /* It is safe to write mru_block outside the iothread lock. This
+ * is what happens:
+ *
+ * mru_block = xxx
+ * rcu_read_unlock()
+ * xxx removed from list
+ * rcu_read_lock()
+ * read mru_block
+ * mru_block = NULL;
+ * call_rcu(reclaim_ramblock, xxx);
+ * rcu_read_unlock()
+ *
+ * qatomic_rcu_set is not needed here. The block was already published
+ * when it was placed into the list. Here we're just making an extra
+ * copy of the pointer.
+ */
+ ram_list.mru_block = block;
+ return block;
+}
+
+static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
+{
+ CPUState *cpu;
+ ram_addr_t start1;
+ RAMBlock *block;
+ ram_addr_t end;
+
+ assert(tcg_enabled());
+ end = TARGET_PAGE_ALIGN(start + length);
+ start &= TARGET_PAGE_MASK;
+
+ RCU_READ_LOCK_GUARD();
+ block = qemu_get_ram_block(start);
+ assert(block == qemu_get_ram_block(end - 1));
+ start1 = (uintptr_t)ramblock_ptr(block, start - block->offset);
+ CPU_FOREACH(cpu) {
+ tlb_reset_dirty(cpu, start1, length);
+ }
+}
+
+/* Note: start and end must be within the same ram block. */
+bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
+ ram_addr_t length,
+ unsigned client)
+{
+ DirtyMemoryBlocks *blocks;
+ unsigned long end, page, start_page;
+ bool dirty = false;
+ RAMBlock *ramblock;
+ uint64_t mr_offset, mr_size;
+
+ if (length == 0) {
+ return false;
+ }
+
+ end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
+ start_page = start >> TARGET_PAGE_BITS;
+ page = start_page;
+
+ WITH_RCU_READ_LOCK_GUARD() {
+ blocks = qatomic_rcu_read(&ram_list.dirty_memory[client]);
+ ramblock = qemu_get_ram_block(start);
+ /* Range sanity check on the ramblock */
+ assert(start >= ramblock->offset &&
+ start + length <= ramblock->offset + ramblock->used_length);
+
+ while (page < end) {
+ unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
+ unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
+ unsigned long num = MIN(end - page,
+ DIRTY_MEMORY_BLOCK_SIZE - offset);
+
+ dirty |= bitmap_test_and_clear_atomic(blocks->blocks[idx],
+ offset, num);
+ page += num;
+ }
+
+ mr_offset = (ram_addr_t)(start_page << TARGET_PAGE_BITS) - ramblock->offset;
+ mr_size = (end - start_page) << TARGET_PAGE_BITS;
+ memory_region_clear_dirty_bitmap(ramblock->mr, mr_offset, mr_size);
+ }
+
+ if (dirty && tcg_enabled()) {
+ tlb_reset_dirty_range_all(start, length);
+ }
+
+ return dirty;
+}
+
+DirtyBitmapSnapshot *cpu_physical_memory_snapshot_and_clear_dirty
+ (MemoryRegion *mr, hwaddr offset, hwaddr length, unsigned client)
+{
+ DirtyMemoryBlocks *blocks;
+ ram_addr_t start = memory_region_get_ram_addr(mr) + offset;
+ unsigned long align = 1UL << (TARGET_PAGE_BITS + BITS_PER_LEVEL);
+ ram_addr_t first = QEMU_ALIGN_DOWN(start, align);
+ ram_addr_t last = QEMU_ALIGN_UP(start + length, align);
+ DirtyBitmapSnapshot *snap;
+ unsigned long page, end, dest;
+
+ snap = g_malloc0(sizeof(*snap) +
+ ((last - first) >> (TARGET_PAGE_BITS + 3)));
+ snap->start = first;
+ snap->end = last;
+
+ page = first >> TARGET_PAGE_BITS;
+ end = last >> TARGET_PAGE_BITS;
+ dest = 0;
+
+ WITH_RCU_READ_LOCK_GUARD() {
+ blocks = qatomic_rcu_read(&ram_list.dirty_memory[client]);
+
+ while (page < end) {
+ unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
+ unsigned long ofs = page % DIRTY_MEMORY_BLOCK_SIZE;
+ unsigned long num = MIN(end - page,
+ DIRTY_MEMORY_BLOCK_SIZE - ofs);
+
+ assert(QEMU_IS_ALIGNED(ofs, (1 << BITS_PER_LEVEL)));
+ assert(QEMU_IS_ALIGNED(num, (1 << BITS_PER_LEVEL)));
+ ofs >>= BITS_PER_LEVEL;
+
+ bitmap_copy_and_clear_atomic(snap->dirty + dest,
+ blocks->blocks[idx] + ofs,
+ num);
+ page += num;
+ dest += num >> BITS_PER_LEVEL;
+ }
+ }
+
+ if (tcg_enabled()) {
+ tlb_reset_dirty_range_all(start, length);
+ }
+
+ memory_region_clear_dirty_bitmap(mr, offset, length);
+
+ return snap;
+}
+
+bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap,
+ ram_addr_t start,
+ ram_addr_t length)
+{
+ unsigned long page, end;
+
+ assert(start >= snap->start);
+ assert(start + length <= snap->end);
+
+ end = TARGET_PAGE_ALIGN(start + length - snap->start) >> TARGET_PAGE_BITS;
+ page = (start - snap->start) >> TARGET_PAGE_BITS;
+
+ while (page < end) {
+ if (test_bit(page, snap->dirty)) {
+ return true;
+ }
+ page++;
+ }
+ return false;
+}
+
+/* Called from RCU critical section */
+hwaddr memory_region_section_get_iotlb(CPUState *cpu,
+ MemoryRegionSection *section)
+{
+ AddressSpaceDispatch *d = flatview_to_dispatch(section->fv);
+ return section - d->map.sections;
+}
+
+static int subpage_register(subpage_t *mmio, uint32_t start, uint32_t end,
+ uint16_t section);
+static subpage_t *subpage_init(FlatView *fv, hwaddr base);
+
+static uint16_t phys_section_add(PhysPageMap *map,
+ MemoryRegionSection *section)
+{
+ /* The physical section number is ORed with a page-aligned
+ * pointer to produce the iotlb entries. Thus it should
+ * never overflow into the page-aligned value.
+ */
+ assert(map->sections_nb < TARGET_PAGE_SIZE);
+
+ if (map->sections_nb == map->sections_nb_alloc) {
+ map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
+ map->sections = g_renew(MemoryRegionSection, map->sections,
+ map->sections_nb_alloc);
+ }
+ map->sections[map->sections_nb] = *section;
+ memory_region_ref(section->mr);
+ return map->sections_nb++;
+}
+
+static void phys_section_destroy(MemoryRegion *mr)
+{
+ bool have_sub_page = mr->subpage;
+
+ memory_region_unref(mr);
+
+ if (have_sub_page) {
+ subpage_t *subpage = container_of(mr, subpage_t, iomem);
+ object_unref(OBJECT(&subpage->iomem));
+ g_free(subpage);
+ }
+}
+
+static void phys_sections_free(PhysPageMap *map)
+{
+ while (map->sections_nb > 0) {
+ MemoryRegionSection *section = &map->sections[--map->sections_nb];
+ phys_section_destroy(section->mr);
+ }
+ g_free(map->sections);
+ g_free(map->nodes);
+}
+
+static void register_subpage(FlatView *fv, MemoryRegionSection *section)
+{
+ AddressSpaceDispatch *d = flatview_to_dispatch(fv);
+ subpage_t *subpage;
+ hwaddr base = section->offset_within_address_space
+ & TARGET_PAGE_MASK;
+ MemoryRegionSection *existing = phys_page_find(d, base);
+ MemoryRegionSection subsection = {
+ .offset_within_address_space = base,
+ .size = int128_make64(TARGET_PAGE_SIZE),
+ };
+ hwaddr start, end;
+
+ assert(existing->mr->subpage || existing->mr == &io_mem_unassigned);
+
+ if (!(existing->mr->subpage)) {
+ subpage = subpage_init(fv, base);
+ subsection.fv = fv;
+ subsection.mr = &subpage->iomem;
+ phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
+ phys_section_add(&d->map, &subsection));
+ } else {
+ subpage = container_of(existing->mr, subpage_t, iomem);
+ }
+ start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
+ end = start + int128_get64(section->size) - 1;
+ subpage_register(subpage, start, end,
+ phys_section_add(&d->map, section));
+}
+
+
+static void register_multipage(FlatView *fv,
+ MemoryRegionSection *section)
+{
+ AddressSpaceDispatch *d = flatview_to_dispatch(fv);
+ hwaddr start_addr = section->offset_within_address_space;
+ uint16_t section_index = phys_section_add(&d->map, section);
+ uint64_t num_pages = int128_get64(int128_rshift(section->size,
+ TARGET_PAGE_BITS));
+
+ assert(num_pages);
+ phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
+}
+
+/*
+ * The range in *section* may look like this:
+ *
+ * |s|PPPPPPP|s|
+ *
+ * where s stands for subpage and P for page.
+ */
+void flatview_add_to_dispatch(FlatView *fv, MemoryRegionSection *section)
+{
+ MemoryRegionSection remain = *section;
+ Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
+
+ /* register first subpage */
+ if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
+ uint64_t left = TARGET_PAGE_ALIGN(remain.offset_within_address_space)
+ - remain.offset_within_address_space;
+
+ MemoryRegionSection now = remain;
+ now.size = int128_min(int128_make64(left), now.size);
+ register_subpage(fv, &now);
+ if (int128_eq(remain.size, now.size)) {
+ return;
+ }
+ remain.size = int128_sub(remain.size, now.size);
+ remain.offset_within_address_space += int128_get64(now.size);
+ remain.offset_within_region += int128_get64(now.size);
+ }
+
+ /* register whole pages */
+ if (int128_ge(remain.size, page_size)) {
+ MemoryRegionSection now = remain;
+ now.size = int128_and(now.size, int128_neg(page_size));
+ register_multipage(fv, &now);
+ if (int128_eq(remain.size, now.size)) {
+ return;
+ }
+ remain.size = int128_sub(remain.size, now.size);
+ remain.offset_within_address_space += int128_get64(now.size);
+ remain.offset_within_region += int128_get64(now.size);
+ }
+
+ /* register last subpage */
+ register_subpage(fv, &remain);
+}
+
+void qemu_flush_coalesced_mmio_buffer(void)
+{
+ if (kvm_enabled())
+ kvm_flush_coalesced_mmio_buffer();
+}
+
+void qemu_mutex_lock_ramlist(void)
+{
+ qemu_mutex_lock(&ram_list.mutex);
+}
+
+void qemu_mutex_unlock_ramlist(void)
+{
+ qemu_mutex_unlock(&ram_list.mutex);
+}
+
+GString *ram_block_format(void)
+{
+ RAMBlock *block;
+ char *psize;
+ GString *buf = g_string_new("");
+
+ RCU_READ_LOCK_GUARD();
+ g_string_append_printf(buf, "%24s %8s %18s %18s %18s %18s %3s\n",
+ "Block Name", "PSize", "Offset", "Used", "Total",
+ "HVA", "RO");
+
+ RAMBLOCK_FOREACH(block) {
+ psize = size_to_str(block->page_size);
+ g_string_append_printf(buf, "%24s %8s 0x%016" PRIx64 " 0x%016" PRIx64
+ " 0x%016" PRIx64 " 0x%016" PRIx64 " %3s\n",
+ block->idstr, psize,
+ (uint64_t)block->offset,
+ (uint64_t)block->used_length,
+ (uint64_t)block->max_length,
+ (uint64_t)(uintptr_t)block->host,
+ block->mr->readonly ? "ro" : "rw");
+
+ g_free(psize);
+ }
+
+ return buf;
+}
+
+static int find_min_backend_pagesize(Object *obj, void *opaque)
+{
+ long *hpsize_min = opaque;
+
+ if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
+ HostMemoryBackend *backend = MEMORY_BACKEND(obj);
+ long hpsize = host_memory_backend_pagesize(backend);
+
+ if (host_memory_backend_is_mapped(backend) && (hpsize < *hpsize_min)) {
+ *hpsize_min = hpsize;
+ }
+ }
+
+ return 0;
+}
+
+static int find_max_backend_pagesize(Object *obj, void *opaque)
+{
+ long *hpsize_max = opaque;
+
+ if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
+ HostMemoryBackend *backend = MEMORY_BACKEND(obj);
+ long hpsize = host_memory_backend_pagesize(backend);
+
+ if (host_memory_backend_is_mapped(backend) && (hpsize > *hpsize_max)) {
+ *hpsize_max = hpsize;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * TODO: We assume right now that all mapped host memory backends are
+ * used as RAM, however some might be used for different purposes.
+ */
+long qemu_minrampagesize(void)
+{
+ long hpsize = LONG_MAX;
+ Object *memdev_root = object_resolve_path("/objects", NULL);
+
+ object_child_foreach(memdev_root, find_min_backend_pagesize, &hpsize);
+ return hpsize;
+}
+
+long qemu_maxrampagesize(void)
+{
+ long pagesize = 0;
+ Object *memdev_root = object_resolve_path("/objects", NULL);
+
+ object_child_foreach(memdev_root, find_max_backend_pagesize, &pagesize);
+ return pagesize;
+}
+
+#ifdef CONFIG_POSIX
+static int64_t get_file_size(int fd)
+{
+ int64_t size;
+#if defined(__linux__)
+ struct stat st;
+
+ if (fstat(fd, &st) < 0) {
+ return -errno;
+ }
+
+ /* Special handling for devdax character devices */
+ if (S_ISCHR(st.st_mode)) {
+ g_autofree char *subsystem_path = NULL;
+ g_autofree char *subsystem = NULL;
+
+ subsystem_path = g_strdup_printf("/sys/dev/char/%d:%d/subsystem",
+ major(st.st_rdev), minor(st.st_rdev));
+ subsystem = g_file_read_link(subsystem_path, NULL);
+
+ if (subsystem && g_str_has_suffix(subsystem, "/dax")) {
+ g_autofree char *size_path = NULL;
+ g_autofree char *size_str = NULL;
+
+ size_path = g_strdup_printf("/sys/dev/char/%d:%d/size",
+ major(st.st_rdev), minor(st.st_rdev));
+
+ if (g_file_get_contents(size_path, &size_str, NULL, NULL)) {
+ return g_ascii_strtoll(size_str, NULL, 0);
+ }
+ }
+ }
+#endif /* defined(__linux__) */
+
+ /* st.st_size may be zero for special files yet lseek(2) works */
+ size = lseek(fd, 0, SEEK_END);
+ if (size < 0) {
+ return -errno;
+ }
+ return size;
+}
+
+static int64_t get_file_align(int fd)
+{
+ int64_t align = -1;
+#if defined(__linux__) && defined(CONFIG_LIBDAXCTL)
+ struct stat st;
+
+ if (fstat(fd, &st) < 0) {
+ return -errno;
+ }
+
+ /* Special handling for devdax character devices */
+ if (S_ISCHR(st.st_mode)) {
+ g_autofree char *path = NULL;
+ g_autofree char *rpath = NULL;
+ struct daxctl_ctx *ctx;
+ struct daxctl_region *region;
+ int rc = 0;
+
+ path = g_strdup_printf("/sys/dev/char/%d:%d",
+ major(st.st_rdev), minor(st.st_rdev));
+ rpath = realpath(path, NULL);
+ if (!rpath) {
+ return -errno;
+ }
+
+ rc = daxctl_new(&ctx);
+ if (rc) {
+ return -1;
+ }
+
+ daxctl_region_foreach(ctx, region) {
+ if (strstr(rpath, daxctl_region_get_path(region))) {
+ align = daxctl_region_get_align(region);
+ break;
+ }
+ }
+ daxctl_unref(ctx);
+ }
+#endif /* defined(__linux__) && defined(CONFIG_LIBDAXCTL) */
+
+ return align;
+}
+
+static int file_ram_open(const char *path,
+ const char *region_name,
+ bool readonly,
+ bool *created)
+{
+ char *filename;
+ char *sanitized_name;
+ char *c;
+ int fd = -1;
+
+ *created = false;
+ for (;;) {
+ fd = open(path, readonly ? O_RDONLY : O_RDWR);
+ if (fd >= 0) {
+ /*
+ * open(O_RDONLY) won't fail with EISDIR. Check manually if we
+ * opened a directory and fail similarly to how we fail ENOENT
+ * in readonly mode. Note that mkstemp() would imply O_RDWR.
+ */
+ if (readonly) {
+ struct stat file_stat;
+
+ if (fstat(fd, &file_stat)) {
+ close(fd);
+ if (errno == EINTR) {
+ continue;
+ }
+ return -errno;
+ } else if (S_ISDIR(file_stat.st_mode)) {
+ close(fd);
+ return -EISDIR;
+ }
+ }
+ /* @path names an existing file, use it */
+ break;
+ }
+ if (errno == ENOENT) {
+ if (readonly) {
+ /* Refuse to create new, readonly files. */
+ return -ENOENT;
+ }
+ /* @path names a file that doesn't exist, create it */
+ fd = open(path, O_RDWR | O_CREAT | O_EXCL, 0644);
+ if (fd >= 0) {
+ *created = true;
+ break;
+ }
+ } else if (errno == EISDIR) {
+ /* @path names a directory, create a file there */
+ /* Make name safe to use with mkstemp by replacing '/' with '_'. */
+ sanitized_name = g_strdup(region_name);
+ for (c = sanitized_name; *c != '\0'; c++) {
+ if (*c == '/') {
+ *c = '_';
+ }
+ }
+
+ filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
+ sanitized_name);
+ g_free(sanitized_name);
+
+ fd = mkstemp(filename);
+ if (fd >= 0) {
+ unlink(filename);
+ g_free(filename);
+ break;
+ }
+ g_free(filename);
+ }
+ if (errno != EEXIST && errno != EINTR) {
+ return -errno;
+ }
+ /*
+ * Try again on EINTR and EEXIST. The latter happens when
+ * something else creates the file between our two open().
+ */
+ }
+
+ return fd;
+}
+
+static void *file_ram_alloc(RAMBlock *block,
+ ram_addr_t memory,
+ int fd,
+ bool truncate,
+ off_t offset,
+ Error **errp)
+{
+ uint32_t qemu_map_flags;
+ void *area;
+
+ block->page_size = qemu_fd_getpagesize(fd);
+ if (block->mr->align % block->page_size) {
+ error_setg(errp, "alignment 0x%" PRIx64
+ " must be multiples of page size 0x%zx",
+ block->mr->align, block->page_size);
+ return NULL;
+ } else if (block->mr->align && !is_power_of_2(block->mr->align)) {
+ error_setg(errp, "alignment 0x%" PRIx64
+ " must be a power of two", block->mr->align);
+ return NULL;
+ } else if (offset % block->page_size) {
+ error_setg(errp, "offset 0x%" PRIx64
+ " must be multiples of page size 0x%zx",
+ offset, block->page_size);
+ return NULL;
+ }
+ block->mr->align = MAX(block->page_size, block->mr->align);
+#if defined(__s390x__)
+ if (kvm_enabled()) {
+ block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
+ }
+#endif
+
+ if (memory < block->page_size) {
+ error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
+ "or larger than page size 0x%zx",
+ memory, block->page_size);
+ return NULL;
+ }
+
+ memory = ROUND_UP(memory, block->page_size);
+
+ /*
+ * ftruncate is not supported by hugetlbfs in older
+ * hosts, so don't bother bailing out on errors.
+ * If anything goes wrong with it under other filesystems,
+ * mmap will fail.
+ *
+ * Do not truncate the non-empty backend file to avoid corrupting
+ * the existing data in the file. Disabling shrinking is not
+ * enough. For example, the current vNVDIMM implementation stores
+ * the guest NVDIMM labels at the end of the backend file. If the
+ * backend file is later extended, QEMU will not be able to find
+ * those labels. Therefore, extending the non-empty backend file
+ * is disabled as well.
+ */
+ if (truncate && ftruncate(fd, offset + memory)) {
+ perror("ftruncate");
+ }
+
+ qemu_map_flags = (block->flags & RAM_READONLY) ? QEMU_MAP_READONLY : 0;
+ qemu_map_flags |= (block->flags & RAM_SHARED) ? QEMU_MAP_SHARED : 0;
+ qemu_map_flags |= (block->flags & RAM_PMEM) ? QEMU_MAP_SYNC : 0;
+ qemu_map_flags |= (block->flags & RAM_NORESERVE) ? QEMU_MAP_NORESERVE : 0;
+ area = qemu_ram_mmap(fd, memory, block->mr->align, qemu_map_flags, offset);
+ if (area == MAP_FAILED) {
+ error_setg_errno(errp, errno,
+ "unable to map backing store for guest RAM");
+ return NULL;
+ }
+
+ block->fd = fd;
+ block->fd_offset = offset;
+ return area;
+}
+#endif
+
+/* Allocate space within the ram_addr_t space that governs the
+ * dirty bitmaps.
+ * Called with the ramlist lock held.
+ */
+static ram_addr_t find_ram_offset(ram_addr_t size)
+{
+ RAMBlock *block, *next_block;
+ ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
+
+ assert(size != 0); /* it would hand out same offset multiple times */
+
+ if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
+ return 0;
+ }
+
+ RAMBLOCK_FOREACH(block) {
+ ram_addr_t candidate, next = RAM_ADDR_MAX;
+
+ /* Align blocks to start on a 'long' in the bitmap
+ * which makes the bitmap sync'ing take the fast path.
+ */
+ candidate = block->offset + block->max_length;
+ candidate = ROUND_UP(candidate, BITS_PER_LONG << TARGET_PAGE_BITS);
+
+ /* Search for the closest following block
+ * and find the gap.
+ */
+ RAMBLOCK_FOREACH(next_block) {
+ if (next_block->offset >= candidate) {
+ next = MIN(next, next_block->offset);
+ }
+ }
+
+ /* If it fits remember our place and remember the size
+ * of gap, but keep going so that we might find a smaller
+ * gap to fill so avoiding fragmentation.
+ */
+ if (next - candidate >= size && next - candidate < mingap) {
+ offset = candidate;
+ mingap = next - candidate;
+ }
+
+ trace_find_ram_offset_loop(size, candidate, offset, next, mingap);
+ }
+
+ if (offset == RAM_ADDR_MAX) {
+ fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n",
+ (uint64_t)size);
+ abort();
+ }
+
+ trace_find_ram_offset(size, offset);
+
+ return offset;
+}
+
+static unsigned long last_ram_page(void)
+{
+ RAMBlock *block;
+ ram_addr_t last = 0;
+
+ RCU_READ_LOCK_GUARD();
+ RAMBLOCK_FOREACH(block) {
+ last = MAX(last, block->offset + block->max_length);
+ }
+ return last >> TARGET_PAGE_BITS;
+}
+
+static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
+{
+ int ret;
+
+ /* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
+ if (!machine_dump_guest_core(current_machine)) {
+ ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
+ if (ret) {
+ perror("qemu_madvise");
+ fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, "
+ "but dump_guest_core=off specified\n");
+ }
+ }
+}
+
+const char *qemu_ram_get_idstr(RAMBlock *rb)
+{
+ return rb->idstr;
+}
+
+void *qemu_ram_get_host_addr(RAMBlock *rb)
+{
+ return rb->host;
+}
+
+ram_addr_t qemu_ram_get_offset(RAMBlock *rb)
+{
+ return rb->offset;
+}
+
+ram_addr_t qemu_ram_get_used_length(RAMBlock *rb)
+{
+ return rb->used_length;
+}
+
+ram_addr_t qemu_ram_get_max_length(RAMBlock *rb)
+{
+ return rb->max_length;
+}
+
+bool qemu_ram_is_shared(RAMBlock *rb)
+{
+ return rb->flags & RAM_SHARED;
+}
+
+bool qemu_ram_is_noreserve(RAMBlock *rb)
+{
+ return rb->flags & RAM_NORESERVE;
+}
+
+/* Note: Only set at the start of postcopy */
+bool qemu_ram_is_uf_zeroable(RAMBlock *rb)
+{
+ return rb->flags & RAM_UF_ZEROPAGE;
+}
+
+void qemu_ram_set_uf_zeroable(RAMBlock *rb)
+{
+ rb->flags |= RAM_UF_ZEROPAGE;
+}
+
+bool qemu_ram_is_migratable(RAMBlock *rb)
+{
+ return rb->flags & RAM_MIGRATABLE;
+}
+
+void qemu_ram_set_migratable(RAMBlock *rb)
+{
+ rb->flags |= RAM_MIGRATABLE;
+}
+
+void qemu_ram_unset_migratable(RAMBlock *rb)
+{
+ rb->flags &= ~RAM_MIGRATABLE;
+}
+
+bool qemu_ram_is_named_file(RAMBlock *rb)
+{
+ return rb->flags & RAM_NAMED_FILE;
+}
+
+int qemu_ram_get_fd(RAMBlock *rb)
+{
+ return rb->fd;
+}
+
+/* Called with iothread lock held. */
+void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev)
+{
+ RAMBlock *block;
+
+ assert(new_block);
+ assert(!new_block->idstr[0]);
+
+ if (dev) {
+ char *id = qdev_get_dev_path(dev);
+ if (id) {
+ snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
+ g_free(id);
+ }
+ }
+ pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
+
+ RCU_READ_LOCK_GUARD();
+ RAMBLOCK_FOREACH(block) {
+ if (block != new_block &&
+ !strcmp(block->idstr, new_block->idstr)) {
+ fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
+ new_block->idstr);
+ abort();
+ }
+ }
+}
+
+/* Called with iothread lock held. */
+void qemu_ram_unset_idstr(RAMBlock *block)
+{
+ /* FIXME: arch_init.c assumes that this is not called throughout
+ * migration. Ignore the problem since hot-unplug during migration
+ * does not work anyway.
+ */
+ if (block) {
+ memset(block->idstr, 0, sizeof(block->idstr));
+ }
+}
+
+size_t qemu_ram_pagesize(RAMBlock *rb)
+{
+ return rb->page_size;
+}
+
+/* Returns the largest size of page in use */
+size_t qemu_ram_pagesize_largest(void)
+{
+ RAMBlock *block;
+ size_t largest = 0;
+
+ RAMBLOCK_FOREACH(block) {
+ largest = MAX(largest, qemu_ram_pagesize(block));
+ }
+
+ return largest;
+}
+
+static int memory_try_enable_merging(void *addr, size_t len)
+{
+ if (!machine_mem_merge(current_machine)) {
+ /* disabled by the user */
+ return 0;
+ }
+
+ return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
+}
+
+/*
+ * Resizing RAM while migrating can result in the migration being canceled.
+ * Care has to be taken if the guest might have already detected the memory.
+ *
+ * As memory core doesn't know how is memory accessed, it is up to
+ * resize callback to update device state and/or add assertions to detect
+ * misuse, if necessary.
+ */
+int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, Error **errp)
+{
+ const ram_addr_t oldsize = block->used_length;
+ const ram_addr_t unaligned_size = newsize;
+
+ assert(block);
+
+ newsize = HOST_PAGE_ALIGN(newsize);
+
+ if (block->used_length == newsize) {
+ /*
+ * We don't have to resize the ram block (which only knows aligned
+ * sizes), however, we have to notify if the unaligned size changed.
+ */
+ if (unaligned_size != memory_region_size(block->mr)) {
+ memory_region_set_size(block->mr, unaligned_size);
+ if (block->resized) {
+ block->resized(block->idstr, unaligned_size, block->host);
+ }
+ }
+ return 0;
+ }
+
+ if (!(block->flags & RAM_RESIZEABLE)) {
+ error_setg_errno(errp, EINVAL,
+ "Size mismatch: %s: 0x" RAM_ADDR_FMT
+ " != 0x" RAM_ADDR_FMT, block->idstr,
+ newsize, block->used_length);
+ return -EINVAL;
+ }
+
+ if (block->max_length < newsize) {
+ error_setg_errno(errp, EINVAL,
+ "Size too large: %s: 0x" RAM_ADDR_FMT
+ " > 0x" RAM_ADDR_FMT, block->idstr,
+ newsize, block->max_length);
+ return -EINVAL;
+ }
+
+ /* Notify before modifying the ram block and touching the bitmaps. */
+ if (block->host) {
+ ram_block_notify_resize(block->host, oldsize, newsize);
+ }
+
+ cpu_physical_memory_clear_dirty_range(block->offset, block->used_length);
+ block->used_length = newsize;
+ cpu_physical_memory_set_dirty_range(block->offset, block->used_length,
+ DIRTY_CLIENTS_ALL);
+ memory_region_set_size(block->mr, unaligned_size);
+ if (block->resized) {
+ block->resized(block->idstr, unaligned_size, block->host);
+ }
+ return 0;
+}
+
+/*
+ * Trigger sync on the given ram block for range [start, start + length]
+ * with the backing store if one is available.
+ * Otherwise no-op.
+ * @Note: this is supposed to be a synchronous op.
+ */
+void qemu_ram_msync(RAMBlock *block, ram_addr_t start, ram_addr_t length)
+{
+ /* The requested range should fit in within the block range */
+ g_assert((start + length) <= block->used_length);
+
+#ifdef CONFIG_LIBPMEM
+ /* The lack of support for pmem should not block the sync */
+ if (ramblock_is_pmem(block)) {
+ void *addr = ramblock_ptr(block, start);
+ pmem_persist(addr, length);
+ return;
+ }
+#endif
+ if (block->fd >= 0) {
+ /**
+ * Case there is no support for PMEM or the memory has not been
+ * specified as persistent (or is not one) - use the msync.
+ * Less optimal but still achieves the same goal
+ */
+ void *addr = ramblock_ptr(block, start);
+ if (qemu_msync(addr, length, block->fd)) {
+ warn_report("%s: failed to sync memory range: start: "
+ RAM_ADDR_FMT " length: " RAM_ADDR_FMT,
+ __func__, start, length);
+ }
+ }
+}
+
+/* Called with ram_list.mutex held */
+static void dirty_memory_extend(ram_addr_t old_ram_size,
+ ram_addr_t new_ram_size)
+{
+ ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
+ DIRTY_MEMORY_BLOCK_SIZE);
+ ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
+ DIRTY_MEMORY_BLOCK_SIZE);
+ int i;
+
+ /* Only need to extend if block count increased */
+ if (new_num_blocks <= old_num_blocks) {
+ return;
+ }
+
+ for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
+ DirtyMemoryBlocks *old_blocks;
+ DirtyMemoryBlocks *new_blocks;
+ int j;
+
+ old_blocks = qatomic_rcu_read(&ram_list.dirty_memory[i]);
+ new_blocks = g_malloc(sizeof(*new_blocks) +
+ sizeof(new_blocks->blocks[0]) * new_num_blocks);
+
+ if (old_num_blocks) {
+ memcpy(new_blocks->blocks, old_blocks->blocks,
+ old_num_blocks * sizeof(old_blocks->blocks[0]));
+ }
+
+ for (j = old_num_blocks; j < new_num_blocks; j++) {
+ new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
+ }
+
+ qatomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);
+
+ if (old_blocks) {
+ g_free_rcu(old_blocks, rcu);
+ }
+ }
+}
+
+static void ram_block_add(RAMBlock *new_block, Error **errp)
+{
+ const bool noreserve = qemu_ram_is_noreserve(new_block);
+ const bool shared = qemu_ram_is_shared(new_block);
+ RAMBlock *block;
+ RAMBlock *last_block = NULL;
+ ram_addr_t old_ram_size, new_ram_size;
+ Error *err = NULL;
+
+ old_ram_size = last_ram_page();
+
+ qemu_mutex_lock_ramlist();
+ new_block->offset = find_ram_offset(new_block->max_length);
+
+ if (!new_block->host) {
+ if (xen_enabled()) {
+ xen_ram_alloc(new_block->offset, new_block->max_length,
+ new_block->mr, &err);
+ if (err) {
+ error_propagate(errp, err);
+ qemu_mutex_unlock_ramlist();
+ return;
+ }
+ } else {
+ new_block->host = qemu_anon_ram_alloc(new_block->max_length,
+ &new_block->mr->align,
+ shared, noreserve);
+ if (!new_block->host) {
+ error_setg_errno(errp, errno,
+ "cannot set up guest memory '%s'",
+ memory_region_name(new_block->mr));
+ qemu_mutex_unlock_ramlist();
+ return;
+ }
+ memory_try_enable_merging(new_block->host, new_block->max_length);
+ }
+ }
+
+ new_ram_size = MAX(old_ram_size,
+ (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
+ if (new_ram_size > old_ram_size) {
+ dirty_memory_extend(old_ram_size, new_ram_size);
+ }
+ /* Keep the list sorted from biggest to smallest block. Unlike QTAILQ,
+ * QLIST (which has an RCU-friendly variant) does not have insertion at
+ * tail, so save the last element in last_block.
+ */
+ RAMBLOCK_FOREACH(block) {
+ last_block = block;
+ if (block->max_length < new_block->max_length) {
+ break;
+ }
+ }
+ if (block) {
+ QLIST_INSERT_BEFORE_RCU(block, new_block, next);
+ } else if (last_block) {
+ QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
+ } else { /* list is empty */
+ QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
+ }
+ ram_list.mru_block = NULL;
+
+ /* Write list before version */
+ smp_wmb();
+ ram_list.version++;
+ qemu_mutex_unlock_ramlist();
+
+ cpu_physical_memory_set_dirty_range(new_block->offset,
+ new_block->used_length,
+ DIRTY_CLIENTS_ALL);
+
+ if (new_block->host) {
+ qemu_ram_setup_dump(new_block->host, new_block->max_length);
+ qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
+ /*
+ * MADV_DONTFORK is also needed by KVM in absence of synchronous MMU
+ * Configure it unless the machine is a qtest server, in which case
+ * KVM is not used and it may be forked (eg for fuzzing purposes).
+ */
+ if (!qtest_enabled()) {
+ qemu_madvise(new_block->host, new_block->max_length,
+ QEMU_MADV_DONTFORK);
+ }
+ ram_block_notify_add(new_block->host, new_block->used_length,
+ new_block->max_length);
+ }
+}
+
+#ifdef CONFIG_POSIX
+RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
+ uint32_t ram_flags, int fd, off_t offset,
+ Error **errp)
+{
+ RAMBlock *new_block;
+ Error *local_err = NULL;
+ int64_t file_size, file_align;
+
+ /* Just support these ram flags by now. */
+ assert((ram_flags & ~(RAM_SHARED | RAM_PMEM | RAM_NORESERVE |
+ RAM_PROTECTED | RAM_NAMED_FILE | RAM_READONLY |
+ RAM_READONLY_FD)) == 0);
+
+ if (xen_enabled()) {
+ error_setg(errp, "-mem-path not supported with Xen");
+ return NULL;
+ }
+
+ if (kvm_enabled() && !kvm_has_sync_mmu()) {
+ error_setg(errp,
+ "host lacks kvm mmu notifiers, -mem-path unsupported");
+ return NULL;
+ }
+
+ size = HOST_PAGE_ALIGN(size);
+ file_size = get_file_size(fd);
+ if (file_size > offset && file_size < (offset + size)) {
+ error_setg(errp, "backing store size 0x%" PRIx64
+ " does not match 'size' option 0x" RAM_ADDR_FMT,
+ file_size, size);
+ return NULL;
+ }
+
+ file_align = get_file_align(fd);
+ if (file_align > 0 && file_align > mr->align) {
+ error_setg(errp, "backing store align 0x%" PRIx64
+ " is larger than 'align' option 0x%" PRIx64,
+ file_align, mr->align);
+ return NULL;
+ }
+
+ new_block = g_malloc0(sizeof(*new_block));
+ new_block->mr = mr;
+ new_block->used_length = size;
+ new_block->max_length = size;
+ new_block->flags = ram_flags;
+ new_block->host = file_ram_alloc(new_block, size, fd, !file_size, offset,
+ errp);
+ if (!new_block->host) {
+ g_free(new_block);
+ return NULL;
+ }
+
+ ram_block_add(new_block, &local_err);
+ if (local_err) {
+ g_free(new_block);
+ error_propagate(errp, local_err);
+ return NULL;
+ }
+ return new_block;
+
+}
+
+
+RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
+ uint32_t ram_flags, const char *mem_path,
+ off_t offset, Error **errp)
+{
+ int fd;
+ bool created;
+ RAMBlock *block;
+
+ fd = file_ram_open(mem_path, memory_region_name(mr),
+ !!(ram_flags & RAM_READONLY_FD), &created);
+ if (fd < 0) {
+ error_setg_errno(errp, -fd, "can't open backing store %s for guest RAM",
+ mem_path);
+ if (!(ram_flags & RAM_READONLY_FD) && !(ram_flags & RAM_SHARED) &&
+ fd == -EACCES) {
+ /*
+ * If we can open the file R/O (note: will never create a new file)
+ * and we are dealing with a private mapping, there are still ways
+ * to consume such files and get RAM instead of ROM.
+ */
+ fd = file_ram_open(mem_path, memory_region_name(mr), true,
+ &created);
+ if (fd < 0) {
+ return NULL;
+ }
+ assert(!created);
+ close(fd);
+ error_append_hint(errp, "Consider opening the backing store"
+ " read-only but still creating writable RAM using"
+ " '-object memory-backend-file,readonly=on,rom=off...'"
+ " (see \"VM templating\" documentation)\n");
+ }
+ return NULL;
+ }
+
+ block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, offset, errp);
+ if (!block) {
+ if (created) {
+ unlink(mem_path);
+ }
+ close(fd);
+ return NULL;
+ }
+
+ return block;
+}
+#endif
+
+static
+RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
+ void (*resized)(const char*,
+ uint64_t length,
+ void *host),
+ void *host, uint32_t ram_flags,
+ MemoryRegion *mr, Error **errp)
+{
+ RAMBlock *new_block;
+ Error *local_err = NULL;
+
+ assert((ram_flags & ~(RAM_SHARED | RAM_RESIZEABLE | RAM_PREALLOC |
+ RAM_NORESERVE)) == 0);
+ assert(!host ^ (ram_flags & RAM_PREALLOC));
+
+ size = HOST_PAGE_ALIGN(size);
+ max_size = HOST_PAGE_ALIGN(max_size);
+ new_block = g_malloc0(sizeof(*new_block));
+ new_block->mr = mr;
+ new_block->resized = resized;
+ new_block->used_length = size;
+ new_block->max_length = max_size;
+ assert(max_size >= size);
+ new_block->fd = -1;
+ new_block->page_size = qemu_real_host_page_size();
+ new_block->host = host;
+ new_block->flags = ram_flags;
+ ram_block_add(new_block, &local_err);
+ if (local_err) {
+ g_free(new_block);
+ error_propagate(errp, local_err);
+ return NULL;
+ }
+ return new_block;
+}
+
+RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
+ MemoryRegion *mr, Error **errp)
+{
+ return qemu_ram_alloc_internal(size, size, NULL, host, RAM_PREALLOC, mr,
+ errp);
+}
+
+RAMBlock *qemu_ram_alloc(ram_addr_t size, uint32_t ram_flags,
+ MemoryRegion *mr, Error **errp)
+{
+ assert((ram_flags & ~(RAM_SHARED | RAM_NORESERVE)) == 0);
+ return qemu_ram_alloc_internal(size, size, NULL, NULL, ram_flags, mr, errp);
+}
+
+RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
+ void (*resized)(const char*,
+ uint64_t length,
+ void *host),
+ MemoryRegion *mr, Error **errp)
+{
+ return qemu_ram_alloc_internal(size, maxsz, resized, NULL,
+ RAM_RESIZEABLE, mr, errp);
+}
+
+static void reclaim_ramblock(RAMBlock *block)
+{
+ if (block->flags & RAM_PREALLOC) {
+ ;
+ } else if (xen_enabled()) {
+ xen_invalidate_map_cache_entry(block->host);
+#ifndef _WIN32
+ } else if (block->fd >= 0) {
+ qemu_ram_munmap(block->fd, block->host, block->max_length);
+ close(block->fd);
+#endif
+ } else {
+ qemu_anon_ram_free(block->host, block->max_length);
+ }
+ g_free(block);
+}
+
+void qemu_ram_free(RAMBlock *block)
+{
+ if (!block) {
+ return;
+ }
+
+ if (block->host) {
+ ram_block_notify_remove(block->host, block->used_length,
+ block->max_length);
+ }
+
+ qemu_mutex_lock_ramlist();
+ QLIST_REMOVE_RCU(block, next);
+ ram_list.mru_block = NULL;
+ /* Write list before version */
+ smp_wmb();
+ ram_list.version++;
+ call_rcu(block, reclaim_ramblock, rcu);
+ qemu_mutex_unlock_ramlist();
+}
+
+#ifndef _WIN32
+void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
+{
+ RAMBlock *block;
+ ram_addr_t offset;
+ int flags;
+ void *area, *vaddr;
+ int prot;
+
+ RAMBLOCK_FOREACH(block) {
+ offset = addr - block->offset;
+ if (offset < block->max_length) {
+ vaddr = ramblock_ptr(block, offset);
+ if (block->flags & RAM_PREALLOC) {
+ ;
+ } else if (xen_enabled()) {
+ abort();
+ } else {
+ flags = MAP_FIXED;
+ flags |= block->flags & RAM_SHARED ?
+ MAP_SHARED : MAP_PRIVATE;
+ flags |= block->flags & RAM_NORESERVE ? MAP_NORESERVE : 0;
+ prot = PROT_READ;
+ prot |= block->flags & RAM_READONLY ? 0 : PROT_WRITE;
+ if (block->fd >= 0) {
+ area = mmap(vaddr, length, prot, flags, block->fd,
+ offset + block->fd_offset);
+ } else {
+ flags |= MAP_ANONYMOUS;
+ area = mmap(vaddr, length, prot, flags, -1, 0);
+ }
+ if (area != vaddr) {
+ error_report("Could not remap addr: "
+ RAM_ADDR_FMT "@" RAM_ADDR_FMT "",
+ length, addr);
+ exit(1);
+ }
+ memory_try_enable_merging(vaddr, length);
+ qemu_ram_setup_dump(vaddr, length);
+ }
+ }
+ }
+}
+#endif /* !_WIN32 */
+
+/* Return a host pointer to ram allocated with qemu_ram_alloc.
+ * This should not be used for general purpose DMA. Use address_space_map
+ * or address_space_rw instead. For local memory (e.g. video ram) that the
+ * device owns, use memory_region_get_ram_ptr.
+ *
+ * Called within RCU critical section.
+ */
+void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr)
+{
+ RAMBlock *block = ram_block;
+
+ if (block == NULL) {
+ block = qemu_get_ram_block(addr);
+ addr -= block->offset;
+ }
+
+ if (xen_enabled() && block->host == NULL) {
+ /* We need to check if the requested address is in the RAM
+ * because we don't want to map the entire memory in QEMU.
+ * In that case just map until the end of the page.
+ */
+ if (block->offset == 0) {
+ return xen_map_cache(addr, 0, 0, false);
+ }
+
+ block->host = xen_map_cache(block->offset, block->max_length, 1, false);
+ }
+ return ramblock_ptr(block, addr);
+}
+
+/* Return a host pointer to guest's ram. Similar to qemu_map_ram_ptr
+ * but takes a size argument.
+ *
+ * Called within RCU critical section.
+ */
+static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr,
+ hwaddr *size, bool lock)
+{
+ RAMBlock *block = ram_block;
+ if (*size == 0) {
+ return NULL;
+ }
+
+ if (block == NULL) {
+ block = qemu_get_ram_block(addr);
+ addr -= block->offset;
+ }
+ *size = MIN(*size, block->max_length - addr);
+
+ if (xen_enabled() && block->host == NULL) {
+ /* We need to check if the requested address is in the RAM
+ * because we don't want to map the entire memory in QEMU.
+ * In that case just map the requested area.
+ */
+ if (block->offset == 0) {
+ return xen_map_cache(addr, *size, lock, lock);
+ }
+
+ block->host = xen_map_cache(block->offset, block->max_length, 1, lock);
+ }
+
+ return ramblock_ptr(block, addr);
+}
+
+/* Return the offset of a hostpointer within a ramblock */
+ram_addr_t qemu_ram_block_host_offset(RAMBlock *rb, void *host)
+{
+ ram_addr_t res = (uint8_t *)host - (uint8_t *)rb->host;
+ assert((uintptr_t)host >= (uintptr_t)rb->host);
+ assert(res < rb->max_length);
+
+ return res;
+}
+
+/*
+ * Translates a host ptr back to a RAMBlock, a ram_addr and an offset
+ * in that RAMBlock.
+ *
+ * ptr: Host pointer to look up
+ * round_offset: If true round the result offset down to a page boundary
+ * *ram_addr: set to result ram_addr
+ * *offset: set to result offset within the RAMBlock
+ *
+ * Returns: RAMBlock (or NULL if not found)
+ *
+ * By the time this function returns, the returned pointer is not protected
+ * by RCU anymore. If the caller is not within an RCU critical section and
+ * does not hold the iothread lock, it must have other means of protecting the
+ * pointer, such as a reference to the region that includes the incoming
+ * ram_addr_t.
+ */
+RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
+ ram_addr_t *offset)
+{
+ RAMBlock *block;
+ uint8_t *host = ptr;
+
+ if (xen_enabled()) {
+ ram_addr_t ram_addr;
+ RCU_READ_LOCK_GUARD();
+ ram_addr = xen_ram_addr_from_mapcache(ptr);
+ block = qemu_get_ram_block(ram_addr);
+ if (block) {
+ *offset = ram_addr - block->offset;
+ }
+ return block;
+ }
+
+ RCU_READ_LOCK_GUARD();
+ block = qatomic_rcu_read(&ram_list.mru_block);
+ if (block && block->host && host - block->host < block->max_length) {
+ goto found;
+ }
+
+ RAMBLOCK_FOREACH(block) {
+ /* This case append when the block is not mapped. */
+ if (block->host == NULL) {
+ continue;
+ }
+ if (host - block->host < block->max_length) {
+ goto found;
+ }
+ }
+
+ return NULL;
+
+found:
+ *offset = (host - block->host);
+ if (round_offset) {
+ *offset &= TARGET_PAGE_MASK;
+ }
+ return block;
+}
+
+/*
+ * Finds the named RAMBlock
+ *
+ * name: The name of RAMBlock to find
+ *
+ * Returns: RAMBlock (or NULL if not found)
+ */
+RAMBlock *qemu_ram_block_by_name(const char *name)
+{
+ RAMBlock *block;
+
+ RAMBLOCK_FOREACH(block) {
+ if (!strcmp(name, block->idstr)) {
+ return block;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * Some of the system routines need to translate from a host pointer
+ * (typically a TLB entry) back to a ram offset.
+ */
+ram_addr_t qemu_ram_addr_from_host(void *ptr)
+{
+ RAMBlock *block;
+ ram_addr_t offset;
+
+ block = qemu_ram_block_from_host(ptr, false, &offset);
+ if (!block) {
+ return RAM_ADDR_INVALID;
+ }
+
+ return block->offset + offset;
+}
+
+ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr)
+{
+ ram_addr_t ram_addr;
+
+ ram_addr = qemu_ram_addr_from_host(ptr);
+ if (ram_addr == RAM_ADDR_INVALID) {
+ error_report("Bad ram pointer %p", ptr);
+ abort();
+ }
+ return ram_addr;
+}
+
+static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
+ MemTxAttrs attrs, void *buf, hwaddr len);
+static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
+ const void *buf, hwaddr len);
+static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len,
+ bool is_write, MemTxAttrs attrs);
+
+static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data,
+ unsigned len, MemTxAttrs attrs)
+{
+ subpage_t *subpage = opaque;
+ uint8_t buf[8];
+ MemTxResult res;
+
+#if defined(DEBUG_SUBPAGE)
+ printf("%s: subpage %p len %u addr " HWADDR_FMT_plx "\n", __func__,
+ subpage, len, addr);
+#endif
+ res = flatview_read(subpage->fv, addr + subpage->base, attrs, buf, len);
+ if (res) {
+ return res;
+ }
+ *data = ldn_p(buf, len);
+ return MEMTX_OK;
+}
+
+static MemTxResult subpage_write(void *opaque, hwaddr addr,
+ uint64_t value, unsigned len, MemTxAttrs attrs)
+{
+ subpage_t *subpage = opaque;
+ uint8_t buf[8];
+
+#if defined(DEBUG_SUBPAGE)
+ printf("%s: subpage %p len %u addr " HWADDR_FMT_plx
+ " value %"PRIx64"\n",
+ __func__, subpage, len, addr, value);
+#endif
+ stn_p(buf, len, value);
+ return flatview_write(subpage->fv, addr + subpage->base, attrs, buf, len);
+}
+
+static bool subpage_accepts(void *opaque, hwaddr addr,
+ unsigned len, bool is_write,
+ MemTxAttrs attrs)
+{
+ subpage_t *subpage = opaque;
+#if defined(DEBUG_SUBPAGE)
+ printf("%s: subpage %p %c len %u addr " HWADDR_FMT_plx "\n",
+ __func__, subpage, is_write ? 'w' : 'r', len, addr);
+#endif
+
+ return flatview_access_valid(subpage->fv, addr + subpage->base,
+ len, is_write, attrs);
+}
+
+static const MemoryRegionOps subpage_ops = {
+ .read_with_attrs = subpage_read,
+ .write_with_attrs = subpage_write,
+ .impl.min_access_size = 1,
+ .impl.max_access_size = 8,
+ .valid.min_access_size = 1,
+ .valid.max_access_size = 8,
+ .valid.accepts = subpage_accepts,
+ .endianness = DEVICE_NATIVE_ENDIAN,
+};
+
+static int subpage_register(subpage_t *mmio, uint32_t start, uint32_t end,
+ uint16_t section)
+{
+ int idx, eidx;
+
+ if (start >= TARGET_PAGE_SIZE || end >= TARGET_PAGE_SIZE)
+ return -1;
+ idx = SUBPAGE_IDX(start);
+ eidx = SUBPAGE_IDX(end);
+#if defined(DEBUG_SUBPAGE)
+ printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n",
+ __func__, mmio, start, end, idx, eidx, section);
+#endif
+ for (; idx <= eidx; idx++) {
+ mmio->sub_section[idx] = section;
+ }
+
+ return 0;
+}
+
+static subpage_t *subpage_init(FlatView *fv, hwaddr base)
+{
+ subpage_t *mmio;
+
+ /* mmio->sub_section is set to PHYS_SECTION_UNASSIGNED with g_malloc0 */
+ mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t));
+ mmio->fv = fv;
+ mmio->base = base;
+ memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
+ NULL, TARGET_PAGE_SIZE);
+ mmio->iomem.subpage = true;
+#if defined(DEBUG_SUBPAGE)
+ printf("%s: %p base " HWADDR_FMT_plx " len %08x\n", __func__,
+ mmio, base, TARGET_PAGE_SIZE);
+#endif
+
+ return mmio;
+}
+
+static uint16_t dummy_section(PhysPageMap *map, FlatView *fv, MemoryRegion *mr)
+{
+ assert(fv);
+ MemoryRegionSection section = {
+ .fv = fv,
+ .mr = mr,
+ .offset_within_address_space = 0,
+ .offset_within_region = 0,
+ .size = int128_2_64(),
+ };
+
+ return phys_section_add(map, &section);
+}
+
+MemoryRegionSection *iotlb_to_section(CPUState *cpu,
+ hwaddr index, MemTxAttrs attrs)
+{
+ int asidx = cpu_asidx_from_attrs(cpu, attrs);
+ CPUAddressSpace *cpuas = &cpu->cpu_ases[asidx];
+ AddressSpaceDispatch *d = cpuas->memory_dispatch;
+ int section_index = index & ~TARGET_PAGE_MASK;
+ MemoryRegionSection *ret;
+
+ assert(section_index < d->map.sections_nb);
+ ret = d->map.sections + section_index;
+ assert(ret->mr);
+ assert(ret->mr->ops);
+
+ return ret;
+}
+
+static void io_mem_init(void)
+{
+ memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
+ NULL, UINT64_MAX);
+}
+
+AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv)
+{
+ AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
+ uint16_t n;
+
+ n = dummy_section(&d->map, fv, &io_mem_unassigned);
+ assert(n == PHYS_SECTION_UNASSIGNED);
+
+ d->phys_map = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
+
+ return d;
+}
+
+void address_space_dispatch_free(AddressSpaceDispatch *d)
+{
+ phys_sections_free(&d->map);
+ g_free(d);
+}
+
+static void do_nothing(CPUState *cpu, run_on_cpu_data d)
+{
+}
+
+static void tcg_log_global_after_sync(MemoryListener *listener)
+{
+ CPUAddressSpace *cpuas;
+
+ /* Wait for the CPU to end the current TB. This avoids the following
+ * incorrect race:
+ *
+ * vCPU migration
+ * ---------------------- -------------------------
+ * TLB check -> slow path
+ * notdirty_mem_write
+ * write to RAM
+ * mark dirty
+ * clear dirty flag
+ * TLB check -> fast path
+ * read memory
+ * write to RAM
+ *
+ * by pushing the migration thread's memory read after the vCPU thread has
+ * written the memory.
+ */
+ if (replay_mode == REPLAY_MODE_NONE) {
+ /*
+ * VGA can make calls to this function while updating the screen.
+ * In record/replay mode this causes a deadlock, because
+ * run_on_cpu waits for rr mutex. Therefore no races are possible
+ * in this case and no need for making run_on_cpu when
+ * record/replay is enabled.
+ */
+ cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
+ run_on_cpu(cpuas->cpu, do_nothing, RUN_ON_CPU_NULL);
+ }
+}
+
+static void tcg_commit_cpu(CPUState *cpu, run_on_cpu_data data)
+{
+ CPUAddressSpace *cpuas = data.host_ptr;
+
+ cpuas->memory_dispatch = address_space_to_dispatch(cpuas->as);
+ tlb_flush(cpu);
+}
+
+static void tcg_commit(MemoryListener *listener)
+{
+ CPUAddressSpace *cpuas;
+ CPUState *cpu;
+
+ assert(tcg_enabled());
+ /* since each CPU stores ram addresses in its TLB cache, we must
+ reset the modified entries */
+ cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
+ cpu = cpuas->cpu;
+
+ /*
+ * Defer changes to as->memory_dispatch until the cpu is quiescent.
+ * Otherwise we race between (1) other cpu threads and (2) ongoing
+ * i/o for the current cpu thread, with data cached by mmu_lookup().
+ *
+ * In addition, queueing the work function will kick the cpu back to
+ * the main loop, which will end the RCU critical section and reclaim
+ * the memory data structures.
+ *
+ * That said, the listener is also called during realize, before
+ * all of the tcg machinery for run-on is initialized: thus halt_cond.
+ */
+ if (cpu->halt_cond) {
+ async_run_on_cpu(cpu, tcg_commit_cpu, RUN_ON_CPU_HOST_PTR(cpuas));
+ } else {
+ tcg_commit_cpu(cpu, RUN_ON_CPU_HOST_PTR(cpuas));
+ }
+}
+
+static void memory_map_init(void)
+{
+ system_memory = g_malloc(sizeof(*system_memory));
+
+ memory_region_init(system_memory, NULL, "system", UINT64_MAX);
+ address_space_init(&address_space_memory, system_memory, "memory");
+
+ system_io = g_malloc(sizeof(*system_io));
+ memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
+ 65536);
+ address_space_init(&address_space_io, system_io, "I/O");
+}
+
+MemoryRegion *get_system_memory(void)
+{
+ return system_memory;
+}
+
+MemoryRegion *get_system_io(void)
+{
+ return system_io;
+}
+
+static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
+ hwaddr length)
+{
+ uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr);
+ addr += memory_region_get_ram_addr(mr);
+
+ /* No early return if dirty_log_mask is or becomes 0, because
+ * cpu_physical_memory_set_dirty_range will still call
+ * xen_modified_memory.
+ */
+ if (dirty_log_mask) {
+ dirty_log_mask =
+ cpu_physical_memory_range_includes_clean(addr, length, dirty_log_mask);
+ }
+ if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) {
+ assert(tcg_enabled());
+ tb_invalidate_phys_range(addr, addr + length - 1);
+ dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
+ }
+ cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask);
+}
+
+void memory_region_flush_rom_device(MemoryRegion *mr, hwaddr addr, hwaddr size)
+{
+ /*
+ * In principle this function would work on other memory region types too,
+ * but the ROM device use case is the only one where this operation is
+ * necessary. Other memory regions should use the
+ * address_space_read/write() APIs.
+ */
+ assert(memory_region_is_romd(mr));
+
+ invalidate_and_set_dirty(mr, addr, size);
+}
+
+int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
+{
+ unsigned access_size_max = mr->ops->valid.max_access_size;
+
+ /* Regions are assumed to support 1-4 byte accesses unless
+ otherwise specified. */
+ if (access_size_max == 0) {
+ access_size_max = 4;
+ }
+
+ /* Bound the maximum access by the alignment of the address. */
+ if (!mr->ops->impl.unaligned) {
+ unsigned align_size_max = addr & -addr;
+ if (align_size_max != 0 && align_size_max < access_size_max) {
+ access_size_max = align_size_max;
+ }
+ }
+
+ /* Don't attempt accesses larger than the maximum. */
+ if (l > access_size_max) {
+ l = access_size_max;
+ }
+ l = pow2floor(l);
+
+ return l;
+}
+
+bool prepare_mmio_access(MemoryRegion *mr)
+{
+ bool release_lock = false;
+
+ if (!qemu_mutex_iothread_locked()) {
+ qemu_mutex_lock_iothread();
+ release_lock = true;
+ }
+ if (mr->flush_coalesced_mmio) {
+ qemu_flush_coalesced_mmio_buffer();
+ }
+
+ return release_lock;
+}
+
+/**
+ * flatview_access_allowed
+ * @mr: #MemoryRegion to be accessed
+ * @attrs: memory transaction attributes
+ * @addr: address within that memory region
+ * @len: the number of bytes to access
+ *
+ * Check if a memory transaction is allowed.
+ *
+ * Returns: true if transaction is allowed, false if denied.
+ */
+static bool flatview_access_allowed(MemoryRegion *mr, MemTxAttrs attrs,
+ hwaddr addr, hwaddr len)
+{
+ if (likely(!attrs.memory)) {
+ return true;
+ }
+ if (memory_region_is_ram(mr)) {
+ return true;
+ }
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "Invalid access to non-RAM device at "
+ "addr 0x%" HWADDR_PRIX ", size %" HWADDR_PRIu ", "
+ "region '%s'\n", addr, len, memory_region_name(mr));
+ return false;
+}
+
+/* Called within RCU critical section. */
+static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr,
+ MemTxAttrs attrs,
+ const void *ptr,
+ hwaddr len, hwaddr addr1,
+ hwaddr l, MemoryRegion *mr)
+{
+ uint8_t *ram_ptr;
+ uint64_t val;
+ MemTxResult result = MEMTX_OK;
+ bool release_lock = false;
+ const uint8_t *buf = ptr;
+
+ for (;;) {
+ if (!flatview_access_allowed(mr, attrs, addr1, l)) {
+ result |= MEMTX_ACCESS_ERROR;
+ /* Keep going. */
+ } else if (!memory_access_is_direct(mr, true)) {
+ release_lock |= prepare_mmio_access(mr);
+ l = memory_access_size(mr, l, addr1);
+ /* XXX: could force current_cpu to NULL to avoid
+ potential bugs */
+ val = ldn_he_p(buf, l);
+ result |= memory_region_dispatch_write(mr, addr1, val,
+ size_memop(l), attrs);
+ } else {
+ /* RAM case */
+ ram_ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
+ memmove(ram_ptr, buf, l);
+ invalidate_and_set_dirty(mr, addr1, l);
+ }
+
+ if (release_lock) {
+ qemu_mutex_unlock_iothread();
+ release_lock = false;
+ }
+
+ len -= l;
+ buf += l;
+ addr += l;
+
+ if (!len) {
+ break;
+ }
+
+ l = len;
+ mr = flatview_translate(fv, addr, &addr1, &l, true, attrs);
+ }
+
+ return result;
+}
+
+/* Called from RCU critical section. */
+static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
+ const void *buf, hwaddr len)
+{
+ hwaddr l;
+ hwaddr addr1;
+ MemoryRegion *mr;
+
+ l = len;
+ mr = flatview_translate(fv, addr, &addr1, &l, true, attrs);
+ if (!flatview_access_allowed(mr, attrs, addr, len)) {
+ return MEMTX_ACCESS_ERROR;
+ }
+ return flatview_write_continue(fv, addr, attrs, buf, len,
+ addr1, l, mr);
+}
+
+/* Called within RCU critical section. */
+MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
+ MemTxAttrs attrs, void *ptr,
+ hwaddr len, hwaddr addr1, hwaddr l,
+ MemoryRegion *mr)
+{
+ uint8_t *ram_ptr;
+ uint64_t val;
+ MemTxResult result = MEMTX_OK;
+ bool release_lock = false;
+ uint8_t *buf = ptr;
+
+ fuzz_dma_read_cb(addr, len, mr);
+ for (;;) {
+ if (!flatview_access_allowed(mr, attrs, addr1, l)) {
+ result |= MEMTX_ACCESS_ERROR;
+ /* Keep going. */
+ } else if (!memory_access_is_direct(mr, false)) {
+ /* I/O case */
+ release_lock |= prepare_mmio_access(mr);
+ l = memory_access_size(mr, l, addr1);
+ result |= memory_region_dispatch_read(mr, addr1, &val,
+ size_memop(l), attrs);
+ stn_he_p(buf, l, val);
+ } else {
+ /* RAM case */
+ ram_ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
+ memcpy(buf, ram_ptr, l);
+ }
+
+ if (release_lock) {
+ qemu_mutex_unlock_iothread();
+ release_lock = false;
+ }
+
+ len -= l;
+ buf += l;
+ addr += l;
+
+ if (!len) {
+ break;
+ }
+
+ l = len;
+ mr = flatview_translate(fv, addr, &addr1, &l, false, attrs);
+ }
+
+ return result;
+}
+
+/* Called from RCU critical section. */
+static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
+ MemTxAttrs attrs, void *buf, hwaddr len)
+{
+ hwaddr l;
+ hwaddr addr1;
+ MemoryRegion *mr;
+
+ l = len;
+ mr = flatview_translate(fv, addr, &addr1, &l, false, attrs);
+ if (!flatview_access_allowed(mr, attrs, addr, len)) {
+ return MEMTX_ACCESS_ERROR;
+ }
+ return flatview_read_continue(fv, addr, attrs, buf, len,
+ addr1, l, mr);
+}
+
+MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
+ MemTxAttrs attrs, void *buf, hwaddr len)
+{
+ MemTxResult result = MEMTX_OK;
+ FlatView *fv;
+
+ if (len > 0) {
+ RCU_READ_LOCK_GUARD();
+ fv = address_space_to_flatview(as);
+ result = flatview_read(fv, addr, attrs, buf, len);
+ }
+
+ return result;
+}
+
+MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
+ MemTxAttrs attrs,
+ const void *buf, hwaddr len)
+{
+ MemTxResult result = MEMTX_OK;
+ FlatView *fv;
+
+ if (len > 0) {
+ RCU_READ_LOCK_GUARD();
+ fv = address_space_to_flatview(as);
+ result = flatview_write(fv, addr, attrs, buf, len);
+ }
+
+ return result;
+}
+
+MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
+ void *buf, hwaddr len, bool is_write)
+{
+ if (is_write) {
+ return address_space_write(as, addr, attrs, buf, len);
+ } else {
+ return address_space_read_full(as, addr, attrs, buf, len);
+ }
+}
+
+MemTxResult address_space_set(AddressSpace *as, hwaddr addr,
+ uint8_t c, hwaddr len, MemTxAttrs attrs)
+{
+#define FILLBUF_SIZE 512
+ uint8_t fillbuf[FILLBUF_SIZE];
+ int l;
+ MemTxResult error = MEMTX_OK;
+
+ memset(fillbuf, c, FILLBUF_SIZE);
+ while (len > 0) {
+ l = len < FILLBUF_SIZE ? len : FILLBUF_SIZE;
+ error |= address_space_write(as, addr, attrs, fillbuf, l);
+ len -= l;
+ addr += l;
+ }
+
+ return error;
+}
+
+void cpu_physical_memory_rw(hwaddr addr, void *buf,
+ hwaddr len, bool is_write)
+{
+ address_space_rw(&address_space_memory, addr, MEMTXATTRS_UNSPECIFIED,
+ buf, len, is_write);
+}
+
+enum write_rom_type {
+ WRITE_DATA,
+ FLUSH_CACHE,
+};
+
+static inline MemTxResult address_space_write_rom_internal(AddressSpace *as,
+ hwaddr addr,
+ MemTxAttrs attrs,
+ const void *ptr,
+ hwaddr len,
+ enum write_rom_type type)
+{
+ hwaddr l;
+ uint8_t *ram_ptr;
+ hwaddr addr1;
+ MemoryRegion *mr;
+ const uint8_t *buf = ptr;
+
+ RCU_READ_LOCK_GUARD();
+ while (len > 0) {
+ l = len;
+ mr = address_space_translate(as, addr, &addr1, &l, true, attrs);
+
+ if (!(memory_region_is_ram(mr) ||
+ memory_region_is_romd(mr))) {
+ l = memory_access_size(mr, l, addr1);
+ } else {
+ /* ROM/RAM case */
+ ram_ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
+ switch (type) {
+ case WRITE_DATA:
+ memcpy(ram_ptr, buf, l);
+ invalidate_and_set_dirty(mr, addr1, l);
+ break;
+ case FLUSH_CACHE:
+ flush_idcache_range((uintptr_t)ram_ptr, (uintptr_t)ram_ptr, l);
+ break;
+ }
+ }
+ len -= l;
+ buf += l;
+ addr += l;
+ }
+ return MEMTX_OK;
+}
+
+/* used for ROM loading : can write in RAM and ROM */
+MemTxResult address_space_write_rom(AddressSpace *as, hwaddr addr,
+ MemTxAttrs attrs,
+ const void *buf, hwaddr len)
+{
+ return address_space_write_rom_internal(as, addr, attrs,
+ buf, len, WRITE_DATA);
+}
+
+void cpu_flush_icache_range(hwaddr start, hwaddr len)
+{
+ /*
+ * This function should do the same thing as an icache flush that was
+ * triggered from within the guest. For TCG we are always cache coherent,
+ * so there is no need to flush anything. For KVM / Xen we need to flush
+ * the host's instruction cache at least.
+ */
+ if (tcg_enabled()) {
+ return;
+ }
+
+ address_space_write_rom_internal(&address_space_memory,
+ start, MEMTXATTRS_UNSPECIFIED,
+ NULL, len, FLUSH_CACHE);
+}
+
+typedef struct {
+ MemoryRegion *mr;
+ void *buffer;
+ hwaddr addr;
+ hwaddr len;
+ bool in_use;
+} BounceBuffer;
+
+static BounceBuffer bounce;
+
+typedef struct MapClient {
+ QEMUBH *bh;
+ QLIST_ENTRY(MapClient) link;
+} MapClient;
+
+QemuMutex map_client_list_lock;
+static QLIST_HEAD(, MapClient) map_client_list
+ = QLIST_HEAD_INITIALIZER(map_client_list);
+
+static void cpu_unregister_map_client_do(MapClient *client)
+{
+ QLIST_REMOVE(client, link);
+ g_free(client);
+}
+
+static void cpu_notify_map_clients_locked(void)
+{
+ MapClient *client;
+
+ while (!QLIST_EMPTY(&map_client_list)) {
+ client = QLIST_FIRST(&map_client_list);
+ qemu_bh_schedule(client->bh);
+ cpu_unregister_map_client_do(client);
+ }
+}
+
+void cpu_register_map_client(QEMUBH *bh)
+{
+ MapClient *client = g_malloc(sizeof(*client));
+
+ qemu_mutex_lock(&map_client_list_lock);
+ client->bh = bh;
+ QLIST_INSERT_HEAD(&map_client_list, client, link);
+ /* Write map_client_list before reading in_use. */
+ smp_mb();
+ if (!qatomic_read(&bounce.in_use)) {
+ cpu_notify_map_clients_locked();
+ }
+ qemu_mutex_unlock(&map_client_list_lock);
+}
+
+void cpu_exec_init_all(void)
+{
+ qemu_mutex_init(&ram_list.mutex);
+ /* The data structures we set up here depend on knowing the page size,
+ * so no more changes can be made after this point.
+ * In an ideal world, nothing we did before we had finished the
+ * machine setup would care about the target page size, and we could
+ * do this much later, rather than requiring board models to state
+ * up front what their requirements are.
+ */
+ finalize_target_page_bits();
+ io_mem_init();
+ memory_map_init();
+ qemu_mutex_init(&map_client_list_lock);
+}
+
+void cpu_unregister_map_client(QEMUBH *bh)
+{
+ MapClient *client;
+
+ qemu_mutex_lock(&map_client_list_lock);
+ QLIST_FOREACH(client, &map_client_list, link) {
+ if (client->bh == bh) {
+ cpu_unregister_map_client_do(client);
+ break;
+ }
+ }
+ qemu_mutex_unlock(&map_client_list_lock);
+}
+
+static void cpu_notify_map_clients(void)
+{
+ qemu_mutex_lock(&map_client_list_lock);
+ cpu_notify_map_clients_locked();
+ qemu_mutex_unlock(&map_client_list_lock);
+}
+
+static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len,
+ bool is_write, MemTxAttrs attrs)
+{
+ MemoryRegion *mr;
+ hwaddr l, xlat;
+
+ while (len > 0) {
+ l = len;
+ mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
+ if (!memory_access_is_direct(mr, is_write)) {
+ l = memory_access_size(mr, l, addr);
+ if (!memory_region_access_valid(mr, xlat, l, is_write, attrs)) {
+ return false;
+ }
+ }
+
+ len -= l;
+ addr += l;
+ }
+ return true;
+}
+
+bool address_space_access_valid(AddressSpace *as, hwaddr addr,
+ hwaddr len, bool is_write,
+ MemTxAttrs attrs)
+{
+ FlatView *fv;
+
+ RCU_READ_LOCK_GUARD();
+ fv = address_space_to_flatview(as);
+ return flatview_access_valid(fv, addr, len, is_write, attrs);
+}
+
+static hwaddr
+flatview_extend_translation(FlatView *fv, hwaddr addr,
+ hwaddr target_len,
+ MemoryRegion *mr, hwaddr base, hwaddr len,
+ bool is_write, MemTxAttrs attrs)
+{
+ hwaddr done = 0;
+ hwaddr xlat;
+ MemoryRegion *this_mr;
+
+ for (;;) {
+ target_len -= len;
+ addr += len;
+ done += len;
+ if (target_len == 0) {
+ return done;
+ }
+
+ len = target_len;
+ this_mr = flatview_translate(fv, addr, &xlat,
+ &len, is_write, attrs);
+ if (this_mr != mr || xlat != base + done) {
+ return done;
+ }
+ }
+}
+
+/* Map a physical memory region into a host virtual address.
+ * May map a subset of the requested range, given by and returned in *plen.
+ * May return NULL if resources needed to perform the mapping are exhausted.
+ * Use only for reads OR writes - not for read-modify-write operations.
+ * Use cpu_register_map_client() to know when retrying the map operation is
+ * likely to succeed.
+ */
+void *address_space_map(AddressSpace *as,
+ hwaddr addr,
+ hwaddr *plen,
+ bool is_write,
+ MemTxAttrs attrs)
+{
+ hwaddr len = *plen;
+ hwaddr l, xlat;
+ MemoryRegion *mr;
+ FlatView *fv;
+
+ if (len == 0) {
+ return NULL;
+ }
+
+ l = len;
+ RCU_READ_LOCK_GUARD();
+ fv = address_space_to_flatview(as);
+ mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
+
+ if (!memory_access_is_direct(mr, is_write)) {
+ if (qatomic_xchg(&bounce.in_use, true)) {
+ *plen = 0;
+ return NULL;
+ }
+ /* Avoid unbounded allocations */
+ l = MIN(l, TARGET_PAGE_SIZE);
+ bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
+ bounce.addr = addr;
+ bounce.len = l;
+
+ memory_region_ref(mr);
+ bounce.mr = mr;
+ if (!is_write) {
+ flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED,
+ bounce.buffer, l);
+ }
+
+ *plen = l;
+ return bounce.buffer;
+ }
+
+
+ memory_region_ref(mr);
+ *plen = flatview_extend_translation(fv, addr, len, mr, xlat,
+ l, is_write, attrs);
+ fuzz_dma_read_cb(addr, *plen, mr);
+ return qemu_ram_ptr_length(mr->ram_block, xlat, plen, true);
+}
+
+/* Unmaps a memory region previously mapped by address_space_map().
+ * Will also mark the memory as dirty if is_write is true. access_len gives
+ * the amount of memory that was actually read or written by the caller.
+ */
+void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
+ bool is_write, hwaddr access_len)
+{
+ if (buffer != bounce.buffer) {
+ MemoryRegion *mr;
+ ram_addr_t addr1;
+
+ mr = memory_region_from_host(buffer, &addr1);
+ assert(mr != NULL);
+ if (is_write) {
+ invalidate_and_set_dirty(mr, addr1, access_len);
+ }
+ if (xen_enabled()) {
+ xen_invalidate_map_cache_entry(buffer);
+ }
+ memory_region_unref(mr);
+ return;
+ }
+ if (is_write) {
+ address_space_write(as, bounce.addr, MEMTXATTRS_UNSPECIFIED,
+ bounce.buffer, access_len);
+ }
+ qemu_vfree(bounce.buffer);
+ bounce.buffer = NULL;
+ memory_region_unref(bounce.mr);
+ /* Clear in_use before reading map_client_list. */
+ qatomic_set_mb(&bounce.in_use, false);
+ cpu_notify_map_clients();
+}
+
+void *cpu_physical_memory_map(hwaddr addr,
+ hwaddr *plen,
+ bool is_write)
+{
+ return address_space_map(&address_space_memory, addr, plen, is_write,
+ MEMTXATTRS_UNSPECIFIED);
+}
+
+void cpu_physical_memory_unmap(void *buffer, hwaddr len,
+ bool is_write, hwaddr access_len)
+{
+ return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len);
+}
+
+#define ARG1_DECL AddressSpace *as
+#define ARG1 as
+#define SUFFIX
+#define TRANSLATE(...) address_space_translate(as, __VA_ARGS__)
+#define RCU_READ_LOCK(...) rcu_read_lock()
+#define RCU_READ_UNLOCK(...) rcu_read_unlock()
+#include "memory_ldst.c.inc"
+
+int64_t address_space_cache_init(MemoryRegionCache *cache,
+ AddressSpace *as,
+ hwaddr addr,
+ hwaddr len,
+ bool is_write)
+{
+ AddressSpaceDispatch *d;
+ hwaddr l;
+ MemoryRegion *mr;
+ Int128 diff;
+
+ assert(len > 0);
+
+ l = len;
+ cache->fv = address_space_get_flatview(as);
+ d = flatview_to_dispatch(cache->fv);
+ cache->mrs = *address_space_translate_internal(d, addr, &cache->xlat, &l, true);
+
+ /*
+ * cache->xlat is now relative to cache->mrs.mr, not to the section itself.
+ * Take that into account to compute how many bytes are there between
+ * cache->xlat and the end of the section.
+ */
+ diff = int128_sub(cache->mrs.size,
+ int128_make64(cache->xlat - cache->mrs.offset_within_region));
+ l = int128_get64(int128_min(diff, int128_make64(l)));
+
+ mr = cache->mrs.mr;
+ memory_region_ref(mr);
+ if (memory_access_is_direct(mr, is_write)) {
+ /* We don't care about the memory attributes here as we're only
+ * doing this if we found actual RAM, which behaves the same
+ * regardless of attributes; so UNSPECIFIED is fine.
+ */
+ l = flatview_extend_translation(cache->fv, addr, len, mr,
+ cache->xlat, l, is_write,
+ MEMTXATTRS_UNSPECIFIED);
+ cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, &l, true);
+ } else {
+ cache->ptr = NULL;
+ }
+
+ cache->len = l;
+ cache->is_write = is_write;
+ return l;
+}
+
+void address_space_cache_invalidate(MemoryRegionCache *cache,
+ hwaddr addr,
+ hwaddr access_len)
+{
+ assert(cache->is_write);
+ if (likely(cache->ptr)) {
+ invalidate_and_set_dirty(cache->mrs.mr, addr + cache->xlat, access_len);
+ }
+}
+
+void address_space_cache_destroy(MemoryRegionCache *cache)
+{
+ if (!cache->mrs.mr) {
+ return;
+ }
+
+ if (xen_enabled()) {
+ xen_invalidate_map_cache_entry(cache->ptr);
+ }
+ memory_region_unref(cache->mrs.mr);
+ flatview_unref(cache->fv);
+ cache->mrs.mr = NULL;
+ cache->fv = NULL;
+}
+
+/* Called from RCU critical section. This function has the same
+ * semantics as address_space_translate, but it only works on a
+ * predefined range of a MemoryRegion that was mapped with
+ * address_space_cache_init.
+ */
+static inline MemoryRegion *address_space_translate_cached(
+ MemoryRegionCache *cache, hwaddr addr, hwaddr *xlat,
+ hwaddr *plen, bool is_write, MemTxAttrs attrs)
+{
+ MemoryRegionSection section;
+ MemoryRegion *mr;
+ IOMMUMemoryRegion *iommu_mr;
+ AddressSpace *target_as;
+
+ assert(!cache->ptr);
+ *xlat = addr + cache->xlat;
+
+ mr = cache->mrs.mr;
+ iommu_mr = memory_region_get_iommu(mr);
+ if (!iommu_mr) {
+ /* MMIO region. */
+ return mr;
+ }
+
+ section = address_space_translate_iommu(iommu_mr, xlat, plen,
+ NULL, is_write, true,
+ &target_as, attrs);
+ return section.mr;
+}
+
+/* Called from RCU critical section. address_space_read_cached uses this
+ * out of line function when the target is an MMIO or IOMMU region.
+ */
+MemTxResult
+address_space_read_cached_slow(MemoryRegionCache *cache, hwaddr addr,
+ void *buf, hwaddr len)
+{
+ hwaddr addr1, l;
+ MemoryRegion *mr;
+
+ l = len;
+ mr = address_space_translate_cached(cache, addr, &addr1, &l, false,
+ MEMTXATTRS_UNSPECIFIED);
+ return flatview_read_continue(cache->fv,
+ addr, MEMTXATTRS_UNSPECIFIED, buf, len,
+ addr1, l, mr);
+}
+
+/* Called from RCU critical section. address_space_write_cached uses this
+ * out of line function when the target is an MMIO or IOMMU region.
+ */
+MemTxResult
+address_space_write_cached_slow(MemoryRegionCache *cache, hwaddr addr,
+ const void *buf, hwaddr len)
+{
+ hwaddr addr1, l;
+ MemoryRegion *mr;
+
+ l = len;
+ mr = address_space_translate_cached(cache, addr, &addr1, &l, true,
+ MEMTXATTRS_UNSPECIFIED);
+ return flatview_write_continue(cache->fv,
+ addr, MEMTXATTRS_UNSPECIFIED, buf, len,
+ addr1, l, mr);
+}
+
+#define ARG1_DECL MemoryRegionCache *cache
+#define ARG1 cache
+#define SUFFIX _cached_slow
+#define TRANSLATE(...) address_space_translate_cached(cache, __VA_ARGS__)
+#define RCU_READ_LOCK() ((void)0)
+#define RCU_READ_UNLOCK() ((void)0)
+#include "memory_ldst.c.inc"
+
+/* virtual memory access for debug (includes writing to ROM) */
+int cpu_memory_rw_debug(CPUState *cpu, vaddr addr,
+ void *ptr, size_t len, bool is_write)
+{
+ hwaddr phys_addr;
+ vaddr l, page;
+ uint8_t *buf = ptr;
+
+ cpu_synchronize_state(cpu);
+ while (len > 0) {
+ int asidx;
+ MemTxAttrs attrs;
+ MemTxResult res;
+
+ page = addr & TARGET_PAGE_MASK;
+ phys_addr = cpu_get_phys_page_attrs_debug(cpu, page, &attrs);
+ asidx = cpu_asidx_from_attrs(cpu, attrs);
+ /* if no physical page mapped, return an error */
+ if (phys_addr == -1)
+ return -1;
+ l = (page + TARGET_PAGE_SIZE) - addr;
+ if (l > len)
+ l = len;
+ phys_addr += (addr & ~TARGET_PAGE_MASK);
+ if (is_write) {
+ res = address_space_write_rom(cpu->cpu_ases[asidx].as, phys_addr,
+ attrs, buf, l);
+ } else {
+ res = address_space_read(cpu->cpu_ases[asidx].as, phys_addr,
+ attrs, buf, l);
+ }
+ if (res != MEMTX_OK) {
+ return -1;
+ }
+ len -= l;
+ buf += l;
+ addr += l;
+ }
+ return 0;
+}
+
+/*
+ * Allows code that needs to deal with migration bitmaps etc to still be built
+ * target independent.
+ */
+size_t qemu_target_page_size(void)
+{
+ return TARGET_PAGE_SIZE;
+}
+
+int qemu_target_page_mask(void)
+{
+ return TARGET_PAGE_MASK;
+}
+
+int qemu_target_page_bits(void)
+{
+ return TARGET_PAGE_BITS;
+}
+
+int qemu_target_page_bits_min(void)
+{
+ return TARGET_PAGE_BITS_MIN;
+}
+
+/* Convert target pages to MiB (2**20). */
+size_t qemu_target_pages_to_MiB(size_t pages)
+{
+ int page_bits = TARGET_PAGE_BITS;
+
+ /* So far, the largest (non-huge) page size is 64k, i.e. 16 bits. */
+ g_assert(page_bits < 20);
+
+ return pages >> (20 - page_bits);
+}
+
+bool cpu_physical_memory_is_io(hwaddr phys_addr)
+{
+ MemoryRegion*mr;
+ hwaddr l = 1;
+
+ RCU_READ_LOCK_GUARD();
+ mr = address_space_translate(&address_space_memory,
+ phys_addr, &phys_addr, &l, false,
+ MEMTXATTRS_UNSPECIFIED);
+
+ return !(memory_region_is_ram(mr) || memory_region_is_romd(mr));
+}
+
+int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
+{
+ RAMBlock *block;
+ int ret = 0;
+
+ RCU_READ_LOCK_GUARD();
+ RAMBLOCK_FOREACH(block) {
+ ret = func(block, opaque);
+ if (ret) {
+ break;
+ }
+ }
+ return ret;
+}
+
+/*
+ * Unmap pages of memory from start to start+length such that
+ * they a) read as 0, b) Trigger whatever fault mechanism
+ * the OS provides for postcopy.
+ * The pages must be unmapped by the end of the function.
+ * Returns: 0 on success, none-0 on failure
+ *
+ */
+int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
+{
+ int ret = -1;
+
+ uint8_t *host_startaddr = rb->host + start;
+
+ if (!QEMU_PTR_IS_ALIGNED(host_startaddr, rb->page_size)) {
+ error_report("ram_block_discard_range: Unaligned start address: %p",
+ host_startaddr);
+ goto err;
+ }
+
+ if ((start + length) <= rb->max_length) {
+ bool need_madvise, need_fallocate;
+ if (!QEMU_IS_ALIGNED(length, rb->page_size)) {
+ error_report("ram_block_discard_range: Unaligned length: %zx",
+ length);
+ goto err;
+ }
+
+ errno = ENOTSUP; /* If we are missing MADVISE etc */
+
+ /* The logic here is messy;
+ * madvise DONTNEED fails for hugepages
+ * fallocate works on hugepages and shmem
+ * shared anonymous memory requires madvise REMOVE
+ */
+ need_madvise = (rb->page_size == qemu_host_page_size);
+ need_fallocate = rb->fd != -1;
+ if (need_fallocate) {
+ /* For a file, this causes the area of the file to be zero'd
+ * if read, and for hugetlbfs also causes it to be unmapped
+ * so a userfault will trigger.
+ */
+#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
+ /*
+ * fallocate() will fail with readonly files. Let's print a
+ * proper error message.
+ */
+ if (rb->flags & RAM_READONLY_FD) {
+ error_report("ram_block_discard_range: Discarding RAM"
+ " with readonly files is not supported");
+ goto err;
+
+ }
+ /*
+ * We'll discard data from the actual file, even though we only
+ * have a MAP_PRIVATE mapping, possibly messing with other
+ * MAP_PRIVATE/MAP_SHARED mappings. There is no easy way to
+ * change that behavior whithout violating the promised
+ * semantics of ram_block_discard_range().
+ *
+ * Only warn, because it works as long as nobody else uses that
+ * file.
+ */
+ if (!qemu_ram_is_shared(rb)) {
+ warn_report_once("ram_block_discard_range: Discarding RAM"
+ " in private file mappings is possibly"
+ " dangerous, because it will modify the"
+ " underlying file and will affect other"
+ " users of the file");
+ }
+
+ ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+ start, length);
+ if (ret) {
+ ret = -errno;
+ error_report("ram_block_discard_range: Failed to fallocate "
+ "%s:%" PRIx64 " +%zx (%d)",
+ rb->idstr, start, length, ret);
+ goto err;
+ }
+#else
+ ret = -ENOSYS;
+ error_report("ram_block_discard_range: fallocate not available/file"
+ "%s:%" PRIx64 " +%zx (%d)",
+ rb->idstr, start, length, ret);
+ goto err;
+#endif
+ }
+ if (need_madvise) {
+ /* For normal RAM this causes it to be unmapped,
+ * for shared memory it causes the local mapping to disappear
+ * and to fall back on the file contents (which we just
+ * fallocate'd away).
+ */
+#if defined(CONFIG_MADVISE)
+ if (qemu_ram_is_shared(rb) && rb->fd < 0) {
+ ret = madvise(host_startaddr, length, QEMU_MADV_REMOVE);
+ } else {
+ ret = madvise(host_startaddr, length, QEMU_MADV_DONTNEED);
+ }
+ if (ret) {
+ ret = -errno;
+ error_report("ram_block_discard_range: Failed to discard range "
+ "%s:%" PRIx64 " +%zx (%d)",
+ rb->idstr, start, length, ret);
+ goto err;
+ }
+#else
+ ret = -ENOSYS;
+ error_report("ram_block_discard_range: MADVISE not available"
+ "%s:%" PRIx64 " +%zx (%d)",
+ rb->idstr, start, length, ret);
+ goto err;
+#endif
+ }
+ trace_ram_block_discard_range(rb->idstr, host_startaddr, length,
+ need_madvise, need_fallocate, ret);
+ } else {
+ error_report("ram_block_discard_range: Overrun block '%s' (%" PRIu64
+ "/%zx/" RAM_ADDR_FMT")",
+ rb->idstr, start, length, rb->max_length);
+ }
+
+err:
+ return ret;
+}
+
+bool ramblock_is_pmem(RAMBlock *rb)
+{
+ return rb->flags & RAM_PMEM;
+}
+
+static void mtree_print_phys_entries(int start, int end, int skip, int ptr)
+{
+ if (start == end - 1) {
+ qemu_printf("\t%3d ", start);
+ } else {
+ qemu_printf("\t%3d..%-3d ", start, end - 1);
+ }
+ qemu_printf(" skip=%d ", skip);
+ if (ptr == PHYS_MAP_NODE_NIL) {
+ qemu_printf(" ptr=NIL");
+ } else if (!skip) {
+ qemu_printf(" ptr=#%d", ptr);
+ } else {
+ qemu_printf(" ptr=[%d]", ptr);
+ }
+ qemu_printf("\n");
+}
+
+#define MR_SIZE(size) (int128_nz(size) ? (hwaddr)int128_get64( \
+ int128_sub((size), int128_one())) : 0)
+
+void mtree_print_dispatch(AddressSpaceDispatch *d, MemoryRegion *root)
+{
+ int i;
+
+ qemu_printf(" Dispatch\n");
+ qemu_printf(" Physical sections\n");
+
+ for (i = 0; i < d->map.sections_nb; ++i) {
+ MemoryRegionSection *s = d->map.sections + i;
+ const char *names[] = { " [unassigned]", " [not dirty]",
+ " [ROM]", " [watch]" };
+
+ qemu_printf(" #%d @" HWADDR_FMT_plx ".." HWADDR_FMT_plx
+ " %s%s%s%s%s",
+ i,
+ s->offset_within_address_space,
+ s->offset_within_address_space + MR_SIZE(s->size),
+ s->mr->name ? s->mr->name : "(noname)",
+ i < ARRAY_SIZE(names) ? names[i] : "",
+ s->mr == root ? " [ROOT]" : "",
+ s == d->mru_section ? " [MRU]" : "",
+ s->mr->is_iommu ? " [iommu]" : "");
+
+ if (s->mr->alias) {
+ qemu_printf(" alias=%s", s->mr->alias->name ?
+ s->mr->alias->name : "noname");
+ }
+ qemu_printf("\n");
+ }
+
+ qemu_printf(" Nodes (%d bits per level, %d levels) ptr=[%d] skip=%d\n",
+ P_L2_BITS, P_L2_LEVELS, d->phys_map.ptr, d->phys_map.skip);
+ for (i = 0; i < d->map.nodes_nb; ++i) {
+ int j, jprev;
+ PhysPageEntry prev;
+ Node *n = d->map.nodes + i;
+
+ qemu_printf(" [%d]\n", i);
+
+ for (j = 0, jprev = 0, prev = *n[0]; j < ARRAY_SIZE(*n); ++j) {
+ PhysPageEntry *pe = *n + j;
+
+ if (pe->ptr == prev.ptr && pe->skip == prev.skip) {
+ continue;
+ }
+
+ mtree_print_phys_entries(jprev, j, prev.skip, prev.ptr);
+
+ jprev = j;
+ prev = *pe;
+ }
+
+ if (jprev != ARRAY_SIZE(*n)) {
+ mtree_print_phys_entries(jprev, j, prev.skip, prev.ptr);
+ }
+ }
+}
+
+/* Require any discards to work. */
+static unsigned int ram_block_discard_required_cnt;
+/* Require only coordinated discards to work. */
+static unsigned int ram_block_coordinated_discard_required_cnt;
+/* Disable any discards. */
+static unsigned int ram_block_discard_disabled_cnt;
+/* Disable only uncoordinated discards. */
+static unsigned int ram_block_uncoordinated_discard_disabled_cnt;
+static QemuMutex ram_block_discard_disable_mutex;
+
+static void ram_block_discard_disable_mutex_lock(void)
+{
+ static gsize initialized;
+
+ if (g_once_init_enter(&initialized)) {
+ qemu_mutex_init(&ram_block_discard_disable_mutex);
+ g_once_init_leave(&initialized, 1);
+ }
+ qemu_mutex_lock(&ram_block_discard_disable_mutex);
+}
+
+static void ram_block_discard_disable_mutex_unlock(void)
+{
+ qemu_mutex_unlock(&ram_block_discard_disable_mutex);
+}
+
+int ram_block_discard_disable(bool state)
+{
+ int ret = 0;
+
+ ram_block_discard_disable_mutex_lock();
+ if (!state) {
+ ram_block_discard_disabled_cnt--;
+ } else if (ram_block_discard_required_cnt ||
+ ram_block_coordinated_discard_required_cnt) {
+ ret = -EBUSY;
+ } else {
+ ram_block_discard_disabled_cnt++;
+ }
+ ram_block_discard_disable_mutex_unlock();
+ return ret;
+}
+
+int ram_block_uncoordinated_discard_disable(bool state)
+{
+ int ret = 0;
+
+ ram_block_discard_disable_mutex_lock();
+ if (!state) {
+ ram_block_uncoordinated_discard_disabled_cnt--;
+ } else if (ram_block_discard_required_cnt) {
+ ret = -EBUSY;
+ } else {
+ ram_block_uncoordinated_discard_disabled_cnt++;
+ }
+ ram_block_discard_disable_mutex_unlock();
+ return ret;
+}
+
+int ram_block_discard_require(bool state)
+{
+ int ret = 0;
+
+ ram_block_discard_disable_mutex_lock();
+ if (!state) {
+ ram_block_discard_required_cnt--;
+ } else if (ram_block_discard_disabled_cnt ||
+ ram_block_uncoordinated_discard_disabled_cnt) {
+ ret = -EBUSY;
+ } else {
+ ram_block_discard_required_cnt++;
+ }
+ ram_block_discard_disable_mutex_unlock();
+ return ret;
+}
+
+int ram_block_coordinated_discard_require(bool state)
+{
+ int ret = 0;
+
+ ram_block_discard_disable_mutex_lock();
+ if (!state) {
+ ram_block_coordinated_discard_required_cnt--;
+ } else if (ram_block_discard_disabled_cnt) {
+ ret = -EBUSY;
+ } else {
+ ram_block_coordinated_discard_required_cnt++;
+ }
+ ram_block_discard_disable_mutex_unlock();
+ return ret;
+}
+
+bool ram_block_discard_is_disabled(void)
+{
+ return qatomic_read(&ram_block_discard_disabled_cnt) ||
+ qatomic_read(&ram_block_uncoordinated_discard_disabled_cnt);
+}
+
+bool ram_block_discard_is_required(void)
+{
+ return qatomic_read(&ram_block_discard_required_cnt) ||
+ qatomic_read(&ram_block_coordinated_discard_required_cnt);
+}
diff --git a/system/qdev-monitor.c b/system/qdev-monitor.c
new file mode 100644
index 0000000000..74f4e41338
--- /dev/null
+++ b/system/qdev-monitor.c
@@ -0,0 +1,1148 @@
+/*
+ * Dynamic device configuration and creation.
+ *
+ * Copyright (c) 2009 CodeSourcery
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/sysbus.h"
+#include "monitor/hmp.h"
+#include "monitor/monitor.h"
+#include "monitor/qdev.h"
+#include "sysemu/arch_init.h"
+#include "qapi/error.h"
+#include "qapi/qapi-commands-qdev.h"
+#include "qapi/qmp/dispatch.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qerror.h"
+#include "qapi/qmp/qstring.h"
+#include "qapi/qobject-input-visitor.h"
+#include "qemu/config-file.h"
+#include "qemu/error-report.h"
+#include "qemu/help_option.h"
+#include "qemu/option.h"
+#include "qemu/qemu-print.h"
+#include "qemu/option_int.h"
+#include "sysemu/block-backend.h"
+#include "migration/misc.h"
+#include "migration/migration.h"
+#include "qemu/cutils.h"
+#include "hw/qdev-properties.h"
+#include "hw/clock.h"
+#include "hw/boards.h"
+
+/*
+ * Aliases were a bad idea from the start. Let's keep them
+ * from spreading further.
+ */
+typedef struct QDevAlias
+{
+ const char *typename;
+ const char *alias;
+ uint32_t arch_mask;
+} QDevAlias;
+
+/* default virtio transport per architecture */
+#define QEMU_ARCH_VIRTIO_PCI (QEMU_ARCH_ALPHA | QEMU_ARCH_ARM | \
+ QEMU_ARCH_HPPA | QEMU_ARCH_I386 | \
+ QEMU_ARCH_MIPS | QEMU_ARCH_PPC | \
+ QEMU_ARCH_RISCV | QEMU_ARCH_SH4 | \
+ QEMU_ARCH_SPARC | QEMU_ARCH_XTENSA | \
+ QEMU_ARCH_LOONGARCH)
+#define QEMU_ARCH_VIRTIO_CCW (QEMU_ARCH_S390X)
+#define QEMU_ARCH_VIRTIO_MMIO (QEMU_ARCH_M68K)
+
+/* Please keep this table sorted by typename. */
+static const QDevAlias qdev_alias_table[] = {
+ { "AC97", "ac97" }, /* -soundhw name */
+ { "e1000", "e1000-82540em" },
+ { "ES1370", "es1370" }, /* -soundhw name */
+ { "ich9-ahci", "ahci" },
+ { "lsi53c895a", "lsi" },
+ { "virtio-9p-device", "virtio-9p", QEMU_ARCH_VIRTIO_MMIO },
+ { "virtio-9p-ccw", "virtio-9p", QEMU_ARCH_VIRTIO_CCW },
+ { "virtio-9p-pci", "virtio-9p", QEMU_ARCH_VIRTIO_PCI },
+ { "virtio-balloon-device", "virtio-balloon", QEMU_ARCH_VIRTIO_MMIO },
+ { "virtio-balloon-ccw", "virtio-balloon", QEMU_ARCH_VIRTIO_CCW },
+ { "virtio-balloon-pci", "virtio-balloon", QEMU_ARCH_VIRTIO_PCI },
+ { "virtio-blk-device", "virtio-blk", QEMU_ARCH_VIRTIO_MMIO },
+ { "virtio-blk-ccw", "virtio-blk", QEMU_ARCH_VIRTIO_CCW },
+ { "virtio-blk-pci", "virtio-blk", QEMU_ARCH_VIRTIO_PCI },
+ { "virtio-gpu-device", "virtio-gpu", QEMU_ARCH_VIRTIO_MMIO },
+ { "virtio-gpu-ccw", "virtio-gpu", QEMU_ARCH_VIRTIO_CCW },
+ { "virtio-gpu-pci", "virtio-gpu", QEMU_ARCH_VIRTIO_PCI },
+ { "virtio-gpu-gl-device", "virtio-gpu-gl", QEMU_ARCH_VIRTIO_MMIO },
+ { "virtio-gpu-gl-pci", "virtio-gpu-gl", QEMU_ARCH_VIRTIO_PCI },
+ { "virtio-input-host-device", "virtio-input-host", QEMU_ARCH_VIRTIO_MMIO },
+ { "virtio-input-host-ccw", "virtio-input-host", QEMU_ARCH_VIRTIO_CCW },
+ { "virtio-input-host-pci", "virtio-input-host", QEMU_ARCH_VIRTIO_PCI },
+ { "virtio-iommu-pci", "virtio-iommu", QEMU_ARCH_VIRTIO_PCI },
+ { "virtio-keyboard-device", "virtio-keyboard", QEMU_ARCH_VIRTIO_MMIO },
+ { "virtio-keyboard-ccw", "virtio-keyboard", QEMU_ARCH_VIRTIO_CCW },
+ { "virtio-keyboard-pci", "virtio-keyboard", QEMU_ARCH_VIRTIO_PCI },
+ { "virtio-mouse-device", "virtio-mouse", QEMU_ARCH_VIRTIO_MMIO },
+ { "virtio-mouse-ccw", "virtio-mouse", QEMU_ARCH_VIRTIO_CCW },
+ { "virtio-mouse-pci", "virtio-mouse", QEMU_ARCH_VIRTIO_PCI },
+ { "virtio-net-device", "virtio-net", QEMU_ARCH_VIRTIO_MMIO },
+ { "virtio-net-ccw", "virtio-net", QEMU_ARCH_VIRTIO_CCW },
+ { "virtio-net-pci", "virtio-net", QEMU_ARCH_VIRTIO_PCI },
+ { "virtio-rng-device", "virtio-rng", QEMU_ARCH_VIRTIO_MMIO },
+ { "virtio-rng-ccw", "virtio-rng", QEMU_ARCH_VIRTIO_CCW },
+ { "virtio-rng-pci", "virtio-rng", QEMU_ARCH_VIRTIO_PCI },
+ { "virtio-scsi-device", "virtio-scsi", QEMU_ARCH_VIRTIO_MMIO },
+ { "virtio-scsi-ccw", "virtio-scsi", QEMU_ARCH_VIRTIO_CCW },
+ { "virtio-scsi-pci", "virtio-scsi", QEMU_ARCH_VIRTIO_PCI },
+ { "virtio-serial-device", "virtio-serial", QEMU_ARCH_VIRTIO_MMIO },
+ { "virtio-serial-ccw", "virtio-serial", QEMU_ARCH_VIRTIO_CCW },
+ { "virtio-serial-pci", "virtio-serial", QEMU_ARCH_VIRTIO_PCI},
+ { "virtio-tablet-device", "virtio-tablet", QEMU_ARCH_VIRTIO_MMIO },
+ { "virtio-tablet-ccw", "virtio-tablet", QEMU_ARCH_VIRTIO_CCW },
+ { "virtio-tablet-pci", "virtio-tablet", QEMU_ARCH_VIRTIO_PCI },
+ { }
+};
+
+static const char *qdev_class_get_alias(DeviceClass *dc)
+{
+ const char *typename = object_class_get_name(OBJECT_CLASS(dc));
+ int i;
+
+ for (i = 0; qdev_alias_table[i].typename; i++) {
+ if (qdev_alias_table[i].arch_mask &&
+ !(qdev_alias_table[i].arch_mask & arch_type)) {
+ continue;
+ }
+
+ if (strcmp(qdev_alias_table[i].typename, typename) == 0) {
+ return qdev_alias_table[i].alias;
+ }
+ }
+
+ return NULL;
+}
+
+static bool qdev_class_has_alias(DeviceClass *dc)
+{
+ return (qdev_class_get_alias(dc) != NULL);
+}
+
+static void qdev_print_devinfo(DeviceClass *dc)
+{
+ qemu_printf("name \"%s\"", object_class_get_name(OBJECT_CLASS(dc)));
+ if (dc->bus_type) {
+ qemu_printf(", bus %s", dc->bus_type);
+ }
+ if (qdev_class_has_alias(dc)) {
+ qemu_printf(", alias \"%s\"", qdev_class_get_alias(dc));
+ }
+ if (dc->desc) {
+ qemu_printf(", desc \"%s\"", dc->desc);
+ }
+ if (!dc->user_creatable) {
+ qemu_printf(", no-user");
+ }
+ qemu_printf("\n");
+}
+
+static void qdev_print_devinfos(bool show_no_user)
+{
+ static const char *cat_name[DEVICE_CATEGORY_MAX + 1] = {
+ [DEVICE_CATEGORY_BRIDGE] = "Controller/Bridge/Hub",
+ [DEVICE_CATEGORY_USB] = "USB",
+ [DEVICE_CATEGORY_STORAGE] = "Storage",
+ [DEVICE_CATEGORY_NETWORK] = "Network",
+ [DEVICE_CATEGORY_INPUT] = "Input",
+ [DEVICE_CATEGORY_DISPLAY] = "Display",
+ [DEVICE_CATEGORY_SOUND] = "Sound",
+ [DEVICE_CATEGORY_MISC] = "Misc",
+ [DEVICE_CATEGORY_CPU] = "CPU",
+ [DEVICE_CATEGORY_WATCHDOG]= "Watchdog",
+ [DEVICE_CATEGORY_MAX] = "Uncategorized",
+ };
+ GSList *list, *elt;
+ int i;
+ bool cat_printed;
+
+ module_load_qom_all();
+ list = object_class_get_list_sorted(TYPE_DEVICE, false);
+
+ for (i = 0; i <= DEVICE_CATEGORY_MAX; i++) {
+ cat_printed = false;
+ for (elt = list; elt; elt = elt->next) {
+ DeviceClass *dc = OBJECT_CLASS_CHECK(DeviceClass, elt->data,
+ TYPE_DEVICE);
+ if ((i < DEVICE_CATEGORY_MAX
+ ? !test_bit(i, dc->categories)
+ : !bitmap_empty(dc->categories, DEVICE_CATEGORY_MAX))
+ || (!show_no_user
+ && !dc->user_creatable)) {
+ continue;
+ }
+ if (!cat_printed) {
+ qemu_printf("%s%s devices:\n", i ? "\n" : "", cat_name[i]);
+ cat_printed = true;
+ }
+ qdev_print_devinfo(dc);
+ }
+ }
+
+ g_slist_free(list);
+}
+
+static const char *find_typename_by_alias(const char *alias)
+{
+ int i;
+
+ for (i = 0; qdev_alias_table[i].alias; i++) {
+ if (qdev_alias_table[i].arch_mask &&
+ !(qdev_alias_table[i].arch_mask & arch_type)) {
+ continue;
+ }
+
+ if (strcmp(qdev_alias_table[i].alias, alias) == 0) {
+ return qdev_alias_table[i].typename;
+ }
+ }
+
+ return NULL;
+}
+
+static DeviceClass *qdev_get_device_class(const char **driver, Error **errp)
+{
+ ObjectClass *oc;
+ DeviceClass *dc;
+ const char *original_name = *driver;
+
+ oc = module_object_class_by_name(*driver);
+ if (!oc) {
+ const char *typename = find_typename_by_alias(*driver);
+
+ if (typename) {
+ *driver = typename;
+ oc = module_object_class_by_name(*driver);
+ }
+ }
+
+ if (!object_class_dynamic_cast(oc, TYPE_DEVICE)) {
+ if (*driver != original_name) {
+ error_setg(errp, "'%s' (alias '%s') is not a valid device model"
+ " name", original_name, *driver);
+ } else {
+ error_setg(errp, "'%s' is not a valid device model name", *driver);
+ }
+ return NULL;
+ }
+
+ if (object_class_is_abstract(oc)) {
+ error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "driver",
+ "a non-abstract device type");
+ return NULL;
+ }
+
+ dc = DEVICE_CLASS(oc);
+ if (!dc->user_creatable ||
+ (phase_check(PHASE_MACHINE_READY) && !dc->hotpluggable)) {
+ error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "driver",
+ "a pluggable device type");
+ return NULL;
+ }
+
+ if (object_class_dynamic_cast(oc, TYPE_SYS_BUS_DEVICE)) {
+ /* sysbus devices need to be allowed by the machine */
+ MachineClass *mc = MACHINE_CLASS(object_get_class(qdev_get_machine()));
+ if (!device_type_is_dynamic_sysbus(mc, *driver)) {
+ error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "driver",
+ "a dynamic sysbus device type for the machine");
+ return NULL;
+ }
+ }
+
+ return dc;
+}
+
+
+int qdev_device_help(QemuOpts *opts)
+{
+ Error *local_err = NULL;
+ const char *driver;
+ ObjectPropertyInfoList *prop_list;
+ ObjectPropertyInfoList *prop;
+ GPtrArray *array;
+ int i;
+
+ driver = qemu_opt_get(opts, "driver");
+ if (driver && is_help_option(driver)) {
+ qdev_print_devinfos(false);
+ return 1;
+ }
+
+ if (!driver || !qemu_opt_has_help_opt(opts)) {
+ return 0;
+ }
+
+ if (!object_class_by_name(driver)) {
+ const char *typename = find_typename_by_alias(driver);
+
+ if (typename) {
+ driver = typename;
+ }
+ }
+
+ prop_list = qmp_device_list_properties(driver, &local_err);
+ if (local_err) {
+ goto error;
+ }
+
+ if (prop_list) {
+ qemu_printf("%s options:\n", driver);
+ } else {
+ qemu_printf("There are no options for %s.\n", driver);
+ }
+ array = g_ptr_array_new();
+ for (prop = prop_list; prop; prop = prop->next) {
+ g_ptr_array_add(array,
+ object_property_help(prop->value->name,
+ prop->value->type,
+ prop->value->default_value,
+ prop->value->description));
+ }
+ g_ptr_array_sort(array, (GCompareFunc)qemu_pstrcmp0);
+ for (i = 0; i < array->len; i++) {
+ qemu_printf("%s\n", (char *)array->pdata[i]);
+ }
+ g_ptr_array_set_free_func(array, g_free);
+ g_ptr_array_free(array, true);
+ qapi_free_ObjectPropertyInfoList(prop_list);
+ return 1;
+
+error:
+ error_report_err(local_err);
+ return 1;
+}
+
+static Object *qdev_get_peripheral(void)
+{
+ static Object *dev;
+
+ if (dev == NULL) {
+ dev = container_get(qdev_get_machine(), "/peripheral");
+ }
+
+ return dev;
+}
+
+static Object *qdev_get_peripheral_anon(void)
+{
+ static Object *dev;
+
+ if (dev == NULL) {
+ dev = container_get(qdev_get_machine(), "/peripheral-anon");
+ }
+
+ return dev;
+}
+
+static void qbus_error_append_bus_list_hint(DeviceState *dev,
+ Error *const *errp)
+{
+ BusState *child;
+ const char *sep = " ";
+
+ error_append_hint(errp, "child buses at \"%s\":",
+ dev->id ? dev->id : object_get_typename(OBJECT(dev)));
+ QLIST_FOREACH(child, &dev->child_bus, sibling) {
+ error_append_hint(errp, "%s\"%s\"", sep, child->name);
+ sep = ", ";
+ }
+ error_append_hint(errp, "\n");
+}
+
+static void qbus_error_append_dev_list_hint(BusState *bus,
+ Error *const *errp)
+{
+ BusChild *kid;
+ const char *sep = " ";
+
+ error_append_hint(errp, "devices at \"%s\":", bus->name);
+ QTAILQ_FOREACH(kid, &bus->children, sibling) {
+ DeviceState *dev = kid->child;
+ error_append_hint(errp, "%s\"%s\"", sep,
+ object_get_typename(OBJECT(dev)));
+ if (dev->id) {
+ error_append_hint(errp, "/\"%s\"", dev->id);
+ }
+ sep = ", ";
+ }
+ error_append_hint(errp, "\n");
+}
+
+static BusState *qbus_find_bus(DeviceState *dev, char *elem)
+{
+ BusState *child;
+
+ QLIST_FOREACH(child, &dev->child_bus, sibling) {
+ if (strcmp(child->name, elem) == 0) {
+ return child;
+ }
+ }
+ return NULL;
+}
+
+static DeviceState *qbus_find_dev(BusState *bus, char *elem)
+{
+ BusChild *kid;
+
+ /*
+ * try to match in order:
+ * (1) instance id, if present
+ * (2) driver name
+ * (3) driver alias, if present
+ */
+ QTAILQ_FOREACH(kid, &bus->children, sibling) {
+ DeviceState *dev = kid->child;
+ if (dev->id && strcmp(dev->id, elem) == 0) {
+ return dev;
+ }
+ }
+ QTAILQ_FOREACH(kid, &bus->children, sibling) {
+ DeviceState *dev = kid->child;
+ if (strcmp(object_get_typename(OBJECT(dev)), elem) == 0) {
+ return dev;
+ }
+ }
+ QTAILQ_FOREACH(kid, &bus->children, sibling) {
+ DeviceState *dev = kid->child;
+ DeviceClass *dc = DEVICE_GET_CLASS(dev);
+
+ if (qdev_class_has_alias(dc) &&
+ strcmp(qdev_class_get_alias(dc), elem) == 0) {
+ return dev;
+ }
+ }
+ return NULL;
+}
+
+static inline bool qbus_is_full(BusState *bus)
+{
+ BusClass *bus_class;
+
+ if (bus->full) {
+ return true;
+ }
+ bus_class = BUS_GET_CLASS(bus);
+ return bus_class->max_dev && bus->num_children >= bus_class->max_dev;
+}
+
+/*
+ * Search the tree rooted at @bus for a bus.
+ * If @name, search for a bus with that name. Note that bus names
+ * need not be unique. Yes, that's screwed up.
+ * Else search for a bus that is a subtype of @bus_typename.
+ * If more than one exists, prefer one that can take another device.
+ * Return the bus if found, else %NULL.
+ */
+static BusState *qbus_find_recursive(BusState *bus, const char *name,
+ const char *bus_typename)
+{
+ BusChild *kid;
+ BusState *pick, *child, *ret;
+ bool match;
+
+ assert(name || bus_typename);
+ if (name) {
+ match = !strcmp(bus->name, name);
+ } else {
+ match = !!object_dynamic_cast(OBJECT(bus), bus_typename);
+ }
+
+ if (match && !qbus_is_full(bus)) {
+ return bus; /* root matches and isn't full */
+ }
+
+ pick = match ? bus : NULL;
+
+ QTAILQ_FOREACH(kid, &bus->children, sibling) {
+ DeviceState *dev = kid->child;
+ QLIST_FOREACH(child, &dev->child_bus, sibling) {
+ ret = qbus_find_recursive(child, name, bus_typename);
+ if (ret && !qbus_is_full(ret)) {
+ return ret; /* a descendant matches and isn't full */
+ }
+ if (ret && !pick) {
+ pick = ret;
+ }
+ }
+ }
+
+ /* root or a descendant matches, but is full */
+ return pick;
+}
+
+static BusState *qbus_find(const char *path, Error **errp)
+{
+ DeviceState *dev;
+ BusState *bus;
+ char elem[128];
+ int pos, len;
+
+ /* find start element */
+ if (path[0] == '/') {
+ bus = sysbus_get_default();
+ pos = 0;
+ } else {
+ if (sscanf(path, "%127[^/]%n", elem, &len) != 1) {
+ assert(!path[0]);
+ elem[0] = len = 0;
+ }
+ bus = qbus_find_recursive(sysbus_get_default(), elem, NULL);
+ if (!bus) {
+ error_setg(errp, "Bus '%s' not found", elem);
+ return NULL;
+ }
+ pos = len;
+ }
+
+ for (;;) {
+ assert(path[pos] == '/' || !path[pos]);
+ while (path[pos] == '/') {
+ pos++;
+ }
+ if (path[pos] == '\0') {
+ break;
+ }
+
+ /* find device */
+ if (sscanf(path+pos, "%127[^/]%n", elem, &len) != 1) {
+ g_assert_not_reached();
+ elem[0] = len = 0;
+ }
+ pos += len;
+ dev = qbus_find_dev(bus, elem);
+ if (!dev) {
+ error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
+ "Device '%s' not found", elem);
+ qbus_error_append_dev_list_hint(bus, errp);
+ return NULL;
+ }
+
+ assert(path[pos] == '/' || !path[pos]);
+ while (path[pos] == '/') {
+ pos++;
+ }
+ if (path[pos] == '\0') {
+ /* last specified element is a device. If it has exactly
+ * one child bus accept it nevertheless */
+ if (dev->num_child_bus == 1) {
+ bus = QLIST_FIRST(&dev->child_bus);
+ break;
+ }
+ if (dev->num_child_bus) {
+ error_setg(errp, "Device '%s' has multiple child buses",
+ elem);
+ qbus_error_append_bus_list_hint(dev, errp);
+ } else {
+ error_setg(errp, "Device '%s' has no child bus", elem);
+ }
+ return NULL;
+ }
+
+ /* find bus */
+ if (sscanf(path+pos, "%127[^/]%n", elem, &len) != 1) {
+ g_assert_not_reached();
+ elem[0] = len = 0;
+ }
+ pos += len;
+ bus = qbus_find_bus(dev, elem);
+ if (!bus) {
+ error_setg(errp, "Bus '%s' not found", elem);
+ qbus_error_append_bus_list_hint(dev, errp);
+ return NULL;
+ }
+ }
+
+ if (qbus_is_full(bus)) {
+ error_setg(errp, "Bus '%s' is full", path);
+ return NULL;
+ }
+ return bus;
+}
+
+/* Takes ownership of @id, will be freed when deleting the device */
+const char *qdev_set_id(DeviceState *dev, char *id, Error **errp)
+{
+ ObjectProperty *prop;
+
+ assert(!dev->id && !dev->realized);
+
+ /*
+ * object_property_[try_]add_child() below will assert the device
+ * has no parent
+ */
+ if (id) {
+ prop = object_property_try_add_child(qdev_get_peripheral(), id,
+ OBJECT(dev), NULL);
+ if (prop) {
+ dev->id = id;
+ } else {
+ error_setg(errp, "Duplicate device ID '%s'", id);
+ g_free(id);
+ return NULL;
+ }
+ } else {
+ static int anon_count;
+ gchar *name = g_strdup_printf("device[%d]", anon_count++);
+ prop = object_property_add_child(qdev_get_peripheral_anon(), name,
+ OBJECT(dev));
+ g_free(name);
+ }
+
+ return prop->name;
+}
+
+DeviceState *qdev_device_add_from_qdict(const QDict *opts,
+ bool from_json, Error **errp)
+{
+ ERRP_GUARD();
+ DeviceClass *dc;
+ const char *driver, *path;
+ char *id;
+ DeviceState *dev = NULL;
+ BusState *bus = NULL;
+
+ driver = qdict_get_try_str(opts, "driver");
+ if (!driver) {
+ error_setg(errp, QERR_MISSING_PARAMETER, "driver");
+ return NULL;
+ }
+
+ /* find driver */
+ dc = qdev_get_device_class(&driver, errp);
+ if (!dc) {
+ return NULL;
+ }
+
+ /* find bus */
+ path = qdict_get_try_str(opts, "bus");
+ if (path != NULL) {
+ bus = qbus_find(path, errp);
+ if (!bus) {
+ return NULL;
+ }
+ if (!object_dynamic_cast(OBJECT(bus), dc->bus_type)) {
+ error_setg(errp, "Device '%s' can't go on %s bus",
+ driver, object_get_typename(OBJECT(bus)));
+ return NULL;
+ }
+ } else if (dc->bus_type != NULL) {
+ bus = qbus_find_recursive(sysbus_get_default(), NULL, dc->bus_type);
+ if (!bus || qbus_is_full(bus)) {
+ error_setg(errp, "No '%s' bus found for device '%s'",
+ dc->bus_type, driver);
+ return NULL;
+ }
+ }
+
+ if (qdev_should_hide_device(opts, from_json, errp)) {
+ if (bus && !qbus_is_hotpluggable(bus)) {
+ error_setg(errp, QERR_BUS_NO_HOTPLUG, bus->name);
+ }
+ return NULL;
+ } else if (*errp) {
+ return NULL;
+ }
+
+ if (phase_check(PHASE_MACHINE_READY) && bus && !qbus_is_hotpluggable(bus)) {
+ error_setg(errp, QERR_BUS_NO_HOTPLUG, bus->name);
+ return NULL;
+ }
+
+ if (!migration_is_idle()) {
+ error_setg(errp, "device_add not allowed while migrating");
+ return NULL;
+ }
+
+ /* create device */
+ dev = qdev_new(driver);
+
+ /* Check whether the hotplug is allowed by the machine */
+ if (phase_check(PHASE_MACHINE_READY)) {
+ if (!qdev_hotplug_allowed(dev, errp)) {
+ goto err_del_dev;
+ }
+
+ if (!bus && !qdev_get_machine_hotplug_handler(dev)) {
+ /* No bus, no machine hotplug handler --> device is not hotpluggable */
+ error_setg(errp, "Device '%s' can not be hotplugged on this machine",
+ driver);
+ goto err_del_dev;
+ }
+ }
+
+ /*
+ * set dev's parent and register its id.
+ * If it fails it means the id is already taken.
+ */
+ id = g_strdup(qdict_get_try_str(opts, "id"));
+ if (!qdev_set_id(dev, id, errp)) {
+ goto err_del_dev;
+ }
+
+ /* set properties */
+ dev->opts = qdict_clone_shallow(opts);
+ qdict_del(dev->opts, "driver");
+ qdict_del(dev->opts, "bus");
+ qdict_del(dev->opts, "id");
+
+ object_set_properties_from_keyval(&dev->parent_obj, dev->opts, from_json,
+ errp);
+ if (*errp) {
+ goto err_del_dev;
+ }
+
+ if (!qdev_realize(dev, bus, errp)) {
+ goto err_del_dev;
+ }
+ return dev;
+
+err_del_dev:
+ if (dev) {
+ object_unparent(OBJECT(dev));
+ object_unref(OBJECT(dev));
+ }
+ return NULL;
+}
+
+/* Takes ownership of @opts on success */
+DeviceState *qdev_device_add(QemuOpts *opts, Error **errp)
+{
+ QDict *qdict = qemu_opts_to_qdict(opts, NULL);
+ DeviceState *ret;
+
+ ret = qdev_device_add_from_qdict(qdict, false, errp);
+ if (ret) {
+ qemu_opts_del(opts);
+ }
+ qobject_unref(qdict);
+ return ret;
+}
+
+#define qdev_printf(fmt, ...) monitor_printf(mon, "%*s" fmt, indent, "", ## __VA_ARGS__)
+static void qbus_print(Monitor *mon, BusState *bus, int indent);
+
+static void qdev_print_props(Monitor *mon, DeviceState *dev, Property *props,
+ int indent)
+{
+ if (!props)
+ return;
+ for (; props->name; props++) {
+ char *value;
+ char *legacy_name = g_strdup_printf("legacy-%s", props->name);
+
+ if (object_property_get_type(OBJECT(dev), legacy_name, NULL)) {
+ value = object_property_get_str(OBJECT(dev), legacy_name, NULL);
+ } else {
+ value = object_property_print(OBJECT(dev), props->name, true,
+ NULL);
+ }
+ g_free(legacy_name);
+
+ if (!value) {
+ continue;
+ }
+ qdev_printf("%s = %s\n", props->name,
+ *value ? value : "<null>");
+ g_free(value);
+ }
+}
+
+static void bus_print_dev(BusState *bus, Monitor *mon, DeviceState *dev, int indent)
+{
+ BusClass *bc = BUS_GET_CLASS(bus);
+
+ if (bc->print_dev) {
+ bc->print_dev(mon, dev, indent);
+ }
+}
+
+static void qdev_print(Monitor *mon, DeviceState *dev, int indent)
+{
+ ObjectClass *class;
+ BusState *child;
+ NamedGPIOList *ngl;
+ NamedClockList *ncl;
+
+ qdev_printf("dev: %s, id \"%s\"\n", object_get_typename(OBJECT(dev)),
+ dev->id ? dev->id : "");
+ indent += 2;
+ QLIST_FOREACH(ngl, &dev->gpios, node) {
+ if (ngl->num_in) {
+ qdev_printf("gpio-in \"%s\" %d\n", ngl->name ? ngl->name : "",
+ ngl->num_in);
+ }
+ if (ngl->num_out) {
+ qdev_printf("gpio-out \"%s\" %d\n", ngl->name ? ngl->name : "",
+ ngl->num_out);
+ }
+ }
+ QLIST_FOREACH(ncl, &dev->clocks, node) {
+ g_autofree char *freq_str = clock_display_freq(ncl->clock);
+ qdev_printf("clock-%s%s \"%s\" freq_hz=%s\n",
+ ncl->output ? "out" : "in",
+ ncl->alias ? " (alias)" : "",
+ ncl->name, freq_str);
+ }
+ class = object_get_class(OBJECT(dev));
+ do {
+ qdev_print_props(mon, dev, DEVICE_CLASS(class)->props_, indent);
+ class = object_class_get_parent(class);
+ } while (class != object_class_by_name(TYPE_DEVICE));
+ bus_print_dev(dev->parent_bus, mon, dev, indent);
+ QLIST_FOREACH(child, &dev->child_bus, sibling) {
+ qbus_print(mon, child, indent);
+ }
+}
+
+static void qbus_print(Monitor *mon, BusState *bus, int indent)
+{
+ BusChild *kid;
+
+ qdev_printf("bus: %s\n", bus->name);
+ indent += 2;
+ qdev_printf("type %s\n", object_get_typename(OBJECT(bus)));
+ QTAILQ_FOREACH(kid, &bus->children, sibling) {
+ DeviceState *dev = kid->child;
+ qdev_print(mon, dev, indent);
+ }
+}
+#undef qdev_printf
+
+void hmp_info_qtree(Monitor *mon, const QDict *qdict)
+{
+ if (sysbus_get_default())
+ qbus_print(mon, sysbus_get_default(), 0);
+}
+
+void hmp_info_qdm(Monitor *mon, const QDict *qdict)
+{
+ qdev_print_devinfos(true);
+}
+
+void qmp_device_add(QDict *qdict, QObject **ret_data, Error **errp)
+{
+ QemuOpts *opts;
+ DeviceState *dev;
+
+ opts = qemu_opts_from_qdict(qemu_find_opts("device"), qdict, errp);
+ if (!opts) {
+ return;
+ }
+ if (!monitor_cur_is_qmp() && qdev_device_help(opts)) {
+ qemu_opts_del(opts);
+ return;
+ }
+ dev = qdev_device_add(opts, errp);
+
+ /*
+ * Drain all pending RCU callbacks. This is done because
+ * some bus related operations can delay a device removal
+ * (in this case this can happen if device is added and then
+ * removed due to a configuration error)
+ * to a RCU callback, but user might expect that this interface
+ * will finish its job completely once qmp command returns result
+ * to the user
+ */
+ drain_call_rcu();
+
+ if (!dev) {
+ qemu_opts_del(opts);
+ return;
+ }
+ object_unref(OBJECT(dev));
+}
+
+static DeviceState *find_device_state(const char *id, Error **errp)
+{
+ Object *obj = object_resolve_path_at(qdev_get_peripheral(), id);
+ DeviceState *dev;
+
+ if (!obj) {
+ error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
+ "Device '%s' not found", id);
+ return NULL;
+ }
+
+ dev = (DeviceState *)object_dynamic_cast(obj, TYPE_DEVICE);
+ if (!dev) {
+ error_setg(errp, "%s is not a hotpluggable device", id);
+ return NULL;
+ }
+
+ return dev;
+}
+
+void qdev_unplug(DeviceState *dev, Error **errp)
+{
+ DeviceClass *dc = DEVICE_GET_CLASS(dev);
+ HotplugHandler *hotplug_ctrl;
+ HotplugHandlerClass *hdc;
+ Error *local_err = NULL;
+
+ if (qdev_unplug_blocked(dev, errp)) {
+ return;
+ }
+
+ if (dev->parent_bus && !qbus_is_hotpluggable(dev->parent_bus)) {
+ error_setg(errp, QERR_BUS_NO_HOTPLUG, dev->parent_bus->name);
+ return;
+ }
+
+ if (!dc->hotpluggable) {
+ error_setg(errp, QERR_DEVICE_NO_HOTPLUG,
+ object_get_typename(OBJECT(dev)));
+ return;
+ }
+
+ if (!migration_is_idle() && !dev->allow_unplug_during_migration) {
+ error_setg(errp, "device_del not allowed while migrating");
+ return;
+ }
+
+ qdev_hot_removed = true;
+
+ hotplug_ctrl = qdev_get_hotplug_handler(dev);
+ /* hotpluggable device MUST have HotplugHandler, if it doesn't
+ * then something is very wrong with it */
+ g_assert(hotplug_ctrl);
+
+ /* If device supports async unplug just request it to be done,
+ * otherwise just remove it synchronously */
+ hdc = HOTPLUG_HANDLER_GET_CLASS(hotplug_ctrl);
+ if (hdc->unplug_request) {
+ hotplug_handler_unplug_request(hotplug_ctrl, dev, &local_err);
+ } else {
+ hotplug_handler_unplug(hotplug_ctrl, dev, &local_err);
+ if (!local_err) {
+ object_unparent(OBJECT(dev));
+ }
+ }
+ error_propagate(errp, local_err);
+}
+
+void qmp_device_del(const char *id, Error **errp)
+{
+ DeviceState *dev = find_device_state(id, errp);
+ if (dev != NULL) {
+ if (dev->pending_deleted_event &&
+ (dev->pending_deleted_expires_ms == 0 ||
+ dev->pending_deleted_expires_ms > qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL))) {
+ error_setg(errp, "Device %s is already in the "
+ "process of unplug", id);
+ return;
+ }
+
+ qdev_unplug(dev, errp);
+ }
+}
+
+void hmp_device_add(Monitor *mon, const QDict *qdict)
+{
+ Error *err = NULL;
+
+ qmp_device_add((QDict *)qdict, NULL, &err);
+ hmp_handle_error(mon, err);
+}
+
+void hmp_device_del(Monitor *mon, const QDict *qdict)
+{
+ const char *id = qdict_get_str(qdict, "id");
+ Error *err = NULL;
+
+ qmp_device_del(id, &err);
+ hmp_handle_error(mon, err);
+}
+
+void device_add_completion(ReadLineState *rs, int nb_args, const char *str)
+{
+ GSList *list, *elt;
+ size_t len;
+
+ if (nb_args != 2) {
+ return;
+ }
+
+ len = strlen(str);
+ readline_set_completion_index(rs, len);
+ list = elt = object_class_get_list(TYPE_DEVICE, false);
+ while (elt) {
+ DeviceClass *dc = OBJECT_CLASS_CHECK(DeviceClass, elt->data,
+ TYPE_DEVICE);
+
+ if (dc->user_creatable) {
+ readline_add_completion_of(rs, str,
+ object_class_get_name(OBJECT_CLASS(dc)));
+ }
+ elt = elt->next;
+ }
+ g_slist_free(list);
+}
+
+static int qdev_add_hotpluggable_device(Object *obj, void *opaque)
+{
+ GSList **list = opaque;
+ DeviceState *dev = (DeviceState *)object_dynamic_cast(obj, TYPE_DEVICE);
+
+ if (dev == NULL) {
+ return 0;
+ }
+
+ if (dev->realized && object_property_get_bool(obj, "hotpluggable", NULL)) {
+ *list = g_slist_append(*list, dev);
+ }
+
+ return 0;
+}
+
+static GSList *qdev_build_hotpluggable_device_list(Object *peripheral)
+{
+ GSList *list = NULL;
+
+ object_child_foreach(peripheral, qdev_add_hotpluggable_device, &list);
+
+ return list;
+}
+
+static void peripheral_device_del_completion(ReadLineState *rs,
+ const char *str)
+{
+ Object *peripheral = container_get(qdev_get_machine(), "/peripheral");
+ GSList *list, *item;
+
+ list = qdev_build_hotpluggable_device_list(peripheral);
+ if (!list) {
+ return;
+ }
+
+ for (item = list; item; item = g_slist_next(item)) {
+ DeviceState *dev = item->data;
+
+ if (dev->id) {
+ readline_add_completion_of(rs, str, dev->id);
+ }
+ }
+
+ g_slist_free(list);
+}
+
+void device_del_completion(ReadLineState *rs, int nb_args, const char *str)
+{
+ if (nb_args != 2) {
+ return;
+ }
+
+ readline_set_completion_index(rs, strlen(str));
+ peripheral_device_del_completion(rs, str);
+}
+
+BlockBackend *blk_by_qdev_id(const char *id, Error **errp)
+{
+ DeviceState *dev;
+ BlockBackend *blk;
+
+ GLOBAL_STATE_CODE();
+
+ dev = find_device_state(id, errp);
+ if (dev == NULL) {
+ return NULL;
+ }
+
+ blk = blk_by_dev(dev);
+ if (!blk) {
+ error_setg(errp, "Device does not have a block device backend");
+ }
+ return blk;
+}
+
+QemuOptsList qemu_device_opts = {
+ .name = "device",
+ .implied_opt_name = "driver",
+ .head = QTAILQ_HEAD_INITIALIZER(qemu_device_opts.head),
+ .desc = {
+ /*
+ * no elements => accept any
+ * sanity checking will happen later
+ * when setting device properties
+ */
+ { /* end of list */ }
+ },
+};
+
+QemuOptsList qemu_global_opts = {
+ .name = "global",
+ .head = QTAILQ_HEAD_INITIALIZER(qemu_global_opts.head),
+ .desc = {
+ {
+ .name = "driver",
+ .type = QEMU_OPT_STRING,
+ },{
+ .name = "property",
+ .type = QEMU_OPT_STRING,
+ },{
+ .name = "value",
+ .type = QEMU_OPT_STRING,
+ },
+ { /* end of list */ }
+ },
+};
+
+int qemu_global_option(const char *str)
+{
+ char driver[64], property[64];
+ QemuOpts *opts;
+ int rc, offset;
+
+ rc = sscanf(str, "%63[^.=].%63[^=]%n", driver, property, &offset);
+ if (rc == 2 && str[offset] == '=') {
+ opts = qemu_opts_create(&qemu_global_opts, NULL, 0, &error_abort);
+ qemu_opt_set(opts, "driver", driver, &error_abort);
+ qemu_opt_set(opts, "property", property, &error_abort);
+ qemu_opt_set(opts, "value", str + offset + 1, &error_abort);
+ return 0;
+ }
+
+ opts = qemu_opts_parse_noisily(&qemu_global_opts, str, false);
+ if (!opts) {
+ return -1;
+ }
+ if (!qemu_opt_get(opts, "driver")
+ || !qemu_opt_get(opts, "property")
+ || !qemu_opt_get(opts, "value")) {
+ error_report("options 'driver', 'property', and 'value'"
+ " are required");
+ return -1;
+ }
+
+ return 0;
+}
+
+bool qmp_command_available(const QmpCommand *cmd, Error **errp)
+{
+ if (!phase_check(PHASE_MACHINE_READY) &&
+ !(cmd->options & QCO_ALLOW_PRECONFIG)) {
+ error_setg(errp, "The command '%s' is permitted only after machine initialization has completed",
+ cmd->name);
+ return false;
+ }
+ return true;
+}
diff --git a/system/qemu-seccomp.c b/system/qemu-seccomp.c
new file mode 100644
index 0000000000..4d7439e7f7
--- /dev/null
+++ b/system/qemu-seccomp.c
@@ -0,0 +1,486 @@
+/*
+ * QEMU seccomp mode 2 support with libseccomp
+ *
+ * Copyright IBM, Corp. 2012
+ *
+ * Authors:
+ * Eduardo Otubo <eotubo@br.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/config-file.h"
+#include "qemu/option.h"
+#include "qemu/module.h"
+#include <sys/prctl.h>
+#include <seccomp.h>
+#include "sysemu/seccomp.h"
+#include <linux/seccomp.h>
+
+/* For some architectures (notably ARM) cacheflush is not supported until
+ * libseccomp 2.2.3, but configure enforces that we are using a more recent
+ * version on those hosts, so it is OK for this check to be less strict.
+ */
+#if SCMP_VER_MAJOR >= 3
+ #define HAVE_CACHEFLUSH
+#elif SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 2
+ #define HAVE_CACHEFLUSH
+#endif
+
+struct QemuSeccompSyscall {
+ int32_t num;
+ uint8_t set;
+ uint8_t narg;
+ const struct scmp_arg_cmp *arg_cmp;
+ uint32_t action;
+};
+
+const struct scmp_arg_cmp sched_setscheduler_arg[] = {
+ /* was SCMP_A1(SCMP_CMP_NE, SCHED_IDLE), but expanded due to GCC 4.x bug */
+ { .arg = 1, .op = SCMP_CMP_NE, .datum_a = SCHED_IDLE }
+};
+
+/*
+ * See 'NOTES' in 'man 2 clone' - s390 & cross have 'flags' in
+ * different position to other architectures
+ */
+#if defined(HOST_S390X) || defined(HOST_S390) || defined(HOST_CRIS)
+#define CLONE_FLAGS_ARG 1
+#else
+#define CLONE_FLAGS_ARG 0
+#endif
+
+#ifndef CLONE_PIDFD
+# define CLONE_PIDFD 0x00001000
+#endif
+
+#define REQUIRE_CLONE_FLAG(flag) \
+ const struct scmp_arg_cmp clone_arg ## flag[] = { \
+ { .arg = CLONE_FLAGS_ARG, \
+ .op = SCMP_CMP_MASKED_EQ, \
+ .datum_a = flag, .datum_b = 0 } }
+
+#define FORBID_CLONE_FLAG(flag) \
+ const struct scmp_arg_cmp clone_arg ## flag[] = { \
+ { .arg = CLONE_FLAGS_ARG, \
+ .op = SCMP_CMP_MASKED_EQ, \
+ .datum_a = flag, .datum_b = flag } }
+
+#define RULE_CLONE_FLAG(flag) \
+ { SCMP_SYS(clone), QEMU_SECCOMP_SET_SPAWN, \
+ ARRAY_SIZE(clone_arg ## flag), clone_arg ## flag, SCMP_ACT_TRAP }
+
+/* If no CLONE_* flags are set, except CSIGNAL, deny */
+const struct scmp_arg_cmp clone_arg_none[] = {
+ { .arg = CLONE_FLAGS_ARG,
+ .op = SCMP_CMP_MASKED_EQ,
+ .datum_a = ~(CSIGNAL), .datum_b = 0 }
+};
+
+/*
+ * pthread_create should always set all of these.
+ */
+REQUIRE_CLONE_FLAG(CLONE_VM);
+REQUIRE_CLONE_FLAG(CLONE_FS);
+REQUIRE_CLONE_FLAG(CLONE_FILES);
+REQUIRE_CLONE_FLAG(CLONE_SIGHAND);
+REQUIRE_CLONE_FLAG(CLONE_THREAD);
+REQUIRE_CLONE_FLAG(CLONE_SYSVSEM);
+REQUIRE_CLONE_FLAG(CLONE_SETTLS);
+REQUIRE_CLONE_FLAG(CLONE_PARENT_SETTID);
+REQUIRE_CLONE_FLAG(CLONE_CHILD_CLEARTID);
+/*
+ * Musl sets this in pthread_create too, but it is
+ * obsolete and harmless since its behaviour is
+ * subsumed under CLONE_THREAD
+ */
+/*REQUIRE_CLONE_FLAG(CLONE_DETACHED);*/
+
+
+/*
+ * These all indicate an attempt to spawn a process
+ * instead of a thread, or other undesirable scenarios
+ */
+FORBID_CLONE_FLAG(CLONE_PIDFD);
+FORBID_CLONE_FLAG(CLONE_PTRACE);
+FORBID_CLONE_FLAG(CLONE_VFORK);
+FORBID_CLONE_FLAG(CLONE_PARENT);
+FORBID_CLONE_FLAG(CLONE_NEWNS);
+FORBID_CLONE_FLAG(CLONE_UNTRACED);
+FORBID_CLONE_FLAG(CLONE_NEWCGROUP);
+FORBID_CLONE_FLAG(CLONE_NEWUTS);
+FORBID_CLONE_FLAG(CLONE_NEWIPC);
+FORBID_CLONE_FLAG(CLONE_NEWUSER);
+FORBID_CLONE_FLAG(CLONE_NEWPID);
+FORBID_CLONE_FLAG(CLONE_NEWNET);
+FORBID_CLONE_FLAG(CLONE_IO);
+
+
+static const struct QemuSeccompSyscall denylist[] = {
+ /* default set of syscalls that should get blocked */
+ { SCMP_SYS(reboot), QEMU_SECCOMP_SET_DEFAULT,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(swapon), QEMU_SECCOMP_SET_DEFAULT,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(swapoff), QEMU_SECCOMP_SET_DEFAULT,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(syslog), QEMU_SECCOMP_SET_DEFAULT,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(mount), QEMU_SECCOMP_SET_DEFAULT,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(umount), QEMU_SECCOMP_SET_DEFAULT,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(kexec_load), QEMU_SECCOMP_SET_DEFAULT,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(afs_syscall), QEMU_SECCOMP_SET_DEFAULT,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(break), QEMU_SECCOMP_SET_DEFAULT,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(ftime), QEMU_SECCOMP_SET_DEFAULT,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(getpmsg), QEMU_SECCOMP_SET_DEFAULT,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(gtty), QEMU_SECCOMP_SET_DEFAULT,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(lock), QEMU_SECCOMP_SET_DEFAULT,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(mpx), QEMU_SECCOMP_SET_DEFAULT,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(prof), QEMU_SECCOMP_SET_DEFAULT,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(profil), QEMU_SECCOMP_SET_DEFAULT,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(putpmsg), QEMU_SECCOMP_SET_DEFAULT,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(security), QEMU_SECCOMP_SET_DEFAULT,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(stty), QEMU_SECCOMP_SET_DEFAULT,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(tuxcall), QEMU_SECCOMP_SET_DEFAULT,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(ulimit), QEMU_SECCOMP_SET_DEFAULT,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(vserver), QEMU_SECCOMP_SET_DEFAULT,
+ 0, NULL, SCMP_ACT_TRAP },
+ /* obsolete */
+ { SCMP_SYS(readdir), QEMU_SECCOMP_SET_OBSOLETE,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(_sysctl), QEMU_SECCOMP_SET_OBSOLETE,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(bdflush), QEMU_SECCOMP_SET_OBSOLETE,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(create_module), QEMU_SECCOMP_SET_OBSOLETE,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(get_kernel_syms), QEMU_SECCOMP_SET_OBSOLETE,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(query_module), QEMU_SECCOMP_SET_OBSOLETE,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(sgetmask), QEMU_SECCOMP_SET_OBSOLETE,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(ssetmask), QEMU_SECCOMP_SET_OBSOLETE,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(sysfs), QEMU_SECCOMP_SET_OBSOLETE,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(uselib), QEMU_SECCOMP_SET_OBSOLETE,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(ustat), QEMU_SECCOMP_SET_OBSOLETE,
+ 0, NULL, SCMP_ACT_TRAP },
+ /* privileged */
+ { SCMP_SYS(setuid), QEMU_SECCOMP_SET_PRIVILEGED,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(setgid), QEMU_SECCOMP_SET_PRIVILEGED,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(setpgid), QEMU_SECCOMP_SET_PRIVILEGED,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(setsid), QEMU_SECCOMP_SET_PRIVILEGED,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(setreuid), QEMU_SECCOMP_SET_PRIVILEGED,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(setregid), QEMU_SECCOMP_SET_PRIVILEGED,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(setresuid), QEMU_SECCOMP_SET_PRIVILEGED,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(setresgid), QEMU_SECCOMP_SET_PRIVILEGED,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(setfsuid), QEMU_SECCOMP_SET_PRIVILEGED,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(setfsgid), QEMU_SECCOMP_SET_PRIVILEGED,
+ 0, NULL, SCMP_ACT_TRAP },
+ /* spawn */
+ { SCMP_SYS(fork), QEMU_SECCOMP_SET_SPAWN,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(vfork), QEMU_SECCOMP_SET_SPAWN,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(execve), QEMU_SECCOMP_SET_SPAWN,
+ 0, NULL, SCMP_ACT_TRAP },
+ { SCMP_SYS(clone), QEMU_SECCOMP_SET_SPAWN,
+ ARRAY_SIZE(clone_arg_none), clone_arg_none, SCMP_ACT_TRAP },
+ RULE_CLONE_FLAG(CLONE_VM),
+ RULE_CLONE_FLAG(CLONE_FS),
+ RULE_CLONE_FLAG(CLONE_FILES),
+ RULE_CLONE_FLAG(CLONE_SIGHAND),
+ RULE_CLONE_FLAG(CLONE_THREAD),
+ RULE_CLONE_FLAG(CLONE_SYSVSEM),
+ RULE_CLONE_FLAG(CLONE_SETTLS),
+ RULE_CLONE_FLAG(CLONE_PARENT_SETTID),
+ RULE_CLONE_FLAG(CLONE_CHILD_CLEARTID),
+ /*RULE_CLONE_FLAG(CLONE_DETACHED),*/
+ RULE_CLONE_FLAG(CLONE_PIDFD),
+ RULE_CLONE_FLAG(CLONE_PTRACE),
+ RULE_CLONE_FLAG(CLONE_VFORK),
+ RULE_CLONE_FLAG(CLONE_PARENT),
+ RULE_CLONE_FLAG(CLONE_NEWNS),
+ RULE_CLONE_FLAG(CLONE_UNTRACED),
+ RULE_CLONE_FLAG(CLONE_NEWCGROUP),
+ RULE_CLONE_FLAG(CLONE_NEWUTS),
+ RULE_CLONE_FLAG(CLONE_NEWIPC),
+ RULE_CLONE_FLAG(CLONE_NEWUSER),
+ RULE_CLONE_FLAG(CLONE_NEWPID),
+ RULE_CLONE_FLAG(CLONE_NEWNET),
+ RULE_CLONE_FLAG(CLONE_IO),
+#ifdef __SNR_clone3
+ { SCMP_SYS(clone3), QEMU_SECCOMP_SET_SPAWN,
+ 0, NULL, SCMP_ACT_ERRNO(ENOSYS) },
+#endif
+#ifdef __SNR_execveat
+ { SCMP_SYS(execveat), QEMU_SECCOMP_SET_SPAWN },
+#endif
+ { SCMP_SYS(setns), QEMU_SECCOMP_SET_SPAWN },
+ { SCMP_SYS(unshare), QEMU_SECCOMP_SET_SPAWN },
+ /* resource control */
+ { SCMP_SYS(setpriority), QEMU_SECCOMP_SET_RESOURCECTL,
+ 0, NULL, SCMP_ACT_ERRNO(EPERM) },
+ { SCMP_SYS(sched_setparam), QEMU_SECCOMP_SET_RESOURCECTL,
+ 0, NULL, SCMP_ACT_ERRNO(EPERM) },
+ { SCMP_SYS(sched_setscheduler), QEMU_SECCOMP_SET_RESOURCECTL,
+ ARRAY_SIZE(sched_setscheduler_arg), sched_setscheduler_arg,
+ SCMP_ACT_ERRNO(EPERM) },
+ { SCMP_SYS(sched_setaffinity), QEMU_SECCOMP_SET_RESOURCECTL,
+ 0, NULL, SCMP_ACT_ERRNO(EPERM) },
+};
+
+static inline __attribute__((unused)) int
+qemu_seccomp(unsigned int operation, unsigned int flags, void *args)
+{
+#ifdef __NR_seccomp
+ return syscall(__NR_seccomp, operation, flags, args);
+#else
+ errno = ENOSYS;
+ return -1;
+#endif
+}
+
+static uint32_t qemu_seccomp_update_action(uint32_t action)
+{
+#if defined(SECCOMP_GET_ACTION_AVAIL) && defined(SCMP_ACT_KILL_PROCESS) && \
+ defined(SECCOMP_RET_KILL_PROCESS)
+ if (action == SCMP_ACT_TRAP) {
+ static int kill_process = -1;
+ if (kill_process == -1) {
+ uint32_t testaction = SECCOMP_RET_KILL_PROCESS;
+
+ if (qemu_seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &testaction) == 0) {
+ kill_process = 1;
+ } else {
+ kill_process = 0;
+ }
+ }
+ if (kill_process == 1) {
+ return SCMP_ACT_KILL_PROCESS;
+ }
+ }
+#endif
+ return action;
+}
+
+
+static int seccomp_start(uint32_t seccomp_opts, Error **errp)
+{
+ int rc = -1;
+ unsigned int i = 0;
+ scmp_filter_ctx ctx;
+
+ ctx = seccomp_init(SCMP_ACT_ALLOW);
+ if (ctx == NULL) {
+ error_setg(errp, "failed to initialize seccomp context");
+ goto seccomp_return;
+ }
+
+#if defined(CONFIG_SECCOMP_SYSRAWRC)
+ /*
+ * This must be the first seccomp_attr_set() call to have full
+ * error propagation from subsequent seccomp APIs.
+ */
+ rc = seccomp_attr_set(ctx, SCMP_FLTATR_API_SYSRAWRC, 1);
+ if (rc != 0) {
+ error_setg_errno(errp, -rc,
+ "failed to set seccomp rawrc attribute");
+ goto seccomp_return;
+ }
+#endif
+
+ rc = seccomp_attr_set(ctx, SCMP_FLTATR_CTL_TSYNC, 1);
+ if (rc != 0) {
+ error_setg_errno(errp, -rc,
+ "failed to set seccomp thread synchronization");
+ goto seccomp_return;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(denylist); i++) {
+ uint32_t action;
+ if (!(seccomp_opts & denylist[i].set)) {
+ continue;
+ }
+
+ action = qemu_seccomp_update_action(denylist[i].action);
+ rc = seccomp_rule_add_array(ctx, action, denylist[i].num,
+ denylist[i].narg, denylist[i].arg_cmp);
+ if (rc < 0) {
+ error_setg_errno(errp, -rc,
+ "failed to add seccomp denylist rules");
+ goto seccomp_return;
+ }
+ }
+
+ rc = seccomp_load(ctx);
+ if (rc < 0) {
+ error_setg_errno(errp, -rc,
+ "failed to load seccomp syscall filter in kernel");
+ }
+
+ seccomp_return:
+ seccomp_release(ctx);
+ return rc < 0 ? -1 : 0;
+}
+
+int parse_sandbox(void *opaque, QemuOpts *opts, Error **errp)
+{
+ if (qemu_opt_get_bool(opts, "enable", false)) {
+ uint32_t seccomp_opts = QEMU_SECCOMP_SET_DEFAULT
+ | QEMU_SECCOMP_SET_OBSOLETE;
+ const char *value = NULL;
+
+ value = qemu_opt_get(opts, "obsolete");
+ if (value) {
+ if (g_str_equal(value, "allow")) {
+ seccomp_opts &= ~QEMU_SECCOMP_SET_OBSOLETE;
+ } else if (g_str_equal(value, "deny")) {
+ /* this is the default option, this if is here
+ * to provide a little bit of consistency for
+ * the command line */
+ } else {
+ error_setg(errp, "invalid argument for obsolete");
+ return -1;
+ }
+ }
+
+ value = qemu_opt_get(opts, "elevateprivileges");
+ if (value) {
+ if (g_str_equal(value, "deny")) {
+ seccomp_opts |= QEMU_SECCOMP_SET_PRIVILEGED;
+ } else if (g_str_equal(value, "children")) {
+ seccomp_opts |= QEMU_SECCOMP_SET_PRIVILEGED;
+
+ /* calling prctl directly because we're
+ * not sure if host has CAP_SYS_ADMIN set*/
+ if (prctl(PR_SET_NO_NEW_PRIVS, 1)) {
+ error_setg(errp, "failed to set no_new_privs aborting");
+ return -1;
+ }
+ } else if (g_str_equal(value, "allow")) {
+ /* default value */
+ } else {
+ error_setg(errp, "invalid argument for elevateprivileges");
+ return -1;
+ }
+ }
+
+ value = qemu_opt_get(opts, "spawn");
+ if (value) {
+ if (g_str_equal(value, "deny")) {
+ seccomp_opts |= QEMU_SECCOMP_SET_SPAWN;
+ } else if (g_str_equal(value, "allow")) {
+ /* default value */
+ } else {
+ error_setg(errp, "invalid argument for spawn");
+ return -1;
+ }
+ }
+
+ value = qemu_opt_get(opts, "resourcecontrol");
+ if (value) {
+ if (g_str_equal(value, "deny")) {
+ seccomp_opts |= QEMU_SECCOMP_SET_RESOURCECTL;
+ } else if (g_str_equal(value, "allow")) {
+ /* default value */
+ } else {
+ error_setg(errp, "invalid argument for resourcecontrol");
+ return -1;
+ }
+ }
+
+ if (seccomp_start(seccomp_opts, errp) < 0) {
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static QemuOptsList qemu_sandbox_opts = {
+ .name = "sandbox",
+ .implied_opt_name = "enable",
+ .head = QTAILQ_HEAD_INITIALIZER(qemu_sandbox_opts.head),
+ .desc = {
+ {
+ .name = "enable",
+ .type = QEMU_OPT_BOOL,
+ },
+ {
+ .name = "obsolete",
+ .type = QEMU_OPT_STRING,
+ },
+ {
+ .name = "elevateprivileges",
+ .type = QEMU_OPT_STRING,
+ },
+ {
+ .name = "spawn",
+ .type = QEMU_OPT_STRING,
+ },
+ {
+ .name = "resourcecontrol",
+ .type = QEMU_OPT_STRING,
+ },
+ { /* end of list */ }
+ },
+};
+
+static void seccomp_register(void)
+{
+ bool add = false;
+
+ /* FIXME: use seccomp_api_get() >= 2 check when released */
+
+#if defined(SECCOMP_FILTER_FLAG_TSYNC)
+ int check;
+
+ /* check host TSYNC capability, it returns errno == ENOSYS if unavailable */
+ check = qemu_seccomp(SECCOMP_SET_MODE_FILTER,
+ SECCOMP_FILTER_FLAG_TSYNC, NULL);
+ if (check < 0 && errno == EFAULT) {
+ add = true;
+ }
+#endif
+
+ if (add) {
+ qemu_add_opts(&qemu_sandbox_opts);
+ }
+}
+opts_init(seccomp_register);
diff --git a/system/qtest.c b/system/qtest.c
new file mode 100644
index 0000000000..35b643a274
--- /dev/null
+++ b/system/qtest.c
@@ -0,0 +1,1070 @@
+/*
+ * Test Server
+ *
+ * Copyright IBM, Corp. 2011
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "sysemu/qtest.h"
+#include "sysemu/runstate.h"
+#include "chardev/char-fe.h"
+#include "exec/ioport.h"
+#include "exec/memory.h"
+#include "exec/tswap.h"
+#include "hw/qdev-core.h"
+#include "hw/irq.h"
+#include "qemu/accel.h"
+#include "sysemu/cpu-timers.h"
+#include "qemu/config-file.h"
+#include "qemu/option.h"
+#include "qemu/error-report.h"
+#include "qemu/module.h"
+#include "qemu/cutils.h"
+#include "qom/object_interfaces.h"
+
+#define MAX_IRQ 256
+
+#define TYPE_QTEST "qtest"
+
+OBJECT_DECLARE_SIMPLE_TYPE(QTest, QTEST)
+
+struct QTest {
+ Object parent;
+
+ bool has_machine_link;
+ char *chr_name;
+ Chardev *chr;
+ CharBackend qtest_chr;
+ char *log;
+};
+
+bool qtest_allowed;
+
+static DeviceState *irq_intercept_dev;
+static FILE *qtest_log_fp;
+static QTest *qtest;
+static GString *inbuf;
+static int irq_levels[MAX_IRQ];
+static GTimer *timer;
+static bool qtest_opened;
+static void (*qtest_server_send)(void*, const char*);
+static void *qtest_server_send_opaque;
+
+#define FMT_timeval "%.06f"
+
+/**
+ * DOC: QTest Protocol
+ *
+ * Line based protocol, request/response based. Server can send async messages
+ * so clients should always handle many async messages before the response
+ * comes in.
+ *
+ * Valid requests
+ * ^^^^^^^^^^^^^^
+ *
+ * Clock management:
+ * """""""""""""""""
+ *
+ * The qtest client is completely in charge of the QEMU_CLOCK_VIRTUAL. qtest commands
+ * let you adjust the value of the clock (monotonically). All the commands
+ * return the current value of the clock in nanoseconds.
+ *
+ * .. code-block:: none
+ *
+ * > clock_step
+ * < OK VALUE
+ *
+ * Advance the clock to the next deadline. Useful when waiting for
+ * asynchronous events.
+ *
+ * .. code-block:: none
+ *
+ * > clock_step NS
+ * < OK VALUE
+ *
+ * Advance the clock by NS nanoseconds.
+ *
+ * .. code-block:: none
+ *
+ * > clock_set NS
+ * < OK VALUE
+ *
+ * Advance the clock to NS nanoseconds (do nothing if it's already past).
+ *
+ * PIO and memory access:
+ * """"""""""""""""""""""
+ *
+ * .. code-block:: none
+ *
+ * > outb ADDR VALUE
+ * < OK
+ *
+ * .. code-block:: none
+ *
+ * > outw ADDR VALUE
+ * < OK
+ *
+ * .. code-block:: none
+ *
+ * > outl ADDR VALUE
+ * < OK
+ *
+ * .. code-block:: none
+ *
+ * > inb ADDR
+ * < OK VALUE
+ *
+ * .. code-block:: none
+ *
+ * > inw ADDR
+ * < OK VALUE
+ *
+ * .. code-block:: none
+ *
+ * > inl ADDR
+ * < OK VALUE
+ *
+ * .. code-block:: none
+ *
+ * > writeb ADDR VALUE
+ * < OK
+ *
+ * .. code-block:: none
+ *
+ * > writew ADDR VALUE
+ * < OK
+ *
+ * .. code-block:: none
+ *
+ * > writel ADDR VALUE
+ * < OK
+ *
+ * .. code-block:: none
+ *
+ * > writeq ADDR VALUE
+ * < OK
+ *
+ * .. code-block:: none
+ *
+ * > readb ADDR
+ * < OK VALUE
+ *
+ * .. code-block:: none
+ *
+ * > readw ADDR
+ * < OK VALUE
+ *
+ * .. code-block:: none
+ *
+ * > readl ADDR
+ * < OK VALUE
+ *
+ * .. code-block:: none
+ *
+ * > readq ADDR
+ * < OK VALUE
+ *
+ * .. code-block:: none
+ *
+ * > read ADDR SIZE
+ * < OK DATA
+ *
+ * .. code-block:: none
+ *
+ * > write ADDR SIZE DATA
+ * < OK
+ *
+ * .. code-block:: none
+ *
+ * > b64read ADDR SIZE
+ * < OK B64_DATA
+ *
+ * .. code-block:: none
+ *
+ * > b64write ADDR SIZE B64_DATA
+ * < OK
+ *
+ * .. code-block:: none
+ *
+ * > memset ADDR SIZE VALUE
+ * < OK
+ *
+ * ADDR, SIZE, VALUE are all integers parsed with strtoul() with a base of 0.
+ * For 'memset' a zero size is permitted and does nothing.
+ *
+ * DATA is an arbitrarily long hex number prefixed with '0x'. If it's smaller
+ * than the expected size, the value will be zero filled at the end of the data
+ * sequence.
+ *
+ * B64_DATA is an arbitrarily long base64 encoded string.
+ * If the sizes do not match, the data will be truncated.
+ *
+ * IRQ management:
+ * """""""""""""""
+ *
+ * .. code-block:: none
+ *
+ * > irq_intercept_in QOM-PATH
+ * < OK
+ *
+ * .. code-block:: none
+ *
+ * > irq_intercept_out QOM-PATH
+ * < OK
+ *
+ * Attach to the gpio-in (resp. gpio-out) pins exported by the device at
+ * QOM-PATH. When the pin is triggered, one of the following async messages
+ * will be printed to the qtest stream::
+ *
+ * IRQ raise NUM
+ * IRQ lower NUM
+ *
+ * where NUM is an IRQ number. For the PC, interrupts can be intercepted
+ * simply with "irq_intercept_in ioapic" (note that IRQ0 comes out with
+ * NUM=0 even though it is remapped to GSI 2).
+ *
+ * Setting interrupt level:
+ * """"""""""""""""""""""""
+ *
+ * .. code-block:: none
+ *
+ * > set_irq_in QOM-PATH NAME NUM LEVEL
+ * < OK
+ *
+ * where NAME is the name of the irq/gpio list, NUM is an IRQ number and
+ * LEVEL is an signed integer IRQ level.
+ *
+ * Forcibly set the given interrupt pin to the given level.
+ *
+ */
+
+static int hex2nib(char ch)
+{
+ if (ch >= '0' && ch <= '9') {
+ return ch - '0';
+ } else if (ch >= 'a' && ch <= 'f') {
+ return 10 + (ch - 'a');
+ } else if (ch >= 'A' && ch <= 'F') {
+ return 10 + (ch - 'A');
+ } else {
+ return -1;
+ }
+}
+
+void qtest_send_prefix(CharBackend *chr)
+{
+ if (!qtest_log_fp || !qtest_opened) {
+ return;
+ }
+
+ fprintf(qtest_log_fp, "[S +" FMT_timeval "] ", g_timer_elapsed(timer, NULL));
+}
+
+static void G_GNUC_PRINTF(1, 2) qtest_log_send(const char *fmt, ...)
+{
+ va_list ap;
+
+ if (!qtest_log_fp || !qtest_opened) {
+ return;
+ }
+
+ qtest_send_prefix(NULL);
+
+ va_start(ap, fmt);
+ vfprintf(qtest_log_fp, fmt, ap);
+ va_end(ap);
+}
+
+static void qtest_server_char_be_send(void *opaque, const char *str)
+{
+ size_t len = strlen(str);
+ CharBackend* chr = (CharBackend *)opaque;
+ qemu_chr_fe_write_all(chr, (uint8_t *)str, len);
+ if (qtest_log_fp && qtest_opened) {
+ fprintf(qtest_log_fp, "%s", str);
+ }
+}
+
+static void qtest_send(CharBackend *chr, const char *str)
+{
+ qtest_server_send(qtest_server_send_opaque, str);
+}
+
+void qtest_sendf(CharBackend *chr, const char *fmt, ...)
+{
+ va_list ap;
+ gchar *buffer;
+
+ va_start(ap, fmt);
+ buffer = g_strdup_vprintf(fmt, ap);
+ qtest_send(chr, buffer);
+ g_free(buffer);
+ va_end(ap);
+}
+
+static void qtest_irq_handler(void *opaque, int n, int level)
+{
+ qemu_irq old_irq = *(qemu_irq *)opaque;
+ qemu_set_irq(old_irq, level);
+
+ if (irq_levels[n] != level) {
+ CharBackend *chr = &qtest->qtest_chr;
+ irq_levels[n] = level;
+ qtest_send_prefix(chr);
+ qtest_sendf(chr, "IRQ %s %d\n",
+ level ? "raise" : "lower", n);
+ }
+}
+
+static int64_t qtest_clock_counter;
+
+int64_t qtest_get_virtual_clock(void)
+{
+ return qatomic_read_i64(&qtest_clock_counter);
+}
+
+static void qtest_set_virtual_clock(int64_t count)
+{
+ qatomic_set_i64(&qtest_clock_counter, count);
+}
+
+static void qtest_clock_warp(int64_t dest)
+{
+ int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
+ AioContext *aio_context;
+ assert(qtest_enabled());
+ aio_context = qemu_get_aio_context();
+ while (clock < dest) {
+ int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
+ QEMU_TIMER_ATTR_ALL);
+ int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
+
+ qtest_set_virtual_clock(qtest_get_virtual_clock() + warp);
+
+ qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
+ timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
+ clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
+ }
+ qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
+}
+
+static bool (*process_command_cb)(CharBackend *chr, gchar **words);
+
+void qtest_set_command_cb(bool (*pc_cb)(CharBackend *chr, gchar **words))
+{
+ assert(!process_command_cb); /* Switch to a list if we need more than one */
+
+ process_command_cb = pc_cb;
+}
+
+static void qtest_install_gpio_out_intercept(DeviceState *dev, const char *name, int n)
+{
+ qemu_irq *disconnected = g_new0(qemu_irq, 1);
+ qemu_irq icpt = qemu_allocate_irq(qtest_irq_handler,
+ disconnected, n);
+
+ *disconnected = qdev_intercept_gpio_out(dev, icpt, name, n);
+}
+
+static void qtest_process_command(CharBackend *chr, gchar **words)
+{
+ const gchar *command;
+
+ g_assert(words);
+
+ command = words[0];
+
+ if (qtest_log_fp) {
+ int i;
+
+ fprintf(qtest_log_fp, "[R +" FMT_timeval "]", g_timer_elapsed(timer, NULL));
+ for (i = 0; words[i]; i++) {
+ fprintf(qtest_log_fp, " %s", words[i]);
+ }
+ fprintf(qtest_log_fp, "\n");
+ }
+
+ g_assert(command);
+ if (strcmp(words[0], "irq_intercept_out") == 0
+ || strcmp(words[0], "irq_intercept_in") == 0) {
+ DeviceState *dev;
+ NamedGPIOList *ngl;
+ bool is_named;
+ bool is_outbound;
+ bool interception_succeeded = false;
+
+ g_assert(words[1]);
+ is_named = words[2] != NULL;
+ is_outbound = words[0][14] == 'o';
+ dev = DEVICE(object_resolve_path(words[1], NULL));
+ if (!dev) {
+ qtest_send_prefix(chr);
+ qtest_send(chr, "FAIL Unknown device\n");
+ return;
+ }
+
+ if (is_named && !is_outbound) {
+ qtest_send_prefix(chr);
+ qtest_send(chr, "FAIL Interception of named in-GPIOs not yet supported\n");
+ return;
+ }
+
+ if (irq_intercept_dev) {
+ qtest_send_prefix(chr);
+ if (irq_intercept_dev != dev) {
+ qtest_send(chr, "FAIL IRQ intercept already enabled\n");
+ } else {
+ qtest_send(chr, "OK\n");
+ }
+ return;
+ }
+
+ QLIST_FOREACH(ngl, &dev->gpios, node) {
+ /* We don't support inbound interception of named GPIOs yet */
+ if (is_outbound) {
+ /* NULL is valid and matchable, for "unnamed GPIO" */
+ if (g_strcmp0(ngl->name, words[2]) == 0) {
+ int i;
+ for (i = 0; i < ngl->num_out; ++i) {
+ qtest_install_gpio_out_intercept(dev, ngl->name, i);
+ }
+ interception_succeeded = true;
+ }
+ } else {
+ qemu_irq_intercept_in(ngl->in, qtest_irq_handler,
+ ngl->num_in);
+ interception_succeeded = true;
+ }
+ }
+
+ qtest_send_prefix(chr);
+ if (interception_succeeded) {
+ irq_intercept_dev = dev;
+ qtest_send(chr, "OK\n");
+ } else {
+ qtest_send(chr, "FAIL No intercepts installed\n");
+ }
+ } else if (strcmp(words[0], "set_irq_in") == 0) {
+ DeviceState *dev;
+ qemu_irq irq;
+ char *name;
+ int ret;
+ int num;
+ int level;
+
+ g_assert(words[1] && words[2] && words[3] && words[4]);
+
+ dev = DEVICE(object_resolve_path(words[1], NULL));
+ if (!dev) {
+ qtest_send_prefix(chr);
+ qtest_send(chr, "FAIL Unknown device\n");
+ return;
+ }
+
+ if (strcmp(words[2], "unnamed-gpio-in") == 0) {
+ name = NULL;
+ } else {
+ name = words[2];
+ }
+
+ ret = qemu_strtoi(words[3], NULL, 0, &num);
+ g_assert(!ret);
+ ret = qemu_strtoi(words[4], NULL, 0, &level);
+ g_assert(!ret);
+
+ irq = qdev_get_gpio_in_named(dev, name, num);
+
+ qemu_set_irq(irq, level);
+ qtest_send_prefix(chr);
+ qtest_send(chr, "OK\n");
+ } else if (strcmp(words[0], "outb") == 0 ||
+ strcmp(words[0], "outw") == 0 ||
+ strcmp(words[0], "outl") == 0) {
+ unsigned long addr;
+ unsigned long value;
+ int ret;
+
+ g_assert(words[1] && words[2]);
+ ret = qemu_strtoul(words[1], NULL, 0, &addr);
+ g_assert(ret == 0);
+ ret = qemu_strtoul(words[2], NULL, 0, &value);
+ g_assert(ret == 0);
+ g_assert(addr <= 0xffff);
+
+ if (words[0][3] == 'b') {
+ cpu_outb(addr, value);
+ } else if (words[0][3] == 'w') {
+ cpu_outw(addr, value);
+ } else if (words[0][3] == 'l') {
+ cpu_outl(addr, value);
+ }
+ qtest_send_prefix(chr);
+ qtest_send(chr, "OK\n");
+ } else if (strcmp(words[0], "inb") == 0 ||
+ strcmp(words[0], "inw") == 0 ||
+ strcmp(words[0], "inl") == 0) {
+ unsigned long addr;
+ uint32_t value = -1U;
+ int ret;
+
+ g_assert(words[1]);
+ ret = qemu_strtoul(words[1], NULL, 0, &addr);
+ g_assert(ret == 0);
+ g_assert(addr <= 0xffff);
+
+ if (words[0][2] == 'b') {
+ value = cpu_inb(addr);
+ } else if (words[0][2] == 'w') {
+ value = cpu_inw(addr);
+ } else if (words[0][2] == 'l') {
+ value = cpu_inl(addr);
+ }
+ qtest_send_prefix(chr);
+ qtest_sendf(chr, "OK 0x%04x\n", value);
+ } else if (strcmp(words[0], "writeb") == 0 ||
+ strcmp(words[0], "writew") == 0 ||
+ strcmp(words[0], "writel") == 0 ||
+ strcmp(words[0], "writeq") == 0) {
+ uint64_t addr;
+ uint64_t value;
+ int ret;
+
+ g_assert(words[1] && words[2]);
+ ret = qemu_strtou64(words[1], NULL, 0, &addr);
+ g_assert(ret == 0);
+ ret = qemu_strtou64(words[2], NULL, 0, &value);
+ g_assert(ret == 0);
+
+ if (words[0][5] == 'b') {
+ uint8_t data = value;
+ address_space_write(first_cpu->as, addr, MEMTXATTRS_UNSPECIFIED,
+ &data, 1);
+ } else if (words[0][5] == 'w') {
+ uint16_t data = value;
+ tswap16s(&data);
+ address_space_write(first_cpu->as, addr, MEMTXATTRS_UNSPECIFIED,
+ &data, 2);
+ } else if (words[0][5] == 'l') {
+ uint32_t data = value;
+ tswap32s(&data);
+ address_space_write(first_cpu->as, addr, MEMTXATTRS_UNSPECIFIED,
+ &data, 4);
+ } else if (words[0][5] == 'q') {
+ uint64_t data = value;
+ tswap64s(&data);
+ address_space_write(first_cpu->as, addr, MEMTXATTRS_UNSPECIFIED,
+ &data, 8);
+ }
+ qtest_send_prefix(chr);
+ qtest_send(chr, "OK\n");
+ } else if (strcmp(words[0], "readb") == 0 ||
+ strcmp(words[0], "readw") == 0 ||
+ strcmp(words[0], "readl") == 0 ||
+ strcmp(words[0], "readq") == 0) {
+ uint64_t addr;
+ uint64_t value = UINT64_C(-1);
+ int ret;
+
+ g_assert(words[1]);
+ ret = qemu_strtou64(words[1], NULL, 0, &addr);
+ g_assert(ret == 0);
+
+ if (words[0][4] == 'b') {
+ uint8_t data;
+ address_space_read(first_cpu->as, addr, MEMTXATTRS_UNSPECIFIED,
+ &data, 1);
+ value = data;
+ } else if (words[0][4] == 'w') {
+ uint16_t data;
+ address_space_read(first_cpu->as, addr, MEMTXATTRS_UNSPECIFIED,
+ &data, 2);
+ value = tswap16(data);
+ } else if (words[0][4] == 'l') {
+ uint32_t data;
+ address_space_read(first_cpu->as, addr, MEMTXATTRS_UNSPECIFIED,
+ &data, 4);
+ value = tswap32(data);
+ } else if (words[0][4] == 'q') {
+ address_space_read(first_cpu->as, addr, MEMTXATTRS_UNSPECIFIED,
+ &value, 8);
+ tswap64s(&value);
+ }
+ qtest_send_prefix(chr);
+ qtest_sendf(chr, "OK 0x%016" PRIx64 "\n", value);
+ } else if (strcmp(words[0], "read") == 0) {
+ uint64_t addr, len, i;
+ uint8_t *data;
+ char *enc;
+ int ret;
+
+ g_assert(words[1] && words[2]);
+ ret = qemu_strtou64(words[1], NULL, 0, &addr);
+ g_assert(ret == 0);
+ ret = qemu_strtou64(words[2], NULL, 0, &len);
+ g_assert(ret == 0);
+ /* We'd send garbage to libqtest if len is 0 */
+ g_assert(len);
+
+ data = g_malloc(len);
+ address_space_read(first_cpu->as, addr, MEMTXATTRS_UNSPECIFIED, data,
+ len);
+
+ enc = g_malloc(2 * len + 1);
+ for (i = 0; i < len; i++) {
+ sprintf(&enc[i * 2], "%02x", data[i]);
+ }
+
+ qtest_send_prefix(chr);
+ qtest_sendf(chr, "OK 0x%s\n", enc);
+
+ g_free(data);
+ g_free(enc);
+ } else if (strcmp(words[0], "b64read") == 0) {
+ uint64_t addr, len;
+ uint8_t *data;
+ gchar *b64_data;
+ int ret;
+
+ g_assert(words[1] && words[2]);
+ ret = qemu_strtou64(words[1], NULL, 0, &addr);
+ g_assert(ret == 0);
+ ret = qemu_strtou64(words[2], NULL, 0, &len);
+ g_assert(ret == 0);
+
+ data = g_malloc(len);
+ address_space_read(first_cpu->as, addr, MEMTXATTRS_UNSPECIFIED, data,
+ len);
+ b64_data = g_base64_encode(data, len);
+ qtest_send_prefix(chr);
+ qtest_sendf(chr, "OK %s\n", b64_data);
+
+ g_free(data);
+ g_free(b64_data);
+ } else if (strcmp(words[0], "write") == 0) {
+ uint64_t addr, len, i;
+ uint8_t *data;
+ size_t data_len;
+ int ret;
+
+ g_assert(words[1] && words[2] && words[3]);
+ ret = qemu_strtou64(words[1], NULL, 0, &addr);
+ g_assert(ret == 0);
+ ret = qemu_strtou64(words[2], NULL, 0, &len);
+ g_assert(ret == 0);
+
+ data_len = strlen(words[3]);
+ if (data_len < 3) {
+ qtest_send(chr, "ERR invalid argument size\n");
+ return;
+ }
+
+ data = g_malloc(len);
+ for (i = 0; i < len; i++) {
+ if ((i * 2 + 4) <= data_len) {
+ data[i] = hex2nib(words[3][i * 2 + 2]) << 4;
+ data[i] |= hex2nib(words[3][i * 2 + 3]);
+ } else {
+ data[i] = 0;
+ }
+ }
+ address_space_write(first_cpu->as, addr, MEMTXATTRS_UNSPECIFIED, data,
+ len);
+ g_free(data);
+
+ qtest_send_prefix(chr);
+ qtest_send(chr, "OK\n");
+ } else if (strcmp(words[0], "memset") == 0) {
+ uint64_t addr, len;
+ uint8_t *data;
+ unsigned long pattern;
+ int ret;
+
+ g_assert(words[1] && words[2] && words[3]);
+ ret = qemu_strtou64(words[1], NULL, 0, &addr);
+ g_assert(ret == 0);
+ ret = qemu_strtou64(words[2], NULL, 0, &len);
+ g_assert(ret == 0);
+ ret = qemu_strtoul(words[3], NULL, 0, &pattern);
+ g_assert(ret == 0);
+
+ if (len) {
+ data = g_malloc(len);
+ memset(data, pattern, len);
+ address_space_write(first_cpu->as, addr, MEMTXATTRS_UNSPECIFIED,
+ data, len);
+ g_free(data);
+ }
+
+ qtest_send_prefix(chr);
+ qtest_send(chr, "OK\n");
+ } else if (strcmp(words[0], "b64write") == 0) {
+ uint64_t addr, len;
+ uint8_t *data;
+ size_t data_len;
+ gsize out_len;
+ int ret;
+
+ g_assert(words[1] && words[2] && words[3]);
+ ret = qemu_strtou64(words[1], NULL, 0, &addr);
+ g_assert(ret == 0);
+ ret = qemu_strtou64(words[2], NULL, 0, &len);
+ g_assert(ret == 0);
+
+ data_len = strlen(words[3]);
+ if (data_len < 3) {
+ qtest_send(chr, "ERR invalid argument size\n");
+ return;
+ }
+
+ data = g_base64_decode_inplace(words[3], &out_len);
+ if (out_len != len) {
+ qtest_log_send("b64write: data length mismatch (told %"PRIu64", "
+ "found %zu)\n",
+ len, out_len);
+ out_len = MIN(out_len, len);
+ }
+
+ address_space_write(first_cpu->as, addr, MEMTXATTRS_UNSPECIFIED, data,
+ len);
+
+ qtest_send_prefix(chr);
+ qtest_send(chr, "OK\n");
+ } else if (strcmp(words[0], "endianness") == 0) {
+ qtest_send_prefix(chr);
+ if (target_words_bigendian()) {
+ qtest_sendf(chr, "OK big\n");
+ } else {
+ qtest_sendf(chr, "OK little\n");
+ }
+ } else if (qtest_enabled() && strcmp(words[0], "clock_step") == 0) {
+ int64_t ns;
+
+ if (words[1]) {
+ int ret = qemu_strtoi64(words[1], NULL, 0, &ns);
+ g_assert(ret == 0);
+ } else {
+ ns = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
+ QEMU_TIMER_ATTR_ALL);
+ }
+ qtest_clock_warp(qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + ns);
+ qtest_send_prefix(chr);
+ qtest_sendf(chr, "OK %"PRIi64"\n",
+ (int64_t)qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL));
+ } else if (strcmp(words[0], "module_load") == 0) {
+ Error *local_err = NULL;
+ int rv;
+ g_assert(words[1] && words[2]);
+
+ qtest_send_prefix(chr);
+ rv = module_load(words[1], words[2], &local_err);
+ if (rv > 0) {
+ qtest_sendf(chr, "OK\n");
+ } else {
+ if (rv < 0) {
+ error_report_err(local_err);
+ }
+ qtest_sendf(chr, "FAIL\n");
+ }
+ } else if (qtest_enabled() && strcmp(words[0], "clock_set") == 0) {
+ int64_t ns;
+ int ret;
+
+ g_assert(words[1]);
+ ret = qemu_strtoi64(words[1], NULL, 0, &ns);
+ g_assert(ret == 0);
+ qtest_clock_warp(ns);
+ qtest_send_prefix(chr);
+ qtest_sendf(chr, "OK %"PRIi64"\n",
+ (int64_t)qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL));
+ } else if (process_command_cb && process_command_cb(chr, words)) {
+ /* Command got consumed by the callback handler */
+ } else {
+ qtest_send_prefix(chr);
+ qtest_sendf(chr, "FAIL Unknown command '%s'\n", words[0]);
+ }
+}
+
+static void qtest_process_inbuf(CharBackend *chr, GString *inbuf)
+{
+ char *end;
+
+ while ((end = strchr(inbuf->str, '\n')) != NULL) {
+ size_t offset;
+ GString *cmd;
+ gchar **words;
+
+ offset = end - inbuf->str;
+
+ cmd = g_string_new_len(inbuf->str, offset);
+ g_string_erase(inbuf, 0, offset + 1);
+
+ words = g_strsplit(cmd->str, " ", 0);
+ qtest_process_command(chr, words);
+ g_strfreev(words);
+
+ g_string_free(cmd, TRUE);
+ }
+}
+
+static void qtest_read(void *opaque, const uint8_t *buf, int size)
+{
+ CharBackend *chr = opaque;
+
+ g_string_append_len(inbuf, (const gchar *)buf, size);
+ qtest_process_inbuf(chr, inbuf);
+}
+
+static int qtest_can_read(void *opaque)
+{
+ return 1024;
+}
+
+static void qtest_event(void *opaque, QEMUChrEvent event)
+{
+ int i;
+
+ switch (event) {
+ case CHR_EVENT_OPENED:
+ /*
+ * We used to call qemu_system_reset() here, hoping we could
+ * use the same process for multiple tests that way. Never
+ * used. Injects an extra reset even when it's not used, and
+ * that can mess up tests, e.g. -boot once.
+ */
+ for (i = 0; i < ARRAY_SIZE(irq_levels); i++) {
+ irq_levels[i] = 0;
+ }
+
+ g_clear_pointer(&timer, g_timer_destroy);
+ timer = g_timer_new();
+ qtest_opened = true;
+ if (qtest_log_fp) {
+ fprintf(qtest_log_fp, "[I " FMT_timeval "] OPENED\n", g_timer_elapsed(timer, NULL));
+ }
+ break;
+ case CHR_EVENT_CLOSED:
+ qtest_opened = false;
+ if (qtest_log_fp) {
+ fprintf(qtest_log_fp, "[I +" FMT_timeval "] CLOSED\n", g_timer_elapsed(timer, NULL));
+ }
+ g_clear_pointer(&timer, g_timer_destroy);
+ break;
+ default:
+ break;
+ }
+}
+
+void qtest_server_init(const char *qtest_chrdev, const char *qtest_log, Error **errp)
+{
+ ERRP_GUARD();
+ Chardev *chr;
+ Object *qtest;
+
+ chr = qemu_chr_new("qtest", qtest_chrdev, NULL);
+ if (chr == NULL) {
+ error_setg(errp, "Failed to initialize device for qtest: \"%s\"",
+ qtest_chrdev);
+ return;
+ }
+
+ qtest = object_new(TYPE_QTEST);
+ object_property_set_str(qtest, "chardev", chr->label, &error_abort);
+ if (qtest_log) {
+ object_property_set_str(qtest, "log", qtest_log, &error_abort);
+ }
+ object_property_add_child(qdev_get_machine(), "qtest", qtest);
+ user_creatable_complete(USER_CREATABLE(qtest), errp);
+ if (*errp) {
+ object_unparent(qtest);
+ }
+ object_unref(OBJECT(chr));
+ object_unref(qtest);
+}
+
+static bool qtest_server_start(QTest *q, Error **errp)
+{
+ Chardev *chr = q->chr;
+ const char *qtest_log = q->log;
+
+ if (qtest_log) {
+ if (strcmp(qtest_log, "none") != 0) {
+ qtest_log_fp = fopen(qtest_log, "w+");
+ }
+ } else {
+ qtest_log_fp = stderr;
+ }
+
+ if (!qemu_chr_fe_init(&q->qtest_chr, chr, errp)) {
+ return false;
+ }
+ qemu_chr_fe_set_handlers(&q->qtest_chr, qtest_can_read, qtest_read,
+ qtest_event, NULL, &q->qtest_chr, NULL, true);
+ qemu_chr_fe_set_echo(&q->qtest_chr, true);
+
+ inbuf = g_string_new("");
+
+ if (!qtest_server_send) {
+ qtest_server_set_send_handler(qtest_server_char_be_send, &q->qtest_chr);
+ }
+ qtest = q;
+ return true;
+}
+
+void qtest_server_set_send_handler(void (*send)(void*, const char*),
+ void *opaque)
+{
+ qtest_server_send = send;
+ qtest_server_send_opaque = opaque;
+}
+
+bool qtest_driver(void)
+{
+ return qtest && qtest->qtest_chr.chr != NULL;
+}
+
+void qtest_server_inproc_recv(void *dummy, const char *buf)
+{
+ static GString *gstr;
+ if (!gstr) {
+ gstr = g_string_new(NULL);
+ }
+ g_string_append(gstr, buf);
+ if (gstr->str[gstr->len - 1] == '\n') {
+ qtest_process_inbuf(NULL, gstr);
+ g_string_truncate(gstr, 0);
+ }
+}
+
+static void qtest_complete(UserCreatable *uc, Error **errp)
+{
+ QTest *q = QTEST(uc);
+ if (qtest) {
+ error_setg(errp, "Only one instance of qtest can be created");
+ return;
+ }
+ if (!q->chr_name) {
+ error_setg(errp, "No backend specified");
+ return;
+ }
+
+ if (OBJECT(uc)->parent != qdev_get_machine()) {
+ q->has_machine_link = true;
+ object_property_add_const_link(qdev_get_machine(), "qtest", OBJECT(uc));
+ } else {
+ /* -qtest was used. */
+ }
+
+ qtest_server_start(q, errp);
+}
+
+static void qtest_unparent(Object *obj)
+{
+ QTest *q = QTEST(obj);
+
+ if (qtest == q) {
+ qemu_chr_fe_disconnect(&q->qtest_chr);
+ assert(!qtest_opened);
+ qemu_chr_fe_deinit(&q->qtest_chr, false);
+ if (qtest_log_fp) {
+ fclose(qtest_log_fp);
+ qtest_log_fp = NULL;
+ }
+ qtest = NULL;
+ }
+
+ if (q->has_machine_link) {
+ object_property_del(qdev_get_machine(), "qtest");
+ q->has_machine_link = false;
+ }
+}
+
+static void qtest_set_log(Object *obj, const char *value, Error **errp)
+{
+ QTest *q = QTEST(obj);
+
+ if (qtest == q) {
+ error_setg(errp, "Property 'log' can not be set now");
+ } else {
+ g_free(q->log);
+ q->log = g_strdup(value);
+ }
+}
+
+static char *qtest_get_log(Object *obj, Error **errp)
+{
+ QTest *q = QTEST(obj);
+
+ return g_strdup(q->log);
+}
+
+static void qtest_set_chardev(Object *obj, const char *value, Error **errp)
+{
+ QTest *q = QTEST(obj);
+ Chardev *chr;
+
+ if (qtest == q) {
+ error_setg(errp, "Property 'chardev' can not be set now");
+ return;
+ }
+
+ chr = qemu_chr_find(value);
+ if (!chr) {
+ error_setg(errp, "Cannot find character device '%s'", value);
+ return;
+ }
+
+ g_free(q->chr_name);
+ q->chr_name = g_strdup(value);
+
+ if (q->chr) {
+ object_unref(q->chr);
+ }
+ q->chr = chr;
+ object_ref(chr);
+}
+
+static char *qtest_get_chardev(Object *obj, Error **errp)
+{
+ QTest *q = QTEST(obj);
+
+ return g_strdup(q->chr_name);
+}
+
+static void qtest_class_init(ObjectClass *oc, void *data)
+{
+ UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc);
+
+ oc->unparent = qtest_unparent;
+ ucc->complete = qtest_complete;
+
+ object_class_property_add_str(oc, "chardev",
+ qtest_get_chardev, qtest_set_chardev);
+ object_class_property_add_str(oc, "log",
+ qtest_get_log, qtest_set_log);
+}
+
+static const TypeInfo qtest_info = {
+ .name = TYPE_QTEST,
+ .parent = TYPE_OBJECT,
+ .class_init = qtest_class_init,
+ .instance_size = sizeof(QTest),
+ .interfaces = (InterfaceInfo[]) {
+ { TYPE_USER_CREATABLE },
+ { }
+ }
+};
+
+static void register_types(void)
+{
+ type_register_static(&qtest_info);
+}
+
+type_init(register_types);
diff --git a/system/rtc.c b/system/rtc.c
new file mode 100644
index 0000000000..4904581abe
--- /dev/null
+++ b/system/rtc.c
@@ -0,0 +1,192 @@
+/*
+ * RTC configuration and clock read
+ *
+ * Copyright (c) 2003-2020 QEMU contributors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/cutils.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qerror.h"
+#include "qemu/error-report.h"
+#include "qemu/option.h"
+#include "qemu/timer.h"
+#include "qom/object.h"
+#include "sysemu/replay.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/rtc.h"
+#include "hw/rtc/mc146818rtc.h"
+
+static enum {
+ RTC_BASE_UTC,
+ RTC_BASE_LOCALTIME,
+ RTC_BASE_DATETIME,
+} rtc_base_type = RTC_BASE_UTC;
+static time_t rtc_ref_start_datetime;
+static int rtc_realtime_clock_offset; /* used only with QEMU_CLOCK_REALTIME */
+static int rtc_host_datetime_offset = -1; /* valid & used only with
+ RTC_BASE_DATETIME */
+QEMUClockType rtc_clock;
+/***********************************************************/
+/* RTC reference time/date access */
+static time_t qemu_ref_timedate(QEMUClockType clock)
+{
+ time_t value = qemu_clock_get_ms(clock) / 1000;
+ switch (clock) {
+ case QEMU_CLOCK_REALTIME:
+ value -= rtc_realtime_clock_offset;
+ /* fall through */
+ case QEMU_CLOCK_VIRTUAL:
+ value += rtc_ref_start_datetime;
+ break;
+ case QEMU_CLOCK_HOST:
+ if (rtc_base_type == RTC_BASE_DATETIME) {
+ value -= rtc_host_datetime_offset;
+ }
+ break;
+ default:
+ assert(0);
+ }
+ return value;
+}
+
+void qemu_get_timedate(struct tm *tm, time_t offset)
+{
+ time_t ti = qemu_ref_timedate(rtc_clock);
+
+ ti += offset;
+
+ switch (rtc_base_type) {
+ case RTC_BASE_DATETIME:
+ case RTC_BASE_UTC:
+ gmtime_r(&ti, tm);
+ break;
+ case RTC_BASE_LOCALTIME:
+ localtime_r(&ti, tm);
+ break;
+ }
+}
+
+time_t qemu_timedate_diff(struct tm *tm)
+{
+ time_t seconds;
+
+ switch (rtc_base_type) {
+ case RTC_BASE_DATETIME:
+ case RTC_BASE_UTC:
+ seconds = mktimegm(tm);
+ break;
+ case RTC_BASE_LOCALTIME:
+ {
+ struct tm tmp = *tm;
+ tmp.tm_isdst = -1; /* use timezone to figure it out */
+ seconds = mktime(&tmp);
+ break;
+ }
+ default:
+ abort();
+ }
+
+ return seconds - qemu_ref_timedate(QEMU_CLOCK_HOST);
+}
+
+static void configure_rtc_base_datetime(const char *startdate)
+{
+ time_t rtc_start_datetime;
+ struct tm tm;
+
+ if (sscanf(startdate, "%d-%d-%dT%d:%d:%d", &tm.tm_year, &tm.tm_mon,
+ &tm.tm_mday, &tm.tm_hour, &tm.tm_min, &tm.tm_sec) == 6) {
+ /* OK */
+ } else if (sscanf(startdate, "%d-%d-%d",
+ &tm.tm_year, &tm.tm_mon, &tm.tm_mday) == 3) {
+ tm.tm_hour = 0;
+ tm.tm_min = 0;
+ tm.tm_sec = 0;
+ } else {
+ goto date_fail;
+ }
+ tm.tm_year -= 1900;
+ tm.tm_mon--;
+ rtc_start_datetime = mktimegm(&tm);
+ if (rtc_start_datetime == -1) {
+ date_fail:
+ error_report("invalid datetime format");
+ error_printf("valid formats: "
+ "'2006-06-17T16:01:21' or '2006-06-17'\n");
+ exit(1);
+ }
+ rtc_host_datetime_offset = rtc_ref_start_datetime - rtc_start_datetime;
+ rtc_ref_start_datetime = rtc_start_datetime;
+}
+
+void configure_rtc(QemuOpts *opts)
+{
+ const char *value;
+
+ /* Set defaults */
+ rtc_clock = QEMU_CLOCK_HOST;
+ rtc_ref_start_datetime = qemu_clock_get_ms(QEMU_CLOCK_HOST) / 1000;
+ rtc_realtime_clock_offset = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) / 1000;
+
+ value = qemu_opt_get(opts, "base");
+ if (value) {
+ if (!strcmp(value, "utc")) {
+ rtc_base_type = RTC_BASE_UTC;
+ } else if (!strcmp(value, "localtime")) {
+ rtc_base_type = RTC_BASE_LOCALTIME;
+ replay_add_blocker("-rtc base=localtime");
+ } else {
+ rtc_base_type = RTC_BASE_DATETIME;
+ configure_rtc_base_datetime(value);
+ }
+ }
+ value = qemu_opt_get(opts, "clock");
+ if (value) {
+ if (!strcmp(value, "host")) {
+ rtc_clock = QEMU_CLOCK_HOST;
+ } else if (!strcmp(value, "rt")) {
+ rtc_clock = QEMU_CLOCK_REALTIME;
+ } else if (!strcmp(value, "vm")) {
+ rtc_clock = QEMU_CLOCK_VIRTUAL;
+ } else {
+ error_report("invalid option value '%s'", value);
+ exit(1);
+ }
+ }
+ value = qemu_opt_get(opts, "driftfix");
+ if (value) {
+ if (!strcmp(value, "slew")) {
+ object_register_sugar_prop(TYPE_MC146818_RTC,
+ "lost_tick_policy",
+ "slew",
+ false);
+ if (!object_class_by_name(TYPE_MC146818_RTC)) {
+ warn_report("driftfix 'slew' is not available with this machine");
+ }
+ } else if (!strcmp(value, "none")) {
+ /* discard is default */
+ } else {
+ error_report("invalid option value '%s'", value);
+ exit(1);
+ }
+ }
+}
diff --git a/system/runstate-action.c b/system/runstate-action.c
new file mode 100644
index 0000000000..ae0761a9c3
--- /dev/null
+++ b/system/runstate-action.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2020 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "sysemu/runstate-action.h"
+#include "sysemu/watchdog.h"
+#include "qemu/config-file.h"
+#include "qapi/error.h"
+#include "qemu/option_int.h"
+
+RebootAction reboot_action = REBOOT_ACTION_RESET;
+ShutdownAction shutdown_action = SHUTDOWN_ACTION_POWEROFF;
+PanicAction panic_action = PANIC_ACTION_SHUTDOWN;
+
+/*
+ * Receives actions to be applied for specific guest events
+ * and sets the internal state as requested.
+ */
+void qmp_set_action(bool has_reboot, RebootAction reboot,
+ bool has_shutdown, ShutdownAction shutdown,
+ bool has_panic, PanicAction panic,
+ bool has_watchdog, WatchdogAction watchdog,
+ Error **errp)
+{
+ if (has_reboot) {
+ reboot_action = reboot;
+ }
+
+ if (has_panic) {
+ panic_action = panic;
+ }
+
+ if (has_watchdog) {
+ qmp_watchdog_set_action(watchdog, errp);
+ }
+
+ /* Process shutdown last, in case the panic action needs to be altered */
+ if (has_shutdown) {
+ shutdown_action = shutdown;
+ }
+}
diff --git a/system/runstate-hmp-cmds.c b/system/runstate-hmp-cmds.c
new file mode 100644
index 0000000000..2df670f0c0
--- /dev/null
+++ b/system/runstate-hmp-cmds.c
@@ -0,0 +1,95 @@
+/*
+ * HMP commands related to run state
+ *
+ * Copyright IBM, Corp. 2011
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#include "exec/cpu-common.h"
+#include "monitor/hmp.h"
+#include "monitor/monitor.h"
+#include "qapi/error.h"
+#include "qapi/qapi-commands-run-state.h"
+#include "qapi/qmp/qdict.h"
+#include "qemu/accel.h"
+
+void hmp_info_status(Monitor *mon, const QDict *qdict)
+{
+ StatusInfo *info;
+
+ info = qmp_query_status(NULL);
+
+ monitor_printf(mon, "VM status: %s",
+ info->running ? "running" : "paused");
+
+ if (!info->running && info->status != RUN_STATE_PAUSED) {
+ monitor_printf(mon, " (%s)", RunState_str(info->status));
+ }
+
+ monitor_printf(mon, "\n");
+
+ qapi_free_StatusInfo(info);
+}
+
+void hmp_one_insn_per_tb(Monitor *mon, const QDict *qdict)
+{
+ const char *option = qdict_get_try_str(qdict, "option");
+ AccelState *accel = current_accel();
+ bool newval;
+
+ if (!object_property_find(OBJECT(accel), "one-insn-per-tb")) {
+ monitor_printf(mon,
+ "This accelerator does not support setting one-insn-per-tb\n");
+ return;
+ }
+
+ if (!option || !strcmp(option, "on")) {
+ newval = true;
+ } else if (!strcmp(option, "off")) {
+ newval = false;
+ } else {
+ monitor_printf(mon, "unexpected option %s\n", option);
+ return;
+ }
+ /* If the property exists then setting it can never fail */
+ object_property_set_bool(OBJECT(accel), "one-insn-per-tb",
+ newval, &error_abort);
+}
+
+void hmp_watchdog_action(Monitor *mon, const QDict *qdict)
+{
+ Error *err = NULL;
+ WatchdogAction action;
+ char *qapi_value;
+
+ qapi_value = g_ascii_strdown(qdict_get_str(qdict, "action"), -1);
+ action = qapi_enum_parse(&WatchdogAction_lookup, qapi_value, -1, &err);
+ g_free(qapi_value);
+ if (err) {
+ hmp_handle_error(mon, err);
+ return;
+ }
+ qmp_watchdog_set_action(action, &error_abort);
+}
+
+void watchdog_action_completion(ReadLineState *rs, int nb_args, const char *str)
+{
+ int i;
+
+ if (nb_args != 2) {
+ return;
+ }
+ readline_set_completion_index(rs, strlen(str));
+ for (i = 0; i < WATCHDOG_ACTION__MAX; i++) {
+ readline_add_completion_of(rs, str, WatchdogAction_str(i));
+ }
+}
diff --git a/system/runstate.c b/system/runstate.c
new file mode 100644
index 0000000000..1652ed0439
--- /dev/null
+++ b/system/runstate.c
@@ -0,0 +1,871 @@
+/*
+ * QEMU main system emulation loop
+ *
+ * Copyright (c) 2003-2020 QEMU contributors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "audio/audio.h"
+#include "block/block.h"
+#include "block/export.h"
+#include "chardev/char.h"
+#include "crypto/cipher.h"
+#include "crypto/init.h"
+#include "exec/cpu-common.h"
+#include "gdbstub/syscalls.h"
+#include "hw/boards.h"
+#include "migration/misc.h"
+#include "migration/postcopy-ram.h"
+#include "monitor/monitor.h"
+#include "net/net.h"
+#include "net/vhost_net.h"
+#include "qapi/error.h"
+#include "qapi/qapi-commands-run-state.h"
+#include "qapi/qapi-events-run-state.h"
+#include "qemu/accel.h"
+#include "qemu/error-report.h"
+#include "qemu/job.h"
+#include "qemu/log.h"
+#include "qemu/module.h"
+#include "qemu/plugin.h"
+#include "qemu/sockets.h"
+#include "qemu/timer.h"
+#include "qemu/thread.h"
+#include "qom/object.h"
+#include "qom/object_interfaces.h"
+#include "sysemu/cpus.h"
+#include "sysemu/qtest.h"
+#include "sysemu/replay.h"
+#include "sysemu/reset.h"
+#include "sysemu/runstate.h"
+#include "sysemu/runstate-action.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/tpm.h"
+#include "trace.h"
+
+static NotifierList exit_notifiers =
+ NOTIFIER_LIST_INITIALIZER(exit_notifiers);
+
+static RunState current_run_state = RUN_STATE_PRELAUNCH;
+
+/* We use RUN_STATE__MAX but any invalid value will do */
+static RunState vmstop_requested = RUN_STATE__MAX;
+static QemuMutex vmstop_lock;
+
+typedef struct {
+ RunState from;
+ RunState to;
+} RunStateTransition;
+
+static const RunStateTransition runstate_transitions_def[] = {
+ { RUN_STATE_PRELAUNCH, RUN_STATE_INMIGRATE },
+
+ { RUN_STATE_DEBUG, RUN_STATE_RUNNING },
+ { RUN_STATE_DEBUG, RUN_STATE_FINISH_MIGRATE },
+ { RUN_STATE_DEBUG, RUN_STATE_PRELAUNCH },
+
+ { RUN_STATE_INMIGRATE, RUN_STATE_INTERNAL_ERROR },
+ { RUN_STATE_INMIGRATE, RUN_STATE_IO_ERROR },
+ { RUN_STATE_INMIGRATE, RUN_STATE_PAUSED },
+ { RUN_STATE_INMIGRATE, RUN_STATE_RUNNING },
+ { RUN_STATE_INMIGRATE, RUN_STATE_SHUTDOWN },
+ { RUN_STATE_INMIGRATE, RUN_STATE_SUSPENDED },
+ { RUN_STATE_INMIGRATE, RUN_STATE_WATCHDOG },
+ { RUN_STATE_INMIGRATE, RUN_STATE_GUEST_PANICKED },
+ { RUN_STATE_INMIGRATE, RUN_STATE_FINISH_MIGRATE },
+ { RUN_STATE_INMIGRATE, RUN_STATE_PRELAUNCH },
+ { RUN_STATE_INMIGRATE, RUN_STATE_POSTMIGRATE },
+ { RUN_STATE_INMIGRATE, RUN_STATE_COLO },
+
+ { RUN_STATE_INTERNAL_ERROR, RUN_STATE_PAUSED },
+ { RUN_STATE_INTERNAL_ERROR, RUN_STATE_FINISH_MIGRATE },
+ { RUN_STATE_INTERNAL_ERROR, RUN_STATE_PRELAUNCH },
+
+ { RUN_STATE_IO_ERROR, RUN_STATE_RUNNING },
+ { RUN_STATE_IO_ERROR, RUN_STATE_FINISH_MIGRATE },
+ { RUN_STATE_IO_ERROR, RUN_STATE_PRELAUNCH },
+
+ { RUN_STATE_PAUSED, RUN_STATE_RUNNING },
+ { RUN_STATE_PAUSED, RUN_STATE_FINISH_MIGRATE },
+ { RUN_STATE_PAUSED, RUN_STATE_POSTMIGRATE },
+ { RUN_STATE_PAUSED, RUN_STATE_PRELAUNCH },
+ { RUN_STATE_PAUSED, RUN_STATE_COLO},
+
+ { RUN_STATE_POSTMIGRATE, RUN_STATE_RUNNING },
+ { RUN_STATE_POSTMIGRATE, RUN_STATE_FINISH_MIGRATE },
+ { RUN_STATE_POSTMIGRATE, RUN_STATE_PRELAUNCH },
+
+ { RUN_STATE_PRELAUNCH, RUN_STATE_RUNNING },
+ { RUN_STATE_PRELAUNCH, RUN_STATE_FINISH_MIGRATE },
+ { RUN_STATE_PRELAUNCH, RUN_STATE_INMIGRATE },
+
+ { RUN_STATE_FINISH_MIGRATE, RUN_STATE_RUNNING },
+ { RUN_STATE_FINISH_MIGRATE, RUN_STATE_PAUSED },
+ { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE },
+ { RUN_STATE_FINISH_MIGRATE, RUN_STATE_PRELAUNCH },
+ { RUN_STATE_FINISH_MIGRATE, RUN_STATE_COLO },
+ { RUN_STATE_FINISH_MIGRATE, RUN_STATE_INTERNAL_ERROR },
+ { RUN_STATE_FINISH_MIGRATE, RUN_STATE_IO_ERROR },
+ { RUN_STATE_FINISH_MIGRATE, RUN_STATE_SHUTDOWN },
+ { RUN_STATE_FINISH_MIGRATE, RUN_STATE_SUSPENDED },
+ { RUN_STATE_FINISH_MIGRATE, RUN_STATE_WATCHDOG },
+ { RUN_STATE_FINISH_MIGRATE, RUN_STATE_GUEST_PANICKED },
+
+ { RUN_STATE_RESTORE_VM, RUN_STATE_RUNNING },
+ { RUN_STATE_RESTORE_VM, RUN_STATE_PRELAUNCH },
+
+ { RUN_STATE_COLO, RUN_STATE_RUNNING },
+ { RUN_STATE_COLO, RUN_STATE_PRELAUNCH },
+ { RUN_STATE_COLO, RUN_STATE_SHUTDOWN},
+
+ { RUN_STATE_RUNNING, RUN_STATE_DEBUG },
+ { RUN_STATE_RUNNING, RUN_STATE_INTERNAL_ERROR },
+ { RUN_STATE_RUNNING, RUN_STATE_IO_ERROR },
+ { RUN_STATE_RUNNING, RUN_STATE_PAUSED },
+ { RUN_STATE_RUNNING, RUN_STATE_FINISH_MIGRATE },
+ { RUN_STATE_RUNNING, RUN_STATE_RESTORE_VM },
+ { RUN_STATE_RUNNING, RUN_STATE_SAVE_VM },
+ { RUN_STATE_RUNNING, RUN_STATE_SHUTDOWN },
+ { RUN_STATE_RUNNING, RUN_STATE_WATCHDOG },
+ { RUN_STATE_RUNNING, RUN_STATE_GUEST_PANICKED },
+ { RUN_STATE_RUNNING, RUN_STATE_COLO},
+
+ { RUN_STATE_SAVE_VM, RUN_STATE_RUNNING },
+
+ { RUN_STATE_SHUTDOWN, RUN_STATE_PAUSED },
+ { RUN_STATE_SHUTDOWN, RUN_STATE_FINISH_MIGRATE },
+ { RUN_STATE_SHUTDOWN, RUN_STATE_PRELAUNCH },
+ { RUN_STATE_SHUTDOWN, RUN_STATE_COLO },
+
+ { RUN_STATE_DEBUG, RUN_STATE_SUSPENDED },
+ { RUN_STATE_RUNNING, RUN_STATE_SUSPENDED },
+ { RUN_STATE_SUSPENDED, RUN_STATE_RUNNING },
+ { RUN_STATE_SUSPENDED, RUN_STATE_FINISH_MIGRATE },
+ { RUN_STATE_SUSPENDED, RUN_STATE_PRELAUNCH },
+ { RUN_STATE_SUSPENDED, RUN_STATE_COLO},
+
+ { RUN_STATE_WATCHDOG, RUN_STATE_RUNNING },
+ { RUN_STATE_WATCHDOG, RUN_STATE_FINISH_MIGRATE },
+ { RUN_STATE_WATCHDOG, RUN_STATE_PRELAUNCH },
+ { RUN_STATE_WATCHDOG, RUN_STATE_COLO},
+
+ { RUN_STATE_GUEST_PANICKED, RUN_STATE_RUNNING },
+ { RUN_STATE_GUEST_PANICKED, RUN_STATE_FINISH_MIGRATE },
+ { RUN_STATE_GUEST_PANICKED, RUN_STATE_PRELAUNCH },
+
+ { RUN_STATE__MAX, RUN_STATE__MAX },
+};
+
+static bool runstate_valid_transitions[RUN_STATE__MAX][RUN_STATE__MAX];
+
+bool runstate_check(RunState state)
+{
+ return current_run_state == state;
+}
+
+static void runstate_init(void)
+{
+ const RunStateTransition *p;
+
+ memset(&runstate_valid_transitions, 0, sizeof(runstate_valid_transitions));
+ for (p = &runstate_transitions_def[0]; p->from != RUN_STATE__MAX; p++) {
+ runstate_valid_transitions[p->from][p->to] = true;
+ }
+
+ qemu_mutex_init(&vmstop_lock);
+}
+
+/* This function will abort() on invalid state transitions */
+void runstate_set(RunState new_state)
+{
+ assert(new_state < RUN_STATE__MAX);
+
+ trace_runstate_set(current_run_state, RunState_str(current_run_state),
+ new_state, RunState_str(new_state));
+
+ if (current_run_state == new_state) {
+ return;
+ }
+
+ if (!runstate_valid_transitions[current_run_state][new_state]) {
+ error_report("invalid runstate transition: '%s' -> '%s'",
+ RunState_str(current_run_state),
+ RunState_str(new_state));
+ abort();
+ }
+
+ current_run_state = new_state;
+}
+
+RunState runstate_get(void)
+{
+ return current_run_state;
+}
+
+bool runstate_is_running(void)
+{
+ return runstate_check(RUN_STATE_RUNNING);
+}
+
+bool runstate_needs_reset(void)
+{
+ return runstate_check(RUN_STATE_INTERNAL_ERROR) ||
+ runstate_check(RUN_STATE_SHUTDOWN);
+}
+
+StatusInfo *qmp_query_status(Error **errp)
+{
+ StatusInfo *info = g_malloc0(sizeof(*info));
+ AccelState *accel = current_accel();
+
+ /*
+ * We ignore errors, which will happen if the accelerator
+ * is not TCG. "singlestep" is meaningless for other accelerators,
+ * so we will set the StatusInfo field to false for those.
+ */
+ info->singlestep = object_property_get_bool(OBJECT(accel),
+ "one-insn-per-tb", NULL);
+ info->running = runstate_is_running();
+ info->status = current_run_state;
+
+ return info;
+}
+
+bool qemu_vmstop_requested(RunState *r)
+{
+ qemu_mutex_lock(&vmstop_lock);
+ *r = vmstop_requested;
+ vmstop_requested = RUN_STATE__MAX;
+ qemu_mutex_unlock(&vmstop_lock);
+ return *r < RUN_STATE__MAX;
+}
+
+void qemu_system_vmstop_request_prepare(void)
+{
+ qemu_mutex_lock(&vmstop_lock);
+}
+
+void qemu_system_vmstop_request(RunState state)
+{
+ vmstop_requested = state;
+ qemu_mutex_unlock(&vmstop_lock);
+ qemu_notify_event();
+}
+struct VMChangeStateEntry {
+ VMChangeStateHandler *cb;
+ VMChangeStateHandler *prepare_cb;
+ void *opaque;
+ QTAILQ_ENTRY(VMChangeStateEntry) entries;
+ int priority;
+};
+
+static QTAILQ_HEAD(, VMChangeStateEntry) vm_change_state_head =
+ QTAILQ_HEAD_INITIALIZER(vm_change_state_head);
+
+/**
+ * qemu_add_vm_change_state_handler_prio:
+ * @cb: the callback to invoke
+ * @opaque: user data passed to the callback
+ * @priority: low priorities execute first when the vm runs and the reverse is
+ * true when the vm stops
+ *
+ * Register a callback function that is invoked when the vm starts or stops
+ * running.
+ *
+ * Returns: an entry to be freed using qemu_del_vm_change_state_handler()
+ */
+VMChangeStateEntry *qemu_add_vm_change_state_handler_prio(
+ VMChangeStateHandler *cb, void *opaque, int priority)
+{
+ return qemu_add_vm_change_state_handler_prio_full(cb, NULL, opaque,
+ priority);
+}
+
+/**
+ * qemu_add_vm_change_state_handler_prio_full:
+ * @cb: the main callback to invoke
+ * @prepare_cb: a callback to invoke before the main callback
+ * @opaque: user data passed to the callbacks
+ * @priority: low priorities execute first when the vm runs and the reverse is
+ * true when the vm stops
+ *
+ * Register a main callback function and an optional prepare callback function
+ * that are invoked when the vm starts or stops running. The main callback and
+ * the prepare callback are called in two separate phases: First all prepare
+ * callbacks are called and only then all main callbacks are called. As its
+ * name suggests, the prepare callback can be used to do some preparatory work
+ * before invoking the main callback.
+ *
+ * Returns: an entry to be freed using qemu_del_vm_change_state_handler()
+ */
+VMChangeStateEntry *
+qemu_add_vm_change_state_handler_prio_full(VMChangeStateHandler *cb,
+ VMChangeStateHandler *prepare_cb,
+ void *opaque, int priority)
+{
+ VMChangeStateEntry *e;
+ VMChangeStateEntry *other;
+
+ e = g_malloc0(sizeof(*e));
+ e->cb = cb;
+ e->prepare_cb = prepare_cb;
+ e->opaque = opaque;
+ e->priority = priority;
+
+ /* Keep list sorted in ascending priority order */
+ QTAILQ_FOREACH(other, &vm_change_state_head, entries) {
+ if (priority < other->priority) {
+ QTAILQ_INSERT_BEFORE(other, e, entries);
+ return e;
+ }
+ }
+
+ QTAILQ_INSERT_TAIL(&vm_change_state_head, e, entries);
+ return e;
+}
+
+VMChangeStateEntry *qemu_add_vm_change_state_handler(VMChangeStateHandler *cb,
+ void *opaque)
+{
+ return qemu_add_vm_change_state_handler_prio(cb, opaque, 0);
+}
+
+void qemu_del_vm_change_state_handler(VMChangeStateEntry *e)
+{
+ QTAILQ_REMOVE(&vm_change_state_head, e, entries);
+ g_free(e);
+}
+
+void vm_state_notify(bool running, RunState state)
+{
+ VMChangeStateEntry *e, *next;
+
+ trace_vm_state_notify(running, state, RunState_str(state));
+
+ if (running) {
+ QTAILQ_FOREACH_SAFE(e, &vm_change_state_head, entries, next) {
+ if (e->prepare_cb) {
+ e->prepare_cb(e->opaque, running, state);
+ }
+ }
+
+ QTAILQ_FOREACH_SAFE(e, &vm_change_state_head, entries, next) {
+ e->cb(e->opaque, running, state);
+ }
+ } else {
+ QTAILQ_FOREACH_REVERSE_SAFE(e, &vm_change_state_head, entries, next) {
+ if (e->prepare_cb) {
+ e->prepare_cb(e->opaque, running, state);
+ }
+ }
+
+ QTAILQ_FOREACH_REVERSE_SAFE(e, &vm_change_state_head, entries, next) {
+ e->cb(e->opaque, running, state);
+ }
+ }
+}
+
+static ShutdownCause reset_requested;
+static ShutdownCause shutdown_requested;
+static int shutdown_signal;
+static pid_t shutdown_pid;
+static int powerdown_requested;
+static int debug_requested;
+static int suspend_requested;
+static WakeupReason wakeup_reason;
+static NotifierList powerdown_notifiers =
+ NOTIFIER_LIST_INITIALIZER(powerdown_notifiers);
+static NotifierList suspend_notifiers =
+ NOTIFIER_LIST_INITIALIZER(suspend_notifiers);
+static NotifierList wakeup_notifiers =
+ NOTIFIER_LIST_INITIALIZER(wakeup_notifiers);
+static NotifierList shutdown_notifiers =
+ NOTIFIER_LIST_INITIALIZER(shutdown_notifiers);
+static uint32_t wakeup_reason_mask = ~(1 << QEMU_WAKEUP_REASON_NONE);
+
+ShutdownCause qemu_shutdown_requested_get(void)
+{
+ return shutdown_requested;
+}
+
+ShutdownCause qemu_reset_requested_get(void)
+{
+ return reset_requested;
+}
+
+static int qemu_shutdown_requested(void)
+{
+ return qatomic_xchg(&shutdown_requested, SHUTDOWN_CAUSE_NONE);
+}
+
+static void qemu_kill_report(void)
+{
+ if (!qtest_driver() && shutdown_signal) {
+ if (shutdown_pid == 0) {
+ /* This happens for eg ^C at the terminal, so it's worth
+ * avoiding printing an odd message in that case.
+ */
+ error_report("terminating on signal %d", shutdown_signal);
+ } else {
+ char *shutdown_cmd = qemu_get_pid_name(shutdown_pid);
+
+ error_report("terminating on signal %d from pid " FMT_pid " (%s)",
+ shutdown_signal, shutdown_pid,
+ shutdown_cmd ? shutdown_cmd : "<unknown process>");
+ g_free(shutdown_cmd);
+ }
+ shutdown_signal = 0;
+ }
+}
+
+static ShutdownCause qemu_reset_requested(void)
+{
+ ShutdownCause r = reset_requested;
+
+ if (r && replay_checkpoint(CHECKPOINT_RESET_REQUESTED)) {
+ reset_requested = SHUTDOWN_CAUSE_NONE;
+ return r;
+ }
+ return SHUTDOWN_CAUSE_NONE;
+}
+
+static int qemu_suspend_requested(void)
+{
+ int r = suspend_requested;
+ if (r && replay_checkpoint(CHECKPOINT_SUSPEND_REQUESTED)) {
+ suspend_requested = 0;
+ return r;
+ }
+ return false;
+}
+
+static WakeupReason qemu_wakeup_requested(void)
+{
+ return wakeup_reason;
+}
+
+static int qemu_powerdown_requested(void)
+{
+ int r = powerdown_requested;
+ powerdown_requested = 0;
+ return r;
+}
+
+static int qemu_debug_requested(void)
+{
+ int r = debug_requested;
+ debug_requested = 0;
+ return r;
+}
+
+/*
+ * Reset the VM. Issue an event unless @reason is SHUTDOWN_CAUSE_NONE.
+ */
+void qemu_system_reset(ShutdownCause reason)
+{
+ MachineClass *mc;
+
+ mc = current_machine ? MACHINE_GET_CLASS(current_machine) : NULL;
+
+ cpu_synchronize_all_states();
+
+ if (mc && mc->reset) {
+ mc->reset(current_machine, reason);
+ } else {
+ qemu_devices_reset(reason);
+ }
+ switch (reason) {
+ case SHUTDOWN_CAUSE_NONE:
+ case SHUTDOWN_CAUSE_SUBSYSTEM_RESET:
+ case SHUTDOWN_CAUSE_SNAPSHOT_LOAD:
+ break;
+ default:
+ qapi_event_send_reset(shutdown_caused_by_guest(reason), reason);
+ }
+ cpu_synchronize_all_post_reset();
+}
+
+/*
+ * Wake the VM after suspend.
+ */
+static void qemu_system_wakeup(void)
+{
+ MachineClass *mc;
+
+ mc = current_machine ? MACHINE_GET_CLASS(current_machine) : NULL;
+
+ if (mc && mc->wakeup) {
+ mc->wakeup(current_machine);
+ }
+}
+
+void qemu_system_guest_panicked(GuestPanicInformation *info)
+{
+ qemu_log_mask(LOG_GUEST_ERROR, "Guest crashed");
+
+ if (current_cpu) {
+ current_cpu->crash_occurred = true;
+ }
+ /*
+ * TODO: Currently the available panic actions are: none, pause, and
+ * shutdown, but in principle debug and reset could be supported as well.
+ * Investigate any potential use cases for the unimplemented actions.
+ */
+ if (panic_action == PANIC_ACTION_PAUSE
+ || (panic_action == PANIC_ACTION_SHUTDOWN && shutdown_action == SHUTDOWN_ACTION_PAUSE)) {
+ qapi_event_send_guest_panicked(GUEST_PANIC_ACTION_PAUSE, info);
+ vm_stop(RUN_STATE_GUEST_PANICKED);
+ } else if (panic_action == PANIC_ACTION_SHUTDOWN ||
+ panic_action == PANIC_ACTION_EXIT_FAILURE) {
+ qapi_event_send_guest_panicked(GUEST_PANIC_ACTION_POWEROFF, info);
+ vm_stop(RUN_STATE_GUEST_PANICKED);
+ qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_PANIC);
+ } else {
+ qapi_event_send_guest_panicked(GUEST_PANIC_ACTION_RUN, info);
+ }
+
+ if (info) {
+ if (info->type == GUEST_PANIC_INFORMATION_TYPE_HYPER_V) {
+ qemu_log_mask(LOG_GUEST_ERROR, "\nHV crash parameters: (%#"PRIx64
+ " %#"PRIx64" %#"PRIx64" %#"PRIx64" %#"PRIx64")\n",
+ info->u.hyper_v.arg1,
+ info->u.hyper_v.arg2,
+ info->u.hyper_v.arg3,
+ info->u.hyper_v.arg4,
+ info->u.hyper_v.arg5);
+ } else if (info->type == GUEST_PANIC_INFORMATION_TYPE_S390) {
+ qemu_log_mask(LOG_GUEST_ERROR, " on cpu %d: %s\n"
+ "PSW: 0x%016" PRIx64 " 0x%016" PRIx64"\n",
+ info->u.s390.core,
+ S390CrashReason_str(info->u.s390.reason),
+ info->u.s390.psw_mask,
+ info->u.s390.psw_addr);
+ }
+ qapi_free_GuestPanicInformation(info);
+ }
+}
+
+void qemu_system_guest_crashloaded(GuestPanicInformation *info)
+{
+ qemu_log_mask(LOG_GUEST_ERROR, "Guest crash loaded");
+ qapi_event_send_guest_crashloaded(GUEST_PANIC_ACTION_RUN, info);
+ qapi_free_GuestPanicInformation(info);
+}
+
+void qemu_system_reset_request(ShutdownCause reason)
+{
+ if (reboot_action == REBOOT_ACTION_SHUTDOWN &&
+ reason != SHUTDOWN_CAUSE_SUBSYSTEM_RESET) {
+ shutdown_requested = reason;
+ } else if (!cpus_are_resettable()) {
+ error_report("cpus are not resettable, terminating");
+ shutdown_requested = reason;
+ } else {
+ reset_requested = reason;
+ }
+ cpu_stop_current();
+ qemu_notify_event();
+}
+
+static void qemu_system_suspend(void)
+{
+ pause_all_vcpus();
+ notifier_list_notify(&suspend_notifiers, NULL);
+ runstate_set(RUN_STATE_SUSPENDED);
+ qapi_event_send_suspend();
+}
+
+void qemu_system_suspend_request(void)
+{
+ if (runstate_check(RUN_STATE_SUSPENDED)) {
+ return;
+ }
+ suspend_requested = 1;
+ cpu_stop_current();
+ qemu_notify_event();
+}
+
+void qemu_register_suspend_notifier(Notifier *notifier)
+{
+ notifier_list_add(&suspend_notifiers, notifier);
+}
+
+void qemu_system_wakeup_request(WakeupReason reason, Error **errp)
+{
+ trace_system_wakeup_request(reason);
+
+ if (!runstate_check(RUN_STATE_SUSPENDED)) {
+ error_setg(errp,
+ "Unable to wake up: guest is not in suspended state");
+ return;
+ }
+ if (!(wakeup_reason_mask & (1 << reason))) {
+ return;
+ }
+ runstate_set(RUN_STATE_RUNNING);
+ wakeup_reason = reason;
+ qemu_notify_event();
+}
+
+void qemu_system_wakeup_enable(WakeupReason reason, bool enabled)
+{
+ if (enabled) {
+ wakeup_reason_mask |= (1 << reason);
+ } else {
+ wakeup_reason_mask &= ~(1 << reason);
+ }
+}
+
+void qemu_register_wakeup_notifier(Notifier *notifier)
+{
+ notifier_list_add(&wakeup_notifiers, notifier);
+}
+
+static bool wakeup_suspend_enabled;
+
+void qemu_register_wakeup_support(void)
+{
+ wakeup_suspend_enabled = true;
+}
+
+bool qemu_wakeup_suspend_enabled(void)
+{
+ return wakeup_suspend_enabled;
+}
+
+void qemu_system_killed(int signal, pid_t pid)
+{
+ shutdown_signal = signal;
+ shutdown_pid = pid;
+ shutdown_action = SHUTDOWN_ACTION_POWEROFF;
+
+ /* Cannot call qemu_system_shutdown_request directly because
+ * we are in a signal handler.
+ */
+ shutdown_requested = SHUTDOWN_CAUSE_HOST_SIGNAL;
+ qemu_notify_event();
+}
+
+void qemu_system_shutdown_request(ShutdownCause reason)
+{
+ trace_qemu_system_shutdown_request(reason);
+ replay_shutdown_request(reason);
+ shutdown_requested = reason;
+ qemu_notify_event();
+}
+
+static void qemu_system_powerdown(void)
+{
+ qapi_event_send_powerdown();
+ notifier_list_notify(&powerdown_notifiers, NULL);
+}
+
+static void qemu_system_shutdown(ShutdownCause cause)
+{
+ qapi_event_send_shutdown(shutdown_caused_by_guest(cause), cause);
+ notifier_list_notify(&shutdown_notifiers, &cause);
+}
+
+void qemu_system_powerdown_request(void)
+{
+ trace_qemu_system_powerdown_request();
+ powerdown_requested = 1;
+ qemu_notify_event();
+}
+
+void qemu_register_powerdown_notifier(Notifier *notifier)
+{
+ notifier_list_add(&powerdown_notifiers, notifier);
+}
+
+void qemu_register_shutdown_notifier(Notifier *notifier)
+{
+ notifier_list_add(&shutdown_notifiers, notifier);
+}
+
+void qemu_system_debug_request(void)
+{
+ debug_requested = 1;
+ qemu_notify_event();
+}
+
+static bool main_loop_should_exit(int *status)
+{
+ RunState r;
+ ShutdownCause request;
+
+ if (qemu_debug_requested()) {
+ vm_stop(RUN_STATE_DEBUG);
+ }
+ if (qemu_suspend_requested()) {
+ qemu_system_suspend();
+ }
+ request = qemu_shutdown_requested();
+ if (request) {
+ qemu_kill_report();
+ qemu_system_shutdown(request);
+ if (shutdown_action == SHUTDOWN_ACTION_PAUSE) {
+ vm_stop(RUN_STATE_SHUTDOWN);
+ } else {
+ if (request == SHUTDOWN_CAUSE_GUEST_PANIC &&
+ panic_action == PANIC_ACTION_EXIT_FAILURE) {
+ *status = EXIT_FAILURE;
+ }
+ return true;
+ }
+ }
+ request = qemu_reset_requested();
+ if (request) {
+ pause_all_vcpus();
+ qemu_system_reset(request);
+ resume_all_vcpus();
+ /*
+ * runstate can change in pause_all_vcpus()
+ * as iothread mutex is unlocked
+ */
+ if (!runstate_check(RUN_STATE_RUNNING) &&
+ !runstate_check(RUN_STATE_INMIGRATE) &&
+ !runstate_check(RUN_STATE_FINISH_MIGRATE)) {
+ runstate_set(RUN_STATE_PRELAUNCH);
+ }
+ }
+ if (qemu_wakeup_requested()) {
+ pause_all_vcpus();
+ qemu_system_wakeup();
+ notifier_list_notify(&wakeup_notifiers, &wakeup_reason);
+ wakeup_reason = QEMU_WAKEUP_REASON_NONE;
+ resume_all_vcpus();
+ qapi_event_send_wakeup();
+ }
+ if (qemu_powerdown_requested()) {
+ qemu_system_powerdown();
+ }
+ if (qemu_vmstop_requested(&r)) {
+ vm_stop(r);
+ }
+ return false;
+}
+
+int qemu_main_loop(void)
+{
+ int status = EXIT_SUCCESS;
+
+ while (!main_loop_should_exit(&status)) {
+ main_loop_wait(false);
+ }
+
+ return status;
+}
+
+void qemu_add_exit_notifier(Notifier *notify)
+{
+ notifier_list_add(&exit_notifiers, notify);
+}
+
+void qemu_remove_exit_notifier(Notifier *notify)
+{
+ notifier_remove(notify);
+}
+
+static void qemu_run_exit_notifiers(void)
+{
+ notifier_list_notify(&exit_notifiers, NULL);
+}
+
+void qemu_init_subsystems(void)
+{
+ Error *err = NULL;
+
+ os_set_line_buffering();
+
+ module_call_init(MODULE_INIT_TRACE);
+
+ qemu_init_cpu_list();
+ qemu_init_cpu_loop();
+ qemu_mutex_lock_iothread();
+
+ atexit(qemu_run_exit_notifiers);
+
+ module_call_init(MODULE_INIT_QOM);
+ module_call_init(MODULE_INIT_MIGRATION);
+
+ runstate_init();
+ precopy_infrastructure_init();
+ postcopy_infrastructure_init();
+ monitor_init_globals();
+
+ if (qcrypto_init(&err) < 0) {
+ error_reportf_err(err, "cannot initialize crypto: ");
+ exit(1);
+ }
+
+ os_setup_early_signal_handling();
+
+ bdrv_init_with_whitelist();
+ socket_init();
+}
+
+
+void qemu_cleanup(void)
+{
+ gdb_exit(0);
+
+ /*
+ * cleaning up the migration object cancels any existing migration
+ * try to do this early so that it also stops using devices.
+ */
+ migration_shutdown();
+
+ /*
+ * Close the exports before draining the block layer. The export
+ * drivers may have coroutines yielding on it, so we need to clean
+ * them up before the drain, as otherwise they may be get stuck in
+ * blk_wait_while_drained().
+ */
+ blk_exp_close_all();
+
+
+ /* No more vcpu or device emulation activity beyond this point */
+ vm_shutdown();
+ replay_finish();
+
+ /*
+ * We must cancel all block jobs while the block layer is drained,
+ * or cancelling will be affected by throttling and thus may block
+ * for an extended period of time.
+ * Begin the drained section after vm_shutdown() to avoid requests being
+ * stuck in the BlockBackend's request queue.
+ * We do not need to end this section, because we do not want any
+ * requests happening from here on anyway.
+ */
+ bdrv_drain_all_begin();
+ job_cancel_sync_all();
+ bdrv_close_all();
+
+ /* vhost-user must be cleaned up before chardevs. */
+ tpm_cleanup();
+ net_cleanup();
+ audio_cleanup();
+ monitor_cleanup();
+ qemu_chr_cleanup();
+ user_creatable_cleanup();
+ /* TODO: unref root container, check all devices are ok */
+}
diff --git a/system/tpm-hmp-cmds.c b/system/tpm-hmp-cmds.c
new file mode 100644
index 0000000000..9ed6ad6c4d
--- /dev/null
+++ b/system/tpm-hmp-cmds.c
@@ -0,0 +1,65 @@
+/*
+ * HMP commands related to TPM
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/qapi-commands-tpm.h"
+#include "monitor/monitor.h"
+#include "monitor/hmp.h"
+#include "qapi/error.h"
+
+void hmp_info_tpm(Monitor *mon, const QDict *qdict)
+{
+#ifdef CONFIG_TPM
+ TPMInfoList *info_list, *info;
+ Error *err = NULL;
+ unsigned int c = 0;
+ TPMPassthroughOptions *tpo;
+ TPMEmulatorOptions *teo;
+
+ info_list = qmp_query_tpm(&err);
+ if (err) {
+ monitor_printf(mon, "TPM device not supported\n");
+ error_free(err);
+ return;
+ }
+
+ if (info_list) {
+ monitor_printf(mon, "TPM device:\n");
+ }
+
+ for (info = info_list; info; info = info->next) {
+ TPMInfo *ti = info->value;
+ monitor_printf(mon, " tpm%d: model=%s\n",
+ c, TpmModel_str(ti->model));
+
+ monitor_printf(mon, " \\ %s: type=%s",
+ ti->id, TpmType_str(ti->options->type));
+
+ switch (ti->options->type) {
+ case TPM_TYPE_PASSTHROUGH:
+ tpo = ti->options->u.passthrough.data;
+ monitor_printf(mon, "%s%s%s%s",
+ tpo->path ? ",path=" : "",
+ tpo->path ?: "",
+ tpo->cancel_path ? ",cancel-path=" : "",
+ tpo->cancel_path ?: "");
+ break;
+ case TPM_TYPE_EMULATOR:
+ teo = ti->options->u.emulator.data;
+ monitor_printf(mon, ",chardev=%s", teo->chardev);
+ break;
+ case TPM_TYPE__MAX:
+ break;
+ }
+ monitor_printf(mon, "\n");
+ c++;
+ }
+ qapi_free_TPMInfoList(info_list);
+#else
+ monitor_printf(mon, "TPM device not supported\n");
+#endif /* CONFIG_TPM */
+}
diff --git a/system/tpm.c b/system/tpm.c
new file mode 100644
index 0000000000..578563f05a
--- /dev/null
+++ b/system/tpm.c
@@ -0,0 +1,239 @@
+/*
+ * TPM configuration
+ *
+ * Copyright (C) 2011-2013 IBM Corporation
+ *
+ * Authors:
+ * Stefan Berger <stefanb@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ * Based on net.c
+ */
+
+#include "qemu/osdep.h"
+
+#include "qapi/error.h"
+#include "qapi/qapi-commands-tpm.h"
+#include "qapi/qmp/qerror.h"
+#include "sysemu/tpm_backend.h"
+#include "sysemu/tpm.h"
+#include "qemu/config-file.h"
+#include "qemu/error-report.h"
+
+static QLIST_HEAD(, TPMBackend) tpm_backends =
+ QLIST_HEAD_INITIALIZER(tpm_backends);
+
+static const TPMBackendClass *
+tpm_be_find_by_type(enum TpmType type)
+{
+ ObjectClass *oc;
+ char *typename = g_strdup_printf("tpm-%s", TpmType_str(type));
+
+ oc = object_class_by_name(typename);
+ g_free(typename);
+
+ if (!object_class_dynamic_cast(oc, TYPE_TPM_BACKEND)) {
+ return NULL;
+ }
+
+ return TPM_BACKEND_CLASS(oc);
+}
+
+/*
+ * Walk the list of available TPM backend drivers and display them on the
+ * screen.
+ */
+static void tpm_display_backend_drivers(void)
+{
+ bool got_one = false;
+ int i;
+
+ for (i = 0; i < TPM_TYPE__MAX; i++) {
+ const TPMBackendClass *bc = tpm_be_find_by_type(i);
+ if (!bc) {
+ continue;
+ }
+ if (!got_one) {
+ error_printf("Supported TPM types (choose only one):\n");
+ got_one = true;
+ }
+ error_printf("%12s %s\n", TpmType_str(i), bc->desc);
+ }
+ if (!got_one) {
+ error_printf("No TPM backend types are available\n");
+ }
+}
+
+/*
+ * Find the TPM with the given Id
+ */
+TPMBackend *qemu_find_tpm_be(const char *id)
+{
+ TPMBackend *drv;
+
+ if (id) {
+ QLIST_FOREACH(drv, &tpm_backends, list) {
+ if (!strcmp(drv->id, id)) {
+ return drv;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+static int tpm_init_tpmdev(void *dummy, QemuOpts *opts, Error **errp)
+{
+ /*
+ * Use of error_report() in a function with an Error ** parameter
+ * is suspicious. It is okay here. The parameter only exists to
+ * make the function usable with qemu_opts_foreach(). It is not
+ * actually used.
+ */
+ const char *value;
+ const char *id;
+ const TPMBackendClass *be;
+ TPMBackend *drv;
+ Error *local_err = NULL;
+ int i;
+
+ if (!QLIST_EMPTY(&tpm_backends)) {
+ error_report("Only one TPM is allowed.");
+ return 1;
+ }
+
+ id = qemu_opts_id(opts);
+ if (id == NULL) {
+ error_report(QERR_MISSING_PARAMETER, "id");
+ return 1;
+ }
+
+ value = qemu_opt_get(opts, "type");
+ if (!value) {
+ error_report(QERR_MISSING_PARAMETER, "type");
+ tpm_display_backend_drivers();
+ return 1;
+ }
+
+ i = qapi_enum_parse(&TpmType_lookup, value, -1, NULL);
+ be = i >= 0 ? tpm_be_find_by_type(i) : NULL;
+ if (be == NULL) {
+ error_report(QERR_INVALID_PARAMETER_VALUE,
+ "type", "a TPM backend type");
+ tpm_display_backend_drivers();
+ return 1;
+ }
+
+ /* validate backend specific opts */
+ if (!qemu_opts_validate(opts, be->opts, &local_err)) {
+ error_report_err(local_err);
+ return 1;
+ }
+
+ drv = be->create(opts);
+ if (!drv) {
+ return 1;
+ }
+
+ drv->id = g_strdup(id);
+ QLIST_INSERT_HEAD(&tpm_backends, drv, list);
+
+ return 0;
+}
+
+/*
+ * Walk the list of TPM backend drivers that are in use and call their
+ * destroy function to have them cleaned up.
+ */
+void tpm_cleanup(void)
+{
+ TPMBackend *drv, *next;
+
+ QLIST_FOREACH_SAFE(drv, &tpm_backends, list, next) {
+ QLIST_REMOVE(drv, list);
+ object_unref(OBJECT(drv));
+ }
+}
+
+/*
+ * Initialize the TPM. Process the tpmdev command line options describing the
+ * TPM backend.
+ */
+int tpm_init(void)
+{
+ if (qemu_opts_foreach(qemu_find_opts("tpmdev"),
+ tpm_init_tpmdev, NULL, NULL)) {
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Parse the TPM configuration options.
+ * To display all available TPM backends the user may use '-tpmdev help'
+ */
+int tpm_config_parse(QemuOptsList *opts_list, const char *optarg)
+{
+ QemuOpts *opts;
+
+ if (!strcmp(optarg, "help")) {
+ tpm_display_backend_drivers();
+ return -1;
+ }
+ opts = qemu_opts_parse_noisily(opts_list, optarg, true);
+ if (!opts) {
+ return -1;
+ }
+ return 0;
+}
+
+/*
+ * Walk the list of active TPM backends and collect information about them.
+ */
+TPMInfoList *qmp_query_tpm(Error **errp)
+{
+ TPMBackend *drv;
+ TPMInfoList *head = NULL, **tail = &head;
+
+ QLIST_FOREACH(drv, &tpm_backends, list) {
+ if (!drv->tpmif) {
+ continue;
+ }
+
+ QAPI_LIST_APPEND(tail, tpm_backend_query_tpm(drv));
+ }
+
+ return head;
+}
+
+TpmTypeList *qmp_query_tpm_types(Error **errp)
+{
+ unsigned int i = 0;
+ TpmTypeList *head = NULL, **tail = &head;
+
+ for (i = 0; i < TPM_TYPE__MAX; i++) {
+ if (!tpm_be_find_by_type(i)) {
+ continue;
+ }
+ QAPI_LIST_APPEND(tail, i);
+ }
+
+ return head;
+}
+TpmModelList *qmp_query_tpm_models(Error **errp)
+{
+ TpmModelList *head = NULL, **tail = &head;
+ GSList *e, *l = object_class_get_list(TYPE_TPM_IF, false);
+
+ for (e = l; e; e = e->next) {
+ TPMIfClass *c = TPM_IF_CLASS(e->data);
+
+ QAPI_LIST_APPEND(tail, c->model);
+ }
+ g_slist_free(l);
+
+ return head;
+}
diff --git a/system/trace-events b/system/trace-events
new file mode 100644
index 0000000000..69c9044151
--- /dev/null
+++ b/system/trace-events
@@ -0,0 +1,40 @@
+# See docs/devel/tracing.rst for syntax documentation.
+
+# balloon.c
+# Since requests are raised via monitor, not many tracepoints are needed.
+balloon_event(void *opaque, unsigned long addr) "opaque %p addr %lu"
+
+# ioport.c
+cpu_in(unsigned int addr, char size, unsigned int val) "addr 0x%x(%c) value %u"
+cpu_out(unsigned int addr, char size, unsigned int val) "addr 0x%x(%c) value %u"
+
+# memory.c
+memory_region_ops_read(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size, const char *name) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u name '%s'"
+memory_region_ops_write(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size, const char *name) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u name '%s'"
+memory_region_subpage_read(int cpu_index, void *mr, uint64_t offset, uint64_t value, unsigned size) "cpu %d mr %p offset 0x%"PRIx64" value 0x%"PRIx64" size %u"
+memory_region_subpage_write(int cpu_index, void *mr, uint64_t offset, uint64_t value, unsigned size) "cpu %d mr %p offset 0x%"PRIx64" value 0x%"PRIx64" size %u"
+memory_region_ram_device_read(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
+memory_region_ram_device_write(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
+memory_region_sync_dirty(const char *mr, const char *listener, int global) "mr '%s' listener '%s' synced (global=%d)"
+flatview_new(void *view, void *root) "%p (root %p)"
+flatview_destroy(void *view, void *root) "%p (root %p)"
+flatview_destroy_rcu(void *view, void *root) "%p (root %p)"
+global_dirty_changed(unsigned int bitmask) "bitmask 0x%"PRIx32
+
+# cpus.c
+vm_stop_flush_all(int ret) "ret %d"
+
+# vl.c
+vm_state_notify(int running, int reason, const char *reason_str) "running %d reason %d (%s)"
+load_file(const char *name, const char *path) "name %s location %s"
+runstate_set(int current_state, const char *current_state_str, int new_state, const char *new_state_str) "current_run_state %d (%s) new_state %d (%s)"
+system_wakeup_request(int reason) "reason=%d"
+qemu_system_shutdown_request(int reason) "reason=%d"
+qemu_system_powerdown_request(void) ""
+
+#dirtylimit.c
+dirtylimit_state_initialize(int max_cpus) "dirtylimit state initialize: max cpus %d"
+dirtylimit_state_finalize(void)
+dirtylimit_throttle_pct(int cpu_index, uint64_t pct, int64_t time_us) "CPU[%d] throttle percent: %" PRIu64 ", throttle adjust time %"PRIi64 " us"
+dirtylimit_set_vcpu(int cpu_index, uint64_t quota) "CPU[%d] set dirty page rate limit %"PRIu64
+dirtylimit_vcpu_execute(int cpu_index, int64_t sleep_time_us) "CPU[%d] sleep %"PRIi64 " us"
diff --git a/system/trace.h b/system/trace.h
new file mode 100644
index 0000000000..cd0136dcdc
--- /dev/null
+++ b/system/trace.h
@@ -0,0 +1 @@
+#include "trace/trace-system.h"
diff --git a/system/vl.c b/system/vl.c
new file mode 100644
index 0000000000..98e071e63b
--- /dev/null
+++ b/system/vl.c
@@ -0,0 +1,3730 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/help-texts.h"
+#include "qemu/datadir.h"
+#include "qemu/units.h"
+#include "exec/cpu-common.h"
+#include "exec/page-vary.h"
+#include "hw/qdev-properties.h"
+#include "qapi/compat-policy.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qstring.h"
+#include "qapi/qmp/qjson.h"
+#include "qemu-version.h"
+#include "qemu/cutils.h"
+#include "qemu/help_option.h"
+#include "qemu/hw-version.h"
+#include "qemu/uuid.h"
+#include "sysemu/reset.h"
+#include "sysemu/runstate.h"
+#include "sysemu/runstate-action.h"
+#include "sysemu/seccomp.h"
+#include "sysemu/tcg.h"
+#include "sysemu/xen.h"
+
+#include "qemu/error-report.h"
+#include "qemu/sockets.h"
+#include "qemu/accel.h"
+#include "qemu/async-teardown.h"
+#include "hw/usb.h"
+#include "hw/isa/isa.h"
+#include "hw/scsi/scsi.h"
+#include "hw/display/vga.h"
+#include "hw/firmware/smbios.h"
+#include "hw/acpi/acpi.h"
+#include "hw/xen/xen.h"
+#include "hw/loader.h"
+#include "monitor/qdev.h"
+#include "net/net.h"
+#include "net/slirp.h"
+#include "monitor/monitor.h"
+#include "ui/console.h"
+#include "ui/input.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/numa.h"
+#include "sysemu/hostmem.h"
+#include "exec/gdbstub.h"
+#include "qemu/timer.h"
+#include "chardev/char.h"
+#include "qemu/bitmap.h"
+#include "qemu/log.h"
+#include "sysemu/blockdev.h"
+#include "hw/block/block.h"
+#include "hw/i386/x86.h"
+#include "hw/i386/pc.h"
+#include "migration/misc.h"
+#include "migration/snapshot.h"
+#include "sysemu/tpm.h"
+#include "sysemu/dma.h"
+#include "hw/audio/soundhw.h"
+#include "audio/audio.h"
+#include "sysemu/cpus.h"
+#include "sysemu/cpu-timers.h"
+#include "migration/colo.h"
+#include "migration/postcopy-ram.h"
+#include "sysemu/kvm.h"
+#include "qapi/qobject-input-visitor.h"
+#include "qemu/option.h"
+#include "qemu/config-file.h"
+#include "qemu/main-loop.h"
+#ifdef CONFIG_VIRTFS
+#include "fsdev/qemu-fsdev.h"
+#endif
+#include "sysemu/qtest.h"
+#ifdef CONFIG_TCG
+#include "accel/tcg/perf.h"
+#endif
+
+#include "disas/disas.h"
+
+#include "trace.h"
+#include "trace/control.h"
+#include "qemu/plugin.h"
+#include "qemu/queue.h"
+#include "sysemu/arch_init.h"
+#include "exec/confidential-guest-support.h"
+
+#include "ui/qemu-spice.h"
+#include "qapi/string-input-visitor.h"
+#include "qapi/opts-visitor.h"
+#include "qapi/clone-visitor.h"
+#include "qom/object_interfaces.h"
+#include "semihosting/semihost.h"
+#include "crypto/init.h"
+#include "sysemu/replay.h"
+#include "qapi/qapi-events-run-state.h"
+#include "qapi/qapi-types-audio.h"
+#include "qapi/qapi-visit-audio.h"
+#include "qapi/qapi-visit-block-core.h"
+#include "qapi/qapi-visit-compat.h"
+#include "qapi/qapi-visit-machine.h"
+#include "qapi/qapi-visit-ui.h"
+#include "qapi/qapi-commands-block-core.h"
+#include "qapi/qapi-commands-migration.h"
+#include "qapi/qapi-commands-misc.h"
+#include "qapi/qapi-visit-qom.h"
+#include "qapi/qapi-commands-ui.h"
+#include "block/qdict.h"
+#include "qapi/qmp/qerror.h"
+#include "sysemu/iothread.h"
+#include "qemu/guest-random.h"
+#include "qemu/keyval.h"
+
+#define MAX_VIRTIO_CONSOLES 1
+
+typedef struct BlockdevOptionsQueueEntry {
+ BlockdevOptions *bdo;
+ Location loc;
+ QSIMPLEQ_ENTRY(BlockdevOptionsQueueEntry) entry;
+} BlockdevOptionsQueueEntry;
+
+typedef QSIMPLEQ_HEAD(, BlockdevOptionsQueueEntry) BlockdevOptionsQueue;
+
+typedef struct ObjectOption {
+ ObjectOptions *opts;
+ QTAILQ_ENTRY(ObjectOption) next;
+} ObjectOption;
+
+typedef struct DeviceOption {
+ QDict *opts;
+ Location loc;
+ QTAILQ_ENTRY(DeviceOption) next;
+} DeviceOption;
+
+static const char *cpu_option;
+static const char *mem_path;
+static const char *incoming;
+static const char *loadvm;
+static const char *accelerators;
+static bool have_custom_ram_size;
+static const char *ram_memdev_id;
+static QDict *machine_opts_dict;
+static QTAILQ_HEAD(, ObjectOption) object_opts = QTAILQ_HEAD_INITIALIZER(object_opts);
+static QTAILQ_HEAD(, DeviceOption) device_opts = QTAILQ_HEAD_INITIALIZER(device_opts);
+static int display_remote;
+static int snapshot;
+static bool preconfig_requested;
+static QemuPluginList plugin_list = QTAILQ_HEAD_INITIALIZER(plugin_list);
+static BlockdevOptionsQueue bdo_queue = QSIMPLEQ_HEAD_INITIALIZER(bdo_queue);
+static bool nographic = false;
+static int mem_prealloc; /* force preallocation of physical target memory */
+static const char *vga_model = NULL;
+static DisplayOptions dpy;
+static int num_serial_hds;
+static Chardev **serial_hds;
+static const char *log_mask;
+static const char *log_file;
+static bool list_data_dirs;
+static const char *qtest_chrdev;
+static const char *qtest_log;
+static bool opt_one_insn_per_tb;
+
+static int has_defaults = 1;
+static int default_serial = 1;
+static int default_parallel = 1;
+static int default_monitor = 1;
+static int default_floppy = 1;
+static int default_cdrom = 1;
+static int default_sdcard = 1;
+static int default_vga = 1;
+static int default_net = 1;
+
+static struct {
+ const char *driver;
+ int *flag;
+} default_list[] = {
+ { .driver = "isa-serial", .flag = &default_serial },
+ { .driver = "isa-parallel", .flag = &default_parallel },
+ { .driver = "isa-fdc", .flag = &default_floppy },
+ { .driver = "floppy", .flag = &default_floppy },
+ { .driver = "ide-cd", .flag = &default_cdrom },
+ { .driver = "ide-hd", .flag = &default_cdrom },
+ { .driver = "scsi-cd", .flag = &default_cdrom },
+ { .driver = "scsi-hd", .flag = &default_cdrom },
+ { .driver = "VGA", .flag = &default_vga },
+ { .driver = "isa-vga", .flag = &default_vga },
+ { .driver = "cirrus-vga", .flag = &default_vga },
+ { .driver = "isa-cirrus-vga", .flag = &default_vga },
+ { .driver = "vmware-svga", .flag = &default_vga },
+ { .driver = "qxl-vga", .flag = &default_vga },
+ { .driver = "virtio-vga", .flag = &default_vga },
+ { .driver = "ati-vga", .flag = &default_vga },
+ { .driver = "vhost-user-vga", .flag = &default_vga },
+ { .driver = "virtio-vga-gl", .flag = &default_vga },
+};
+
+static QemuOptsList qemu_rtc_opts = {
+ .name = "rtc",
+ .head = QTAILQ_HEAD_INITIALIZER(qemu_rtc_opts.head),
+ .merge_lists = true,
+ .desc = {
+ {
+ .name = "base",
+ .type = QEMU_OPT_STRING,
+ },{
+ .name = "clock",
+ .type = QEMU_OPT_STRING,
+ },{
+ .name = "driftfix",
+ .type = QEMU_OPT_STRING,
+ },
+ { /* end of list */ }
+ },
+};
+
+static QemuOptsList qemu_option_rom_opts = {
+ .name = "option-rom",
+ .implied_opt_name = "romfile",
+ .head = QTAILQ_HEAD_INITIALIZER(qemu_option_rom_opts.head),
+ .desc = {
+ {
+ .name = "bootindex",
+ .type = QEMU_OPT_NUMBER,
+ }, {
+ .name = "romfile",
+ .type = QEMU_OPT_STRING,
+ },
+ { /* end of list */ }
+ },
+};
+
+static QemuOptsList qemu_accel_opts = {
+ .name = "accel",
+ .implied_opt_name = "accel",
+ .head = QTAILQ_HEAD_INITIALIZER(qemu_accel_opts.head),
+ .desc = {
+ /*
+ * no elements => accept any
+ * sanity checking will happen later
+ * when setting accelerator properties
+ */
+ { }
+ },
+};
+
+static QemuOptsList qemu_boot_opts = {
+ .name = "boot-opts",
+ .implied_opt_name = "order",
+ .merge_lists = true,
+ .head = QTAILQ_HEAD_INITIALIZER(qemu_boot_opts.head),
+ .desc = {
+ {
+ .name = "order",
+ .type = QEMU_OPT_STRING,
+ }, {
+ .name = "once",
+ .type = QEMU_OPT_STRING,
+ }, {
+ .name = "menu",
+ .type = QEMU_OPT_BOOL,
+ }, {
+ .name = "splash",
+ .type = QEMU_OPT_STRING,
+ }, {
+ .name = "splash-time",
+ .type = QEMU_OPT_NUMBER,
+ }, {
+ .name = "reboot-timeout",
+ .type = QEMU_OPT_NUMBER,
+ }, {
+ .name = "strict",
+ .type = QEMU_OPT_BOOL,
+ },
+ { /*End of list */ }
+ },
+};
+
+static QemuOptsList qemu_add_fd_opts = {
+ .name = "add-fd",
+ .head = QTAILQ_HEAD_INITIALIZER(qemu_add_fd_opts.head),
+ .desc = {
+ {
+ .name = "fd",
+ .type = QEMU_OPT_NUMBER,
+ .help = "file descriptor of which a duplicate is added to fd set",
+ },{
+ .name = "set",
+ .type = QEMU_OPT_NUMBER,
+ .help = "ID of the fd set to add fd to",
+ },{
+ .name = "opaque",
+ .type = QEMU_OPT_STRING,
+ .help = "free-form string used to describe fd",
+ },
+ { /* end of list */ }
+ },
+};
+
+static QemuOptsList qemu_object_opts = {
+ .name = "object",
+ .implied_opt_name = "qom-type",
+ .head = QTAILQ_HEAD_INITIALIZER(qemu_object_opts.head),
+ .desc = {
+ { }
+ },
+};
+
+static QemuOptsList qemu_tpmdev_opts = {
+ .name = "tpmdev",
+ .implied_opt_name = "type",
+ .head = QTAILQ_HEAD_INITIALIZER(qemu_tpmdev_opts.head),
+ .desc = {
+ /* options are defined in the TPM backends */
+ { /* end of list */ }
+ },
+};
+
+static QemuOptsList qemu_overcommit_opts = {
+ .name = "overcommit",
+ .head = QTAILQ_HEAD_INITIALIZER(qemu_overcommit_opts.head),
+ .desc = {
+ {
+ .name = "mem-lock",
+ .type = QEMU_OPT_BOOL,
+ },
+ {
+ .name = "cpu-pm",
+ .type = QEMU_OPT_BOOL,
+ },
+ { /* end of list */ }
+ },
+};
+
+static QemuOptsList qemu_msg_opts = {
+ .name = "msg",
+ .head = QTAILQ_HEAD_INITIALIZER(qemu_msg_opts.head),
+ .desc = {
+ {
+ .name = "timestamp",
+ .type = QEMU_OPT_BOOL,
+ },
+ {
+ .name = "guest-name",
+ .type = QEMU_OPT_BOOL,
+ .help = "Prepends guest name for error messages but only if "
+ "-name guest is set otherwise option is ignored\n",
+ },
+ { /* end of list */ }
+ },
+};
+
+static QemuOptsList qemu_name_opts = {
+ .name = "name",
+ .implied_opt_name = "guest",
+ .merge_lists = true,
+ .head = QTAILQ_HEAD_INITIALIZER(qemu_name_opts.head),
+ .desc = {
+ {
+ .name = "guest",
+ .type = QEMU_OPT_STRING,
+ .help = "Sets the name of the guest.\n"
+ "This name will be displayed in the SDL window caption.\n"
+ "The name will also be used for the VNC server",
+ }, {
+ .name = "process",
+ .type = QEMU_OPT_STRING,
+ .help = "Sets the name of the QEMU process, as shown in top etc",
+ }, {
+ .name = "debug-threads",
+ .type = QEMU_OPT_BOOL,
+ .help = "When enabled, name the individual threads; defaults off.\n"
+ "NOTE: The thread names are for debugging and not a\n"
+ "stable API.",
+ },
+ { /* End of list */ }
+ },
+};
+
+static QemuOptsList qemu_mem_opts = {
+ .name = "memory",
+ .implied_opt_name = "size",
+ .head = QTAILQ_HEAD_INITIALIZER(qemu_mem_opts.head),
+ .merge_lists = true,
+ .desc = {
+ {
+ .name = "size",
+ .type = QEMU_OPT_SIZE,
+ },
+ {
+ .name = "slots",
+ .type = QEMU_OPT_NUMBER,
+ },
+ {
+ .name = "maxmem",
+ .type = QEMU_OPT_SIZE,
+ },
+ { /* end of list */ }
+ },
+};
+
+static QemuOptsList qemu_icount_opts = {
+ .name = "icount",
+ .implied_opt_name = "shift",
+ .merge_lists = true,
+ .head = QTAILQ_HEAD_INITIALIZER(qemu_icount_opts.head),
+ .desc = {
+ {
+ .name = "shift",
+ .type = QEMU_OPT_STRING,
+ }, {
+ .name = "align",
+ .type = QEMU_OPT_BOOL,
+ }, {
+ .name = "sleep",
+ .type = QEMU_OPT_BOOL,
+ }, {
+ .name = "rr",
+ .type = QEMU_OPT_STRING,
+ }, {
+ .name = "rrfile",
+ .type = QEMU_OPT_STRING,
+ }, {
+ .name = "rrsnapshot",
+ .type = QEMU_OPT_STRING,
+ },
+ { /* end of list */ }
+ },
+};
+
+static QemuOptsList qemu_fw_cfg_opts = {
+ .name = "fw_cfg",
+ .implied_opt_name = "name",
+ .head = QTAILQ_HEAD_INITIALIZER(qemu_fw_cfg_opts.head),
+ .desc = {
+ {
+ .name = "name",
+ .type = QEMU_OPT_STRING,
+ .help = "Sets the fw_cfg name of the blob to be inserted",
+ }, {
+ .name = "file",
+ .type = QEMU_OPT_STRING,
+ .help = "Sets the name of the file from which "
+ "the fw_cfg blob will be loaded",
+ }, {
+ .name = "string",
+ .type = QEMU_OPT_STRING,
+ .help = "Sets content of the blob to be inserted from a string",
+ }, {
+ .name = "gen_id",
+ .type = QEMU_OPT_STRING,
+ .help = "Sets id of the object generating the fw_cfg blob "
+ "to be inserted",
+ },
+ { /* end of list */ }
+ },
+};
+
+static QemuOptsList qemu_action_opts = {
+ .name = "action",
+ .merge_lists = true,
+ .head = QTAILQ_HEAD_INITIALIZER(qemu_action_opts.head),
+ .desc = {
+ {
+ .name = "shutdown",
+ .type = QEMU_OPT_STRING,
+ },{
+ .name = "reboot",
+ .type = QEMU_OPT_STRING,
+ },{
+ .name = "panic",
+ .type = QEMU_OPT_STRING,
+ },{
+ .name = "watchdog",
+ .type = QEMU_OPT_STRING,
+ },
+ { /* end of list */ }
+ },
+};
+
+const char *qemu_get_vm_name(void)
+{
+ return qemu_name;
+}
+
+static void default_driver_disable(const char *driver)
+{
+ int i;
+
+ if (!driver) {
+ return;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(default_list); i++) {
+ if (strcmp(default_list[i].driver, driver) != 0)
+ continue;
+ *(default_list[i].flag) = 0;
+ }
+}
+
+static int default_driver_check(void *opaque, QemuOpts *opts, Error **errp)
+{
+ const char *driver = qemu_opt_get(opts, "driver");
+
+ default_driver_disable(driver);
+ return 0;
+}
+
+static void default_driver_check_json(void)
+{
+ DeviceOption *opt;
+
+ QTAILQ_FOREACH(opt, &device_opts, next) {
+ const char *driver = qdict_get_try_str(opt->opts, "driver");
+ default_driver_disable(driver);
+ }
+}
+
+static int parse_name(void *opaque, QemuOpts *opts, Error **errp)
+{
+ const char *proc_name;
+
+ if (qemu_opt_get(opts, "debug-threads")) {
+ qemu_thread_naming(qemu_opt_get_bool(opts, "debug-threads", false));
+ }
+ qemu_name = qemu_opt_get(opts, "guest");
+
+ proc_name = qemu_opt_get(opts, "process");
+ if (proc_name) {
+ os_set_proc_name(proc_name);
+ }
+
+ return 0;
+}
+
+bool defaults_enabled(void)
+{
+ return has_defaults;
+}
+
+#ifndef _WIN32
+static int parse_add_fd(void *opaque, QemuOpts *opts, Error **errp)
+{
+ int fd, dupfd, flags;
+ int64_t fdset_id;
+ const char *fd_opaque = NULL;
+ AddfdInfo *fdinfo;
+
+ fd = qemu_opt_get_number(opts, "fd", -1);
+ fdset_id = qemu_opt_get_number(opts, "set", -1);
+ fd_opaque = qemu_opt_get(opts, "opaque");
+
+ if (fd < 0) {
+ error_setg(errp, "fd option is required and must be non-negative");
+ return -1;
+ }
+
+ if (fd <= STDERR_FILENO) {
+ error_setg(errp, "fd cannot be a standard I/O stream");
+ return -1;
+ }
+
+ /*
+ * All fds inherited across exec() necessarily have FD_CLOEXEC
+ * clear, while qemu sets FD_CLOEXEC on all other fds used internally.
+ */
+ flags = fcntl(fd, F_GETFD);
+ if (flags == -1 || (flags & FD_CLOEXEC)) {
+ error_setg(errp, "fd is not valid or already in use");
+ return -1;
+ }
+
+ if (fdset_id < 0) {
+ error_setg(errp, "set option is required and must be non-negative");
+ return -1;
+ }
+
+#ifdef F_DUPFD_CLOEXEC
+ dupfd = fcntl(fd, F_DUPFD_CLOEXEC, 0);
+#else
+ dupfd = dup(fd);
+ if (dupfd != -1) {
+ qemu_set_cloexec(dupfd);
+ }
+#endif
+ if (dupfd == -1) {
+ error_setg(errp, "error duplicating fd: %s", strerror(errno));
+ return -1;
+ }
+
+ /* add the duplicate fd, and optionally the opaque string, to the fd set */
+ fdinfo = monitor_fdset_add_fd(dupfd, true, fdset_id, fd_opaque,
+ &error_abort);
+ g_free(fdinfo);
+
+ return 0;
+}
+
+static int cleanup_add_fd(void *opaque, QemuOpts *opts, Error **errp)
+{
+ int fd;
+
+ fd = qemu_opt_get_number(opts, "fd", -1);
+ close(fd);
+
+ return 0;
+}
+#endif
+
+/***********************************************************/
+/* QEMU Block devices */
+
+#define HD_OPTS "media=disk"
+#define CDROM_OPTS "media=cdrom"
+#define FD_OPTS ""
+#define PFLASH_OPTS ""
+#define MTD_OPTS ""
+#define SD_OPTS ""
+
+static int drive_init_func(void *opaque, QemuOpts *opts, Error **errp)
+{
+ BlockInterfaceType *block_default_type = opaque;
+
+ return drive_new(opts, *block_default_type, errp) == NULL;
+}
+
+static int drive_enable_snapshot(void *opaque, QemuOpts *opts, Error **errp)
+{
+ if (qemu_opt_get(opts, "snapshot") == NULL) {
+ qemu_opt_set(opts, "snapshot", "on", &error_abort);
+ }
+ return 0;
+}
+
+static void default_drive(int enable, int snapshot, BlockInterfaceType type,
+ int index, const char *optstr)
+{
+ QemuOpts *opts;
+ DriveInfo *dinfo;
+
+ if (!enable || drive_get_by_index(type, index)) {
+ return;
+ }
+
+ opts = drive_add(type, index, NULL, optstr);
+ if (snapshot) {
+ drive_enable_snapshot(NULL, opts, NULL);
+ }
+
+ dinfo = drive_new(opts, type, &error_abort);
+ dinfo->is_default = true;
+
+}
+
+static void configure_blockdev(BlockdevOptionsQueue *bdo_queue,
+ MachineClass *machine_class, int snapshot)
+{
+ /*
+ * If the currently selected machine wishes to override the
+ * units-per-bus property of its default HBA interface type, do so
+ * now.
+ */
+ if (machine_class->units_per_default_bus) {
+ override_max_devs(machine_class->block_default_type,
+ machine_class->units_per_default_bus);
+ }
+
+ /* open the virtual block devices */
+ while (!QSIMPLEQ_EMPTY(bdo_queue)) {
+ BlockdevOptionsQueueEntry *bdo = QSIMPLEQ_FIRST(bdo_queue);
+
+ QSIMPLEQ_REMOVE_HEAD(bdo_queue, entry);
+ loc_push_restore(&bdo->loc);
+ qmp_blockdev_add(bdo->bdo, &error_fatal);
+ loc_pop(&bdo->loc);
+ qapi_free_BlockdevOptions(bdo->bdo);
+ g_free(bdo);
+ }
+ if (snapshot) {
+ qemu_opts_foreach(qemu_find_opts("drive"), drive_enable_snapshot,
+ NULL, NULL);
+ }
+ if (qemu_opts_foreach(qemu_find_opts("drive"), drive_init_func,
+ &machine_class->block_default_type, &error_fatal)) {
+ /* We printed help */
+ exit(0);
+ }
+
+ default_drive(default_cdrom, snapshot, machine_class->block_default_type, 2,
+ CDROM_OPTS);
+ default_drive(default_floppy, snapshot, IF_FLOPPY, 0, FD_OPTS);
+ default_drive(default_sdcard, snapshot, IF_SD, 0, SD_OPTS);
+
+}
+
+static QemuOptsList qemu_smp_opts = {
+ .name = "smp-opts",
+ .implied_opt_name = "cpus",
+ .merge_lists = true,
+ .head = QTAILQ_HEAD_INITIALIZER(qemu_smp_opts.head),
+ .desc = {
+ {
+ .name = "cpus",
+ .type = QEMU_OPT_NUMBER,
+ }, {
+ .name = "sockets",
+ .type = QEMU_OPT_NUMBER,
+ }, {
+ .name = "dies",
+ .type = QEMU_OPT_NUMBER,
+ }, {
+ .name = "clusters",
+ .type = QEMU_OPT_NUMBER,
+ }, {
+ .name = "cores",
+ .type = QEMU_OPT_NUMBER,
+ }, {
+ .name = "threads",
+ .type = QEMU_OPT_NUMBER,
+ }, {
+ .name = "maxcpus",
+ .type = QEMU_OPT_NUMBER,
+ },
+ { /*End of list */ }
+ },
+};
+
+#if defined(CONFIG_POSIX)
+static QemuOptsList qemu_run_with_opts = {
+ .name = "run-with",
+ .head = QTAILQ_HEAD_INITIALIZER(qemu_run_with_opts.head),
+ .desc = {
+#if defined(CONFIG_LINUX)
+ {
+ .name = "async-teardown",
+ .type = QEMU_OPT_BOOL,
+ },
+#endif
+ {
+ .name = "chroot",
+ .type = QEMU_OPT_STRING,
+ },
+ { /* end of list */ }
+ },
+};
+
+#define qemu_add_run_with_opts() qemu_add_opts(&qemu_run_with_opts)
+
+#else
+
+#define qemu_add_run_with_opts()
+
+#endif /* CONFIG_POSIX */
+
+static void realtime_init(void)
+{
+ if (enable_mlock) {
+ if (os_mlock() < 0) {
+ error_report("locking memory failed");
+ exit(1);
+ }
+ }
+}
+
+
+static void configure_msg(QemuOpts *opts)
+{
+ message_with_timestamp = qemu_opt_get_bool(opts, "timestamp", false);
+ error_with_guestname = qemu_opt_get_bool(opts, "guest-name", false);
+}
+
+
+/***********************************************************/
+/* USB devices */
+
+static int usb_device_add(const char *devname)
+{
+ USBDevice *dev = NULL;
+
+ if (!machine_usb(current_machine)) {
+ return -1;
+ }
+
+ dev = usbdevice_create(devname);
+ if (!dev)
+ return -1;
+
+ return 0;
+}
+
+static int usb_parse(const char *cmdline)
+{
+ int r;
+ r = usb_device_add(cmdline);
+ if (r < 0) {
+ error_report("could not add USB device '%s'", cmdline);
+ }
+ return r;
+}
+
+/***********************************************************/
+/* machine registration */
+
+static MachineClass *find_machine(const char *name, GSList *machines)
+{
+ GSList *el;
+
+ for (el = machines; el; el = el->next) {
+ MachineClass *mc = el->data;
+
+ if (!strcmp(mc->name, name) || !g_strcmp0(mc->alias, name)) {
+ return mc;
+ }
+ }
+
+ return NULL;
+}
+
+static MachineClass *find_default_machine(GSList *machines)
+{
+ GSList *el;
+ MachineClass *default_machineclass = NULL;
+
+ for (el = machines; el; el = el->next) {
+ MachineClass *mc = el->data;
+
+ if (mc->is_default) {
+ assert(default_machineclass == NULL && "Multiple default machines");
+ default_machineclass = mc;
+ }
+ }
+
+ return default_machineclass;
+}
+
+static void version(void)
+{
+ printf("QEMU emulator version " QEMU_FULL_VERSION "\n"
+ QEMU_COPYRIGHT "\n");
+}
+
+static void help(int exitcode)
+{
+ version();
+ printf("usage: %s [options] [disk_image]\n\n"
+ "'disk_image' is a raw hard disk image for IDE hard disk 0\n\n",
+ g_get_prgname());
+
+#define DEF(option, opt_arg, opt_enum, opt_help, arch_mask) \
+ if ((arch_mask) & arch_type) \
+ fputs(opt_help, stdout);
+
+#define ARCHHEADING(text, arch_mask) \
+ if ((arch_mask) & arch_type) \
+ puts(stringify(text));
+
+#define DEFHEADING(text) ARCHHEADING(text, QEMU_ARCH_ALL)
+
+#include "qemu-options.def"
+
+ printf("\nDuring emulation, the following keys are useful:\n"
+ "ctrl-alt-f toggle full screen\n"
+ "ctrl-alt-n switch to virtual console 'n'\n"
+ "ctrl-alt toggle mouse and keyboard grab\n"
+ "\n"
+ "When using -nographic, press 'ctrl-a h' to get some help.\n"
+ "\n"
+ QEMU_HELP_BOTTOM "\n");
+
+ exit(exitcode);
+}
+
+enum {
+
+#define DEF(option, opt_arg, opt_enum, opt_help, arch_mask) \
+ opt_enum,
+#define DEFHEADING(text)
+#define ARCHHEADING(text, arch_mask)
+
+#include "qemu-options.def"
+};
+
+#define HAS_ARG 0x0001
+
+typedef struct QEMUOption {
+ const char *name;
+ int flags;
+ int index;
+ uint32_t arch_mask;
+} QEMUOption;
+
+static const QEMUOption qemu_options[] = {
+ { "h", 0, QEMU_OPTION_h, QEMU_ARCH_ALL },
+
+#define DEF(option, opt_arg, opt_enum, opt_help, arch_mask) \
+ { option, opt_arg, opt_enum, arch_mask },
+#define DEFHEADING(text)
+#define ARCHHEADING(text, arch_mask)
+
+#include "qemu-options.def"
+ { /* end of list */ }
+};
+
+typedef struct VGAInterfaceInfo {
+ const char *opt_name; /* option name */
+ const char *name; /* human-readable name */
+ /* Class names indicating that support is available.
+ * If no class is specified, the interface is always available */
+ const char *class_names[2];
+} VGAInterfaceInfo;
+
+static const VGAInterfaceInfo vga_interfaces[VGA_TYPE_MAX] = {
+ [VGA_NONE] = {
+ .opt_name = "none",
+ .name = "no graphic card",
+ },
+ [VGA_STD] = {
+ .opt_name = "std",
+ .name = "standard VGA",
+ .class_names = { "VGA", "isa-vga" },
+ },
+ [VGA_CIRRUS] = {
+ .opt_name = "cirrus",
+ .name = "Cirrus VGA",
+ .class_names = { "cirrus-vga", "isa-cirrus-vga" },
+ },
+ [VGA_VMWARE] = {
+ .opt_name = "vmware",
+ .name = "VMWare SVGA",
+ .class_names = { "vmware-svga" },
+ },
+ [VGA_VIRTIO] = {
+ .opt_name = "virtio",
+ .name = "Virtio VGA",
+ .class_names = { "virtio-vga" },
+ },
+ [VGA_QXL] = {
+ .opt_name = "qxl",
+ .name = "QXL VGA",
+ .class_names = { "qxl-vga" },
+ },
+ [VGA_TCX] = {
+ .opt_name = "tcx",
+ .name = "TCX framebuffer",
+ .class_names = { "sun-tcx" },
+ },
+ [VGA_CG3] = {
+ .opt_name = "cg3",
+ .name = "CG3 framebuffer",
+ .class_names = { "cgthree" },
+ },
+#ifdef CONFIG_XEN_BACKEND
+ [VGA_XENFB] = {
+ .opt_name = "xenfb",
+ .name = "Xen paravirtualized framebuffer",
+ },
+#endif
+};
+
+static bool vga_interface_available(VGAInterfaceType t)
+{
+ const VGAInterfaceInfo *ti = &vga_interfaces[t];
+
+ assert(t < VGA_TYPE_MAX);
+ return !ti->class_names[0] ||
+ module_object_class_by_name(ti->class_names[0]) ||
+ module_object_class_by_name(ti->class_names[1]);
+}
+
+static const char *
+get_default_vga_model(const MachineClass *machine_class)
+{
+ if (machine_class->default_display) {
+ for (int t = 0; t < VGA_TYPE_MAX; t++) {
+ const VGAInterfaceInfo *ti = &vga_interfaces[t];
+
+ if (ti->opt_name && vga_interface_available(t) &&
+ g_str_equal(ti->opt_name, machine_class->default_display)) {
+ return machine_class->default_display;
+ }
+ }
+
+ warn_report_once("Default display '%s' is not available in this binary",
+ machine_class->default_display);
+ return NULL;
+ } else if (vga_interface_available(VGA_CIRRUS)) {
+ return "cirrus";
+ } else if (vga_interface_available(VGA_STD)) {
+ return "std";
+ }
+
+ return NULL;
+}
+
+static void select_vgahw(const MachineClass *machine_class, const char *p)
+{
+ const char *opts;
+ int t;
+
+ if (g_str_equal(p, "help")) {
+ const char *def = get_default_vga_model(machine_class);
+
+ for (t = 0; t < VGA_TYPE_MAX; t++) {
+ const VGAInterfaceInfo *ti = &vga_interfaces[t];
+
+ if (vga_interface_available(t) && ti->opt_name) {
+ printf("%-20s %s%s\n", ti->opt_name, ti->name ?: "",
+ (def && g_str_equal(ti->opt_name, def)) ?
+ " (default)" : "");
+ }
+ }
+ exit(0);
+ }
+
+ assert(vga_interface_type == VGA_NONE);
+ for (t = 0; t < VGA_TYPE_MAX; t++) {
+ const VGAInterfaceInfo *ti = &vga_interfaces[t];
+ if (ti->opt_name && strstart(p, ti->opt_name, &opts)) {
+ if (!vga_interface_available(t)) {
+ error_report("%s not available", ti->name);
+ exit(1);
+ }
+ vga_interface_type = t;
+ break;
+ }
+ }
+ if (t == VGA_TYPE_MAX) {
+ invalid_vga:
+ error_report("unknown vga type: %s", p);
+ exit(1);
+ }
+ while (*opts) {
+ const char *nextopt;
+
+ if (strstart(opts, ",retrace=", &nextopt)) {
+ opts = nextopt;
+ if (strstart(opts, "dumb", &nextopt))
+ vga_retrace_method = VGA_RETRACE_DUMB;
+ else if (strstart(opts, "precise", &nextopt))
+ vga_retrace_method = VGA_RETRACE_PRECISE;
+ else goto invalid_vga;
+ } else goto invalid_vga;
+ opts = nextopt;
+ }
+}
+
+static void parse_display_qapi(const char *optarg)
+{
+ DisplayOptions *opts;
+ Visitor *v;
+
+ v = qobject_input_visitor_new_str(optarg, "type", &error_fatal);
+
+ visit_type_DisplayOptions(v, NULL, &opts, &error_fatal);
+ QAPI_CLONE_MEMBERS(DisplayOptions, &dpy, opts);
+
+ qapi_free_DisplayOptions(opts);
+ visit_free(v);
+}
+
+DisplayOptions *qmp_query_display_options(Error **errp)
+{
+ return QAPI_CLONE(DisplayOptions, &dpy);
+}
+
+static void parse_display(const char *p)
+{
+ const char *opts;
+
+ if (is_help_option(p)) {
+ qemu_display_help();
+ exit(0);
+ }
+
+ if (strstart(p, "vnc", &opts)) {
+ /*
+ * vnc isn't a (local) DisplayType but a protocol for remote
+ * display access.
+ */
+ if (*opts == '=') {
+ vnc_parse(opts + 1);
+ } else {
+ error_report("VNC requires a display argument vnc=<display>");
+ exit(1);
+ }
+ } else {
+ parse_display_qapi(p);
+ }
+}
+
+static inline bool nonempty_str(const char *str)
+{
+ return str && *str;
+}
+
+static int parse_fw_cfg(void *opaque, QemuOpts *opts, Error **errp)
+{
+ gchar *buf;
+ size_t size;
+ const char *name, *file, *str, *gen_id;
+ FWCfgState *fw_cfg = (FWCfgState *) opaque;
+
+ if (fw_cfg == NULL) {
+ error_setg(errp, "fw_cfg device not available");
+ return -1;
+ }
+ name = qemu_opt_get(opts, "name");
+ file = qemu_opt_get(opts, "file");
+ str = qemu_opt_get(opts, "string");
+ gen_id = qemu_opt_get(opts, "gen_id");
+
+ /* we need the name, and exactly one of: file, content string, gen_id */
+ if (!nonempty_str(name) ||
+ nonempty_str(file) + nonempty_str(str) + nonempty_str(gen_id) != 1) {
+ error_setg(errp, "name, plus exactly one of file,"
+ " string and gen_id, are needed");
+ return -1;
+ }
+ if (strlen(name) > FW_CFG_MAX_FILE_PATH - 1) {
+ error_setg(errp, "name too long (max. %d char)",
+ FW_CFG_MAX_FILE_PATH - 1);
+ return -1;
+ }
+ if (nonempty_str(gen_id)) {
+ /*
+ * In this particular case where the content is populated
+ * internally, the "etc/" namespace protection is relaxed,
+ * so do not emit a warning.
+ */
+ } else if (strncmp(name, "opt/", 4) != 0) {
+ warn_report("externally provided fw_cfg item names "
+ "should be prefixed with \"opt/\"");
+ }
+ if (nonempty_str(str)) {
+ size = strlen(str); /* NUL terminator NOT included in fw_cfg blob */
+ buf = g_memdup(str, size);
+ } else if (nonempty_str(gen_id)) {
+ if (!fw_cfg_add_from_generator(fw_cfg, name, gen_id, errp)) {
+ return -1;
+ }
+ return 0;
+ } else {
+ GError *err = NULL;
+ if (!g_file_get_contents(file, &buf, &size, &err)) {
+ error_setg(errp, "can't load %s: %s", file, err->message);
+ g_error_free(err);
+ return -1;
+ }
+ }
+ /* For legacy, keep user files in a specific global order. */
+ fw_cfg_set_order_override(fw_cfg, FW_CFG_ORDER_OVERRIDE_USER);
+ fw_cfg_add_file(fw_cfg, name, buf, size);
+ fw_cfg_reset_order_override(fw_cfg);
+ return 0;
+}
+
+static int device_help_func(void *opaque, QemuOpts *opts, Error **errp)
+{
+ return qdev_device_help(opts);
+}
+
+static int device_init_func(void *opaque, QemuOpts *opts, Error **errp)
+{
+ DeviceState *dev;
+
+ dev = qdev_device_add(opts, errp);
+ if (!dev && *errp) {
+ error_report_err(*errp);
+ return -1;
+ } else if (dev) {
+ object_unref(OBJECT(dev));
+ }
+ return 0;
+}
+
+static int chardev_init_func(void *opaque, QemuOpts *opts, Error **errp)
+{
+ Error *local_err = NULL;
+
+ if (!qemu_chr_new_from_opts(opts, NULL, &local_err)) {
+ if (local_err) {
+ error_propagate(errp, local_err);
+ return -1;
+ }
+ exit(0);
+ }
+ return 0;
+}
+
+#ifdef CONFIG_VIRTFS
+static int fsdev_init_func(void *opaque, QemuOpts *opts, Error **errp)
+{
+ return qemu_fsdev_add(opts, errp);
+}
+#endif
+
+static int mon_init_func(void *opaque, QemuOpts *opts, Error **errp)
+{
+ return monitor_init_opts(opts, errp);
+}
+
+static void monitor_parse(const char *optarg, const char *mode, bool pretty)
+{
+ static int monitor_device_index = 0;
+ QemuOpts *opts;
+ const char *p;
+ char label[32];
+
+ if (strstart(optarg, "chardev:", &p)) {
+ snprintf(label, sizeof(label), "%s", p);
+ } else {
+ snprintf(label, sizeof(label), "compat_monitor%d",
+ monitor_device_index);
+ opts = qemu_chr_parse_compat(label, optarg, true);
+ if (!opts) {
+ error_report("parse error: %s", optarg);
+ exit(1);
+ }
+ }
+
+ opts = qemu_opts_create(qemu_find_opts("mon"), label, 1, &error_fatal);
+ qemu_opt_set(opts, "mode", mode, &error_abort);
+ qemu_opt_set(opts, "chardev", label, &error_abort);
+ if (!strcmp(mode, "control")) {
+ qemu_opt_set_bool(opts, "pretty", pretty, &error_abort);
+ } else {
+ assert(pretty == false);
+ }
+ monitor_device_index++;
+}
+
+struct device_config {
+ enum {
+ DEV_USB, /* -usbdevice */
+ DEV_SERIAL, /* -serial */
+ DEV_PARALLEL, /* -parallel */
+ DEV_DEBUGCON, /* -debugcon */
+ DEV_GDB, /* -gdb, -s */
+ DEV_SCLP, /* s390 sclp */
+ } type;
+ const char *cmdline;
+ Location loc;
+ QTAILQ_ENTRY(device_config) next;
+};
+
+static QTAILQ_HEAD(, device_config) device_configs =
+ QTAILQ_HEAD_INITIALIZER(device_configs);
+
+static void add_device_config(int type, const char *cmdline)
+{
+ struct device_config *conf;
+
+ conf = g_malloc0(sizeof(*conf));
+ conf->type = type;
+ conf->cmdline = cmdline;
+ loc_save(&conf->loc);
+ QTAILQ_INSERT_TAIL(&device_configs, conf, next);
+}
+
+static int foreach_device_config(int type, int (*func)(const char *cmdline))
+{
+ struct device_config *conf;
+ int rc;
+
+ QTAILQ_FOREACH(conf, &device_configs, next) {
+ if (conf->type != type)
+ continue;
+ loc_push_restore(&conf->loc);
+ rc = func(conf->cmdline);
+ loc_pop(&conf->loc);
+ if (rc) {
+ return rc;
+ }
+ }
+ return 0;
+}
+
+static void qemu_disable_default_devices(void)
+{
+ MachineClass *machine_class = MACHINE_GET_CLASS(current_machine);
+
+ default_driver_check_json();
+ qemu_opts_foreach(qemu_find_opts("device"),
+ default_driver_check, NULL, NULL);
+ qemu_opts_foreach(qemu_find_opts("global"),
+ default_driver_check, NULL, NULL);
+
+ if (!vga_model && !default_vga) {
+ vga_interface_type = VGA_DEVICE;
+ vga_interface_created = true;
+ }
+ if (!has_defaults || machine_class->no_serial) {
+ default_serial = 0;
+ }
+ if (!has_defaults || machine_class->no_parallel) {
+ default_parallel = 0;
+ }
+ if (!has_defaults || machine_class->no_floppy) {
+ default_floppy = 0;
+ }
+ if (!has_defaults || machine_class->no_cdrom) {
+ default_cdrom = 0;
+ }
+ if (!has_defaults || machine_class->no_sdcard) {
+ default_sdcard = 0;
+ }
+ if (!has_defaults) {
+ default_monitor = 0;
+ default_net = 0;
+ default_vga = 0;
+ } else {
+ if (default_net && machine_class->default_nic &&
+ !module_object_class_by_name(machine_class->default_nic)) {
+ warn_report("Default NIC '%s' is not available in this binary",
+ machine_class->default_nic);
+ default_net = 0;
+ }
+ }
+}
+
+static void qemu_create_default_devices(void)
+{
+ MachineClass *machine_class = MACHINE_GET_CLASS(current_machine);
+
+ if (is_daemonized()) {
+ /* According to documentation and historically, -nographic redirects
+ * serial port, parallel port and monitor to stdio, which does not work
+ * with -daemonize. We can redirect these to null instead, but since
+ * -nographic is legacy, let's just error out.
+ * We disallow -nographic only if all other ports are not redirected
+ * explicitly, to not break existing legacy setups which uses
+ * -nographic _and_ redirects all ports explicitly - this is valid
+ * usage, -nographic is just a no-op in this case.
+ */
+ if (nographic
+ && (default_parallel || default_serial || default_monitor)) {
+ error_report("-nographic cannot be used with -daemonize");
+ exit(1);
+ }
+ }
+
+ if (nographic) {
+ if (default_parallel)
+ add_device_config(DEV_PARALLEL, "null");
+ if (default_serial && default_monitor) {
+ add_device_config(DEV_SERIAL, "mon:stdio");
+ } else {
+ if (default_serial)
+ add_device_config(DEV_SERIAL, "stdio");
+ if (default_monitor)
+ monitor_parse("stdio", "readline", false);
+ }
+ } else {
+ if (default_serial)
+ add_device_config(DEV_SERIAL, "vc:80Cx24C");
+ if (default_parallel)
+ add_device_config(DEV_PARALLEL, "vc:80Cx24C");
+ if (default_monitor)
+ monitor_parse("vc:80Cx24C", "readline", false);
+ }
+
+ if (default_net) {
+ QemuOptsList *net = qemu_find_opts("net");
+ qemu_opts_parse(net, "nic", true, &error_abort);
+#ifdef CONFIG_SLIRP
+ qemu_opts_parse(net, "user", true, &error_abort);
+#endif
+ }
+
+#if defined(CONFIG_VNC)
+ if (!QTAILQ_EMPTY(&(qemu_find_opts("vnc")->head))) {
+ display_remote++;
+ }
+#endif
+ if (dpy.type == DISPLAY_TYPE_DEFAULT && !display_remote) {
+ if (!qemu_display_find_default(&dpy)) {
+ dpy.type = DISPLAY_TYPE_NONE;
+#if defined(CONFIG_VNC)
+ vnc_parse("localhost:0,to=99,id=default");
+#endif
+ }
+ }
+ if (dpy.type == DISPLAY_TYPE_DEFAULT) {
+ dpy.type = DISPLAY_TYPE_NONE;
+ }
+
+ /* If no default VGA is requested, the default is "none". */
+ if (default_vga) {
+ vga_model = get_default_vga_model(machine_class);
+ }
+ if (vga_model) {
+ select_vgahw(machine_class, vga_model);
+ }
+}
+
+static int serial_parse(const char *devname)
+{
+ int index = num_serial_hds;
+ char label[32];
+
+ if (strcmp(devname, "none") == 0)
+ return 0;
+ snprintf(label, sizeof(label), "serial%d", index);
+ serial_hds = g_renew(Chardev *, serial_hds, index + 1);
+
+ serial_hds[index] = qemu_chr_new_mux_mon(label, devname, NULL);
+ if (!serial_hds[index]) {
+ error_report("could not connect serial device"
+ " to character backend '%s'", devname);
+ return -1;
+ }
+ num_serial_hds++;
+ return 0;
+}
+
+Chardev *serial_hd(int i)
+{
+ assert(i >= 0);
+ if (i < num_serial_hds) {
+ return serial_hds[i];
+ }
+ return NULL;
+}
+
+static int parallel_parse(const char *devname)
+{
+ static int index = 0;
+ char label[32];
+
+ if (strcmp(devname, "none") == 0)
+ return 0;
+ if (index == MAX_PARALLEL_PORTS) {
+ error_report("too many parallel ports");
+ exit(1);
+ }
+ snprintf(label, sizeof(label), "parallel%d", index);
+ parallel_hds[index] = qemu_chr_new_mux_mon(label, devname, NULL);
+ if (!parallel_hds[index]) {
+ error_report("could not connect parallel device"
+ " to character backend '%s'", devname);
+ return -1;
+ }
+ index++;
+ return 0;
+}
+
+static int debugcon_parse(const char *devname)
+{
+ QemuOpts *opts;
+
+ if (!qemu_chr_new_mux_mon("debugcon", devname, NULL)) {
+ error_report("invalid character backend '%s'", devname);
+ exit(1);
+ }
+ opts = qemu_opts_create(qemu_find_opts("device"), "debugcon", 1, NULL);
+ if (!opts) {
+ error_report("already have a debugcon device");
+ exit(1);
+ }
+ qemu_opt_set(opts, "driver", "isa-debugcon", &error_abort);
+ qemu_opt_set(opts, "chardev", "debugcon", &error_abort);
+ return 0;
+}
+
+static gint machine_class_cmp(gconstpointer a, gconstpointer b)
+{
+ const MachineClass *mc1 = a, *mc2 = b;
+ int res;
+
+ if (mc1->family == NULL) {
+ if (mc2->family == NULL) {
+ /* Compare standalone machine types against each other; they sort
+ * in increasing order.
+ */
+ return strcmp(object_class_get_name(OBJECT_CLASS(mc1)),
+ object_class_get_name(OBJECT_CLASS(mc2)));
+ }
+
+ /* Standalone machine types sort after families. */
+ return 1;
+ }
+
+ if (mc2->family == NULL) {
+ /* Families sort before standalone machine types. */
+ return -1;
+ }
+
+ /* Families sort between each other alphabetically increasingly. */
+ res = strcmp(mc1->family, mc2->family);
+ if (res != 0) {
+ return res;
+ }
+
+ /* Within the same family, machine types sort in decreasing order. */
+ return strcmp(object_class_get_name(OBJECT_CLASS(mc2)),
+ object_class_get_name(OBJECT_CLASS(mc1)));
+}
+
+static void machine_help_func(const QDict *qdict)
+{
+ GSList *machines, *el;
+ const char *type = qdict_get_try_str(qdict, "type");
+
+ machines = object_class_get_list(TYPE_MACHINE, false);
+ if (type) {
+ ObjectClass *machine_class = OBJECT_CLASS(find_machine(type, machines));
+ if (machine_class) {
+ type_print_class_properties(object_class_get_name(machine_class));
+ return;
+ }
+ }
+
+ printf("Supported machines are:\n");
+ machines = g_slist_sort(machines, machine_class_cmp);
+ for (el = machines; el; el = el->next) {
+ MachineClass *mc = el->data;
+ if (mc->alias) {
+ printf("%-20s %s (alias of %s)\n", mc->alias, mc->desc, mc->name);
+ }
+ printf("%-20s %s%s%s\n", mc->name, mc->desc,
+ mc->is_default ? " (default)" : "",
+ mc->deprecation_reason ? " (deprecated)" : "");
+ }
+}
+
+static void
+machine_merge_property(const char *propname, QDict *prop, Error **errp)
+{
+ QDict *opts;
+
+ opts = qdict_new();
+ /* Preserve the caller's reference to prop. */
+ qobject_ref(prop);
+ qdict_put(opts, propname, prop);
+ keyval_merge(machine_opts_dict, opts, errp);
+ qobject_unref(opts);
+}
+
+static void
+machine_parse_property_opt(QemuOptsList *opts_list, const char *propname,
+ const char *arg)
+{
+ QDict *prop = NULL;
+ bool help = false;
+
+ prop = keyval_parse(arg, opts_list->implied_opt_name, &help, &error_fatal);
+ if (help) {
+ qemu_opts_print_help(opts_list, true);
+ exit(0);
+ }
+ machine_merge_property(propname, prop, &error_fatal);
+ qobject_unref(prop);
+}
+
+static const char *pid_file;
+struct UnlinkPidfileNotifier {
+ Notifier notifier;
+ char *pid_file_realpath;
+};
+static struct UnlinkPidfileNotifier qemu_unlink_pidfile_notifier;
+
+static void qemu_unlink_pidfile(Notifier *n, void *data)
+{
+ struct UnlinkPidfileNotifier *upn;
+
+ upn = DO_UPCAST(struct UnlinkPidfileNotifier, notifier, n);
+ unlink(upn->pid_file_realpath);
+}
+
+static const QEMUOption *lookup_opt(int argc, char **argv,
+ const char **poptarg, int *poptind)
+{
+ const QEMUOption *popt;
+ int optind = *poptind;
+ char *r = argv[optind];
+ const char *optarg;
+
+ loc_set_cmdline(argv, optind, 1);
+ optind++;
+ /* Treat --foo the same as -foo. */
+ if (r[1] == '-')
+ r++;
+ popt = qemu_options;
+ for(;;) {
+ if (!popt->name) {
+ error_report("invalid option");
+ exit(1);
+ }
+ if (!strcmp(popt->name, r + 1))
+ break;
+ popt++;
+ }
+ if (popt->flags & HAS_ARG) {
+ if (optind >= argc) {
+ error_report("requires an argument");
+ exit(1);
+ }
+ optarg = argv[optind++];
+ loc_set_cmdline(argv, optind - 2, 2);
+ } else {
+ optarg = NULL;
+ }
+
+ *poptarg = optarg;
+ *poptind = optind;
+
+ return popt;
+}
+
+static MachineClass *select_machine(QDict *qdict, Error **errp)
+{
+ const char *optarg = qdict_get_try_str(qdict, "type");
+ GSList *machines = object_class_get_list(TYPE_MACHINE, false);
+ MachineClass *machine_class;
+ Error *local_err = NULL;
+
+ if (optarg) {
+ machine_class = find_machine(optarg, machines);
+ qdict_del(qdict, "type");
+ if (!machine_class) {
+ error_setg(&local_err, "unsupported machine type");
+ }
+ } else {
+ machine_class = find_default_machine(machines);
+ if (!machine_class) {
+ error_setg(&local_err, "No machine specified, and there is no default");
+ }
+ }
+
+ g_slist_free(machines);
+ if (local_err) {
+ error_append_hint(&local_err, "Use -machine help to list supported machines\n");
+ error_propagate(errp, local_err);
+ }
+ return machine_class;
+}
+
+static int object_parse_property_opt(Object *obj,
+ const char *name, const char *value,
+ const char *skip, Error **errp)
+{
+ if (g_str_equal(name, skip)) {
+ return 0;
+ }
+
+ if (!object_property_parse(obj, name, value, errp)) {
+ return -1;
+ }
+
+ return 0;
+}
+
+/* *Non*recursively replace underscores with dashes in QDict keys. */
+static void keyval_dashify(QDict *qdict, Error **errp)
+{
+ const QDictEntry *ent, *next;
+ char *p;
+
+ for (ent = qdict_first(qdict); ent; ent = next) {
+ g_autofree char *new_key = NULL;
+
+ next = qdict_next(qdict, ent);
+ if (!strchr(ent->key, '_')) {
+ continue;
+ }
+ new_key = g_strdup(ent->key);
+ for (p = new_key; *p; p++) {
+ if (*p == '_') {
+ *p = '-';
+ }
+ }
+ if (qdict_haskey(qdict, new_key)) {
+ error_setg(errp, "Conflict between '%s' and '%s'", ent->key, new_key);
+ return;
+ }
+ qobject_ref(ent->value);
+ qdict_put_obj(qdict, new_key, ent->value);
+ qdict_del(qdict, ent->key);
+ }
+}
+
+static void qemu_apply_legacy_machine_options(QDict *qdict)
+{
+ const char *value;
+ QObject *prop;
+
+ keyval_dashify(qdict, &error_fatal);
+
+ /* Legacy options do not correspond to MachineState properties. */
+ value = qdict_get_try_str(qdict, "accel");
+ if (value) {
+ accelerators = g_strdup(value);
+ qdict_del(qdict, "accel");
+ }
+
+ value = qdict_get_try_str(qdict, "igd-passthru");
+ if (value) {
+ object_register_sugar_prop(ACCEL_CLASS_NAME("xen"), "igd-passthru", value,
+ false);
+ qdict_del(qdict, "igd-passthru");
+ }
+
+ value = qdict_get_try_str(qdict, "kvm-shadow-mem");
+ if (value) {
+ object_register_sugar_prop(ACCEL_CLASS_NAME("kvm"), "kvm-shadow-mem", value,
+ false);
+ qdict_del(qdict, "kvm-shadow-mem");
+ }
+
+ value = qdict_get_try_str(qdict, "kernel-irqchip");
+ if (value) {
+ object_register_sugar_prop(ACCEL_CLASS_NAME("kvm"), "kernel-irqchip", value,
+ false);
+ object_register_sugar_prop(ACCEL_CLASS_NAME("whpx"), "kernel-irqchip", value,
+ false);
+ qdict_del(qdict, "kernel-irqchip");
+ }
+
+ value = qdict_get_try_str(qdict, "memory-backend");
+ if (value) {
+ if (mem_path) {
+ error_report("'-mem-path' can't be used together with"
+ "'-machine memory-backend'");
+ exit(EXIT_FAILURE);
+ }
+
+ /* Resolved later. */
+ ram_memdev_id = g_strdup(value);
+ qdict_del(qdict, "memory-backend");
+ }
+
+ prop = qdict_get(qdict, "memory");
+ if (prop) {
+ have_custom_ram_size =
+ qobject_type(prop) == QTYPE_QDICT &&
+ qdict_haskey(qobject_to(QDict, prop), "size");
+ }
+}
+
+static void object_option_foreach_add(bool (*type_opt_predicate)(const char *))
+{
+ ObjectOption *opt, *next;
+
+ QTAILQ_FOREACH_SAFE(opt, &object_opts, next, next) {
+ const char *type = ObjectType_str(opt->opts->qom_type);
+ if (type_opt_predicate(type)) {
+ user_creatable_add_qapi(opt->opts, &error_fatal);
+ qapi_free_ObjectOptions(opt->opts);
+ QTAILQ_REMOVE(&object_opts, opt, next);
+ g_free(opt);
+ }
+ }
+}
+
+static void object_option_add_visitor(Visitor *v)
+{
+ ObjectOption *opt = g_new0(ObjectOption, 1);
+ visit_type_ObjectOptions(v, NULL, &opt->opts, &error_fatal);
+ QTAILQ_INSERT_TAIL(&object_opts, opt, next);
+}
+
+static void object_option_parse(const char *optarg)
+{
+ QemuOpts *opts;
+ const char *type;
+ Visitor *v;
+
+ if (optarg[0] == '{') {
+ QObject *obj = qobject_from_json(optarg, &error_fatal);
+
+ v = qobject_input_visitor_new(obj);
+ qobject_unref(obj);
+ } else {
+ opts = qemu_opts_parse_noisily(qemu_find_opts("object"),
+ optarg, true);
+ if (!opts) {
+ exit(1);
+ }
+
+ type = qemu_opt_get(opts, "qom-type");
+ if (!type) {
+ error_setg(&error_fatal, QERR_MISSING_PARAMETER, "qom-type");
+ }
+ if (user_creatable_print_help(type, opts)) {
+ exit(0);
+ }
+
+ v = opts_visitor_new(opts);
+ }
+
+ object_option_add_visitor(v);
+ visit_free(v);
+}
+
+/*
+ * Very early object creation, before the sandbox options have been activated.
+ */
+static bool object_create_pre_sandbox(const char *type)
+{
+ /*
+ * Objects should in general not get initialized "too early" without
+ * a reason. If you add one, state the reason in a comment!
+ */
+
+ /*
+ * Reason: -sandbox on,resourcecontrol=deny disallows setting CPU
+ * affinity of threads.
+ */
+ if (g_str_equal(type, "thread-context")) {
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Initial object creation happens before all other
+ * QEMU data types are created. The majority of objects
+ * can be created at this point. The rng-egd object
+ * cannot be created here, as it depends on the chardev
+ * already existing.
+ */
+static bool object_create_early(const char *type)
+{
+ /*
+ * Objects should not be made "delayed" without a reason. If you
+ * add one, state the reason in a comment!
+ */
+
+ /* Reason: already created. */
+ if (object_create_pre_sandbox(type)) {
+ return false;
+ }
+
+ /* Reason: property "chardev" */
+ if (g_str_equal(type, "rng-egd") ||
+ g_str_equal(type, "qtest")) {
+ return false;
+ }
+
+#if defined(CONFIG_VHOST_USER) && defined(CONFIG_LINUX)
+ /* Reason: cryptodev-vhost-user property "chardev" */
+ if (g_str_equal(type, "cryptodev-vhost-user")) {
+ return false;
+ }
+#endif
+
+ /* Reason: vhost-user-blk-server property "node-name" */
+ if (g_str_equal(type, "vhost-user-blk-server")) {
+ return false;
+ }
+ /*
+ * Reason: filter-* property "netdev" etc.
+ */
+ if (g_str_equal(type, "filter-buffer") ||
+ g_str_equal(type, "filter-dump") ||
+ g_str_equal(type, "filter-mirror") ||
+ g_str_equal(type, "filter-redirector") ||
+ g_str_equal(type, "colo-compare") ||
+ g_str_equal(type, "filter-rewriter") ||
+ g_str_equal(type, "filter-replay")) {
+ return false;
+ }
+
+ /*
+ * Allocation of large amounts of memory may delay
+ * chardev initialization for too long, and trigger timeouts
+ * on software that waits for a monitor socket to be created
+ * (e.g. libvirt).
+ */
+ if (g_str_has_prefix(type, "memory-backend-")) {
+ return false;
+ }
+
+ return true;
+}
+
+static void qemu_apply_machine_options(QDict *qdict)
+{
+ object_set_properties_from_keyval(OBJECT(current_machine), qdict, false, &error_fatal);
+
+ if (semihosting_enabled(false) && !semihosting_get_argc()) {
+ /* fall back to the -kernel/-append */
+ semihosting_arg_fallback(current_machine->kernel_filename, current_machine->kernel_cmdline);
+ }
+
+ if (current_machine->smp.cpus > 1) {
+ replay_add_blocker("smp");
+ }
+}
+
+static void qemu_create_early_backends(void)
+{
+ MachineClass *machine_class = MACHINE_GET_CLASS(current_machine);
+#if defined(CONFIG_SDL)
+ const bool use_sdl = (dpy.type == DISPLAY_TYPE_SDL);
+#else
+ const bool use_sdl = false;
+#endif
+#if defined(CONFIG_GTK)
+ const bool use_gtk = (dpy.type == DISPLAY_TYPE_GTK);
+#else
+ const bool use_gtk = false;
+#endif
+
+ if (dpy.has_window_close && !use_gtk && !use_sdl) {
+ error_report("window-close is only valid for GTK and SDL, "
+ "ignoring option");
+ }
+
+ qemu_display_early_init(&dpy);
+ qemu_console_early_init();
+
+ if (dpy.has_gl && dpy.gl != DISPLAYGL_MODE_OFF && display_opengl == 0) {
+#if defined(CONFIG_OPENGL)
+ error_report("OpenGL is not supported by the display");
+#else
+ error_report("OpenGL support is disabled");
+#endif
+ exit(1);
+ }
+
+ object_option_foreach_add(object_create_early);
+
+ /* spice needs the timers to be initialized by this point */
+ /* spice must initialize before audio as it changes the default audiodev */
+ /* spice must initialize before chardevs (for spicevmc and spiceport) */
+ qemu_spice.init();
+
+ qemu_opts_foreach(qemu_find_opts("chardev"),
+ chardev_init_func, NULL, &error_fatal);
+
+#ifdef CONFIG_VIRTFS
+ qemu_opts_foreach(qemu_find_opts("fsdev"),
+ fsdev_init_func, NULL, &error_fatal);
+#endif
+
+ /*
+ * Note: we need to create audio and block backends before
+ * setting machine properties, so they can be referred to.
+ */
+ configure_blockdev(&bdo_queue, machine_class, snapshot);
+ audio_init_audiodevs();
+}
+
+
+/*
+ * The remainder of object creation happens after the
+ * creation of chardev, fsdev, net clients and device data types.
+ */
+static bool object_create_late(const char *type)
+{
+ return !object_create_early(type) && !object_create_pre_sandbox(type);
+}
+
+static void qemu_create_late_backends(void)
+{
+ if (qtest_chrdev) {
+ qtest_server_init(qtest_chrdev, qtest_log, &error_fatal);
+ }
+
+ net_init_clients();
+
+ object_option_foreach_add(object_create_late);
+
+ if (tpm_init() < 0) {
+ exit(1);
+ }
+
+ qemu_opts_foreach(qemu_find_opts("mon"),
+ mon_init_func, NULL, &error_fatal);
+
+ if (foreach_device_config(DEV_SERIAL, serial_parse) < 0)
+ exit(1);
+ if (foreach_device_config(DEV_PARALLEL, parallel_parse) < 0)
+ exit(1);
+ if (foreach_device_config(DEV_DEBUGCON, debugcon_parse) < 0)
+ exit(1);
+
+ /* now chardevs have been created we may have semihosting to connect */
+ qemu_semihosting_chardev_init();
+}
+
+static void qemu_resolve_machine_memdev(void)
+{
+ if (ram_memdev_id) {
+ Object *backend;
+ ram_addr_t backend_size;
+
+ backend = object_resolve_path_type(ram_memdev_id,
+ TYPE_MEMORY_BACKEND, NULL);
+ if (!backend) {
+ error_report("Memory backend '%s' not found", ram_memdev_id);
+ exit(EXIT_FAILURE);
+ }
+ if (!have_custom_ram_size) {
+ backend_size = object_property_get_uint(backend, "size", &error_abort);
+ current_machine->ram_size = backend_size;
+ }
+ object_property_set_link(OBJECT(current_machine),
+ "memory-backend", backend, &error_fatal);
+ }
+}
+
+static void parse_memory_options(void)
+{
+ QemuOpts *opts = qemu_find_opts_singleton("memory");
+ QDict *dict, *prop;
+ const char *mem_str;
+ Location loc;
+
+ loc_push_none(&loc);
+ qemu_opts_loc_restore(opts);
+
+ prop = qdict_new();
+
+ if (qemu_opt_get_size(opts, "size", 0) != 0) {
+ /* Fix up legacy suffix-less format */
+ mem_str = qemu_opt_get(opts, "size");
+ if (g_ascii_isdigit(mem_str[strlen(mem_str) - 1])) {
+ g_autofree char *mib_str = g_strdup_printf("%sM", mem_str);
+ qdict_put_str(prop, "size", mib_str);
+ } else {
+ qdict_put_str(prop, "size", mem_str);
+ }
+ }
+
+ if (qemu_opt_get(opts, "maxmem")) {
+ qdict_put_str(prop, "max-size", qemu_opt_get(opts, "maxmem"));
+ }
+ if (qemu_opt_get(opts, "slots")) {
+ qdict_put_str(prop, "slots", qemu_opt_get(opts, "slots"));
+ }
+
+ dict = qdict_new();
+ qdict_put(dict, "memory", prop);
+ keyval_merge(machine_opts_dict, dict, &error_fatal);
+ qobject_unref(dict);
+ loc_pop(&loc);
+}
+
+static void qemu_create_machine(QDict *qdict)
+{
+ MachineClass *machine_class = select_machine(qdict, &error_fatal);
+ object_set_machine_compat_props(machine_class->compat_props);
+
+ current_machine = MACHINE(object_new_with_class(OBJECT_CLASS(machine_class)));
+ object_property_add_child(object_get_root(), "machine",
+ OBJECT(current_machine));
+ object_property_add_child(container_get(OBJECT(current_machine),
+ "/unattached"),
+ "sysbus", OBJECT(sysbus_get_default()));
+
+ if (machine_class->minimum_page_bits) {
+ if (!set_preferred_target_page_bits(machine_class->minimum_page_bits)) {
+ /* This would be a board error: specifying a minimum smaller than
+ * a target's compile-time fixed setting.
+ */
+ g_assert_not_reached();
+ }
+ }
+
+ cpu_exec_init_all();
+ page_size_init();
+
+ if (machine_class->hw_version) {
+ qemu_set_hw_version(machine_class->hw_version);
+ }
+
+ /*
+ * Get the default machine options from the machine if it is not already
+ * specified either by the configuration file or by the command line.
+ */
+ if (machine_class->default_machine_opts) {
+ QDict *default_opts =
+ keyval_parse(machine_class->default_machine_opts, NULL, NULL,
+ &error_abort);
+ qemu_apply_legacy_machine_options(default_opts);
+ object_set_properties_from_keyval(OBJECT(current_machine), default_opts,
+ false, &error_abort);
+ qobject_unref(default_opts);
+ }
+}
+
+static int global_init_func(void *opaque, QemuOpts *opts, Error **errp)
+{
+ GlobalProperty *g;
+
+ g = g_malloc0(sizeof(*g));
+ g->driver = qemu_opt_get(opts, "driver");
+ g->property = qemu_opt_get(opts, "property");
+ g->value = qemu_opt_get(opts, "value");
+ qdev_prop_register_global(g);
+ return 0;
+}
+
+/*
+ * Return whether configuration group @group is stored in QemuOpts, or
+ * recorded as one or more QDicts by qemu_record_config_group.
+ */
+static bool is_qemuopts_group(const char *group)
+{
+ if (g_str_equal(group, "object") ||
+ g_str_equal(group, "audiodev") ||
+ g_str_equal(group, "machine") ||
+ g_str_equal(group, "smp-opts") ||
+ g_str_equal(group, "boot-opts")) {
+ return false;
+ }
+ return true;
+}
+
+static void qemu_record_config_group(const char *group, QDict *dict,
+ bool from_json, Error **errp)
+{
+ if (g_str_equal(group, "object")) {
+ Visitor *v = qobject_input_visitor_new_keyval(QOBJECT(dict));
+ object_option_add_visitor(v);
+ visit_free(v);
+
+ } else if (g_str_equal(group, "audiodev")) {
+ Audiodev *dev = NULL;
+ Visitor *v = qobject_input_visitor_new_keyval(QOBJECT(dict));
+ if (visit_type_Audiodev(v, NULL, &dev, errp)) {
+ audio_define(dev);
+ }
+ visit_free(v);
+
+ } else if (g_str_equal(group, "machine")) {
+ /*
+ * Cannot merge string-valued and type-safe dictionaries, so JSON
+ * is not accepted yet for -M.
+ */
+ assert(!from_json);
+ keyval_merge(machine_opts_dict, dict, errp);
+ } else if (g_str_equal(group, "smp-opts")) {
+ machine_merge_property("smp", dict, &error_fatal);
+ } else if (g_str_equal(group, "boot-opts")) {
+ machine_merge_property("boot", dict, &error_fatal);
+ } else {
+ abort();
+ }
+}
+
+/*
+ * Parse non-QemuOpts config file groups, pass the rest to
+ * qemu_config_do_parse.
+ */
+static void qemu_parse_config_group(const char *group, QDict *qdict,
+ void *opaque, Error **errp)
+{
+ QObject *crumpled;
+ if (is_qemuopts_group(group)) {
+ qemu_config_do_parse(group, qdict, opaque, errp);
+ return;
+ }
+
+ crumpled = qdict_crumple(qdict, errp);
+ if (!crumpled) {
+ return;
+ }
+ switch (qobject_type(crumpled)) {
+ case QTYPE_QDICT:
+ qemu_record_config_group(group, qobject_to(QDict, crumpled), false, errp);
+ break;
+ case QTYPE_QLIST:
+ error_setg(errp, "Lists cannot be at top level of a configuration section");
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ qobject_unref(crumpled);
+}
+
+static void qemu_read_default_config_file(Error **errp)
+{
+ ERRP_GUARD();
+ int ret;
+ g_autofree char *file = get_relocated_path(CONFIG_QEMU_CONFDIR "/qemu.conf");
+
+ ret = qemu_read_config_file(file, qemu_parse_config_group, errp);
+ if (ret < 0) {
+ if (ret == -ENOENT) {
+ error_free(*errp);
+ *errp = NULL;
+ }
+ }
+}
+
+static void qemu_set_option(const char *str, Error **errp)
+{
+ char group[64], id[64], arg[64];
+ QemuOptsList *list;
+ QemuOpts *opts;
+ int rc, offset;
+
+ rc = sscanf(str, "%63[^.].%63[^.].%63[^=]%n", group, id, arg, &offset);
+ if (rc < 3 || str[offset] != '=') {
+ error_setg(errp, "can't parse: \"%s\"", str);
+ return;
+ }
+
+ if (!is_qemuopts_group(group)) {
+ error_setg(errp, "-set is not supported with %s", group);
+ } else {
+ list = qemu_find_opts_err(group, errp);
+ if (list) {
+ opts = qemu_opts_find(list, id);
+ if (!opts) {
+ error_setg(errp, "there is no %s \"%s\" defined", group, id);
+ return;
+ }
+ qemu_opt_set(opts, arg, str + offset + 1, errp);
+ }
+ }
+}
+
+static void user_register_global_props(void)
+{
+ qemu_opts_foreach(qemu_find_opts("global"),
+ global_init_func, NULL, NULL);
+}
+
+static int do_configure_icount(void *opaque, QemuOpts *opts, Error **errp)
+{
+ icount_configure(opts, errp);
+ return 0;
+}
+
+static int accelerator_set_property(void *opaque,
+ const char *name, const char *value,
+ Error **errp)
+{
+ return object_parse_property_opt(opaque, name, value, "accel", errp);
+}
+
+static int do_configure_accelerator(void *opaque, QemuOpts *opts, Error **errp)
+{
+ bool *p_init_failed = opaque;
+ const char *acc = qemu_opt_get(opts, "accel");
+ AccelClass *ac = accel_find(acc);
+ AccelState *accel;
+ int ret;
+ bool qtest_with_kvm;
+
+ if (!acc) {
+ error_setg(errp, QERR_MISSING_PARAMETER, "accel");
+ goto bad;
+ }
+
+ qtest_with_kvm = g_str_equal(acc, "kvm") && qtest_chrdev != NULL;
+
+ if (!ac) {
+ if (!qtest_with_kvm) {
+ error_report("invalid accelerator %s", acc);
+ }
+ goto bad;
+ }
+ accel = ACCEL(object_new_with_class(OBJECT_CLASS(ac)));
+ object_apply_compat_props(OBJECT(accel));
+ qemu_opt_foreach(opts, accelerator_set_property,
+ accel,
+ &error_fatal);
+ /*
+ * If legacy -singlestep option is set, honour it for TCG and
+ * silently ignore for any other accelerator (which is how this
+ * option has always behaved).
+ */
+ if (opt_one_insn_per_tb) {
+ /*
+ * This will always succeed for TCG, and we want to ignore
+ * the error from trying to set a nonexistent property
+ * on any other accelerator.
+ */
+ object_property_set_bool(OBJECT(accel), "one-insn-per-tb", true, NULL);
+ }
+ ret = accel_init_machine(accel, current_machine);
+ if (ret < 0) {
+ if (!qtest_with_kvm || ret != -ENOENT) {
+ error_report("failed to initialize %s: %s", acc, strerror(-ret));
+ }
+ goto bad;
+ }
+
+ return 1;
+
+bad:
+ *p_init_failed = true;
+ return 0;
+}
+
+static void configure_accelerators(const char *progname)
+{
+ bool init_failed = false;
+
+ qemu_opts_foreach(qemu_find_opts("icount"),
+ do_configure_icount, NULL, &error_fatal);
+
+ if (QTAILQ_EMPTY(&qemu_accel_opts.head)) {
+ char **accel_list, **tmp;
+
+ if (accelerators == NULL) {
+ /* Select the default accelerator */
+ bool have_tcg = accel_find("tcg");
+ bool have_kvm = accel_find("kvm");
+
+ if (have_tcg && have_kvm) {
+ if (g_str_has_suffix(progname, "kvm")) {
+ /* If the program name ends with "kvm", we prefer KVM */
+ accelerators = "kvm:tcg";
+ } else {
+ accelerators = "tcg:kvm";
+ }
+ } else if (have_kvm) {
+ accelerators = "kvm";
+ } else if (have_tcg) {
+ accelerators = "tcg";
+ } else {
+ error_report("No accelerator selected and"
+ " no default accelerator available");
+ exit(1);
+ }
+ }
+ accel_list = g_strsplit(accelerators, ":", 0);
+
+ for (tmp = accel_list; *tmp; tmp++) {
+ /*
+ * Filter invalid accelerators here, to prevent obscenities
+ * such as "-machine accel=tcg,,thread=single".
+ */
+ if (accel_find(*tmp)) {
+ qemu_opts_parse_noisily(qemu_find_opts("accel"), *tmp, true);
+ } else {
+ init_failed = true;
+ error_report("invalid accelerator %s", *tmp);
+ }
+ }
+ g_strfreev(accel_list);
+ } else {
+ if (accelerators != NULL) {
+ error_report("The -accel and \"-machine accel=\" options are incompatible");
+ exit(1);
+ }
+ }
+
+ if (!qemu_opts_foreach(qemu_find_opts("accel"),
+ do_configure_accelerator, &init_failed, &error_fatal)) {
+ if (!init_failed) {
+ error_report("no accelerator found");
+ }
+ exit(1);
+ }
+
+ if (init_failed && !qtest_chrdev) {
+ error_report("falling back to %s", current_accel_name());
+ }
+
+ if (icount_enabled() && !tcg_enabled()) {
+ error_report("-icount is not allowed with hardware virtualization");
+ exit(1);
+ }
+}
+
+static void qemu_validate_options(const QDict *machine_opts)
+{
+ const char *kernel_filename = qdict_get_try_str(machine_opts, "kernel");
+ const char *initrd_filename = qdict_get_try_str(machine_opts, "initrd");
+ const char *kernel_cmdline = qdict_get_try_str(machine_opts, "append");
+
+ if (kernel_filename == NULL) {
+ if (kernel_cmdline != NULL) {
+ error_report("-append only allowed with -kernel option");
+ exit(1);
+ }
+
+ if (initrd_filename != NULL) {
+ error_report("-initrd only allowed with -kernel option");
+ exit(1);
+ }
+ }
+
+ if (loadvm && preconfig_requested) {
+ error_report("'preconfig' and 'loadvm' options are "
+ "mutually exclusive");
+ exit(EXIT_FAILURE);
+ }
+ if (incoming && preconfig_requested && strcmp(incoming, "defer") != 0) {
+ error_report("'preconfig' supports '-incoming defer' only");
+ exit(EXIT_FAILURE);
+ }
+
+#ifdef CONFIG_CURSES
+ if (is_daemonized() && dpy.type == DISPLAY_TYPE_CURSES) {
+ error_report("curses display cannot be used with -daemonize");
+ exit(1);
+ }
+#endif
+}
+
+static void qemu_process_sugar_options(void)
+{
+ if (mem_prealloc) {
+ QObject *smp = qdict_get(machine_opts_dict, "smp");
+ if (smp && qobject_type(smp) == QTYPE_QDICT) {
+ QObject *cpus = qdict_get(qobject_to(QDict, smp), "cpus");
+ if (cpus && qobject_type(cpus) == QTYPE_QSTRING) {
+ const char *val = qstring_get_str(qobject_to(QString, cpus));
+ object_register_sugar_prop("memory-backend", "prealloc-threads",
+ val, false);
+ }
+ }
+ object_register_sugar_prop("memory-backend", "prealloc", "on", false);
+ }
+}
+
+/* -action processing */
+
+/*
+ * Process all the -action parameters parsed from cmdline.
+ */
+static int process_runstate_actions(void *opaque, QemuOpts *opts, Error **errp)
+{
+ Error *local_err = NULL;
+ QDict *qdict = qemu_opts_to_qdict(opts, NULL);
+ QObject *ret = NULL;
+ qmp_marshal_set_action(qdict, &ret, &local_err);
+ qobject_unref(ret);
+ qobject_unref(qdict);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ return 1;
+ }
+ return 0;
+}
+
+static void qemu_process_early_options(void)
+{
+ qemu_opts_foreach(qemu_find_opts("name"),
+ parse_name, NULL, &error_fatal);
+
+ object_option_foreach_add(object_create_pre_sandbox);
+
+#ifdef CONFIG_SECCOMP
+ QemuOptsList *olist = qemu_find_opts_err("sandbox", NULL);
+ if (olist) {
+ qemu_opts_foreach(olist, parse_sandbox, NULL, &error_fatal);
+ }
+#endif
+
+ if (qemu_opts_foreach(qemu_find_opts("action"),
+ process_runstate_actions, NULL, &error_fatal)) {
+ exit(1);
+ }
+
+#ifndef _WIN32
+ qemu_opts_foreach(qemu_find_opts("add-fd"),
+ parse_add_fd, NULL, &error_fatal);
+
+ qemu_opts_foreach(qemu_find_opts("add-fd"),
+ cleanup_add_fd, NULL, &error_fatal);
+#endif
+
+ /* Open the logfile at this point and set the log mask if necessary. */
+ {
+ int mask = 0;
+ if (log_mask) {
+ mask = qemu_str_to_log_mask(log_mask);
+ if (!mask) {
+ qemu_print_log_usage(stdout);
+ exit(1);
+ }
+ }
+ qemu_set_log_filename_flags(log_file, mask, &error_fatal);
+ }
+
+ qemu_add_default_firmwarepath();
+}
+
+static void qemu_process_help_options(void)
+{
+ /*
+ * Check for -cpu help and -device help before we call select_machine(),
+ * which will return an error if the architecture has no default machine
+ * type and the user did not specify one, so that the user doesn't need
+ * to say '-cpu help -machine something'.
+ */
+ if (cpu_option && is_help_option(cpu_option)) {
+ list_cpus();
+ exit(0);
+ }
+
+ if (qemu_opts_foreach(qemu_find_opts("device"),
+ device_help_func, NULL, NULL)) {
+ exit(0);
+ }
+
+ /* -L help lists the data directories and exits. */
+ if (list_data_dirs) {
+ qemu_list_data_dirs();
+ exit(0);
+ }
+}
+
+static void qemu_maybe_daemonize(const char *pid_file)
+{
+ Error *err = NULL;
+
+ os_daemonize();
+ rcu_disable_atfork();
+
+ if (pid_file) {
+ char *pid_file_realpath = NULL;
+
+ if (!qemu_write_pidfile(pid_file, &err)) {
+ error_reportf_err(err, "cannot create PID file: ");
+ exit(1);
+ }
+
+ pid_file_realpath = g_malloc0(PATH_MAX);
+ if (!realpath(pid_file, pid_file_realpath)) {
+ if (errno != ENOENT) {
+ warn_report("not removing PID file on exit: cannot resolve PID "
+ "file path: %s: %s", pid_file, strerror(errno));
+ }
+ return;
+ }
+
+ qemu_unlink_pidfile_notifier = (struct UnlinkPidfileNotifier) {
+ .notifier = {
+ .notify = qemu_unlink_pidfile,
+ },
+ .pid_file_realpath = pid_file_realpath,
+ };
+ qemu_add_exit_notifier(&qemu_unlink_pidfile_notifier.notifier);
+ }
+}
+
+static void qemu_init_displays(void)
+{
+ DisplayState *ds;
+
+ /* init local displays */
+ ds = init_displaystate();
+ qemu_display_init(ds, &dpy);
+
+ /* must be after terminal init, SDL library changes signal handlers */
+ os_setup_signal_handling();
+
+ /* init remote displays */
+#ifdef CONFIG_VNC
+ qemu_opts_foreach(qemu_find_opts("vnc"),
+ vnc_init_func, NULL, &error_fatal);
+#endif
+
+ if (using_spice) {
+ qemu_spice.display_init();
+ }
+}
+
+static void qemu_init_board(void)
+{
+ /* process plugin before CPUs are created, but once -smp has been parsed */
+ qemu_plugin_load_list(&plugin_list, &error_fatal);
+
+ /* From here on we enter MACHINE_PHASE_INITIALIZED. */
+ machine_run_board_init(current_machine, mem_path, &error_fatal);
+
+ drive_check_orphaned();
+
+ realtime_init();
+}
+
+static void qemu_create_cli_devices(void)
+{
+ DeviceOption *opt;
+
+ soundhw_init();
+
+ qemu_opts_foreach(qemu_find_opts("fw_cfg"),
+ parse_fw_cfg, fw_cfg_find(), &error_fatal);
+
+ /* init USB devices */
+ if (machine_usb(current_machine)) {
+ if (foreach_device_config(DEV_USB, usb_parse) < 0)
+ exit(1);
+ }
+
+ /* init generic devices */
+ rom_set_order_override(FW_CFG_ORDER_OVERRIDE_DEVICE);
+ qemu_opts_foreach(qemu_find_opts("device"),
+ device_init_func, NULL, &error_fatal);
+ QTAILQ_FOREACH(opt, &device_opts, next) {
+ DeviceState *dev;
+ loc_push_restore(&opt->loc);
+ /*
+ * TODO Eventually we should call qmp_device_add() here to make sure it
+ * behaves the same, but QMP still has to accept incorrectly typed
+ * options until libvirt is fixed and we want to be strict on the CLI
+ * from the start, so call qdev_device_add_from_qdict() directly for
+ * now.
+ */
+ dev = qdev_device_add_from_qdict(opt->opts, true, &error_fatal);
+ object_unref(OBJECT(dev));
+ loc_pop(&opt->loc);
+ }
+ rom_reset_order_override();
+}
+
+static void qemu_machine_creation_done(void)
+{
+ MachineState *machine = MACHINE(qdev_get_machine());
+
+ /* Did we create any drives that we failed to create a device for? */
+ drive_check_orphaned();
+
+ /* Don't warn about the default network setup that you get if
+ * no command line -net or -netdev options are specified. There
+ * are two cases that we would otherwise complain about:
+ * (1) board doesn't support a NIC but the implicit "-net nic"
+ * requested one
+ * (2) CONFIG_SLIRP not set, in which case the implicit "-net nic"
+ * sets up a nic that isn't connected to anything.
+ */
+ if (!default_net && (!qtest_enabled() || has_defaults)) {
+ net_check_clients();
+ }
+
+ qdev_prop_check_globals();
+
+ qdev_machine_creation_done();
+
+ if (machine->cgs) {
+ /*
+ * Verify that Confidential Guest Support has actually been initialized
+ */
+ assert(machine->cgs->ready);
+ }
+
+ if (foreach_device_config(DEV_GDB, gdbserver_start) < 0) {
+ exit(1);
+ }
+ if (!vga_interface_created && !default_vga &&
+ vga_interface_type != VGA_NONE) {
+ warn_report("A -vga option was passed but this machine "
+ "type does not use that option; "
+ "No VGA device has been created");
+ }
+}
+
+void qmp_x_exit_preconfig(Error **errp)
+{
+ if (phase_check(PHASE_MACHINE_INITIALIZED)) {
+ error_setg(errp, "The command is permitted only before machine initialization");
+ return;
+ }
+
+ qemu_init_board();
+ qemu_create_cli_devices();
+ qemu_machine_creation_done();
+
+ if (loadvm) {
+ load_snapshot(loadvm, NULL, false, NULL, &error_fatal);
+ }
+ if (replay_mode != REPLAY_MODE_NONE) {
+ replay_vmstate_init();
+ }
+
+ if (incoming) {
+ Error *local_err = NULL;
+ if (strcmp(incoming, "defer") != 0) {
+ qmp_migrate_incoming(incoming, &local_err);
+ if (local_err) {
+ error_reportf_err(local_err, "-incoming %s: ", incoming);
+ exit(1);
+ }
+ }
+ } else if (autostart) {
+ qmp_cont(NULL);
+ }
+}
+
+void qemu_init(int argc, char **argv)
+{
+ QemuOpts *opts;
+ QemuOpts *icount_opts = NULL, *accel_opts = NULL;
+ QemuOptsList *olist;
+ int optind;
+ const char *optarg;
+ MachineClass *machine_class;
+ bool userconfig = true;
+ FILE *vmstate_dump_file = NULL;
+
+ qemu_add_opts(&qemu_drive_opts);
+ qemu_add_drive_opts(&qemu_legacy_drive_opts);
+ qemu_add_drive_opts(&qemu_common_drive_opts);
+ qemu_add_drive_opts(&qemu_drive_opts);
+ qemu_add_drive_opts(&bdrv_runtime_opts);
+ qemu_add_opts(&qemu_chardev_opts);
+ qemu_add_opts(&qemu_device_opts);
+ qemu_add_opts(&qemu_netdev_opts);
+ qemu_add_opts(&qemu_nic_opts);
+ qemu_add_opts(&qemu_net_opts);
+ qemu_add_opts(&qemu_rtc_opts);
+ qemu_add_opts(&qemu_global_opts);
+ qemu_add_opts(&qemu_mon_opts);
+ qemu_add_opts(&qemu_trace_opts);
+ qemu_plugin_add_opts();
+ qemu_add_opts(&qemu_option_rom_opts);
+ qemu_add_opts(&qemu_accel_opts);
+ qemu_add_opts(&qemu_mem_opts);
+ qemu_add_opts(&qemu_smp_opts);
+ qemu_add_opts(&qemu_boot_opts);
+ qemu_add_opts(&qemu_add_fd_opts);
+ qemu_add_opts(&qemu_object_opts);
+ qemu_add_opts(&qemu_tpmdev_opts);
+ qemu_add_opts(&qemu_overcommit_opts);
+ qemu_add_opts(&qemu_msg_opts);
+ qemu_add_opts(&qemu_name_opts);
+ qemu_add_opts(&qemu_numa_opts);
+ qemu_add_opts(&qemu_icount_opts);
+ qemu_add_opts(&qemu_semihosting_config_opts);
+ qemu_add_opts(&qemu_fw_cfg_opts);
+ qemu_add_opts(&qemu_action_opts);
+ qemu_add_run_with_opts();
+ module_call_init(MODULE_INIT_OPTS);
+
+ error_init(argv[0]);
+ qemu_init_exec_dir(argv[0]);
+
+ qemu_init_arch_modules();
+
+ qemu_init_subsystems();
+
+ /* first pass of option parsing */
+ optind = 1;
+ while (optind < argc) {
+ if (argv[optind][0] != '-') {
+ /* disk image */
+ optind++;
+ } else {
+ const QEMUOption *popt;
+
+ popt = lookup_opt(argc, argv, &optarg, &optind);
+ switch (popt->index) {
+ case QEMU_OPTION_nouserconfig:
+ userconfig = false;
+ break;
+ }
+ }
+ }
+
+ machine_opts_dict = qdict_new();
+ if (userconfig) {
+ qemu_read_default_config_file(&error_fatal);
+ }
+
+ /* second pass of option parsing */
+ optind = 1;
+ for(;;) {
+ if (optind >= argc)
+ break;
+ if (argv[optind][0] != '-') {
+ loc_set_cmdline(argv, optind, 1);
+ drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS);
+ } else {
+ const QEMUOption *popt;
+
+ popt = lookup_opt(argc, argv, &optarg, &optind);
+ if (!(popt->arch_mask & arch_type)) {
+ error_report("Option not supported for this target");
+ exit(1);
+ }
+ switch(popt->index) {
+ case QEMU_OPTION_cpu:
+ /* hw initialization will check this */
+ cpu_option = optarg;
+ break;
+ case QEMU_OPTION_hda:
+ case QEMU_OPTION_hdb:
+ case QEMU_OPTION_hdc:
+ case QEMU_OPTION_hdd:
+ drive_add(IF_DEFAULT, popt->index - QEMU_OPTION_hda, optarg,
+ HD_OPTS);
+ break;
+ case QEMU_OPTION_blockdev:
+ {
+ Visitor *v;
+ BlockdevOptionsQueueEntry *bdo;
+
+ v = qobject_input_visitor_new_str(optarg, "driver",
+ &error_fatal);
+
+ bdo = g_new(BlockdevOptionsQueueEntry, 1);
+ visit_type_BlockdevOptions(v, NULL, &bdo->bdo,
+ &error_fatal);
+ visit_free(v);
+ loc_save(&bdo->loc);
+ QSIMPLEQ_INSERT_TAIL(&bdo_queue, bdo, entry);
+ break;
+ }
+ case QEMU_OPTION_drive:
+ opts = qemu_opts_parse_noisily(qemu_find_opts("drive"),
+ optarg, false);
+ if (opts == NULL) {
+ exit(1);
+ }
+ break;
+ case QEMU_OPTION_set:
+ qemu_set_option(optarg, &error_fatal);
+ break;
+ case QEMU_OPTION_global:
+ if (qemu_global_option(optarg) != 0)
+ exit(1);
+ break;
+ case QEMU_OPTION_mtdblock:
+ drive_add(IF_MTD, -1, optarg, MTD_OPTS);
+ break;
+ case QEMU_OPTION_sd:
+ drive_add(IF_SD, -1, optarg, SD_OPTS);
+ break;
+ case QEMU_OPTION_pflash:
+ drive_add(IF_PFLASH, -1, optarg, PFLASH_OPTS);
+ break;
+ case QEMU_OPTION_snapshot:
+ snapshot = 1;
+ replay_add_blocker("-snapshot");
+ break;
+ case QEMU_OPTION_numa:
+ opts = qemu_opts_parse_noisily(qemu_find_opts("numa"),
+ optarg, true);
+ if (!opts) {
+ exit(1);
+ }
+ break;
+ case QEMU_OPTION_display:
+ parse_display(optarg);
+ break;
+ case QEMU_OPTION_nographic:
+ qdict_put_str(machine_opts_dict, "graphics", "off");
+ nographic = true;
+ dpy.type = DISPLAY_TYPE_NONE;
+ break;
+ case QEMU_OPTION_portrait:
+ graphic_rotate = 90;
+ break;
+ case QEMU_OPTION_rotate:
+ graphic_rotate = strtol(optarg, (char **) &optarg, 10);
+ if (graphic_rotate != 0 && graphic_rotate != 90 &&
+ graphic_rotate != 180 && graphic_rotate != 270) {
+ error_report("only 90, 180, 270 deg rotation is available");
+ exit(1);
+ }
+ break;
+ case QEMU_OPTION_kernel:
+ qdict_put_str(machine_opts_dict, "kernel", optarg);
+ break;
+ case QEMU_OPTION_initrd:
+ qdict_put_str(machine_opts_dict, "initrd", optarg);
+ break;
+ case QEMU_OPTION_append:
+ qdict_put_str(machine_opts_dict, "append", optarg);
+ break;
+ case QEMU_OPTION_dtb:
+ qdict_put_str(machine_opts_dict, "dtb", optarg);
+ break;
+ case QEMU_OPTION_cdrom:
+ drive_add(IF_DEFAULT, 2, optarg, CDROM_OPTS);
+ break;
+ case QEMU_OPTION_boot:
+ machine_parse_property_opt(qemu_find_opts("boot-opts"), "boot", optarg);
+ break;
+ case QEMU_OPTION_fda:
+ case QEMU_OPTION_fdb:
+ drive_add(IF_FLOPPY, popt->index - QEMU_OPTION_fda,
+ optarg, FD_OPTS);
+ break;
+ case QEMU_OPTION_no_fd_bootchk:
+ fd_bootchk = 0;
+ break;
+ case QEMU_OPTION_netdev:
+ default_net = 0;
+ if (netdev_is_modern(optarg)) {
+ netdev_parse_modern(optarg);
+ } else {
+ net_client_parse(qemu_find_opts("netdev"), optarg);
+ }
+ break;
+ case QEMU_OPTION_nic:
+ default_net = 0;
+ net_client_parse(qemu_find_opts("nic"), optarg);
+ break;
+ case QEMU_OPTION_net:
+ default_net = 0;
+ net_client_parse(qemu_find_opts("net"), optarg);
+ break;
+#ifdef CONFIG_LIBISCSI
+ case QEMU_OPTION_iscsi:
+ opts = qemu_opts_parse_noisily(qemu_find_opts("iscsi"),
+ optarg, false);
+ if (!opts) {
+ exit(1);
+ }
+ break;
+#endif
+ case QEMU_OPTION_audiodev:
+ audio_parse_option(optarg);
+ break;
+ case QEMU_OPTION_audio: {
+ bool help;
+ char *model;
+ Audiodev *dev = NULL;
+ Visitor *v;
+ QDict *dict = keyval_parse(optarg, "driver", &help, &error_fatal);
+ if (help || (qdict_haskey(dict, "driver") &&
+ is_help_option(qdict_get_str(dict, "driver")))) {
+ audio_help();
+ exit(EXIT_SUCCESS);
+ }
+ if (!qdict_haskey(dict, "id")) {
+ qdict_put_str(dict, "id", "audiodev0");
+ }
+ if (!qdict_haskey(dict, "model")) {
+ error_setg(&error_fatal, "Parameter 'model' is missing");
+ }
+ model = g_strdup(qdict_get_str(dict, "model"));
+ qdict_del(dict, "model");
+ if (is_help_option(model)) {
+ show_valid_soundhw();
+ exit(0);
+ }
+ v = qobject_input_visitor_new_keyval(QOBJECT(dict));
+ qobject_unref(dict);
+ visit_type_Audiodev(v, NULL, &dev, &error_fatal);
+ visit_free(v);
+ audio_define(dev);
+ select_soundhw(model, dev->id);
+ g_free(model);
+ break;
+ }
+ case QEMU_OPTION_h:
+ help(0);
+ break;
+ case QEMU_OPTION_version:
+ version();
+ exit(0);
+ break;
+ case QEMU_OPTION_m:
+ opts = qemu_opts_parse_noisily(qemu_find_opts("memory"), optarg, true);
+ if (opts == NULL) {
+ exit(1);
+ }
+ break;
+#ifdef CONFIG_TPM
+ case QEMU_OPTION_tpmdev:
+ if (tpm_config_parse(qemu_find_opts("tpmdev"), optarg) < 0) {
+ exit(1);
+ }
+ break;
+#endif
+ case QEMU_OPTION_mempath:
+ mem_path = optarg;
+ break;
+ case QEMU_OPTION_mem_prealloc:
+ mem_prealloc = 1;
+ break;
+ case QEMU_OPTION_d:
+ log_mask = optarg;
+ break;
+ case QEMU_OPTION_D:
+ log_file = optarg;
+ break;
+ case QEMU_OPTION_DFILTER:
+ qemu_set_dfilter_ranges(optarg, &error_fatal);
+ break;
+#if defined(CONFIG_TCG) && defined(CONFIG_LINUX)
+ case QEMU_OPTION_perfmap:
+ perf_enable_perfmap();
+ break;
+ case QEMU_OPTION_jitdump:
+ perf_enable_jitdump();
+ break;
+#endif
+ case QEMU_OPTION_seed:
+ qemu_guest_random_seed_main(optarg, &error_fatal);
+ break;
+ case QEMU_OPTION_s:
+ add_device_config(DEV_GDB, "tcp::" DEFAULT_GDBSTUB_PORT);
+ break;
+ case QEMU_OPTION_gdb:
+ add_device_config(DEV_GDB, optarg);
+ break;
+ case QEMU_OPTION_L:
+ if (is_help_option(optarg)) {
+ list_data_dirs = true;
+ } else {
+ qemu_add_data_dir(g_strdup(optarg));
+ }
+ break;
+ case QEMU_OPTION_bios:
+ qdict_put_str(machine_opts_dict, "firmware", optarg);
+ break;
+ case QEMU_OPTION_singlestep:
+ opt_one_insn_per_tb = true;
+ break;
+ case QEMU_OPTION_S:
+ autostart = 0;
+ break;
+ case QEMU_OPTION_k:
+ keyboard_layout = optarg;
+ break;
+ case QEMU_OPTION_vga:
+ vga_model = optarg;
+ default_vga = 0;
+ break;
+ case QEMU_OPTION_g:
+ {
+ const char *p;
+ int w, h, depth;
+ p = optarg;
+ w = strtol(p, (char **)&p, 10);
+ if (w <= 0) {
+ graphic_error:
+ error_report("invalid resolution or depth");
+ exit(1);
+ }
+ if (*p != 'x')
+ goto graphic_error;
+ p++;
+ h = strtol(p, (char **)&p, 10);
+ if (h <= 0)
+ goto graphic_error;
+ if (*p == 'x') {
+ p++;
+ depth = strtol(p, (char **)&p, 10);
+ if (depth != 1 && depth != 2 && depth != 4 &&
+ depth != 8 && depth != 15 && depth != 16 &&
+ depth != 24 && depth != 32)
+ goto graphic_error;
+ } else if (*p == '\0') {
+ depth = graphic_depth;
+ } else {
+ goto graphic_error;
+ }
+
+ graphic_width = w;
+ graphic_height = h;
+ graphic_depth = depth;
+ }
+ break;
+ case QEMU_OPTION_echr:
+ {
+ char *r;
+ term_escape_char = strtol(optarg, &r, 0);
+ if (r == optarg)
+ printf("Bad argument to echr\n");
+ break;
+ }
+ case QEMU_OPTION_monitor:
+ default_monitor = 0;
+ if (strncmp(optarg, "none", 4)) {
+ monitor_parse(optarg, "readline", false);
+ }
+ break;
+ case QEMU_OPTION_qmp:
+ monitor_parse(optarg, "control", false);
+ default_monitor = 0;
+ break;
+ case QEMU_OPTION_qmp_pretty:
+ monitor_parse(optarg, "control", true);
+ default_monitor = 0;
+ break;
+ case QEMU_OPTION_mon:
+ opts = qemu_opts_parse_noisily(qemu_find_opts("mon"), optarg,
+ true);
+ if (!opts) {
+ exit(1);
+ }
+ default_monitor = 0;
+ break;
+ case QEMU_OPTION_chardev:
+ opts = qemu_opts_parse_noisily(qemu_find_opts("chardev"),
+ optarg, true);
+ if (!opts) {
+ exit(1);
+ }
+ break;
+ case QEMU_OPTION_fsdev:
+ olist = qemu_find_opts("fsdev");
+ if (!olist) {
+ error_report("fsdev support is disabled");
+ exit(1);
+ }
+ opts = qemu_opts_parse_noisily(olist, optarg, true);
+ if (!opts) {
+ exit(1);
+ }
+ break;
+ case QEMU_OPTION_virtfs: {
+ QemuOpts *fsdev;
+ QemuOpts *device;
+ const char *writeout, *sock_fd, *socket, *path, *security_model,
+ *multidevs;
+
+ olist = qemu_find_opts("virtfs");
+ if (!olist) {
+ error_report("virtfs support is disabled");
+ exit(1);
+ }
+ opts = qemu_opts_parse_noisily(olist, optarg, true);
+ if (!opts) {
+ exit(1);
+ }
+
+ if (qemu_opt_get(opts, "fsdriver") == NULL ||
+ qemu_opt_get(opts, "mount_tag") == NULL) {
+ error_report("Usage: -virtfs fsdriver,mount_tag=tag");
+ exit(1);
+ }
+ fsdev = qemu_opts_create(qemu_find_opts("fsdev"),
+ qemu_opts_id(opts) ?:
+ qemu_opt_get(opts, "mount_tag"),
+ 1, NULL);
+ if (!fsdev) {
+ error_report("duplicate or invalid fsdev id: %s",
+ qemu_opt_get(opts, "mount_tag"));
+ exit(1);
+ }
+
+ writeout = qemu_opt_get(opts, "writeout");
+ if (writeout) {
+#ifdef CONFIG_SYNC_FILE_RANGE
+ qemu_opt_set(fsdev, "writeout", writeout, &error_abort);
+#else
+ error_report("writeout=immediate not supported "
+ "on this platform");
+ exit(1);
+#endif
+ }
+ qemu_opt_set(fsdev, "fsdriver",
+ qemu_opt_get(opts, "fsdriver"), &error_abort);
+ path = qemu_opt_get(opts, "path");
+ if (path) {
+ qemu_opt_set(fsdev, "path", path, &error_abort);
+ }
+ security_model = qemu_opt_get(opts, "security_model");
+ if (security_model) {
+ qemu_opt_set(fsdev, "security_model", security_model,
+ &error_abort);
+ }
+ socket = qemu_opt_get(opts, "socket");
+ if (socket) {
+ qemu_opt_set(fsdev, "socket", socket, &error_abort);
+ }
+ sock_fd = qemu_opt_get(opts, "sock_fd");
+ if (sock_fd) {
+ qemu_opt_set(fsdev, "sock_fd", sock_fd, &error_abort);
+ }
+
+ qemu_opt_set_bool(fsdev, "readonly",
+ qemu_opt_get_bool(opts, "readonly", 0),
+ &error_abort);
+ multidevs = qemu_opt_get(opts, "multidevs");
+ if (multidevs) {
+ qemu_opt_set(fsdev, "multidevs", multidevs, &error_abort);
+ }
+ device = qemu_opts_create(qemu_find_opts("device"), NULL, 0,
+ &error_abort);
+ qemu_opt_set(device, "driver", "virtio-9p-pci", &error_abort);
+ qemu_opt_set(device, "fsdev",
+ qemu_opts_id(fsdev), &error_abort);
+ qemu_opt_set(device, "mount_tag",
+ qemu_opt_get(opts, "mount_tag"), &error_abort);
+ break;
+ }
+ case QEMU_OPTION_serial:
+ add_device_config(DEV_SERIAL, optarg);
+ default_serial = 0;
+ if (strncmp(optarg, "mon:", 4) == 0) {
+ default_monitor = 0;
+ }
+ break;
+ case QEMU_OPTION_action:
+ olist = qemu_find_opts("action");
+ if (!qemu_opts_parse_noisily(olist, optarg, false)) {
+ exit(1);
+ }
+ break;
+ case QEMU_OPTION_watchdog_action: {
+ opts = qemu_opts_create(qemu_find_opts("action"), NULL, 0, &error_abort);
+ qemu_opt_set(opts, "watchdog", optarg, &error_abort);
+ break;
+ }
+ case QEMU_OPTION_parallel:
+ add_device_config(DEV_PARALLEL, optarg);
+ default_parallel = 0;
+ if (strncmp(optarg, "mon:", 4) == 0) {
+ default_monitor = 0;
+ }
+ break;
+ case QEMU_OPTION_debugcon:
+ add_device_config(DEV_DEBUGCON, optarg);
+ break;
+ case QEMU_OPTION_loadvm:
+ loadvm = optarg;
+ break;
+ case QEMU_OPTION_full_screen:
+ dpy.has_full_screen = true;
+ dpy.full_screen = true;
+ break;
+ case QEMU_OPTION_pidfile:
+ pid_file = optarg;
+ break;
+ case QEMU_OPTION_win2k_hack:
+ win2k_install_hack = 1;
+ break;
+ case QEMU_OPTION_acpitable:
+ opts = qemu_opts_parse_noisily(qemu_find_opts("acpi"),
+ optarg, true);
+ if (!opts) {
+ exit(1);
+ }
+ acpi_table_add(opts, &error_fatal);
+ break;
+ case QEMU_OPTION_smbios:
+ opts = qemu_opts_parse_noisily(qemu_find_opts("smbios"),
+ optarg, false);
+ if (!opts) {
+ exit(1);
+ }
+ smbios_entry_add(opts, &error_fatal);
+ break;
+ case QEMU_OPTION_fwcfg:
+ opts = qemu_opts_parse_noisily(qemu_find_opts("fw_cfg"),
+ optarg, true);
+ if (opts == NULL) {
+ exit(1);
+ }
+ break;
+ case QEMU_OPTION_preconfig:
+ preconfig_requested = true;
+ break;
+ case QEMU_OPTION_enable_kvm:
+ qdict_put_str(machine_opts_dict, "accel", "kvm");
+ break;
+ case QEMU_OPTION_M:
+ case QEMU_OPTION_machine:
+ {
+ bool help;
+
+ keyval_parse_into(machine_opts_dict, optarg, "type", &help, &error_fatal);
+ if (help) {
+ machine_help_func(machine_opts_dict);
+ exit(EXIT_SUCCESS);
+ }
+ break;
+ }
+ case QEMU_OPTION_accel:
+ accel_opts = qemu_opts_parse_noisily(qemu_find_opts("accel"),
+ optarg, true);
+ optarg = qemu_opt_get(accel_opts, "accel");
+ if (!optarg || is_help_option(optarg)) {
+ printf("Accelerators supported in QEMU binary:\n");
+ GSList *el, *accel_list = object_class_get_list(TYPE_ACCEL,
+ false);
+ for (el = accel_list; el; el = el->next) {
+ gchar *typename = g_strdup(object_class_get_name(
+ OBJECT_CLASS(el->data)));
+ /* omit qtest which is used for tests only */
+ if (g_strcmp0(typename, ACCEL_CLASS_NAME("qtest")) &&
+ g_str_has_suffix(typename, ACCEL_CLASS_SUFFIX)) {
+ gchar **optname = g_strsplit(typename,
+ ACCEL_CLASS_SUFFIX, 0);
+ printf("%s\n", optname[0]);
+ g_strfreev(optname);
+ }
+ g_free(typename);
+ }
+ g_slist_free(accel_list);
+ exit(0);
+ }
+ break;
+ case QEMU_OPTION_usb:
+ qdict_put_str(machine_opts_dict, "usb", "on");
+ break;
+ case QEMU_OPTION_usbdevice:
+ qdict_put_str(machine_opts_dict, "usb", "on");
+ add_device_config(DEV_USB, optarg);
+ break;
+ case QEMU_OPTION_device:
+ if (optarg[0] == '{') {
+ QObject *obj = qobject_from_json(optarg, &error_fatal);
+ DeviceOption *opt = g_new0(DeviceOption, 1);
+ opt->opts = qobject_to(QDict, obj);
+ loc_save(&opt->loc);
+ assert(opt->opts != NULL);
+ QTAILQ_INSERT_TAIL(&device_opts, opt, next);
+ } else {
+ if (!qemu_opts_parse_noisily(qemu_find_opts("device"),
+ optarg, true)) {
+ exit(1);
+ }
+ }
+ break;
+ case QEMU_OPTION_smp:
+ machine_parse_property_opt(qemu_find_opts("smp-opts"),
+ "smp", optarg);
+ break;
+ case QEMU_OPTION_vnc:
+ vnc_parse(optarg);
+ break;
+ case QEMU_OPTION_no_acpi:
+ warn_report("-no-acpi is deprecated, use '-machine acpi=off' instead");
+ qdict_put_str(machine_opts_dict, "acpi", "off");
+ break;
+ case QEMU_OPTION_no_hpet:
+ warn_report("-no-hpet is deprecated, use '-machine hpet=off' instead");
+ qdict_put_str(machine_opts_dict, "hpet", "off");
+ break;
+ case QEMU_OPTION_no_reboot:
+ olist = qemu_find_opts("action");
+ qemu_opts_parse_noisily(olist, "reboot=shutdown", false);
+ break;
+ case QEMU_OPTION_no_shutdown:
+ olist = qemu_find_opts("action");
+ qemu_opts_parse_noisily(olist, "shutdown=pause", false);
+ break;
+ case QEMU_OPTION_uuid:
+ if (qemu_uuid_parse(optarg, &qemu_uuid) < 0) {
+ error_report("failed to parse UUID string: wrong format");
+ exit(1);
+ }
+ qemu_uuid_set = true;
+ break;
+ case QEMU_OPTION_option_rom:
+ if (nb_option_roms >= MAX_OPTION_ROMS) {
+ error_report("too many option ROMs");
+ exit(1);
+ }
+ opts = qemu_opts_parse_noisily(qemu_find_opts("option-rom"),
+ optarg, true);
+ if (!opts) {
+ exit(1);
+ }
+ option_rom[nb_option_roms].name = qemu_opt_get(opts, "romfile");
+ option_rom[nb_option_roms].bootindex =
+ qemu_opt_get_number(opts, "bootindex", -1);
+ if (!option_rom[nb_option_roms].name) {
+ error_report("Option ROM file is not specified");
+ exit(1);
+ }
+ nb_option_roms++;
+ break;
+ case QEMU_OPTION_semihosting:
+ qemu_semihosting_enable();
+ break;
+ case QEMU_OPTION_semihosting_config:
+ if (qemu_semihosting_config_options(optarg) != 0) {
+ exit(1);
+ }
+ break;
+ case QEMU_OPTION_name:
+ opts = qemu_opts_parse_noisily(qemu_find_opts("name"),
+ optarg, true);
+ if (!opts) {
+ exit(1);
+ }
+ /* Capture guest name if -msg guest-name is used later */
+ error_guest_name = qemu_opt_get(opts, "guest");
+ break;
+ case QEMU_OPTION_prom_env:
+ if (nb_prom_envs >= MAX_PROM_ENVS) {
+ error_report("too many prom variables");
+ exit(1);
+ }
+ prom_envs[nb_prom_envs] = optarg;
+ nb_prom_envs++;
+ break;
+ case QEMU_OPTION_old_param:
+ old_param = 1;
+ break;
+ case QEMU_OPTION_rtc:
+ opts = qemu_opts_parse_noisily(qemu_find_opts("rtc"), optarg,
+ false);
+ if (!opts) {
+ exit(1);
+ }
+ break;
+ case QEMU_OPTION_icount:
+ icount_opts = qemu_opts_parse_noisily(qemu_find_opts("icount"),
+ optarg, true);
+ if (!icount_opts) {
+ exit(1);
+ }
+ break;
+ case QEMU_OPTION_incoming:
+ if (!incoming) {
+ runstate_set(RUN_STATE_INMIGRATE);
+ }
+ incoming = optarg;
+ break;
+ case QEMU_OPTION_only_migratable:
+ only_migratable = 1;
+ break;
+ case QEMU_OPTION_nodefaults:
+ has_defaults = 0;
+ break;
+ case QEMU_OPTION_xen_domid:
+ if (!(accel_find("xen")) && !(accel_find("kvm"))) {
+ error_report("Option not supported for this target");
+ exit(1);
+ }
+ xen_domid = atoi(optarg);
+ break;
+ case QEMU_OPTION_xen_attach:
+ if (!(accel_find("xen"))) {
+ error_report("Option not supported for this target");
+ exit(1);
+ }
+ xen_mode = XEN_ATTACH;
+ break;
+ case QEMU_OPTION_xen_domid_restrict:
+ if (!(accel_find("xen"))) {
+ error_report("Option not supported for this target");
+ exit(1);
+ }
+ xen_domid_restrict = true;
+ break;
+ case QEMU_OPTION_trace:
+ trace_opt_parse(optarg);
+ break;
+ case QEMU_OPTION_plugin:
+ qemu_plugin_opt_parse(optarg, &plugin_list);
+ break;
+ case QEMU_OPTION_readconfig:
+ qemu_read_config_file(optarg, qemu_parse_config_group, &error_fatal);
+ break;
+#ifdef CONFIG_SPICE
+ case QEMU_OPTION_spice:
+ olist = qemu_find_opts_err("spice", NULL);
+ if (!olist) {
+ error_report("spice support is disabled");
+ exit(1);
+ }
+ opts = qemu_opts_parse_noisily(olist, optarg, false);
+ if (!opts) {
+ exit(1);
+ }
+ display_remote++;
+ break;
+#endif
+ case QEMU_OPTION_qtest:
+ qtest_chrdev = optarg;
+ break;
+ case QEMU_OPTION_qtest_log:
+ qtest_log = optarg;
+ break;
+ case QEMU_OPTION_sandbox:
+ olist = qemu_find_opts("sandbox");
+ if (!olist) {
+#ifndef CONFIG_SECCOMP
+ error_report("-sandbox support is not enabled "
+ "in this QEMU binary");
+#endif
+ exit(1);
+ }
+
+ opts = qemu_opts_parse_noisily(olist, optarg, true);
+ if (!opts) {
+ exit(1);
+ }
+ break;
+ case QEMU_OPTION_add_fd:
+#ifndef _WIN32
+ opts = qemu_opts_parse_noisily(qemu_find_opts("add-fd"),
+ optarg, false);
+ if (!opts) {
+ exit(1);
+ }
+#else
+ error_report("File descriptor passing is disabled on this "
+ "platform");
+ exit(1);
+#endif
+ break;
+ case QEMU_OPTION_object:
+ object_option_parse(optarg);
+ break;
+ case QEMU_OPTION_overcommit:
+ opts = qemu_opts_parse_noisily(qemu_find_opts("overcommit"),
+ optarg, false);
+ if (!opts) {
+ exit(1);
+ }
+ enable_mlock = qemu_opt_get_bool(opts, "mem-lock", false);
+ enable_cpu_pm = qemu_opt_get_bool(opts, "cpu-pm", false);
+ break;
+ case QEMU_OPTION_compat:
+ {
+ CompatPolicy *opts_policy;
+ Visitor *v;
+
+ v = qobject_input_visitor_new_str(optarg, NULL,
+ &error_fatal);
+
+ visit_type_CompatPolicy(v, NULL, &opts_policy, &error_fatal);
+ QAPI_CLONE_MEMBERS(CompatPolicy, &compat_policy, opts_policy);
+
+ qapi_free_CompatPolicy(opts_policy);
+ visit_free(v);
+ break;
+ }
+ case QEMU_OPTION_msg:
+ opts = qemu_opts_parse_noisily(qemu_find_opts("msg"), optarg,
+ false);
+ if (!opts) {
+ exit(1);
+ }
+ configure_msg(opts);
+ break;
+ case QEMU_OPTION_dump_vmstate:
+ if (vmstate_dump_file) {
+ error_report("only one '-dump-vmstate' "
+ "option may be given");
+ exit(1);
+ }
+ vmstate_dump_file = fopen(optarg, "w");
+ if (vmstate_dump_file == NULL) {
+ error_report("open %s: %s", optarg, strerror(errno));
+ exit(1);
+ }
+ break;
+ case QEMU_OPTION_enable_sync_profile:
+ qsp_enable();
+ break;
+ case QEMU_OPTION_nouserconfig:
+ /* Nothing to be parsed here. Especially, do not error out below. */
+ break;
+#if defined(CONFIG_POSIX)
+ case QEMU_OPTION_runas:
+ if (!os_set_runas(optarg)) {
+ error_report("User \"%s\" doesn't exist"
+ " (and is not <uid>:<gid>)",
+ optarg);
+ exit(1);
+ }
+ break;
+ case QEMU_OPTION_chroot:
+ warn_report("option is deprecated,"
+ " use '-run-with chroot=...' instead");
+ os_set_chroot(optarg);
+ break;
+ case QEMU_OPTION_daemonize:
+ os_set_daemonize(true);
+ break;
+#if defined(CONFIG_LINUX)
+ /* deprecated */
+ case QEMU_OPTION_asyncteardown:
+ init_async_teardown();
+ break;
+#endif
+ case QEMU_OPTION_run_with: {
+ const char *str;
+ opts = qemu_opts_parse_noisily(qemu_find_opts("run-with"),
+ optarg, false);
+ if (!opts) {
+ exit(1);
+ }
+#if defined(CONFIG_LINUX)
+ if (qemu_opt_get_bool(opts, "async-teardown", false)) {
+ init_async_teardown();
+ }
+#endif
+ str = qemu_opt_get(opts, "chroot");
+ if (str) {
+ os_set_chroot(str);
+ }
+ break;
+ }
+#endif /* CONFIG_POSIX */
+
+ default:
+ error_report("Option not supported in this build");
+ exit(1);
+ }
+ }
+ }
+ /*
+ * Clear error location left behind by the loop.
+ * Best done right after the loop. Do not insert code here!
+ */
+ loc_set_none();
+
+ qemu_validate_options(machine_opts_dict);
+ qemu_process_sugar_options();
+
+ /*
+ * These options affect everything else and should be processed
+ * before daemonizing.
+ */
+ qemu_process_early_options();
+
+ qemu_process_help_options();
+ qemu_maybe_daemonize(pid_file);
+
+ /*
+ * The trace backend must be initialized after daemonizing.
+ * trace_init_backends() will call st_init(), which will create the
+ * trace thread in the parent, and also register st_flush_trace_buffer()
+ * in atexit(). This function will force the parent to wait for the
+ * writeout thread to finish, which will not occur, and the parent
+ * process will be left in the host.
+ */
+ if (!trace_init_backends()) {
+ exit(1);
+ }
+ trace_init_file();
+
+ qemu_init_main_loop(&error_fatal);
+ cpu_timers_init();
+
+ user_register_global_props();
+ replay_configure(icount_opts);
+
+ configure_rtc(qemu_find_opts_singleton("rtc"));
+
+ /* Transfer QemuOpts options into machine options */
+ parse_memory_options();
+
+ qemu_create_machine(machine_opts_dict);
+
+ suspend_mux_open();
+
+ qemu_disable_default_devices();
+ qemu_create_default_devices();
+ qemu_create_early_backends();
+
+ qemu_apply_legacy_machine_options(machine_opts_dict);
+ qemu_apply_machine_options(machine_opts_dict);
+ qobject_unref(machine_opts_dict);
+ phase_advance(PHASE_MACHINE_CREATED);
+
+ /*
+ * Note: uses machine properties such as kernel-irqchip, must run
+ * after qemu_apply_machine_options.
+ */
+ configure_accelerators(argv[0]);
+ phase_advance(PHASE_ACCEL_CREATED);
+
+ /*
+ * Beware, QOM objects created before this point miss global and
+ * compat properties.
+ *
+ * Global properties get set up by qdev_prop_register_global(),
+ * called from user_register_global_props(), and certain option
+ * desugaring. Also in CPU feature desugaring (buried in
+ * parse_cpu_option()), which happens below this point, but may
+ * only target the CPU type, which can only be created after
+ * parse_cpu_option() returned the type.
+ *
+ * Machine compat properties: object_set_machine_compat_props().
+ * Accelerator compat props: object_set_accelerator_compat_props(),
+ * called from do_configure_accelerator().
+ */
+
+ machine_class = MACHINE_GET_CLASS(current_machine);
+ if (!qtest_enabled() && machine_class->deprecation_reason) {
+ warn_report("Machine type '%s' is deprecated: %s",
+ machine_class->name, machine_class->deprecation_reason);
+ }
+
+ /*
+ * Create backends before creating migration objects, so that it can
+ * check against compatibilities on the backend memories (e.g. postcopy
+ * over memory-backend-file objects).
+ */
+ qemu_create_late_backends();
+
+ /*
+ * Note: creates a QOM object, must run only after global and
+ * compat properties have been set up.
+ */
+ migration_object_init();
+
+ /* parse features once if machine provides default cpu_type */
+ current_machine->cpu_type = machine_class->default_cpu_type;
+ if (cpu_option) {
+ current_machine->cpu_type = parse_cpu_option(cpu_option);
+ }
+ /* NB: for machine none cpu_type could STILL be NULL here! */
+
+ qemu_resolve_machine_memdev();
+ parse_numa_opts(current_machine);
+
+ if (vmstate_dump_file) {
+ /* dump and exit */
+ module_load_qom_all();
+ dump_vmstate_json_to_file(vmstate_dump_file);
+ exit(0);
+ }
+
+ if (!preconfig_requested) {
+ qmp_x_exit_preconfig(&error_fatal);
+ }
+ qemu_init_displays();
+ accel_setup_post(current_machine);
+ os_setup_post();
+ resume_mux_open();
+}
diff --git a/system/watchpoint.c b/system/watchpoint.c
new file mode 100644
index 0000000000..45d1f12faf
--- /dev/null
+++ b/system/watchpoint.c
@@ -0,0 +1,226 @@
+/*
+ * CPU watchpoints
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/main-loop.h"
+#include "qemu/error-report.h"
+#include "exec/exec-all.h"
+#include "exec/translate-all.h"
+#include "sysemu/tcg.h"
+#include "sysemu/replay.h"
+#include "hw/core/tcg-cpu-ops.h"
+#include "hw/core/cpu.h"
+
+/* Add a watchpoint. */
+int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
+ int flags, CPUWatchpoint **watchpoint)
+{
+ CPUWatchpoint *wp;
+ vaddr in_page;
+
+ /* forbid ranges which are empty or run off the end of the address space */
+ if (len == 0 || (addr + len - 1) < addr) {
+ error_report("tried to set invalid watchpoint at %"
+ VADDR_PRIx ", len=%" VADDR_PRIu, addr, len);
+ return -EINVAL;
+ }
+ wp = g_malloc(sizeof(*wp));
+
+ wp->vaddr = addr;
+ wp->len = len;
+ wp->flags = flags;
+
+ /* keep all GDB-injected watchpoints in front */
+ if (flags & BP_GDB) {
+ QTAILQ_INSERT_HEAD(&cpu->watchpoints, wp, entry);
+ } else {
+ QTAILQ_INSERT_TAIL(&cpu->watchpoints, wp, entry);
+ }
+
+ in_page = -(addr | TARGET_PAGE_MASK);
+ if (len <= in_page) {
+ tlb_flush_page(cpu, addr);
+ } else {
+ tlb_flush(cpu);
+ }
+
+ if (watchpoint) {
+ *watchpoint = wp;
+ }
+ return 0;
+}
+
+/* Remove a specific watchpoint. */
+int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
+ int flags)
+{
+ CPUWatchpoint *wp;
+
+ QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
+ if (addr == wp->vaddr && len == wp->len
+ && flags == (wp->flags & ~BP_WATCHPOINT_HIT)) {
+ cpu_watchpoint_remove_by_ref(cpu, wp);
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+/* Remove a specific watchpoint by reference. */
+void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
+{
+ QTAILQ_REMOVE(&cpu->watchpoints, watchpoint, entry);
+
+ tlb_flush_page(cpu, watchpoint->vaddr);
+
+ g_free(watchpoint);
+}
+
+/* Remove all matching watchpoints. */
+void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
+{
+ CPUWatchpoint *wp, *next;
+
+ QTAILQ_FOREACH_SAFE(wp, &cpu->watchpoints, entry, next) {
+ if (wp->flags & mask) {
+ cpu_watchpoint_remove_by_ref(cpu, wp);
+ }
+ }
+}
+
+#ifdef CONFIG_TCG
+
+/*
+ * Return true if this watchpoint address matches the specified
+ * access (ie the address range covered by the watchpoint overlaps
+ * partially or completely with the address range covered by the
+ * access).
+ */
+static inline bool watchpoint_address_matches(CPUWatchpoint *wp,
+ vaddr addr, vaddr len)
+{
+ /*
+ * We know the lengths are non-zero, but a little caution is
+ * required to avoid errors in the case where the range ends
+ * exactly at the top of the address space and so addr + len
+ * wraps round to zero.
+ */
+ vaddr wpend = wp->vaddr + wp->len - 1;
+ vaddr addrend = addr + len - 1;
+
+ return !(addr > wpend || wp->vaddr > addrend);
+}
+
+/* Return flags for watchpoints that match addr + prot. */
+int cpu_watchpoint_address_matches(CPUState *cpu, vaddr addr, vaddr len)
+{
+ CPUWatchpoint *wp;
+ int ret = 0;
+
+ QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
+ if (watchpoint_address_matches(wp, addr, len)) {
+ ret |= wp->flags;
+ }
+ }
+ return ret;
+}
+
+/* Generate a debug exception if a watchpoint has been hit. */
+void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
+ MemTxAttrs attrs, int flags, uintptr_t ra)
+{
+ CPUClass *cc = CPU_GET_CLASS(cpu);
+ CPUWatchpoint *wp;
+
+ assert(tcg_enabled());
+ if (cpu->watchpoint_hit) {
+ /*
+ * We re-entered the check after replacing the TB.
+ * Now raise the debug interrupt so that it will
+ * trigger after the current instruction.
+ */
+ qemu_mutex_lock_iothread();
+ cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
+ qemu_mutex_unlock_iothread();
+ return;
+ }
+
+ if (cc->tcg_ops->adjust_watchpoint_address) {
+ /* this is currently used only by ARM BE32 */
+ addr = cc->tcg_ops->adjust_watchpoint_address(cpu, addr, len);
+ }
+
+ assert((flags & ~BP_MEM_ACCESS) == 0);
+ QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
+ int hit_flags = wp->flags & flags;
+
+ if (hit_flags && watchpoint_address_matches(wp, addr, len)) {
+ if (replay_running_debug()) {
+ /*
+ * replay_breakpoint reads icount.
+ * Force recompile to succeed, because icount may
+ * be read only at the end of the block.
+ */
+ if (!cpu->neg.can_do_io) {
+ /* Force execution of one insn next time. */
+ cpu->cflags_next_tb = 1 | CF_LAST_IO | CF_NOIRQ
+ | curr_cflags(cpu);
+ cpu_loop_exit_restore(cpu, ra);
+ }
+ /*
+ * Don't process the watchpoints when we are
+ * in a reverse debugging operation.
+ */
+ replay_breakpoint();
+ return;
+ }
+
+ wp->flags |= hit_flags << BP_HIT_SHIFT;
+ wp->hitaddr = MAX(addr, wp->vaddr);
+ wp->hitattrs = attrs;
+
+ if (wp->flags & BP_CPU
+ && cc->tcg_ops->debug_check_watchpoint
+ && !cc->tcg_ops->debug_check_watchpoint(cpu, wp)) {
+ wp->flags &= ~BP_WATCHPOINT_HIT;
+ continue;
+ }
+ cpu->watchpoint_hit = wp;
+
+ mmap_lock();
+ /* This call also restores vCPU state */
+ tb_check_watchpoint(cpu, ra);
+ if (wp->flags & BP_STOP_BEFORE_ACCESS) {
+ cpu->exception_index = EXCP_DEBUG;
+ mmap_unlock();
+ cpu_loop_exit(cpu);
+ } else {
+ /* Force execution of one insn next time. */
+ cpu->cflags_next_tb = 1 | CF_LAST_IO | CF_NOIRQ
+ | curr_cflags(cpu);
+ mmap_unlock();
+ cpu_loop_exit_noexc(cpu);
+ }
+ } else {
+ wp->flags &= ~BP_WATCHPOINT_HIT;
+ }
+ }
+}
+
+#endif /* CONFIG_TCG */