diff options
author | Stefan Hajnoczi <stefanha@redhat.com> | 2022-10-30 18:31:59 -0400 |
---|---|---|
committer | Stefan Hajnoczi <stefanha@redhat.com> | 2022-10-30 18:31:59 -0400 |
commit | 7208429223963c405c62fa2611398f1aa8033593 (patch) | |
tree | 4e4bb0a31f0b2be625b9901d811e0c522d16688d | |
parent | 5eff7badae58fdfb8ff179e72cd5546425cf8a6f (diff) | |
parent | bd77c30df984faefa85e6a402939b485d6e05f05 (diff) |
Merge tag 'mem-2022-10-28' of https://github.com/davidhildenbrand/qemu into staging
Hi,
"Host Memory Backends" and "Memory devices" queue ("mem"):
- Fix NVDIMM error message
- Add ThreadContext user-creatable object and wire it up for NUMA-aware
hostmem preallocation
# -----BEGIN PGP SIGNATURE-----
#
# iQJFBAABCAAvFiEEG9nKrXNcTDpGDfzKTd4Q9wD/g1oFAmNbpHARHGRhdmlkQHJl
# ZGhhdC5jb20ACgkQTd4Q9wD/g1pDpw//bG9cyIlzTzDnU5pbQiXyLm0nF9tW/tli
# npGPSbFFYz/72XD9VJSVLhbNHoQSmFcMK5m/DA4WAMdOc5zF7lP3XdZcj72pDyxu
# 31hJRvuRhxNb09jhEdWRfX5+Jg9UyYXuIvtKXHSWgrtaYDtHBdTXq/ojZlvlo/rr
# 36v0jaVaTNRs7dKQL2oaN+DSMiPXHxBzA6FABqYmJNNwuMJT0kkX8pfz0OFwkRn+
# iqf9uRhM6b/fNNB0+ReA7FfGL+hzU6Uv8AvAL3orXUqjwPMRe9Fz2gE7HpFnE6DD
# dOP4Xk2iSSJ5XQA8HwtvrQfrGPh4gPYE80ziK/+8boy3alVeGYbYbvWVtdsNju41
# Cq9kM1wDyjZf6SSUIAbjOrNPdbhwyK4GviVBR1zh+/gA3uF5MhrDtZh4h3mWX2if
# ijmT9mfte4NwF3K1MvckAl7IHRb8nxmr7wjjhJ26JwpD+76lfAcmXC2YOlFGHCMi
# 028mjvThf3HW7BD2LjlQSX4UkHmM2vUBrgMGQKyeMham1VmMfSK32wzvUNfF7xSz
# o9k0loBh7unGcUsv3EbqUGswV5F6AgjK3vWRkDql8dNrdIoapDfaejPCd58kVM98
# 5N/aEoha4bAeJ6NGIKzD+4saiMxUqJ0y2NjSrE8iO4HszXgZW5e1Gbkn4Ae6d37D
# QSSqyfasVHY=
# =bLuc
# -----END PGP SIGNATURE-----
# gpg: Signature made Fri 28 Oct 2022 05:44:16 EDT
# gpg: using RSA key 1BD9CAAD735C4C3A460DFCCA4DDE10F700FF835A
# gpg: issuer "david@redhat.com"
# gpg: Good signature from "David Hildenbrand <david@redhat.com>" [unknown]
# gpg: aka "David Hildenbrand <davidhildenbrand@gmail.com>" [full]
# gpg: aka "David Hildenbrand <hildenbr@in.tum.de>" [unknown]
# gpg: WARNING: The key's User ID is not certified with a trusted signature!
# gpg: There is no indication that the signature belongs to the owner.
# Primary key fingerprint: 1BD9 CAAD 735C 4C3A 460D FCCA 4DDE 10F7 00FF 835A
* tag 'mem-2022-10-28' of https://github.com/davidhildenbrand/qemu:
vl: Allow ThreadContext objects to be created before the sandbox option
hostmem: Allow for specifying a ThreadContext for preallocation
util: Make qemu_prealloc_mem() optionally consume a ThreadContext
util: Add write-only "node-affinity" property for ThreadContext
util: Introduce ThreadContext user-creatable object
util: Introduce qemu_thread_set_affinity() and qemu_thread_get_affinity()
util: Cleanup and rename os_mem_prealloc()
hw/mem/nvdimm: fix error message for 'unarmed' flag
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-rw-r--r-- | backends/hostmem.c | 13 | ||||
-rw-r--r-- | hw/mem/nvdimm.c | 2 | ||||
-rw-r--r-- | hw/virtio/virtio-mem.c | 2 | ||||
-rw-r--r-- | include/qemu/osdep.h | 19 | ||||
-rw-r--r-- | include/qemu/thread-context.h | 57 | ||||
-rw-r--r-- | include/qemu/thread.h | 4 | ||||
-rw-r--r-- | include/sysemu/hostmem.h | 2 | ||||
-rw-r--r-- | meson.build | 16 | ||||
-rw-r--r-- | qapi/qom.json | 28 | ||||
-rw-r--r-- | softmmu/cpus.c | 2 | ||||
-rw-r--r-- | softmmu/vl.c | 36 | ||||
-rw-r--r-- | util/meson.build | 1 | ||||
-rw-r--r-- | util/oslib-posix.c | 39 | ||||
-rw-r--r-- | util/oslib-win32.c | 8 | ||||
-rw-r--r-- | util/qemu-thread-posix.c | 70 | ||||
-rw-r--r-- | util/qemu-thread-win32.c | 12 | ||||
-rw-r--r-- | util/thread-context.c | 362 |
17 files changed, 642 insertions, 31 deletions
diff --git a/backends/hostmem.c b/backends/hostmem.c index 4428e06738..8640294c10 100644 --- a/backends/hostmem.c +++ b/backends/hostmem.c @@ -232,7 +232,8 @@ static void host_memory_backend_set_prealloc(Object *obj, bool value, void *ptr = memory_region_get_ram_ptr(&backend->mr); uint64_t sz = memory_region_size(&backend->mr); - os_mem_prealloc(fd, ptr, sz, backend->prealloc_threads, &local_err); + qemu_prealloc_mem(fd, ptr, sz, backend->prealloc_threads, + backend->prealloc_context, &local_err); if (local_err) { error_propagate(errp, local_err); return; @@ -383,8 +384,9 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp) * specified NUMA policy in place. */ if (backend->prealloc) { - os_mem_prealloc(memory_region_get_fd(&backend->mr), ptr, sz, - backend->prealloc_threads, &local_err); + qemu_prealloc_mem(memory_region_get_fd(&backend->mr), ptr, sz, + backend->prealloc_threads, + backend->prealloc_context, &local_err); if (local_err) { goto out; } @@ -492,6 +494,11 @@ host_memory_backend_class_init(ObjectClass *oc, void *data) NULL, NULL); object_class_property_set_description(oc, "prealloc-threads", "Number of CPU threads to use for prealloc"); + object_class_property_add_link(oc, "prealloc-context", + TYPE_THREAD_CONTEXT, offsetof(HostMemoryBackend, prealloc_context), + object_property_allow_set_link, OBJ_PROP_LINK_STRONG); + object_class_property_set_description(oc, "prealloc-context", + "Context to use for creating CPU threads for preallocation"); object_class_property_add(oc, "size", "int", host_memory_backend_get_size, host_memory_backend_set_size, diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c index 7c7d777781..31080c22c9 100644 --- a/hw/mem/nvdimm.c +++ b/hw/mem/nvdimm.c @@ -149,7 +149,7 @@ static void nvdimm_prepare_memory_region(NVDIMMDevice *nvdimm, Error **errp) if (!nvdimm->unarmed && memory_region_is_rom(mr)) { HostMemoryBackend *hostmem = dimm->hostmem; - error_setg(errp, "'unarmed' property must be off since memdev %s " + error_setg(errp, "'unarmed' property must be 'on' since memdev %s " "is read-only", object_get_canonical_path_component(OBJECT(hostmem))); return; diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c index 30d03e987a..ed170def48 100644 --- a/hw/virtio/virtio-mem.c +++ b/hw/virtio/virtio-mem.c @@ -467,7 +467,7 @@ static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa, int fd = memory_region_get_fd(&vmem->memdev->mr); Error *local_err = NULL; - os_mem_prealloc(fd, area, size, 1, &local_err); + qemu_prealloc_mem(fd, area, size, 1, NULL, &local_err); if (local_err) { static bool warned; diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h index 2276094729..b9c4307779 100644 --- a/include/qemu/osdep.h +++ b/include/qemu/osdep.h @@ -576,8 +576,23 @@ unsigned long qemu_getauxval(unsigned long type); void qemu_set_tty_echo(int fd, bool echo); -void os_mem_prealloc(int fd, char *area, size_t sz, int smp_cpus, - Error **errp); +typedef struct ThreadContext ThreadContext; + +/** + * qemu_prealloc_mem: + * @fd: the fd mapped into the area, -1 for anonymous memory + * @area: start address of the are to preallocate + * @sz: the size of the area to preallocate + * @max_threads: maximum number of threads to use + * @errp: returns an error if this function fails + * + * Preallocate memory (populate/prefault page tables writable) for the virtual + * memory area starting at @area with the size of @sz. After a successful call, + * each page in the area was faulted in writable at least once, for example, + * after allocating file blocks for mapped files. + */ +void qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads, + ThreadContext *tc, Error **errp); /** * qemu_get_pid_name: diff --git a/include/qemu/thread-context.h b/include/qemu/thread-context.h new file mode 100644 index 0000000000..2ebd6b7fe1 --- /dev/null +++ b/include/qemu/thread-context.h @@ -0,0 +1,57 @@ +/* + * QEMU Thread Context + * + * Copyright Red Hat Inc., 2022 + * + * Authors: + * David Hildenbrand <david@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef SYSEMU_THREAD_CONTEXT_H +#define SYSEMU_THREAD_CONTEXT_H + +#include "qapi/qapi-types-machine.h" +#include "qemu/thread.h" +#include "qom/object.h" + +#define TYPE_THREAD_CONTEXT "thread-context" +OBJECT_DECLARE_TYPE(ThreadContext, ThreadContextClass, + THREAD_CONTEXT) + +struct ThreadContextClass { + ObjectClass parent_class; +}; + +struct ThreadContext { + /* private */ + Object parent; + + /* private */ + unsigned int thread_id; + QemuThread thread; + + /* Semaphore to wait for context thread action. */ + QemuSemaphore sem; + /* Semaphore to wait for action in context thread. */ + QemuSemaphore sem_thread; + /* Mutex to synchronize requests. */ + QemuMutex mutex; + + /* Commands for the thread to execute. */ + int thread_cmd; + void *thread_cmd_data; + + /* CPU affinity bitmap used for initialization. */ + unsigned long *init_cpu_bitmap; + int init_cpu_nbits; +}; + +void thread_context_create_thread(ThreadContext *tc, QemuThread *thread, + const char *name, + void *(*start_routine)(void *), void *arg, + int mode); + +#endif /* SYSEMU_THREAD_CONTEXT_H */ diff --git a/include/qemu/thread.h b/include/qemu/thread.h index 20641e5844..7c6703bce3 100644 --- a/include/qemu/thread.h +++ b/include/qemu/thread.h @@ -185,6 +185,10 @@ void qemu_event_destroy(QemuEvent *ev); void qemu_thread_create(QemuThread *thread, const char *name, void *(*start_routine)(void *), void *arg, int mode); +int qemu_thread_set_affinity(QemuThread *thread, unsigned long *host_cpus, + unsigned long nbits); +int qemu_thread_get_affinity(QemuThread *thread, unsigned long **host_cpus, + unsigned long *nbits); void *qemu_thread_join(QemuThread *thread); void qemu_thread_get_self(QemuThread *thread); bool qemu_thread_is_self(QemuThread *thread); diff --git a/include/sysemu/hostmem.h b/include/sysemu/hostmem.h index 9ff5c16963..39326f1d4f 100644 --- a/include/sysemu/hostmem.h +++ b/include/sysemu/hostmem.h @@ -18,6 +18,7 @@ #include "qom/object.h" #include "exec/memory.h" #include "qemu/bitmap.h" +#include "qemu/thread-context.h" #define TYPE_MEMORY_BACKEND "memory-backend" OBJECT_DECLARE_TYPE(HostMemoryBackend, HostMemoryBackendClass, @@ -66,6 +67,7 @@ struct HostMemoryBackend { bool merge, dump, use_canonical_path; bool prealloc, is_mapped, share, reserve; uint32_t prealloc_threads; + ThreadContext *prealloc_context; DECLARE_BITMAP(host_nodes, MAX_NODES + 1); HostMemPolicy policy; diff --git a/meson.build b/meson.build index 7d39756ae9..15dff0a8d3 100644 --- a/meson.build +++ b/meson.build @@ -2130,7 +2130,23 @@ config_host_data.set('CONFIG_PTHREAD_CONDATTR_SETCLOCK', cc.links(gnu_source_pre pthread_condattr_setclock(&attr, CLOCK_MONOTONIC); return 0; }''', dependencies: threads)) +config_host_data.set('CONFIG_PTHREAD_AFFINITY_NP', cc.links(gnu_source_prefix + ''' + #include <pthread.h> + static void *f(void *p) { return NULL; } + int main(void) + { + int setsize = CPU_ALLOC_SIZE(64); + pthread_t thread; + cpu_set_t *cpuset; + pthread_create(&thread, 0, f, 0); + cpuset = CPU_ALLOC(64); + CPU_ZERO_S(setsize, cpuset); + pthread_setaffinity_np(thread, setsize, cpuset); + pthread_getaffinity_np(thread, setsize, cpuset); + CPU_FREE(cpuset); + return 0; + }''', dependencies: threads)) config_host_data.set('CONFIG_SIGNALFD', cc.links(gnu_source_prefix + ''' #include <sys/signalfd.h> #include <stddef.h> diff --git a/qapi/qom.json b/qapi/qom.json index 80dd419b39..87fcad2423 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -578,6 +578,9 @@ # # @prealloc-threads: number of CPU threads to use for prealloc (default: 1) # +# @prealloc-context: thread context to use for creation of preallocation threads +# (default: none) (since 7.2) +# # @share: if false, the memory is private to QEMU; if true, it is shared # (default: false) # @@ -608,6 +611,7 @@ '*policy': 'HostMemPolicy', '*prealloc': 'bool', '*prealloc-threads': 'uint32', + '*prealloc-context': 'str', '*share': 'bool', '*reserve': 'bool', 'size': 'size', @@ -831,6 +835,28 @@ '*kernel-hashes': 'bool' } } ## +# @ThreadContextProperties: +# +# Properties for thread context objects. +# +# @cpu-affinity: the list of host CPU numbers used as CPU affinity for all +# threads created in the thread context (default: QEMU main +# thread CPU affinity) +# +# @node-affinity: the list of host node numbers that will be resolved to a +# list of host CPU numbers used as CPU affinity. This is a +# shortcut for specifying the list of host CPU numbers +# belonging to the host nodes manually by setting +# @cpu-affinity. (default: QEMU main thread affinity) +# +# Since: 7.2 +## +{ 'struct': 'ThreadContextProperties', + 'data': { '*cpu-affinity': ['uint16'], + '*node-affinity': ['uint16'] } } + + +## # @ObjectType: # # Features: @@ -882,6 +908,7 @@ { 'name': 'secret_keyring', 'if': 'CONFIG_SECRET_KEYRING' }, 'sev-guest', + 'thread-context', 's390-pv-guest', 'throttle-group', 'tls-creds-anon', @@ -948,6 +975,7 @@ 'secret_keyring': { 'type': 'SecretKeyringProperties', 'if': 'CONFIG_SECRET_KEYRING' }, 'sev-guest': 'SevGuestProperties', + 'thread-context': 'ThreadContextProperties', 'throttle-group': 'ThrottleGroupProperties', 'tls-creds-anon': 'TlsCredsAnonProperties', 'tls-creds-psk': 'TlsCredsPskProperties', diff --git a/softmmu/cpus.c b/softmmu/cpus.c index 61b27ff59d..01c94fd298 100644 --- a/softmmu/cpus.c +++ b/softmmu/cpus.c @@ -354,7 +354,7 @@ static void qemu_init_sigbus(void) /* * ALERT: when modifying this, take care that SIGBUS forwarding in - * os_mem_prealloc() will continue working as expected. + * qemu_prealloc_mem() will continue working as expected. */ memset(&action, 0, sizeof(action)); action.sa_flags = SA_SIGINFO; diff --git a/softmmu/vl.c b/softmmu/vl.c index 99fb49c7b0..5115221efe 100644 --- a/softmmu/vl.c +++ b/softmmu/vl.c @@ -1760,6 +1760,27 @@ static void object_option_parse(const char *optarg) } /* + * Very early object creation, before the sandbox options have been activated. + */ +static bool object_create_pre_sandbox(const char *type) +{ + /* + * Objects should in general not get initialized "too early" without + * a reason. If you add one, state the reason in a comment! + */ + + /* + * Reason: -sandbox on,resourcecontrol=deny disallows setting CPU + * affinity of threads. + */ + if (g_str_equal(type, "thread-context")) { + return true; + } + + return false; +} + +/* * Initial object creation happens before all other * QEMU data types are created. The majority of objects * can be created at this point. The rng-egd object @@ -1773,6 +1794,11 @@ static bool object_create_early(const char *type) * add one, state the reason in a comment! */ + /* Reason: already created. */ + if (object_create_pre_sandbox(type)) { + return false; + } + /* Reason: property "chardev" */ if (g_str_equal(type, "rng-egd") || g_str_equal(type, "qtest")) { @@ -1895,7 +1921,7 @@ static void qemu_create_early_backends(void) */ static bool object_create_late(const char *type) { - return !object_create_early(type); + return !object_create_early(type) && !object_create_pre_sandbox(type); } static void qemu_create_late_backends(void) @@ -2351,6 +2377,11 @@ static int process_runstate_actions(void *opaque, QemuOpts *opts, Error **errp) static void qemu_process_early_options(void) { + qemu_opts_foreach(qemu_find_opts("name"), + parse_name, NULL, &error_fatal); + + object_option_foreach_add(object_create_pre_sandbox); + #ifdef CONFIG_SECCOMP QemuOptsList *olist = qemu_find_opts_err("sandbox", NULL); if (olist) { @@ -2358,9 +2389,6 @@ static void qemu_process_early_options(void) } #endif - qemu_opts_foreach(qemu_find_opts("name"), - parse_name, NULL, &error_fatal); - if (qemu_opts_foreach(qemu_find_opts("action"), process_runstate_actions, NULL, &error_fatal)) { exit(1); diff --git a/util/meson.build b/util/meson.build index 5e282130df..c0a7bc54d4 100644 --- a/util/meson.build +++ b/util/meson.build @@ -1,4 +1,5 @@ util_ss.add(files('osdep.c', 'cutils.c', 'unicode.c', 'qemu-timer-common.c')) +util_ss.add(files('thread-context.c'), numa) if not config_host_data.get('CONFIG_ATOMIC64') util_ss.add(files('atomic64.c')) endif diff --git a/util/oslib-posix.c b/util/oslib-posix.c index 827a7aadba..59a891b6a8 100644 --- a/util/oslib-posix.c +++ b/util/oslib-posix.c @@ -42,6 +42,7 @@ #include "qemu/cutils.h" #include "qemu/compiler.h" #include "qemu/units.h" +#include "qemu/thread-context.h" #ifdef CONFIG_LINUX #include <sys/syscall.h> @@ -329,7 +330,7 @@ static void sigbus_handler(int signal) return; } #endif /* CONFIG_LINUX */ - warn_report("os_mem_prealloc: unrelated SIGBUS detected and ignored"); + warn_report("qemu_prealloc_mem: unrelated SIGBUS detected and ignored"); } static void *do_touch_pages(void *arg) @@ -399,13 +400,13 @@ static void *do_madv_populate_write_pages(void *arg) } static inline int get_memset_num_threads(size_t hpagesize, size_t numpages, - int smp_cpus) + int max_threads) { long host_procs = sysconf(_SC_NPROCESSORS_ONLN); int ret = 1; if (host_procs > 0) { - ret = MIN(MIN(host_procs, MAX_MEM_PREALLOC_THREAD_COUNT), smp_cpus); + ret = MIN(MIN(host_procs, MAX_MEM_PREALLOC_THREAD_COUNT), max_threads); } /* Especially with gigantic pages, don't create more threads than pages. */ @@ -418,11 +419,12 @@ static inline int get_memset_num_threads(size_t hpagesize, size_t numpages, } static int touch_all_pages(char *area, size_t hpagesize, size_t numpages, - int smp_cpus, bool use_madv_populate_write) + int max_threads, ThreadContext *tc, + bool use_madv_populate_write) { static gsize initialized = 0; MemsetContext context = { - .num_threads = get_memset_num_threads(hpagesize, numpages, smp_cpus), + .num_threads = get_memset_num_threads(hpagesize, numpages, max_threads), }; size_t numpages_per_thread, leftover; void *(*touch_fn)(void *); @@ -457,9 +459,16 @@ static int touch_all_pages(char *area, size_t hpagesize, size_t numpages, context.threads[i].numpages = numpages_per_thread + (i < leftover); context.threads[i].hpagesize = hpagesize; context.threads[i].context = &context; - qemu_thread_create(&context.threads[i].pgthread, "touch_pages", - touch_fn, &context.threads[i], - QEMU_THREAD_JOINABLE); + if (tc) { + thread_context_create_thread(tc, &context.threads[i].pgthread, + "touch_pages", + touch_fn, &context.threads[i], + QEMU_THREAD_JOINABLE); + } else { + qemu_thread_create(&context.threads[i].pgthread, "touch_pages", + touch_fn, &context.threads[i], + QEMU_THREAD_JOINABLE); + } addr += context.threads[i].numpages * hpagesize; } @@ -494,13 +503,13 @@ static bool madv_populate_write_possible(char *area, size_t pagesize) errno != EINVAL; } -void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus, - Error **errp) +void qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads, + ThreadContext *tc, Error **errp) { static gsize initialized; int ret; size_t hpagesize = qemu_fd_getpagesize(fd); - size_t numpages = DIV_ROUND_UP(memory, hpagesize); + size_t numpages = DIV_ROUND_UP(sz, hpagesize); bool use_madv_populate_write; struct sigaction act; @@ -530,24 +539,24 @@ void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus, if (ret) { qemu_mutex_unlock(&sigbus_mutex); error_setg_errno(errp, errno, - "os_mem_prealloc: failed to install signal handler"); + "qemu_prealloc_mem: failed to install signal handler"); return; } } /* touch pages simultaneously */ - ret = touch_all_pages(area, hpagesize, numpages, smp_cpus, + ret = touch_all_pages(area, hpagesize, numpages, max_threads, tc, use_madv_populate_write); if (ret) { error_setg_errno(errp, -ret, - "os_mem_prealloc: preallocating memory failed"); + "qemu_prealloc_mem: preallocating memory failed"); } if (!use_madv_populate_write) { ret = sigaction(SIGBUS, &sigbus_oldact, NULL); if (ret) { /* Terminate QEMU since it can't recover from error */ - perror("os_mem_prealloc: failed to reinstall signal handler"); + perror("qemu_prealloc_mem: failed to reinstall signal handler"); exit(1); } qemu_mutex_unlock(&sigbus_mutex); diff --git a/util/oslib-win32.c b/util/oslib-win32.c index 5723d3eb4c..a67cb3822e 100644 --- a/util/oslib-win32.c +++ b/util/oslib-win32.c @@ -268,14 +268,14 @@ int getpagesize(void) return system_info.dwPageSize; } -void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus, - Error **errp) +void qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads, + ThreadContext *tc, Error **errp) { int i; size_t pagesize = qemu_real_host_page_size(); - memory = (memory + pagesize - 1) & -pagesize; - for (i = 0; i < memory / pagesize; i++) { + sz = (sz + pagesize - 1) & -pagesize; + for (i = 0; i < sz / pagesize; i++) { memset(area + pagesize * i, 0, 1); } } diff --git a/util/qemu-thread-posix.c b/util/qemu-thread-posix.c index ac1d56e673..bae938c670 100644 --- a/util/qemu-thread-posix.c +++ b/util/qemu-thread-posix.c @@ -16,6 +16,7 @@ #include "qemu/notify.h" #include "qemu-thread-common.h" #include "qemu/tsan.h" +#include "qemu/bitmap.h" static bool name_threads; @@ -552,6 +553,75 @@ void qemu_thread_create(QemuThread *thread, const char *name, pthread_attr_destroy(&attr); } +int qemu_thread_set_affinity(QemuThread *thread, unsigned long *host_cpus, + unsigned long nbits) +{ +#if defined(CONFIG_PTHREAD_AFFINITY_NP) + const size_t setsize = CPU_ALLOC_SIZE(nbits); + unsigned long value; + cpu_set_t *cpuset; + int err; + + cpuset = CPU_ALLOC(nbits); + g_assert(cpuset); + + CPU_ZERO_S(setsize, cpuset); + value = find_first_bit(host_cpus, nbits); + while (value < nbits) { + CPU_SET_S(value, setsize, cpuset); + value = find_next_bit(host_cpus, nbits, value + 1); + } + + err = pthread_setaffinity_np(thread->thread, setsize, cpuset); + CPU_FREE(cpuset); + return err; +#else + return -ENOSYS; +#endif +} + +int qemu_thread_get_affinity(QemuThread *thread, unsigned long **host_cpus, + unsigned long *nbits) +{ +#if defined(CONFIG_PTHREAD_AFFINITY_NP) + unsigned long tmpbits; + cpu_set_t *cpuset; + size_t setsize; + int i, err; + + tmpbits = CPU_SETSIZE; + while (true) { + setsize = CPU_ALLOC_SIZE(tmpbits); + cpuset = CPU_ALLOC(tmpbits); + g_assert(cpuset); + + err = pthread_getaffinity_np(thread->thread, setsize, cpuset); + if (err) { + CPU_FREE(cpuset); + if (err != -EINVAL) { + return err; + } + tmpbits *= 2; + } else { + break; + } + } + + /* Convert the result into a proper bitmap. */ + *nbits = tmpbits; + *host_cpus = bitmap_new(tmpbits); + for (i = 0; i < tmpbits; i++) { + if (CPU_ISSET(i, cpuset)) { + set_bit(i, *host_cpus); + } + } + CPU_FREE(cpuset); + return 0; +#else + return -ENOSYS; +#endif +} + void qemu_thread_get_self(QemuThread *thread) { thread->thread = pthread_self(); diff --git a/util/qemu-thread-win32.c b/util/qemu-thread-win32.c index b9a467d7db..69db254ac7 100644 --- a/util/qemu-thread-win32.c +++ b/util/qemu-thread-win32.c @@ -477,6 +477,18 @@ void qemu_thread_create(QemuThread *thread, const char *name, thread->data = data; } +int qemu_thread_set_affinity(QemuThread *thread, unsigned long *host_cpus, + unsigned long nbits) +{ + return -ENOSYS; +} + +int qemu_thread_get_affinity(QemuThread *thread, unsigned long **host_cpus, + unsigned long *nbits) +{ + return -ENOSYS; +} + void qemu_thread_get_self(QemuThread *thread) { thread->data = qemu_thread_data; diff --git a/util/thread-context.c b/util/thread-context.c new file mode 100644 index 0000000000..4138245332 --- /dev/null +++ b/util/thread-context.c @@ -0,0 +1,362 @@ +/* + * QEMU Thread Context + * + * Copyright Red Hat Inc., 2022 + * + * Authors: + * David Hildenbrand <david@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/thread-context.h" +#include "qapi/error.h" +#include "qapi/qapi-builtin-visit.h" +#include "qapi/visitor.h" +#include "qemu/config-file.h" +#include "qapi/qapi-builtin-visit.h" +#include "qom/object_interfaces.h" +#include "qemu/module.h" +#include "qemu/bitmap.h" + +#ifdef CONFIG_NUMA +#include <numa.h> +#endif + +enum { + TC_CMD_NONE = 0, + TC_CMD_STOP, + TC_CMD_NEW, +}; + +typedef struct ThreadContextCmdNew { + QemuThread *thread; + const char *name; + void *(*start_routine)(void *); + void *arg; + int mode; +} ThreadContextCmdNew; + +static void *thread_context_run(void *opaque) +{ + ThreadContext *tc = opaque; + + tc->thread_id = qemu_get_thread_id(); + qemu_sem_post(&tc->sem); + + while (true) { + /* + * Threads inherit the CPU affinity of the creating thread. For this + * reason, we create new (especially short-lived) threads from our + * persistent context thread. + * + * Especially when QEMU is not allowed to set the affinity itself, + * management tools can simply set the affinity of the context thread + * after creating the context, to have new threads created via + * the context inherit the CPU affinity automatically. + */ + switch (tc->thread_cmd) { + case TC_CMD_NONE: + break; + case TC_CMD_STOP: + tc->thread_cmd = TC_CMD_NONE; + qemu_sem_post(&tc->sem); + return NULL; + case TC_CMD_NEW: { + ThreadContextCmdNew *cmd_new = tc->thread_cmd_data; + + qemu_thread_create(cmd_new->thread, cmd_new->name, + cmd_new->start_routine, cmd_new->arg, + cmd_new->mode); + tc->thread_cmd = TC_CMD_NONE; + tc->thread_cmd_data = NULL; + qemu_sem_post(&tc->sem); + break; + } + default: + g_assert_not_reached(); + } + qemu_sem_wait(&tc->sem_thread); + } +} + +static void thread_context_set_cpu_affinity(Object *obj, Visitor *v, + const char *name, void *opaque, + Error **errp) +{ + ThreadContext *tc = THREAD_CONTEXT(obj); + uint16List *l, *host_cpus = NULL; + unsigned long *bitmap = NULL; + int nbits = 0, ret; + Error *err = NULL; + + if (tc->init_cpu_bitmap) { + error_setg(errp, "Mixing CPU and node affinity not supported"); + return; + } + + visit_type_uint16List(v, name, &host_cpus, &err); + if (err) { + error_propagate(errp, err); + return; + } + + if (!host_cpus) { + error_setg(errp, "CPU list is empty"); + goto out; + } + + for (l = host_cpus; l; l = l->next) { + nbits = MAX(nbits, l->value + 1); + } + bitmap = bitmap_new(nbits); + for (l = host_cpus; l; l = l->next) { + set_bit(l->value, bitmap); + } + + if (tc->thread_id != -1) { + /* + * Note: we won't be adjusting the affinity of any thread that is still + * around, but only the affinity of the context thread. + */ + ret = qemu_thread_set_affinity(&tc->thread, bitmap, nbits); + if (ret) { + error_setg(errp, "Setting CPU affinity failed: %s", strerror(ret)); + } + } else { + tc->init_cpu_bitmap = bitmap; + bitmap = NULL; + tc->init_cpu_nbits = nbits; + } +out: + g_free(bitmap); + qapi_free_uint16List(host_cpus); +} + +static void thread_context_get_cpu_affinity(Object *obj, Visitor *v, + const char *name, void *opaque, + Error **errp) +{ + unsigned long *bitmap, nbits, value; + ThreadContext *tc = THREAD_CONTEXT(obj); + uint16List *host_cpus = NULL; + uint16List **tail = &host_cpus; + int ret; + + if (tc->thread_id == -1) { + error_setg(errp, "Object not initialized yet"); + return; + } + + ret = qemu_thread_get_affinity(&tc->thread, &bitmap, &nbits); + if (ret) { + error_setg(errp, "Getting CPU affinity failed: %s", strerror(ret)); + return; + } + + value = find_first_bit(bitmap, nbits); + while (value < nbits) { + QAPI_LIST_APPEND(tail, value); + + value = find_next_bit(bitmap, nbits, value + 1); + } + g_free(bitmap); + + visit_type_uint16List(v, name, &host_cpus, errp); + qapi_free_uint16List(host_cpus); +} + +static void thread_context_set_node_affinity(Object *obj, Visitor *v, + const char *name, void *opaque, + Error **errp) +{ +#ifdef CONFIG_NUMA + const int nbits = numa_num_possible_cpus(); + ThreadContext *tc = THREAD_CONTEXT(obj); + uint16List *l, *host_nodes = NULL; + unsigned long *bitmap = NULL; + struct bitmask *tmp_cpus; + Error *err = NULL; + int ret, i; + + if (tc->init_cpu_bitmap) { + error_setg(errp, "Mixing CPU and node affinity not supported"); + return; + } + + visit_type_uint16List(v, name, &host_nodes, &err); + if (err) { + error_propagate(errp, err); + return; + } + + if (!host_nodes) { + error_setg(errp, "Node list is empty"); + goto out; + } + + bitmap = bitmap_new(nbits); + tmp_cpus = numa_allocate_cpumask(); + for (l = host_nodes; l; l = l->next) { + numa_bitmask_clearall(tmp_cpus); + ret = numa_node_to_cpus(l->value, tmp_cpus); + if (ret) { + /* We ignore any errors, such as impossible nodes. */ + continue; + } + for (i = 0; i < nbits; i++) { + if (numa_bitmask_isbitset(tmp_cpus, i)) { + set_bit(i, bitmap); + } + } + } + numa_free_cpumask(tmp_cpus); + + if (bitmap_empty(bitmap, nbits)) { + error_setg(errp, "The nodes select no CPUs"); + goto out; + } + + if (tc->thread_id != -1) { + /* + * Note: we won't be adjusting the affinity of any thread that is still + * around for now, but only the affinity of the context thread. + */ + ret = qemu_thread_set_affinity(&tc->thread, bitmap, nbits); + if (ret) { + error_setg(errp, "Setting CPU affinity failed: %s", strerror(ret)); + } + } else { + tc->init_cpu_bitmap = bitmap; + bitmap = NULL; + tc->init_cpu_nbits = nbits; + } +out: + g_free(bitmap); + qapi_free_uint16List(host_nodes); +#else + error_setg(errp, "NUMA node affinity is not supported by this QEMU"); +#endif +} + +static void thread_context_get_thread_id(Object *obj, Visitor *v, + const char *name, void *opaque, + Error **errp) +{ + ThreadContext *tc = THREAD_CONTEXT(obj); + uint64_t value = tc->thread_id; + + visit_type_uint64(v, name, &value, errp); +} + +static void thread_context_instance_complete(UserCreatable *uc, Error **errp) +{ + ThreadContext *tc = THREAD_CONTEXT(uc); + char *thread_name; + int ret; + + thread_name = g_strdup_printf("TC %s", + object_get_canonical_path_component(OBJECT(uc))); + qemu_thread_create(&tc->thread, thread_name, thread_context_run, tc, + QEMU_THREAD_JOINABLE); + g_free(thread_name); + + /* Wait until initialization of the thread is done. */ + while (tc->thread_id == -1) { + qemu_sem_wait(&tc->sem); + } + + if (tc->init_cpu_bitmap) { + ret = qemu_thread_set_affinity(&tc->thread, tc->init_cpu_bitmap, + tc->init_cpu_nbits); + if (ret) { + error_setg(errp, "Setting CPU affinity failed: %s", strerror(ret)); + } + g_free(tc->init_cpu_bitmap); + tc->init_cpu_bitmap = NULL; + } +} + +static void thread_context_class_init(ObjectClass *oc, void *data) +{ + UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc); + + ucc->complete = thread_context_instance_complete; + object_class_property_add(oc, "thread-id", "int", + thread_context_get_thread_id, NULL, NULL, + NULL); + object_class_property_add(oc, "cpu-affinity", "int", + thread_context_get_cpu_affinity, + thread_context_set_cpu_affinity, NULL, NULL); + object_class_property_add(oc, "node-affinity", "int", NULL, + thread_context_set_node_affinity, NULL, NULL); +} + +static void thread_context_instance_init(Object *obj) +{ + ThreadContext *tc = THREAD_CONTEXT(obj); + + tc->thread_id = -1; + qemu_sem_init(&tc->sem, 0); + qemu_sem_init(&tc->sem_thread, 0); + qemu_mutex_init(&tc->mutex); +} + +static void thread_context_instance_finalize(Object *obj) +{ + ThreadContext *tc = THREAD_CONTEXT(obj); + + if (tc->thread_id != -1) { + tc->thread_cmd = TC_CMD_STOP; + qemu_sem_post(&tc->sem_thread); + qemu_thread_join(&tc->thread); + } + qemu_sem_destroy(&tc->sem); + qemu_sem_destroy(&tc->sem_thread); + qemu_mutex_destroy(&tc->mutex); +} + +static const TypeInfo thread_context_info = { + .name = TYPE_THREAD_CONTEXT, + .parent = TYPE_OBJECT, + .class_init = thread_context_class_init, + .instance_size = sizeof(ThreadContext), + .instance_init = thread_context_instance_init, + .instance_finalize = thread_context_instance_finalize, + .interfaces = (InterfaceInfo[]) { + { TYPE_USER_CREATABLE }, + { } + } +}; + +static void thread_context_register_types(void) +{ + type_register_static(&thread_context_info); +} +type_init(thread_context_register_types) + +void thread_context_create_thread(ThreadContext *tc, QemuThread *thread, + const char *name, + void *(*start_routine)(void *), void *arg, + int mode) +{ + ThreadContextCmdNew data = { + .thread = thread, + .name = name, + .start_routine = start_routine, + .arg = arg, + .mode = mode, + }; + + qemu_mutex_lock(&tc->mutex); + tc->thread_cmd = TC_CMD_NEW; + tc->thread_cmd_data = &data; + qemu_sem_post(&tc->sem_thread); + + while (tc->thread_cmd != TC_CMD_NONE) { + qemu_sem_wait(&tc->sem); + } + qemu_mutex_unlock(&tc->mutex); +} |