diff options
119 files changed, 1868 insertions, 1537 deletions
@@ -54,6 +54,7 @@ Aleksandar Markovic <aleksandar.qemu.devel@gmail.com> <amarkovic@wavecomp.com> Aleksandar Rikalo <aleksandar.rikalo@syrmia.com> <arikalo@wavecomp.com> Aleksandar Rikalo <aleksandar.rikalo@syrmia.com> <aleksandar.rikalo@rt-rk.com> Alexander Graf <agraf@csgraf.de> <agraf@suse.de> +Ani Sinha <anisinha@redhat.com> <ani@anisinha.ca> Anthony Liguori <anthony@codemonkey.ws> Anthony Liguori <aliguori@us.ibm.com> Christian Borntraeger <borntraeger@linux.ibm.com> <borntraeger@de.ibm.com> Damien Hedde <damien.hedde@dahe.fr> <damien.hedde@greensocs.com> diff --git a/MAINTAINERS b/MAINTAINERS index 24154f5721..fc225e66df 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1895,7 +1895,7 @@ F: hw/pci/pcie_doe.c ACPI/SMBIOS M: Michael S. Tsirkin <mst@redhat.com> M: Igor Mammedov <imammedo@redhat.com> -R: Ani Sinha <ani@anisinha.ca> +R: Ani Sinha <anisinha@redhat.com> S: Supported F: include/hw/acpi/* F: include/hw/firmware/smbios.h @@ -1932,7 +1932,7 @@ F: hw/acpi/viot.c F: hw/acpi/viot.h ACPI/AVOCADO/BIOSBITS -M: Ani Sinha <ani@anisinha.ca> +M: Ani Sinha <anisinha@redhat.com> M: Michael S. Tsirkin <mst@redhat.com> S: Supported F: tests/avocado/acpi-bits/* @@ -2076,6 +2076,10 @@ F: backends/vhost-user.c F: include/sysemu/vhost-user-backend.h F: subprojects/libvhost-user/ +vhost-shadow-virtqueue +R: Eugenio PĂ©rez <eperezma@redhat.com> +F: hw/virtio/vhost-shadow-virtqueue.* + virtio M: Michael S. Tsirkin <mst@redhat.com> S: Supported @@ -3368,6 +3372,10 @@ F: hw/i386/intel_iommu.c F: hw/i386/intel_iommu_internal.h F: include/hw/i386/intel_iommu.h +AMD-Vi Emulation +S: Orphan +F: hw/i386/amd_iommu.? + OpenSBI Firmware M: Bin Meng <bmeng.cn@gmail.com> S: Supported diff --git a/backends/tpm/tpm_backend.c b/backends/tpm/tpm_backend.c index 375587e743..485a20b9e0 100644 --- a/backends/tpm/tpm_backend.c +++ b/backends/tpm/tpm_backend.c @@ -100,8 +100,6 @@ bool tpm_backend_had_startup_error(TPMBackend *s) void tpm_backend_deliver_request(TPMBackend *s, TPMBackendCmd *cmd) { - ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context()); - if (s->cmd != NULL) { error_report("There is a TPM request pending"); return; @@ -109,7 +107,7 @@ void tpm_backend_deliver_request(TPMBackend *s, TPMBackendCmd *cmd) s->cmd = cmd; object_ref(OBJECT(s)); - thread_pool_submit_aio(pool, tpm_backend_worker_thread, s, + thread_pool_submit_aio(tpm_backend_worker_thread, s, tpm_backend_request_completed, s); } @@ -5750,7 +5750,7 @@ exit: * sums the size of all data-bearing children. (This excludes backing * children.) */ -static int64_t bdrv_sum_allocated_file_size(BlockDriverState *bs) +static int64_t coroutine_fn bdrv_sum_allocated_file_size(BlockDriverState *bs) { BdrvChild *child; int64_t child_size, sum = 0; diff --git a/block/blkdebug.c b/block/blkdebug.c index 978c8cff9e..addad914b3 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -583,8 +583,8 @@ out: return ret; } -static int rule_check(BlockDriverState *bs, uint64_t offset, uint64_t bytes, - BlkdebugIOType iotype) +static int coroutine_fn rule_check(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, BlkdebugIOType iotype) { BDRVBlkdebugState *s = bs->opaque; BlkdebugRule *rule = NULL; diff --git a/block/block-backend.c b/block/block-backend.c index 5566ea059d..fc530ded6a 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -80,9 +80,10 @@ struct BlockBackend { NotifierList remove_bs_notifiers, insert_bs_notifiers; QLIST_HEAD(, BlockBackendAioNotifier) aio_notifiers; - int quiesce_counter; + int quiesce_counter; /* atomic: written under BQL, read by other threads */ + QemuMutex queued_requests_lock; /* protects queued_requests */ CoQueue queued_requests; - bool disable_request_queuing; + bool disable_request_queuing; /* atomic */ VMChangeStateEntry *vmsh; bool force_allow_inactivate; @@ -368,6 +369,7 @@ BlockBackend *blk_new(AioContext *ctx, uint64_t perm, uint64_t shared_perm) block_acct_init(&blk->stats); + qemu_mutex_init(&blk->queued_requests_lock); qemu_co_queue_init(&blk->queued_requests); notifier_list_init(&blk->remove_bs_notifiers); notifier_list_init(&blk->insert_bs_notifiers); @@ -485,6 +487,8 @@ static void blk_delete(BlockBackend *blk) assert(QLIST_EMPTY(&blk->remove_bs_notifiers.notifiers)); assert(QLIST_EMPTY(&blk->insert_bs_notifiers.notifiers)); assert(QLIST_EMPTY(&blk->aio_notifiers)); + assert(qemu_co_queue_empty(&blk->queued_requests)); + qemu_mutex_destroy(&blk->queued_requests_lock); QTAILQ_REMOVE(&block_backends, blk, link); drive_info_del(blk->legacy_dinfo); block_acct_cleanup(&blk->stats); @@ -1057,7 +1061,7 @@ void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops, blk->dev_opaque = opaque; /* Are we currently quiesced? Should we enforce this right now? */ - if (blk->quiesce_counter && ops && ops->drained_begin) { + if (qatomic_read(&blk->quiesce_counter) && ops && ops->drained_begin) { ops->drained_begin(opaque); } } @@ -1232,7 +1236,7 @@ void blk_set_allow_aio_context_change(BlockBackend *blk, bool allow) void blk_set_disable_request_queuing(BlockBackend *blk, bool disable) { IO_CODE(); - blk->disable_request_queuing = disable; + qatomic_set(&blk->disable_request_queuing, disable); } static int coroutine_fn GRAPH_RDLOCK @@ -1271,10 +1275,18 @@ static void coroutine_fn blk_wait_while_drained(BlockBackend *blk) { assert(blk->in_flight > 0); - if (blk->quiesce_counter && !blk->disable_request_queuing) { + if (qatomic_read(&blk->quiesce_counter) && + !qatomic_read(&blk->disable_request_queuing)) { + /* + * Take lock before decrementing in flight counter so main loop thread + * waits for us to enqueue ourselves before it can leave the drained + * section. + */ + qemu_mutex_lock(&blk->queued_requests_lock); blk_dec_in_flight(blk); - qemu_co_queue_wait(&blk->queued_requests, NULL); + qemu_co_queue_wait(&blk->queued_requests, &blk->queued_requests_lock); blk_inc_in_flight(blk); + qemu_mutex_unlock(&blk->queued_requests_lock); } } @@ -1862,14 +1874,8 @@ void blk_drain_all(void) bdrv_drain_all_begin(); while ((blk = blk_all_next(blk)) != NULL) { - AioContext *ctx = blk_get_aio_context(blk); - - aio_context_acquire(ctx); - /* We may have -ENOMEDIUM completions in flight */ - AIO_WAIT_WHILE(ctx, qatomic_read(&blk->in_flight) > 0); - - aio_context_release(ctx); + AIO_WAIT_WHILE_UNLOCKED(NULL, qatomic_read(&blk->in_flight) > 0); } bdrv_drain_all_end(); @@ -2595,7 +2601,7 @@ static void blk_root_drained_begin(BdrvChild *child) BlockBackend *blk = child->opaque; ThrottleGroupMember *tgm = &blk->public.throttle_group_member; - if (++blk->quiesce_counter == 1) { + if (qatomic_fetch_inc(&blk->quiesce_counter) == 0) { if (blk->dev_ops && blk->dev_ops->drained_begin) { blk->dev_ops->drained_begin(blk->dev_opaque); } @@ -2613,7 +2619,7 @@ static bool blk_root_drained_poll(BdrvChild *child) { BlockBackend *blk = child->opaque; bool busy = false; - assert(blk->quiesce_counter); + assert(qatomic_read(&blk->quiesce_counter)); if (blk->dev_ops && blk->dev_ops->drained_poll) { busy = blk->dev_ops->drained_poll(blk->dev_opaque); @@ -2624,18 +2630,21 @@ static bool blk_root_drained_poll(BdrvChild *child) static void blk_root_drained_end(BdrvChild *child) { BlockBackend *blk = child->opaque; - assert(blk->quiesce_counter); + assert(qatomic_read(&blk->quiesce_counter)); assert(blk->public.throttle_group_member.io_limits_disabled); qatomic_dec(&blk->public.throttle_group_member.io_limits_disabled); - if (--blk->quiesce_counter == 0) { + if (qatomic_fetch_dec(&blk->quiesce_counter) == 1) { if (blk->dev_ops && blk->dev_ops->drained_end) { blk->dev_ops->drained_end(blk->dev_opaque); } - while (qemu_co_enter_next(&blk->queued_requests, NULL)) { + qemu_mutex_lock(&blk->queued_requests_lock); + while (qemu_co_enter_next(&blk->queued_requests, + &blk->queued_requests_lock)) { /* Resume all queued requests */ } + qemu_mutex_unlock(&blk->queued_requests_lock); } } diff --git a/block/dmg.c b/block/dmg.c index e10b9a2ba5..2769900359 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -31,11 +31,8 @@ #include "qemu/memalign.h" #include "dmg.h" -int (*dmg_uncompress_bz2)(char *next_in, unsigned int avail_in, - char *next_out, unsigned int avail_out); - -int (*dmg_uncompress_lzfse)(char *next_in, unsigned int avail_in, - char *next_out, unsigned int avail_out); +BdrvDmgUncompressFunc *dmg_uncompress_bz2; +BdrvDmgUncompressFunc *dmg_uncompress_lzfse; enum { /* Limit chunk sizes to prevent unreasonable amounts of memory being used diff --git a/block/dmg.h b/block/dmg.h index e488601b62..dcd6165e63 100644 --- a/block/dmg.h +++ b/block/dmg.h @@ -51,10 +51,10 @@ typedef struct BDRVDMGState { z_stream zstream; } BDRVDMGState; -extern int (*dmg_uncompress_bz2)(char *next_in, unsigned int avail_in, - char *next_out, unsigned int avail_out); +typedef int BdrvDmgUncompressFunc(char *next_in, unsigned int avail_in, + char *next_out, unsigned int avail_out); -extern int (*dmg_uncompress_lzfse)(char *next_in, unsigned int avail_in, - char *next_out, unsigned int avail_out); +extern BdrvDmgUncompressFunc *dmg_uncompress_bz2; +extern BdrvDmgUncompressFunc *dmg_uncompress_lzfse; #endif diff --git a/block/export/export.c b/block/export/export.c index 28a91c9c42..e3fee60611 100644 --- a/block/export/export.c +++ b/block/export/export.c @@ -306,7 +306,7 @@ void blk_exp_close_all_type(BlockExportType type) blk_exp_request_shutdown(exp); } - AIO_WAIT_WHILE(NULL, blk_exp_has_type(type)); + AIO_WAIT_WHILE_UNLOCKED(NULL, blk_exp_has_type(type)); } void blk_exp_close_all(void) diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c index 3409d9e02e..e56b92f2e2 100644 --- a/block/export/vhost-user-blk-server.c +++ b/block/export/vhost-user-blk-server.c @@ -10,6 +10,7 @@ * later. See the COPYING file in the top-level directory. */ #include "qemu/osdep.h" +#include "qemu/error-report.h" #include "block/block.h" #include "subprojects/libvhost-user/libvhost-user.h" /* only for the type definitions */ #include "standard-headers/linux/virtio_blk.h" @@ -251,6 +252,27 @@ static void vu_blk_exp_request_shutdown(BlockExport *exp) vhost_user_server_stop(&vexp->vu_server); } +static void vu_blk_exp_resize(void *opaque) +{ + VuBlkExport *vexp = opaque; + BlockDriverState *bs = blk_bs(vexp->handler.blk); + int64_t new_size = bdrv_getlength(bs); + + if (new_size < 0) { + error_printf("Failed to get length of block node '%s'", + bdrv_get_node_name(bs)); + return; + } + + vexp->blkcfg.capacity = cpu_to_le64(new_size >> VIRTIO_BLK_SECTOR_BITS); + + vu_config_change_msg(&vexp->vu_server.vu_dev); +} + +static const BlockDevOps vu_blk_dev_ops = { + .resize_cb = vu_blk_exp_resize, +}; + static int vu_blk_exp_create(BlockExport *exp, BlockExportOptions *opts, Error **errp) { @@ -292,6 +314,8 @@ static int vu_blk_exp_create(BlockExport *exp, BlockExportOptions *opts, blk_add_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach, vexp); + blk_set_dev_ops(exp->blk, &vu_blk_dev_ops, vexp); + if (!vhost_user_server_start(&vexp->vu_server, vu_opts->addr, exp->ctx, num_queues, &vu_blk_iface, errp)) { blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, diff --git a/block/file-posix.c b/block/file-posix.c index c2dee3f056..c7b723368e 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -2040,12 +2040,9 @@ out: return result; } -static int coroutine_fn raw_thread_pool_submit(BlockDriverState *bs, - ThreadPoolFunc func, void *arg) +static int coroutine_fn raw_thread_pool_submit(ThreadPoolFunc func, void *arg) { - /* @bs can be NULL, bdrv_get_aio_context() returns the main context then */ - ThreadPool *pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); - return thread_pool_submit_co(pool, func, arg); + return thread_pool_submit_co(func, arg); } /* @@ -2089,16 +2086,13 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, type |= QEMU_AIO_MISALIGNED; #ifdef CONFIG_LINUX_IO_URING } else if (s->use_linux_io_uring) { - LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs)); assert(qiov->size == bytes); - return luring_co_submit(bs, aio, s->fd, offset, qiov, type); + return luring_co_submit(bs, s->fd, offset, qiov, type); #endif #ifdef CONFIG_LINUX_AIO } else if (s->use_linux_aio) { - LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); assert(qiov->size == bytes); - return laio_co_submit(bs, aio, s->fd, offset, qiov, type, - s->aio_max_batch); + return laio_co_submit(s->fd, offset, qiov, type, s->aio_max_batch); #endif } @@ -2115,7 +2109,7 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, }; assert(qiov->size == bytes); - return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb); + return raw_thread_pool_submit(handle_aiocb_rw, &acb); } static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset, @@ -2137,14 +2131,12 @@ static void coroutine_fn raw_co_io_plug(BlockDriverState *bs) BDRVRawState __attribute__((unused)) *s = bs->opaque; #ifdef CONFIG_LINUX_AIO if (s->use_linux_aio) { - LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); - laio_io_plug(bs, aio); + laio_io_plug(); } #endif #ifdef CONFIG_LINUX_IO_URING if (s->use_linux_io_uring) { - LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs)); - luring_io_plug(bs, aio); + luring_io_plug(); } #endif } @@ -2154,14 +2146,12 @@ static void coroutine_fn raw_co_io_unplug(BlockDriverState *bs) BDRVRawState __attribute__((unused)) *s = bs->opaque; #ifdef CONFIG_LINUX_AIO if (s->use_linux_aio) { - LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); - laio_io_unplug(bs, aio, s->aio_max_batch); + laio_io_unplug(s->aio_max_batch); } #endif #ifdef CONFIG_LINUX_IO_URING if (s->use_linux_io_uring) { - LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs)); - luring_io_unplug(bs, aio); + luring_io_unplug(); } #endif } @@ -2185,11 +2175,10 @@ static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs) #ifdef CONFIG_LINUX_IO_URING if (s->use_linux_io_uring) { - LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs)); - return luring_co_submit(bs, aio, s->fd, 0, NULL, QEMU_AIO_FLUSH); + return luring_co_submit(bs, s->fd, 0, NULL, QEMU_AIO_FLUSH); } #endif - return raw_thread_pool_submit(bs, handle_aiocb_flush, &acb); + return raw_thread_pool_submit(handle_aiocb_flush, &acb); } static void raw_aio_attach_aio_context(BlockDriverState *bs, @@ -2251,7 +2240,7 @@ raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset, }, }; - return raw_thread_pool_submit(bs, handle_aiocb_truncate, &acb); + return raw_thread_pool_submit(handle_aiocb_truncate, &acb); } static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset, @@ -3000,7 +2989,7 @@ raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes, acb.aio_type |= QEMU_AIO_BLKDEV; } - ret = raw_thread_pool_submit(bs, handle_aiocb_discard, &acb); + ret = raw_thread_pool_submit(handle_aiocb_discard, &acb); raw_account_discard(s, bytes, ret); return ret; } @@ -3075,7 +3064,7 @@ raw_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes, handler = handle_aiocb_write_zeroes; } - return raw_thread_pool_submit(bs, handler, &acb); + return raw_thread_pool_submit(handler, &acb); } static int coroutine_fn raw_co_pwrite_zeroes( @@ -3313,7 +3302,7 @@ raw_co_copy_range_to(BlockDriverState *bs, }, }; - return raw_thread_pool_submit(bs, handle_aiocb_copy_range, &acb); + return raw_thread_pool_submit(handle_aiocb_copy_range, &acb); } BlockDriver bdrv_file = { @@ -3643,7 +3632,7 @@ hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) struct sg_io_hdr *io_hdr = buf; if (io_hdr->cmdp[0] == PERSISTENT_RESERVE_OUT || io_hdr->cmdp[0] == PERSISTENT_RESERVE_IN) { - return pr_manager_execute(s->pr_mgr, bdrv_get_aio_context(bs), + return pr_manager_execute(s->pr_mgr, qemu_get_current_aio_context(), s->fd, io_hdr); } } @@ -3659,7 +3648,7 @@ hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) }, }; - return raw_thread_pool_submit(bs, handle_aiocb_ioctl, &acb); + return raw_thread_pool_submit(handle_aiocb_ioctl, &acb); } #endif /* linux */ diff --git a/block/file-win32.c b/block/file-win32.c index 1763b8662e..48b790d917 100644 --- a/block/file-win32.c +++ b/block/file-win32.c @@ -153,7 +153,6 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile, BlockCompletionFunc *cb, void *opaque, int type) { RawWin32AIOData *acb = g_new(RawWin32AIOData, 1); - ThreadPool *pool; acb->bs = bs; acb->hfile = hfile; @@ -168,8 +167,7 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile, acb->aio_offset = offset; trace_file_paio_submit(acb, opaque, offset, count, type); - pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); - return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); + return thread_pool_submit_aio(aio_worker, acb, cb, opaque); } int qemu_ftruncate64(int fd, int64_t length) diff --git a/block/graph-lock.c b/block/graph-lock.c index 454c31e691..639526608f 100644 --- a/block/graph-lock.c +++ b/block/graph-lock.c @@ -127,7 +127,7 @@ void bdrv_graph_wrlock(void) * reader lock. */ qatomic_set(&has_writer, 0); - AIO_WAIT_WHILE(qemu_get_aio_context(), reader_count() >= 1); + AIO_WAIT_WHILE_UNLOCKED(NULL, reader_count() >= 1); qatomic_set(&has_writer, 1); /* diff --git a/block/io.c b/block/io.c index 2e267a85ab..6fa1993374 100644 --- a/block/io.c +++ b/block/io.c @@ -524,7 +524,7 @@ void bdrv_drain_all_begin(void) bdrv_drain_all_begin_nopoll(); /* Now poll the in-flight requests */ - AIO_WAIT_WHILE(NULL, bdrv_drain_all_poll()); + AIO_WAIT_WHILE_UNLOCKED(NULL, bdrv_drain_all_poll()); while ((bs = bdrv_next_all_states(bs))) { bdrv_drain_assert_idle(bs); diff --git a/block/io_uring.c b/block/io_uring.c index 973e15d876..989f9a99ed 100644 --- a/block/io_uring.c +++ b/block/io_uring.c @@ -18,6 +18,9 @@ #include "qapi/error.h" #include "trace.h" +/* Only used for assertions. */ +#include "qemu/coroutine_int.h" + /* io_uring ring size */ #define MAX_ENTRIES 128 @@ -50,10 +53,9 @@ typedef struct LuringState { struct io_uring ring; - /* io queue for submit at batch. Protected by AioContext lock. */ + /* No locking required, only accessed from AioContext home thread */ LuringQueue io_q; - /* I/O completion processing. Only runs in I/O thread. */ QEMUBH *completion_bh; } LuringState; @@ -209,6 +211,7 @@ end: * eventually runs later. Coroutines cannot be entered recursively * so avoid doing that! */ + assert(luringcb->co->ctx == s->aio_context); if (!qemu_coroutine_entered(luringcb->co)) { aio_co_wake(luringcb->co); } @@ -262,13 +265,11 @@ static int ioq_submit(LuringState *s) static void luring_process_completions_and_submit(LuringState *s) { - aio_context_acquire(s->aio_context); luring_process_completions(s); if (!s->io_q.plugged && s->io_q.in_queue > 0) { ioq_submit(s); } - aio_context_release(s->aio_context); } static void qemu_luring_completion_bh(void *opaque) @@ -306,14 +307,18 @@ static void ioq_init(LuringQueue *io_q) io_q->blocked = false; } -void luring_io_plug(BlockDriverState *bs, LuringState *s) +void luring_io_plug(void) { + AioContext *ctx = qemu_get_current_aio_context(); + LuringState *s = aio_get_linux_io_uring(ctx); trace_luring_io_plug(s); s->io_q.plugged++; } -void luring_io_unplug(BlockDriverState *bs, LuringState *s) +void luring_io_unplug(void) { + AioContext *ctx = qemu_get_current_aio_context(); + LuringState *s = aio_get_linux_io_uring(ctx); assert(s->io_q.plugged); trace_luring_io_unplug(s, s->io_q.blocked, s->io_q.plugged, s->io_q.in_queue, s->io_q.in_flight); @@ -373,10 +378,12 @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s, return 0; } -int coroutine_fn luring_co_submit(BlockDriverState *bs, LuringState *s, int fd, - uint64_t offset, QEMUIOVector *qiov, int type) +int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset, + QEMUIOVector *qiov, int type) { int ret; + AioContext *ctx = qemu_get_current_aio_context(); + LuringState *s = aio_get_linux_io_uring(ctx); LuringAIOCB luringcb = { .co = qemu_coroutine_self(), .ret = -EINPROGRESS, diff --git a/block/linux-aio.c b/block/linux-aio.c index d2cfb7f523..fc50cdd1bf 100644 --- a/block/linux-aio.c +++ b/block/linux-aio.c @@ -16,6 +16,9 @@ #include "qemu/coroutine.h" #include "qapi/error.h" +/* Only used for assertions. */ +#include "qemu/coroutine_int.h" + #include <libaio.h> /* @@ -56,10 +59,8 @@ struct LinuxAioState { io_context_t ctx; EventNotifier e; - /* io queue for submit at batch. Protected by AioContext lock. */ + /* No locking required, only accessed from AioContext home thread */ LaioQueue io_q; - - /* I/O completion processing. Only runs in I/O thread. */ QEMUBH *completion_bh; int event_idx; int event_max; @@ -102,6 +103,7 @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb) * later. Coroutines cannot be entered recursively so avoid doing * that! */ + assert(laiocb->co->ctx == laiocb->ctx->aio_context); if (!qemu_coroutine_entered(laiocb->co)) { aio_co_wake(laiocb->co); } @@ -232,13 +234,11 @@ static void qemu_laio_process_completions(LinuxAioState *s) static void qemu_laio_process_completions_and_submit(LinuxAioState *s) { - aio_context_acquire(s->aio_context); qemu_laio_process_completions(s); if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) { ioq_submit(s); } - aio_context_release(s->aio_context); } static void qemu_laio_completion_bh(void *opaque) @@ -354,14 +354,19 @@ static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch) return max_batch; } -void laio_io_plug(BlockDriverState *bs, LinuxAioState *s) +void laio_io_plug(void) { + AioContext *ctx = qemu_get_current_aio_context(); + LinuxAioState *s = aio_get_linux_aio(ctx); + s->io_q.plugged++; } -void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s, - uint64_t dev_max_batch) +void laio_io_unplug(uint64_t dev_max_batch) { + AioContext *ctx = qemu_get_current_aio_context(); + LinuxAioState *s = aio_get_linux_aio(ctx); + assert(s->io_q.plugged); s->io_q.plugged--; @@ -411,15 +416,15 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset, return 0; } -int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd, - uint64_t offset, QEMUIOVector *qiov, int type, - uint64_t dev_max_batch) +int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov, + int type, uint64_t dev_max_batch) { int ret; + AioContext *ctx = qemu_get_current_aio_context(); struct qemu_laiocb laiocb = { .co = qemu_coroutine_self(), .nbytes = qiov->size, - .ctx = s, + .ctx = aio_get_linux_aio(ctx), .ret = -EINPROGRESS, .is_read = (type == QEMU_AIO_READ), .qiov = qiov, diff --git a/block/mirror.c b/block/mirror.c index 663e2b7002..af9bbd23d4 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -886,9 +886,9 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s) /* Called when going out of the streaming phase to flush the bulk of the * data to the medium, or just before completing. */ -static int mirror_flush(MirrorBlockJob *s) +static int coroutine_fn mirror_flush(MirrorBlockJob *s) { - int ret = blk_flush(s->target); + int ret = blk_co_flush(s->target); if (ret < 0) { if (mirror_error_action(s, false, -ret) == BLOCK_ERROR_ACTION_REPORT) { s->ret = ret; diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c index 2846083546..ca2599de44 100644 --- a/block/monitor/block-hmp-cmds.c +++ b/block/monitor/block-hmp-cmds.c @@ -214,15 +214,17 @@ void hmp_commit(Monitor *mon, const QDict *qdict) error_report("Device '%s' not found", device); return; } - if (!blk_is_available(blk)) { - error_report("Device '%s' has no medium", device); - return; - } bs = bdrv_skip_implicit_filters(blk_bs(blk)); aio_context = bdrv_get_aio_context(bs); aio_context_acquire(aio_context); + if (!blk_is_available(blk)) { + error_report("Device '%s' has no medium", device); + aio_context_release(aio_context); + return; + } + ret = bdrv_commit(bs); aio_context_release(aio_context); diff --git a/block/qcow2-bitmap.c b/block/qcow2-bitmap.c index 5f456a2785..a952fd58d8 100644 --- a/block/qcow2-bitmap.c +++ b/block/qcow2-bitmap.c @@ -1221,7 +1221,7 @@ out: } /* Checks to see if it's safe to resize bitmaps */ -int qcow2_truncate_bitmaps_check(BlockDriverState *bs, Error **errp) +int coroutine_fn qcow2_truncate_bitmaps_check(BlockDriverState *bs, Error **errp) { BDRVQcow2State *s = bs->opaque; Qcow2BitmapList *bm_list; diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index a9e6622fe3..39cda7f907 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -1126,7 +1126,7 @@ err: * Frees the allocated clusters because the request failed and they won't * actually be linked. */ -void qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m) +void coroutine_fn qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m) { BDRVQcow2State *s = bs->opaque; if (!has_data_file(bs) && !m->keep_old_clusters) { @@ -1156,9 +1156,11 @@ void qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m) * * Returns 0 on success, -errno on failure. */ -static int calculate_l2_meta(BlockDriverState *bs, uint64_t host_cluster_offset, - uint64_t guest_offset, unsigned bytes, - uint64_t *l2_slice, QCowL2Meta **m, bool keep_old) +static int coroutine_fn calculate_l2_meta(BlockDriverState *bs, + uint64_t host_cluster_offset, + uint64_t guest_offset, unsigned bytes, + uint64_t *l2_slice, QCowL2Meta **m, + bool keep_old) { BDRVQcow2State *s = bs->opaque; int sc_index, l2_index = offset_to_l2_slice_index(s, guest_offset); @@ -1599,8 +1601,10 @@ out: * function has been waiting for another request and the allocation must be * restarted, but the whole request should not be failed. */ -static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset, - uint64_t *host_offset, uint64_t *nb_clusters) +static int coroutine_fn do_alloc_cluster_offset(BlockDriverState *bs, + uint64_t guest_offset, + uint64_t *host_offset, + uint64_t *nb_clusters) { BDRVQcow2State *s = bs->opaque; @@ -2065,8 +2069,9 @@ static int zero_in_l2_slice(BlockDriverState *bs, uint64_t offset, return nb_clusters; } -static int zero_l2_subclusters(BlockDriverState *bs, uint64_t offset, - unsigned nb_subclusters) +static int coroutine_fn +zero_l2_subclusters(BlockDriverState *bs, uint64_t offset, + unsigned nb_subclusters) { BDRVQcow2State *s = bs->opaque; uint64_t *l2_slice; diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index b092f89da9..b2a81ff707 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -1030,8 +1030,8 @@ int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size) return offset; } -int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, - int64_t nb_clusters) +int64_t coroutine_fn qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, + int64_t nb_clusters) { BDRVQcow2State *s = bs->opaque; uint64_t cluster_index, refcount; @@ -1069,7 +1069,7 @@ int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, /* only used to allocate compressed sectors. We try to allocate contiguous sectors. size must be <= cluster_size */ -int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) +int64_t coroutine_fn qcow2_alloc_bytes(BlockDriverState *bs, int size) { BDRVQcow2State *s = bs->opaque; int64_t offset; @@ -3685,7 +3685,7 @@ out: return ret; } -int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size) +int64_t coroutine_fn qcow2_get_last_cluster(BlockDriverState *bs, int64_t size) { BDRVQcow2State *s = bs->opaque; int64_t i; diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c index 62e8a0335d..92e47978bf 100644 --- a/block/qcow2-snapshot.c +++ b/block/qcow2-snapshot.c @@ -77,10 +77,11 @@ void qcow2_free_snapshots(BlockDriverState *bs) * qcow2_check_refcounts() does not do anything with snapshots' * extra data.) */ -static int qcow2_do_read_snapshots(BlockDriverState *bs, bool repair, - int *nb_clusters_reduced, - int *extra_data_dropped, - Error **errp) +static coroutine_fn GRAPH_RDLOCK +int qcow2_do_read_snapshots(BlockDriverState *bs, bool repair, + int *nb_clusters_reduced, + int *extra_data_dropped, + Error **errp) { BDRVQcow2State *s = bs->opaque; QCowSnapshotHeader h; @@ -108,7 +109,7 @@ static int qcow2_do_read_snapshots(BlockDriverState *bs, bool repair, /* Read statically sized part of the snapshot header */ offset = ROUND_UP(offset, 8); - ret = bdrv_pread(bs->file, offset, sizeof(h), &h, 0); + ret = bdrv_co_pread(bs->file, offset, sizeof(h), &h, 0); if (ret < 0) { error_setg_errno(errp, -ret, "Failed to read snapshot table"); goto fail; @@ -146,8 +147,8 @@ static int qcow2_do_read_snapshots(BlockDriverState *bs, bool repair, } /* Read known extra data */ - ret = bdrv_pread(bs->file, offset, - MIN(sizeof(extra), sn->extra_data_size), &extra, 0); + ret = bdrv_co_pread(bs->file, offset, + MIN(sizeof(extra), sn->extra_data_size), &extra, 0); if (ret < 0) { error_setg_errno(errp, -ret, "Failed to read snapshot table"); goto fail; @@ -184,8 +185,8 @@ static int qcow2_do_read_snapshots(BlockDriverState *bs, bool repair, /* Store unknown extra data */ unknown_extra_data_size = sn->extra_data_size - sizeof(extra); sn->unknown_extra_data = g_malloc(unknown_extra_data_size); - ret = bdrv_pread(bs->file, offset, unknown_extra_data_size, - sn->unknown_extra_data, 0); + ret = bdrv_co_pread(bs->file, offset, unknown_extra_data_size, + sn->unknown_extra_data, 0); if (ret < 0) { error_setg_errno(errp, -ret, "Failed to read snapshot table"); @@ -196,7 +197,7 @@ static int qcow2_do_read_snapshots(BlockDriverState *bs, bool repair, /* Read snapshot ID */ sn->id_str = g_malloc(id_str_size + 1); - ret = bdrv_pread(bs->file, offset, id_str_size, sn->id_str, 0); + ret = bdrv_co_pread(bs->file, offset, id_str_size, sn->id_str, 0); if (ret < 0) { error_setg_errno(errp, -ret, "Failed to read snapshot table"); goto fail; @@ -206,7 +207,7 @@ static int qcow2_do_read_snapshots(BlockDriverState *bs, bool repair, /* Read snapshot name */ sn->name = g_malloc(name_size + 1); - ret = bdrv_pread(bs->file, offset, name_size, sn->name, 0); + ret = bdrv_co_pread(bs->file, offset, name_size, sn->name, 0); if (ret < 0) { error_setg_errno(errp, -ret, "Failed to read snapshot table"); goto fail; @@ -261,7 +262,7 @@ fail: return ret; } -int qcow2_read_snapshots(BlockDriverState *bs, Error **errp) +int coroutine_fn qcow2_read_snapshots(BlockDriverState *bs, Error **errp) { return qcow2_do_read_snapshots(bs, false, NULL, NULL, errp); } diff --git a/block/qcow2-threads.c b/block/qcow2-threads.c index 953bbe6df8..d6071a1eae 100644 --- a/block/qcow2-threads.c +++ b/block/qcow2-threads.c @@ -43,7 +43,6 @@ qcow2_co_process(BlockDriverState *bs, ThreadPoolFunc *func, void *arg) { int ret; BDRVQcow2State *s = bs->opaque; - ThreadPool *pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); qemu_co_mutex_lock(&s->lock); while (s->nb_threads >= QCOW2_MAX_THREADS) { @@ -52,7 +51,7 @@ qcow2_co_process(BlockDriverState *bs, ThreadPoolFunc *func, void *arg) s->nb_threads++; qemu_co_mutex_unlock(&s->lock); - ret = thread_pool_submit_co(pool, func, arg); + ret = thread_pool_submit_co(func, arg); qemu_co_mutex_lock(&s->lock); s->nb_threads--; diff --git a/block/qcow2.c b/block/qcow2.c index 30fd53fa64..fe5def438e 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -199,10 +199,10 @@ qcow2_extract_crypto_opts(QemuOpts *opts, const char *fmt, Error **errp) * unknown magic is skipped (future extension this version knows nothing about) * return 0 upon success, non-0 otherwise */ -static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, - uint64_t end_offset, void **p_feature_table, - int flags, bool *need_update_header, - Error **errp) +static int coroutine_fn GRAPH_RDLOCK +qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, + uint64_t end_offset, void **p_feature_table, + int flags, bool *need_update_header, Error **errp) { BDRVQcow2State *s = bs->opaque; QCowExtension ext; @@ -228,7 +228,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, printf("attempting to read extended header in offset %lu\n", offset); #endif - ret = bdrv_pread(bs->file, offset, sizeof(ext), &ext, 0); + ret = bdrv_co_pread(bs->file, offset, sizeof(ext), &ext, 0); if (ret < 0) { error_setg_errno(errp, -ret, "qcow2_read_extension: ERROR: " "pread fail from offset %" PRIu64, offset); @@ -256,7 +256,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, sizeof(bs->backing_format)); return 2; } - ret = bdrv_pread(bs->file, offset, ext.len, bs->backing_format, 0); + ret = bdrv_co_pread(bs->file, offset, ext.len, bs->backing_format, 0); if (ret < 0) { error_setg_errno(errp, -ret, "ERROR: ext_backing_format: " "Could not read format name"); @@ -272,7 +272,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, case QCOW2_EXT_MAGIC_FEATURE_TABLE: if (p_feature_table != NULL) { void *feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature)); - ret = bdrv_pread(bs->file, offset, ext.len, feature_table, 0); + ret = bdrv_co_pread(bs->file, offset, ext.len, feature_table, 0); if (ret < 0) { error_setg_errno(errp, -ret, "ERROR: ext_feature_table: " "Could not read table"); @@ -298,7 +298,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, return -EINVAL; } - ret = bdrv_pread(bs->file, offset, ext.len, &s->crypto_header, 0); + ret = bdrv_co_pread(bs->file, offset, ext.len, &s->crypto_header, 0); if (ret < 0) { error_setg_errno(errp, -ret, "Unable to read CRYPTO header extension"); @@ -354,7 +354,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, break; } - ret = bdrv_pread(bs->file, offset, ext.len, &bitmaps_ext, 0); + ret = bdrv_co_pread(bs->file, offset, ext.len, &bitmaps_ext, 0); if (ret < 0) { error_setg_errno(errp, -ret, "bitmaps_ext: " "Could not read ext header"); @@ -418,7 +418,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, case QCOW2_EXT_MAGIC_DATA_FILE: { s->image_data_file = g_malloc0(ext.len + 1); - ret = bdrv_pread(bs->file, offset, ext.len, s->image_data_file, 0); + ret = bdrv_co_pread(bs->file, offset, ext.len, s->image_data_file, 0); if (ret < 0) { error_setg_errno(errp, -ret, "ERROR: Could not read data file name"); @@ -442,7 +442,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, uext->len = ext.len; QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next); - ret = bdrv_pread(bs->file, offset, uext->len, uext->data, 0); + ret = bdrv_co_pread(bs->file, offset, uext->len, uext->data, 0); if (ret < 0) { error_setg_errno(errp, -ret, "ERROR: unknown extension: " "Could not read data"); @@ -1241,8 +1241,9 @@ static void qcow2_update_options_abort(BlockDriverState *bs, qapi_free_QCryptoBlockOpenOptions(r->crypto_opts); } -static int qcow2_update_options(BlockDriverState *bs, QDict *options, - int flags, Error **errp) +static int coroutine_fn +qcow2_update_options(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { Qcow2ReopenState r = {}; int ret; diff --git a/block/qcow2.h b/block/qcow2.h index c59e33c01c..c75decc38a 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -862,9 +862,9 @@ int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t offset, uint64_t new_refblock_offset); int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size); -int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, - int64_t nb_clusters); -int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size); +int64_t coroutine_fn qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, + int64_t nb_clusters); +int64_t coroutine_fn qcow2_alloc_bytes(BlockDriverState *bs, int size); void qcow2_free_clusters(BlockDriverState *bs, int64_t offset, int64_t size, enum qcow2_discard_type type); @@ -894,7 +894,7 @@ int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order, BlockDriverAmendStatusCB *status_cb, void *cb_opaque, Error **errp); int coroutine_fn GRAPH_RDLOCK qcow2_shrink_reftable(BlockDriverState *bs); -int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size); +int64_t coroutine_fn qcow2_get_last_cluster(BlockDriverState *bs, int64_t size); int coroutine_fn qcow2_detect_metadata_preallocation(BlockDriverState *bs); /* qcow2-cluster.c functions */ @@ -924,7 +924,7 @@ void qcow2_parse_compressed_l2_entry(BlockDriverState *bs, uint64_t l2_entry, int coroutine_fn GRAPH_RDLOCK qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m); -void qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m); +void coroutine_fn qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m); int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset, uint64_t bytes, enum qcow2_discard_type type, bool full_discard); @@ -951,7 +951,8 @@ int qcow2_snapshot_load_tmp(BlockDriverState *bs, Error **errp); void qcow2_free_snapshots(BlockDriverState *bs); -int qcow2_read_snapshots(BlockDriverState *bs, Error **errp); +int coroutine_fn GRAPH_RDLOCK +qcow2_read_snapshots(BlockDriverState *bs, Error **errp); int qcow2_write_snapshots(BlockDriverState *bs); int coroutine_fn GRAPH_RDLOCK @@ -994,7 +995,7 @@ bool coroutine_fn qcow2_load_dirty_bitmaps(BlockDriverState *bs, bool qcow2_get_bitmap_info_list(BlockDriverState *bs, Qcow2BitmapInfoList **info_list, Error **errp); int qcow2_reopen_bitmaps_rw(BlockDriverState *bs, Error **errp); -int qcow2_truncate_bitmaps_check(BlockDriverState *bs, Error **errp); +int coroutine_fn qcow2_truncate_bitmaps_check(BlockDriverState *bs, Error **errp); bool qcow2_store_persistent_dirty_bitmaps(BlockDriverState *bs, bool release_stored, Error **errp); int qcow2_reopen_bitmaps_ro(BlockDriverState *bs, Error **errp); diff --git a/block/vmdk.c b/block/vmdk.c index f5f49018fe..3f8c731e32 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -376,7 +376,7 @@ out: return ret; } -static int vmdk_is_cid_valid(BlockDriverState *bs) +static int coroutine_fn vmdk_is_cid_valid(BlockDriverState *bs) { BDRVVmdkState *s = bs->opaque; uint32_t cur_pcid; diff --git a/block/vvfat.c b/block/vvfat.c index fd45e86416..0ddc91fc09 100644 --- a/block/vvfat.c +++ b/block/vvfat.c @@ -1053,7 +1053,7 @@ static BDRVVVFATState *vvv = NULL; #endif static int enable_write_target(BlockDriverState *bs, Error **errp); -static int is_consistent(BDRVVVFATState *s); +static int coroutine_fn is_consistent(BDRVVVFATState *s); static QemuOptsList runtime_opts = { .name = "vvfat", @@ -1469,8 +1469,8 @@ static void print_mapping(const mapping_t* mapping) } #endif -static int vvfat_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) +static int coroutine_fn GRAPH_RDLOCK +vvfat_read(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, int nb_sectors) { BDRVVVFATState *s = bs->opaque; int i; @@ -1490,8 +1490,8 @@ static int vvfat_read(BlockDriverState *bs, int64_t sector_num, DLOG(fprintf(stderr, "sectors %" PRId64 "+%" PRId64 " allocated\n", sector_num, n >> BDRV_SECTOR_BITS)); - if (bdrv_pread(s->qcow, sector_num * BDRV_SECTOR_SIZE, n, - buf + i * 0x200, 0) < 0) { + if (bdrv_co_pread(s->qcow, sector_num * BDRV_SECTOR_SIZE, n, + buf + i * 0x200, 0) < 0) { return -1; } i += (n >> BDRV_SECTOR_BITS) - 1; @@ -1532,7 +1532,7 @@ static int vvfat_read(BlockDriverState *bs, int64_t sector_num, return 0; } -static int coroutine_fn +static int coroutine_fn GRAPH_RDLOCK vvfat_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) { @@ -1796,8 +1796,8 @@ static inline uint32_t modified_fat_get(BDRVVVFATState* s, } } -static inline bool cluster_was_modified(BDRVVVFATState *s, - uint32_t cluster_num) +static inline bool coroutine_fn GRAPH_RDLOCK +cluster_was_modified(BDRVVVFATState *s, uint32_t cluster_num) { int was_modified = 0; int i; @@ -1852,8 +1852,8 @@ typedef enum { * Further, the files/directories handled by this function are * assumed to be *not* deleted (and *only* those). */ -static uint32_t get_cluster_count_for_direntry(BDRVVVFATState* s, - direntry_t* direntry, const char* path) +static uint32_t coroutine_fn GRAPH_RDLOCK +get_cluster_count_for_direntry(BDRVVVFATState* s, direntry_t* direntry, const char* path) { /* * This is a little bit tricky: @@ -1979,9 +1979,9 @@ static uint32_t get_cluster_count_for_direntry(BDRVVVFATState* s, if (res) { return -1; } - res = bdrv_pwrite(s->qcow, offset * BDRV_SECTOR_SIZE, - BDRV_SECTOR_SIZE, s->cluster_buffer, - 0); + res = bdrv_co_pwrite(s->qcow, offset * BDRV_SECTOR_SIZE, + BDRV_SECTOR_SIZE, s->cluster_buffer, + 0); if (res < 0) { return -2; } @@ -2011,8 +2011,8 @@ static uint32_t get_cluster_count_for_direntry(BDRVVVFATState* s, * It returns 0 upon inconsistency or error, and the number of clusters * used by the directory, its subdirectories and their files. */ -static int check_directory_consistency(BDRVVVFATState *s, - int cluster_num, const char* path) +static int coroutine_fn GRAPH_RDLOCK +check_directory_consistency(BDRVVVFATState *s, int cluster_num, const char* path) { int ret = 0; unsigned char* cluster = g_malloc(s->cluster_size); @@ -2138,7 +2138,8 @@ DLOG(fprintf(stderr, "check direntry %d:\n", i); print_direntry(direntries + i)) } /* returns 1 on success */ -static int is_consistent(BDRVVVFATState* s) +static int coroutine_fn GRAPH_RDLOCK +is_consistent(BDRVVVFATState* s) { int i, check; int used_clusters_count = 0; @@ -2414,8 +2415,8 @@ static int commit_mappings(BDRVVVFATState* s, return 0; } -static int commit_direntries(BDRVVVFATState* s, - int dir_index, int parent_mapping_index) +static int coroutine_fn GRAPH_RDLOCK +commit_direntries(BDRVVVFATState* s, int dir_index, int parent_mapping_index) { direntry_t* direntry = array_get(&(s->directory), dir_index); uint32_t first_cluster = dir_index == 0 ? 0 : begin_of_direntry(direntry); @@ -2504,8 +2505,8 @@ static int commit_direntries(BDRVVVFATState* s, /* commit one file (adjust contents, adjust mapping), return first_mapping_index */ -static int commit_one_file(BDRVVVFATState* s, - int dir_index, uint32_t offset) +static int coroutine_fn GRAPH_RDLOCK +commit_one_file(BDRVVVFATState* s, int dir_index, uint32_t offset) { direntry_t* direntry = array_get(&(s->directory), dir_index); uint32_t c = begin_of_direntry(direntry); @@ -2770,7 +2771,7 @@ static int handle_renames_and_mkdirs(BDRVVVFATState* s) /* * TODO: make sure that the short name is not matching *another* file */ -static int handle_commits(BDRVVVFATState* s) +static int coroutine_fn GRAPH_RDLOCK handle_commits(BDRVVVFATState* s) { int i, fail = 0; @@ -2913,7 +2914,7 @@ static int handle_deletes(BDRVVVFATState* s) * - recurse direntries from root (using bs->bdrv_pread) * - delete files corresponding to mappings marked as deleted */ -static int do_commit(BDRVVVFATState* s) +static int coroutine_fn GRAPH_RDLOCK do_commit(BDRVVVFATState* s) { int ret = 0; @@ -2963,7 +2964,7 @@ DLOG(checkpoint()); return 0; } -static int try_commit(BDRVVVFATState* s) +static int coroutine_fn GRAPH_RDLOCK try_commit(BDRVVVFATState* s) { vvfat_close_current_file(s); DLOG(checkpoint()); @@ -2972,8 +2973,9 @@ DLOG(checkpoint()); return do_commit(s); } -static int vvfat_write(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) +static int coroutine_fn GRAPH_RDLOCK +vvfat_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) { BDRVVVFATState *s = bs->opaque; int i, ret; @@ -3082,8 +3084,8 @@ DLOG(checkpoint()); * Use qcow backend. Commit later. */ DLOG(fprintf(stderr, "Write to qcow backend: %d + %d\n", (int)sector_num, nb_sectors)); - ret = bdrv_pwrite(s->qcow, sector_num * BDRV_SECTOR_SIZE, - nb_sectors * BDRV_SECTOR_SIZE, buf, 0); + ret = bdrv_co_pwrite(s->qcow, sector_num * BDRV_SECTOR_SIZE, + nb_sectors * BDRV_SECTOR_SIZE, buf, 0); if (ret < 0) { fprintf(stderr, "Error writing to qcow backend\n"); return ret; @@ -3103,7 +3105,7 @@ DLOG(checkpoint()); return 0; } -static int coroutine_fn +static int coroutine_fn GRAPH_RDLOCK vvfat_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) { diff --git a/docs/devel/acpi-bits.rst b/docs/devel/acpi-bits.rst index 9eb4b9e666..22e2580200 100644 --- a/docs/devel/acpi-bits.rst +++ b/docs/devel/acpi-bits.rst @@ -135,7 +135,7 @@ Under ``tests/avocado/`` as the root we have: (c) They need not be loaded by avocado framework when running tests. -Author: Ani Sinha <ani@anisinha.ca> +Author: Ani Sinha <anisinha@redhat.com> References: ----------- diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst index 8a5924ea75..5a070adbc1 100644 --- a/docs/interop/vhost-user.rst +++ b/docs/interop/vhost-user.rst @@ -130,18 +130,8 @@ A vring address description Note that a ring address is an IOVA if ``VIRTIO_F_IOMMU_PLATFORM`` has been negotiated. Otherwise it is a user address. -Memory regions description -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -+-------------+---------+---------+-----+---------+ -| num regions | padding | region0 | ... | region7 | -+-------------+---------+---------+-----+---------+ - -:num regions: a 32-bit number of regions - -:padding: 32-bit - -A region is: +Memory region description +^^^^^^^^^^^^^^^^^^^^^^^^^ +---------------+------+--------------+-------------+ | guest address | size | user address | mmap offset | @@ -155,22 +145,49 @@ A region is: :mmap offset: 64-bit offset where region starts in the mapped memory +When the ``VHOST_USER_PROTOCOL_F_XEN_MMAP`` protocol feature has been +successfully negotiated, the memory region description contains two extra +fields at the end. + ++---------------+------+--------------+-------------+----------------+-------+ +| guest address | size | user address | mmap offset | xen mmap flags | domid | ++---------------+------+--------------+-------------+----------------+-------+ + +:xen mmap flags: 32-bit bit field + +- Bit 0 is set for Xen foreign memory mapping. +- Bit 1 is set for Xen grant memory mapping. +- Bit 8 is set if the memory region can not be mapped in advance, and memory + areas within this region must be mapped / unmapped only when required by the + back-end. The back-end shouldn't try to map the entire region at once, as the + front-end may not allow it. The back-end should rather map only the required + amount of memory at once and unmap it after it is used. + +:domid: a 32-bit Xen hypervisor specific domain id. + Single memory region description ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -+---------+---------------+------+--------------+-------------+ -| padding | guest address | size | user address | mmap offset | -+---------+---------------+------+--------------+-------------+ ++---------+--------+ +| padding | region | ++---------+--------+ :padding: 64-bit -:guest address: a 64-bit guest address of the region +A region is represented by Memory region description. -:size: a 64-bit size +Multiple Memory regions description +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:user address: a 64-bit user address ++-------------+---------+---------+-----+---------+ +| num regions | padding | region0 | ... | region7 | ++-------------+---------+---------+-----+---------+ -:mmap offset: 64-bit offset where region starts in the mapped memory +:num regions: a 32-bit number of regions + +:padding: 32-bit + +A region is represented by Memory region description. Log description ^^^^^^^^^^^^^^^ @@ -867,6 +884,7 @@ Protocol features #define VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS 14 #define VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS 15 #define VHOST_USER_PROTOCOL_F_STATUS 16 + #define VHOST_USER_PROTOCOL_F_XEN_MMAP 17 Front-end message types ----------------------- @@ -952,8 +970,8 @@ Front-end message types ``VHOST_USER_SET_MEM_TABLE`` :id: 5 :equivalent ioctl: ``VHOST_SET_MEM_TABLE`` - :request payload: memory regions description - :reply payload: (postcopy only) memory regions description + :request payload: multiple memory regions description + :reply payload: (postcopy only) multiple memory regions description Sets the memory map regions on the back-end so it can translate the vring addresses. In the ancillary data there is an array of file diff --git a/docs/pcie_sriov.txt b/docs/pcie_sriov.txt index 11158dbf88..7eff7f2703 100644 --- a/docs/pcie_sriov.txt +++ b/docs/pcie_sriov.txt @@ -9,10 +9,7 @@ virtual functions (VFs) for the main purpose of eliminating software overhead in I/O from virtual machines. QEMU now implements the basic common functionality to enable an emulated device -to support SR/IOV. Yet no fully implemented devices exists in QEMU, but a -proof-of-concept hack of the Intel igb can be found here: - -git://github.com/knuto/qemu.git sriov_patches_v5 +to support SR/IOV. Implementation ============== diff --git a/docs/specs/index.rst b/docs/specs/index.rst index a58d9311cb..e58be38c41 100644 --- a/docs/specs/index.rst +++ b/docs/specs/index.rst @@ -8,6 +8,9 @@ guest hardware that is specific to QEMU. .. toctree:: :maxdepth: 2 + pci-ids + pci-serial + pci-testdev ppc-xive ppc-spapr-xive ppc-spapr-numa diff --git a/docs/specs/pci-ids.rst b/docs/specs/pci-ids.rst new file mode 100644 index 0000000000..e302bea484 --- /dev/null +++ b/docs/specs/pci-ids.rst @@ -0,0 +1,98 @@ +================ +PCI IDs for QEMU +================ + +Red Hat, Inc. donates a part of its device ID range to QEMU, to be used for +virtual devices. The vendor IDs are 1af4 (formerly Qumranet ID) and 1b36. + +Contact Gerd Hoffmann <kraxel@redhat.com> to get a device ID assigned +for your devices. + +1af4 vendor ID +-------------- + +The 1000 -> 10ff device ID range is used as follows for virtio-pci devices. +Note that this allocation is separate from the virtio device IDs, which are +maintained as part of the virtio specification. + +1af4:1000 + network device (legacy) +1af4:1001 + block device (legacy) +1af4:1002 + balloon device (legacy) +1af4:1003 + console device (legacy) +1af4:1004 + SCSI host bus adapter device (legacy) +1af4:1005 + entropy generator device (legacy) +1af4:1009 + 9p filesystem device (legacy) +1af4:1012 + vsock device (bug compatibility) + +1af4:1040 to 1af4:10ef + ID range for modern virtio devices. The PCI device + ID is calculated from the virtio device ID by adding the + 0x1040 offset. The virtio IDs are defined in the virtio + specification. The Linux kernel has a header file with + defines for all virtio IDs (``linux/virtio_ids.h``); QEMU has a + copy in ``include/standard-headers/``. + +1af4:10f0 to 1a4f:10ff + Available for experimental usage without registration. Must get + official ID when the code leaves the test lab (i.e. when seeking + upstream merge or shipping a distro/product) to avoid conflicts. + +1af4:1100 + Used as PCI Subsystem ID for existing hardware devices emulated + by QEMU. + +1af4:1110 + ivshmem device (shared memory, ``docs/specs/ivshmem-spec.txt``) + +All other device IDs are reserved. + +1b36 vendor ID +-------------- + +The 0000 -> 00ff device ID range is used as follows for QEMU-specific +PCI devices (other than virtio): + +1b36:0001 + PCI-PCI bridge +1b36:0002 + PCI serial port (16550A) adapter (:doc:`pci-serial`) +1b36:0003 + PCI Dual-port 16550A adapter (:doc:`pci-serial`) +1b36:0004 + PCI Quad-port 16550A adapter (:doc:`pci-serial`) +1b36:0005 + PCI test device (:doc:`pci-testdev`) +1b36:0006 + PCI Rocker Ethernet switch device +1b36:0007 + PCI SD Card Host Controller Interface (SDHCI) +1b36:0008 + PCIe host bridge +1b36:0009 + PCI Expander Bridge (-device pxb) +1b36:000a + PCI-PCI bridge (multiseat) +1b36:000b + PCIe Expander Bridge (-device pxb-pcie) +1b36:000d + PCI xhci usb host adapter +1b36:000f + mdpy (mdev sample device), ``linux/samples/vfio-mdev/mdpy.c`` +1b36:0010 + PCIe NVMe device (``-device nvme``) +1b36:0011 + PCI PVPanic device (``-device pvpanic-pci``) +1b36:0012 + PCI ACPI ERST device (``-device acpi-erst``) + +All these devices are documented in :doc:`index`. + +The 0100 device ID is used for the QXL video card device. diff --git a/docs/specs/pci-ids.txt b/docs/specs/pci-ids.txt deleted file mode 100644 index e463c4cb3a..0000000000 --- a/docs/specs/pci-ids.txt +++ /dev/null @@ -1,70 +0,0 @@ - -PCI IDs for qemu -================ - -Red Hat, Inc. donates a part of its device ID range to qemu, to be used for -virtual devices. The vendor IDs are 1af4 (formerly Qumranet ID) and 1b36. - -Contact Gerd Hoffmann <kraxel@redhat.com> to get a device ID assigned -for your devices. - -1af4 vendor ID --------------- - -The 1000 -> 10ff device ID range is used as follows for virtio-pci devices. -Note that this allocation separate from the virtio device IDs, which are -maintained as part of the virtio specification. - -1af4:1000 network device (legacy) -1af4:1001 block device (legacy) -1af4:1002 balloon device (legacy) -1af4:1003 console device (legacy) -1af4:1004 SCSI host bus adapter device (legacy) -1af4:1005 entropy generator device (legacy) -1af4:1009 9p filesystem device (legacy) -1af4:1012 vsock device (bug compatibility) - -1af4:1040 Start of ID range for modern virtio devices. The PCI device - to ID is calculated from the virtio device ID by adding the -1af4:10ef 0x1040 offset. The virtio IDs are defined in the virtio - specification. The Linux kernel has a header file with - defines for all virtio IDs (linux/virtio_ids.h), qemu has a - copy in include/standard-headers/. - -1af4:10f0 Available for experimental usage without registration. Must get - to official ID when the code leaves the test lab (i.e. when seeking -1af4:10ff upstream merge or shipping a distro/product) to avoid conflicts. - -1af4:1100 Used as PCI Subsystem ID for existing hardware devices emulated - by qemu. - -1af4:1110 ivshmem device (shared memory, docs/specs/ivshmem-spec.txt) - -All other device IDs are reserved. - -1b36 vendor ID --------------- - -The 0000 -> 00ff device ID range is used as follows for QEMU-specific -PCI devices (other than virtio): - -1b36:0001 PCI-PCI bridge -1b36:0002 PCI serial port (16550A) adapter (docs/specs/pci-serial.txt) -1b36:0003 PCI Dual-port 16550A adapter (docs/specs/pci-serial.txt) -1b36:0004 PCI Quad-port 16550A adapter (docs/specs/pci-serial.txt) -1b36:0005 PCI test device (docs/specs/pci-testdev.txt) -1b36:0006 PCI Rocker Ethernet switch device -1b36:0007 PCI SD Card Host Controller Interface (SDHCI) -1b36:0008 PCIe host bridge -1b36:0009 PCI Expander Bridge (-device pxb) -1b36:000a PCI-PCI bridge (multiseat) -1b36:000b PCIe Expander Bridge (-device pxb-pcie) -1b36:000d PCI xhci usb host adapter -1b36:000f mdpy (mdev sample device), linux/samples/vfio-mdev/mdpy.c -1b36:0010 PCIe NVMe device (-device nvme) -1b36:0011 PCI PVPanic device (-device pvpanic-pci) -1b36:0012 PCI ACPI ERST device (-device acpi-erst) - -All these devices are documented in docs/specs. - -The 0100 device ID is used for the QXL video card device. diff --git a/docs/specs/pci-serial.rst b/docs/specs/pci-serial.rst new file mode 100644 index 0000000000..8d916a3669 --- /dev/null +++ b/docs/specs/pci-serial.rst @@ -0,0 +1,37 @@ +======================= +QEMU PCI serial devices +======================= + +QEMU implements some PCI serial devices which are simple PCI +wrappers around one or more 16550 UARTs. + +There is one single-port variant and two multiport-variants. Linux +guests work out-of-the box with all cards. There is a Windows inf file +(``docs/qemupciserial.inf``) to set up the cards in Windows guests. + + +Single-port card +---------------- + +Name: + ``pci-serial`` +PCI ID: + 1b36:0002 +PCI Region 0: + IO bar, 8 bytes long, with the 16550 UART mapped to it. +Interrupt: + Wired to pin A. + + +Multiport cards +--------------- + +Name: + ``pci-serial-2x``, ``pci-serial-4x`` +PCI ID: + 1b36:0003 (``-2x``) and 1b36:0004 (``-4x``) +PCI Region 0: + IO bar, with two or four 16550 UARTs mapped after each other. + The first is at offset 0, the second at offset 8, and so on. +Interrupt: + Wired to pin A. diff --git a/docs/specs/pci-serial.txt b/docs/specs/pci-serial.txt deleted file mode 100644 index 66c761f2b4..0000000000 --- a/docs/specs/pci-serial.txt +++ /dev/null @@ -1,34 +0,0 @@ - -QEMU pci serial devices -======================= - -There is one single-port variant and two muliport-variants. Linux -guests out-of-the box with all cards. There is a Windows inf file -(docs/qemupciserial.inf) to setup the single-port card in Windows -guests. - - -single-port card ----------------- - -Name: pci-serial -PCI ID: 1b36:0002 - -PCI Region 0: - IO bar, 8 bytes long, with the 16550 uart mapped to it. - Interrupt is wired to pin A. - - -multiport cards ---------------- - -Name: pci-serial-2x -PCI ID: 1b36:0003 - -Name: pci-serial-4x -PCI ID: 1b36:0004 - -PCI Region 0: - IO bar, with two/four 16550 uart mapped after each other. - The first is at offset 0, second at offset 8, ... - Interrupt is wired to pin A. diff --git a/docs/specs/pci-testdev.rst b/docs/specs/pci-testdev.rst new file mode 100644 index 0000000000..4b6d36543b --- /dev/null +++ b/docs/specs/pci-testdev.rst @@ -0,0 +1,39 @@ +==================== +QEMU PCI test device +==================== + +``pci-testdev`` is a device used for testing low level IO. + +The device implements up to three BARs: BAR0, BAR1 and BAR2. +Each of BAR 0+1 can be memory or IO. Guests must detect +BAR types and act accordingly. + +BAR 0+1 size is up to 4K bytes each. +BAR 0+1 starts with the following header: + +.. code-block:: c + + typedef struct PCITestDevHdr { + uint8_t test; /* write-only, starts a given test number */ + uint8_t width_type; /* + * read-only, type and width of access for a given test. + * 1,2,4 for byte,word or long write. + * any other value if test not supported on this BAR + */ + uint8_t pad0[2]; + uint32_t offset; /* read-only, offset in this BAR for a given test */ + uint32_t data; /* read-only, data to use for a given test */ + uint32_t count; /* for debugging. number of writes detected. */ + uint8_t name[]; /* for debugging. 0-terminated ASCII string. */ + } PCITestDevHdr; + +All registers are little endian. + +The device is expected to always implement tests 0 to N on each BAR, and to add new +tests with higher numbers. In this way a guest can scan test numbers until it +detects an access type that it does not support on this BAR, then stop. + +BAR2 is a 64bit memory BAR, without backing storage. It is disabled +by default and can be enabled using the ``membar=<size>`` property. This +can be used to test whether guests handle PCI BARs of a specific +(possibly quite large) size correctly. diff --git a/docs/specs/pci-testdev.txt b/docs/specs/pci-testdev.txt deleted file mode 100644 index 4280a1e73c..0000000000 --- a/docs/specs/pci-testdev.txt +++ /dev/null @@ -1,31 +0,0 @@ -pci-test is a device used for testing low level IO - -device implements up to three BARs: BAR0, BAR1 and BAR2. -Each of BAR 0+1 can be memory or IO. Guests must detect -BAR types and act accordingly. - -BAR 0+1 size is up to 4K bytes each. -BAR 0+1 starts with the following header: - -typedef struct PCITestDevHdr { - uint8_t test; <- write-only, starts a given test number - uint8_t width_type; <- read-only, type and width of access for a given test. - 1,2,4 for byte,word or long write. - any other value if test not supported on this BAR - uint8_t pad0[2]; - uint32_t offset; <- read-only, offset in this BAR for a given test - uint32_t data; <- read-only, data to use for a given test - uint32_t count; <- for debugging. number of writes detected. - uint8_t name[]; <- for debugging. 0-terminated ASCII string. -} PCITestDevHdr; - -All registers are little endian. - -device is expected to always implement tests 0 to N on each BAR, and to add new -tests with higher numbers. In this way a guest can scan test numbers until it -detects an access type that it does not support on this BAR, then stop. - -BAR2 is a 64bit memory bar, without backing storage. It is disabled -by default and can be enabled using the membar=<size> property. This -can be used to test whether guests handle pci bars of a specific -(possibly quite large) size correctly. diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h index 2fce4140d1..1b0d805b9c 100644 --- a/hw/9pfs/9p.h +++ b/hw/9pfs/9p.h @@ -203,7 +203,7 @@ typedef struct V9fsDir { QemuMutex readdir_mutex_L; } V9fsDir; -static inline void v9fs_readdir_lock(V9fsDir *dir) +static inline void coroutine_fn v9fs_readdir_lock(V9fsDir *dir) { if (dir->proto_version == V9FS_PROTO_2000U) { qemu_co_mutex_lock(&dir->readdir_mutex_u); @@ -212,7 +212,7 @@ static inline void v9fs_readdir_lock(V9fsDir *dir) } } -static inline void v9fs_readdir_unlock(V9fsDir *dir) +static inline void coroutine_fn v9fs_readdir_unlock(V9fsDir *dir) { if (dir->proto_version == V9FS_PROTO_2000U) { qemu_co_mutex_unlock(&dir->readdir_mutex_u); diff --git a/hw/9pfs/codir.c b/hw/9pfs/codir.c index 7ba63be489..2068a4779d 100644 --- a/hw/9pfs/codir.c +++ b/hw/9pfs/codir.c @@ -68,9 +68,9 @@ int coroutine_fn v9fs_co_readdir(V9fsPDU *pdu, V9fsFidState *fidp, * * See v9fs_co_readdir_many() (as its only user) below for details. */ -static int do_readdir_many(V9fsPDU *pdu, V9fsFidState *fidp, - struct V9fsDirEnt **entries, off_t offset, - int32_t maxsize, bool dostat) +static int coroutine_fn +do_readdir_many(V9fsPDU *pdu, V9fsFidState *fidp, struct V9fsDirEnt **entries, + off_t offset, int32_t maxsize, bool dostat) { V9fsState *s = pdu->s; V9fsString name; diff --git a/hw/9pfs/coth.c b/hw/9pfs/coth.c index 2802d41cce..598f46add9 100644 --- a/hw/9pfs/coth.c +++ b/hw/9pfs/coth.c @@ -41,6 +41,5 @@ static int coroutine_enter_func(void *arg) void co_run_in_worker_bh(void *opaque) { Coroutine *co = opaque; - thread_pool_submit_aio(aio_get_thread_pool(qemu_get_aio_context()), - coroutine_enter_func, co, coroutine_enter_cb, co); + thread_pool_submit_aio(coroutine_enter_func, co, coroutine_enter_cb, co); } diff --git a/hw/acpi/cxl.c b/hw/acpi/cxl.c index 2bf8c07993..92b46bc932 100644 --- a/hw/acpi/cxl.c +++ b/hw/acpi/cxl.c @@ -30,9 +30,10 @@ #include "qapi/error.h" #include "qemu/uuid.h" -static void cedt_build_chbs(GArray *table_data, PXBDev *cxl) +static void cedt_build_chbs(GArray *table_data, PXBCXLDev *cxl) { - SysBusDevice *sbd = SYS_BUS_DEVICE(cxl->cxl.cxl_host_bridge); + PXBDev *pxb = PXB_DEV(cxl); + SysBusDevice *sbd = SYS_BUS_DEVICE(cxl->cxl_host_bridge); struct MemoryRegion *mr = sbd->mmio[0].memory; /* Type */ @@ -45,7 +46,7 @@ static void cedt_build_chbs(GArray *table_data, PXBDev *cxl) build_append_int_noprefix(table_data, 32, 2); /* UID - currently equal to bus number */ - build_append_int_noprefix(table_data, cxl->bus_nr, 4); + build_append_int_noprefix(table_data, pxb->bus_nr, 4); /* Version */ build_append_int_noprefix(table_data, 1, 4); @@ -112,7 +113,7 @@ static void cedt_build_cfmws(GArray *table_data, CXLState *cxls) /* Host Bridge List (list of UIDs - currently bus_nr) */ for (i = 0; i < fw->num_targets; i++) { g_assert(fw->target_hbs[i]); - build_append_int_noprefix(table_data, fw->target_hbs[i]->bus_nr, 4); + build_append_int_noprefix(table_data, PXB_DEV(fw->target_hbs[i])->bus_nr, 4); } } } @@ -121,7 +122,7 @@ static int cxl_foreach_pxb_hb(Object *obj, void *opaque) { Aml *cedt = opaque; - if (object_dynamic_cast(obj, TYPE_PXB_CXL_DEVICE)) { + if (object_dynamic_cast(obj, TYPE_PXB_CXL_DEV)) { cedt_build_chbs(cedt->buf, PXB_CXL_DEV(obj)); } diff --git a/hw/acpi/pcihp.c b/hw/acpi/pcihp.c index dcfb779a7a..cdd6f775a1 100644 --- a/hw/acpi/pcihp.c +++ b/hw/acpi/pcihp.c @@ -357,6 +357,16 @@ void acpi_pcihp_device_unplug_request_cb(HotplugHandler *hotplug_dev, * acpi_pcihp_eject_slot() when the operation is completed. */ pdev->qdev.pending_deleted_event = true; + /* if unplug was requested before OSPM is initialized, + * linux kernel will clear GPE0.sts[] bits during boot, which effectively + * hides unplug event. And than followup qmp_device_del() calls remain + * blocked by above flag permanently. + * Unblock qmp_device_del() by setting expire limit, so user can + * repeat unplug request later when OSPM has been booted. + */ + pdev->qdev.pending_deleted_expires_ms = + qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL); /* 1 msec */ + s->acpi_pcihp_pci_status[bsel].down |= (1U << slot); acpi_send_event(DEVICE(hotplug_dev), ACPI_PCI_HOTPLUG_STATUS); } diff --git a/hw/char/serial-pci-multi.c b/hw/char/serial-pci-multi.c index f18b8dcce5..5d65c534cb 100644 --- a/hw/char/serial-pci-multi.c +++ b/hw/char/serial-pci-multi.c @@ -25,7 +25,7 @@ * THE SOFTWARE. */ -/* see docs/specs/pci-serial.txt */ +/* see docs/specs/pci-serial.rst */ #include "qemu/osdep.h" #include "qapi/error.h" diff --git a/hw/char/serial-pci.c b/hw/char/serial-pci.c index 801b769aba..087da3059a 100644 --- a/hw/char/serial-pci.c +++ b/hw/char/serial-pci.c @@ -23,7 +23,7 @@ * THE SOFTWARE. */ -/* see docs/specs/pci-serial.txt */ +/* see docs/specs/pci-serial.rst */ #include "qemu/osdep.h" #include "qapi/error.h" diff --git a/hw/cxl/cxl-host.c b/hw/cxl/cxl-host.c index 6e923ceeaf..034c7805b3 100644 --- a/hw/cxl/cxl-host.c +++ b/hw/cxl/cxl-host.c @@ -84,7 +84,7 @@ void cxl_fmws_link_targets(CXLState *cxl_state, Error **errp) bool ambig; o = object_resolve_path_type(fw->targets[i], - TYPE_PXB_CXL_DEVICE, + TYPE_PXB_CXL_DEV, &ambig); if (!o) { error_setg(errp, "Could not resolve CXLFM target %s", @@ -141,7 +141,7 @@ static PCIDevice *cxl_cfmws_find_device(CXLFixedWindow *fw, hwaddr addr) addr += fw->base; rb_index = (addr / cxl_decode_ig(fw->enc_int_gran)) % fw->num_targets; - hb = PCI_HOST_BRIDGE(fw->target_hbs[rb_index]->cxl.cxl_host_bridge); + hb = PCI_HOST_BRIDGE(fw->target_hbs[rb_index]->cxl_host_bridge); if (!hb || !hb->bus || !pci_bus_is_cxl(hb->bus)) { return NULL; } diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index ec857a117e..512162003b 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -2395,9 +2395,11 @@ build_amd_iommu(GArray *table_data, BIOSLinker *linker, const char *oem_id, /* IVHD length */ build_append_int_noprefix(table_data, ivhd_table_len, 2); /* DeviceID */ - build_append_int_noprefix(table_data, s->devid, 2); + build_append_int_noprefix(table_data, + object_property_get_int(OBJECT(&s->pci), "addr", + &error_abort), 2); /* Capability offset */ - build_append_int_noprefix(table_data, s->capab_offset, 2); + build_append_int_noprefix(table_data, s->pci.capab_offset, 2); /* IOMMU base address */ build_append_int_noprefix(table_data, s->mmio.addr, 8); /* PCI Segment Group */ @@ -2695,7 +2697,8 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine) int legacy_table_size = ROUND_UP(tables_blob->len - aml_len + legacy_aml_len, ACPI_BUILD_ALIGN_SIZE); - if (tables_blob->len > legacy_table_size) { + if ((tables_blob->len > legacy_table_size) && + !pcmc->resizable_acpi_blob) { /* Should happen only with PCI bridges and -M pc-i440fx-2.0. */ warn_report("ACPI table size %u exceeds %d bytes," " migration may not work", @@ -2706,7 +2709,8 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine) g_array_set_size(tables_blob, legacy_table_size); } else { /* Make sure we have a buffer in case we need to resize the tables. */ - if (tables_blob->len > ACPI_BUILD_TABLE_SIZE / 2) { + if ((tables_blob->len > ACPI_BUILD_TABLE_SIZE / 2) && + !pcmc->resizable_acpi_blob) { /* As of QEMU 2.1, this fires with 160 VCPUs and 255 memory slots. */ warn_report("ACPI table size %u exceeds %d bytes," " migration may not work", diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c index bcd016f5c5..9c77304438 100644 --- a/hw/i386/amd_iommu.c +++ b/hw/i386/amd_iommu.c @@ -1509,23 +1509,48 @@ static void amdvi_init(AMDVIState *s) amdvi_set_quad(s, AMDVI_MMIO_EXT_FEATURES, AMDVI_EXT_FEATURES, 0xffffffffffffffef, 0); amdvi_set_quad(s, AMDVI_MMIO_STATUS, 0, 0x98, 0x67); +} + +static void amdvi_pci_realize(PCIDevice *pdev, Error **errp) +{ + AMDVIPCIState *s = AMD_IOMMU_PCI(pdev); + int ret; + + ret = pci_add_capability(pdev, AMDVI_CAPAB_ID_SEC, 0, + AMDVI_CAPAB_SIZE, errp); + if (ret < 0) { + return; + } + s->capab_offset = ret; + + ret = pci_add_capability(pdev, PCI_CAP_ID_MSI, 0, + AMDVI_CAPAB_REG_SIZE, errp); + if (ret < 0) { + return; + } + ret = pci_add_capability(pdev, PCI_CAP_ID_HT, 0, + AMDVI_CAPAB_REG_SIZE, errp); + if (ret < 0) { + return; + } + + if (msi_init(pdev, 0, 1, true, false, errp) < 0) { + return; + } /* reset device ident */ - pci_config_set_vendor_id(s->pci.dev.config, PCI_VENDOR_ID_AMD); - pci_config_set_prog_interface(s->pci.dev.config, 00); - pci_config_set_device_id(s->pci.dev.config, s->devid); - pci_config_set_class(s->pci.dev.config, 0x0806); + pci_config_set_prog_interface(pdev->config, 0); /* reset AMDVI specific capabilities, all r/o */ - pci_set_long(s->pci.dev.config + s->capab_offset, AMDVI_CAPAB_FEATURES); - pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_BAR_LOW, - s->mmio.addr & ~(0xffff0000)); - pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_BAR_HIGH, - (s->mmio.addr & ~(0xffff)) >> 16); - pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_RANGE, + pci_set_long(pdev->config + s->capab_offset, AMDVI_CAPAB_FEATURES); + pci_set_long(pdev->config + s->capab_offset + AMDVI_CAPAB_BAR_LOW, + AMDVI_BASE_ADDR & ~(0xffff0000)); + pci_set_long(pdev->config + s->capab_offset + AMDVI_CAPAB_BAR_HIGH, + (AMDVI_BASE_ADDR & ~(0xffff)) >> 16); + pci_set_long(pdev->config + s->capab_offset + AMDVI_CAPAB_RANGE, 0xff000000); - pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_MISC, 0); - pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_MISC, + pci_set_long(pdev->config + s->capab_offset + AMDVI_CAPAB_MISC, 0); + pci_set_long(pdev->config + s->capab_offset + AMDVI_CAPAB_MISC, AMDVI_MAX_PH_ADDR | AMDVI_MAX_GVA_ADDR | AMDVI_MAX_VA_ADDR); } @@ -1539,7 +1564,6 @@ static void amdvi_sysbus_reset(DeviceState *dev) static void amdvi_sysbus_realize(DeviceState *dev, Error **errp) { - int ret = 0; AMDVIState *s = AMD_IOMMU_DEVICE(dev); MachineState *ms = MACHINE(qdev_get_machine()); PCMachineState *pcms = PC_MACHINE(ms); @@ -1553,23 +1577,6 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp) if (!qdev_realize(DEVICE(&s->pci), &bus->qbus, errp)) { return; } - ret = pci_add_capability(&s->pci.dev, AMDVI_CAPAB_ID_SEC, 0, - AMDVI_CAPAB_SIZE, errp); - if (ret < 0) { - return; - } - s->capab_offset = ret; - - ret = pci_add_capability(&s->pci.dev, PCI_CAP_ID_MSI, 0, - AMDVI_CAPAB_REG_SIZE, errp); - if (ret < 0) { - return; - } - ret = pci_add_capability(&s->pci.dev, PCI_CAP_ID_HT, 0, - AMDVI_CAPAB_REG_SIZE, errp); - if (ret < 0) { - return; - } /* Pseudo address space under root PCI bus. */ x86ms->ioapic_as = amdvi_host_dma_iommu(bus, s, AMDVI_IOAPIC_SB_DEVID); @@ -1581,8 +1588,6 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp) sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->mmio); sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, AMDVI_BASE_ADDR); pci_setup_iommu(bus, amdvi_host_dma_iommu, s); - s->devid = object_property_get_int(OBJECT(&s->pci), "addr", &error_abort); - msi_init(&s->pci.dev, 0, 1, true, false, errp); amdvi_init(s); } @@ -1625,6 +1630,11 @@ static const TypeInfo amdvi_sysbus = { static void amdvi_pci_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); + PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); + + k->vendor_id = PCI_VENDOR_ID_AMD; + k->class_id = 0x0806; + k->realize = amdvi_pci_realize; set_bit(DEVICE_CATEGORY_MISC, dc->categories); dc->desc = "AMD IOMMU (AMD-Vi) DMA Remapping device"; diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h index 79d38a3e41..6da893ee57 100644 --- a/hw/i386/amd_iommu.h +++ b/hw/i386/amd_iommu.h @@ -300,27 +300,26 @@ struct irte_ga { OBJECT_DECLARE_SIMPLE_TYPE(AMDVIState, AMD_IOMMU_DEVICE) #define TYPE_AMD_IOMMU_PCI "AMDVI-PCI" +OBJECT_DECLARE_SIMPLE_TYPE(AMDVIPCIState, AMD_IOMMU_PCI) #define TYPE_AMD_IOMMU_MEMORY_REGION "amd-iommu-iommu-memory-region" typedef struct AMDVIAddressSpace AMDVIAddressSpace; /* functions to steal PCI config space */ -typedef struct AMDVIPCIState { +struct AMDVIPCIState { PCIDevice dev; /* The PCI device itself */ -} AMDVIPCIState; + uint32_t capab_offset; /* capability offset pointer */ +}; struct AMDVIState { X86IOMMUState iommu; /* IOMMU bus device */ AMDVIPCIState pci; /* IOMMU PCI device */ uint32_t version; - uint32_t capab_offset; /* capability offset pointer */ uint64_t mmio_addr; - uint32_t devid; /* auto-assigned devid */ - bool enabled; /* IOMMU enabled */ bool ats_enabled; /* address translation enabled */ bool cmdbuf_enabled; /* command buffer enabled */ diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index a62896759c..94d52f4205 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -64,8 +64,8 @@ struct vtd_as_key { struct vtd_iotlb_key { uint64_t gfn; uint32_t pasid; - uint32_t level; uint16_t sid; + uint8_t level; }; static void vtd_address_space_refresh_all(IntelIOMMUState *s); @@ -221,10 +221,11 @@ static gboolean vtd_iotlb_equal(gconstpointer v1, gconstpointer v2) static guint vtd_iotlb_hash(gconstpointer v) { const struct vtd_iotlb_key *key = v; + uint64_t hash64 = key->gfn | ((uint64_t)(key->sid) << VTD_IOTLB_SID_SHIFT) | + (uint64_t)(key->level - 1) << VTD_IOTLB_LVL_SHIFT | + (uint64_t)(key->pasid) << VTD_IOTLB_PASID_SHIFT; - return key->gfn | ((key->sid) << VTD_IOTLB_SID_SHIFT) | - (key->level) << VTD_IOTLB_LVL_SHIFT | - (key->pasid) << VTD_IOTLB_PASID_SHIFT; + return (guint)((hash64 >> 32) ^ (hash64 & 0xffffffffU)); } static gboolean vtd_as_equal(gconstpointer v1, gconstpointer v2) diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index f090e61e11..2e61eec2f5 100644 --- a/hw/i386/intel_iommu_internal.h +++ b/hw/i386/intel_iommu_internal.h @@ -114,9 +114,9 @@ VTD_INTERRUPT_ADDR_FIRST + 1) /* The shift of source_id in the key of IOTLB hash table */ -#define VTD_IOTLB_SID_SHIFT 20 -#define VTD_IOTLB_LVL_SHIFT 28 -#define VTD_IOTLB_PASID_SHIFT 30 +#define VTD_IOTLB_SID_SHIFT 26 +#define VTD_IOTLB_LVL_SHIFT 42 +#define VTD_IOTLB_PASID_SHIFT 44 #define VTD_IOTLB_MAX_SIZE 1024 /* Max size of the hash table */ /* IOTLB_REG */ diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 615e1d3d06..d761c8c775 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1946,6 +1946,7 @@ static void pc_machine_class_init(ObjectClass *oc, void *data) pcmc->acpi_data_size = 0x20000 + 0x8000; pcmc->pvh_enabled = true; pcmc->kvmclock_create_always = true; + pcmc->resizable_acpi_blob = true; assert(!mc->get_hotplug_handler); mc->get_hotplug_handler = pc_get_hotplug_handler; mc->hotplug_allowed = pc_hotplug_allowed; diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 21591dad8d..66a849d279 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -756,6 +756,7 @@ static void pc_i440fx_2_2_machine_options(MachineClass *m) compat_props_add(m->compat_props, hw_compat_2_2, hw_compat_2_2_len); compat_props_add(m->compat_props, pc_compat_2_2, pc_compat_2_2_len); pcmc->rsdp_in_ram = false; + pcmc->resizable_acpi_blob = false; } DEFINE_I440FX_MACHINE(v2_2, "pc-i440fx-2.2", pc_compat_2_2_fn, diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c index 50ef83215c..37f1f4ccfd 100644 --- a/hw/mem/pc-dimm.c +++ b/hw/mem/pc-dimm.c @@ -81,6 +81,10 @@ void pc_dimm_plug(PCDIMMDevice *dimm, MachineState *machine) memory_device_plug(MEMORY_DEVICE(dimm), machine); vmstate_register_ram(vmstate_mr, DEVICE(dimm)); + /* count only "real" DIMMs, not NVDIMMs */ + if (!object_dynamic_cast(OBJECT(dimm), TYPE_NVDIMM)) { + machine->device_memory->dimm_size += memory_region_size(vmstate_mr); + } } void pc_dimm_unplug(PCDIMMDevice *dimm, MachineState *machine) @@ -90,6 +94,9 @@ void pc_dimm_unplug(PCDIMMDevice *dimm, MachineState *machine) memory_device_unplug(MEMORY_DEVICE(dimm), machine); vmstate_unregister_ram(vmstate_mr, DEVICE(dimm)); + if (!object_dynamic_cast(OBJECT(dimm), TYPE_NVDIMM)) { + machine->device_memory->dimm_size -= memory_region_size(vmstate_mr); + } } static int pc_dimm_slot2bitmap(Object *obj, void *opaque) diff --git a/hw/pci-bridge/pci_expander_bridge.c b/hw/pci-bridge/pci_expander_bridge.c index ead33f0c05..613857b601 100644 --- a/hw/pci-bridge/pci_expander_bridge.c +++ b/hw/pci-bridge/pci_expander_bridge.c @@ -50,24 +50,8 @@ struct PXBBus { char bus_path[8]; }; -#define TYPE_PXB_DEVICE "pxb" -DECLARE_INSTANCE_CHECKER(PXBDev, PXB_DEV, - TYPE_PXB_DEVICE) - -#define TYPE_PXB_PCIE_DEVICE "pxb-pcie" -DECLARE_INSTANCE_CHECKER(PXBDev, PXB_PCIE_DEV, - TYPE_PXB_PCIE_DEVICE) - -static PXBDev *convert_to_pxb(PCIDevice *dev) -{ - /* A CXL PXB's parent bus is PCIe, so the normal check won't work */ - if (object_dynamic_cast(OBJECT(dev), TYPE_PXB_CXL_DEVICE)) { - return PXB_CXL_DEV(dev); - } - - return pci_bus_is_express(pci_get_bus(dev)) - ? PXB_PCIE_DEV(dev) : PXB_DEV(dev); -} +#define TYPE_PXB_PCIE_DEV "pxb-pcie" +OBJECT_DECLARE_SIMPLE_TYPE(PXBPCIEDev, PXB_PCIE_DEV) static GList *pxb_dev_list; @@ -89,14 +73,14 @@ bool cxl_get_hb_passthrough(PCIHostState *hb) static int pxb_bus_num(PCIBus *bus) { - PXBDev *pxb = convert_to_pxb(bus->parent_dev); + PXBDev *pxb = PXB_DEV(bus->parent_dev); return pxb->bus_nr; } static uint16_t pxb_bus_numa_node(PCIBus *bus) { - PXBDev *pxb = convert_to_pxb(bus->parent_dev); + PXBDev *pxb = PXB_DEV(bus->parent_dev); return pxb->numa_node; } @@ -154,7 +138,7 @@ static char *pxb_host_ofw_unit_address(const SysBusDevice *dev) pxb_host = PCI_HOST_BRIDGE(dev); pxb_bus = pxb_host->bus; - pxb_dev = convert_to_pxb(pxb_bus->parent_dev); + pxb_dev = PXB_DEV(pxb_bus->parent_dev); position = g_list_index(pxb_dev_list, pxb_dev); assert(position >= 0); @@ -212,8 +196,8 @@ static void pxb_cxl_realize(DeviceState *dev, Error **errp) */ void pxb_cxl_hook_up_registers(CXLState *cxl_state, PCIBus *bus, Error **errp) { - PXBDev *pxb = PXB_CXL_DEV(pci_bridge_get_device(bus)); - CXLHost *cxl = pxb->cxl.cxl_host_bridge; + PXBCXLDev *pxb = PXB_CXL_DEV(pci_bridge_get_device(bus)); + CXLHost *cxl = pxb->cxl_host_bridge; CXLComponentState *cxl_cstate = &cxl->cxl_cstate; struct MemoryRegion *mr = &cxl_cstate->crb.component_registers; hwaddr offset; @@ -299,7 +283,7 @@ static int pxb_map_irq_fn(PCIDevice *pci_dev, int pin) static void pxb_cxl_dev_reset(DeviceState *dev) { - CXLHost *cxl = PXB_CXL_DEV(dev)->cxl.cxl_host_bridge; + CXLHost *cxl = PXB_CXL_DEV(dev)->cxl_host_bridge; CXLComponentState *cxl_cstate = &cxl->cxl_cstate; PCIHostState *hb = PCI_HOST_BRIDGE(cxl); uint32_t *reg_state = cxl_cstate->crb.cache_mem_registers; @@ -311,7 +295,7 @@ static void pxb_cxl_dev_reset(DeviceState *dev) * The CXL specification allows for host bridges with no HDM decoders * if they only have a single root port. */ - if (!PXB_DEV(dev)->hdm_for_passthrough) { + if (!PXB_CXL_DEV(dev)->hdm_for_passthrough) { dsp_count = pcie_count_ds_ports(hb->bus); } /* Initial reset will have 0 dsp so wait until > 0 */ @@ -337,7 +321,7 @@ static gint pxb_compare(gconstpointer a, gconstpointer b) static void pxb_dev_realize_common(PCIDevice *dev, enum BusType type, Error **errp) { - PXBDev *pxb = convert_to_pxb(dev); + PXBDev *pxb = PXB_DEV(dev); DeviceState *ds, *bds = NULL; PCIBus *bus; const char *dev_name = NULL; @@ -365,7 +349,7 @@ static void pxb_dev_realize_common(PCIDevice *dev, enum BusType type, } else if (type == CXL) { bus = pci_root_bus_new(ds, dev_name, NULL, NULL, 0, TYPE_PXB_CXL_BUS); bus->flags |= PCI_BUS_CXL; - PXB_CXL_DEV(dev)->cxl.cxl_host_bridge = PXB_CXL_HOST(ds); + PXB_CXL_DEV(dev)->cxl_host_bridge = PXB_CXL_HOST(ds); } else { bus = pci_root_bus_new(ds, "pxb-internal", NULL, NULL, 0, TYPE_PXB_BUS); bds = qdev_new("pci-bridge"); @@ -418,7 +402,7 @@ static void pxb_dev_realize(PCIDevice *dev, Error **errp) static void pxb_dev_exitfn(PCIDevice *pci_dev) { - PXBDev *pxb = convert_to_pxb(pci_dev); + PXBDev *pxb = PXB_DEV(pci_dev); pxb_dev_list = g_list_remove(pxb_dev_list, pxb); } @@ -449,7 +433,7 @@ static void pxb_dev_class_init(ObjectClass *klass, void *data) } static const TypeInfo pxb_dev_info = { - .name = TYPE_PXB_DEVICE, + .name = TYPE_PXB_DEV, .parent = TYPE_PCI_DEVICE, .instance_size = sizeof(PXBDev), .class_init = pxb_dev_class_init, @@ -481,15 +465,14 @@ static void pxb_pcie_dev_class_init(ObjectClass *klass, void *data) k->class_id = PCI_CLASS_BRIDGE_HOST; dc->desc = "PCI Express Expander Bridge"; - device_class_set_props(dc, pxb_dev_properties); dc->hotpluggable = false; set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories); } static const TypeInfo pxb_pcie_dev_info = { - .name = TYPE_PXB_PCIE_DEVICE, - .parent = TYPE_PCI_DEVICE, - .instance_size = sizeof(PXBDev), + .name = TYPE_PXB_PCIE_DEV, + .parent = TYPE_PXB_DEV, + .instance_size = sizeof(PXBPCIEDev), .class_init = pxb_pcie_dev_class_init, .interfaces = (InterfaceInfo[]) { { INTERFACE_CONVENTIONAL_PCI_DEVICE }, @@ -510,11 +493,7 @@ static void pxb_cxl_dev_realize(PCIDevice *dev, Error **errp) } static Property pxb_cxl_dev_properties[] = { - /* Note: 0 is not a legal PXB bus number. */ - DEFINE_PROP_UINT8("bus_nr", PXBDev, bus_nr, 0), - DEFINE_PROP_UINT16("numa_node", PXBDev, numa_node, NUMA_NODE_UNASSIGNED), - DEFINE_PROP_BOOL("bypass_iommu", PXBDev, bypass_iommu, false), - DEFINE_PROP_BOOL("hdm_for_passthrough", PXBDev, hdm_for_passthrough, false), + DEFINE_PROP_BOOL("hdm_for_passthrough", PXBCXLDev, hdm_for_passthrough, false), DEFINE_PROP_END_OF_LIST(), }; @@ -540,9 +519,9 @@ static void pxb_cxl_dev_class_init(ObjectClass *klass, void *data) } static const TypeInfo pxb_cxl_dev_info = { - .name = TYPE_PXB_CXL_DEVICE, - .parent = TYPE_PCI_DEVICE, - .instance_size = sizeof(PXBDev), + .name = TYPE_PXB_CXL_DEV, + .parent = TYPE_PXB_PCIE_DEV, + .instance_size = sizeof(PXBCXLDev), .class_init = pxb_cxl_dev_class_init, .interfaces = (InterfaceInfo[]){ diff --git a/hw/pci/pci.c b/hw/pci/pci.c index def5000e7b..8a87ccc8b0 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -1116,6 +1116,21 @@ static bool pci_bus_devfn_reserved(PCIBus *bus, int devfn) return bus->slot_reserved_mask & (1UL << PCI_SLOT(devfn)); } +uint32_t pci_bus_get_slot_reserved_mask(PCIBus *bus) +{ + return bus->slot_reserved_mask; +} + +void pci_bus_set_slot_reserved_mask(PCIBus *bus, uint32_t mask) +{ + bus->slot_reserved_mask |= mask; +} + +void pci_bus_clear_slot_reserved_mask(PCIBus *bus, uint32_t mask) +{ + bus->slot_reserved_mask &= ~mask; +} + /* -1 for devfn means auto assign */ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, const char *name, int devfn, diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c index 04a64cada3..a8688243a6 100644 --- a/hw/ppc/spapr_nvdimm.c +++ b/hw/ppc/spapr_nvdimm.c @@ -496,7 +496,6 @@ static int spapr_nvdimm_flush_post_load(void *opaque, int version_id) { SpaprNVDIMMDevice *s_nvdimm = (SpaprNVDIMMDevice *)opaque; SpaprNVDIMMDeviceFlushState *state; - ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context()); HostMemoryBackend *backend = MEMORY_BACKEND(PC_DIMM(s_nvdimm)->hostmem); bool is_pmem = object_property_get_bool(OBJECT(backend), "pmem", NULL); bool pmem_override = object_property_get_bool(OBJECT(s_nvdimm), @@ -517,7 +516,7 @@ static int spapr_nvdimm_flush_post_load(void *opaque, int version_id) } QLIST_FOREACH(state, &s_nvdimm->pending_nvdimm_flush_states, node) { - thread_pool_submit_aio(pool, flush_worker_cb, state, + thread_pool_submit_aio(flush_worker_cb, state, spapr_nvdimm_flush_completion_cb, state); } @@ -664,7 +663,6 @@ static target_ulong h_scm_flush(PowerPCCPU *cpu, SpaprMachineState *spapr, PCDIMMDevice *dimm; HostMemoryBackend *backend = NULL; SpaprNVDIMMDeviceFlushState *state; - ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context()); int fd; if (!drc || !drc->dev || @@ -699,7 +697,7 @@ static target_ulong h_scm_flush(PowerPCCPU *cpu, SpaprMachineState *spapr, state->drcidx = drc_index; - thread_pool_submit_aio(pool, flush_worker_cb, state, + thread_pool_submit_aio(flush_worker_cb, state, spapr_nvdimm_flush_completion_cb, state); continue_token = state->continue_token; diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c index e33e5207ab..f44de1a8c1 100644 --- a/hw/s390x/virtio-ccw.c +++ b/hw/s390x/virtio-ccw.c @@ -237,6 +237,7 @@ static int virtio_ccw_set_vqs(SubchDev *sch, VqInfoBlock *info, return -EINVAL; } virtio_queue_set_num(vdev, index, num); + virtio_init_region_cache(vdev, index); } else if (virtio_queue_get_num(vdev, index) > num) { /* Fail if we don't have a big enough queue. */ return -EINVAL; diff --git a/hw/sparc64/sun4u.c b/hw/sparc64/sun4u.c index a25e951f9d..eae7589462 100644 --- a/hw/sparc64/sun4u.c +++ b/hw/sparc64/sun4u.c @@ -31,7 +31,6 @@ #include "hw/irq.h" #include "hw/pci/pci.h" #include "hw/pci/pci_bridge.h" -#include "hw/pci/pci_bus.h" #include "hw/pci/pci_host.h" #include "hw/qdev-properties.h" #include "hw/pci-host/sabre.h" @@ -608,9 +607,9 @@ static void sun4uv_init(MemoryRegion *address_space_mem, /* Only in-built Simba APBs can exist on the root bus, slot 0 on busA is reserved (leaving no slots free after on-board devices) however slots 0-3 are free on busB */ - pci_bus->slot_reserved_mask = 0xfffffffc; - pci_busA->slot_reserved_mask = 0xfffffff1; - pci_busB->slot_reserved_mask = 0xfffffff0; + pci_bus_set_slot_reserved_mask(pci_bus, 0xfffffffc); + pci_bus_set_slot_reserved_mask(pci_busA, 0xfffffff1); + pci_bus_set_slot_reserved_mask(pci_busB, 0xfffffff0); ebus = pci_new_multifunction(PCI_DEVFN(1, 0), true, TYPE_EBUS); qdev_prop_set_uint64(DEVICE(ebus), "console-serial-base", diff --git a/hw/virtio/vhost-user-i2c.c b/hw/virtio/vhost-user-i2c.c index 60eaf0d95b..4eef3f0633 100644 --- a/hw/virtio/vhost-user-i2c.c +++ b/hw/virtio/vhost-user-i2c.c @@ -128,6 +128,14 @@ static void vu_i2c_guest_notifier_mask(VirtIODevice *vdev, int idx, bool mask) { VHostUserI2C *i2c = VHOST_USER_I2C(vdev); + /* + * We don't support interrupts, return early if index is set to + * VIRTIO_CONFIG_IRQ_IDX. + */ + if (idx == VIRTIO_CONFIG_IRQ_IDX) { + return; + } + vhost_virtqueue_mask(&i2c->vhost_dev, vdev, idx, mask); } @@ -135,6 +143,14 @@ static bool vu_i2c_guest_notifier_pending(VirtIODevice *vdev, int idx) { VHostUserI2C *i2c = VHOST_USER_I2C(vdev); + /* + * We don't support interrupts, return early if index is set to + * VIRTIO_CONFIG_IRQ_IDX. + */ + if (idx == VIRTIO_CONFIG_IRQ_IDX) { + return false; + } + return vhost_virtqueue_pending(&i2c->vhost_dev, idx); } diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index a266396576..746d130c74 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -1291,18 +1291,6 @@ void vhost_virtqueue_stop(struct vhost_dev *dev, 0, virtio_queue_get_desc_size(vdev, idx)); } -static void vhost_eventfd_add(MemoryListener *listener, - MemoryRegionSection *section, - bool match_data, uint64_t data, EventNotifier *e) -{ -} - -static void vhost_eventfd_del(MemoryListener *listener, - MemoryRegionSection *section, - bool match_data, uint64_t data, EventNotifier *e) -{ -} - static int vhost_virtqueue_set_busyloop_timeout(struct vhost_dev *dev, int n, uint32_t timeout) { @@ -1457,8 +1445,6 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, .log_sync = vhost_log_sync, .log_global_start = vhost_log_global_start, .log_global_stop = vhost_log_global_stop, - .eventfd_add = vhost_eventfd_add, - .eventfd_del = vhost_eventfd_del, .priority = 10 }; diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index 746f07c4d2..fd06fcfb3f 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -32,6 +32,7 @@ #include "qemu/error-report.h" #include "migration/misc.h" #include "migration/migration.h" +#include "migration/options.h" #include "hw/virtio/virtio-bus.h" #include "hw/virtio/virtio-access.h" @@ -729,37 +730,14 @@ static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data) memcpy(config_data, &config, virtio_balloon_config_size(dev)); } -static int build_dimm_list(Object *obj, void *opaque) -{ - GSList **list = opaque; - - if (object_dynamic_cast(obj, TYPE_PC_DIMM)) { - DeviceState *dev = DEVICE(obj); - if (dev->realized) { /* only realized DIMMs matter */ - *list = g_slist_prepend(*list, dev); - } - } - - object_child_foreach(obj, build_dimm_list, opaque); - return 0; -} - static ram_addr_t get_current_ram_size(void) { - GSList *list = NULL, *item; - ram_addr_t size = current_machine->ram_size; - - build_dimm_list(qdev_get_machine(), &list); - for (item = list; item; item = g_slist_next(item)) { - Object *obj = OBJECT(item->data); - if (!strcmp(object_get_typename(obj), TYPE_PC_DIMM)) { - size += object_property_get_int(obj, PC_DIMM_SIZE_PROP, - &error_abort); - } + MachineState *machine = MACHINE(qdev_get_machine()); + if (machine->device_memory) { + return machine->ram_size + machine->device_memory->dimm_size; + } else { + return machine->ram_size; } - g_slist_free(list); - - return size; } static bool virtio_balloon_page_poison_support(void *opaque) diff --git a/hw/virtio/virtio-mmio.c b/hw/virtio/virtio-mmio.c index 23ba625eb6..c2c6d85475 100644 --- a/hw/virtio/virtio-mmio.c +++ b/hw/virtio/virtio-mmio.c @@ -354,6 +354,7 @@ static void virtio_mmio_write(void *opaque, hwaddr offset, uint64_t value, if (proxy->legacy) { virtio_queue_update_rings(vdev, vdev->queue_sel); } else { + virtio_init_region_cache(vdev, vdev->queue_sel); proxy->vqs[vdev->queue_sel].num = value; } break; diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index 247325c193..02fb84a8fa 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -1554,6 +1554,7 @@ static void virtio_pci_common_write(void *opaque, hwaddr addr, proxy->vqs[vdev->queue_sel].num = val; virtio_queue_set_num(vdev, vdev->queue_sel, proxy->vqs[vdev->queue_sel].num); + virtio_init_region_cache(vdev, vdev->queue_sel); break; case VIRTIO_PCI_COMMON_Q_MSIX: vector = virtio_queue_vector(vdev, vdev->queue_sel); diff --git a/hw/virtio/virtio-pmem.c b/hw/virtio/virtio-pmem.c index dff402f08f..c3512c2dae 100644 --- a/hw/virtio/virtio-pmem.c +++ b/hw/virtio/virtio-pmem.c @@ -70,7 +70,6 @@ static void virtio_pmem_flush(VirtIODevice *vdev, VirtQueue *vq) VirtIODeviceRequest *req_data; VirtIOPMEM *pmem = VIRTIO_PMEM(vdev); HostMemoryBackend *backend = MEMORY_BACKEND(pmem->memdev); - ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context()); trace_virtio_pmem_flush_request(); req_data = virtqueue_pop(vq, sizeof(VirtIODeviceRequest)); @@ -88,7 +87,7 @@ static void virtio_pmem_flush(VirtIODevice *vdev, VirtQueue *vq) req_data->fd = memory_region_get_fd(&backend->mr); req_data->pmem = pmem; req_data->vdev = vdev; - thread_pool_submit_aio(pool, worker_cb, req_data, done_cb, req_data); + thread_pool_submit_aio(worker_cb, req_data, done_cb, req_data); } static void virtio_pmem_get_config(VirtIODevice *vdev, uint8_t *config) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 98c4819fcc..272d930721 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -226,7 +226,7 @@ static void virtio_virtqueue_reset_region_cache(struct VirtQueue *vq) } } -static void virtio_init_region_cache(VirtIODevice *vdev, int n) +void virtio_init_region_cache(VirtIODevice *vdev, int n) { VirtQueue *vq = &vdev->vq[n]; VRingMemoryRegionCaches *old = vq->vring.caches; diff --git a/hw/xen/xen_pt.c b/hw/xen/xen_pt.c index 2d33d178ad..a540149639 100644 --- a/hw/xen/xen_pt.c +++ b/hw/xen/xen_pt.c @@ -57,7 +57,6 @@ #include <sys/ioctl.h> #include "hw/pci/pci.h" -#include "hw/pci/pci_bus.h" #include "hw/qdev-properties.h" #include "hw/qdev-properties-system.h" #include "xen_pt.h" @@ -951,7 +950,7 @@ void xen_igd_reserve_slot(PCIBus *pci_bus) } XEN_PT_LOG(0, "Reserving PCI slot 2 for IGD\n"); - pci_bus->slot_reserved_mask |= XEN_PCI_IGD_SLOT_MASK; + pci_bus_set_slot_reserved_mask(pci_bus, XEN_PCI_IGD_SLOT_MASK); } static void xen_igd_clear_slot(DeviceState *qdev, Error **errp) @@ -971,7 +970,7 @@ static void xen_igd_clear_slot(DeviceState *qdev, Error **errp) return; } - if (!(pci_bus->slot_reserved_mask & XEN_PCI_IGD_SLOT_MASK)) { + if (!(pci_bus_get_slot_reserved_mask(pci_bus) & XEN_PCI_IGD_SLOT_MASK)) { xpdc->pci_qdev_realize(qdev, errp); return; } @@ -982,7 +981,7 @@ static void xen_igd_clear_slot(DeviceState *qdev, Error **errp) s->real_device.dev == XEN_PCI_IGD_DEV && s->real_device.func == XEN_PCI_IGD_FN && s->real_device.vendor_id == PCI_VENDOR_ID_INTEL) { - pci_bus->slot_reserved_mask &= ~XEN_PCI_IGD_SLOT_MASK; + pci_bus_clear_slot_reserved_mask(pci_bus, XEN_PCI_IGD_SLOT_MASK); XEN_PT_LOG(pci_dev, "Intel IGD found, using slot 2\n"); } xpdc->pci_qdev_realize(qdev, errp); diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h index da13357bb8..6e43e3b7bb 100644 --- a/include/block/aio-wait.h +++ b/include/block/aio-wait.h @@ -63,7 +63,7 @@ extern AioWait global_aio_wait; * @ctx: the aio context, or NULL if multiple aio contexts (for which the * caller does not hold a lock) are involved in the polling condition. * @cond: wait while this conditional expression is true - * @unlock: whether to unlock and then lock again @ctx. This apples + * @unlock: whether to unlock and then lock again @ctx. This applies * only when waiting for another AioContext from the main loop. * Otherwise it's ignored. * diff --git a/include/block/aio.h b/include/block/aio.h index 543717f294..e267d918fd 100644 --- a/include/block/aio.h +++ b/include/block/aio.h @@ -208,17 +208,9 @@ struct AioContext { struct ThreadPool *thread_pool; #ifdef CONFIG_LINUX_AIO - /* - * State for native Linux AIO. Uses aio_context_acquire/release for - * locking. - */ struct LinuxAioState *linux_aio; #endif #ifdef CONFIG_LINUX_IO_URING - /* - * State for Linux io_uring. Uses aio_context_acquire/release for - * locking. - */ struct LuringState *linux_io_uring; /* State for file descriptor monitoring using Linux io_uring */ diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h index f01bb8b617..013d419444 100644 --- a/include/block/block_int-common.h +++ b/include/block/block_int-common.h @@ -1260,7 +1260,7 @@ extern QemuOptsList bdrv_create_opts_simple; /* * Common functions that are neither I/O nor Global State. * - * See include/block/block-commmon.h for more information about + * See include/block/block-common.h for more information about * the Common API. */ diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h index f8cda9df91..e46a29c3f0 100644 --- a/include/block/raw-aio.h +++ b/include/block/raw-aio.h @@ -49,26 +49,39 @@ typedef struct LinuxAioState LinuxAioState; LinuxAioState *laio_init(Error **errp); void laio_cleanup(LinuxAioState *s); -int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd, - uint64_t offset, QEMUIOVector *qiov, int type, - uint64_t dev_max_batch); + +/* laio_co_submit: submit I/O requests in the thread's current AioContext. */ +int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov, + int type, uint64_t dev_max_batch); + void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context); void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context); -void laio_io_plug(BlockDriverState *bs, LinuxAioState *s); -void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s, - uint64_t dev_max_batch); + +/* + * laio_io_plug/unplug work in the thread's current AioContext, therefore the + * caller must ensure that they are paired in the same IOThread. + */ +void laio_io_plug(void); +void laio_io_unplug(uint64_t dev_max_batch); #endif /* io_uring.c - Linux io_uring implementation */ #ifdef CONFIG_LINUX_IO_URING typedef struct LuringState LuringState; LuringState *luring_init(Error **errp); void luring_cleanup(LuringState *s); -int coroutine_fn luring_co_submit(BlockDriverState *bs, LuringState *s, int fd, - uint64_t offset, QEMUIOVector *qiov, int type); + +/* luring_co_submit: submit I/O requests in the thread's current AioContext. */ +int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset, + QEMUIOVector *qiov, int type); void luring_detach_aio_context(LuringState *s, AioContext *old_context); void luring_attach_aio_context(LuringState *s, AioContext *new_context); -void luring_io_plug(BlockDriverState *bs, LuringState *s); -void luring_io_unplug(BlockDriverState *bs, LuringState *s); + +/* + * luring_io_plug/unplug work in the thread's current AioContext, therefore the + * caller must ensure that they are paired in the same IOThread. + */ +void luring_io_plug(void); +void luring_io_unplug(void); #endif #ifdef _WIN32 diff --git a/include/block/thread-pool.h b/include/block/thread-pool.h index 95ff2b0bdb..948ff5f30c 100644 --- a/include/block/thread-pool.h +++ b/include/block/thread-pool.h @@ -29,12 +29,15 @@ typedef struct ThreadPool ThreadPool; ThreadPool *thread_pool_new(struct AioContext *ctx); void thread_pool_free(ThreadPool *pool); -BlockAIOCB *thread_pool_submit_aio(ThreadPool *pool, - ThreadPoolFunc *func, void *arg, - BlockCompletionFunc *cb, void *opaque); -int coroutine_fn thread_pool_submit_co(ThreadPool *pool, - ThreadPoolFunc *func, void *arg); -void thread_pool_submit(ThreadPool *pool, ThreadPoolFunc *func, void *arg); +/* + * thread_pool_submit* API: submit I/O requests in the thread's + * current AioContext. + */ +BlockAIOCB *thread_pool_submit_aio(ThreadPoolFunc *func, void *arg, + BlockCompletionFunc *cb, void *opaque); +int coroutine_fn thread_pool_submit_co(ThreadPoolFunc *func, void *arg); +void thread_pool_submit(ThreadPoolFunc *func, void *arg); + void thread_pool_update_params(ThreadPool *pool, struct AioContext *ctx); #endif diff --git a/include/hw/boards.h b/include/hw/boards.h index bf5fc9e3e7..f4117fdb9a 100644 --- a/include/hw/boards.h +++ b/include/hw/boards.h @@ -292,10 +292,12 @@ struct MachineClass { * @base: address in guest physical address space where the memory * address space for memory devices starts * @mr: address space container for memory devices + * @dimm_size: the sum of plugged DIMMs' sizes */ typedef struct DeviceMemoryState { hwaddr base; MemoryRegion mr; + uint64_t dimm_size; } DeviceMemoryState; /** diff --git a/include/hw/cxl/cxl.h b/include/hw/cxl/cxl.h index b2cffbb364..c453983e83 100644 --- a/include/hw/cxl/cxl.h +++ b/include/hw/cxl/cxl.h @@ -23,12 +23,12 @@ #define CXL_WINDOW_MAX 10 -typedef struct PXBDev PXBDev; +typedef struct PXBCXLDev PXBCXLDev; typedef struct CXLFixedWindow { uint64_t size; char **targets; - PXBDev *target_hbs[8]; + PXBCXLDev *target_hbs[8]; uint8_t num_targets; uint8_t enc_int_ways; uint8_t enc_int_gran; diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index eb668e9034..84935fc958 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -127,6 +127,9 @@ struct PCMachineClass { /* create kvmclock device even when KVM PV features are not exposed */ bool kvmclock_create_always; + + /* resizable acpi blob compat */ + bool resizable_acpi_blob; }; #define TYPE_PC_MACHINE "generic-pc-machine" diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index d5a40cd058..935b4b91b4 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -287,6 +287,9 @@ void pci_bus_irqs(PCIBus *bus, pci_set_irq_fn set_irq, void pci_bus_map_irqs(PCIBus *bus, pci_map_irq_fn map_irq); void pci_bus_irqs_cleanup(PCIBus *bus); int pci_bus_get_irq_level(PCIBus *bus, int irq_num); +uint32_t pci_bus_get_slot_reserved_mask(PCIBus *bus); +void pci_bus_set_slot_reserved_mask(PCIBus *bus, uint32_t mask); +void pci_bus_clear_slot_reserved_mask(PCIBus *bus, uint32_t mask); /* 0 <= pin <= 3 0 = INTA, 1 = INTB, 2 = INTC, 3 = INTD */ static inline int pci_swizzle(int slot, int pin) { diff --git a/include/hw/pci/pci_bridge.h b/include/hw/pci/pci_bridge.h index 1677176b2a..01670e9e65 100644 --- a/include/hw/pci/pci_bridge.h +++ b/include/hw/pci/pci_bridge.h @@ -84,7 +84,7 @@ struct PCIBridge { #define PCI_BRIDGE_DEV_PROP_SHPC "shpc" typedef struct CXLHost CXLHost; -struct PXBDev { +typedef struct PXBDev { /*< private >*/ PCIDevice parent_obj; /*< public >*/ @@ -92,15 +92,27 @@ struct PXBDev { uint8_t bus_nr; uint16_t numa_node; bool bypass_iommu; +} PXBDev; + +typedef struct PXBPCIEDev { + /*< private >*/ + PXBDev parent_obj; +} PXBPCIEDev; + +#define TYPE_PXB_DEV "pxb" +OBJECT_DECLARE_SIMPLE_TYPE(PXBDev, PXB_DEV) + +typedef struct PXBCXLDev { + /*< private >*/ + PXBPCIEDev parent_obj; + /*< public >*/ + bool hdm_for_passthrough; - struct cxl_dev { - CXLHost *cxl_host_bridge; /* Pointer to a CXLHost */ - } cxl; -}; + CXLHost *cxl_host_bridge; /* Pointer to a CXLHost */ +} PXBCXLDev; -#define TYPE_PXB_CXL_DEVICE "pxb-cxl" -DECLARE_INSTANCE_CHECKER(PXBDev, PXB_CXL_DEV, - TYPE_PXB_CXL_DEVICE) +#define TYPE_PXB_CXL_DEV "pxb-cxl" +OBJECT_DECLARE_SIMPLE_TYPE(PXBCXLDev, PXB_CXL_DEV) int pci_bridge_ssvid_init(PCIDevice *dev, uint8_t offset, uint16_t svid, uint16_t ssid, diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index f236e94ca6..f6b38f7e9c 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -309,6 +309,7 @@ int virtio_get_num_queues(VirtIODevice *vdev); void virtio_queue_set_rings(VirtIODevice *vdev, int n, hwaddr desc, hwaddr avail, hwaddr used); void virtio_queue_update_rings(VirtIODevice *vdev, int n); +void virtio_init_region_cache(VirtIODevice *vdev, int n); void virtio_queue_set_align(VirtIODevice *vdev, int n, int align); void virtio_queue_notify(VirtIODevice *vdev, int n); uint16_t virtio_queue_vector(VirtIODevice *vdev, int n); diff --git a/include/migration/misc.h b/include/migration/misc.h index 8b49841016..5ebe13b4b9 100644 --- a/include/migration/misc.h +++ b/include/migration/misc.h @@ -66,7 +66,6 @@ bool migration_has_finished(MigrationState *); bool migration_has_failed(MigrationState *); /* ...and after the device transmission */ bool migration_in_postcopy_after_devices(MigrationState *); -void migration_global_dump(Monitor *mon); /* True if incoming migration entered POSTCOPY_INCOMING_DISCARD */ bool migration_in_incoming_postcopy(void); /* True if incoming migration entered POSTCOPY_INCOMING_ADVISE */ diff --git a/include/qemu/mmap-alloc.h b/include/qemu/mmap-alloc.h index 2825e231a7..8344daaa03 100644 --- a/include/qemu/mmap-alloc.h +++ b/include/qemu/mmap-alloc.h @@ -1,8 +1,15 @@ #ifndef QEMU_MMAP_ALLOC_H #define QEMU_MMAP_ALLOC_H +typedef enum { + QEMU_FS_TYPE_UNKNOWN = 0, + QEMU_FS_TYPE_TMPFS, + QEMU_FS_TYPE_HUGETLBFS, + QEMU_FS_TYPE_NUM, +} QemuFsType; size_t qemu_fd_getpagesize(int fd); +QemuFsType qemu_fd_getfs(int fd); /** * qemu_ram_mmap: mmap anonymous memory, the specified file or device. diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h index bb25493ba1..851a44de96 100644 --- a/include/sysemu/block-backend-io.h +++ b/include/sysemu/block-backend-io.h @@ -90,6 +90,11 @@ void blk_iostatus_set_err(BlockBackend *blk, int error); int blk_get_max_iov(BlockBackend *blk); int blk_get_max_hw_iov(BlockBackend *blk); +/* + * blk_io_plug/unplug are thread-local operations. This means that multiple + * IOThreads can simultaneously call plug/unplug, but the caller must ensure + * that each unplug() is called in the same IOThread of the matching plug(). + */ void coroutine_fn blk_co_io_plug(BlockBackend *blk); void co_wrapper blk_io_plug(BlockBackend *blk); diff --git a/meson_options.txt b/meson_options.txt index fc9447d267..2471dd02da 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -315,7 +315,7 @@ option('debug_mutex', type: 'boolean', value: false, description: 'mutex debugging support') option('debug_stack_usage', type: 'boolean', value: false, description: 'measure coroutine stack usage') -option('qom_cast_debug', type: 'boolean', value: false, +option('qom_cast_debug', type: 'boolean', value: true, description: 'cast debugging support') option('gprof', type: 'boolean', value: false, description: 'QEMU profiling with gprof', diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c index fe73aa94b1..a6ffae0002 100644 --- a/migration/block-dirty-bitmap.c +++ b/migration/block-dirty-bitmap.c @@ -79,6 +79,7 @@ #include "qapi/qapi-visit-migration.h" #include "qapi/clone-visitor.h" #include "trace.h" +#include "options.h" #define CHUNK_SIZE (1 << 10) diff --git a/migration/block.c b/migration/block.c index b2497bbd32..6d532ac7a2 100644 --- a/migration/block.c +++ b/migration/block.c @@ -28,6 +28,7 @@ #include "migration/vmstate.h" #include "sysemu/block-backend.h" #include "trace.h" +#include "options.h" #define BLK_MIG_BLOCK_SIZE (1ULL << 20) #define BDRV_SECTORS_PER_DIRTY_CHUNK (BLK_MIG_BLOCK_SIZE >> BDRV_SECTOR_BITS) @@ -416,7 +417,7 @@ static int init_blk_migration(QEMUFile *f) bmds->bulk_completed = 0; bmds->total_sectors = sectors; bmds->completed_sectors = 0; - bmds->shared_base = migrate_use_block_incremental(); + bmds->shared_base = migrate_block_incremental(); assert(i < num_bs); bmds_bs[i].bmds = bmds; @@ -1000,7 +1001,7 @@ static int block_load(QEMUFile *f, void *opaque, int version_id) static bool block_is_active(void *opaque) { - return migrate_use_block(); + return migrate_block(); } static SaveVMHandlers savevm_block_handlers = { diff --git a/migration/colo.c b/migration/colo.c index 0716e64689..07bfa21fea 100644 --- a/migration/colo.c +++ b/migration/colo.c @@ -36,6 +36,7 @@ #include "sysemu/cpus.h" #include "sysemu/runstate.h" #include "net/filter.h" +#include "options.h" static bool vmstate_loading; static Notifier packets_compare_notifier; @@ -575,7 +576,7 @@ static void colo_process_checkpoint(MigrationState *s) trace_colo_vm_state_change("stop", "run"); timer_mod(s->colo_delay_timer, qemu_clock_get_ms(QEMU_CLOCK_HOST) + - s->parameters.x_checkpoint_delay); + migrate_checkpoint_delay()); while (s->state == MIGRATION_STATUS_COLO) { if (failover_get_state() != FAILOVER_STATUS_NONE) { @@ -650,8 +651,7 @@ void colo_checkpoint_notify(void *opaque) qemu_event_set(&s->colo_checkpoint_event); s->colo_checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST); - next_notify_time = s->colo_checkpoint_time + - s->parameters.x_checkpoint_delay; + next_notify_time = s->colo_checkpoint_time + migrate_checkpoint_delay(); timer_mod(s->colo_delay_timer, next_notify_time); } diff --git a/migration/meson.build b/migration/meson.build index 0d1bb9f96e..480ff6854a 100644 --- a/migration/meson.build +++ b/migration/meson.build @@ -22,6 +22,7 @@ softmmu_ss.add(files( 'migration.c', 'multifd.c', 'multifd-zlib.c', + 'options.c', 'postcopy-ram.c', 'savevm.c', 'socket.c', diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c index 72519ea99f..4e9f00e7dc 100644 --- a/migration/migration-hmp-cmds.c +++ b/migration/migration-hmp-cmds.c @@ -15,7 +15,6 @@ #include "qemu/osdep.h" #include "block/qapi.h" -#include "migration/misc.h" #include "migration/snapshot.h" #include "monitor/hmp.h" #include "monitor/monitor.h" @@ -30,6 +29,27 @@ #include "qemu/sockets.h" #include "sysemu/runstate.h" #include "ui/qemu-spice.h" +#include "sysemu/sysemu.h" +#include "migration.h" + +static void migration_global_dump(Monitor *mon) +{ + MigrationState *ms = migrate_get_current(); + + monitor_printf(mon, "globals:\n"); + monitor_printf(mon, "store-global-state: %s\n", + ms->store_global_state ? "on" : "off"); + monitor_printf(mon, "only-migratable: %s\n", + only_migratable ? "on" : "off"); + monitor_printf(mon, "send-configuration: %s\n", + ms->send_configuration ? "on" : "off"); + monitor_printf(mon, "send-section-footer: %s\n", + ms->send_section_footer ? "on" : "off"); + monitor_printf(mon, "decompress-error-check: %s\n", + ms->decompress_error_check ? "on" : "off"); + monitor_printf(mon, "clear-bitmap-shift: %u\n", + ms->clear_bitmap_shift); +} void hmp_info_migrate(Monitor *mon, const QDict *qdict) { @@ -616,23 +636,6 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) hmp_handle_error(mon, err); } -void hmp_client_migrate_info(Monitor *mon, const QDict *qdict) -{ - Error *err = NULL; - const char *protocol = qdict_get_str(qdict, "protocol"); - const char *hostname = qdict_get_str(qdict, "hostname"); - bool has_port = qdict_haskey(qdict, "port"); - int port = qdict_get_try_int(qdict, "port", -1); - bool has_tls_port = qdict_haskey(qdict, "tls-port"); - int tls_port = qdict_get_try_int(qdict, "tls-port", -1); - const char *cert_subject = qdict_get_try_str(qdict, "cert-subject"); - - qmp_client_migrate_info(protocol, hostname, - has_port, port, has_tls_port, tls_port, - cert_subject, &err); - hmp_handle_error(mon, err); -} - void hmp_migrate_start_postcopy(Monitor *mon, const QDict *qdict) { Error *err = NULL; diff --git a/migration/migration.c b/migration/migration.c index bda4789193..53dd59f6f6 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -63,7 +63,7 @@ #include "sysemu/cpus.h" #include "yank_functions.h" #include "sysemu/qtest.h" -#include "ui/qemu-spice.h" +#include "options.h" #define MAX_THROTTLE (128 << 20) /* Migration transfer speed throttling */ @@ -136,39 +136,6 @@ enum mig_rp_message_type { MIG_RP_MSG_MAX }; -/* Migration capabilities set */ -struct MigrateCapsSet { - int size; /* Capability set size */ - MigrationCapability caps[]; /* Variadic array of capabilities */ -}; -typedef struct MigrateCapsSet MigrateCapsSet; - -/* Define and initialize MigrateCapsSet */ -#define INITIALIZE_MIGRATE_CAPS_SET(_name, ...) \ - MigrateCapsSet _name = { \ - .size = sizeof((int []) { __VA_ARGS__ }) / sizeof(int), \ - .caps = { __VA_ARGS__ } \ - } - -/* Background-snapshot compatibility check list */ -static const -INITIALIZE_MIGRATE_CAPS_SET(check_caps_background_snapshot, - MIGRATION_CAPABILITY_POSTCOPY_RAM, - MIGRATION_CAPABILITY_DIRTY_BITMAPS, - MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME, - MIGRATION_CAPABILITY_LATE_BLOCK_ACTIVATE, - MIGRATION_CAPABILITY_RETURN_PATH, - MIGRATION_CAPABILITY_MULTIFD, - MIGRATION_CAPABILITY_PAUSE_BEFORE_SWITCHOVER, - MIGRATION_CAPABILITY_AUTO_CONVERGE, - MIGRATION_CAPABILITY_RELEASE_RAM, - MIGRATION_CAPABILITY_RDMA_PIN_ALL, - MIGRATION_CAPABILITY_COMPRESS, - MIGRATION_CAPABILITY_XBZRLE, - MIGRATION_CAPABILITY_X_COLO, - MIGRATION_CAPABILITY_VALIDATE_UUID, - MIGRATION_CAPABILITY_ZERO_COPY_SEND); - /* When we add fault tolerance, we could have several migrations at once. For now we don't need to add dynamic creation of migration */ @@ -186,7 +153,7 @@ static void migrate_fd_cancel(MigrationState *s); static bool migration_needs_multiple_sockets(void) { - return migrate_use_multifd() || migrate_postcopy_preempt(); + return migrate_multifd() || migrate_postcopy_preempt(); } static bool uri_supports_multi_channels(const char *uri) @@ -353,21 +320,11 @@ void migration_incoming_state_destroy(void) static void migrate_generate_event(int new_state) { - if (migrate_use_events()) { + if (migrate_events()) { qapi_event_send_migration(new_state); } } -static bool migrate_late_block_activate(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->enabled_capabilities[ - MIGRATION_CAPABILITY_LATE_BLOCK_ACTIVATE]; -} - /* * Send a message on the return channel back to the source * of the migration. @@ -742,7 +699,7 @@ void migration_fd_process_incoming(QEMUFile *f, Error **errp) static bool migration_should_start_incoming(bool main_channel) { /* Multifd doesn't start unless all channels are established */ - if (migrate_use_multifd()) { + if (migrate_multifd()) { return migration_has_all_channels(); } @@ -769,7 +726,7 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) uint32_t channel_magic = 0; int ret = 0; - if (migrate_use_multifd() && !migrate_postcopy_ram() && + if (migrate_multifd() && !migrate_postcopy_ram() && qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) { /* * With multiple channels, it is possible that we receive channels @@ -808,7 +765,7 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) } else { /* Multiple connections */ assert(migration_needs_multiple_sockets()); - if (migrate_use_multifd()) { + if (migrate_multifd()) { multifd_recv_new_channel(ioc, &local_err); } else { assert(migrate_postcopy_preempt()); @@ -844,7 +801,7 @@ bool migration_has_all_channels(void) return false; } - if (migrate_use_multifd()) { + if (migrate_multifd()) { return multifd_recv_all_channels_created(); } @@ -929,139 +886,6 @@ void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value) migrate_send_rp_message(mis, MIG_RP_MSG_RESUME_ACK, sizeof(buf), &buf); } -MigrationCapabilityStatusList *qmp_query_migrate_capabilities(Error **errp) -{ - MigrationCapabilityStatusList *head = NULL, **tail = &head; - MigrationCapabilityStatus *caps; - MigrationState *s = migrate_get_current(); - int i; - - for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) { -#ifndef CONFIG_LIVE_BLOCK_MIGRATION - if (i == MIGRATION_CAPABILITY_BLOCK) { - continue; - } -#endif - caps = g_malloc0(sizeof(*caps)); - caps->capability = i; - caps->state = s->enabled_capabilities[i]; - QAPI_LIST_APPEND(tail, caps); - } - - return head; -} - -MigrationParameters *qmp_query_migrate_parameters(Error **errp) -{ - MigrationParameters *params; - MigrationState *s = migrate_get_current(); - - /* TODO use QAPI_CLONE() instead of duplicating it inline */ - params = g_malloc0(sizeof(*params)); - params->has_compress_level = true; - params->compress_level = s->parameters.compress_level; - params->has_compress_threads = true; - params->compress_threads = s->parameters.compress_threads; - params->has_compress_wait_thread = true; - params->compress_wait_thread = s->parameters.compress_wait_thread; - params->has_decompress_threads = true; - params->decompress_threads = s->parameters.decompress_threads; - params->has_throttle_trigger_threshold = true; - params->throttle_trigger_threshold = s->parameters.throttle_trigger_threshold; - params->has_cpu_throttle_initial = true; - params->cpu_throttle_initial = s->parameters.cpu_throttle_initial; - params->has_cpu_throttle_increment = true; - params->cpu_throttle_increment = s->parameters.cpu_throttle_increment; - params->has_cpu_throttle_tailslow = true; - params->cpu_throttle_tailslow = s->parameters.cpu_throttle_tailslow; - params->tls_creds = g_strdup(s->parameters.tls_creds); - params->tls_hostname = g_strdup(s->parameters.tls_hostname); - params->tls_authz = g_strdup(s->parameters.tls_authz ? - s->parameters.tls_authz : ""); - params->has_max_bandwidth = true; - params->max_bandwidth = s->parameters.max_bandwidth; - params->has_downtime_limit = true; - params->downtime_limit = s->parameters.downtime_limit; - params->has_x_checkpoint_delay = true; - params->x_checkpoint_delay = s->parameters.x_checkpoint_delay; - params->has_block_incremental = true; - params->block_incremental = s->parameters.block_incremental; - params->has_multifd_channels = true; - params->multifd_channels = s->parameters.multifd_channels; - params->has_multifd_compression = true; - params->multifd_compression = s->parameters.multifd_compression; - params->has_multifd_zlib_level = true; - params->multifd_zlib_level = s->parameters.multifd_zlib_level; - params->has_multifd_zstd_level = true; - params->multifd_zstd_level = s->parameters.multifd_zstd_level; - params->has_xbzrle_cache_size = true; - params->xbzrle_cache_size = s->parameters.xbzrle_cache_size; - params->has_max_postcopy_bandwidth = true; - params->max_postcopy_bandwidth = s->parameters.max_postcopy_bandwidth; - params->has_max_cpu_throttle = true; - params->max_cpu_throttle = s->parameters.max_cpu_throttle; - params->has_announce_initial = true; - params->announce_initial = s->parameters.announce_initial; - params->has_announce_max = true; - params->announce_max = s->parameters.announce_max; - params->has_announce_rounds = true; - params->announce_rounds = s->parameters.announce_rounds; - params->has_announce_step = true; - params->announce_step = s->parameters.announce_step; - - if (s->parameters.has_block_bitmap_mapping) { - params->has_block_bitmap_mapping = true; - params->block_bitmap_mapping = - QAPI_CLONE(BitmapMigrationNodeAliasList, - s->parameters.block_bitmap_mapping); - } - - return params; -} - -void qmp_client_migrate_info(const char *protocol, const char *hostname, - bool has_port, int64_t port, - bool has_tls_port, int64_t tls_port, - const char *cert_subject, - Error **errp) -{ - if (strcmp(protocol, "spice") == 0) { - if (!qemu_using_spice(errp)) { - return; - } - - if (!has_port && !has_tls_port) { - error_setg(errp, QERR_MISSING_PARAMETER, "port/tls-port"); - return; - } - - if (qemu_spice.migrate_info(hostname, - has_port ? port : -1, - has_tls_port ? tls_port : -1, - cert_subject)) { - error_setg(errp, "Could not set up display for migration"); - return; - } - return; - } - - error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "protocol", "'spice'"); -} - -AnnounceParameters *migrate_announce_params(void) -{ - static AnnounceParameters ap; - - MigrationState *s = migrate_get_current(); - - ap.initial = s->parameters.announce_initial; - ap.max = s->parameters.announce_max; - ap.rounds = s->parameters.announce_rounds; - ap.step = s->parameters.announce_step; - - return ≈ -} - /* * Return true if we're already in the middle of a migration * (i.e. any of the active or setup states) @@ -1140,26 +964,28 @@ static void populate_ram_info(MigrationInfo *info, MigrationState *s) size_t page_size = qemu_target_page_size(); info->ram = g_malloc0(sizeof(*info->ram)); - info->ram->transferred = stat64_get(&ram_atomic_counters.transferred); + info->ram->transferred = stat64_get(&ram_counters.transferred); info->ram->total = ram_bytes_total(); - info->ram->duplicate = stat64_get(&ram_atomic_counters.duplicate); + info->ram->duplicate = stat64_get(&ram_counters.zero_pages); /* legacy value. It is not used anymore */ info->ram->skipped = 0; - info->ram->normal = stat64_get(&ram_atomic_counters.normal); + info->ram->normal = stat64_get(&ram_counters.normal_pages); info->ram->normal_bytes = info->ram->normal * page_size; info->ram->mbps = s->mbps; - info->ram->dirty_sync_count = ram_counters.dirty_sync_count; + info->ram->dirty_sync_count = + stat64_get(&ram_counters.dirty_sync_count); info->ram->dirty_sync_missed_zero_copy = - ram_counters.dirty_sync_missed_zero_copy; - info->ram->postcopy_requests = ram_counters.postcopy_requests; + stat64_get(&ram_counters.dirty_sync_missed_zero_copy); + info->ram->postcopy_requests = + stat64_get(&ram_counters.postcopy_requests); info->ram->page_size = page_size; - info->ram->multifd_bytes = ram_counters.multifd_bytes; + info->ram->multifd_bytes = stat64_get(&ram_counters.multifd_bytes); info->ram->pages_per_second = s->pages_per_second; - info->ram->precopy_bytes = ram_counters.precopy_bytes; - info->ram->downtime_bytes = ram_counters.downtime_bytes; - info->ram->postcopy_bytes = stat64_get(&ram_atomic_counters.postcopy_bytes); + info->ram->precopy_bytes = stat64_get(&ram_counters.precopy_bytes); + info->ram->downtime_bytes = stat64_get(&ram_counters.downtime_bytes); + info->ram->postcopy_bytes = stat64_get(&ram_counters.postcopy_bytes); - if (migrate_use_xbzrle()) { + if (migrate_xbzrle()) { info->xbzrle_cache = g_malloc0(sizeof(*info->xbzrle_cache)); info->xbzrle_cache->cache_size = migrate_xbzrle_cache_size(); info->xbzrle_cache->bytes = xbzrle_counters.bytes; @@ -1170,7 +996,7 @@ static void populate_ram_info(MigrationInfo *info, MigrationState *s) info->xbzrle_cache->overflow = xbzrle_counters.overflow; } - if (migrate_use_compression()) { + if (migrate_compress()) { info->compression = g_malloc0(sizeof(*info->compression)); info->compression->pages = compression_counters.pages; info->compression->busy = compression_counters.busy; @@ -1272,172 +1098,6 @@ static void fill_source_migration_info(MigrationInfo *info) info->status = state; } -typedef enum WriteTrackingSupport { - WT_SUPPORT_UNKNOWN = 0, - WT_SUPPORT_ABSENT, - WT_SUPPORT_AVAILABLE, - WT_SUPPORT_COMPATIBLE -} WriteTrackingSupport; - -static -WriteTrackingSupport migrate_query_write_tracking(void) -{ - /* Check if kernel supports required UFFD features */ - if (!ram_write_tracking_available()) { - return WT_SUPPORT_ABSENT; - } - /* - * Check if current memory configuration is - * compatible with required UFFD features. - */ - if (!ram_write_tracking_compatible()) { - return WT_SUPPORT_AVAILABLE; - } - - return WT_SUPPORT_COMPATIBLE; -} - -/** - * @migration_caps_check - check capability validity - * - * @cap_list: old capability list, array of bool - * @params: new capabilities to be applied soon - * @errp: set *errp if the check failed, with reason - * - * Returns true if check passed, otherwise false. - */ -static bool migrate_caps_check(bool *cap_list, - MigrationCapabilityStatusList *params, - Error **errp) -{ - MigrationCapabilityStatusList *cap; - bool old_postcopy_cap; - MigrationIncomingState *mis = migration_incoming_get_current(); - - old_postcopy_cap = cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM]; - - for (cap = params; cap; cap = cap->next) { - cap_list[cap->value->capability] = cap->value->state; - } - -#ifndef CONFIG_LIVE_BLOCK_MIGRATION - if (cap_list[MIGRATION_CAPABILITY_BLOCK]) { - error_setg(errp, "QEMU compiled without old-style (blk/-b, inc/-i) " - "block migration"); - error_append_hint(errp, "Use drive_mirror+NBD instead.\n"); - return false; - } -#endif - -#ifndef CONFIG_REPLICATION - if (cap_list[MIGRATION_CAPABILITY_X_COLO]) { - error_setg(errp, "QEMU compiled without replication module" - " can't enable COLO"); - error_append_hint(errp, "Please enable replication before COLO.\n"); - return false; - } -#endif - - if (cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM]) { - /* This check is reasonably expensive, so only when it's being - * set the first time, also it's only the destination that needs - * special support. - */ - if (!old_postcopy_cap && runstate_check(RUN_STATE_INMIGRATE) && - !postcopy_ram_supported_by_host(mis)) { - /* postcopy_ram_supported_by_host will have emitted a more - * detailed message - */ - error_setg(errp, "Postcopy is not supported"); - return false; - } - - if (cap_list[MIGRATION_CAPABILITY_X_IGNORE_SHARED]) { - error_setg(errp, "Postcopy is not compatible with ignore-shared"); - return false; - } - } - - if (cap_list[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) { - WriteTrackingSupport wt_support; - int idx; - /* - * Check if 'background-snapshot' capability is supported by - * host kernel and compatible with guest memory configuration. - */ - wt_support = migrate_query_write_tracking(); - if (wt_support < WT_SUPPORT_AVAILABLE) { - error_setg(errp, "Background-snapshot is not supported by host kernel"); - return false; - } - if (wt_support < WT_SUPPORT_COMPATIBLE) { - error_setg(errp, "Background-snapshot is not compatible " - "with guest memory configuration"); - return false; - } - - /* - * Check if there are any migration capabilities - * incompatible with 'background-snapshot'. - */ - for (idx = 0; idx < check_caps_background_snapshot.size; idx++) { - int incomp_cap = check_caps_background_snapshot.caps[idx]; - if (cap_list[incomp_cap]) { - error_setg(errp, - "Background-snapshot is not compatible with %s", - MigrationCapability_str(incomp_cap)); - return false; - } - } - } - -#ifdef CONFIG_LINUX - if (cap_list[MIGRATION_CAPABILITY_ZERO_COPY_SEND] && - (!cap_list[MIGRATION_CAPABILITY_MULTIFD] || - cap_list[MIGRATION_CAPABILITY_COMPRESS] || - cap_list[MIGRATION_CAPABILITY_XBZRLE] || - migrate_multifd_compression() || - migrate_use_tls())) { - error_setg(errp, - "Zero copy only available for non-compressed non-TLS multifd migration"); - return false; - } -#else - if (cap_list[MIGRATION_CAPABILITY_ZERO_COPY_SEND]) { - error_setg(errp, - "Zero copy currently only available on Linux"); - return false; - } -#endif - - if (cap_list[MIGRATION_CAPABILITY_POSTCOPY_PREEMPT]) { - if (!cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM]) { - error_setg(errp, "Postcopy preempt requires postcopy-ram"); - return false; - } - - /* - * Preempt mode requires urgent pages to be sent in separate - * channel, OTOH compression logic will disorder all pages into - * different compression channels, which is not compatible with the - * preempt assumptions on channel assignments. - */ - if (cap_list[MIGRATION_CAPABILITY_COMPRESS]) { - error_setg(errp, "Postcopy preempt not compatible with compress"); - return false; - } - } - - if (cap_list[MIGRATION_CAPABILITY_MULTIFD]) { - if (cap_list[MIGRATION_CAPABILITY_COMPRESS]) { - error_setg(errp, "Multifd is not compatible with compress"); - return false; - } - } - - return true; -} - static void fill_destination_migration_info(MigrationInfo *info) { MigrationIncomingState *mis = migration_incoming_get_current(); @@ -1480,28 +1140,6 @@ MigrationInfo *qmp_query_migrate(Error **errp) return info; } -void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params, - Error **errp) -{ - MigrationState *s = migrate_get_current(); - MigrationCapabilityStatusList *cap; - bool cap_list[MIGRATION_CAPABILITY__MAX]; - - if (migration_is_running(s->state)) { - error_setg(errp, QERR_MIGRATION_ACTIVE); - return; - } - - memcpy(cap_list, s->enabled_capabilities, sizeof(cap_list)); - if (!migrate_caps_check(cap_list, params, errp)) { - return; - } - - for (cap = params; cap; cap = cap->next) { - s->enabled_capabilities[cap->value->capability] = cap->value->state; - } -} - /* * Check whether the parameters are valid. Error will be put into errp * (if provided). Return true if valid, otherwise false. @@ -1651,7 +1289,7 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp) } #ifdef CONFIG_LINUX - if (migrate_use_zero_copy_send() && + if (migrate_zero_copy_send() && ((params->has_multifd_compression && params->multifd_compression) || (params->tls_creds && *params->tls_creds))) { error_setg(errp, @@ -1946,27 +1584,6 @@ void migrate_set_state(int *state, int old_state, int new_state) } } -static MigrationCapabilityStatus *migrate_cap_add(MigrationCapability index, - bool state) -{ - MigrationCapabilityStatus *cap; - - cap = g_new0(MigrationCapabilityStatus, 1); - cap->capability = index; - cap->state = state; - - return cap; -} - -void migrate_set_block_enabled(bool value, Error **errp) -{ - MigrationCapabilityStatusList *cap = NULL; - - QAPI_LIST_PREPEND(cap, migrate_cap_add(MIGRATION_CAPABILITY_BLOCK, value)); - qmp_migrate_set_capabilities(cap, errp); - qapi_free_MigrationCapabilityStatusList(cap); -} - static void migrate_set_block_incremental(MigrationState *s, bool value) { s->parameters.block_incremental = value; @@ -1976,7 +1593,7 @@ static void block_cleanup_parameters(MigrationState *s) { if (s->must_remove_block_options) { /* setting to false can never fail */ - migrate_set_block_enabled(false, &error_abort); + migrate_cap_set(MIGRATION_CAPABILITY_BLOCK, false, &error_abort); migrate_set_block_incremental(s, false); s->must_remove_block_options = false; } @@ -2454,17 +2071,16 @@ static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc, } if (blk || blk_inc) { - if (migrate_colo_enabled()) { + if (migrate_colo()) { error_setg(errp, "No disk migration is required in COLO mode"); return false; } - if (migrate_use_block() || migrate_use_block_incremental()) { + if (migrate_block() || migrate_block_incremental()) { error_setg(errp, "Command options are incompatible with " "current migration capabilities"); return false; } - migrate_set_block_enabled(true, &local_err); - if (local_err) { + if (!migrate_cap_set(MIGRATION_CAPABILITY_BLOCK, true, &local_err)) { error_propagate(errp, local_err); return false; } @@ -2561,204 +2177,6 @@ void qmp_migrate_continue(MigrationStatus state, Error **errp) qemu_sem_post(&s->pause_sem); } -bool migrate_release_ram(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->enabled_capabilities[MIGRATION_CAPABILITY_RELEASE_RAM]; -} - -bool migrate_postcopy_ram(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_RAM]; -} - -bool migrate_postcopy(void) -{ - return migrate_postcopy_ram() || migrate_dirty_bitmaps(); -} - -bool migrate_auto_converge(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->enabled_capabilities[MIGRATION_CAPABILITY_AUTO_CONVERGE]; -} - -bool migrate_zero_blocks(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->enabled_capabilities[MIGRATION_CAPABILITY_ZERO_BLOCKS]; -} - -bool migrate_postcopy_blocktime(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME]; -} - -bool migrate_use_compression(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->enabled_capabilities[MIGRATION_CAPABILITY_COMPRESS]; -} - -int migrate_compress_level(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->parameters.compress_level; -} - -int migrate_compress_threads(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->parameters.compress_threads; -} - -int migrate_compress_wait_thread(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->parameters.compress_wait_thread; -} - -int migrate_decompress_threads(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->parameters.decompress_threads; -} - -bool migrate_dirty_bitmaps(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->enabled_capabilities[MIGRATION_CAPABILITY_DIRTY_BITMAPS]; -} - -bool migrate_ignore_shared(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->enabled_capabilities[MIGRATION_CAPABILITY_X_IGNORE_SHARED]; -} - -bool migrate_validate_uuid(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->enabled_capabilities[MIGRATION_CAPABILITY_VALIDATE_UUID]; -} - -bool migrate_use_events(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->enabled_capabilities[MIGRATION_CAPABILITY_EVENTS]; -} - -bool migrate_use_multifd(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->enabled_capabilities[MIGRATION_CAPABILITY_MULTIFD]; -} - -bool migrate_pause_before_switchover(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->enabled_capabilities[ - MIGRATION_CAPABILITY_PAUSE_BEFORE_SWITCHOVER]; -} - -int migrate_multifd_channels(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->parameters.multifd_channels; -} - -MultiFDCompression migrate_multifd_compression(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - assert(s->parameters.multifd_compression < MULTIFD_COMPRESSION__MAX); - return s->parameters.multifd_compression; -} - -int migrate_multifd_zlib_level(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->parameters.multifd_zlib_level; -} - -int migrate_multifd_zstd_level(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->parameters.multifd_zstd_level; -} - -#ifdef CONFIG_LINUX -bool migrate_use_zero_copy_send(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->enabled_capabilities[MIGRATION_CAPABILITY_ZERO_COPY_SEND]; -} -#endif - int migrate_use_tls(void) { MigrationState *s; @@ -2768,78 +2186,6 @@ int migrate_use_tls(void) return s->parameters.tls_creds && *s->parameters.tls_creds; } -int migrate_use_xbzrle(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->enabled_capabilities[MIGRATION_CAPABILITY_XBZRLE]; -} - -uint64_t migrate_xbzrle_cache_size(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->parameters.xbzrle_cache_size; -} - -static int64_t migrate_max_postcopy_bandwidth(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->parameters.max_postcopy_bandwidth; -} - -bool migrate_use_block(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->enabled_capabilities[MIGRATION_CAPABILITY_BLOCK]; -} - -bool migrate_use_return_path(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->enabled_capabilities[MIGRATION_CAPABILITY_RETURN_PATH]; -} - -bool migrate_use_block_incremental(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->parameters.block_incremental; -} - -bool migrate_background_snapshot(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->enabled_capabilities[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]; -} - -bool migrate_postcopy_preempt(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_PREEMPT]; -} - /* migration thread support */ /* * Something bad happened to the RP stream, mark an error @@ -3436,7 +2782,6 @@ static void migration_completion(MigrationState *s) ret = global_state_store(); if (!ret) { - bool inactivate = !migrate_colo_enabled(); ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); trace_migration_completion_vm_stop(ret); if (ret >= 0) { @@ -3444,12 +2789,10 @@ static void migration_completion(MigrationState *s) MIGRATION_STATUS_DEVICE); } if (ret >= 0) { + s->block_inactive = !migrate_colo(); qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX); ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false, - inactivate); - } - if (inactivate && ret >= 0) { - s->block_inactive = true; + s->block_inactive); } } qemu_mutex_unlock_iothread(); @@ -3499,7 +2842,7 @@ static void migration_completion(MigrationState *s) goto fail_invalidate; } - if (migrate_colo_enabled() && s->state == MIGRATION_STATUS_ACTIVE) { + if (migrate_colo() && s->state == MIGRATION_STATUS_ACTIVE) { /* COLO does not support postcopy */ migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, MIGRATION_STATUS_COLO); @@ -3522,6 +2865,7 @@ fail_invalidate: bdrv_activate_all(&local_err); if (local_err) { error_report_err(local_err); + s->block_inactive = true; } else { s->block_inactive = false; } @@ -3577,12 +2921,6 @@ fail: MIGRATION_STATUS_FAILED); } -bool migrate_colo_enabled(void) -{ - MigrationState *s = migrate_get_current(); - return s->enabled_capabilities[MIGRATION_CAPABILITY_X_COLO]; -} - typedef enum MigThrError { /* No error detected */ MIG_THR_ERR_NONE = 0, @@ -3778,7 +3116,7 @@ static MigThrError migration_detect_error(MigrationState *s) static uint64_t migration_total_bytes(MigrationState *s) { return qemu_file_total_transferred(s->to_dst_file) + - ram_counters.multifd_bytes; + stat64_get(&ram_counters.multifd_bytes); } static void migration_calculate_complete(MigrationState *s) @@ -3913,7 +3251,7 @@ static void migration_iteration_finish(MigrationState *s) runstate_set(RUN_STATE_POSTMIGRATE); break; case MIGRATION_STATUS_COLO: - if (!migrate_colo_enabled()) { + if (!migrate_colo()) { error_report("%s: critical error: calling COLO code without " "COLO enabled", __func__); } @@ -4109,7 +3447,7 @@ static void *migration_thread(void *opaque) qemu_savevm_send_postcopy_advise(s->to_dst_file); } - if (migrate_colo_enabled()) { + if (migrate_colo()) { /* Notify migration destination that we enable COLO */ qemu_savevm_send_colo_enable(s->to_dst_file); } @@ -4361,11 +3699,11 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) if (resume) { /* This is a resumed migration */ - rate_limit = s->parameters.max_postcopy_bandwidth / + rate_limit = migrate_max_postcopy_bandwidth() / XFER_LIMIT_RATIO; } else { /* This is a fresh new migration */ - rate_limit = s->parameters.max_bandwidth / XFER_LIMIT_RATIO; + rate_limit = migrate_max_bandwidth() / XFER_LIMIT_RATIO; /* Notify before starting migration thread */ notifier_list_notify(&migration_state_notifiers, s); @@ -4379,7 +3717,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) * precopy, only if user specified "return-path" capability would * QEMU uses the return path. */ - if (migrate_postcopy_ram() || migrate_use_return_path()) { + if (migrate_postcopy_ram() || migrate_return_path()) { if (open_return_path_on_source(s, !resume)) { error_report("Unable to open return-path for postcopy"); migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED); @@ -4423,27 +3761,8 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) s->migration_thread_running = true; } -void migration_global_dump(Monitor *mon) -{ - MigrationState *ms = migrate_get_current(); - - monitor_printf(mon, "globals:\n"); - monitor_printf(mon, "store-global-state: %s\n", - ms->store_global_state ? "on" : "off"); - monitor_printf(mon, "only-migratable: %s\n", - only_migratable ? "on" : "off"); - monitor_printf(mon, "send-configuration: %s\n", - ms->send_configuration ? "on" : "off"); - monitor_printf(mon, "send-section-footer: %s\n", - ms->send_section_footer ? "on" : "off"); - monitor_printf(mon, "decompress-error-check: %s\n", - ms->decompress_error_check ? "on" : "off"); - monitor_printf(mon, "clear-bitmap-shift: %u\n", - ms->clear_bitmap_shift); -} - #define DEFINE_PROP_MIG_CAP(name, x) \ - DEFINE_PROP_BOOL(name, MigrationState, enabled_capabilities[x], false) + DEFINE_PROP_BOOL(name, MigrationState, capabilities[x], false) static Property migration_properties[] = { DEFINE_PROP_BOOL("store-global-state", MigrationState, @@ -4632,27 +3951,14 @@ static void migration_instance_init(Object *obj) */ static bool migration_object_check(MigrationState *ms, Error **errp) { - MigrationCapabilityStatusList *head = NULL; /* Assuming all off */ - bool cap_list[MIGRATION_CAPABILITY__MAX] = { 0 }, ret; - int i; + bool old_caps[MIGRATION_CAPABILITY__MAX] = { 0 }; if (!migrate_params_check(&ms->parameters, errp)) { return false; } - for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) { - if (ms->enabled_capabilities[i]) { - QAPI_LIST_PREPEND(head, migrate_cap_add(i, true)); - } - } - - ret = migrate_caps_check(cap_list, head, errp); - - /* It works with head == NULL */ - qapi_free_MigrationCapabilityStatusList(head); - - return ret; + return migrate_caps_check(old_caps, ms->capabilities, errp); } static const TypeInfo migration_type = { diff --git a/migration/migration.h b/migration/migration.h index 310ae8901b..dcf906868d 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -310,7 +310,7 @@ struct MigrationState { int64_t downtime_start; int64_t downtime; int64_t expected_downtime; - bool enabled_capabilities[MIGRATION_CAPABILITY__MAX]; + bool capabilities[MIGRATION_CAPABILITY__MAX]; int64_t setup_time; /* * Whether guest was running when we enter the completion stage. @@ -447,50 +447,10 @@ bool migration_is_blocked(Error **errp); bool migration_in_postcopy(void); MigrationState *migrate_get_current(void); -bool migrate_postcopy(void); - -bool migrate_release_ram(void); -bool migrate_postcopy_ram(void); -bool migrate_zero_blocks(void); -bool migrate_dirty_bitmaps(void); -bool migrate_ignore_shared(void); -bool migrate_validate_uuid(void); - -bool migrate_auto_converge(void); -bool migrate_use_multifd(void); -bool migrate_pause_before_switchover(void); -int migrate_multifd_channels(void); -MultiFDCompression migrate_multifd_compression(void); -int migrate_multifd_zlib_level(void); -int migrate_multifd_zstd_level(void); - -#ifdef CONFIG_LINUX -bool migrate_use_zero_copy_send(void); -#else -#define migrate_use_zero_copy_send() (false) -#endif int migrate_use_tls(void); -int migrate_use_xbzrle(void); -uint64_t migrate_xbzrle_cache_size(void); -bool migrate_colo_enabled(void); - -bool migrate_use_block(void); -bool migrate_use_block_incremental(void); -int migrate_max_cpu_throttle(void); -bool migrate_use_return_path(void); uint64_t ram_get_total_transferred_pages(void); -bool migrate_use_compression(void); -int migrate_compress_level(void); -int migrate_compress_threads(void); -int migrate_compress_wait_thread(void); -int migrate_decompress_threads(void); -bool migrate_use_events(void); -bool migrate_postcopy_blocktime(void); -bool migrate_background_snapshot(void); -bool migrate_postcopy_preempt(void); - /* Sending on the return path - generic and then for each message type */ void migrate_send_rp_shut(MigrationIncomingState *mis, uint32_t value); diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index 37770248e1..81701250ad 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -18,6 +18,7 @@ #include "qapi/error.h" #include "migration.h" #include "trace.h" +#include "options.h" #include "multifd.h" struct zlib_data { diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index f4a8e1ed1f..d1d29e76cc 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -18,6 +18,7 @@ #include "qapi/error.h" #include "migration.h" #include "trace.h" +#include "options.h" #include "multifd.h" struct zstd_data { diff --git a/migration/multifd.c b/migration/multifd.c index cbc0dfe39b..cce3ad6988 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -25,7 +25,7 @@ #include "trace.h" #include "multifd.h" #include "threadinfo.h" - +#include "options.h" #include "qemu/yank.h" #include "io/channel-socket.h" #include "yank_functions.h" @@ -432,9 +432,9 @@ static int multifd_send_pages(QEMUFile *f) p->pages = pages; transferred = ((uint64_t) pages->num) * p->page_size + p->packet_len; qemu_file_acct_rate_limit(f, transferred); - ram_counters.multifd_bytes += transferred; - stat64_add(&ram_atomic_counters.transferred, transferred); qemu_mutex_unlock(&p->mutex); + stat64_add(&ram_counters.transferred, transferred); + stat64_add(&ram_counters.multifd_bytes, transferred); qemu_sem_post(&p->sem); return 1; @@ -516,7 +516,7 @@ void multifd_save_cleanup(void) { int i; - if (!migrate_use_multifd()) { + if (!migrate_multifd()) { return; } multifd_send_terminate_threads(NULL); @@ -576,7 +576,7 @@ static int multifd_zero_copy_flush(QIOChannel *c) return -1; } if (ret == 1) { - dirty_sync_missed_zero_copy(); + stat64_add(&ram_counters.dirty_sync_missed_zero_copy, 1); } return ret; @@ -587,7 +587,7 @@ int multifd_send_sync_main(QEMUFile *f) int i; bool flush_zero_copy; - if (!migrate_use_multifd()) { + if (!migrate_multifd()) { return 0; } if (multifd_send_state->pages->num) { @@ -608,7 +608,7 @@ int multifd_send_sync_main(QEMUFile *f) * all the dirty bitmaps. */ - flush_zero_copy = migrate_use_zero_copy_send(); + flush_zero_copy = migrate_zero_copy_send(); for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; @@ -627,9 +627,9 @@ int multifd_send_sync_main(QEMUFile *f) p->flags |= MULTIFD_FLAG_SYNC; p->pending_job++; qemu_file_acct_rate_limit(f, p->packet_len); - ram_counters.multifd_bytes += p->packet_len; - stat64_add(&ram_atomic_counters.transferred, p->packet_len); qemu_mutex_unlock(&p->mutex); + stat64_add(&ram_counters.transferred, p->packet_len); + stat64_add(&ram_counters.multifd_bytes, p->packet_len); qemu_sem_post(&p->sem); } for (i = 0; i < migrate_multifd_channels(); i++) { @@ -653,7 +653,7 @@ static void *multifd_send_thread(void *opaque) MigrationThread *thread = NULL; Error *local_err = NULL; int ret = 0; - bool use_zero_copy_send = migrate_use_zero_copy_send(); + bool use_zero_copy_send = migrate_zero_copy_send(); thread = MigrationThreadAdd(p->name, qemu_get_thread_id()); @@ -911,7 +911,7 @@ int multifd_save_setup(Error **errp) uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); uint8_t i; - if (!migrate_use_multifd()) { + if (!migrate_multifd()) { return 0; } @@ -945,7 +945,7 @@ int multifd_save_setup(Error **errp) p->page_size = qemu_target_page_size(); p->page_count = page_count; - if (migrate_use_zero_copy_send()) { + if (migrate_zero_copy_send()) { p->write_flags = QIO_CHANNEL_WRITE_FLAG_ZERO_COPY; } else { p->write_flags = 0; @@ -1016,7 +1016,7 @@ static void multifd_recv_terminate_threads(Error *err) void multifd_load_shutdown(void) { - if (migrate_use_multifd()) { + if (migrate_multifd()) { multifd_recv_terminate_threads(NULL); } } @@ -1025,7 +1025,7 @@ void multifd_load_cleanup(void) { int i; - if (!migrate_use_multifd()) { + if (!migrate_multifd()) { return; } multifd_recv_terminate_threads(NULL); @@ -1072,7 +1072,7 @@ void multifd_recv_sync_main(void) { int i; - if (!migrate_use_multifd()) { + if (!migrate_multifd()) { return; } for (i = 0; i < migrate_multifd_channels(); i++) { @@ -1170,7 +1170,7 @@ int multifd_load_setup(Error **errp) * Return successfully if multiFD recv state is already initialised * or multiFD is not enabled. */ - if (multifd_recv_state || !migrate_use_multifd()) { + if (multifd_recv_state || !migrate_multifd()) { return 0; } @@ -1216,7 +1216,7 @@ bool multifd_recv_all_channels_created(void) { int thread_count = migrate_multifd_channels(); - if (!migrate_use_multifd()) { + if (!migrate_multifd()) { return true; } diff --git a/migration/options.c b/migration/options.c new file mode 100644 index 0000000000..8e8753d9be --- /dev/null +++ b/migration/options.c @@ -0,0 +1,722 @@ +/* + * QEMU migration capabilities + * + * Copyright (c) 2012-2023 Red Hat Inc + * + * Authors: + * Orit Wasserman <owasserm@redhat.com> + * Juan Quintela <quintela@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qapi/clone-visitor.h" +#include "qapi/error.h" +#include "qapi/qapi-commands-migration.h" +#include "qapi/qapi-visit-migration.h" +#include "qapi/qmp/qerror.h" +#include "sysemu/runstate.h" +#include "migration/misc.h" +#include "migration.h" +#include "ram.h" +#include "options.h" + +bool migrate_auto_converge(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_AUTO_CONVERGE]; +} + +bool migrate_background_snapshot(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]; +} + +bool migrate_block(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_BLOCK]; +} + +bool migrate_colo(void) +{ + MigrationState *s = migrate_get_current(); + return s->capabilities[MIGRATION_CAPABILITY_X_COLO]; +} + +bool migrate_compress(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_COMPRESS]; +} + +bool migrate_dirty_bitmaps(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_DIRTY_BITMAPS]; +} + +bool migrate_events(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_EVENTS]; +} + +bool migrate_ignore_shared(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_X_IGNORE_SHARED]; +} + +bool migrate_late_block_activate(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_LATE_BLOCK_ACTIVATE]; +} + +bool migrate_multifd(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_MULTIFD]; +} + +bool migrate_pause_before_switchover(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_PAUSE_BEFORE_SWITCHOVER]; +} + +bool migrate_postcopy_blocktime(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME]; +} + +bool migrate_postcopy_preempt(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_POSTCOPY_PREEMPT]; +} + +bool migrate_postcopy_ram(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_POSTCOPY_RAM]; +} + +bool migrate_rdma_pin_all(void) +{ + MigrationState *s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL]; +} + +bool migrate_release_ram(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_RELEASE_RAM]; +} + +bool migrate_return_path(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_RETURN_PATH]; +} + +bool migrate_validate_uuid(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_VALIDATE_UUID]; +} + +bool migrate_xbzrle(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_XBZRLE]; +} + +bool migrate_zero_blocks(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_ZERO_BLOCKS]; +} + +bool migrate_zero_copy_send(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_ZERO_COPY_SEND]; +} + +/* pseudo capabilities */ + +bool migrate_postcopy(void) +{ + return migrate_postcopy_ram() || migrate_dirty_bitmaps(); +} + +typedef enum WriteTrackingSupport { + WT_SUPPORT_UNKNOWN = 0, + WT_SUPPORT_ABSENT, + WT_SUPPORT_AVAILABLE, + WT_SUPPORT_COMPATIBLE +} WriteTrackingSupport; + +static +WriteTrackingSupport migrate_query_write_tracking(void) +{ + /* Check if kernel supports required UFFD features */ + if (!ram_write_tracking_available()) { + return WT_SUPPORT_ABSENT; + } + /* + * Check if current memory configuration is + * compatible with required UFFD features. + */ + if (!ram_write_tracking_compatible()) { + return WT_SUPPORT_AVAILABLE; + } + + return WT_SUPPORT_COMPATIBLE; +} + +/* Migration capabilities set */ +struct MigrateCapsSet { + int size; /* Capability set size */ + MigrationCapability caps[]; /* Variadic array of capabilities */ +}; +typedef struct MigrateCapsSet MigrateCapsSet; + +/* Define and initialize MigrateCapsSet */ +#define INITIALIZE_MIGRATE_CAPS_SET(_name, ...) \ + MigrateCapsSet _name = { \ + .size = sizeof((int []) { __VA_ARGS__ }) / sizeof(int), \ + .caps = { __VA_ARGS__ } \ + } + +/* Background-snapshot compatibility check list */ +static const +INITIALIZE_MIGRATE_CAPS_SET(check_caps_background_snapshot, + MIGRATION_CAPABILITY_POSTCOPY_RAM, + MIGRATION_CAPABILITY_DIRTY_BITMAPS, + MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME, + MIGRATION_CAPABILITY_LATE_BLOCK_ACTIVATE, + MIGRATION_CAPABILITY_RETURN_PATH, + MIGRATION_CAPABILITY_MULTIFD, + MIGRATION_CAPABILITY_PAUSE_BEFORE_SWITCHOVER, + MIGRATION_CAPABILITY_AUTO_CONVERGE, + MIGRATION_CAPABILITY_RELEASE_RAM, + MIGRATION_CAPABILITY_RDMA_PIN_ALL, + MIGRATION_CAPABILITY_COMPRESS, + MIGRATION_CAPABILITY_XBZRLE, + MIGRATION_CAPABILITY_X_COLO, + MIGRATION_CAPABILITY_VALIDATE_UUID, + MIGRATION_CAPABILITY_ZERO_COPY_SEND); + +/** + * @migration_caps_check - check capability compatibility + * + * @old_caps: old capability list + * @new_caps: new capability list + * @errp: set *errp if the check failed, with reason + * + * Returns true if check passed, otherwise false. + */ +bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp) +{ + MigrationIncomingState *mis = migration_incoming_get_current(); + +#ifndef CONFIG_LIVE_BLOCK_MIGRATION + if (new_caps[MIGRATION_CAPABILITY_BLOCK]) { + error_setg(errp, "QEMU compiled without old-style (blk/-b, inc/-i) " + "block migration"); + error_append_hint(errp, "Use drive_mirror+NBD instead.\n"); + return false; + } +#endif + +#ifndef CONFIG_REPLICATION + if (new_caps[MIGRATION_CAPABILITY_X_COLO]) { + error_setg(errp, "QEMU compiled without replication module" + " can't enable COLO"); + error_append_hint(errp, "Please enable replication before COLO.\n"); + return false; + } +#endif + + if (new_caps[MIGRATION_CAPABILITY_POSTCOPY_RAM]) { + /* This check is reasonably expensive, so only when it's being + * set the first time, also it's only the destination that needs + * special support. + */ + if (!old_caps[MIGRATION_CAPABILITY_POSTCOPY_RAM] && + runstate_check(RUN_STATE_INMIGRATE) && + !postcopy_ram_supported_by_host(mis)) { + /* postcopy_ram_supported_by_host will have emitted a more + * detailed message + */ + error_setg(errp, "Postcopy is not supported"); + return false; + } + + if (new_caps[MIGRATION_CAPABILITY_X_IGNORE_SHARED]) { + error_setg(errp, "Postcopy is not compatible with ignore-shared"); + return false; + } + } + + if (new_caps[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) { + WriteTrackingSupport wt_support; + int idx; + /* + * Check if 'background-snapshot' capability is supported by + * host kernel and compatible with guest memory configuration. + */ + wt_support = migrate_query_write_tracking(); + if (wt_support < WT_SUPPORT_AVAILABLE) { + error_setg(errp, "Background-snapshot is not supported by host kernel"); + return false; + } + if (wt_support < WT_SUPPORT_COMPATIBLE) { + error_setg(errp, "Background-snapshot is not compatible " + "with guest memory configuration"); + return false; + } + + /* + * Check if there are any migration capabilities + * incompatible with 'background-snapshot'. + */ + for (idx = 0; idx < check_caps_background_snapshot.size; idx++) { + int incomp_cap = check_caps_background_snapshot.caps[idx]; + if (new_caps[incomp_cap]) { + error_setg(errp, + "Background-snapshot is not compatible with %s", + MigrationCapability_str(incomp_cap)); + return false; + } + } + } + +#ifdef CONFIG_LINUX + if (new_caps[MIGRATION_CAPABILITY_ZERO_COPY_SEND] && + (!new_caps[MIGRATION_CAPABILITY_MULTIFD] || + new_caps[MIGRATION_CAPABILITY_COMPRESS] || + new_caps[MIGRATION_CAPABILITY_XBZRLE] || + migrate_multifd_compression() || + migrate_use_tls())) { + error_setg(errp, + "Zero copy only available for non-compressed non-TLS multifd migration"); + return false; + } +#else + if (new_caps[MIGRATION_CAPABILITY_ZERO_COPY_SEND]) { + error_setg(errp, + "Zero copy currently only available on Linux"); + return false; + } +#endif + + if (new_caps[MIGRATION_CAPABILITY_POSTCOPY_PREEMPT]) { + if (!new_caps[MIGRATION_CAPABILITY_POSTCOPY_RAM]) { + error_setg(errp, "Postcopy preempt requires postcopy-ram"); + return false; + } + + /* + * Preempt mode requires urgent pages to be sent in separate + * channel, OTOH compression logic will disorder all pages into + * different compression channels, which is not compatible with the + * preempt assumptions on channel assignments. + */ + if (new_caps[MIGRATION_CAPABILITY_COMPRESS]) { + error_setg(errp, "Postcopy preempt not compatible with compress"); + return false; + } + } + + if (new_caps[MIGRATION_CAPABILITY_MULTIFD]) { + if (new_caps[MIGRATION_CAPABILITY_COMPRESS]) { + error_setg(errp, "Multifd is not compatible with compress"); + return false; + } + } + + return true; +} + +bool migrate_cap_set(int cap, bool value, Error **errp) +{ + MigrationState *s = migrate_get_current(); + bool new_caps[MIGRATION_CAPABILITY__MAX]; + + if (migration_is_running(s->state)) { + error_setg(errp, QERR_MIGRATION_ACTIVE); + return false; + } + + memcpy(new_caps, s->capabilities, sizeof(new_caps)); + new_caps[cap] = value; + + if (!migrate_caps_check(s->capabilities, new_caps, errp)) { + return false; + } + s->capabilities[cap] = value; + return true; +} + +MigrationCapabilityStatusList *qmp_query_migrate_capabilities(Error **errp) +{ + MigrationCapabilityStatusList *head = NULL, **tail = &head; + MigrationCapabilityStatus *caps; + MigrationState *s = migrate_get_current(); + int i; + + for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) { +#ifndef CONFIG_LIVE_BLOCK_MIGRATION + if (i == MIGRATION_CAPABILITY_BLOCK) { + continue; + } +#endif + caps = g_malloc0(sizeof(*caps)); + caps->capability = i; + caps->state = s->capabilities[i]; + QAPI_LIST_APPEND(tail, caps); + } + + return head; +} + +void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params, + Error **errp) +{ + MigrationState *s = migrate_get_current(); + MigrationCapabilityStatusList *cap; + bool new_caps[MIGRATION_CAPABILITY__MAX]; + + if (migration_is_running(s->state)) { + error_setg(errp, QERR_MIGRATION_ACTIVE); + return; + } + + memcpy(new_caps, s->capabilities, sizeof(new_caps)); + for (cap = params; cap; cap = cap->next) { + new_caps[cap->value->capability] = cap->value->state; + } + + if (!migrate_caps_check(s->capabilities, new_caps, errp)) { + return; + } + + for (cap = params; cap; cap = cap->next) { + s->capabilities[cap->value->capability] = cap->value->state; + } +} + +/* parameters */ + +bool migrate_block_incremental(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->parameters.block_incremental; +} + +uint32_t migrate_checkpoint_delay(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->parameters.x_checkpoint_delay; +} + +int migrate_compress_level(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->parameters.compress_level; +} + +int migrate_compress_threads(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->parameters.compress_threads; +} + +int migrate_compress_wait_thread(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->parameters.compress_wait_thread; +} + +uint8_t migrate_cpu_throttle_increment(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->parameters.cpu_throttle_increment; +} + +uint8_t migrate_cpu_throttle_initial(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->parameters.cpu_throttle_initial; +} + +bool migrate_cpu_throttle_tailslow(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->parameters.cpu_throttle_tailslow; +} + +int migrate_decompress_threads(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->parameters.decompress_threads; +} + +uint8_t migrate_max_cpu_throttle(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->parameters.max_cpu_throttle; +} + +uint64_t migrate_max_bandwidth(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->parameters.max_bandwidth; +} + +int64_t migrate_max_postcopy_bandwidth(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->parameters.max_postcopy_bandwidth; +} + +int migrate_multifd_channels(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->parameters.multifd_channels; +} + +MultiFDCompression migrate_multifd_compression(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + assert(s->parameters.multifd_compression < MULTIFD_COMPRESSION__MAX); + return s->parameters.multifd_compression; +} + +int migrate_multifd_zlib_level(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->parameters.multifd_zlib_level; +} + +int migrate_multifd_zstd_level(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->parameters.multifd_zstd_level; +} + +uint8_t migrate_throttle_trigger_threshold(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->parameters.throttle_trigger_threshold; +} + +uint64_t migrate_xbzrle_cache_size(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->parameters.xbzrle_cache_size; +} + +/* parameters helpers */ + +AnnounceParameters *migrate_announce_params(void) +{ + static AnnounceParameters ap; + + MigrationState *s = migrate_get_current(); + + ap.initial = s->parameters.announce_initial; + ap.max = s->parameters.announce_max; + ap.rounds = s->parameters.announce_rounds; + ap.step = s->parameters.announce_step; + + return ≈ +} + +MigrationParameters *qmp_query_migrate_parameters(Error **errp) +{ + MigrationParameters *params; + MigrationState *s = migrate_get_current(); + + /* TODO use QAPI_CLONE() instead of duplicating it inline */ + params = g_malloc0(sizeof(*params)); + params->has_compress_level = true; + params->compress_level = s->parameters.compress_level; + params->has_compress_threads = true; + params->compress_threads = s->parameters.compress_threads; + params->has_compress_wait_thread = true; + params->compress_wait_thread = s->parameters.compress_wait_thread; + params->has_decompress_threads = true; + params->decompress_threads = s->parameters.decompress_threads; + params->has_throttle_trigger_threshold = true; + params->throttle_trigger_threshold = s->parameters.throttle_trigger_threshold; + params->has_cpu_throttle_initial = true; + params->cpu_throttle_initial = s->parameters.cpu_throttle_initial; + params->has_cpu_throttle_increment = true; + params->cpu_throttle_increment = s->parameters.cpu_throttle_increment; + params->has_cpu_throttle_tailslow = true; + params->cpu_throttle_tailslow = s->parameters.cpu_throttle_tailslow; + params->tls_creds = g_strdup(s->parameters.tls_creds); + params->tls_hostname = g_strdup(s->parameters.tls_hostname); + params->tls_authz = g_strdup(s->parameters.tls_authz ? + s->parameters.tls_authz : ""); + params->has_max_bandwidth = true; + params->max_bandwidth = s->parameters.max_bandwidth; + params->has_downtime_limit = true; + params->downtime_limit = s->parameters.downtime_limit; + params->has_x_checkpoint_delay = true; + params->x_checkpoint_delay = s->parameters.x_checkpoint_delay; + params->has_block_incremental = true; + params->block_incremental = s->parameters.block_incremental; + params->has_multifd_channels = true; + params->multifd_channels = s->parameters.multifd_channels; + params->has_multifd_compression = true; + params->multifd_compression = s->parameters.multifd_compression; + params->has_multifd_zlib_level = true; + params->multifd_zlib_level = s->parameters.multifd_zlib_level; + params->has_multifd_zstd_level = true; + params->multifd_zstd_level = s->parameters.multifd_zstd_level; + params->has_xbzrle_cache_size = true; + params->xbzrle_cache_size = s->parameters.xbzrle_cache_size; + params->has_max_postcopy_bandwidth = true; + params->max_postcopy_bandwidth = s->parameters.max_postcopy_bandwidth; + params->has_max_cpu_throttle = true; + params->max_cpu_throttle = s->parameters.max_cpu_throttle; + params->has_announce_initial = true; + params->announce_initial = s->parameters.announce_initial; + params->has_announce_max = true; + params->announce_max = s->parameters.announce_max; + params->has_announce_rounds = true; + params->announce_rounds = s->parameters.announce_rounds; + params->has_announce_step = true; + params->announce_step = s->parameters.announce_step; + + if (s->parameters.has_block_bitmap_mapping) { + params->has_block_bitmap_mapping = true; + params->block_bitmap_mapping = + QAPI_CLONE(BitmapMigrationNodeAliasList, + s->parameters.block_bitmap_mapping); + } + + return params; +} diff --git a/migration/options.h b/migration/options.h new file mode 100644 index 0000000000..1b78fa9f3d --- /dev/null +++ b/migration/options.h @@ -0,0 +1,76 @@ +/* + * QEMU migration capabilities + * + * Copyright (c) 2012-2023 Red Hat Inc + * + * Authors: + * Orit Wasserman <owasserm@redhat.com> + * Juan Quintela <quintela@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef QEMU_MIGRATION_OPTIONS_H +#define QEMU_MIGRATION_OPTIONS_H + +/* capabilities */ + +bool migrate_auto_converge(void); +bool migrate_background_snapshot(void); +bool migrate_block(void); +bool migrate_colo(void); +bool migrate_compress(void); +bool migrate_dirty_bitmaps(void); +bool migrate_events(void); +bool migrate_ignore_shared(void); +bool migrate_late_block_activate(void); +bool migrate_multifd(void); +bool migrate_pause_before_switchover(void); +bool migrate_postcopy_blocktime(void); +bool migrate_postcopy_preempt(void); +bool migrate_postcopy_ram(void); +bool migrate_rdma_pin_all(void); +bool migrate_release_ram(void); +bool migrate_return_path(void); +bool migrate_validate_uuid(void); +bool migrate_xbzrle(void); +bool migrate_zero_blocks(void); +bool migrate_zero_copy_send(void); + +/* + * pseudo capabilities + * + * These are functions that are used in a similar way to capabilities + * check, but they are not a capability. + */ + +bool migrate_postcopy(void); + +/* capabilities helpers */ + +bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp); +bool migrate_cap_set(int cap, bool value, Error **errp); + +/* parameters */ + +bool migrate_block_incremental(void); +uint32_t migrate_checkpoint_delay(void); +int migrate_compress_level(void); +int migrate_compress_threads(void); +int migrate_compress_wait_thread(void); +uint8_t migrate_cpu_throttle_increment(void); +uint8_t migrate_cpu_throttle_initial(void); +bool migrate_cpu_throttle_tailslow(void); +int migrate_decompress_threads(void); +uint8_t migrate_max_cpu_throttle(void); +uint64_t migrate_max_bandwidth(void); +int64_t migrate_max_postcopy_bandwidth(void); +int migrate_multifd_channels(void); +MultiFDCompression migrate_multifd_compression(void); +int migrate_multifd_zlib_level(void); +int migrate_multifd_zstd_level(void); +uint8_t migrate_throttle_trigger_threshold(void); +uint64_t migrate_xbzrle_cache_size(void); + +#endif diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index 7d24dac397..0711500036 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -36,6 +36,8 @@ #include "yank_functions.h" #include "tls.h" #include "qemu/userfaultfd.h" +#include "qemu/mmap-alloc.h" +#include "options.h" /* Arbitrary limit on size of each discard command, * keeps them around ~200 bytes @@ -336,11 +338,12 @@ static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis) /* Callback from postcopy_ram_supported_by_host block iterator. */ -static int test_ramblock_postcopiable(RAMBlock *rb, void *opaque) +static int test_ramblock_postcopiable(RAMBlock *rb) { const char *block_name = qemu_ram_get_idstr(rb); ram_addr_t length = qemu_ram_get_used_length(rb); size_t pagesize = qemu_ram_pagesize(rb); + QemuFsType fs; if (length % pagesize) { error_report("Postcopy requires RAM blocks to be a page size multiple," @@ -348,6 +351,15 @@ static int test_ramblock_postcopiable(RAMBlock *rb, void *opaque) "page size of 0x%zx", block_name, length, pagesize); return 1; } + + if (rb->fd >= 0) { + fs = qemu_fd_getfs(rb->fd); + if (fs != QEMU_FS_TYPE_TMPFS && fs != QEMU_FS_TYPE_HUGETLBFS) { + error_report("Host backend files need to be TMPFS or HUGETLBFS only"); + return 1; + } + } + return 0; } @@ -366,6 +378,7 @@ bool postcopy_ram_supported_by_host(MigrationIncomingState *mis) struct uffdio_range range_struct; uint64_t feature_mask; Error *local_err = NULL; + RAMBlock *block; if (qemu_target_page_size() > pagesize) { error_report("Target page size bigger than host page size"); @@ -390,9 +403,23 @@ bool postcopy_ram_supported_by_host(MigrationIncomingState *mis) goto out; } - /* We don't support postcopy with shared RAM yet */ - if (foreach_not_ignored_block(test_ramblock_postcopiable, NULL)) { - goto out; + /* + * We don't support postcopy with some type of ramblocks. + * + * NOTE: we explicitly ignored ramblock_is_ignored() instead we checked + * all possible ramblocks. This is because this function can be called + * when creating the migration object, during the phase RAM_MIGRATABLE + * is not even properly set for all the ramblocks. + * + * A side effect of this is we'll also check against RAM_SHARED + * ramblocks even if migrate_ignore_shared() is set (in which case + * we'll never migrate RAM_SHARED at all), but normally this shouldn't + * affect in reality, or we can revisit. + */ + RAMBLOCK_FOREACH(block) { + if (test_ramblock_postcopiable(block)) { + goto out; + } } /* diff --git a/migration/ram.c b/migration/ram.c index 79d881f735..01356f60a4 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -57,6 +57,7 @@ #include "qemu/iov.h" #include "multifd.h" #include "sysemu/runstate.h" +#include "options.h" #include "hw/boards.h" /* for machine_dump_guest_core() */ @@ -155,14 +156,14 @@ static struct { static void XBZRLE_cache_lock(void) { - if (migrate_use_xbzrle()) { + if (migrate_xbzrle()) { qemu_mutex_lock(&XBZRLE.lock); } } static void XBZRLE_cache_unlock(void) { - if (migrate_use_xbzrle()) { + if (migrate_xbzrle()) { qemu_mutex_unlock(&XBZRLE.lock); } } @@ -458,30 +459,18 @@ uint64_t ram_bytes_remaining(void) 0; } -/* - * NOTE: not all stats in ram_counters are used in reality. See comments - * for struct MigrationAtomicStats. The ultimate result of ram migration - * counters will be a merged version with both ram_counters and the atomic - * fields in ram_atomic_counters. - */ -MigrationStats ram_counters; -MigrationAtomicStats ram_atomic_counters; +RAMStats ram_counters; void ram_transferred_add(uint64_t bytes) { if (runstate_is_running()) { - ram_counters.precopy_bytes += bytes; + stat64_add(&ram_counters.precopy_bytes, bytes); } else if (migration_in_postcopy()) { - stat64_add(&ram_atomic_counters.postcopy_bytes, bytes); + stat64_add(&ram_counters.postcopy_bytes, bytes); } else { - ram_counters.downtime_bytes += bytes; + stat64_add(&ram_counters.downtime_bytes, bytes); } - stat64_add(&ram_atomic_counters.transferred, bytes); -} - -void dirty_sync_missed_zero_copy(void) -{ - ram_counters.dirty_sync_missed_zero_copy++; + stat64_add(&ram_counters.transferred, bytes); } struct MigrationOps { @@ -597,7 +586,7 @@ static void compress_threads_save_cleanup(void) { int i, thread_count; - if (!migrate_use_compression() || !comp_param) { + if (!migrate_compress() || !comp_param) { return; } @@ -636,7 +625,7 @@ static int compress_threads_save_setup(void) { int i, thread_count; - if (!migrate_use_compression()) { + if (!migrate_compress()) { return 0; } thread_count = migrate_compress_threads(); @@ -722,11 +711,10 @@ static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f, static void mig_throttle_guest_down(uint64_t bytes_dirty_period, uint64_t bytes_dirty_threshold) { - MigrationState *s = migrate_get_current(); - uint64_t pct_initial = s->parameters.cpu_throttle_initial; - uint64_t pct_increment = s->parameters.cpu_throttle_increment; - bool pct_tailslow = s->parameters.cpu_throttle_tailslow; - int pct_max = s->parameters.max_cpu_throttle; + uint64_t pct_initial = migrate_cpu_throttle_initial(); + uint64_t pct_increment = migrate_cpu_throttle_increment(); + bool pct_tailslow = migrate_cpu_throttle_tailslow(); + int pct_max = migrate_max_cpu_throttle(); uint64_t throttle_now = cpu_throttle_get_percentage(); uint64_t cpu_now, cpu_ideal, throttle_inc; @@ -756,7 +744,7 @@ void mig_throttle_counter_reset(void) rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); rs->num_dirty_pages_period = 0; - rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred); + rs->bytes_xfer_prev = stat64_get(&ram_counters.transferred); } /** @@ -776,7 +764,7 @@ static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) /* We don't care if this fails to allocate a new cache page * as long as it updated an old one */ cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, - ram_counters.dirty_sync_count); + stat64_get(&ram_counters.dirty_sync_count)); } #define ENCODING_FLAG_XBZRLE 0x1 @@ -802,13 +790,13 @@ static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss, int encoded_len = 0, bytes_xbzrle; uint8_t *prev_cached_page; QEMUFile *file = pss->pss_channel; + uint64_t generation = stat64_get(&ram_counters.dirty_sync_count); - if (!cache_is_cached(XBZRLE.cache, current_addr, - ram_counters.dirty_sync_count)) { + if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) { xbzrle_counters.cache_miss++; if (!rs->last_stage) { if (cache_insert(XBZRLE.cache, current_addr, *current_data, - ram_counters.dirty_sync_count) == -1) { + generation) == -1) { return -1; } else { /* update *current_data when the page has been @@ -1130,8 +1118,8 @@ uint64_t ram_pagesize_summary(void) uint64_t ram_get_total_transferred_pages(void) { - return stat64_get(&ram_atomic_counters.normal) + - stat64_get(&ram_atomic_counters.duplicate) + + return stat64_get(&ram_counters.normal_pages) + + stat64_get(&ram_counters.zero_pages) + compression_counters.pages + xbzrle_counters.pages; } @@ -1148,7 +1136,7 @@ static void migration_update_rates(RAMState *rs, int64_t end_time) return; } - if (migrate_use_xbzrle()) { + if (migrate_xbzrle()) { double encoded_size, unencoded_size; xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - @@ -1166,7 +1154,7 @@ static void migration_update_rates(RAMState *rs, int64_t end_time) rs->xbzrle_bytes_prev = xbzrle_counters.bytes; } - if (migrate_use_compression()) { + if (migrate_compress()) { compression_counters.busy_rate = (double)(compression_counters.busy - rs->compress_thread_busy_prev) / page_count; rs->compress_thread_busy_prev = compression_counters.busy; @@ -1189,10 +1177,9 @@ static void migration_update_rates(RAMState *rs, int64_t end_time) static void migration_trigger_throttle(RAMState *rs) { - MigrationState *s = migrate_get_current(); - uint64_t threshold = s->parameters.throttle_trigger_threshold; + uint64_t threshold = migrate_throttle_trigger_threshold(); uint64_t bytes_xfer_period = - stat64_get(&ram_atomic_counters.transferred) - rs->bytes_xfer_prev; + stat64_get(&ram_counters.transferred) - rs->bytes_xfer_prev; uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; @@ -1221,7 +1208,7 @@ static void migration_bitmap_sync(RAMState *rs) RAMBlock *block; int64_t end_time; - ram_counters.dirty_sync_count++; + stat64_add(&ram_counters.dirty_sync_count, 1); if (!rs->time_last_bitmap_sync) { rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); @@ -1255,10 +1242,11 @@ static void migration_bitmap_sync(RAMState *rs) /* reset period counters */ rs->time_last_bitmap_sync = end_time; rs->num_dirty_pages_period = 0; - rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred); + rs->bytes_xfer_prev = stat64_get(&ram_counters.transferred); } - if (migrate_use_events()) { - qapi_event_send_migration_pass(ram_counters.dirty_sync_count); + if (migrate_events()) { + uint64_t generation = stat64_get(&ram_counters.dirty_sync_count); + qapi_event_send_migration_pass(generation); } } @@ -1331,7 +1319,7 @@ static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block, int len = save_zero_page_to_file(pss, f, block, offset); if (len) { - stat64_add(&ram_atomic_counters.duplicate, 1); + stat64_add(&ram_counters.zero_pages, 1); ram_transferred_add(len); return 1; } @@ -1368,9 +1356,9 @@ static bool control_save_page(PageSearchStatus *pss, RAMBlock *block, } if (bytes_xmit > 0) { - stat64_add(&ram_atomic_counters.normal, 1); + stat64_add(&ram_counters.normal_pages, 1); } else if (bytes_xmit == 0) { - stat64_add(&ram_atomic_counters.duplicate, 1); + stat64_add(&ram_counters.zero_pages, 1); } return true; @@ -1402,7 +1390,7 @@ static int save_normal_page(PageSearchStatus *pss, RAMBlock *block, qemu_put_buffer(file, buf, TARGET_PAGE_SIZE); } ram_transferred_add(TARGET_PAGE_SIZE); - stat64_add(&ram_atomic_counters.normal, 1); + stat64_add(&ram_counters.normal_pages, 1); return 1; } @@ -1458,7 +1446,7 @@ static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block, if (multifd_queue_page(file, block, offset) < 0) { return -1; } - stat64_add(&ram_atomic_counters.normal, 1); + stat64_add(&ram_counters.normal_pages, 1); return 1; } @@ -1497,7 +1485,7 @@ update_compress_thread_counts(const CompressParam *param, int bytes_xmit) ram_transferred_add(bytes_xmit); if (param->zero_page) { - stat64_add(&ram_atomic_counters.duplicate, 1); + stat64_add(&ram_counters.zero_pages, 1); return; } @@ -1636,7 +1624,7 @@ static int find_dirty_block(RAMState *rs, PageSearchStatus *pss) /* Flag that we've looped */ pss->complete_round = true; /* After the first round, enable XBZRLE. */ - if (migrate_use_xbzrle()) { + if (migrate_xbzrle()) { rs->xbzrle_enabled = true; } } @@ -2180,7 +2168,7 @@ int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) RAMBlock *ramblock; RAMState *rs = ram_state; - ram_counters.postcopy_requests++; + stat64_add(&ram_counters.postcopy_requests, 1); RCU_READ_LOCK_GUARD(); if (!rbname) { @@ -2280,7 +2268,7 @@ int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) static bool save_page_use_compression(RAMState *rs) { - if (!migrate_use_compression()) { + if (!migrate_compress()) { return false; } @@ -2372,7 +2360,7 @@ static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) * if host page size == guest page size the dest guest during run may * still see partially copied pages which is data corruption. */ - if (migrate_use_multifd() && !migration_in_postcopy()) { + if (migrate_multifd() && !migration_in_postcopy()) { return ram_save_multifd_page(pss->pss_channel, block, offset); } @@ -2632,9 +2620,9 @@ void acct_update_position(QEMUFile *f, size_t size, bool zero) uint64_t pages = size / TARGET_PAGE_SIZE; if (zero) { - stat64_add(&ram_atomic_counters.duplicate, pages); + stat64_add(&ram_counters.zero_pages, pages); } else { - stat64_add(&ram_atomic_counters.normal, pages); + stat64_add(&ram_counters.normal_pages, pages); ram_transferred_add(size); qemu_file_credit_transfer(f, size); } @@ -2989,7 +2977,7 @@ static int xbzrle_init(void) { Error *local_err = NULL; - if (!migrate_use_xbzrle()) { + if (!migrate_xbzrle()) { return 0; } @@ -3293,7 +3281,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque) migration_ops = g_malloc0(sizeof(MigrationOps)); migration_ops->ram_save_target_page = ram_save_target_page_legacy; - ret = multifd_send_sync_main(f); + ret = multifd_send_sync_main(f); if (ret < 0) { return ret; } @@ -3744,7 +3732,7 @@ static int wait_for_decompress_done(void) { int idx, thread_count; - if (!migrate_use_compression()) { + if (!migrate_compress()) { return 0; } @@ -3763,7 +3751,7 @@ static void compress_threads_load_cleanup(void) { int i, thread_count; - if (!migrate_use_compression()) { + if (!migrate_compress()) { return; } thread_count = migrate_decompress_threads(); @@ -3804,7 +3792,7 @@ static int compress_threads_load_setup(QEMUFile *f) { int i, thread_count; - if (!migrate_use_compression()) { + if (!migrate_compress()) { return 0; } @@ -4270,7 +4258,7 @@ static int ram_load_precopy(QEMUFile *f) int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; /* ADVISE is earlier, it shows the source has the postcopy capability on */ bool postcopy_advised = migration_incoming_postcopy_advised(); - if (!migrate_use_compression()) { + if (!migrate_compress()) { invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; } diff --git a/migration/ram.h b/migration/ram.h index 81cbb0947c..a6e0d70226 100644 --- a/migration/ram.h +++ b/migration/ram.h @@ -35,25 +35,27 @@ #include "qemu/stats64.h" /* - * These are the migration statistic counters that need to be updated using - * atomic ops (can be accessed by more than one thread). Here since we - * cannot modify MigrationStats directly to use Stat64 as it was defined in - * the QAPI scheme, we define an internal structure to hold them, and we - * propagate the real values when QMP queries happen. - * - * IOW, the corresponding fields within ram_counters on these specific - * fields will be always zero and not being used at all; they're just - * placeholders to make it QAPI-compatible. + * These are the ram migration statistic counters. It is loosely + * based on MigrationStats. We change to Stat64 any counter that + * needs to be updated using atomic ops (can be accessed by more than + * one thread). */ typedef struct { - Stat64 transferred; - Stat64 duplicate; - Stat64 normal; + int64_t dirty_pages_rate; + Stat64 dirty_sync_count; + Stat64 dirty_sync_missed_zero_copy; + Stat64 downtime_bytes; + Stat64 zero_pages; + Stat64 multifd_bytes; + Stat64 normal_pages; Stat64 postcopy_bytes; -} MigrationAtomicStats; + Stat64 postcopy_requests; + Stat64 precopy_bytes; + int64_t remaining; + Stat64 transferred; +} RAMStats; -extern MigrationAtomicStats ram_atomic_counters; -extern MigrationStats ram_counters; +extern RAMStats ram_counters; extern XBZRLECacheStats xbzrle_counters; extern CompressionStats compression_counters; @@ -112,6 +114,4 @@ void ram_write_tracking_prepare(void); int ram_write_tracking_start(void); void ram_write_tracking_stop(void); -void dirty_sync_missed_zero_copy(void); - #endif diff --git a/migration/rdma.c b/migration/rdma.c index df646be35e..0af5e944f0 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -35,6 +35,7 @@ #include <rdma/rdma_cma.h> #include "trace.h" #include "qom/object.h" +#include "options.h" #include <poll.h> /* @@ -3373,7 +3374,7 @@ static int qemu_rdma_accept(RDMAContext *rdma) * initialize the RDMAContext for return path for postcopy after first * connection request reached. */ - if ((migrate_postcopy() || migrate_use_return_path()) + if ((migrate_postcopy() || migrate_return_path()) && !rdma->is_return_path) { rdma_return_path = qemu_rdma_data_init(rdma->host_port, NULL); if (rdma_return_path == NULL) { @@ -3456,7 +3457,7 @@ static int qemu_rdma_accept(RDMAContext *rdma) } /* Accept the second connection request for return path */ - if ((migrate_postcopy() || migrate_use_return_path()) + if ((migrate_postcopy() || migrate_return_path()) && !rdma->is_return_path) { qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration, NULL, @@ -4178,8 +4179,7 @@ void rdma_start_outgoing_migration(void *opaque, goto err; } - ret = qemu_rdma_source_init(rdma, - s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL], errp); + ret = qemu_rdma_source_init(rdma, migrate_rdma_pin_all(), errp); if (ret) { goto err; @@ -4193,7 +4193,7 @@ void rdma_start_outgoing_migration(void *opaque, } /* RDMA postcopy need a separate queue pair for return path */ - if (migrate_postcopy() || migrate_use_return_path()) { + if (migrate_postcopy() || migrate_return_path()) { rdma_return_path = qemu_rdma_data_init(host_port, errp); if (rdma_return_path == NULL) { @@ -4201,7 +4201,7 @@ void rdma_start_outgoing_migration(void *opaque, } ret = qemu_rdma_source_init(rdma_return_path, - s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL], errp); + migrate_rdma_pin_all(), errp); if (ret) { goto return_path_err; diff --git a/migration/savevm.c b/migration/savevm.c index aa54a67fda..9671211339 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -67,6 +67,7 @@ #include "qemu/yank.h" #include "yank_functions.h" #include "sysemu/qtest.h" +#include "options.h" const unsigned int postcopy_ram_discard_version; @@ -253,7 +254,7 @@ static uint32_t get_validatable_capabilities_count(void) uint32_t result = 0; int i; for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) { - if (should_validate_capability(i) && s->enabled_capabilities[i]) { + if (should_validate_capability(i) && s->capabilities[i]) { result++; } } @@ -275,7 +276,7 @@ static int configuration_pre_save(void *opaque) state->capabilities = g_renew(MigrationCapability, state->capabilities, state->caps_count); for (i = j = 0; i < MIGRATION_CAPABILITY__MAX; i++) { - if (should_validate_capability(i) && s->enabled_capabilities[i]) { + if (should_validate_capability(i) && s->capabilities[i]) { state->capabilities[j++] = i; } } @@ -325,7 +326,7 @@ static bool configuration_validate_capabilities(SaveState *state) continue; } source_state = test_bit(i, source_caps_bm); - target_state = s->enabled_capabilities[i]; + target_state = s->capabilities[i]; if (source_state != target_state) { error_report("Capability %s is %s, but received capability is %s", MigrationCapability_str(i), @@ -1611,7 +1612,7 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp) return -EINVAL; } - if (migrate_use_block()) { + if (migrate_block()) { error_setg(errp, "Block migration and snapshots are incompatible"); return -EINVAL; } diff --git a/migration/socket.c b/migration/socket.c index e6fdf3c5e1..1b6f5baefb 100644 --- a/migration/socket.c +++ b/migration/socket.c @@ -27,6 +27,7 @@ #include "io/net-listener.h" #include "trace.h" #include "postcopy-ram.h" +#include "options.h" struct SocketOutgoingArgs { SocketAddress *saddr; @@ -97,7 +98,7 @@ static void socket_outgoing_migration(QIOTask *task, trace_migration_socket_outgoing_connected(data->hostname); - if (migrate_use_zero_copy_send() && + if (migrate_zero_copy_send() && !qio_channel_has_feature(sioc, QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY)) { error_setg(&err, "Zero copy send feature not detected in host kernel"); } @@ -182,7 +183,7 @@ socket_start_incoming_migration_internal(SocketAddress *saddr, qio_net_listener_set_name(listener, "migration-socket-listener"); - if (migrate_use_multifd()) { + if (migrate_multifd()) { num = migrate_multifd_channels(); } else if (migrate_postcopy_preempt()) { num = RAM_CHANNEL_MAX; diff --git a/monitor/hmp.c b/monitor/hmp.c index fee410362f..5cab56d355 100644 --- a/monitor/hmp.c +++ b/monitor/hmp.c @@ -1167,7 +1167,7 @@ void handle_hmp_command(MonitorHMP *mon, const char *cmdline) Coroutine *co = qemu_coroutine_create(handle_hmp_command_co, &data); monitor_set_cur(co, &mon->common); aio_co_enter(qemu_get_aio_context(), co); - AIO_WAIT_WHILE(qemu_get_aio_context(), !data.done); + AIO_WAIT_WHILE_UNLOCKED(NULL, !data.done); } qobject_unref(qdict); diff --git a/monitor/monitor.c b/monitor/monitor.c index 8dc96f6af9..602535696c 100644 --- a/monitor/monitor.c +++ b/monitor/monitor.c @@ -666,7 +666,7 @@ void monitor_cleanup(void) * We need to poll both qemu_aio_context and iohandler_ctx to make * sure that the dispatcher coroutine keeps making progress and * eventually terminates. qemu_aio_context is automatically - * polled by calling AIO_WAIT_WHILE on it, but we must poll + * polled by calling AIO_WAIT_WHILE_UNLOCKED on it, but we must poll * iohandler_ctx manually. * * Letting the iothread continue while shutting down the dispatcher @@ -679,7 +679,7 @@ void monitor_cleanup(void) aio_co_wake(qmp_dispatcher_co); } - AIO_WAIT_WHILE(qemu_get_aio_context(), + AIO_WAIT_WHILE_UNLOCKED(NULL, (aio_poll(iohandler_get_aio_context(), false), qatomic_mb_read(&qmp_dispatcher_co_busy))); diff --git a/nbd/server.c b/nbd/server.c index 4f5c42f84d..e239c2890f 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -1409,8 +1409,8 @@ nbd_read_eof(NBDClient *client, void *buffer, size_t size, Error **errp) return 1; } -static int nbd_receive_request(NBDClient *client, NBDRequest *request, - Error **errp) +static int coroutine_fn nbd_receive_request(NBDClient *client, NBDRequest *request, + Error **errp) { uint8_t buf[NBD_REQUEST_SIZE]; uint32_t magic; @@ -1893,12 +1893,12 @@ static inline void set_be_simple_reply(NBDSimpleReply *reply, uint64_t error, stq_be_p(&reply->handle, handle); } -static int nbd_co_send_simple_reply(NBDClient *client, - uint64_t handle, - uint32_t error, - void *data, - size_t len, - Error **errp) +static int coroutine_fn nbd_co_send_simple_reply(NBDClient *client, + uint64_t handle, + uint32_t error, + void *data, + size_t len, + Error **errp) { NBDSimpleReply reply; int nbd_err = system_errno_to_nbd_errno(error); @@ -2036,8 +2036,8 @@ static int coroutine_fn nbd_co_send_sparse_read(NBDClient *client, stl_be_p(&chunk.length, pnum); ret = nbd_co_send_iov(client, iov, 1, errp); } else { - ret = blk_pread(exp->common.blk, offset + progress, pnum, - data + progress, 0); + ret = blk_co_pread(exp->common.blk, offset + progress, pnum, + data + progress, 0); if (ret < 0) { error_setg_errno(errp, -ret, "reading from file failed"); break; @@ -2196,9 +2196,9 @@ static int coroutine_fn blockalloc_to_extents(BlockBackend *blk, * @ea is converted to BE by the function * @last controls whether NBD_REPLY_FLAG_DONE is sent. */ -static int nbd_co_send_extents(NBDClient *client, uint64_t handle, - NBDExtentArray *ea, - bool last, uint32_t context_id, Error **errp) +static int coroutine_fn +nbd_co_send_extents(NBDClient *client, uint64_t handle, NBDExtentArray *ea, + bool last, uint32_t context_id, Error **errp) { NBDStructuredMeta chunk; struct iovec iov[] = { @@ -2275,10 +2275,10 @@ static void bitmap_to_extents(BdrvDirtyBitmap *bitmap, bdrv_dirty_bitmap_unlock(bitmap); } -static int nbd_co_send_bitmap(NBDClient *client, uint64_t handle, - BdrvDirtyBitmap *bitmap, uint64_t offset, - uint32_t length, bool dont_fragment, bool last, - uint32_t context_id, Error **errp) +static int coroutine_fn nbd_co_send_bitmap(NBDClient *client, uint64_t handle, + BdrvDirtyBitmap *bitmap, uint64_t offset, + uint32_t length, bool dont_fragment, bool last, + uint32_t context_id, Error **errp) { unsigned int nb_extents = dont_fragment ? 1 : NBD_MAX_BLOCK_STATUS_EXTENTS; g_autoptr(NBDExtentArray) ea = nbd_extent_array_new(nb_extents); @@ -2295,8 +2295,8 @@ static int nbd_co_send_bitmap(NBDClient *client, uint64_t handle, * to the client (although the caller may still need to disconnect after * reporting the error). */ -static int nbd_co_receive_request(NBDRequestData *req, NBDRequest *request, - Error **errp) +static int coroutine_fn nbd_co_receive_request(NBDRequestData *req, NBDRequest *request, + Error **errp) { NBDClient *client = req->client; int valid_flags; @@ -2444,7 +2444,7 @@ static coroutine_fn int nbd_do_cmd_read(NBDClient *client, NBDRequest *request, data, request->len, errp); } - ret = blk_pread(exp->common.blk, request->from, request->len, data, 0); + ret = blk_co_pread(exp->common.blk, request->from, request->len, data, 0); if (ret < 0) { return nbd_send_generic_reply(client, request->handle, ret, "reading from file failed", errp); @@ -2511,8 +2511,8 @@ static coroutine_fn int nbd_handle_request(NBDClient *client, if (request->flags & NBD_CMD_FLAG_FUA) { flags |= BDRV_REQ_FUA; } - ret = blk_pwrite(exp->common.blk, request->from, request->len, data, - flags); + ret = blk_co_pwrite(exp->common.blk, request->from, request->len, data, + flags); return nbd_send_generic_reply(client, request->handle, ret, "writing to file failed", errp); @@ -2527,8 +2527,8 @@ static coroutine_fn int nbd_handle_request(NBDClient *client, if (request->flags & NBD_CMD_FLAG_FAST_ZERO) { flags |= BDRV_REQ_NO_FALLBACK; } - ret = blk_pwrite_zeroes(exp->common.blk, request->from, request->len, - flags); + ret = blk_co_pwrite_zeroes(exp->common.blk, request->from, request->len, + flags); return nbd_send_generic_reply(client, request->handle, ret, "writing to file failed", errp); diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c index 99904a0da7..37cdc84562 100644 --- a/net/vhost-vdpa.c +++ b/net/vhost-vdpa.c @@ -104,7 +104,8 @@ static const uint64_t vdpa_svq_device_features = /* VHOST_F_LOG_ALL is exposed by SVQ */ BIT_ULL(VHOST_F_LOG_ALL) | BIT_ULL(VIRTIO_NET_F_RSC_EXT) | - BIT_ULL(VIRTIO_NET_F_STANDBY); + BIT_ULL(VIRTIO_NET_F_STANDBY) | + BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX); #define VHOST_VDPA_NET_CVQ_ASID 1 diff --git a/qapi/migration.json b/qapi/migration.json index c84fa10e86..2c35b7b9cf 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -1204,34 +1204,6 @@ 'returns': 'MigrationParameters' } ## -# @client_migrate_info: -# -# Set migration information for remote display. This makes the server -# ask the client to automatically reconnect using the new parameters -# once migration finished successfully. Only implemented for SPICE. -# -# @protocol: must be "spice" -# @hostname: migration target hostname -# @port: spice tcp port for plaintext channels -# @tls-port: spice tcp port for tls-secured channels -# @cert-subject: server certificate subject -# -# Since: 0.14 -# -# Example: -# -# -> { "execute": "client_migrate_info", -# "arguments": { "protocol": "spice", -# "hostname": "virt42.lab.kraxel.org", -# "port": 1234 } } -# <- { "return": {} } -# -## -{ 'command': 'client_migrate_info', - 'data': { 'protocol': 'str', 'hostname': 'str', '*port': 'int', - '*tls-port': 'int', '*cert-subject': 'str' } } - -## # @migrate-start-postcopy: # # Followup to a migration command to switch the migration to postcopy mode. diff --git a/qapi/ui.json b/qapi/ui.json index 98322342f7..7ddd27a932 100644 --- a/qapi/ui.json +++ b/qapi/ui.json @@ -1554,3 +1554,31 @@ { 'command': 'display-update', 'data': 'DisplayUpdateOptions', 'boxed' : true } + +## +# @client_migrate_info: +# +# Set migration information for remote display. This makes the server +# ask the client to automatically reconnect using the new parameters +# once migration finished successfully. Only implemented for SPICE. +# +# @protocol: must be "spice" +# @hostname: migration target hostname +# @port: spice tcp port for plaintext channels +# @tls-port: spice tcp port for tls-secured channels +# @cert-subject: server certificate subject +# +# Since: 0.14 +# +# Example: +# +# -> { "execute": "client_migrate_info", +# "arguments": { "protocol": "spice", +# "hostname": "virt42.lab.kraxel.org", +# "port": 1234 } } +# <- { "return": {} } +# +## +{ 'command': 'client_migrate_info', + 'data': { 'protocol': 'str', 'hostname': 'str', '*port': 'int', + '*tls-port': 'int', '*cert-subject': 'str' } } diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh index 009fab1515..d4369a3ad8 100644 --- a/scripts/meson-buildoptions.sh +++ b/scripts/meson-buildoptions.sh @@ -14,6 +14,7 @@ meson_options_help() { printf "%s\n" ' use idef-parser to automatically generate TCG' printf "%s\n" ' code for the Hexagon frontend' printf "%s\n" ' --disable-install-blobs install provided firmware blobs' + printf "%s\n" ' --disable-qom-cast-debug cast debugging support' printf "%s\n" ' --docdir=VALUE Base directory for documentation installation' printf "%s\n" ' (can be empty) [share/doc]' printf "%s\n" ' --enable-block-drv-whitelist-in-tools' @@ -35,7 +36,6 @@ meson_options_help() { printf "%s\n" ' --enable-module-upgrades try to load modules from alternate paths for' printf "%s\n" ' upgrades' printf "%s\n" ' --enable-profiler profiler support' - printf "%s\n" ' --enable-qom-cast-debug cast debugging support' printf "%s\n" ' --enable-rng-none dummy RNG, avoid using /dev/(u)random and' printf "%s\n" ' getrandom()' printf "%s\n" ' --enable-strip Strip targets on install' diff --git a/scripts/tracetool/backend/ftrace.py b/scripts/tracetool/backend/ftrace.py index 5fa30ccc08..baed2ae61c 100644 --- a/scripts/tracetool/backend/ftrace.py +++ b/scripts/tracetool/backend/ftrace.py @@ -12,6 +12,8 @@ __maintainer__ = "Stefan Hajnoczi" __email__ = "stefanha@redhat.com" +import os.path + from tracetool import out @@ -45,7 +47,7 @@ def generate_h(event, group): args=event.args, event_id="TRACE_" + event.name.upper(), event_lineno=event.lineno, - event_filename=event.filename, + event_filename=os.path.relpath(event.filename), fmt=event.fmt.rstrip("\n"), argnames=argnames) diff --git a/scripts/tracetool/backend/log.py b/scripts/tracetool/backend/log.py index 17ba1cd90e..de27b7e62e 100644 --- a/scripts/tracetool/backend/log.py +++ b/scripts/tracetool/backend/log.py @@ -12,6 +12,8 @@ __maintainer__ = "Stefan Hajnoczi" __email__ = "stefanha@redhat.com" +import os.path + from tracetool import out @@ -53,7 +55,7 @@ def generate_h(event, group): ' }', cond=cond, event_lineno=event.lineno, - event_filename=event.filename, + event_filename=os.path.relpath(event.filename), name=event.name, fmt=event.fmt.rstrip("\n"), argnames=argnames) diff --git a/scripts/tracetool/backend/syslog.py b/scripts/tracetool/backend/syslog.py index 5a3a00fe31..012970f6cc 100644 --- a/scripts/tracetool/backend/syslog.py +++ b/scripts/tracetool/backend/syslog.py @@ -12,6 +12,8 @@ __maintainer__ = "Stefan Hajnoczi" __email__ = "stefanha@redhat.com" +import os.path + from tracetool import out @@ -41,7 +43,7 @@ def generate_h(event, group): ' }', cond=cond, event_lineno=event.lineno, - event_filename=event.filename, + event_filename=os.path.relpath(event.filename), name=event.name, fmt=event.fmt.rstrip("\n"), argnames=argnames) diff --git a/scsi/pr-manager.c b/scsi/pr-manager.c index 2098d7e759..fb5fc29730 100644 --- a/scsi/pr-manager.c +++ b/scsi/pr-manager.c @@ -51,7 +51,6 @@ static int pr_manager_worker(void *opaque) int coroutine_fn pr_manager_execute(PRManager *pr_mgr, AioContext *ctx, int fd, struct sg_io_hdr *hdr) { - ThreadPool *pool = aio_get_thread_pool(ctx); PRManagerData data = { .pr_mgr = pr_mgr, .fd = fd, @@ -62,7 +61,7 @@ int coroutine_fn pr_manager_execute(PRManager *pr_mgr, AioContext *ctx, int fd, /* The matching object_unref is in pr_manager_worker. */ object_ref(OBJECT(pr_mgr)); - return thread_pool_submit_co(pool, pr_manager_worker, &data); + return thread_pool_submit_co(pr_manager_worker, &data); } bool pr_manager_is_connected(PRManager *pr_mgr) diff --git a/scsi/qemu-pr-helper.c b/scsi/qemu-pr-helper.c index 199227a556..a857e80c03 100644 --- a/scsi/qemu-pr-helper.c +++ b/scsi/qemu-pr-helper.c @@ -177,10 +177,9 @@ static int do_sgio_worker(void *opaque) return status; } -static int do_sgio(int fd, const uint8_t *cdb, uint8_t *sense, - uint8_t *buf, int *sz, int dir) +static int coroutine_fn do_sgio(int fd, const uint8_t *cdb, uint8_t *sense, + uint8_t *buf, int *sz, int dir) { - ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context()); int r; PRHelperSGIOData data = { @@ -192,7 +191,7 @@ static int do_sgio(int fd, const uint8_t *cdb, uint8_t *sense, .dir = dir, }; - r = thread_pool_submit_co(pool, do_sgio_worker, &data); + r = thread_pool_submit_co(do_sgio_worker, &data); *sz = data.sz; return r; } @@ -320,7 +319,7 @@ static SCSISense mpath_generic_sense(int r) } } -static int mpath_reconstruct_sense(int fd, int r, uint8_t *sense) +static int coroutine_fn mpath_reconstruct_sense(int fd, int r, uint8_t *sense) { switch (r) { case MPATH_PR_SUCCESS: @@ -372,8 +371,8 @@ static int mpath_reconstruct_sense(int fd, int r, uint8_t *sense) } } -static int multipath_pr_in(int fd, const uint8_t *cdb, uint8_t *sense, - uint8_t *data, int sz) +static int coroutine_fn multipath_pr_in(int fd, const uint8_t *cdb, uint8_t *sense, + uint8_t *data, int sz) { int rq_servact = cdb[1]; struct prin_resp resp; @@ -427,8 +426,8 @@ static int multipath_pr_in(int fd, const uint8_t *cdb, uint8_t *sense, return mpath_reconstruct_sense(fd, r, sense); } -static int multipath_pr_out(int fd, const uint8_t *cdb, uint8_t *sense, - const uint8_t *param, int sz) +static int coroutine_fn multipath_pr_out(int fd, const uint8_t *cdb, uint8_t *sense, + const uint8_t *param, int sz) { int rq_servact = cdb[1]; int rq_scope = cdb[2] >> 4; @@ -545,8 +544,8 @@ static int multipath_pr_out(int fd, const uint8_t *cdb, uint8_t *sense, } #endif -static int do_pr_in(int fd, const uint8_t *cdb, uint8_t *sense, - uint8_t *data, int *resp_sz) +static int coroutine_fn do_pr_in(int fd, const uint8_t *cdb, uint8_t *sense, + uint8_t *data, int *resp_sz) { #ifdef CONFIG_MPATH if (is_mpath(fd)) { @@ -563,8 +562,8 @@ static int do_pr_in(int fd, const uint8_t *cdb, uint8_t *sense, SG_DXFER_FROM_DEV); } -static int do_pr_out(int fd, const uint8_t *cdb, uint8_t *sense, - const uint8_t *param, int sz) +static int coroutine_fn do_pr_out(int fd, const uint8_t *cdb, uint8_t *sense, + const uint8_t *param, int sz) { int resp_sz; diff --git a/softmmu/vl.c b/softmmu/vl.c index 5cb72a56fc..fb6c221e8e 100644 --- a/softmmu/vl.c +++ b/softmmu/vl.c @@ -3584,13 +3584,18 @@ void qemu_init(int argc, char **argv) } /* + * Create backends before creating migration objects, so that it can + * check against compatibilities on the backend memories (e.g. postcopy + * over memory-backend-file objects). + */ + qemu_create_late_backends(); + + /* * Note: creates a QOM object, must run only after global and * compat properties have been set up. */ migration_object_init(); - qemu_create_late_backends(); - /* parse features once if machine provides default cpu_type */ current_machine->cpu_type = machine_class->default_cpu_type; if (cpu_option) { diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index 0200b78e8e..0abd898a52 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -2455,6 +2455,16 @@ void vu_queue_notify_sync(VuDev *dev, VuVirtq *vq) _vu_queue_notify(dev, vq, true); } +void vu_config_change_msg(VuDev *dev) +{ + VhostUserMsg vmsg = { + .request = VHOST_USER_BACKEND_CONFIG_CHANGE_MSG, + .flags = VHOST_USER_VERSION, + }; + + vu_message_write(dev, dev->slave_fd, &vmsg); +} + static inline void vring_used_flags_set_bit(VuVirtq *vq, int mask) { diff --git a/subprojects/libvhost-user/libvhost-user.h b/subprojects/libvhost-user/libvhost-user.h index 8c5a2719e3..49208cceaa 100644 --- a/subprojects/libvhost-user/libvhost-user.h +++ b/subprojects/libvhost-user/libvhost-user.h @@ -585,6 +585,8 @@ bool vu_queue_empty(VuDev *dev, VuVirtq *vq); */ void vu_queue_notify(VuDev *dev, VuVirtq *vq); +void vu_config_change_msg(VuDev *dev); + /** * vu_queue_notify_sync: * @dev: a VuDev context diff --git a/tests/unit/test-thread-pool.c b/tests/unit/test-thread-pool.c index 6020e65d69..1483e53473 100644 --- a/tests/unit/test-thread-pool.c +++ b/tests/unit/test-thread-pool.c @@ -8,7 +8,6 @@ #include "qemu/main-loop.h" static AioContext *ctx; -static ThreadPool *pool; static int active; typedef struct { @@ -47,7 +46,7 @@ static void done_cb(void *opaque, int ret) static void test_submit(void) { WorkerTestData data = { .n = 0 }; - thread_pool_submit(pool, worker_cb, &data); + thread_pool_submit(worker_cb, &data); while (data.n == 0) { aio_poll(ctx, true); } @@ -57,7 +56,7 @@ static void test_submit(void) static void test_submit_aio(void) { WorkerTestData data = { .n = 0, .ret = -EINPROGRESS }; - data.aiocb = thread_pool_submit_aio(pool, worker_cb, &data, + data.aiocb = thread_pool_submit_aio(worker_cb, &data, done_cb, &data); /* The callbacks are not called until after the first wait. */ @@ -71,14 +70,14 @@ static void test_submit_aio(void) g_assert_cmpint(data.ret, ==, 0); } -static void co_test_cb(void *opaque) +static void coroutine_fn co_test_cb(void *opaque) { WorkerTestData *data = opaque; active = 1; data->n = 0; data->ret = -EINPROGRESS; - thread_pool_submit_co(pool, worker_cb, data); + thread_pool_submit_co(worker_cb, data); /* The test continues in test_submit_co, after qemu_coroutine_enter... */ @@ -122,7 +121,7 @@ static void test_submit_many(void) for (i = 0; i < 100; i++) { data[i].n = 0; data[i].ret = -EINPROGRESS; - thread_pool_submit_aio(pool, worker_cb, &data[i], done_cb, &data[i]); + thread_pool_submit_aio(worker_cb, &data[i], done_cb, &data[i]); } active = 100; @@ -150,7 +149,7 @@ static void do_test_cancel(bool sync) for (i = 0; i < 100; i++) { data[i].n = 0; data[i].ret = -EINPROGRESS; - data[i].aiocb = thread_pool_submit_aio(pool, long_cb, &data[i], + data[i].aiocb = thread_pool_submit_aio(long_cb, &data[i], done_cb, &data[i]); } @@ -235,7 +234,6 @@ int main(int argc, char **argv) { qemu_init_main_loop(&error_abort); ctx = qemu_get_current_aio_context(); - pool = aio_get_thread_pool(ctx); g_test_init(&argc, &argv, NULL); g_test_add_func("/thread-pool/submit", test_submit); diff --git a/ui/ui-hmp-cmds.c b/ui/ui-hmp-cmds.c index 5c456ecc02..c671389473 100644 --- a/ui/ui-hmp-cmds.c +++ b/ui/ui-hmp-cmds.c @@ -458,3 +458,20 @@ hmp_screendump(Monitor *mon, const QDict *qdict) end: hmp_handle_error(mon, err); } + +void hmp_client_migrate_info(Monitor *mon, const QDict *qdict) +{ + Error *err = NULL; + const char *protocol = qdict_get_str(qdict, "protocol"); + const char *hostname = qdict_get_str(qdict, "hostname"); + bool has_port = qdict_haskey(qdict, "port"); + int port = qdict_get_try_int(qdict, "port", -1); + bool has_tls_port = qdict_haskey(qdict, "tls-port"); + int tls_port = qdict_get_try_int(qdict, "tls-port", -1); + const char *cert_subject = qdict_get_try_str(qdict, "cert-subject"); + + qmp_client_migrate_info(protocol, hostname, + has_port, port, has_tls_port, tls_port, + cert_subject, &err); + hmp_handle_error(mon, err); +} diff --git a/ui/ui-qmp-cmds.c b/ui/ui-qmp-cmds.c index dbc4afcd73..a37a7024f3 100644 --- a/ui/ui-qmp-cmds.c +++ b/ui/ui-qmp-cmds.c @@ -175,3 +175,32 @@ void qmp_display_update(DisplayUpdateOptions *arg, Error **errp) abort(); } } + +void qmp_client_migrate_info(const char *protocol, const char *hostname, + bool has_port, int64_t port, + bool has_tls_port, int64_t tls_port, + const char *cert_subject, + Error **errp) +{ + if (strcmp(protocol, "spice") == 0) { + if (!qemu_using_spice(errp)) { + return; + } + + if (!has_port && !has_tls_port) { + error_setg(errp, QERR_MISSING_PARAMETER, "port/tls-port"); + return; + } + + if (qemu_spice.migrate_info(hostname, + has_port ? port : -1, + has_tls_port ? tls_port : -1, + cert_subject)) { + error_setg(errp, "Could not set up display for migration"); + return; + } + return; + } + + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "protocol", "'spice'"); +} diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c index 5ed7d29183..ed14f9c64d 100644 --- a/util/mmap-alloc.c +++ b/util/mmap-alloc.c @@ -27,8 +27,36 @@ #ifdef CONFIG_LINUX #include <sys/vfs.h> +#include <linux/magic.h> #endif +QemuFsType qemu_fd_getfs(int fd) +{ +#ifdef CONFIG_LINUX + struct statfs fs; + int ret; + + if (fd < 0) { + return QEMU_FS_TYPE_UNKNOWN; + } + + do { + ret = fstatfs(fd, &fs); + } while (ret != 0 && errno == EINTR); + + switch (fs.f_type) { + case TMPFS_MAGIC: + return QEMU_FS_TYPE_TMPFS; + case HUGETLBFS_MAGIC: + return QEMU_FS_TYPE_HUGETLBFS; + default: + return QEMU_FS_TYPE_UNKNOWN; + } +#else + return QEMU_FS_TYPE_UNKNOWN; +#endif +} + size_t qemu_fd_getpagesize(int fd) { #ifdef CONFIG_LINUX diff --git a/util/thread-pool.c b/util/thread-pool.c index 31113b5860..0d97888df0 100644 --- a/util/thread-pool.c +++ b/util/thread-pool.c @@ -48,7 +48,7 @@ struct ThreadPoolElement { /* Access to this list is protected by lock. */ QTAILQ_ENTRY(ThreadPoolElement) reqs; - /* Access to this list is protected by the global mutex. */ + /* This list is only written by the thread pool's mother thread. */ QLIST_ENTRY(ThreadPoolElement) all; }; @@ -175,7 +175,6 @@ static void thread_pool_completion_bh(void *opaque) ThreadPool *pool = opaque; ThreadPoolElement *elem, *next; - aio_context_acquire(pool->ctx); restart: QLIST_FOREACH_SAFE(elem, &pool->head, all, next) { if (elem->state != THREAD_DONE) { @@ -195,9 +194,7 @@ restart: */ qemu_bh_schedule(pool->completion_bh); - aio_context_release(pool->ctx); elem->common.cb(elem->common.opaque, elem->ret); - aio_context_acquire(pool->ctx); /* We can safely cancel the completion_bh here regardless of someone * else having scheduled it meanwhile because we reenter the @@ -211,7 +208,6 @@ restart: qemu_aio_unref(elem); } } - aio_context_release(pool->ctx); } static void thread_pool_cancel(BlockAIOCB *acb) @@ -245,11 +241,15 @@ static const AIOCBInfo thread_pool_aiocb_info = { .get_aio_context = thread_pool_get_aio_context, }; -BlockAIOCB *thread_pool_submit_aio(ThreadPool *pool, - ThreadPoolFunc *func, void *arg, - BlockCompletionFunc *cb, void *opaque) +BlockAIOCB *thread_pool_submit_aio(ThreadPoolFunc *func, void *arg, + BlockCompletionFunc *cb, void *opaque) { ThreadPoolElement *req; + AioContext *ctx = qemu_get_current_aio_context(); + ThreadPool *pool = aio_get_thread_pool(ctx); + + /* Assert that the thread submitting work is the same running the pool */ + assert(pool->ctx == qemu_get_current_aio_context()); req = qemu_aio_get(&thread_pool_aiocb_info, NULL, cb, opaque); req->func = func; @@ -284,19 +284,18 @@ static void thread_pool_co_cb(void *opaque, int ret) aio_co_wake(co->co); } -int coroutine_fn thread_pool_submit_co(ThreadPool *pool, ThreadPoolFunc *func, - void *arg) +int coroutine_fn thread_pool_submit_co(ThreadPoolFunc *func, void *arg) { ThreadPoolCo tpc = { .co = qemu_coroutine_self(), .ret = -EINPROGRESS }; assert(qemu_in_coroutine()); - thread_pool_submit_aio(pool, func, arg, thread_pool_co_cb, &tpc); + thread_pool_submit_aio(func, arg, thread_pool_co_cb, &tpc); qemu_coroutine_yield(); return tpc.ret; } -void thread_pool_submit(ThreadPool *pool, ThreadPoolFunc *func, void *arg) +void thread_pool_submit(ThreadPoolFunc *func, void *arg) { - thread_pool_submit_aio(pool, func, arg, NULL, NULL); + thread_pool_submit_aio(func, arg, NULL, NULL); } void thread_pool_update_params(ThreadPool *pool, AioContext *ctx) |