diff options
28 files changed, 594 insertions, 96 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index b8637c6f52..d9378511b7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1840,7 +1840,6 @@ R: Ani Sinha <ani@anisinha.ca> S: Supported F: include/hw/acpi/* F: include/hw/firmware/smbios.h -F: hw/mem/* F: hw/acpi/* F: hw/smbios/* F: hw/i386/acpi-build.[hc] @@ -1851,6 +1850,7 @@ F: tests/qtest/acpi-utils.[hc] F: tests/data/acpi/ F: docs/specs/acpi_cpu_hotplug.rst F: docs/specs/acpi_mem_hotplug.rst +F: docs/specs/acpi_nvdimm.rst F: docs/specs/acpi_pci_hotplug.rst F: docs/specs/acpi_hw_reduced_hotplug.rst @@ -2158,15 +2158,6 @@ F: qapi/rocker.json F: tests/rocker/ F: docs/specs/rocker.txt -NVDIMM -M: Xiao Guangrong <xiaoguangrong.eric@gmail.com> -S: Maintained -F: hw/acpi/nvdimm.c -F: hw/mem/nvdimm.c -F: include/hw/mem/nvdimm.h -F: docs/nvdimm.txt -F: docs/specs/acpi_nvdimm.rst - e1000x M: Dmitry Fleytman <dmitry.fleytman@gmail.com> S: Maintained @@ -2588,6 +2579,7 @@ M: Ben Widawsky <ben.widawsky@intel.com> M: Jonathan Cameron <jonathan.cameron@huawei.com> S: Supported F: hw/cxl/ +F: hw/mem/cxl_type3.c F: include/hw/cxl/ Dirty Bitmaps @@ -2704,6 +2696,19 @@ F: softmmu/physmem.c F: include/exec/memory-internal.h F: scripts/coccinelle/memory-region-housekeeping.cocci +Memory devices +M: David Hildenbrand <david@redhat.com> +M: Igor Mammedov <imammedo@redhat.com> +R: Xiao Guangrong <xiaoguangrong.eric@gmail.com> +S: Supported +F: hw/mem/memory-device.c +F: hw/mem/nvdimm.c +F: hw/mem/pc-dimm.c +F: include/hw/mem/memory-device.h +F: include/hw/mem/nvdimm.h +F: include/hw/mem/pc-dimm.h +F: docs/nvdimm.txt + SPICE M: Gerd Hoffmann <kraxel@redhat.com> S: Odd Fixes diff --git a/block/block-copy.c b/block/block-copy.c index ec46775ea5..bb947afdda 100644 --- a/block/block-copy.c +++ b/block/block-copy.c @@ -883,23 +883,42 @@ static int coroutine_fn block_copy_common(BlockCopyCallState *call_state) return ret; } +static void coroutine_fn block_copy_async_co_entry(void *opaque) +{ + block_copy_common(opaque); +} + int coroutine_fn block_copy(BlockCopyState *s, int64_t start, int64_t bytes, - bool ignore_ratelimit) + bool ignore_ratelimit, uint64_t timeout_ns, + BlockCopyAsyncCallbackFunc cb, + void *cb_opaque) { - BlockCopyCallState call_state = { + int ret; + BlockCopyCallState *call_state = g_new(BlockCopyCallState, 1); + + *call_state = (BlockCopyCallState) { .s = s, .offset = start, .bytes = bytes, .ignore_ratelimit = ignore_ratelimit, .max_workers = BLOCK_COPY_MAX_WORKERS, + .cb = cb, + .cb_opaque = cb_opaque, }; - return block_copy_common(&call_state); -} + ret = qemu_co_timeout(block_copy_async_co_entry, call_state, timeout_ns, + g_free); + if (ret < 0) { + assert(ret == -ETIMEDOUT); + block_copy_call_cancel(call_state); + /* call_state will be freed by running coroutine. */ + return ret; + } -static void coroutine_fn block_copy_async_co_entry(void *opaque) -{ - block_copy_common(opaque); + ret = call_state->ret; + g_free(call_state); + + return ret; } BlockCopyCallState *block_copy_async(BlockCopyState *s, diff --git a/block/copy-before-write.c b/block/copy-before-write.c index a8a06fdc09..c24b8dd117 100644 --- a/block/copy-before-write.c +++ b/block/copy-before-write.c @@ -24,6 +24,7 @@ */ #include "qemu/osdep.h" +#include "qapi/qmp/qjson.h" #include "sysemu/block-backend.h" #include "qemu/cutils.h" @@ -40,6 +41,8 @@ typedef struct BDRVCopyBeforeWriteState { BlockCopyState *bcs; BdrvChild *target; + OnCbwError on_cbw_error; + uint32_t cbw_timeout_ns; /* * @lock: protects access to @access_bitmap, @done_bitmap and @@ -64,6 +67,14 @@ typedef struct BDRVCopyBeforeWriteState { * node. These areas must not be rewritten by guest. */ BlockReqList frozen_read_reqs; + + /* + * @snapshot_error is normally zero. But on first copy-before-write failure + * when @on_cbw_error == ON_CBW_ERROR_BREAK_SNAPSHOT, @snapshot_error takes + * value of this error (<0). After that all in-flight and further + * snapshot-API requests will fail with that error. + */ + int snapshot_error; } BDRVCopyBeforeWriteState; static coroutine_fn int cbw_co_preadv( @@ -73,6 +84,13 @@ static coroutine_fn int cbw_co_preadv( return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags); } +static void block_copy_cb(void *opaque) +{ + BlockDriverState *bs = opaque; + + bdrv_dec_in_flight(bs); +} + /* * Do copy-before-write operation. * @@ -94,16 +112,36 @@ static coroutine_fn int cbw_do_copy_before_write(BlockDriverState *bs, return 0; } + if (s->snapshot_error) { + return 0; + } + off = QEMU_ALIGN_DOWN(offset, cluster_size); end = QEMU_ALIGN_UP(offset + bytes, cluster_size); - ret = block_copy(s->bcs, off, end - off, true); - if (ret < 0) { + /* + * Increase in_flight, so that in case of timed-out block-copy, the + * remaining background block_copy() request (which can't be immediately + * cancelled by timeout) is presented in bs->in_flight. This way we are + * sure that on bs close() we'll previously wait for all timed-out but yet + * running block_copy calls. + */ + bdrv_inc_in_flight(bs); + ret = block_copy(s->bcs, off, end - off, true, s->cbw_timeout_ns, + block_copy_cb, bs); + if (ret < 0 && s->on_cbw_error == ON_CBW_ERROR_BREAK_GUEST_WRITE) { return ret; } WITH_QEMU_LOCK_GUARD(&s->lock) { - bdrv_set_dirty_bitmap(s->done_bitmap, off, end - off); + if (ret < 0) { + assert(s->on_cbw_error == ON_CBW_ERROR_BREAK_SNAPSHOT); + if (!s->snapshot_error) { + s->snapshot_error = ret; + } + } else { + bdrv_set_dirty_bitmap(s->done_bitmap, off, end - off); + } reqlist_wait_all(&s->frozen_read_reqs, off, end - off, &s->lock); } @@ -175,6 +213,11 @@ static BlockReq *cbw_snapshot_read_lock(BlockDriverState *bs, QEMU_LOCK_GUARD(&s->lock); + if (s->snapshot_error) { + g_free(req); + return NULL; + } + if (bdrv_dirty_bitmap_next_zero(s->access_bitmap, offset, bytes) != -1) { g_free(req); return NULL; @@ -328,46 +371,36 @@ static void cbw_child_perm(BlockDriverState *bs, BdrvChild *c, } } -static bool cbw_parse_bitmap_option(QDict *options, BdrvDirtyBitmap **bitmap, - Error **errp) +static BlockdevOptions *cbw_parse_options(QDict *options, Error **errp) { - QDict *bitmap_qdict = NULL; - BlockDirtyBitmap *bmp_param = NULL; + BlockdevOptions *opts = NULL; Visitor *v = NULL; - bool ret = false; - *bitmap = NULL; + qdict_put_str(options, "driver", "copy-before-write"); - qdict_extract_subqdict(options, &bitmap_qdict, "bitmap."); - if (!qdict_size(bitmap_qdict)) { - ret = true; - goto out; - } - - v = qobject_input_visitor_new_flat_confused(bitmap_qdict, errp); + v = qobject_input_visitor_new_flat_confused(options, errp); if (!v) { goto out; } - visit_type_BlockDirtyBitmap(v, NULL, &bmp_param, errp); - if (!bmp_param) { - goto out; - } - - *bitmap = block_dirty_bitmap_lookup(bmp_param->node, bmp_param->name, NULL, - errp); - if (!*bitmap) { + visit_type_BlockdevOptions(v, NULL, &opts, errp); + if (!opts) { goto out; } - ret = true; + /* + * Delete options which we are going to parse through BlockdevOptions + * object for original options. + */ + qdict_extract_subqdict(options, NULL, "bitmap"); + qdict_del(options, "on-cbw-error"); + qdict_del(options, "cbw-timeout"); out: - qapi_free_BlockDirtyBitmap(bmp_param); visit_free(v); - qobject_unref(bitmap_qdict); + qdict_del(options, "driver"); - return ret; + return opts; } static int cbw_open(BlockDriverState *bs, QDict *options, int flags, @@ -376,6 +409,15 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags, BDRVCopyBeforeWriteState *s = bs->opaque; BdrvDirtyBitmap *bitmap = NULL; int64_t cluster_size; + g_autoptr(BlockdevOptions) full_opts = NULL; + BlockdevOptionsCbw *opts; + + full_opts = cbw_parse_options(options, errp); + if (!full_opts) { + return -EINVAL; + } + assert(full_opts->driver == BLOCKDEV_DRIVER_COPY_BEFORE_WRITE); + opts = &full_opts->u.copy_before_write; bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds, BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, @@ -390,9 +432,17 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags, return -EINVAL; } - if (!cbw_parse_bitmap_option(options, &bitmap, errp)) { - return -EINVAL; + if (opts->has_bitmap) { + bitmap = block_dirty_bitmap_lookup(opts->bitmap->node, + opts->bitmap->name, NULL, errp); + if (!bitmap) { + return -EINVAL; + } } + s->on_cbw_error = opts->has_on_cbw_error ? opts->on_cbw_error : + ON_CBW_ERROR_BREAK_GUEST_WRITE; + s->cbw_timeout_ns = opts->has_cbw_timeout ? + opts->cbw_timeout * NANOSECONDS_PER_SECOND : 0; bs->total_sectors = bs->file->bs->total_sectors; bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED | diff --git a/block/mirror.c b/block/mirror.c index d8ecb9efa2..3c4ab1159d 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -73,7 +73,7 @@ typedef struct MirrorBlockJob { uint64_t last_pause_ns; unsigned long *in_flight_bitmap; - int in_flight; + unsigned in_flight; int64_t bytes_in_flight; QTAILQ_HEAD(, MirrorOp) ops_in_flight; int ret; diff --git a/block/nbd.c b/block/nbd.c index 7f5f50ec46..97683cce27 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -77,7 +77,7 @@ typedef struct BDRVNBDState { QemuMutex requests_lock; NBDClientState state; CoQueue free_sema; - int in_flight; + unsigned in_flight; NBDClientRequest requests[MAX_NBD_REQUESTS]; QEMUTimer *reconnect_delay_timer; @@ -371,6 +371,7 @@ static bool nbd_client_connecting(BDRVNBDState *s) /* Called with s->requests_lock taken. */ static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s) { + int ret; bool blocking = s->state == NBD_CLIENT_CONNECTING_WAIT; /* @@ -380,6 +381,8 @@ static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s) assert(nbd_client_connecting(s)); assert(s->in_flight == 1); + trace_nbd_reconnect_attempt(s->bs->in_flight); + if (blocking && !s->reconnect_delay_timer) { /* * It's the first reconnect attempt after switching to @@ -401,7 +404,8 @@ static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s) } qemu_mutex_unlock(&s->requests_lock); - nbd_co_do_establish_connection(s->bs, blocking, NULL); + ret = nbd_co_do_establish_connection(s->bs, blocking, NULL); + trace_nbd_reconnect_attempt_result(ret, s->bs->in_flight); qemu_mutex_lock(&s->requests_lock); /* diff --git a/block/trace-events b/block/trace-events index 549090d453..48dbf10c66 100644 --- a/block/trace-events +++ b/block/trace-events @@ -172,6 +172,8 @@ nbd_read_reply_entry_fail(int ret, const char *err) "ret = %d, err: %s" nbd_co_request_fail(uint64_t from, uint32_t len, uint64_t handle, uint16_t flags, uint16_t type, const char *name, int ret, const char *err) "Request failed { .from = %" PRIu64", .len = %" PRIu32 ", .handle = %" PRIu64 ", .flags = 0x%" PRIx16 ", .type = %" PRIu16 " (%s) } ret = %d, err: %s" nbd_client_handshake(const char *export_name) "export '%s'" nbd_client_handshake_success(const char *export_name) "export '%s'" +nbd_reconnect_attempt(unsigned in_flight) "in_flight %u" +nbd_reconnect_attempt_result(int ret, unsigned in_flight) "ret %d in_flight %u" # ssh.c ssh_restart_coroutine(void *co) "co=%p" diff --git a/contrib/vhost-user-blk/meson.build b/contrib/vhost-user-blk/meson.build index 601ea15ef5..dcb9e2ffcd 100644 --- a/contrib/vhost-user-blk/meson.build +++ b/contrib/vhost-user-blk/meson.build @@ -1,5 +1,4 @@ -# FIXME: broken on 32-bit architectures executable('vhost-user-blk', files('vhost-user-blk.c'), dependencies: [qemuutil, vhost_user], - build_by_default: false, + build_by_default: targetos == 'linux', install: false) diff --git a/contrib/vhost-user-blk/vhost-user-blk.c b/contrib/vhost-user-blk/vhost-user-blk.c index cd4a5d7335..9cb78ca1d0 100644 --- a/contrib/vhost-user-blk/vhost-user-blk.c +++ b/contrib/vhost-user-blk/vhost-user-blk.c @@ -146,7 +146,7 @@ vub_readv(VubReq *req, struct iovec *iov, uint32_t iovcnt) req->size = vub_iov_size(iov, iovcnt); rc = preadv(vdev_blk->blk_fd, iov, iovcnt, req->sector_num * 512); if (rc < 0) { - fprintf(stderr, "%s, Sector %"PRIu64", Size %lu failed with %s\n", + fprintf(stderr, "%s, Sector %"PRIu64", Size %zu failed with %s\n", vdev_blk->blk_name, req->sector_num, req->size, strerror(errno)); return -1; @@ -169,7 +169,7 @@ vub_writev(VubReq *req, struct iovec *iov, uint32_t iovcnt) req->size = vub_iov_size(iov, iovcnt); rc = pwritev(vdev_blk->blk_fd, iov, iovcnt, req->sector_num * 512); if (rc < 0) { - fprintf(stderr, "%s, Sector %"PRIu64", Size %lu failed with %s\n", + fprintf(stderr, "%s, Sector %"PRIu64", Size %zu failed with %s\n", vdev_blk->blk_name, req->sector_num, req->size, strerror(errno)); return -1; @@ -188,7 +188,7 @@ vub_discard_write_zeroes(VubReq *req, struct iovec *iov, uint32_t iovcnt, size = vub_iov_size(iov, iovcnt); if (size != sizeof(*desc)) { - fprintf(stderr, "Invalid size %ld, expect %ld\n", size, sizeof(*desc)); + fprintf(stderr, "Invalid size %zd, expect %zd\n", size, sizeof(*desc)); return -1; } buf = g_new0(char, size); diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst index d7cf904f7f..3f18ab424e 100644 --- a/docs/interop/vhost-user.rst +++ b/docs/interop/vhost-user.rst @@ -1376,14 +1376,6 @@ Front-end message types For further details on postcopy, see ``VHOST_USER_SET_MEM_TABLE``. They apply to ``VHOST_USER_ADD_MEM_REG`` accordingly. - Exactly one file descriptor from which the memory is mapped is - passed in the ancillary data. - - In postcopy mode (see ``VHOST_USER_POSTCOPY_LISTEN``), the back-end - replies with the bases of the memory mapped region to the front-end. - For further details on postcopy, see ``VHOST_USER_SET_MEM_TABLE``. - They apply to ``VHOST_USER_ADD_MEM_REG`` accordingly. - ``VHOST_USER_REM_MEM_REG`` :id: 38 :equivalent ioctl: N/A @@ -1408,14 +1400,6 @@ Front-end message types accept messages with one file descriptor. If a file descriptor is passed, the back-end MUST close it without using it otherwise. - The memory region to be removed is identified by its guest address, - user address and size. The mmap offset is ignored. - - No file descriptors SHOULD be passed in the ancillary data. For - compatibility with existing incorrect implementations, the back-end MAY - accept messages with one file descriptor. If a file descriptor is - passed, the back-end MUST close it without using it otherwise. - ``VHOST_USER_SET_STATUS`` :id: 39 :equivalent ioctl: VHOST_VDPA_SET_STATUS diff --git a/hw/pci/msi.c b/hw/pci/msi.c index 5c471b9616..058d1d1ef1 100644 --- a/hw/pci/msi.c +++ b/hw/pci/msi.c @@ -322,9 +322,9 @@ void msi_set_mask(PCIDevice *dev, int vector, bool mask, Error **errp) bool msi64bit = flags & PCI_MSI_FLAGS_64BIT; uint32_t irq_state, vector_mask, pending; - if (vector > PCI_MSI_VECTORS_MAX) { + if (vector >= PCI_MSI_VECTORS_MAX) { error_setg(errp, "msi: vector %d not allocated. max vector is %d", - vector, PCI_MSI_VECTORS_MAX); + vector, (PCI_MSI_VECTORS_MAX - 1)); return; } diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c index 4de8b6b3b0..8e581575c9 100644 --- a/hw/virtio/vhost-backend.c +++ b/hw/virtio/vhost-backend.c @@ -146,6 +146,12 @@ static int vhost_kernel_set_vring_call(struct vhost_dev *dev, return vhost_kernel_call(dev, VHOST_SET_VRING_CALL, file); } +static int vhost_kernel_set_vring_err(struct vhost_dev *dev, + struct vhost_vring_file *file) +{ + return vhost_kernel_call(dev, VHOST_SET_VRING_ERR, file); +} + static int vhost_kernel_set_vring_busyloop_timeout(struct vhost_dev *dev, struct vhost_vring_state *s) { @@ -309,6 +315,7 @@ const VhostOps kernel_ops = { .vhost_get_vring_base = vhost_kernel_get_vring_base, .vhost_set_vring_kick = vhost_kernel_set_vring_kick, .vhost_set_vring_call = vhost_kernel_set_vring_call, + .vhost_set_vring_err = vhost_kernel_set_vring_err, .vhost_set_vring_busyloop_timeout = vhost_kernel_set_vring_busyloop_timeout, .vhost_set_features = vhost_kernel_set_features, diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 4b9be26e84..75b8df21a4 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -1313,6 +1313,11 @@ static int vhost_user_set_vring_call(struct vhost_dev *dev, return vhost_set_vring_file(dev, VHOST_USER_SET_VRING_CALL, file); } +static int vhost_user_set_vring_err(struct vhost_dev *dev, + struct vhost_vring_file *file) +{ + return vhost_set_vring_file(dev, VHOST_USER_SET_VRING_ERR, file); +} static int vhost_user_get_u64(struct vhost_dev *dev, int request, uint64_t *u64) { @@ -2616,6 +2621,7 @@ const VhostOps user_ops = { .vhost_get_vring_base = vhost_user_get_vring_base, .vhost_set_vring_kick = vhost_user_set_vring_kick, .vhost_set_vring_call = vhost_user_set_vring_call, + .vhost_set_vring_err = vhost_user_set_vring_err, .vhost_set_features = vhost_user_set_features, .vhost_get_features = vhost_user_get_features, .vhost_set_owner = vhost_user_set_owner, diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 6c41fa13e3..0827d631c0 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -1278,6 +1278,19 @@ static int vhost_virtqueue_set_busyloop_timeout(struct vhost_dev *dev, return 0; } +static void vhost_virtqueue_error_notifier(EventNotifier *n) +{ + struct vhost_virtqueue *vq = container_of(n, struct vhost_virtqueue, + error_notifier); + struct vhost_dev *dev = vq->dev; + int index = vq - dev->vqs; + + if (event_notifier_test_and_clear(n) && dev->vdev) { + VHOST_OPS_DEBUG(-EINVAL, "vhost vring error in virtqueue %d", + dev->vq_index + index); + } +} + static int vhost_virtqueue_init(struct vhost_dev *dev, struct vhost_virtqueue *vq, int n) { @@ -1299,7 +1312,27 @@ static int vhost_virtqueue_init(struct vhost_dev *dev, vq->dev = dev; + if (dev->vhost_ops->vhost_set_vring_err) { + r = event_notifier_init(&vq->error_notifier, 0); + if (r < 0) { + goto fail_call; + } + + file.fd = event_notifier_get_fd(&vq->error_notifier); + r = dev->vhost_ops->vhost_set_vring_err(dev, &file); + if (r) { + VHOST_OPS_DEBUG(r, "vhost_set_vring_err failed"); + goto fail_err; + } + + event_notifier_set_handler(&vq->error_notifier, + vhost_virtqueue_error_notifier); + } + return 0; + +fail_err: + event_notifier_cleanup(&vq->error_notifier); fail_call: event_notifier_cleanup(&vq->masked_notifier); return r; @@ -1308,6 +1341,10 @@ fail_call: static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq) { event_notifier_cleanup(&vq->masked_notifier); + if (vq->dev->vhost_ops->vhost_set_vring_err) { + event_notifier_set_handler(&vq->error_notifier, NULL); + event_notifier_cleanup(&vq->error_notifier); + } } int vhost_dev_init(struct vhost_dev *hdev, void *opaque, diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index 7c122ab957..281152d338 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -675,11 +675,10 @@ static int virtio_iommu_probe(VirtIOIOMMU *s, static int virtio_iommu_iov_to_req(struct iovec *iov, unsigned int iov_cnt, - void *req, size_t req_sz) + void *req, size_t payload_sz) { - size_t sz, payload_sz = req_sz - sizeof(struct virtio_iommu_req_tail); + size_t sz = iov_to_buf(iov, iov_cnt, 0, req, payload_sz); - sz = iov_to_buf(iov, iov_cnt, 0, req, payload_sz); if (unlikely(sz != payload_sz)) { return VIRTIO_IOMMU_S_INVAL; } @@ -692,7 +691,8 @@ static int virtio_iommu_handle_ ## __req(VirtIOIOMMU *s, \ unsigned int iov_cnt) \ { \ struct virtio_iommu_req_ ## __req req; \ - int ret = virtio_iommu_iov_to_req(iov, iov_cnt, &req, sizeof(req)); \ + int ret = virtio_iommu_iov_to_req(iov, iov_cnt, &req, \ + sizeof(req) - sizeof(struct virtio_iommu_req_tail));\ \ return ret ? ret : virtio_iommu_ ## __req(s, &req); \ } @@ -1322,6 +1322,14 @@ static int iommu_post_load(void *opaque, int version_id) VirtIOIOMMU *s = opaque; g_tree_foreach(s->domains, reconstruct_endpoints, s); + + /* + * Memory regions are dynamically turned on/off depending on + * 'config.bypass' and attached domain type if there is. After + * migration, we need to make sure the memory regions are + * still correct. + */ + virtio_iommu_switch_address_space_all(s); return 0; } diff --git a/include/block/block-copy.h b/include/block/block-copy.h index 68bbd344b2..ba0b425d78 100644 --- a/include/block/block-copy.h +++ b/include/block/block-copy.h @@ -40,7 +40,9 @@ int64_t block_copy_reset_unallocated(BlockCopyState *s, int64_t offset, int64_t *count); int coroutine_fn block_copy(BlockCopyState *s, int64_t offset, int64_t bytes, - bool ignore_ratelimit); + bool ignore_ratelimit, uint64_t timeout_ns, + BlockCopyAsyncCallbackFunc cb, + void *cb_opaque); /* * Run block-copy in a coroutine, create corresponding BlockCopyCallState diff --git a/include/hw/virtio/vhost-backend.h b/include/hw/virtio/vhost-backend.h index 81bf3109f8..eab46d7f0b 100644 --- a/include/hw/virtio/vhost-backend.h +++ b/include/hw/virtio/vhost-backend.h @@ -69,6 +69,8 @@ typedef int (*vhost_set_vring_kick_op)(struct vhost_dev *dev, struct vhost_vring_file *file); typedef int (*vhost_set_vring_call_op)(struct vhost_dev *dev, struct vhost_vring_file *file); +typedef int (*vhost_set_vring_err_op)(struct vhost_dev *dev, + struct vhost_vring_file *file); typedef int (*vhost_set_vring_busyloop_timeout_op)(struct vhost_dev *dev, struct vhost_vring_state *r); typedef int (*vhost_set_features_op)(struct vhost_dev *dev, @@ -145,6 +147,7 @@ typedef struct VhostOps { vhost_get_vring_base_op vhost_get_vring_base; vhost_set_vring_kick_op vhost_set_vring_kick; vhost_set_vring_call_op vhost_set_vring_call; + vhost_set_vring_err_op vhost_set_vring_err; vhost_set_vring_busyloop_timeout_op vhost_set_vring_busyloop_timeout; vhost_set_features_op vhost_set_features; vhost_get_features_op vhost_get_features; diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h index b291fe4e24..a346f23d13 100644 --- a/include/hw/virtio/vhost.h +++ b/include/hw/virtio/vhost.h @@ -29,6 +29,7 @@ struct vhost_virtqueue { unsigned long long used_phys; unsigned used_size; EventNotifier masked_notifier; + EventNotifier error_notifier; struct vhost_dev *dev; }; @@ -246,8 +247,29 @@ bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n); */ void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n, bool mask); + +/** + * vhost_get_features() - return a sanitised set of feature bits + * @hdev: common vhost_dev structure + * @feature_bits: pointer to terminated table of feature bits + * @features: original feature set + * + * This returns a set of features bits that is an intersection of what + * is supported by the vhost backend (hdev->features), the supported + * feature_bits and the requested feature set. + */ uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits, uint64_t features); + +/** + * vhost_ack_features() - set vhost acked_features + * @hdev: common vhost_dev structure + * @feature_bits: pointer to terminated table of feature bits + * @features: requested feature set + * + * This sets the internal hdev->acked_features to the intersection of + * the backends advertised features and the supported feature_bits. + */ void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits, uint64_t features); bool vhost_has_free_slot(void); diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h index d1548d5b11..08c5bb3c76 100644 --- a/include/qemu/coroutine.h +++ b/include/qemu/coroutine.h @@ -331,6 +331,19 @@ static inline void coroutine_fn qemu_co_sleep_ns(QEMUClockType type, int64_t ns) qemu_co_sleep_ns_wakeable(&w, type, ns); } +typedef void CleanupFunc(void *opaque); +/** + * Run entry in a coroutine and start timer. Wait for entry to finish or for + * timer to elapse, what happen first. If entry finished, return 0, if timer + * elapsed earlier, return -ETIMEDOUT. + * + * Be careful, entry execution is not canceled, user should handle it somehow. + * If @clean is provided, it's called after coroutine finish if timeout + * happened. + */ +int coroutine_fn qemu_co_timeout(CoroutineEntry *entry, void *opaque, + uint64_t timeout_ns, CleanupFunc clean); + /** * Wake a coroutine if it is sleeping in qemu_co_sleep_ns. The timer will be * deleted. @sleep_state must be the variable whose address was given to diff --git a/meson.build b/meson.build index a113078f1a..65a885ea69 100644 --- a/meson.build +++ b/meson.build @@ -1516,7 +1516,7 @@ have_vhost_user_blk_server = get_option('vhost_user_blk_server') \ error_message: 'vhost_user_blk_server requires linux') \ .require(have_vhost_user, error_message: 'vhost_user_blk_server requires vhost-user support') \ - .disable_auto_if(not have_system) \ + .disable_auto_if(not have_tools and not have_system) \ .allowed() if get_option('fuse').disabled() and get_option('fuse_lseek').enabled() diff --git a/nbd/client-connection.c b/nbd/client-connection.c index 2a632931c3..0c5f917efa 100644 --- a/nbd/client-connection.c +++ b/nbd/client-connection.c @@ -23,6 +23,7 @@ */ #include "qemu/osdep.h" +#include "trace.h" #include "block/nbd.h" @@ -210,6 +211,7 @@ static void *connect_thread_func(void *opaque) object_unref(OBJECT(conn->sioc)); conn->sioc = NULL; if (conn->do_retry && !conn->detached) { + trace_nbd_connect_thread_sleep(timeout); qemu_mutex_unlock(&conn->mutex); sleep(timeout); diff --git a/nbd/trace-events b/nbd/trace-events index c4919a2dd5..b7032ca277 100644 --- a/nbd/trace-events +++ b/nbd/trace-events @@ -73,3 +73,6 @@ nbd_co_receive_request_decode_type(uint64_t handle, uint16_t type, const char *n nbd_co_receive_request_payload_received(uint64_t handle, uint32_t len) "Payload received: handle = %" PRIu64 ", len = %" PRIu32 nbd_co_receive_align_compliance(const char *op, uint64_t from, uint32_t len, uint32_t align) "client sent non-compliant unaligned %s request: from=0x%" PRIx64 ", len=0x%" PRIx32 ", align=0x%" PRIx32 nbd_trip(void) "Reading request" + +# client-connection.c +nbd_connect_thread_sleep(uint64_t timeout) "timeout %" PRIu64 diff --git a/qapi/block-core.json b/qapi/block-core.json index 457df16638..2173e7734a 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -4185,6 +4185,25 @@ 'data': { '*bottom': 'str' } } ## +# @OnCbwError: +# +# An enumeration of possible behaviors for copy-before-write operation +# failures. +# +# @break-guest-write: report the error to the guest. This way, the guest +# will not be able to overwrite areas that cannot be +# backed up, so the backup process remains valid. +# +# @break-snapshot: continue guest write. Doing so will make the provided +# snapshot state invalid and any backup or export +# process based on it will finally fail. +# +# Since: 7.1 +## +{ 'enum': 'OnCbwError', + 'data': [ 'break-guest-write', 'break-snapshot' ] } + +## # @BlockdevOptionsCbw: # # Driver specific block device options for the copy-before-write driver, @@ -4205,11 +4224,21 @@ # modifications (or removing) of specified bitmap doesn't # influence the filter. (Since 7.0) # +# @on-cbw-error: Behavior on failure of copy-before-write operation. +# Default is @break-guest-write. (Since 7.1) +# +# @cbw-timeout: Zero means no limit. Non-zero sets the timeout in seconds +# for copy-before-write operation. When a timeout occurs, +# the respective copy-before-write operation will fail, and +# the @on-cbw-error parameter will decide how this failure +# is handled. Default 0. (Since 7.1) +# # Since: 6.2 ## { 'struct': 'BlockdevOptionsCbw', 'base': 'BlockdevOptionsGenericFormat', - 'data': { 'target': 'BlockdevRef', '*bitmap': 'BlockDirtyBitmap' } } + 'data': { 'target': 'BlockdevRef', '*bitmap': 'BlockDirtyBitmap', + '*on-cbw-error': 'OnCbwError', '*cbw-timeout': 'uint32' } } ## # @BlockdevOptions: diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index b4cc3c2d68..ffed4729a3 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -779,15 +779,9 @@ vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { /* Send the message back to qemu with the addresses filled in. */ vmsg->fd_num = 0; - if (!vu_send_reply(dev, dev->sock, vmsg)) { - vu_panic(dev, "failed to respond to add-mem-region for postcopy"); - return false; - } - DPRINT("Successfully added new region in postcopy\n"); dev->nregions++; - return false; - + return true; } else { for (i = 0; i < dev->max_queues; i++) { if (dev->vq[i].vring.desc) { @@ -1827,18 +1821,11 @@ vu_handle_vring_kick(VuDev *dev, VhostUserMsg *vmsg) static bool vu_handle_get_max_memslots(VuDev *dev, VhostUserMsg *vmsg) { - vmsg->flags = VHOST_USER_REPLY_MASK | VHOST_USER_VERSION; - vmsg->size = sizeof(vmsg->payload.u64); - vmsg->payload.u64 = VHOST_USER_MAX_RAM_SLOTS; - vmsg->fd_num = 0; - - if (!vu_message_write(dev, dev->sock, vmsg)) { - vu_panic(dev, "Failed to send max ram slots: %s\n", strerror(errno)); - } + vmsg_set_reply_u64(vmsg, VHOST_USER_MAX_RAM_SLOTS); DPRINT("u64: 0x%016"PRIx64"\n", (uint64_t) VHOST_USER_MAX_RAM_SLOTS); - return false; + return true; } static bool diff --git a/tests/qemu-iotests/pylintrc b/tests/qemu-iotests/pylintrc index 32ab77b8bb..f4f823a991 100644 --- a/tests/qemu-iotests/pylintrc +++ b/tests/qemu-iotests/pylintrc @@ -51,3 +51,8 @@ notes=FIXME, # Maximum number of characters on a single line. max-line-length=79 + + +[SIMILARITIES] + +min-similarity-lines=6 diff --git a/tests/qemu-iotests/tests/copy-before-write b/tests/qemu-iotests/tests/copy-before-write new file mode 100755 index 0000000000..16efebbf8f --- /dev/null +++ b/tests/qemu-iotests/tests/copy-before-write @@ -0,0 +1,216 @@ +#!/usr/bin/env python3 +# group: auto backup +# +# Copyright (c) 2022 Virtuozzo International GmbH +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# + +import os +import re + +from qemu.machine import QEMUMachine + +import iotests +from iotests import qemu_img_create, qemu_io + + +temp_img = os.path.join(iotests.test_dir, 'temp') +source_img = os.path.join(iotests.test_dir, 'source') +size = '1M' + + +class TestCbwError(iotests.QMPTestCase): + def tearDown(self): + self.vm.shutdown() + os.remove(temp_img) + os.remove(source_img) + + def setUp(self): + qemu_img_create('-f', iotests.imgfmt, source_img, size) + qemu_img_create('-f', iotests.imgfmt, temp_img, size) + qemu_io('-c', 'write 0 1M', source_img) + + opts = ['-nodefaults', '-display', 'none', '-machine', 'none'] + self.vm = QEMUMachine(iotests.qemu_prog, opts, + base_temp_dir=iotests.test_dir, + sock_dir=iotests.sock_dir) + self.vm.launch() + + def do_cbw_error(self, on_cbw_error): + result = self.vm.qmp('blockdev-add', { + 'node-name': 'cbw', + 'driver': 'copy-before-write', + 'on-cbw-error': on_cbw_error, + 'file': { + 'driver': iotests.imgfmt, + 'file': { + 'driver': 'file', + 'filename': source_img, + } + }, + 'target': { + 'driver': iotests.imgfmt, + 'file': { + 'driver': 'blkdebug', + 'image': { + 'driver': 'file', + 'filename': temp_img + }, + 'inject-error': [ + { + 'event': 'write_aio', + 'errno': 5, + 'immediately': False, + 'once': True + } + ] + } + } + }) + self.assert_qmp(result, 'return', {}) + + result = self.vm.qmp('blockdev-add', { + 'node-name': 'access', + 'driver': 'snapshot-access', + 'file': 'cbw' + }) + self.assert_qmp(result, 'return', {}) + + result = self.vm.qmp('human-monitor-command', + command_line='qemu-io cbw "write 0 1M"') + self.assert_qmp(result, 'return', '') + + result = self.vm.qmp('human-monitor-command', + command_line='qemu-io access "read 0 1M"') + self.assert_qmp(result, 'return', '') + + self.vm.shutdown() + log = self.vm.get_log() + log = re.sub(r'^\[I \d+\.\d+\] OPENED\n', '', log) + log = re.sub(r'\[I \+\d+\.\d+\] CLOSED\n?$', '', log) + log = iotests.filter_qemu_io(log) + return log + + def test_break_snapshot_on_cbw_error(self): + """break-snapshot behavior: + Guest write succeed, but further snapshot-read fails, as snapshot is + broken. + """ + log = self.do_cbw_error('break-snapshot') + + self.assertEqual(log, """\ +wrote 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read failed: Permission denied +""") + + def test_break_guest_write_on_cbw_error(self): + """break-guest-write behavior: + Guest write fails, but snapshot-access continues working and further + snapshot-read succeeds. + """ + log = self.do_cbw_error('break-guest-write') + + self.assertEqual(log, """\ +write failed: Input/output error +read 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +""") + + def do_cbw_timeout(self, on_cbw_error): + result = self.vm.qmp('object-add', { + 'qom-type': 'throttle-group', + 'id': 'group0', + 'limits': {'bps-write': 300 * 1024} + }) + self.assert_qmp(result, 'return', {}) + + result = self.vm.qmp('blockdev-add', { + 'node-name': 'cbw', + 'driver': 'copy-before-write', + 'on-cbw-error': on_cbw_error, + 'cbw-timeout': 1, + 'file': { + 'driver': iotests.imgfmt, + 'file': { + 'driver': 'file', + 'filename': source_img, + } + }, + 'target': { + 'driver': 'throttle', + 'throttle-group': 'group0', + 'file': { + 'driver': 'qcow2', + 'file': { + 'driver': 'file', + 'filename': temp_img + } + } + } + }) + self.assert_qmp(result, 'return', {}) + + result = self.vm.qmp('blockdev-add', { + 'node-name': 'access', + 'driver': 'snapshot-access', + 'file': 'cbw' + }) + self.assert_qmp(result, 'return', {}) + + result = self.vm.qmp('human-monitor-command', + command_line='qemu-io cbw "write 0 512K"') + self.assert_qmp(result, 'return', '') + + # We need second write to trigger throttling + result = self.vm.qmp('human-monitor-command', + command_line='qemu-io cbw "write 512K 512K"') + self.assert_qmp(result, 'return', '') + + result = self.vm.qmp('human-monitor-command', + command_line='qemu-io access "read 0 1M"') + self.assert_qmp(result, 'return', '') + + self.vm.shutdown() + log = self.vm.get_log() + log = re.sub(r'^\[I \d+\.\d+\] OPENED\n', '', log) + log = re.sub(r'\[I \+\d+\.\d+\] CLOSED\n?$', '', log) + log = iotests.filter_qemu_io(log) + return log + + def test_timeout_break_guest(self): + log = self.do_cbw_timeout('break-guest-write') + self.assertEqual(log, """\ +wrote 524288/524288 bytes at offset 0 +512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +write failed: Connection timed out +read 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +""") + + def test_timeout_break_snapshot(self): + log = self.do_cbw_timeout('break-snapshot') + self.assertEqual(log, """\ +wrote 524288/524288 bytes at offset 0 +512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 524288/524288 bytes at offset 524288 +512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read failed: Permission denied +""") + + +if __name__ == '__main__': + iotests.main(supported_fmts=['qcow2'], + supported_protocols=['file']) diff --git a/tests/qemu-iotests/tests/copy-before-write.out b/tests/qemu-iotests/tests/copy-before-write.out new file mode 100644 index 0000000000..89968f35d7 --- /dev/null +++ b/tests/qemu-iotests/tests/copy-before-write.out @@ -0,0 +1,5 @@ +.... +---------------------------------------------------------------------- +Ran 4 tests + +OK diff --git a/util/meson.build b/util/meson.build index 4939b0b91c..8cce8f8968 100644 --- a/util/meson.build +++ b/util/meson.build @@ -85,6 +85,7 @@ if have_block util_ss.add(files('block-helpers.c')) util_ss.add(files('qemu-coroutine-sleep.c')) util_ss.add(files('qemu-co-shared-resource.c')) + util_ss.add(files('qemu-co-timeout.c')) util_ss.add(files('thread-pool.c', 'qemu-timer.c')) util_ss.add(files('readline.c')) util_ss.add(files('throttle.c')) diff --git a/util/qemu-co-timeout.c b/util/qemu-co-timeout.c new file mode 100644 index 0000000000..00cd335649 --- /dev/null +++ b/util/qemu-co-timeout.c @@ -0,0 +1,89 @@ +/* + * Helper functionality for distributing a fixed total amount of + * an abstract resource among multiple coroutines. + * + * Copyright (c) 2022 Virtuozzo International GmbH + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qemu/coroutine.h" +#include "block/aio.h" + +typedef struct QemuCoTimeoutState { + CoroutineEntry *entry; + void *opaque; + QemuCoSleep sleep_state; + bool marker; + CleanupFunc *clean; +} QemuCoTimeoutState; + +static void coroutine_fn qemu_co_timeout_entry(void *opaque) +{ + QemuCoTimeoutState *s = opaque; + + s->entry(s->opaque); + + if (s->marker) { + assert(!s->sleep_state.to_wake); + /* .marker set by qemu_co_timeout, it have been failed */ + if (s->clean) { + s->clean(s->opaque); + } + g_free(s); + } else { + s->marker = true; + qemu_co_sleep_wake(&s->sleep_state); + } +} + +int coroutine_fn qemu_co_timeout(CoroutineEntry *entry, void *opaque, + uint64_t timeout_ns, CleanupFunc clean) +{ + QemuCoTimeoutState *s; + Coroutine *co; + + if (timeout_ns == 0) { + entry(opaque); + return 0; + } + + s = g_new(QemuCoTimeoutState, 1); + *s = (QemuCoTimeoutState) { + .entry = entry, + .opaque = opaque, + .clean = clean + }; + + co = qemu_coroutine_create(qemu_co_timeout_entry, s); + + aio_co_enter(qemu_get_current_aio_context(), co); + qemu_co_sleep_ns_wakeable(&s->sleep_state, QEMU_CLOCK_REALTIME, timeout_ns); + + if (s->marker) { + /* .marker set by qemu_co_timeout_entry, success */ + g_free(s); + return 0; + } + + /* Don't free s, as we can't cancel qemu_co_timeout_entry execution */ + s->marker = true; + return -ETIMEDOUT; +} |