aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--MAINTAINERS12
-rw-r--r--aio-posix.c87
-rw-r--r--aio-win32.c8
-rw-r--r--async.c10
-rw-r--r--block.c2712
-rw-r--r--block/Makefile.objs5
-rw-r--r--block/backup.c155
-rw-r--r--block/blkdebug.c6
-rw-r--r--block/block-backend.c11
-rw-r--r--block/io.c2540
-rw-r--r--block/iscsi.c64
-rw-r--r--block/mirror.c53
-rw-r--r--block/null.c66
-rw-r--r--block/qapi.c46
-rw-r--r--block/qcow.c8
-rw-r--r--block/qcow2-refcount.c2
-rw-r--r--block/qcow2-snapshot.c6
-rw-r--r--block/qcow2.c14
-rw-r--r--block/qed.c6
-rw-r--r--block/quorum.c5
-rw-r--r--block/rbd.c2
-rw-r--r--block/sheepdog.c2
-rw-r--r--block/snapshot.c12
-rw-r--r--block/vdi.c6
-rw-r--r--block/vhdx.c10
-rw-r--r--block/vmdk.c10
-rw-r--r--block/vpc.c6
-rw-r--r--block/vvfat.c7
-rw-r--r--blockdev.c195
-rw-r--r--blockjob.c23
-rw-r--r--docs/bitmaps.md352
-rw-r--r--docs/qmp/qmp-events.txt21
-rw-r--r--hmp.c6
-rw-r--r--hw/acpi/pcihp.c2
-rw-r--r--hw/arm/nseries.c5
-rw-r--r--hw/arm/omap1.c6
-rw-r--r--hw/arm/pxa2xx_gpio.c2
-rw-r--r--hw/arm/strongarm.c4
-rw-r--r--hw/block/m25p80.c5
-rw-r--r--hw/bt/sdp.c2
-rw-r--r--hw/char/virtio-serial-bus.c8
-rw-r--r--hw/display/tc6393xb.c2
-rw-r--r--hw/gpio/max7310.c2
-rw-r--r--hw/gpio/omap_gpio.c13
-rw-r--r--hw/gpio/zaurus.c2
-rw-r--r--hw/i2c/omap_i2c.c10
-rw-r--r--hw/intc/allwinner-a10-pic.c8
-rw-r--r--hw/intc/omap_intc.c9
-rw-r--r--hw/pci-host/bonito.c2
-rw-r--r--hw/pci-host/uninorth.c5
-rw-r--r--hw/pci/msi.c12
-rw-r--r--hw/pci/pcie_aer.c2
-rw-r--r--hw/pci/shpc.c10
-rw-r--r--hw/pci/slotid_cap.c2
-rw-r--r--hw/ppc/ppce500_spin.c2
-rw-r--r--hw/scsi/megasas.c2
-rw-r--r--hw/sd/sd.c3
-rw-r--r--include/block/aio.h16
-rw-r--r--include/block/block.h36
-rw-r--r--include/block/block_int.h16
-rw-r--r--include/block/blockjob.h22
-rw-r--r--include/block/qapi.h2
-rw-r--r--include/hw/pci/pci.h16
-rw-r--r--include/hw/pci/pcie_regs.h18
-rw-r--r--include/qapi/qmp/qerror.h6
-rw-r--r--include/qemu/hbitmap.h23
-rw-r--r--include/standard-headers/linux/virtio_blk.h8
-rw-r--r--include/sysemu/block-backend.h2
-rw-r--r--include/sysemu/os-win32.h3
-rw-r--r--iothread.c11
-rw-r--r--kvm-all.c8
-rw-r--r--migration/block.c9
-rw-r--r--qapi/block-core.json113
-rw-r--r--qemu-img.c516
-rw-r--r--qmp-commands.hx130
-rwxr-xr-xscripts/checkpatch.pl11
-rw-r--r--scripts/qemu-gdb.py75
-rw-r--r--scripts/qmp/qmp.py95
-rw-r--r--target-ppc/cpu.h4
-rwxr-xr-xtests/qemu-iotests/122223
-rw-r--r--tests/qemu-iotests/122.out209
-rw-r--r--tests/qemu-iotests/124363
-rw-r--r--tests/qemu-iotests/124.out5
-rw-r--r--tests/qemu-iotests/12986
-rw-r--r--tests/qemu-iotests/129.out5
-rw-r--r--tests/qemu-iotests/group3
-rw-r--r--tests/qemu-iotests/iotests.py38
-rw-r--r--tests/test-aio.c19
-rw-r--r--tests/test-hbitmap.c255
-rw-r--r--thread-pool.c14
-rw-r--r--util/hbitmap.c85
91 files changed, 5962 insertions, 3071 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 7aab80b59f..b5ab755de5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1182,7 +1182,19 @@ S: Supported
F: block/gluster.c
T: git git://github.com/codyprime/qemu-kvm-jtc.git block
+Null Block Driver
+M: Fam Zheng <famz@redhat.com>
+L: qemu-block@nongnu.org
+S: Supported
+F: block/null.c
+
Bootdevice
M: Gonglei <arei.gonglei@huawei.com>
S: Maintained
F: bootdevice.c
+
+Quorum
+M: Alberto Garcia <berto@igalia.com>
+S: Supported
+F: block/quorum.c
+L: qemu-block@nongnu.org
diff --git a/aio-posix.c b/aio-posix.c
index cbd4c3438c..4abec38866 100644
--- a/aio-posix.c
+++ b/aio-posix.c
@@ -24,7 +24,6 @@ struct AioHandler
IOHandler *io_read;
IOHandler *io_write;
int deleted;
- int pollfds_idx;
void *opaque;
QLIST_ENTRY(AioHandler) node;
};
@@ -83,7 +82,6 @@ void aio_set_fd_handler(AioContext *ctx,
node->io_read = io_read;
node->io_write = io_write;
node->opaque = opaque;
- node->pollfds_idx = -1;
node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0);
node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
@@ -186,13 +184,61 @@ bool aio_dispatch(AioContext *ctx)
return progress;
}
+/* These thread-local variables are used only in a small part of aio_poll
+ * around the call to the poll() system call. In particular they are not
+ * used while aio_poll is performing callbacks, which makes it much easier
+ * to think about reentrancy!
+ *
+ * Stack-allocated arrays would be perfect but they have size limitations;
+ * heap allocation is expensive enough that we want to reuse arrays across
+ * calls to aio_poll(). And because poll() has to be called without holding
+ * any lock, the arrays cannot be stored in AioContext. Thread-local data
+ * has none of the disadvantages of these three options.
+ */
+static __thread GPollFD *pollfds;
+static __thread AioHandler **nodes;
+static __thread unsigned npfd, nalloc;
+static __thread Notifier pollfds_cleanup_notifier;
+
+static void pollfds_cleanup(Notifier *n, void *unused)
+{
+ g_assert(npfd == 0);
+ g_free(pollfds);
+ g_free(nodes);
+ nalloc = 0;
+}
+
+static void add_pollfd(AioHandler *node)
+{
+ if (npfd == nalloc) {
+ if (nalloc == 0) {
+ pollfds_cleanup_notifier.notify = pollfds_cleanup;
+ qemu_thread_atexit_add(&pollfds_cleanup_notifier);
+ nalloc = 8;
+ } else {
+ g_assert(nalloc <= INT_MAX);
+ nalloc *= 2;
+ }
+ pollfds = g_renew(GPollFD, pollfds, nalloc);
+ nodes = g_renew(AioHandler *, nodes, nalloc);
+ }
+ nodes[npfd] = node;
+ pollfds[npfd] = (GPollFD) {
+ .fd = node->pfd.fd,
+ .events = node->pfd.events,
+ };
+ npfd++;
+}
+
bool aio_poll(AioContext *ctx, bool blocking)
{
AioHandler *node;
bool was_dispatching;
- int ret;
+ int i, ret;
bool progress;
+ int64_t timeout;
+ aio_context_acquire(ctx);
was_dispatching = ctx->dispatching;
progress = false;
@@ -210,39 +256,36 @@ bool aio_poll(AioContext *ctx, bool blocking)
ctx->walking_handlers++;
- g_array_set_size(ctx->pollfds, 0);
+ assert(npfd == 0);
/* fill pollfds */
QLIST_FOREACH(node, &ctx->aio_handlers, node) {
- node->pollfds_idx = -1;
if (!node->deleted && node->pfd.events) {
- GPollFD pfd = {
- .fd = node->pfd.fd,
- .events = node->pfd.events,
- };
- node->pollfds_idx = ctx->pollfds->len;
- g_array_append_val(ctx->pollfds, pfd);
+ add_pollfd(node);
}
}
- ctx->walking_handlers--;
+ timeout = blocking ? aio_compute_timeout(ctx) : 0;
/* wait until next event */
- ret = qemu_poll_ns((GPollFD *)ctx->pollfds->data,
- ctx->pollfds->len,
- blocking ? aio_compute_timeout(ctx) : 0);
+ if (timeout) {
+ aio_context_release(ctx);
+ }
+ ret = qemu_poll_ns((GPollFD *)pollfds, npfd, timeout);
+ if (timeout) {
+ aio_context_acquire(ctx);
+ }
/* if we have any readable fds, dispatch event */
if (ret > 0) {
- QLIST_FOREACH(node, &ctx->aio_handlers, node) {
- if (node->pollfds_idx != -1) {
- GPollFD *pfd = &g_array_index(ctx->pollfds, GPollFD,
- node->pollfds_idx);
- node->pfd.revents = pfd->revents;
- }
+ for (i = 0; i < npfd; i++) {
+ nodes[i]->pfd.revents = pollfds[i].revents;
}
}
+ npfd = 0;
+ ctx->walking_handlers--;
+
/* Run dispatch even if there were no readable fds to run timers */
aio_set_dispatching(ctx, true);
if (aio_dispatch(ctx)) {
@@ -250,5 +293,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
}
aio_set_dispatching(ctx, was_dispatching);
+ aio_context_release(ctx);
+
return progress;
}
diff --git a/aio-win32.c b/aio-win32.c
index e6f4cedf48..233d8f5d79 100644
--- a/aio-win32.c
+++ b/aio-win32.c
@@ -283,6 +283,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
int count;
int timeout;
+ aio_context_acquire(ctx);
have_select_revents = aio_prepare(ctx);
if (have_select_revents) {
blocking = false;
@@ -323,7 +324,13 @@ bool aio_poll(AioContext *ctx, bool blocking)
timeout = blocking
? qemu_timeout_ns_to_ms(aio_compute_timeout(ctx)) : 0;
+ if (timeout) {
+ aio_context_release(ctx);
+ }
ret = WaitForMultipleObjects(count, events, FALSE, timeout);
+ if (timeout) {
+ aio_context_acquire(ctx);
+ }
aio_set_dispatching(ctx, true);
if (first && aio_bh_poll(ctx)) {
@@ -349,5 +356,6 @@ bool aio_poll(AioContext *ctx, bool blocking)
progress |= timerlistgroup_run_timers(&ctx->tlg);
aio_set_dispatching(ctx, was_dispatching);
+ aio_context_release(ctx);
return progress;
}
diff --git a/async.c b/async.c
index 2b51e87679..46d9e639d7 100644
--- a/async.c
+++ b/async.c
@@ -230,7 +230,6 @@ aio_ctx_finalize(GSource *source)
event_notifier_cleanup(&ctx->notifier);
rfifolock_destroy(&ctx->lock);
qemu_mutex_destroy(&ctx->bh_lock);
- g_array_free(ctx->pollfds, TRUE);
timerlistgroup_deinit(&ctx->tlg);
}
@@ -281,12 +280,6 @@ static void aio_timerlist_notify(void *opaque)
aio_notify(opaque);
}
-static void aio_rfifolock_cb(void *opaque)
-{
- /* Kick owner thread in case they are blocked in aio_poll() */
- aio_notify(opaque);
-}
-
AioContext *aio_context_new(Error **errp)
{
int ret;
@@ -302,10 +295,9 @@ AioContext *aio_context_new(Error **errp)
aio_set_event_notifier(ctx, &ctx->notifier,
(EventNotifierHandler *)
event_notifier_test_and_clear);
- ctx->pollfds = g_array_new(FALSE, FALSE, sizeof(GPollFD));
ctx->thread_pool = NULL;
qemu_mutex_init(&ctx->bh_lock);
- rfifolock_init(&ctx->lock, aio_rfifolock_cb, ctx);
+ rfifolock_init(&ctx->lock, NULL, NULL);
timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx);
return ctx;
diff --git a/block.c b/block.c
index f2f8ae77c1..7904098c64 100644
--- a/block.c
+++ b/block.c
@@ -51,43 +51,25 @@
#include <windows.h>
#endif
+/**
+ * A BdrvDirtyBitmap can be in three possible states:
+ * (1) successor is NULL and disabled is false: full r/w mode
+ * (2) successor is NULL and disabled is true: read only mode ("disabled")
+ * (3) successor is set: frozen mode.
+ * A frozen bitmap cannot be renamed, deleted, anonymized, cleared, set,
+ * or enabled. A frozen bitmap can only abdicate() or reclaim().
+ */
struct BdrvDirtyBitmap {
- HBitmap *bitmap;
+ HBitmap *bitmap; /* Dirty sector bitmap implementation */
+ BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status */
+ char *name; /* Optional non-empty unique ID */
+ int64_t size; /* Size of the bitmap (Number of sectors) */
+ bool disabled; /* Bitmap is read-only */
QLIST_ENTRY(BdrvDirtyBitmap) list;
};
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
-static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
- int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
- BlockCompletionFunc *cb, void *opaque);
-static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
- int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
- BlockCompletionFunc *cb, void *opaque);
-static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
- int64_t sector_num, int nb_sectors,
- QEMUIOVector *iov);
-static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
- int64_t sector_num, int nb_sectors,
- QEMUIOVector *iov);
-static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
- int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
- BdrvRequestFlags flags);
-static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
- int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
- BdrvRequestFlags flags);
-static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
- int64_t sector_num,
- QEMUIOVector *qiov,
- int nb_sectors,
- BdrvRequestFlags flags,
- BlockCompletionFunc *cb,
- void *opaque,
- bool is_write);
-static void coroutine_fn bdrv_co_do_rw(void *opaque);
-static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
- int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
-
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
QTAILQ_HEAD_INITIALIZER(bdrv_states);
@@ -97,10 +79,7 @@ static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
static QLIST_HEAD(, BlockDriver) bdrv_drivers =
QLIST_HEAD_INITIALIZER(bdrv_drivers);
-static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
- int nr_sectors);
-static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
- int nr_sectors);
+static void bdrv_dirty_bitmap_truncate(BlockDriverState *bs);
/* If non-zero, use only whitelisted block drivers */
static int use_bdrv_whitelist;
@@ -124,104 +103,6 @@ int is_windows_drive(const char *filename)
}
#endif
-/* throttling disk I/O limits */
-void bdrv_set_io_limits(BlockDriverState *bs,
- ThrottleConfig *cfg)
-{
- int i;
-
- throttle_config(&bs->throttle_state, cfg);
-
- for (i = 0; i < 2; i++) {
- qemu_co_enter_next(&bs->throttled_reqs[i]);
- }
-}
-
-/* this function drain all the throttled IOs */
-static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
-{
- bool drained = false;
- bool enabled = bs->io_limits_enabled;
- int i;
-
- bs->io_limits_enabled = false;
-
- for (i = 0; i < 2; i++) {
- while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
- drained = true;
- }
- }
-
- bs->io_limits_enabled = enabled;
-
- return drained;
-}
-
-void bdrv_io_limits_disable(BlockDriverState *bs)
-{
- bs->io_limits_enabled = false;
-
- bdrv_start_throttled_reqs(bs);
-
- throttle_destroy(&bs->throttle_state);
-}
-
-static void bdrv_throttle_read_timer_cb(void *opaque)
-{
- BlockDriverState *bs = opaque;
- qemu_co_enter_next(&bs->throttled_reqs[0]);
-}
-
-static void bdrv_throttle_write_timer_cb(void *opaque)
-{
- BlockDriverState *bs = opaque;
- qemu_co_enter_next(&bs->throttled_reqs[1]);
-}
-
-/* should be called before bdrv_set_io_limits if a limit is set */
-void bdrv_io_limits_enable(BlockDriverState *bs)
-{
- assert(!bs->io_limits_enabled);
- throttle_init(&bs->throttle_state,
- bdrv_get_aio_context(bs),
- QEMU_CLOCK_VIRTUAL,
- bdrv_throttle_read_timer_cb,
- bdrv_throttle_write_timer_cb,
- bs);
- bs->io_limits_enabled = true;
-}
-
-/* This function makes an IO wait if needed
- *
- * @nb_sectors: the number of sectors of the IO
- * @is_write: is the IO a write
- */
-static void bdrv_io_limits_intercept(BlockDriverState *bs,
- unsigned int bytes,
- bool is_write)
-{
- /* does this io must wait */
- bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
-
- /* if must wait or any request of this type throttled queue the IO */
- if (must_wait ||
- !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
- qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
- }
-
- /* the IO will be executed, do the accounting */
- throttle_account(&bs->throttle_state, is_write, bytes);
-
-
- /* if the next request must wait -> do nothing */
- if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
- return;
- }
-
- /* else queue next request for execution */
- qemu_co_queue_next(&bs->throttled_reqs[is_write]);
-}
-
size_t bdrv_opt_mem_align(BlockDriverState *bs)
{
if (!bs || !bs->drv) {
@@ -335,20 +216,7 @@ void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz,
void bdrv_register(BlockDriver *bdrv)
{
- /* Block drivers without coroutine functions need emulation */
- if (!bdrv->bdrv_co_readv) {
- bdrv->bdrv_co_readv = bdrv_co_readv_em;
- bdrv->bdrv_co_writev = bdrv_co_writev_em;
-
- /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
- * the block driver lacks aio we need to emulate that too.
- */
- if (!bdrv->bdrv_aio_readv) {
- /* add AIO emulation layer */
- bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
- bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
- }
- }
+ bdrv_setup_io_funcs(bdrv);
QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
}
@@ -520,54 +388,6 @@ int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
return ret;
}
-void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
-{
- BlockDriver *drv = bs->drv;
- Error *local_err = NULL;
-
- memset(&bs->bl, 0, sizeof(bs->bl));
-
- if (!drv) {
- return;
- }
-
- /* Take some limits from the children as a default */
- if (bs->file) {
- bdrv_refresh_limits(bs->file, &local_err);
- if (local_err) {
- error_propagate(errp, local_err);
- return;
- }
- bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
- bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
- bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
- } else {
- bs->bl.opt_mem_alignment = 512;
- }
-
- if (bs->backing_hd) {
- bdrv_refresh_limits(bs->backing_hd, &local_err);
- if (local_err) {
- error_propagate(errp, local_err);
- return;
- }
- bs->bl.opt_transfer_length =
- MAX(bs->bl.opt_transfer_length,
- bs->backing_hd->bl.opt_transfer_length);
- bs->bl.max_transfer_length =
- MIN_NON_ZERO(bs->bl.max_transfer_length,
- bs->backing_hd->bl.max_transfer_length);
- bs->bl.opt_mem_alignment =
- MAX(bs->bl.opt_mem_alignment,
- bs->backing_hd->bl.opt_mem_alignment);
- }
-
- /* Then let the driver override it */
- if (drv->bdrv_refresh_limits) {
- drv->bdrv_refresh_limits(bs, errp);
- }
-}
-
/**
* Try to get @bs's logical and physical block size.
* On success, store them in @bsz struct and return 0.
@@ -841,22 +661,6 @@ int bdrv_parse_cache_flags(const char *mode, int *flags)
return 0;
}
-/**
- * The copy-on-read flag is actually a reference count so multiple users may
- * use the feature without worrying about clobbering its previous state.
- * Copy-on-read stays enabled until all users have called to disable it.
- */
-void bdrv_enable_copy_on_read(BlockDriverState *bs)
-{
- bs->copy_on_read++;
-}
-
-void bdrv_disable_copy_on_read(BlockDriverState *bs)
-{
- assert(bs->copy_on_read > 0);
- bs->copy_on_read--;
-}
-
/*
* Returns the flags that a temporary snapshot should get, based on the
* originally requested flags (the originally requested image will have flags
@@ -1224,8 +1028,8 @@ void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
} else if (backing_hd) {
error_setg(&bs->backing_blocker,
- "device is used as backing hd of '%s'",
- bdrv_get_device_name(bs));
+ "node is used as backing hd of '%s'",
+ bdrv_get_device_or_node_name(bs));
}
bs->backing_hd = backing_hd;
@@ -1812,8 +1616,8 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
* to r/w */
if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
reopen_state->flags & BDRV_O_RDWR) {
- error_set(errp, QERR_DEVICE_IS_READ_ONLY,
- bdrv_get_device_name(reopen_state->bs));
+ error_setg(errp, "Node '%s' is read only",
+ bdrv_get_device_or_node_name(reopen_state->bs));
goto error;
}
@@ -1839,9 +1643,9 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
} else {
/* It is currently mandatory to have a bdrv_reopen_prepare()
* handler for each supported drv. */
- error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
- drv->format_name, bdrv_get_device_name(reopen_state->bs),
- "reopening of file");
+ error_setg(errp, "Block format '%s' used by node '%s' "
+ "does not support reopening files", drv->format_name,
+ bdrv_get_device_or_node_name(reopen_state->bs));
ret = -1;
goto error;
}
@@ -1966,86 +1770,6 @@ void bdrv_close_all(void)
}
}
-/* Check if any requests are in-flight (including throttled requests) */
-static bool bdrv_requests_pending(BlockDriverState *bs)
-{
- if (!QLIST_EMPTY(&bs->tracked_requests)) {
- return true;
- }
- if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
- return true;
- }
- if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
- return true;
- }
- if (bs->file && bdrv_requests_pending(bs->file)) {
- return true;
- }
- if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
- return true;
- }
- return false;
-}
-
-static bool bdrv_drain_one(BlockDriverState *bs)
-{
- bool bs_busy;
-
- bdrv_flush_io_queue(bs);
- bdrv_start_throttled_reqs(bs);
- bs_busy = bdrv_requests_pending(bs);
- bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
- return bs_busy;
-}
-
-/*
- * Wait for pending requests to complete on a single BlockDriverState subtree
- *
- * See the warning in bdrv_drain_all(). This function can only be called if
- * you are sure nothing can generate I/O because you have op blockers
- * installed.
- *
- * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
- * AioContext.
- */
-void bdrv_drain(BlockDriverState *bs)
-{
- while (bdrv_drain_one(bs)) {
- /* Keep iterating */
- }
-}
-
-/*
- * Wait for pending requests to complete across all BlockDriverStates
- *
- * This function does not flush data to disk, use bdrv_flush_all() for that
- * after calling this function.
- *
- * Note that completion of an asynchronous I/O operation can trigger any
- * number of other I/O operations on other devices---for example a coroutine
- * can be arbitrarily complex and a constant flow of I/O can come until the
- * coroutine is complete. Because of this, it is not possible to have a
- * function to drain a single device's I/O queue.
- */
-void bdrv_drain_all(void)
-{
- /* Always run first iteration so any pending completion BHs run */
- bool busy = true;
- BlockDriverState *bs;
-
- while (busy) {
- busy = false;
-
- QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
- AioContext *aio_context = bdrv_get_aio_context(bs);
-
- aio_context_acquire(aio_context);
- busy |= bdrv_drain_one(bs);
- aio_context_release(aio_context);
- }
- }
-}
-
/* make a BlockDriverState anonymous by removing from bdrv_state and
* graph_bdrv_state list.
Also, NULL terminate the device_name to prevent double remove */
@@ -2367,152 +2091,6 @@ int bdrv_commit_all(void)
return 0;
}
-/**
- * Remove an active request from the tracked requests list
- *
- * This function should be called when a tracked request is completing.
- */
-static void tracked_request_end(BdrvTrackedRequest *req)
-{
- if (req->serialising) {
- req->bs->serialising_in_flight--;
- }
-
- QLIST_REMOVE(req, list);
- qemu_co_queue_restart_all(&req->wait_queue);
-}
-
-/**
- * Add an active request to the tracked requests list
- */
-static void tracked_request_begin(BdrvTrackedRequest *req,
- BlockDriverState *bs,
- int64_t offset,
- unsigned int bytes, bool is_write)
-{
- *req = (BdrvTrackedRequest){
- .bs = bs,
- .offset = offset,
- .bytes = bytes,
- .is_write = is_write,
- .co = qemu_coroutine_self(),
- .serialising = false,
- .overlap_offset = offset,
- .overlap_bytes = bytes,
- };
-
- qemu_co_queue_init(&req->wait_queue);
-
- QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
-}
-
-static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
-{
- int64_t overlap_offset = req->offset & ~(align - 1);
- unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
- - overlap_offset;
-
- if (!req->serialising) {
- req->bs->serialising_in_flight++;
- req->serialising = true;
- }
-
- req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
- req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
-}
-
-/**
- * Round a region to cluster boundaries
- */
-void bdrv_round_to_clusters(BlockDriverState *bs,
- int64_t sector_num, int nb_sectors,
- int64_t *cluster_sector_num,
- int *cluster_nb_sectors)
-{
- BlockDriverInfo bdi;
-
- if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
- *cluster_sector_num = sector_num;
- *cluster_nb_sectors = nb_sectors;
- } else {
- int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
- *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
- *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
- nb_sectors, c);
- }
-}
-
-static int bdrv_get_cluster_size(BlockDriverState *bs)
-{
- BlockDriverInfo bdi;
- int ret;
-
- ret = bdrv_get_info(bs, &bdi);
- if (ret < 0 || bdi.cluster_size == 0) {
- return bs->request_alignment;
- } else {
- return bdi.cluster_size;
- }
-}
-
-static bool tracked_request_overlaps(BdrvTrackedRequest *req,
- int64_t offset, unsigned int bytes)
-{
- /* aaaa bbbb */
- if (offset >= req->overlap_offset + req->overlap_bytes) {
- return false;
- }
- /* bbbb aaaa */
- if (req->overlap_offset >= offset + bytes) {
- return false;
- }
- return true;
-}
-
-static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
-{
- BlockDriverState *bs = self->bs;
- BdrvTrackedRequest *req;
- bool retry;
- bool waited = false;
-
- if (!bs->serialising_in_flight) {
- return false;
- }
-
- do {
- retry = false;
- QLIST_FOREACH(req, &bs->tracked_requests, list) {
- if (req == self || (!req->serialising && !self->serialising)) {
- continue;
- }
- if (tracked_request_overlaps(req, self->overlap_offset,
- self->overlap_bytes))
- {
- /* Hitting this means there was a reentrant request, for
- * example, a block driver issuing nested requests. This must
- * never happen since it means deadlock.
- */
- assert(qemu_coroutine_self() != req->co);
-
- /* If the request is already (indirectly) waiting for us, or
- * will wait for us as soon as it wakes up, then just go on
- * (instead of producing a deadlock in the former case). */
- if (!req->waiting_for) {
- self->waiting_for = req;
- qemu_co_queue_wait(&req->wait_queue);
- self->waiting_for = NULL;
- retry = true;
- waited = true;
- break;
- }
- }
- }
- } while (retry);
-
- return waited;
-}
-
/*
* Return values:
* 0 - success
@@ -2681,879 +2259,6 @@ exit:
return ret;
}
-
-static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
- size_t size)
-{
- if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
- return -EIO;
- }
-
- if (!bdrv_is_inserted(bs)) {
- return -ENOMEDIUM;
- }
-
- if (offset < 0) {
- return -EIO;
- }
-
- return 0;
-}
-
-static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
- int nb_sectors)
-{
- if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
- return -EIO;
- }
-
- return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
- nb_sectors * BDRV_SECTOR_SIZE);
-}
-
-typedef struct RwCo {
- BlockDriverState *bs;
- int64_t offset;
- QEMUIOVector *qiov;
- bool is_write;
- int ret;
- BdrvRequestFlags flags;
-} RwCo;
-
-static void coroutine_fn bdrv_rw_co_entry(void *opaque)
-{
- RwCo *rwco = opaque;
-
- if (!rwco->is_write) {
- rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
- rwco->qiov->size, rwco->qiov,
- rwco->flags);
- } else {
- rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
- rwco->qiov->size, rwco->qiov,
- rwco->flags);
- }
-}
-
-/*
- * Process a vectored synchronous request using coroutines
- */
-static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
- QEMUIOVector *qiov, bool is_write,
- BdrvRequestFlags flags)
-{
- Coroutine *co;
- RwCo rwco = {
- .bs = bs,
- .offset = offset,
- .qiov = qiov,
- .is_write = is_write,
- .ret = NOT_DONE,
- .flags = flags,
- };
-
- /**
- * In sync call context, when the vcpu is blocked, this throttling timer
- * will not fire; so the I/O throttling function has to be disabled here
- * if it has been enabled.
- */
- if (bs->io_limits_enabled) {
- fprintf(stderr, "Disabling I/O throttling on '%s' due "
- "to synchronous I/O.\n", bdrv_get_device_name(bs));
- bdrv_io_limits_disable(bs);
- }
-
- if (qemu_in_coroutine()) {
- /* Fast-path if already in coroutine context */
- bdrv_rw_co_entry(&rwco);
- } else {
- AioContext *aio_context = bdrv_get_aio_context(bs);
-
- co = qemu_coroutine_create(bdrv_rw_co_entry);
- qemu_coroutine_enter(co, &rwco);
- while (rwco.ret == NOT_DONE) {
- aio_poll(aio_context, true);
- }
- }
- return rwco.ret;
-}
-
-/*
- * Process a synchronous request using coroutines
- */
-static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
- int nb_sectors, bool is_write, BdrvRequestFlags flags)
-{
- QEMUIOVector qiov;
- struct iovec iov = {
- .iov_base = (void *)buf,
- .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
- };
-
- if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
- return -EINVAL;
- }
-
- qemu_iovec_init_external(&qiov, &iov, 1);
- return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
- &qiov, is_write, flags);
-}
-
-/* return < 0 if error. See bdrv_write() for the return codes */
-int bdrv_read(BlockDriverState *bs, int64_t sector_num,
- uint8_t *buf, int nb_sectors)
-{
- return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
-}
-
-/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
-int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
- uint8_t *buf, int nb_sectors)
-{
- bool enabled;
- int ret;
-
- enabled = bs->io_limits_enabled;
- bs->io_limits_enabled = false;
- ret = bdrv_read(bs, sector_num, buf, nb_sectors);
- bs->io_limits_enabled = enabled;
- return ret;
-}
-
-/* Return < 0 if error. Important errors are:
- -EIO generic I/O error (may happen for all errors)
- -ENOMEDIUM No media inserted.
- -EINVAL Invalid sector number or nb_sectors
- -EACCES Trying to write a read-only device
-*/
-int bdrv_write(BlockDriverState *bs, int64_t sector_num,
- const uint8_t *buf, int nb_sectors)
-{
- return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
-}
-
-int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
- int nb_sectors, BdrvRequestFlags flags)
-{
- return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
- BDRV_REQ_ZERO_WRITE | flags);
-}
-
-/*
- * Completely zero out a block device with the help of bdrv_write_zeroes.
- * The operation is sped up by checking the block status and only writing
- * zeroes to the device if they currently do not return zeroes. Optional
- * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
- *
- * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
- */
-int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
-{
- int64_t target_sectors, ret, nb_sectors, sector_num = 0;
- int n;
-
- target_sectors = bdrv_nb_sectors(bs);
- if (target_sectors < 0) {
- return target_sectors;
- }
-
- for (;;) {
- nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
- if (nb_sectors <= 0) {
- return 0;
- }
- ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
- if (ret < 0) {
- error_report("error getting block status at sector %" PRId64 ": %s",
- sector_num, strerror(-ret));
- return ret;
- }
- if (ret & BDRV_BLOCK_ZERO) {
- sector_num += n;
- continue;
- }
- ret = bdrv_write_zeroes(bs, sector_num, n, flags);
- if (ret < 0) {
- error_report("error writing zeroes at sector %" PRId64 ": %s",
- sector_num, strerror(-ret));
- return ret;
- }
- sector_num += n;
- }
-}
-
-int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
-{
- QEMUIOVector qiov;
- struct iovec iov = {
- .iov_base = (void *)buf,
- .iov_len = bytes,
- };
- int ret;
-
- if (bytes < 0) {
- return -EINVAL;
- }
-
- qemu_iovec_init_external(&qiov, &iov, 1);
- ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
- if (ret < 0) {
- return ret;
- }
-
- return bytes;
-}
-
-int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
-{
- int ret;
-
- ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
- if (ret < 0) {
- return ret;
- }
-
- return qiov->size;
-}
-
-int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
- const void *buf, int bytes)
-{
- QEMUIOVector qiov;
- struct iovec iov = {
- .iov_base = (void *) buf,
- .iov_len = bytes,
- };
-
- if (bytes < 0) {
- return -EINVAL;
- }
-
- qemu_iovec_init_external(&qiov, &iov, 1);
- return bdrv_pwritev(bs, offset, &qiov);
-}
-
-/*
- * Writes to the file and ensures that no writes are reordered across this
- * request (acts as a barrier)
- *
- * Returns 0 on success, -errno in error cases.
- */
-int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
- const void *buf, int count)
-{
- int ret;
-
- ret = bdrv_pwrite(bs, offset, buf, count);
- if (ret < 0) {
- return ret;
- }
-
- /* No flush needed for cache modes that already do it */
- if (bs->enable_write_cache) {
- bdrv_flush(bs);
- }
-
- return 0;
-}
-
-static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
- int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
-{
- /* Perform I/O through a temporary buffer so that users who scribble over
- * their read buffer while the operation is in progress do not end up
- * modifying the image file. This is critical for zero-copy guest I/O
- * where anything might happen inside guest memory.
- */
- void *bounce_buffer;
-
- BlockDriver *drv = bs->drv;
- struct iovec iov;
- QEMUIOVector bounce_qiov;
- int64_t cluster_sector_num;
- int cluster_nb_sectors;
- size_t skip_bytes;
- int ret;
-
- /* Cover entire cluster so no additional backing file I/O is required when
- * allocating cluster in the image file.
- */
- bdrv_round_to_clusters(bs, sector_num, nb_sectors,
- &cluster_sector_num, &cluster_nb_sectors);
-
- trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
- cluster_sector_num, cluster_nb_sectors);
-
- iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
- iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
- if (bounce_buffer == NULL) {
- ret = -ENOMEM;
- goto err;
- }
-
- qemu_iovec_init_external(&bounce_qiov, &iov, 1);
-
- ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
- &bounce_qiov);
- if (ret < 0) {
- goto err;
- }
-
- if (drv->bdrv_co_write_zeroes &&
- buffer_is_zero(bounce_buffer, iov.iov_len)) {
- ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
- cluster_nb_sectors, 0);
- } else {
- /* This does not change the data on the disk, it is not necessary
- * to flush even in cache=writethrough mode.
- */
- ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
- &bounce_qiov);
- }
-
- if (ret < 0) {
- /* It might be okay to ignore write errors for guest requests. If this
- * is a deliberate copy-on-read then we don't want to ignore the error.
- * Simply report it in all cases.
- */
- goto err;
- }
-
- skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
- qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
- nb_sectors * BDRV_SECTOR_SIZE);
-
-err:
- qemu_vfree(bounce_buffer);
- return ret;
-}
-
-/*
- * Forwards an already correctly aligned request to the BlockDriver. This
- * handles copy on read and zeroing after EOF; any other features must be
- * implemented by the caller.
- */
-static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
- BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
- int64_t align, QEMUIOVector *qiov, int flags)
-{
- BlockDriver *drv = bs->drv;
- int ret;
-
- int64_t sector_num = offset >> BDRV_SECTOR_BITS;
- unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
-
- assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
- assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
- assert(!qiov || bytes == qiov->size);
-
- /* Handle Copy on Read and associated serialisation */
- if (flags & BDRV_REQ_COPY_ON_READ) {
- /* If we touch the same cluster it counts as an overlap. This
- * guarantees that allocating writes will be serialized and not race
- * with each other for the same cluster. For example, in copy-on-read
- * it ensures that the CoR read and write operations are atomic and
- * guest writes cannot interleave between them. */
- mark_request_serialising(req, bdrv_get_cluster_size(bs));
- }
-
- wait_serialising_requests(req);
-
- if (flags & BDRV_REQ_COPY_ON_READ) {
- int pnum;
-
- ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
- if (ret < 0) {
- goto out;
- }
-
- if (!ret || pnum != nb_sectors) {
- ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
- goto out;
- }
- }
-
- /* Forward the request to the BlockDriver */
- if (!bs->zero_beyond_eof) {
- ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
- } else {
- /* Read zeros after EOF */
- int64_t total_sectors, max_nb_sectors;
-
- total_sectors = bdrv_nb_sectors(bs);
- if (total_sectors < 0) {
- ret = total_sectors;
- goto out;
- }
-
- max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
- align >> BDRV_SECTOR_BITS);
- if (nb_sectors < max_nb_sectors) {
- ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
- } else if (max_nb_sectors > 0) {
- QEMUIOVector local_qiov;
-
- qemu_iovec_init(&local_qiov, qiov->niov);
- qemu_iovec_concat(&local_qiov, qiov, 0,
- max_nb_sectors * BDRV_SECTOR_SIZE);
-
- ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors,
- &local_qiov);
-
- qemu_iovec_destroy(&local_qiov);
- } else {
- ret = 0;
- }
-
- /* Reading beyond end of file is supposed to produce zeroes */
- if (ret == 0 && total_sectors < sector_num + nb_sectors) {
- uint64_t offset = MAX(0, total_sectors - sector_num);
- uint64_t bytes = (sector_num + nb_sectors - offset) *
- BDRV_SECTOR_SIZE;
- qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
- }
- }
-
-out:
- return ret;
-}
-
-static inline uint64_t bdrv_get_align(BlockDriverState *bs)
-{
- /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
- return MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
-}
-
-static inline bool bdrv_req_is_aligned(BlockDriverState *bs,
- int64_t offset, size_t bytes)
-{
- int64_t align = bdrv_get_align(bs);
- return !(offset & (align - 1) || (bytes & (align - 1)));
-}
-
-/*
- * Handle a read request in coroutine context
- */
-static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
- int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
- BdrvRequestFlags flags)
-{
- BlockDriver *drv = bs->drv;
- BdrvTrackedRequest req;
-
- uint64_t align = bdrv_get_align(bs);
- uint8_t *head_buf = NULL;
- uint8_t *tail_buf = NULL;
- QEMUIOVector local_qiov;
- bool use_local_qiov = false;
- int ret;
-
- if (!drv) {
- return -ENOMEDIUM;
- }
-
- ret = bdrv_check_byte_request(bs, offset, bytes);
- if (ret < 0) {
- return ret;
- }
-
- if (bs->copy_on_read) {
- flags |= BDRV_REQ_COPY_ON_READ;
- }
-
- /* throttling disk I/O */
- if (bs->io_limits_enabled) {
- bdrv_io_limits_intercept(bs, bytes, false);
- }
-
- /* Align read if necessary by padding qiov */
- if (offset & (align - 1)) {
- head_buf = qemu_blockalign(bs, align);
- qemu_iovec_init(&local_qiov, qiov->niov + 2);
- qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
- qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
- use_local_qiov = true;
-
- bytes += offset & (align - 1);
- offset = offset & ~(align - 1);
- }
-
- if ((offset + bytes) & (align - 1)) {
- if (!use_local_qiov) {
- qemu_iovec_init(&local_qiov, qiov->niov + 1);
- qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
- use_local_qiov = true;
- }
- tail_buf = qemu_blockalign(bs, align);
- qemu_iovec_add(&local_qiov, tail_buf,
- align - ((offset + bytes) & (align - 1)));
-
- bytes = ROUND_UP(bytes, align);
- }
-
- tracked_request_begin(&req, bs, offset, bytes, false);
- ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
- use_local_qiov ? &local_qiov : qiov,
- flags);
- tracked_request_end(&req);
-
- if (use_local_qiov) {
- qemu_iovec_destroy(&local_qiov);
- qemu_vfree(head_buf);
- qemu_vfree(tail_buf);
- }
-
- return ret;
-}
-
-static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
- int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
- BdrvRequestFlags flags)
-{
- if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
- return -EINVAL;
- }
-
- return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
- nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
-}
-
-int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
- int nb_sectors, QEMUIOVector *qiov)
-{
- trace_bdrv_co_readv(bs, sector_num, nb_sectors);
-
- return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
-}
-
-int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
- int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
-{
- trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
-
- return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
- BDRV_REQ_COPY_ON_READ);
-}
-
-#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
-
-static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
- int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
-{
- BlockDriver *drv = bs->drv;
- QEMUIOVector qiov;
- struct iovec iov = {0};
- int ret = 0;
-
- int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
- BDRV_REQUEST_MAX_SECTORS);
-
- while (nb_sectors > 0 && !ret) {
- int num = nb_sectors;
-
- /* Align request. Block drivers can expect the "bulk" of the request
- * to be aligned.
- */
- if (bs->bl.write_zeroes_alignment
- && num > bs->bl.write_zeroes_alignment) {
- if (sector_num % bs->bl.write_zeroes_alignment != 0) {
- /* Make a small request up to the first aligned sector. */
- num = bs->bl.write_zeroes_alignment;
- num -= sector_num % bs->bl.write_zeroes_alignment;
- } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
- /* Shorten the request to the last aligned sector. num cannot
- * underflow because num > bs->bl.write_zeroes_alignment.
- */
- num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
- }
- }
-
- /* limit request size */
- if (num > max_write_zeroes) {
- num = max_write_zeroes;
- }
-
- ret = -ENOTSUP;
- /* First try the efficient write zeroes operation */
- if (drv->bdrv_co_write_zeroes) {
- ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
- }
-
- if (ret == -ENOTSUP) {
- /* Fall back to bounce buffer if write zeroes is unsupported */
- int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
- MAX_WRITE_ZEROES_BOUNCE_BUFFER);
- num = MIN(num, max_xfer_len);
- iov.iov_len = num * BDRV_SECTOR_SIZE;
- if (iov.iov_base == NULL) {
- iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
- if (iov.iov_base == NULL) {
- ret = -ENOMEM;
- goto fail;
- }
- memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
- }
- qemu_iovec_init_external(&qiov, &iov, 1);
-
- ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
-
- /* Keep bounce buffer around if it is big enough for all
- * all future requests.
- */
- if (num < max_xfer_len) {
- qemu_vfree(iov.iov_base);
- iov.iov_base = NULL;
- }
- }
-
- sector_num += num;
- nb_sectors -= num;
- }
-
-fail:
- qemu_vfree(iov.iov_base);
- return ret;
-}
-
-/*
- * Forwards an already correctly aligned write request to the BlockDriver.
- */
-static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
- BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
- QEMUIOVector *qiov, int flags)
-{
- BlockDriver *drv = bs->drv;
- bool waited;
- int ret;
-
- int64_t sector_num = offset >> BDRV_SECTOR_BITS;
- unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
-
- assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
- assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
- assert(!qiov || bytes == qiov->size);
-
- waited = wait_serialising_requests(req);
- assert(!waited || !req->serialising);
- assert(req->overlap_offset <= offset);
- assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
-
- ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
-
- if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
- !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
- qemu_iovec_is_zero(qiov)) {
- flags |= BDRV_REQ_ZERO_WRITE;
- if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
- flags |= BDRV_REQ_MAY_UNMAP;
- }
- }
-
- if (ret < 0) {
- /* Do nothing, write notifier decided to fail this request */
- } else if (flags & BDRV_REQ_ZERO_WRITE) {
- BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
- ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
- } else {
- BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
- ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
- }
- BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
-
- if (ret == 0 && !bs->enable_write_cache) {
- ret = bdrv_co_flush(bs);
- }
-
- bdrv_set_dirty(bs, sector_num, nb_sectors);
-
- block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
-
- if (ret >= 0) {
- bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
- }
-
- return ret;
-}
-
-/*
- * Handle a write request in coroutine context
- */
-static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
- int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
- BdrvRequestFlags flags)
-{
- BdrvTrackedRequest req;
- uint64_t align = bdrv_get_align(bs);
- uint8_t *head_buf = NULL;
- uint8_t *tail_buf = NULL;
- QEMUIOVector local_qiov;
- bool use_local_qiov = false;
- int ret;
-
- if (!bs->drv) {
- return -ENOMEDIUM;
- }
- if (bs->read_only) {
- return -EACCES;
- }
-
- ret = bdrv_check_byte_request(bs, offset, bytes);
- if (ret < 0) {
- return ret;
- }
-
- /* throttling disk I/O */
- if (bs->io_limits_enabled) {
- bdrv_io_limits_intercept(bs, bytes, true);
- }
-
- /*
- * Align write if necessary by performing a read-modify-write cycle.
- * Pad qiov with the read parts and be sure to have a tracked request not
- * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
- */
- tracked_request_begin(&req, bs, offset, bytes, true);
-
- if (offset & (align - 1)) {
- QEMUIOVector head_qiov;
- struct iovec head_iov;
-
- mark_request_serialising(&req, align);
- wait_serialising_requests(&req);
-
- head_buf = qemu_blockalign(bs, align);
- head_iov = (struct iovec) {
- .iov_base = head_buf,
- .iov_len = align,
- };
- qemu_iovec_init_external(&head_qiov, &head_iov, 1);
-
- BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
- ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
- align, &head_qiov, 0);
- if (ret < 0) {
- goto fail;
- }
- BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
-
- qemu_iovec_init(&local_qiov, qiov->niov + 2);
- qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
- qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
- use_local_qiov = true;
-
- bytes += offset & (align - 1);
- offset = offset & ~(align - 1);
- }
-
- if ((offset + bytes) & (align - 1)) {
- QEMUIOVector tail_qiov;
- struct iovec tail_iov;
- size_t tail_bytes;
- bool waited;
-
- mark_request_serialising(&req, align);
- waited = wait_serialising_requests(&req);
- assert(!waited || !use_local_qiov);
-
- tail_buf = qemu_blockalign(bs, align);
- tail_iov = (struct iovec) {
- .iov_base = tail_buf,
- .iov_len = align,
- };
- qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
-
- BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
- ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
- align, &tail_qiov, 0);
- if (ret < 0) {
- goto fail;
- }
- BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
-
- if (!use_local_qiov) {
- qemu_iovec_init(&local_qiov, qiov->niov + 1);
- qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
- use_local_qiov = true;
- }
-
- tail_bytes = (offset + bytes) & (align - 1);
- qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
-
- bytes = ROUND_UP(bytes, align);
- }
-
- if (use_local_qiov) {
- /* Local buffer may have non-zero data. */
- flags &= ~BDRV_REQ_ZERO_WRITE;
- }
- ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
- use_local_qiov ? &local_qiov : qiov,
- flags);
-
-fail:
- tracked_request_end(&req);
-
- if (use_local_qiov) {
- qemu_iovec_destroy(&local_qiov);
- }
- qemu_vfree(head_buf);
- qemu_vfree(tail_buf);
-
- return ret;
-}
-
-static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
- int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
- BdrvRequestFlags flags)
-{
- if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
- return -EINVAL;
- }
-
- return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
- nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
-}
-
-int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
- int nb_sectors, QEMUIOVector *qiov)
-{
- trace_bdrv_co_writev(bs, sector_num, nb_sectors);
-
- return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
-}
-
-int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
- int64_t sector_num, int nb_sectors,
- BdrvRequestFlags flags)
-{
- int ret;
-
- trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
-
- if (!(bs->open_flags & BDRV_O_UNMAP)) {
- flags &= ~BDRV_REQ_MAY_UNMAP;
- }
- if (bdrv_req_is_aligned(bs, sector_num << BDRV_SECTOR_BITS,
- nb_sectors << BDRV_SECTOR_BITS)) {
- ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
- BDRV_REQ_ZERO_WRITE | flags);
- } else {
- uint8_t *buf;
- QEMUIOVector local_qiov;
- size_t bytes = nb_sectors << BDRV_SECTOR_BITS;
-
- buf = qemu_memalign(bdrv_opt_mem_align(bs), bytes);
- memset(buf, 0, bytes);
- qemu_iovec_init(&local_qiov, 1);
- qemu_iovec_add(&local_qiov, buf, bytes);
-
- ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, &local_qiov,
- BDRV_REQ_ZERO_WRITE | flags);
- qemu_vfree(buf);
- }
- return ret;
-}
-
/**
* Truncate file to 'offset' bytes (needed only for file protocols)
*/
@@ -3571,6 +2276,7 @@ int bdrv_truncate(BlockDriverState *bs, int64_t offset)
ret = drv->bdrv_truncate(bs, offset);
if (ret == 0) {
ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
+ bdrv_dirty_bitmap_truncate(bs);
if (bs->blk) {
blk_dev_resize_cb(bs->blk);
}
@@ -3797,8 +2503,8 @@ void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp)
{
if (key) {
if (!bdrv_is_encrypted(bs)) {
- error_setg(errp, "Device '%s' is not encrypted",
- bdrv_get_device_name(bs));
+ error_setg(errp, "Node '%s' is not encrypted",
+ bdrv_get_device_or_node_name(bs));
} else if (bdrv_set_key(bs, key) < 0) {
error_set(errp, QERR_INVALID_PASSWORD);
}
@@ -3806,7 +2512,7 @@ void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp)
if (bdrv_key_required(bs)) {
error_set(errp, ERROR_CLASS_DEVICE_ENCRYPTED,
"'%s' (%s) is encrypted",
- bdrv_get_device_name(bs),
+ bdrv_get_device_or_node_name(bs),
bdrv_get_encrypted_filename(bs));
}
}
@@ -3870,15 +2576,20 @@ BlockDriverState *bdrv_find_node(const char *node_name)
}
/* Put this QMP function here so it can access the static graph_bdrv_states. */
-BlockDeviceInfoList *bdrv_named_nodes_list(void)
+BlockDeviceInfoList *bdrv_named_nodes_list(Error **errp)
{
BlockDeviceInfoList *list, *entry;
BlockDriverState *bs;
list = NULL;
QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
+ BlockDeviceInfo *info = bdrv_block_device_info(bs, errp);
+ if (!info) {
+ qapi_free_BlockDeviceInfoList(list);
+ return NULL;
+ }
entry = g_malloc0(sizeof(*entry));
- entry->value = bdrv_block_device_info(bs);
+ entry->value = info;
entry->next = list;
list = entry;
}
@@ -3953,29 +2664,18 @@ const char *bdrv_get_device_name(const BlockDriverState *bs)
return bs->blk ? blk_name(bs->blk) : "";
}
-int bdrv_get_flags(BlockDriverState *bs)
+/* This can be used to identify nodes that might not have a device
+ * name associated. Since node and device names live in the same
+ * namespace, the result is unambiguous. The exception is if both are
+ * absent, then this returns an empty (non-null) string. */
+const char *bdrv_get_device_or_node_name(const BlockDriverState *bs)
{
- return bs->open_flags;
+ return bs->blk ? blk_name(bs->blk) : bs->node_name;
}
-int bdrv_flush_all(void)
+int bdrv_get_flags(BlockDriverState *bs)
{
- BlockDriverState *bs;
- int result = 0;
-
- QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
- AioContext *aio_context = bdrv_get_aio_context(bs);
- int ret;
-
- aio_context_acquire(aio_context);
- ret = bdrv_flush(bs);
- if (ret < 0 && !result) {
- result = ret;
- }
- aio_context_release(aio_context);
- }
-
- return result;
+ return bs->open_flags;
}
int bdrv_has_zero_init_1(BlockDriverState *bs)
@@ -4030,222 +2730,6 @@ bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
return false;
}
-typedef struct BdrvCoGetBlockStatusData {
- BlockDriverState *bs;
- BlockDriverState *base;
- int64_t sector_num;
- int nb_sectors;
- int *pnum;
- int64_t ret;
- bool done;
-} BdrvCoGetBlockStatusData;
-
-/*
- * Returns the allocation status of the specified sectors.
- * Drivers not implementing the functionality are assumed to not support
- * backing files, hence all their sectors are reported as allocated.
- *
- * If 'sector_num' is beyond the end of the disk image the return value is 0
- * and 'pnum' is set to 0.
- *
- * 'pnum' is set to the number of sectors (including and immediately following
- * the specified sector) that are known to be in the same
- * allocated/unallocated state.
- *
- * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
- * beyond the end of the disk image it will be clamped.
- */
-static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
- int64_t sector_num,
- int nb_sectors, int *pnum)
-{
- int64_t total_sectors;
- int64_t n;
- int64_t ret, ret2;
-
- total_sectors = bdrv_nb_sectors(bs);
- if (total_sectors < 0) {
- return total_sectors;
- }
-
- if (sector_num >= total_sectors) {
- *pnum = 0;
- return 0;
- }
-
- n = total_sectors - sector_num;
- if (n < nb_sectors) {
- nb_sectors = n;
- }
-
- if (!bs->drv->bdrv_co_get_block_status) {
- *pnum = nb_sectors;
- ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
- if (bs->drv->protocol_name) {
- ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
- }
- return ret;
- }
-
- ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
- if (ret < 0) {
- *pnum = 0;
- return ret;
- }
-
- if (ret & BDRV_BLOCK_RAW) {
- assert(ret & BDRV_BLOCK_OFFSET_VALID);
- return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
- *pnum, pnum);
- }
-
- if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
- ret |= BDRV_BLOCK_ALLOCATED;
- }
-
- if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
- if (bdrv_unallocated_blocks_are_zero(bs)) {
- ret |= BDRV_BLOCK_ZERO;
- } else if (bs->backing_hd) {
- BlockDriverState *bs2 = bs->backing_hd;
- int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
- if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
- ret |= BDRV_BLOCK_ZERO;
- }
- }
- }
-
- if (bs->file &&
- (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
- (ret & BDRV_BLOCK_OFFSET_VALID)) {
- int file_pnum;
-
- ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
- *pnum, &file_pnum);
- if (ret2 >= 0) {
- /* Ignore errors. This is just providing extra information, it
- * is useful but not necessary.
- */
- if (!file_pnum) {
- /* !file_pnum indicates an offset at or beyond the EOF; it is
- * perfectly valid for the format block driver to point to such
- * offsets, so catch it and mark everything as zero */
- ret |= BDRV_BLOCK_ZERO;
- } else {
- /* Limit request to the range reported by the protocol driver */
- *pnum = file_pnum;
- ret |= (ret2 & BDRV_BLOCK_ZERO);
- }
- }
- }
-
- return ret;
-}
-
-/* Coroutine wrapper for bdrv_get_block_status() */
-static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
-{
- BdrvCoGetBlockStatusData *data = opaque;
- BlockDriverState *bs = data->bs;
-
- data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
- data->pnum);
- data->done = true;
-}
-
-/*
- * Synchronous wrapper around bdrv_co_get_block_status().
- *
- * See bdrv_co_get_block_status() for details.
- */
-int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
- int nb_sectors, int *pnum)
-{
- Coroutine *co;
- BdrvCoGetBlockStatusData data = {
- .bs = bs,
- .sector_num = sector_num,
- .nb_sectors = nb_sectors,
- .pnum = pnum,
- .done = false,
- };
-
- if (qemu_in_coroutine()) {
- /* Fast-path if already in coroutine context */
- bdrv_get_block_status_co_entry(&data);
- } else {
- AioContext *aio_context = bdrv_get_aio_context(bs);
-
- co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
- qemu_coroutine_enter(co, &data);
- while (!data.done) {
- aio_poll(aio_context, true);
- }
- }
- return data.ret;
-}
-
-int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
- int nb_sectors, int *pnum)
-{
- int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
- if (ret < 0) {
- return ret;
- }
- return !!(ret & BDRV_BLOCK_ALLOCATED);
-}
-
-/*
- * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
- *
- * Return true if the given sector is allocated in any image between
- * BASE and TOP (inclusive). BASE can be NULL to check if the given
- * sector is allocated in any image of the chain. Return false otherwise.
- *
- * 'pnum' is set to the number of sectors (including and immediately following
- * the specified sector) that are known to be in the same
- * allocated/unallocated state.
- *
- */
-int bdrv_is_allocated_above(BlockDriverState *top,
- BlockDriverState *base,
- int64_t sector_num,
- int nb_sectors, int *pnum)
-{
- BlockDriverState *intermediate;
- int ret, n = nb_sectors;
-
- intermediate = top;
- while (intermediate && intermediate != base) {
- int pnum_inter;
- ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
- &pnum_inter);
- if (ret < 0) {
- return ret;
- } else if (ret) {
- *pnum = pnum_inter;
- return 1;
- }
-
- /*
- * [sector_num, nb_sectors] is unallocated on top but intermediate
- * might have
- *
- * [sector_num+x, nr_sectors] allocated.
- */
- if (n > pnum_inter &&
- (intermediate == top ||
- sector_num + pnum_inter < intermediate->total_sectors)) {
- n = pnum_inter;
- }
-
- intermediate = intermediate->backing_hd;
- }
-
- *pnum = n;
- return 0;
-}
-
const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
{
if (bs->backing_hd && bs->backing_hd->encrypted)
@@ -4262,28 +2746,6 @@ void bdrv_get_backing_filename(BlockDriverState *bs,
pstrcpy(filename, filename_size, bs->backing_file);
}
-int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
- const uint8_t *buf, int nb_sectors)
-{
- BlockDriver *drv = bs->drv;
- int ret;
-
- if (!drv) {
- return -ENOMEDIUM;
- }
- if (!drv->bdrv_write_compressed) {
- return -ENOTSUP;
- }
- ret = bdrv_check_request(bs, sector_num, nb_sectors);
- if (ret < 0) {
- return ret;
- }
-
- assert(QLIST_EMPTY(&bs->dirty_bitmaps));
-
- return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
-}
-
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
{
BlockDriver *drv = bs->drv;
@@ -4304,47 +2766,6 @@ ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
return NULL;
}
-int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
- int64_t pos, int size)
-{
- QEMUIOVector qiov;
- struct iovec iov = {
- .iov_base = (void *) buf,
- .iov_len = size,
- };
-
- qemu_iovec_init_external(&qiov, &iov, 1);
- return bdrv_writev_vmstate(bs, &qiov, pos);
-}
-
-int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
-{
- BlockDriver *drv = bs->drv;
-
- if (!drv) {
- return -ENOMEDIUM;
- } else if (drv->bdrv_save_vmstate) {
- return drv->bdrv_save_vmstate(bs, qiov, pos);
- } else if (bs->file) {
- return bdrv_writev_vmstate(bs->file, qiov, pos);
- }
-
- return -ENOTSUP;
-}
-
-int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
- int64_t pos, int size)
-{
- BlockDriver *drv = bs->drv;
- if (!drv)
- return -ENOMEDIUM;
- if (drv->bdrv_load_vmstate)
- return drv->bdrv_load_vmstate(bs, buf, pos, size);
- if (bs->file)
- return bdrv_load_vmstate(bs->file, buf, pos, size);
- return -ENOTSUP;
-}
-
void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
{
if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
@@ -4491,452 +2912,6 @@ int bdrv_get_backing_file_depth(BlockDriverState *bs)
return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
}
-/**************************************************************/
-/* async I/Os */
-
-BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
- QEMUIOVector *qiov, int nb_sectors,
- BlockCompletionFunc *cb, void *opaque)
-{
- trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
-
- return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
- cb, opaque, false);
-}
-
-BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
- QEMUIOVector *qiov, int nb_sectors,
- BlockCompletionFunc *cb, void *opaque)
-{
- trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
-
- return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
- cb, opaque, true);
-}
-
-BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
- int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
- BlockCompletionFunc *cb, void *opaque)
-{
- trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
-
- return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
- BDRV_REQ_ZERO_WRITE | flags,
- cb, opaque, true);
-}
-
-
-typedef struct MultiwriteCB {
- int error;
- int num_requests;
- int num_callbacks;
- struct {
- BlockCompletionFunc *cb;
- void *opaque;
- QEMUIOVector *free_qiov;
- } callbacks[];
-} MultiwriteCB;
-
-static void multiwrite_user_cb(MultiwriteCB *mcb)
-{
- int i;
-
- for (i = 0; i < mcb->num_callbacks; i++) {
- mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
- if (mcb->callbacks[i].free_qiov) {
- qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
- }
- g_free(mcb->callbacks[i].free_qiov);
- }
-}
-
-static void multiwrite_cb(void *opaque, int ret)
-{
- MultiwriteCB *mcb = opaque;
-
- trace_multiwrite_cb(mcb, ret);
-
- if (ret < 0 && !mcb->error) {
- mcb->error = ret;
- }
-
- mcb->num_requests--;
- if (mcb->num_requests == 0) {
- multiwrite_user_cb(mcb);
- g_free(mcb);
- }
-}
-
-static int multiwrite_req_compare(const void *a, const void *b)
-{
- const BlockRequest *req1 = a, *req2 = b;
-
- /*
- * Note that we can't simply subtract req2->sector from req1->sector
- * here as that could overflow the return value.
- */
- if (req1->sector > req2->sector) {
- return 1;
- } else if (req1->sector < req2->sector) {
- return -1;
- } else {
- return 0;
- }
-}
-
-/*
- * Takes a bunch of requests and tries to merge them. Returns the number of
- * requests that remain after merging.
- */
-static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
- int num_reqs, MultiwriteCB *mcb)
-{
- int i, outidx;
-
- // Sort requests by start sector
- qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
-
- // Check if adjacent requests touch the same clusters. If so, combine them,
- // filling up gaps with zero sectors.
- outidx = 0;
- for (i = 1; i < num_reqs; i++) {
- int merge = 0;
- int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
-
- // Handle exactly sequential writes and overlapping writes.
- if (reqs[i].sector <= oldreq_last) {
- merge = 1;
- }
-
- if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
- merge = 0;
- }
-
- if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
- reqs[i].nb_sectors > bs->bl.max_transfer_length) {
- merge = 0;
- }
-
- if (merge) {
- size_t size;
- QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
- qemu_iovec_init(qiov,
- reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
-
- // Add the first request to the merged one. If the requests are
- // overlapping, drop the last sectors of the first request.
- size = (reqs[i].sector - reqs[outidx].sector) << 9;
- qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
-
- // We should need to add any zeros between the two requests
- assert (reqs[i].sector <= oldreq_last);
-
- // Add the second request
- qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
-
- // Add tail of first request, if necessary
- if (qiov->size < reqs[outidx].qiov->size) {
- qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
- reqs[outidx].qiov->size - qiov->size);
- }
-
- reqs[outidx].nb_sectors = qiov->size >> 9;
- reqs[outidx].qiov = qiov;
-
- mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
- } else {
- outidx++;
- reqs[outidx].sector = reqs[i].sector;
- reqs[outidx].nb_sectors = reqs[i].nb_sectors;
- reqs[outidx].qiov = reqs[i].qiov;
- }
- }
-
- block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1);
-
- return outidx + 1;
-}
-
-/*
- * Submit multiple AIO write requests at once.
- *
- * On success, the function returns 0 and all requests in the reqs array have
- * been submitted. In error case this function returns -1, and any of the
- * requests may or may not be submitted yet. In particular, this means that the
- * callback will be called for some of the requests, for others it won't. The
- * caller must check the error field of the BlockRequest to wait for the right
- * callbacks (if error != 0, no callback will be called).
- *
- * The implementation may modify the contents of the reqs array, e.g. to merge
- * requests. However, the fields opaque and error are left unmodified as they
- * are used to signal failure for a single request to the caller.
- */
-int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
-{
- MultiwriteCB *mcb;
- int i;
-
- /* don't submit writes if we don't have a medium */
- if (bs->drv == NULL) {
- for (i = 0; i < num_reqs; i++) {
- reqs[i].error = -ENOMEDIUM;
- }
- return -1;
- }
-
- if (num_reqs == 0) {
- return 0;
- }
-
- // Create MultiwriteCB structure
- mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
- mcb->num_requests = 0;
- mcb->num_callbacks = num_reqs;
-
- for (i = 0; i < num_reqs; i++) {
- mcb->callbacks[i].cb = reqs[i].cb;
- mcb->callbacks[i].opaque = reqs[i].opaque;
- }
-
- // Check for mergable requests
- num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
-
- trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
-
- /* Run the aio requests. */
- mcb->num_requests = num_reqs;
- for (i = 0; i < num_reqs; i++) {
- bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
- reqs[i].nb_sectors, reqs[i].flags,
- multiwrite_cb, mcb,
- true);
- }
-
- return 0;
-}
-
-void bdrv_aio_cancel(BlockAIOCB *acb)
-{
- qemu_aio_ref(acb);
- bdrv_aio_cancel_async(acb);
- while (acb->refcnt > 1) {
- if (acb->aiocb_info->get_aio_context) {
- aio_poll(acb->aiocb_info->get_aio_context(acb), true);
- } else if (acb->bs) {
- aio_poll(bdrv_get_aio_context(acb->bs), true);
- } else {
- abort();
- }
- }
- qemu_aio_unref(acb);
-}
-
-/* Async version of aio cancel. The caller is not blocked if the acb implements
- * cancel_async, otherwise we do nothing and let the request normally complete.
- * In either case the completion callback must be called. */
-void bdrv_aio_cancel_async(BlockAIOCB *acb)
-{
- if (acb->aiocb_info->cancel_async) {
- acb->aiocb_info->cancel_async(acb);
- }
-}
-
-/**************************************************************/
-/* async block device emulation */
-
-typedef struct BlockAIOCBSync {
- BlockAIOCB common;
- QEMUBH *bh;
- int ret;
- /* vector translation state */
- QEMUIOVector *qiov;
- uint8_t *bounce;
- int is_write;
-} BlockAIOCBSync;
-
-static const AIOCBInfo bdrv_em_aiocb_info = {
- .aiocb_size = sizeof(BlockAIOCBSync),
-};
-
-static void bdrv_aio_bh_cb(void *opaque)
-{
- BlockAIOCBSync *acb = opaque;
-
- if (!acb->is_write && acb->ret >= 0) {
- qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
- }
- qemu_vfree(acb->bounce);
- acb->common.cb(acb->common.opaque, acb->ret);
- qemu_bh_delete(acb->bh);
- acb->bh = NULL;
- qemu_aio_unref(acb);
-}
-
-static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
- int64_t sector_num,
- QEMUIOVector *qiov,
- int nb_sectors,
- BlockCompletionFunc *cb,
- void *opaque,
- int is_write)
-
-{
- BlockAIOCBSync *acb;
-
- acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
- acb->is_write = is_write;
- acb->qiov = qiov;
- acb->bounce = qemu_try_blockalign(bs, qiov->size);
- acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
-
- if (acb->bounce == NULL) {
- acb->ret = -ENOMEM;
- } else if (is_write) {
- qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
- acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
- } else {
- acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
- }
-
- qemu_bh_schedule(acb->bh);
-
- return &acb->common;
-}
-
-static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
- int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
- BlockCompletionFunc *cb, void *opaque)
-{
- return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
-}
-
-static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
- int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
- BlockCompletionFunc *cb, void *opaque)
-{
- return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
-}
-
-
-typedef struct BlockAIOCBCoroutine {
- BlockAIOCB common;
- BlockRequest req;
- bool is_write;
- bool *done;
- QEMUBH* bh;
-} BlockAIOCBCoroutine;
-
-static const AIOCBInfo bdrv_em_co_aiocb_info = {
- .aiocb_size = sizeof(BlockAIOCBCoroutine),
-};
-
-static void bdrv_co_em_bh(void *opaque)
-{
- BlockAIOCBCoroutine *acb = opaque;
-
- acb->common.cb(acb->common.opaque, acb->req.error);
-
- qemu_bh_delete(acb->bh);
- qemu_aio_unref(acb);
-}
-
-/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
-static void coroutine_fn bdrv_co_do_rw(void *opaque)
-{
- BlockAIOCBCoroutine *acb = opaque;
- BlockDriverState *bs = acb->common.bs;
-
- if (!acb->is_write) {
- acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
- acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
- } else {
- acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
- acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
- }
-
- acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
- qemu_bh_schedule(acb->bh);
-}
-
-static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
- int64_t sector_num,
- QEMUIOVector *qiov,
- int nb_sectors,
- BdrvRequestFlags flags,
- BlockCompletionFunc *cb,
- void *opaque,
- bool is_write)
-{
- Coroutine *co;
- BlockAIOCBCoroutine *acb;
-
- acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
- acb->req.sector = sector_num;
- acb->req.nb_sectors = nb_sectors;
- acb->req.qiov = qiov;
- acb->req.flags = flags;
- acb->is_write = is_write;
-
- co = qemu_coroutine_create(bdrv_co_do_rw);
- qemu_coroutine_enter(co, acb);
-
- return &acb->common;
-}
-
-static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
-{
- BlockAIOCBCoroutine *acb = opaque;
- BlockDriverState *bs = acb->common.bs;
-
- acb->req.error = bdrv_co_flush(bs);
- acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
- qemu_bh_schedule(acb->bh);
-}
-
-BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
- BlockCompletionFunc *cb, void *opaque)
-{
- trace_bdrv_aio_flush(bs, opaque);
-
- Coroutine *co;
- BlockAIOCBCoroutine *acb;
-
- acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
-
- co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
- qemu_coroutine_enter(co, acb);
-
- return &acb->common;
-}
-
-static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
-{
- BlockAIOCBCoroutine *acb = opaque;
- BlockDriverState *bs = acb->common.bs;
-
- acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
- acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
- qemu_bh_schedule(acb->bh);
-}
-
-BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
- int64_t sector_num, int nb_sectors,
- BlockCompletionFunc *cb, void *opaque)
-{
- Coroutine *co;
- BlockAIOCBCoroutine *acb;
-
- trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
-
- acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
- acb->req.sector = sector_num;
- acb->req.nb_sectors = nb_sectors;
- co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
- qemu_coroutine_enter(co, acb);
-
- return &acb->common;
-}
-
void bdrv_init(void)
{
module_call_init(MODULE_INIT_BLOCK);
@@ -4948,161 +2923,6 @@ void bdrv_init_with_whitelist(void)
bdrv_init();
}
-void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
- BlockCompletionFunc *cb, void *opaque)
-{
- BlockAIOCB *acb;
-
- acb = g_slice_alloc(aiocb_info->aiocb_size);
- acb->aiocb_info = aiocb_info;
- acb->bs = bs;
- acb->cb = cb;
- acb->opaque = opaque;
- acb->refcnt = 1;
- return acb;
-}
-
-void qemu_aio_ref(void *p)
-{
- BlockAIOCB *acb = p;
- acb->refcnt++;
-}
-
-void qemu_aio_unref(void *p)
-{
- BlockAIOCB *acb = p;
- assert(acb->refcnt > 0);
- if (--acb->refcnt == 0) {
- g_slice_free1(acb->aiocb_info->aiocb_size, acb);
- }
-}
-
-/**************************************************************/
-/* Coroutine block device emulation */
-
-typedef struct CoroutineIOCompletion {
- Coroutine *coroutine;
- int ret;
-} CoroutineIOCompletion;
-
-static void bdrv_co_io_em_complete(void *opaque, int ret)
-{
- CoroutineIOCompletion *co = opaque;
-
- co->ret = ret;
- qemu_coroutine_enter(co->coroutine, NULL);
-}
-
-static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
- int nb_sectors, QEMUIOVector *iov,
- bool is_write)
-{
- CoroutineIOCompletion co = {
- .coroutine = qemu_coroutine_self(),
- };
- BlockAIOCB *acb;
-
- if (is_write) {
- acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
- bdrv_co_io_em_complete, &co);
- } else {
- acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
- bdrv_co_io_em_complete, &co);
- }
-
- trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
- if (!acb) {
- return -EIO;
- }
- qemu_coroutine_yield();
-
- return co.ret;
-}
-
-static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
- int64_t sector_num, int nb_sectors,
- QEMUIOVector *iov)
-{
- return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
-}
-
-static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
- int64_t sector_num, int nb_sectors,
- QEMUIOVector *iov)
-{
- return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
-}
-
-static void coroutine_fn bdrv_flush_co_entry(void *opaque)
-{
- RwCo *rwco = opaque;
-
- rwco->ret = bdrv_co_flush(rwco->bs);
-}
-
-int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
-{
- int ret;
-
- if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
- return 0;
- }
-
- /* Write back cached data to the OS even with cache=unsafe */
- BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
- if (bs->drv->bdrv_co_flush_to_os) {
- ret = bs->drv->bdrv_co_flush_to_os(bs);
- if (ret < 0) {
- return ret;
- }
- }
-
- /* But don't actually force it to the disk with cache=unsafe */
- if (bs->open_flags & BDRV_O_NO_FLUSH) {
- goto flush_parent;
- }
-
- BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
- if (bs->drv->bdrv_co_flush_to_disk) {
- ret = bs->drv->bdrv_co_flush_to_disk(bs);
- } else if (bs->drv->bdrv_aio_flush) {
- BlockAIOCB *acb;
- CoroutineIOCompletion co = {
- .coroutine = qemu_coroutine_self(),
- };
-
- acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
- if (acb == NULL) {
- ret = -EIO;
- } else {
- qemu_coroutine_yield();
- ret = co.ret;
- }
- } else {
- /*
- * Some block drivers always operate in either writethrough or unsafe
- * mode and don't support bdrv_flush therefore. Usually qemu doesn't
- * know how the server works (because the behaviour is hardcoded or
- * depends on server-side configuration), so we can't ensure that
- * everything is safe on disk. Returning an error doesn't work because
- * that would break guests even if the server operates in writethrough
- * mode.
- *
- * Let's hope the user knows what he's doing.
- */
- ret = 0;
- }
- if (ret < 0) {
- return ret;
- }
-
- /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
- * in the case of cache=unsafe, so there are no useless flushes.
- */
-flush_parent:
- return bdrv_co_flush(bs->file);
-}
-
void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
{
Error *local_err = NULL;
@@ -5152,143 +2972,6 @@ void bdrv_invalidate_cache_all(Error **errp)
}
}
-int bdrv_flush(BlockDriverState *bs)
-{
- Coroutine *co;
- RwCo rwco = {
- .bs = bs,
- .ret = NOT_DONE,
- };
-
- if (qemu_in_coroutine()) {
- /* Fast-path if already in coroutine context */
- bdrv_flush_co_entry(&rwco);
- } else {
- AioContext *aio_context = bdrv_get_aio_context(bs);
-
- co = qemu_coroutine_create(bdrv_flush_co_entry);
- qemu_coroutine_enter(co, &rwco);
- while (rwco.ret == NOT_DONE) {
- aio_poll(aio_context, true);
- }
- }
-
- return rwco.ret;
-}
-
-typedef struct DiscardCo {
- BlockDriverState *bs;
- int64_t sector_num;
- int nb_sectors;
- int ret;
-} DiscardCo;
-static void coroutine_fn bdrv_discard_co_entry(void *opaque)
-{
- DiscardCo *rwco = opaque;
-
- rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
-}
-
-int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
- int nb_sectors)
-{
- int max_discard, ret;
-
- if (!bs->drv) {
- return -ENOMEDIUM;
- }
-
- ret = bdrv_check_request(bs, sector_num, nb_sectors);
- if (ret < 0) {
- return ret;
- } else if (bs->read_only) {
- return -EROFS;
- }
-
- bdrv_reset_dirty(bs, sector_num, nb_sectors);
-
- /* Do nothing if disabled. */
- if (!(bs->open_flags & BDRV_O_UNMAP)) {
- return 0;
- }
-
- if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
- return 0;
- }
-
- max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
- while (nb_sectors > 0) {
- int ret;
- int num = nb_sectors;
-
- /* align request */
- if (bs->bl.discard_alignment &&
- num >= bs->bl.discard_alignment &&
- sector_num % bs->bl.discard_alignment) {
- if (num > bs->bl.discard_alignment) {
- num = bs->bl.discard_alignment;
- }
- num -= sector_num % bs->bl.discard_alignment;
- }
-
- /* limit request size */
- if (num > max_discard) {
- num = max_discard;
- }
-
- if (bs->drv->bdrv_co_discard) {
- ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
- } else {
- BlockAIOCB *acb;
- CoroutineIOCompletion co = {
- .coroutine = qemu_coroutine_self(),
- };
-
- acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
- bdrv_co_io_em_complete, &co);
- if (acb == NULL) {
- return -EIO;
- } else {
- qemu_coroutine_yield();
- ret = co.ret;
- }
- }
- if (ret && ret != -ENOTSUP) {
- return ret;
- }
-
- sector_num += num;
- nb_sectors -= num;
- }
- return 0;
-}
-
-int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
-{
- Coroutine *co;
- DiscardCo rwco = {
- .bs = bs,
- .sector_num = sector_num,
- .nb_sectors = nb_sectors,
- .ret = NOT_DONE,
- };
-
- if (qemu_in_coroutine()) {
- /* Fast-path if already in coroutine context */
- bdrv_discard_co_entry(&rwco);
- } else {
- AioContext *aio_context = bdrv_get_aio_context(bs);
-
- co = qemu_coroutine_create(bdrv_discard_co_entry);
- qemu_coroutine_enter(co, &rwco);
- while (rwco.ret == NOT_DONE) {
- aio_poll(aio_context, true);
- }
- }
-
- return rwco.ret;
-}
-
/**************************************************************/
/* removable device support */
@@ -5354,107 +3037,171 @@ void bdrv_lock_medium(BlockDriverState *bs, bool locked)
}
}
-/* needed for generic scsi interface */
-
-int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
{
- BlockDriver *drv = bs->drv;
-
- if (drv && drv->bdrv_ioctl)
- return drv->bdrv_ioctl(bs, req, buf);
- return -ENOTSUP;
+ bs->guest_block_size = align;
}
-BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
- unsigned long int req, void *buf,
- BlockCompletionFunc *cb, void *opaque)
+BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name)
{
- BlockDriver *drv = bs->drv;
+ BdrvDirtyBitmap *bm;
- if (drv && drv->bdrv_aio_ioctl)
- return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
+ assert(name);
+ QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
+ if (bm->name && !strcmp(name, bm->name)) {
+ return bm;
+ }
+ }
return NULL;
}
-void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
+void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap)
{
- bs->guest_block_size = align;
+ assert(!bdrv_dirty_bitmap_frozen(bitmap));
+ g_free(bitmap->name);
+ bitmap->name = NULL;
+}
+
+BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
+ uint32_t granularity,
+ const char *name,
+ Error **errp)
+{
+ int64_t bitmap_size;
+ BdrvDirtyBitmap *bitmap;
+ uint32_t sector_granularity;
+
+ assert((granularity & (granularity - 1)) == 0);
+
+ if (name && bdrv_find_dirty_bitmap(bs, name)) {
+ error_setg(errp, "Bitmap already exists: %s", name);
+ return NULL;
+ }
+ sector_granularity = granularity >> BDRV_SECTOR_BITS;
+ assert(sector_granularity);
+ bitmap_size = bdrv_nb_sectors(bs);
+ if (bitmap_size < 0) {
+ error_setg_errno(errp, -bitmap_size, "could not get length of device");
+ errno = -bitmap_size;
+ return NULL;
+ }
+ bitmap = g_new0(BdrvDirtyBitmap, 1);
+ bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(sector_granularity));
+ bitmap->size = bitmap_size;
+ bitmap->name = g_strdup(name);
+ bitmap->disabled = false;
+ QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
+ return bitmap;
}
-void *qemu_blockalign(BlockDriverState *bs, size_t size)
+bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap)
{
- return qemu_memalign(bdrv_opt_mem_align(bs), size);
+ return bitmap->successor;
}
-void *qemu_blockalign0(BlockDriverState *bs, size_t size)
+bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap)
{
- return memset(qemu_blockalign(bs, size), 0, size);
+ return !(bitmap->disabled || bitmap->successor);
}
-void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
+/**
+ * Create a successor bitmap destined to replace this bitmap after an operation.
+ * Requires that the bitmap is not frozen and has no successor.
+ */
+int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs,
+ BdrvDirtyBitmap *bitmap, Error **errp)
{
- size_t align = bdrv_opt_mem_align(bs);
+ uint64_t granularity;
+ BdrvDirtyBitmap *child;
+
+ if (bdrv_dirty_bitmap_frozen(bitmap)) {
+ error_setg(errp, "Cannot create a successor for a bitmap that is "
+ "currently frozen");
+ return -1;
+ }
+ assert(!bitmap->successor);
- /* Ensure that NULL is never returned on success */
- assert(align > 0);
- if (size == 0) {
- size = align;
+ /* Create an anonymous successor */
+ granularity = bdrv_dirty_bitmap_granularity(bitmap);
+ child = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
+ if (!child) {
+ return -1;
}
- return qemu_try_memalign(align, size);
+ /* Successor will be on or off based on our current state. */
+ child->disabled = bitmap->disabled;
+
+ /* Install the successor and freeze the parent */
+ bitmap->successor = child;
+ return 0;
}
-void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
+/**
+ * For a bitmap with a successor, yield our name to the successor,
+ * delete the old bitmap, and return a handle to the new bitmap.
+ */
+BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs,
+ BdrvDirtyBitmap *bitmap,
+ Error **errp)
{
- void *mem = qemu_try_blockalign(bs, size);
+ char *name;
+ BdrvDirtyBitmap *successor = bitmap->successor;
- if (mem) {
- memset(mem, 0, size);
+ if (successor == NULL) {
+ error_setg(errp, "Cannot relinquish control if "
+ "there's no successor present");
+ return NULL;
}
- return mem;
+ name = bitmap->name;
+ bitmap->name = NULL;
+ successor->name = name;
+ bitmap->successor = NULL;
+ bdrv_release_dirty_bitmap(bs, bitmap);
+
+ return successor;
}
-/*
- * Check if all memory in this vector is sector aligned.
+/**
+ * In cases of failure where we can no longer safely delete the parent,
+ * we may wish to re-join the parent and child/successor.
+ * The merged parent will be un-frozen, but not explicitly re-enabled.
*/
-bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
+BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs,
+ BdrvDirtyBitmap *parent,
+ Error **errp)
{
- int i;
- size_t alignment = bdrv_opt_mem_align(bs);
+ BdrvDirtyBitmap *successor = parent->successor;
- for (i = 0; i < qiov->niov; i++) {
- if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
- return false;
- }
- if (qiov->iov[i].iov_len % alignment) {
- return false;
- }
+ if (!successor) {
+ error_setg(errp, "Cannot reclaim a successor when none is present");
+ return NULL;
}
- return true;
+ if (!hbitmap_merge(parent->bitmap, successor->bitmap)) {
+ error_setg(errp, "Merging of parent and successor bitmap failed");
+ return NULL;
+ }
+ bdrv_release_dirty_bitmap(bs, successor);
+ parent->successor = NULL;
+
+ return parent;
}
-BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
- Error **errp)
+/**
+ * Truncates _all_ bitmaps attached to a BDS.
+ */
+static void bdrv_dirty_bitmap_truncate(BlockDriverState *bs)
{
- int64_t bitmap_size;
BdrvDirtyBitmap *bitmap;
+ uint64_t size = bdrv_nb_sectors(bs);
- assert((granularity & (granularity - 1)) == 0);
-
- granularity >>= BDRV_SECTOR_BITS;
- assert(granularity);
- bitmap_size = bdrv_nb_sectors(bs);
- if (bitmap_size < 0) {
- error_setg_errno(errp, -bitmap_size, "could not get length of device");
- errno = -bitmap_size;
- return NULL;
+ QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
+ if (bdrv_dirty_bitmap_frozen(bitmap)) {
+ continue;
+ }
+ hbitmap_truncate(bitmap->bitmap, size);
}
- bitmap = g_new0(BdrvDirtyBitmap, 1);
- bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
- QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
- return bitmap;
}
void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
@@ -5462,14 +3209,28 @@ void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
BdrvDirtyBitmap *bm, *next;
QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
if (bm == bitmap) {
+ assert(!bdrv_dirty_bitmap_frozen(bm));
QLIST_REMOVE(bitmap, list);
hbitmap_free(bitmap->bitmap);
+ g_free(bitmap->name);
g_free(bitmap);
return;
}
}
}
+void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap)
+{
+ assert(!bdrv_dirty_bitmap_frozen(bitmap));
+ bitmap->disabled = true;
+}
+
+void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap)
+{
+ assert(!bdrv_dirty_bitmap_frozen(bitmap));
+ bitmap->disabled = false;
+}
+
BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
{
BdrvDirtyBitmap *bm;
@@ -5479,9 +3240,11 @@ BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
- info->count = bdrv_get_dirty_count(bs, bm);
- info->granularity =
- ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
+ info->count = bdrv_get_dirty_count(bm);
+ info->granularity = bdrv_dirty_bitmap_granularity(bm);
+ info->has_name = !!bm->name;
+ info->name = g_strdup(bm->name);
+ info->frozen = bdrv_dirty_bitmap_frozen(bm);
entry->value = info;
*plist = entry;
plist = &entry->next;
@@ -5499,43 +3262,90 @@ int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector
}
}
-void bdrv_dirty_iter_init(BlockDriverState *bs,
- BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
+/**
+ * Chooses a default granularity based on the existing cluster size,
+ * but clamped between [4K, 64K]. Defaults to 64K in the case that there
+ * is no cluster size information available.
+ */
+uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs)
+{
+ BlockDriverInfo bdi;
+ uint32_t granularity;
+
+ if (bdrv_get_info(bs, &bdi) >= 0 && bdi.cluster_size > 0) {
+ granularity = MAX(4096, bdi.cluster_size);
+ granularity = MIN(65536, granularity);
+ } else {
+ granularity = 65536;
+ }
+
+ return granularity;
+}
+
+uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap)
+{
+ return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap);
+}
+
+void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
{
hbitmap_iter_init(hbi, bitmap->bitmap, 0);
}
-void bdrv_set_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
+void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap,
int64_t cur_sector, int nr_sectors)
{
+ assert(bdrv_dirty_bitmap_enabled(bitmap));
hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
}
-void bdrv_reset_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
+void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap,
int64_t cur_sector, int nr_sectors)
{
+ assert(bdrv_dirty_bitmap_enabled(bitmap));
hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
}
-static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
- int nr_sectors)
+void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap)
+{
+ assert(bdrv_dirty_bitmap_enabled(bitmap));
+ hbitmap_reset(bitmap->bitmap, 0, bitmap->size);
+}
+
+void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
+ int nr_sectors)
{
BdrvDirtyBitmap *bitmap;
QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
+ if (!bdrv_dirty_bitmap_enabled(bitmap)) {
+ continue;
+ }
hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
}
}
-static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
- int nr_sectors)
+void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
+ int nr_sectors)
{
BdrvDirtyBitmap *bitmap;
QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
+ if (!bdrv_dirty_bitmap_enabled(bitmap)) {
+ continue;
+ }
hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
}
}
-int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
+/**
+ * Advance an HBitmapIter to an arbitrary offset.
+ */
+void bdrv_set_dirty_iter(HBitmapIter *hbi, int64_t offset)
+{
+ assert(hbi->hb);
+ hbitmap_iter_init(hbi, hbi->hb, offset);
+}
+
+int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap)
{
return hbitmap_count(bitmap->bitmap);
}
@@ -5572,8 +3382,8 @@ bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
if (!QLIST_EMPTY(&bs->op_blockers[op])) {
blocker = QLIST_FIRST(&bs->op_blockers[op]);
if (errp) {
- error_setg(errp, "Device '%s' is busy: %s",
- bdrv_get_device_name(bs),
+ error_setg(errp, "Node '%s' is busy: %s",
+ bdrv_get_device_or_node_name(bs),
error_get_pretty(blocker->reason));
}
return true;
@@ -5953,12 +3763,6 @@ void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
abort();
}
-void bdrv_add_before_write_notifier(BlockDriverState *bs,
- NotifierWithReturn *notifier)
-{
- notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
-}
-
int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
BlockDriverAmendStatusCB *status_cb)
{
@@ -6059,36 +3863,6 @@ out:
return to_replace_bs;
}
-void bdrv_io_plug(BlockDriverState *bs)
-{
- BlockDriver *drv = bs->drv;
- if (drv && drv->bdrv_io_plug) {
- drv->bdrv_io_plug(bs);
- } else if (bs->file) {
- bdrv_io_plug(bs->file);
- }
-}
-
-void bdrv_io_unplug(BlockDriverState *bs)
-{
- BlockDriver *drv = bs->drv;
- if (drv && drv->bdrv_io_unplug) {
- drv->bdrv_io_unplug(bs);
- } else if (bs->file) {
- bdrv_io_unplug(bs->file);
- }
-}
-
-void bdrv_flush_io_queue(BlockDriverState *bs)
-{
- BlockDriver *drv = bs->drv;
- if (drv && drv->bdrv_flush_io_queue) {
- drv->bdrv_flush_io_queue(bs);
- } else if (bs->file) {
- bdrv_flush_io_queue(bs->file);
- }
-}
-
static bool append_open_options(QDict *d, BlockDriverState *bs)
{
const QDictEntry *entry;
diff --git a/block/Makefile.objs b/block/Makefile.objs
index db2933e469..0d8c2a4ab6 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -1,4 +1,4 @@
-block-obj-y += raw_bsd.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
+block-obj-y += raw_bsd.o qcow.o vdi.o vmdk.o cloop.o bochs.o vpc.o vvfat.o
block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
block-obj-y += qed-check.o
@@ -9,7 +9,7 @@ block-obj-y += block-backend.o snapshot.o qapi.o
block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
block-obj-$(CONFIG_POSIX) += raw-posix.o
block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
-block-obj-y += null.o mirror.o
+block-obj-y += null.o mirror.o io.o
block-obj-y += nbd.o nbd-client.o sheepdog.o
block-obj-$(CONFIG_LIBISCSI) += iscsi.o
@@ -37,6 +37,7 @@ gluster.o-libs := $(GLUSTERFS_LIBS)
ssh.o-cflags := $(LIBSSH2_CFLAGS)
ssh.o-libs := $(LIBSSH2_LIBS)
archipelago.o-libs := $(ARCHIPELAGO_LIBS)
+block-obj-m += dmg.o
dmg.o-libs := $(BZIP2_LIBS)
qcow.o-libs := -lz
linux-aio.o-libs := -laio
diff --git a/block/backup.c b/block/backup.c
index 1c535b1ab9..d3f648ddd7 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -37,6 +37,8 @@ typedef struct CowRequest {
typedef struct BackupBlockJob {
BlockJob common;
BlockDriverState *target;
+ /* bitmap for sync=dirty-bitmap */
+ BdrvDirtyBitmap *sync_bitmap;
MirrorSyncMode sync_mode;
RateLimit limit;
BlockdevOnError on_source_error;
@@ -242,6 +244,91 @@ static void backup_complete(BlockJob *job, void *opaque)
g_free(data);
}
+static bool coroutine_fn yield_and_check(BackupBlockJob *job)
+{
+ if (block_job_is_cancelled(&job->common)) {
+ return true;
+ }
+
+ /* we need to yield so that bdrv_drain_all() returns.
+ * (without, VM does not reboot)
+ */
+ if (job->common.speed) {
+ uint64_t delay_ns = ratelimit_calculate_delay(&job->limit,
+ job->sectors_read);
+ job->sectors_read = 0;
+ block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns);
+ } else {
+ block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0);
+ }
+
+ if (block_job_is_cancelled(&job->common)) {
+ return true;
+ }
+
+ return false;
+}
+
+static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
+{
+ bool error_is_read;
+ int ret = 0;
+ int clusters_per_iter;
+ uint32_t granularity;
+ int64_t sector;
+ int64_t cluster;
+ int64_t end;
+ int64_t last_cluster = -1;
+ BlockDriverState *bs = job->common.bs;
+ HBitmapIter hbi;
+
+ granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap);
+ clusters_per_iter = MAX((granularity / BACKUP_CLUSTER_SIZE), 1);
+ bdrv_dirty_iter_init(job->sync_bitmap, &hbi);
+
+ /* Find the next dirty sector(s) */
+ while ((sector = hbitmap_iter_next(&hbi)) != -1) {
+ cluster = sector / BACKUP_SECTORS_PER_CLUSTER;
+
+ /* Fake progress updates for any clusters we skipped */
+ if (cluster != last_cluster + 1) {
+ job->common.offset += ((cluster - last_cluster - 1) *
+ BACKUP_CLUSTER_SIZE);
+ }
+
+ for (end = cluster + clusters_per_iter; cluster < end; cluster++) {
+ do {
+ if (yield_and_check(job)) {
+ return ret;
+ }
+ ret = backup_do_cow(bs, cluster * BACKUP_SECTORS_PER_CLUSTER,
+ BACKUP_SECTORS_PER_CLUSTER, &error_is_read);
+ if ((ret < 0) &&
+ backup_error_action(job, error_is_read, -ret) ==
+ BLOCK_ERROR_ACTION_REPORT) {
+ return ret;
+ }
+ } while (ret < 0);
+ }
+
+ /* If the bitmap granularity is smaller than the backup granularity,
+ * we need to advance the iterator pointer to the next cluster. */
+ if (granularity < BACKUP_CLUSTER_SIZE) {
+ bdrv_set_dirty_iter(&hbi, cluster * BACKUP_SECTORS_PER_CLUSTER);
+ }
+
+ last_cluster = cluster - 1;
+ }
+
+ /* Play some final catchup with the progress meter */
+ end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE);
+ if (last_cluster + 1 < end) {
+ job->common.offset += ((end - last_cluster - 1) * BACKUP_CLUSTER_SIZE);
+ }
+
+ return ret;
+}
+
static void coroutine_fn backup_run(void *opaque)
{
BackupBlockJob *job = opaque;
@@ -259,8 +346,7 @@ static void coroutine_fn backup_run(void *opaque)
qemu_co_rwlock_init(&job->flush_rwlock);
start = 0;
- end = DIV_ROUND_UP(job->common.len / BDRV_SECTOR_SIZE,
- BACKUP_SECTORS_PER_CLUSTER);
+ end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE);
job->bitmap = hbitmap_alloc(end, 0);
@@ -278,28 +364,13 @@ static void coroutine_fn backup_run(void *opaque)
qemu_coroutine_yield();
job->common.busy = true;
}
+ } else if (job->sync_mode == MIRROR_SYNC_MODE_DIRTY_BITMAP) {
+ ret = backup_run_incremental(job);
} else {
/* Both FULL and TOP SYNC_MODE's require copying.. */
for (; start < end; start++) {
bool error_is_read;
-
- if (block_job_is_cancelled(&job->common)) {
- break;
- }
-
- /* we need to yield so that qemu_aio_flush() returns.
- * (without, VM does not reboot)
- */
- if (job->common.speed) {
- uint64_t delay_ns = ratelimit_calculate_delay(
- &job->limit, job->sectors_read);
- job->sectors_read = 0;
- block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns);
- } else {
- block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0);
- }
-
- if (block_job_is_cancelled(&job->common)) {
+ if (yield_and_check(job)) {
break;
}
@@ -357,6 +428,18 @@ static void coroutine_fn backup_run(void *opaque)
qemu_co_rwlock_wrlock(&job->flush_rwlock);
qemu_co_rwlock_unlock(&job->flush_rwlock);
+ if (job->sync_bitmap) {
+ BdrvDirtyBitmap *bm;
+ if (ret < 0) {
+ /* Merge the successor back into the parent, delete nothing. */
+ bm = bdrv_reclaim_dirty_bitmap(bs, job->sync_bitmap, NULL);
+ assert(bm);
+ } else {
+ /* Everything is fine, delete this bitmap and install the backup. */
+ bm = bdrv_dirty_bitmap_abdicate(bs, job->sync_bitmap, NULL);
+ assert(bm);
+ }
+ }
hbitmap_free(job->bitmap);
bdrv_iostatus_disable(target);
@@ -369,6 +452,7 @@ static void coroutine_fn backup_run(void *opaque)
void backup_start(BlockDriverState *bs, BlockDriverState *target,
int64_t speed, MirrorSyncMode sync_mode,
+ BdrvDirtyBitmap *sync_bitmap,
BlockdevOnError on_source_error,
BlockdevOnError on_target_error,
BlockCompletionFunc *cb, void *opaque,
@@ -412,17 +496,36 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
return;
}
+ if (sync_mode == MIRROR_SYNC_MODE_DIRTY_BITMAP) {
+ if (!sync_bitmap) {
+ error_setg(errp, "must provide a valid bitmap name for "
+ "\"dirty-bitmap\" sync mode");
+ return;
+ }
+
+ /* Create a new bitmap, and freeze/disable this one. */
+ if (bdrv_dirty_bitmap_create_successor(bs, sync_bitmap, errp) < 0) {
+ return;
+ }
+ } else if (sync_bitmap) {
+ error_setg(errp,
+ "a sync_bitmap was provided to backup_run, "
+ "but received an incompatible sync_mode (%s)",
+ MirrorSyncMode_lookup[sync_mode]);
+ return;
+ }
+
len = bdrv_getlength(bs);
if (len < 0) {
error_setg_errno(errp, -len, "unable to get length for '%s'",
bdrv_get_device_name(bs));
- return;
+ goto error;
}
BackupBlockJob *job = block_job_create(&backup_job_driver, bs, speed,
cb, opaque, errp);
if (!job) {
- return;
+ goto error;
}
bdrv_op_block_all(target, job->common.blocker);
@@ -431,7 +534,15 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
job->on_target_error = on_target_error;
job->target = target;
job->sync_mode = sync_mode;
+ job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_DIRTY_BITMAP ?
+ sync_bitmap : NULL;
job->common.len = len;
job->common.co = qemu_coroutine_create(backup_run);
qemu_coroutine_enter(job->common.co, job);
+ return;
+
+ error:
+ if (sync_bitmap) {
+ bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL);
+ }
}
diff --git a/block/blkdebug.c b/block/blkdebug.c
index 63611e0a33..3c30edba73 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -721,6 +721,11 @@ static int64_t blkdebug_getlength(BlockDriverState *bs)
return bdrv_getlength(bs->file);
}
+static int blkdebug_truncate(BlockDriverState *bs, int64_t offset)
+{
+ return bdrv_truncate(bs->file, offset);
+}
+
static void blkdebug_refresh_filename(BlockDriverState *bs)
{
QDict *opts;
@@ -779,6 +784,7 @@ static BlockDriver bdrv_blkdebug = {
.bdrv_file_open = blkdebug_open,
.bdrv_close = blkdebug_close,
.bdrv_getlength = blkdebug_getlength,
+ .bdrv_truncate = blkdebug_truncate,
.bdrv_refresh_filename = blkdebug_refresh_filename,
.bdrv_aio_readv = blkdebug_aio_readv,
diff --git a/block/block-backend.c b/block/block-backend.c
index 48b6e4c05c..93e46f376a 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -515,6 +515,17 @@ int blk_write(BlockBackend *blk, int64_t sector_num, const uint8_t *buf,
return bdrv_write(blk->bs, sector_num, buf, nb_sectors);
}
+int blk_write_zeroes(BlockBackend *blk, int64_t sector_num,
+ int nb_sectors, BdrvRequestFlags flags)
+{
+ int ret = blk_check_request(blk, sector_num, nb_sectors);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return bdrv_write_zeroes(blk->bs, sector_num, nb_sectors, flags);
+}
+
static void error_callback_bh(void *opaque)
{
struct BlockBackendAIOCB *acb = opaque;
diff --git a/block/io.c b/block/io.c
new file mode 100644
index 0000000000..1ce62c4fbc
--- /dev/null
+++ b/block/io.c
@@ -0,0 +1,2540 @@
+/*
+ * Block layer I/O functions
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "trace.h"
+#include "sysemu/qtest.h"
+#include "block/blockjob.h"
+#include "block/block_int.h"
+
+#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
+
+static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockCompletionFunc *cb, void *opaque);
+static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockCompletionFunc *cb, void *opaque);
+static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ QEMUIOVector *iov);
+static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ QEMUIOVector *iov);
+static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
+ int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
+ BdrvRequestFlags flags);
+static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
+ int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
+ BdrvRequestFlags flags);
+static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BdrvRequestFlags flags,
+ BlockCompletionFunc *cb,
+ void *opaque,
+ bool is_write);
+static void coroutine_fn bdrv_co_do_rw(void *opaque);
+static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
+
+/* throttling disk I/O limits */
+void bdrv_set_io_limits(BlockDriverState *bs,
+ ThrottleConfig *cfg)
+{
+ int i;
+
+ throttle_config(&bs->throttle_state, cfg);
+
+ for (i = 0; i < 2; i++) {
+ qemu_co_enter_next(&bs->throttled_reqs[i]);
+ }
+}
+
+/* this function drain all the throttled IOs */
+static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
+{
+ bool drained = false;
+ bool enabled = bs->io_limits_enabled;
+ int i;
+
+ bs->io_limits_enabled = false;
+
+ for (i = 0; i < 2; i++) {
+ while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
+ drained = true;
+ }
+ }
+
+ bs->io_limits_enabled = enabled;
+
+ return drained;
+}
+
+void bdrv_io_limits_disable(BlockDriverState *bs)
+{
+ bs->io_limits_enabled = false;
+
+ bdrv_start_throttled_reqs(bs);
+
+ throttle_destroy(&bs->throttle_state);
+}
+
+static void bdrv_throttle_read_timer_cb(void *opaque)
+{
+ BlockDriverState *bs = opaque;
+ qemu_co_enter_next(&bs->throttled_reqs[0]);
+}
+
+static void bdrv_throttle_write_timer_cb(void *opaque)
+{
+ BlockDriverState *bs = opaque;
+ qemu_co_enter_next(&bs->throttled_reqs[1]);
+}
+
+/* should be called before bdrv_set_io_limits if a limit is set */
+void bdrv_io_limits_enable(BlockDriverState *bs)
+{
+ int clock_type = QEMU_CLOCK_REALTIME;
+
+ if (qtest_enabled()) {
+ /* For testing block IO throttling only */
+ clock_type = QEMU_CLOCK_VIRTUAL;
+ }
+ assert(!bs->io_limits_enabled);
+ throttle_init(&bs->throttle_state,
+ bdrv_get_aio_context(bs),
+ clock_type,
+ bdrv_throttle_read_timer_cb,
+ bdrv_throttle_write_timer_cb,
+ bs);
+ bs->io_limits_enabled = true;
+}
+
+/* This function makes an IO wait if needed
+ *
+ * @nb_sectors: the number of sectors of the IO
+ * @is_write: is the IO a write
+ */
+static void bdrv_io_limits_intercept(BlockDriverState *bs,
+ unsigned int bytes,
+ bool is_write)
+{
+ /* does this io must wait */
+ bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
+
+ /* if must wait or any request of this type throttled queue the IO */
+ if (must_wait ||
+ !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
+ qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
+ }
+
+ /* the IO will be executed, do the accounting */
+ throttle_account(&bs->throttle_state, is_write, bytes);
+
+
+ /* if the next request must wait -> do nothing */
+ if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
+ return;
+ }
+
+ /* else queue next request for execution */
+ qemu_co_queue_next(&bs->throttled_reqs[is_write]);
+}
+
+void bdrv_setup_io_funcs(BlockDriver *bdrv)
+{
+ /* Block drivers without coroutine functions need emulation */
+ if (!bdrv->bdrv_co_readv) {
+ bdrv->bdrv_co_readv = bdrv_co_readv_em;
+ bdrv->bdrv_co_writev = bdrv_co_writev_em;
+
+ /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
+ * the block driver lacks aio we need to emulate that too.
+ */
+ if (!bdrv->bdrv_aio_readv) {
+ /* add AIO emulation layer */
+ bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
+ bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
+ }
+ }
+}
+
+void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
+{
+ BlockDriver *drv = bs->drv;
+ Error *local_err = NULL;
+
+ memset(&bs->bl, 0, sizeof(bs->bl));
+
+ if (!drv) {
+ return;
+ }
+
+ /* Take some limits from the children as a default */
+ if (bs->file) {
+ bdrv_refresh_limits(bs->file, &local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ return;
+ }
+ bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
+ bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
+ bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
+ } else {
+ bs->bl.opt_mem_alignment = 512;
+ }
+
+ if (bs->backing_hd) {
+ bdrv_refresh_limits(bs->backing_hd, &local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ return;
+ }
+ bs->bl.opt_transfer_length =
+ MAX(bs->bl.opt_transfer_length,
+ bs->backing_hd->bl.opt_transfer_length);
+ bs->bl.max_transfer_length =
+ MIN_NON_ZERO(bs->bl.max_transfer_length,
+ bs->backing_hd->bl.max_transfer_length);
+ bs->bl.opt_mem_alignment =
+ MAX(bs->bl.opt_mem_alignment,
+ bs->backing_hd->bl.opt_mem_alignment);
+ }
+
+ /* Then let the driver override it */
+ if (drv->bdrv_refresh_limits) {
+ drv->bdrv_refresh_limits(bs, errp);
+ }
+}
+
+/**
+ * The copy-on-read flag is actually a reference count so multiple users may
+ * use the feature without worrying about clobbering its previous state.
+ * Copy-on-read stays enabled until all users have called to disable it.
+ */
+void bdrv_enable_copy_on_read(BlockDriverState *bs)
+{
+ bs->copy_on_read++;
+}
+
+void bdrv_disable_copy_on_read(BlockDriverState *bs)
+{
+ assert(bs->copy_on_read > 0);
+ bs->copy_on_read--;
+}
+
+/* Check if any requests are in-flight (including throttled requests) */
+static bool bdrv_requests_pending(BlockDriverState *bs)
+{
+ if (!QLIST_EMPTY(&bs->tracked_requests)) {
+ return true;
+ }
+ if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
+ return true;
+ }
+ if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
+ return true;
+ }
+ if (bs->file && bdrv_requests_pending(bs->file)) {
+ return true;
+ }
+ if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
+ return true;
+ }
+ return false;
+}
+
+static bool bdrv_drain_one(BlockDriverState *bs)
+{
+ bool bs_busy;
+
+ bdrv_flush_io_queue(bs);
+ bdrv_start_throttled_reqs(bs);
+ bs_busy = bdrv_requests_pending(bs);
+ bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
+ return bs_busy;
+}
+
+/*
+ * Wait for pending requests to complete on a single BlockDriverState subtree
+ *
+ * See the warning in bdrv_drain_all(). This function can only be called if
+ * you are sure nothing can generate I/O because you have op blockers
+ * installed.
+ *
+ * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
+ * AioContext.
+ */
+void bdrv_drain(BlockDriverState *bs)
+{
+ while (bdrv_drain_one(bs)) {
+ /* Keep iterating */
+ }
+}
+
+/*
+ * Wait for pending requests to complete across all BlockDriverStates
+ *
+ * This function does not flush data to disk, use bdrv_flush_all() for that
+ * after calling this function.
+ *
+ * Note that completion of an asynchronous I/O operation can trigger any
+ * number of other I/O operations on other devices---for example a coroutine
+ * can be arbitrarily complex and a constant flow of I/O can come until the
+ * coroutine is complete. Because of this, it is not possible to have a
+ * function to drain a single device's I/O queue.
+ */
+void bdrv_drain_all(void)
+{
+ /* Always run first iteration so any pending completion BHs run */
+ bool busy = true;
+ BlockDriverState *bs = NULL;
+
+ while ((bs = bdrv_next(bs))) {
+ AioContext *aio_context = bdrv_get_aio_context(bs);
+
+ aio_context_acquire(aio_context);
+ if (bs->job) {
+ block_job_pause(bs->job);
+ }
+ aio_context_release(aio_context);
+ }
+
+ while (busy) {
+ busy = false;
+ bs = NULL;
+
+ while ((bs = bdrv_next(bs))) {
+ AioContext *aio_context = bdrv_get_aio_context(bs);
+
+ aio_context_acquire(aio_context);
+ busy |= bdrv_drain_one(bs);
+ aio_context_release(aio_context);
+ }
+ }
+
+ bs = NULL;
+ while ((bs = bdrv_next(bs))) {
+ AioContext *aio_context = bdrv_get_aio_context(bs);
+
+ aio_context_acquire(aio_context);
+ if (bs->job) {
+ block_job_resume(bs->job);
+ }
+ aio_context_release(aio_context);
+ }
+}
+
+/**
+ * Remove an active request from the tracked requests list
+ *
+ * This function should be called when a tracked request is completing.
+ */
+static void tracked_request_end(BdrvTrackedRequest *req)
+{
+ if (req->serialising) {
+ req->bs->serialising_in_flight--;
+ }
+
+ QLIST_REMOVE(req, list);
+ qemu_co_queue_restart_all(&req->wait_queue);
+}
+
+/**
+ * Add an active request to the tracked requests list
+ */
+static void tracked_request_begin(BdrvTrackedRequest *req,
+ BlockDriverState *bs,
+ int64_t offset,
+ unsigned int bytes, bool is_write)
+{
+ *req = (BdrvTrackedRequest){
+ .bs = bs,
+ .offset = offset,
+ .bytes = bytes,
+ .is_write = is_write,
+ .co = qemu_coroutine_self(),
+ .serialising = false,
+ .overlap_offset = offset,
+ .overlap_bytes = bytes,
+ };
+
+ qemu_co_queue_init(&req->wait_queue);
+
+ QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
+}
+
+static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
+{
+ int64_t overlap_offset = req->offset & ~(align - 1);
+ unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
+ - overlap_offset;
+
+ if (!req->serialising) {
+ req->bs->serialising_in_flight++;
+ req->serialising = true;
+ }
+
+ req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
+ req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
+}
+
+/**
+ * Round a region to cluster boundaries
+ */
+void bdrv_round_to_clusters(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ int64_t *cluster_sector_num,
+ int *cluster_nb_sectors)
+{
+ BlockDriverInfo bdi;
+
+ if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
+ *cluster_sector_num = sector_num;
+ *cluster_nb_sectors = nb_sectors;
+ } else {
+ int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
+ *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
+ *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
+ nb_sectors, c);
+ }
+}
+
+static int bdrv_get_cluster_size(BlockDriverState *bs)
+{
+ BlockDriverInfo bdi;
+ int ret;
+
+ ret = bdrv_get_info(bs, &bdi);
+ if (ret < 0 || bdi.cluster_size == 0) {
+ return bs->request_alignment;
+ } else {
+ return bdi.cluster_size;
+ }
+}
+
+static bool tracked_request_overlaps(BdrvTrackedRequest *req,
+ int64_t offset, unsigned int bytes)
+{
+ /* aaaa bbbb */
+ if (offset >= req->overlap_offset + req->overlap_bytes) {
+ return false;
+ }
+ /* bbbb aaaa */
+ if (req->overlap_offset >= offset + bytes) {
+ return false;
+ }
+ return true;
+}
+
+static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
+{
+ BlockDriverState *bs = self->bs;
+ BdrvTrackedRequest *req;
+ bool retry;
+ bool waited = false;
+
+ if (!bs->serialising_in_flight) {
+ return false;
+ }
+
+ do {
+ retry = false;
+ QLIST_FOREACH(req, &bs->tracked_requests, list) {
+ if (req == self || (!req->serialising && !self->serialising)) {
+ continue;
+ }
+ if (tracked_request_overlaps(req, self->overlap_offset,
+ self->overlap_bytes))
+ {
+ /* Hitting this means there was a reentrant request, for
+ * example, a block driver issuing nested requests. This must
+ * never happen since it means deadlock.
+ */
+ assert(qemu_coroutine_self() != req->co);
+
+ /* If the request is already (indirectly) waiting for us, or
+ * will wait for us as soon as it wakes up, then just go on
+ * (instead of producing a deadlock in the former case). */
+ if (!req->waiting_for) {
+ self->waiting_for = req;
+ qemu_co_queue_wait(&req->wait_queue);
+ self->waiting_for = NULL;
+ retry = true;
+ waited = true;
+ break;
+ }
+ }
+ }
+ } while (retry);
+
+ return waited;
+}
+
+static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
+ size_t size)
+{
+ if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
+ return -EIO;
+ }
+
+ if (!bdrv_is_inserted(bs)) {
+ return -ENOMEDIUM;
+ }
+
+ if (offset < 0) {
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors)
+{
+ if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
+ return -EIO;
+ }
+
+ return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
+ nb_sectors * BDRV_SECTOR_SIZE);
+}
+
+typedef struct RwCo {
+ BlockDriverState *bs;
+ int64_t offset;
+ QEMUIOVector *qiov;
+ bool is_write;
+ int ret;
+ BdrvRequestFlags flags;
+} RwCo;
+
+static void coroutine_fn bdrv_rw_co_entry(void *opaque)
+{
+ RwCo *rwco = opaque;
+
+ if (!rwco->is_write) {
+ rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
+ rwco->qiov->size, rwco->qiov,
+ rwco->flags);
+ } else {
+ rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
+ rwco->qiov->size, rwco->qiov,
+ rwco->flags);
+ }
+}
+
+/*
+ * Process a vectored synchronous request using coroutines
+ */
+static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
+ QEMUIOVector *qiov, bool is_write,
+ BdrvRequestFlags flags)
+{
+ Coroutine *co;
+ RwCo rwco = {
+ .bs = bs,
+ .offset = offset,
+ .qiov = qiov,
+ .is_write = is_write,
+ .ret = NOT_DONE,
+ .flags = flags,
+ };
+
+ /**
+ * In sync call context, when the vcpu is blocked, this throttling timer
+ * will not fire; so the I/O throttling function has to be disabled here
+ * if it has been enabled.
+ */
+ if (bs->io_limits_enabled) {
+ fprintf(stderr, "Disabling I/O throttling on '%s' due "
+ "to synchronous I/O.\n", bdrv_get_device_name(bs));
+ bdrv_io_limits_disable(bs);
+ }
+
+ if (qemu_in_coroutine()) {
+ /* Fast-path if already in coroutine context */
+ bdrv_rw_co_entry(&rwco);
+ } else {
+ AioContext *aio_context = bdrv_get_aio_context(bs);
+
+ co = qemu_coroutine_create(bdrv_rw_co_entry);
+ qemu_coroutine_enter(co, &rwco);
+ while (rwco.ret == NOT_DONE) {
+ aio_poll(aio_context, true);
+ }
+ }
+ return rwco.ret;
+}
+
+/*
+ * Process a synchronous request using coroutines
+ */
+static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
+ int nb_sectors, bool is_write, BdrvRequestFlags flags)
+{
+ QEMUIOVector qiov;
+ struct iovec iov = {
+ .iov_base = (void *)buf,
+ .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
+ };
+
+ if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
+ return -EINVAL;
+ }
+
+ qemu_iovec_init_external(&qiov, &iov, 1);
+ return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
+ &qiov, is_write, flags);
+}
+
+/* return < 0 if error. See bdrv_write() for the return codes */
+int bdrv_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
+}
+
+/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
+int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ bool enabled;
+ int ret;
+
+ enabled = bs->io_limits_enabled;
+ bs->io_limits_enabled = false;
+ ret = bdrv_read(bs, sector_num, buf, nb_sectors);
+ bs->io_limits_enabled = enabled;
+ return ret;
+}
+
+/* Return < 0 if error. Important errors are:
+ -EIO generic I/O error (may happen for all errors)
+ -ENOMEDIUM No media inserted.
+ -EINVAL Invalid sector number or nb_sectors
+ -EACCES Trying to write a read-only device
+*/
+int bdrv_write(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
+}
+
+int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, BdrvRequestFlags flags)
+{
+ return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
+ BDRV_REQ_ZERO_WRITE | flags);
+}
+
+/*
+ * Completely zero out a block device with the help of bdrv_write_zeroes.
+ * The operation is sped up by checking the block status and only writing
+ * zeroes to the device if they currently do not return zeroes. Optional
+ * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
+ *
+ * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
+ */
+int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
+{
+ int64_t target_sectors, ret, nb_sectors, sector_num = 0;
+ int n;
+
+ target_sectors = bdrv_nb_sectors(bs);
+ if (target_sectors < 0) {
+ return target_sectors;
+ }
+
+ for (;;) {
+ nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
+ if (nb_sectors <= 0) {
+ return 0;
+ }
+ ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
+ if (ret < 0) {
+ error_report("error getting block status at sector %" PRId64 ": %s",
+ sector_num, strerror(-ret));
+ return ret;
+ }
+ if (ret & BDRV_BLOCK_ZERO) {
+ sector_num += n;
+ continue;
+ }
+ ret = bdrv_write_zeroes(bs, sector_num, n, flags);
+ if (ret < 0) {
+ error_report("error writing zeroes at sector %" PRId64 ": %s",
+ sector_num, strerror(-ret));
+ return ret;
+ }
+ sector_num += n;
+ }
+}
+
+int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
+{
+ QEMUIOVector qiov;
+ struct iovec iov = {
+ .iov_base = (void *)buf,
+ .iov_len = bytes,
+ };
+ int ret;
+
+ if (bytes < 0) {
+ return -EINVAL;
+ }
+
+ qemu_iovec_init_external(&qiov, &iov, 1);
+ ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return bytes;
+}
+
+int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
+{
+ int ret;
+
+ ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return qiov->size;
+}
+
+int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
+ const void *buf, int bytes)
+{
+ QEMUIOVector qiov;
+ struct iovec iov = {
+ .iov_base = (void *) buf,
+ .iov_len = bytes,
+ };
+
+ if (bytes < 0) {
+ return -EINVAL;
+ }
+
+ qemu_iovec_init_external(&qiov, &iov, 1);
+ return bdrv_pwritev(bs, offset, &qiov);
+}
+
+/*
+ * Writes to the file and ensures that no writes are reordered across this
+ * request (acts as a barrier)
+ *
+ * Returns 0 on success, -errno in error cases.
+ */
+int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
+ const void *buf, int count)
+{
+ int ret;
+
+ ret = bdrv_pwrite(bs, offset, buf, count);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* No flush needed for cache modes that already do it */
+ if (bs->enable_write_cache) {
+ bdrv_flush(bs);
+ }
+
+ return 0;
+}
+
+static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+ /* Perform I/O through a temporary buffer so that users who scribble over
+ * their read buffer while the operation is in progress do not end up
+ * modifying the image file. This is critical for zero-copy guest I/O
+ * where anything might happen inside guest memory.
+ */
+ void *bounce_buffer;
+
+ BlockDriver *drv = bs->drv;
+ struct iovec iov;
+ QEMUIOVector bounce_qiov;
+ int64_t cluster_sector_num;
+ int cluster_nb_sectors;
+ size_t skip_bytes;
+ int ret;
+
+ /* Cover entire cluster so no additional backing file I/O is required when
+ * allocating cluster in the image file.
+ */
+ bdrv_round_to_clusters(bs, sector_num, nb_sectors,
+ &cluster_sector_num, &cluster_nb_sectors);
+
+ trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
+ cluster_sector_num, cluster_nb_sectors);
+
+ iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
+ iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
+ if (bounce_buffer == NULL) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ qemu_iovec_init_external(&bounce_qiov, &iov, 1);
+
+ ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
+ &bounce_qiov);
+ if (ret < 0) {
+ goto err;
+ }
+
+ if (drv->bdrv_co_write_zeroes &&
+ buffer_is_zero(bounce_buffer, iov.iov_len)) {
+ ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
+ cluster_nb_sectors, 0);
+ } else {
+ /* This does not change the data on the disk, it is not necessary
+ * to flush even in cache=writethrough mode.
+ */
+ ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
+ &bounce_qiov);
+ }
+
+ if (ret < 0) {
+ /* It might be okay to ignore write errors for guest requests. If this
+ * is a deliberate copy-on-read then we don't want to ignore the error.
+ * Simply report it in all cases.
+ */
+ goto err;
+ }
+
+ skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
+ qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
+ nb_sectors * BDRV_SECTOR_SIZE);
+
+err:
+ qemu_vfree(bounce_buffer);
+ return ret;
+}
+
+/*
+ * Forwards an already correctly aligned request to the BlockDriver. This
+ * handles copy on read and zeroing after EOF; any other features must be
+ * implemented by the caller.
+ */
+static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
+ BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
+ int64_t align, QEMUIOVector *qiov, int flags)
+{
+ BlockDriver *drv = bs->drv;
+ int ret;
+
+ int64_t sector_num = offset >> BDRV_SECTOR_BITS;
+ unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+
+ assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+ assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+ assert(!qiov || bytes == qiov->size);
+
+ /* Handle Copy on Read and associated serialisation */
+ if (flags & BDRV_REQ_COPY_ON_READ) {
+ /* If we touch the same cluster it counts as an overlap. This
+ * guarantees that allocating writes will be serialized and not race
+ * with each other for the same cluster. For example, in copy-on-read
+ * it ensures that the CoR read and write operations are atomic and
+ * guest writes cannot interleave between them. */
+ mark_request_serialising(req, bdrv_get_cluster_size(bs));
+ }
+
+ wait_serialising_requests(req);
+
+ if (flags & BDRV_REQ_COPY_ON_READ) {
+ int pnum;
+
+ ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
+ if (ret < 0) {
+ goto out;
+ }
+
+ if (!ret || pnum != nb_sectors) {
+ ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
+ goto out;
+ }
+ }
+
+ /* Forward the request to the BlockDriver */
+ if (!bs->zero_beyond_eof) {
+ ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+ } else {
+ /* Read zeros after EOF */
+ int64_t total_sectors, max_nb_sectors;
+
+ total_sectors = bdrv_nb_sectors(bs);
+ if (total_sectors < 0) {
+ ret = total_sectors;
+ goto out;
+ }
+
+ max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
+ align >> BDRV_SECTOR_BITS);
+ if (nb_sectors < max_nb_sectors) {
+ ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+ } else if (max_nb_sectors > 0) {
+ QEMUIOVector local_qiov;
+
+ qemu_iovec_init(&local_qiov, qiov->niov);
+ qemu_iovec_concat(&local_qiov, qiov, 0,
+ max_nb_sectors * BDRV_SECTOR_SIZE);
+
+ ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors,
+ &local_qiov);
+
+ qemu_iovec_destroy(&local_qiov);
+ } else {
+ ret = 0;
+ }
+
+ /* Reading beyond end of file is supposed to produce zeroes */
+ if (ret == 0 && total_sectors < sector_num + nb_sectors) {
+ uint64_t offset = MAX(0, total_sectors - sector_num);
+ uint64_t bytes = (sector_num + nb_sectors - offset) *
+ BDRV_SECTOR_SIZE;
+ qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
+ }
+ }
+
+out:
+ return ret;
+}
+
+static inline uint64_t bdrv_get_align(BlockDriverState *bs)
+{
+ /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
+ return MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
+}
+
+static inline bool bdrv_req_is_aligned(BlockDriverState *bs,
+ int64_t offset, size_t bytes)
+{
+ int64_t align = bdrv_get_align(bs);
+ return !(offset & (align - 1) || (bytes & (align - 1)));
+}
+
+/*
+ * Handle a read request in coroutine context
+ */
+static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
+ int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
+ BdrvRequestFlags flags)
+{
+ BlockDriver *drv = bs->drv;
+ BdrvTrackedRequest req;
+
+ uint64_t align = bdrv_get_align(bs);
+ uint8_t *head_buf = NULL;
+ uint8_t *tail_buf = NULL;
+ QEMUIOVector local_qiov;
+ bool use_local_qiov = false;
+ int ret;
+
+ if (!drv) {
+ return -ENOMEDIUM;
+ }
+
+ ret = bdrv_check_byte_request(bs, offset, bytes);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (bs->copy_on_read) {
+ flags |= BDRV_REQ_COPY_ON_READ;
+ }
+
+ /* throttling disk I/O */
+ if (bs->io_limits_enabled) {
+ bdrv_io_limits_intercept(bs, bytes, false);
+ }
+
+ /* Align read if necessary by padding qiov */
+ if (offset & (align - 1)) {
+ head_buf = qemu_blockalign(bs, align);
+ qemu_iovec_init(&local_qiov, qiov->niov + 2);
+ qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
+ qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
+ use_local_qiov = true;
+
+ bytes += offset & (align - 1);
+ offset = offset & ~(align - 1);
+ }
+
+ if ((offset + bytes) & (align - 1)) {
+ if (!use_local_qiov) {
+ qemu_iovec_init(&local_qiov, qiov->niov + 1);
+ qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
+ use_local_qiov = true;
+ }
+ tail_buf = qemu_blockalign(bs, align);
+ qemu_iovec_add(&local_qiov, tail_buf,
+ align - ((offset + bytes) & (align - 1)));
+
+ bytes = ROUND_UP(bytes, align);
+ }
+
+ tracked_request_begin(&req, bs, offset, bytes, false);
+ ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
+ use_local_qiov ? &local_qiov : qiov,
+ flags);
+ tracked_request_end(&req);
+
+ if (use_local_qiov) {
+ qemu_iovec_destroy(&local_qiov);
+ qemu_vfree(head_buf);
+ qemu_vfree(tail_buf);
+ }
+
+ return ret;
+}
+
+static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+ BdrvRequestFlags flags)
+{
+ if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
+ return -EINVAL;
+ }
+
+ return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
+ nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
+}
+
+int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *qiov)
+{
+ trace_bdrv_co_readv(bs, sector_num, nb_sectors);
+
+ return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
+}
+
+int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+ trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
+
+ return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
+ BDRV_REQ_COPY_ON_READ);
+}
+
+#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
+
+static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+{
+ BlockDriver *drv = bs->drv;
+ QEMUIOVector qiov;
+ struct iovec iov = {0};
+ int ret = 0;
+
+ int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
+ BDRV_REQUEST_MAX_SECTORS);
+
+ while (nb_sectors > 0 && !ret) {
+ int num = nb_sectors;
+
+ /* Align request. Block drivers can expect the "bulk" of the request
+ * to be aligned.
+ */
+ if (bs->bl.write_zeroes_alignment
+ && num > bs->bl.write_zeroes_alignment) {
+ if (sector_num % bs->bl.write_zeroes_alignment != 0) {
+ /* Make a small request up to the first aligned sector. */
+ num = bs->bl.write_zeroes_alignment;
+ num -= sector_num % bs->bl.write_zeroes_alignment;
+ } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
+ /* Shorten the request to the last aligned sector. num cannot
+ * underflow because num > bs->bl.write_zeroes_alignment.
+ */
+ num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
+ }
+ }
+
+ /* limit request size */
+ if (num > max_write_zeroes) {
+ num = max_write_zeroes;
+ }
+
+ ret = -ENOTSUP;
+ /* First try the efficient write zeroes operation */
+ if (drv->bdrv_co_write_zeroes) {
+ ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
+ }
+
+ if (ret == -ENOTSUP) {
+ /* Fall back to bounce buffer if write zeroes is unsupported */
+ int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
+ MAX_WRITE_ZEROES_BOUNCE_BUFFER);
+ num = MIN(num, max_xfer_len);
+ iov.iov_len = num * BDRV_SECTOR_SIZE;
+ if (iov.iov_base == NULL) {
+ iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
+ if (iov.iov_base == NULL) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
+ }
+ qemu_iovec_init_external(&qiov, &iov, 1);
+
+ ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
+
+ /* Keep bounce buffer around if it is big enough for all
+ * all future requests.
+ */
+ if (num < max_xfer_len) {
+ qemu_vfree(iov.iov_base);
+ iov.iov_base = NULL;
+ }
+ }
+
+ sector_num += num;
+ nb_sectors -= num;
+ }
+
+fail:
+ qemu_vfree(iov.iov_base);
+ return ret;
+}
+
+/*
+ * Forwards an already correctly aligned write request to the BlockDriver.
+ */
+static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
+ BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
+ QEMUIOVector *qiov, int flags)
+{
+ BlockDriver *drv = bs->drv;
+ bool waited;
+ int ret;
+
+ int64_t sector_num = offset >> BDRV_SECTOR_BITS;
+ unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+
+ assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+ assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+ assert(!qiov || bytes == qiov->size);
+
+ waited = wait_serialising_requests(req);
+ assert(!waited || !req->serialising);
+ assert(req->overlap_offset <= offset);
+ assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
+
+ ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
+
+ if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
+ !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
+ qemu_iovec_is_zero(qiov)) {
+ flags |= BDRV_REQ_ZERO_WRITE;
+ if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
+ flags |= BDRV_REQ_MAY_UNMAP;
+ }
+ }
+
+ if (ret < 0) {
+ /* Do nothing, write notifier decided to fail this request */
+ } else if (flags & BDRV_REQ_ZERO_WRITE) {
+ BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
+ ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
+ } else {
+ BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
+ ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
+ }
+ BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
+
+ if (ret == 0 && !bs->enable_write_cache) {
+ ret = bdrv_co_flush(bs);
+ }
+
+ bdrv_set_dirty(bs, sector_num, nb_sectors);
+
+ block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
+
+ if (ret >= 0) {
+ bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
+ }
+
+ return ret;
+}
+
+/*
+ * Handle a write request in coroutine context
+ */
+static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
+ int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
+ BdrvRequestFlags flags)
+{
+ BdrvTrackedRequest req;
+ uint64_t align = bdrv_get_align(bs);
+ uint8_t *head_buf = NULL;
+ uint8_t *tail_buf = NULL;
+ QEMUIOVector local_qiov;
+ bool use_local_qiov = false;
+ int ret;
+
+ if (!bs->drv) {
+ return -ENOMEDIUM;
+ }
+ if (bs->read_only) {
+ return -EACCES;
+ }
+
+ ret = bdrv_check_byte_request(bs, offset, bytes);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* throttling disk I/O */
+ if (bs->io_limits_enabled) {
+ bdrv_io_limits_intercept(bs, bytes, true);
+ }
+
+ /*
+ * Align write if necessary by performing a read-modify-write cycle.
+ * Pad qiov with the read parts and be sure to have a tracked request not
+ * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
+ */
+ tracked_request_begin(&req, bs, offset, bytes, true);
+
+ if (offset & (align - 1)) {
+ QEMUIOVector head_qiov;
+ struct iovec head_iov;
+
+ mark_request_serialising(&req, align);
+ wait_serialising_requests(&req);
+
+ head_buf = qemu_blockalign(bs, align);
+ head_iov = (struct iovec) {
+ .iov_base = head_buf,
+ .iov_len = align,
+ };
+ qemu_iovec_init_external(&head_qiov, &head_iov, 1);
+
+ BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
+ ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
+ align, &head_qiov, 0);
+ if (ret < 0) {
+ goto fail;
+ }
+ BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
+
+ qemu_iovec_init(&local_qiov, qiov->niov + 2);
+ qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
+ qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
+ use_local_qiov = true;
+
+ bytes += offset & (align - 1);
+ offset = offset & ~(align - 1);
+ }
+
+ if ((offset + bytes) & (align - 1)) {
+ QEMUIOVector tail_qiov;
+ struct iovec tail_iov;
+ size_t tail_bytes;
+ bool waited;
+
+ mark_request_serialising(&req, align);
+ waited = wait_serialising_requests(&req);
+ assert(!waited || !use_local_qiov);
+
+ tail_buf = qemu_blockalign(bs, align);
+ tail_iov = (struct iovec) {
+ .iov_base = tail_buf,
+ .iov_len = align,
+ };
+ qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
+
+ BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
+ ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
+ align, &tail_qiov, 0);
+ if (ret < 0) {
+ goto fail;
+ }
+ BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
+
+ if (!use_local_qiov) {
+ qemu_iovec_init(&local_qiov, qiov->niov + 1);
+ qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
+ use_local_qiov = true;
+ }
+
+ tail_bytes = (offset + bytes) & (align - 1);
+ qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
+
+ bytes = ROUND_UP(bytes, align);
+ }
+
+ if (use_local_qiov) {
+ /* Local buffer may have non-zero data. */
+ flags &= ~BDRV_REQ_ZERO_WRITE;
+ }
+ ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
+ use_local_qiov ? &local_qiov : qiov,
+ flags);
+
+fail:
+ tracked_request_end(&req);
+
+ if (use_local_qiov) {
+ qemu_iovec_destroy(&local_qiov);
+ }
+ qemu_vfree(head_buf);
+ qemu_vfree(tail_buf);
+
+ return ret;
+}
+
+static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+ BdrvRequestFlags flags)
+{
+ if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
+ return -EINVAL;
+ }
+
+ return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
+ nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
+}
+
+int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *qiov)
+{
+ trace_bdrv_co_writev(bs, sector_num, nb_sectors);
+
+ return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
+}
+
+int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ BdrvRequestFlags flags)
+{
+ int ret;
+
+ trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
+
+ if (!(bs->open_flags & BDRV_O_UNMAP)) {
+ flags &= ~BDRV_REQ_MAY_UNMAP;
+ }
+ if (bdrv_req_is_aligned(bs, sector_num << BDRV_SECTOR_BITS,
+ nb_sectors << BDRV_SECTOR_BITS)) {
+ ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
+ BDRV_REQ_ZERO_WRITE | flags);
+ } else {
+ uint8_t *buf;
+ QEMUIOVector local_qiov;
+ size_t bytes = nb_sectors << BDRV_SECTOR_BITS;
+
+ buf = qemu_memalign(bdrv_opt_mem_align(bs), bytes);
+ memset(buf, 0, bytes);
+ qemu_iovec_init(&local_qiov, 1);
+ qemu_iovec_add(&local_qiov, buf, bytes);
+
+ ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, &local_qiov,
+ BDRV_REQ_ZERO_WRITE | flags);
+ qemu_vfree(buf);
+ }
+ return ret;
+}
+
+int bdrv_flush_all(void)
+{
+ BlockDriverState *bs = NULL;
+ int result = 0;
+
+ while ((bs = bdrv_next(bs))) {
+ AioContext *aio_context = bdrv_get_aio_context(bs);
+ int ret;
+
+ aio_context_acquire(aio_context);
+ ret = bdrv_flush(bs);
+ if (ret < 0 && !result) {
+ result = ret;
+ }
+ aio_context_release(aio_context);
+ }
+
+ return result;
+}
+
+typedef struct BdrvCoGetBlockStatusData {
+ BlockDriverState *bs;
+ BlockDriverState *base;
+ int64_t sector_num;
+ int nb_sectors;
+ int *pnum;
+ int64_t ret;
+ bool done;
+} BdrvCoGetBlockStatusData;
+
+/*
+ * Returns the allocation status of the specified sectors.
+ * Drivers not implementing the functionality are assumed to not support
+ * backing files, hence all their sectors are reported as allocated.
+ *
+ * If 'sector_num' is beyond the end of the disk image the return value is 0
+ * and 'pnum' is set to 0.
+ *
+ * 'pnum' is set to the number of sectors (including and immediately following
+ * the specified sector) that are known to be in the same
+ * allocated/unallocated state.
+ *
+ * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
+ * beyond the end of the disk image it will be clamped.
+ */
+static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
+ int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ int64_t total_sectors;
+ int64_t n;
+ int64_t ret, ret2;
+
+ total_sectors = bdrv_nb_sectors(bs);
+ if (total_sectors < 0) {
+ return total_sectors;
+ }
+
+ if (sector_num >= total_sectors) {
+ *pnum = 0;
+ return 0;
+ }
+
+ n = total_sectors - sector_num;
+ if (n < nb_sectors) {
+ nb_sectors = n;
+ }
+
+ if (!bs->drv->bdrv_co_get_block_status) {
+ *pnum = nb_sectors;
+ ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
+ if (bs->drv->protocol_name) {
+ ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
+ }
+ return ret;
+ }
+
+ ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
+ if (ret < 0) {
+ *pnum = 0;
+ return ret;
+ }
+
+ if (ret & BDRV_BLOCK_RAW) {
+ assert(ret & BDRV_BLOCK_OFFSET_VALID);
+ return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
+ *pnum, pnum);
+ }
+
+ if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
+ ret |= BDRV_BLOCK_ALLOCATED;
+ }
+
+ if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
+ if (bdrv_unallocated_blocks_are_zero(bs)) {
+ ret |= BDRV_BLOCK_ZERO;
+ } else if (bs->backing_hd) {
+ BlockDriverState *bs2 = bs->backing_hd;
+ int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
+ if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
+ ret |= BDRV_BLOCK_ZERO;
+ }
+ }
+ }
+
+ if (bs->file &&
+ (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
+ (ret & BDRV_BLOCK_OFFSET_VALID)) {
+ int file_pnum;
+
+ ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
+ *pnum, &file_pnum);
+ if (ret2 >= 0) {
+ /* Ignore errors. This is just providing extra information, it
+ * is useful but not necessary.
+ */
+ if (!file_pnum) {
+ /* !file_pnum indicates an offset at or beyond the EOF; it is
+ * perfectly valid for the format block driver to point to such
+ * offsets, so catch it and mark everything as zero */
+ ret |= BDRV_BLOCK_ZERO;
+ } else {
+ /* Limit request to the range reported by the protocol driver */
+ *pnum = file_pnum;
+ ret |= (ret2 & BDRV_BLOCK_ZERO);
+ }
+ }
+ }
+
+ return ret;
+}
+
+/* Coroutine wrapper for bdrv_get_block_status() */
+static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
+{
+ BdrvCoGetBlockStatusData *data = opaque;
+ BlockDriverState *bs = data->bs;
+
+ data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
+ data->pnum);
+ data->done = true;
+}
+
+/*
+ * Synchronous wrapper around bdrv_co_get_block_status().
+ *
+ * See bdrv_co_get_block_status() for details.
+ */
+int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ Coroutine *co;
+ BdrvCoGetBlockStatusData data = {
+ .bs = bs,
+ .sector_num = sector_num,
+ .nb_sectors = nb_sectors,
+ .pnum = pnum,
+ .done = false,
+ };
+
+ if (qemu_in_coroutine()) {
+ /* Fast-path if already in coroutine context */
+ bdrv_get_block_status_co_entry(&data);
+ } else {
+ AioContext *aio_context = bdrv_get_aio_context(bs);
+
+ co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
+ qemu_coroutine_enter(co, &data);
+ while (!data.done) {
+ aio_poll(aio_context, true);
+ }
+ }
+ return data.ret;
+}
+
+int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
+ if (ret < 0) {
+ return ret;
+ }
+ return !!(ret & BDRV_BLOCK_ALLOCATED);
+}
+
+/*
+ * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
+ *
+ * Return true if the given sector is allocated in any image between
+ * BASE and TOP (inclusive). BASE can be NULL to check if the given
+ * sector is allocated in any image of the chain. Return false otherwise.
+ *
+ * 'pnum' is set to the number of sectors (including and immediately following
+ * the specified sector) that are known to be in the same
+ * allocated/unallocated state.
+ *
+ */
+int bdrv_is_allocated_above(BlockDriverState *top,
+ BlockDriverState *base,
+ int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ BlockDriverState *intermediate;
+ int ret, n = nb_sectors;
+
+ intermediate = top;
+ while (intermediate && intermediate != base) {
+ int pnum_inter;
+ ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
+ &pnum_inter);
+ if (ret < 0) {
+ return ret;
+ } else if (ret) {
+ *pnum = pnum_inter;
+ return 1;
+ }
+
+ /*
+ * [sector_num, nb_sectors] is unallocated on top but intermediate
+ * might have
+ *
+ * [sector_num+x, nr_sectors] allocated.
+ */
+ if (n > pnum_inter &&
+ (intermediate == top ||
+ sector_num + pnum_inter < intermediate->total_sectors)) {
+ n = pnum_inter;
+ }
+
+ intermediate = intermediate->backing_hd;
+ }
+
+ *pnum = n;
+ return 0;
+}
+
+int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ BlockDriver *drv = bs->drv;
+ int ret;
+
+ if (!drv) {
+ return -ENOMEDIUM;
+ }
+ if (!drv->bdrv_write_compressed) {
+ return -ENOTSUP;
+ }
+ ret = bdrv_check_request(bs, sector_num, nb_sectors);
+ if (ret < 0) {
+ return ret;
+ }
+
+ assert(QLIST_EMPTY(&bs->dirty_bitmaps));
+
+ return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
+}
+
+int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
+ int64_t pos, int size)
+{
+ QEMUIOVector qiov;
+ struct iovec iov = {
+ .iov_base = (void *) buf,
+ .iov_len = size,
+ };
+
+ qemu_iovec_init_external(&qiov, &iov, 1);
+ return bdrv_writev_vmstate(bs, &qiov, pos);
+}
+
+int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
+{
+ BlockDriver *drv = bs->drv;
+
+ if (!drv) {
+ return -ENOMEDIUM;
+ } else if (drv->bdrv_save_vmstate) {
+ return drv->bdrv_save_vmstate(bs, qiov, pos);
+ } else if (bs->file) {
+ return bdrv_writev_vmstate(bs->file, qiov, pos);
+ }
+
+ return -ENOTSUP;
+}
+
+int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
+ int64_t pos, int size)
+{
+ BlockDriver *drv = bs->drv;
+ if (!drv)
+ return -ENOMEDIUM;
+ if (drv->bdrv_load_vmstate)
+ return drv->bdrv_load_vmstate(bs, buf, pos, size);
+ if (bs->file)
+ return bdrv_load_vmstate(bs->file, buf, pos, size);
+ return -ENOTSUP;
+}
+
+/**************************************************************/
+/* async I/Os */
+
+BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
+ QEMUIOVector *qiov, int nb_sectors,
+ BlockCompletionFunc *cb, void *opaque)
+{
+ trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
+
+ return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
+ cb, opaque, false);
+}
+
+BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
+ QEMUIOVector *qiov, int nb_sectors,
+ BlockCompletionFunc *cb, void *opaque)
+{
+ trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
+
+ return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
+ cb, opaque, true);
+}
+
+BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
+ BlockCompletionFunc *cb, void *opaque)
+{
+ trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
+
+ return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
+ BDRV_REQ_ZERO_WRITE | flags,
+ cb, opaque, true);
+}
+
+
+typedef struct MultiwriteCB {
+ int error;
+ int num_requests;
+ int num_callbacks;
+ struct {
+ BlockCompletionFunc *cb;
+ void *opaque;
+ QEMUIOVector *free_qiov;
+ } callbacks[];
+} MultiwriteCB;
+
+static void multiwrite_user_cb(MultiwriteCB *mcb)
+{
+ int i;
+
+ for (i = 0; i < mcb->num_callbacks; i++) {
+ mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
+ if (mcb->callbacks[i].free_qiov) {
+ qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
+ }
+ g_free(mcb->callbacks[i].free_qiov);
+ }
+}
+
+static void multiwrite_cb(void *opaque, int ret)
+{
+ MultiwriteCB *mcb = opaque;
+
+ trace_multiwrite_cb(mcb, ret);
+
+ if (ret < 0 && !mcb->error) {
+ mcb->error = ret;
+ }
+
+ mcb->num_requests--;
+ if (mcb->num_requests == 0) {
+ multiwrite_user_cb(mcb);
+ g_free(mcb);
+ }
+}
+
+static int multiwrite_req_compare(const void *a, const void *b)
+{
+ const BlockRequest *req1 = a, *req2 = b;
+
+ /*
+ * Note that we can't simply subtract req2->sector from req1->sector
+ * here as that could overflow the return value.
+ */
+ if (req1->sector > req2->sector) {
+ return 1;
+ } else if (req1->sector < req2->sector) {
+ return -1;
+ } else {
+ return 0;
+ }
+}
+
+/*
+ * Takes a bunch of requests and tries to merge them. Returns the number of
+ * requests that remain after merging.
+ */
+static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
+ int num_reqs, MultiwriteCB *mcb)
+{
+ int i, outidx;
+
+ // Sort requests by start sector
+ qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
+
+ // Check if adjacent requests touch the same clusters. If so, combine them,
+ // filling up gaps with zero sectors.
+ outidx = 0;
+ for (i = 1; i < num_reqs; i++) {
+ int merge = 0;
+ int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
+
+ // Handle exactly sequential writes and overlapping writes.
+ if (reqs[i].sector <= oldreq_last) {
+ merge = 1;
+ }
+
+ if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
+ merge = 0;
+ }
+
+ if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
+ reqs[i].nb_sectors > bs->bl.max_transfer_length) {
+ merge = 0;
+ }
+
+ if (merge) {
+ size_t size;
+ QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
+ qemu_iovec_init(qiov,
+ reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
+
+ // Add the first request to the merged one. If the requests are
+ // overlapping, drop the last sectors of the first request.
+ size = (reqs[i].sector - reqs[outidx].sector) << 9;
+ qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
+
+ // We should need to add any zeros between the two requests
+ assert (reqs[i].sector <= oldreq_last);
+
+ // Add the second request
+ qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
+
+ // Add tail of first request, if necessary
+ if (qiov->size < reqs[outidx].qiov->size) {
+ qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
+ reqs[outidx].qiov->size - qiov->size);
+ }
+
+ reqs[outidx].nb_sectors = qiov->size >> 9;
+ reqs[outidx].qiov = qiov;
+
+ mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
+ } else {
+ outidx++;
+ reqs[outidx].sector = reqs[i].sector;
+ reqs[outidx].nb_sectors = reqs[i].nb_sectors;
+ reqs[outidx].qiov = reqs[i].qiov;
+ }
+ }
+
+ block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1);
+
+ return outidx + 1;
+}
+
+/*
+ * Submit multiple AIO write requests at once.
+ *
+ * On success, the function returns 0 and all requests in the reqs array have
+ * been submitted. In error case this function returns -1, and any of the
+ * requests may or may not be submitted yet. In particular, this means that the
+ * callback will be called for some of the requests, for others it won't. The
+ * caller must check the error field of the BlockRequest to wait for the right
+ * callbacks (if error != 0, no callback will be called).
+ *
+ * The implementation may modify the contents of the reqs array, e.g. to merge
+ * requests. However, the fields opaque and error are left unmodified as they
+ * are used to signal failure for a single request to the caller.
+ */
+int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
+{
+ MultiwriteCB *mcb;
+ int i;
+
+ /* don't submit writes if we don't have a medium */
+ if (bs->drv == NULL) {
+ for (i = 0; i < num_reqs; i++) {
+ reqs[i].error = -ENOMEDIUM;
+ }
+ return -1;
+ }
+
+ if (num_reqs == 0) {
+ return 0;
+ }
+
+ // Create MultiwriteCB structure
+ mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
+ mcb->num_requests = 0;
+ mcb->num_callbacks = num_reqs;
+
+ for (i = 0; i < num_reqs; i++) {
+ mcb->callbacks[i].cb = reqs[i].cb;
+ mcb->callbacks[i].opaque = reqs[i].opaque;
+ }
+
+ // Check for mergable requests
+ num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
+
+ trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
+
+ /* Run the aio requests. */
+ mcb->num_requests = num_reqs;
+ for (i = 0; i < num_reqs; i++) {
+ bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
+ reqs[i].nb_sectors, reqs[i].flags,
+ multiwrite_cb, mcb,
+ true);
+ }
+
+ return 0;
+}
+
+void bdrv_aio_cancel(BlockAIOCB *acb)
+{
+ qemu_aio_ref(acb);
+ bdrv_aio_cancel_async(acb);
+ while (acb->refcnt > 1) {
+ if (acb->aiocb_info->get_aio_context) {
+ aio_poll(acb->aiocb_info->get_aio_context(acb), true);
+ } else if (acb->bs) {
+ aio_poll(bdrv_get_aio_context(acb->bs), true);
+ } else {
+ abort();
+ }
+ }
+ qemu_aio_unref(acb);
+}
+
+/* Async version of aio cancel. The caller is not blocked if the acb implements
+ * cancel_async, otherwise we do nothing and let the request normally complete.
+ * In either case the completion callback must be called. */
+void bdrv_aio_cancel_async(BlockAIOCB *acb)
+{
+ if (acb->aiocb_info->cancel_async) {
+ acb->aiocb_info->cancel_async(acb);
+ }
+}
+
+/**************************************************************/
+/* async block device emulation */
+
+typedef struct BlockAIOCBSync {
+ BlockAIOCB common;
+ QEMUBH *bh;
+ int ret;
+ /* vector translation state */
+ QEMUIOVector *qiov;
+ uint8_t *bounce;
+ int is_write;
+} BlockAIOCBSync;
+
+static const AIOCBInfo bdrv_em_aiocb_info = {
+ .aiocb_size = sizeof(BlockAIOCBSync),
+};
+
+static void bdrv_aio_bh_cb(void *opaque)
+{
+ BlockAIOCBSync *acb = opaque;
+
+ if (!acb->is_write && acb->ret >= 0) {
+ qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
+ }
+ qemu_vfree(acb->bounce);
+ acb->common.cb(acb->common.opaque, acb->ret);
+ qemu_bh_delete(acb->bh);
+ acb->bh = NULL;
+ qemu_aio_unref(acb);
+}
+
+static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BlockCompletionFunc *cb,
+ void *opaque,
+ int is_write)
+
+{
+ BlockAIOCBSync *acb;
+
+ acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
+ acb->is_write = is_write;
+ acb->qiov = qiov;
+ acb->bounce = qemu_try_blockalign(bs, qiov->size);
+ acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
+
+ if (acb->bounce == NULL) {
+ acb->ret = -ENOMEM;
+ } else if (is_write) {
+ qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
+ acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
+ } else {
+ acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
+ }
+
+ qemu_bh_schedule(acb->bh);
+
+ return &acb->common;
+}
+
+static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockCompletionFunc *cb, void *opaque)
+{
+ return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+}
+
+static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockCompletionFunc *cb, void *opaque)
+{
+ return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
+}
+
+
+typedef struct BlockAIOCBCoroutine {
+ BlockAIOCB common;
+ BlockRequest req;
+ bool is_write;
+ bool need_bh;
+ bool *done;
+ QEMUBH* bh;
+} BlockAIOCBCoroutine;
+
+static const AIOCBInfo bdrv_em_co_aiocb_info = {
+ .aiocb_size = sizeof(BlockAIOCBCoroutine),
+};
+
+static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
+{
+ if (!acb->need_bh) {
+ acb->common.cb(acb->common.opaque, acb->req.error);
+ qemu_aio_unref(acb);
+ }
+}
+
+static void bdrv_co_em_bh(void *opaque)
+{
+ BlockAIOCBCoroutine *acb = opaque;
+
+ assert(!acb->need_bh);
+ qemu_bh_delete(acb->bh);
+ bdrv_co_complete(acb);
+}
+
+static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
+{
+ acb->need_bh = false;
+ if (acb->req.error != -EINPROGRESS) {
+ BlockDriverState *bs = acb->common.bs;
+
+ acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
+ qemu_bh_schedule(acb->bh);
+ }
+}
+
+/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
+static void coroutine_fn bdrv_co_do_rw(void *opaque)
+{
+ BlockAIOCBCoroutine *acb = opaque;
+ BlockDriverState *bs = acb->common.bs;
+
+ if (!acb->is_write) {
+ acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
+ acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
+ } else {
+ acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
+ acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
+ }
+
+ bdrv_co_complete(acb);
+}
+
+static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BdrvRequestFlags flags,
+ BlockCompletionFunc *cb,
+ void *opaque,
+ bool is_write)
+{
+ Coroutine *co;
+ BlockAIOCBCoroutine *acb;
+
+ acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
+ acb->need_bh = true;
+ acb->req.error = -EINPROGRESS;
+ acb->req.sector = sector_num;
+ acb->req.nb_sectors = nb_sectors;
+ acb->req.qiov = qiov;
+ acb->req.flags = flags;
+ acb->is_write = is_write;
+
+ co = qemu_coroutine_create(bdrv_co_do_rw);
+ qemu_coroutine_enter(co, acb);
+
+ bdrv_co_maybe_schedule_bh(acb);
+ return &acb->common;
+}
+
+static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
+{
+ BlockAIOCBCoroutine *acb = opaque;
+ BlockDriverState *bs = acb->common.bs;
+
+ acb->req.error = bdrv_co_flush(bs);
+ bdrv_co_complete(acb);
+}
+
+BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
+ BlockCompletionFunc *cb, void *opaque)
+{
+ trace_bdrv_aio_flush(bs, opaque);
+
+ Coroutine *co;
+ BlockAIOCBCoroutine *acb;
+
+ acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
+ acb->need_bh = true;
+ acb->req.error = -EINPROGRESS;
+
+ co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
+ qemu_coroutine_enter(co, acb);
+
+ bdrv_co_maybe_schedule_bh(acb);
+ return &acb->common;
+}
+
+static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
+{
+ BlockAIOCBCoroutine *acb = opaque;
+ BlockDriverState *bs = acb->common.bs;
+
+ acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
+ bdrv_co_complete(acb);
+}
+
+BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ BlockCompletionFunc *cb, void *opaque)
+{
+ Coroutine *co;
+ BlockAIOCBCoroutine *acb;
+
+ trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
+
+ acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
+ acb->need_bh = true;
+ acb->req.error = -EINPROGRESS;
+ acb->req.sector = sector_num;
+ acb->req.nb_sectors = nb_sectors;
+ co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
+ qemu_coroutine_enter(co, acb);
+
+ bdrv_co_maybe_schedule_bh(acb);
+ return &acb->common;
+}
+
+void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
+ BlockCompletionFunc *cb, void *opaque)
+{
+ BlockAIOCB *acb;
+
+ acb = g_slice_alloc(aiocb_info->aiocb_size);
+ acb->aiocb_info = aiocb_info;
+ acb->bs = bs;
+ acb->cb = cb;
+ acb->opaque = opaque;
+ acb->refcnt = 1;
+ return acb;
+}
+
+void qemu_aio_ref(void *p)
+{
+ BlockAIOCB *acb = p;
+ acb->refcnt++;
+}
+
+void qemu_aio_unref(void *p)
+{
+ BlockAIOCB *acb = p;
+ assert(acb->refcnt > 0);
+ if (--acb->refcnt == 0) {
+ g_slice_free1(acb->aiocb_info->aiocb_size, acb);
+ }
+}
+
+/**************************************************************/
+/* Coroutine block device emulation */
+
+typedef struct CoroutineIOCompletion {
+ Coroutine *coroutine;
+ int ret;
+} CoroutineIOCompletion;
+
+static void bdrv_co_io_em_complete(void *opaque, int ret)
+{
+ CoroutineIOCompletion *co = opaque;
+
+ co->ret = ret;
+ qemu_coroutine_enter(co->coroutine, NULL);
+}
+
+static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *iov,
+ bool is_write)
+{
+ CoroutineIOCompletion co = {
+ .coroutine = qemu_coroutine_self(),
+ };
+ BlockAIOCB *acb;
+
+ if (is_write) {
+ acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
+ bdrv_co_io_em_complete, &co);
+ } else {
+ acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
+ bdrv_co_io_em_complete, &co);
+ }
+
+ trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
+ if (!acb) {
+ return -EIO;
+ }
+ qemu_coroutine_yield();
+
+ return co.ret;
+}
+
+static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ QEMUIOVector *iov)
+{
+ return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
+}
+
+static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ QEMUIOVector *iov)
+{
+ return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
+}
+
+static void coroutine_fn bdrv_flush_co_entry(void *opaque)
+{
+ RwCo *rwco = opaque;
+
+ rwco->ret = bdrv_co_flush(rwco->bs);
+}
+
+int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
+{
+ int ret;
+
+ if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
+ return 0;
+ }
+
+ /* Write back cached data to the OS even with cache=unsafe */
+ BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
+ if (bs->drv->bdrv_co_flush_to_os) {
+ ret = bs->drv->bdrv_co_flush_to_os(bs);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ /* But don't actually force it to the disk with cache=unsafe */
+ if (bs->open_flags & BDRV_O_NO_FLUSH) {
+ goto flush_parent;
+ }
+
+ BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
+ if (bs->drv->bdrv_co_flush_to_disk) {
+ ret = bs->drv->bdrv_co_flush_to_disk(bs);
+ } else if (bs->drv->bdrv_aio_flush) {
+ BlockAIOCB *acb;
+ CoroutineIOCompletion co = {
+ .coroutine = qemu_coroutine_self(),
+ };
+
+ acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
+ if (acb == NULL) {
+ ret = -EIO;
+ } else {
+ qemu_coroutine_yield();
+ ret = co.ret;
+ }
+ } else {
+ /*
+ * Some block drivers always operate in either writethrough or unsafe
+ * mode and don't support bdrv_flush therefore. Usually qemu doesn't
+ * know how the server works (because the behaviour is hardcoded or
+ * depends on server-side configuration), so we can't ensure that
+ * everything is safe on disk. Returning an error doesn't work because
+ * that would break guests even if the server operates in writethrough
+ * mode.
+ *
+ * Let's hope the user knows what he's doing.
+ */
+ ret = 0;
+ }
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
+ * in the case of cache=unsafe, so there are no useless flushes.
+ */
+flush_parent:
+ return bdrv_co_flush(bs->file);
+}
+
+int bdrv_flush(BlockDriverState *bs)
+{
+ Coroutine *co;
+ RwCo rwco = {
+ .bs = bs,
+ .ret = NOT_DONE,
+ };
+
+ if (qemu_in_coroutine()) {
+ /* Fast-path if already in coroutine context */
+ bdrv_flush_co_entry(&rwco);
+ } else {
+ AioContext *aio_context = bdrv_get_aio_context(bs);
+
+ co = qemu_coroutine_create(bdrv_flush_co_entry);
+ qemu_coroutine_enter(co, &rwco);
+ while (rwco.ret == NOT_DONE) {
+ aio_poll(aio_context, true);
+ }
+ }
+
+ return rwco.ret;
+}
+
+typedef struct DiscardCo {
+ BlockDriverState *bs;
+ int64_t sector_num;
+ int nb_sectors;
+ int ret;
+} DiscardCo;
+static void coroutine_fn bdrv_discard_co_entry(void *opaque)
+{
+ DiscardCo *rwco = opaque;
+
+ rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
+}
+
+int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors)
+{
+ int max_discard, ret;
+
+ if (!bs->drv) {
+ return -ENOMEDIUM;
+ }
+
+ ret = bdrv_check_request(bs, sector_num, nb_sectors);
+ if (ret < 0) {
+ return ret;
+ } else if (bs->read_only) {
+ return -EROFS;
+ }
+
+ bdrv_reset_dirty(bs, sector_num, nb_sectors);
+
+ /* Do nothing if disabled. */
+ if (!(bs->open_flags & BDRV_O_UNMAP)) {
+ return 0;
+ }
+
+ if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
+ return 0;
+ }
+
+ max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
+ while (nb_sectors > 0) {
+ int ret;
+ int num = nb_sectors;
+
+ /* align request */
+ if (bs->bl.discard_alignment &&
+ num >= bs->bl.discard_alignment &&
+ sector_num % bs->bl.discard_alignment) {
+ if (num > bs->bl.discard_alignment) {
+ num = bs->bl.discard_alignment;
+ }
+ num -= sector_num % bs->bl.discard_alignment;
+ }
+
+ /* limit request size */
+ if (num > max_discard) {
+ num = max_discard;
+ }
+
+ if (bs->drv->bdrv_co_discard) {
+ ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
+ } else {
+ BlockAIOCB *acb;
+ CoroutineIOCompletion co = {
+ .coroutine = qemu_coroutine_self(),
+ };
+
+ acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
+ bdrv_co_io_em_complete, &co);
+ if (acb == NULL) {
+ return -EIO;
+ } else {
+ qemu_coroutine_yield();
+ ret = co.ret;
+ }
+ }
+ if (ret && ret != -ENOTSUP) {
+ return ret;
+ }
+
+ sector_num += num;
+ nb_sectors -= num;
+ }
+ return 0;
+}
+
+int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
+{
+ Coroutine *co;
+ DiscardCo rwco = {
+ .bs = bs,
+ .sector_num = sector_num,
+ .nb_sectors = nb_sectors,
+ .ret = NOT_DONE,
+ };
+
+ if (qemu_in_coroutine()) {
+ /* Fast-path if already in coroutine context */
+ bdrv_discard_co_entry(&rwco);
+ } else {
+ AioContext *aio_context = bdrv_get_aio_context(bs);
+
+ co = qemu_coroutine_create(bdrv_discard_co_entry);
+ qemu_coroutine_enter(co, &rwco);
+ while (rwco.ret == NOT_DONE) {
+ aio_poll(aio_context, true);
+ }
+ }
+
+ return rwco.ret;
+}
+
+/* needed for generic scsi interface */
+
+int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+{
+ BlockDriver *drv = bs->drv;
+
+ if (drv && drv->bdrv_ioctl)
+ return drv->bdrv_ioctl(bs, req, buf);
+ return -ENOTSUP;
+}
+
+BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
+ unsigned long int req, void *buf,
+ BlockCompletionFunc *cb, void *opaque)
+{
+ BlockDriver *drv = bs->drv;
+
+ if (drv && drv->bdrv_aio_ioctl)
+ return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
+ return NULL;
+}
+
+void *qemu_blockalign(BlockDriverState *bs, size_t size)
+{
+ return qemu_memalign(bdrv_opt_mem_align(bs), size);
+}
+
+void *qemu_blockalign0(BlockDriverState *bs, size_t size)
+{
+ return memset(qemu_blockalign(bs, size), 0, size);
+}
+
+void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
+{
+ size_t align = bdrv_opt_mem_align(bs);
+
+ /* Ensure that NULL is never returned on success */
+ assert(align > 0);
+ if (size == 0) {
+ size = align;
+ }
+
+ return qemu_try_memalign(align, size);
+}
+
+void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
+{
+ void *mem = qemu_try_blockalign(bs, size);
+
+ if (mem) {
+ memset(mem, 0, size);
+ }
+
+ return mem;
+}
+
+/*
+ * Check if all memory in this vector is sector aligned.
+ */
+bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
+{
+ int i;
+ size_t alignment = bdrv_opt_mem_align(bs);
+
+ for (i = 0; i < qiov->niov; i++) {
+ if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
+ return false;
+ }
+ if (qiov->iov[i].iov_len % alignment) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+void bdrv_add_before_write_notifier(BlockDriverState *bs,
+ NotifierWithReturn *notifier)
+{
+ notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
+}
+
+void bdrv_io_plug(BlockDriverState *bs)
+{
+ BlockDriver *drv = bs->drv;
+ if (drv && drv->bdrv_io_plug) {
+ drv->bdrv_io_plug(bs);
+ } else if (bs->file) {
+ bdrv_io_plug(bs->file);
+ }
+}
+
+void bdrv_io_unplug(BlockDriverState *bs)
+{
+ BlockDriver *drv = bs->drv;
+ if (drv && drv->bdrv_io_unplug) {
+ drv->bdrv_io_unplug(bs);
+ } else if (bs->file) {
+ bdrv_io_unplug(bs->file);
+ }
+}
+
+void bdrv_flush_io_queue(BlockDriverState *bs)
+{
+ BlockDriver *drv = bs->drv;
+ if (drv && drv->bdrv_flush_io_queue) {
+ drv->bdrv_flush_io_queue(bs);
+ } else if (bs->file) {
+ bdrv_flush_io_queue(bs->file);
+ }
+}
diff --git a/block/iscsi.c b/block/iscsi.c
index ba33290000..8fca1d32cb 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -2,7 +2,7 @@
* QEMU Block driver for iSCSI images
*
* Copyright (c) 2010-2011 Ronnie Sahlberg <ronniesahlberg@gmail.com>
- * Copyright (c) 2012-2014 Peter Lieven <pl@kamp.de>
+ * Copyright (c) 2012-2015 Peter Lieven <pl@kamp.de>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
@@ -57,9 +57,6 @@ typedef struct IscsiLun {
int events;
QEMUTimer *nop_timer;
QEMUTimer *event_timer;
- uint8_t lbpme;
- uint8_t lbprz;
- uint8_t has_write_same;
struct scsi_inquiry_logical_block_provisioning lbp;
struct scsi_inquiry_block_limits bl;
unsigned char *zeroblock;
@@ -67,6 +64,11 @@ typedef struct IscsiLun {
int cluster_sectors;
bool use_16_for_rw;
bool write_protected;
+ bool lbpme;
+ bool lbprz;
+ bool dpofua;
+ bool has_write_same;
+ bool force_next_flush;
} IscsiLun;
typedef struct IscsiTask {
@@ -79,6 +81,7 @@ typedef struct IscsiTask {
QEMUBH *bh;
IscsiLun *iscsilun;
QEMUTimer retry_timer;
+ bool force_next_flush;
} IscsiTask;
typedef struct IscsiAIOCB {
@@ -100,7 +103,7 @@ typedef struct IscsiAIOCB {
#define NOP_INTERVAL 5000
#define MAX_NOP_FAILURES 3
#define ISCSI_CMD_RETRIES ARRAY_SIZE(iscsi_retry_times)
-static const unsigned iscsi_retry_times[] = {8, 32, 128, 512, 2048};
+static const unsigned iscsi_retry_times[] = {8, 32, 128, 512, 2048, 8192, 32768};
/* this threshold is a trade-off knob to choose between
* the potential additional overhead of an extra GET_LBA_STATUS request
@@ -183,10 +186,13 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
iTask->do_retry = 1;
goto out;
}
- if (status == SCSI_STATUS_BUSY) {
+ /* status 0x28 is SCSI_TASK_SET_FULL. It was first introduced
+ * in libiscsi 1.10.0. Hardcode this value here to avoid
+ * the need to bump the libiscsi requirement to 1.10.0 */
+ if (status == SCSI_STATUS_BUSY || status == 0x28) {
unsigned retry_time =
exp_random(iscsi_retry_times[iTask->retries - 1]);
- error_report("iSCSI Busy (retry #%u in %u ms): %s",
+ error_report("iSCSI Busy/TaskSetFull (retry #%u in %u ms): %s",
iTask->retries, retry_time,
iscsi_get_error(iscsi));
aio_timer_init(iTask->iscsilun->aio_context,
@@ -199,6 +205,8 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
}
}
error_report("iSCSI Failure: %s", iscsi_get_error(iscsi));
+ } else {
+ iTask->iscsilun->force_next_flush |= iTask->force_next_flush;
}
out:
@@ -369,6 +377,7 @@ static int coroutine_fn iscsi_co_writev(BlockDriverState *bs,
struct IscsiTask iTask;
uint64_t lba;
uint32_t num_sectors;
+ int fua;
if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
return -EINVAL;
@@ -384,15 +393,17 @@ static int coroutine_fn iscsi_co_writev(BlockDriverState *bs,
num_sectors = sector_qemu2lun(nb_sectors, iscsilun);
iscsi_co_init_iscsitask(iscsilun, &iTask);
retry:
+ fua = iscsilun->dpofua && !bs->enable_write_cache;
+ iTask.force_next_flush = !fua;
if (iscsilun->use_16_for_rw) {
iTask.task = iscsi_write16_task(iscsilun->iscsi, iscsilun->lun, lba,
NULL, num_sectors * iscsilun->block_size,
- iscsilun->block_size, 0, 0, 0, 0, 0,
+ iscsilun->block_size, 0, 0, fua, 0, 0,
iscsi_co_generic_cb, &iTask);
} else {
iTask.task = iscsi_write10_task(iscsilun->iscsi, iscsilun->lun, lba,
NULL, num_sectors * iscsilun->block_size,
- iscsilun->block_size, 0, 0, 0, 0, 0,
+ iscsilun->block_size, 0, 0, fua, 0, 0,
iscsi_co_generic_cb, &iTask);
}
if (iTask.task == NULL) {
@@ -460,7 +471,7 @@ static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs,
*pnum = nb_sectors;
/* LUN does not support logical block provisioning */
- if (iscsilun->lbpme == 0) {
+ if (!iscsilun->lbpme) {
goto out;
}
@@ -620,8 +631,12 @@ static int coroutine_fn iscsi_co_flush(BlockDriverState *bs)
return 0;
}
- iscsi_co_init_iscsitask(iscsilun, &iTask);
+ if (!iscsilun->force_next_flush) {
+ return 0;
+ }
+ iscsilun->force_next_flush = false;
+ iscsi_co_init_iscsitask(iscsilun, &iTask);
retry:
if (iscsi_synchronizecache10_task(iscsilun->iscsi, iscsilun->lun, 0, 0, 0,
0, iscsi_co_generic_cb, &iTask) == NULL) {
@@ -917,6 +932,7 @@ coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
}
iscsi_co_init_iscsitask(iscsilun, &iTask);
+ iTask.force_next_flush = true;
retry:
if (use_16_for_ws) {
iTask.task = iscsi_writesame16_task(iscsilun->iscsi, iscsilun->lun, lba,
@@ -1121,8 +1137,8 @@ static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp)
} else {
iscsilun->block_size = rc16->block_length;
iscsilun->num_blocks = rc16->returned_lba + 1;
- iscsilun->lbpme = rc16->lbpme;
- iscsilun->lbprz = rc16->lbprz;
+ iscsilun->lbpme = !!rc16->lbpme;
+ iscsilun->lbprz = !!rc16->lbprz;
iscsilun->use_16_for_rw = (rc16->returned_lba > 0xffffffff);
}
}
@@ -1253,11 +1269,12 @@ static void iscsi_attach_aio_context(BlockDriverState *bs,
iscsi_timed_set_events, iscsilun);
}
-static bool iscsi_is_write_protected(IscsiLun *iscsilun)
+static void iscsi_modesense_sync(IscsiLun *iscsilun)
{
struct scsi_task *task;
struct scsi_mode_sense *ms = NULL;
- bool wrprotected = false;
+ iscsilun->write_protected = false;
+ iscsilun->dpofua = false;
task = iscsi_modesense6_sync(iscsilun->iscsi, iscsilun->lun,
1, SCSI_MODESENSE_PC_CURRENT,
@@ -1278,13 +1295,13 @@ static bool iscsi_is_write_protected(IscsiLun *iscsilun)
iscsi_get_error(iscsilun->iscsi));
goto out;
}
- wrprotected = ms->device_specific_parameter & 0x80;
+ iscsilun->write_protected = ms->device_specific_parameter & 0x80;
+ iscsilun->dpofua = ms->device_specific_parameter & 0x10;
out:
if (task) {
scsi_free_scsi_task(task);
}
- return wrprotected;
}
/*
@@ -1403,7 +1420,8 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
scsi_free_scsi_task(task);
task = NULL;
- iscsilun->write_protected = iscsi_is_write_protected(iscsilun);
+ iscsi_modesense_sync(iscsilun);
+
/* Check the write protect flag of the LUN if we want to write */
if (iscsilun->type == TYPE_DISK && (flags & BDRV_O_RDWR) &&
iscsilun->write_protected) {
@@ -1481,7 +1499,7 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) {
iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran *
iscsilun->block_size) >> BDRV_SECTOR_BITS;
- if (iscsilun->lbprz && !(bs->open_flags & BDRV_O_NOCACHE)) {
+ if (iscsilun->lbprz) {
iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun);
if (iscsilun->allocationmap == NULL) {
ret = -ENOMEM;
@@ -1501,6 +1519,9 @@ out:
if (ret) {
if (iscsi != NULL) {
+ if (iscsi_is_logged_in(iscsi)) {
+ iscsi_logout_sync(iscsi);
+ }
iscsi_destroy_context(iscsi);
}
memset(iscsilun, 0, sizeof(IscsiLun));
@@ -1514,6 +1535,9 @@ static void iscsi_close(BlockDriverState *bs)
struct iscsi_context *iscsi = iscsilun->iscsi;
iscsi_detach_aio_context(bs);
+ if (iscsi_is_logged_in(iscsi)) {
+ iscsi_logout_sync(iscsi);
+ }
iscsi_destroy_context(iscsi);
g_free(iscsilun->zeroblock);
g_free(iscsilun->allocationmap);
@@ -1649,7 +1673,7 @@ out:
static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
{
IscsiLun *iscsilun = bs->opaque;
- bdi->unallocated_blocks_are_zero = !!iscsilun->lbprz;
+ bdi->unallocated_blocks_are_zero = iscsilun->lbprz;
bdi->can_write_zeroes_with_unmap = iscsilun->lbprz && iscsilun->lbp.lbpws;
bdi->cluster_size = iscsilun->cluster_sectors * BDRV_SECTOR_SIZE;
return 0;
diff --git a/block/mirror.c b/block/mirror.c
index 405616422b..58f391a6d6 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -125,11 +125,9 @@ static void mirror_write_complete(void *opaque, int ret)
MirrorOp *op = opaque;
MirrorBlockJob *s = op->s;
if (ret < 0) {
- BlockDriverState *source = s->common.bs;
BlockErrorAction action;
- bdrv_set_dirty_bitmap(source, s->dirty_bitmap, op->sector_num,
- op->nb_sectors);
+ bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
action = mirror_error_action(s, false, -ret);
if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
s->ret = ret;
@@ -143,11 +141,9 @@ static void mirror_read_complete(void *opaque, int ret)
MirrorOp *op = opaque;
MirrorBlockJob *s = op->s;
if (ret < 0) {
- BlockDriverState *source = s->common.bs;
BlockErrorAction action;
- bdrv_set_dirty_bitmap(source, s->dirty_bitmap, op->sector_num,
- op->nb_sectors);
+ bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
action = mirror_error_action(s, true, -ret);
if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
s->ret = ret;
@@ -170,10 +166,9 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
s->sector_num = hbitmap_iter_next(&s->hbi);
if (s->sector_num < 0) {
- bdrv_dirty_iter_init(source, s->dirty_bitmap, &s->hbi);
+ bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
s->sector_num = hbitmap_iter_next(&s->hbi);
- trace_mirror_restart_iter(s,
- bdrv_get_dirty_count(source, s->dirty_bitmap));
+ trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
assert(s->sector_num >= 0);
}
@@ -288,8 +283,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
next_sector += sectors_per_chunk;
}
- bdrv_reset_dirty_bitmap(source, s->dirty_bitmap, sector_num,
- nb_sectors);
+ bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num, nb_sectors);
/* Copy the dirty cluster. */
s->in_flight++;
@@ -446,7 +440,7 @@ static void coroutine_fn mirror_run(void *opaque)
assert(n > 0);
if (ret == 1) {
- bdrv_set_dirty_bitmap(bs, s->dirty_bitmap, sector_num, n);
+ bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
sector_num = next;
} else {
sector_num += n;
@@ -454,7 +448,7 @@ static void coroutine_fn mirror_run(void *opaque)
}
}
- bdrv_dirty_iter_init(bs, s->dirty_bitmap, &s->hbi);
+ bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
for (;;) {
uint64_t delay_ns = 0;
@@ -466,7 +460,7 @@ static void coroutine_fn mirror_run(void *opaque)
goto immediate_exit;
}
- cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
+ cnt = bdrv_get_dirty_count(s->dirty_bitmap);
/* s->common.offset contains the number of bytes already processed so
* far, cnt is the number of dirty sectors remaining and
* s->sectors_in_flight is the number of sectors currently being
@@ -475,7 +469,7 @@ static void coroutine_fn mirror_run(void *opaque)
(cnt + s->sectors_in_flight) * BDRV_SECTOR_SIZE;
/* Note that even when no rate limit is applied we need to yield
- * periodically with no pending I/O so that qemu_aio_flush() returns.
+ * periodically with no pending I/O so that bdrv_drain_all() returns.
* We do so every SLICE_TIME nanoseconds, or when there is an error,
* or when the source is clean, whichever comes first.
*/
@@ -488,9 +482,6 @@ static void coroutine_fn mirror_run(void *opaque)
continue;
} else if (cnt != 0) {
delay_ns = mirror_iteration(s);
- if (delay_ns == 0) {
- continue;
- }
}
}
@@ -516,7 +507,7 @@ static void coroutine_fn mirror_run(void *opaque)
should_complete = s->should_complete ||
block_job_is_cancelled(&s->common);
- cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
+ cnt = bdrv_get_dirty_count(s->dirty_bitmap);
}
}
@@ -531,7 +522,7 @@ static void coroutine_fn mirror_run(void *opaque)
*/
trace_mirror_before_drain(s, cnt);
bdrv_drain(bs);
- cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
+ cnt = bdrv_get_dirty_count(s->dirty_bitmap);
}
ret = 0;
@@ -634,7 +625,7 @@ static void mirror_complete(BlockJob *job, Error **errp)
}
s->should_complete = true;
- block_job_resume(job);
+ block_job_enter(&s->common);
}
static const BlockJobDriver mirror_job_driver = {
@@ -656,7 +647,7 @@ static const BlockJobDriver commit_active_job_driver = {
static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
const char *replaces,
- int64_t speed, int64_t granularity,
+ int64_t speed, uint32_t granularity,
int64_t buf_size,
BlockdevOnError on_source_error,
BlockdevOnError on_target_error,
@@ -668,15 +659,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
MirrorBlockJob *s;
if (granularity == 0) {
- /* Choose the default granularity based on the target file's cluster
- * size, clamped between 4k and 64k. */
- BlockDriverInfo bdi;
- if (bdrv_get_info(target, &bdi) >= 0 && bdi.cluster_size != 0) {
- granularity = MAX(4096, bdi.cluster_size);
- granularity = MIN(65536, granularity);
- } else {
- granularity = 65536;
- }
+ granularity = bdrv_get_default_bitmap_granularity(target);
}
assert ((granularity & (granularity - 1)) == 0);
@@ -703,7 +686,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
s->granularity = granularity;
s->buf_size = MAX(buf_size, granularity);
- s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, errp);
+ s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
if (!s->dirty_bitmap) {
return;
}
@@ -717,7 +700,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
void mirror_start(BlockDriverState *bs, BlockDriverState *target,
const char *replaces,
- int64_t speed, int64_t granularity, int64_t buf_size,
+ int64_t speed, uint32_t granularity, int64_t buf_size,
MirrorSyncMode mode, BlockdevOnError on_source_error,
BlockdevOnError on_target_error,
BlockCompletionFunc *cb,
@@ -726,6 +709,10 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
bool is_none_mode;
BlockDriverState *base;
+ if (mode == MIRROR_SYNC_MODE_DIRTY_BITMAP) {
+ error_setg(errp, "Sync mode 'dirty-bitmap' not supported");
+ return;
+ }
is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
base = mode == MIRROR_SYNC_MODE_TOP ? bs->backing_hd : NULL;
mirror_start_job(bs, target, replaces,
diff --git a/block/null.c b/block/null.c
index ec2bd27a4b..7d083233fb 100644
--- a/block/null.c
+++ b/block/null.c
@@ -12,8 +12,11 @@
#include "block/block_int.h"
+#define NULL_OPT_LATENCY "latency-ns"
+
typedef struct {
int64_t length;
+ int64_t latency_ns;
} BDRVNullState;
static QemuOptsList runtime_opts = {
@@ -30,6 +33,12 @@ static QemuOptsList runtime_opts = {
.type = QEMU_OPT_SIZE,
.help = "size of the null block",
},
+ {
+ .name = NULL_OPT_LATENCY,
+ .type = QEMU_OPT_NUMBER,
+ .help = "nanoseconds (approximated) to wait "
+ "before completing request",
+ },
{ /* end of list */ }
},
};
@@ -39,13 +48,20 @@ static int null_file_open(BlockDriverState *bs, QDict *options, int flags,
{
QemuOpts *opts;
BDRVNullState *s = bs->opaque;
+ int ret = 0;
opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
qemu_opts_absorb_qdict(opts, options, &error_abort);
s->length =
qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 1 << 30);
+ s->latency_ns =
+ qemu_opt_get_number(opts, NULL_OPT_LATENCY, 0);
+ if (s->latency_ns < 0) {
+ error_setg(errp, "latency-ns is invalid");
+ ret = -EINVAL;
+ }
qemu_opts_del(opts);
- return 0;
+ return ret;
}
static void null_close(BlockDriverState *bs)
@@ -58,28 +74,40 @@ static int64_t null_getlength(BlockDriverState *bs)
return s->length;
}
+static coroutine_fn int null_co_common(BlockDriverState *bs)
+{
+ BDRVNullState *s = bs->opaque;
+
+ if (s->latency_ns) {
+ co_aio_sleep_ns(bdrv_get_aio_context(bs), QEMU_CLOCK_REALTIME,
+ s->latency_ns);
+ }
+ return 0;
+}
+
static coroutine_fn int null_co_readv(BlockDriverState *bs,
int64_t sector_num, int nb_sectors,
QEMUIOVector *qiov)
{
- return 0;
+ return null_co_common(bs);
}
static coroutine_fn int null_co_writev(BlockDriverState *bs,
int64_t sector_num, int nb_sectors,
QEMUIOVector *qiov)
{
- return 0;
+ return null_co_common(bs);
}
static coroutine_fn int null_co_flush(BlockDriverState *bs)
{
- return 0;
+ return null_co_common(bs);
}
typedef struct {
BlockAIOCB common;
QEMUBH *bh;
+ QEMUTimer timer;
} NullAIOCB;
static const AIOCBInfo null_aiocb_info = {
@@ -94,15 +122,33 @@ static void null_bh_cb(void *opaque)
qemu_aio_unref(acb);
}
+static void null_timer_cb(void *opaque)
+{
+ NullAIOCB *acb = opaque;
+ acb->common.cb(acb->common.opaque, 0);
+ timer_deinit(&acb->timer);
+ qemu_aio_unref(acb);
+}
+
static inline BlockAIOCB *null_aio_common(BlockDriverState *bs,
BlockCompletionFunc *cb,
void *opaque)
{
NullAIOCB *acb;
+ BDRVNullState *s = bs->opaque;
acb = qemu_aio_get(&null_aiocb_info, bs, cb, opaque);
- acb->bh = aio_bh_new(bdrv_get_aio_context(bs), null_bh_cb, acb);
- qemu_bh_schedule(acb->bh);
+ /* Only emulate latency after vcpu is running. */
+ if (s->latency_ns) {
+ aio_timer_init(bdrv_get_aio_context(bs), &acb->timer,
+ QEMU_CLOCK_REALTIME, SCALE_NS,
+ null_timer_cb, acb);
+ timer_mod_ns(&acb->timer,
+ qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + s->latency_ns);
+ } else {
+ acb->bh = aio_bh_new(bdrv_get_aio_context(bs), null_bh_cb, acb);
+ qemu_bh_schedule(acb->bh);
+ }
return &acb->common;
}
@@ -131,6 +177,12 @@ static BlockAIOCB *null_aio_flush(BlockDriverState *bs,
return null_aio_common(bs, cb, opaque);
}
+static int null_reopen_prepare(BDRVReopenState *reopen_state,
+ BlockReopenQueue *queue, Error **errp)
+{
+ return 0;
+}
+
static BlockDriver bdrv_null_co = {
.format_name = "null-co",
.protocol_name = "null-co",
@@ -143,6 +195,7 @@ static BlockDriver bdrv_null_co = {
.bdrv_co_readv = null_co_readv,
.bdrv_co_writev = null_co_writev,
.bdrv_co_flush_to_disk = null_co_flush,
+ .bdrv_reopen_prepare = null_reopen_prepare,
};
static BlockDriver bdrv_null_aio = {
@@ -157,6 +210,7 @@ static BlockDriver bdrv_null_aio = {
.bdrv_aio_readv = null_aio_readv,
.bdrv_aio_writev = null_aio_writev,
.bdrv_aio_flush = null_aio_flush,
+ .bdrv_reopen_prepare = null_reopen_prepare,
};
static void bdrv_null_init(void)
diff --git a/block/qapi.c b/block/qapi.c
index 8a19aed446..063dd1bc1f 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -31,8 +31,10 @@
#include "qapi/qmp/types.h"
#include "sysemu/block-backend.h"
-BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs)
+BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs, Error **errp)
{
+ ImageInfo **p_image_info;
+ BlockDriverState *bs0;
BlockDeviceInfo *info = g_malloc0(sizeof(*info));
info->file = g_strdup(bs->filename);
@@ -92,6 +94,25 @@ BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs)
info->write_threshold = bdrv_write_threshold_get(bs);
+ bs0 = bs;
+ p_image_info = &info->image;
+ while (1) {
+ Error *local_err = NULL;
+ bdrv_query_image_info(bs0, p_image_info, &local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ qapi_free_BlockDeviceInfo(info);
+ return NULL;
+ }
+ if (bs0->drv && bs0->backing_hd) {
+ bs0 = bs0->backing_hd;
+ (*p_image_info)->has_backing_image = true;
+ p_image_info = &((*p_image_info)->backing_image);
+ } else {
+ break;
+ }
+ }
+
return info;
}
@@ -264,9 +285,6 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info,
{
BlockInfo *info = g_malloc0(sizeof(*info));
BlockDriverState *bs = blk_bs(blk);
- BlockDriverState *bs0;
- ImageInfo **p_image_info;
- Error *local_err = NULL;
info->device = g_strdup(blk_name(blk));
info->type = g_strdup("unknown");
info->locked = blk_dev_is_medium_locked(blk);
@@ -289,23 +307,9 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info,
if (bs->drv) {
info->has_inserted = true;
- info->inserted = bdrv_block_device_info(bs);
-
- bs0 = bs;
- p_image_info = &info->inserted->image;
- while (1) {
- bdrv_query_image_info(bs0, p_image_info, &local_err);
- if (local_err) {
- error_propagate(errp, local_err);
- goto err;
- }
- if (bs0->drv && bs0->backing_hd) {
- bs0 = bs0->backing_hd;
- (*p_image_info)->has_backing_image = true;
- p_image_info = &((*p_image_info)->backing_image);
- } else {
- break;
- }
+ info->inserted = bdrv_block_device_info(bs, errp);
+ if (info->inserted == NULL) {
+ goto err;
}
}
diff --git a/block/qcow.c b/block/qcow.c
index 055896910e..ab893284d2 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -124,7 +124,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
snprintf(version, sizeof(version), "QCOW version %" PRIu32,
header.version);
error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
- bdrv_get_device_name(bs), "qcow", version);
+ bdrv_get_device_or_node_name(bs), "qcow", version);
ret = -ENOTSUP;
goto fail;
}
@@ -229,9 +229,9 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
}
/* Disable migration when qcow images are used */
- error_set(&s->migration_blocker,
- QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
- "qcow", bdrv_get_device_name(bs), "live migration");
+ error_setg(&s->migration_blocker, "The qcow format used by node '%s' "
+ "does not support live migration",
+ bdrv_get_device_or_node_name(bs));
migrate_add_blocker(s->migration_blocker);
qemu_co_mutex_init(&s->lock);
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 6cbae1d205..f47260b808 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -2450,7 +2450,7 @@ int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
if (ret < 0) {
return ret;
} else if (ret > 0) {
- int metadata_ol_bitnr = ffs(ret) - 1;
+ int metadata_ol_bitnr = ctz32(ret);
assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 2aa9dcb1d1..17bb2119b2 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -351,10 +351,8 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
memset(sn, 0, sizeof(*sn));
- /* Generate an ID if it wasn't passed */
- if (sn_info->id_str[0] == '\0') {
- find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
- }
+ /* Generate an ID */
+ find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
/* Check that the ID is unique */
if (find_snapshot_by_id_and_name(bs, sn_info->id_str, NULL) >= 0) {
diff --git a/block/qcow2.c b/block/qcow2.c
index 316a8db22b..b9a72e39d4 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -208,7 +208,7 @@ static void GCC_FMT_ATTR(3, 4) report_unsupported(BlockDriverState *bs,
va_end(ap);
error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
- bdrv_get_device_name(bs), "qcow2", msg);
+ bdrv_get_device_or_node_name(bs), "qcow2", msg);
}
static void report_unsupported_feature(BlockDriverState *bs,
@@ -1802,7 +1802,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
{
/* Calculate cluster_bits */
int cluster_bits;
- cluster_bits = ffs(cluster_size) - 1;
+ cluster_bits = ctz32(cluster_size);
if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS ||
(1 << cluster_bits) != cluster_size)
{
@@ -2110,7 +2110,7 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp)
goto finish;
}
- refcount_order = ffs(refcount_bits) - 1;
+ refcount_order = ctz32(refcount_bits);
ret = qcow2_create2(filename, size, backing_file, backing_fmt, flags,
cluster_size, prealloc, opts, version, refcount_order,
@@ -2824,6 +2824,7 @@ void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
int64_t size, const char *message_format, ...)
{
BDRVQcowState *s = bs->opaque;
+ const char *node_name;
char *message;
va_list ap;
@@ -2847,8 +2848,11 @@ void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
"corruption events will be suppressed\n", message);
}
- qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs), message,
- offset >= 0, offset, size >= 0, size,
+ node_name = bdrv_get_node_name(bs);
+ qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs),
+ *node_name != '\0', node_name,
+ message, offset >= 0, offset,
+ size >= 0, size,
fatal, &error_abort);
g_free(message);
diff --git a/block/qed.c b/block/qed.c
index 892b13c806..5bbe069ce9 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -408,7 +408,7 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
snprintf(buf, sizeof(buf), "%" PRIx64,
s->header.features & ~QED_FEATURE_MASK);
error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
- bdrv_get_device_name(bs), "QED", buf);
+ bdrv_get_device_or_node_name(bs), "QED", buf);
return -ENOTSUP;
}
if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
@@ -436,9 +436,9 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
s->table_nelems = (s->header.cluster_size * s->header.table_size) /
sizeof(uint64_t);
- s->l2_shift = ffs(s->header.cluster_size) - 1;
+ s->l2_shift = ctz32(s->header.cluster_size);
s->l2_mask = s->table_nelems - 1;
- s->l1_shift = s->l2_shift + ffs(s->table_nelems) - 1;
+ s->l1_shift = s->l2_shift + ctz32(s->table_nelems);
/* Header size calculation must not overflow uint32_t */
if (s->header.header_size > UINT32_MAX / s->header.cluster_size) {
diff --git a/block/quorum.c b/block/quorum.c
index 437b12251d..f91ef75a84 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -226,10 +226,7 @@ static void quorum_report_bad(QuorumAIOCB *acb, char *node_name, int ret)
static void quorum_report_failure(QuorumAIOCB *acb)
{
- const char *reference = bdrv_get_device_name(acb->common.bs)[0] ?
- bdrv_get_device_name(acb->common.bs) :
- acb->common.bs->node_name;
-
+ const char *reference = bdrv_get_device_or_node_name(acb->common.bs);
qapi_event_send_quorum_failure(reference, acb->sector_num,
acb->nb_sectors, &error_abort);
}
diff --git a/block/rbd.c b/block/rbd.c
index f3ab2ddd5a..fbe87e035b 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -325,7 +325,7 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)
error_setg(errp, "obj size too small");
return -EINVAL;
}
- obj_order = ffs(objsize) - 1;
+ obj_order = ctz32(objsize);
}
clientname = qemu_rbd_parse_clientname(conf, clientname_buf);
diff --git a/block/sheepdog.c b/block/sheepdog.c
index c14172cfa6..2d5f06a390 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -1716,7 +1716,7 @@ static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
if ((object_size - 1) & object_size) { /* not a power of 2? */
return -EINVAL;
}
- obj_order = ffs(object_size) - 1;
+ obj_order = ctz32(object_size);
if (obj_order < 20 || obj_order > 31) {
return -EINVAL;
}
diff --git a/block/snapshot.c b/block/snapshot.c
index 698e1a1d58..50ae610139 100644
--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -246,9 +246,9 @@ int bdrv_snapshot_delete(BlockDriverState *bs,
if (bs->file) {
return bdrv_snapshot_delete(bs->file, snapshot_id, name, errp);
}
- error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
- drv->format_name, bdrv_get_device_name(bs),
- "internal snapshot deletion");
+ error_setg(errp, "Block format '%s' used by device '%s' "
+ "does not support internal snapshot deletion",
+ drv->format_name, bdrv_get_device_name(bs));
return -ENOTSUP;
}
@@ -329,9 +329,9 @@ int bdrv_snapshot_load_tmp(BlockDriverState *bs,
if (drv->bdrv_snapshot_load_tmp) {
return drv->bdrv_snapshot_load_tmp(bs, snapshot_id, name, errp);
}
- error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
- drv->format_name, bdrv_get_device_name(bs),
- "temporarily load internal snapshot");
+ error_setg(errp, "Block format '%s' used by device '%s' "
+ "does not support temporarily loading internal snapshots",
+ drv->format_name, bdrv_get_device_name(bs));
return -ENOTSUP;
}
diff --git a/block/vdi.c b/block/vdi.c
index 53bd02fe22..7642ef3597 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -502,9 +502,9 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
}
/* Disable migration when vdi images are used */
- error_set(&s->migration_blocker,
- QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
- "vdi", bdrv_get_device_name(bs), "live migration");
+ error_setg(&s->migration_blocker, "The vdi format used by node '%s' "
+ "does not support live migration",
+ bdrv_get_device_or_node_name(bs));
migrate_add_blocker(s->migration_blocker);
qemu_co_mutex_init(&s->write_lock);
diff --git a/block/vhdx.c b/block/vhdx.c
index bb3ed45d5c..0776de7174 100644
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -1002,9 +1002,9 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags,
/* TODO: differencing files */
/* Disable migration when VHDX images are used */
- error_set(&s->migration_blocker,
- QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
- "vhdx", bdrv_get_device_name(bs), "live migration");
+ error_setg(&s->migration_blocker, "The vhdx format used by node '%s' "
+ "does not support live migration",
+ bdrv_get_device_or_node_name(bs));
migrate_add_blocker(s->migration_blocker);
return 0;
@@ -1269,7 +1269,7 @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
iov1.iov_base = qemu_blockalign(bs, iov1.iov_len);
memset(iov1.iov_base, 0, iov1.iov_len);
qemu_iovec_concat_iov(&hd_qiov, &iov1, 1, 0,
- sinfo.block_offset);
+ iov1.iov_len);
sectors_to_write += iov1.iov_len >> BDRV_SECTOR_BITS;
}
@@ -1285,7 +1285,7 @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
iov2.iov_base = qemu_blockalign(bs, iov2.iov_len);
memset(iov2.iov_base, 0, iov2.iov_len);
qemu_iovec_concat_iov(&hd_qiov, &iov2, 1, 0,
- sinfo.block_offset);
+ iov2.iov_len);
sectors_to_write += iov2.iov_len >> BDRV_SECTOR_BITS;
}
}
diff --git a/block/vmdk.c b/block/vmdk.c
index 8410a158a2..1c5e2ef1b3 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -523,7 +523,7 @@ static int vmdk_open_vmfs_sparse(BlockDriverState *bs,
}
ret = vmdk_add_extent(bs, file, false,
le32_to_cpu(header.disk_sectors),
- le32_to_cpu(header.l1dir_offset) << 9,
+ (int64_t)le32_to_cpu(header.l1dir_offset) << 9,
0,
le32_to_cpu(header.l1dir_size),
4096,
@@ -669,7 +669,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
snprintf(buf, sizeof(buf), "VMDK version %" PRId32,
le32_to_cpu(header.version));
error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
- bdrv_get_device_name(bs), "vmdk", buf);
+ bdrv_get_device_or_node_name(bs), "vmdk", buf);
return -ENOTSUP;
} else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) {
/* VMware KB 2064959 explains that version 3 added support for
@@ -962,9 +962,9 @@ static int vmdk_open(BlockDriverState *bs, QDict *options, int flags,
qemu_co_mutex_init(&s->lock);
/* Disable migration when VMDK images are used */
- error_set(&s->migration_blocker,
- QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
- "vmdk", bdrv_get_device_name(bs), "live migration");
+ error_setg(&s->migration_blocker, "The vmdk format used by node '%s' "
+ "does not support live migration",
+ bdrv_get_device_or_node_name(bs));
migrate_add_blocker(s->migration_blocker);
g_free(buf);
return 0;
diff --git a/block/vpc.c b/block/vpc.c
index 43e768ee76..37572bab86 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -318,9 +318,9 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
qemu_co_mutex_init(&s->lock);
/* Disable migration when VHD images are used */
- error_set(&s->migration_blocker,
- QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
- "vpc", bdrv_get_device_name(bs), "live migration");
+ error_setg(&s->migration_blocker, "The vpc format used by node '%s' "
+ "does not support live migration",
+ bdrv_get_device_or_node_name(bs));
migrate_add_blocker(s->migration_blocker);
return 0;
diff --git a/block/vvfat.c b/block/vvfat.c
index 9be632f404..e803589675 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -1180,9 +1180,10 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags,
/* Disable migration when vvfat is used rw */
if (s->qcow) {
- error_set(&s->migration_blocker,
- QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
- "vvfat (rw)", bdrv_get_device_name(bs), "live migration");
+ error_setg(&s->migration_blocker,
+ "The vvfat (rw) format used by node '%s' "
+ "does not support live migration",
+ bdrv_get_device_or_node_name(bs));
migrate_add_blocker(s->migration_blocker);
}
diff --git a/blockdev.c b/blockdev.c
index fbb3a79978..5eaf77e599 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1164,6 +1164,68 @@ out_aio_context:
return NULL;
}
+/**
+ * block_dirty_bitmap_lookup:
+ * Return a dirty bitmap (if present), after validating
+ * the node reference and bitmap names.
+ *
+ * @node: The name of the BDS node to search for bitmaps
+ * @name: The name of the bitmap to search for
+ * @pbs: Output pointer for BDS lookup, if desired. Can be NULL.
+ * @paio: Output pointer for aio_context acquisition, if desired. Can be NULL.
+ * @errp: Output pointer for error information. Can be NULL.
+ *
+ * @return: A bitmap object on success, or NULL on failure.
+ */
+static BdrvDirtyBitmap *block_dirty_bitmap_lookup(const char *node,
+ const char *name,
+ BlockDriverState **pbs,
+ AioContext **paio,
+ Error **errp)
+{
+ BlockDriverState *bs;
+ BdrvDirtyBitmap *bitmap;
+ AioContext *aio_context;
+
+ if (!node) {
+ error_setg(errp, "Node cannot be NULL");
+ return NULL;
+ }
+ if (!name) {
+ error_setg(errp, "Bitmap name cannot be NULL");
+ return NULL;
+ }
+ bs = bdrv_lookup_bs(node, node, NULL);
+ if (!bs) {
+ error_setg(errp, "Node '%s' not found", node);
+ return NULL;
+ }
+
+ aio_context = bdrv_get_aio_context(bs);
+ aio_context_acquire(aio_context);
+
+ bitmap = bdrv_find_dirty_bitmap(bs, name);
+ if (!bitmap) {
+ error_setg(errp, "Dirty bitmap '%s' not found", name);
+ goto fail;
+ }
+
+ if (pbs) {
+ *pbs = bs;
+ }
+ if (paio) {
+ *paio = aio_context;
+ } else {
+ aio_context_release(aio_context);
+ }
+
+ return bitmap;
+
+ fail:
+ aio_context_release(aio_context);
+ return NULL;
+}
+
/* New and old BlockDriverState structs for atomic group operations */
typedef struct BlkTransactionState BlkTransactionState;
@@ -1248,13 +1310,14 @@ static void internal_snapshot_prepare(BlkTransactionState *common,
}
if (bdrv_is_read_only(bs)) {
- error_set(errp, QERR_DEVICE_IS_READ_ONLY, device);
+ error_setg(errp, "Device '%s' is read only", device);
return;
}
if (!bdrv_can_snapshot(bs)) {
- error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
- bs->drv->format_name, device, "internal snapshot");
+ error_setg(errp, "Block format '%s' used by device '%s' "
+ "does not support internal snapshots",
+ bs->drv->format_name, device);
return;
}
@@ -1522,6 +1585,7 @@ static void drive_backup_prepare(BlkTransactionState *common, Error **errp)
backup->sync,
backup->has_mode, backup->mode,
backup->has_speed, backup->speed,
+ backup->has_bitmap, backup->bitmap,
backup->has_on_source_error, backup->on_source_error,
backup->has_on_target_error, backup->on_target_error,
&local_err);
@@ -1953,6 +2017,102 @@ void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd,
aio_context_release(aio_context);
}
+void qmp_block_dirty_bitmap_add(const char *node, const char *name,
+ bool has_granularity, uint32_t granularity,
+ Error **errp)
+{
+ AioContext *aio_context;
+ BlockDriverState *bs;
+
+ if (!name || name[0] == '\0') {
+ error_setg(errp, "Bitmap name cannot be empty");
+ return;
+ }
+
+ bs = bdrv_lookup_bs(node, node, errp);
+ if (!bs) {
+ return;
+ }
+
+ aio_context = bdrv_get_aio_context(bs);
+ aio_context_acquire(aio_context);
+
+ if (has_granularity) {
+ if (granularity < 512 || !is_power_of_2(granularity)) {
+ error_setg(errp, "Granularity must be power of 2 "
+ "and at least 512");
+ goto out;
+ }
+ } else {
+ /* Default to cluster size, if available: */
+ granularity = bdrv_get_default_bitmap_granularity(bs);
+ }
+
+ bdrv_create_dirty_bitmap(bs, granularity, name, errp);
+
+ out:
+ aio_context_release(aio_context);
+}
+
+void qmp_block_dirty_bitmap_remove(const char *node, const char *name,
+ Error **errp)
+{
+ AioContext *aio_context;
+ BlockDriverState *bs;
+ BdrvDirtyBitmap *bitmap;
+
+ bitmap = block_dirty_bitmap_lookup(node, name, &bs, &aio_context, errp);
+ if (!bitmap || !bs) {
+ return;
+ }
+
+ if (bdrv_dirty_bitmap_frozen(bitmap)) {
+ error_setg(errp,
+ "Bitmap '%s' is currently frozen and cannot be removed",
+ name);
+ goto out;
+ }
+ bdrv_dirty_bitmap_make_anon(bitmap);
+ bdrv_release_dirty_bitmap(bs, bitmap);
+
+ out:
+ aio_context_release(aio_context);
+}
+
+/**
+ * Completely clear a bitmap, for the purposes of synchronizing a bitmap
+ * immediately after a full backup operation.
+ */
+void qmp_block_dirty_bitmap_clear(const char *node, const char *name,
+ Error **errp)
+{
+ AioContext *aio_context;
+ BdrvDirtyBitmap *bitmap;
+ BlockDriverState *bs;
+
+ bitmap = block_dirty_bitmap_lookup(node, name, &bs, &aio_context, errp);
+ if (!bitmap || !bs) {
+ return;
+ }
+
+ if (bdrv_dirty_bitmap_frozen(bitmap)) {
+ error_setg(errp,
+ "Bitmap '%s' is currently frozen and cannot be modified",
+ name);
+ goto out;
+ } else if (!bdrv_dirty_bitmap_enabled(bitmap)) {
+ error_setg(errp,
+ "Bitmap '%s' is currently disabled and cannot be cleared",
+ name);
+ goto out;
+ }
+
+ bdrv_clear_dirty_bitmap(bitmap);
+
+ out:
+ aio_context_release(aio_context);
+}
+
int hmp_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data)
{
const char *id = qdict_get_str(qdict, "id");
@@ -2055,7 +2215,7 @@ void qmp_block_resize(bool has_device, const char *device,
error_set(errp, QERR_UNSUPPORTED);
break;
case -EACCES:
- error_set(errp, QERR_DEVICE_IS_READ_ONLY, device);
+ error_setg(errp, "Device '%s' is read only", device);
break;
case -EBUSY:
error_set(errp, QERR_DEVICE_IN_USE, device);
@@ -2270,6 +2430,7 @@ void qmp_drive_backup(const char *device, const char *target,
enum MirrorSyncMode sync,
bool has_mode, enum NewImageMode mode,
bool has_speed, int64_t speed,
+ bool has_bitmap, const char *bitmap,
bool has_on_source_error, BlockdevOnError on_source_error,
bool has_on_target_error, BlockdevOnError on_target_error,
Error **errp)
@@ -2278,6 +2439,7 @@ void qmp_drive_backup(const char *device, const char *target,
BlockDriverState *bs;
BlockDriverState *target_bs;
BlockDriverState *source = NULL;
+ BdrvDirtyBitmap *bmap = NULL;
AioContext *aio_context;
BlockDriver *drv = NULL;
Error *local_err = NULL;
@@ -2377,7 +2539,16 @@ void qmp_drive_backup(const char *device, const char *target,
bdrv_set_aio_context(target_bs, aio_context);
- backup_start(bs, target_bs, speed, sync, on_source_error, on_target_error,
+ if (has_bitmap) {
+ bmap = bdrv_find_dirty_bitmap(bs, bitmap);
+ if (!bmap) {
+ error_setg(errp, "Bitmap '%s' could not be found", bitmap);
+ goto out;
+ }
+ }
+
+ backup_start(bs, target_bs, speed, sync, bmap,
+ on_source_error, on_target_error,
block_job_cb, bs, &local_err);
if (local_err != NULL) {
bdrv_unref(target_bs);
@@ -2391,7 +2562,7 @@ out:
BlockDeviceInfoList *qmp_query_named_block_nodes(Error **errp)
{
- return bdrv_named_nodes_list();
+ return bdrv_named_nodes_list(errp);
}
void qmp_blockdev_backup(const char *device, const char *target,
@@ -2438,8 +2609,8 @@ void qmp_blockdev_backup(const char *device, const char *target,
bdrv_ref(target_bs);
bdrv_set_aio_context(target_bs, aio_context);
- backup_start(bs, target_bs, speed, sync, on_source_error, on_target_error,
- block_job_cb, bs, &local_err);
+ backup_start(bs, target_bs, speed, sync, NULL, on_source_error,
+ on_target_error, block_job_cb, bs, &local_err);
if (local_err != NULL) {
bdrv_unref(target_bs);
error_propagate(errp, local_err);
@@ -2699,7 +2870,7 @@ void qmp_block_job_cancel(const char *device,
force = false;
}
- if (job->paused && !force) {
+ if (job->user_paused && !force) {
error_setg(errp, "The block job for device '%s' is currently paused",
device);
goto out;
@@ -2716,10 +2887,11 @@ void qmp_block_job_pause(const char *device, Error **errp)
AioContext *aio_context;
BlockJob *job = find_block_job(device, &aio_context, errp);
- if (!job) {
+ if (!job || job->user_paused) {
return;
}
+ job->user_paused = true;
trace_qmp_block_job_pause(job);
block_job_pause(job);
aio_context_release(aio_context);
@@ -2730,10 +2902,11 @@ void qmp_block_job_resume(const char *device, Error **errp)
AioContext *aio_context;
BlockJob *job = find_block_job(device, &aio_context, errp);
- if (!job) {
+ if (!job || !job->user_paused) {
return;
}
+ job->user_paused = false;
trace_qmp_block_job_resume(job);
block_job_resume(job);
aio_context_release(aio_context);
diff --git a/blockjob.c b/blockjob.c
index ba2255d91f..2755465259 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -107,7 +107,7 @@ void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
void block_job_complete(BlockJob *job, Error **errp)
{
- if (job->paused || job->cancelled || !job->driver->complete) {
+ if (job->pause_count || job->cancelled || !job->driver->complete) {
error_set(errp, QERR_BLOCK_JOB_NOT_READY,
bdrv_get_device_name(job->bs));
return;
@@ -118,17 +118,26 @@ void block_job_complete(BlockJob *job, Error **errp)
void block_job_pause(BlockJob *job)
{
- job->paused = true;
+ job->pause_count++;
}
bool block_job_is_paused(BlockJob *job)
{
- return job->paused;
+ return job->pause_count > 0;
}
void block_job_resume(BlockJob *job)
{
- job->paused = false;
+ assert(job->pause_count > 0);
+ job->pause_count--;
+ if (job->pause_count) {
+ return;
+ }
+ block_job_enter(job);
+}
+
+void block_job_enter(BlockJob *job)
+{
block_job_iostatus_reset(job);
if (job->co && !job->busy) {
qemu_coroutine_enter(job->co, NULL);
@@ -138,7 +147,7 @@ void block_job_resume(BlockJob *job)
void block_job_cancel(BlockJob *job)
{
job->cancelled = true;
- block_job_resume(job);
+ block_job_enter(job);
}
bool block_job_is_cancelled(BlockJob *job)
@@ -258,7 +267,7 @@ BlockJobInfo *block_job_query(BlockJob *job)
info->device = g_strdup(bdrv_get_device_name(job->bs));
info->len = job->len;
info->busy = job->busy;
- info->paused = job->paused;
+ info->paused = job->pause_count > 0;
info->offset = job->offset;
info->speed = job->speed;
info->io_status = job->iostatus;
@@ -335,6 +344,8 @@ BlockErrorAction block_job_error_action(BlockJob *job, BlockDriverState *bs,
IO_OPERATION_TYPE_WRITE,
action, &error_abort);
if (action == BLOCK_ERROR_ACTION_STOP) {
+ /* make the pause user visible, which will be resumed from QMP. */
+ job->user_paused = true;
block_job_pause(job);
block_job_iostatus_set_err(job, error);
if (bs != job->bs) {
diff --git a/docs/bitmaps.md b/docs/bitmaps.md
new file mode 100644
index 0000000000..f066b48aa5
--- /dev/null
+++ b/docs/bitmaps.md
@@ -0,0 +1,352 @@
+<!--
+Copyright 2015 John Snow <jsnow@redhat.com> and Red Hat, Inc.
+All rights reserved.
+
+This file is licensed via The FreeBSD Documentation License, the full text of
+which is included at the end of this document.
+-->
+
+# Dirty Bitmaps and Incremental Backup
+
+* Dirty Bitmaps are objects that track which data needs to be backed up for the
+ next incremental backup.
+
+* Dirty bitmaps can be created at any time and attached to any node
+ (not just complete drives.)
+
+## Dirty Bitmap Names
+
+* A dirty bitmap's name is unique to the node, but bitmaps attached to different
+ nodes can share the same name.
+
+## Bitmap Modes
+
+* A Bitmap can be "frozen," which means that it is currently in-use by a backup
+ operation and cannot be deleted, renamed, written to, reset,
+ etc.
+
+## Basic QMP Usage
+
+### Supported Commands ###
+
+* block-dirty-bitmap-add
+* block-dirty-bitmap-remove
+* block-dirty-bitmap-clear
+
+### Creation
+
+* To create a new bitmap, enabled, on the drive with id=drive0:
+
+```json
+{ "execute": "block-dirty-bitmap-add",
+ "arguments": {
+ "node": "drive0",
+ "name": "bitmap0"
+ }
+}
+```
+
+* This bitmap will have a default granularity that matches the cluster size of
+ its associated drive, if available, clamped to between [4KiB, 64KiB].
+ The current default for qcow2 is 64KiB.
+
+* To create a new bitmap that tracks changes in 32KiB segments:
+
+```json
+{ "execute": "block-dirty-bitmap-add",
+ "arguments": {
+ "node": "drive0",
+ "name": "bitmap0",
+ "granularity": 32768
+ }
+}
+```
+
+### Deletion
+
+* Bitmaps that are frozen cannot be deleted.
+
+* Deleting the bitmap does not impact any other bitmaps attached to the same
+ node, nor does it affect any backups already created from this node.
+
+* Because bitmaps are only unique to the node to which they are attached,
+ you must specify the node/drive name here, too.
+
+```json
+{ "execute": "block-dirty-bitmap-remove",
+ "arguments": {
+ "node": "drive0",
+ "name": "bitmap0"
+ }
+}
+```
+
+### Resetting
+
+* Resetting a bitmap will clear all information it holds.
+
+* An incremental backup created from an empty bitmap will copy no data,
+ as if nothing has changed.
+
+```json
+{ "execute": "block-dirty-bitmap-clear",
+ "arguments": {
+ "node": "drive0",
+ "name": "bitmap0"
+ }
+}
+```
+
+## Transactions (Not yet implemented)
+
+* Transactional commands are forthcoming in a future version,
+ and are not yet available for use. This section serves as
+ documentation of intent for their design and usage.
+
+### Justification
+
+Bitmaps can be safely modified when the VM is paused or halted by using
+the basic QMP commands. For instance, you might perform the following actions:
+
+1. Boot the VM in a paused state.
+2. Create a full drive backup of drive0.
+3. Create a new bitmap attached to drive0.
+4. Resume execution of the VM.
+5. Incremental backups are ready to be created.
+
+At this point, the bitmap and drive backup would be correctly in sync,
+and incremental backups made from this point forward would be correctly aligned
+to the full drive backup.
+
+This is not particularly useful if we decide we want to start incremental
+backups after the VM has been running for a while, for which we will need to
+perform actions such as the following:
+
+1. Boot the VM and begin execution.
+2. Using a single transaction, perform the following operations:
+ * Create bitmap0.
+ * Create a full drive backup of drive0.
+3. Incremental backups are now ready to be created.
+
+### Supported Bitmap Transactions
+
+* block-dirty-bitmap-add
+* block-dirty-bitmap-clear
+
+The usages are identical to their respective QMP commands, but see below
+for examples.
+
+### Example: New Incremental Backup
+
+As outlined in the justification, perhaps we want to create a new incremental
+backup chain attached to a drive.
+
+```json
+{ "execute": "transaction",
+ "arguments": {
+ "actions": [
+ {"type": "block-dirty-bitmap-add",
+ "data": {"node": "drive0", "name": "bitmap0"} },
+ {"type": "drive-backup",
+ "data": {"device": "drive0", "target": "/path/to/full_backup.img",
+ "sync": "full", "format": "qcow2"} }
+ ]
+ }
+}
+```
+
+### Example: New Incremental Backup Anchor Point
+
+Maybe we just want to create a new full backup with an existing bitmap and
+want to reset the bitmap to track the new chain.
+
+```json
+{ "execute": "transaction",
+ "arguments": {
+ "actions": [
+ {"type": "block-dirty-bitmap-clear",
+ "data": {"node": "drive0", "name": "bitmap0"} },
+ {"type": "drive-backup",
+ "data": {"device": "drive0", "target": "/path/to/new_full_backup.img",
+ "sync": "full", "format": "qcow2"} }
+ ]
+ }
+}
+```
+
+## Incremental Backups
+
+The star of the show.
+
+**Nota Bene!** Only incremental backups of entire drives are supported for now.
+So despite the fact that you can attach a bitmap to any arbitrary node, they are
+only currently useful when attached to the root node. This is because
+drive-backup only supports drives/devices instead of arbitrary nodes.
+
+### Example: First Incremental Backup
+
+1. Create a full backup and sync it to the dirty bitmap, as in the transactional
+examples above; or with the VM offline, manually create a full copy and then
+create a new bitmap before the VM begins execution.
+
+ * Let's assume the full backup is named 'full_backup.img'.
+ * Let's assume the bitmap you created is 'bitmap0' attached to 'drive0'.
+
+2. Create a destination image for the incremental backup that utilizes the
+full backup as a backing image.
+
+ * Let's assume it is named 'incremental.0.img'.
+
+ ```sh
+ # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
+ ```
+
+3. Issue the incremental backup command:
+
+ ```json
+ { "execute": "drive-backup",
+ "arguments": {
+ "device": "drive0",
+ "bitmap": "bitmap0",
+ "target": "incremental.0.img",
+ "format": "qcow2",
+ "sync": "dirty-bitmap",
+ "mode": "existing"
+ }
+ }
+ ```
+
+### Example: Second Incremental Backup
+
+1. Create a new destination image for the incremental backup that points to the
+ previous one, e.g.: 'incremental.1.img'
+
+ ```sh
+ # qemu-img create -f qcow2 incremental.1.img -b incremental.0.img -F qcow2
+ ```
+
+2. Issue a new incremental backup command. The only difference here is that we
+ have changed the target image below.
+
+ ```json
+ { "execute": "drive-backup",
+ "arguments": {
+ "device": "drive0",
+ "bitmap": "bitmap0",
+ "target": "incremental.1.img",
+ "format": "qcow2",
+ "sync": "dirty-bitmap",
+ "mode": "existing"
+ }
+ }
+ ```
+
+## Errors
+
+* In the event of an error that occurs after a backup job is successfully
+ launched, either by a direct QMP command or a QMP transaction, the user
+ will receive a BLOCK_JOB_COMPLETE event with a failure message, accompanied
+ by a BLOCK_JOB_ERROR event.
+
+* In the case of an event being cancelled, the user will receive a
+ BLOCK_JOB_CANCELLED event instead of a pair of COMPLETE and ERROR events.
+
+* In either case, the incremental backup data contained within the bitmap is
+ safely rolled back, and the data within the bitmap is not lost. The image
+ file created for the failed attempt can be safely deleted.
+
+* Once the underlying problem is fixed (e.g. more storage space is freed up),
+ you can simply retry the incremental backup command with the same bitmap.
+
+### Example
+
+1. Create a target image:
+
+ ```sh
+ # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
+ ```
+
+2. Attempt to create an incremental backup via QMP:
+
+ ```json
+ { "execute": "drive-backup",
+ "arguments": {
+ "device": "drive0",
+ "bitmap": "bitmap0",
+ "target": "incremental.0.img",
+ "format": "qcow2",
+ "sync": "dirty-bitmap",
+ "mode": "existing"
+ }
+ }
+ ```
+
+3. Receive an event notifying us of failure:
+
+ ```json
+ { "timestamp": { "seconds": 1424709442, "microseconds": 844524 },
+ "data": { "speed": 0, "offset": 0, "len": 67108864,
+ "error": "No space left on device",
+ "device": "drive1", "type": "backup" },
+ "event": "BLOCK_JOB_COMPLETED" }
+ ```
+
+4. Delete the failed incremental, and re-create the image.
+
+ ```sh
+ # rm incremental.0.img
+ # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
+ ```
+
+5. Retry the command after fixing the underlying problem,
+ such as freeing up space on the backup volume:
+
+ ```json
+ { "execute": "drive-backup",
+ "arguments": {
+ "device": "drive0",
+ "bitmap": "bitmap0",
+ "target": "incremental.0.img",
+ "format": "qcow2",
+ "sync": "dirty-bitmap",
+ "mode": "existing"
+ }
+ }
+ ```
+
+6. Receive confirmation that the job completed successfully:
+
+ ```json
+ { "timestamp": { "seconds": 1424709668, "microseconds": 526525 },
+ "data": { "device": "drive1", "type": "backup",
+ "speed": 0, "len": 67108864, "offset": 67108864},
+ "event": "BLOCK_JOB_COMPLETED" }
+ ```
+
+<!--
+The FreeBSD Documentation License
+
+Redistribution and use in source (Markdown) and 'compiled' forms (SGML, HTML,
+PDF, PostScript, RTF and so forth) with or without modification, are permitted
+provided that the following conditions are met:
+
+Redistributions of source code (Markdown) must retain the above copyright
+notice, this list of conditions and the following disclaimer of this file
+unmodified.
+
+Redistributions in compiled form (transformed to other DTDs, converted to PDF,
+PostScript, RTF and other formats) must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+
+THIS DOCUMENTATION IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+THIS DOCUMENTATION, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
diff --git a/docs/qmp/qmp-events.txt b/docs/qmp/qmp-events.txt
index d759d19748..b19e490eb5 100644
--- a/docs/qmp/qmp-events.txt
+++ b/docs/qmp/qmp-events.txt
@@ -31,21 +31,26 @@ Example:
BLOCK_IMAGE_CORRUPTED
---------------------
-Emitted when a disk image is being marked corrupt.
+Emitted when a disk image is being marked corrupt. The image can be
+identified by its device or node name. The 'device' field is always
+present for compatibility reasons, but it can be empty ("") if the
+image does not have a device name associated.
Data:
-- "device": Device name (json-string)
-- "msg": Informative message (e.g., reason for the corruption) (json-string)
-- "offset": If the corruption resulted from an image access, this is the access
- offset into the image (json-int)
-- "size": If the corruption resulted from an image access, this is the access
- size (json-int)
+- "device": Device name (json-string)
+- "node-name": Node name (json-string, optional)
+- "msg": Informative message (e.g., reason for the corruption)
+ (json-string)
+- "offset": If the corruption resulted from an image access, this
+ is the access offset into the image (json-int)
+- "size": If the corruption resulted from an image access, this
+ is the access size (json-int)
Example:
{ "event": "BLOCK_IMAGE_CORRUPTED",
- "data": { "device": "ide0-hd0",
+ "data": { "device": "ide0-hd0", "node-name": "node0",
"msg": "Prevented active L1 table overwrite", "offset": 196608,
"size": 65536 },
"timestamp": { "seconds": 1378126126, "microseconds": 966463 } }
diff --git a/hmp.c b/hmp.c
index f142d366ef..d85d913a79 100644
--- a/hmp.c
+++ b/hmp.c
@@ -391,8 +391,7 @@ static void print_block_info(Monitor *mon, BlockInfo *info,
inserted->iops_size);
}
- /* TODO: inserted->image should never be null */
- if (verbose && inserted->image) {
+ if (verbose) {
monitor_printf(mon, "\nImages:\n");
image_info = inserted->image;
while (1) {
@@ -1062,7 +1061,8 @@ void hmp_drive_backup(Monitor *mon, const QDict *qdict)
qmp_drive_backup(device, filename, !!format, format,
full ? MIRROR_SYNC_MODE_FULL : MIRROR_SYNC_MODE_TOP,
- true, mode, false, 0, false, 0, false, 0, &err);
+ true, mode, false, 0, false, NULL,
+ false, 0, false, 0, &err);
hmp_handle_error(mon, &err);
}
diff --git a/hw/acpi/pcihp.c b/hw/acpi/pcihp.c
index 612fec03ee..77e1126f8f 100644
--- a/hw/acpi/pcihp.c
+++ b/hw/acpi/pcihp.c
@@ -120,7 +120,7 @@ static bool acpi_pcihp_pc_no_hotplug(AcpiPciHpState *s, PCIDevice *dev)
static void acpi_pcihp_eject_slot(AcpiPciHpState *s, unsigned bsel, unsigned slots)
{
BusChild *kid, *next;
- int slot = ffs(slots) - 1;
+ int slot = ctz32(slots);
PCIBus *bus = acpi_pcihp_find_hotplug_bus(s, bsel);
if (!bus) {
diff --git a/hw/arm/nseries.c b/hw/arm/nseries.c
index 2a5406d98d..d243159664 100644
--- a/hw/arm/nseries.c
+++ b/hw/arm/nseries.c
@@ -579,7 +579,10 @@ static uint32_t mipid_txrx(void *opaque, uint32_t cmd, int len)
case 0x26: /* GAMSET */
if (!s->pm) {
- s->gamma = ffs(s->param[0] & 0xf) - 1;
+ s->gamma = ctz32(s->param[0] & 0xf);
+ if (s->gamma == 32) {
+ s->gamma = -1; /* XXX: should this be 0? */
+ }
} else if (s->pm < 0) {
s->pm = 1;
}
diff --git a/hw/arm/omap1.c b/hw/arm/omap1.c
index 91ffb589e5..de2b289257 100644
--- a/hw/arm/omap1.c
+++ b/hw/arm/omap1.c
@@ -2004,8 +2004,7 @@ static void omap_mpuio_write(void *opaque, hwaddr addr,
case 0x04: /* OUTPUT_REG */
diff = (s->outputs ^ value) & ~s->dir;
s->outputs = value;
- while ((ln = ffs(diff))) {
- ln --;
+ while ((ln = ctz32(diff)) != 32) {
if (s->handler[ln])
qemu_set_irq(s->handler[ln], (value >> ln) & 1);
diff &= ~(1 << ln);
@@ -2017,8 +2016,7 @@ static void omap_mpuio_write(void *opaque, hwaddr addr,
s->dir = value;
value = s->outputs & ~s->dir;
- while ((ln = ffs(diff))) {
- ln --;
+ while ((ln = ctz32(diff)) != 32) {
if (s->handler[ln])
qemu_set_irq(s->handler[ln], (value >> ln) & 1);
diff &= ~(1 << ln);
diff --git a/hw/arm/pxa2xx_gpio.c b/hw/arm/pxa2xx_gpio.c
index 354ccf1ea1..c89c8045c3 100644
--- a/hw/arm/pxa2xx_gpio.c
+++ b/hw/arm/pxa2xx_gpio.c
@@ -137,7 +137,7 @@ static void pxa2xx_gpio_handler_update(PXA2xxGPIOInfo *s) {
level = s->olevel[i] & s->dir[i];
for (diff = s->prev_level[i] ^ level; diff; diff ^= 1 << bit) {
- bit = ffs(diff) - 1;
+ bit = ctz32(diff);
line = bit + 32 * i;
qemu_set_irq(s->handler[line], (level >> bit) & 1);
}
diff --git a/hw/arm/strongarm.c b/hw/arm/strongarm.c
index 1ddea6d89c..da9fc1d51b 100644
--- a/hw/arm/strongarm.c
+++ b/hw/arm/strongarm.c
@@ -528,7 +528,7 @@ static void strongarm_gpio_handler_update(StrongARMGPIOInfo *s)
level = s->olevel & s->dir;
for (diff = s->prev_level ^ level; diff; diff ^= 1 << bit) {
- bit = ffs(diff) - 1;
+ bit = ctz32(diff);
qemu_set_irq(s->handler[bit], (level >> bit) & 1);
}
@@ -745,7 +745,7 @@ static void strongarm_ppc_handler_update(StrongARMPPCInfo *s)
level = s->olevel & s->dir;
for (diff = s->prev_level ^ level; diff; diff ^= 1 << bit) {
- bit = ffs(diff) - 1;
+ bit = ctz32(diff);
qemu_set_irq(s->handler[bit], (level >> bit) & 1);
}
diff --git a/hw/block/m25p80.c b/hw/block/m25p80.c
index afe243b811..efc43dde6a 100644
--- a/hw/block/m25p80.c
+++ b/hw/block/m25p80.c
@@ -621,7 +621,6 @@ static int m25p80_init(SSISlave *ss)
s->size = s->pi->sector_size * s->pi->n_sectors;
s->dirty_page = -1;
- s->storage = blk_blockalign(s->blk, s->size);
/* FIXME use a qdev drive property instead of drive_get_next() */
dinfo = drive_get_next(IF_MTD);
@@ -629,6 +628,9 @@ static int m25p80_init(SSISlave *ss)
if (dinfo) {
DB_PRINT_L(0, "Binding to IF_MTD drive\n");
s->blk = blk_by_legacy_dinfo(dinfo);
+ blk_attach_dev_nofail(s->blk, s);
+
+ s->storage = blk_blockalign(s->blk, s->size);
/* FIXME: Move to late init */
if (blk_read(s->blk, 0, s->storage,
@@ -638,6 +640,7 @@ static int m25p80_init(SSISlave *ss)
}
} else {
DB_PRINT_L(0, "No BDRV - binding to RAM\n");
+ s->storage = blk_blockalign(NULL, s->size);
memset(s->storage, 0xFF, s->size);
}
diff --git a/hw/bt/sdp.c b/hw/bt/sdp.c
index 218e075df7..c903747952 100644
--- a/hw/bt/sdp.c
+++ b/hw/bt/sdp.c
@@ -707,7 +707,7 @@ static void sdp_service_record_build(struct sdp_service_record_s *record,
len += sdp_attr_max_size(&def->attributes[record->attributes ++].data,
&record->uuids);
}
- record->uuids = 1 << ffs(record->uuids - 1);
+ record->uuids = pow2ceil(record->uuids);
record->attribute_list =
g_malloc0(record->attributes * sizeof(*record->attribute_list));
record->uuid =
diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c
index e336bdb4a9..6e2ad8221b 100644
--- a/hw/char/virtio-serial-bus.c
+++ b/hw/char/virtio-serial-bus.c
@@ -814,12 +814,12 @@ static uint32_t find_free_port_id(VirtIOSerial *vser)
max_nr_ports = vser->serial.max_virtserial_ports;
for (i = 0; i < (max_nr_ports + 31) / 32; i++) {
- uint32_t map, bit;
+ uint32_t map, zeroes;
map = vser->ports_map[i];
- bit = ffs(~map);
- if (bit) {
- return (bit - 1) + i * 32;
+ zeroes = ctz32(~map);
+ if (zeroes != 32) {
+ return zeroes + i * 32;
}
}
return VIRTIO_CONSOLE_BAD_ID;
diff --git a/hw/display/tc6393xb.c b/hw/display/tc6393xb.c
index 4306adc959..66b7ade8da 100644
--- a/hw/display/tc6393xb.c
+++ b/hw/display/tc6393xb.c
@@ -171,7 +171,7 @@ static void tc6393xb_gpio_handler_update(TC6393xbState *s)
level = s->gpio_level & s->gpio_dir;
for (diff = s->prev_level ^ level; diff; diff ^= 1 << bit) {
- bit = ffs(diff) - 1;
+ bit = ctz32(diff);
qemu_set_irq(s->handler[bit], (level >> bit) & 1);
}
diff --git a/hw/gpio/max7310.c b/hw/gpio/max7310.c
index 7fbf313ce8..2f59b134ee 100644
--- a/hw/gpio/max7310.c
+++ b/hw/gpio/max7310.c
@@ -96,7 +96,7 @@ static int max7310_tx(I2CSlave *i2c, uint8_t data)
case 0x01: /* Output port */
for (diff = (data ^ s->level) & ~s->direction; diff;
diff &= ~(1 << line)) {
- line = ffs(diff) - 1;
+ line = ctz32(diff);
if (s->handler[line])
qemu_set_irq(s->handler[line], (data >> line) & 1);
}
diff --git a/hw/gpio/omap_gpio.c b/hw/gpio/omap_gpio.c
index 9a43486890..d92f8cfbae 100644
--- a/hw/gpio/omap_gpio.c
+++ b/hw/gpio/omap_gpio.c
@@ -125,8 +125,7 @@ static void omap_gpio_write(void *opaque, hwaddr addr,
case 0x04: /* DATA_OUTPUT */
diff = (s->outputs ^ value) & ~s->dir;
s->outputs = value;
- while ((ln = ffs(diff))) {
- ln --;
+ while ((ln = ctz32(diff)) != 32) {
if (s->handler[ln])
qemu_set_irq(s->handler[ln], (value >> ln) & 1);
diff &= ~(1 << ln);
@@ -138,8 +137,7 @@ static void omap_gpio_write(void *opaque, hwaddr addr,
s->dir = value;
value = s->outputs & ~s->dir;
- while ((ln = ffs(diff))) {
- ln --;
+ while ((ln = ctz32(diff)) != 32) {
if (s->handler[ln])
qemu_set_irq(s->handler[ln], (value >> ln) & 1);
diff &= ~(1 << ln);
@@ -253,8 +251,7 @@ static inline void omap2_gpio_module_out_update(struct omap2_gpio_s *s,
s->outputs ^= diff;
diff &= ~s->dir;
- while ((ln = ffs(diff))) {
- ln --;
+ while ((ln = ctz32(diff)) != 32) {
qemu_set_irq(s->handler[ln], (s->outputs >> ln) & 1);
diff &= ~(1 << ln);
}
@@ -442,8 +439,8 @@ static void omap2_gpio_module_write(void *opaque, hwaddr addr,
s->dir = value;
value = s->outputs & ~s->dir;
- while ((ln = ffs(diff))) {
- diff &= ~(1 <<-- ln);
+ while ((ln = ctz32(diff)) != 32) {
+ diff &= ~(1 << ln);
qemu_set_irq(s->handler[ln], (value >> ln) & 1);
}
diff --git a/hw/gpio/zaurus.c b/hw/gpio/zaurus.c
index 94083424f8..24a77272d7 100644
--- a/hw/gpio/zaurus.c
+++ b/hw/gpio/zaurus.c
@@ -65,7 +65,7 @@ static inline void scoop_gpio_handler_update(ScoopInfo *s) {
level = s->gpio_level & s->gpio_dir;
for (diff = s->prev_level ^ level; diff; diff ^= 1 << bit) {
- bit = ffs(diff) - 1;
+ bit = ctz32(diff);
qemu_set_irq(s->handler[bit], (level >> bit) & 1);
}
diff --git a/hw/i2c/omap_i2c.c b/hw/i2c/omap_i2c.c
index d63278dbde..b6f544a221 100644
--- a/hw/i2c/omap_i2c.c
+++ b/hw/i2c/omap_i2c.c
@@ -171,9 +171,13 @@ static uint32_t omap_i2c_read(void *opaque, hwaddr addr)
case 0x0c: /* I2C_IV */
if (s->revision >= OMAP2_INTR_REV)
break;
- ret = ffs(s->stat & s->mask);
- if (ret)
- s->stat ^= 1 << (ret - 1);
+ ret = ctz32(s->stat & s->mask);
+ if (ret != 32) {
+ s->stat ^= 1 << ret;
+ ret++;
+ } else {
+ ret = 0;
+ }
omap_i2c_interrupts_update(s);
return ret;
diff --git a/hw/intc/allwinner-a10-pic.c b/hw/intc/allwinner-a10-pic.c
index de820b9723..eed7621f13 100644
--- a/hw/intc/allwinner-a10-pic.c
+++ b/hw/intc/allwinner-a10-pic.c
@@ -23,7 +23,7 @@
static void aw_a10_pic_update(AwA10PICState *s)
{
uint8_t i;
- int irq = 0, fiq = 0, pending;
+ int irq = 0, fiq = 0, zeroes;
s->vector = 0;
@@ -32,9 +32,9 @@ static void aw_a10_pic_update(AwA10PICState *s)
fiq |= s->select[i] & s->irq_pending[i] & ~s->mask[i];
if (!s->vector) {
- pending = ffs(s->irq_pending[i] & ~s->mask[i]);
- if (pending) {
- s->vector = (i * 32 + pending - 1) * 4;
+ zeroes = ctz32(s->irq_pending[i] & ~s->mask[i]);
+ if (zeroes != 32) {
+ s->vector = (i * 32 + zeroes) * 4;
}
}
}
diff --git a/hw/intc/omap_intc.c b/hw/intc/omap_intc.c
index ad3931c112..e9b38a3c63 100644
--- a/hw/intc/omap_intc.c
+++ b/hw/intc/omap_intc.c
@@ -60,7 +60,7 @@ struct omap_intr_handler_s {
static void omap_inth_sir_update(struct omap_intr_handler_s *s, int is_fiq)
{
- int i, j, sir_intr, p_intr, p, f;
+ int i, j, sir_intr, p_intr, p;
uint32_t level;
sir_intr = 0;
p_intr = 255;
@@ -72,14 +72,15 @@ static void omap_inth_sir_update(struct omap_intr_handler_s *s, int is_fiq)
for (j = 0; j < s->nbanks; ++j) {
level = s->bank[j].irqs & ~s->bank[j].mask &
(is_fiq ? s->bank[j].fiq : ~s->bank[j].fiq);
- for (f = ffs(level), i = f - 1, level >>= f - 1; f; i += f,
- level >>= f) {
+
+ while (level != 0) {
+ i = ctz32(level);
p = s->bank[j].priority[i];
if (p <= p_intr) {
p_intr = p;
sir_intr = 32 * j + i;
}
- f = ffs(level >> 1);
+ level &= level - 1;
}
}
s->sir_intr[is_fiq] = sir_intr;
diff --git a/hw/pci-host/bonito.c b/hw/pci-host/bonito.c
index 8134d0bcd0..3a731fe18d 100644
--- a/hw/pci-host/bonito.c
+++ b/hw/pci-host/bonito.c
@@ -427,7 +427,7 @@ static uint32_t bonito_sbridge_pciaddr(void *opaque, hwaddr addr)
cfgaddr |= (s->regs[BONITO_PCIMAP_CFG] & 0xffff) << 16;
idsel = (cfgaddr & BONITO_PCICONF_IDSEL_MASK) >> BONITO_PCICONF_IDSEL_OFFSET;
- devno = ffs(idsel) - 1;
+ devno = ctz32(idsel);
funno = (cfgaddr & BONITO_PCICONF_FUN_MASK) >> BONITO_PCICONF_FUN_OFFSET;
regno = (cfgaddr & BONITO_PCICONF_REG_MASK) >> BONITO_PCICONF_REG_OFFSET;
diff --git a/hw/pci-host/uninorth.c b/hw/pci-host/uninorth.c
index 53f2b59ae8..f0144eb7b0 100644
--- a/hw/pci-host/uninorth.c
+++ b/hw/pci-host/uninorth.c
@@ -92,7 +92,10 @@ static uint32_t unin_get_config_reg(uint32_t reg, uint32_t addr)
uint32_t slot, func;
/* Grab CFA0 style values */
- slot = ffs(reg & 0xfffff800) - 1;
+ slot = ctz32(reg & 0xfffff800);
+ if (slot == 32) {
+ slot = -1; /* XXX: should this be 0? */
+ }
func = (reg >> 8) & 7;
/* ... and then convert them to x86 format */
diff --git a/hw/pci/msi.c b/hw/pci/msi.c
index 916e1a1e5b..2949938223 100644
--- a/hw/pci/msi.c
+++ b/hw/pci/msi.c
@@ -72,7 +72,7 @@ static inline uint8_t msi_cap_sizeof(uint16_t flags)
static inline unsigned int msi_nr_vectors(uint16_t flags)
{
return 1U <<
- ((flags & PCI_MSI_FLAGS_QSIZE) >> (ffs(PCI_MSI_FLAGS_QSIZE) - 1));
+ ((flags & PCI_MSI_FLAGS_QSIZE) >> ctz32(PCI_MSI_FLAGS_QSIZE));
}
static inline uint8_t msi_flags_off(const PCIDevice* dev)
@@ -175,9 +175,9 @@ int msi_init(struct PCIDevice *dev, uint8_t offset,
assert(nr_vectors > 0);
assert(nr_vectors <= PCI_MSI_VECTORS_MAX);
/* the nr of MSI vectors is up to 32 */
- vectors_order = ffs(nr_vectors) - 1;
+ vectors_order = ctz32(nr_vectors);
- flags = vectors_order << (ffs(PCI_MSI_FLAGS_QMASK) - 1);
+ flags = vectors_order << ctz32(PCI_MSI_FLAGS_QMASK);
if (msi64bit) {
flags |= PCI_MSI_FLAGS_64BIT;
}
@@ -355,12 +355,12 @@ void msi_write_config(PCIDevice *dev, uint32_t addr, uint32_t val, int len)
* just don't crash the host
*/
log_num_vecs =
- (flags & PCI_MSI_FLAGS_QSIZE) >> (ffs(PCI_MSI_FLAGS_QSIZE) - 1);
+ (flags & PCI_MSI_FLAGS_QSIZE) >> ctz32(PCI_MSI_FLAGS_QSIZE);
log_max_vecs =
- (flags & PCI_MSI_FLAGS_QMASK) >> (ffs(PCI_MSI_FLAGS_QMASK) - 1);
+ (flags & PCI_MSI_FLAGS_QMASK) >> ctz32(PCI_MSI_FLAGS_QMASK);
if (log_num_vecs > log_max_vecs) {
flags &= ~PCI_MSI_FLAGS_QSIZE;
- flags |= log_max_vecs << (ffs(PCI_MSI_FLAGS_QSIZE) - 1);
+ flags |= log_max_vecs << ctz32(PCI_MSI_FLAGS_QSIZE);
pci_set_word(dev->config + msi_flags_off(dev), flags);
}
diff --git a/hw/pci/pcie_aer.c b/hw/pci/pcie_aer.c
index eaa3e6ea94..b48c09cd11 100644
--- a/hw/pci/pcie_aer.c
+++ b/hw/pci/pcie_aer.c
@@ -410,7 +410,7 @@ static void pcie_aer_msg(PCIDevice *dev, const PCIEAERMsg *msg)
static void pcie_aer_update_log(PCIDevice *dev, const PCIEAERErr *err)
{
uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
- uint8_t first_bit = ffs(err->status) - 1;
+ uint8_t first_bit = ctz32(err->status);
uint32_t errcap = pci_get_long(aer_cap + PCI_ERR_CAP);
int i;
diff --git a/hw/pci/shpc.c b/hw/pci/shpc.c
index 759910f79a..a706486394 100644
--- a/hw/pci/shpc.c
+++ b/hw/pci/shpc.c
@@ -61,7 +61,7 @@
/* Same slot state masks are used for command and status registers */
#define SHPC_SLOT_STATE_MASK 0x03
#define SHPC_SLOT_STATE_SHIFT \
- (ffs(SHPC_SLOT_STATE_MASK) - 1)
+ ctz32(SHPC_SLOT_STATE_MASK)
#define SHPC_STATE_NO 0x0
#define SHPC_STATE_PWRONLY 0x1
@@ -70,10 +70,10 @@
#define SHPC_SLOT_PWR_LED_MASK 0xC
#define SHPC_SLOT_PWR_LED_SHIFT \
- (ffs(SHPC_SLOT_PWR_LED_MASK) - 1)
+ ctz32(SHPC_SLOT_PWR_LED_MASK)
#define SHPC_SLOT_ATTN_LED_MASK 0x30
#define SHPC_SLOT_ATTN_LED_SHIFT \
- (ffs(SHPC_SLOT_ATTN_LED_MASK) - 1)
+ ctz32(SHPC_SLOT_ATTN_LED_MASK)
#define SHPC_LED_NO 0x0
#define SHPC_LED_ON 0x1
@@ -136,7 +136,7 @@ static int roundup_pow_of_two(int x)
static uint16_t shpc_get_status(SHPCDevice *shpc, int slot, uint16_t msk)
{
uint8_t *status = shpc->config + SHPC_SLOT_STATUS(slot);
- return (pci_get_word(status) & msk) >> (ffs(msk) - 1);
+ return (pci_get_word(status) & msk) >> ctz32(msk);
}
static void shpc_set_status(SHPCDevice *shpc,
@@ -144,7 +144,7 @@ static void shpc_set_status(SHPCDevice *shpc,
{
uint8_t *status = shpc->config + SHPC_SLOT_STATUS(slot);
pci_word_test_and_clear_mask(status, msk);
- pci_word_test_and_set_mask(status, value << (ffs(msk) - 1));
+ pci_word_test_and_set_mask(status, value << ctz32(msk));
}
static void shpc_interrupt_update(PCIDevice *d)
diff --git a/hw/pci/slotid_cap.c b/hw/pci/slotid_cap.c
index 62f7bae2f1..1c01d346c9 100644
--- a/hw/pci/slotid_cap.c
+++ b/hw/pci/slotid_cap.c
@@ -3,7 +3,7 @@
#include "qemu/error-report.h"
#define SLOTID_CAP_LENGTH 4
-#define SLOTID_NSLOTS_SHIFT (ffs(PCI_SID_ESR_NSLOTS) - 1)
+#define SLOTID_NSLOTS_SHIFT ctz32(PCI_SID_ESR_NSLOTS)
int slotid_cap_init(PCIDevice *d, int nslots,
uint8_t chassis,
diff --git a/hw/ppc/ppce500_spin.c b/hw/ppc/ppce500_spin.c
index d49f2b8803..a99f7b0397 100644
--- a/hw/ppc/ppce500_spin.c
+++ b/hw/ppc/ppce500_spin.c
@@ -74,7 +74,7 @@ static void spin_reset(void *opaque)
/* Create -kernel TLB entries for BookE, linearly spanning 256MB. */
static inline hwaddr booke206_page_size_to_tlb(uint64_t size)
{
- return (ffs(size >> 10) - 1) >> 1;
+ return ctz32(size >> 10) >> 1;
}
static void mmubooke_create_initial_mapping(CPUPPCState *env,
diff --git a/hw/scsi/megasas.c b/hw/scsi/megasas.c
index ad7317bfe9..91a5d97c73 100644
--- a/hw/scsi/megasas.c
+++ b/hw/scsi/megasas.c
@@ -804,7 +804,7 @@ static int megasas_ctrl_get_info(MegasasState *s, MegasasCmd *cmd)
MFI_INFO_LDOPS_READ_POLICY);
info.max_strips_per_io = cpu_to_le16(s->fw_sge);
info.stripe_sz_ops.min = 3;
- info.stripe_sz_ops.max = ffs(MEGASAS_MAX_SECTORS + 1) - 1;
+ info.stripe_sz_ops.max = ctz32(MEGASAS_MAX_SECTORS + 1);
info.properties.pred_fail_poll_interval = cpu_to_le16(300);
info.properties.intr_throttle_cnt = cpu_to_le16(16);
info.properties.intr_throttle_timeout = cpu_to_le16(50);
diff --git a/hw/sd/sd.c b/hw/sd/sd.c
index f955265f74..8abf0c9e31 100644
--- a/hw/sd/sd.c
+++ b/hw/sd/sd.c
@@ -796,8 +796,9 @@ static sd_rsp_type_t sd_normal_command(SDState *sd,
sd->vhs = 0;
/* No response if not exactly one VHS bit is set. */
- if (!(req.arg >> 8) || (req.arg >> ffs(req.arg & ~0xff)))
+ if (!(req.arg >> 8) || (req.arg >> (ctz32(req.arg & ~0xff) + 1))) {
return sd->spi ? sd_r7 : sd_r0;
+ }
/* Accept. */
sd->vhs = req.arg;
diff --git a/include/block/aio.h b/include/block/aio.h
index 7d1e26b33b..d2bb423de1 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -82,9 +82,6 @@ struct AioContext {
/* Used for aio_notify. */
EventNotifier notifier;
- /* GPollFDs for aio_poll() */
- GArray *pollfds;
-
/* Thread pool for performing work and receiving completion callbacks */
struct ThreadPool *thread_pool;
@@ -121,13 +118,14 @@ void aio_context_ref(AioContext *ctx);
void aio_context_unref(AioContext *ctx);
/* Take ownership of the AioContext. If the AioContext will be shared between
- * threads, a thread must have ownership when calling aio_poll().
+ * threads, and a thread does not want to be interrupted, it will have to
+ * take ownership around calls to aio_poll(). Otherwise, aio_poll()
+ * automatically takes care of calling aio_context_acquire and
+ * aio_context_release.
*
- * Note that multiple threads calling aio_poll() means timers, BHs, and
- * callbacks may be invoked from a different thread than they were registered
- * from. Therefore, code must use AioContext acquire/release or use
- * fine-grained synchronization to protect shared state if other threads will
- * be accessing it simultaneously.
+ * Access to timers and BHs from a thread that has not acquired AioContext
+ * is possible. Access to callbacks for now must be done while the AioContext
+ * is owned by the thread (FIXME).
*/
void aio_context_acquire(AioContext *ctx);
diff --git a/include/block/block.h b/include/block/block.h
index 4c57d63fe2..7d1a7174f6 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -382,7 +382,7 @@ void bdrv_lock_medium(BlockDriverState *bs, bool locked);
void bdrv_eject(BlockDriverState *bs, bool eject_flag);
const char *bdrv_get_format_name(BlockDriverState *bs);
BlockDriverState *bdrv_find_node(const char *node_name);
-BlockDeviceInfoList *bdrv_named_nodes_list(void);
+BlockDeviceInfoList *bdrv_named_nodes_list(Error **errp);
BlockDriverState *bdrv_lookup_bs(const char *device,
const char *node_name,
Error **errp);
@@ -398,6 +398,7 @@ void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
void *opaque);
const char *bdrv_get_node_name(const BlockDriverState *bs);
const char *bdrv_get_device_name(const BlockDriverState *bs);
+const char *bdrv_get_device_or_node_name(const BlockDriverState *bs);
int bdrv_get_flags(BlockDriverState *bs);
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
const uint8_t *buf, int nb_sectors);
@@ -449,18 +450,39 @@ bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov);
struct HBitmapIter;
typedef struct BdrvDirtyBitmap BdrvDirtyBitmap;
-BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
+BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
+ uint32_t granularity,
+ const char *name,
Error **errp);
+int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs,
+ BdrvDirtyBitmap *bitmap,
+ Error **errp);
+BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs,
+ BdrvDirtyBitmap *bitmap,
+ Error **errp);
+BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs,
+ BdrvDirtyBitmap *bitmap,
+ Error **errp);
+BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs,
+ const char *name);
+void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap);
void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap);
+void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap);
+void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap);
BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs);
+uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs);
+uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap);
+bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap);
+bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap);
int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector);
-void bdrv_set_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
+void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap,
int64_t cur_sector, int nr_sectors);
-void bdrv_reset_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
+void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap,
int64_t cur_sector, int nr_sectors);
-void bdrv_dirty_iter_init(BlockDriverState *bs,
- BdrvDirtyBitmap *bitmap, struct HBitmapIter *hbi);
-int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap);
+void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap);
+void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, struct HBitmapIter *hbi);
+void bdrv_set_dirty_iter(struct HBitmapIter *hbi, int64_t offset);
+int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap);
void bdrv_enable_copy_on_read(BlockDriverState *bs);
void bdrv_disable_copy_on_read(BlockDriverState *bs);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index dccb092df7..db29b7424e 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -439,6 +439,14 @@ extern BlockDriver bdrv_file;
extern BlockDriver bdrv_raw;
extern BlockDriver bdrv_qcow2;
+/**
+ * bdrv_setup_io_funcs:
+ *
+ * Prepare a #BlockDriver for I/O request processing by populating
+ * unimplemented coroutine and AIO interfaces with generic wrapper functions
+ * that fall back to implemented interfaces.
+ */
+void bdrv_setup_io_funcs(BlockDriver *bdrv);
int get_tmp_filename(char *filename, int size);
BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
@@ -590,7 +598,7 @@ void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
*/
void mirror_start(BlockDriverState *bs, BlockDriverState *target,
const char *replaces,
- int64_t speed, int64_t granularity, int64_t buf_size,
+ int64_t speed, uint32_t granularity, int64_t buf_size,
MirrorSyncMode mode, BlockdevOnError on_source_error,
BlockdevOnError on_target_error,
BlockCompletionFunc *cb,
@@ -602,6 +610,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
* @target: Block device to write to.
* @speed: The maximum speed, in bytes per second, or 0 for unlimited.
* @sync_mode: What parts of the disk image should be copied to the destination.
+ * @sync_bitmap: The dirty bitmap if sync_mode is MIRROR_SYNC_MODE_DIRTY_BITMAP.
* @on_source_error: The action to take upon error reading from the source.
* @on_target_error: The action to take upon error writing to the target.
* @cb: Completion function for the job.
@@ -612,6 +621,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
*/
void backup_start(BlockDriverState *bs, BlockDriverState *target,
int64_t speed, MirrorSyncMode sync_mode,
+ BdrvDirtyBitmap *sync_bitmap,
BlockdevOnError on_source_error,
BlockdevOnError on_target_error,
BlockCompletionFunc *cb, void *opaque,
@@ -624,4 +634,8 @@ bool blk_dev_is_tray_open(BlockBackend *blk);
bool blk_dev_is_medium_locked(BlockBackend *blk);
void blk_dev_resize_cb(BlockBackend *blk);
+void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors);
+void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
+ int nr_sectors);
+
#endif /* BLOCK_INT_H */
diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index b6d4ebbe03..57d8ef13e2 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -79,10 +79,16 @@ struct BlockJob {
bool cancelled;
/**
- * Set to true if the job is either paused, or will pause itself
- * as soon as possible (if busy == true).
+ * Counter for pause request. If non-zero, the block job is either paused,
+ * or if busy == true will pause itself as soon as possible.
*/
- bool paused;
+ int pause_count;
+
+ /**
+ * Set to true if the job is paused by user. Can be unpaused with the
+ * block-job-resume QMP command.
+ */
+ bool user_paused;
/**
* Set to false by the job while it is in a quiescent state, where
@@ -225,11 +231,19 @@ void block_job_pause(BlockJob *job);
* block_job_resume:
* @job: The job to be resumed.
*
- * Resume the specified job.
+ * Resume the specified job. Must be paired with a preceding block_job_pause.
*/
void block_job_resume(BlockJob *job);
/**
+ * block_job_enter:
+ * @job: The job to enter.
+ *
+ * Continue the specified job by entering the coroutine.
+ */
+void block_job_enter(BlockJob *job);
+
+/**
* block_job_event_cancelled:
* @job: The job whose information is requested.
*
diff --git a/include/block/qapi.h b/include/block/qapi.h
index 168d788521..327549d917 100644
--- a/include/block/qapi.h
+++ b/include/block/qapi.h
@@ -29,7 +29,7 @@
#include "block/block.h"
#include "block/snapshot.h"
-BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs);
+BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs, Error **errp);
int bdrv_query_snapshot_info_list(BlockDriverState *bs,
SnapshotInfoList **p_list,
Error **errp);
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index b97c2956ec..d4ffead48a 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -568,7 +568,7 @@ static inline void
pci_set_byte_by_mask(uint8_t *config, uint8_t mask, uint8_t reg)
{
uint8_t val = pci_get_byte(config);
- uint8_t rval = reg << (ffs(mask) - 1);
+ uint8_t rval = reg << ctz32(mask);
pci_set_byte(config, (~mask & val) | (mask & rval));
}
@@ -576,14 +576,14 @@ static inline uint8_t
pci_get_byte_by_mask(uint8_t *config, uint8_t mask)
{
uint8_t val = pci_get_byte(config);
- return (val & mask) >> (ffs(mask) - 1);
+ return (val & mask) >> ctz32(mask);
}
static inline void
pci_set_word_by_mask(uint8_t *config, uint16_t mask, uint16_t reg)
{
uint16_t val = pci_get_word(config);
- uint16_t rval = reg << (ffs(mask) - 1);
+ uint16_t rval = reg << ctz32(mask);
pci_set_word(config, (~mask & val) | (mask & rval));
}
@@ -591,14 +591,14 @@ static inline uint16_t
pci_get_word_by_mask(uint8_t *config, uint16_t mask)
{
uint16_t val = pci_get_word(config);
- return (val & mask) >> (ffs(mask) - 1);
+ return (val & mask) >> ctz32(mask);
}
static inline void
pci_set_long_by_mask(uint8_t *config, uint32_t mask, uint32_t reg)
{
uint32_t val = pci_get_long(config);
- uint32_t rval = reg << (ffs(mask) - 1);
+ uint32_t rval = reg << ctz32(mask);
pci_set_long(config, (~mask & val) | (mask & rval));
}
@@ -606,14 +606,14 @@ static inline uint32_t
pci_get_long_by_mask(uint8_t *config, uint32_t mask)
{
uint32_t val = pci_get_long(config);
- return (val & mask) >> (ffs(mask) - 1);
+ return (val & mask) >> ctz32(mask);
}
static inline void
pci_set_quad_by_mask(uint8_t *config, uint64_t mask, uint64_t reg)
{
uint64_t val = pci_get_quad(config);
- uint64_t rval = reg << (ffs(mask) - 1);
+ uint64_t rval = reg << ctz32(mask);
pci_set_quad(config, (~mask & val) | (mask & rval));
}
@@ -621,7 +621,7 @@ static inline uint64_t
pci_get_quad_by_mask(uint8_t *config, uint64_t mask)
{
uint64_t val = pci_get_quad(config);
- return (val & mask) >> (ffs(mask) - 1);
+ return (val & mask) >> ctz32(mask);
}
PCIDevice *pci_create_multifunction(PCIBus *bus, int devfn, bool multifunction,
diff --git a/include/hw/pci/pcie_regs.h b/include/hw/pci/pcie_regs.h
index 848ab1c206..6a28b33e69 100644
--- a/include/hw/pci/pcie_regs.h
+++ b/include/hw/pci/pcie_regs.h
@@ -27,34 +27,34 @@
/* PCI_EXP_FLAGS */
#define PCI_EXP_FLAGS_VER2 2 /* for now, supports only ver. 2 */
-#define PCI_EXP_FLAGS_IRQ_SHIFT (ffs(PCI_EXP_FLAGS_IRQ) - 1)
-#define PCI_EXP_FLAGS_TYPE_SHIFT (ffs(PCI_EXP_FLAGS_TYPE) - 1)
+#define PCI_EXP_FLAGS_IRQ_SHIFT ctz32(PCI_EXP_FLAGS_IRQ)
+#define PCI_EXP_FLAGS_TYPE_SHIFT ctz32(PCI_EXP_FLAGS_TYPE)
/* PCI_EXP_LINK{CAP, STA} */
/* link speed */
#define PCI_EXP_LNK_LS_25 1
-#define PCI_EXP_LNK_MLW_SHIFT (ffs(PCI_EXP_LNKCAP_MLW) - 1)
+#define PCI_EXP_LNK_MLW_SHIFT ctz32(PCI_EXP_LNKCAP_MLW)
#define PCI_EXP_LNK_MLW_1 (1 << PCI_EXP_LNK_MLW_SHIFT)
/* PCI_EXP_LINKCAP */
-#define PCI_EXP_LNKCAP_ASPMS_SHIFT (ffs(PCI_EXP_LNKCAP_ASPMS) - 1)
+#define PCI_EXP_LNKCAP_ASPMS_SHIFT ctz32(PCI_EXP_LNKCAP_ASPMS)
#define PCI_EXP_LNKCAP_ASPMS_0S (1 << PCI_EXP_LNKCAP_ASPMS_SHIFT)
-#define PCI_EXP_LNKCAP_PN_SHIFT (ffs(PCI_EXP_LNKCAP_PN) - 1)
+#define PCI_EXP_LNKCAP_PN_SHIFT ctz32(PCI_EXP_LNKCAP_PN)
-#define PCI_EXP_SLTCAP_PSN_SHIFT (ffs(PCI_EXP_SLTCAP_PSN) - 1)
+#define PCI_EXP_SLTCAP_PSN_SHIFT ctz32(PCI_EXP_SLTCAP_PSN)
#define PCI_EXP_SLTCTL_IND_RESERVED 0x0
#define PCI_EXP_SLTCTL_IND_ON 0x1
#define PCI_EXP_SLTCTL_IND_BLINK 0x2
#define PCI_EXP_SLTCTL_IND_OFF 0x3
-#define PCI_EXP_SLTCTL_AIC_SHIFT (ffs(PCI_EXP_SLTCTL_AIC) - 1)
+#define PCI_EXP_SLTCTL_AIC_SHIFT ctz32(PCI_EXP_SLTCTL_AIC)
#define PCI_EXP_SLTCTL_AIC_OFF \
(PCI_EXP_SLTCTL_IND_OFF << PCI_EXP_SLTCTL_AIC_SHIFT)
-#define PCI_EXP_SLTCTL_PIC_SHIFT (ffs(PCI_EXP_SLTCTL_PIC) - 1)
+#define PCI_EXP_SLTCTL_PIC_SHIFT ctz32(PCI_EXP_SLTCTL_PIC)
#define PCI_EXP_SLTCTL_PIC_OFF \
(PCI_EXP_SLTCTL_IND_OFF << PCI_EXP_SLTCTL_PIC_SHIFT)
#define PCI_EXP_SLTCTL_PIC_ON \
@@ -109,7 +109,7 @@
#define PCI_ERR_ROOT_IRQ_MAX 32
#define PCI_ERR_ROOT_IRQ 0xf8000000
-#define PCI_ERR_ROOT_IRQ_SHIFT (ffs(PCI_ERR_ROOT_IRQ) - 1)
+#define PCI_ERR_ROOT_IRQ_SHIFT ctz32(PCI_ERR_ROOT_IRQ)
#define PCI_ERR_ROOT_STATUS_REPORT_MASK (PCI_ERR_ROOT_COR_RCV | \
PCI_ERR_ROOT_MULTI_COR_RCV | \
PCI_ERR_ROOT_UNCOR_RCV | \
diff --git a/include/qapi/qmp/qerror.h b/include/qapi/qmp/qerror.h
index 57a62d4b76..e5673394d3 100644
--- a/include/qapi/qmp/qerror.h
+++ b/include/qapi/qmp/qerror.h
@@ -37,9 +37,6 @@ void qerror_report_err(Error *err);
#define QERR_BASE_NOT_FOUND \
ERROR_CLASS_GENERIC_ERROR, "Base '%s' not found"
-#define QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED \
- ERROR_CLASS_GENERIC_ERROR, "Block format '%s' used by device '%s' does not support feature '%s'"
-
#define QERR_BLOCK_JOB_NOT_READY \
ERROR_CLASS_GENERIC_ERROR, "The active block job for device '%s' cannot be completed"
@@ -58,9 +55,6 @@ void qerror_report_err(Error *err);
#define QERR_DEVICE_IN_USE \
ERROR_CLASS_GENERIC_ERROR, "Device '%s' is in use"
-#define QERR_DEVICE_IS_READ_ONLY \
- ERROR_CLASS_GENERIC_ERROR, "Device '%s' is read only"
-
#define QERR_DEVICE_NO_HOTPLUG \
ERROR_CLASS_GENERIC_ERROR, "Device '%s' does not support hotplugging"
diff --git a/include/qemu/hbitmap.h b/include/qemu/hbitmap.h
index 550d7ce2c3..f0a85f8649 100644
--- a/include/qemu/hbitmap.h
+++ b/include/qemu/hbitmap.h
@@ -65,6 +65,29 @@ struct HBitmapIter {
HBitmap *hbitmap_alloc(uint64_t size, int granularity);
/**
+ * hbitmap_truncate:
+ * @hb: The bitmap to change the size of.
+ * @size: The number of elements to change the bitmap to accommodate.
+ *
+ * truncate or grow an existing bitmap to accommodate a new number of elements.
+ * This may invalidate existing HBitmapIterators.
+ */
+void hbitmap_truncate(HBitmap *hb, uint64_t size);
+
+/**
+ * hbitmap_merge:
+ * @a: The bitmap to store the result in.
+ * @b: The bitmap to merge into @a.
+ * @return true if the merge was successful,
+ * false if it was not attempted.
+ *
+ * Merge two bitmaps together.
+ * A := A (BITOR) B.
+ * B is left unmodified.
+ */
+bool hbitmap_merge(HBitmap *a, const HBitmap *b);
+
+/**
* hbitmap_empty:
* @hb: HBitmap to operate on.
*
diff --git a/include/standard-headers/linux/virtio_blk.h b/include/standard-headers/linux/virtio_blk.h
index 12016b47f3..cd601f4069 100644
--- a/include/standard-headers/linux/virtio_blk.h
+++ b/include/standard-headers/linux/virtio_blk.h
@@ -58,7 +58,7 @@ struct virtio_blk_config {
uint32_t size_max;
/* The maximum number of segments (if VIRTIO_BLK_F_SEG_MAX) */
uint32_t seg_max;
- /* geometry the device (if VIRTIO_BLK_F_GEOMETRY) */
+ /* geometry of the device (if VIRTIO_BLK_F_GEOMETRY) */
struct virtio_blk_geometry {
uint16_t cylinders;
uint8_t heads;
@@ -117,7 +117,11 @@ struct virtio_blk_config {
#define VIRTIO_BLK_T_BARRIER 0x80000000
#endif /* !VIRTIO_BLK_NO_LEGACY */
-/* This is the first element of the read scatter-gather list. */
+/*
+ * This comes first in the read scatter-gather list.
+ * For legacy virtio, if VIRTIO_F_ANY_LAYOUT is not negotiated,
+ * this is the first element of the read scatter-gather list.
+ */
struct virtio_blk_outhdr {
/* VIRTIO_BLK_T* */
__virtio32 type;
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index 77e9b9c370..b4a4d5e0b9 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -87,6 +87,8 @@ int blk_read_unthrottled(BlockBackend *blk, int64_t sector_num, uint8_t *buf,
int nb_sectors);
int blk_write(BlockBackend *blk, int64_t sector_num, const uint8_t *buf,
int nb_sectors);
+int blk_write_zeroes(BlockBackend *blk, int64_t sector_num,
+ int nb_sectors, BdrvRequestFlags flags);
BlockAIOCB *blk_aio_write_zeroes(BlockBackend *blk, int64_t sector_num,
int nb_sectors, BdrvRequestFlags flags,
BlockCompletionFunc *cb, void *opaque);
diff --git a/include/sysemu/os-win32.h b/include/sysemu/os-win32.h
index 9cc9e08139..4035c4fe54 100644
--- a/include/sysemu/os-win32.h
+++ b/include/sysemu/os-win32.h
@@ -72,9 +72,6 @@
#define sigsetjmp(env, savemask) setjmp(env)
#define siglongjmp(env, val) longjmp(env, val)
-/* Declaration of ffs() is missing in MinGW's strings.h. */
-int ffs(int i);
-
/* Missing POSIX functions. Don't use MinGW-w64 macros. */
#undef gmtime_r
struct tm *gmtime_r(const time_t *timep, struct tm *result);
diff --git a/iothread.c b/iothread.c
index 342a23fcb0..a1f91099bc 100644
--- a/iothread.c
+++ b/iothread.c
@@ -31,21 +31,14 @@ typedef ObjectClass IOThreadClass;
static void *iothread_run(void *opaque)
{
IOThread *iothread = opaque;
- bool blocking;
qemu_mutex_lock(&iothread->init_done_lock);
iothread->thread_id = qemu_get_thread_id();
qemu_cond_signal(&iothread->init_done_cond);
qemu_mutex_unlock(&iothread->init_done_lock);
- while (!iothread->stopping) {
- aio_context_acquire(iothread->ctx);
- blocking = true;
- while (!iothread->stopping && aio_poll(iothread->ctx, blocking)) {
- /* Progress was made, keep going */
- blocking = false;
- }
- aio_context_release(iothread->ctx);
+ while (!atomic_read(&iothread->stopping)) {
+ aio_poll(iothread->ctx, true);
}
return NULL;
}
diff --git a/kvm-all.c b/kvm-all.c
index 4ec153df93..2a717e5b50 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -1141,18 +1141,18 @@ static int kvm_irqchip_get_virq(KVMState *s)
{
uint32_t *word = s->used_gsi_bitmap;
int max_words = ALIGN(s->gsi_count, 32) / 32;
- int i, bit;
+ int i, zeroes;
bool retry = true;
again:
/* Return the lowest unused GSI in the bitmap */
for (i = 0; i < max_words; i++) {
- bit = ffs(~word[i]);
- if (!bit) {
+ zeroes = ctz32(~word[i]);
+ if (zeroes == 32) {
continue;
}
- return bit - 1 + i * 32;
+ return zeroes + i * 32;
}
if (!s->direct_msi && retry) {
retry = false;
diff --git a/migration/block.c b/migration/block.c
index 085c0fae05..ddb59ccf87 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -304,7 +304,7 @@ static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
nr_sectors, blk_mig_read_cb, blk);
- bdrv_reset_dirty_bitmap(bs, bmds->dirty_bitmap, cur_sector, nr_sectors);
+ bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, cur_sector, nr_sectors);
qemu_mutex_unlock_iothread();
bmds->cur_sector = cur_sector + nr_sectors;
@@ -320,7 +320,7 @@ static int set_dirty_tracking(void)
QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
bmds->dirty_bitmap = bdrv_create_dirty_bitmap(bmds->bs, BLOCK_SIZE,
- NULL);
+ NULL, NULL);
if (!bmds->dirty_bitmap) {
ret = -errno;
goto fail;
@@ -497,8 +497,7 @@ static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,
g_free(blk);
}
- bdrv_reset_dirty_bitmap(bmds->bs, bmds->dirty_bitmap, sector,
- nr_sectors);
+ bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, sector, nr_sectors);
break;
}
sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
@@ -584,7 +583,7 @@ static int64_t get_remaining_dirty(void)
int64_t dirty = 0;
QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
- dirty += bdrv_get_dirty_count(bmds->bs, bmds->dirty_bitmap);
+ dirty += bdrv_get_dirty_count(bmds->dirty_bitmap);
}
return dirty << BDRV_SECTOR_BITS;
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 78730846c2..1c17224c77 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -330,14 +330,19 @@
#
# Block dirty bitmap information.
#
+# @name: #optional the name of the dirty bitmap (Since 2.4)
+#
# @count: number of dirty bytes according to the dirty bitmap
#
# @granularity: granularity of the dirty bitmap in bytes (since 1.4)
#
+# @frozen: whether the dirty bitmap is frozen (Since 2.4)
+#
# Since: 1.3
##
{ 'type': 'BlockDirtyInfo',
- 'data': {'count': 'int', 'granularity': 'int'} }
+ 'data': {'*name': 'str', 'count': 'int', 'granularity': 'uint32',
+ 'frozen': 'bool'} }
##
# @BlockInfo:
@@ -510,10 +515,12 @@
#
# @none: only copy data written from now on
#
+# @dirty-bitmap: only copy data described by the dirty bitmap. Since: 2.4
+#
# Since: 1.3
##
{ 'enum': 'MirrorSyncMode',
- 'data': ['top', 'full', 'none'] }
+ 'data': ['top', 'full', 'none', 'dirty-bitmap'] }
##
# @BlockJobType:
@@ -688,14 +695,18 @@
# probe if @mode is 'existing', else the format of the source
#
# @sync: what parts of the disk image should be copied to the destination
-# (all the disk, only the sectors allocated in the topmost image, or
-# only new I/O).
+# (all the disk, only the sectors allocated in the topmost image, from a
+# dirty bitmap, or only new I/O).
#
# @mode: #optional whether and how QEMU should create a new image, default is
# 'absolute-paths'.
#
# @speed: #optional the maximum speed, in bytes per second
#
+# @bitmap: #optional the name of dirty bitmap if sync is "dirty-bitmap".
+# Must be present if sync is "dirty-bitmap", must NOT be present
+# otherwise. (Since 2.4)
+#
# @on-source-error: #optional the action to take on an error on the source,
# default 'report'. 'stop' and 'enospc' can only be used
# if the block device supports io-status (see BlockInfo).
@@ -713,7 +724,7 @@
{ 'type': 'DriveBackup',
'data': { 'device': 'str', 'target': 'str', '*format': 'str',
'sync': 'MirrorSyncMode', '*mode': 'NewImageMode',
- '*speed': 'int',
+ '*speed': 'int', '*bitmap': 'str',
'*on-source-error': 'BlockdevOnError',
'*on-target-error': 'BlockdevOnError' } }
@@ -958,6 +969,76 @@
'*on-target-error': 'BlockdevOnError' } }
##
+# @BlockDirtyBitmap
+#
+# @node: name of device/node which the bitmap is tracking
+#
+# @name: name of the dirty bitmap
+#
+# Since 2.4
+##
+{ 'type': 'BlockDirtyBitmap',
+ 'data': { 'node': 'str', 'name': 'str' } }
+
+##
+# @BlockDirtyBitmapAdd
+#
+# @node: name of device/node which the bitmap is tracking
+#
+# @name: name of the dirty bitmap
+#
+# @granularity: #optional the bitmap granularity, default is 64k for
+# block-dirty-bitmap-add
+#
+# Since 2.4
+##
+{ 'type': 'BlockDirtyBitmapAdd',
+ 'data': { 'node': 'str', 'name': 'str', '*granularity': 'uint32' } }
+
+##
+# @block-dirty-bitmap-add
+#
+# Create a dirty bitmap with a name on the node
+#
+# Returns: nothing on success
+# If @node is not a valid block device or node, DeviceNotFound
+# If @name is already taken, GenericError with an explanation
+#
+# Since 2.4
+##
+{ 'command': 'block-dirty-bitmap-add',
+ 'data': 'BlockDirtyBitmapAdd' }
+
+##
+# @block-dirty-bitmap-remove
+#
+# Remove a dirty bitmap on the node
+#
+# Returns: nothing on success
+# If @node is not a valid block device or node, DeviceNotFound
+# If @name is not found, GenericError with an explanation
+# if @name is frozen by an operation, GenericError
+#
+# Since 2.4
+##
+{ 'command': 'block-dirty-bitmap-remove',
+ 'data': 'BlockDirtyBitmap' }
+
+##
+# @block-dirty-bitmap-clear
+#
+# Clear (reset) a dirty bitmap on the device
+#
+# Returns: nothing on success
+# If @node is not a valid block device, DeviceNotFound
+# If @name is not found, GenericError with an explanation
+#
+# Since 2.4
+##
+{ 'command': 'block-dirty-bitmap-clear',
+ 'data': 'BlockDirtyBitmap' }
+
+##
# @block_set_io_throttle:
#
# Change I/O throttle limits for a block drive.
@@ -1310,11 +1391,14 @@
# Driver specific block device options for the null backend.
#
# @size: #optional size of the device in bytes.
+# @latency-ns: #optional emulated latency (in nanoseconds) in processing
+# requests. Default to zero which completes requests immediately.
+# (Since 2.4)
#
# Since: 2.2
##
{ 'type': 'BlockdevOptionsNull',
- 'data': { '*size': 'int' } }
+ 'data': { '*size': 'int', '*latency-ns': 'uint64' } }
##
# @BlockdevOptionsVVFAT
@@ -1754,7 +1838,11 @@
#
# Emitted when a corruption has been detected in a disk image
#
-# @device: device name
+# @device: device name. This is always present for compatibility
+# reasons, but it can be empty ("") if the image does not
+# have a device name associated.
+#
+# @node-name: #optional node name (Since: 2.4)
#
# @msg: informative message for human consumption, such as the kind of
# corruption being detected. It should not be parsed by machine as it is
@@ -1773,11 +1861,12 @@
# Since: 1.7
##
{ 'event': 'BLOCK_IMAGE_CORRUPTED',
- 'data': { 'device' : 'str',
- 'msg' : 'str',
- '*offset': 'int',
- '*size' : 'int',
- 'fatal' : 'bool' } }
+ 'data': { 'device' : 'str',
+ '*node-name' : 'str',
+ 'msg' : 'str',
+ '*offset' : 'int',
+ '*size' : 'int',
+ 'fatal' : 'bool' } }
##
# @BLOCK_IO_ERROR
diff --git a/qemu-img.c b/qemu-img.c
index 9dddfbefce..8d30e43b53 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1305,20 +1305,312 @@ out3:
return ret;
}
+enum ImgConvertBlockStatus {
+ BLK_DATA,
+ BLK_ZERO,
+ BLK_BACKING_FILE,
+};
+
+typedef struct ImgConvertState {
+ BlockBackend **src;
+ int64_t *src_sectors;
+ int src_cur, src_num;
+ int64_t src_cur_offset;
+ int64_t total_sectors;
+ int64_t allocated_sectors;
+ enum ImgConvertBlockStatus status;
+ int64_t sector_next_status;
+ BlockBackend *target;
+ bool has_zero_init;
+ bool compressed;
+ bool target_has_backing;
+ int min_sparse;
+ size_t cluster_sectors;
+ size_t buf_sectors;
+} ImgConvertState;
+
+static void convert_select_part(ImgConvertState *s, int64_t sector_num)
+{
+ assert(sector_num >= s->src_cur_offset);
+ while (sector_num - s->src_cur_offset >= s->src_sectors[s->src_cur]) {
+ s->src_cur_offset += s->src_sectors[s->src_cur];
+ s->src_cur++;
+ assert(s->src_cur < s->src_num);
+ }
+}
+
+static int convert_iteration_sectors(ImgConvertState *s, int64_t sector_num)
+{
+ int64_t ret;
+ int n;
+
+ convert_select_part(s, sector_num);
+
+ assert(s->total_sectors > sector_num);
+ n = MIN(s->total_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
+
+ if (s->sector_next_status <= sector_num) {
+ ret = bdrv_get_block_status(blk_bs(s->src[s->src_cur]),
+ sector_num - s->src_cur_offset,
+ n, &n);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (ret & BDRV_BLOCK_ZERO) {
+ s->status = BLK_ZERO;
+ } else if (ret & BDRV_BLOCK_DATA) {
+ s->status = BLK_DATA;
+ } else if (!s->target_has_backing) {
+ /* Without a target backing file we must copy over the contents of
+ * the backing file as well. */
+ /* TODO Check block status of the backing file chain to avoid
+ * needlessly reading zeroes and limiting the iteration to the
+ * buffer size */
+ s->status = BLK_DATA;
+ } else {
+ s->status = BLK_BACKING_FILE;
+ }
+
+ s->sector_next_status = sector_num + n;
+ }
+
+ n = MIN(n, s->sector_next_status - sector_num);
+ if (s->status == BLK_DATA) {
+ n = MIN(n, s->buf_sectors);
+ }
+
+ /* We need to write complete clusters for compressed images, so if an
+ * unallocated area is shorter than that, we must consider the whole
+ * cluster allocated. */
+ if (s->compressed) {
+ if (n < s->cluster_sectors) {
+ n = MIN(s->cluster_sectors, s->total_sectors - sector_num);
+ s->status = BLK_DATA;
+ } else {
+ n = QEMU_ALIGN_DOWN(n, s->cluster_sectors);
+ }
+ }
+
+ return n;
+}
+
+static int convert_read(ImgConvertState *s, int64_t sector_num, int nb_sectors,
+ uint8_t *buf)
+{
+ int n;
+ int ret;
+
+ if (s->status == BLK_ZERO || s->status == BLK_BACKING_FILE) {
+ return 0;
+ }
+
+ assert(nb_sectors <= s->buf_sectors);
+ while (nb_sectors > 0) {
+ BlockBackend *blk;
+ int64_t bs_sectors;
+
+ /* In the case of compression with multiple source files, we can get a
+ * nb_sectors that spreads into the next part. So we must be able to
+ * read across multiple BDSes for one convert_read() call. */
+ convert_select_part(s, sector_num);
+ blk = s->src[s->src_cur];
+ bs_sectors = s->src_sectors[s->src_cur];
+
+ n = MIN(nb_sectors, bs_sectors - (sector_num - s->src_cur_offset));
+ ret = blk_read(blk, sector_num - s->src_cur_offset, buf, n);
+ if (ret < 0) {
+ return ret;
+ }
+
+ sector_num += n;
+ nb_sectors -= n;
+ buf += n * BDRV_SECTOR_SIZE;
+ }
+
+ return 0;
+}
+
+static int convert_write(ImgConvertState *s, int64_t sector_num, int nb_sectors,
+ const uint8_t *buf)
+{
+ int ret;
+
+ while (nb_sectors > 0) {
+ int n = nb_sectors;
+
+ switch (s->status) {
+ case BLK_BACKING_FILE:
+ /* If we have a backing file, leave clusters unallocated that are
+ * unallocated in the source image, so that the backing file is
+ * visible at the respective offset. */
+ assert(s->target_has_backing);
+ break;
+
+ case BLK_DATA:
+ /* We must always write compressed clusters as a whole, so don't
+ * try to find zeroed parts in the buffer. We can only save the
+ * write if the buffer is completely zeroed and we're allowed to
+ * keep the target sparse. */
+ if (s->compressed) {
+ if (s->has_zero_init && s->min_sparse &&
+ buffer_is_zero(buf, n * BDRV_SECTOR_SIZE))
+ {
+ assert(!s->target_has_backing);
+ break;
+ }
+
+ ret = blk_write_compressed(s->target, sector_num, buf, n);
+ if (ret < 0) {
+ return ret;
+ }
+ break;
+ }
+
+ /* If there is real non-zero data or we're told to keep the target
+ * fully allocated (-S 0), we must write it. Otherwise we can treat
+ * it as zero sectors. */
+ if (!s->min_sparse ||
+ is_allocated_sectors_min(buf, n, &n, s->min_sparse))
+ {
+ ret = blk_write(s->target, sector_num, buf, n);
+ if (ret < 0) {
+ return ret;
+ }
+ break;
+ }
+ /* fall-through */
+
+ case BLK_ZERO:
+ if (s->has_zero_init) {
+ break;
+ }
+ ret = blk_write_zeroes(s->target, sector_num, n, 0);
+ if (ret < 0) {
+ return ret;
+ }
+ break;
+ }
+
+ sector_num += n;
+ nb_sectors -= n;
+ buf += n * BDRV_SECTOR_SIZE;
+ }
+
+ return 0;
+}
+
+static int convert_do_copy(ImgConvertState *s)
+{
+ uint8_t *buf = NULL;
+ int64_t sector_num, allocated_done;
+ int ret;
+ int n;
+
+ /* Check whether we have zero initialisation or can get it efficiently */
+ s->has_zero_init = s->min_sparse && !s->target_has_backing
+ ? bdrv_has_zero_init(blk_bs(s->target))
+ : false;
+
+ if (!s->has_zero_init && !s->target_has_backing &&
+ bdrv_can_write_zeroes_with_unmap(blk_bs(s->target)))
+ {
+ ret = bdrv_make_zero(blk_bs(s->target), BDRV_REQ_MAY_UNMAP);
+ if (ret == 0) {
+ s->has_zero_init = true;
+ }
+ }
+
+ /* Allocate buffer for copied data. For compressed images, only one cluster
+ * can be copied at a time. */
+ if (s->compressed) {
+ if (s->cluster_sectors <= 0 || s->cluster_sectors > s->buf_sectors) {
+ error_report("invalid cluster size");
+ ret = -EINVAL;
+ goto fail;
+ }
+ s->buf_sectors = s->cluster_sectors;
+ }
+ buf = blk_blockalign(s->target, s->buf_sectors * BDRV_SECTOR_SIZE);
+
+ /* Calculate allocated sectors for progress */
+ s->allocated_sectors = 0;
+ sector_num = 0;
+ while (sector_num < s->total_sectors) {
+ n = convert_iteration_sectors(s, sector_num);
+ if (n < 0) {
+ ret = n;
+ goto fail;
+ }
+ if (s->status == BLK_DATA) {
+ s->allocated_sectors += n;
+ }
+ sector_num += n;
+ }
+
+ /* Do the copy */
+ s->src_cur = 0;
+ s->src_cur_offset = 0;
+ s->sector_next_status = 0;
+
+ sector_num = 0;
+ allocated_done = 0;
+
+ while (sector_num < s->total_sectors) {
+ n = convert_iteration_sectors(s, sector_num);
+ if (n < 0) {
+ ret = n;
+ goto fail;
+ }
+ if (s->status == BLK_DATA) {
+ allocated_done += n;
+ qemu_progress_print(100.0 * allocated_done / s->allocated_sectors,
+ 0);
+ }
+
+ ret = convert_read(s, sector_num, n, buf);
+ if (ret < 0) {
+ error_report("error while reading sector %" PRId64
+ ": %s", sector_num, strerror(-ret));
+ goto fail;
+ }
+
+ ret = convert_write(s, sector_num, n, buf);
+ if (ret < 0) {
+ error_report("error while writing sector %" PRId64
+ ": %s", sector_num, strerror(-ret));
+ goto fail;
+ }
+
+ sector_num += n;
+ }
+
+ if (s->compressed) {
+ /* signal EOF to align */
+ ret = blk_write_compressed(s->target, 0, NULL, 0);
+ if (ret < 0) {
+ goto fail;
+ }
+ }
+
+ ret = 0;
+fail:
+ qemu_vfree(buf);
+ return ret;
+}
+
static int img_convert(int argc, char **argv)
{
- int c, n, n1, bs_n, bs_i, compress, cluster_sectors, skip_create;
+ int c, bs_n, bs_i, compress, cluster_sectors, skip_create;
int64_t ret = 0;
int progress = 0, flags, src_flags;
const char *fmt, *out_fmt, *cache, *src_cache, *out_baseimg, *out_filename;
BlockDriver *drv, *proto_drv;
BlockBackend **blk = NULL, *out_blk = NULL;
BlockDriverState **bs = NULL, *out_bs = NULL;
- int64_t total_sectors, nb_sectors, sector_num, bs_offset;
+ int64_t total_sectors;
int64_t *bs_sectors = NULL;
- uint8_t * buf = NULL;
size_t bufsectors = IO_BUF_SIZE / BDRV_SECTOR_SIZE;
- const uint8_t *buf1;
BlockDriverInfo bdi;
QemuOpts *opts = NULL;
QemuOptsList *create_opts = NULL;
@@ -1329,6 +1621,7 @@ static int img_convert(int argc, char **argv)
bool quiet = false;
Error *local_err = NULL;
QemuOpts *sn_opts = NULL;
+ ImgConvertState state;
fmt = NULL;
out_fmt = "raw";
@@ -1627,9 +1920,6 @@ static int img_convert(int argc, char **argv)
}
out_bs = blk_bs(out_blk);
- bs_i = 0;
- bs_offset = 0;
-
/* increase bufsectors from the default 4096 (2M) if opt_transfer_length
* or discard_alignment of the out_bs is greater. Limit to 32768 (16MB)
* as maximum. */
@@ -1638,8 +1928,6 @@ static int img_convert(int argc, char **argv)
out_bs->bl.discard_alignment))
);
- buf = blk_blockalign(out_blk, bufsectors * BDRV_SECTOR_SIZE);
-
if (skip_create) {
int64_t output_sectors = blk_nb_sectors(out_blk);
if (output_sectors < 0) {
@@ -1666,203 +1954,20 @@ static int img_convert(int argc, char **argv)
cluster_sectors = bdi.cluster_size / BDRV_SECTOR_SIZE;
}
- if (compress) {
- if (cluster_sectors <= 0 || cluster_sectors > bufsectors) {
- error_report("invalid cluster size");
- ret = -1;
- goto out;
- }
- sector_num = 0;
-
- nb_sectors = total_sectors;
-
- for(;;) {
- int64_t bs_num;
- int remainder;
- uint8_t *buf2;
-
- nb_sectors = total_sectors - sector_num;
- if (nb_sectors <= 0)
- break;
- if (nb_sectors >= cluster_sectors)
- n = cluster_sectors;
- else
- n = nb_sectors;
-
- bs_num = sector_num - bs_offset;
- assert (bs_num >= 0);
- remainder = n;
- buf2 = buf;
- while (remainder > 0) {
- int nlow;
- while (bs_num == bs_sectors[bs_i]) {
- bs_offset += bs_sectors[bs_i];
- bs_i++;
- assert (bs_i < bs_n);
- bs_num = 0;
- /* printf("changing part: sector_num=%" PRId64 ", "
- "bs_i=%d, bs_offset=%" PRId64 ", bs_sectors=%" PRId64
- "\n", sector_num, bs_i, bs_offset, bs_sectors[bs_i]); */
- }
- assert (bs_num < bs_sectors[bs_i]);
-
- nlow = remainder > bs_sectors[bs_i] - bs_num
- ? bs_sectors[bs_i] - bs_num : remainder;
-
- ret = blk_read(blk[bs_i], bs_num, buf2, nlow);
- if (ret < 0) {
- error_report("error while reading sector %" PRId64 ": %s",
- bs_num, strerror(-ret));
- goto out;
- }
-
- buf2 += nlow * 512;
- bs_num += nlow;
-
- remainder -= nlow;
- }
- assert (remainder == 0);
-
- if (!buffer_is_zero(buf, n * BDRV_SECTOR_SIZE)) {
- ret = blk_write_compressed(out_blk, sector_num, buf, n);
- if (ret != 0) {
- error_report("error while compressing sector %" PRId64
- ": %s", sector_num, strerror(-ret));
- goto out;
- }
- }
- sector_num += n;
- qemu_progress_print(100.0 * sector_num / total_sectors, 0);
- }
- /* signal EOF to align */
- blk_write_compressed(out_blk, 0, NULL, 0);
- } else {
- int64_t sectors_to_read, sectors_read, sector_num_next_status;
- bool count_allocated_sectors;
- int has_zero_init = min_sparse ? bdrv_has_zero_init(out_bs) : 0;
-
- if (!has_zero_init && bdrv_can_write_zeroes_with_unmap(out_bs)) {
- ret = bdrv_make_zero(out_bs, BDRV_REQ_MAY_UNMAP);
- if (ret < 0) {
- goto out;
- }
- has_zero_init = 1;
- }
-
- sectors_to_read = total_sectors;
- count_allocated_sectors = progress && (out_baseimg || has_zero_init);
-restart:
- sector_num = 0; // total number of sectors converted so far
- sectors_read = 0;
- sector_num_next_status = 0;
-
- for(;;) {
- nb_sectors = total_sectors - sector_num;
- if (nb_sectors <= 0) {
- if (count_allocated_sectors) {
- sectors_to_read = sectors_read;
- count_allocated_sectors = false;
- goto restart;
- }
- ret = 0;
- break;
- }
-
- while (sector_num - bs_offset >= bs_sectors[bs_i]) {
- bs_offset += bs_sectors[bs_i];
- bs_i ++;
- assert (bs_i < bs_n);
- /* printf("changing part: sector_num=%" PRId64 ", bs_i=%d, "
- "bs_offset=%" PRId64 ", bs_sectors=%" PRId64 "\n",
- sector_num, bs_i, bs_offset, bs_sectors[bs_i]); */
- }
-
- if ((out_baseimg || has_zero_init) &&
- sector_num >= sector_num_next_status) {
- n = nb_sectors > INT_MAX ? INT_MAX : nb_sectors;
- ret = bdrv_get_block_status(bs[bs_i], sector_num - bs_offset,
- n, &n1);
- if (ret < 0) {
- error_report("error while reading block status of sector %"
- PRId64 ": %s", sector_num - bs_offset,
- strerror(-ret));
- goto out;
- }
- /* If the output image is zero initialized, we are not working
- * on a shared base and the input is zero we can skip the next
- * n1 sectors */
- if (has_zero_init && !out_baseimg && (ret & BDRV_BLOCK_ZERO)) {
- sector_num += n1;
- continue;
- }
- /* If the output image is being created as a copy on write
- * image, assume that sectors which are unallocated in the
- * input image are present in both the output's and input's
- * base images (no need to copy them). */
- if (out_baseimg) {
- if (!(ret & BDRV_BLOCK_DATA)) {
- sector_num += n1;
- continue;
- }
- /* The next 'n1' sectors are allocated in the input image.
- * Copy only those as they may be followed by unallocated
- * sectors. */
- nb_sectors = n1;
- }
- /* avoid redundant callouts to get_block_status */
- sector_num_next_status = sector_num + n1;
- }
-
- n = MIN(nb_sectors, bufsectors);
-
- /* round down request length to an aligned sector, but
- * do not bother doing this on short requests. They happen
- * when we found an all-zero area, and the next sector to
- * write will not be sector_num + n. */
- if (cluster_sectors > 0 && n >= cluster_sectors) {
- int64_t next_aligned_sector = (sector_num + n);
- next_aligned_sector -= next_aligned_sector % cluster_sectors;
- if (sector_num + n > next_aligned_sector) {
- n = next_aligned_sector - sector_num;
- }
- }
-
- n = MIN(n, bs_sectors[bs_i] - (sector_num - bs_offset));
-
- sectors_read += n;
- if (count_allocated_sectors) {
- sector_num += n;
- continue;
- }
+ state = (ImgConvertState) {
+ .src = blk,
+ .src_sectors = bs_sectors,
+ .src_num = bs_n,
+ .total_sectors = total_sectors,
+ .target = out_blk,
+ .compressed = compress,
+ .target_has_backing = (bool) out_baseimg,
+ .min_sparse = min_sparse,
+ .cluster_sectors = cluster_sectors,
+ .buf_sectors = bufsectors,
+ };
+ ret = convert_do_copy(&state);
- n1 = n;
- ret = blk_read(blk[bs_i], sector_num - bs_offset, buf, n);
- if (ret < 0) {
- error_report("error while reading sector %" PRId64 ": %s",
- sector_num - bs_offset, strerror(-ret));
- goto out;
- }
- /* NOTE: at the same time we convert, we do not write zero
- sectors to have a chance to compress the image. Ideally, we
- should add a specific call to have the info to go faster */
- buf1 = buf;
- while (n > 0) {
- if (!has_zero_init ||
- is_allocated_sectors_min(buf1, n, &n1, min_sparse)) {
- ret = blk_write(out_blk, sector_num, buf1, n1);
- if (ret < 0) {
- error_report("error while writing sector %" PRId64
- ": %s", sector_num, strerror(-ret));
- goto out;
- }
- }
- sector_num += n1;
- n -= n1;
- buf1 += n1 * 512;
- }
- qemu_progress_print(100.0 * sectors_read / sectors_to_read, 0);
- }
- }
out:
if (!ret) {
qemu_progress_print(100, 0);
@@ -1870,7 +1975,6 @@ out:
qemu_progress_end();
qemu_opts_del(opts);
qemu_opts_free(create_opts);
- qemu_vfree(buf);
qemu_opts_del(sn_opts);
blk_unref(out_blk);
g_free(bs);
diff --git a/qmp-commands.hx b/qmp-commands.hx
index 1e59541665..213508fe5d 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -1007,6 +1007,43 @@ EQMP
.mhandler.cmd_new = qmp_marshal_input_block_stream,
},
+SQMP
+block-stream
+------------
+
+Copy data from a backing file into a block device.
+
+Arguments:
+
+- "device": The device's ID, must be unique (json-string)
+- "base": The file name of the backing image above which copying starts
+ (json-string, optional)
+- "backing-file": The backing file string to write into the active layer. This
+ filename is not validated.
+
+ If a pathname string is such that it cannot be resolved by
+ QEMU, that means that subsequent QMP or HMP commands must use
+ node-names for the image in question, as filename lookup
+ methods will fail.
+
+ If not specified, QEMU will automatically determine the
+ backing file string to use, or error out if there is no
+ obvious choice. Care should be taken when specifying the
+ string, to specify a valid filename or protocol.
+ (json-string, optional) (Since 2.1)
+- "speed": the maximum speed, in bytes per second (json-int, optional)
+- "on-error": the action to take on an error (default 'report'). 'stop' and
+ 'enospc' can only be used if the block device supports io-status.
+ (json-string, optional) (Since 2.1)
+
+Example:
+
+-> { "execute": "block-stream", "arguments": { "device": "virtio0",
+ "base": "/tmp/master.qcow2" } }
+<- { "return": {} }
+
+EQMP
+
{
.name = "block-commit",
.args_type = "device:B,base:s?,top:s?,backing-file:s?,speed:o?",
@@ -1073,7 +1110,7 @@ EQMP
{
.name = "drive-backup",
.args_type = "sync:s,device:B,target:s,speed:i?,mode:s?,format:s?,"
- "on-source-error:s?,on-target-error:s?",
+ "bitmap:s?,on-source-error:s?,on-target-error:s?",
.mhandler.cmd_new = qmp_marshal_input_drive_backup,
},
@@ -1100,8 +1137,10 @@ Arguments:
(json-string, optional)
- "sync": what parts of the disk image should be copied to the destination;
possibilities include "full" for all the disk, "top" for only the sectors
- allocated in the topmost image, or "none" to only replicate new I/O
- (MirrorSyncMode).
+ allocated in the topmost image, "dirty-bitmap" for only the dirty sectors in
+ the bitmap, or "none" to only replicate new I/O (MirrorSyncMode).
+- "bitmap": dirty bitmap name for sync==dirty-bitmap. Must be present if sync
+ is "dirty-bitmap", must NOT be present otherwise.
- "mode": whether and how QEMU should create a new image
(NewImageMode, optional, default 'absolute-paths')
- "speed": the maximum speed, in bytes per second (json-int, optional)
@@ -1269,6 +1308,91 @@ Example:
EQMP
{
+ .name = "block-dirty-bitmap-add",
+ .args_type = "node:B,name:s,granularity:i?",
+ .mhandler.cmd_new = qmp_marshal_input_block_dirty_bitmap_add,
+ },
+
+SQMP
+
+block-dirty-bitmap-add
+----------------------
+Since 2.4
+
+Create a dirty bitmap with a name on the device, and start tracking the writes.
+
+Arguments:
+
+- "node": device/node on which to create dirty bitmap (json-string)
+- "name": name of the new dirty bitmap (json-string)
+- "granularity": granularity to track writes with (int, optional)
+
+Example:
+
+-> { "execute": "block-dirty-bitmap-add", "arguments": { "node": "drive0",
+ "name": "bitmap0" } }
+<- { "return": {} }
+
+EQMP
+
+ {
+ .name = "block-dirty-bitmap-remove",
+ .args_type = "node:B,name:s",
+ .mhandler.cmd_new = qmp_marshal_input_block_dirty_bitmap_remove,
+ },
+
+SQMP
+
+block-dirty-bitmap-remove
+-------------------------
+Since 2.4
+
+Stop write tracking and remove the dirty bitmap that was created with
+block-dirty-bitmap-add.
+
+Arguments:
+
+- "node": device/node on which to remove dirty bitmap (json-string)
+- "name": name of the dirty bitmap to remove (json-string)
+
+Example:
+
+-> { "execute": "block-dirty-bitmap-remove", "arguments": { "node": "drive0",
+ "name": "bitmap0" } }
+<- { "return": {} }
+
+EQMP
+
+ {
+ .name = "block-dirty-bitmap-clear",
+ .args_type = "node:B,name:s",
+ .mhandler.cmd_new = qmp_marshal_input_block_dirty_bitmap_clear,
+ },
+
+SQMP
+
+block-dirty-bitmap-clear
+------------------------
+Since 2.4
+
+Reset the dirty bitmap associated with a node so that an incremental backup
+from this point in time forward will only backup clusters modified after this
+clear operation.
+
+Arguments:
+
+- "node": device/node on which to remove dirty bitmap (json-string)
+- "name": name of the dirty bitmap to remove (json-string)
+
+Example:
+
+-> { "execute": "block-dirty-bitmap-clear", "arguments": { "node": "drive0",
+ "name": "bitmap0" } }
+<- { "return": {} }
+
+EQMP
+
+ {
.name = "blockdev-snapshot-sync",
.args_type = "device:s?,node-name:s?,snapshot-file:s,snapshot-node-name:s?,format:s?,mode:s?",
.mhandler.cmd_new = qmp_marshal_input_blockdev_snapshot_sync,
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 5df61f9aa9..7f0aae977d 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2911,6 +2911,17 @@ sub process {
if ($rawline =~ /\b(?:Qemu|QEmu)\b/) {
WARN("use QEMU instead of Qemu or QEmu\n" . $herecurr);
}
+
+# check for non-portable ffs() calls that have portable alternatives in QEMU
+ if ($line =~ /\bffs\(/) {
+ ERROR("use ctz32() instead of ffs()\n" . $herecurr);
+ }
+ if ($line =~ /\bffsl\(/) {
+ ERROR("use ctz32() or ctz64() instead of ffsl()\n" . $herecurr);
+ }
+ if ($line =~ /\bffsll\(/) {
+ ERROR("use ctz64() instead of ffsll()\n" . $herecurr);
+ }
}
# If we have no input at all, then there is nothing to report on
diff --git a/scripts/qemu-gdb.py b/scripts/qemu-gdb.py
index 8a0f30534f..6c7f4fbe53 100644
--- a/scripts/qemu-gdb.py
+++ b/scripts/qemu-gdb.py
@@ -22,12 +22,86 @@ def isnull(ptr):
def int128(p):
return long(p['lo']) + (long(p['hi']) << 64)
+def get_fs_base():
+ '''Fetch %fs base value using arch_prctl(ARCH_GET_FS)'''
+ # %rsp - 120 is scratch space according to the SystemV ABI
+ old = gdb.parse_and_eval('*(uint64_t*)($rsp - 120)')
+ gdb.execute('call arch_prctl(0x1003, $rsp - 120)', False, True)
+ fs_base = gdb.parse_and_eval('*(uint64_t*)($rsp - 120)')
+ gdb.execute('set *(uint64_t*)($rsp - 120) = %s' % old, False, True)
+ return fs_base
+
+def get_glibc_pointer_guard():
+ '''Fetch glibc pointer guard value'''
+ fs_base = get_fs_base()
+ return gdb.parse_and_eval('*(uint64_t*)((uint64_t)%s + 0x30)' % fs_base)
+
+def glibc_ptr_demangle(val, pointer_guard):
+ '''Undo effect of glibc's PTR_MANGLE()'''
+ return gdb.parse_and_eval('(((uint64_t)%s >> 0x11) | ((uint64_t)%s << (64 - 0x11))) ^ (uint64_t)%s' % (val, val, pointer_guard))
+
+def bt_jmpbuf(jmpbuf):
+ '''Backtrace a jmpbuf'''
+ JB_RBX = 0
+ JB_RBP = 1
+ JB_R12 = 2
+ JB_R13 = 3
+ JB_R14 = 4
+ JB_R15 = 5
+ JB_RSP = 6
+ JB_PC = 7
+
+ old_rbx = gdb.parse_and_eval('(uint64_t)$rbx')
+ old_rbp = gdb.parse_and_eval('(uint64_t)$rbp')
+ old_rsp = gdb.parse_and_eval('(uint64_t)$rsp')
+ old_r12 = gdb.parse_and_eval('(uint64_t)$r12')
+ old_r13 = gdb.parse_and_eval('(uint64_t)$r13')
+ old_r14 = gdb.parse_and_eval('(uint64_t)$r14')
+ old_r15 = gdb.parse_and_eval('(uint64_t)$r15')
+ old_rip = gdb.parse_and_eval('(uint64_t)$rip')
+
+ pointer_guard = get_glibc_pointer_guard()
+ gdb.execute('set $rbx = %s' % jmpbuf[JB_RBX])
+ gdb.execute('set $rbp = %s' % glibc_ptr_demangle(jmpbuf[JB_RBP], pointer_guard))
+ gdb.execute('set $rsp = %s' % glibc_ptr_demangle(jmpbuf[JB_RSP], pointer_guard))
+ gdb.execute('set $r12 = %s' % jmpbuf[JB_R12])
+ gdb.execute('set $r13 = %s' % jmpbuf[JB_R13])
+ gdb.execute('set $r14 = %s' % jmpbuf[JB_R14])
+ gdb.execute('set $r15 = %s' % jmpbuf[JB_R15])
+ gdb.execute('set $rip = %s' % glibc_ptr_demangle(jmpbuf[JB_PC], pointer_guard))
+
+ gdb.execute('bt')
+
+ gdb.execute('set $rbx = %s' % old_rbx)
+ gdb.execute('set $rbp = %s' % old_rbp)
+ gdb.execute('set $rsp = %s' % old_rsp)
+ gdb.execute('set $r12 = %s' % old_r12)
+ gdb.execute('set $r13 = %s' % old_r13)
+ gdb.execute('set $r14 = %s' % old_r14)
+ gdb.execute('set $r15 = %s' % old_r15)
+ gdb.execute('set $rip = %s' % old_rip)
+
class QemuCommand(gdb.Command):
'''Prefix for QEMU debug support commands'''
def __init__(self):
gdb.Command.__init__(self, 'qemu', gdb.COMMAND_DATA,
gdb.COMPLETE_NONE, True)
+class CoroutineCommand(gdb.Command):
+ '''Display coroutine backtrace'''
+ def __init__(self):
+ gdb.Command.__init__(self, 'qemu coroutine', gdb.COMMAND_DATA,
+ gdb.COMPLETE_NONE)
+
+ def invoke(self, arg, from_tty):
+ argv = gdb.string_to_argv(arg)
+ if len(argv) != 1:
+ gdb.write('usage: qemu coroutine <coroutine-pointer>\n')
+ return
+
+ coroutine_pointer = gdb.parse_and_eval(argv[0]).cast(gdb.lookup_type('CoroutineUContext').pointer())
+ bt_jmpbuf(coroutine_pointer['env']['__jmpbuf'])
+
class MtreeCommand(gdb.Command):
'''Display the memory tree hierarchy'''
def __init__(self):
@@ -86,4 +160,5 @@ class MtreeCommand(gdb.Command):
subregion = subregion['subregions_link']['tqe_next']
QemuCommand()
+CoroutineCommand()
MtreeCommand()
diff --git a/scripts/qmp/qmp.py b/scripts/qmp/qmp.py
index 20b6ec795e..1d38e3e9e7 100644
--- a/scripts/qmp/qmp.py
+++ b/scripts/qmp/qmp.py
@@ -21,6 +21,9 @@ class QMPConnectError(QMPError):
class QMPCapabilitiesError(QMPError):
pass
+class QMPTimeoutError(QMPError):
+ pass
+
class QEMUMonitorProtocol:
def __init__(self, address, server=False):
"""
@@ -72,6 +75,44 @@ class QEMUMonitorProtocol:
error = socket.error
+ def __get_events(self, wait=False):
+ """
+ Check for new events in the stream and cache them in __events.
+
+ @param wait (bool): block until an event is available.
+ @param wait (float): If wait is a float, treat it as a timeout value.
+
+ @raise QMPTimeoutError: If a timeout float is provided and the timeout
+ period elapses.
+ @raise QMPConnectError: If wait is True but no events could be retrieved
+ or if some other error occurred.
+ """
+
+ # Check for new events regardless and pull them into the cache:
+ self.__sock.setblocking(0)
+ try:
+ self.__json_read()
+ except socket.error, err:
+ if err[0] == errno.EAGAIN:
+ # No data available
+ pass
+ self.__sock.setblocking(1)
+
+ # Wait for new events, if needed.
+ # if wait is 0.0, this means "no wait" and is also implicitly false.
+ if not self.__events and wait:
+ if isinstance(wait, float):
+ self.__sock.settimeout(wait)
+ try:
+ ret = self.__json_read(only_event=True)
+ except socket.timeout:
+ raise QMPTimeoutError("Timeout waiting for event")
+ except:
+ raise QMPConnectError("Error while reading from socket")
+ if ret is None:
+ raise QMPConnectError("Error while reading from socket")
+ self.__sock.settimeout(None)
+
def connect(self, negotiate=True):
"""
Connect to the QMP Monitor and perform capabilities negotiation.
@@ -140,43 +181,37 @@ class QEMUMonitorProtocol:
"""
Get and delete the first available QMP event.
- @param wait: block until an event is available (bool)
+ @param wait (bool): block until an event is available.
+ @param wait (float): If wait is a float, treat it as a timeout value.
+
+ @raise QMPTimeoutError: If a timeout float is provided and the timeout
+ period elapses.
+ @raise QMPConnectError: If wait is True but no events could be retrieved
+ or if some other error occurred.
+
+ @return The first available QMP event, or None.
"""
- self.__sock.setblocking(0)
- try:
- self.__json_read()
- except socket.error, err:
- if err[0] == errno.EAGAIN:
- # No data available
- pass
- self.__sock.setblocking(1)
- if not self.__events and wait:
- self.__json_read(only_event=True)
- event = self.__events[0]
- del self.__events[0]
- return event
+ self.__get_events(wait)
+
+ if self.__events:
+ return self.__events.pop(0)
+ return None
def get_events(self, wait=False):
"""
Get a list of available QMP events.
- @param wait: block until an event is available (bool)
- """
- self.__sock.setblocking(0)
- try:
- self.__json_read()
- except socket.error, err:
- if err[0] == errno.EAGAIN:
- # No data available
- pass
- self.__sock.setblocking(1)
- if not self.__events and wait:
- ret = self.__json_read(only_event=True)
- if ret == None:
- # We are in blocking mode, if don't get anything, something
- # went wrong
- raise QMPConnectError("Error while reading from socket")
+ @param wait (bool): block until an event is available.
+ @param wait (float): If wait is a float, treat it as a timeout value.
+ @raise QMPTimeoutError: If a timeout float is provided and the timeout
+ period elapses.
+ @raise QMPConnectError: If wait is True but no events could be retrieved
+ or if some other error occurred.
+
+ @return The list of available QMP events.
+ """
+ self.__get_events(wait)
return self.__events
def clear_events(self):
diff --git a/target-ppc/cpu.h b/target-ppc/cpu.h
index f15815f11b..c05c503305 100644
--- a/target-ppc/cpu.h
+++ b/target-ppc/cpu.h
@@ -2251,8 +2251,8 @@ static inline ppcmas_tlb_t *booke206_get_tlbm(CPUPPCState *env, const int tlbn,
{
int r;
uint32_t ways = booke206_tlb_ways(env, tlbn);
- int ways_bits = ffs(ways) - 1;
- int tlb_bits = ffs(booke206_tlb_size(env, tlbn)) - 1;
+ int ways_bits = ctz32(ways);
+ int tlb_bits = ctz32(booke206_tlb_size(env, tlbn));
int i;
way &= ways - 1;
diff --git a/tests/qemu-iotests/122 b/tests/qemu-iotests/122
new file mode 100755
index 0000000000..350ca9c466
--- /dev/null
+++ b/tests/qemu-iotests/122
@@ -0,0 +1,223 @@
+#!/bin/bash
+#
+# Test some qemu-img convert cases
+#
+# Copyright (C) 2015 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+# creator
+owner=kwolf@redhat.com
+
+seq="$(basename $0)"
+echo "QA output created by $seq"
+
+here="$PWD"
+tmp=/tmp/$$
+status=1 # failure is the default!
+
+_cleanup()
+{
+ rm -f "$TEST_IMG".[123]
+ _cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+_supported_fmt qcow2
+_supported_proto file
+_supported_os Linux
+
+
+TEST_IMG="$TEST_IMG".base _make_test_img 64M
+$QEMU_IO -c "write -P 0x11 0 64M" "$TEST_IMG".base 2>&1 | _filter_qemu_io | _filter_testdir
+
+
+echo
+echo "=== Check allocation status regression with -B ==="
+echo
+
+_make_test_img -b "$TEST_IMG".base
+$QEMU_IO -c "write -P 0x22 0 3M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG convert -O $IMGFMT -B "$TEST_IMG".base "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IMG map "$TEST_IMG".orig | _filter_qemu_img_map
+
+
+echo
+echo "=== Check that zero clusters are kept in overlay ==="
+echo
+
+_make_test_img -b "$TEST_IMG".base
+
+$QEMU_IO -c "write -P 0 0 3M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG convert -O $IMGFMT -B "$TEST_IMG".base "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IO -c "read -P 0 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG convert -O $IMGFMT -c -B "$TEST_IMG".base "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IO -c "read -P 0 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+
+$QEMU_IO -c "write -z 0 3M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG convert -O $IMGFMT -B "$TEST_IMG".base "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IO -c "read -P 0 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG convert -O $IMGFMT -c -B "$TEST_IMG".base "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IO -c "read -P 0 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+
+
+echo
+echo "=== Concatenate multiple source images ==="
+echo
+
+TEST_IMG="$TEST_IMG".1 _make_test_img 4M
+TEST_IMG="$TEST_IMG".2 _make_test_img 4M
+TEST_IMG="$TEST_IMG".3 _make_test_img 4M
+
+$QEMU_IO -c "write -P 0x11 0 64k" "$TEST_IMG".1 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "write -P 0x22 0 64k" "$TEST_IMG".2 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "write -P 0x33 0 64k" "$TEST_IMG".3 2>&1 | _filter_qemu_io | _filter_testdir
+
+$QEMU_IMG convert -O $IMGFMT "$TEST_IMG".[123] "$TEST_IMG"
+$QEMU_IMG map "$TEST_IMG" | _filter_qemu_img_map
+$QEMU_IO -c "read -P 0x11 0 64k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x22 4M 64k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x33 8M 64k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+$QEMU_IMG convert -c -O $IMGFMT "$TEST_IMG".[123] "$TEST_IMG"
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+$QEMU_IO -c "read -P 0x11 0 64k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x22 4M 64k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x33 8M 64k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+# -B can't be combined with concatenation
+$QEMU_IMG convert -O $IMGFMT -B "$TEST_IMG".base "$TEST_IMG".[123] "$TEST_IMG"
+$QEMU_IMG convert -O $IMGFMT -c -B "$TEST_IMG".base "$TEST_IMG".[123] "$TEST_IMG"
+
+
+echo
+echo "=== Compression with misaligned allocations and image sizes ==="
+echo
+
+TEST_IMG="$TEST_IMG".1 _make_test_img 1023k -o cluster_size=1024
+TEST_IMG="$TEST_IMG".2 _make_test_img 1023k -o cluster_size=1024
+
+$QEMU_IO -c "write -P 0x11 16k 16k" "$TEST_IMG".1 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "write -P 0x22 130k 130k" "$TEST_IMG".1 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "write -P 0x33 1022k 1k" "$TEST_IMG".1 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "write -P 0x44 0k 1k" "$TEST_IMG".2 2>&1 | _filter_qemu_io | _filter_testdir
+
+$QEMU_IMG convert -c -O $IMGFMT "$TEST_IMG".[12] "$TEST_IMG"
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+$QEMU_IO -c "read -P 0 0k 16k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x11 16k 16k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0 32k 98k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x22 130k 130k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0 260k 762k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x33 1022k 1k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x44 1023k 1k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0 1024k 1022k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+
+echo
+echo "=== Full allocation with -S 0 ==="
+echo
+
+# Standalone image
+_make_test_img 64M
+$QEMU_IO -c "write -P 0x22 0 3M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "write -P 0 3M 3M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+echo
+echo convert -S 0:
+$QEMU_IMG convert -O $IMGFMT -S 0 "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IO -c "read -P 0x22 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0 3M 61M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map
+
+echo
+echo convert -c -S 0:
+$QEMU_IMG convert -O $IMGFMT -c -S 0 "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IO -c "read -P 0x22 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0 3M 61M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map
+
+# With backing file
+TEST_IMG="$TEST_IMG".base _make_test_img 64M
+$QEMU_IO -c "write -P 0x11 0 32M" "$TEST_IMG".base 2>&1 | _filter_qemu_io | _filter_testdir
+
+_make_test_img -b "$TEST_IMG".base 64M
+$QEMU_IO -c "write -P 0x22 0 3M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+echo
+echo convert -S 0 with source backing file:
+$QEMU_IMG convert -O $IMGFMT -S 0 "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IO -c "read -P 0x22 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x11 3M 29M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0 32M 32M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map
+
+echo
+echo convert -c -S 0 with source backing file:
+$QEMU_IMG convert -O $IMGFMT -c -S 0 "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IO -c "read -P 0x22 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x11 3M 29M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0 32M 32M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map
+
+# With keeping the backing file
+echo
+echo convert -S 0 -B ...
+$QEMU_IMG convert -O $IMGFMT -S 0 "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IO -c "read -P 0x22 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x11 3M 29M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0 32M 32M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map
+
+echo
+echo convert -c -S 0 -B ...
+$QEMU_IMG convert -O $IMGFMT -c -S 0 "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IO -c "read -P 0x22 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x11 3M 29M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0 32M 32M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map
+
+
+echo
+echo "=== Non-zero -S ==="
+echo
+
+_make_test_img 64M -o cluster_size=1k
+$QEMU_IO -c "write -P 0 0 64k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "write 0 1k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "write 8k 1k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "write 17k 1k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+for min_sparse in 4k 8k; do
+ echo
+ echo convert -S $min_sparse
+ $QEMU_IMG convert -O $IMGFMT -o cluster_size=1k -S $min_sparse "$TEST_IMG" "$TEST_IMG".orig
+ $QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map
+
+ echo
+ echo convert -c -S $min_sparse
+ # For compressed images, -S values other than 0 are ignored
+ $QEMU_IMG convert -O $IMGFMT -o cluster_size=1k -c -S $min_sparse "$TEST_IMG" "$TEST_IMG".orig
+ $QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map
+done
+
+# success, all done
+echo '*** done'
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/122.out b/tests/qemu-iotests/122.out
new file mode 100644
index 0000000000..1f853b9e93
--- /dev/null
+++ b/tests/qemu-iotests/122.out
@@ -0,0 +1,209 @@
+QA output created by 122
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=67108864
+wrote 67108864/67108864 bytes at offset 0
+64 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+=== Check allocation status regression with -B ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 backing_file='TEST_DIR/t.IMGFMT.base'
+wrote 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+Offset Length File
+0 0x300000 TEST_DIR/t.IMGFMT.orig
+0x300000 0x3d00000 TEST_DIR/t.IMGFMT.base
+
+=== Check that zero clusters are kept in overlay ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 backing_file='TEST_DIR/t.IMGFMT.base'
+wrote 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+=== Concatenate multiple source images ===
+
+Formatting 'TEST_DIR/t.IMGFMT.1', fmt=IMGFMT size=4194304
+Formatting 'TEST_DIR/t.IMGFMT.2', fmt=IMGFMT size=4194304
+Formatting 'TEST_DIR/t.IMGFMT.3', fmt=IMGFMT size=4194304
+wrote 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+Offset Length File
+0 0x10000 TEST_DIR/t.IMGFMT
+0x400000 0x10000 TEST_DIR/t.IMGFMT
+0x800000 0x10000 TEST_DIR/t.IMGFMT
+read 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 65536/65536 bytes at offset 4194304
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 65536/65536 bytes at offset 8388608
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 65536, "depth": 0, "zero": false, "data": true},
+{ "start": 65536, "length": 4128768, "depth": 0, "zero": true, "data": false},
+{ "start": 4194304, "length": 65536, "depth": 0, "zero": false, "data": true},
+{ "start": 4259840, "length": 4128768, "depth": 0, "zero": true, "data": false},
+{ "start": 8388608, "length": 65536, "depth": 0, "zero": false, "data": true},
+{ "start": 8454144, "length": 4128768, "depth": 0, "zero": true, "data": false}]
+read 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 65536/65536 bytes at offset 4194304
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 65536/65536 bytes at offset 8388608
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-img: -B makes no sense when concatenating multiple input images
+qemu-img: -B makes no sense when concatenating multiple input images
+
+=== Compression with misaligned allocations and image sizes ===
+
+Formatting 'TEST_DIR/t.IMGFMT.1', fmt=IMGFMT size=1047552
+Formatting 'TEST_DIR/t.IMGFMT.2', fmt=IMGFMT size=1047552
+wrote 16384/16384 bytes at offset 16384
+16 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 133120/133120 bytes at offset 133120
+130 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1024/1024 bytes at offset 1046528
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1024/1024 bytes at offset 0
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 65536, "depth": 0, "zero": false, "data": true},
+{ "start": 65536, "length": 65536, "depth": 0, "zero": true, "data": false},
+{ "start": 131072, "length": 196608, "depth": 0, "zero": false, "data": true},
+{ "start": 327680, "length": 655360, "depth": 0, "zero": true, "data": false},
+{ "start": 983040, "length": 65536, "depth": 0, "zero": false, "data": true},
+{ "start": 1048576, "length": 1046528, "depth": 0, "zero": true, "data": false}]
+read 16384/16384 bytes at offset 0
+16 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 16384/16384 bytes at offset 16384
+16 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 100352/100352 bytes at offset 32768
+98 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 133120/133120 bytes at offset 133120
+130 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 780288/780288 bytes at offset 266240
+762 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 1046528
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 1047552
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1046528/1046528 bytes at offset 1048576
+1022 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+=== Full allocation with -S 0 ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
+wrote 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 3145728/3145728 bytes at offset 3145728
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+convert -S 0:
+read 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 63963136/63963136 bytes at offset 3145728
+61 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 6291456, "depth": 0, "zero": false, "data": true, "offset": 327680},
+{ "start": 6291456, "length": 60817408, "depth": 0, "zero": true, "data": false}]
+
+convert -c -S 0:
+read 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 63963136/63963136 bytes at offset 3145728
+61 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 6291456, "depth": 0, "zero": false, "data": true},
+{ "start": 6291456, "length": 60817408, "depth": 0, "zero": true, "data": false}]
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=67108864
+wrote 33554432/33554432 bytes at offset 0
+32 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 backing_file='TEST_DIR/t.IMGFMT.base'
+wrote 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+convert -S 0 with source backing file:
+read 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 30408704/30408704 bytes at offset 3145728
+29 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 33554432/33554432 bytes at offset 33554432
+32 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 67108864, "depth": 0, "zero": false, "data": true, "offset": 327680}]
+
+convert -c -S 0 with source backing file:
+read 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 30408704/30408704 bytes at offset 3145728
+29 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 33554432/33554432 bytes at offset 33554432
+32 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 67108864, "depth": 0, "zero": false, "data": true}]
+
+convert -S 0 -B ...
+read 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 30408704/30408704 bytes at offset 3145728
+29 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 33554432/33554432 bytes at offset 33554432
+32 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 67108864, "depth": 0, "zero": false, "data": true, "offset": 327680}]
+
+convert -c -S 0 -B ...
+read 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 30408704/30408704 bytes at offset 3145728
+29 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 33554432/33554432 bytes at offset 33554432
+32 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 67108864, "depth": 0, "zero": false, "data": true}]
+
+=== Non-zero -S ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
+wrote 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1024/1024 bytes at offset 0
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1024/1024 bytes at offset 8192
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1024/1024 bytes at offset 17408
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+convert -S 4k
+[{ "start": 0, "length": 1024, "depth": 0, "zero": false, "data": true, "offset": 8192},
+{ "start": 1024, "length": 7168, "depth": 0, "zero": true, "data": false},
+{ "start": 8192, "length": 1024, "depth": 0, "zero": false, "data": true, "offset": 9216},
+{ "start": 9216, "length": 8192, "depth": 0, "zero": true, "data": false},
+{ "start": 17408, "length": 1024, "depth": 0, "zero": false, "data": true, "offset": 10240},
+{ "start": 18432, "length": 67090432, "depth": 0, "zero": true, "data": false}]
+
+convert -c -S 4k
+[{ "start": 0, "length": 1024, "depth": 0, "zero": false, "data": true},
+{ "start": 1024, "length": 7168, "depth": 0, "zero": true, "data": false},
+{ "start": 8192, "length": 1024, "depth": 0, "zero": false, "data": true},
+{ "start": 9216, "length": 8192, "depth": 0, "zero": true, "data": false},
+{ "start": 17408, "length": 1024, "depth": 0, "zero": false, "data": true},
+{ "start": 18432, "length": 67090432, "depth": 0, "zero": true, "data": false}]
+
+convert -S 8k
+[{ "start": 0, "length": 9216, "depth": 0, "zero": false, "data": true, "offset": 8192},
+{ "start": 9216, "length": 8192, "depth": 0, "zero": true, "data": false},
+{ "start": 17408, "length": 1024, "depth": 0, "zero": false, "data": true, "offset": 17408},
+{ "start": 18432, "length": 67090432, "depth": 0, "zero": true, "data": false}]
+
+convert -c -S 8k
+[{ "start": 0, "length": 1024, "depth": 0, "zero": false, "data": true},
+{ "start": 1024, "length": 7168, "depth": 0, "zero": true, "data": false},
+{ "start": 8192, "length": 1024, "depth": 0, "zero": false, "data": true},
+{ "start": 9216, "length": 8192, "depth": 0, "zero": true, "data": false},
+{ "start": 17408, "length": 1024, "depth": 0, "zero": false, "data": true},
+{ "start": 18432, "length": 67090432, "depth": 0, "zero": true, "data": false}]
+*** done
diff --git a/tests/qemu-iotests/124 b/tests/qemu-iotests/124
new file mode 100644
index 0000000000..3ee78cd1f1
--- /dev/null
+++ b/tests/qemu-iotests/124
@@ -0,0 +1,363 @@
+#!/usr/bin/env python
+#
+# Tests for incremental drive-backup
+#
+# Copyright (C) 2015 John Snow for Red Hat, Inc.
+#
+# Based on 056.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+import os
+import iotests
+
+
+def io_write_patterns(img, patterns):
+ for pattern in patterns:
+ iotests.qemu_io('-c', 'write -P%s %s %s' % pattern, img)
+
+
+def try_remove(img):
+ try:
+ os.remove(img)
+ except OSError:
+ pass
+
+
+class Bitmap:
+ def __init__(self, name, drive):
+ self.name = name
+ self.drive = drive
+ self.num = 0
+ self.backups = list()
+
+ def base_target(self):
+ return (self.drive['backup'], None)
+
+ def new_target(self, num=None):
+ if num is None:
+ num = self.num
+ self.num = num + 1
+ base = os.path.join(iotests.test_dir,
+ "%s.%s." % (self.drive['id'], self.name))
+ suff = "%i.%s" % (num, self.drive['fmt'])
+ target = base + "inc" + suff
+ reference = base + "ref" + suff
+ self.backups.append((target, reference))
+ return (target, reference)
+
+ def last_target(self):
+ if self.backups:
+ return self.backups[-1]
+ return self.base_target()
+
+ def del_target(self):
+ for image in self.backups.pop():
+ try_remove(image)
+ self.num -= 1
+
+ def cleanup(self):
+ for backup in self.backups:
+ for image in backup:
+ try_remove(image)
+
+
+class TestIncrementalBackup(iotests.QMPTestCase):
+ def setUp(self):
+ self.bitmaps = list()
+ self.files = list()
+ self.drives = list()
+ self.vm = iotests.VM()
+ self.err_img = os.path.join(iotests.test_dir, 'err.%s' % iotests.imgfmt)
+
+ # Create a base image with a distinctive patterning
+ drive0 = self.add_node('drive0')
+ self.img_create(drive0['file'], drive0['fmt'])
+ self.vm.add_drive(drive0['file'])
+ io_write_patterns(drive0['file'], (('0x41', 0, 512),
+ ('0xd5', '1M', '32k'),
+ ('0xdc', '32M', '124k')))
+ self.vm.launch()
+
+
+ def add_node(self, node_id, fmt=iotests.imgfmt, path=None, backup=None):
+ if path is None:
+ path = os.path.join(iotests.test_dir, '%s.%s' % (node_id, fmt))
+ if backup is None:
+ backup = os.path.join(iotests.test_dir,
+ '%s.full.backup.%s' % (node_id, fmt))
+
+ self.drives.append({
+ 'id': node_id,
+ 'file': path,
+ 'backup': backup,
+ 'fmt': fmt })
+ return self.drives[-1]
+
+
+ def img_create(self, img, fmt=iotests.imgfmt, size='64M',
+ parent=None, parentFormat=None):
+ if parent:
+ if parentFormat is None:
+ parentFormat = fmt
+ iotests.qemu_img('create', '-f', fmt, img, size,
+ '-b', parent, '-F', parentFormat)
+ else:
+ iotests.qemu_img('create', '-f', fmt, img, size)
+ self.files.append(img)
+
+
+ def do_qmp_backup(self, error='Input/output error', **kwargs):
+ res = self.vm.qmp('drive-backup', **kwargs)
+ self.assert_qmp(res, 'return', {})
+
+ event = self.vm.event_wait(name="BLOCK_JOB_COMPLETED",
+ match={'data': {'device': kwargs['device']}})
+ self.assertIsNotNone(event)
+
+ try:
+ failure = self.dictpath(event, 'data/error')
+ except AssertionError:
+ # Backup succeeded.
+ self.assert_qmp(event, 'data/offset', event['data']['len'])
+ return True
+ else:
+ # Backup failed.
+ self.assert_qmp(event, 'data/error', error)
+ return False
+
+
+ def create_anchor_backup(self, drive=None):
+ if drive is None:
+ drive = self.drives[-1]
+ res = self.do_qmp_backup(device=drive['id'], sync='full',
+ format=drive['fmt'], target=drive['backup'])
+ self.assertTrue(res)
+ self.files.append(drive['backup'])
+ return drive['backup']
+
+
+ def make_reference_backup(self, bitmap=None):
+ if bitmap is None:
+ bitmap = self.bitmaps[-1]
+ _, reference = bitmap.last_target()
+ res = self.do_qmp_backup(device=bitmap.drive['id'], sync='full',
+ format=bitmap.drive['fmt'], target=reference)
+ self.assertTrue(res)
+
+
+ def add_bitmap(self, name, drive, **kwargs):
+ bitmap = Bitmap(name, drive)
+ self.bitmaps.append(bitmap)
+ result = self.vm.qmp('block-dirty-bitmap-add', node=drive['id'],
+ name=bitmap.name, **kwargs)
+ self.assert_qmp(result, 'return', {})
+ return bitmap
+
+
+ def prepare_backup(self, bitmap=None, parent=None):
+ if bitmap is None:
+ bitmap = self.bitmaps[-1]
+ if parent is None:
+ parent, _ = bitmap.last_target()
+
+ target, _ = bitmap.new_target()
+ self.img_create(target, bitmap.drive['fmt'], parent=parent)
+ return target
+
+
+ def create_incremental(self, bitmap=None, parent=None,
+ parentFormat=None, validate=True):
+ if bitmap is None:
+ bitmap = self.bitmaps[-1]
+ if parent is None:
+ parent, _ = bitmap.last_target()
+
+ target = self.prepare_backup(bitmap, parent)
+ res = self.do_qmp_backup(device=bitmap.drive['id'],
+ sync='dirty-bitmap', bitmap=bitmap.name,
+ format=bitmap.drive['fmt'], target=target,
+ mode='existing')
+ if not res:
+ bitmap.del_target();
+ self.assertFalse(validate)
+ else:
+ self.make_reference_backup(bitmap)
+ return res
+
+
+ def check_backups(self):
+ for bitmap in self.bitmaps:
+ for incremental, reference in bitmap.backups:
+ self.assertTrue(iotests.compare_images(incremental, reference))
+ last = bitmap.last_target()[0]
+ self.assertTrue(iotests.compare_images(last, bitmap.drive['file']))
+
+
+ def hmp_io_writes(self, drive, patterns):
+ for pattern in patterns:
+ self.vm.hmp_qemu_io(drive, 'write -P%s %s %s' % pattern)
+ self.vm.hmp_qemu_io(drive, 'flush')
+
+
+ def do_incremental_simple(self, **kwargs):
+ self.create_anchor_backup()
+ self.add_bitmap('bitmap0', self.drives[0], **kwargs)
+
+ # Sanity: Create a "hollow" incremental backup
+ self.create_incremental()
+ # Three writes: One complete overwrite, one new segment,
+ # and one partial overlap.
+ self.hmp_io_writes(self.drives[0]['id'], (('0xab', 0, 512),
+ ('0xfe', '16M', '256k'),
+ ('0x64', '32736k', '64k')))
+ self.create_incremental()
+ # Three more writes, one of each kind, like above
+ self.hmp_io_writes(self.drives[0]['id'], (('0x9a', 0, 512),
+ ('0x55', '8M', '352k'),
+ ('0x78', '15872k', '1M')))
+ self.create_incremental()
+ self.vm.shutdown()
+ self.check_backups()
+
+
+ def test_incremental_simple(self):
+ '''
+ Test: Create and verify three incremental backups.
+
+ Create a bitmap and a full backup before VM execution begins,
+ then create a series of three incremental backups "during execution,"
+ i.e.; after IO requests begin modifying the drive.
+ '''
+ return self.do_incremental_simple()
+
+
+ def test_small_granularity(self):
+ '''
+ Test: Create and verify backups made with a small granularity bitmap.
+
+ Perform the same test as test_incremental_simple, but with a granularity
+ of only 32KiB instead of the present default of 64KiB.
+ '''
+ return self.do_incremental_simple(granularity=32768)
+
+
+ def test_large_granularity(self):
+ '''
+ Test: Create and verify backups made with a large granularity bitmap.
+
+ Perform the same test as test_incremental_simple, but with a granularity
+ of 128KiB instead of the present default of 64KiB.
+ '''
+ return self.do_incremental_simple(granularity=131072)
+
+
+ def test_incremental_failure(self):
+ '''Test: Verify backups made after a failure are correct.
+
+ Simulate a failure during an incremental backup block job,
+ emulate additional writes, then create another incremental backup
+ afterwards and verify that the backup created is correct.
+ '''
+
+ # Create a blkdebug interface to this img as 'drive1',
+ # but don't actually create a new image.
+ drive1 = self.add_node('drive1', self.drives[0]['fmt'],
+ path=self.drives[0]['file'],
+ backup=self.drives[0]['backup'])
+ result = self.vm.qmp('blockdev-add', options={
+ 'id': drive1['id'],
+ 'driver': drive1['fmt'],
+ 'file': {
+ 'driver': 'blkdebug',
+ 'image': {
+ 'driver': 'file',
+ 'filename': drive1['file']
+ },
+ 'set-state': [{
+ 'event': 'flush_to_disk',
+ 'state': 1,
+ 'new_state': 2
+ }],
+ 'inject-error': [{
+ 'event': 'read_aio',
+ 'errno': 5,
+ 'state': 2,
+ 'immediately': False,
+ 'once': True
+ }],
+ }
+ })
+ self.assert_qmp(result, 'return', {})
+
+ self.create_anchor_backup(self.drives[0])
+ self.add_bitmap('bitmap0', drive1)
+ # Note: at this point, during a normal execution,
+ # Assume that the VM resumes and begins issuing IO requests here.
+
+ self.hmp_io_writes(drive1['id'], (('0xab', 0, 512),
+ ('0xfe', '16M', '256k'),
+ ('0x64', '32736k', '64k')))
+
+ result = self.create_incremental(validate=False)
+ self.assertFalse(result)
+ self.hmp_io_writes(drive1['id'], (('0x9a', 0, 512),
+ ('0x55', '8M', '352k'),
+ ('0x78', '15872k', '1M')))
+ self.create_incremental()
+ self.vm.shutdown()
+ self.check_backups()
+
+
+ def test_sync_dirty_bitmap_missing(self):
+ self.assert_no_active_block_jobs()
+ self.files.append(self.err_img)
+ result = self.vm.qmp('drive-backup', device=self.drives[0]['id'],
+ sync='dirty-bitmap', format=self.drives[0]['fmt'],
+ target=self.err_img)
+ self.assert_qmp(result, 'error/class', 'GenericError')
+
+
+ def test_sync_dirty_bitmap_not_found(self):
+ self.assert_no_active_block_jobs()
+ self.files.append(self.err_img)
+ result = self.vm.qmp('drive-backup', device=self.drives[0]['id'],
+ sync='dirty-bitmap', bitmap='unknown',
+ format=self.drives[0]['fmt'], target=self.err_img)
+ self.assert_qmp(result, 'error/class', 'GenericError')
+
+
+ def test_sync_dirty_bitmap_bad_granularity(self):
+ '''
+ Test: Test what happens if we provide an improper granularity.
+
+ The granularity must always be a power of 2.
+ '''
+ self.assert_no_active_block_jobs()
+ self.assertRaises(AssertionError, self.add_bitmap,
+ 'bitmap0', self.drives[0],
+ granularity=64000)
+
+
+ def tearDown(self):
+ self.vm.shutdown()
+ for bitmap in self.bitmaps:
+ bitmap.cleanup()
+ for filename in self.files:
+ try_remove(filename)
+
+
+if __name__ == '__main__':
+ iotests.main(supported_fmts=['qcow2'])
diff --git a/tests/qemu-iotests/124.out b/tests/qemu-iotests/124.out
new file mode 100644
index 0000000000..2f7d3902f2
--- /dev/null
+++ b/tests/qemu-iotests/124.out
@@ -0,0 +1,5 @@
+.......
+----------------------------------------------------------------------
+Ran 7 tests
+
+OK
diff --git a/tests/qemu-iotests/129 b/tests/qemu-iotests/129
new file mode 100644
index 0000000000..9e87e1c8d9
--- /dev/null
+++ b/tests/qemu-iotests/129
@@ -0,0 +1,86 @@
+#!/usr/bin/env python
+#
+# Tests that "bdrv_drain_all" doesn't drain block jobs
+#
+# Copyright (C) 2015 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+import os
+import iotests
+import time
+
+class TestStopWithBlockJob(iotests.QMPTestCase):
+ test_img = os.path.join(iotests.test_dir, 'test.img')
+ target_img = os.path.join(iotests.test_dir, 'target.img')
+ base_img = os.path.join(iotests.test_dir, 'base.img')
+
+ def setUp(self):
+ iotests.qemu_img('create', '-f', iotests.imgfmt, self.base_img, "1G")
+ iotests.qemu_img('create', '-f', iotests.imgfmt, self.test_img, "-b", self.base_img)
+ iotests.qemu_io('-f', iotests.imgfmt, '-c', 'write -P0x5d 1M 128M', self.test_img)
+ self.vm = iotests.VM().add_drive(self.test_img)
+ self.vm.launch()
+
+ def tearDown(self):
+ params = {"device": "drive0",
+ "bps": 0,
+ "bps_rd": 0,
+ "bps_wr": 0,
+ "iops": 0,
+ "iops_rd": 0,
+ "iops_wr": 0,
+ }
+ result = self.vm.qmp("block_set_io_throttle", conv_keys=False,
+ **params)
+ self.vm.shutdown()
+
+ def do_test_stop(self, cmd, **args):
+ """Test 'stop' while block job is running on a throttled drive.
+ The 'stop' command shouldn't drain the job"""
+ params = {"device": "drive0",
+ "bps": 1024,
+ "bps_rd": 0,
+ "bps_wr": 0,
+ "iops": 0,
+ "iops_rd": 0,
+ "iops_wr": 0,
+ }
+ result = self.vm.qmp("block_set_io_throttle", conv_keys=False,
+ **params)
+ self.assert_qmp(result, 'return', {})
+ result = self.vm.qmp(cmd, **args)
+ self.assert_qmp(result, 'return', {})
+ result = self.vm.qmp("stop")
+ self.assert_qmp(result, 'return', {})
+ result = self.vm.qmp("query-block-jobs")
+ self.assert_qmp(result, 'return[0]/busy', True)
+ self.assert_qmp(result, 'return[0]/ready', False)
+
+ def test_drive_mirror(self):
+ self.do_test_stop("drive-mirror", device="drive0",
+ target=self.target_img,
+ sync="full")
+
+ def test_drive_backup(self):
+ self.do_test_stop("drive-backup", device="drive0",
+ target=self.target_img,
+ sync="full")
+
+ def test_block_commit(self):
+ self.do_test_stop("block-commit", device="drive0")
+
+if __name__ == '__main__':
+ iotests.main(supported_fmts=["qcow2"])
diff --git a/tests/qemu-iotests/129.out b/tests/qemu-iotests/129.out
new file mode 100644
index 0000000000..8d7e996700
--- /dev/null
+++ b/tests/qemu-iotests/129.out
@@ -0,0 +1,5 @@
+...
+----------------------------------------------------------------------
+Ran 3 tests
+
+OK
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index bcf25786ab..6ca3466ec5 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -122,6 +122,9 @@
115 rw auto
116 rw auto quick
121 rw auto
+122 rw auto
123 rw auto quick
+124 rw auto backing
128 rw auto quick
+129 rw auto quick
130 rw auto quick
diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index 14028540b3..e93e62387b 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -78,6 +78,23 @@ def create_image(name, size):
i = i + 512
file.close()
+# Test if 'match' is a recursive subset of 'event'
+def event_match(event, match=None):
+ if match is None:
+ return True
+
+ for key in match:
+ if key in event:
+ if isinstance(event[key], dict):
+ if not event_match(event[key], match[key]):
+ return False
+ elif event[key] != match[key]:
+ return False
+ else:
+ return False
+
+ return True
+
class VM(object):
'''A QEMU VM'''
@@ -92,6 +109,7 @@ class VM(object):
'-machine', 'accel=qtest',
'-display', 'none', '-vga', 'none']
self._num_drives = 0
+ self._events = []
# This can be used to add an unused monitor instance.
def add_monitor_telnet(self, ip, port):
@@ -202,14 +220,34 @@ class VM(object):
def get_qmp_event(self, wait=False):
'''Poll for one queued QMP events and return it'''
+ if len(self._events) > 0:
+ return self._events.pop(0)
return self._qmp.pull_event(wait=wait)
def get_qmp_events(self, wait=False):
'''Poll for queued QMP events and return a list of dicts'''
events = self._qmp.get_events(wait=wait)
+ events.extend(self._events)
+ del self._events[:]
self._qmp.clear_events()
return events
+ def event_wait(self, name='BLOCK_JOB_COMPLETED', timeout=60.0, match=None):
+ # Search cached events
+ for event in self._events:
+ if (event['event'] == name) and event_match(event, match):
+ self._events.remove(event)
+ return event
+
+ # Poll for new events
+ while True:
+ event = self._qmp.pull_event(wait=timeout)
+ if (event['event'] == name) and event_match(event, match):
+ return event
+ self._events.append(event)
+
+ return None
+
index_re = re.compile(r'([^\[]+)\[([^\]]+)\]')
class QMPTestCase(unittest.TestCase):
diff --git a/tests/test-aio.c b/tests/test-aio.c
index a7cb5c9915..4b0cb45d31 100644
--- a/tests/test-aio.c
+++ b/tests/test-aio.c
@@ -107,6 +107,7 @@ static void test_notify(void)
typedef struct {
QemuMutex start_lock;
+ EventNotifier notifier;
bool thread_acquired;
} AcquireTestData;
@@ -118,6 +119,8 @@ static void *test_acquire_thread(void *opaque)
qemu_mutex_lock(&data->start_lock);
qemu_mutex_unlock(&data->start_lock);
+ g_usleep(500000);
+ event_notifier_set(&data->notifier);
aio_context_acquire(ctx);
aio_context_release(ctx);
@@ -126,20 +129,19 @@ static void *test_acquire_thread(void *opaque)
return NULL;
}
-static void dummy_notifier_read(EventNotifier *unused)
+static void dummy_notifier_read(EventNotifier *n)
{
- g_assert(false); /* should never be invoked */
+ event_notifier_test_and_clear(n);
}
static void test_acquire(void)
{
QemuThread thread;
- EventNotifier notifier;
AcquireTestData data;
/* Dummy event notifier ensures aio_poll() will block */
- event_notifier_init(&notifier, false);
- aio_set_event_notifier(ctx, &notifier, dummy_notifier_read);
+ event_notifier_init(&data.notifier, false);
+ aio_set_event_notifier(ctx, &data.notifier, dummy_notifier_read);
g_assert(!aio_poll(ctx, false)); /* consume aio_notify() */
qemu_mutex_init(&data.start_lock);
@@ -153,12 +155,13 @@ static void test_acquire(void)
/* Block in aio_poll(), let other thread kick us and acquire context */
aio_context_acquire(ctx);
qemu_mutex_unlock(&data.start_lock); /* let the thread run */
- g_assert(!aio_poll(ctx, true));
+ g_assert(aio_poll(ctx, true));
+ g_assert(!data.thread_acquired);
aio_context_release(ctx);
qemu_thread_join(&thread);
- aio_set_event_notifier(ctx, &notifier, NULL);
- event_notifier_cleanup(&notifier);
+ aio_set_event_notifier(ctx, &data.notifier, NULL);
+ event_notifier_cleanup(&data.notifier);
g_assert(data.thread_acquired);
}
diff --git a/tests/test-hbitmap.c b/tests/test-hbitmap.c
index 8c902f2055..9f41b5fd2e 100644
--- a/tests/test-hbitmap.c
+++ b/tests/test-hbitmap.c
@@ -11,6 +11,8 @@
#include <glib.h>
#include <stdarg.h>
+#include <string.h>
+#include <sys/types.h>
#include "qemu/hbitmap.h"
#define LOG_BITS_PER_LONG (BITS_PER_LONG == 32 ? 5 : 6)
@@ -23,6 +25,7 @@ typedef struct TestHBitmapData {
HBitmap *hb;
unsigned long *bits;
size_t size;
+ size_t old_size;
int granularity;
} TestHBitmapData;
@@ -91,6 +94,44 @@ static void hbitmap_test_init(TestHBitmapData *data,
}
}
+static inline size_t hbitmap_test_array_size(size_t bits)
+{
+ size_t n = (bits + BITS_PER_LONG - 1) / BITS_PER_LONG;
+ return n ? n : 1;
+}
+
+static void hbitmap_test_truncate_impl(TestHBitmapData *data,
+ size_t size)
+{
+ size_t n;
+ size_t m;
+ data->old_size = data->size;
+ data->size = size;
+
+ if (data->size == data->old_size) {
+ return;
+ }
+
+ n = hbitmap_test_array_size(size);
+ m = hbitmap_test_array_size(data->old_size);
+ data->bits = g_realloc(data->bits, sizeof(unsigned long) * n);
+ if (n > m) {
+ memset(&data->bits[m], 0x00, sizeof(unsigned long) * (n - m));
+ }
+
+ /* If we shrink to an uneven multiple of sizeof(unsigned long),
+ * scrub the leftover memory. */
+ if (data->size < data->old_size) {
+ m = size % (sizeof(unsigned long) * 8);
+ if (m) {
+ unsigned long mask = (1ULL << m) - 1;
+ data->bits[n-1] &= mask;
+ }
+ }
+
+ hbitmap_truncate(data->hb, size);
+}
+
static void hbitmap_test_teardown(TestHBitmapData *data,
const void *unused)
{
@@ -369,6 +410,198 @@ static void test_hbitmap_iter_granularity(TestHBitmapData *data,
g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0);
}
+static void hbitmap_test_set_boundary_bits(TestHBitmapData *data, ssize_t diff)
+{
+ size_t size = data->size;
+
+ /* First bit */
+ hbitmap_test_set(data, 0, 1);
+ if (diff < 0) {
+ /* Last bit in new, shortened map */
+ hbitmap_test_set(data, size + diff - 1, 1);
+
+ /* First bit to be truncated away */
+ hbitmap_test_set(data, size + diff, 1);
+ }
+ /* Last bit */
+ hbitmap_test_set(data, size - 1, 1);
+ if (data->granularity == 0) {
+ hbitmap_test_check_get(data);
+ }
+}
+
+static void hbitmap_test_check_boundary_bits(TestHBitmapData *data)
+{
+ size_t size = MIN(data->size, data->old_size);
+
+ if (data->granularity == 0) {
+ hbitmap_test_check_get(data);
+ hbitmap_test_check(data, 0);
+ } else {
+ /* If a granularity was set, note that every distinct
+ * (bit >> granularity) value that was set will increase
+ * the bit pop count by 2^granularity, not just 1.
+ *
+ * The hbitmap_test_check facility does not currently tolerate
+ * non-zero granularities, so test the boundaries and the population
+ * count manually.
+ */
+ g_assert(hbitmap_get(data->hb, 0));
+ g_assert(hbitmap_get(data->hb, size - 1));
+ g_assert_cmpint(2 << data->granularity, ==, hbitmap_count(data->hb));
+ }
+}
+
+/* Generic truncate test. */
+static void hbitmap_test_truncate(TestHBitmapData *data,
+ size_t size,
+ ssize_t diff,
+ int granularity)
+{
+ hbitmap_test_init(data, size, granularity);
+ hbitmap_test_set_boundary_bits(data, diff);
+ hbitmap_test_truncate_impl(data, size + diff);
+ hbitmap_test_check_boundary_bits(data);
+}
+
+static void test_hbitmap_truncate_nop(TestHBitmapData *data,
+ const void *unused)
+{
+ hbitmap_test_truncate(data, L2, 0, 0);
+}
+
+/**
+ * Grow by an amount smaller than the granularity, without crossing
+ * a granularity alignment boundary. Effectively a NOP.
+ */
+static void test_hbitmap_truncate_grow_negligible(TestHBitmapData *data,
+ const void *unused)
+{
+ size_t size = L2 - 1;
+ size_t diff = 1;
+ int granularity = 1;
+
+ hbitmap_test_truncate(data, size, diff, granularity);
+}
+
+/**
+ * Shrink by an amount smaller than the granularity, without crossing
+ * a granularity alignment boundary. Effectively a NOP.
+ */
+static void test_hbitmap_truncate_shrink_negligible(TestHBitmapData *data,
+ const void *unused)
+{
+ size_t size = L2;
+ ssize_t diff = -1;
+ int granularity = 1;
+
+ hbitmap_test_truncate(data, size, diff, granularity);
+}
+
+/**
+ * Grow by an amount smaller than the granularity, but crossing over
+ * a granularity alignment boundary.
+ */
+static void test_hbitmap_truncate_grow_tiny(TestHBitmapData *data,
+ const void *unused)
+{
+ size_t size = L2 - 2;
+ ssize_t diff = 1;
+ int granularity = 1;
+
+ hbitmap_test_truncate(data, size, diff, granularity);
+}
+
+/**
+ * Shrink by an amount smaller than the granularity, but crossing over
+ * a granularity alignment boundary.
+ */
+static void test_hbitmap_truncate_shrink_tiny(TestHBitmapData *data,
+ const void *unused)
+{
+ size_t size = L2 - 1;
+ ssize_t diff = -1;
+ int granularity = 1;
+
+ hbitmap_test_truncate(data, size, diff, granularity);
+}
+
+/**
+ * Grow by an amount smaller than sizeof(long), and not crossing over
+ * a sizeof(long) alignment boundary.
+ */
+static void test_hbitmap_truncate_grow_small(TestHBitmapData *data,
+ const void *unused)
+{
+ size_t size = L2 + 1;
+ size_t diff = sizeof(long) / 2;
+
+ hbitmap_test_truncate(data, size, diff, 0);
+}
+
+/**
+ * Shrink by an amount smaller than sizeof(long), and not crossing over
+ * a sizeof(long) alignment boundary.
+ */
+static void test_hbitmap_truncate_shrink_small(TestHBitmapData *data,
+ const void *unused)
+{
+ size_t size = L2;
+ size_t diff = sizeof(long) / 2;
+
+ hbitmap_test_truncate(data, size, -diff, 0);
+}
+
+/**
+ * Grow by an amount smaller than sizeof(long), while crossing over
+ * a sizeof(long) alignment boundary.
+ */
+static void test_hbitmap_truncate_grow_medium(TestHBitmapData *data,
+ const void *unused)
+{
+ size_t size = L2 - 1;
+ size_t diff = sizeof(long) / 2;
+
+ hbitmap_test_truncate(data, size, diff, 0);
+}
+
+/**
+ * Shrink by an amount smaller than sizeof(long), while crossing over
+ * a sizeof(long) alignment boundary.
+ */
+static void test_hbitmap_truncate_shrink_medium(TestHBitmapData *data,
+ const void *unused)
+{
+ size_t size = L2 + 1;
+ size_t diff = sizeof(long) / 2;
+
+ hbitmap_test_truncate(data, size, -diff, 0);
+}
+
+/**
+ * Grow by an amount larger than sizeof(long).
+ */
+static void test_hbitmap_truncate_grow_large(TestHBitmapData *data,
+ const void *unused)
+{
+ size_t size = L2;
+ size_t diff = 8 * sizeof(long);
+
+ hbitmap_test_truncate(data, size, diff, 0);
+}
+
+/**
+ * Shrink by an amount larger than sizeof(long).
+ */
+static void test_hbitmap_truncate_shrink_large(TestHBitmapData *data,
+ const void *unused)
+{
+ size_t size = L2;
+ size_t diff = 8 * sizeof(long);
+
+ hbitmap_test_truncate(data, size, -diff, 0);
+}
+
static void hbitmap_test_add(const char *testpath,
void (*test_func)(TestHBitmapData *data, const void *user_data))
{
@@ -395,6 +628,28 @@ int main(int argc, char **argv)
hbitmap_test_add("/hbitmap/reset/empty", test_hbitmap_reset_empty);
hbitmap_test_add("/hbitmap/reset/general", test_hbitmap_reset);
hbitmap_test_add("/hbitmap/granularity", test_hbitmap_granularity);
+
+ hbitmap_test_add("/hbitmap/truncate/nop", test_hbitmap_truncate_nop);
+ hbitmap_test_add("/hbitmap/truncate/grow/negligible",
+ test_hbitmap_truncate_grow_negligible);
+ hbitmap_test_add("/hbitmap/truncate/shrink/negligible",
+ test_hbitmap_truncate_shrink_negligible);
+ hbitmap_test_add("/hbitmap/truncate/grow/tiny",
+ test_hbitmap_truncate_grow_tiny);
+ hbitmap_test_add("/hbitmap/truncate/shrink/tiny",
+ test_hbitmap_truncate_shrink_tiny);
+ hbitmap_test_add("/hbitmap/truncate/grow/small",
+ test_hbitmap_truncate_grow_small);
+ hbitmap_test_add("/hbitmap/truncate/shrink/small",
+ test_hbitmap_truncate_shrink_small);
+ hbitmap_test_add("/hbitmap/truncate/grow/medium",
+ test_hbitmap_truncate_grow_medium);
+ hbitmap_test_add("/hbitmap/truncate/shrink/medium",
+ test_hbitmap_truncate_shrink_medium);
+ hbitmap_test_add("/hbitmap/truncate/grow/large",
+ test_hbitmap_truncate_grow_large);
+ hbitmap_test_add("/hbitmap/truncate/shrink/large",
+ test_hbitmap_truncate_shrink_large);
g_test_run();
return 0;
diff --git a/thread-pool.c b/thread-pool.c
index e2cac8e4ff..ac909f4986 100644
--- a/thread-pool.c
+++ b/thread-pool.c
@@ -170,12 +170,12 @@ restart:
if (elem->state != THREAD_DONE) {
continue;
}
- if (elem->state == THREAD_DONE) {
- trace_thread_pool_complete(pool, elem, elem->common.opaque,
- elem->ret);
- }
- if (elem->state == THREAD_DONE && elem->common.cb) {
- QLIST_REMOVE(elem, all);
+
+ trace_thread_pool_complete(pool, elem, elem->common.opaque,
+ elem->ret);
+ QLIST_REMOVE(elem, all);
+
+ if (elem->common.cb) {
/* Read state before ret. */
smp_rmb();
@@ -188,8 +188,6 @@ restart:
qemu_aio_unref(elem);
goto restart;
} else {
- /* remove the request */
- QLIST_REMOVE(elem, all);
qemu_aio_unref(elem);
}
}
diff --git a/util/hbitmap.c b/util/hbitmap.c
index ab139717f5..a10c7aeeda 100644
--- a/util/hbitmap.c
+++ b/util/hbitmap.c
@@ -90,6 +90,9 @@ struct HBitmap {
* bitmap will still allocate HBITMAP_LEVELS arrays.
*/
unsigned long *levels[HBITMAP_LEVELS];
+
+ /* The length of each levels[] array. */
+ uint64_t sizes[HBITMAP_LEVELS];
};
/* Advance hbi to the next nonzero word and return it. hbi->pos
@@ -384,6 +387,7 @@ HBitmap *hbitmap_alloc(uint64_t size, int granularity)
hb->granularity = granularity;
for (i = HBITMAP_LEVELS; i-- > 0; ) {
size = MAX((size + BITS_PER_LONG - 1) >> BITS_PER_LEVEL, 1);
+ hb->sizes[i] = size;
hb->levels[i] = g_new0(unsigned long, size);
}
@@ -395,3 +399,84 @@ HBitmap *hbitmap_alloc(uint64_t size, int granularity)
hb->levels[0][0] |= 1UL << (BITS_PER_LONG - 1);
return hb;
}
+
+void hbitmap_truncate(HBitmap *hb, uint64_t size)
+{
+ bool shrink;
+ unsigned i;
+ uint64_t num_elements = size;
+ uint64_t old;
+
+ /* Size comes in as logical elements, adjust for granularity. */
+ size = (size + (1ULL << hb->granularity) - 1) >> hb->granularity;
+ assert(size <= ((uint64_t)1 << HBITMAP_LOG_MAX_SIZE));
+ shrink = size < hb->size;
+
+ /* bit sizes are identical; nothing to do. */
+ if (size == hb->size) {
+ return;
+ }
+
+ /* If we're losing bits, let's clear those bits before we invalidate all of
+ * our invariants. This helps keep the bitcount consistent, and will prevent
+ * us from carrying around garbage bits beyond the end of the map.
+ */
+ if (shrink) {
+ /* Don't clear partial granularity groups;
+ * start at the first full one. */
+ uint64_t start = QEMU_ALIGN_UP(num_elements, 1 << hb->granularity);
+ uint64_t fix_count = (hb->size << hb->granularity) - start;
+
+ assert(fix_count);
+ hbitmap_reset(hb, start, fix_count);
+ }
+
+ hb->size = size;
+ for (i = HBITMAP_LEVELS; i-- > 0; ) {
+ size = MAX(BITS_TO_LONGS(size), 1);
+ if (hb->sizes[i] == size) {
+ break;
+ }
+ old = hb->sizes[i];
+ hb->sizes[i] = size;
+ hb->levels[i] = g_realloc(hb->levels[i], size * sizeof(unsigned long));
+ if (!shrink) {
+ memset(&hb->levels[i][old], 0x00,
+ (size - old) * sizeof(*hb->levels[i]));
+ }
+ }
+}
+
+
+/**
+ * Given HBitmaps A and B, let A := A (BITOR) B.
+ * Bitmap B will not be modified.
+ *
+ * @return true if the merge was successful,
+ * false if it was not attempted.
+ */
+bool hbitmap_merge(HBitmap *a, const HBitmap *b)
+{
+ int i;
+ uint64_t j;
+
+ if ((a->size != b->size) || (a->granularity != b->granularity)) {
+ return false;
+ }
+
+ if (hbitmap_count(b) == 0) {
+ return true;
+ }
+
+ /* This merge is O(size), as BITS_PER_LONG and HBITMAP_LEVELS are constant.
+ * It may be possible to improve running times for sparsely populated maps
+ * by using hbitmap_iter_next, but this is suboptimal for dense maps.
+ */
+ for (i = HBITMAP_LEVELS - 1; i >= 0; i--) {
+ for (j = 0; j < a->sizes[i]; j++) {
+ a->levels[i][j] |= b->levels[i][j];
+ }
+ }
+
+ return true;
+}