91 files changed, 5962 insertions, 3071 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 7aab80b59f..b5ab755de5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1182,7 +1182,19 @@ S: Supported
 F: block/gluster.c
 T: git git://github.com/codyprime/qemu-kvm-jtc.git block
 
+Null Block Driver
+M: Fam Zheng <famz@redhat.com>
+L: qemu-block@nongnu.org
+S: Supported
+F: block/null.c
+
 Bootdevice
 M: Gonglei <arei.gonglei@huawei.com>
 S: Maintained
 F: bootdevice.c
+
+Quorum
+M: Alberto Garcia <berto@igalia.com>
+S: Supported
+F: block/quorum.c
+L: qemu-block@nongnu.org
diff --git a/aio-posix.c b/aio-posix.c
index cbd4c3438c..4abec38866 100644
--- a/aio-posix.c
+++ b/aio-posix.c
@@ -24,7 +24,6 @@ struct AioHandler
     IOHandler *io_read;
     IOHandler *io_write;
     int deleted;
-    int pollfds_idx;
     void *opaque;
     QLIST_ENTRY(AioHandler) node;
 };
@@ -83,7 +82,6 @@ void aio_set_fd_handler(AioContext *ctx,
         node->io_read = io_read;
         node->io_write = io_write;
         node->opaque = opaque;
-        node->pollfds_idx = -1;
 
         node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0);
         node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
@@ -186,13 +184,61 @@ bool aio_dispatch(AioContext *ctx)
     return progress;
 }
 
+/* These thread-local variables are used only in a small part of aio_poll
+ * around the call to the poll() system call.  In particular they are not
+ * used while aio_poll is performing callbacks, which makes it much easier
+ * to think about reentrancy!
+ *
+ * Stack-allocated arrays would be perfect but they have size limitations;
+ * heap allocation is expensive enough that we want to reuse arrays across
+ * calls to aio_poll().  And because poll() has to be called without holding
+ * any lock, the arrays cannot be stored in AioContext.  Thread-local data
+ * has none of the disadvantages of these three options.
+ */
+static __thread GPollFD *pollfds;
+static __thread AioHandler **nodes;
+static __thread unsigned npfd, nalloc;
+static __thread Notifier pollfds_cleanup_notifier;
+
+static void pollfds_cleanup(Notifier *n, void *unused)
+{
+    g_assert(npfd == 0);
+    g_free(pollfds);
+    g_free(nodes);
+    nalloc = 0;
+}
+
+static void add_pollfd(AioHandler *node)
+{
+    if (npfd == nalloc) {
+        if (nalloc == 0) {
+            pollfds_cleanup_notifier.notify = pollfds_cleanup;
+            qemu_thread_atexit_add(&pollfds_cleanup_notifier);
+            nalloc = 8;
+        } else {
+            g_assert(nalloc <= INT_MAX);
+            nalloc *= 2;
+        }
+        pollfds = g_renew(GPollFD, pollfds, nalloc);
+        nodes = g_renew(AioHandler *, nodes, nalloc);
+    }
+    nodes[npfd] = node;
+    pollfds[npfd] = (GPollFD) {
+        .fd = node->pfd.fd,
+        .events = node->pfd.events,
+    };
+    npfd++;
+}
+
 bool aio_poll(AioContext *ctx, bool blocking)
 {
     AioHandler *node;
     bool was_dispatching;
-    int ret;
+    int i, ret;
     bool progress;
+    int64_t timeout;
 
+    aio_context_acquire(ctx);
     was_dispatching = ctx->dispatching;
     progress = false;
 
@@ -210,39 +256,36 @@ bool aio_poll(AioContext *ctx, bool blocking)
 
     ctx->walking_handlers++;
 
-    g_array_set_size(ctx->pollfds, 0);
+    assert(npfd == 0);
 
     /* fill pollfds */
     QLIST_FOREACH(node, &ctx->aio_handlers, node) {
-        node->pollfds_idx = -1;
         if (!node->deleted && node->pfd.events) {
-            GPollFD pfd = {
-                .fd = node->pfd.fd,
-                .events = node->pfd.events,
-            };
-            node->pollfds_idx = ctx->pollfds->len;
-            g_array_append_val(ctx->pollfds, pfd);
+            add_pollfd(node);
         }
     }
 
-    ctx->walking_handlers--;
+    timeout = blocking ? aio_compute_timeout(ctx) : 0;
 
     /* wait until next event */
-    ret = qemu_poll_ns((GPollFD *)ctx->pollfds->data,
-                         ctx->pollfds->len,
-                         blocking ? aio_compute_timeout(ctx) : 0);
+    if (timeout) {
+        aio_context_release(ctx);
+    }
+    ret = qemu_poll_ns((GPollFD *)pollfds, npfd, timeout);
+    if (timeout) {
+        aio_context_acquire(ctx);
+    }
 
     /* if we have any readable fds, dispatch event */
     if (ret > 0) {
-        QLIST_FOREACH(node, &ctx->aio_handlers, node) {
-            if (node->pollfds_idx != -1) {
-                GPollFD *pfd = &g_array_index(ctx->pollfds, GPollFD,
-                                              node->pollfds_idx);
-                node->pfd.revents = pfd->revents;
-            }
+        for (i = 0; i < npfd; i++) {
+            nodes[i]->pfd.revents = pollfds[i].revents;
         }
     }
 
+    npfd = 0;
+    ctx->walking_handlers--;
+
     /* Run dispatch even if there were no readable fds to run timers */
     aio_set_dispatching(ctx, true);
     if (aio_dispatch(ctx)) {
@@ -250,5 +293,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
     }
 
     aio_set_dispatching(ctx, was_dispatching);
+    aio_context_release(ctx);
+
     return progress;
 }
diff --git a/aio-win32.c b/aio-win32.c
index e6f4cedf48..233d8f5d79 100644
--- a/aio-win32.c
+++ b/aio-win32.c
@@ -283,6 +283,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
     int count;
     int timeout;
 
+    aio_context_acquire(ctx);
     have_select_revents = aio_prepare(ctx);
     if (have_select_revents) {
         blocking = false;
@@ -323,7 +324,13 @@ bool aio_poll(AioContext *ctx, bool blocking)
 
         timeout = blocking
             ? qemu_timeout_ns_to_ms(aio_compute_timeout(ctx)) : 0;
+        if (timeout) {
+            aio_context_release(ctx);
+        }
         ret = WaitForMultipleObjects(count, events, FALSE, timeout);
+        if (timeout) {
+            aio_context_acquire(ctx);
+        }
         aio_set_dispatching(ctx, true);
 
         if (first && aio_bh_poll(ctx)) {
@@ -349,5 +356,6 @@ bool aio_poll(AioContext *ctx, bool blocking)
     progress |= timerlistgroup_run_timers(&ctx->tlg);
 
     aio_set_dispatching(ctx, was_dispatching);
+    aio_context_release(ctx);
     return progress;
 }
diff --git a/async.c b/async.c
index 2b51e87679..46d9e639d7 100644
--- a/async.c
+++ b/async.c
@@ -230,7 +230,6 @@ aio_ctx_finalize(GSource     *source)
     event_notifier_cleanup(&ctx->notifier);
     rfifolock_destroy(&ctx->lock);
     qemu_mutex_destroy(&ctx->bh_lock);
-    g_array_free(ctx->pollfds, TRUE);
     timerlistgroup_deinit(&ctx->tlg);
 }
 
@@ -281,12 +280,6 @@ static void aio_timerlist_notify(void *opaque)
     aio_notify(opaque);
 }
 
-static void aio_rfifolock_cb(void *opaque)
-{
-    /* Kick owner thread in case they are blocked in aio_poll() */
-    aio_notify(opaque);
-}
-
 AioContext *aio_context_new(Error **errp)
 {
     int ret;
@@ -302,10 +295,9 @@ AioContext *aio_context_new(Error **errp)
     aio_set_event_notifier(ctx, &ctx->notifier,
                            (EventNotifierHandler *)
                            event_notifier_test_and_clear);
-    ctx->pollfds = g_array_new(FALSE, FALSE, sizeof(GPollFD));
     ctx->thread_pool = NULL;
     qemu_mutex_init(&ctx->bh_lock);
-    rfifolock_init(&ctx->lock, aio_rfifolock_cb, ctx);
+    rfifolock_init(&ctx->lock, NULL, NULL);
     timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx);
 
     return ctx;
diff --git a/block.c b/block.c
index f2f8ae77c1..7904098c64 100644
--- a/block.c
+++ b/block.c
@@ -51,43 +51,25 @@
 #include <windows.h>
 #endif
 
+/**
+ * A BdrvDirtyBitmap can be in three possible states:
+ * (1) successor is NULL and disabled is false: full r/w mode
+ * (2) successor is NULL and disabled is true: read only mode ("disabled")
+ * (3) successor is set: frozen mode.
+ *     A frozen bitmap cannot be renamed, deleted, anonymized, cleared, set,
+ *     or enabled. A frozen bitmap can only abdicate() or reclaim().
+ */
 struct BdrvDirtyBitmap {
-    HBitmap *bitmap;
+    HBitmap *bitmap;            /* Dirty sector bitmap implementation */
+    BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status */
+    char *name;                 /* Optional non-empty unique ID */
+    int64_t size;               /* Size of the bitmap (Number of sectors) */
+    bool disabled;              /* Bitmap is read-only */
     QLIST_ENTRY(BdrvDirtyBitmap) list;
 };
 
 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
 
-static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque);
-static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque);
-static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
-                                         int64_t sector_num, int nb_sectors,
-                                         QEMUIOVector *iov);
-static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
-                                         int64_t sector_num, int nb_sectors,
-                                         QEMUIOVector *iov);
-static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
-    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
-    BdrvRequestFlags flags);
-static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
-    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
-    BdrvRequestFlags flags);
-static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
-                                         int64_t sector_num,
-                                         QEMUIOVector *qiov,
-                                         int nb_sectors,
-                                         BdrvRequestFlags flags,
-                                         BlockCompletionFunc *cb,
-                                         void *opaque,
-                                         bool is_write);
-static void coroutine_fn bdrv_co_do_rw(void *opaque);
-static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
-
 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
     QTAILQ_HEAD_INITIALIZER(bdrv_states);
 
@@ -97,10 +79,7 @@ static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
     QLIST_HEAD_INITIALIZER(bdrv_drivers);
 
-static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
-                           int nr_sectors);
-static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
-                             int nr_sectors);
+static void bdrv_dirty_bitmap_truncate(BlockDriverState *bs);
 /* If non-zero, use only whitelisted block drivers */
 static int use_bdrv_whitelist;
 
@@ -124,104 +103,6 @@ int is_windows_drive(const char *filename)
 }
 #endif
 
-/* throttling disk I/O limits */
-void bdrv_set_io_limits(BlockDriverState *bs,
-                        ThrottleConfig *cfg)
-{
-    int i;
-
-    throttle_config(&bs->throttle_state, cfg);
-
-    for (i = 0; i < 2; i++) {
-        qemu_co_enter_next(&bs->throttled_reqs[i]);
-    }
-}
-
-/* this function drain all the throttled IOs */
-static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
-{
-    bool drained = false;
-    bool enabled = bs->io_limits_enabled;
-    int i;
-
-    bs->io_limits_enabled = false;
-
-    for (i = 0; i < 2; i++) {
-        while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
-            drained = true;
-        }
-    }
-
-    bs->io_limits_enabled = enabled;
-
-    return drained;
-}
-
-void bdrv_io_limits_disable(BlockDriverState *bs)
-{
-    bs->io_limits_enabled = false;
-
-    bdrv_start_throttled_reqs(bs);
-
-    throttle_destroy(&bs->throttle_state);
-}
-
-static void bdrv_throttle_read_timer_cb(void *opaque)
-{
-    BlockDriverState *bs = opaque;
-    qemu_co_enter_next(&bs->throttled_reqs[0]);
-}
-
-static void bdrv_throttle_write_timer_cb(void *opaque)
-{
-    BlockDriverState *bs = opaque;
-    qemu_co_enter_next(&bs->throttled_reqs[1]);
-}
-
-/* should be called before bdrv_set_io_limits if a limit is set */
-void bdrv_io_limits_enable(BlockDriverState *bs)
-{
-    assert(!bs->io_limits_enabled);
-    throttle_init(&bs->throttle_state,
-                  bdrv_get_aio_context(bs),
-                  QEMU_CLOCK_VIRTUAL,
-                  bdrv_throttle_read_timer_cb,
-                  bdrv_throttle_write_timer_cb,
-                  bs);
-    bs->io_limits_enabled = true;
-}
-
-/* This function makes an IO wait if needed
- *
- * @nb_sectors: the number of sectors of the IO
- * @is_write:   is the IO a write
- */
-static void bdrv_io_limits_intercept(BlockDriverState *bs,
-                                     unsigned int bytes,
-                                     bool is_write)
-{
-    /* does this io must wait */
-    bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
-
-    /* if must wait or any request of this type throttled queue the IO */
-    if (must_wait ||
-        !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
-        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
-    }
-
-    /* the IO will be executed, do the accounting */
-    throttle_account(&bs->throttle_state, is_write, bytes);
-
-
-    /* if the next request must wait -> do nothing */
-    if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
-        return;
-    }
-
-    /* else queue next request for execution */
-    qemu_co_queue_next(&bs->throttled_reqs[is_write]);
-}
-
 size_t bdrv_opt_mem_align(BlockDriverState *bs)
 {
     if (!bs || !bs->drv) {
@@ -335,20 +216,7 @@ void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz,
 
 void bdrv_register(BlockDriver *bdrv)
 {
-    /* Block drivers without coroutine functions need emulation */
-    if (!bdrv->bdrv_co_readv) {
-        bdrv->bdrv_co_readv = bdrv_co_readv_em;
-        bdrv->bdrv_co_writev = bdrv_co_writev_em;
-
-        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
-         * the block driver lacks aio we need to emulate that too.
-         */
-        if (!bdrv->bdrv_aio_readv) {
-            /* add AIO emulation layer */
-            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
-            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
-        }
-    }
+    bdrv_setup_io_funcs(bdrv);
 
     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
 }
@@ -520,54 +388,6 @@ int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
     return ret;
 }
 
-void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
-{
-    BlockDriver *drv = bs->drv;
-    Error *local_err = NULL;
-
-    memset(&bs->bl, 0, sizeof(bs->bl));
-
-    if (!drv) {
-        return;
-    }
-
-    /* Take some limits from the children as a default */
-    if (bs->file) {
-        bdrv_refresh_limits(bs->file, &local_err);
-        if (local_err) {
-            error_propagate(errp, local_err);
-            return;
-        }
-        bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
-        bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
-        bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
-    } else {
-        bs->bl.opt_mem_alignment = 512;
-    }
-
-    if (bs->backing_hd) {
-        bdrv_refresh_limits(bs->backing_hd, &local_err);
-        if (local_err) {
-            error_propagate(errp, local_err);
-            return;
-        }
-        bs->bl.opt_transfer_length =
-            MAX(bs->bl.opt_transfer_length,
-                bs->backing_hd->bl.opt_transfer_length);
-        bs->bl.max_transfer_length =
-            MIN_NON_ZERO(bs->bl.max_transfer_length,
-                         bs->backing_hd->bl.max_transfer_length);
-        bs->bl.opt_mem_alignment =
-            MAX(bs->bl.opt_mem_alignment,
-                bs->backing_hd->bl.opt_mem_alignment);
-    }
-
-    /* Then let the driver override it */
-    if (drv->bdrv_refresh_limits) {
-        drv->bdrv_refresh_limits(bs, errp);
-    }
-}
-
 /**
  * Try to get @bs's logical and physical block size.
  * On success, store them in @bsz struct and return 0.
@@ -841,22 +661,6 @@ int bdrv_parse_cache_flags(const char *mode, int *flags)
     return 0;
 }
 
-/**
- * The copy-on-read flag is actually a reference count so multiple users may
- * use the feature without worrying about clobbering its previous state.
- * Copy-on-read stays enabled until all users have called to disable it.
- */
-void bdrv_enable_copy_on_read(BlockDriverState *bs)
-{
-    bs->copy_on_read++;
-}
-
-void bdrv_disable_copy_on_read(BlockDriverState *bs)
-{
-    assert(bs->copy_on_read > 0);
-    bs->copy_on_read--;
-}
-
 /*
  * Returns the flags that a temporary snapshot should get, based on the
  * originally requested flags (the originally requested image will have flags
@@ -1224,8 +1028,8 @@ void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
         bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
     } else if (backing_hd) {
         error_setg(&bs->backing_blocker,
-                   "device is used as backing hd of '%s'",
-                   bdrv_get_device_name(bs));
+                   "node is used as backing hd of '%s'",
+                   bdrv_get_device_or_node_name(bs));
     }
 
     bs->backing_hd = backing_hd;
@@ -1812,8 +1616,8 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
      * to r/w */
     if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
         reopen_state->flags & BDRV_O_RDWR) {
-        error_set(errp, QERR_DEVICE_IS_READ_ONLY,
-                  bdrv_get_device_name(reopen_state->bs));
+        error_setg(errp, "Node '%s' is read only",
+                   bdrv_get_device_or_node_name(reopen_state->bs));
         goto error;
     }
 
@@ -1839,9 +1643,9 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
     } else {
         /* It is currently mandatory to have a bdrv_reopen_prepare()
          * handler for each supported drv. */
-        error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-                  drv->format_name, bdrv_get_device_name(reopen_state->bs),
-                 "reopening of file");
+        error_setg(errp, "Block format '%s' used by node '%s' "
+                   "does not support reopening files", drv->format_name,
+                   bdrv_get_device_or_node_name(reopen_state->bs));
         ret = -1;
         goto error;
     }
@@ -1966,86 +1770,6 @@ void bdrv_close_all(void)
     }
 }
 
-/* Check if any requests are in-flight (including throttled requests) */
-static bool bdrv_requests_pending(BlockDriverState *bs)
-{
-    if (!QLIST_EMPTY(&bs->tracked_requests)) {
-        return true;
-    }
-    if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
-        return true;
-    }
-    if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
-        return true;
-    }
-    if (bs->file && bdrv_requests_pending(bs->file)) {
-        return true;
-    }
-    if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
-        return true;
-    }
-    return false;
-}
-
-static bool bdrv_drain_one(BlockDriverState *bs)
-{
-    bool bs_busy;
-
-    bdrv_flush_io_queue(bs);
-    bdrv_start_throttled_reqs(bs);
-    bs_busy = bdrv_requests_pending(bs);
-    bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
-    return bs_busy;
-}
-
-/*
- * Wait for pending requests to complete on a single BlockDriverState subtree
- *
- * See the warning in bdrv_drain_all().  This function can only be called if
- * you are sure nothing can generate I/O because you have op blockers
- * installed.
- *
- * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
- * AioContext.
- */
-void bdrv_drain(BlockDriverState *bs)
-{
-    while (bdrv_drain_one(bs)) {
-        /* Keep iterating */
-    }
-}
-
-/*
- * Wait for pending requests to complete across all BlockDriverStates
- *
- * This function does not flush data to disk, use bdrv_flush_all() for that
- * after calling this function.
- *
- * Note that completion of an asynchronous I/O operation can trigger any
- * number of other I/O operations on other devices---for example a coroutine
- * can be arbitrarily complex and a constant flow of I/O can come until the
- * coroutine is complete.  Because of this, it is not possible to have a
- * function to drain a single device's I/O queue.
- */
-void bdrv_drain_all(void)
-{
-    /* Always run first iteration so any pending completion BHs run */
-    bool busy = true;
-    BlockDriverState *bs;
-
-    while (busy) {
-        busy = false;
-
-        QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
-            AioContext *aio_context = bdrv_get_aio_context(bs);
-
-            aio_context_acquire(aio_context);
-            busy |= bdrv_drain_one(bs);
-            aio_context_release(aio_context);
-        }
-    }
-}
-
 /* make a BlockDriverState anonymous by removing from bdrv_state and
  * graph_bdrv_state list.
    Also, NULL terminate the device_name to prevent double remove */
@@ -2367,152 +2091,6 @@ int bdrv_commit_all(void)
     return 0;
 }
 
-/**
- * Remove an active request from the tracked requests list
- *
- * This function should be called when a tracked request is completing.
- */
-static void tracked_request_end(BdrvTrackedRequest *req)
-{
-    if (req->serialising) {
-        req->bs->serialising_in_flight--;
-    }
-
-    QLIST_REMOVE(req, list);
-    qemu_co_queue_restart_all(&req->wait_queue);
-}
-
-/**
- * Add an active request to the tracked requests list
- */
-static void tracked_request_begin(BdrvTrackedRequest *req,
-                                  BlockDriverState *bs,
-                                  int64_t offset,
-                                  unsigned int bytes, bool is_write)
-{
-    *req = (BdrvTrackedRequest){
-        .bs = bs,
-        .offset         = offset,
-        .bytes          = bytes,
-        .is_write       = is_write,
-        .co             = qemu_coroutine_self(),
-        .serialising    = false,
-        .overlap_offset = offset,
-        .overlap_bytes  = bytes,
-    };
-
-    qemu_co_queue_init(&req->wait_queue);
-
-    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
-}
-
-static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
-{
-    int64_t overlap_offset = req->offset & ~(align - 1);
-    unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
-                               - overlap_offset;
-
-    if (!req->serialising) {
-        req->bs->serialising_in_flight++;
-        req->serialising = true;
-    }
-
-    req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
-    req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
-}
-
-/**
- * Round a region to cluster boundaries
- */
-void bdrv_round_to_clusters(BlockDriverState *bs,
-                            int64_t sector_num, int nb_sectors,
-                            int64_t *cluster_sector_num,
-                            int *cluster_nb_sectors)
-{
-    BlockDriverInfo bdi;
-
-    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
-        *cluster_sector_num = sector_num;
-        *cluster_nb_sectors = nb_sectors;
-    } else {
-        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
-        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
-        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
-                                            nb_sectors, c);
-    }
-}
-
-static int bdrv_get_cluster_size(BlockDriverState *bs)
-{
-    BlockDriverInfo bdi;
-    int ret;
-
-    ret = bdrv_get_info(bs, &bdi);
-    if (ret < 0 || bdi.cluster_size == 0) {
-        return bs->request_alignment;
-    } else {
-        return bdi.cluster_size;
-    }
-}
-
-static bool tracked_request_overlaps(BdrvTrackedRequest *req,
-                                     int64_t offset, unsigned int bytes)
-{
-    /*        aaaa   bbbb */
-    if (offset >= req->overlap_offset + req->overlap_bytes) {
-        return false;
-    }
-    /* bbbb   aaaa        */
-    if (req->overlap_offset >= offset + bytes) {
-        return false;
-    }
-    return true;
-}
-
-static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
-{
-    BlockDriverState *bs = self->bs;
-    BdrvTrackedRequest *req;
-    bool retry;
-    bool waited = false;
-
-    if (!bs->serialising_in_flight) {
-        return false;
-    }
-
-    do {
-        retry = false;
-        QLIST_FOREACH(req, &bs->tracked_requests, list) {
-            if (req == self || (!req->serialising && !self->serialising)) {
-                continue;
-            }
-            if (tracked_request_overlaps(req, self->overlap_offset,
-                                         self->overlap_bytes))
-            {
-                /* Hitting this means there was a reentrant request, for
-                 * example, a block driver issuing nested requests.  This must
-                 * never happen since it means deadlock.
-                 */
-                assert(qemu_coroutine_self() != req->co);
-
-                /* If the request is already (indirectly) waiting for us, or
-                 * will wait for us as soon as it wakes up, then just go on
-                 * (instead of producing a deadlock in the former case). */
-                if (!req->waiting_for) {
-                    self->waiting_for = req;
-                    qemu_co_queue_wait(&req->wait_queue);
-                    self->waiting_for = NULL;
-                    retry = true;
-                    waited = true;
-                    break;
-                }
-            }
-        }
-    } while (retry);
-
-    return waited;
-}
-
 /*
  * Return values:
  * 0        - success
@@ -2681,879 +2259,6 @@ exit:
     return ret;
 }
 
-
-static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
-                                   size_t size)
-{
-    if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
-        return -EIO;
-    }
-
-    if (!bdrv_is_inserted(bs)) {
-        return -ENOMEDIUM;
-    }
-
-    if (offset < 0) {
-        return -EIO;
-    }
-
-    return 0;
-}
-
-static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
-                              int nb_sectors)
-{
-    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
-        return -EIO;
-    }
-
-    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
-                                   nb_sectors * BDRV_SECTOR_SIZE);
-}
-
-typedef struct RwCo {
-    BlockDriverState *bs;
-    int64_t offset;
-    QEMUIOVector *qiov;
-    bool is_write;
-    int ret;
-    BdrvRequestFlags flags;
-} RwCo;
-
-static void coroutine_fn bdrv_rw_co_entry(void *opaque)
-{
-    RwCo *rwco = opaque;
-
-    if (!rwco->is_write) {
-        rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
-                                      rwco->qiov->size, rwco->qiov,
-                                      rwco->flags);
-    } else {
-        rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
-                                       rwco->qiov->size, rwco->qiov,
-                                       rwco->flags);
-    }
-}
-
-/*
- * Process a vectored synchronous request using coroutines
- */
-static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
-                        QEMUIOVector *qiov, bool is_write,
-                        BdrvRequestFlags flags)
-{
-    Coroutine *co;
-    RwCo rwco = {
-        .bs = bs,
-        .offset = offset,
-        .qiov = qiov,
-        .is_write = is_write,
-        .ret = NOT_DONE,
-        .flags = flags,
-    };
-
-    /**
-     * In sync call context, when the vcpu is blocked, this throttling timer
-     * will not fire; so the I/O throttling function has to be disabled here
-     * if it has been enabled.
-     */
-    if (bs->io_limits_enabled) {
-        fprintf(stderr, "Disabling I/O throttling on '%s' due "
-                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
-        bdrv_io_limits_disable(bs);
-    }
-
-    if (qemu_in_coroutine()) {
-        /* Fast-path if already in coroutine context */
-        bdrv_rw_co_entry(&rwco);
-    } else {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
-
-        co = qemu_coroutine_create(bdrv_rw_co_entry);
-        qemu_coroutine_enter(co, &rwco);
-        while (rwco.ret == NOT_DONE) {
-            aio_poll(aio_context, true);
-        }
-    }
-    return rwco.ret;
-}
-
-/*
- * Process a synchronous request using coroutines
- */
-static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
-                      int nb_sectors, bool is_write, BdrvRequestFlags flags)
-{
-    QEMUIOVector qiov;
-    struct iovec iov = {
-        .iov_base = (void *)buf,
-        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
-    };
-
-    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
-        return -EINVAL;
-    }
-
-    qemu_iovec_init_external(&qiov, &iov, 1);
-    return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
-                        &qiov, is_write, flags);
-}
-
-/* return < 0 if error. See bdrv_write() for the return codes */
-int bdrv_read(BlockDriverState *bs, int64_t sector_num,
-              uint8_t *buf, int nb_sectors)
-{
-    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
-}
-
-/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
-int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
-                          uint8_t *buf, int nb_sectors)
-{
-    bool enabled;
-    int ret;
-
-    enabled = bs->io_limits_enabled;
-    bs->io_limits_enabled = false;
-    ret = bdrv_read(bs, sector_num, buf, nb_sectors);
-    bs->io_limits_enabled = enabled;
-    return ret;
-}
-
-/* Return < 0 if error. Important errors are:
-  -EIO         generic I/O error (may happen for all errors)
-  -ENOMEDIUM   No media inserted.
-  -EINVAL      Invalid sector number or nb_sectors
-  -EACCES      Trying to write a read-only device
-*/
-int bdrv_write(BlockDriverState *bs, int64_t sector_num,
-               const uint8_t *buf, int nb_sectors)
-{
-    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
-}
-
-int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
-                      int nb_sectors, BdrvRequestFlags flags)
-{
-    return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
-                      BDRV_REQ_ZERO_WRITE | flags);
-}
-
-/*
- * Completely zero out a block device with the help of bdrv_write_zeroes.
- * The operation is sped up by checking the block status and only writing
- * zeroes to the device if they currently do not return zeroes. Optional
- * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
- *
- * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
- */
-int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
-{
-    int64_t target_sectors, ret, nb_sectors, sector_num = 0;
-    int n;
-
-    target_sectors = bdrv_nb_sectors(bs);
-    if (target_sectors < 0) {
-        return target_sectors;
-    }
-
-    for (;;) {
-        nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
-        if (nb_sectors <= 0) {
-            return 0;
-        }
-        ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
-        if (ret < 0) {
-            error_report("error getting block status at sector %" PRId64 ": %s",
-                         sector_num, strerror(-ret));
-            return ret;
-        }
-        if (ret & BDRV_BLOCK_ZERO) {
-            sector_num += n;
-            continue;
-        }
-        ret = bdrv_write_zeroes(bs, sector_num, n, flags);
-        if (ret < 0) {
-            error_report("error writing zeroes at sector %" PRId64 ": %s",
-                         sector_num, strerror(-ret));
-            return ret;
-        }
-        sector_num += n;
-    }
-}
-
-int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
-{
-    QEMUIOVector qiov;
-    struct iovec iov = {
-        .iov_base = (void *)buf,
-        .iov_len = bytes,
-    };
-    int ret;
-
-    if (bytes < 0) {
-        return -EINVAL;
-    }
-
-    qemu_iovec_init_external(&qiov, &iov, 1);
-    ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
-    if (ret < 0) {
-        return ret;
-    }
-
-    return bytes;
-}
-
-int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
-{
-    int ret;
-
-    ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
-    if (ret < 0) {
-        return ret;
-    }
-
-    return qiov->size;
-}
-
-int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
-                const void *buf, int bytes)
-{
-    QEMUIOVector qiov;
-    struct iovec iov = {
-        .iov_base   = (void *) buf,
-        .iov_len    = bytes,
-    };
-
-    if (bytes < 0) {
-        return -EINVAL;
-    }
-
-    qemu_iovec_init_external(&qiov, &iov, 1);
-    return bdrv_pwritev(bs, offset, &qiov);
-}
-
-/*
- * Writes to the file and ensures that no writes are reordered across this
- * request (acts as a barrier)
- *
- * Returns 0 on success, -errno in error cases.
- */
-int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
-    const void *buf, int count)
-{
-    int ret;
-
-    ret = bdrv_pwrite(bs, offset, buf, count);
-    if (ret < 0) {
-        return ret;
-    }
-
-    /* No flush needed for cache modes that already do it */
-    if (bs->enable_write_cache) {
-        bdrv_flush(bs);
-    }
-
-    return 0;
-}
-
-static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
-{
-    /* Perform I/O through a temporary buffer so that users who scribble over
-     * their read buffer while the operation is in progress do not end up
-     * modifying the image file.  This is critical for zero-copy guest I/O
-     * where anything might happen inside guest memory.
-     */
-    void *bounce_buffer;
-
-    BlockDriver *drv = bs->drv;
-    struct iovec iov;
-    QEMUIOVector bounce_qiov;
-    int64_t cluster_sector_num;
-    int cluster_nb_sectors;
-    size_t skip_bytes;
-    int ret;
-
-    /* Cover entire cluster so no additional backing file I/O is required when
-     * allocating cluster in the image file.
-     */
-    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
-                           &cluster_sector_num, &cluster_nb_sectors);
-
-    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
-                                   cluster_sector_num, cluster_nb_sectors);
-
-    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
-    iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
-    if (bounce_buffer == NULL) {
-        ret = -ENOMEM;
-        goto err;
-    }
-
-    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
-
-    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
-                             &bounce_qiov);
-    if (ret < 0) {
-        goto err;
-    }
-
-    if (drv->bdrv_co_write_zeroes &&
-        buffer_is_zero(bounce_buffer, iov.iov_len)) {
-        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
-                                      cluster_nb_sectors, 0);
-    } else {
-        /* This does not change the data on the disk, it is not necessary
-         * to flush even in cache=writethrough mode.
-         */
-        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
-                                  &bounce_qiov);
-    }
-
-    if (ret < 0) {
-        /* It might be okay to ignore write errors for guest requests.  If this
-         * is a deliberate copy-on-read then we don't want to ignore the error.
-         * Simply report it in all cases.
-         */
-        goto err;
-    }
-
-    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
-    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
-                        nb_sectors * BDRV_SECTOR_SIZE);
-
-err:
-    qemu_vfree(bounce_buffer);
-    return ret;
-}
-
-/*
- * Forwards an already correctly aligned request to the BlockDriver. This
- * handles copy on read and zeroing after EOF; any other features must be
- * implemented by the caller.
- */
-static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
-    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
-    int64_t align, QEMUIOVector *qiov, int flags)
-{
-    BlockDriver *drv = bs->drv;
-    int ret;
-
-    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
-    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
-
-    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
-    assert(!qiov || bytes == qiov->size);
-
-    /* Handle Copy on Read and associated serialisation */
-    if (flags & BDRV_REQ_COPY_ON_READ) {
-        /* If we touch the same cluster it counts as an overlap.  This
-         * guarantees that allocating writes will be serialized and not race
-         * with each other for the same cluster.  For example, in copy-on-read
-         * it ensures that the CoR read and write operations are atomic and
-         * guest writes cannot interleave between them. */
-        mark_request_serialising(req, bdrv_get_cluster_size(bs));
-    }
-
-    wait_serialising_requests(req);
-
-    if (flags & BDRV_REQ_COPY_ON_READ) {
-        int pnum;
-
-        ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
-        if (ret < 0) {
-            goto out;
-        }
-
-        if (!ret || pnum != nb_sectors) {
-            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
-            goto out;
-        }
-    }
-
-    /* Forward the request to the BlockDriver */
-    if (!bs->zero_beyond_eof) {
-        ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
-    } else {
-        /* Read zeros after EOF */
-        int64_t total_sectors, max_nb_sectors;
-
-        total_sectors = bdrv_nb_sectors(bs);
-        if (total_sectors < 0) {
-            ret = total_sectors;
-            goto out;
-        }
-
-        max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
-                                  align >> BDRV_SECTOR_BITS);
-        if (nb_sectors < max_nb_sectors) {
-            ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
-        } else if (max_nb_sectors > 0) {
-            QEMUIOVector local_qiov;
-
-            qemu_iovec_init(&local_qiov, qiov->niov);
-            qemu_iovec_concat(&local_qiov, qiov, 0,
-                              max_nb_sectors * BDRV_SECTOR_SIZE);
-
-            ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors,
-                                     &local_qiov);
-
-            qemu_iovec_destroy(&local_qiov);
-        } else {
-            ret = 0;
-        }
-
-        /* Reading beyond end of file is supposed to produce zeroes */
-        if (ret == 0 && total_sectors < sector_num + nb_sectors) {
-            uint64_t offset = MAX(0, total_sectors - sector_num);
-            uint64_t bytes = (sector_num + nb_sectors - offset) *
-                              BDRV_SECTOR_SIZE;
-            qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
-        }
-    }
-
-out:
-    return ret;
-}
-
-static inline uint64_t bdrv_get_align(BlockDriverState *bs)
-{
-    /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
-    return MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
-}
-
-static inline bool bdrv_req_is_aligned(BlockDriverState *bs,
-                                       int64_t offset, size_t bytes)
-{
-    int64_t align = bdrv_get_align(bs);
-    return !(offset & (align - 1) || (bytes & (align - 1)));
-}
-
-/*
- * Handle a read request in coroutine context
- */
-static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
-    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
-    BdrvRequestFlags flags)
-{
-    BlockDriver *drv = bs->drv;
-    BdrvTrackedRequest req;
-
-    uint64_t align = bdrv_get_align(bs);
-    uint8_t *head_buf = NULL;
-    uint8_t *tail_buf = NULL;
-    QEMUIOVector local_qiov;
-    bool use_local_qiov = false;
-    int ret;
-
-    if (!drv) {
-        return -ENOMEDIUM;
-    }
-
-    ret = bdrv_check_byte_request(bs, offset, bytes);
-    if (ret < 0) {
-        return ret;
-    }
-
-    if (bs->copy_on_read) {
-        flags |= BDRV_REQ_COPY_ON_READ;
-    }
-
-    /* throttling disk I/O */
-    if (bs->io_limits_enabled) {
-        bdrv_io_limits_intercept(bs, bytes, false);
-    }
-
-    /* Align read if necessary by padding qiov */
-    if (offset & (align - 1)) {
-        head_buf = qemu_blockalign(bs, align);
-        qemu_iovec_init(&local_qiov, qiov->niov + 2);
-        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
-        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
-        use_local_qiov = true;
-
-        bytes += offset & (align - 1);
-        offset = offset & ~(align - 1);
-    }
-
-    if ((offset + bytes) & (align - 1)) {
-        if (!use_local_qiov) {
-            qemu_iovec_init(&local_qiov, qiov->niov + 1);
-            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
-            use_local_qiov = true;
-        }
-        tail_buf = qemu_blockalign(bs, align);
-        qemu_iovec_add(&local_qiov, tail_buf,
-                       align - ((offset + bytes) & (align - 1)));
-
-        bytes = ROUND_UP(bytes, align);
-    }
-
-    tracked_request_begin(&req, bs, offset, bytes, false);
-    ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
-                              use_local_qiov ? &local_qiov : qiov,
-                              flags);
-    tracked_request_end(&req);
-
-    if (use_local_qiov) {
-        qemu_iovec_destroy(&local_qiov);
-        qemu_vfree(head_buf);
-        qemu_vfree(tail_buf);
-    }
-
-    return ret;
-}
-
-static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
-    BdrvRequestFlags flags)
-{
-    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
-        return -EINVAL;
-    }
-
-    return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
-                             nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
-}
-
-int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
-    int nb_sectors, QEMUIOVector *qiov)
-{
-    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
-
-    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
-}
-
-int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
-{
-    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
-
-    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
-                            BDRV_REQ_COPY_ON_READ);
-}
-
-#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
-
-static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
-{
-    BlockDriver *drv = bs->drv;
-    QEMUIOVector qiov;
-    struct iovec iov = {0};
-    int ret = 0;
-
-    int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
-                                        BDRV_REQUEST_MAX_SECTORS);
-
-    while (nb_sectors > 0 && !ret) {
-        int num = nb_sectors;
-
-        /* Align request.  Block drivers can expect the "bulk" of the request
-         * to be aligned.
-         */
-        if (bs->bl.write_zeroes_alignment
-            && num > bs->bl.write_zeroes_alignment) {
-            if (sector_num % bs->bl.write_zeroes_alignment != 0) {
-                /* Make a small request up to the first aligned sector.  */
-                num = bs->bl.write_zeroes_alignment;
-                num -= sector_num % bs->bl.write_zeroes_alignment;
-            } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
-                /* Shorten the request to the last aligned sector.  num cannot
-                 * underflow because num > bs->bl.write_zeroes_alignment.
-                 */
-                num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
-            }
-        }
-
-        /* limit request size */
-        if (num > max_write_zeroes) {
-            num = max_write_zeroes;
-        }
-
-        ret = -ENOTSUP;
-        /* First try the efficient write zeroes operation */
-        if (drv->bdrv_co_write_zeroes) {
-            ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
-        }
-
-        if (ret == -ENOTSUP) {
-            /* Fall back to bounce buffer if write zeroes is unsupported */
-            int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
-                                            MAX_WRITE_ZEROES_BOUNCE_BUFFER);
-            num = MIN(num, max_xfer_len);
-            iov.iov_len = num * BDRV_SECTOR_SIZE;
-            if (iov.iov_base == NULL) {
-                iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
-                if (iov.iov_base == NULL) {
-                    ret = -ENOMEM;
-                    goto fail;
-                }
-                memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
-            }
-            qemu_iovec_init_external(&qiov, &iov, 1);
-
-            ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
-
-            /* Keep bounce buffer around if it is big enough for all
-             * all future requests.
-             */
-            if (num < max_xfer_len) {
-                qemu_vfree(iov.iov_base);
-                iov.iov_base = NULL;
-            }
-        }
-
-        sector_num += num;
-        nb_sectors -= num;
-    }
-
-fail:
-    qemu_vfree(iov.iov_base);
-    return ret;
-}
-
-/*
- * Forwards an already correctly aligned write request to the BlockDriver.
- */
-static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
-    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
-    QEMUIOVector *qiov, int flags)
-{
-    BlockDriver *drv = bs->drv;
-    bool waited;
-    int ret;
-
-    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
-    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
-
-    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
-    assert(!qiov || bytes == qiov->size);
-
-    waited = wait_serialising_requests(req);
-    assert(!waited || !req->serialising);
-    assert(req->overlap_offset <= offset);
-    assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
-
-    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
-
-    if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
-        !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
-        qemu_iovec_is_zero(qiov)) {
-        flags |= BDRV_REQ_ZERO_WRITE;
-        if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
-            flags |= BDRV_REQ_MAY_UNMAP;
-        }
-    }
-
-    if (ret < 0) {
-        /* Do nothing, write notifier decided to fail this request */
-    } else if (flags & BDRV_REQ_ZERO_WRITE) {
-        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
-        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
-    } else {
-        BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
-        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
-    }
-    BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
-
-    if (ret == 0 && !bs->enable_write_cache) {
-        ret = bdrv_co_flush(bs);
-    }
-
-    bdrv_set_dirty(bs, sector_num, nb_sectors);
-
-    block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
-
-    if (ret >= 0) {
-        bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
-    }
-
-    return ret;
-}
-
-/*
- * Handle a write request in coroutine context
- */
-static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
-    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
-    BdrvRequestFlags flags)
-{
-    BdrvTrackedRequest req;
-    uint64_t align = bdrv_get_align(bs);
-    uint8_t *head_buf = NULL;
-    uint8_t *tail_buf = NULL;
-    QEMUIOVector local_qiov;
-    bool use_local_qiov = false;
-    int ret;
-
-    if (!bs->drv) {
-        return -ENOMEDIUM;
-    }
-    if (bs->read_only) {
-        return -EACCES;
-    }
-
-    ret = bdrv_check_byte_request(bs, offset, bytes);
-    if (ret < 0) {
-        return ret;
-    }
-
-    /* throttling disk I/O */
-    if (bs->io_limits_enabled) {
-        bdrv_io_limits_intercept(bs, bytes, true);
-    }
-
-    /*
-     * Align write if necessary by performing a read-modify-write cycle.
-     * Pad qiov with the read parts and be sure to have a tracked request not
-     * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
-     */
-    tracked_request_begin(&req, bs, offset, bytes, true);
-
-    if (offset & (align - 1)) {
-        QEMUIOVector head_qiov;
-        struct iovec head_iov;
-
-        mark_request_serialising(&req, align);
-        wait_serialising_requests(&req);
-
-        head_buf = qemu_blockalign(bs, align);
-        head_iov = (struct iovec) {
-            .iov_base   = head_buf,
-            .iov_len    = align,
-        };
-        qemu_iovec_init_external(&head_qiov, &head_iov, 1);
-
-        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
-        ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
-                                  align, &head_qiov, 0);
-        if (ret < 0) {
-            goto fail;
-        }
-        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
-
-        qemu_iovec_init(&local_qiov, qiov->niov + 2);
-        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
-        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
-        use_local_qiov = true;
-
-        bytes += offset & (align - 1);
-        offset = offset & ~(align - 1);
-    }
-
-    if ((offset + bytes) & (align - 1)) {
-        QEMUIOVector tail_qiov;
-        struct iovec tail_iov;
-        size_t tail_bytes;
-        bool waited;
-
-        mark_request_serialising(&req, align);
-        waited = wait_serialising_requests(&req);
-        assert(!waited || !use_local_qiov);
-
-        tail_buf = qemu_blockalign(bs, align);
-        tail_iov = (struct iovec) {
-            .iov_base   = tail_buf,
-            .iov_len    = align,
-        };
-        qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
-
-        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
-        ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
-                                  align, &tail_qiov, 0);
-        if (ret < 0) {
-            goto fail;
-        }
-        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
-
-        if (!use_local_qiov) {
-            qemu_iovec_init(&local_qiov, qiov->niov + 1);
-            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
-            use_local_qiov = true;
-        }
-
-        tail_bytes = (offset + bytes) & (align - 1);
-        qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
-
-        bytes = ROUND_UP(bytes, align);
-    }
-
-    if (use_local_qiov) {
-        /* Local buffer may have non-zero data. */
-        flags &= ~BDRV_REQ_ZERO_WRITE;
-    }
-    ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
-                               use_local_qiov ? &local_qiov : qiov,
-                               flags);
-
-fail:
-    tracked_request_end(&req);
-
-    if (use_local_qiov) {
-        qemu_iovec_destroy(&local_qiov);
-    }
-    qemu_vfree(head_buf);
-    qemu_vfree(tail_buf);
-
-    return ret;
-}
-
-static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
-    BdrvRequestFlags flags)
-{
-    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
-        return -EINVAL;
-    }
-
-    return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
-                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
-}
-
-int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
-    int nb_sectors, QEMUIOVector *qiov)
-{
-    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
-
-    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
-}
-
-int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
-                                      int64_t sector_num, int nb_sectors,
-                                      BdrvRequestFlags flags)
-{
-    int ret;
-
-    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
-
-    if (!(bs->open_flags & BDRV_O_UNMAP)) {
-        flags &= ~BDRV_REQ_MAY_UNMAP;
-    }
-    if (bdrv_req_is_aligned(bs, sector_num << BDRV_SECTOR_BITS,
-                            nb_sectors << BDRV_SECTOR_BITS)) {
-        ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
-                                BDRV_REQ_ZERO_WRITE | flags);
-    } else {
-        uint8_t *buf;
-        QEMUIOVector local_qiov;
-        size_t bytes = nb_sectors << BDRV_SECTOR_BITS;
-
-        buf = qemu_memalign(bdrv_opt_mem_align(bs), bytes);
-        memset(buf, 0, bytes);
-        qemu_iovec_init(&local_qiov, 1);
-        qemu_iovec_add(&local_qiov, buf, bytes);
-
-        ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, &local_qiov,
-                                BDRV_REQ_ZERO_WRITE | flags);
-        qemu_vfree(buf);
-    }
-    return ret;
-}
-
 /**
  * Truncate file to 'offset' bytes (needed only for file protocols)
  */
@@ -3571,6 +2276,7 @@ int bdrv_truncate(BlockDriverState *bs, int64_t offset)
     ret = drv->bdrv_truncate(bs, offset);
     if (ret == 0) {
         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
+        bdrv_dirty_bitmap_truncate(bs);
         if (bs->blk) {
             blk_dev_resize_cb(bs->blk);
         }
@@ -3797,8 +2503,8 @@ void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp)
 {
     if (key) {
         if (!bdrv_is_encrypted(bs)) {
-            error_setg(errp, "Device '%s' is not encrypted",
-                      bdrv_get_device_name(bs));
+            error_setg(errp, "Node '%s' is not encrypted",
+                      bdrv_get_device_or_node_name(bs));
         } else if (bdrv_set_key(bs, key) < 0) {
             error_set(errp, QERR_INVALID_PASSWORD);
         }
@@ -3806,7 +2512,7 @@ void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp)
         if (bdrv_key_required(bs)) {
             error_set(errp, ERROR_CLASS_DEVICE_ENCRYPTED,
                       "'%s' (%s) is encrypted",
-                      bdrv_get_device_name(bs),
+                      bdrv_get_device_or_node_name(bs),
                       bdrv_get_encrypted_filename(bs));
         }
     }
@@ -3870,15 +2576,20 @@ BlockDriverState *bdrv_find_node(const char *node_name)
 }
 
 /* Put this QMP function here so it can access the static graph_bdrv_states. */
-BlockDeviceInfoList *bdrv_named_nodes_list(void)
+BlockDeviceInfoList *bdrv_named_nodes_list(Error **errp)
 {
     BlockDeviceInfoList *list, *entry;
     BlockDriverState *bs;
 
     list = NULL;
     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
+        BlockDeviceInfo *info = bdrv_block_device_info(bs, errp);
+        if (!info) {
+            qapi_free_BlockDeviceInfoList(list);
+            return NULL;
+        }
         entry = g_malloc0(sizeof(*entry));
-        entry->value = bdrv_block_device_info(bs);
+        entry->value = info;
         entry->next = list;
         list = entry;
     }
@@ -3953,29 +2664,18 @@ const char *bdrv_get_device_name(const BlockDriverState *bs)
     return bs->blk ? blk_name(bs->blk) : "";
 }
 
-int bdrv_get_flags(BlockDriverState *bs)
+/* This can be used to identify nodes that might not have a device
+ * name associated. Since node and device names live in the same
+ * namespace, the result is unambiguous. The exception is if both are
+ * absent, then this returns an empty (non-null) string. */
+const char *bdrv_get_device_or_node_name(const BlockDriverState *bs)
 {
-    return bs->open_flags;
+    return bs->blk ? blk_name(bs->blk) : bs->node_name;
 }
 
-int bdrv_flush_all(void)
+int bdrv_get_flags(BlockDriverState *bs)
 {
-    BlockDriverState *bs;
-    int result = 0;
-
-    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
-        int ret;
-
-        aio_context_acquire(aio_context);
-        ret = bdrv_flush(bs);
-        if (ret < 0 && !result) {
-            result = ret;
-        }
-        aio_context_release(aio_context);
-    }
-
-    return result;
+    return bs->open_flags;
 }
 
 int bdrv_has_zero_init_1(BlockDriverState *bs)
@@ -4030,222 +2730,6 @@ bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
     return false;
 }
 
-typedef struct BdrvCoGetBlockStatusData {
-    BlockDriverState *bs;
-    BlockDriverState *base;
-    int64_t sector_num;
-    int nb_sectors;
-    int *pnum;
-    int64_t ret;
-    bool done;
-} BdrvCoGetBlockStatusData;
-
-/*
- * Returns the allocation status of the specified sectors.
- * Drivers not implementing the functionality are assumed to not support
- * backing files, hence all their sectors are reported as allocated.
- *
- * If 'sector_num' is beyond the end of the disk image the return value is 0
- * and 'pnum' is set to 0.
- *
- * 'pnum' is set to the number of sectors (including and immediately following
- * the specified sector) that are known to be in the same
- * allocated/unallocated state.
- *
- * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
- * beyond the end of the disk image it will be clamped.
- */
-static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
-                                                     int64_t sector_num,
-                                                     int nb_sectors, int *pnum)
-{
-    int64_t total_sectors;
-    int64_t n;
-    int64_t ret, ret2;
-
-    total_sectors = bdrv_nb_sectors(bs);
-    if (total_sectors < 0) {
-        return total_sectors;
-    }
-
-    if (sector_num >= total_sectors) {
-        *pnum = 0;
-        return 0;
-    }
-
-    n = total_sectors - sector_num;
-    if (n < nb_sectors) {
-        nb_sectors = n;
-    }
-
-    if (!bs->drv->bdrv_co_get_block_status) {
-        *pnum = nb_sectors;
-        ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
-        if (bs->drv->protocol_name) {
-            ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
-        }
-        return ret;
-    }
-
-    ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
-    if (ret < 0) {
-        *pnum = 0;
-        return ret;
-    }
-
-    if (ret & BDRV_BLOCK_RAW) {
-        assert(ret & BDRV_BLOCK_OFFSET_VALID);
-        return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
-                                     *pnum, pnum);
-    }
-
-    if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
-        ret |= BDRV_BLOCK_ALLOCATED;
-    }
-
-    if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
-        if (bdrv_unallocated_blocks_are_zero(bs)) {
-            ret |= BDRV_BLOCK_ZERO;
-        } else if (bs->backing_hd) {
-            BlockDriverState *bs2 = bs->backing_hd;
-            int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
-            if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
-                ret |= BDRV_BLOCK_ZERO;
-            }
-        }
-    }
-
-    if (bs->file &&
-        (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
-        (ret & BDRV_BLOCK_OFFSET_VALID)) {
-        int file_pnum;
-
-        ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
-                                        *pnum, &file_pnum);
-        if (ret2 >= 0) {
-            /* Ignore errors.  This is just providing extra information, it
-             * is useful but not necessary.
-             */
-            if (!file_pnum) {
-                /* !file_pnum indicates an offset at or beyond the EOF; it is
-                 * perfectly valid for the format block driver to point to such
-                 * offsets, so catch it and mark everything as zero */
-                ret |= BDRV_BLOCK_ZERO;
-            } else {
-                /* Limit request to the range reported by the protocol driver */
-                *pnum = file_pnum;
-                ret |= (ret2 & BDRV_BLOCK_ZERO);
-            }
-        }
-    }
-
-    return ret;
-}
-
-/* Coroutine wrapper for bdrv_get_block_status() */
-static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
-{
-    BdrvCoGetBlockStatusData *data = opaque;
-    BlockDriverState *bs = data->bs;
-
-    data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
-                                         data->pnum);
-    data->done = true;
-}
-
-/*
- * Synchronous wrapper around bdrv_co_get_block_status().
- *
- * See bdrv_co_get_block_status() for details.
- */
-int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
-                              int nb_sectors, int *pnum)
-{
-    Coroutine *co;
-    BdrvCoGetBlockStatusData data = {
-        .bs = bs,
-        .sector_num = sector_num,
-        .nb_sectors = nb_sectors,
-        .pnum = pnum,
-        .done = false,
-    };
-
-    if (qemu_in_coroutine()) {
-        /* Fast-path if already in coroutine context */
-        bdrv_get_block_status_co_entry(&data);
-    } else {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
-
-        co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
-        qemu_coroutine_enter(co, &data);
-        while (!data.done) {
-            aio_poll(aio_context, true);
-        }
-    }
-    return data.ret;
-}
-
-int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
-                                   int nb_sectors, int *pnum)
-{
-    int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
-    if (ret < 0) {
-        return ret;
-    }
-    return !!(ret & BDRV_BLOCK_ALLOCATED);
-}
-
-/*
- * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
- *
- * Return true if the given sector is allocated in any image between
- * BASE and TOP (inclusive).  BASE can be NULL to check if the given
- * sector is allocated in any image of the chain.  Return false otherwise.
- *
- * 'pnum' is set to the number of sectors (including and immediately following
- *  the specified sector) that are known to be in the same
- *  allocated/unallocated state.
- *
- */
-int bdrv_is_allocated_above(BlockDriverState *top,
-                            BlockDriverState *base,
-                            int64_t sector_num,
-                            int nb_sectors, int *pnum)
-{
-    BlockDriverState *intermediate;
-    int ret, n = nb_sectors;
-
-    intermediate = top;
-    while (intermediate && intermediate != base) {
-        int pnum_inter;
-        ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
-                                &pnum_inter);
-        if (ret < 0) {
-            return ret;
-        } else if (ret) {
-            *pnum = pnum_inter;
-            return 1;
-        }
-
-        /*
-         * [sector_num, nb_sectors] is unallocated on top but intermediate
-         * might have
-         *
-         * [sector_num+x, nr_sectors] allocated.
-         */
-        if (n > pnum_inter &&
-            (intermediate == top ||
-             sector_num + pnum_inter < intermediate->total_sectors)) {
-            n = pnum_inter;
-        }
-
-        intermediate = intermediate->backing_hd;
-    }
-
-    *pnum = n;
-    return 0;
-}
-
 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
 {
     if (bs->backing_hd && bs->backing_hd->encrypted)
@@ -4262,28 +2746,6 @@ void bdrv_get_backing_filename(BlockDriverState *bs,
     pstrcpy(filename, filename_size, bs->backing_file);
 }
 
-int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
-                          const uint8_t *buf, int nb_sectors)
-{
-    BlockDriver *drv = bs->drv;
-    int ret;
-
-    if (!drv) {
-        return -ENOMEDIUM;
-    }
-    if (!drv->bdrv_write_compressed) {
-        return -ENOTSUP;
-    }
-    ret = bdrv_check_request(bs, sector_num, nb_sectors);
-    if (ret < 0) {
-        return ret;
-    }
-
-    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
-
-    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
-}
-
 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
     BlockDriver *drv = bs->drv;
@@ -4304,47 +2766,6 @@ ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
     return NULL;
 }
 
-int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
-                      int64_t pos, int size)
-{
-    QEMUIOVector qiov;
-    struct iovec iov = {
-        .iov_base   = (void *) buf,
-        .iov_len    = size,
-    };
-
-    qemu_iovec_init_external(&qiov, &iov, 1);
-    return bdrv_writev_vmstate(bs, &qiov, pos);
-}
-
-int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
-{
-    BlockDriver *drv = bs->drv;
-
-    if (!drv) {
-        return -ENOMEDIUM;
-    } else if (drv->bdrv_save_vmstate) {
-        return drv->bdrv_save_vmstate(bs, qiov, pos);
-    } else if (bs->file) {
-        return bdrv_writev_vmstate(bs->file, qiov, pos);
-    }
-
-    return -ENOTSUP;
-}
-
-int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
-                      int64_t pos, int size)
-{
-    BlockDriver *drv = bs->drv;
-    if (!drv)
-        return -ENOMEDIUM;
-    if (drv->bdrv_load_vmstate)
-        return drv->bdrv_load_vmstate(bs, buf, pos, size);
-    if (bs->file)
-        return bdrv_load_vmstate(bs->file, buf, pos, size);
-    return -ENOTSUP;
-}
-
 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
 {
     if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
@@ -4491,452 +2912,6 @@ int bdrv_get_backing_file_depth(BlockDriverState *bs)
     return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
 }
 
-/**************************************************************/
-/* async I/Os */
-
-BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
-                           QEMUIOVector *qiov, int nb_sectors,
-                           BlockCompletionFunc *cb, void *opaque)
-{
-    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
-
-    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
-                                 cb, opaque, false);
-}
-
-BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
-                            QEMUIOVector *qiov, int nb_sectors,
-                            BlockCompletionFunc *cb, void *opaque)
-{
-    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
-
-    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
-                                 cb, opaque, true);
-}
-
-BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
-        BlockCompletionFunc *cb, void *opaque)
-{
-    trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
-
-    return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
-                                 BDRV_REQ_ZERO_WRITE | flags,
-                                 cb, opaque, true);
-}
-
-
-typedef struct MultiwriteCB {
-    int error;
-    int num_requests;
-    int num_callbacks;
-    struct {
-        BlockCompletionFunc *cb;
-        void *opaque;
-        QEMUIOVector *free_qiov;
-    } callbacks[];
-} MultiwriteCB;
-
-static void multiwrite_user_cb(MultiwriteCB *mcb)
-{
-    int i;
-
-    for (i = 0; i < mcb->num_callbacks; i++) {
-        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
-        if (mcb->callbacks[i].free_qiov) {
-            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
-        }
-        g_free(mcb->callbacks[i].free_qiov);
-    }
-}
-
-static void multiwrite_cb(void *opaque, int ret)
-{
-    MultiwriteCB *mcb = opaque;
-
-    trace_multiwrite_cb(mcb, ret);
-
-    if (ret < 0 && !mcb->error) {
-        mcb->error = ret;
-    }
-
-    mcb->num_requests--;
-    if (mcb->num_requests == 0) {
-        multiwrite_user_cb(mcb);
-        g_free(mcb);
-    }
-}
-
-static int multiwrite_req_compare(const void *a, const void *b)
-{
-    const BlockRequest *req1 = a, *req2 = b;
-
-    /*
-     * Note that we can't simply subtract req2->sector from req1->sector
-     * here as that could overflow the return value.
-     */
-    if (req1->sector > req2->sector) {
-        return 1;
-    } else if (req1->sector < req2->sector) {
-        return -1;
-    } else {
-        return 0;
-    }
-}
-
-/*
- * Takes a bunch of requests and tries to merge them. Returns the number of
- * requests that remain after merging.
- */
-static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
-    int num_reqs, MultiwriteCB *mcb)
-{
-    int i, outidx;
-
-    // Sort requests by start sector
-    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
-
-    // Check if adjacent requests touch the same clusters. If so, combine them,
-    // filling up gaps with zero sectors.
-    outidx = 0;
-    for (i = 1; i < num_reqs; i++) {
-        int merge = 0;
-        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
-
-        // Handle exactly sequential writes and overlapping writes.
-        if (reqs[i].sector <= oldreq_last) {
-            merge = 1;
-        }
-
-        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
-            merge = 0;
-        }
-
-        if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
-            reqs[i].nb_sectors > bs->bl.max_transfer_length) {
-            merge = 0;
-        }
-
-        if (merge) {
-            size_t size;
-            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
-            qemu_iovec_init(qiov,
-                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
-
-            // Add the first request to the merged one. If the requests are
-            // overlapping, drop the last sectors of the first request.
-            size = (reqs[i].sector - reqs[outidx].sector) << 9;
-            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
-
-            // We should need to add any zeros between the two requests
-            assert (reqs[i].sector <= oldreq_last);
-
-            // Add the second request
-            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
-
-            // Add tail of first request, if necessary
-            if (qiov->size < reqs[outidx].qiov->size) {
-                qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
-                                  reqs[outidx].qiov->size - qiov->size);
-            }
-
-            reqs[outidx].nb_sectors = qiov->size >> 9;
-            reqs[outidx].qiov = qiov;
-
-            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
-        } else {
-            outidx++;
-            reqs[outidx].sector     = reqs[i].sector;
-            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
-            reqs[outidx].qiov       = reqs[i].qiov;
-        }
-    }
-
-    block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1);
-
-    return outidx + 1;
-}
-
-/*
- * Submit multiple AIO write requests at once.
- *
- * On success, the function returns 0 and all requests in the reqs array have
- * been submitted. In error case this function returns -1, and any of the
- * requests may or may not be submitted yet. In particular, this means that the
- * callback will be called for some of the requests, for others it won't. The
- * caller must check the error field of the BlockRequest to wait for the right
- * callbacks (if error != 0, no callback will be called).
- *
- * The implementation may modify the contents of the reqs array, e.g. to merge
- * requests. However, the fields opaque and error are left unmodified as they
- * are used to signal failure for a single request to the caller.
- */
-int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
-{
-    MultiwriteCB *mcb;
-    int i;
-
-    /* don't submit writes if we don't have a medium */
-    if (bs->drv == NULL) {
-        for (i = 0; i < num_reqs; i++) {
-            reqs[i].error = -ENOMEDIUM;
-        }
-        return -1;
-    }
-
-    if (num_reqs == 0) {
-        return 0;
-    }
-
-    // Create MultiwriteCB structure
-    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
-    mcb->num_requests = 0;
-    mcb->num_callbacks = num_reqs;
-
-    for (i = 0; i < num_reqs; i++) {
-        mcb->callbacks[i].cb = reqs[i].cb;
-        mcb->callbacks[i].opaque = reqs[i].opaque;
-    }
-
-    // Check for mergable requests
-    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
-
-    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
-
-    /* Run the aio requests. */
-    mcb->num_requests = num_reqs;
-    for (i = 0; i < num_reqs; i++) {
-        bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
-                              reqs[i].nb_sectors, reqs[i].flags,
-                              multiwrite_cb, mcb,
-                              true);
-    }
-
-    return 0;
-}
-
-void bdrv_aio_cancel(BlockAIOCB *acb)
-{
-    qemu_aio_ref(acb);
-    bdrv_aio_cancel_async(acb);
-    while (acb->refcnt > 1) {
-        if (acb->aiocb_info->get_aio_context) {
-            aio_poll(acb->aiocb_info->get_aio_context(acb), true);
-        } else if (acb->bs) {
-            aio_poll(bdrv_get_aio_context(acb->bs), true);
-        } else {
-            abort();
-        }
-    }
-    qemu_aio_unref(acb);
-}
-
-/* Async version of aio cancel. The caller is not blocked if the acb implements
- * cancel_async, otherwise we do nothing and let the request normally complete.
- * In either case the completion callback must be called. */
-void bdrv_aio_cancel_async(BlockAIOCB *acb)
-{
-    if (acb->aiocb_info->cancel_async) {
-        acb->aiocb_info->cancel_async(acb);
-    }
-}
-
-/**************************************************************/
-/* async block device emulation */
-
-typedef struct BlockAIOCBSync {
-    BlockAIOCB common;
-    QEMUBH *bh;
-    int ret;
-    /* vector translation state */
-    QEMUIOVector *qiov;
-    uint8_t *bounce;
-    int is_write;
-} BlockAIOCBSync;
-
-static const AIOCBInfo bdrv_em_aiocb_info = {
-    .aiocb_size         = sizeof(BlockAIOCBSync),
-};
-
-static void bdrv_aio_bh_cb(void *opaque)
-{
-    BlockAIOCBSync *acb = opaque;
-
-    if (!acb->is_write && acb->ret >= 0) {
-        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
-    }
-    qemu_vfree(acb->bounce);
-    acb->common.cb(acb->common.opaque, acb->ret);
-    qemu_bh_delete(acb->bh);
-    acb->bh = NULL;
-    qemu_aio_unref(acb);
-}
-
-static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
-                                      int64_t sector_num,
-                                      QEMUIOVector *qiov,
-                                      int nb_sectors,
-                                      BlockCompletionFunc *cb,
-                                      void *opaque,
-                                      int is_write)
-
-{
-    BlockAIOCBSync *acb;
-
-    acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
-    acb->is_write = is_write;
-    acb->qiov = qiov;
-    acb->bounce = qemu_try_blockalign(bs, qiov->size);
-    acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
-
-    if (acb->bounce == NULL) {
-        acb->ret = -ENOMEM;
-    } else if (is_write) {
-        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
-        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
-    } else {
-        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
-    }
-
-    qemu_bh_schedule(acb->bh);
-
-    return &acb->common;
-}
-
-static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque)
-{
-    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
-}
-
-static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque)
-{
-    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
-}
-
-
-typedef struct BlockAIOCBCoroutine {
-    BlockAIOCB common;
-    BlockRequest req;
-    bool is_write;
-    bool *done;
-    QEMUBH* bh;
-} BlockAIOCBCoroutine;
-
-static const AIOCBInfo bdrv_em_co_aiocb_info = {
-    .aiocb_size         = sizeof(BlockAIOCBCoroutine),
-};
-
-static void bdrv_co_em_bh(void *opaque)
-{
-    BlockAIOCBCoroutine *acb = opaque;
-
-    acb->common.cb(acb->common.opaque, acb->req.error);
-
-    qemu_bh_delete(acb->bh);
-    qemu_aio_unref(acb);
-}
-
-/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
-static void coroutine_fn bdrv_co_do_rw(void *opaque)
-{
-    BlockAIOCBCoroutine *acb = opaque;
-    BlockDriverState *bs = acb->common.bs;
-
-    if (!acb->is_write) {
-        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
-            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
-    } else {
-        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
-            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
-    }
-
-    acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
-    qemu_bh_schedule(acb->bh);
-}
-
-static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
-                                         int64_t sector_num,
-                                         QEMUIOVector *qiov,
-                                         int nb_sectors,
-                                         BdrvRequestFlags flags,
-                                         BlockCompletionFunc *cb,
-                                         void *opaque,
-                                         bool is_write)
-{
-    Coroutine *co;
-    BlockAIOCBCoroutine *acb;
-
-    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
-    acb->req.sector = sector_num;
-    acb->req.nb_sectors = nb_sectors;
-    acb->req.qiov = qiov;
-    acb->req.flags = flags;
-    acb->is_write = is_write;
-
-    co = qemu_coroutine_create(bdrv_co_do_rw);
-    qemu_coroutine_enter(co, acb);
-
-    return &acb->common;
-}
-
-static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
-{
-    BlockAIOCBCoroutine *acb = opaque;
-    BlockDriverState *bs = acb->common.bs;
-
-    acb->req.error = bdrv_co_flush(bs);
-    acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
-    qemu_bh_schedule(acb->bh);
-}
-
-BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
-        BlockCompletionFunc *cb, void *opaque)
-{
-    trace_bdrv_aio_flush(bs, opaque);
-
-    Coroutine *co;
-    BlockAIOCBCoroutine *acb;
-
-    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
-
-    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
-    qemu_coroutine_enter(co, acb);
-
-    return &acb->common;
-}
-
-static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
-{
-    BlockAIOCBCoroutine *acb = opaque;
-    BlockDriverState *bs = acb->common.bs;
-
-    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
-    acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
-    qemu_bh_schedule(acb->bh);
-}
-
-BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque)
-{
-    Coroutine *co;
-    BlockAIOCBCoroutine *acb;
-
-    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
-
-    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
-    acb->req.sector = sector_num;
-    acb->req.nb_sectors = nb_sectors;
-    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
-    qemu_coroutine_enter(co, acb);
-
-    return &acb->common;
-}
-
 void bdrv_init(void)
 {
     module_call_init(MODULE_INIT_BLOCK);
@@ -4948,161 +2923,6 @@ void bdrv_init_with_whitelist(void)
     bdrv_init();
 }
 
-void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
-                   BlockCompletionFunc *cb, void *opaque)
-{
-    BlockAIOCB *acb;
-
-    acb = g_slice_alloc(aiocb_info->aiocb_size);
-    acb->aiocb_info = aiocb_info;
-    acb->bs = bs;
-    acb->cb = cb;
-    acb->opaque = opaque;
-    acb->refcnt = 1;
-    return acb;
-}
-
-void qemu_aio_ref(void *p)
-{
-    BlockAIOCB *acb = p;
-    acb->refcnt++;
-}
-
-void qemu_aio_unref(void *p)
-{
-    BlockAIOCB *acb = p;
-    assert(acb->refcnt > 0);
-    if (--acb->refcnt == 0) {
-        g_slice_free1(acb->aiocb_info->aiocb_size, acb);
-    }
-}
-
-/**************************************************************/
-/* Coroutine block device emulation */
-
-typedef struct CoroutineIOCompletion {
-    Coroutine *coroutine;
-    int ret;
-} CoroutineIOCompletion;
-
-static void bdrv_co_io_em_complete(void *opaque, int ret)
-{
-    CoroutineIOCompletion *co = opaque;
-
-    co->ret = ret;
-    qemu_coroutine_enter(co->coroutine, NULL);
-}
-
-static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
-                                      int nb_sectors, QEMUIOVector *iov,
-                                      bool is_write)
-{
-    CoroutineIOCompletion co = {
-        .coroutine = qemu_coroutine_self(),
-    };
-    BlockAIOCB *acb;
-
-    if (is_write) {
-        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
-                                       bdrv_co_io_em_complete, &co);
-    } else {
-        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
-                                      bdrv_co_io_em_complete, &co);
-    }
-
-    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
-    if (!acb) {
-        return -EIO;
-    }
-    qemu_coroutine_yield();
-
-    return co.ret;
-}
-
-static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
-                                         int64_t sector_num, int nb_sectors,
-                                         QEMUIOVector *iov)
-{
-    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
-}
-
-static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
-                                         int64_t sector_num, int nb_sectors,
-                                         QEMUIOVector *iov)
-{
-    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
-}
-
-static void coroutine_fn bdrv_flush_co_entry(void *opaque)
-{
-    RwCo *rwco = opaque;
-
-    rwco->ret = bdrv_co_flush(rwco->bs);
-}
-
-int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
-{
-    int ret;
-
-    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
-        return 0;
-    }
-
-    /* Write back cached data to the OS even with cache=unsafe */
-    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
-    if (bs->drv->bdrv_co_flush_to_os) {
-        ret = bs->drv->bdrv_co_flush_to_os(bs);
-        if (ret < 0) {
-            return ret;
-        }
-    }
-
-    /* But don't actually force it to the disk with cache=unsafe */
-    if (bs->open_flags & BDRV_O_NO_FLUSH) {
-        goto flush_parent;
-    }
-
-    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
-    if (bs->drv->bdrv_co_flush_to_disk) {
-        ret = bs->drv->bdrv_co_flush_to_disk(bs);
-    } else if (bs->drv->bdrv_aio_flush) {
-        BlockAIOCB *acb;
-        CoroutineIOCompletion co = {
-            .coroutine = qemu_coroutine_self(),
-        };
-
-        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
-        if (acb == NULL) {
-            ret = -EIO;
-        } else {
-            qemu_coroutine_yield();
-            ret = co.ret;
-        }
-    } else {
-        /*
-         * Some block drivers always operate in either writethrough or unsafe
-         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
-         * know how the server works (because the behaviour is hardcoded or
-         * depends on server-side configuration), so we can't ensure that
-         * everything is safe on disk. Returning an error doesn't work because
-         * that would break guests even if the server operates in writethrough
-         * mode.
-         *
-         * Let's hope the user knows what he's doing.
-         */
-        ret = 0;
-    }
-    if (ret < 0) {
-        return ret;
-    }
-
-    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
-     * in the case of cache=unsafe, so there are no useless flushes.
-     */
-flush_parent:
-    return bdrv_co_flush(bs->file);
-}
-
 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
 {
     Error *local_err = NULL;
@@ -5152,143 +2972,6 @@ void bdrv_invalidate_cache_all(Error **errp)
     }
 }
 
-int bdrv_flush(BlockDriverState *bs)
-{
-    Coroutine *co;
-    RwCo rwco = {
-        .bs = bs,
-        .ret = NOT_DONE,
-    };
-
-    if (qemu_in_coroutine()) {
-        /* Fast-path if already in coroutine context */
-        bdrv_flush_co_entry(&rwco);
-    } else {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
-
-        co = qemu_coroutine_create(bdrv_flush_co_entry);
-        qemu_coroutine_enter(co, &rwco);
-        while (rwco.ret == NOT_DONE) {
-            aio_poll(aio_context, true);
-        }
-    }
-
-    return rwco.ret;
-}
-
-typedef struct DiscardCo {
-    BlockDriverState *bs;
-    int64_t sector_num;
-    int nb_sectors;
-    int ret;
-} DiscardCo;
-static void coroutine_fn bdrv_discard_co_entry(void *opaque)
-{
-    DiscardCo *rwco = opaque;
-
-    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
-}
-
-int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
-                                 int nb_sectors)
-{
-    int max_discard, ret;
-
-    if (!bs->drv) {
-        return -ENOMEDIUM;
-    }
-
-    ret = bdrv_check_request(bs, sector_num, nb_sectors);
-    if (ret < 0) {
-        return ret;
-    } else if (bs->read_only) {
-        return -EROFS;
-    }
-
-    bdrv_reset_dirty(bs, sector_num, nb_sectors);
-
-    /* Do nothing if disabled.  */
-    if (!(bs->open_flags & BDRV_O_UNMAP)) {
-        return 0;
-    }
-
-    if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
-        return 0;
-    }
-
-    max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
-    while (nb_sectors > 0) {
-        int ret;
-        int num = nb_sectors;
-
-        /* align request */
-        if (bs->bl.discard_alignment &&
-            num >= bs->bl.discard_alignment &&
-            sector_num % bs->bl.discard_alignment) {
-            if (num > bs->bl.discard_alignment) {
-                num = bs->bl.discard_alignment;
-            }
-            num -= sector_num % bs->bl.discard_alignment;
-        }
-
-        /* limit request size */
-        if (num > max_discard) {
-            num = max_discard;
-        }
-
-        if (bs->drv->bdrv_co_discard) {
-            ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
-        } else {
-            BlockAIOCB *acb;
-            CoroutineIOCompletion co = {
-                .coroutine = qemu_coroutine_self(),
-            };
-
-            acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
-                                            bdrv_co_io_em_complete, &co);
-            if (acb == NULL) {
-                return -EIO;
-            } else {
-                qemu_coroutine_yield();
-                ret = co.ret;
-            }
-        }
-        if (ret && ret != -ENOTSUP) {
-            return ret;
-        }
-
-        sector_num += num;
-        nb_sectors -= num;
-    }
-    return 0;
-}
-
-int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
-{
-    Coroutine *co;
-    DiscardCo rwco = {
-        .bs = bs,
-        .sector_num = sector_num,
-        .nb_sectors = nb_sectors,
-        .ret = NOT_DONE,
-    };
-
-    if (qemu_in_coroutine()) {
-        /* Fast-path if already in coroutine context */
-        bdrv_discard_co_entry(&rwco);
-    } else {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
-
-        co = qemu_coroutine_create(bdrv_discard_co_entry);
-        qemu_coroutine_enter(co, &rwco);
-        while (rwco.ret == NOT_DONE) {
-            aio_poll(aio_context, true);
-        }
-    }
-
-    return rwco.ret;
-}
-
 /**************************************************************/
 /* removable device support */
 
@@ -5354,107 +3037,171 @@ void bdrv_lock_medium(BlockDriverState *bs, bool locked)
     }
 }
 
-/* needed for generic scsi interface */
-
-int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
 {
-    BlockDriver *drv = bs->drv;
-
-    if (drv && drv->bdrv_ioctl)
-        return drv->bdrv_ioctl(bs, req, buf);
-    return -ENOTSUP;
+    bs->guest_block_size = align;
 }
 
-BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
-        unsigned long int req, void *buf,
-        BlockCompletionFunc *cb, void *opaque)
+BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name)
 {
-    BlockDriver *drv = bs->drv;
+    BdrvDirtyBitmap *bm;
 
-    if (drv && drv->bdrv_aio_ioctl)
-        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
+    assert(name);
+    QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
+        if (bm->name && !strcmp(name, bm->name)) {
+            return bm;
+        }
+    }
     return NULL;
 }
 
-void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
+void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap)
 {
-    bs->guest_block_size = align;
+    assert(!bdrv_dirty_bitmap_frozen(bitmap));
+    g_free(bitmap->name);
+    bitmap->name = NULL;
+}
+
+BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
+                                          uint32_t granularity,
+                                          const char *name,
+                                          Error **errp)
+{
+    int64_t bitmap_size;
+    BdrvDirtyBitmap *bitmap;
+    uint32_t sector_granularity;
+
+    assert((granularity & (granularity - 1)) == 0);
+
+    if (name && bdrv_find_dirty_bitmap(bs, name)) {
+        error_setg(errp, "Bitmap already exists: %s", name);
+        return NULL;
+    }
+    sector_granularity = granularity >> BDRV_SECTOR_BITS;
+    assert(sector_granularity);
+    bitmap_size = bdrv_nb_sectors(bs);
+    if (bitmap_size < 0) {
+        error_setg_errno(errp, -bitmap_size, "could not get length of device");
+        errno = -bitmap_size;
+        return NULL;
+    }
+    bitmap = g_new0(BdrvDirtyBitmap, 1);
+    bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(sector_granularity));
+    bitmap->size = bitmap_size;
+    bitmap->name = g_strdup(name);
+    bitmap->disabled = false;
+    QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
+    return bitmap;
 }
 
-void *qemu_blockalign(BlockDriverState *bs, size_t size)
+bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap)
 {
-    return qemu_memalign(bdrv_opt_mem_align(bs), size);
+    return bitmap->successor;
 }
 
-void *qemu_blockalign0(BlockDriverState *bs, size_t size)
+bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap)
 {
-    return memset(qemu_blockalign(bs, size), 0, size);
+    return !(bitmap->disabled || bitmap->successor);
 }
 
-void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
+/**
+ * Create a successor bitmap destined to replace this bitmap after an operation.
+ * Requires that the bitmap is not frozen and has no successor.
+ */
+int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs,
+                                       BdrvDirtyBitmap *bitmap, Error **errp)
 {
-    size_t align = bdrv_opt_mem_align(bs);
+    uint64_t granularity;
+    BdrvDirtyBitmap *child;
+
+    if (bdrv_dirty_bitmap_frozen(bitmap)) {
+        error_setg(errp, "Cannot create a successor for a bitmap that is "
+                   "currently frozen");
+        return -1;
+    }
+    assert(!bitmap->successor);
 
-    /* Ensure that NULL is never returned on success */
-    assert(align > 0);
-    if (size == 0) {
-        size = align;
+    /* Create an anonymous successor */
+    granularity = bdrv_dirty_bitmap_granularity(bitmap);
+    child = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
+    if (!child) {
+        return -1;
     }
 
-    return qemu_try_memalign(align, size);
+    /* Successor will be on or off based on our current state. */
+    child->disabled = bitmap->disabled;
+
+    /* Install the successor and freeze the parent */
+    bitmap->successor = child;
+    return 0;
 }
 
-void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
+/**
+ * For a bitmap with a successor, yield our name to the successor,
+ * delete the old bitmap, and return a handle to the new bitmap.
+ */
+BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs,
+                                            BdrvDirtyBitmap *bitmap,
+                                            Error **errp)
 {
-    void *mem = qemu_try_blockalign(bs, size);
+    char *name;
+    BdrvDirtyBitmap *successor = bitmap->successor;
 
-    if (mem) {
-        memset(mem, 0, size);
+    if (successor == NULL) {
+        error_setg(errp, "Cannot relinquish control if "
+                   "there's no successor present");
+        return NULL;
     }
 
-    return mem;
+    name = bitmap->name;
+    bitmap->name = NULL;
+    successor->name = name;
+    bitmap->successor = NULL;
+    bdrv_release_dirty_bitmap(bs, bitmap);
+
+    return successor;
 }
 
-/*
- * Check if all memory in this vector is sector aligned.
+/**
+ * In cases of failure where we can no longer safely delete the parent,
+ * we may wish to re-join the parent and child/successor.
+ * The merged parent will be un-frozen, but not explicitly re-enabled.
  */
-bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
+BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs,
+                                           BdrvDirtyBitmap *parent,
+                                           Error **errp)
 {
-    int i;
-    size_t alignment = bdrv_opt_mem_align(bs);
+    BdrvDirtyBitmap *successor = parent->successor;
 
-    for (i = 0; i < qiov->niov; i++) {
-        if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
-            return false;
-        }
-        if (qiov->iov[i].iov_len % alignment) {
-            return false;
-        }
+    if (!successor) {
+        error_setg(errp, "Cannot reclaim a successor when none is present");
+        return NULL;
     }
 
-    return true;
+    if (!hbitmap_merge(parent->bitmap, successor->bitmap)) {
+        error_setg(errp, "Merging of parent and successor bitmap failed");
+        return NULL;
+    }
+    bdrv_release_dirty_bitmap(bs, successor);
+    parent->successor = NULL;
+
+    return parent;
 }
 
-BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
-                                          Error **errp)
+/**
+ * Truncates _all_ bitmaps attached to a BDS.
+ */
+static void bdrv_dirty_bitmap_truncate(BlockDriverState *bs)
 {
-    int64_t bitmap_size;
     BdrvDirtyBitmap *bitmap;
+    uint64_t size = bdrv_nb_sectors(bs);
 
-    assert((granularity & (granularity - 1)) == 0);
-
-    granularity >>= BDRV_SECTOR_BITS;
-    assert(granularity);
-    bitmap_size = bdrv_nb_sectors(bs);
-    if (bitmap_size < 0) {
-        error_setg_errno(errp, -bitmap_size, "could not get length of device");
-        errno = -bitmap_size;
-        return NULL;
+    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
+        if (bdrv_dirty_bitmap_frozen(bitmap)) {
+            continue;
+        }
+        hbitmap_truncate(bitmap->bitmap, size);
     }
-    bitmap = g_new0(BdrvDirtyBitmap, 1);
-    bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
-    QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
-    return bitmap;
 }
 
 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
@@ -5462,14 +3209,28 @@ void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
     BdrvDirtyBitmap *bm, *next;
     QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
         if (bm == bitmap) {
+            assert(!bdrv_dirty_bitmap_frozen(bm));
             QLIST_REMOVE(bitmap, list);
             hbitmap_free(bitmap->bitmap);
+            g_free(bitmap->name);
             g_free(bitmap);
             return;
         }
     }
 }
 
+void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap)
+{
+    assert(!bdrv_dirty_bitmap_frozen(bitmap));
+    bitmap->disabled = true;
+}
+
+void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap)
+{
+    assert(!bdrv_dirty_bitmap_frozen(bitmap));
+    bitmap->disabled = false;
+}
+
 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
 {
     BdrvDirtyBitmap *bm;
@@ -5479,9 +3240,11 @@ BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
     QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
         BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
         BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
-        info->count = bdrv_get_dirty_count(bs, bm);
-        info->granularity =
-            ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
+        info->count = bdrv_get_dirty_count(bm);
+        info->granularity = bdrv_dirty_bitmap_granularity(bm);
+        info->has_name = !!bm->name;
+        info->name = g_strdup(bm->name);
+        info->frozen = bdrv_dirty_bitmap_frozen(bm);
         entry->value = info;
         *plist = entry;
         plist = &entry->next;
@@ -5499,43 +3262,90 @@ int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector
     }
 }
 
-void bdrv_dirty_iter_init(BlockDriverState *bs,
-                          BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
+/**
+ * Chooses a default granularity based on the existing cluster size,
+ * but clamped between [4K, 64K]. Defaults to 64K in the case that there
+ * is no cluster size information available.
+ */
+uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs)
+{
+    BlockDriverInfo bdi;
+    uint32_t granularity;
+
+    if (bdrv_get_info(bs, &bdi) >= 0 && bdi.cluster_size > 0) {
+        granularity = MAX(4096, bdi.cluster_size);
+        granularity = MIN(65536, granularity);
+    } else {
+        granularity = 65536;
+    }
+
+    return granularity;
+}
+
+uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap)
+{
+    return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap);
+}
+
+void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
 {
     hbitmap_iter_init(hbi, bitmap->bitmap, 0);
 }
 
-void bdrv_set_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
+void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap,
                            int64_t cur_sector, int nr_sectors)
 {
+    assert(bdrv_dirty_bitmap_enabled(bitmap));
     hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
 }
 
-void bdrv_reset_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
+void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap,
                              int64_t cur_sector, int nr_sectors)
 {
+    assert(bdrv_dirty_bitmap_enabled(bitmap));
     hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
 }
 
-static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
-                           int nr_sectors)
+void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap)
+{
+    assert(bdrv_dirty_bitmap_enabled(bitmap));
+    hbitmap_reset(bitmap->bitmap, 0, bitmap->size);
+}
+
+void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
+                    int nr_sectors)
 {
     BdrvDirtyBitmap *bitmap;
     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
+        if (!bdrv_dirty_bitmap_enabled(bitmap)) {
+            continue;
+        }
         hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
     }
 }
 
-static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
-                             int nr_sectors)
+void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
+                      int nr_sectors)
 {
     BdrvDirtyBitmap *bitmap;
     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
+        if (!bdrv_dirty_bitmap_enabled(bitmap)) {
+            continue;
+        }
         hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
     }
 }
 
-int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
+/**
+ * Advance an HBitmapIter to an arbitrary offset.
+ */
+void bdrv_set_dirty_iter(HBitmapIter *hbi, int64_t offset)
+{
+    assert(hbi->hb);
+    hbitmap_iter_init(hbi, hbi->hb, offset);
+}
+
+int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap)
 {
     return hbitmap_count(bitmap->bitmap);
 }
@@ -5572,8 +3382,8 @@ bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
     if (!QLIST_EMPTY(&bs->op_blockers[op])) {
         blocker = QLIST_FIRST(&bs->op_blockers[op]);
         if (errp) {
-            error_setg(errp, "Device '%s' is busy: %s",
-                       bdrv_get_device_name(bs),
+            error_setg(errp, "Node '%s' is busy: %s",
+                       bdrv_get_device_or_node_name(bs),
                        error_get_pretty(blocker->reason));
         }
         return true;
@@ -5953,12 +3763,6 @@ void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
     abort();
 }
 
-void bdrv_add_before_write_notifier(BlockDriverState *bs,
-                                    NotifierWithReturn *notifier)
-{
-    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
-}
-
 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
                        BlockDriverAmendStatusCB *status_cb)
 {
@@ -6059,36 +3863,6 @@ out:
     return to_replace_bs;
 }
 
-void bdrv_io_plug(BlockDriverState *bs)
-{
-    BlockDriver *drv = bs->drv;
-    if (drv && drv->bdrv_io_plug) {
-        drv->bdrv_io_plug(bs);
-    } else if (bs->file) {
-        bdrv_io_plug(bs->file);
-    }
-}
-
-void bdrv_io_unplug(BlockDriverState *bs)
-{
-    BlockDriver *drv = bs->drv;
-    if (drv && drv->bdrv_io_unplug) {
-        drv->bdrv_io_unplug(bs);
-    } else if (bs->file) {
-        bdrv_io_unplug(bs->file);
-    }
-}
-
-void bdrv_flush_io_queue(BlockDriverState *bs)
-{
-    BlockDriver *drv = bs->drv;
-    if (drv && drv->bdrv_flush_io_queue) {
-        drv->bdrv_flush_io_queue(bs);
-    } else if (bs->file) {
-        bdrv_flush_io_queue(bs->file);
-    }
-}
-
 static bool append_open_options(QDict *d, BlockDriverState *bs)
 {
     const QDictEntry *entry;
diff --git a/block/Makefile.objs b/block/Makefile.objs
index db2933e469..0d8c2a4ab6 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -1,4 +1,4 @@
-block-obj-y += raw_bsd.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
+block-obj-y += raw_bsd.o qcow.o vdi.o vmdk.o cloop.o bochs.o vpc.o vvfat.o
 block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
 block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-obj-y += qed-check.o
@@ -9,7 +9,7 @@ block-obj-y += block-backend.o snapshot.o qapi.o
 block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
 block-obj-$(CONFIG_POSIX) += raw-posix.o
 block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
-block-obj-y += null.o mirror.o
+block-obj-y += null.o mirror.o io.o
 
 block-obj-y += nbd.o nbd-client.o sheepdog.o
 block-obj-$(CONFIG_LIBISCSI) += iscsi.o
@@ -37,6 +37,7 @@ gluster.o-libs     := $(GLUSTERFS_LIBS)
 ssh.o-cflags       := $(LIBSSH2_CFLAGS)
 ssh.o-libs         := $(LIBSSH2_LIBS)
 archipelago.o-libs := $(ARCHIPELAGO_LIBS)
+block-obj-m        += dmg.o
 dmg.o-libs         := $(BZIP2_LIBS)
 qcow.o-libs        := -lz
 linux-aio.o-libs   := -laio
diff --git a/block/backup.c b/block/backup.c
index 1c535b1ab9..d3f648ddd7 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -37,6 +37,8 @@ typedef struct CowRequest {
 typedef struct BackupBlockJob {
     BlockJob common;
     BlockDriverState *target;
+    /* bitmap for sync=dirty-bitmap */
+    BdrvDirtyBitmap *sync_bitmap;
     MirrorSyncMode sync_mode;
     RateLimit limit;
     BlockdevOnError on_source_error;
@@ -242,6 +244,91 @@ static void backup_complete(BlockJob *job, void *opaque)
     g_free(data);
 }
 
+static bool coroutine_fn yield_and_check(BackupBlockJob *job)
+{
+    if (block_job_is_cancelled(&job->common)) {
+        return true;
+    }
+
+    /* we need to yield so that bdrv_drain_all() returns.
+     * (without, VM does not reboot)
+     */
+    if (job->common.speed) {
+        uint64_t delay_ns = ratelimit_calculate_delay(&job->limit,
+                                                      job->sectors_read);
+        job->sectors_read = 0;
+        block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns);
+    } else {
+        block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0);
+    }
+
+    if (block_job_is_cancelled(&job->common)) {
+        return true;
+    }
+
+    return false;
+}
+
+static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
+{
+    bool error_is_read;
+    int ret = 0;
+    int clusters_per_iter;
+    uint32_t granularity;
+    int64_t sector;
+    int64_t cluster;
+    int64_t end;
+    int64_t last_cluster = -1;
+    BlockDriverState *bs = job->common.bs;
+    HBitmapIter hbi;
+
+    granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap);
+    clusters_per_iter = MAX((granularity / BACKUP_CLUSTER_SIZE), 1);
+    bdrv_dirty_iter_init(job->sync_bitmap, &hbi);
+
+    /* Find the next dirty sector(s) */
+    while ((sector = hbitmap_iter_next(&hbi)) != -1) {
+        cluster = sector / BACKUP_SECTORS_PER_CLUSTER;
+
+        /* Fake progress updates for any clusters we skipped */
+        if (cluster != last_cluster + 1) {
+            job->common.offset += ((cluster - last_cluster - 1) *
+                                   BACKUP_CLUSTER_SIZE);
+        }
+
+        for (end = cluster + clusters_per_iter; cluster < end; cluster++) {
+            do {
+                if (yield_and_check(job)) {
+                    return ret;
+                }
+                ret = backup_do_cow(bs, cluster * BACKUP_SECTORS_PER_CLUSTER,
+                                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read);
+                if ((ret < 0) &&
+                    backup_error_action(job, error_is_read, -ret) ==
+                    BLOCK_ERROR_ACTION_REPORT) {
+                    return ret;
+                }
+            } while (ret < 0);
+        }
+
+        /* If the bitmap granularity is smaller than the backup granularity,
+         * we need to advance the iterator pointer to the next cluster. */
+        if (granularity < BACKUP_CLUSTER_SIZE) {
+            bdrv_set_dirty_iter(&hbi, cluster * BACKUP_SECTORS_PER_CLUSTER);
+        }
+
+        last_cluster = cluster - 1;
+    }
+
+    /* Play some final catchup with the progress meter */
+    end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE);
+    if (last_cluster + 1 < end) {
+        job->common.offset += ((end - last_cluster - 1) * BACKUP_CLUSTER_SIZE);
+    }
+
+    return ret;
+}
+
 static void coroutine_fn backup_run(void *opaque)
 {
     BackupBlockJob *job = opaque;
@@ -259,8 +346,7 @@ static void coroutine_fn backup_run(void *opaque)
     qemu_co_rwlock_init(&job->flush_rwlock);
 
     start = 0;
-    end = DIV_ROUND_UP(job->common.len / BDRV_SECTOR_SIZE,
-                       BACKUP_SECTORS_PER_CLUSTER);
+    end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE);
 
     job->bitmap = hbitmap_alloc(end, 0);
 
@@ -278,28 +364,13 @@ static void coroutine_fn backup_run(void *opaque)
             qemu_coroutine_yield();
             job->common.busy = true;
         }
+    } else if (job->sync_mode == MIRROR_SYNC_MODE_DIRTY_BITMAP) {
+        ret = backup_run_incremental(job);
     } else {
         /* Both FULL and TOP SYNC_MODE's require copying.. */
         for (; start < end; start++) {
             bool error_is_read;
-
-            if (block_job_is_cancelled(&job->common)) {
-                break;
-            }
-
-            /* we need to yield so that qemu_aio_flush() returns.
-             * (without, VM does not reboot)
-             */
-            if (job->common.speed) {
-                uint64_t delay_ns = ratelimit_calculate_delay(
-                        &job->limit, job->sectors_read);
-                job->sectors_read = 0;
-                block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns);
-            } else {
-                block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0);
-            }
-
-            if (block_job_is_cancelled(&job->common)) {
+            if (yield_and_check(job)) {
                 break;
             }
 
@@ -357,6 +428,18 @@ static void coroutine_fn backup_run(void *opaque)
     qemu_co_rwlock_wrlock(&job->flush_rwlock);
     qemu_co_rwlock_unlock(&job->flush_rwlock);
 
+    if (job->sync_bitmap) {
+        BdrvDirtyBitmap *bm;
+        if (ret < 0) {
+            /* Merge the successor back into the parent, delete nothing. */
+            bm = bdrv_reclaim_dirty_bitmap(bs, job->sync_bitmap, NULL);
+            assert(bm);
+        } else {
+            /* Everything is fine, delete this bitmap and install the backup. */
+            bm = bdrv_dirty_bitmap_abdicate(bs, job->sync_bitmap, NULL);
+            assert(bm);
+        }
+    }
     hbitmap_free(job->bitmap);
 
     bdrv_iostatus_disable(target);
@@ -369,6 +452,7 @@ static void coroutine_fn backup_run(void *opaque)
 
 void backup_start(BlockDriverState *bs, BlockDriverState *target,
                   int64_t speed, MirrorSyncMode sync_mode,
+                  BdrvDirtyBitmap *sync_bitmap,
                   BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
                   BlockCompletionFunc *cb, void *opaque,
@@ -412,17 +496,36 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
         return;
     }
 
+    if (sync_mode == MIRROR_SYNC_MODE_DIRTY_BITMAP) {
+        if (!sync_bitmap) {
+            error_setg(errp, "must provide a valid bitmap name for "
+                             "\"dirty-bitmap\" sync mode");
+            return;
+        }
+
+        /* Create a new bitmap, and freeze/disable this one. */
+        if (bdrv_dirty_bitmap_create_successor(bs, sync_bitmap, errp) < 0) {
+            return;
+        }
+    } else if (sync_bitmap) {
+        error_setg(errp,
+                   "a sync_bitmap was provided to backup_run, "
+                   "but received an incompatible sync_mode (%s)",
+                   MirrorSyncMode_lookup[sync_mode]);
+        return;
+    }
+
     len = bdrv_getlength(bs);
     if (len < 0) {
         error_setg_errno(errp, -len, "unable to get length for '%s'",
                          bdrv_get_device_name(bs));
-        return;
+        goto error;
     }
 
     BackupBlockJob *job = block_job_create(&backup_job_driver, bs, speed,
                                            cb, opaque, errp);
     if (!job) {
-        return;
+        goto error;
     }
 
     bdrv_op_block_all(target, job->common.blocker);
@@ -431,7 +534,15 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
     job->on_target_error = on_target_error;
     job->target = target;
     job->sync_mode = sync_mode;
+    job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_DIRTY_BITMAP ?
+                       sync_bitmap : NULL;
     job->common.len = len;
     job->common.co = qemu_coroutine_create(backup_run);
     qemu_coroutine_enter(job->common.co, job);
+    return;
+
+ error:
+    if (sync_bitmap) {
+        bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL);
+    }
 }
diff --git a/block/blkdebug.c b/block/blkdebug.c
index 63611e0a33..3c30edba73 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -721,6 +721,11 @@ static int64_t blkdebug_getlength(BlockDriverState *bs)
     return bdrv_getlength(bs->file);
 }
 
+static int blkdebug_truncate(BlockDriverState *bs, int64_t offset)
+{
+    return bdrv_truncate(bs->file, offset);
+}
+
 static void blkdebug_refresh_filename(BlockDriverState *bs)
 {
     QDict *opts;
@@ -779,6 +784,7 @@ static BlockDriver bdrv_blkdebug = {
     .bdrv_file_open         = blkdebug_open,
     .bdrv_close             = blkdebug_close,
     .bdrv_getlength         = blkdebug_getlength,
+    .bdrv_truncate          = blkdebug_truncate,
     .bdrv_refresh_filename  = blkdebug_refresh_filename,
 
     .bdrv_aio_readv         = blkdebug_aio_readv,
diff --git a/block/block-backend.c b/block/block-backend.c
index 48b6e4c05c..93e46f376a 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -515,6 +515,17 @@ int blk_write(BlockBackend *blk, int64_t sector_num, const uint8_t *buf,
     return bdrv_write(blk->bs, sector_num, buf, nb_sectors);
 }
 
+int blk_write_zeroes(BlockBackend *blk, int64_t sector_num,
+                     int nb_sectors, BdrvRequestFlags flags)
+{
+    int ret = blk_check_request(blk, sector_num, nb_sectors);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return bdrv_write_zeroes(blk->bs, sector_num, nb_sectors, flags);
+}
+
 static void error_callback_bh(void *opaque)
 {
     struct BlockBackendAIOCB *acb = opaque;
diff --git a/block/io.c b/block/io.c
new file mode 100644
index 0000000000..1ce62c4fbc
--- /dev/null
+++ b/block/io.c
@@ -0,0 +1,2540 @@
+/*
+ * Block layer I/O functions
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "trace.h"
+#include "sysemu/qtest.h"
+#include "block/blockjob.h"
+#include "block/block_int.h"
+
+#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
+
+static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockCompletionFunc *cb, void *opaque);
+static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockCompletionFunc *cb, void *opaque);
+static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
+                                         int64_t sector_num, int nb_sectors,
+                                         QEMUIOVector *iov);
+static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
+                                         int64_t sector_num, int nb_sectors,
+                                         QEMUIOVector *iov);
+static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
+    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
+    BdrvRequestFlags flags);
+static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
+    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
+    BdrvRequestFlags flags);
+static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
+                                         int64_t sector_num,
+                                         QEMUIOVector *qiov,
+                                         int nb_sectors,
+                                         BdrvRequestFlags flags,
+                                         BlockCompletionFunc *cb,
+                                         void *opaque,
+                                         bool is_write);
+static void coroutine_fn bdrv_co_do_rw(void *opaque);
+static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
+
+/* throttling disk I/O limits */
+void bdrv_set_io_limits(BlockDriverState *bs,
+                        ThrottleConfig *cfg)
+{
+    int i;
+
+    throttle_config(&bs->throttle_state, cfg);
+
+    for (i = 0; i < 2; i++) {
+        qemu_co_enter_next(&bs->throttled_reqs[i]);
+    }
+}
+
+/* this function drain all the throttled IOs */
+static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
+{
+    bool drained = false;
+    bool enabled = bs->io_limits_enabled;
+    int i;
+
+    bs->io_limits_enabled = false;
+
+    for (i = 0; i < 2; i++) {
+        while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
+            drained = true;
+        }
+    }
+
+    bs->io_limits_enabled = enabled;
+
+    return drained;
+}
+
+void bdrv_io_limits_disable(BlockDriverState *bs)
+{
+    bs->io_limits_enabled = false;
+
+    bdrv_start_throttled_reqs(bs);
+
+    throttle_destroy(&bs->throttle_state);
+}
+
+static void bdrv_throttle_read_timer_cb(void *opaque)
+{
+    BlockDriverState *bs = opaque;
+    qemu_co_enter_next(&bs->throttled_reqs[0]);
+}
+
+static void bdrv_throttle_write_timer_cb(void *opaque)
+{
+    BlockDriverState *bs = opaque;
+    qemu_co_enter_next(&bs->throttled_reqs[1]);
+}
+
+/* should be called before bdrv_set_io_limits if a limit is set */
+void bdrv_io_limits_enable(BlockDriverState *bs)
+{
+    int clock_type = QEMU_CLOCK_REALTIME;
+
+    if (qtest_enabled()) {
+        /* For testing block IO throttling only */
+        clock_type = QEMU_CLOCK_VIRTUAL;
+    }
+    assert(!bs->io_limits_enabled);
+    throttle_init(&bs->throttle_state,
+                  bdrv_get_aio_context(bs),
+                  clock_type,
+                  bdrv_throttle_read_timer_cb,
+                  bdrv_throttle_write_timer_cb,
+                  bs);
+    bs->io_limits_enabled = true;
+}
+
+/* This function makes an IO wait if needed
+ *
+ * @nb_sectors: the number of sectors of the IO
+ * @is_write:   is the IO a write
+ */
+static void bdrv_io_limits_intercept(BlockDriverState *bs,
+                                     unsigned int bytes,
+                                     bool is_write)
+{
+    /* does this io must wait */
+    bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
+
+    /* if must wait or any request of this type throttled queue the IO */
+    if (must_wait ||
+        !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
+        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
+    }
+
+    /* the IO will be executed, do the accounting */
+    throttle_account(&bs->throttle_state, is_write, bytes);
+
+
+    /* if the next request must wait -> do nothing */
+    if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
+        return;
+    }
+
+    /* else queue next request for execution */
+    qemu_co_queue_next(&bs->throttled_reqs[is_write]);
+}
+
+void bdrv_setup_io_funcs(BlockDriver *bdrv)
+{
+    /* Block drivers without coroutine functions need emulation */
+    if (!bdrv->bdrv_co_readv) {
+        bdrv->bdrv_co_readv = bdrv_co_readv_em;
+        bdrv->bdrv_co_writev = bdrv_co_writev_em;
+
+        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
+         * the block driver lacks aio we need to emulate that too.
+         */
+        if (!bdrv->bdrv_aio_readv) {
+            /* add AIO emulation layer */
+            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
+            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
+        }
+    }
+}
+
+void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
+{
+    BlockDriver *drv = bs->drv;
+    Error *local_err = NULL;
+
+    memset(&bs->bl, 0, sizeof(bs->bl));
+
+    if (!drv) {
+        return;
+    }
+
+    /* Take some limits from the children as a default */
+    if (bs->file) {
+        bdrv_refresh_limits(bs->file, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+        bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
+        bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
+        bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
+    } else {
+        bs->bl.opt_mem_alignment = 512;
+    }
+
+    if (bs->backing_hd) {
+        bdrv_refresh_limits(bs->backing_hd, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+        bs->bl.opt_transfer_length =
+            MAX(bs->bl.opt_transfer_length,
+                bs->backing_hd->bl.opt_transfer_length);
+        bs->bl.max_transfer_length =
+            MIN_NON_ZERO(bs->bl.max_transfer_length,
+                         bs->backing_hd->bl.max_transfer_length);
+        bs->bl.opt_mem_alignment =
+            MAX(bs->bl.opt_mem_alignment,
+                bs->backing_hd->bl.opt_mem_alignment);
+    }
+
+    /* Then let the driver override it */
+    if (drv->bdrv_refresh_limits) {
+        drv->bdrv_refresh_limits(bs, errp);
+    }
+}
+
+/**
+ * The copy-on-read flag is actually a reference count so multiple users may
+ * use the feature without worrying about clobbering its previous state.
+ * Copy-on-read stays enabled until all users have called to disable it.
+ */
+void bdrv_enable_copy_on_read(BlockDriverState *bs)
+{
+    bs->copy_on_read++;
+}
+
+void bdrv_disable_copy_on_read(BlockDriverState *bs)
+{
+    assert(bs->copy_on_read > 0);
+    bs->copy_on_read--;
+}
+
+/* Check if any requests are in-flight (including throttled requests) */
+static bool bdrv_requests_pending(BlockDriverState *bs)
+{
+    if (!QLIST_EMPTY(&bs->tracked_requests)) {
+        return true;
+    }
+    if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
+        return true;
+    }
+    if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
+        return true;
+    }
+    if (bs->file && bdrv_requests_pending(bs->file)) {
+        return true;
+    }
+    if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
+        return true;
+    }
+    return false;
+}
+
+static bool bdrv_drain_one(BlockDriverState *bs)
+{
+    bool bs_busy;
+
+    bdrv_flush_io_queue(bs);
+    bdrv_start_throttled_reqs(bs);
+    bs_busy = bdrv_requests_pending(bs);
+    bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
+    return bs_busy;
+}
+
+/*
+ * Wait for pending requests to complete on a single BlockDriverState subtree
+ *
+ * See the warning in bdrv_drain_all().  This function can only be called if
+ * you are sure nothing can generate I/O because you have op blockers
+ * installed.
+ *
+ * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
+ * AioContext.
+ */
+void bdrv_drain(BlockDriverState *bs)
+{
+    while (bdrv_drain_one(bs)) {
+        /* Keep iterating */
+    }
+}
+
+/*
+ * Wait for pending requests to complete across all BlockDriverStates
+ *
+ * This function does not flush data to disk, use bdrv_flush_all() for that
+ * after calling this function.
+ *
+ * Note that completion of an asynchronous I/O operation can trigger any
+ * number of other I/O operations on other devices---for example a coroutine
+ * can be arbitrarily complex and a constant flow of I/O can come until the
+ * coroutine is complete.  Because of this, it is not possible to have a
+ * function to drain a single device's I/O queue.
+ */
+void bdrv_drain_all(void)
+{
+    /* Always run first iteration so any pending completion BHs run */
+    bool busy = true;
+    BlockDriverState *bs = NULL;
+
+    while ((bs = bdrv_next(bs))) {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+
+        aio_context_acquire(aio_context);
+        if (bs->job) {
+            block_job_pause(bs->job);
+        }
+        aio_context_release(aio_context);
+    }
+
+    while (busy) {
+        busy = false;
+        bs = NULL;
+
+        while ((bs = bdrv_next(bs))) {
+            AioContext *aio_context = bdrv_get_aio_context(bs);
+
+            aio_context_acquire(aio_context);
+            busy |= bdrv_drain_one(bs);
+            aio_context_release(aio_context);
+        }
+    }
+
+    bs = NULL;
+    while ((bs = bdrv_next(bs))) {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+
+        aio_context_acquire(aio_context);
+        if (bs->job) {
+            block_job_resume(bs->job);
+        }
+        aio_context_release(aio_context);
+    }
+}
+
+/**
+ * Remove an active request from the tracked requests list
+ *
+ * This function should be called when a tracked request is completing.
+ */
+static void tracked_request_end(BdrvTrackedRequest *req)
+{
+    if (req->serialising) {
+        req->bs->serialising_in_flight--;
+    }
+
+    QLIST_REMOVE(req, list);
+    qemu_co_queue_restart_all(&req->wait_queue);
+}
+
+/**
+ * Add an active request to the tracked requests list
+ */
+static void tracked_request_begin(BdrvTrackedRequest *req,
+                                  BlockDriverState *bs,
+                                  int64_t offset,
+                                  unsigned int bytes, bool is_write)
+{
+    *req = (BdrvTrackedRequest){
+        .bs = bs,
+        .offset         = offset,
+        .bytes          = bytes,
+        .is_write       = is_write,
+        .co             = qemu_coroutine_self(),
+        .serialising    = false,
+        .overlap_offset = offset,
+        .overlap_bytes  = bytes,
+    };
+
+    qemu_co_queue_init(&req->wait_queue);
+
+    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
+}
+
+static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
+{
+    int64_t overlap_offset = req->offset & ~(align - 1);
+    unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
+                               - overlap_offset;
+
+    if (!req->serialising) {
+        req->bs->serialising_in_flight++;
+        req->serialising = true;
+    }
+
+    req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
+    req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
+}
+
+/**
+ * Round a region to cluster boundaries
+ */
+void bdrv_round_to_clusters(BlockDriverState *bs,
+                            int64_t sector_num, int nb_sectors,
+                            int64_t *cluster_sector_num,
+                            int *cluster_nb_sectors)
+{
+    BlockDriverInfo bdi;
+
+    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
+        *cluster_sector_num = sector_num;
+        *cluster_nb_sectors = nb_sectors;
+    } else {
+        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
+        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
+        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
+                                            nb_sectors, c);
+    }
+}
+
+static int bdrv_get_cluster_size(BlockDriverState *bs)
+{
+    BlockDriverInfo bdi;
+    int ret;
+
+    ret = bdrv_get_info(bs, &bdi);
+    if (ret < 0 || bdi.cluster_size == 0) {
+        return bs->request_alignment;
+    } else {
+        return bdi.cluster_size;
+    }
+}
+
+static bool tracked_request_overlaps(BdrvTrackedRequest *req,
+                                     int64_t offset, unsigned int bytes)
+{
+    /*        aaaa   bbbb */
+    if (offset >= req->overlap_offset + req->overlap_bytes) {
+        return false;
+    }
+    /* bbbb   aaaa        */
+    if (req->overlap_offset >= offset + bytes) {
+        return false;
+    }
+    return true;
+}
+
+static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
+{
+    BlockDriverState *bs = self->bs;
+    BdrvTrackedRequest *req;
+    bool retry;
+    bool waited = false;
+
+    if (!bs->serialising_in_flight) {
+        return false;
+    }
+
+    do {
+        retry = false;
+        QLIST_FOREACH(req, &bs->tracked_requests, list) {
+            if (req == self || (!req->serialising && !self->serialising)) {
+                continue;
+            }
+            if (tracked_request_overlaps(req, self->overlap_offset,
+                                         self->overlap_bytes))
+            {
+                /* Hitting this means there was a reentrant request, for
+                 * example, a block driver issuing nested requests.  This must
+                 * never happen since it means deadlock.
+                 */
+                assert(qemu_coroutine_self() != req->co);
+
+                /* If the request is already (indirectly) waiting for us, or
+                 * will wait for us as soon as it wakes up, then just go on
+                 * (instead of producing a deadlock in the former case). */
+                if (!req->waiting_for) {
+                    self->waiting_for = req;
+                    qemu_co_queue_wait(&req->wait_queue);
+                    self->waiting_for = NULL;
+                    retry = true;
+                    waited = true;
+                    break;
+                }
+            }
+        }
+    } while (retry);
+
+    return waited;
+}
+
+static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
+                                   size_t size)
+{
+    if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
+        return -EIO;
+    }
+
+    if (!bdrv_is_inserted(bs)) {
+        return -ENOMEDIUM;
+    }
+
+    if (offset < 0) {
+        return -EIO;
+    }
+
+    return 0;
+}
+
+static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
+                              int nb_sectors)
+{
+    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
+        return -EIO;
+    }
+
+    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
+                                   nb_sectors * BDRV_SECTOR_SIZE);
+}
+
+typedef struct RwCo {
+    BlockDriverState *bs;
+    int64_t offset;
+    QEMUIOVector *qiov;
+    bool is_write;
+    int ret;
+    BdrvRequestFlags flags;
+} RwCo;
+
+static void coroutine_fn bdrv_rw_co_entry(void *opaque)
+{
+    RwCo *rwco = opaque;
+
+    if (!rwco->is_write) {
+        rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
+                                      rwco->qiov->size, rwco->qiov,
+                                      rwco->flags);
+    } else {
+        rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
+                                       rwco->qiov->size, rwco->qiov,
+                                       rwco->flags);
+    }
+}
+
+/*
+ * Process a vectored synchronous request using coroutines
+ */
+static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
+                        QEMUIOVector *qiov, bool is_write,
+                        BdrvRequestFlags flags)
+{
+    Coroutine *co;
+    RwCo rwco = {
+        .bs = bs,
+        .offset = offset,
+        .qiov = qiov,
+        .is_write = is_write,
+        .ret = NOT_DONE,
+        .flags = flags,
+    };
+
+    /**
+     * In sync call context, when the vcpu is blocked, this throttling timer
+     * will not fire; so the I/O throttling function has to be disabled here
+     * if it has been enabled.
+     */
+    if (bs->io_limits_enabled) {
+        fprintf(stderr, "Disabling I/O throttling on '%s' due "
+                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
+        bdrv_io_limits_disable(bs);
+    }
+
+    if (qemu_in_coroutine()) {
+        /* Fast-path if already in coroutine context */
+        bdrv_rw_co_entry(&rwco);
+    } else {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+
+        co = qemu_coroutine_create(bdrv_rw_co_entry);
+        qemu_coroutine_enter(co, &rwco);
+        while (rwco.ret == NOT_DONE) {
+            aio_poll(aio_context, true);
+        }
+    }
+    return rwco.ret;
+}
+
+/*
+ * Process a synchronous request using coroutines
+ */
+static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
+                      int nb_sectors, bool is_write, BdrvRequestFlags flags)
+{
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base = (void *)buf,
+        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
+    };
+
+    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
+        return -EINVAL;
+    }
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
+    return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
+                        &qiov, is_write, flags);
+}
+
+/* return < 0 if error. See bdrv_write() for the return codes */
+int bdrv_read(BlockDriverState *bs, int64_t sector_num,
+              uint8_t *buf, int nb_sectors)
+{
+    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
+}
+
+/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
+int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
+                          uint8_t *buf, int nb_sectors)
+{
+    bool enabled;
+    int ret;
+
+    enabled = bs->io_limits_enabled;
+    bs->io_limits_enabled = false;
+    ret = bdrv_read(bs, sector_num, buf, nb_sectors);
+    bs->io_limits_enabled = enabled;
+    return ret;
+}
+
+/* Return < 0 if error. Important errors are:
+  -EIO         generic I/O error (may happen for all errors)
+  -ENOMEDIUM   No media inserted.
+  -EINVAL      Invalid sector number or nb_sectors
+  -EACCES      Trying to write a read-only device
+*/
+int bdrv_write(BlockDriverState *bs, int64_t sector_num,
+               const uint8_t *buf, int nb_sectors)
+{
+    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
+}
+
+int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
+                      int nb_sectors, BdrvRequestFlags flags)
+{
+    return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
+                      BDRV_REQ_ZERO_WRITE | flags);
+}
+
+/*
+ * Completely zero out a block device with the help of bdrv_write_zeroes.
+ * The operation is sped up by checking the block status and only writing
+ * zeroes to the device if they currently do not return zeroes. Optional
+ * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
+ *
+ * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
+ */
+int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
+{
+    int64_t target_sectors, ret, nb_sectors, sector_num = 0;
+    int n;
+
+    target_sectors = bdrv_nb_sectors(bs);
+    if (target_sectors < 0) {
+        return target_sectors;
+    }
+
+    for (;;) {
+        nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
+        if (nb_sectors <= 0) {
+            return 0;
+        }
+        ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
+        if (ret < 0) {
+            error_report("error getting block status at sector %" PRId64 ": %s",
+                         sector_num, strerror(-ret));
+            return ret;
+        }
+        if (ret & BDRV_BLOCK_ZERO) {
+            sector_num += n;
+            continue;
+        }
+        ret = bdrv_write_zeroes(bs, sector_num, n, flags);
+        if (ret < 0) {
+            error_report("error writing zeroes at sector %" PRId64 ": %s",
+                         sector_num, strerror(-ret));
+            return ret;
+        }
+        sector_num += n;
+    }
+}
+
+int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
+{
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base = (void *)buf,
+        .iov_len = bytes,
+    };
+    int ret;
+
+    if (bytes < 0) {
+        return -EINVAL;
+    }
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
+    ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return bytes;
+}
+
+int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
+{
+    int ret;
+
+    ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return qiov->size;
+}
+
+int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
+                const void *buf, int bytes)
+{
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base   = (void *) buf,
+        .iov_len    = bytes,
+    };
+
+    if (bytes < 0) {
+        return -EINVAL;
+    }
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
+    return bdrv_pwritev(bs, offset, &qiov);
+}
+
+/*
+ * Writes to the file and ensures that no writes are reordered across this
+ * request (acts as a barrier)
+ *
+ * Returns 0 on success, -errno in error cases.
+ */
+int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
+    const void *buf, int count)
+{
+    int ret;
+
+    ret = bdrv_pwrite(bs, offset, buf, count);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* No flush needed for cache modes that already do it */
+    if (bs->enable_write_cache) {
+        bdrv_flush(bs);
+    }
+
+    return 0;
+}
+
+static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+    /* Perform I/O through a temporary buffer so that users who scribble over
+     * their read buffer while the operation is in progress do not end up
+     * modifying the image file.  This is critical for zero-copy guest I/O
+     * where anything might happen inside guest memory.
+     */
+    void *bounce_buffer;
+
+    BlockDriver *drv = bs->drv;
+    struct iovec iov;
+    QEMUIOVector bounce_qiov;
+    int64_t cluster_sector_num;
+    int cluster_nb_sectors;
+    size_t skip_bytes;
+    int ret;
+
+    /* Cover entire cluster so no additional backing file I/O is required when
+     * allocating cluster in the image file.
+     */
+    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
+                           &cluster_sector_num, &cluster_nb_sectors);
+
+    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
+                                   cluster_sector_num, cluster_nb_sectors);
+
+    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
+    iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
+    if (bounce_buffer == NULL) {
+        ret = -ENOMEM;
+        goto err;
+    }
+
+    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
+
+    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
+                             &bounce_qiov);
+    if (ret < 0) {
+        goto err;
+    }
+
+    if (drv->bdrv_co_write_zeroes &&
+        buffer_is_zero(bounce_buffer, iov.iov_len)) {
+        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
+                                      cluster_nb_sectors, 0);
+    } else {
+        /* This does not change the data on the disk, it is not necessary
+         * to flush even in cache=writethrough mode.
+         */
+        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
+                                  &bounce_qiov);
+    }
+
+    if (ret < 0) {
+        /* It might be okay to ignore write errors for guest requests.  If this
+         * is a deliberate copy-on-read then we don't want to ignore the error.
+         * Simply report it in all cases.
+         */
+        goto err;
+    }
+
+    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
+    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
+                        nb_sectors * BDRV_SECTOR_SIZE);
+
+err:
+    qemu_vfree(bounce_buffer);
+    return ret;
+}
+
+/*
+ * Forwards an already correctly aligned request to the BlockDriver. This
+ * handles copy on read and zeroing after EOF; any other features must be
+ * implemented by the caller.
+ */
+static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
+    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
+    int64_t align, QEMUIOVector *qiov, int flags)
+{
+    BlockDriver *drv = bs->drv;
+    int ret;
+
+    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
+    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert(!qiov || bytes == qiov->size);
+
+    /* Handle Copy on Read and associated serialisation */
+    if (flags & BDRV_REQ_COPY_ON_READ) {
+        /* If we touch the same cluster it counts as an overlap.  This
+         * guarantees that allocating writes will be serialized and not race
+         * with each other for the same cluster.  For example, in copy-on-read
+         * it ensures that the CoR read and write operations are atomic and
+         * guest writes cannot interleave between them. */
+        mark_request_serialising(req, bdrv_get_cluster_size(bs));
+    }
+
+    wait_serialising_requests(req);
+
+    if (flags & BDRV_REQ_COPY_ON_READ) {
+        int pnum;
+
+        ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
+        if (ret < 0) {
+            goto out;
+        }
+
+        if (!ret || pnum != nb_sectors) {
+            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
+            goto out;
+        }
+    }
+
+    /* Forward the request to the BlockDriver */
+    if (!bs->zero_beyond_eof) {
+        ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+    } else {
+        /* Read zeros after EOF */
+        int64_t total_sectors, max_nb_sectors;
+
+        total_sectors = bdrv_nb_sectors(bs);
+        if (total_sectors < 0) {
+            ret = total_sectors;
+            goto out;
+        }
+
+        max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
+                                  align >> BDRV_SECTOR_BITS);
+        if (nb_sectors < max_nb_sectors) {
+            ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+        } else if (max_nb_sectors > 0) {
+            QEMUIOVector local_qiov;
+
+            qemu_iovec_init(&local_qiov, qiov->niov);
+            qemu_iovec_concat(&local_qiov, qiov, 0,
+                              max_nb_sectors * BDRV_SECTOR_SIZE);
+
+            ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors,
+                                     &local_qiov);
+
+            qemu_iovec_destroy(&local_qiov);
+        } else {
+            ret = 0;
+        }
+
+        /* Reading beyond end of file is supposed to produce zeroes */
+        if (ret == 0 && total_sectors < sector_num + nb_sectors) {
+            uint64_t offset = MAX(0, total_sectors - sector_num);
+            uint64_t bytes = (sector_num + nb_sectors - offset) *
+                              BDRV_SECTOR_SIZE;
+            qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
+        }
+    }
+
+out:
+    return ret;
+}
+
+static inline uint64_t bdrv_get_align(BlockDriverState *bs)
+{
+    /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
+    return MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
+}
+
+static inline bool bdrv_req_is_aligned(BlockDriverState *bs,
+                                       int64_t offset, size_t bytes)
+{
+    int64_t align = bdrv_get_align(bs);
+    return !(offset & (align - 1) || (bytes & (align - 1)));
+}
+
+/*
+ * Handle a read request in coroutine context
+ */
+static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
+    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
+    BdrvRequestFlags flags)
+{
+    BlockDriver *drv = bs->drv;
+    BdrvTrackedRequest req;
+
+    uint64_t align = bdrv_get_align(bs);
+    uint8_t *head_buf = NULL;
+    uint8_t *tail_buf = NULL;
+    QEMUIOVector local_qiov;
+    bool use_local_qiov = false;
+    int ret;
+
+    if (!drv) {
+        return -ENOMEDIUM;
+    }
+
+    ret = bdrv_check_byte_request(bs, offset, bytes);
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (bs->copy_on_read) {
+        flags |= BDRV_REQ_COPY_ON_READ;
+    }
+
+    /* throttling disk I/O */
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_intercept(bs, bytes, false);
+    }
+
+    /* Align read if necessary by padding qiov */
+    if (offset & (align - 1)) {
+        head_buf = qemu_blockalign(bs, align);
+        qemu_iovec_init(&local_qiov, qiov->niov + 2);
+        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
+        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
+        use_local_qiov = true;
+
+        bytes += offset & (align - 1);
+        offset = offset & ~(align - 1);
+    }
+
+    if ((offset + bytes) & (align - 1)) {
+        if (!use_local_qiov) {
+            qemu_iovec_init(&local_qiov, qiov->niov + 1);
+            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
+            use_local_qiov = true;
+        }
+        tail_buf = qemu_blockalign(bs, align);
+        qemu_iovec_add(&local_qiov, tail_buf,
+                       align - ((offset + bytes) & (align - 1)));
+
+        bytes = ROUND_UP(bytes, align);
+    }
+
+    tracked_request_begin(&req, bs, offset, bytes, false);
+    ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
+                              use_local_qiov ? &local_qiov : qiov,
+                              flags);
+    tracked_request_end(&req);
+
+    if (use_local_qiov) {
+        qemu_iovec_destroy(&local_qiov);
+        qemu_vfree(head_buf);
+        qemu_vfree(tail_buf);
+    }
+
+    return ret;
+}
+
+static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+    BdrvRequestFlags flags)
+{
+    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
+        return -EINVAL;
+    }
+
+    return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
+                             nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
+}
+
+int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
+    int nb_sectors, QEMUIOVector *qiov)
+{
+    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
+
+    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
+}
+
+int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
+
+    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
+                            BDRV_REQ_COPY_ON_READ);
+}
+
+#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
+
+static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+{
+    BlockDriver *drv = bs->drv;
+    QEMUIOVector qiov;
+    struct iovec iov = {0};
+    int ret = 0;
+
+    int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
+                                        BDRV_REQUEST_MAX_SECTORS);
+
+    while (nb_sectors > 0 && !ret) {
+        int num = nb_sectors;
+
+        /* Align request.  Block drivers can expect the "bulk" of the request
+         * to be aligned.
+         */
+        if (bs->bl.write_zeroes_alignment
+            && num > bs->bl.write_zeroes_alignment) {
+            if (sector_num % bs->bl.write_zeroes_alignment != 0) {
+                /* Make a small request up to the first aligned sector.  */
+                num = bs->bl.write_zeroes_alignment;
+                num -= sector_num % bs->bl.write_zeroes_alignment;
+            } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
+                /* Shorten the request to the last aligned sector.  num cannot
+                 * underflow because num > bs->bl.write_zeroes_alignment.
+                 */
+                num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
+            }
+        }
+
+        /* limit request size */
+        if (num > max_write_zeroes) {
+            num = max_write_zeroes;
+        }
+
+        ret = -ENOTSUP;
+        /* First try the efficient write zeroes operation */
+        if (drv->bdrv_co_write_zeroes) {
+            ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
+        }
+
+        if (ret == -ENOTSUP) {
+            /* Fall back to bounce buffer if write zeroes is unsupported */
+            int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
+                                            MAX_WRITE_ZEROES_BOUNCE_BUFFER);
+            num = MIN(num, max_xfer_len);
+            iov.iov_len = num * BDRV_SECTOR_SIZE;
+            if (iov.iov_base == NULL) {
+                iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
+                if (iov.iov_base == NULL) {
+                    ret = -ENOMEM;
+                    goto fail;
+                }
+                memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
+            }
+            qemu_iovec_init_external(&qiov, &iov, 1);
+
+            ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
+
+            /* Keep bounce buffer around if it is big enough for all
+             * all future requests.
+             */
+            if (num < max_xfer_len) {
+                qemu_vfree(iov.iov_base);
+                iov.iov_base = NULL;
+            }
+        }
+
+        sector_num += num;
+        nb_sectors -= num;
+    }
+
+fail:
+    qemu_vfree(iov.iov_base);
+    return ret;
+}
+
+/*
+ * Forwards an already correctly aligned write request to the BlockDriver.
+ */
+static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
+    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
+    QEMUIOVector *qiov, int flags)
+{
+    BlockDriver *drv = bs->drv;
+    bool waited;
+    int ret;
+
+    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
+    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert(!qiov || bytes == qiov->size);
+
+    waited = wait_serialising_requests(req);
+    assert(!waited || !req->serialising);
+    assert(req->overlap_offset <= offset);
+    assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
+
+    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
+
+    if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
+        !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
+        qemu_iovec_is_zero(qiov)) {
+        flags |= BDRV_REQ_ZERO_WRITE;
+        if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
+            flags |= BDRV_REQ_MAY_UNMAP;
+        }
+    }
+
+    if (ret < 0) {
+        /* Do nothing, write notifier decided to fail this request */
+    } else if (flags & BDRV_REQ_ZERO_WRITE) {
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
+        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
+    } else {
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
+        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
+    }
+    BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
+
+    if (ret == 0 && !bs->enable_write_cache) {
+        ret = bdrv_co_flush(bs);
+    }
+
+    bdrv_set_dirty(bs, sector_num, nb_sectors);
+
+    block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
+
+    if (ret >= 0) {
+        bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
+    }
+
+    return ret;
+}
+
+/*
+ * Handle a write request in coroutine context
+ */
+static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
+    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
+    BdrvRequestFlags flags)
+{
+    BdrvTrackedRequest req;
+    uint64_t align = bdrv_get_align(bs);
+    uint8_t *head_buf = NULL;
+    uint8_t *tail_buf = NULL;
+    QEMUIOVector local_qiov;
+    bool use_local_qiov = false;
+    int ret;
+
+    if (!bs->drv) {
+        return -ENOMEDIUM;
+    }
+    if (bs->read_only) {
+        return -EACCES;
+    }
+
+    ret = bdrv_check_byte_request(bs, offset, bytes);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* throttling disk I/O */
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_intercept(bs, bytes, true);
+    }
+
+    /*
+     * Align write if necessary by performing a read-modify-write cycle.
+     * Pad qiov with the read parts and be sure to have a tracked request not
+     * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
+     */
+    tracked_request_begin(&req, bs, offset, bytes, true);
+
+    if (offset & (align - 1)) {
+        QEMUIOVector head_qiov;
+        struct iovec head_iov;
+
+        mark_request_serialising(&req, align);
+        wait_serialising_requests(&req);
+
+        head_buf = qemu_blockalign(bs, align);
+        head_iov = (struct iovec) {
+            .iov_base   = head_buf,
+            .iov_len    = align,
+        };
+        qemu_iovec_init_external(&head_qiov, &head_iov, 1);
+
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
+        ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
+                                  align, &head_qiov, 0);
+        if (ret < 0) {
+            goto fail;
+        }
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
+
+        qemu_iovec_init(&local_qiov, qiov->niov + 2);
+        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
+        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
+        use_local_qiov = true;
+
+        bytes += offset & (align - 1);
+        offset = offset & ~(align - 1);
+    }
+
+    if ((offset + bytes) & (align - 1)) {
+        QEMUIOVector tail_qiov;
+        struct iovec tail_iov;
+        size_t tail_bytes;
+        bool waited;
+
+        mark_request_serialising(&req, align);
+        waited = wait_serialising_requests(&req);
+        assert(!waited || !use_local_qiov);
+
+        tail_buf = qemu_blockalign(bs, align);
+        tail_iov = (struct iovec) {
+            .iov_base   = tail_buf,
+            .iov_len    = align,
+        };
+        qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
+
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
+        ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
+                                  align, &tail_qiov, 0);
+        if (ret < 0) {
+            goto fail;
+        }
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
+
+        if (!use_local_qiov) {
+            qemu_iovec_init(&local_qiov, qiov->niov + 1);
+            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
+            use_local_qiov = true;
+        }
+
+        tail_bytes = (offset + bytes) & (align - 1);
+        qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
+
+        bytes = ROUND_UP(bytes, align);
+    }
+
+    if (use_local_qiov) {
+        /* Local buffer may have non-zero data. */
+        flags &= ~BDRV_REQ_ZERO_WRITE;
+    }
+    ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
+                               use_local_qiov ? &local_qiov : qiov,
+                               flags);
+
+fail:
+    tracked_request_end(&req);
+
+    if (use_local_qiov) {
+        qemu_iovec_destroy(&local_qiov);
+    }
+    qemu_vfree(head_buf);
+    qemu_vfree(tail_buf);
+
+    return ret;
+}
+
+static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+    BdrvRequestFlags flags)
+{
+    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
+        return -EINVAL;
+    }
+
+    return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
+                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
+}
+
+int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
+    int nb_sectors, QEMUIOVector *qiov)
+{
+    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
+
+    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
+}
+
+int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
+                                      int64_t sector_num, int nb_sectors,
+                                      BdrvRequestFlags flags)
+{
+    int ret;
+
+    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
+
+    if (!(bs->open_flags & BDRV_O_UNMAP)) {
+        flags &= ~BDRV_REQ_MAY_UNMAP;
+    }
+    if (bdrv_req_is_aligned(bs, sector_num << BDRV_SECTOR_BITS,
+                            nb_sectors << BDRV_SECTOR_BITS)) {
+        ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
+                                BDRV_REQ_ZERO_WRITE | flags);
+    } else {
+        uint8_t *buf;
+        QEMUIOVector local_qiov;
+        size_t bytes = nb_sectors << BDRV_SECTOR_BITS;
+
+        buf = qemu_memalign(bdrv_opt_mem_align(bs), bytes);
+        memset(buf, 0, bytes);
+        qemu_iovec_init(&local_qiov, 1);
+        qemu_iovec_add(&local_qiov, buf, bytes);
+
+        ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, &local_qiov,
+                                BDRV_REQ_ZERO_WRITE | flags);
+        qemu_vfree(buf);
+    }
+    return ret;
+}
+
+int bdrv_flush_all(void)
+{
+    BlockDriverState *bs = NULL;
+    int result = 0;
+
+    while ((bs = bdrv_next(bs))) {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+        int ret;
+
+        aio_context_acquire(aio_context);
+        ret = bdrv_flush(bs);
+        if (ret < 0 && !result) {
+            result = ret;
+        }
+        aio_context_release(aio_context);
+    }
+
+    return result;
+}
+
+typedef struct BdrvCoGetBlockStatusData {
+    BlockDriverState *bs;
+    BlockDriverState *base;
+    int64_t sector_num;
+    int nb_sectors;
+    int *pnum;
+    int64_t ret;
+    bool done;
+} BdrvCoGetBlockStatusData;
+
+/*
+ * Returns the allocation status of the specified sectors.
+ * Drivers not implementing the functionality are assumed to not support
+ * backing files, hence all their sectors are reported as allocated.
+ *
+ * If 'sector_num' is beyond the end of the disk image the return value is 0
+ * and 'pnum' is set to 0.
+ *
+ * 'pnum' is set to the number of sectors (including and immediately following
+ * the specified sector) that are known to be in the same
+ * allocated/unallocated state.
+ *
+ * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
+ * beyond the end of the disk image it will be clamped.
+ */
+static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
+                                                     int64_t sector_num,
+                                                     int nb_sectors, int *pnum)
+{
+    int64_t total_sectors;
+    int64_t n;
+    int64_t ret, ret2;
+
+    total_sectors = bdrv_nb_sectors(bs);
+    if (total_sectors < 0) {
+        return total_sectors;
+    }
+
+    if (sector_num >= total_sectors) {
+        *pnum = 0;
+        return 0;
+    }
+
+    n = total_sectors - sector_num;
+    if (n < nb_sectors) {
+        nb_sectors = n;
+    }
+
+    if (!bs->drv->bdrv_co_get_block_status) {
+        *pnum = nb_sectors;
+        ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
+        if (bs->drv->protocol_name) {
+            ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
+        }
+        return ret;
+    }
+
+    ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
+    if (ret < 0) {
+        *pnum = 0;
+        return ret;
+    }
+
+    if (ret & BDRV_BLOCK_RAW) {
+        assert(ret & BDRV_BLOCK_OFFSET_VALID);
+        return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
+                                     *pnum, pnum);
+    }
+
+    if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
+        ret |= BDRV_BLOCK_ALLOCATED;
+    }
+
+    if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
+        if (bdrv_unallocated_blocks_are_zero(bs)) {
+            ret |= BDRV_BLOCK_ZERO;
+        } else if (bs->backing_hd) {
+            BlockDriverState *bs2 = bs->backing_hd;
+            int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
+            if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
+                ret |= BDRV_BLOCK_ZERO;
+            }
+        }
+    }
+
+    if (bs->file &&
+        (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
+        (ret & BDRV_BLOCK_OFFSET_VALID)) {
+        int file_pnum;
+
+        ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
+                                        *pnum, &file_pnum);
+        if (ret2 >= 0) {
+            /* Ignore errors.  This is just providing extra information, it
+             * is useful but not necessary.
+             */
+            if (!file_pnum) {
+                /* !file_pnum indicates an offset at or beyond the EOF; it is
+                 * perfectly valid for the format block driver to point to such
+                 * offsets, so catch it and mark everything as zero */
+                ret |= BDRV_BLOCK_ZERO;
+            } else {
+                /* Limit request to the range reported by the protocol driver */
+                *pnum = file_pnum;
+                ret |= (ret2 & BDRV_BLOCK_ZERO);
+            }
+        }
+    }
+
+    return ret;
+}
+
+/* Coroutine wrapper for bdrv_get_block_status() */
+static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
+{
+    BdrvCoGetBlockStatusData *data = opaque;
+    BlockDriverState *bs = data->bs;
+
+    data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
+                                         data->pnum);
+    data->done = true;
+}
+
+/*
+ * Synchronous wrapper around bdrv_co_get_block_status().
+ *
+ * See bdrv_co_get_block_status() for details.
+ */
+int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
+                              int nb_sectors, int *pnum)
+{
+    Coroutine *co;
+    BdrvCoGetBlockStatusData data = {
+        .bs = bs,
+        .sector_num = sector_num,
+        .nb_sectors = nb_sectors,
+        .pnum = pnum,
+        .done = false,
+    };
+
+    if (qemu_in_coroutine()) {
+        /* Fast-path if already in coroutine context */
+        bdrv_get_block_status_co_entry(&data);
+    } else {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+
+        co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
+        qemu_coroutine_enter(co, &data);
+        while (!data.done) {
+            aio_poll(aio_context, true);
+        }
+    }
+    return data.ret;
+}
+
+int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
+                                   int nb_sectors, int *pnum)
+{
+    int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
+    if (ret < 0) {
+        return ret;
+    }
+    return !!(ret & BDRV_BLOCK_ALLOCATED);
+}
+
+/*
+ * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
+ *
+ * Return true if the given sector is allocated in any image between
+ * BASE and TOP (inclusive).  BASE can be NULL to check if the given
+ * sector is allocated in any image of the chain.  Return false otherwise.
+ *
+ * 'pnum' is set to the number of sectors (including and immediately following
+ *  the specified sector) that are known to be in the same
+ *  allocated/unallocated state.
+ *
+ */
+int bdrv_is_allocated_above(BlockDriverState *top,
+                            BlockDriverState *base,
+                            int64_t sector_num,
+                            int nb_sectors, int *pnum)
+{
+    BlockDriverState *intermediate;
+    int ret, n = nb_sectors;
+
+    intermediate = top;
+    while (intermediate && intermediate != base) {
+        int pnum_inter;
+        ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
+                                &pnum_inter);
+        if (ret < 0) {
+            return ret;
+        } else if (ret) {
+            *pnum = pnum_inter;
+            return 1;
+        }
+
+        /*
+         * [sector_num, nb_sectors] is unallocated on top but intermediate
+         * might have
+         *
+         * [sector_num+x, nr_sectors] allocated.
+         */
+        if (n > pnum_inter &&
+            (intermediate == top ||
+             sector_num + pnum_inter < intermediate->total_sectors)) {
+            n = pnum_inter;
+        }
+
+        intermediate = intermediate->backing_hd;
+    }
+
+    *pnum = n;
+    return 0;
+}
+
+int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
+                          const uint8_t *buf, int nb_sectors)
+{
+    BlockDriver *drv = bs->drv;
+    int ret;
+
+    if (!drv) {
+        return -ENOMEDIUM;
+    }
+    if (!drv->bdrv_write_compressed) {
+        return -ENOTSUP;
+    }
+    ret = bdrv_check_request(bs, sector_num, nb_sectors);
+    if (ret < 0) {
+        return ret;
+    }
+
+    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
+
+    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
+}
+
+int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
+                      int64_t pos, int size)
+{
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base   = (void *) buf,
+        .iov_len    = size,
+    };
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
+    return bdrv_writev_vmstate(bs, &qiov, pos);
+}
+
+int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
+{
+    BlockDriver *drv = bs->drv;
+
+    if (!drv) {
+        return -ENOMEDIUM;
+    } else if (drv->bdrv_save_vmstate) {
+        return drv->bdrv_save_vmstate(bs, qiov, pos);
+    } else if (bs->file) {
+        return bdrv_writev_vmstate(bs->file, qiov, pos);
+    }
+
+    return -ENOTSUP;
+}
+
+int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
+                      int64_t pos, int size)
+{
+    BlockDriver *drv = bs->drv;
+    if (!drv)
+        return -ENOMEDIUM;
+    if (drv->bdrv_load_vmstate)
+        return drv->bdrv_load_vmstate(bs, buf, pos, size);
+    if (bs->file)
+        return bdrv_load_vmstate(bs->file, buf, pos, size);
+    return -ENOTSUP;
+}
+
+/**************************************************************/
+/* async I/Os */
+
+BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
+                           QEMUIOVector *qiov, int nb_sectors,
+                           BlockCompletionFunc *cb, void *opaque)
+{
+    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
+
+    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
+                                 cb, opaque, false);
+}
+
+BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
+                            QEMUIOVector *qiov, int nb_sectors,
+                            BlockCompletionFunc *cb, void *opaque)
+{
+    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
+
+    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
+                                 cb, opaque, true);
+}
+
+BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
+        BlockCompletionFunc *cb, void *opaque)
+{
+    trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
+
+    return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
+                                 BDRV_REQ_ZERO_WRITE | flags,
+                                 cb, opaque, true);
+}
+
+
+typedef struct MultiwriteCB {
+    int error;
+    int num_requests;
+    int num_callbacks;
+    struct {
+        BlockCompletionFunc *cb;
+        void *opaque;
+        QEMUIOVector *free_qiov;
+    } callbacks[];
+} MultiwriteCB;
+
+static void multiwrite_user_cb(MultiwriteCB *mcb)
+{
+    int i;
+
+    for (i = 0; i < mcb->num_callbacks; i++) {
+        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
+        if (mcb->callbacks[i].free_qiov) {
+            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
+        }
+        g_free(mcb->callbacks[i].free_qiov);
+    }
+}
+
+static void multiwrite_cb(void *opaque, int ret)
+{
+    MultiwriteCB *mcb = opaque;
+
+    trace_multiwrite_cb(mcb, ret);
+
+    if (ret < 0 && !mcb->error) {
+        mcb->error = ret;
+    }
+
+    mcb->num_requests--;
+    if (mcb->num_requests == 0) {
+        multiwrite_user_cb(mcb);
+        g_free(mcb);
+    }
+}
+
+static int multiwrite_req_compare(const void *a, const void *b)
+{
+    const BlockRequest *req1 = a, *req2 = b;
+
+    /*
+     * Note that we can't simply subtract req2->sector from req1->sector
+     * here as that could overflow the return value.
+     */
+    if (req1->sector > req2->sector) {
+        return 1;
+    } else if (req1->sector < req2->sector) {
+        return -1;
+    } else {
+        return 0;
+    }
+}
+
+/*
+ * Takes a bunch of requests and tries to merge them. Returns the number of
+ * requests that remain after merging.
+ */
+static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
+    int num_reqs, MultiwriteCB *mcb)
+{
+    int i, outidx;
+
+    // Sort requests by start sector
+    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
+
+    // Check if adjacent requests touch the same clusters. If so, combine them,
+    // filling up gaps with zero sectors.
+    outidx = 0;
+    for (i = 1; i < num_reqs; i++) {
+        int merge = 0;
+        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
+
+        // Handle exactly sequential writes and overlapping writes.
+        if (reqs[i].sector <= oldreq_last) {
+            merge = 1;
+        }
+
+        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
+            merge = 0;
+        }
+
+        if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
+            reqs[i].nb_sectors > bs->bl.max_transfer_length) {
+            merge = 0;
+        }
+
+        if (merge) {
+            size_t size;
+            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
+            qemu_iovec_init(qiov,
+                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
+
+            // Add the first request to the merged one. If the requests are
+            // overlapping, drop the last sectors of the first request.
+            size = (reqs[i].sector - reqs[outidx].sector) << 9;
+            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
+
+            // We should need to add any zeros between the two requests
+            assert (reqs[i].sector <= oldreq_last);
+
+            // Add the second request
+            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
+
+            // Add tail of first request, if necessary
+            if (qiov->size < reqs[outidx].qiov->size) {
+                qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
+                                  reqs[outidx].qiov->size - qiov->size);
+            }
+
+            reqs[outidx].nb_sectors = qiov->size >> 9;
+            reqs[outidx].qiov = qiov;
+
+            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
+        } else {
+            outidx++;
+            reqs[outidx].sector     = reqs[i].sector;
+            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
+            reqs[outidx].qiov       = reqs[i].qiov;
+        }
+    }
+
+    block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1);
+
+    return outidx + 1;
+}
+
+/*
+ * Submit multiple AIO write requests at once.
+ *
+ * On success, the function returns 0 and all requests in the reqs array have
+ * been submitted. In error case this function returns -1, and any of the
+ * requests may or may not be submitted yet. In particular, this means that the
+ * callback will be called for some of the requests, for others it won't. The
+ * caller must check the error field of the BlockRequest to wait for the right
+ * callbacks (if error != 0, no callback will be called).
+ *
+ * The implementation may modify the contents of the reqs array, e.g. to merge
+ * requests. However, the fields opaque and error are left unmodified as they
+ * are used to signal failure for a single request to the caller.
+ */
+int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
+{
+    MultiwriteCB *mcb;
+    int i;
+
+    /* don't submit writes if we don't have a medium */
+    if (bs->drv == NULL) {
+        for (i = 0; i < num_reqs; i++) {
+            reqs[i].error = -ENOMEDIUM;
+        }
+        return -1;
+    }
+
+    if (num_reqs == 0) {
+        return 0;
+    }
+
+    // Create MultiwriteCB structure
+    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
+    mcb->num_requests = 0;
+    mcb->num_callbacks = num_reqs;
+
+    for (i = 0; i < num_reqs; i++) {
+        mcb->callbacks[i].cb = reqs[i].cb;
+        mcb->callbacks[i].opaque = reqs[i].opaque;
+    }
+
+    // Check for mergable requests
+    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
+
+    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
+
+    /* Run the aio requests. */
+    mcb->num_requests = num_reqs;
+    for (i = 0; i < num_reqs; i++) {
+        bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
+                              reqs[i].nb_sectors, reqs[i].flags,
+                              multiwrite_cb, mcb,
+                              true);
+    }
+
+    return 0;
+}
+
+void bdrv_aio_cancel(BlockAIOCB *acb)
+{
+    qemu_aio_ref(acb);
+    bdrv_aio_cancel_async(acb);
+    while (acb->refcnt > 1) {
+        if (acb->aiocb_info->get_aio_context) {
+            aio_poll(acb->aiocb_info->get_aio_context(acb), true);
+        } else if (acb->bs) {
+            aio_poll(bdrv_get_aio_context(acb->bs), true);
+        } else {
+            abort();
+        }
+    }
+    qemu_aio_unref(acb);
+}
+
+/* Async version of aio cancel. The caller is not blocked if the acb implements
+ * cancel_async, otherwise we do nothing and let the request normally complete.
+ * In either case the completion callback must be called. */
+void bdrv_aio_cancel_async(BlockAIOCB *acb)
+{
+    if (acb->aiocb_info->cancel_async) {
+        acb->aiocb_info->cancel_async(acb);
+    }
+}
+
+/**************************************************************/
+/* async block device emulation */
+
+typedef struct BlockAIOCBSync {
+    BlockAIOCB common;
+    QEMUBH *bh;
+    int ret;
+    /* vector translation state */
+    QEMUIOVector *qiov;
+    uint8_t *bounce;
+    int is_write;
+} BlockAIOCBSync;
+
+static const AIOCBInfo bdrv_em_aiocb_info = {
+    .aiocb_size         = sizeof(BlockAIOCBSync),
+};
+
+static void bdrv_aio_bh_cb(void *opaque)
+{
+    BlockAIOCBSync *acb = opaque;
+
+    if (!acb->is_write && acb->ret >= 0) {
+        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
+    }
+    qemu_vfree(acb->bounce);
+    acb->common.cb(acb->common.opaque, acb->ret);
+    qemu_bh_delete(acb->bh);
+    acb->bh = NULL;
+    qemu_aio_unref(acb);
+}
+
+static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
+                                      int64_t sector_num,
+                                      QEMUIOVector *qiov,
+                                      int nb_sectors,
+                                      BlockCompletionFunc *cb,
+                                      void *opaque,
+                                      int is_write)
+
+{
+    BlockAIOCBSync *acb;
+
+    acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
+    acb->is_write = is_write;
+    acb->qiov = qiov;
+    acb->bounce = qemu_try_blockalign(bs, qiov->size);
+    acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
+
+    if (acb->bounce == NULL) {
+        acb->ret = -ENOMEM;
+    } else if (is_write) {
+        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
+        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
+    } else {
+        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
+    }
+
+    qemu_bh_schedule(acb->bh);
+
+    return &acb->common;
+}
+
+static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockCompletionFunc *cb, void *opaque)
+{
+    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+}
+
+static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockCompletionFunc *cb, void *opaque)
+{
+    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
+}
+
+
+typedef struct BlockAIOCBCoroutine {
+    BlockAIOCB common;
+    BlockRequest req;
+    bool is_write;
+    bool need_bh;
+    bool *done;
+    QEMUBH* bh;
+} BlockAIOCBCoroutine;
+
+static const AIOCBInfo bdrv_em_co_aiocb_info = {
+    .aiocb_size         = sizeof(BlockAIOCBCoroutine),
+};
+
+static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
+{
+    if (!acb->need_bh) {
+        acb->common.cb(acb->common.opaque, acb->req.error);
+        qemu_aio_unref(acb);
+    }
+}
+
+static void bdrv_co_em_bh(void *opaque)
+{
+    BlockAIOCBCoroutine *acb = opaque;
+
+    assert(!acb->need_bh);
+    qemu_bh_delete(acb->bh);
+    bdrv_co_complete(acb);
+}
+
+static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
+{
+    acb->need_bh = false;
+    if (acb->req.error != -EINPROGRESS) {
+        BlockDriverState *bs = acb->common.bs;
+
+        acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
+        qemu_bh_schedule(acb->bh);
+    }
+}
+
+/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
+static void coroutine_fn bdrv_co_do_rw(void *opaque)
+{
+    BlockAIOCBCoroutine *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+
+    if (!acb->is_write) {
+        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
+            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
+    } else {
+        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
+            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
+    }
+
+    bdrv_co_complete(acb);
+}
+
+static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
+                                         int64_t sector_num,
+                                         QEMUIOVector *qiov,
+                                         int nb_sectors,
+                                         BdrvRequestFlags flags,
+                                         BlockCompletionFunc *cb,
+                                         void *opaque,
+                                         bool is_write)
+{
+    Coroutine *co;
+    BlockAIOCBCoroutine *acb;
+
+    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
+    acb->need_bh = true;
+    acb->req.error = -EINPROGRESS;
+    acb->req.sector = sector_num;
+    acb->req.nb_sectors = nb_sectors;
+    acb->req.qiov = qiov;
+    acb->req.flags = flags;
+    acb->is_write = is_write;
+
+    co = qemu_coroutine_create(bdrv_co_do_rw);
+    qemu_coroutine_enter(co, acb);
+
+    bdrv_co_maybe_schedule_bh(acb);
+    return &acb->common;
+}
+
+static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
+{
+    BlockAIOCBCoroutine *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+
+    acb->req.error = bdrv_co_flush(bs);
+    bdrv_co_complete(acb);
+}
+
+BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
+        BlockCompletionFunc *cb, void *opaque)
+{
+    trace_bdrv_aio_flush(bs, opaque);
+
+    Coroutine *co;
+    BlockAIOCBCoroutine *acb;
+
+    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
+    acb->need_bh = true;
+    acb->req.error = -EINPROGRESS;
+
+    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
+    qemu_coroutine_enter(co, acb);
+
+    bdrv_co_maybe_schedule_bh(acb);
+    return &acb->common;
+}
+
+static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
+{
+    BlockAIOCBCoroutine *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+
+    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
+    bdrv_co_complete(acb);
+}
+
+BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors,
+        BlockCompletionFunc *cb, void *opaque)
+{
+    Coroutine *co;
+    BlockAIOCBCoroutine *acb;
+
+    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
+
+    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
+    acb->need_bh = true;
+    acb->req.error = -EINPROGRESS;
+    acb->req.sector = sector_num;
+    acb->req.nb_sectors = nb_sectors;
+    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
+    qemu_coroutine_enter(co, acb);
+
+    bdrv_co_maybe_schedule_bh(acb);
+    return &acb->common;
+}
+
+void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
+                   BlockCompletionFunc *cb, void *opaque)
+{
+    BlockAIOCB *acb;
+
+    acb = g_slice_alloc(aiocb_info->aiocb_size);
+    acb->aiocb_info = aiocb_info;
+    acb->bs = bs;
+    acb->cb = cb;
+    acb->opaque = opaque;
+    acb->refcnt = 1;
+    return acb;
+}
+
+void qemu_aio_ref(void *p)
+{
+    BlockAIOCB *acb = p;
+    acb->refcnt++;
+}
+
+void qemu_aio_unref(void *p)
+{
+    BlockAIOCB *acb = p;
+    assert(acb->refcnt > 0);
+    if (--acb->refcnt == 0) {
+        g_slice_free1(acb->aiocb_info->aiocb_size, acb);
+    }
+}
+
+/**************************************************************/
+/* Coroutine block device emulation */
+
+typedef struct CoroutineIOCompletion {
+    Coroutine *coroutine;
+    int ret;
+} CoroutineIOCompletion;
+
+static void bdrv_co_io_em_complete(void *opaque, int ret)
+{
+    CoroutineIOCompletion *co = opaque;
+
+    co->ret = ret;
+    qemu_coroutine_enter(co->coroutine, NULL);
+}
+
+static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
+                                      int nb_sectors, QEMUIOVector *iov,
+                                      bool is_write)
+{
+    CoroutineIOCompletion co = {
+        .coroutine = qemu_coroutine_self(),
+    };
+    BlockAIOCB *acb;
+
+    if (is_write) {
+        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
+                                       bdrv_co_io_em_complete, &co);
+    } else {
+        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
+                                      bdrv_co_io_em_complete, &co);
+    }
+
+    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
+    if (!acb) {
+        return -EIO;
+    }
+    qemu_coroutine_yield();
+
+    return co.ret;
+}
+
+static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
+                                         int64_t sector_num, int nb_sectors,
+                                         QEMUIOVector *iov)
+{
+    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
+}
+
+static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
+                                         int64_t sector_num, int nb_sectors,
+                                         QEMUIOVector *iov)
+{
+    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
+}
+
+static void coroutine_fn bdrv_flush_co_entry(void *opaque)
+{
+    RwCo *rwco = opaque;
+
+    rwco->ret = bdrv_co_flush(rwco->bs);
+}
+
+int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
+{
+    int ret;
+
+    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
+        return 0;
+    }
+
+    /* Write back cached data to the OS even with cache=unsafe */
+    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
+    if (bs->drv->bdrv_co_flush_to_os) {
+        ret = bs->drv->bdrv_co_flush_to_os(bs);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    /* But don't actually force it to the disk with cache=unsafe */
+    if (bs->open_flags & BDRV_O_NO_FLUSH) {
+        goto flush_parent;
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
+    if (bs->drv->bdrv_co_flush_to_disk) {
+        ret = bs->drv->bdrv_co_flush_to_disk(bs);
+    } else if (bs->drv->bdrv_aio_flush) {
+        BlockAIOCB *acb;
+        CoroutineIOCompletion co = {
+            .coroutine = qemu_coroutine_self(),
+        };
+
+        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
+        if (acb == NULL) {
+            ret = -EIO;
+        } else {
+            qemu_coroutine_yield();
+            ret = co.ret;
+        }
+    } else {
+        /*
+         * Some block drivers always operate in either writethrough or unsafe
+         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
+         * know how the server works (because the behaviour is hardcoded or
+         * depends on server-side configuration), so we can't ensure that
+         * everything is safe on disk. Returning an error doesn't work because
+         * that would break guests even if the server operates in writethrough
+         * mode.
+         *
+         * Let's hope the user knows what he's doing.
+         */
+        ret = 0;
+    }
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
+     * in the case of cache=unsafe, so there are no useless flushes.
+     */
+flush_parent:
+    return bdrv_co_flush(bs->file);
+}
+
+int bdrv_flush(BlockDriverState *bs)
+{
+    Coroutine *co;
+    RwCo rwco = {
+        .bs = bs,
+        .ret = NOT_DONE,
+    };
+
+    if (qemu_in_coroutine()) {
+        /* Fast-path if already in coroutine context */
+        bdrv_flush_co_entry(&rwco);
+    } else {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+
+        co = qemu_coroutine_create(bdrv_flush_co_entry);
+        qemu_coroutine_enter(co, &rwco);
+        while (rwco.ret == NOT_DONE) {
+            aio_poll(aio_context, true);
+        }
+    }
+
+    return rwco.ret;
+}
+
+typedef struct DiscardCo {
+    BlockDriverState *bs;
+    int64_t sector_num;
+    int nb_sectors;
+    int ret;
+} DiscardCo;
+static void coroutine_fn bdrv_discard_co_entry(void *opaque)
+{
+    DiscardCo *rwco = opaque;
+
+    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
+}
+
+int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
+                                 int nb_sectors)
+{
+    int max_discard, ret;
+
+    if (!bs->drv) {
+        return -ENOMEDIUM;
+    }
+
+    ret = bdrv_check_request(bs, sector_num, nb_sectors);
+    if (ret < 0) {
+        return ret;
+    } else if (bs->read_only) {
+        return -EROFS;
+    }
+
+    bdrv_reset_dirty(bs, sector_num, nb_sectors);
+
+    /* Do nothing if disabled.  */
+    if (!(bs->open_flags & BDRV_O_UNMAP)) {
+        return 0;
+    }
+
+    if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
+        return 0;
+    }
+
+    max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
+    while (nb_sectors > 0) {
+        int ret;
+        int num = nb_sectors;
+
+        /* align request */
+        if (bs->bl.discard_alignment &&
+            num >= bs->bl.discard_alignment &&
+            sector_num % bs->bl.discard_alignment) {
+            if (num > bs->bl.discard_alignment) {
+                num = bs->bl.discard_alignment;
+            }
+            num -= sector_num % bs->bl.discard_alignment;
+        }
+
+        /* limit request size */
+        if (num > max_discard) {
+            num = max_discard;
+        }
+
+        if (bs->drv->bdrv_co_discard) {
+            ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
+        } else {
+            BlockAIOCB *acb;
+            CoroutineIOCompletion co = {
+                .coroutine = qemu_coroutine_self(),
+            };
+
+            acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
+                                            bdrv_co_io_em_complete, &co);
+            if (acb == NULL) {
+                return -EIO;
+            } else {
+                qemu_coroutine_yield();
+                ret = co.ret;
+            }
+        }
+        if (ret && ret != -ENOTSUP) {
+            return ret;
+        }
+
+        sector_num += num;
+        nb_sectors -= num;
+    }
+    return 0;
+}
+
+int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
+{
+    Coroutine *co;
+    DiscardCo rwco = {
+        .bs = bs,
+        .sector_num = sector_num,
+        .nb_sectors = nb_sectors,
+        .ret = NOT_DONE,
+    };
+
+    if (qemu_in_coroutine()) {
+        /* Fast-path if already in coroutine context */
+        bdrv_discard_co_entry(&rwco);
+    } else {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+
+        co = qemu_coroutine_create(bdrv_discard_co_entry);
+        qemu_coroutine_enter(co, &rwco);
+        while (rwco.ret == NOT_DONE) {
+            aio_poll(aio_context, true);
+        }
+    }
+
+    return rwco.ret;
+}
+
+/* needed for generic scsi interface */
+
+int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+{
+    BlockDriver *drv = bs->drv;
+
+    if (drv && drv->bdrv_ioctl)
+        return drv->bdrv_ioctl(bs, req, buf);
+    return -ENOTSUP;
+}
+
+BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
+        unsigned long int req, void *buf,
+        BlockCompletionFunc *cb, void *opaque)
+{
+    BlockDriver *drv = bs->drv;
+
+    if (drv && drv->bdrv_aio_ioctl)
+        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
+    return NULL;
+}
+
+void *qemu_blockalign(BlockDriverState *bs, size_t size)
+{
+    return qemu_memalign(bdrv_opt_mem_align(bs), size);
+}
+
+void *qemu_blockalign0(BlockDriverState *bs, size_t size)
+{
+    return memset(qemu_blockalign(bs, size), 0, size);
+}
+
+void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
+{
+    size_t align = bdrv_opt_mem_align(bs);
+
+    /* Ensure that NULL is never returned on success */
+    assert(align > 0);
+    if (size == 0) {
+        size = align;
+    }
+
+    return qemu_try_memalign(align, size);
+}
+
+void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
+{
+    void *mem = qemu_try_blockalign(bs, size);
+
+    if (mem) {
+        memset(mem, 0, size);
+    }
+
+    return mem;
+}
+
+/*
+ * Check if all memory in this vector is sector aligned.
+ */
+bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
+{
+    int i;
+    size_t alignment = bdrv_opt_mem_align(bs);
+
+    for (i = 0; i < qiov->niov; i++) {
+        if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
+            return false;
+        }
+        if (qiov->iov[i].iov_len % alignment) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+void bdrv_add_before_write_notifier(BlockDriverState *bs,
+                                    NotifierWithReturn *notifier)
+{
+    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
+}
+
+void bdrv_io_plug(BlockDriverState *bs)
+{
+    BlockDriver *drv = bs->drv;
+    if (drv && drv->bdrv_io_plug) {
+        drv->bdrv_io_plug(bs);
+    } else if (bs->file) {
+        bdrv_io_plug(bs->file);
+    }
+}
+
+void bdrv_io_unplug(BlockDriverState *bs)
+{
+    BlockDriver *drv = bs->drv;
+    if (drv && drv->bdrv_io_unplug) {
+        drv->bdrv_io_unplug(bs);
+    } else if (bs->file) {
+        bdrv_io_unplug(bs->file);
+    }
+}
+
+void bdrv_flush_io_queue(BlockDriverState *bs)
+{
+    BlockDriver *drv = bs->drv;
+    if (drv && drv->bdrv_flush_io_queue) {
+        drv->bdrv_flush_io_queue(bs);
+    } else if (bs->file) {
+        bdrv_flush_io_queue(bs->file);
+    }
+}
diff --git a/block/iscsi.c b/block/iscsi.c
index ba33290000..8fca1d32cb 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -2,7 +2,7 @@
  * QEMU Block driver for iSCSI images
  *
  * Copyright (c) 2010-2011 Ronnie Sahlberg <ronniesahlberg@gmail.com>
- * Copyright (c) 2012-2014 Peter Lieven <pl@kamp.de>
+ * Copyright (c) 2012-2015 Peter Lieven <pl@kamp.de>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -57,9 +57,6 @@ typedef struct IscsiLun {
     int events;
     QEMUTimer *nop_timer;
     QEMUTimer *event_timer;
-    uint8_t lbpme;
-    uint8_t lbprz;
-    uint8_t has_write_same;
     struct scsi_inquiry_logical_block_provisioning lbp;
     struct scsi_inquiry_block_limits bl;
     unsigned char *zeroblock;
@@ -67,6 +64,11 @@ typedef struct IscsiLun {
     int cluster_sectors;
     bool use_16_for_rw;
     bool write_protected;
+    bool lbpme;
+    bool lbprz;
+    bool dpofua;
+    bool has_write_same;
+    bool force_next_flush;
 } IscsiLun;
 
 typedef struct IscsiTask {
@@ -79,6 +81,7 @@ typedef struct IscsiTask {
     QEMUBH *bh;
     IscsiLun *iscsilun;
     QEMUTimer retry_timer;
+    bool force_next_flush;
 } IscsiTask;
 
 typedef struct IscsiAIOCB {
@@ -100,7 +103,7 @@ typedef struct IscsiAIOCB {
 #define NOP_INTERVAL 5000
 #define MAX_NOP_FAILURES 3
 #define ISCSI_CMD_RETRIES ARRAY_SIZE(iscsi_retry_times)
-static const unsigned iscsi_retry_times[] = {8, 32, 128, 512, 2048};
+static const unsigned iscsi_retry_times[] = {8, 32, 128, 512, 2048, 8192, 32768};
 
 /* this threshold is a trade-off knob to choose between
  * the potential additional overhead of an extra GET_LBA_STATUS request
@@ -183,10 +186,13 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
                 iTask->do_retry = 1;
                 goto out;
             }
-            if (status == SCSI_STATUS_BUSY) {
+            /* status 0x28 is SCSI_TASK_SET_FULL. It was first introduced
+             * in libiscsi 1.10.0. Hardcode this value here to avoid
+             * the need to bump the libiscsi requirement to 1.10.0 */
+            if (status == SCSI_STATUS_BUSY || status == 0x28) {
                 unsigned retry_time =
                     exp_random(iscsi_retry_times[iTask->retries - 1]);
-                error_report("iSCSI Busy (retry #%u in %u ms): %s",
+                error_report("iSCSI Busy/TaskSetFull (retry #%u in %u ms): %s",
                              iTask->retries, retry_time,
                              iscsi_get_error(iscsi));
                 aio_timer_init(iTask->iscsilun->aio_context,
@@ -199,6 +205,8 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
             }
         }
         error_report("iSCSI Failure: %s", iscsi_get_error(iscsi));
+    } else {
+        iTask->iscsilun->force_next_flush |= iTask->force_next_flush;
     }
 
 out:
@@ -369,6 +377,7 @@ static int coroutine_fn iscsi_co_writev(BlockDriverState *bs,
     struct IscsiTask iTask;
     uint64_t lba;
     uint32_t num_sectors;
+    int fua;
 
     if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
         return -EINVAL;
@@ -384,15 +393,17 @@ static int coroutine_fn iscsi_co_writev(BlockDriverState *bs,
     num_sectors = sector_qemu2lun(nb_sectors, iscsilun);
     iscsi_co_init_iscsitask(iscsilun, &iTask);
 retry:
+    fua = iscsilun->dpofua && !bs->enable_write_cache;
+    iTask.force_next_flush = !fua;
     if (iscsilun->use_16_for_rw) {
         iTask.task = iscsi_write16_task(iscsilun->iscsi, iscsilun->lun, lba,
                                         NULL, num_sectors * iscsilun->block_size,
-                                        iscsilun->block_size, 0, 0, 0, 0, 0,
+                                        iscsilun->block_size, 0, 0, fua, 0, 0,
                                         iscsi_co_generic_cb, &iTask);
     } else {
         iTask.task = iscsi_write10_task(iscsilun->iscsi, iscsilun->lun, lba,
                                         NULL, num_sectors * iscsilun->block_size,
-                                        iscsilun->block_size, 0, 0, 0, 0, 0,
+                                        iscsilun->block_size, 0, 0, fua, 0, 0,
                                         iscsi_co_generic_cb, &iTask);
     }
     if (iTask.task == NULL) {
@@ -460,7 +471,7 @@ static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs,
     *pnum = nb_sectors;
 
     /* LUN does not support logical block provisioning */
-    if (iscsilun->lbpme == 0) {
+    if (!iscsilun->lbpme) {
         goto out;
     }
 
@@ -620,8 +631,12 @@ static int coroutine_fn iscsi_co_flush(BlockDriverState *bs)
         return 0;
     }
 
-    iscsi_co_init_iscsitask(iscsilun, &iTask);
+    if (!iscsilun->force_next_flush) {
+        return 0;
+    }
+    iscsilun->force_next_flush = false;
 
+    iscsi_co_init_iscsitask(iscsilun, &iTask);
 retry:
     if (iscsi_synchronizecache10_task(iscsilun->iscsi, iscsilun->lun, 0, 0, 0,
                                       0, iscsi_co_generic_cb, &iTask) == NULL) {
@@ -917,6 +932,7 @@ coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
     }
 
     iscsi_co_init_iscsitask(iscsilun, &iTask);
+    iTask.force_next_flush = true;
 retry:
     if (use_16_for_ws) {
         iTask.task = iscsi_writesame16_task(iscsilun->iscsi, iscsilun->lun, lba,
@@ -1121,8 +1137,8 @@ static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp)
                 } else {
                     iscsilun->block_size = rc16->block_length;
                     iscsilun->num_blocks = rc16->returned_lba + 1;
-                    iscsilun->lbpme = rc16->lbpme;
-                    iscsilun->lbprz = rc16->lbprz;
+                    iscsilun->lbpme = !!rc16->lbpme;
+                    iscsilun->lbprz = !!rc16->lbprz;
                     iscsilun->use_16_for_rw = (rc16->returned_lba > 0xffffffff);
                 }
             }
@@ -1253,11 +1269,12 @@ static void iscsi_attach_aio_context(BlockDriverState *bs,
                                           iscsi_timed_set_events, iscsilun);
 }
 
-static bool iscsi_is_write_protected(IscsiLun *iscsilun)
+static void iscsi_modesense_sync(IscsiLun *iscsilun)
 {
     struct scsi_task *task;
     struct scsi_mode_sense *ms = NULL;
-    bool wrprotected = false;
+    iscsilun->write_protected = false;
+    iscsilun->dpofua = false;
 
     task = iscsi_modesense6_sync(iscsilun->iscsi, iscsilun->lun,
                                  1, SCSI_MODESENSE_PC_CURRENT,
@@ -1278,13 +1295,13 @@ static bool iscsi_is_write_protected(IscsiLun *iscsilun)
                      iscsi_get_error(iscsilun->iscsi));
         goto out;
     }
-    wrprotected = ms->device_specific_parameter & 0x80;
+    iscsilun->write_protected = ms->device_specific_parameter & 0x80;
+    iscsilun->dpofua          = ms->device_specific_parameter & 0x10;
 
 out:
     if (task) {
         scsi_free_scsi_task(task);
     }
-    return wrprotected;
 }
 
 /*
@@ -1403,7 +1420,8 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
     scsi_free_scsi_task(task);
     task = NULL;
 
-    iscsilun->write_protected = iscsi_is_write_protected(iscsilun);
+    iscsi_modesense_sync(iscsilun);
+
     /* Check the write protect flag of the LUN if we want to write */
     if (iscsilun->type == TYPE_DISK && (flags & BDRV_O_RDWR) &&
         iscsilun->write_protected) {
@@ -1481,7 +1499,7 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
         iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) {
         iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran *
                                      iscsilun->block_size) >> BDRV_SECTOR_BITS;
-        if (iscsilun->lbprz && !(bs->open_flags & BDRV_O_NOCACHE)) {
+        if (iscsilun->lbprz) {
             iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun);
             if (iscsilun->allocationmap == NULL) {
                 ret = -ENOMEM;
@@ -1501,6 +1519,9 @@ out:
 
     if (ret) {
         if (iscsi != NULL) {
+            if (iscsi_is_logged_in(iscsi)) {
+                iscsi_logout_sync(iscsi);
+            }
             iscsi_destroy_context(iscsi);
         }
         memset(iscsilun, 0, sizeof(IscsiLun));
@@ -1514,6 +1535,9 @@ static void iscsi_close(BlockDriverState *bs)
     struct iscsi_context *iscsi = iscsilun->iscsi;
 
     iscsi_detach_aio_context(bs);
+    if (iscsi_is_logged_in(iscsi)) {
+        iscsi_logout_sync(iscsi);
+    }
     iscsi_destroy_context(iscsi);
     g_free(iscsilun->zeroblock);
     g_free(iscsilun->allocationmap);
@@ -1649,7 +1673,7 @@ out:
 static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
     IscsiLun *iscsilun = bs->opaque;
-    bdi->unallocated_blocks_are_zero = !!iscsilun->lbprz;
+    bdi->unallocated_blocks_are_zero = iscsilun->lbprz;
     bdi->can_write_zeroes_with_unmap = iscsilun->lbprz && iscsilun->lbp.lbpws;
     bdi->cluster_size = iscsilun->cluster_sectors * BDRV_SECTOR_SIZE;
     return 0;
diff --git a/block/mirror.c b/block/mirror.c
index 405616422b..58f391a6d6 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -125,11 +125,9 @@ static void mirror_write_complete(void *opaque, int ret)
     MirrorOp *op = opaque;
     MirrorBlockJob *s = op->s;
     if (ret < 0) {
-        BlockDriverState *source = s->common.bs;
         BlockErrorAction action;
 
-        bdrv_set_dirty_bitmap(source, s->dirty_bitmap, op->sector_num,
-                              op->nb_sectors);
+        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
         action = mirror_error_action(s, false, -ret);
         if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
             s->ret = ret;
@@ -143,11 +141,9 @@ static void mirror_read_complete(void *opaque, int ret)
     MirrorOp *op = opaque;
     MirrorBlockJob *s = op->s;
     if (ret < 0) {
-        BlockDriverState *source = s->common.bs;
         BlockErrorAction action;
 
-        bdrv_set_dirty_bitmap(source, s->dirty_bitmap, op->sector_num,
-                              op->nb_sectors);
+        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
         action = mirror_error_action(s, true, -ret);
         if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
             s->ret = ret;
@@ -170,10 +166,9 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
 
     s->sector_num = hbitmap_iter_next(&s->hbi);
     if (s->sector_num < 0) {
-        bdrv_dirty_iter_init(source, s->dirty_bitmap, &s->hbi);
+        bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
         s->sector_num = hbitmap_iter_next(&s->hbi);
-        trace_mirror_restart_iter(s,
-                                  bdrv_get_dirty_count(source, s->dirty_bitmap));
+        trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
         assert(s->sector_num >= 0);
     }
 
@@ -288,8 +283,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
         next_sector += sectors_per_chunk;
     }
 
-    bdrv_reset_dirty_bitmap(source, s->dirty_bitmap, sector_num,
-                            nb_sectors);
+    bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num, nb_sectors);
 
     /* Copy the dirty cluster.  */
     s->in_flight++;
@@ -446,7 +440,7 @@ static void coroutine_fn mirror_run(void *opaque)
 
             assert(n > 0);
             if (ret == 1) {
-                bdrv_set_dirty_bitmap(bs, s->dirty_bitmap, sector_num, n);
+                bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
                 sector_num = next;
             } else {
                 sector_num += n;
@@ -454,7 +448,7 @@ static void coroutine_fn mirror_run(void *opaque)
         }
     }
 
-    bdrv_dirty_iter_init(bs, s->dirty_bitmap, &s->hbi);
+    bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
     last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
     for (;;) {
         uint64_t delay_ns = 0;
@@ -466,7 +460,7 @@ static void coroutine_fn mirror_run(void *opaque)
             goto immediate_exit;
         }
 
-        cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
+        cnt = bdrv_get_dirty_count(s->dirty_bitmap);
         /* s->common.offset contains the number of bytes already processed so
          * far, cnt is the number of dirty sectors remaining and
          * s->sectors_in_flight is the number of sectors currently being
@@ -475,7 +469,7 @@ static void coroutine_fn mirror_run(void *opaque)
                         (cnt + s->sectors_in_flight) * BDRV_SECTOR_SIZE;
 
         /* Note that even when no rate limit is applied we need to yield
-         * periodically with no pending I/O so that qemu_aio_flush() returns.
+         * periodically with no pending I/O so that bdrv_drain_all() returns.
          * We do so every SLICE_TIME nanoseconds, or when there is an error,
          * or when the source is clean, whichever comes first.
          */
@@ -488,9 +482,6 @@ static void coroutine_fn mirror_run(void *opaque)
                 continue;
             } else if (cnt != 0) {
                 delay_ns = mirror_iteration(s);
-                if (delay_ns == 0) {
-                    continue;
-                }
             }
         }
 
@@ -516,7 +507,7 @@ static void coroutine_fn mirror_run(void *opaque)
 
                 should_complete = s->should_complete ||
                     block_job_is_cancelled(&s->common);
-                cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
+                cnt = bdrv_get_dirty_count(s->dirty_bitmap);
             }
         }
 
@@ -531,7 +522,7 @@ static void coroutine_fn mirror_run(void *opaque)
              */
             trace_mirror_before_drain(s, cnt);
             bdrv_drain(bs);
-            cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
+            cnt = bdrv_get_dirty_count(s->dirty_bitmap);
         }
 
         ret = 0;
@@ -634,7 +625,7 @@ static void mirror_complete(BlockJob *job, Error **errp)
     }
 
     s->should_complete = true;
-    block_job_resume(job);
+    block_job_enter(&s->common);
 }
 
 static const BlockJobDriver mirror_job_driver = {
@@ -656,7 +647,7 @@ static const BlockJobDriver commit_active_job_driver = {
 
 static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
                              const char *replaces,
-                             int64_t speed, int64_t granularity,
+                             int64_t speed, uint32_t granularity,
                              int64_t buf_size,
                              BlockdevOnError on_source_error,
                              BlockdevOnError on_target_error,
@@ -668,15 +659,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
     MirrorBlockJob *s;
 
     if (granularity == 0) {
-        /* Choose the default granularity based on the target file's cluster
-         * size, clamped between 4k and 64k.  */
-        BlockDriverInfo bdi;
-        if (bdrv_get_info(target, &bdi) >= 0 && bdi.cluster_size != 0) {
-            granularity = MAX(4096, bdi.cluster_size);
-            granularity = MIN(65536, granularity);
-        } else {
-            granularity = 65536;
-        }
+        granularity = bdrv_get_default_bitmap_granularity(target);
     }
 
     assert ((granularity & (granularity - 1)) == 0);
@@ -703,7 +686,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
     s->granularity = granularity;
     s->buf_size = MAX(buf_size, granularity);
 
-    s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, errp);
+    s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
     if (!s->dirty_bitmap) {
         return;
     }
@@ -717,7 +700,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
 
 void mirror_start(BlockDriverState *bs, BlockDriverState *target,
                   const char *replaces,
-                  int64_t speed, int64_t granularity, int64_t buf_size,
+                  int64_t speed, uint32_t granularity, int64_t buf_size,
                   MirrorSyncMode mode, BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
                   BlockCompletionFunc *cb,
@@ -726,6 +709,10 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
     bool is_none_mode;
     BlockDriverState *base;
 
+    if (mode == MIRROR_SYNC_MODE_DIRTY_BITMAP) {
+        error_setg(errp, "Sync mode 'dirty-bitmap' not supported");
+        return;
+    }
     is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
     base = mode == MIRROR_SYNC_MODE_TOP ? bs->backing_hd : NULL;
     mirror_start_job(bs, target, replaces,
diff --git a/block/null.c b/block/null.c
index ec2bd27a4b..7d083233fb 100644
--- a/block/null.c
+++ b/block/null.c
@@ -12,8 +12,11 @@
 
 #include "block/block_int.h"
 
+#define NULL_OPT_LATENCY "latency-ns"
+
 typedef struct {
     int64_t length;
+    int64_t latency_ns;
 } BDRVNullState;
 
 static QemuOptsList runtime_opts = {
@@ -30,6 +33,12 @@ static QemuOptsList runtime_opts = {
             .type = QEMU_OPT_SIZE,
             .help = "size of the null block",
         },
+        {
+            .name = NULL_OPT_LATENCY,
+            .type = QEMU_OPT_NUMBER,
+            .help = "nanoseconds (approximated) to wait "
+                    "before completing request",
+        },
         { /* end of list */ }
     },
 };
@@ -39,13 +48,20 @@ static int null_file_open(BlockDriverState *bs, QDict *options, int flags,
 {
     QemuOpts *opts;
     BDRVNullState *s = bs->opaque;
+    int ret = 0;
 
     opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
     qemu_opts_absorb_qdict(opts, options, &error_abort);
     s->length =
         qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 1 << 30);
+    s->latency_ns =
+        qemu_opt_get_number(opts, NULL_OPT_LATENCY, 0);
+    if (s->latency_ns < 0) {
+        error_setg(errp, "latency-ns is invalid");
+        ret = -EINVAL;
+    }
     qemu_opts_del(opts);
-    return 0;
+    return ret;
 }
 
 static void null_close(BlockDriverState *bs)
@@ -58,28 +74,40 @@ static int64_t null_getlength(BlockDriverState *bs)
     return s->length;
 }
 
+static coroutine_fn int null_co_common(BlockDriverState *bs)
+{
+    BDRVNullState *s = bs->opaque;
+
+    if (s->latency_ns) {
+        co_aio_sleep_ns(bdrv_get_aio_context(bs), QEMU_CLOCK_REALTIME,
+                        s->latency_ns);
+    }
+    return 0;
+}
+
 static coroutine_fn int null_co_readv(BlockDriverState *bs,
                                       int64_t sector_num, int nb_sectors,
                                       QEMUIOVector *qiov)
 {
-    return 0;
+    return null_co_common(bs);
 }
 
 static coroutine_fn int null_co_writev(BlockDriverState *bs,
                                        int64_t sector_num, int nb_sectors,
                                        QEMUIOVector *qiov)
 {
-    return 0;
+    return null_co_common(bs);
 }
 
 static coroutine_fn int null_co_flush(BlockDriverState *bs)
 {
-    return 0;
+    return null_co_common(bs);
 }
 
 typedef struct {
     BlockAIOCB common;
     QEMUBH *bh;
+    QEMUTimer timer;
 } NullAIOCB;
 
 static const AIOCBInfo null_aiocb_info = {
@@ -94,15 +122,33 @@ static void null_bh_cb(void *opaque)
     qemu_aio_unref(acb);
 }
 
+static void null_timer_cb(void *opaque)
+{
+    NullAIOCB *acb = opaque;
+    acb->common.cb(acb->common.opaque, 0);
+    timer_deinit(&acb->timer);
+    qemu_aio_unref(acb);
+}
+
 static inline BlockAIOCB *null_aio_common(BlockDriverState *bs,
                                           BlockCompletionFunc *cb,
                                           void *opaque)
 {
     NullAIOCB *acb;
+    BDRVNullState *s = bs->opaque;
 
     acb = qemu_aio_get(&null_aiocb_info, bs, cb, opaque);
-    acb->bh = aio_bh_new(bdrv_get_aio_context(bs), null_bh_cb, acb);
-    qemu_bh_schedule(acb->bh);
+    /* Only emulate latency after vcpu is running. */
+    if (s->latency_ns) {
+        aio_timer_init(bdrv_get_aio_context(bs), &acb->timer,
+                       QEMU_CLOCK_REALTIME, SCALE_NS,
+                       null_timer_cb, acb);
+        timer_mod_ns(&acb->timer,
+                     qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + s->latency_ns);
+    } else {
+        acb->bh = aio_bh_new(bdrv_get_aio_context(bs), null_bh_cb, acb);
+        qemu_bh_schedule(acb->bh);
+    }
     return &acb->common;
 }
 
@@ -131,6 +177,12 @@ static BlockAIOCB *null_aio_flush(BlockDriverState *bs,
     return null_aio_common(bs, cb, opaque);
 }
 
+static int null_reopen_prepare(BDRVReopenState *reopen_state,
+                               BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
 static BlockDriver bdrv_null_co = {
     .format_name            = "null-co",
     .protocol_name          = "null-co",
@@ -143,6 +195,7 @@ static BlockDriver bdrv_null_co = {
     .bdrv_co_readv          = null_co_readv,
     .bdrv_co_writev         = null_co_writev,
     .bdrv_co_flush_to_disk  = null_co_flush,
+    .bdrv_reopen_prepare    = null_reopen_prepare,
 };
 
 static BlockDriver bdrv_null_aio = {
@@ -157,6 +210,7 @@ static BlockDriver bdrv_null_aio = {
     .bdrv_aio_readv         = null_aio_readv,
     .bdrv_aio_writev        = null_aio_writev,
     .bdrv_aio_flush         = null_aio_flush,
+    .bdrv_reopen_prepare    = null_reopen_prepare,
 };
 
 static void bdrv_null_init(void)
diff --git a/block/qapi.c b/block/qapi.c
index 8a19aed446..063dd1bc1f 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -31,8 +31,10 @@
 #include "qapi/qmp/types.h"
 #include "sysemu/block-backend.h"
 
-BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs)
+BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs, Error **errp)
 {
+    ImageInfo **p_image_info;
+    BlockDriverState *bs0;
     BlockDeviceInfo *info = g_malloc0(sizeof(*info));
 
     info->file                   = g_strdup(bs->filename);
@@ -92,6 +94,25 @@ BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs)
 
     info->write_threshold = bdrv_write_threshold_get(bs);
 
+    bs0 = bs;
+    p_image_info = &info->image;
+    while (1) {
+        Error *local_err = NULL;
+        bdrv_query_image_info(bs0, p_image_info, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            qapi_free_BlockDeviceInfo(info);
+            return NULL;
+        }
+        if (bs0->drv && bs0->backing_hd) {
+            bs0 = bs0->backing_hd;
+            (*p_image_info)->has_backing_image = true;
+            p_image_info = &((*p_image_info)->backing_image);
+        } else {
+            break;
+        }
+    }
+
     return info;
 }
 
@@ -264,9 +285,6 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info,
 {
     BlockInfo *info = g_malloc0(sizeof(*info));
     BlockDriverState *bs = blk_bs(blk);
-    BlockDriverState *bs0;
-    ImageInfo **p_image_info;
-    Error *local_err = NULL;
     info->device = g_strdup(blk_name(blk));
     info->type = g_strdup("unknown");
     info->locked = blk_dev_is_medium_locked(blk);
@@ -289,23 +307,9 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info,
 
     if (bs->drv) {
         info->has_inserted = true;
-        info->inserted = bdrv_block_device_info(bs);
-
-        bs0 = bs;
-        p_image_info = &info->inserted->image;
-        while (1) {
-            bdrv_query_image_info(bs0, p_image_info, &local_err);
-            if (local_err) {
-                error_propagate(errp, local_err);
-                goto err;
-            }
-            if (bs0->drv && bs0->backing_hd) {
-                bs0 = bs0->backing_hd;
-                (*p_image_info)->has_backing_image = true;
-                p_image_info = &((*p_image_info)->backing_image);
-            } else {
-                break;
-            }
+        info->inserted = bdrv_block_device_info(bs, errp);
+        if (info->inserted == NULL) {
+            goto err;
         }
     }
 
diff --git a/block/qcow.c b/block/qcow.c
index 055896910e..ab893284d2 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -124,7 +124,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
         snprintf(version, sizeof(version), "QCOW version %" PRIu32,
                  header.version);
         error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-                  bdrv_get_device_name(bs), "qcow", version);
+                  bdrv_get_device_or_node_name(bs), "qcow", version);
         ret = -ENOTSUP;
         goto fail;
     }
@@ -229,9 +229,9 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
     }
 
     /* Disable migration when qcow images are used */
-    error_set(&s->migration_blocker,
-              QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-              "qcow", bdrv_get_device_name(bs), "live migration");
+    error_setg(&s->migration_blocker, "The qcow format used by node '%s' "
+               "does not support live migration",
+               bdrv_get_device_or_node_name(bs));
     migrate_add_blocker(s->migration_blocker);
 
     qemu_co_mutex_init(&s->lock);
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 6cbae1d205..f47260b808 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -2450,7 +2450,7 @@ int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
     if (ret < 0) {
         return ret;
     } else if (ret > 0) {
-        int metadata_ol_bitnr = ffs(ret) - 1;
+        int metadata_ol_bitnr = ctz32(ret);
         assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
 
         qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 2aa9dcb1d1..17bb2119b2 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -351,10 +351,8 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
 
     memset(sn, 0, sizeof(*sn));
 
-    /* Generate an ID if it wasn't passed */
-    if (sn_info->id_str[0] == '\0') {
-        find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
-    }
+    /* Generate an ID */
+    find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
 
     /* Check that the ID is unique */
     if (find_snapshot_by_id_and_name(bs, sn_info->id_str, NULL) >= 0) {
diff --git a/block/qcow2.c b/block/qcow2.c
index 316a8db22b..b9a72e39d4 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -208,7 +208,7 @@ static void GCC_FMT_ATTR(3, 4) report_unsupported(BlockDriverState *bs,
     va_end(ap);
 
     error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-              bdrv_get_device_name(bs), "qcow2", msg);
+              bdrv_get_device_or_node_name(bs), "qcow2", msg);
 }
 
 static void report_unsupported_feature(BlockDriverState *bs,
@@ -1802,7 +1802,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
 {
     /* Calculate cluster_bits */
     int cluster_bits;
-    cluster_bits = ffs(cluster_size) - 1;
+    cluster_bits = ctz32(cluster_size);
     if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS ||
         (1 << cluster_bits) != cluster_size)
     {
@@ -2110,7 +2110,7 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp)
         goto finish;
     }
 
-    refcount_order = ffs(refcount_bits) - 1;
+    refcount_order = ctz32(refcount_bits);
 
     ret = qcow2_create2(filename, size, backing_file, backing_fmt, flags,
                         cluster_size, prealloc, opts, version, refcount_order,
@@ -2824,6 +2824,7 @@ void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
                              int64_t size, const char *message_format, ...)
 {
     BDRVQcowState *s = bs->opaque;
+    const char *node_name;
     char *message;
     va_list ap;
 
@@ -2847,8 +2848,11 @@ void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
                 "corruption events will be suppressed\n", message);
     }
 
-    qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs), message,
-                                          offset >= 0, offset, size >= 0, size,
+    node_name = bdrv_get_node_name(bs);
+    qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs),
+                                          *node_name != '\0', node_name,
+                                          message, offset >= 0, offset,
+                                          size >= 0, size,
                                           fatal, &error_abort);
     g_free(message);
 
diff --git a/block/qed.c b/block/qed.c
index 892b13c806..5bbe069ce9 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -408,7 +408,7 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
         snprintf(buf, sizeof(buf), "%" PRIx64,
             s->header.features & ~QED_FEATURE_MASK);
         error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-            bdrv_get_device_name(bs), "QED", buf);
+            bdrv_get_device_or_node_name(bs), "QED", buf);
         return -ENOTSUP;
     }
     if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
@@ -436,9 +436,9 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
 
     s->table_nelems = (s->header.cluster_size * s->header.table_size) /
                       sizeof(uint64_t);
-    s->l2_shift = ffs(s->header.cluster_size) - 1;
+    s->l2_shift = ctz32(s->header.cluster_size);
     s->l2_mask = s->table_nelems - 1;
-    s->l1_shift = s->l2_shift + ffs(s->table_nelems) - 1;
+    s->l1_shift = s->l2_shift + ctz32(s->table_nelems);
 
     /* Header size calculation must not overflow uint32_t */
     if (s->header.header_size > UINT32_MAX / s->header.cluster_size) {
diff --git a/block/quorum.c b/block/quorum.c
index 437b12251d..f91ef75a84 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -226,10 +226,7 @@ static void quorum_report_bad(QuorumAIOCB *acb, char *node_name, int ret)
 
 static void quorum_report_failure(QuorumAIOCB *acb)
 {
-    const char *reference = bdrv_get_device_name(acb->common.bs)[0] ?
-                            bdrv_get_device_name(acb->common.bs) :
-                            acb->common.bs->node_name;
-
+    const char *reference = bdrv_get_device_or_node_name(acb->common.bs);
     qapi_event_send_quorum_failure(reference, acb->sector_num,
                                    acb->nb_sectors, &error_abort);
 }
diff --git a/block/rbd.c b/block/rbd.c
index f3ab2ddd5a..fbe87e035b 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -325,7 +325,7 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)
             error_setg(errp, "obj size too small");
             return -EINVAL;
         }
-        obj_order = ffs(objsize) - 1;
+        obj_order = ctz32(objsize);
     }
 
     clientname = qemu_rbd_parse_clientname(conf, clientname_buf);
diff --git a/block/sheepdog.c b/block/sheepdog.c
index c14172cfa6..2d5f06a390 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -1716,7 +1716,7 @@ static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
         if ((object_size - 1) & object_size) {    /* not a power of 2? */
             return -EINVAL;
         }
-        obj_order = ffs(object_size) - 1;
+        obj_order = ctz32(object_size);
         if (obj_order < 20 || obj_order > 31) {
             return -EINVAL;
         }
diff --git a/block/snapshot.c b/block/snapshot.c
index 698e1a1d58..50ae610139 100644
--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -246,9 +246,9 @@ int bdrv_snapshot_delete(BlockDriverState *bs,
     if (bs->file) {
         return bdrv_snapshot_delete(bs->file, snapshot_id, name, errp);
     }
-    error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-              drv->format_name, bdrv_get_device_name(bs),
-              "internal snapshot deletion");
+    error_setg(errp, "Block format '%s' used by device '%s' "
+               "does not support internal snapshot deletion",
+               drv->format_name, bdrv_get_device_name(bs));
     return -ENOTSUP;
 }
 
@@ -329,9 +329,9 @@ int bdrv_snapshot_load_tmp(BlockDriverState *bs,
     if (drv->bdrv_snapshot_load_tmp) {
         return drv->bdrv_snapshot_load_tmp(bs, snapshot_id, name, errp);
     }
-    error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-              drv->format_name, bdrv_get_device_name(bs),
-              "temporarily load internal snapshot");
+    error_setg(errp, "Block format '%s' used by device '%s' "
+               "does not support temporarily loading internal snapshots",
+               drv->format_name, bdrv_get_device_name(bs));
     return -ENOTSUP;
 }
 
diff --git a/block/vdi.c b/block/vdi.c
index 53bd02fe22..7642ef3597 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -502,9 +502,9 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
     }
 
     /* Disable migration when vdi images are used */
-    error_set(&s->migration_blocker,
-              QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-              "vdi", bdrv_get_device_name(bs), "live migration");
+    error_setg(&s->migration_blocker, "The vdi format used by node '%s' "
+               "does not support live migration",
+               bdrv_get_device_or_node_name(bs));
     migrate_add_blocker(s->migration_blocker);
 
     qemu_co_mutex_init(&s->write_lock);
diff --git a/block/vhdx.c b/block/vhdx.c
index bb3ed45d5c..0776de7174 100644
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -1002,9 +1002,9 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags,
     /* TODO: differencing files */
 
     /* Disable migration when VHDX images are used */
-    error_set(&s->migration_blocker,
-            QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-            "vhdx", bdrv_get_device_name(bs), "live migration");
+    error_setg(&s->migration_blocker, "The vhdx format used by node '%s' "
+               "does not support live migration",
+               bdrv_get_device_or_node_name(bs));
     migrate_add_blocker(s->migration_blocker);
 
     return 0;
@@ -1269,7 +1269,7 @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
                         iov1.iov_base = qemu_blockalign(bs, iov1.iov_len);
                         memset(iov1.iov_base, 0, iov1.iov_len);
                         qemu_iovec_concat_iov(&hd_qiov, &iov1, 1, 0,
-                                              sinfo.block_offset);
+                                              iov1.iov_len);
                         sectors_to_write += iov1.iov_len >> BDRV_SECTOR_BITS;
                     }
 
@@ -1285,7 +1285,7 @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
                         iov2.iov_base = qemu_blockalign(bs, iov2.iov_len);
                         memset(iov2.iov_base, 0, iov2.iov_len);
                         qemu_iovec_concat_iov(&hd_qiov, &iov2, 1, 0,
-                                              sinfo.block_offset);
+                                              iov2.iov_len);
                         sectors_to_write += iov2.iov_len >> BDRV_SECTOR_BITS;
                     }
                 }
diff --git a/block/vmdk.c b/block/vmdk.c
index 8410a158a2..1c5e2ef1b3 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -523,7 +523,7 @@ static int vmdk_open_vmfs_sparse(BlockDriverState *bs,
     }
     ret = vmdk_add_extent(bs, file, false,
                           le32_to_cpu(header.disk_sectors),
-                          le32_to_cpu(header.l1dir_offset) << 9,
+                          (int64_t)le32_to_cpu(header.l1dir_offset) << 9,
                           0,
                           le32_to_cpu(header.l1dir_size),
                           4096,
@@ -669,7 +669,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
         snprintf(buf, sizeof(buf), "VMDK version %" PRId32,
                  le32_to_cpu(header.version));
         error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-                  bdrv_get_device_name(bs), "vmdk", buf);
+                  bdrv_get_device_or_node_name(bs), "vmdk", buf);
         return -ENOTSUP;
     } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) {
         /* VMware KB 2064959 explains that version 3 added support for
@@ -962,9 +962,9 @@ static int vmdk_open(BlockDriverState *bs, QDict *options, int flags,
     qemu_co_mutex_init(&s->lock);
 
     /* Disable migration when VMDK images are used */
-    error_set(&s->migration_blocker,
-              QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-              "vmdk", bdrv_get_device_name(bs), "live migration");
+    error_setg(&s->migration_blocker, "The vmdk format used by node '%s' "
+               "does not support live migration",
+               bdrv_get_device_or_node_name(bs));
     migrate_add_blocker(s->migration_blocker);
     g_free(buf);
     return 0;
diff --git a/block/vpc.c b/block/vpc.c
index 43e768ee76..37572bab86 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -318,9 +318,9 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
     qemu_co_mutex_init(&s->lock);
 
     /* Disable migration when VHD images are used */
-    error_set(&s->migration_blocker,
-              QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-              "vpc", bdrv_get_device_name(bs), "live migration");
+    error_setg(&s->migration_blocker, "The vpc format used by node '%s' "
+               "does not support live migration",
+               bdrv_get_device_or_node_name(bs));
     migrate_add_blocker(s->migration_blocker);
 
     return 0;
diff --git a/block/vvfat.c b/block/vvfat.c
index 9be632f404..e803589675 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -1180,9 +1180,10 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags,
 
     /* Disable migration when vvfat is used rw */
     if (s->qcow) {
-        error_set(&s->migration_blocker,
-                  QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-                  "vvfat (rw)", bdrv_get_device_name(bs), "live migration");
+        error_setg(&s->migration_blocker,
+                   "The vvfat (rw) format used by node '%s' "
+                   "does not support live migration",
+                   bdrv_get_device_or_node_name(bs));
         migrate_add_blocker(s->migration_blocker);
     }
 
diff --git a/blockdev.c b/blockdev.c
index fbb3a79978..5eaf77e599 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1164,6 +1164,68 @@ out_aio_context:
     return NULL;
 }
 
+/**
+ * block_dirty_bitmap_lookup:
+ * Return a dirty bitmap (if present), after validating
+ * the node reference and bitmap names.
+ *
+ * @node: The name of the BDS node to search for bitmaps
+ * @name: The name of the bitmap to search for
+ * @pbs: Output pointer for BDS lookup, if desired. Can be NULL.
+ * @paio: Output pointer for aio_context acquisition, if desired. Can be NULL.
+ * @errp: Output pointer for error information. Can be NULL.
+ *
+ * @return: A bitmap object on success, or NULL on failure.
+ */
+static BdrvDirtyBitmap *block_dirty_bitmap_lookup(const char *node,
+                                                  const char *name,
+                                                  BlockDriverState **pbs,
+                                                  AioContext **paio,
+                                                  Error **errp)
+{
+    BlockDriverState *bs;
+    BdrvDirtyBitmap *bitmap;
+    AioContext *aio_context;
+
+    if (!node) {
+        error_setg(errp, "Node cannot be NULL");
+        return NULL;
+    }
+    if (!name) {
+        error_setg(errp, "Bitmap name cannot be NULL");
+        return NULL;
+    }
+    bs = bdrv_lookup_bs(node, node, NULL);
+    if (!bs) {
+        error_setg(errp, "Node '%s' not found", node);
+        return NULL;
+    }
+
+    aio_context = bdrv_get_aio_context(bs);
+    aio_context_acquire(aio_context);
+
+    bitmap = bdrv_find_dirty_bitmap(bs, name);
+    if (!bitmap) {
+        error_setg(errp, "Dirty bitmap '%s' not found", name);
+        goto fail;
+    }
+
+    if (pbs) {
+        *pbs = bs;
+    }
+    if (paio) {
+        *paio = aio_context;
+    } else {
+        aio_context_release(aio_context);
+    }
+
+    return bitmap;
+
+ fail:
+    aio_context_release(aio_context);
+    return NULL;
+}
+
 /* New and old BlockDriverState structs for atomic group operations */
 
 typedef struct BlkTransactionState BlkTransactionState;
@@ -1248,13 +1310,14 @@ static void internal_snapshot_prepare(BlkTransactionState *common,
     }
 
     if (bdrv_is_read_only(bs)) {
-        error_set(errp, QERR_DEVICE_IS_READ_ONLY, device);
+        error_setg(errp, "Device '%s' is read only", device);
         return;
     }
 
     if (!bdrv_can_snapshot(bs)) {
-        error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-                  bs->drv->format_name, device, "internal snapshot");
+        error_setg(errp, "Block format '%s' used by device '%s' "
+                   "does not support internal snapshots",
+                   bs->drv->format_name, device);
         return;
     }
 
@@ -1522,6 +1585,7 @@ static void drive_backup_prepare(BlkTransactionState *common, Error **errp)
                      backup->sync,
                      backup->has_mode, backup->mode,
                      backup->has_speed, backup->speed,
+                     backup->has_bitmap, backup->bitmap,
                      backup->has_on_source_error, backup->on_source_error,
                      backup->has_on_target_error, backup->on_target_error,
                      &local_err);
@@ -1953,6 +2017,102 @@ void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd,
     aio_context_release(aio_context);
 }
 
+void qmp_block_dirty_bitmap_add(const char *node, const char *name,
+                                bool has_granularity, uint32_t granularity,
+                                Error **errp)
+{
+    AioContext *aio_context;
+    BlockDriverState *bs;
+
+    if (!name || name[0] == '\0') {
+        error_setg(errp, "Bitmap name cannot be empty");
+        return;
+    }
+
+    bs = bdrv_lookup_bs(node, node, errp);
+    if (!bs) {
+        return;
+    }
+
+    aio_context = bdrv_get_aio_context(bs);
+    aio_context_acquire(aio_context);
+
+    if (has_granularity) {
+        if (granularity < 512 || !is_power_of_2(granularity)) {
+            error_setg(errp, "Granularity must be power of 2 "
+                             "and at least 512");
+            goto out;
+        }
+    } else {
+        /* Default to cluster size, if available: */
+        granularity = bdrv_get_default_bitmap_granularity(bs);
+    }
+
+    bdrv_create_dirty_bitmap(bs, granularity, name, errp);
+
+ out:
+    aio_context_release(aio_context);
+}
+
+void qmp_block_dirty_bitmap_remove(const char *node, const char *name,
+                                   Error **errp)
+{
+    AioContext *aio_context;
+    BlockDriverState *bs;
+    BdrvDirtyBitmap *bitmap;
+
+    bitmap = block_dirty_bitmap_lookup(node, name, &bs, &aio_context, errp);
+    if (!bitmap || !bs) {
+        return;
+    }
+
+    if (bdrv_dirty_bitmap_frozen(bitmap)) {
+        error_setg(errp,
+                   "Bitmap '%s' is currently frozen and cannot be removed",
+                   name);
+        goto out;
+    }
+    bdrv_dirty_bitmap_make_anon(bitmap);
+    bdrv_release_dirty_bitmap(bs, bitmap);
+
+ out:
+    aio_context_release(aio_context);
+}
+
+/**
+ * Completely clear a bitmap, for the purposes of synchronizing a bitmap
+ * immediately after a full backup operation.
+ */
+void qmp_block_dirty_bitmap_clear(const char *node, const char *name,
+                                  Error **errp)
+{
+    AioContext *aio_context;
+    BdrvDirtyBitmap *bitmap;
+    BlockDriverState *bs;
+
+    bitmap = block_dirty_bitmap_lookup(node, name, &bs, &aio_context, errp);
+    if (!bitmap || !bs) {
+        return;
+    }
+
+    if (bdrv_dirty_bitmap_frozen(bitmap)) {
+        error_setg(errp,
+                   "Bitmap '%s' is currently frozen and cannot be modified",
+                   name);
+        goto out;
+    } else if (!bdrv_dirty_bitmap_enabled(bitmap)) {
+        error_setg(errp,
+                   "Bitmap '%s' is currently disabled and cannot be cleared",
+                   name);
+        goto out;
+    }
+
+    bdrv_clear_dirty_bitmap(bitmap);
+
+ out:
+    aio_context_release(aio_context);
+}
+
 int hmp_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data)
 {
     const char *id = qdict_get_str(qdict, "id");
@@ -2055,7 +2215,7 @@ void qmp_block_resize(bool has_device, const char *device,
         error_set(errp, QERR_UNSUPPORTED);
         break;
     case -EACCES:
-        error_set(errp, QERR_DEVICE_IS_READ_ONLY, device);
+        error_setg(errp, "Device '%s' is read only", device);
         break;
     case -EBUSY:
         error_set(errp, QERR_DEVICE_IN_USE, device);
@@ -2270,6 +2430,7 @@ void qmp_drive_backup(const char *device, const char *target,
                       enum MirrorSyncMode sync,
                       bool has_mode, enum NewImageMode mode,
                       bool has_speed, int64_t speed,
+                      bool has_bitmap, const char *bitmap,
                       bool has_on_source_error, BlockdevOnError on_source_error,
                       bool has_on_target_error, BlockdevOnError on_target_error,
                       Error **errp)
@@ -2278,6 +2439,7 @@ void qmp_drive_backup(const char *device, const char *target,
     BlockDriverState *bs;
     BlockDriverState *target_bs;
     BlockDriverState *source = NULL;
+    BdrvDirtyBitmap *bmap = NULL;
     AioContext *aio_context;
     BlockDriver *drv = NULL;
     Error *local_err = NULL;
@@ -2377,7 +2539,16 @@ void qmp_drive_backup(const char *device, const char *target,
 
     bdrv_set_aio_context(target_bs, aio_context);
 
-    backup_start(bs, target_bs, speed, sync, on_source_error, on_target_error,
+    if (has_bitmap) {
+        bmap = bdrv_find_dirty_bitmap(bs, bitmap);
+        if (!bmap) {
+            error_setg(errp, "Bitmap '%s' could not be found", bitmap);
+            goto out;
+        }
+    }
+
+    backup_start(bs, target_bs, speed, sync, bmap,
+                 on_source_error, on_target_error,
                  block_job_cb, bs, &local_err);
     if (local_err != NULL) {
         bdrv_unref(target_bs);
@@ -2391,7 +2562,7 @@ out:
 
 BlockDeviceInfoList *qmp_query_named_block_nodes(Error **errp)
 {
-    return bdrv_named_nodes_list();
+    return bdrv_named_nodes_list(errp);
 }
 
 void qmp_blockdev_backup(const char *device, const char *target,
@@ -2438,8 +2609,8 @@ void qmp_blockdev_backup(const char *device, const char *target,
 
     bdrv_ref(target_bs);
     bdrv_set_aio_context(target_bs, aio_context);
-    backup_start(bs, target_bs, speed, sync, on_source_error, on_target_error,
-                 block_job_cb, bs, &local_err);
+    backup_start(bs, target_bs, speed, sync, NULL, on_source_error,
+                 on_target_error, block_job_cb, bs, &local_err);
     if (local_err != NULL) {
         bdrv_unref(target_bs);
         error_propagate(errp, local_err);
@@ -2699,7 +2870,7 @@ void qmp_block_job_cancel(const char *device,
         force = false;
     }
 
-    if (job->paused && !force) {
+    if (job->user_paused && !force) {
         error_setg(errp, "The block job for device '%s' is currently paused",
                    device);
         goto out;
@@ -2716,10 +2887,11 @@ void qmp_block_job_pause(const char *device, Error **errp)
     AioContext *aio_context;
     BlockJob *job = find_block_job(device, &aio_context, errp);
 
-    if (!job) {
+    if (!job || job->user_paused) {
         return;
     }
 
+    job->user_paused = true;
     trace_qmp_block_job_pause(job);
     block_job_pause(job);
     aio_context_release(aio_context);
@@ -2730,10 +2902,11 @@ void qmp_block_job_resume(const char *device, Error **errp)
     AioContext *aio_context;
     BlockJob *job = find_block_job(device, &aio_context, errp);
 
-    if (!job) {
+    if (!job || !job->user_paused) {
         return;
     }
 
+    job->user_paused = false;
     trace_qmp_block_job_resume(job);
     block_job_resume(job);
     aio_context_release(aio_context);
diff --git a/blockjob.c b/blockjob.c
index ba2255d91f..2755465259 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -107,7 +107,7 @@ void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
 
 void block_job_complete(BlockJob *job, Error **errp)
 {
-    if (job->paused || job->cancelled || !job->driver->complete) {
+    if (job->pause_count || job->cancelled || !job->driver->complete) {
         error_set(errp, QERR_BLOCK_JOB_NOT_READY,
                   bdrv_get_device_name(job->bs));
         return;
@@ -118,17 +118,26 @@ void block_job_complete(BlockJob *job, Error **errp)
 
 void block_job_pause(BlockJob *job)
 {
-    job->paused = true;
+    job->pause_count++;
 }
 
 bool block_job_is_paused(BlockJob *job)
 {
-    return job->paused;
+    return job->pause_count > 0;
 }
 
 void block_job_resume(BlockJob *job)
 {
-    job->paused = false;
+    assert(job->pause_count > 0);
+    job->pause_count--;
+    if (job->pause_count) {
+        return;
+    }
+    block_job_enter(job);
+}
+
+void block_job_enter(BlockJob *job)
+{
     block_job_iostatus_reset(job);
     if (job->co && !job->busy) {
         qemu_coroutine_enter(job->co, NULL);
@@ -138,7 +147,7 @@ void block_job_resume(BlockJob *job)
 void block_job_cancel(BlockJob *job)
 {
     job->cancelled = true;
-    block_job_resume(job);
+    block_job_enter(job);
 }
 
 bool block_job_is_cancelled(BlockJob *job)
@@ -258,7 +267,7 @@ BlockJobInfo *block_job_query(BlockJob *job)
     info->device    = g_strdup(bdrv_get_device_name(job->bs));
     info->len       = job->len;
     info->busy      = job->busy;
-    info->paused    = job->paused;
+    info->paused    = job->pause_count > 0;
     info->offset    = job->offset;
     info->speed     = job->speed;
     info->io_status = job->iostatus;
@@ -335,6 +344,8 @@ BlockErrorAction block_job_error_action(BlockJob *job, BlockDriverState *bs,
                                     IO_OPERATION_TYPE_WRITE,
                                     action, &error_abort);
     if (action == BLOCK_ERROR_ACTION_STOP) {
+        /* make the pause user visible, which will be resumed from QMP. */
+        job->user_paused = true;
         block_job_pause(job);
         block_job_iostatus_set_err(job, error);
         if (bs != job->bs) {
diff --git a/docs/bitmaps.md b/docs/bitmaps.md
new file mode 100644
index 0000000000..f066b48aa5
--- /dev/null
+++ b/docs/bitmaps.md
@@ -0,0 +1,352 @@
+<!--
+Copyright 2015 John Snow <jsnow@redhat.com> and Red Hat, Inc.
+All rights reserved.
+
+This file is licensed via The FreeBSD Documentation License, the full text of
+which is included at the end of this document.
+-->
+
+# Dirty Bitmaps and Incremental Backup
+
+* Dirty Bitmaps are objects that track which data needs to be backed up for the
+  next incremental backup.
+
+* Dirty bitmaps can be created at any time and attached to any node
+  (not just complete drives.)
+
+## Dirty Bitmap Names
+
+* A dirty bitmap's name is unique to the node, but bitmaps attached to different
+  nodes can share the same name.
+
+## Bitmap Modes
+
+* A Bitmap can be "frozen," which means that it is currently in-use by a backup
+  operation and cannot be deleted, renamed, written to, reset,
+  etc.
+
+## Basic QMP Usage
+
+### Supported Commands ###
+
+* block-dirty-bitmap-add
+* block-dirty-bitmap-remove
+* block-dirty-bitmap-clear
+
+### Creation
+
+* To create a new bitmap, enabled, on the drive with id=drive0:
+
+```json
+{ "execute": "block-dirty-bitmap-add",
+  "arguments": {
+    "node": "drive0",
+    "name": "bitmap0"
+  }
+}
+```
+
+* This bitmap will have a default granularity that matches the cluster size of
+  its associated drive, if available, clamped to between [4KiB, 64KiB].
+  The current default for qcow2 is 64KiB.
+
+* To create a new bitmap that tracks changes in 32KiB segments:
+
+```json
+{ "execute": "block-dirty-bitmap-add",
+  "arguments": {
+    "node": "drive0",
+    "name": "bitmap0",
+    "granularity": 32768
+  }
+}
+```
+
+### Deletion
+
+* Bitmaps that are frozen cannot be deleted.
+
+* Deleting the bitmap does not impact any other bitmaps attached to the same
+  node, nor does it affect any backups already created from this node.
+
+* Because bitmaps are only unique to the node to which they are attached,
+  you must specify the node/drive name here, too.
+
+```json
+{ "execute": "block-dirty-bitmap-remove",
+  "arguments": {
+    "node": "drive0",
+    "name": "bitmap0"
+  }
+}
+```
+
+### Resetting
+
+* Resetting a bitmap will clear all information it holds.
+
+* An incremental backup created from an empty bitmap will copy no data,
+  as if nothing has changed.
+
+```json
+{ "execute": "block-dirty-bitmap-clear",
+  "arguments": {
+    "node": "drive0",
+    "name": "bitmap0"
+  }
+}
+```
+
+## Transactions (Not yet implemented)
+
+* Transactional commands are forthcoming in a future version,
+  and are not yet available for use. This section serves as
+  documentation of intent for their design and usage.
+
+### Justification
+
+Bitmaps can be safely modified when the VM is paused or halted by using
+the basic QMP commands. For instance, you might perform the following actions:
+
+1. Boot the VM in a paused state.
+2. Create a full drive backup of drive0.
+3. Create a new bitmap attached to drive0.
+4. Resume execution of the VM.
+5. Incremental backups are ready to be created.
+
+At this point, the bitmap and drive backup would be correctly in sync,
+and incremental backups made from this point forward would be correctly aligned
+to the full drive backup.
+
+This is not particularly useful if we decide we want to start incremental
+backups after the VM has been running for a while, for which we will need to
+perform actions such as the following:
+
+1. Boot the VM and begin execution.
+2. Using a single transaction, perform the following operations:
+    * Create bitmap0.
+    * Create a full drive backup of drive0.
+3. Incremental backups are now ready to be created.
+
+### Supported Bitmap Transactions
+
+* block-dirty-bitmap-add
+* block-dirty-bitmap-clear
+
+The usages are identical to their respective QMP commands, but see below
+for examples.
+
+### Example: New Incremental Backup
+
+As outlined in the justification, perhaps we want to create a new incremental
+backup chain attached to a drive.
+
+```json
+{ "execute": "transaction",
+  "arguments": {
+    "actions": [
+      {"type": "block-dirty-bitmap-add",
+       "data": {"node": "drive0", "name": "bitmap0"} },
+      {"type": "drive-backup",
+       "data": {"device": "drive0", "target": "/path/to/full_backup.img",
+                "sync": "full", "format": "qcow2"} }
+    ]
+  }
+}
+```
+
+### Example: New Incremental Backup Anchor Point
+
+Maybe we just want to create a new full backup with an existing bitmap and
+want to reset the bitmap to track the new chain.
+
+```json
+{ "execute": "transaction",
+  "arguments": {
+    "actions": [
+      {"type": "block-dirty-bitmap-clear",
+       "data": {"node": "drive0", "name": "bitmap0"} },
+      {"type": "drive-backup",
+       "data": {"device": "drive0", "target": "/path/to/new_full_backup.img",
+                "sync": "full", "format": "qcow2"} }
+    ]
+  }
+}
+```
+
+## Incremental Backups
+
+The star of the show.
+
+**Nota Bene!** Only incremental backups of entire drives are supported for now.
+So despite the fact that you can attach a bitmap to any arbitrary node, they are
+only currently useful when attached to the root node. This is because
+drive-backup only supports drives/devices instead of arbitrary nodes.
+
+### Example: First Incremental Backup
+
+1. Create a full backup and sync it to the dirty bitmap, as in the transactional
+examples above; or with the VM offline, manually create a full copy and then
+create a new bitmap before the VM begins execution.
+
+    * Let's assume the full backup is named 'full_backup.img'.
+    * Let's assume the bitmap you created is 'bitmap0' attached to 'drive0'.
+
+2. Create a destination image for the incremental backup that utilizes the
+full backup as a backing image.
+
+    * Let's assume it is named 'incremental.0.img'.
+
+    ```sh
+    # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
+    ```
+
+3. Issue the incremental backup command:
+
+    ```json
+    { "execute": "drive-backup",
+      "arguments": {
+        "device": "drive0",
+        "bitmap": "bitmap0",
+        "target": "incremental.0.img",
+        "format": "qcow2",
+        "sync": "dirty-bitmap",
+        "mode": "existing"
+      }
+    }
+    ```
+
+### Example: Second Incremental Backup
+
+1. Create a new destination image for the incremental backup that points to the
+   previous one, e.g.: 'incremental.1.img'
+
+    ```sh
+    # qemu-img create -f qcow2 incremental.1.img -b incremental.0.img -F qcow2
+    ```
+
+2. Issue a new incremental backup command. The only difference here is that we
+   have changed the target image below.
+
+    ```json
+    { "execute": "drive-backup",
+      "arguments": {
+        "device": "drive0",
+        "bitmap": "bitmap0",
+        "target": "incremental.1.img",
+        "format": "qcow2",
+        "sync": "dirty-bitmap",
+        "mode": "existing"
+      }
+    }
+    ```
+
+## Errors
+
+* In the event of an error that occurs after a backup job is successfully
+  launched, either by a direct QMP command or a QMP transaction, the user
+  will receive a BLOCK_JOB_COMPLETE event with a failure message, accompanied
+  by a BLOCK_JOB_ERROR event.
+
+* In the case of an event being cancelled, the user will receive a
+  BLOCK_JOB_CANCELLED event instead of a pair of COMPLETE and ERROR events.
+
+* In either case, the incremental backup data contained within the bitmap is
+  safely rolled back, and the data within the bitmap is not lost. The image
+  file created for the failed attempt can be safely deleted.
+
+* Once the underlying problem is fixed (e.g. more storage space is freed up),
+  you can simply retry the incremental backup command with the same bitmap.
+
+### Example
+
+1. Create a target image:
+
+    ```sh
+    # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
+    ```
+
+2. Attempt to create an incremental backup via QMP:
+
+    ```json
+    { "execute": "drive-backup",
+      "arguments": {
+        "device": "drive0",
+        "bitmap": "bitmap0",
+        "target": "incremental.0.img",
+        "format": "qcow2",
+        "sync": "dirty-bitmap",
+        "mode": "existing"
+      }
+    }
+    ```
+
+3. Receive an event notifying us of failure:
+
+    ```json
+    { "timestamp": { "seconds": 1424709442, "microseconds": 844524 },
+      "data": { "speed": 0, "offset": 0, "len": 67108864,
+                "error": "No space left on device",
+                "device": "drive1", "type": "backup" },
+      "event": "BLOCK_JOB_COMPLETED" }
+    ```
+
+4. Delete the failed incremental, and re-create the image.
+
+    ```sh
+    # rm incremental.0.img
+    # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
+    ```
+
+5. Retry the command after fixing the underlying problem,
+   such as freeing up space on the backup volume:
+
+    ```json
+    { "execute": "drive-backup",
+      "arguments": {
+        "device": "drive0",
+        "bitmap": "bitmap0",
+        "target": "incremental.0.img",
+        "format": "qcow2",
+        "sync": "dirty-bitmap",
+        "mode": "existing"
+      }
+    }
+    ```
+
+6. Receive confirmation that the job completed successfully:
+
+    ```json
+    { "timestamp": { "seconds": 1424709668, "microseconds": 526525 },
+      "data": { "device": "drive1", "type": "backup",
+                "speed": 0, "len": 67108864, "offset": 67108864},
+      "event": "BLOCK_JOB_COMPLETED" }
+    ```
+
+<!--
+The FreeBSD Documentation License
+
+Redistribution and use in source (Markdown) and 'compiled' forms (SGML, HTML,
+PDF, PostScript, RTF and so forth) with or without modification, are permitted
+provided that the following conditions are met:
+
+Redistributions of source code (Markdown) must retain the above copyright
+notice, this list of conditions and the following disclaimer of this file
+unmodified.
+
+Redistributions in compiled form (transformed to other DTDs, converted to PDF,
+PostScript, RTF and other formats) must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+
+THIS DOCUMENTATION IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR  PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS  BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+THIS DOCUMENTATION, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
diff --git a/docs/qmp/qmp-events.txt b/docs/qmp/qmp-events.txt
index d759d19748..b19e490eb5 100644
--- a/docs/qmp/qmp-events.txt
+++ b/docs/qmp/qmp-events.txt
@@ -31,21 +31,26 @@ Example:
 BLOCK_IMAGE_CORRUPTED
 ---------------------
 
-Emitted when a disk image is being marked corrupt.
+Emitted when a disk image is being marked corrupt. The image can be
+identified by its device or node name. The 'device' field is always
+present for compatibility reasons, but it can be empty ("") if the
+image does not have a device name associated.
 
 Data:
 
-- "device": Device name (json-string)
-- "msg":    Informative message (e.g., reason for the corruption) (json-string)
-- "offset": If the corruption resulted from an image access, this is the access
-            offset into the image (json-int)
-- "size":   If the corruption resulted from an image access, this is the access
-            size (json-int)
+- "device":    Device name (json-string)
+- "node-name": Node name (json-string, optional)
+- "msg":       Informative message (e.g., reason for the corruption)
+               (json-string)
+- "offset":    If the corruption resulted from an image access, this
+               is the access offset into the image (json-int)
+- "size":      If the corruption resulted from an image access, this
+               is the access size (json-int)
 
 Example:
 
 { "event": "BLOCK_IMAGE_CORRUPTED",
-    "data": { "device": "ide0-hd0",
+    "data": { "device": "ide0-hd0", "node-name": "node0",
         "msg": "Prevented active L1 table overwrite", "offset": 196608,
         "size": 65536 },
     "timestamp": { "seconds": 1378126126, "microseconds": 966463 } }
diff --git a/hmp.c b/hmp.c
index f142d366ef..d85d913a79 100644
--- a/hmp.c
+++ b/hmp.c
@@ -391,8 +391,7 @@ static void print_block_info(Monitor *mon, BlockInfo *info,
                         inserted->iops_size);
     }
 
-    /* TODO: inserted->image should never be null */
-    if (verbose && inserted->image) {
+    if (verbose) {
         monitor_printf(mon, "\nImages:\n");
         image_info = inserted->image;
         while (1) {
@@ -1062,7 +1061,8 @@ void hmp_drive_backup(Monitor *mon, const QDict *qdict)
 
     qmp_drive_backup(device, filename, !!format, format,
                      full ? MIRROR_SYNC_MODE_FULL : MIRROR_SYNC_MODE_TOP,
-                     true, mode, false, 0, false, 0, false, 0, &err);
+                     true, mode, false, 0, false, NULL,
+                     false, 0, false, 0, &err);
     hmp_handle_error(mon, &err);
 }
 
diff --git a/hw/acpi/pcihp.c b/hw/acpi/pcihp.c
index 612fec03ee..77e1126f8f 100644
--- a/hw/acpi/pcihp.c
+++ b/hw/acpi/pcihp.c
@@ -120,7 +120,7 @@ static bool acpi_pcihp_pc_no_hotplug(AcpiPciHpState *s, PCIDevice *dev)
 static void acpi_pcihp_eject_slot(AcpiPciHpState *s, unsigned bsel, unsigned slots)
 {
     BusChild *kid, *next;
-    int slot = ffs(slots) - 1;
+    int slot = ctz32(slots);
     PCIBus *bus = acpi_pcihp_find_hotplug_bus(s, bsel);
 
     if (!bus) {
diff --git a/hw/arm/nseries.c b/hw/arm/nseries.c
index 2a5406d98d..d243159664 100644
--- a/hw/arm/nseries.c
+++ b/hw/arm/nseries.c
@@ -579,7 +579,10 @@ static uint32_t mipid_txrx(void *opaque, uint32_t cmd, int len)
 
     case 0x26:	/* GAMSET */
         if (!s->pm) {
-            s->gamma = ffs(s->param[0] & 0xf) - 1;
+            s->gamma = ctz32(s->param[0] & 0xf);
+            if (s->gamma == 32) {
+                s->gamma = -1; /* XXX: should this be 0? */
+            }
         } else if (s->pm < 0) {
             s->pm = 1;
         }
diff --git a/hw/arm/omap1.c b/hw/arm/omap1.c
index 91ffb589e5..de2b289257 100644
--- a/hw/arm/omap1.c
+++ b/hw/arm/omap1.c
@@ -2004,8 +2004,7 @@ static void omap_mpuio_write(void *opaque, hwaddr addr,
     case 0x04:	/* OUTPUT_REG */
         diff = (s->outputs ^ value) & ~s->dir;
         s->outputs = value;
-        while ((ln = ffs(diff))) {
-            ln --;
+        while ((ln = ctz32(diff)) != 32) {
             if (s->handler[ln])
                 qemu_set_irq(s->handler[ln], (value >> ln) & 1);
             diff &= ~(1 << ln);
@@ -2017,8 +2016,7 @@ static void omap_mpuio_write(void *opaque, hwaddr addr,
         s->dir = value;
 
         value = s->outputs & ~s->dir;
-        while ((ln = ffs(diff))) {
-            ln --;
+        while ((ln = ctz32(diff)) != 32) {
             if (s->handler[ln])
                 qemu_set_irq(s->handler[ln], (value >> ln) & 1);
             diff &= ~(1 << ln);
diff --git a/hw/arm/pxa2xx_gpio.c b/hw/arm/pxa2xx_gpio.c
index 354ccf1ea1..c89c8045c3 100644
--- a/hw/arm/pxa2xx_gpio.c
+++ b/hw/arm/pxa2xx_gpio.c
@@ -137,7 +137,7 @@ static void pxa2xx_gpio_handler_update(PXA2xxGPIOInfo *s) {
         level = s->olevel[i] & s->dir[i];
 
         for (diff = s->prev_level[i] ^ level; diff; diff ^= 1 << bit) {
-            bit = ffs(diff) - 1;
+            bit = ctz32(diff);
             line = bit + 32 * i;
             qemu_set_irq(s->handler[line], (level >> bit) & 1);
         }
diff --git a/hw/arm/strongarm.c b/hw/arm/strongarm.c
index 1ddea6d89c..da9fc1d51b 100644
--- a/hw/arm/strongarm.c
+++ b/hw/arm/strongarm.c
@@ -528,7 +528,7 @@ static void strongarm_gpio_handler_update(StrongARMGPIOInfo *s)
     level = s->olevel & s->dir;
 
     for (diff = s->prev_level ^ level; diff; diff ^= 1 << bit) {
-        bit = ffs(diff) - 1;
+        bit = ctz32(diff);
         qemu_set_irq(s->handler[bit], (level >> bit) & 1);
     }
 
@@ -745,7 +745,7 @@ static void strongarm_ppc_handler_update(StrongARMPPCInfo *s)
     level = s->olevel & s->dir;
 
     for (diff = s->prev_level ^ level; diff; diff ^= 1 << bit) {
-        bit = ffs(diff) - 1;
+        bit = ctz32(diff);
         qemu_set_irq(s->handler[bit], (level >> bit) & 1);
     }
 
diff --git a/hw/block/m25p80.c b/hw/block/m25p80.c
index afe243b811..efc43dde6a 100644
--- a/hw/block/m25p80.c
+++ b/hw/block/m25p80.c
@@ -621,7 +621,6 @@ static int m25p80_init(SSISlave *ss)
 
     s->size = s->pi->sector_size * s->pi->n_sectors;
     s->dirty_page = -1;
-    s->storage = blk_blockalign(s->blk, s->size);
 
     /* FIXME use a qdev drive property instead of drive_get_next() */
     dinfo = drive_get_next(IF_MTD);
@@ -629,6 +628,9 @@ static int m25p80_init(SSISlave *ss)
     if (dinfo) {
         DB_PRINT_L(0, "Binding to IF_MTD drive\n");
         s->blk = blk_by_legacy_dinfo(dinfo);
+        blk_attach_dev_nofail(s->blk, s);
+
+        s->storage = blk_blockalign(s->blk, s->size);
 
         /* FIXME: Move to late init */
         if (blk_read(s->blk, 0, s->storage,
@@ -638,6 +640,7 @@ static int m25p80_init(SSISlave *ss)
         }
     } else {
         DB_PRINT_L(0, "No BDRV - binding to RAM\n");
+        s->storage = blk_blockalign(NULL, s->size);
         memset(s->storage, 0xFF, s->size);
     }
 
diff --git a/hw/bt/sdp.c b/hw/bt/sdp.c
index 218e075df7..c903747952 100644
--- a/hw/bt/sdp.c
+++ b/hw/bt/sdp.c
@@ -707,7 +707,7 @@ static void sdp_service_record_build(struct sdp_service_record_s *record,
         len += sdp_attr_max_size(&def->attributes[record->attributes ++].data,
                         &record->uuids);
     }
-    record->uuids = 1 << ffs(record->uuids - 1);
+    record->uuids = pow2ceil(record->uuids);
     record->attribute_list =
             g_malloc0(record->attributes * sizeof(*record->attribute_list));
     record->uuid =
diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c
index e336bdb4a9..6e2ad8221b 100644
--- a/hw/char/virtio-serial-bus.c
+++ b/hw/char/virtio-serial-bus.c
@@ -814,12 +814,12 @@ static uint32_t find_free_port_id(VirtIOSerial *vser)
 
     max_nr_ports = vser->serial.max_virtserial_ports;
     for (i = 0; i < (max_nr_ports + 31) / 32; i++) {
-        uint32_t map, bit;
+        uint32_t map, zeroes;
 
         map = vser->ports_map[i];
-        bit = ffs(~map);
-        if (bit) {
-            return (bit - 1) + i * 32;
+        zeroes = ctz32(~map);
+        if (zeroes != 32) {
+            return zeroes + i * 32;
         }
     }
     return VIRTIO_CONSOLE_BAD_ID;
diff --git a/hw/display/tc6393xb.c b/hw/display/tc6393xb.c
index 4306adc959..66b7ade8da 100644
--- a/hw/display/tc6393xb.c
+++ b/hw/display/tc6393xb.c
@@ -171,7 +171,7 @@ static void tc6393xb_gpio_handler_update(TC6393xbState *s)
     level = s->gpio_level & s->gpio_dir;
 
     for (diff = s->prev_level ^ level; diff; diff ^= 1 << bit) {
-        bit = ffs(diff) - 1;
+        bit = ctz32(diff);
         qemu_set_irq(s->handler[bit], (level >> bit) & 1);
     }
 
diff --git a/hw/gpio/max7310.c b/hw/gpio/max7310.c
index 7fbf313ce8..2f59b134ee 100644
--- a/hw/gpio/max7310.c
+++ b/hw/gpio/max7310.c
@@ -96,7 +96,7 @@ static int max7310_tx(I2CSlave *i2c, uint8_t data)
     case 0x01:	/* Output port */
         for (diff = (data ^ s->level) & ~s->direction; diff;
                         diff &= ~(1 << line)) {
-            line = ffs(diff) - 1;
+            line = ctz32(diff);
             if (s->handler[line])
                 qemu_set_irq(s->handler[line], (data >> line) & 1);
         }
diff --git a/hw/gpio/omap_gpio.c b/hw/gpio/omap_gpio.c
index 9a43486890..d92f8cfbae 100644
--- a/hw/gpio/omap_gpio.c
+++ b/hw/gpio/omap_gpio.c
@@ -125,8 +125,7 @@ static void omap_gpio_write(void *opaque, hwaddr addr,
     case 0x04:	/* DATA_OUTPUT */
         diff = (s->outputs ^ value) & ~s->dir;
         s->outputs = value;
-        while ((ln = ffs(diff))) {
-            ln --;
+        while ((ln = ctz32(diff)) != 32) {
             if (s->handler[ln])
                 qemu_set_irq(s->handler[ln], (value >> ln) & 1);
             diff &= ~(1 << ln);
@@ -138,8 +137,7 @@ static void omap_gpio_write(void *opaque, hwaddr addr,
         s->dir = value;
 
         value = s->outputs & ~s->dir;
-        while ((ln = ffs(diff))) {
-            ln --;
+        while ((ln = ctz32(diff)) != 32) {
             if (s->handler[ln])
                 qemu_set_irq(s->handler[ln], (value >> ln) & 1);
             diff &= ~(1 << ln);
@@ -253,8 +251,7 @@ static inline void omap2_gpio_module_out_update(struct omap2_gpio_s *s,
 
     s->outputs ^= diff;
     diff &= ~s->dir;
-    while ((ln = ffs(diff))) {
-        ln --;
+    while ((ln = ctz32(diff)) != 32) {
         qemu_set_irq(s->handler[ln], (s->outputs >> ln) & 1);
         diff &= ~(1 << ln);
     }
@@ -442,8 +439,8 @@ static void omap2_gpio_module_write(void *opaque, hwaddr addr,
         s->dir = value;
 
         value = s->outputs & ~s->dir;
-        while ((ln = ffs(diff))) {
-            diff &= ~(1 <<-- ln);
+        while ((ln = ctz32(diff)) != 32) {
+            diff &= ~(1 << ln);
             qemu_set_irq(s->handler[ln], (value >> ln) & 1);
         }
 
diff --git a/hw/gpio/zaurus.c b/hw/gpio/zaurus.c
index 94083424f8..24a77272d7 100644
--- a/hw/gpio/zaurus.c
+++ b/hw/gpio/zaurus.c
@@ -65,7 +65,7 @@ static inline void scoop_gpio_handler_update(ScoopInfo *s) {
     level = s->gpio_level & s->gpio_dir;
 
     for (diff = s->prev_level ^ level; diff; diff ^= 1 << bit) {
-        bit = ffs(diff) - 1;
+        bit = ctz32(diff);
         qemu_set_irq(s->handler[bit], (level >> bit) & 1);
     }
 
diff --git a/hw/i2c/omap_i2c.c b/hw/i2c/omap_i2c.c
index d63278dbde..b6f544a221 100644
--- a/hw/i2c/omap_i2c.c
+++ b/hw/i2c/omap_i2c.c
@@ -171,9 +171,13 @@ static uint32_t omap_i2c_read(void *opaque, hwaddr addr)
     case 0x0c:	/* I2C_IV */
         if (s->revision >= OMAP2_INTR_REV)
             break;
-        ret = ffs(s->stat & s->mask);
-        if (ret)
-            s->stat ^= 1 << (ret - 1);
+        ret = ctz32(s->stat & s->mask);
+        if (ret != 32) {
+            s->stat ^= 1 << ret;
+            ret++;
+        } else {
+            ret = 0;
+        }
         omap_i2c_interrupts_update(s);
         return ret;
 
diff --git a/hw/intc/allwinner-a10-pic.c b/hw/intc/allwinner-a10-pic.c
index de820b9723..eed7621f13 100644
--- a/hw/intc/allwinner-a10-pic.c
+++ b/hw/intc/allwinner-a10-pic.c
@@ -23,7 +23,7 @@
 static void aw_a10_pic_update(AwA10PICState *s)
 {
     uint8_t i;
-    int irq = 0, fiq = 0, pending;
+    int irq = 0, fiq = 0, zeroes;
 
     s->vector = 0;
 
@@ -32,9 +32,9 @@ static void aw_a10_pic_update(AwA10PICState *s)
         fiq |= s->select[i] & s->irq_pending[i] & ~s->mask[i];
 
         if (!s->vector) {
-            pending = ffs(s->irq_pending[i] & ~s->mask[i]);
-            if (pending) {
-                s->vector = (i * 32 + pending - 1) * 4;
+            zeroes = ctz32(s->irq_pending[i] & ~s->mask[i]);
+            if (zeroes != 32) {
+                s->vector = (i * 32 + zeroes) * 4;
             }
         }
     }
diff --git a/hw/intc/omap_intc.c b/hw/intc/omap_intc.c
index ad3931c112..e9b38a3c63 100644
--- a/hw/intc/omap_intc.c
+++ b/hw/intc/omap_intc.c
@@ -60,7 +60,7 @@ struct omap_intr_handler_s {
 
 static void omap_inth_sir_update(struct omap_intr_handler_s *s, int is_fiq)
 {
-    int i, j, sir_intr, p_intr, p, f;
+    int i, j, sir_intr, p_intr, p;
     uint32_t level;
     sir_intr = 0;
     p_intr = 255;
@@ -72,14 +72,15 @@ static void omap_inth_sir_update(struct omap_intr_handler_s *s, int is_fiq)
     for (j = 0; j < s->nbanks; ++j) {
         level = s->bank[j].irqs & ~s->bank[j].mask &
                 (is_fiq ? s->bank[j].fiq : ~s->bank[j].fiq);
-        for (f = ffs(level), i = f - 1, level >>= f - 1; f; i += f,
-                        level >>= f) {
+
+        while (level != 0) {
+            i = ctz32(level);
             p = s->bank[j].priority[i];
             if (p <= p_intr) {
                 p_intr = p;
                 sir_intr = 32 * j + i;
             }
-            f = ffs(level >> 1);
+            level &= level - 1;
         }
     }
     s->sir_intr[is_fiq] = sir_intr;
diff --git a/hw/pci-host/bonito.c b/hw/pci-host/bonito.c
index 8134d0bcd0..3a731fe18d 100644
--- a/hw/pci-host/bonito.c
+++ b/hw/pci-host/bonito.c
@@ -427,7 +427,7 @@ static uint32_t bonito_sbridge_pciaddr(void *opaque, hwaddr addr)
     cfgaddr |= (s->regs[BONITO_PCIMAP_CFG] & 0xffff) << 16;
 
     idsel = (cfgaddr & BONITO_PCICONF_IDSEL_MASK) >> BONITO_PCICONF_IDSEL_OFFSET;
-    devno = ffs(idsel) - 1;
+    devno = ctz32(idsel);
     funno = (cfgaddr & BONITO_PCICONF_FUN_MASK) >> BONITO_PCICONF_FUN_OFFSET;
     regno = (cfgaddr & BONITO_PCICONF_REG_MASK) >> BONITO_PCICONF_REG_OFFSET;
 
diff --git a/hw/pci-host/uninorth.c b/hw/pci-host/uninorth.c
index 53f2b59ae8..f0144eb7b0 100644
--- a/hw/pci-host/uninorth.c
+++ b/hw/pci-host/uninorth.c
@@ -92,7 +92,10 @@ static uint32_t unin_get_config_reg(uint32_t reg, uint32_t addr)
         uint32_t slot, func;
 
         /* Grab CFA0 style values */
-        slot = ffs(reg & 0xfffff800) - 1;
+        slot = ctz32(reg & 0xfffff800);
+        if (slot == 32) {
+            slot = -1; /* XXX: should this be 0? */
+        }
         func = (reg >> 8) & 7;
 
         /* ... and then convert them to x86 format */
diff --git a/hw/pci/msi.c b/hw/pci/msi.c
index 916e1a1e5b..2949938223 100644
--- a/hw/pci/msi.c
+++ b/hw/pci/msi.c
@@ -72,7 +72,7 @@ static inline uint8_t msi_cap_sizeof(uint16_t flags)
 static inline unsigned int msi_nr_vectors(uint16_t flags)
 {
     return 1U <<
-        ((flags & PCI_MSI_FLAGS_QSIZE) >> (ffs(PCI_MSI_FLAGS_QSIZE) - 1));
+        ((flags & PCI_MSI_FLAGS_QSIZE) >> ctz32(PCI_MSI_FLAGS_QSIZE));
 }
 
 static inline uint8_t msi_flags_off(const PCIDevice* dev)
@@ -175,9 +175,9 @@ int msi_init(struct PCIDevice *dev, uint8_t offset,
     assert(nr_vectors > 0);
     assert(nr_vectors <= PCI_MSI_VECTORS_MAX);
     /* the nr of MSI vectors is up to 32 */
-    vectors_order = ffs(nr_vectors) - 1;
+    vectors_order = ctz32(nr_vectors);
 
-    flags = vectors_order << (ffs(PCI_MSI_FLAGS_QMASK) - 1);
+    flags = vectors_order << ctz32(PCI_MSI_FLAGS_QMASK);
     if (msi64bit) {
         flags |= PCI_MSI_FLAGS_64BIT;
     }
@@ -355,12 +355,12 @@ void msi_write_config(PCIDevice *dev, uint32_t addr, uint32_t val, int len)
      * just don't crash the host
      */
     log_num_vecs =
-        (flags & PCI_MSI_FLAGS_QSIZE) >> (ffs(PCI_MSI_FLAGS_QSIZE) - 1);
+        (flags & PCI_MSI_FLAGS_QSIZE) >> ctz32(PCI_MSI_FLAGS_QSIZE);
     log_max_vecs =
-        (flags & PCI_MSI_FLAGS_QMASK) >> (ffs(PCI_MSI_FLAGS_QMASK) - 1);
+        (flags & PCI_MSI_FLAGS_QMASK) >> ctz32(PCI_MSI_FLAGS_QMASK);
     if (log_num_vecs > log_max_vecs) {
         flags &= ~PCI_MSI_FLAGS_QSIZE;
-        flags |= log_max_vecs << (ffs(PCI_MSI_FLAGS_QSIZE) - 1);
+        flags |= log_max_vecs << ctz32(PCI_MSI_FLAGS_QSIZE);
         pci_set_word(dev->config + msi_flags_off(dev), flags);
     }
 
diff --git a/hw/pci/pcie_aer.c b/hw/pci/pcie_aer.c
index eaa3e6ea94..b48c09cd11 100644
--- a/hw/pci/pcie_aer.c
+++ b/hw/pci/pcie_aer.c
@@ -410,7 +410,7 @@ static void pcie_aer_msg(PCIDevice *dev, const PCIEAERMsg *msg)
 static void pcie_aer_update_log(PCIDevice *dev, const PCIEAERErr *err)
 {
     uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
-    uint8_t first_bit = ffs(err->status) - 1;
+    uint8_t first_bit = ctz32(err->status);
     uint32_t errcap = pci_get_long(aer_cap + PCI_ERR_CAP);
     int i;
 
diff --git a/hw/pci/shpc.c b/hw/pci/shpc.c
index 759910f79a..a706486394 100644
--- a/hw/pci/shpc.c
+++ b/hw/pci/shpc.c
@@ -61,7 +61,7 @@
 /* Same slot state masks are used for command and status registers */
 #define SHPC_SLOT_STATE_MASK     0x03
 #define SHPC_SLOT_STATE_SHIFT \
-    (ffs(SHPC_SLOT_STATE_MASK) - 1)
+    ctz32(SHPC_SLOT_STATE_MASK)
 
 #define SHPC_STATE_NO       0x0
 #define SHPC_STATE_PWRONLY  0x1
@@ -70,10 +70,10 @@
 
 #define SHPC_SLOT_PWR_LED_MASK   0xC
 #define SHPC_SLOT_PWR_LED_SHIFT \
-    (ffs(SHPC_SLOT_PWR_LED_MASK) - 1)
+    ctz32(SHPC_SLOT_PWR_LED_MASK)
 #define SHPC_SLOT_ATTN_LED_MASK  0x30
 #define SHPC_SLOT_ATTN_LED_SHIFT \
-    (ffs(SHPC_SLOT_ATTN_LED_MASK) - 1)
+    ctz32(SHPC_SLOT_ATTN_LED_MASK)
 
 #define SHPC_LED_NO     0x0
 #define SHPC_LED_ON     0x1
@@ -136,7 +136,7 @@ static int roundup_pow_of_two(int x)
 static uint16_t shpc_get_status(SHPCDevice *shpc, int slot, uint16_t msk)
 {
     uint8_t *status = shpc->config + SHPC_SLOT_STATUS(slot);
-    return (pci_get_word(status) & msk) >> (ffs(msk) - 1);
+    return (pci_get_word(status) & msk) >> ctz32(msk);
 }
 
 static void shpc_set_status(SHPCDevice *shpc,
@@ -144,7 +144,7 @@ static void shpc_set_status(SHPCDevice *shpc,
 {
     uint8_t *status = shpc->config + SHPC_SLOT_STATUS(slot);
     pci_word_test_and_clear_mask(status, msk);
-    pci_word_test_and_set_mask(status, value << (ffs(msk) - 1));
+    pci_word_test_and_set_mask(status, value << ctz32(msk));
 }
 
 static void shpc_interrupt_update(PCIDevice *d)
diff --git a/hw/pci/slotid_cap.c b/hw/pci/slotid_cap.c
index 62f7bae2f1..1c01d346c9 100644
--- a/hw/pci/slotid_cap.c
+++ b/hw/pci/slotid_cap.c
@@ -3,7 +3,7 @@
 #include "qemu/error-report.h"
 
 #define SLOTID_CAP_LENGTH 4
-#define SLOTID_NSLOTS_SHIFT (ffs(PCI_SID_ESR_NSLOTS) - 1)
+#define SLOTID_NSLOTS_SHIFT ctz32(PCI_SID_ESR_NSLOTS)
 
 int slotid_cap_init(PCIDevice *d, int nslots,
                     uint8_t chassis,
diff --git a/hw/ppc/ppce500_spin.c b/hw/ppc/ppce500_spin.c
index d49f2b8803..a99f7b0397 100644
--- a/hw/ppc/ppce500_spin.c
+++ b/hw/ppc/ppce500_spin.c
@@ -74,7 +74,7 @@ static void spin_reset(void *opaque)
 /* Create -kernel TLB entries for BookE, linearly spanning 256MB.  */
 static inline hwaddr booke206_page_size_to_tlb(uint64_t size)
 {
-    return (ffs(size >> 10) - 1) >> 1;
+    return ctz32(size >> 10) >> 1;
 }
 
 static void mmubooke_create_initial_mapping(CPUPPCState *env,
diff --git a/hw/scsi/megasas.c b/hw/scsi/megasas.c
index ad7317bfe9..91a5d97c73 100644
--- a/hw/scsi/megasas.c
+++ b/hw/scsi/megasas.c
@@ -804,7 +804,7 @@ static int megasas_ctrl_get_info(MegasasState *s, MegasasCmd *cmd)
                                MFI_INFO_LDOPS_READ_POLICY);
     info.max_strips_per_io = cpu_to_le16(s->fw_sge);
     info.stripe_sz_ops.min = 3;
-    info.stripe_sz_ops.max = ffs(MEGASAS_MAX_SECTORS + 1) - 1;
+    info.stripe_sz_ops.max = ctz32(MEGASAS_MAX_SECTORS + 1);
     info.properties.pred_fail_poll_interval = cpu_to_le16(300);
     info.properties.intr_throttle_cnt = cpu_to_le16(16);
     info.properties.intr_throttle_timeout = cpu_to_le16(50);
diff --git a/hw/sd/sd.c b/hw/sd/sd.c
index f955265f74..8abf0c9e31 100644
--- a/hw/sd/sd.c
+++ b/hw/sd/sd.c
@@ -796,8 +796,9 @@ static sd_rsp_type_t sd_normal_command(SDState *sd,
             sd->vhs = 0;
 
             /* No response if not exactly one VHS bit is set.  */
-            if (!(req.arg >> 8) || (req.arg >> ffs(req.arg & ~0xff)))
+            if (!(req.arg >> 8) || (req.arg >> (ctz32(req.arg & ~0xff) + 1))) {
                 return sd->spi ? sd_r7 : sd_r0;
+            }
 
             /* Accept.  */
             sd->vhs = req.arg;
diff --git a/include/block/aio.h b/include/block/aio.h
index 7d1e26b33b..d2bb423de1 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -82,9 +82,6 @@ struct AioContext {
     /* Used for aio_notify.  */
     EventNotifier notifier;
 
-    /* GPollFDs for aio_poll() */
-    GArray *pollfds;
-
     /* Thread pool for performing work and receiving completion callbacks */
     struct ThreadPool *thread_pool;
 
@@ -121,13 +118,14 @@ void aio_context_ref(AioContext *ctx);
 void aio_context_unref(AioContext *ctx);
 
 /* Take ownership of the AioContext.  If the AioContext will be shared between
- * threads, a thread must have ownership when calling aio_poll().
+ * threads, and a thread does not want to be interrupted, it will have to
+ * take ownership around calls to aio_poll().  Otherwise, aio_poll()
+ * automatically takes care of calling aio_context_acquire and
+ * aio_context_release.
  *
- * Note that multiple threads calling aio_poll() means timers, BHs, and
- * callbacks may be invoked from a different thread than they were registered
- * from.  Therefore, code must use AioContext acquire/release or use
- * fine-grained synchronization to protect shared state if other threads will
- * be accessing it simultaneously.
+ * Access to timers and BHs from a thread that has not acquired AioContext
+ * is possible.  Access to callbacks for now must be done while the AioContext
+ * is owned by the thread (FIXME).
  */
 void aio_context_acquire(AioContext *ctx);
 
diff --git a/include/block/block.h b/include/block/block.h
index 4c57d63fe2..7d1a7174f6 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -382,7 +382,7 @@ void bdrv_lock_medium(BlockDriverState *bs, bool locked);
 void bdrv_eject(BlockDriverState *bs, bool eject_flag);
 const char *bdrv_get_format_name(BlockDriverState *bs);
 BlockDriverState *bdrv_find_node(const char *node_name);
-BlockDeviceInfoList *bdrv_named_nodes_list(void);
+BlockDeviceInfoList *bdrv_named_nodes_list(Error **errp);
 BlockDriverState *bdrv_lookup_bs(const char *device,
                                  const char *node_name,
                                  Error **errp);
@@ -398,6 +398,7 @@ void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
                          void *opaque);
 const char *bdrv_get_node_name(const BlockDriverState *bs);
 const char *bdrv_get_device_name(const BlockDriverState *bs);
+const char *bdrv_get_device_or_node_name(const BlockDriverState *bs);
 int bdrv_get_flags(BlockDriverState *bs);
 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
                           const uint8_t *buf, int nb_sectors);
@@ -449,18 +450,39 @@ bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov);
 
 struct HBitmapIter;
 typedef struct BdrvDirtyBitmap BdrvDirtyBitmap;
-BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
+BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
+                                          uint32_t granularity,
+                                          const char *name,
                                           Error **errp);
+int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs,
+                                       BdrvDirtyBitmap *bitmap,
+                                       Error **errp);
+BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs,
+                                            BdrvDirtyBitmap *bitmap,
+                                            Error **errp);
+BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs,
+                                           BdrvDirtyBitmap *bitmap,
+                                           Error **errp);
+BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs,
+                                        const char *name);
+void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap);
 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap);
+void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap);
+void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap);
 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs);
+uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs);
+uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap);
+bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap);
+bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap);
 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector);
-void bdrv_set_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
+void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap,
                            int64_t cur_sector, int nr_sectors);
-void bdrv_reset_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
+void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap,
                              int64_t cur_sector, int nr_sectors);
-void bdrv_dirty_iter_init(BlockDriverState *bs,
-                          BdrvDirtyBitmap *bitmap, struct HBitmapIter *hbi);
-int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap);
+void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap);
+void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, struct HBitmapIter *hbi);
+void bdrv_set_dirty_iter(struct HBitmapIter *hbi, int64_t offset);
+int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap);
 
 void bdrv_enable_copy_on_read(BlockDriverState *bs);
 void bdrv_disable_copy_on_read(BlockDriverState *bs);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index dccb092df7..db29b7424e 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -439,6 +439,14 @@ extern BlockDriver bdrv_file;
 extern BlockDriver bdrv_raw;
 extern BlockDriver bdrv_qcow2;
 
+/**
+ * bdrv_setup_io_funcs:
+ *
+ * Prepare a #BlockDriver for I/O request processing by populating
+ * unimplemented coroutine and AIO interfaces with generic wrapper functions
+ * that fall back to implemented interfaces.
+ */
+void bdrv_setup_io_funcs(BlockDriver *bdrv);
 
 int get_tmp_filename(char *filename, int size);
 BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
@@ -590,7 +598,7 @@ void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
  */
 void mirror_start(BlockDriverState *bs, BlockDriverState *target,
                   const char *replaces,
-                  int64_t speed, int64_t granularity, int64_t buf_size,
+                  int64_t speed, uint32_t granularity, int64_t buf_size,
                   MirrorSyncMode mode, BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
                   BlockCompletionFunc *cb,
@@ -602,6 +610,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
  * @target: Block device to write to.
  * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
  * @sync_mode: What parts of the disk image should be copied to the destination.
+ * @sync_bitmap: The dirty bitmap if sync_mode is MIRROR_SYNC_MODE_DIRTY_BITMAP.
  * @on_source_error: The action to take upon error reading from the source.
  * @on_target_error: The action to take upon error writing to the target.
  * @cb: Completion function for the job.
@@ -612,6 +621,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
  */
 void backup_start(BlockDriverState *bs, BlockDriverState *target,
                   int64_t speed, MirrorSyncMode sync_mode,
+                  BdrvDirtyBitmap *sync_bitmap,
                   BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
                   BlockCompletionFunc *cb, void *opaque,
@@ -624,4 +634,8 @@ bool blk_dev_is_tray_open(BlockBackend *blk);
 bool blk_dev_is_medium_locked(BlockBackend *blk);
 void blk_dev_resize_cb(BlockBackend *blk);
 
+void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors);
+void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
+                      int nr_sectors);
+
 #endif /* BLOCK_INT_H */
diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index b6d4ebbe03..57d8ef13e2 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -79,10 +79,16 @@ struct BlockJob {
     bool cancelled;
 
     /**
-     * Set to true if the job is either paused, or will pause itself
-     * as soon as possible (if busy == true).
+     * Counter for pause request. If non-zero, the block job is either paused,
+     * or if busy == true will pause itself as soon as possible.
      */
-    bool paused;
+    int pause_count;
+
+    /**
+     * Set to true if the job is paused by user.  Can be unpaused with the
+     * block-job-resume QMP command.
+     */
+    bool user_paused;
 
     /**
      * Set to false by the job while it is in a quiescent state, where
@@ -225,11 +231,19 @@ void block_job_pause(BlockJob *job);
  * block_job_resume:
  * @job: The job to be resumed.
  *
- * Resume the specified job.
+ * Resume the specified job.  Must be paired with a preceding block_job_pause.
  */
 void block_job_resume(BlockJob *job);
 
 /**
+ * block_job_enter:
+ * @job: The job to enter.
+ *
+ * Continue the specified job by entering the coroutine.
+ */
+void block_job_enter(BlockJob *job);
+
+/**
  * block_job_event_cancelled:
  * @job: The job whose information is requested.
  *
diff --git a/include/block/qapi.h b/include/block/qapi.h
index 168d788521..327549d917 100644
--- a/include/block/qapi.h
+++ b/include/block/qapi.h
@@ -29,7 +29,7 @@
 #include "block/block.h"
 #include "block/snapshot.h"
 
-BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs);
+BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs, Error **errp);
 int bdrv_query_snapshot_info_list(BlockDriverState *bs,
                                   SnapshotInfoList **p_list,
                                   Error **errp);
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index b97c2956ec..d4ffead48a 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -568,7 +568,7 @@ static inline void
 pci_set_byte_by_mask(uint8_t *config, uint8_t mask, uint8_t reg)
 {
     uint8_t val = pci_get_byte(config);
-    uint8_t rval = reg << (ffs(mask) - 1);
+    uint8_t rval = reg << ctz32(mask);
     pci_set_byte(config, (~mask & val) | (mask & rval));
 }
 
@@ -576,14 +576,14 @@ static inline uint8_t
 pci_get_byte_by_mask(uint8_t *config, uint8_t mask)
 {
     uint8_t val = pci_get_byte(config);
-    return (val & mask) >> (ffs(mask) - 1);
+    return (val & mask) >> ctz32(mask);
 }
 
 static inline void
 pci_set_word_by_mask(uint8_t *config, uint16_t mask, uint16_t reg)
 {
     uint16_t val = pci_get_word(config);
-    uint16_t rval = reg << (ffs(mask) - 1);
+    uint16_t rval = reg << ctz32(mask);
     pci_set_word(config, (~mask & val) | (mask & rval));
 }
 
@@ -591,14 +591,14 @@ static inline uint16_t
 pci_get_word_by_mask(uint8_t *config, uint16_t mask)
 {
     uint16_t val = pci_get_word(config);
-    return (val & mask) >> (ffs(mask) - 1);
+    return (val & mask) >> ctz32(mask);
 }
 
 static inline void
 pci_set_long_by_mask(uint8_t *config, uint32_t mask, uint32_t reg)
 {
     uint32_t val = pci_get_long(config);
-    uint32_t rval = reg << (ffs(mask) - 1);
+    uint32_t rval = reg << ctz32(mask);
     pci_set_long(config, (~mask & val) | (mask & rval));
 }
 
@@ -606,14 +606,14 @@ static inline uint32_t
 pci_get_long_by_mask(uint8_t *config, uint32_t mask)
 {
     uint32_t val = pci_get_long(config);
-    return (val & mask) >> (ffs(mask) - 1);
+    return (val & mask) >> ctz32(mask);
 }
 
 static inline void
 pci_set_quad_by_mask(uint8_t *config, uint64_t mask, uint64_t reg)
 {
     uint64_t val = pci_get_quad(config);
-    uint64_t rval = reg << (ffs(mask) - 1);
+    uint64_t rval = reg << ctz32(mask);
     pci_set_quad(config, (~mask & val) | (mask & rval));
 }
 
@@ -621,7 +621,7 @@ static inline uint64_t
 pci_get_quad_by_mask(uint8_t *config, uint64_t mask)
 {
     uint64_t val = pci_get_quad(config);
-    return (val & mask) >> (ffs(mask) - 1);
+    return (val & mask) >> ctz32(mask);
 }
 
 PCIDevice *pci_create_multifunction(PCIBus *bus, int devfn, bool multifunction,
diff --git a/include/hw/pci/pcie_regs.h b/include/hw/pci/pcie_regs.h
index 848ab1c206..6a28b33e69 100644
--- a/include/hw/pci/pcie_regs.h
+++ b/include/hw/pci/pcie_regs.h
@@ -27,34 +27,34 @@
 
 /* PCI_EXP_FLAGS */
 #define PCI_EXP_FLAGS_VER2              2 /* for now, supports only ver. 2 */
-#define PCI_EXP_FLAGS_IRQ_SHIFT         (ffs(PCI_EXP_FLAGS_IRQ) - 1)
-#define PCI_EXP_FLAGS_TYPE_SHIFT        (ffs(PCI_EXP_FLAGS_TYPE) - 1)
+#define PCI_EXP_FLAGS_IRQ_SHIFT         ctz32(PCI_EXP_FLAGS_IRQ)
+#define PCI_EXP_FLAGS_TYPE_SHIFT        ctz32(PCI_EXP_FLAGS_TYPE)
 
 
 /* PCI_EXP_LINK{CAP, STA} */
 /* link speed */
 #define PCI_EXP_LNK_LS_25               1
 
-#define PCI_EXP_LNK_MLW_SHIFT           (ffs(PCI_EXP_LNKCAP_MLW) - 1)
+#define PCI_EXP_LNK_MLW_SHIFT           ctz32(PCI_EXP_LNKCAP_MLW)
 #define PCI_EXP_LNK_MLW_1               (1 << PCI_EXP_LNK_MLW_SHIFT)
 
 /* PCI_EXP_LINKCAP */
-#define PCI_EXP_LNKCAP_ASPMS_SHIFT      (ffs(PCI_EXP_LNKCAP_ASPMS) - 1)
+#define PCI_EXP_LNKCAP_ASPMS_SHIFT      ctz32(PCI_EXP_LNKCAP_ASPMS)
 #define PCI_EXP_LNKCAP_ASPMS_0S         (1 << PCI_EXP_LNKCAP_ASPMS_SHIFT)
 
-#define PCI_EXP_LNKCAP_PN_SHIFT         (ffs(PCI_EXP_LNKCAP_PN) - 1)
+#define PCI_EXP_LNKCAP_PN_SHIFT         ctz32(PCI_EXP_LNKCAP_PN)
 
-#define PCI_EXP_SLTCAP_PSN_SHIFT        (ffs(PCI_EXP_SLTCAP_PSN) - 1)
+#define PCI_EXP_SLTCAP_PSN_SHIFT        ctz32(PCI_EXP_SLTCAP_PSN)
 
 #define PCI_EXP_SLTCTL_IND_RESERVED     0x0
 #define PCI_EXP_SLTCTL_IND_ON           0x1
 #define PCI_EXP_SLTCTL_IND_BLINK        0x2
 #define PCI_EXP_SLTCTL_IND_OFF          0x3
-#define PCI_EXP_SLTCTL_AIC_SHIFT        (ffs(PCI_EXP_SLTCTL_AIC) - 1)
+#define PCI_EXP_SLTCTL_AIC_SHIFT        ctz32(PCI_EXP_SLTCTL_AIC)
 #define PCI_EXP_SLTCTL_AIC_OFF                          \
     (PCI_EXP_SLTCTL_IND_OFF << PCI_EXP_SLTCTL_AIC_SHIFT)
 
-#define PCI_EXP_SLTCTL_PIC_SHIFT        (ffs(PCI_EXP_SLTCTL_PIC) - 1)
+#define PCI_EXP_SLTCTL_PIC_SHIFT        ctz32(PCI_EXP_SLTCTL_PIC)
 #define PCI_EXP_SLTCTL_PIC_OFF                          \
     (PCI_EXP_SLTCTL_IND_OFF << PCI_EXP_SLTCTL_PIC_SHIFT)
 #define PCI_EXP_SLTCTL_PIC_ON                          \
@@ -109,7 +109,7 @@
 
 #define PCI_ERR_ROOT_IRQ_MAX            32
 #define PCI_ERR_ROOT_IRQ                0xf8000000
-#define PCI_ERR_ROOT_IRQ_SHIFT          (ffs(PCI_ERR_ROOT_IRQ) - 1)
+#define PCI_ERR_ROOT_IRQ_SHIFT          ctz32(PCI_ERR_ROOT_IRQ)
 #define PCI_ERR_ROOT_STATUS_REPORT_MASK (PCI_ERR_ROOT_COR_RCV |         \
                                          PCI_ERR_ROOT_MULTI_COR_RCV |   \
                                          PCI_ERR_ROOT_UNCOR_RCV |       \
diff --git a/include/qapi/qmp/qerror.h b/include/qapi/qmp/qerror.h
index 57a62d4b76..e5673394d3 100644
--- a/include/qapi/qmp/qerror.h
+++ b/include/qapi/qmp/qerror.h
@@ -37,9 +37,6 @@ void qerror_report_err(Error *err);
 #define QERR_BASE_NOT_FOUND \
     ERROR_CLASS_GENERIC_ERROR, "Base '%s' not found"
 
-#define QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED \
-    ERROR_CLASS_GENERIC_ERROR, "Block format '%s' used by device '%s' does not support feature '%s'"
-
 #define QERR_BLOCK_JOB_NOT_READY \
     ERROR_CLASS_GENERIC_ERROR, "The active block job for device '%s' cannot be completed"
 
@@ -58,9 +55,6 @@ void qerror_report_err(Error *err);
 #define QERR_DEVICE_IN_USE \
     ERROR_CLASS_GENERIC_ERROR, "Device '%s' is in use"
 
-#define QERR_DEVICE_IS_READ_ONLY \
-    ERROR_CLASS_GENERIC_ERROR, "Device '%s' is read only"
-
 #define QERR_DEVICE_NO_HOTPLUG \
     ERROR_CLASS_GENERIC_ERROR, "Device '%s' does not support hotplugging"
 
diff --git a/include/qemu/hbitmap.h b/include/qemu/hbitmap.h
index 550d7ce2c3..f0a85f8649 100644
--- a/include/qemu/hbitmap.h
+++ b/include/qemu/hbitmap.h
@@ -65,6 +65,29 @@ struct HBitmapIter {
 HBitmap *hbitmap_alloc(uint64_t size, int granularity);
 
 /**
+ * hbitmap_truncate:
+ * @hb: The bitmap to change the size of.
+ * @size: The number of elements to change the bitmap to accommodate.
+ *
+ * truncate or grow an existing bitmap to accommodate a new number of elements.
+ * This may invalidate existing HBitmapIterators.
+ */
+void hbitmap_truncate(HBitmap *hb, uint64_t size);
+
+/**
+ * hbitmap_merge:
+ * @a: The bitmap to store the result in.
+ * @b: The bitmap to merge into @a.
+ * @return true if the merge was successful,
+ *         false if it was not attempted.
+ *
+ * Merge two bitmaps together.
+ * A := A (BITOR) B.
+ * B is left unmodified.
+ */
+bool hbitmap_merge(HBitmap *a, const HBitmap *b);
+
+/**
  * hbitmap_empty:
  * @hb: HBitmap to operate on.
  *
diff --git a/include/standard-headers/linux/virtio_blk.h b/include/standard-headers/linux/virtio_blk.h
index 12016b47f3..cd601f4069 100644
--- a/include/standard-headers/linux/virtio_blk.h
+++ b/include/standard-headers/linux/virtio_blk.h
@@ -58,7 +58,7 @@ struct virtio_blk_config {
 	uint32_t size_max;
 	/* The maximum number of segments (if VIRTIO_BLK_F_SEG_MAX) */
 	uint32_t seg_max;
-	/* geometry the device (if VIRTIO_BLK_F_GEOMETRY) */
+	/* geometry of the device (if VIRTIO_BLK_F_GEOMETRY) */
 	struct virtio_blk_geometry {
 		uint16_t cylinders;
 		uint8_t heads;
@@ -117,7 +117,11 @@ struct virtio_blk_config {
 #define VIRTIO_BLK_T_BARRIER	0x80000000
 #endif /* !VIRTIO_BLK_NO_LEGACY */
 
-/* This is the first element of the read scatter-gather list. */
+/*
+ * This comes first in the read scatter-gather list.
+ * For legacy virtio, if VIRTIO_F_ANY_LAYOUT is not negotiated,
+ * this is the first element of the read scatter-gather list.
+ */
 struct virtio_blk_outhdr {
 	/* VIRTIO_BLK_T* */
 	__virtio32 type;
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index 77e9b9c370..b4a4d5e0b9 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -87,6 +87,8 @@ int blk_read_unthrottled(BlockBackend *blk, int64_t sector_num, uint8_t *buf,
                          int nb_sectors);
 int blk_write(BlockBackend *blk, int64_t sector_num, const uint8_t *buf,
               int nb_sectors);
+int blk_write_zeroes(BlockBackend *blk, int64_t sector_num,
+                     int nb_sectors, BdrvRequestFlags flags);
 BlockAIOCB *blk_aio_write_zeroes(BlockBackend *blk, int64_t sector_num,
                                  int nb_sectors, BdrvRequestFlags flags,
                                  BlockCompletionFunc *cb, void *opaque);
diff --git a/include/sysemu/os-win32.h b/include/sysemu/os-win32.h
index 9cc9e08139..4035c4fe54 100644
--- a/include/sysemu/os-win32.h
+++ b/include/sysemu/os-win32.h
@@ -72,9 +72,6 @@
 #define sigsetjmp(env, savemask) setjmp(env)
 #define siglongjmp(env, val) longjmp(env, val)
 
-/* Declaration of ffs() is missing in MinGW's strings.h. */
-int ffs(int i);
-
 /* Missing POSIX functions. Don't use MinGW-w64 macros. */
 #undef gmtime_r
 struct tm *gmtime_r(const time_t *timep, struct tm *result);
diff --git a/iothread.c b/iothread.c
index 342a23fcb0..a1f91099bc 100644
--- a/iothread.c
+++ b/iothread.c
@@ -31,21 +31,14 @@ typedef ObjectClass IOThreadClass;
 static void *iothread_run(void *opaque)
 {
     IOThread *iothread = opaque;
-    bool blocking;
 
     qemu_mutex_lock(&iothread->init_done_lock);
     iothread->thread_id = qemu_get_thread_id();
     qemu_cond_signal(&iothread->init_done_cond);
     qemu_mutex_unlock(&iothread->init_done_lock);
 
-    while (!iothread->stopping) {
-        aio_context_acquire(iothread->ctx);
-        blocking = true;
-        while (!iothread->stopping && aio_poll(iothread->ctx, blocking)) {
-            /* Progress was made, keep going */
-            blocking = false;
-        }
-        aio_context_release(iothread->ctx);
+    while (!atomic_read(&iothread->stopping)) {
+        aio_poll(iothread->ctx, true);
     }
     return NULL;
 }
diff --git a/kvm-all.c b/kvm-all.c
index 4ec153df93..2a717e5b50 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -1141,18 +1141,18 @@ static int kvm_irqchip_get_virq(KVMState *s)
 {
     uint32_t *word = s->used_gsi_bitmap;
     int max_words = ALIGN(s->gsi_count, 32) / 32;
-    int i, bit;
+    int i, zeroes;
     bool retry = true;
 
 again:
     /* Return the lowest unused GSI in the bitmap */
     for (i = 0; i < max_words; i++) {
-        bit = ffs(~word[i]);
-        if (!bit) {
+        zeroes = ctz32(~word[i]);
+        if (zeroes == 32) {
             continue;
         }
 
-        return bit - 1 + i * 32;
+        return zeroes + i * 32;
     }
     if (!s->direct_msi && retry) {
         retry = false;
diff --git a/migration/block.c b/migration/block.c
index 085c0fae05..ddb59ccf87 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -304,7 +304,7 @@ static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
     blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
                                 nr_sectors, blk_mig_read_cb, blk);
 
-    bdrv_reset_dirty_bitmap(bs, bmds->dirty_bitmap, cur_sector, nr_sectors);
+    bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, cur_sector, nr_sectors);
     qemu_mutex_unlock_iothread();
 
     bmds->cur_sector = cur_sector + nr_sectors;
@@ -320,7 +320,7 @@ static int set_dirty_tracking(void)
 
     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
         bmds->dirty_bitmap = bdrv_create_dirty_bitmap(bmds->bs, BLOCK_SIZE,
-                                                      NULL);
+                                                      NULL, NULL);
         if (!bmds->dirty_bitmap) {
             ret = -errno;
             goto fail;
@@ -497,8 +497,7 @@ static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,
                 g_free(blk);
             }
 
-            bdrv_reset_dirty_bitmap(bmds->bs, bmds->dirty_bitmap, sector,
-                                    nr_sectors);
+            bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, sector, nr_sectors);
             break;
         }
         sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
@@ -584,7 +583,7 @@ static int64_t get_remaining_dirty(void)
     int64_t dirty = 0;
 
     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
-        dirty += bdrv_get_dirty_count(bmds->bs, bmds->dirty_bitmap);
+        dirty += bdrv_get_dirty_count(bmds->dirty_bitmap);
     }
 
     return dirty << BDRV_SECTOR_BITS;
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 78730846c2..1c17224c77 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -330,14 +330,19 @@
 #
 # Block dirty bitmap information.
 #
+# @name: #optional the name of the dirty bitmap (Since 2.4)
+#
 # @count: number of dirty bytes according to the dirty bitmap
 #
 # @granularity: granularity of the dirty bitmap in bytes (since 1.4)
 #
+# @frozen: whether the dirty bitmap is frozen (Since 2.4)
+#
 # Since: 1.3
 ##
 { 'type': 'BlockDirtyInfo',
-  'data': {'count': 'int', 'granularity': 'int'} }
+  'data': {'*name': 'str', 'count': 'int', 'granularity': 'uint32',
+           'frozen': 'bool'} }
 
 ##
 # @BlockInfo:
@@ -510,10 +515,12 @@
 #
 # @none: only copy data written from now on
 #
+# @dirty-bitmap: only copy data described by the dirty bitmap. Since: 2.4
+#
 # Since: 1.3
 ##
 { 'enum': 'MirrorSyncMode',
-  'data': ['top', 'full', 'none'] }
+  'data': ['top', 'full', 'none', 'dirty-bitmap'] }
 
 ##
 # @BlockJobType:
@@ -688,14 +695,18 @@
 #          probe if @mode is 'existing', else the format of the source
 #
 # @sync: what parts of the disk image should be copied to the destination
-#        (all the disk, only the sectors allocated in the topmost image, or
-#        only new I/O).
+#        (all the disk, only the sectors allocated in the topmost image, from a
+#        dirty bitmap, or only new I/O).
 #
 # @mode: #optional whether and how QEMU should create a new image, default is
 #        'absolute-paths'.
 #
 # @speed: #optional the maximum speed, in bytes per second
 #
+# @bitmap: #optional the name of dirty bitmap if sync is "dirty-bitmap".
+#          Must be present if sync is "dirty-bitmap", must NOT be present
+#          otherwise. (Since 2.4)
+#
 # @on-source-error: #optional the action to take on an error on the source,
 #                   default 'report'.  'stop' and 'enospc' can only be used
 #                   if the block device supports io-status (see BlockInfo).
@@ -713,7 +724,7 @@
 { 'type': 'DriveBackup',
   'data': { 'device': 'str', 'target': 'str', '*format': 'str',
             'sync': 'MirrorSyncMode', '*mode': 'NewImageMode',
-            '*speed': 'int',
+            '*speed': 'int', '*bitmap': 'str',
             '*on-source-error': 'BlockdevOnError',
             '*on-target-error': 'BlockdevOnError' } }
 
@@ -958,6 +969,76 @@
             '*on-target-error': 'BlockdevOnError' } }
 
 ##
+# @BlockDirtyBitmap
+#
+# @node: name of device/node which the bitmap is tracking
+#
+# @name: name of the dirty bitmap
+#
+# Since 2.4
+##
+{ 'type': 'BlockDirtyBitmap',
+  'data': { 'node': 'str', 'name': 'str' } }
+
+##
+# @BlockDirtyBitmapAdd
+#
+# @node: name of device/node which the bitmap is tracking
+#
+# @name: name of the dirty bitmap
+#
+# @granularity: #optional the bitmap granularity, default is 64k for
+#               block-dirty-bitmap-add
+#
+# Since 2.4
+##
+{ 'type': 'BlockDirtyBitmapAdd',
+  'data': { 'node': 'str', 'name': 'str', '*granularity': 'uint32' } }
+
+##
+# @block-dirty-bitmap-add
+#
+# Create a dirty bitmap with a name on the node
+#
+# Returns: nothing on success
+#          If @node is not a valid block device or node, DeviceNotFound
+#          If @name is already taken, GenericError with an explanation
+#
+# Since 2.4
+##
+{ 'command': 'block-dirty-bitmap-add',
+  'data': 'BlockDirtyBitmapAdd' }
+
+##
+# @block-dirty-bitmap-remove
+#
+# Remove a dirty bitmap on the node
+#
+# Returns: nothing on success
+#          If @node is not a valid block device or node, DeviceNotFound
+#          If @name is not found, GenericError with an explanation
+#          if @name is frozen by an operation, GenericError
+#
+# Since 2.4
+##
+{ 'command': 'block-dirty-bitmap-remove',
+  'data': 'BlockDirtyBitmap' }
+
+##
+# @block-dirty-bitmap-clear
+#
+# Clear (reset) a dirty bitmap on the device
+#
+# Returns: nothing on success
+#          If @node is not a valid block device, DeviceNotFound
+#          If @name is not found, GenericError with an explanation
+#
+# Since 2.4
+##
+{ 'command': 'block-dirty-bitmap-clear',
+  'data': 'BlockDirtyBitmap' }
+
+##
 # @block_set_io_throttle:
 #
 # Change I/O throttle limits for a block drive.
@@ -1310,11 +1391,14 @@
 # Driver specific block device options for the null backend.
 #
 # @size:    #optional size of the device in bytes.
+# @latency-ns: #optional emulated latency (in nanoseconds) in processing
+#              requests. Default to zero which completes requests immediately.
+#              (Since 2.4)
 #
 # Since: 2.2
 ##
 { 'type': 'BlockdevOptionsNull',
-  'data': { '*size': 'int' } }
+  'data': { '*size': 'int', '*latency-ns': 'uint64' } }
 
 ##
 # @BlockdevOptionsVVFAT
@@ -1754,7 +1838,11 @@
 #
 # Emitted when a corruption has been detected in a disk image
 #
-# @device: device name
+# @device: device name. This is always present for compatibility
+#          reasons, but it can be empty ("") if the image does not
+#          have a device name associated.
+#
+# @node-name: #optional node name (Since: 2.4)
 #
 # @msg: informative message for human consumption, such as the kind of
 #       corruption being detected. It should not be parsed by machine as it is
@@ -1773,11 +1861,12 @@
 # Since: 1.7
 ##
 { 'event': 'BLOCK_IMAGE_CORRUPTED',
-  'data': { 'device' : 'str',
-            'msg'    : 'str',
-            '*offset': 'int',
-            '*size'  : 'int',
-            'fatal'  : 'bool' } }
+  'data': { 'device'     : 'str',
+            '*node-name' : 'str',
+            'msg'        : 'str',
+            '*offset'    : 'int',
+            '*size'      : 'int',
+            'fatal'      : 'bool' } }
 
 ##
 # @BLOCK_IO_ERROR
diff --git a/qemu-img.c b/qemu-img.c
index 9dddfbefce..8d30e43b53 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1305,20 +1305,312 @@ out3:
     return ret;
 }
 
+enum ImgConvertBlockStatus {
+    BLK_DATA,
+    BLK_ZERO,
+    BLK_BACKING_FILE,
+};
+
+typedef struct ImgConvertState {
+    BlockBackend **src;
+    int64_t *src_sectors;
+    int src_cur, src_num;
+    int64_t src_cur_offset;
+    int64_t total_sectors;
+    int64_t allocated_sectors;
+    enum ImgConvertBlockStatus status;
+    int64_t sector_next_status;
+    BlockBackend *target;
+    bool has_zero_init;
+    bool compressed;
+    bool target_has_backing;
+    int min_sparse;
+    size_t cluster_sectors;
+    size_t buf_sectors;
+} ImgConvertState;
+
+static void convert_select_part(ImgConvertState *s, int64_t sector_num)
+{
+    assert(sector_num >= s->src_cur_offset);
+    while (sector_num - s->src_cur_offset >= s->src_sectors[s->src_cur]) {
+        s->src_cur_offset += s->src_sectors[s->src_cur];
+        s->src_cur++;
+        assert(s->src_cur < s->src_num);
+    }
+}
+
+static int convert_iteration_sectors(ImgConvertState *s, int64_t sector_num)
+{
+    int64_t ret;
+    int n;
+
+    convert_select_part(s, sector_num);
+
+    assert(s->total_sectors > sector_num);
+    n = MIN(s->total_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
+
+    if (s->sector_next_status <= sector_num) {
+        ret = bdrv_get_block_status(blk_bs(s->src[s->src_cur]),
+                                    sector_num - s->src_cur_offset,
+                                    n, &n);
+        if (ret < 0) {
+            return ret;
+        }
+
+        if (ret & BDRV_BLOCK_ZERO) {
+            s->status = BLK_ZERO;
+        } else if (ret & BDRV_BLOCK_DATA) {
+            s->status = BLK_DATA;
+        } else if (!s->target_has_backing) {
+            /* Without a target backing file we must copy over the contents of
+             * the backing file as well. */
+            /* TODO Check block status of the backing file chain to avoid
+             * needlessly reading zeroes and limiting the iteration to the
+             * buffer size */
+            s->status = BLK_DATA;
+        } else {
+            s->status = BLK_BACKING_FILE;
+        }
+
+        s->sector_next_status = sector_num + n;
+    }
+
+    n = MIN(n, s->sector_next_status - sector_num);
+    if (s->status == BLK_DATA) {
+        n = MIN(n, s->buf_sectors);
+    }
+
+    /* We need to write complete clusters for compressed images, so if an
+     * unallocated area is shorter than that, we must consider the whole
+     * cluster allocated. */
+    if (s->compressed) {
+        if (n < s->cluster_sectors) {
+            n = MIN(s->cluster_sectors, s->total_sectors - sector_num);
+            s->status = BLK_DATA;
+        } else {
+            n = QEMU_ALIGN_DOWN(n, s->cluster_sectors);
+        }
+    }
+
+    return n;
+}
+
+static int convert_read(ImgConvertState *s, int64_t sector_num, int nb_sectors,
+                        uint8_t *buf)
+{
+    int n;
+    int ret;
+
+    if (s->status == BLK_ZERO || s->status == BLK_BACKING_FILE) {
+        return 0;
+    }
+
+    assert(nb_sectors <= s->buf_sectors);
+    while (nb_sectors > 0) {
+        BlockBackend *blk;
+        int64_t bs_sectors;
+
+        /* In the case of compression with multiple source files, we can get a
+         * nb_sectors that spreads into the next part. So we must be able to
+         * read across multiple BDSes for one convert_read() call. */
+        convert_select_part(s, sector_num);
+        blk = s->src[s->src_cur];
+        bs_sectors = s->src_sectors[s->src_cur];
+
+        n = MIN(nb_sectors, bs_sectors - (sector_num - s->src_cur_offset));
+        ret = blk_read(blk, sector_num - s->src_cur_offset, buf, n);
+        if (ret < 0) {
+            return ret;
+        }
+
+        sector_num += n;
+        nb_sectors -= n;
+        buf += n * BDRV_SECTOR_SIZE;
+    }
+
+    return 0;
+}
+
+static int convert_write(ImgConvertState *s, int64_t sector_num, int nb_sectors,
+                         const uint8_t *buf)
+{
+    int ret;
+
+    while (nb_sectors > 0) {
+        int n = nb_sectors;
+
+        switch (s->status) {
+        case BLK_BACKING_FILE:
+            /* If we have a backing file, leave clusters unallocated that are
+             * unallocated in the source image, so that the backing file is
+             * visible at the respective offset. */
+            assert(s->target_has_backing);
+            break;
+
+        case BLK_DATA:
+            /* We must always write compressed clusters as a whole, so don't
+             * try to find zeroed parts in the buffer. We can only save the
+             * write if the buffer is completely zeroed and we're allowed to
+             * keep the target sparse. */
+            if (s->compressed) {
+                if (s->has_zero_init && s->min_sparse &&
+                    buffer_is_zero(buf, n * BDRV_SECTOR_SIZE))
+                {
+                    assert(!s->target_has_backing);
+                    break;
+                }
+
+                ret = blk_write_compressed(s->target, sector_num, buf, n);
+                if (ret < 0) {
+                    return ret;
+                }
+                break;
+            }
+
+            /* If there is real non-zero data or we're told to keep the target
+             * fully allocated (-S 0), we must write it. Otherwise we can treat
+             * it as zero sectors. */
+            if (!s->min_sparse ||
+                is_allocated_sectors_min(buf, n, &n, s->min_sparse))
+            {
+                ret = blk_write(s->target, sector_num, buf, n);
+                if (ret < 0) {
+                    return ret;
+                }
+                break;
+            }
+            /* fall-through */
+
+        case BLK_ZERO:
+            if (s->has_zero_init) {
+                break;
+            }
+            ret = blk_write_zeroes(s->target, sector_num, n, 0);
+            if (ret < 0) {
+                return ret;
+            }
+            break;
+        }
+
+        sector_num += n;
+        nb_sectors -= n;
+        buf += n * BDRV_SECTOR_SIZE;
+    }
+
+    return 0;
+}
+
+static int convert_do_copy(ImgConvertState *s)
+{
+    uint8_t *buf = NULL;
+    int64_t sector_num, allocated_done;
+    int ret;
+    int n;
+
+    /* Check whether we have zero initialisation or can get it efficiently */
+    s->has_zero_init = s->min_sparse && !s->target_has_backing
+                     ? bdrv_has_zero_init(blk_bs(s->target))
+                     : false;
+
+    if (!s->has_zero_init && !s->target_has_backing &&
+        bdrv_can_write_zeroes_with_unmap(blk_bs(s->target)))
+    {
+        ret = bdrv_make_zero(blk_bs(s->target), BDRV_REQ_MAY_UNMAP);
+        if (ret == 0) {
+            s->has_zero_init = true;
+        }
+    }
+
+    /* Allocate buffer for copied data. For compressed images, only one cluster
+     * can be copied at a time. */
+    if (s->compressed) {
+        if (s->cluster_sectors <= 0 || s->cluster_sectors > s->buf_sectors) {
+            error_report("invalid cluster size");
+            ret = -EINVAL;
+            goto fail;
+        }
+        s->buf_sectors = s->cluster_sectors;
+    }
+    buf = blk_blockalign(s->target, s->buf_sectors * BDRV_SECTOR_SIZE);
+
+    /* Calculate allocated sectors for progress */
+    s->allocated_sectors = 0;
+    sector_num = 0;
+    while (sector_num < s->total_sectors) {
+        n = convert_iteration_sectors(s, sector_num);
+        if (n < 0) {
+            ret = n;
+            goto fail;
+        }
+        if (s->status == BLK_DATA) {
+            s->allocated_sectors += n;
+        }
+        sector_num += n;
+    }
+
+    /* Do the copy */
+    s->src_cur = 0;
+    s->src_cur_offset = 0;
+    s->sector_next_status = 0;
+
+    sector_num = 0;
+    allocated_done = 0;
+
+    while (sector_num < s->total_sectors) {
+        n = convert_iteration_sectors(s, sector_num);
+        if (n < 0) {
+            ret = n;
+            goto fail;
+        }
+        if (s->status == BLK_DATA) {
+            allocated_done += n;
+            qemu_progress_print(100.0 * allocated_done / s->allocated_sectors,
+                                0);
+        }
+
+        ret = convert_read(s, sector_num, n, buf);
+        if (ret < 0) {
+            error_report("error while reading sector %" PRId64
+                         ": %s", sector_num, strerror(-ret));
+            goto fail;
+        }
+
+        ret = convert_write(s, sector_num, n, buf);
+        if (ret < 0) {
+            error_report("error while writing sector %" PRId64
+                         ": %s", sector_num, strerror(-ret));
+            goto fail;
+        }
+
+        sector_num += n;
+    }
+
+    if (s->compressed) {
+        /* signal EOF to align */
+        ret = blk_write_compressed(s->target, 0, NULL, 0);
+        if (ret < 0) {
+            goto fail;
+        }
+    }
+
+    ret = 0;
+fail:
+    qemu_vfree(buf);
+    return ret;
+}
+
 static int img_convert(int argc, char **argv)
 {
-    int c, n, n1, bs_n, bs_i, compress, cluster_sectors, skip_create;
+    int c, bs_n, bs_i, compress, cluster_sectors, skip_create;
     int64_t ret = 0;
     int progress = 0, flags, src_flags;
     const char *fmt, *out_fmt, *cache, *src_cache, *out_baseimg, *out_filename;
     BlockDriver *drv, *proto_drv;
     BlockBackend **blk = NULL, *out_blk = NULL;
     BlockDriverState **bs = NULL, *out_bs = NULL;
-    int64_t total_sectors, nb_sectors, sector_num, bs_offset;
+    int64_t total_sectors;
     int64_t *bs_sectors = NULL;
-    uint8_t * buf = NULL;
     size_t bufsectors = IO_BUF_SIZE / BDRV_SECTOR_SIZE;
-    const uint8_t *buf1;
     BlockDriverInfo bdi;
     QemuOpts *opts = NULL;
     QemuOptsList *create_opts = NULL;
@@ -1329,6 +1621,7 @@ static int img_convert(int argc, char **argv)
     bool quiet = false;
     Error *local_err = NULL;
     QemuOpts *sn_opts = NULL;
+    ImgConvertState state;
 
     fmt = NULL;
     out_fmt = "raw";
@@ -1627,9 +1920,6 @@ static int img_convert(int argc, char **argv)
     }
     out_bs = blk_bs(out_blk);
 
-    bs_i = 0;
-    bs_offset = 0;
-
     /* increase bufsectors from the default 4096 (2M) if opt_transfer_length
      * or discard_alignment of the out_bs is greater. Limit to 32768 (16MB)
      * as maximum. */
@@ -1638,8 +1928,6 @@ static int img_convert(int argc, char **argv)
                                          out_bs->bl.discard_alignment))
                     );
 
-    buf = blk_blockalign(out_blk, bufsectors * BDRV_SECTOR_SIZE);
-
     if (skip_create) {
         int64_t output_sectors = blk_nb_sectors(out_blk);
         if (output_sectors < 0) {
@@ -1666,203 +1954,20 @@ static int img_convert(int argc, char **argv)
         cluster_sectors = bdi.cluster_size / BDRV_SECTOR_SIZE;
     }
 
-    if (compress) {
-        if (cluster_sectors <= 0 || cluster_sectors > bufsectors) {
-            error_report("invalid cluster size");
-            ret = -1;
-            goto out;
-        }
-        sector_num = 0;
-
-        nb_sectors = total_sectors;
-
-        for(;;) {
-            int64_t bs_num;
-            int remainder;
-            uint8_t *buf2;
-
-            nb_sectors = total_sectors - sector_num;
-            if (nb_sectors <= 0)
-                break;
-            if (nb_sectors >= cluster_sectors)
-                n = cluster_sectors;
-            else
-                n = nb_sectors;
-
-            bs_num = sector_num - bs_offset;
-            assert (bs_num >= 0);
-            remainder = n;
-            buf2 = buf;
-            while (remainder > 0) {
-                int nlow;
-                while (bs_num == bs_sectors[bs_i]) {
-                    bs_offset += bs_sectors[bs_i];
-                    bs_i++;
-                    assert (bs_i < bs_n);
-                    bs_num = 0;
-                    /* printf("changing part: sector_num=%" PRId64 ", "
-                       "bs_i=%d, bs_offset=%" PRId64 ", bs_sectors=%" PRId64
-                       "\n", sector_num, bs_i, bs_offset, bs_sectors[bs_i]); */
-                }
-                assert (bs_num < bs_sectors[bs_i]);
-
-                nlow = remainder > bs_sectors[bs_i] - bs_num
-                    ? bs_sectors[bs_i] - bs_num : remainder;
-
-                ret = blk_read(blk[bs_i], bs_num, buf2, nlow);
-                if (ret < 0) {
-                    error_report("error while reading sector %" PRId64 ": %s",
-                                 bs_num, strerror(-ret));
-                    goto out;
-                }
-
-                buf2 += nlow * 512;
-                bs_num += nlow;
-
-                remainder -= nlow;
-            }
-            assert (remainder == 0);
-
-            if (!buffer_is_zero(buf, n * BDRV_SECTOR_SIZE)) {
-                ret = blk_write_compressed(out_blk, sector_num, buf, n);
-                if (ret != 0) {
-                    error_report("error while compressing sector %" PRId64
-                                 ": %s", sector_num, strerror(-ret));
-                    goto out;
-                }
-            }
-            sector_num += n;
-            qemu_progress_print(100.0 * sector_num / total_sectors, 0);
-        }
-        /* signal EOF to align */
-        blk_write_compressed(out_blk, 0, NULL, 0);
-    } else {
-        int64_t sectors_to_read, sectors_read, sector_num_next_status;
-        bool count_allocated_sectors;
-        int has_zero_init = min_sparse ? bdrv_has_zero_init(out_bs) : 0;
-
-        if (!has_zero_init && bdrv_can_write_zeroes_with_unmap(out_bs)) {
-            ret = bdrv_make_zero(out_bs, BDRV_REQ_MAY_UNMAP);
-            if (ret < 0) {
-                goto out;
-            }
-            has_zero_init = 1;
-        }
-
-        sectors_to_read = total_sectors;
-        count_allocated_sectors = progress && (out_baseimg || has_zero_init);
-restart:
-        sector_num = 0; // total number of sectors converted so far
-        sectors_read = 0;
-        sector_num_next_status = 0;
-
-        for(;;) {
-            nb_sectors = total_sectors - sector_num;
-            if (nb_sectors <= 0) {
-                if (count_allocated_sectors) {
-                    sectors_to_read = sectors_read;
-                    count_allocated_sectors = false;
-                    goto restart;
-                }
-                ret = 0;
-                break;
-            }
-
-            while (sector_num - bs_offset >= bs_sectors[bs_i]) {
-                bs_offset += bs_sectors[bs_i];
-                bs_i ++;
-                assert (bs_i < bs_n);
-                /* printf("changing part: sector_num=%" PRId64 ", bs_i=%d, "
-                  "bs_offset=%" PRId64 ", bs_sectors=%" PRId64 "\n",
-                   sector_num, bs_i, bs_offset, bs_sectors[bs_i]); */
-            }
-
-            if ((out_baseimg || has_zero_init) &&
-                sector_num >= sector_num_next_status) {
-                n = nb_sectors > INT_MAX ? INT_MAX : nb_sectors;
-                ret = bdrv_get_block_status(bs[bs_i], sector_num - bs_offset,
-                                            n, &n1);
-                if (ret < 0) {
-                    error_report("error while reading block status of sector %"
-                                 PRId64 ": %s", sector_num - bs_offset,
-                                 strerror(-ret));
-                    goto out;
-                }
-                /* If the output image is zero initialized, we are not working
-                 * on a shared base and the input is zero we can skip the next
-                 * n1 sectors */
-                if (has_zero_init && !out_baseimg && (ret & BDRV_BLOCK_ZERO)) {
-                    sector_num += n1;
-                    continue;
-                }
-                /* If the output image is being created as a copy on write
-                 * image, assume that sectors which are unallocated in the
-                 * input image are present in both the output's and input's
-                 * base images (no need to copy them). */
-                if (out_baseimg) {
-                    if (!(ret & BDRV_BLOCK_DATA)) {
-                        sector_num += n1;
-                        continue;
-                    }
-                    /* The next 'n1' sectors are allocated in the input image.
-                     * Copy only those as they may be followed by unallocated
-                     * sectors. */
-                    nb_sectors = n1;
-                }
-                /* avoid redundant callouts to get_block_status */
-                sector_num_next_status = sector_num + n1;
-            }
-
-            n = MIN(nb_sectors, bufsectors);
-
-            /* round down request length to an aligned sector, but
-             * do not bother doing this on short requests. They happen
-             * when we found an all-zero area, and the next sector to
-             * write will not be sector_num + n. */
-            if (cluster_sectors > 0 && n >= cluster_sectors) {
-                int64_t next_aligned_sector = (sector_num + n);
-                next_aligned_sector -= next_aligned_sector % cluster_sectors;
-                if (sector_num + n > next_aligned_sector) {
-                    n = next_aligned_sector - sector_num;
-                }
-            }
-
-            n = MIN(n, bs_sectors[bs_i] - (sector_num - bs_offset));
-
-            sectors_read += n;
-            if (count_allocated_sectors) {
-                sector_num += n;
-                continue;
-            }
+    state = (ImgConvertState) {
+        .src                = blk,
+        .src_sectors        = bs_sectors,
+        .src_num            = bs_n,
+        .total_sectors      = total_sectors,
+        .target             = out_blk,
+        .compressed         = compress,
+        .target_has_backing = (bool) out_baseimg,
+        .min_sparse         = min_sparse,
+        .cluster_sectors    = cluster_sectors,
+        .buf_sectors        = bufsectors,
+    };
+    ret = convert_do_copy(&state);
 
-            n1 = n;
-            ret = blk_read(blk[bs_i], sector_num - bs_offset, buf, n);
-            if (ret < 0) {
-                error_report("error while reading sector %" PRId64 ": %s",
-                             sector_num - bs_offset, strerror(-ret));
-                goto out;
-            }
-            /* NOTE: at the same time we convert, we do not write zero
-               sectors to have a chance to compress the image. Ideally, we
-               should add a specific call to have the info to go faster */
-            buf1 = buf;
-            while (n > 0) {
-                if (!has_zero_init ||
-                    is_allocated_sectors_min(buf1, n, &n1, min_sparse)) {
-                    ret = blk_write(out_blk, sector_num, buf1, n1);
-                    if (ret < 0) {
-                        error_report("error while writing sector %" PRId64
-                                     ": %s", sector_num, strerror(-ret));
-                        goto out;
-                    }
-                }
-                sector_num += n1;
-                n -= n1;
-                buf1 += n1 * 512;
-            }
-            qemu_progress_print(100.0 * sectors_read / sectors_to_read, 0);
-        }
-    }
 out:
     if (!ret) {
         qemu_progress_print(100, 0);
@@ -1870,7 +1975,6 @@ out:
     qemu_progress_end();
     qemu_opts_del(opts);
     qemu_opts_free(create_opts);
-    qemu_vfree(buf);
     qemu_opts_del(sn_opts);
     blk_unref(out_blk);
     g_free(bs);
diff --git a/qmp-commands.hx b/qmp-commands.hx
index 1e59541665..213508fe5d 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -1007,6 +1007,43 @@ EQMP
         .mhandler.cmd_new = qmp_marshal_input_block_stream,
     },
 
+SQMP
+block-stream
+------------
+
+Copy data from a backing file into a block device.
+
+Arguments:
+
+- "device": The device's ID, must be unique (json-string)
+- "base": The file name of the backing image above which copying starts
+          (json-string, optional)
+- "backing-file": The backing file string to write into the active layer. This
+                  filename is not validated.
+
+                  If a pathname string is such that it cannot be resolved by
+                  QEMU, that means that subsequent QMP or HMP commands must use
+                  node-names for the image in question, as filename lookup
+                  methods will fail.
+
+                  If not specified, QEMU will automatically determine the
+                  backing file string to use, or error out if there is no
+                  obvious choice.  Care should be taken when specifying the
+                  string, to specify a valid filename or protocol.
+                  (json-string, optional) (Since 2.1)
+- "speed":  the maximum speed, in bytes per second (json-int, optional)
+- "on-error": the action to take on an error (default 'report').  'stop' and
+              'enospc' can only be used if the block device supports io-status.
+              (json-string, optional) (Since 2.1)
+
+Example:
+
+-> { "execute": "block-stream", "arguments": { "device": "virtio0",
+                                               "base": "/tmp/master.qcow2" } }
+<- { "return": {} }
+
+EQMP
+
     {
         .name       = "block-commit",
         .args_type  = "device:B,base:s?,top:s?,backing-file:s?,speed:o?",
@@ -1073,7 +1110,7 @@ EQMP
     {
         .name       = "drive-backup",
         .args_type  = "sync:s,device:B,target:s,speed:i?,mode:s?,format:s?,"
-                      "on-source-error:s?,on-target-error:s?",
+                      "bitmap:s?,on-source-error:s?,on-target-error:s?",
         .mhandler.cmd_new = qmp_marshal_input_drive_backup,
     },
 
@@ -1100,8 +1137,10 @@ Arguments:
             (json-string, optional)
 - "sync": what parts of the disk image should be copied to the destination;
   possibilities include "full" for all the disk, "top" for only the sectors
-  allocated in the topmost image, or "none" to only replicate new I/O
-  (MirrorSyncMode).
+  allocated in the topmost image, "dirty-bitmap" for only the dirty sectors in
+  the bitmap, or "none" to only replicate new I/O (MirrorSyncMode).
+- "bitmap": dirty bitmap name for sync==dirty-bitmap. Must be present if sync
+            is "dirty-bitmap", must NOT be present otherwise.
 - "mode": whether and how QEMU should create a new image
           (NewImageMode, optional, default 'absolute-paths')
 - "speed": the maximum speed, in bytes per second (json-int, optional)
@@ -1269,6 +1308,91 @@ Example:
 EQMP
 
     {
+        .name       = "block-dirty-bitmap-add",
+        .args_type  = "node:B,name:s,granularity:i?",
+        .mhandler.cmd_new = qmp_marshal_input_block_dirty_bitmap_add,
+    },
+
+SQMP
+
+block-dirty-bitmap-add
+----------------------
+Since 2.4
+
+Create a dirty bitmap with a name on the device, and start tracking the writes.
+
+Arguments:
+
+- "node": device/node on which to create dirty bitmap (json-string)
+- "name": name of the new dirty bitmap (json-string)
+- "granularity": granularity to track writes with (int, optional)
+
+Example:
+
+-> { "execute": "block-dirty-bitmap-add", "arguments": { "node": "drive0",
+                                                   "name": "bitmap0" } }
+<- { "return": {} }
+
+EQMP
+
+    {
+        .name       = "block-dirty-bitmap-remove",
+        .args_type  = "node:B,name:s",
+        .mhandler.cmd_new = qmp_marshal_input_block_dirty_bitmap_remove,
+    },
+
+SQMP
+
+block-dirty-bitmap-remove
+-------------------------
+Since 2.4
+
+Stop write tracking and remove the dirty bitmap that was created with
+block-dirty-bitmap-add.
+
+Arguments:
+
+- "node": device/node on which to remove dirty bitmap (json-string)
+- "name": name of the dirty bitmap to remove (json-string)
+
+Example:
+
+-> { "execute": "block-dirty-bitmap-remove", "arguments": { "node": "drive0",
+                                                      "name": "bitmap0" } }
+<- { "return": {} }
+
+EQMP
+
+    {
+        .name       = "block-dirty-bitmap-clear",
+        .args_type  = "node:B,name:s",
+        .mhandler.cmd_new = qmp_marshal_input_block_dirty_bitmap_clear,
+    },
+
+SQMP
+
+block-dirty-bitmap-clear
+------------------------
+Since 2.4
+
+Reset the dirty bitmap associated with a node so that an incremental backup
+from this point in time forward will only backup clusters modified after this
+clear operation.
+
+Arguments:
+
+- "node": device/node on which to remove dirty bitmap (json-string)
+- "name": name of the dirty bitmap to remove (json-string)
+
+Example:
+
+-> { "execute": "block-dirty-bitmap-clear", "arguments": { "node": "drive0",
+                                                           "name": "bitmap0" } }
+<- { "return": {} }
+
+EQMP
+
+    {
         .name       = "blockdev-snapshot-sync",
         .args_type  = "device:s?,node-name:s?,snapshot-file:s,snapshot-node-name:s?,format:s?,mode:s?",
         .mhandler.cmd_new = qmp_marshal_input_blockdev_snapshot_sync,
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 5df61f9aa9..7f0aae977d 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2911,6 +2911,17 @@ sub process {
 		if ($rawline =~ /\b(?:Qemu|QEmu)\b/) {
 			WARN("use QEMU instead of Qemu or QEmu\n" . $herecurr);
 		}
+
+# check for non-portable ffs() calls that have portable alternatives in QEMU
+		if ($line =~ /\bffs\(/) {
+			ERROR("use ctz32() instead of ffs()\n" . $herecurr);
+		}
+		if ($line =~ /\bffsl\(/) {
+			ERROR("use ctz32() or ctz64() instead of ffsl()\n" . $herecurr);
+		}
+		if ($line =~ /\bffsll\(/) {
+			ERROR("use ctz64() instead of ffsll()\n" . $herecurr);
+		}
 	}
 
 	# If we have no input at all, then there is nothing to report on
diff --git a/scripts/qemu-gdb.py b/scripts/qemu-gdb.py
index 8a0f30534f..6c7f4fbe53 100644
--- a/scripts/qemu-gdb.py
+++ b/scripts/qemu-gdb.py
@@ -22,12 +22,86 @@ def isnull(ptr):
 def int128(p):
     return long(p['lo']) + (long(p['hi']) << 64)
 
+def get_fs_base():
+    '''Fetch %fs base value using arch_prctl(ARCH_GET_FS)'''
+    # %rsp - 120 is scratch space according to the SystemV ABI
+    old = gdb.parse_and_eval('*(uint64_t*)($rsp - 120)')
+    gdb.execute('call arch_prctl(0x1003, $rsp - 120)', False, True)
+    fs_base = gdb.parse_and_eval('*(uint64_t*)($rsp - 120)')
+    gdb.execute('set *(uint64_t*)($rsp - 120) = %s' % old, False, True)
+    return fs_base
+
+def get_glibc_pointer_guard():
+    '''Fetch glibc pointer guard value'''
+    fs_base = get_fs_base()
+    return gdb.parse_and_eval('*(uint64_t*)((uint64_t)%s + 0x30)' % fs_base)
+
+def glibc_ptr_demangle(val, pointer_guard):
+    '''Undo effect of glibc's PTR_MANGLE()'''
+    return gdb.parse_and_eval('(((uint64_t)%s >> 0x11) | ((uint64_t)%s << (64 - 0x11))) ^ (uint64_t)%s' % (val, val, pointer_guard))
+
+def bt_jmpbuf(jmpbuf):
+    '''Backtrace a jmpbuf'''
+    JB_RBX  = 0
+    JB_RBP  = 1
+    JB_R12  = 2
+    JB_R13  = 3
+    JB_R14  = 4
+    JB_R15  = 5
+    JB_RSP  = 6
+    JB_PC   = 7
+
+    old_rbx = gdb.parse_and_eval('(uint64_t)$rbx')
+    old_rbp = gdb.parse_and_eval('(uint64_t)$rbp')
+    old_rsp = gdb.parse_and_eval('(uint64_t)$rsp')
+    old_r12 = gdb.parse_and_eval('(uint64_t)$r12')
+    old_r13 = gdb.parse_and_eval('(uint64_t)$r13')
+    old_r14 = gdb.parse_and_eval('(uint64_t)$r14')
+    old_r15 = gdb.parse_and_eval('(uint64_t)$r15')
+    old_rip = gdb.parse_and_eval('(uint64_t)$rip')
+
+    pointer_guard = get_glibc_pointer_guard()
+    gdb.execute('set $rbx = %s' % jmpbuf[JB_RBX])
+    gdb.execute('set $rbp = %s' % glibc_ptr_demangle(jmpbuf[JB_RBP], pointer_guard))
+    gdb.execute('set $rsp = %s' % glibc_ptr_demangle(jmpbuf[JB_RSP], pointer_guard))
+    gdb.execute('set $r12 = %s' % jmpbuf[JB_R12])
+    gdb.execute('set $r13 = %s' % jmpbuf[JB_R13])
+    gdb.execute('set $r14 = %s' % jmpbuf[JB_R14])
+    gdb.execute('set $r15 = %s' % jmpbuf[JB_R15])
+    gdb.execute('set $rip = %s' % glibc_ptr_demangle(jmpbuf[JB_PC], pointer_guard))
+
+    gdb.execute('bt')
+
+    gdb.execute('set $rbx = %s' % old_rbx)
+    gdb.execute('set $rbp = %s' % old_rbp)
+    gdb.execute('set $rsp = %s' % old_rsp)
+    gdb.execute('set $r12 = %s' % old_r12)
+    gdb.execute('set $r13 = %s' % old_r13)
+    gdb.execute('set $r14 = %s' % old_r14)
+    gdb.execute('set $r15 = %s' % old_r15)
+    gdb.execute('set $rip = %s' % old_rip)
+
 class QemuCommand(gdb.Command):
     '''Prefix for QEMU debug support commands'''
     def __init__(self):
         gdb.Command.__init__(self, 'qemu', gdb.COMMAND_DATA,
                              gdb.COMPLETE_NONE, True)
 
+class CoroutineCommand(gdb.Command):
+    '''Display coroutine backtrace'''
+    def __init__(self):
+        gdb.Command.__init__(self, 'qemu coroutine', gdb.COMMAND_DATA,
+                             gdb.COMPLETE_NONE)
+
+    def invoke(self, arg, from_tty):
+        argv = gdb.string_to_argv(arg)
+        if len(argv) != 1:
+            gdb.write('usage: qemu coroutine <coroutine-pointer>\n')
+            return
+
+        coroutine_pointer = gdb.parse_and_eval(argv[0]).cast(gdb.lookup_type('CoroutineUContext').pointer())
+        bt_jmpbuf(coroutine_pointer['env']['__jmpbuf'])
+
 class MtreeCommand(gdb.Command):
     '''Display the memory tree hierarchy'''
     def __init__(self):
@@ -86,4 +160,5 @@ class MtreeCommand(gdb.Command):
             subregion = subregion['subregions_link']['tqe_next']
 
 QemuCommand()
+CoroutineCommand()
 MtreeCommand()
diff --git a/scripts/qmp/qmp.py b/scripts/qmp/qmp.py
index 20b6ec795e..1d38e3e9e7 100644
--- a/scripts/qmp/qmp.py
+++ b/scripts/qmp/qmp.py
@@ -21,6 +21,9 @@ class QMPConnectError(QMPError):
 class QMPCapabilitiesError(QMPError):
     pass
 
+class QMPTimeoutError(QMPError):
+    pass
+
 class QEMUMonitorProtocol:
     def __init__(self, address, server=False):
         """
@@ -72,6 +75,44 @@ class QEMUMonitorProtocol:
 
     error = socket.error
 
+    def __get_events(self, wait=False):
+        """
+        Check for new events in the stream and cache them in __events.
+
+        @param wait (bool): block until an event is available.
+        @param wait (float): If wait is a float, treat it as a timeout value.
+
+        @raise QMPTimeoutError: If a timeout float is provided and the timeout
+                                period elapses.
+        @raise QMPConnectError: If wait is True but no events could be retrieved
+                                or if some other error occurred.
+        """
+
+        # Check for new events regardless and pull them into the cache:
+        self.__sock.setblocking(0)
+        try:
+            self.__json_read()
+        except socket.error, err:
+            if err[0] == errno.EAGAIN:
+                # No data available
+                pass
+        self.__sock.setblocking(1)
+
+        # Wait for new events, if needed.
+        # if wait is 0.0, this means "no wait" and is also implicitly false.
+        if not self.__events and wait:
+            if isinstance(wait, float):
+                self.__sock.settimeout(wait)
+            try:
+                ret = self.__json_read(only_event=True)
+            except socket.timeout:
+                raise QMPTimeoutError("Timeout waiting for event")
+            except:
+                raise QMPConnectError("Error while reading from socket")
+            if ret is None:
+                raise QMPConnectError("Error while reading from socket")
+            self.__sock.settimeout(None)
+
     def connect(self, negotiate=True):
         """
         Connect to the QMP Monitor and perform capabilities negotiation.
@@ -140,43 +181,37 @@ class QEMUMonitorProtocol:
         """
         Get and delete the first available QMP event.
 
-        @param wait: block until an event is available (bool)
+        @param wait (bool): block until an event is available.
+        @param wait (float): If wait is a float, treat it as a timeout value.
+
+        @raise QMPTimeoutError: If a timeout float is provided and the timeout
+                                period elapses.
+        @raise QMPConnectError: If wait is True but no events could be retrieved
+                                or if some other error occurred.
+
+        @return The first available QMP event, or None.
         """
-        self.__sock.setblocking(0)
-        try:
-            self.__json_read()
-        except socket.error, err:
-            if err[0] == errno.EAGAIN:
-                # No data available
-                pass
-        self.__sock.setblocking(1)
-        if not self.__events and wait:
-            self.__json_read(only_event=True)
-        event = self.__events[0]
-        del self.__events[0]
-        return event
+        self.__get_events(wait)
+
+        if self.__events:
+            return self.__events.pop(0)
+        return None
 
     def get_events(self, wait=False):
         """
         Get a list of available QMP events.
 
-        @param wait: block until an event is available (bool)
-        """
-        self.__sock.setblocking(0)
-        try:
-            self.__json_read()
-        except socket.error, err:
-            if err[0] == errno.EAGAIN:
-                # No data available
-                pass
-        self.__sock.setblocking(1)
-        if not self.__events and wait:
-            ret = self.__json_read(only_event=True)
-            if ret == None:
-                # We are in blocking mode, if don't get anything, something
-                # went wrong
-                raise QMPConnectError("Error while reading from socket")
+        @param wait (bool): block until an event is available.
+        @param wait (float): If wait is a float, treat it as a timeout value.
 
+        @raise QMPTimeoutError: If a timeout float is provided and the timeout
+                                period elapses.
+        @raise QMPConnectError: If wait is True but no events could be retrieved
+                                or if some other error occurred.
+
+        @return The list of available QMP events.
+        """
+        self.__get_events(wait)
         return self.__events
 
     def clear_events(self):
diff --git a/target-ppc/cpu.h b/target-ppc/cpu.h
index f15815f11b..c05c503305 100644
--- a/target-ppc/cpu.h
+++ b/target-ppc/cpu.h
@@ -2251,8 +2251,8 @@ static inline ppcmas_tlb_t *booke206_get_tlbm(CPUPPCState *env, const int tlbn,
 {
     int r;
     uint32_t ways = booke206_tlb_ways(env, tlbn);
-    int ways_bits = ffs(ways) - 1;
-    int tlb_bits = ffs(booke206_tlb_size(env, tlbn)) - 1;
+    int ways_bits = ctz32(ways);
+    int tlb_bits = ctz32(booke206_tlb_size(env, tlbn));
     int i;
 
     way &= ways - 1;
diff --git a/tests/qemu-iotests/122 b/tests/qemu-iotests/122
new file mode 100755
index 0000000000..350ca9c466
--- /dev/null
+++ b/tests/qemu-iotests/122
@@ -0,0 +1,223 @@
+#!/bin/bash
+#
+# Test some qemu-img convert cases
+#
+# Copyright (C) 2015 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# creator
+owner=kwolf@redhat.com
+
+seq="$(basename $0)"
+echo "QA output created by $seq"
+
+here="$PWD"
+tmp=/tmp/$$
+status=1	# failure is the default!
+
+_cleanup()
+{
+    rm -f "$TEST_IMG".[123]
+	_cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+_supported_fmt qcow2
+_supported_proto file
+_supported_os Linux
+
+
+TEST_IMG="$TEST_IMG".base _make_test_img 64M
+$QEMU_IO -c "write -P 0x11 0 64M" "$TEST_IMG".base 2>&1 | _filter_qemu_io | _filter_testdir
+
+
+echo
+echo "=== Check allocation status regression with -B ==="
+echo
+
+_make_test_img -b "$TEST_IMG".base
+$QEMU_IO -c "write -P 0x22 0 3M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG convert -O $IMGFMT -B "$TEST_IMG".base "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IMG map "$TEST_IMG".orig | _filter_qemu_img_map
+
+
+echo
+echo "=== Check that zero clusters are kept in overlay ==="
+echo
+
+_make_test_img -b "$TEST_IMG".base
+
+$QEMU_IO -c "write -P 0 0 3M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG convert -O $IMGFMT -B "$TEST_IMG".base "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IO -c "read -P 0 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG convert -O $IMGFMT -c -B "$TEST_IMG".base "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IO -c "read -P 0 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+
+$QEMU_IO -c "write -z 0 3M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG convert -O $IMGFMT -B "$TEST_IMG".base "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IO -c "read -P 0 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG convert -O $IMGFMT -c -B "$TEST_IMG".base "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IO -c "read -P 0 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+
+
+echo
+echo "=== Concatenate multiple source images ==="
+echo
+
+TEST_IMG="$TEST_IMG".1 _make_test_img 4M
+TEST_IMG="$TEST_IMG".2 _make_test_img 4M
+TEST_IMG="$TEST_IMG".3 _make_test_img 4M
+
+$QEMU_IO -c "write -P 0x11 0 64k" "$TEST_IMG".1 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "write -P 0x22 0 64k" "$TEST_IMG".2 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "write -P 0x33 0 64k" "$TEST_IMG".3 2>&1 | _filter_qemu_io | _filter_testdir
+
+$QEMU_IMG convert -O $IMGFMT "$TEST_IMG".[123] "$TEST_IMG"
+$QEMU_IMG map "$TEST_IMG" | _filter_qemu_img_map
+$QEMU_IO -c "read -P 0x11 0 64k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x22 4M 64k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x33 8M 64k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+$QEMU_IMG convert -c -O $IMGFMT "$TEST_IMG".[123] "$TEST_IMG"
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+$QEMU_IO -c "read -P 0x11 0 64k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x22 4M 64k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x33 8M 64k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+# -B can't be combined with concatenation
+$QEMU_IMG convert -O $IMGFMT -B "$TEST_IMG".base "$TEST_IMG".[123] "$TEST_IMG"
+$QEMU_IMG convert -O $IMGFMT -c -B "$TEST_IMG".base "$TEST_IMG".[123] "$TEST_IMG"
+
+
+echo
+echo "=== Compression with misaligned allocations and image sizes ==="
+echo
+
+TEST_IMG="$TEST_IMG".1 _make_test_img 1023k -o cluster_size=1024
+TEST_IMG="$TEST_IMG".2 _make_test_img 1023k -o cluster_size=1024
+
+$QEMU_IO -c "write -P 0x11   16k  16k" "$TEST_IMG".1 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "write -P 0x22  130k 130k" "$TEST_IMG".1 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "write -P 0x33 1022k   1k" "$TEST_IMG".1 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "write -P 0x44    0k   1k" "$TEST_IMG".2 2>&1 | _filter_qemu_io | _filter_testdir
+
+$QEMU_IMG convert -c -O $IMGFMT "$TEST_IMG".[12] "$TEST_IMG"
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+$QEMU_IO -c "read -P 0       0k   16k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x11   16k   16k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0      32k   98k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x22  130k  130k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0     260k  762k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x33 1022k    1k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x44 1023k    1k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0    1024k 1022k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+
+echo
+echo "=== Full allocation with -S 0 ==="
+echo
+
+# Standalone image
+_make_test_img 64M
+$QEMU_IO -c "write -P 0x22 0 3M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "write -P 0 3M 3M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+echo
+echo convert -S 0:
+$QEMU_IMG convert -O $IMGFMT -S 0 "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IO -c "read -P 0x22 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0 3M 61M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map
+
+echo
+echo convert -c -S 0:
+$QEMU_IMG convert -O $IMGFMT -c -S 0 "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IO -c "read -P 0x22 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0 3M 61M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map
+
+# With backing file
+TEST_IMG="$TEST_IMG".base _make_test_img 64M
+$QEMU_IO -c "write -P 0x11 0 32M" "$TEST_IMG".base 2>&1 | _filter_qemu_io | _filter_testdir
+
+_make_test_img -b "$TEST_IMG".base 64M
+$QEMU_IO -c "write -P 0x22 0 3M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+echo
+echo convert -S 0 with source backing file:
+$QEMU_IMG convert -O $IMGFMT -S 0 "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IO -c "read -P 0x22 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x11 3M 29M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0 32M 32M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map
+
+echo
+echo convert -c -S 0 with source backing file:
+$QEMU_IMG convert -O $IMGFMT -c -S 0 "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IO -c "read -P 0x22 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x11 3M 29M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0 32M 32M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map
+
+# With keeping the backing file
+echo
+echo convert -S 0 -B ...
+$QEMU_IMG convert -O $IMGFMT -S 0 "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IO -c "read -P 0x22 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x11 3M 29M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0 32M 32M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map
+
+echo
+echo convert -c -S 0 -B ...
+$QEMU_IMG convert -O $IMGFMT -c -S 0 "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IO -c "read -P 0x22 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x11 3M 29M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0 32M 32M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map
+
+
+echo
+echo "=== Non-zero -S ==="
+echo
+
+_make_test_img 64M -o cluster_size=1k
+$QEMU_IO -c "write -P 0 0 64k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "write 0 1k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "write 8k 1k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "write 17k 1k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+for min_sparse in 4k 8k; do
+    echo
+    echo convert -S $min_sparse
+    $QEMU_IMG convert -O $IMGFMT -o cluster_size=1k -S $min_sparse "$TEST_IMG" "$TEST_IMG".orig
+    $QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map
+
+    echo
+    echo convert -c -S $min_sparse
+    # For compressed images, -S values other than 0 are ignored
+    $QEMU_IMG convert -O $IMGFMT -o cluster_size=1k -c -S $min_sparse "$TEST_IMG" "$TEST_IMG".orig
+    $QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map
+done
+
+# success, all done
+echo '*** done'
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/122.out b/tests/qemu-iotests/122.out
new file mode 100644
index 0000000000..1f853b9e93
--- /dev/null
+++ b/tests/qemu-iotests/122.out
@@ -0,0 +1,209 @@
+QA output created by 122
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=67108864
+wrote 67108864/67108864 bytes at offset 0
+64 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+=== Check allocation status regression with -B ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 backing_file='TEST_DIR/t.IMGFMT.base'
+wrote 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+Offset          Length          File
+0               0x300000        TEST_DIR/t.IMGFMT.orig
+0x300000        0x3d00000       TEST_DIR/t.IMGFMT.base
+
+=== Check that zero clusters are kept in overlay ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 backing_file='TEST_DIR/t.IMGFMT.base'
+wrote 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+=== Concatenate multiple source images ===
+
+Formatting 'TEST_DIR/t.IMGFMT.1', fmt=IMGFMT size=4194304
+Formatting 'TEST_DIR/t.IMGFMT.2', fmt=IMGFMT size=4194304
+Formatting 'TEST_DIR/t.IMGFMT.3', fmt=IMGFMT size=4194304
+wrote 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+Offset          Length          File
+0               0x10000         TEST_DIR/t.IMGFMT
+0x400000        0x10000         TEST_DIR/t.IMGFMT
+0x800000        0x10000         TEST_DIR/t.IMGFMT
+read 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 65536/65536 bytes at offset 4194304
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 65536/65536 bytes at offset 8388608
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 65536, "depth": 0, "zero": false, "data": true},
+{ "start": 65536, "length": 4128768, "depth": 0, "zero": true, "data": false},
+{ "start": 4194304, "length": 65536, "depth": 0, "zero": false, "data": true},
+{ "start": 4259840, "length": 4128768, "depth": 0, "zero": true, "data": false},
+{ "start": 8388608, "length": 65536, "depth": 0, "zero": false, "data": true},
+{ "start": 8454144, "length": 4128768, "depth": 0, "zero": true, "data": false}]
+read 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 65536/65536 bytes at offset 4194304
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 65536/65536 bytes at offset 8388608
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-img: -B makes no sense when concatenating multiple input images
+qemu-img: -B makes no sense when concatenating multiple input images
+
+=== Compression with misaligned allocations and image sizes ===
+
+Formatting 'TEST_DIR/t.IMGFMT.1', fmt=IMGFMT size=1047552
+Formatting 'TEST_DIR/t.IMGFMT.2', fmt=IMGFMT size=1047552
+wrote 16384/16384 bytes at offset 16384
+16 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 133120/133120 bytes at offset 133120
+130 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1024/1024 bytes at offset 1046528
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1024/1024 bytes at offset 0
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 65536, "depth": 0, "zero": false, "data": true},
+{ "start": 65536, "length": 65536, "depth": 0, "zero": true, "data": false},
+{ "start": 131072, "length": 196608, "depth": 0, "zero": false, "data": true},
+{ "start": 327680, "length": 655360, "depth": 0, "zero": true, "data": false},
+{ "start": 983040, "length": 65536, "depth": 0, "zero": false, "data": true},
+{ "start": 1048576, "length": 1046528, "depth": 0, "zero": true, "data": false}]
+read 16384/16384 bytes at offset 0
+16 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 16384/16384 bytes at offset 16384
+16 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 100352/100352 bytes at offset 32768
+98 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 133120/133120 bytes at offset 133120
+130 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 780288/780288 bytes at offset 266240
+762 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 1046528
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 1047552
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1046528/1046528 bytes at offset 1048576
+1022 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+=== Full allocation with -S 0 ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
+wrote 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 3145728/3145728 bytes at offset 3145728
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+convert -S 0:
+read 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 63963136/63963136 bytes at offset 3145728
+61 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 6291456, "depth": 0, "zero": false, "data": true, "offset": 327680},
+{ "start": 6291456, "length": 60817408, "depth": 0, "zero": true, "data": false}]
+
+convert -c -S 0:
+read 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 63963136/63963136 bytes at offset 3145728
+61 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 6291456, "depth": 0, "zero": false, "data": true},
+{ "start": 6291456, "length": 60817408, "depth": 0, "zero": true, "data": false}]
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=67108864
+wrote 33554432/33554432 bytes at offset 0
+32 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 backing_file='TEST_DIR/t.IMGFMT.base'
+wrote 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+convert -S 0 with source backing file:
+read 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 30408704/30408704 bytes at offset 3145728
+29 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 33554432/33554432 bytes at offset 33554432
+32 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 67108864, "depth": 0, "zero": false, "data": true, "offset": 327680}]
+
+convert -c -S 0 with source backing file:
+read 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 30408704/30408704 bytes at offset 3145728
+29 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 33554432/33554432 bytes at offset 33554432
+32 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 67108864, "depth": 0, "zero": false, "data": true}]
+
+convert -S 0 -B ...
+read 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 30408704/30408704 bytes at offset 3145728
+29 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 33554432/33554432 bytes at offset 33554432
+32 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 67108864, "depth": 0, "zero": false, "data": true, "offset": 327680}]
+
+convert -c -S 0 -B ...
+read 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 30408704/30408704 bytes at offset 3145728
+29 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 33554432/33554432 bytes at offset 33554432
+32 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 67108864, "depth": 0, "zero": false, "data": true}]
+
+=== Non-zero -S ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
+wrote 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1024/1024 bytes at offset 0
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1024/1024 bytes at offset 8192
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1024/1024 bytes at offset 17408
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+convert -S 4k
+[{ "start": 0, "length": 1024, "depth": 0, "zero": false, "data": true, "offset": 8192},
+{ "start": 1024, "length": 7168, "depth": 0, "zero": true, "data": false},
+{ "start": 8192, "length": 1024, "depth": 0, "zero": false, "data": true, "offset": 9216},
+{ "start": 9216, "length": 8192, "depth": 0, "zero": true, "data": false},
+{ "start": 17408, "length": 1024, "depth": 0, "zero": false, "data": true, "offset": 10240},
+{ "start": 18432, "length": 67090432, "depth": 0, "zero": true, "data": false}]
+
+convert -c -S 4k
+[{ "start": 0, "length": 1024, "depth": 0, "zero": false, "data": true},
+{ "start": 1024, "length": 7168, "depth": 0, "zero": true, "data": false},
+{ "start": 8192, "length": 1024, "depth": 0, "zero": false, "data": true},
+{ "start": 9216, "length": 8192, "depth": 0, "zero": true, "data": false},
+{ "start": 17408, "length": 1024, "depth": 0, "zero": false, "data": true},
+{ "start": 18432, "length": 67090432, "depth": 0, "zero": true, "data": false}]
+
+convert -S 8k
+[{ "start": 0, "length": 9216, "depth": 0, "zero": false, "data": true, "offset": 8192},
+{ "start": 9216, "length": 8192, "depth": 0, "zero": true, "data": false},
+{ "start": 17408, "length": 1024, "depth": 0, "zero": false, "data": true, "offset": 17408},
+{ "start": 18432, "length": 67090432, "depth": 0, "zero": true, "data": false}]
+
+convert -c -S 8k
+[{ "start": 0, "length": 1024, "depth": 0, "zero": false, "data": true},
+{ "start": 1024, "length": 7168, "depth": 0, "zero": true, "data": false},
+{ "start": 8192, "length": 1024, "depth": 0, "zero": false, "data": true},
+{ "start": 9216, "length": 8192, "depth": 0, "zero": true, "data": false},
+{ "start": 17408, "length": 1024, "depth": 0, "zero": false, "data": true},
+{ "start": 18432, "length": 67090432, "depth": 0, "zero": true, "data": false}]
+*** done
diff --git a/tests/qemu-iotests/124 b/tests/qemu-iotests/124
new file mode 100644
index 0000000000..3ee78cd1f1
--- /dev/null
+++ b/tests/qemu-iotests/124
@@ -0,0 +1,363 @@
+#!/usr/bin/env python
+#
+# Tests for incremental drive-backup
+#
+# Copyright (C) 2015 John Snow for Red Hat, Inc.
+#
+# Based on 056.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import os
+import iotests
+
+
+def io_write_patterns(img, patterns):
+    for pattern in patterns:
+        iotests.qemu_io('-c', 'write -P%s %s %s' % pattern, img)
+
+
+def try_remove(img):
+    try:
+        os.remove(img)
+    except OSError:
+        pass
+
+
+class Bitmap:
+    def __init__(self, name, drive):
+        self.name = name
+        self.drive = drive
+        self.num = 0
+        self.backups = list()
+
+    def base_target(self):
+        return (self.drive['backup'], None)
+
+    def new_target(self, num=None):
+        if num is None:
+            num = self.num
+        self.num = num + 1
+        base = os.path.join(iotests.test_dir,
+                            "%s.%s." % (self.drive['id'], self.name))
+        suff = "%i.%s" % (num, self.drive['fmt'])
+        target = base + "inc" + suff
+        reference = base + "ref" + suff
+        self.backups.append((target, reference))
+        return (target, reference)
+
+    def last_target(self):
+        if self.backups:
+            return self.backups[-1]
+        return self.base_target()
+
+    def del_target(self):
+        for image in self.backups.pop():
+            try_remove(image)
+        self.num -= 1
+
+    def cleanup(self):
+        for backup in self.backups:
+            for image in backup:
+                try_remove(image)
+
+
+class TestIncrementalBackup(iotests.QMPTestCase):
+    def setUp(self):
+        self.bitmaps = list()
+        self.files = list()
+        self.drives = list()
+        self.vm = iotests.VM()
+        self.err_img = os.path.join(iotests.test_dir, 'err.%s' % iotests.imgfmt)
+
+        # Create a base image with a distinctive patterning
+        drive0 = self.add_node('drive0')
+        self.img_create(drive0['file'], drive0['fmt'])
+        self.vm.add_drive(drive0['file'])
+        io_write_patterns(drive0['file'], (('0x41', 0, 512),
+                                           ('0xd5', '1M', '32k'),
+                                           ('0xdc', '32M', '124k')))
+        self.vm.launch()
+
+
+    def add_node(self, node_id, fmt=iotests.imgfmt, path=None, backup=None):
+        if path is None:
+            path = os.path.join(iotests.test_dir, '%s.%s' % (node_id, fmt))
+        if backup is None:
+            backup = os.path.join(iotests.test_dir,
+                                  '%s.full.backup.%s' % (node_id, fmt))
+
+        self.drives.append({
+            'id': node_id,
+            'file': path,
+            'backup': backup,
+            'fmt': fmt })
+        return self.drives[-1]
+
+
+    def img_create(self, img, fmt=iotests.imgfmt, size='64M',
+                   parent=None, parentFormat=None):
+        if parent:
+            if parentFormat is None:
+                parentFormat = fmt
+            iotests.qemu_img('create', '-f', fmt, img, size,
+                             '-b', parent, '-F', parentFormat)
+        else:
+            iotests.qemu_img('create', '-f', fmt, img, size)
+        self.files.append(img)
+
+
+    def do_qmp_backup(self, error='Input/output error', **kwargs):
+        res = self.vm.qmp('drive-backup', **kwargs)
+        self.assert_qmp(res, 'return', {})
+
+        event = self.vm.event_wait(name="BLOCK_JOB_COMPLETED",
+                                   match={'data': {'device': kwargs['device']}})
+        self.assertIsNotNone(event)
+
+        try:
+            failure = self.dictpath(event, 'data/error')
+        except AssertionError:
+            # Backup succeeded.
+            self.assert_qmp(event, 'data/offset', event['data']['len'])
+            return True
+        else:
+            # Backup failed.
+            self.assert_qmp(event, 'data/error', error)
+            return False
+
+
+    def create_anchor_backup(self, drive=None):
+        if drive is None:
+            drive = self.drives[-1]
+        res = self.do_qmp_backup(device=drive['id'], sync='full',
+                                 format=drive['fmt'], target=drive['backup'])
+        self.assertTrue(res)
+        self.files.append(drive['backup'])
+        return drive['backup']
+
+
+    def make_reference_backup(self, bitmap=None):
+        if bitmap is None:
+            bitmap = self.bitmaps[-1]
+        _, reference = bitmap.last_target()
+        res = self.do_qmp_backup(device=bitmap.drive['id'], sync='full',
+                                 format=bitmap.drive['fmt'], target=reference)
+        self.assertTrue(res)
+
+
+    def add_bitmap(self, name, drive, **kwargs):
+        bitmap = Bitmap(name, drive)
+        self.bitmaps.append(bitmap)
+        result = self.vm.qmp('block-dirty-bitmap-add', node=drive['id'],
+                             name=bitmap.name, **kwargs)
+        self.assert_qmp(result, 'return', {})
+        return bitmap
+
+
+    def prepare_backup(self, bitmap=None, parent=None):
+        if bitmap is None:
+            bitmap = self.bitmaps[-1]
+        if parent is None:
+            parent, _ = bitmap.last_target()
+
+        target, _ = bitmap.new_target()
+        self.img_create(target, bitmap.drive['fmt'], parent=parent)
+        return target
+
+
+    def create_incremental(self, bitmap=None, parent=None,
+                           parentFormat=None, validate=True):
+        if bitmap is None:
+            bitmap = self.bitmaps[-1]
+        if parent is None:
+            parent, _ = bitmap.last_target()
+
+        target = self.prepare_backup(bitmap, parent)
+        res = self.do_qmp_backup(device=bitmap.drive['id'],
+                                 sync='dirty-bitmap', bitmap=bitmap.name,
+                                 format=bitmap.drive['fmt'], target=target,
+                                 mode='existing')
+        if not res:
+            bitmap.del_target();
+            self.assertFalse(validate)
+        else:
+            self.make_reference_backup(bitmap)
+        return res
+
+
+    def check_backups(self):
+        for bitmap in self.bitmaps:
+            for incremental, reference in bitmap.backups:
+                self.assertTrue(iotests.compare_images(incremental, reference))
+            last = bitmap.last_target()[0]
+            self.assertTrue(iotests.compare_images(last, bitmap.drive['file']))
+
+
+    def hmp_io_writes(self, drive, patterns):
+        for pattern in patterns:
+            self.vm.hmp_qemu_io(drive, 'write -P%s %s %s' % pattern)
+        self.vm.hmp_qemu_io(drive, 'flush')
+
+
+    def do_incremental_simple(self, **kwargs):
+        self.create_anchor_backup()
+        self.add_bitmap('bitmap0', self.drives[0], **kwargs)
+
+        # Sanity: Create a "hollow" incremental backup
+        self.create_incremental()
+        # Three writes: One complete overwrite, one new segment,
+        # and one partial overlap.
+        self.hmp_io_writes(self.drives[0]['id'], (('0xab', 0, 512),
+                                                  ('0xfe', '16M', '256k'),
+                                                  ('0x64', '32736k', '64k')))
+        self.create_incremental()
+        # Three more writes, one of each kind, like above
+        self.hmp_io_writes(self.drives[0]['id'], (('0x9a', 0, 512),
+                                                  ('0x55', '8M', '352k'),
+                                                  ('0x78', '15872k', '1M')))
+        self.create_incremental()
+        self.vm.shutdown()
+        self.check_backups()
+
+
+    def test_incremental_simple(self):
+        '''
+        Test: Create and verify three incremental backups.
+
+        Create a bitmap and a full backup before VM execution begins,
+        then create a series of three incremental backups "during execution,"
+        i.e.; after IO requests begin modifying the drive.
+        '''
+        return self.do_incremental_simple()
+
+
+    def test_small_granularity(self):
+        '''
+        Test: Create and verify backups made with a small granularity bitmap.
+
+        Perform the same test as test_incremental_simple, but with a granularity
+        of only 32KiB instead of the present default of 64KiB.
+        '''
+        return self.do_incremental_simple(granularity=32768)
+
+
+    def test_large_granularity(self):
+        '''
+        Test: Create and verify backups made with a large granularity bitmap.
+
+        Perform the same test as test_incremental_simple, but with a granularity
+        of 128KiB instead of the present default of 64KiB.
+        '''
+        return self.do_incremental_simple(granularity=131072)
+
+
+    def test_incremental_failure(self):
+        '''Test: Verify backups made after a failure are correct.
+
+        Simulate a failure during an incremental backup block job,
+        emulate additional writes, then create another incremental backup
+        afterwards and verify that the backup created is correct.
+        '''
+
+        # Create a blkdebug interface to this img as 'drive1',
+        # but don't actually create a new image.
+        drive1 = self.add_node('drive1', self.drives[0]['fmt'],
+                               path=self.drives[0]['file'],
+                               backup=self.drives[0]['backup'])
+        result = self.vm.qmp('blockdev-add', options={
+            'id': drive1['id'],
+            'driver': drive1['fmt'],
+            'file': {
+                'driver': 'blkdebug',
+                'image': {
+                    'driver': 'file',
+                    'filename': drive1['file']
+                },
+                'set-state': [{
+                    'event': 'flush_to_disk',
+                    'state': 1,
+                    'new_state': 2
+                }],
+                'inject-error': [{
+                    'event': 'read_aio',
+                    'errno': 5,
+                    'state': 2,
+                    'immediately': False,
+                    'once': True
+                }],
+            }
+        })
+        self.assert_qmp(result, 'return', {})
+
+        self.create_anchor_backup(self.drives[0])
+        self.add_bitmap('bitmap0', drive1)
+        # Note: at this point, during a normal execution,
+        # Assume that the VM resumes and begins issuing IO requests here.
+
+        self.hmp_io_writes(drive1['id'], (('0xab', 0, 512),
+                                          ('0xfe', '16M', '256k'),
+                                          ('0x64', '32736k', '64k')))
+
+        result = self.create_incremental(validate=False)
+        self.assertFalse(result)
+        self.hmp_io_writes(drive1['id'], (('0x9a', 0, 512),
+                                          ('0x55', '8M', '352k'),
+                                          ('0x78', '15872k', '1M')))
+        self.create_incremental()
+        self.vm.shutdown()
+        self.check_backups()
+
+
+    def test_sync_dirty_bitmap_missing(self):
+        self.assert_no_active_block_jobs()
+        self.files.append(self.err_img)
+        result = self.vm.qmp('drive-backup', device=self.drives[0]['id'],
+                             sync='dirty-bitmap', format=self.drives[0]['fmt'],
+                             target=self.err_img)
+        self.assert_qmp(result, 'error/class', 'GenericError')
+
+
+    def test_sync_dirty_bitmap_not_found(self):
+        self.assert_no_active_block_jobs()
+        self.files.append(self.err_img)
+        result = self.vm.qmp('drive-backup', device=self.drives[0]['id'],
+                             sync='dirty-bitmap', bitmap='unknown',
+                             format=self.drives[0]['fmt'], target=self.err_img)
+        self.assert_qmp(result, 'error/class', 'GenericError')
+
+
+    def test_sync_dirty_bitmap_bad_granularity(self):
+        '''
+        Test: Test what happens if we provide an improper granularity.
+
+        The granularity must always be a power of 2.
+        '''
+        self.assert_no_active_block_jobs()
+        self.assertRaises(AssertionError, self.add_bitmap,
+                          'bitmap0', self.drives[0],
+                          granularity=64000)
+
+
+    def tearDown(self):
+        self.vm.shutdown()
+        for bitmap in self.bitmaps:
+            bitmap.cleanup()
+        for filename in self.files:
+            try_remove(filename)
+
+
+if __name__ == '__main__':
+    iotests.main(supported_fmts=['qcow2'])
diff --git a/tests/qemu-iotests/124.out b/tests/qemu-iotests/124.out
new file mode 100644
index 0000000000..2f7d3902f2
--- /dev/null
+++ b/tests/qemu-iotests/124.out
@@ -0,0 +1,5 @@
+.......
+----------------------------------------------------------------------
+Ran 7 tests
+
+OK
diff --git a/tests/qemu-iotests/129 b/tests/qemu-iotests/129
new file mode 100644
index 0000000000..9e87e1c8d9
--- /dev/null
+++ b/tests/qemu-iotests/129
@@ -0,0 +1,86 @@
+#!/usr/bin/env python
+#
+# Tests that "bdrv_drain_all" doesn't drain block jobs
+#
+# Copyright (C) 2015 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import os
+import iotests
+import time
+
+class TestStopWithBlockJob(iotests.QMPTestCase):
+    test_img = os.path.join(iotests.test_dir, 'test.img')
+    target_img = os.path.join(iotests.test_dir, 'target.img')
+    base_img = os.path.join(iotests.test_dir, 'base.img')
+
+    def setUp(self):
+        iotests.qemu_img('create', '-f', iotests.imgfmt, self.base_img, "1G")
+        iotests.qemu_img('create', '-f', iotests.imgfmt, self.test_img, "-b", self.base_img)
+        iotests.qemu_io('-f', iotests.imgfmt, '-c', 'write -P0x5d 1M 128M', self.test_img)
+        self.vm = iotests.VM().add_drive(self.test_img)
+        self.vm.launch()
+
+    def tearDown(self):
+        params = {"device": "drive0",
+                  "bps": 0,
+                  "bps_rd": 0,
+                  "bps_wr": 0,
+                  "iops": 0,
+                  "iops_rd": 0,
+                  "iops_wr": 0,
+                 }
+        result = self.vm.qmp("block_set_io_throttle", conv_keys=False,
+                             **params)
+        self.vm.shutdown()
+
+    def do_test_stop(self, cmd, **args):
+        """Test 'stop' while block job is running on a throttled drive.
+        The 'stop' command shouldn't drain the job"""
+        params = {"device": "drive0",
+                  "bps": 1024,
+                  "bps_rd": 0,
+                  "bps_wr": 0,
+                  "iops": 0,
+                  "iops_rd": 0,
+                  "iops_wr": 0,
+                 }
+        result = self.vm.qmp("block_set_io_throttle", conv_keys=False,
+                             **params)
+        self.assert_qmp(result, 'return', {})
+        result = self.vm.qmp(cmd, **args)
+        self.assert_qmp(result, 'return', {})
+        result = self.vm.qmp("stop")
+        self.assert_qmp(result, 'return', {})
+        result = self.vm.qmp("query-block-jobs")
+        self.assert_qmp(result, 'return[0]/busy', True)
+        self.assert_qmp(result, 'return[0]/ready', False)
+
+    def test_drive_mirror(self):
+        self.do_test_stop("drive-mirror", device="drive0",
+                          target=self.target_img,
+                          sync="full")
+
+    def test_drive_backup(self):
+        self.do_test_stop("drive-backup", device="drive0",
+                          target=self.target_img,
+                          sync="full")
+
+    def test_block_commit(self):
+        self.do_test_stop("block-commit", device="drive0")
+
+if __name__ == '__main__':
+    iotests.main(supported_fmts=["qcow2"])
diff --git a/tests/qemu-iotests/129.out b/tests/qemu-iotests/129.out
new file mode 100644
index 0000000000..8d7e996700
--- /dev/null
+++ b/tests/qemu-iotests/129.out
@@ -0,0 +1,5 @@
+...
+----------------------------------------------------------------------
+Ran 3 tests
+
+OK
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index bcf25786ab..6ca3466ec5 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -122,6 +122,9 @@
 115 rw auto
 116 rw auto quick
 121 rw auto
+122 rw auto
 123 rw auto quick
+124 rw auto backing
 128 rw auto quick
+129 rw auto quick
 130 rw auto quick
diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index 14028540b3..e93e62387b 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -78,6 +78,23 @@ def create_image(name, size):
         i = i + 512
     file.close()
 
+# Test if 'match' is a recursive subset of 'event'
+def event_match(event, match=None):
+    if match is None:
+        return True
+
+    for key in match:
+        if key in event:
+            if isinstance(event[key], dict):
+                if not event_match(event[key], match[key]):
+                    return False
+            elif event[key] != match[key]:
+                return False
+        else:
+            return False
+
+    return True
+
 class VM(object):
     '''A QEMU VM'''
 
@@ -92,6 +109,7 @@ class VM(object):
                      '-machine', 'accel=qtest',
                      '-display', 'none', '-vga', 'none']
         self._num_drives = 0
+        self._events = []
 
     # This can be used to add an unused monitor instance.
     def add_monitor_telnet(self, ip, port):
@@ -202,14 +220,34 @@ class VM(object):
 
     def get_qmp_event(self, wait=False):
         '''Poll for one queued QMP events and return it'''
+        if len(self._events) > 0:
+            return self._events.pop(0)
         return self._qmp.pull_event(wait=wait)
 
     def get_qmp_events(self, wait=False):
         '''Poll for queued QMP events and return a list of dicts'''
         events = self._qmp.get_events(wait=wait)
+        events.extend(self._events)
+        del self._events[:]
         self._qmp.clear_events()
         return events
 
+    def event_wait(self, name='BLOCK_JOB_COMPLETED', timeout=60.0, match=None):
+        # Search cached events
+        for event in self._events:
+            if (event['event'] == name) and event_match(event, match):
+                self._events.remove(event)
+                return event
+
+        # Poll for new events
+        while True:
+            event = self._qmp.pull_event(wait=timeout)
+            if (event['event'] == name) and event_match(event, match):
+                return event
+            self._events.append(event)
+
+        return None
+
 index_re = re.compile(r'([^\[]+)\[([^\]]+)\]')
 
 class QMPTestCase(unittest.TestCase):
diff --git a/tests/test-aio.c b/tests/test-aio.c
index a7cb5c9915..4b0cb45d31 100644
--- a/tests/test-aio.c
+++ b/tests/test-aio.c
@@ -107,6 +107,7 @@ static void test_notify(void)
 
 typedef struct {
     QemuMutex start_lock;
+    EventNotifier notifier;
     bool thread_acquired;
 } AcquireTestData;
 
@@ -118,6 +119,8 @@ static void *test_acquire_thread(void *opaque)
     qemu_mutex_lock(&data->start_lock);
     qemu_mutex_unlock(&data->start_lock);
 
+    g_usleep(500000);
+    event_notifier_set(&data->notifier);
     aio_context_acquire(ctx);
     aio_context_release(ctx);
 
@@ -126,20 +129,19 @@ static void *test_acquire_thread(void *opaque)
     return NULL;
 }
 
-static void dummy_notifier_read(EventNotifier *unused)
+static void dummy_notifier_read(EventNotifier *n)
 {
-    g_assert(false); /* should never be invoked */
+    event_notifier_test_and_clear(n);
 }
 
 static void test_acquire(void)
 {
     QemuThread thread;
-    EventNotifier notifier;
     AcquireTestData data;
 
     /* Dummy event notifier ensures aio_poll() will block */
-    event_notifier_init(&notifier, false);
-    aio_set_event_notifier(ctx, &notifier, dummy_notifier_read);
+    event_notifier_init(&data.notifier, false);
+    aio_set_event_notifier(ctx, &data.notifier, dummy_notifier_read);
     g_assert(!aio_poll(ctx, false)); /* consume aio_notify() */
 
     qemu_mutex_init(&data.start_lock);
@@ -153,12 +155,13 @@ static void test_acquire(void)
     /* Block in aio_poll(), let other thread kick us and acquire context */
     aio_context_acquire(ctx);
     qemu_mutex_unlock(&data.start_lock); /* let the thread run */
-    g_assert(!aio_poll(ctx, true));
+    g_assert(aio_poll(ctx, true));
+    g_assert(!data.thread_acquired);
     aio_context_release(ctx);
 
     qemu_thread_join(&thread);
-    aio_set_event_notifier(ctx, &notifier, NULL);
-    event_notifier_cleanup(&notifier);
+    aio_set_event_notifier(ctx, &data.notifier, NULL);
+    event_notifier_cleanup(&data.notifier);
 
     g_assert(data.thread_acquired);
 }
diff --git a/tests/test-hbitmap.c b/tests/test-hbitmap.c
index 8c902f2055..9f41b5fd2e 100644
--- a/tests/test-hbitmap.c
+++ b/tests/test-hbitmap.c
@@ -11,6 +11,8 @@
 
 #include <glib.h>
 #include <stdarg.h>
+#include <string.h>
+#include <sys/types.h>
 #include "qemu/hbitmap.h"
 
 #define LOG_BITS_PER_LONG          (BITS_PER_LONG == 32 ? 5 : 6)
@@ -23,6 +25,7 @@ typedef struct TestHBitmapData {
     HBitmap       *hb;
     unsigned long *bits;
     size_t         size;
+    size_t         old_size;
     int            granularity;
 } TestHBitmapData;
 
@@ -91,6 +94,44 @@ static void hbitmap_test_init(TestHBitmapData *data,
     }
 }
 
+static inline size_t hbitmap_test_array_size(size_t bits)
+{
+    size_t n = (bits + BITS_PER_LONG - 1) / BITS_PER_LONG;
+    return n ? n : 1;
+}
+
+static void hbitmap_test_truncate_impl(TestHBitmapData *data,
+                                       size_t size)
+{
+    size_t n;
+    size_t m;
+    data->old_size = data->size;
+    data->size = size;
+
+    if (data->size == data->old_size) {
+        return;
+    }
+
+    n = hbitmap_test_array_size(size);
+    m = hbitmap_test_array_size(data->old_size);
+    data->bits = g_realloc(data->bits, sizeof(unsigned long) * n);
+    if (n > m) {
+        memset(&data->bits[m], 0x00, sizeof(unsigned long) * (n - m));
+    }
+
+    /* If we shrink to an uneven multiple of sizeof(unsigned long),
+     * scrub the leftover memory. */
+    if (data->size < data->old_size) {
+        m = size % (sizeof(unsigned long) * 8);
+        if (m) {
+            unsigned long mask = (1ULL << m) - 1;
+            data->bits[n-1] &= mask;
+        }
+    }
+
+    hbitmap_truncate(data->hb, size);
+}
+
 static void hbitmap_test_teardown(TestHBitmapData *data,
                                   const void *unused)
 {
@@ -369,6 +410,198 @@ static void test_hbitmap_iter_granularity(TestHBitmapData *data,
     g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0);
 }
 
+static void hbitmap_test_set_boundary_bits(TestHBitmapData *data, ssize_t diff)
+{
+    size_t size = data->size;
+
+    /* First bit */
+    hbitmap_test_set(data, 0, 1);
+    if (diff < 0) {
+        /* Last bit in new, shortened map */
+        hbitmap_test_set(data, size + diff - 1, 1);
+
+        /* First bit to be truncated away */
+        hbitmap_test_set(data, size + diff, 1);
+    }
+    /* Last bit */
+    hbitmap_test_set(data, size - 1, 1);
+    if (data->granularity == 0) {
+        hbitmap_test_check_get(data);
+    }
+}
+
+static void hbitmap_test_check_boundary_bits(TestHBitmapData *data)
+{
+    size_t size = MIN(data->size, data->old_size);
+
+    if (data->granularity == 0) {
+        hbitmap_test_check_get(data);
+        hbitmap_test_check(data, 0);
+    } else {
+        /* If a granularity was set, note that every distinct
+         * (bit >> granularity) value that was set will increase
+         * the bit pop count by 2^granularity, not just 1.
+         *
+         * The hbitmap_test_check facility does not currently tolerate
+         * non-zero granularities, so test the boundaries and the population
+         * count manually.
+         */
+        g_assert(hbitmap_get(data->hb, 0));
+        g_assert(hbitmap_get(data->hb, size - 1));
+        g_assert_cmpint(2 << data->granularity, ==, hbitmap_count(data->hb));
+    }
+}
+
+/* Generic truncate test. */
+static void hbitmap_test_truncate(TestHBitmapData *data,
+                                  size_t size,
+                                  ssize_t diff,
+                                  int granularity)
+{
+    hbitmap_test_init(data, size, granularity);
+    hbitmap_test_set_boundary_bits(data, diff);
+    hbitmap_test_truncate_impl(data, size + diff);
+    hbitmap_test_check_boundary_bits(data);
+}
+
+static void test_hbitmap_truncate_nop(TestHBitmapData *data,
+                                      const void *unused)
+{
+    hbitmap_test_truncate(data, L2, 0, 0);
+}
+
+/**
+ * Grow by an amount smaller than the granularity, without crossing
+ * a granularity alignment boundary. Effectively a NOP.
+ */
+static void test_hbitmap_truncate_grow_negligible(TestHBitmapData *data,
+                                                  const void *unused)
+{
+    size_t size = L2 - 1;
+    size_t diff = 1;
+    int granularity = 1;
+
+    hbitmap_test_truncate(data, size, diff, granularity);
+}
+
+/**
+ * Shrink by an amount smaller than the granularity, without crossing
+ * a granularity alignment boundary. Effectively a NOP.
+ */
+static void test_hbitmap_truncate_shrink_negligible(TestHBitmapData *data,
+                                                    const void *unused)
+{
+    size_t size = L2;
+    ssize_t diff = -1;
+    int granularity = 1;
+
+    hbitmap_test_truncate(data, size, diff, granularity);
+}
+
+/**
+ * Grow by an amount smaller than the granularity, but crossing over
+ * a granularity alignment boundary.
+ */
+static void test_hbitmap_truncate_grow_tiny(TestHBitmapData *data,
+                                            const void *unused)
+{
+    size_t size = L2 - 2;
+    ssize_t diff = 1;
+    int granularity = 1;
+
+    hbitmap_test_truncate(data, size, diff, granularity);
+}
+
+/**
+ * Shrink by an amount smaller than the granularity, but crossing over
+ * a granularity alignment boundary.
+ */
+static void test_hbitmap_truncate_shrink_tiny(TestHBitmapData *data,
+                                              const void *unused)
+{
+    size_t size = L2 - 1;
+    ssize_t diff = -1;
+    int granularity = 1;
+
+    hbitmap_test_truncate(data, size, diff, granularity);
+}
+
+/**
+ * Grow by an amount smaller than sizeof(long), and not crossing over
+ * a sizeof(long) alignment boundary.
+ */
+static void test_hbitmap_truncate_grow_small(TestHBitmapData *data,
+                                             const void *unused)
+{
+    size_t size = L2 + 1;
+    size_t diff = sizeof(long) / 2;
+
+    hbitmap_test_truncate(data, size, diff, 0);
+}
+
+/**
+ * Shrink by an amount smaller than sizeof(long), and not crossing over
+ * a sizeof(long) alignment boundary.
+ */
+static void test_hbitmap_truncate_shrink_small(TestHBitmapData *data,
+                                               const void *unused)
+{
+    size_t size = L2;
+    size_t diff = sizeof(long) / 2;
+
+    hbitmap_test_truncate(data, size, -diff, 0);
+}
+
+/**
+ * Grow by an amount smaller than sizeof(long), while crossing over
+ * a sizeof(long) alignment boundary.
+ */
+static void test_hbitmap_truncate_grow_medium(TestHBitmapData *data,
+                                              const void *unused)
+{
+    size_t size = L2 - 1;
+    size_t diff = sizeof(long) / 2;
+
+    hbitmap_test_truncate(data, size, diff, 0);
+}
+
+/**
+ * Shrink by an amount smaller than sizeof(long), while crossing over
+ * a sizeof(long) alignment boundary.
+ */
+static void test_hbitmap_truncate_shrink_medium(TestHBitmapData *data,
+                                                const void *unused)
+{
+    size_t size = L2 + 1;
+    size_t diff = sizeof(long) / 2;
+
+    hbitmap_test_truncate(data, size, -diff, 0);
+}
+
+/**
+ * Grow by an amount larger than sizeof(long).
+ */
+static void test_hbitmap_truncate_grow_large(TestHBitmapData *data,
+                                             const void *unused)
+{
+    size_t size = L2;
+    size_t diff = 8 * sizeof(long);
+
+    hbitmap_test_truncate(data, size, diff, 0);
+}
+
+/**
+ * Shrink by an amount larger than sizeof(long).
+ */
+static void test_hbitmap_truncate_shrink_large(TestHBitmapData *data,
+                                               const void *unused)
+{
+    size_t size = L2;
+    size_t diff = 8 * sizeof(long);
+
+    hbitmap_test_truncate(data, size, -diff, 0);
+}
+
 static void hbitmap_test_add(const char *testpath,
                                    void (*test_func)(TestHBitmapData *data, const void *user_data))
 {
@@ -395,6 +628,28 @@ int main(int argc, char **argv)
     hbitmap_test_add("/hbitmap/reset/empty", test_hbitmap_reset_empty);
     hbitmap_test_add("/hbitmap/reset/general", test_hbitmap_reset);
     hbitmap_test_add("/hbitmap/granularity", test_hbitmap_granularity);
+
+    hbitmap_test_add("/hbitmap/truncate/nop", test_hbitmap_truncate_nop);
+    hbitmap_test_add("/hbitmap/truncate/grow/negligible",
+                     test_hbitmap_truncate_grow_negligible);
+    hbitmap_test_add("/hbitmap/truncate/shrink/negligible",
+                     test_hbitmap_truncate_shrink_negligible);
+    hbitmap_test_add("/hbitmap/truncate/grow/tiny",
+                     test_hbitmap_truncate_grow_tiny);
+    hbitmap_test_add("/hbitmap/truncate/shrink/tiny",
+                     test_hbitmap_truncate_shrink_tiny);
+    hbitmap_test_add("/hbitmap/truncate/grow/small",
+                     test_hbitmap_truncate_grow_small);
+    hbitmap_test_add("/hbitmap/truncate/shrink/small",
+                     test_hbitmap_truncate_shrink_small);
+    hbitmap_test_add("/hbitmap/truncate/grow/medium",
+                     test_hbitmap_truncate_grow_medium);
+    hbitmap_test_add("/hbitmap/truncate/shrink/medium",
+                     test_hbitmap_truncate_shrink_medium);
+    hbitmap_test_add("/hbitmap/truncate/grow/large",
+                     test_hbitmap_truncate_grow_large);
+    hbitmap_test_add("/hbitmap/truncate/shrink/large",
+                     test_hbitmap_truncate_shrink_large);
     g_test_run();
 
     return 0;
diff --git a/thread-pool.c b/thread-pool.c
index e2cac8e4ff..ac909f4986 100644
--- a/thread-pool.c
+++ b/thread-pool.c
@@ -170,12 +170,12 @@ restart:
         if (elem->state != THREAD_DONE) {
             continue;
         }
-        if (elem->state == THREAD_DONE) {
-            trace_thread_pool_complete(pool, elem, elem->common.opaque,
-                                       elem->ret);
-        }
-        if (elem->state == THREAD_DONE && elem->common.cb) {
-            QLIST_REMOVE(elem, all);
+
+        trace_thread_pool_complete(pool, elem, elem->common.opaque,
+                                   elem->ret);
+        QLIST_REMOVE(elem, all);
+
+        if (elem->common.cb) {
             /* Read state before ret.  */
             smp_rmb();
 
@@ -188,8 +188,6 @@ restart:
             qemu_aio_unref(elem);
             goto restart;
         } else {
-            /* remove the request */
-            QLIST_REMOVE(elem, all);
             qemu_aio_unref(elem);
         }
     }
diff --git a/util/hbitmap.c b/util/hbitmap.c
index ab139717f5..a10c7aeeda 100644
--- a/util/hbitmap.c
+++ b/util/hbitmap.c
@@ -90,6 +90,9 @@ struct HBitmap {
      * bitmap will still allocate HBITMAP_LEVELS arrays.
      */
     unsigned long *levels[HBITMAP_LEVELS];
+
+    /* The length of each levels[] array. */
+    uint64_t sizes[HBITMAP_LEVELS];
 };
 
 /* Advance hbi to the next nonzero word and return it.  hbi->pos
@@ -384,6 +387,7 @@ HBitmap *hbitmap_alloc(uint64_t size, int granularity)
     hb->granularity = granularity;
     for (i = HBITMAP_LEVELS; i-- > 0; ) {
         size = MAX((size + BITS_PER_LONG - 1) >> BITS_PER_LEVEL, 1);
+        hb->sizes[i] = size;
         hb->levels[i] = g_new0(unsigned long, size);
     }
 
@@ -395,3 +399,84 @@ HBitmap *hbitmap_alloc(uint64_t size, int granularity)
     hb->levels[0][0] |= 1UL << (BITS_PER_LONG - 1);
     return hb;
 }
+
+void hbitmap_truncate(HBitmap *hb, uint64_t size)
+{
+    bool shrink;
+    unsigned i;
+    uint64_t num_elements = size;
+    uint64_t old;
+
+    /* Size comes in as logical elements, adjust for granularity. */
+    size = (size + (1ULL << hb->granularity) - 1) >> hb->granularity;
+    assert(size <= ((uint64_t)1 << HBITMAP_LOG_MAX_SIZE));
+    shrink = size < hb->size;
+
+    /* bit sizes are identical; nothing to do. */
+    if (size == hb->size) {
+        return;
+    }
+
+    /* If we're losing bits, let's clear those bits before we invalidate all of
+     * our invariants. This helps keep the bitcount consistent, and will prevent
+     * us from carrying around garbage bits beyond the end of the map.
+     */
+    if (shrink) {
+        /* Don't clear partial granularity groups;
+         * start at the first full one. */
+        uint64_t start = QEMU_ALIGN_UP(num_elements, 1 << hb->granularity);
+        uint64_t fix_count = (hb->size << hb->granularity) - start;
+
+        assert(fix_count);
+        hbitmap_reset(hb, start, fix_count);
+    }
+
+    hb->size = size;
+    for (i = HBITMAP_LEVELS; i-- > 0; ) {
+        size = MAX(BITS_TO_LONGS(size), 1);
+        if (hb->sizes[i] == size) {
+            break;
+        }
+        old = hb->sizes[i];
+        hb->sizes[i] = size;
+        hb->levels[i] = g_realloc(hb->levels[i], size * sizeof(unsigned long));
+        if (!shrink) {
+            memset(&hb->levels[i][old], 0x00,
+                   (size - old) * sizeof(*hb->levels[i]));
+        }
+    }
+}
+
+
+/**
+ * Given HBitmaps A and B, let A := A (BITOR) B.
+ * Bitmap B will not be modified.
+ *
+ * @return true if the merge was successful,
+ *         false if it was not attempted.
+ */
+bool hbitmap_merge(HBitmap *a, const HBitmap *b)
+{
+    int i;
+    uint64_t j;
+
+    if ((a->size != b->size) || (a->granularity != b->granularity)) {
+        return false;
+    }
+
+    if (hbitmap_count(b) == 0) {
+        return true;
+    }
+
+    /* This merge is O(size), as BITS_PER_LONG and HBITMAP_LEVELS are constant.
+     * It may be possible to improve running times for sparsely populated maps
+     * by using hbitmap_iter_next, but this is suboptimal for dense maps.
+     */
+    for (i = HBITMAP_LEVELS - 1; i >= 0; i--) {
+        for (j = 0; j < a->sizes[i]; j++) {
+            a->levels[i][j] |= b->levels[i][j];
+        }
+    }
+
+    return true;
+}