37 files changed, 1155 insertions, 1361 deletions
diff --git a/block.c b/block.c
index 18a497f69d..1205ef8860 100644
--- a/block.c
+++ b/block.c
@@ -38,7 +38,6 @@
 #include "qmp-commands.h"
 #include "qemu/timer.h"
 #include "qapi-event.h"
-#include "block/throttle-groups.h"
 #include "qemu/cutils.h"
 #include "qemu/id.h"
 
@@ -237,8 +236,6 @@ BlockDriverState *bdrv_new(void)
         QLIST_INIT(&bs->op_blockers[i]);
     }
     notifier_with_return_list_init(&bs->before_write_notifiers);
-    qemu_co_queue_init(&bs->throttled_reqs[0]);
-    qemu_co_queue_init(&bs->throttled_reqs[1]);
     bs->refcnt = 1;
     bs->aio_context = qemu_get_aio_context();
 
@@ -1217,6 +1214,27 @@ void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child)
     bdrv_root_unref_child(child);
 }
 
+
+static void bdrv_parent_cb_change_media(BlockDriverState *bs, bool load)
+{
+    BdrvChild *c;
+    QLIST_FOREACH(c, &bs->parents, next_parent) {
+        if (c->role->change_media) {
+            c->role->change_media(c, load);
+        }
+    }
+}
+
+static void bdrv_parent_cb_resize(BlockDriverState *bs)
+{
+    BdrvChild *c;
+    QLIST_FOREACH(c, &bs->parents, next_parent) {
+        if (c->role->resize) {
+            c->role->resize(c);
+        }
+    }
+}
+
 /*
  * Sets the backing file link of a BDS. A new reference is created; callers
  * which don't need their own reference any more must call bdrv_unref().
@@ -1525,12 +1543,6 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
             return -ENODEV;
         }
 
-        if (bs->throttle_state) {
-            error_setg(errp, "Cannot reference an existing block device for "
-                       "which I/O throttling is enabled");
-            return -EINVAL;
-        }
-
         bdrv_ref(bs);
         *pbs = bs;
         return 0;
@@ -1682,9 +1694,7 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
     }
 
     if (!bdrv_key_required(bs)) {
-        if (bs->blk) {
-            blk_dev_change_media_cb(bs->blk, true);
-        }
+        bdrv_parent_cb_change_media(bs, true);
     } else if (!runstate_check(RUN_STATE_PRELAUNCH)
                && !runstate_check(RUN_STATE_INMIGRATE)
                && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
@@ -2123,11 +2133,6 @@ static void bdrv_close(BlockDriverState *bs)
 
     assert(!bs->job);
 
-    /* Disable I/O limits and drain all pending throttled requests */
-    if (bs->throttle_state) {
-        bdrv_io_limits_disable(bs);
-    }
-
     bdrv_drained_begin(bs); /* complete I/O */
     bdrv_flush(bs);
     bdrv_drain(bs); /* in case flush left pending I/O */
@@ -2135,9 +2140,7 @@ static void bdrv_close(BlockDriverState *bs)
     bdrv_release_named_dirty_bitmaps(bs);
     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
 
-    if (bs->blk) {
-        blk_dev_change_media_cb(bs->blk, false);
-    }
+    bdrv_parent_cb_change_media(bs, false);
 
     if (bs->drv) {
         BdrvChild *child, *next;
@@ -2218,26 +2221,11 @@ void bdrv_close_all(void)
     }
 }
 
-/* Fields that need to stay with the top-level BDS */
-static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
-                                     BlockDriverState *bs_src)
-{
-    /* move some fields that need to stay attached to the device */
-}
-
 static void change_parent_backing_link(BlockDriverState *from,
                                        BlockDriverState *to)
 {
     BdrvChild *c, *next;
 
-    if (from->blk) {
-        /* FIXME We bypass blk_set_bs(), so we need to make these updates
-         * manually. The root problem is not in this change function, but the
-         * existence of BlockDriverState.blk. */
-        to->blk = from->blk;
-        from->blk = NULL;
-    }
-
     QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) {
         assert(c->role != &child_backing);
         c->bs = to;
@@ -2248,22 +2236,6 @@ static void change_parent_backing_link(BlockDriverState *from,
     }
 }
 
-static void swap_feature_fields(BlockDriverState *bs_top,
-                                BlockDriverState *bs_new)
-{
-    BlockDriverState tmp;
-
-    bdrv_move_feature_fields(&tmp, bs_top);
-    bdrv_move_feature_fields(bs_top, bs_new);
-    bdrv_move_feature_fields(bs_new, &tmp);
-
-    assert(!bs_new->throttle_state);
-    if (bs_top->throttle_state) {
-        bdrv_io_limits_enable(bs_new, throttle_group_get_name(bs_top));
-        bdrv_io_limits_disable(bs_top);
-    }
-}
-
 /*
  * Add new bs contents at the top of an image chain while the chain is
  * live, while keeping required fields on the top layer.
@@ -2286,11 +2258,8 @@ void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
     assert(!bdrv_requests_pending(bs_new));
 
     bdrv_ref(bs_top);
-    change_parent_backing_link(bs_top, bs_new);
-
-    /* Some fields always stay on top of the backing file chain */
-    swap_feature_fields(bs_top, bs_new);
 
+    change_parent_backing_link(bs_top, bs_new);
     bdrv_set_backing_hd(bs_new, bs_top);
     bdrv_unref(bs_top);
 
@@ -2306,16 +2275,6 @@ void bdrv_replace_in_backing_chain(BlockDriverState *old, BlockDriverState *new)
 
     bdrv_ref(old);
 
-    if (old->blk) {
-        /* As long as these fields aren't in BlockBackend, but in the top-level
-         * BlockDriverState, it's not possible for a BDS to have two BBs.
-         *
-         * We really want to copy the fields from old to new, but we go for a
-         * swap instead so that pointers aren't duplicated and cause trouble.
-         * (Also, bdrv_swap() used to do the same.) */
-        assert(!new->blk);
-        swap_feature_fields(old, new);
-    }
     change_parent_backing_link(old, new);
 
     /* Change backing files if a previously independent node is added to the
@@ -2624,9 +2583,7 @@ int bdrv_truncate(BlockDriverState *bs, int64_t offset)
     if (ret == 0) {
         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
         bdrv_dirty_bitmap_truncate(bs);
-        if (bs->blk) {
-            blk_dev_resize_cb(bs->blk);
-        }
+        bdrv_parent_cb_resize(bs);
     }
     return ret;
 }
@@ -2736,11 +2693,9 @@ int bdrv_set_key(BlockDriverState *bs, const char *key)
     if (ret < 0) {
         bs->valid_key = 0;
     } else if (!bs->valid_key) {
+        /* call the change callback now, we skipped it on open */
         bs->valid_key = 1;
-        if (bs->blk) {
-            /* call the change callback now, we skipped it on open */
-            blk_dev_change_media_cb(bs->blk, true);
-        }
+        bdrv_parent_cb_change_media(bs, true);
     }
     return ret;
 }
@@ -2907,34 +2862,33 @@ BlockDriverState *bdrv_next_node(BlockDriverState *bs)
     return QTAILQ_NEXT(bs, node_list);
 }
 
-/* Iterates over all top-level BlockDriverStates, i.e. BDSs that are owned by
- * the monitor or attached to a BlockBackend */
-BlockDriverState *bdrv_next(BlockDriverState *bs)
+const char *bdrv_get_node_name(const BlockDriverState *bs)
 {
-    if (!bs || bs->blk) {
-        bs = blk_next_root_bs(bs);
-        if (bs) {
-            return bs;
-        }
-    }
-
-    /* Ignore all BDSs that are attached to a BlockBackend here; they have been
-     * handled by the above block already */
-    do {
-        bs = bdrv_next_monitor_owned(bs);
-    } while (bs && bs->blk);
-    return bs;
+    return bs->node_name;
 }
 
-const char *bdrv_get_node_name(const BlockDriverState *bs)
+const char *bdrv_get_parent_name(const BlockDriverState *bs)
 {
-    return bs->node_name;
+    BdrvChild *c;
+    const char *name;
+
+    /* If multiple parents have a name, just pick the first one. */
+    QLIST_FOREACH(c, &bs->parents, next_parent) {
+        if (c->role->get_name) {
+            name = c->role->get_name(c);
+            if (name && *name) {
+                return name;
+            }
+        }
+    }
+
+    return NULL;
 }
 
 /* TODO check what callers really want: bs->node_name or blk_name() */
 const char *bdrv_get_device_name(const BlockDriverState *bs)
 {
-    return bs->blk ? blk_name(bs->blk) : "";
+    return bdrv_get_parent_name(bs) ?: "";
 }
 
 /* This can be used to identify nodes that might not have a device
@@ -2943,7 +2897,7 @@ const char *bdrv_get_device_name(const BlockDriverState *bs)
  * absent, then this returns an empty (non-null) string. */
 const char *bdrv_get_device_or_node_name(const BlockDriverState *bs)
 {
-    return bs->blk ? blk_name(bs->blk) : bs->node_name;
+    return bdrv_get_parent_name(bs) ?: bs->node_name;
 }
 
 int bdrv_get_flags(BlockDriverState *bs)
@@ -3239,10 +3193,11 @@ void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
 
 void bdrv_invalidate_cache_all(Error **errp)
 {
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
     Error *local_err = NULL;
+    BdrvNextIterator *it = NULL;
 
-    while ((bs = bdrv_next(bs)) != NULL) {
+    while ((it = bdrv_next(it, &bs)) != NULL) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
         aio_context_acquire(aio_context);
@@ -3284,10 +3239,11 @@ static int bdrv_inactivate_recurse(BlockDriverState *bs,
 int bdrv_inactivate_all(void)
 {
     BlockDriverState *bs = NULL;
+    BdrvNextIterator *it = NULL;
     int ret = 0;
     int pass;
 
-    while ((bs = bdrv_next(bs)) != NULL) {
+    while ((it = bdrv_next(it, &bs)) != NULL) {
         aio_context_acquire(bdrv_get_aio_context(bs));
     }
 
@@ -3296,8 +3252,8 @@ int bdrv_inactivate_all(void)
      * the second pass sets the BDRV_O_INACTIVE flag so that no further write
      * is allowed. */
     for (pass = 0; pass < 2; pass++) {
-        bs = NULL;
-        while ((bs = bdrv_next(bs)) != NULL) {
+        it = NULL;
+        while ((it = bdrv_next(it, &bs)) != NULL) {
             ret = bdrv_inactivate_recurse(bs, pass);
             if (ret < 0) {
                 goto out;
@@ -3306,8 +3262,8 @@ int bdrv_inactivate_all(void)
     }
 
 out:
-    bs = NULL;
-    while ((bs = bdrv_next(bs)) != NULL) {
+    it = NULL;
+    while ((it = bdrv_next(it, &bs)) != NULL) {
         aio_context_release(bdrv_get_aio_context(bs));
     }
 
@@ -3653,6 +3609,7 @@ AioContext *bdrv_get_aio_context(BlockDriverState *bs)
 void bdrv_detach_aio_context(BlockDriverState *bs)
 {
     BdrvAioNotifier *baf;
+    BdrvChild *child;
 
     if (!bs->drv) {
         return;
@@ -3662,17 +3619,11 @@ void bdrv_detach_aio_context(BlockDriverState *bs)
         baf->detach_aio_context(baf->opaque);
     }
 
-    if (bs->throttle_state) {
-        throttle_timers_detach_aio_context(&bs->throttle_timers);
-    }
     if (bs->drv->bdrv_detach_aio_context) {
         bs->drv->bdrv_detach_aio_context(bs);
     }
-    if (bs->file) {
-        bdrv_detach_aio_context(bs->file->bs);
-    }
-    if (bs->backing) {
-        bdrv_detach_aio_context(bs->backing->bs);
+    QLIST_FOREACH(child, &bs->children, next) {
+        bdrv_detach_aio_context(child->bs);
     }
 
     bs->aio_context = NULL;
@@ -3682,6 +3633,7 @@ void bdrv_attach_aio_context(BlockDriverState *bs,
                              AioContext *new_context)
 {
     BdrvAioNotifier *ban;
+    BdrvChild *child;
 
     if (!bs->drv) {
         return;
@@ -3689,18 +3641,12 @@ void bdrv_attach_aio_context(BlockDriverState *bs,
 
     bs->aio_context = new_context;
 
-    if (bs->backing) {
-        bdrv_attach_aio_context(bs->backing->bs, new_context);
-    }
-    if (bs->file) {
-        bdrv_attach_aio_context(bs->file->bs, new_context);
+    QLIST_FOREACH(child, &bs->children, next) {
+        bdrv_attach_aio_context(child->bs, new_context);
     }
     if (bs->drv->bdrv_attach_aio_context) {
         bs->drv->bdrv_attach_aio_context(bs, new_context);
     }
-    if (bs->throttle_state) {
-        throttle_timers_attach_aio_context(&bs->throttle_timers, new_context);
-    }
 
     QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
         ban->attached_aio_context(new_context, ban->opaque);
@@ -3806,10 +3752,11 @@ bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
  */
 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
 {
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator *it = NULL;
 
     /* walk down the bs forest recursively */
-    while ((bs = bdrv_next(bs)) != NULL) {
+    while ((it = bdrv_next(it, &bs)) != NULL) {
         bool perm;
 
         /* try to recurse in this top level bs */
diff --git a/block/backup.c b/block/backup.c
index 491fd14068..fec45e8212 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -218,15 +218,6 @@ static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
     ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
 }
 
-static void backup_iostatus_reset(BlockJob *job)
-{
-    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
-
-    if (s->target->blk) {
-        blk_iostatus_reset(s->target->blk);
-    }
-}
-
 static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret)
 {
     BdrvDirtyBitmap *bm;
@@ -263,7 +254,6 @@ static const BlockJobDriver backup_job_driver = {
     .instance_size  = sizeof(BackupBlockJob),
     .job_type       = BLOCK_JOB_TYPE_BACKUP,
     .set_speed      = backup_set_speed,
-    .iostatus_reset = backup_iostatus_reset,
     .commit         = backup_commit,
     .abort          = backup_abort,
 };
@@ -272,11 +262,11 @@ static BlockErrorAction backup_error_action(BackupBlockJob *job,
                                             bool read, int error)
 {
     if (read) {
-        return block_job_error_action(&job->common, job->common.bs,
-                                      job->on_source_error, true, error);
+        return block_job_error_action(&job->common, job->on_source_error,
+                                      true, error);
     } else {
-        return block_job_error_action(&job->common, job->target,
-                                      job->on_target_error, false, error);
+        return block_job_error_action(&job->common, job->on_target_error,
+                                      false, error);
     }
 }
 
@@ -388,7 +378,6 @@ static void coroutine_fn backup_run(void *opaque)
     BackupCompleteData *data;
     BlockDriverState *bs = job->common.bs;
     BlockDriverState *target = job->target;
-    BlockdevOnError on_target_error = job->on_target_error;
     NotifierWithReturn before_write = {
         .notify = backup_before_write_notify,
     };
@@ -404,11 +393,6 @@ static void coroutine_fn backup_run(void *opaque)
 
     job->done_bitmap = bitmap_new(end);
 
-    if (target->blk) {
-        blk_set_on_error(target->blk, on_target_error, on_target_error);
-        blk_iostatus_enable(target->blk);
-    }
-
     bdrv_add_before_write_notifier(bs, &before_write);
 
     if (job->sync_mode == MIRROR_SYNC_MODE_NONE) {
@@ -484,9 +468,6 @@ static void coroutine_fn backup_run(void *opaque)
     qemu_co_rwlock_unlock(&job->flush_rwlock);
     g_free(job->done_bitmap);
 
-    if (target->blk) {
-        blk_iostatus_disable(target->blk);
-    }
     bdrv_op_unblock_all(target, job->common.blocker);
 
     data = g_malloc(sizeof(*data));
@@ -515,13 +496,6 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
         return;
     }
 
-    if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
-         on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
-        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
-        error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error");
-        return;
-    }
-
     if (!bdrv_is_inserted(bs)) {
         error_setg(errp, "Device is not inserted: %s",
                    bdrv_get_device_name(bs));
diff --git a/block/blkverify.c b/block/blkverify.c
index 9414b7a84e..4045396a3d 100644
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -293,22 +293,6 @@ static bool blkverify_recurse_is_first_non_filter(BlockDriverState *bs,
     return bdrv_recurse_is_first_non_filter(s->test_file->bs, candidate);
 }
 
-/* Propagate AioContext changes to ->test_file */
-static void blkverify_detach_aio_context(BlockDriverState *bs)
-{
-    BDRVBlkverifyState *s = bs->opaque;
-
-    bdrv_detach_aio_context(s->test_file->bs);
-}
-
-static void blkverify_attach_aio_context(BlockDriverState *bs,
-                                         AioContext *new_context)
-{
-    BDRVBlkverifyState *s = bs->opaque;
-
-    bdrv_attach_aio_context(s->test_file->bs, new_context);
-}
-
 static void blkverify_refresh_filename(BlockDriverState *bs, QDict *options)
 {
     BDRVBlkverifyState *s = bs->opaque;
@@ -356,9 +340,6 @@ static BlockDriver bdrv_blkverify = {
     .bdrv_aio_writev                  = blkverify_aio_writev,
     .bdrv_aio_flush                   = blkverify_aio_flush,
 
-    .bdrv_attach_aio_context          = blkverify_attach_aio_context,
-    .bdrv_detach_aio_context          = blkverify_detach_aio_context,
-
     .is_filter                        = true,
     .bdrv_recurse_is_first_non_filter = blkverify_recurse_is_first_non_filter,
 };
diff --git a/block/block-backend.c b/block/block-backend.c
index a1e2c7fa20..6928d61de4 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -34,6 +34,7 @@ struct BlockBackend {
     DriveInfo *legacy_dinfo;    /* null unless created by drive_new() */
     QTAILQ_ENTRY(BlockBackend) link;         /* for block_backends */
     QTAILQ_ENTRY(BlockBackend) monitor_link; /* for monitor_block_backends */
+    BlockBackendPublic public;
 
     void *dev;                  /* attached device model, if any */
     /* TODO change to DeviceState when all users are qdevified */
@@ -74,6 +75,7 @@ static const AIOCBInfo block_backend_aiocb_info = {
 };
 
 static void drive_info_del(DriveInfo *dinfo);
+static BlockBackend *bdrv_first_blk(BlockDriverState *bs);
 
 /* All BlockBackends */
 static QTAILQ_HEAD(, BlockBackend) block_backends =
@@ -90,9 +92,26 @@ static void blk_root_inherit_options(int *child_flags, QDict *child_options,
     /* We're not supposed to call this function for root nodes */
     abort();
 }
+static void blk_root_drained_begin(BdrvChild *child);
+static void blk_root_drained_end(BdrvChild *child);
+
+static void blk_root_change_media(BdrvChild *child, bool load);
+static void blk_root_resize(BdrvChild *child);
+
+static const char *blk_root_get_name(BdrvChild *child)
+{
+    return blk_name(child->opaque);
+}
 
 static const BdrvChildRole child_root = {
-    .inherit_options = blk_root_inherit_options,
+    .inherit_options    = blk_root_inherit_options,
+
+    .change_media       = blk_root_change_media,
+    .resize             = blk_root_resize,
+    .get_name           = blk_root_get_name,
+
+    .drained_begin      = blk_root_drained_begin,
+    .drained_end        = blk_root_drained_end,
 };
 
 /*
@@ -106,8 +125,12 @@ BlockBackend *blk_new(Error **errp)
 
     blk = g_new0(BlockBackend, 1);
     blk->refcnt = 1;
+    qemu_co_queue_init(&blk->public.throttled_reqs[0]);
+    qemu_co_queue_init(&blk->public.throttled_reqs[1]);
+
     notifier_list_init(&blk->remove_bs_notifiers);
     notifier_list_init(&blk->insert_bs_notifiers);
+
     QTAILQ_INSERT_TAIL(&block_backends, blk, link);
     return blk;
 }
@@ -128,7 +151,7 @@ BlockBackend *blk_new_with_bs(Error **errp)
 
     bs = bdrv_new_root();
     blk->root = bdrv_root_attach_child(bs, "root", &child_root);
-    bs->blk = blk;
+    blk->root->opaque = blk;
     return blk;
 }
 
@@ -177,10 +200,6 @@ static void blk_delete(BlockBackend *blk)
     }
     assert(QLIST_EMPTY(&blk->remove_bs_notifiers.notifiers));
     assert(QLIST_EMPTY(&blk->insert_bs_notifiers.notifiers));
-    if (blk->root_state.throttle_state) {
-        g_free(blk->root_state.throttle_group);
-        throttle_group_unref(blk->root_state.throttle_state);
-    }
     QTAILQ_REMOVE(&block_backends, blk, link);
     drive_info_del(blk->legacy_dinfo);
     block_acct_cleanup(&blk->stats);
@@ -267,28 +286,50 @@ BlockBackend *blk_next(BlockBackend *blk)
                : QTAILQ_FIRST(&monitor_block_backends);
 }
 
-/*
- * Iterates over all BlockDriverStates which are attached to a BlockBackend.
- * This function is for use by bdrv_next().
- *
- * @bs must be NULL or a BDS that is attached to a BB.
- */
-BlockDriverState *blk_next_root_bs(BlockDriverState *bs)
-{
+struct BdrvNextIterator {
+    enum {
+        BDRV_NEXT_BACKEND_ROOTS,
+        BDRV_NEXT_MONITOR_OWNED,
+    } phase;
     BlockBackend *blk;
+    BlockDriverState *bs;
+};
 
-    if (bs) {
-        assert(bs->blk);
-        blk = bs->blk;
-    } else {
-        blk = NULL;
+/* Iterates over all top-level BlockDriverStates, i.e. BDSs that are owned by
+ * the monitor or attached to a BlockBackend */
+BdrvNextIterator *bdrv_next(BdrvNextIterator *it, BlockDriverState **bs)
+{
+    if (!it) {
+        it = g_new(BdrvNextIterator, 1);
+        *it = (BdrvNextIterator) {
+            .phase = BDRV_NEXT_BACKEND_ROOTS,
+        };
+    }
+
+    /* First, return all root nodes of BlockBackends. In order to avoid
+     * returning a BDS twice when multiple BBs refer to it, we only return it
+     * if the BB is the first one in the parent list of the BDS. */
+    if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
+        do {
+            it->blk = blk_all_next(it->blk);
+            *bs = it->blk ? blk_bs(it->blk) : NULL;
+        } while (it->blk && (*bs == NULL || bdrv_first_blk(*bs) != it->blk));
+
+        if (*bs) {
+            return it;
+        }
+        it->phase = BDRV_NEXT_MONITOR_OWNED;
     }
 
+    /* Then return the monitor-owned BDSes without a BB attached. Ignore all
+     * BDSes that are attached to a BlockBackend here; they have been handled
+     * by the above block already */
     do {
-        blk = blk_all_next(blk);
-    } while (blk && !blk->root);
+        it->bs = bdrv_next_monitor_owned(it->bs);
+        *bs = it->bs;
+    } while (*bs && bdrv_has_blk(*bs));
 
-    return blk ? blk->root->bs : NULL;
+    return *bs ? it : NULL;
 }
 
 /*
@@ -375,6 +416,26 @@ BlockDriverState *blk_bs(BlockBackend *blk)
     return blk->root ? blk->root->bs : NULL;
 }
 
+static BlockBackend *bdrv_first_blk(BlockDriverState *bs)
+{
+    BdrvChild *child;
+    QLIST_FOREACH(child, &bs->parents, next_parent) {
+        if (child->role == &child_root) {
+            return child->opaque;
+        }
+    }
+
+    return NULL;
+}
+
+/*
+ * Returns true if @bs has an associated BlockBackend.
+ */
+bool bdrv_has_blk(BlockDriverState *bs)
+{
+    return bdrv_first_blk(bs) != NULL;
+}
+
 /*
  * Return @blk's DriveInfo if any, else null.
  */
@@ -411,17 +472,33 @@ BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo)
 }
 
 /*
+ * Returns a pointer to the publicly accessible fields of @blk.
+ */
+BlockBackendPublic *blk_get_public(BlockBackend *blk)
+{
+    return &blk->public;
+}
+
+/*
+ * Returns a BlockBackend given the associated @public fields.
+ */
+BlockBackend *blk_by_public(BlockBackendPublic *public)
+{
+    return container_of(public, BlockBackend, public);
+}
+
+/*
  * Disassociates the currently associated BlockDriverState from @blk.
  */
 void blk_remove_bs(BlockBackend *blk)
 {
-    assert(blk->root->bs->blk == blk);
-
     notifier_list_notify(&blk->remove_bs_notifiers, blk);
+    if (blk->public.throttle_state) {
+        throttle_timers_detach_aio_context(&blk->public.throttle_timers);
+    }
 
     blk_update_root_state(blk);
 
-    blk->root->bs->blk = NULL;
     bdrv_root_unref_child(blk->root);
     blk->root = NULL;
 }
@@ -431,12 +508,15 @@ void blk_remove_bs(BlockBackend *blk)
  */
 void blk_insert_bs(BlockBackend *blk, BlockDriverState *bs)
 {
-    assert(!blk->root && !bs->blk);
     bdrv_ref(bs);
     blk->root = bdrv_root_attach_child(bs, "root", &child_root);
-    bs->blk = blk;
+    blk->root->opaque = blk;
 
     notifier_list_notify(&blk->insert_bs_notifiers, blk);
+    if (blk->public.throttle_state) {
+        throttle_timers_attach_aio_context(
+            &blk->public.throttle_timers, bdrv_get_aio_context(bs));
+    }
 }
 
 /*
@@ -525,6 +605,11 @@ void blk_dev_change_media_cb(BlockBackend *blk, bool load)
     }
 }
 
+static void blk_root_change_media(BdrvChild *child, bool load)
+{
+    blk_dev_change_media_cb(child->opaque, load);
+}
+
 /*
  * Does @blk's attached device model have removable media?
  * %true if no device model is attached.
@@ -579,8 +664,10 @@ bool blk_dev_is_medium_locked(BlockBackend *blk)
 /*
  * Notify @blk's attached device model of a backend size change.
  */
-void blk_dev_resize_cb(BlockBackend *blk)
+static void blk_root_resize(BdrvChild *child)
 {
+    BlockBackend *blk = child->opaque;
+
     if (blk->dev_ops && blk->dev_ops->resize_cb) {
         blk->dev_ops->resize_cb(blk->dev_opaque);
     }
@@ -692,6 +779,11 @@ static int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
         return ret;
     }
 
+    /* throttling disk I/O */
+    if (blk->public.throttle_state) {
+        throttle_group_co_io_limits_intercept(blk, bytes, false);
+    }
+
     return bdrv_co_preadv(blk_bs(blk), offset, bytes, qiov, flags);
 }
 
@@ -706,6 +798,11 @@ static int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
         return ret;
     }
 
+    /* throttling disk I/O */
+    if (blk->public.throttle_state) {
+        throttle_group_co_io_limits_intercept(blk, bytes, true);
+    }
+
     if (!blk->enable_write_cache) {
         flags |= BDRV_REQ_FUA;
     }
@@ -775,7 +872,6 @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
 int blk_pread_unthrottled(BlockBackend *blk, int64_t offset, uint8_t *buf,
                           int count)
 {
-    BlockDriverState *bs = blk_bs(blk);
     int ret;
 
     ret = blk_check_byte_request(blk, offset, count);
@@ -783,9 +879,9 @@ int blk_pread_unthrottled(BlockBackend *blk, int64_t offset, uint8_t *buf,
         return ret;
     }
 
-    bdrv_no_throttling_begin(bs);
+    blk_root_drained_begin(blk->root);
     ret = blk_pread(blk, offset, buf, count);
-    bdrv_no_throttling_end(bs);
+    blk_root_drained_end(blk->root);
     return ret;
 }
 
@@ -1008,20 +1104,6 @@ void blk_aio_cancel_async(BlockAIOCB *acb)
     bdrv_aio_cancel_async(acb);
 }
 
-int blk_aio_multiwrite(BlockBackend *blk, BlockRequest *reqs, int num_reqs)
-{
-    int i, ret;
-
-    for (i = 0; i < num_reqs; i++) {
-        ret = blk_check_request(blk, reqs[i].sector, reqs[i].nb_sectors);
-        if (ret < 0) {
-            return ret;
-        }
-    }
-
-    return bdrv_aio_multiwrite(blk_bs(blk), reqs, num_reqs);
-}
-
 int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
 {
     if (!blk_is_available(blk)) {
@@ -1334,7 +1416,14 @@ void blk_set_aio_context(BlockBackend *blk, AioContext *new_context)
     BlockDriverState *bs = blk_bs(blk);
 
     if (bs) {
+        if (blk->public.throttle_state) {
+            throttle_timers_detach_aio_context(&blk->public.throttle_timers);
+        }
         bdrv_set_aio_context(bs, new_context);
+        if (blk->public.throttle_state) {
+            throttle_timers_attach_aio_context(&blk->public.throttle_timers,
+                                               new_context);
+        }
     }
 }
 
@@ -1499,19 +1588,6 @@ void blk_update_root_state(BlockBackend *blk)
     blk->root_state.open_flags    = blk->root->bs->open_flags;
     blk->root_state.read_only     = blk->root->bs->read_only;
     blk->root_state.detect_zeroes = blk->root->bs->detect_zeroes;
-
-    if (blk->root_state.throttle_group) {
-        g_free(blk->root_state.throttle_group);
-        throttle_group_unref(blk->root_state.throttle_state);
-    }
-    if (blk->root->bs->throttle_state) {
-        const char *name = throttle_group_get_name(blk->root->bs);
-        blk->root_state.throttle_group = g_strdup(name);
-        blk->root_state.throttle_state = throttle_group_incref(name);
-    } else {
-        blk->root_state.throttle_group = NULL;
-        blk->root_state.throttle_state = NULL;
-    }
 }
 
 /*
@@ -1522,9 +1598,6 @@ void blk_update_root_state(BlockBackend *blk)
 void blk_apply_root_state(BlockBackend *blk, BlockDriverState *bs)
 {
     bs->detect_zeroes = blk->root_state.detect_zeroes;
-    if (blk->root_state.throttle_group) {
-        bdrv_io_limits_enable(bs, blk->root_state.throttle_group);
-    }
 }
 
 /*
@@ -1587,3 +1660,59 @@ int blk_flush_all(void)
 
     return result;
 }
+
+
+/* throttling disk I/O limits */
+void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg)
+{
+    throttle_group_config(blk, cfg);
+}
+
+void blk_io_limits_disable(BlockBackend *blk)
+{
+    assert(blk->public.throttle_state);
+    bdrv_drained_begin(blk_bs(blk));
+    throttle_group_unregister_blk(blk);
+    bdrv_drained_end(blk_bs(blk));
+}
+
+/* should be called before blk_set_io_limits if a limit is set */
+void blk_io_limits_enable(BlockBackend *blk, const char *group)
+{
+    assert(!blk->public.throttle_state);
+    throttle_group_register_blk(blk, group);
+}
+
+void blk_io_limits_update_group(BlockBackend *blk, const char *group)
+{
+    /* this BB is not part of any group */
+    if (!blk->public.throttle_state) {
+        return;
+    }
+
+    /* this BB is a part of the same group than the one we want */
+    if (!g_strcmp0(throttle_group_get_name(blk), group)) {
+        return;
+    }
+
+    /* need to change the group this bs belong to */
+    blk_io_limits_disable(blk);
+    blk_io_limits_enable(blk, group);
+}
+
+static void blk_root_drained_begin(BdrvChild *child)
+{
+    BlockBackend *blk = child->opaque;
+
+    if (blk->public.io_limits_disabled++ == 0) {
+        throttle_group_restart_blk(blk);
+    }
+}
+
+static void blk_root_drained_end(BdrvChild *child)
+{
+    BlockBackend *blk = child->opaque;
+
+    assert(blk->public.io_limits_disabled);
+    --blk->public.io_limits_disabled;
+}
diff --git a/block/commit.c b/block/commit.c
index cba0e8c1e8..f308c8c6f0 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -214,13 +214,6 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,
     BlockDriverState *overlay_bs;
     Error *local_err = NULL;
 
-    if ((on_error == BLOCKDEV_ON_ERROR_STOP ||
-         on_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
-        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
-        error_setg(errp, "Invalid parameter combination");
-        return;
-    }
-
     assert(top != bs);
     if (top == base) {
         error_setg(errp, "Invalid files for merge: top and base are the same");
diff --git a/block/io.c b/block/io.c
index cd6d71a503..60a6bd8bdb 100644
--- a/block/io.c
+++ b/block/io.c
@@ -27,7 +27,6 @@
 #include "sysemu/block-backend.h"
 #include "block/blockjob.h"
 #include "block/block_int.h"
-#include "block/throttle-groups.h"
 #include "qemu/cutils.h"
 #include "qapi/error.h"
 #include "qemu/error-report.h"
@@ -46,56 +45,26 @@ static void coroutine_fn bdrv_co_do_rw(void *opaque);
 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
 
-/* throttling disk I/O limits */
-void bdrv_set_io_limits(BlockDriverState *bs,
-                        ThrottleConfig *cfg)
+static void bdrv_parent_drained_begin(BlockDriverState *bs)
 {
-    throttle_group_config(bs, cfg);
-}
+    BdrvChild *c;
 
-void bdrv_no_throttling_begin(BlockDriverState *bs)
-{
-    if (bs->io_limits_disabled++ == 0) {
-        throttle_group_restart_bs(bs);
+    QLIST_FOREACH(c, &bs->parents, next_parent) {
+        if (c->role->drained_begin) {
+            c->role->drained_begin(c);
+        }
     }
 }
 
-void bdrv_no_throttling_end(BlockDriverState *bs)
+static void bdrv_parent_drained_end(BlockDriverState *bs)
 {
-    assert(bs->io_limits_disabled);
-    --bs->io_limits_disabled;
-}
-
-void bdrv_io_limits_disable(BlockDriverState *bs)
-{
-    assert(bs->throttle_state);
-    bdrv_no_throttling_begin(bs);
-    throttle_group_unregister_bs(bs);
-    bdrv_no_throttling_end(bs);
-}
-
-/* should be called before bdrv_set_io_limits if a limit is set */
-void bdrv_io_limits_enable(BlockDriverState *bs, const char *group)
-{
-    assert(!bs->throttle_state);
-    throttle_group_register_bs(bs, group);
-}
-
-void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group)
-{
-    /* this bs is not part of any group */
-    if (!bs->throttle_state) {
-        return;
-    }
+    BdrvChild *c;
 
-    /* this bs is a part of the same group than the one we want */
-    if (!g_strcmp0(throttle_group_get_name(bs), group)) {
-        return;
+    QLIST_FOREACH(c, &bs->parents, next_parent) {
+        if (c->role->drained_end) {
+            c->role->drained_end(c);
+        }
     }
-
-    /* need to change the group this bs belong to */
-    bdrv_io_limits_disable(bs);
-    bdrv_io_limits_enable(bs, group);
 }
 
 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
@@ -182,12 +151,6 @@ bool bdrv_requests_pending(BlockDriverState *bs)
     if (!QLIST_EMPTY(&bs->tracked_requests)) {
         return true;
     }
-    if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
-        return true;
-    }
-    if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
-        return true;
-    }
 
     QLIST_FOREACH(child, &bs->children, next) {
         if (bdrv_requests_pending(child->bs)) {
@@ -275,17 +238,17 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
  */
 void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
 {
-    bdrv_no_throttling_begin(bs);
+    bdrv_parent_drained_begin(bs);
     bdrv_io_unplugged_begin(bs);
     bdrv_drain_recurse(bs);
     bdrv_co_yield_to_drain(bs);
     bdrv_io_unplugged_end(bs);
-    bdrv_no_throttling_end(bs);
+    bdrv_parent_drained_end(bs);
 }
 
 void bdrv_drain(BlockDriverState *bs)
 {
-    bdrv_no_throttling_begin(bs);
+    bdrv_parent_drained_begin(bs);
     bdrv_io_unplugged_begin(bs);
     bdrv_drain_recurse(bs);
     if (qemu_in_coroutine()) {
@@ -294,7 +257,7 @@ void bdrv_drain(BlockDriverState *bs)
         bdrv_drain_poll(bs);
     }
     bdrv_io_unplugged_end(bs);
-    bdrv_no_throttling_end(bs);
+    bdrv_parent_drained_end(bs);
 }
 
 /*
@@ -307,17 +270,18 @@ void bdrv_drain_all(void)
 {
     /* Always run first iteration so any pending completion BHs run */
     bool busy = true;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator *it = NULL;
     GSList *aio_ctxs = NULL, *ctx;
 
-    while ((bs = bdrv_next(bs))) {
+    while ((it = bdrv_next(it, &bs))) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
         aio_context_acquire(aio_context);
         if (bs->job) {
             block_job_pause(bs->job);
         }
-        bdrv_no_throttling_begin(bs);
+        bdrv_parent_drained_begin(bs);
         bdrv_io_unplugged_begin(bs);
         bdrv_drain_recurse(bs);
         aio_context_release(aio_context);
@@ -338,10 +302,10 @@ void bdrv_drain_all(void)
 
         for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
             AioContext *aio_context = ctx->data;
-            bs = NULL;
+            it = NULL;
 
             aio_context_acquire(aio_context);
-            while ((bs = bdrv_next(bs))) {
+            while ((it = bdrv_next(it, &bs))) {
                 if (aio_context == bdrv_get_aio_context(bs)) {
                     if (bdrv_requests_pending(bs)) {
                         busy = true;
@@ -354,13 +318,13 @@ void bdrv_drain_all(void)
         }
     }
 
-    bs = NULL;
-    while ((bs = bdrv_next(bs))) {
+    it = NULL;
+    while ((it = bdrv_next(it, &bs))) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
         aio_context_acquire(aio_context);
         bdrv_io_unplugged_end(bs);
-        bdrv_no_throttling_end(bs);
+        bdrv_parent_drained_end(bs);
         if (bs->job) {
             block_job_resume(bs->job);
         }
@@ -1069,11 +1033,6 @@ int coroutine_fn bdrv_co_preadv(BlockDriverState *bs,
         flags |= BDRV_REQ_COPY_ON_READ;
     }
 
-    /* throttling disk I/O */
-    if (bs->throttle_state) {
-        throttle_group_co_io_limits_intercept(bs, bytes, false);
-    }
-
     /* Align read if necessary by padding qiov */
     if (offset & (align - 1)) {
         head_buf = qemu_blockalign(bs, align);
@@ -1430,11 +1389,6 @@ int coroutine_fn bdrv_co_pwritev(BlockDriverState *bs,
         return ret;
     }
 
-    /* throttling disk I/O */
-    if (bs->throttle_state) {
-        throttle_group_co_io_limits_intercept(bs, bytes, true);
-    }
-
     /*
      * Align write if necessary by performing a read-modify-write cycle.
      * Pad qiov with the read parts and be sure to have a tracked request not
@@ -1925,200 +1879,6 @@ BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
                                  cb, opaque, true);
 }
 
-
-typedef struct MultiwriteCB {
-    int error;
-    int num_requests;
-    int num_callbacks;
-    struct {
-        BlockCompletionFunc *cb;
-        void *opaque;
-        QEMUIOVector *free_qiov;
-    } callbacks[];
-} MultiwriteCB;
-
-static void multiwrite_user_cb(MultiwriteCB *mcb)
-{
-    int i;
-
-    for (i = 0; i < mcb->num_callbacks; i++) {
-        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
-        if (mcb->callbacks[i].free_qiov) {
-            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
-        }
-        g_free(mcb->callbacks[i].free_qiov);
-    }
-}
-
-static void multiwrite_cb(void *opaque, int ret)
-{
-    MultiwriteCB *mcb = opaque;
-
-    trace_multiwrite_cb(mcb, ret);
-
-    if (ret < 0 && !mcb->error) {
-        mcb->error = ret;
-    }
-
-    mcb->num_requests--;
-    if (mcb->num_requests == 0) {
-        multiwrite_user_cb(mcb);
-        g_free(mcb);
-    }
-}
-
-static int multiwrite_req_compare(const void *a, const void *b)
-{
-    const BlockRequest *req1 = a, *req2 = b;
-
-    /*
-     * Note that we can't simply subtract req2->sector from req1->sector
-     * here as that could overflow the return value.
-     */
-    if (req1->sector > req2->sector) {
-        return 1;
-    } else if (req1->sector < req2->sector) {
-        return -1;
-    } else {
-        return 0;
-    }
-}
-
-/*
- * Takes a bunch of requests and tries to merge them. Returns the number of
- * requests that remain after merging.
- */
-static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
-    int num_reqs, MultiwriteCB *mcb)
-{
-    int i, outidx;
-
-    // Sort requests by start sector
-    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
-
-    // Check if adjacent requests touch the same clusters. If so, combine them,
-    // filling up gaps with zero sectors.
-    outidx = 0;
-    for (i = 1; i < num_reqs; i++) {
-        int merge = 0;
-        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
-
-        // Handle exactly sequential writes and overlapping writes.
-        if (reqs[i].sector <= oldreq_last) {
-            merge = 1;
-        }
-
-        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 >
-            bs->bl.max_iov) {
-            merge = 0;
-        }
-
-        if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
-            reqs[i].nb_sectors > bs->bl.max_transfer_length) {
-            merge = 0;
-        }
-
-        if (merge) {
-            size_t size;
-            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
-            qemu_iovec_init(qiov,
-                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
-
-            // Add the first request to the merged one. If the requests are
-            // overlapping, drop the last sectors of the first request.
-            size = (reqs[i].sector - reqs[outidx].sector) << 9;
-            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
-
-            // We should need to add any zeros between the two requests
-            assert (reqs[i].sector <= oldreq_last);
-
-            // Add the second request
-            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
-
-            // Add tail of first request, if necessary
-            if (qiov->size < reqs[outidx].qiov->size) {
-                qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
-                                  reqs[outidx].qiov->size - qiov->size);
-            }
-
-            reqs[outidx].nb_sectors = qiov->size >> 9;
-            reqs[outidx].qiov = qiov;
-
-            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
-        } else {
-            outidx++;
-            reqs[outidx].sector     = reqs[i].sector;
-            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
-            reqs[outidx].qiov       = reqs[i].qiov;
-        }
-    }
-
-    if (bs->blk) {
-        block_acct_merge_done(blk_get_stats(bs->blk), BLOCK_ACCT_WRITE,
-                              num_reqs - outidx - 1);
-    }
-
-    return outidx + 1;
-}
-
-/*
- * Submit multiple AIO write requests at once.
- *
- * On success, the function returns 0 and all requests in the reqs array have
- * been submitted. In error case this function returns -1, and any of the
- * requests may or may not be submitted yet. In particular, this means that the
- * callback will be called for some of the requests, for others it won't. The
- * caller must check the error field of the BlockRequest to wait for the right
- * callbacks (if error != 0, no callback will be called).
- *
- * The implementation may modify the contents of the reqs array, e.g. to merge
- * requests. However, the fields opaque and error are left unmodified as they
- * are used to signal failure for a single request to the caller.
- */
-int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
-{
-    MultiwriteCB *mcb;
-    int i;
-
-    /* don't submit writes if we don't have a medium */
-    if (bs->drv == NULL) {
-        for (i = 0; i < num_reqs; i++) {
-            reqs[i].error = -ENOMEDIUM;
-        }
-        return -1;
-    }
-
-    if (num_reqs == 0) {
-        return 0;
-    }
-
-    // Create MultiwriteCB structure
-    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
-    mcb->num_requests = 0;
-    mcb->num_callbacks = num_reqs;
-
-    for (i = 0; i < num_reqs; i++) {
-        mcb->callbacks[i].cb = reqs[i].cb;
-        mcb->callbacks[i].opaque = reqs[i].opaque;
-    }
-
-    // Check for mergable requests
-    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
-
-    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
-
-    /* Run the aio requests. */
-    mcb->num_requests = num_reqs;
-    for (i = 0; i < num_reqs; i++) {
-        bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
-                              reqs[i].nb_sectors, reqs[i].flags,
-                              multiwrite_cb, mcb,
-                              true);
-    }
-
-    return 0;
-}
-
 void bdrv_aio_cancel(BlockAIOCB *acb)
 {
     qemu_aio_ref(acb);
@@ -2789,11 +2549,14 @@ void bdrv_drained_begin(BlockDriverState *bs)
     if (!bs->quiesce_counter++) {
         aio_disable_external(bdrv_get_aio_context(bs));
     }
+    bdrv_parent_drained_begin(bs);
     bdrv_drain(bs);
 }
 
 void bdrv_drained_end(BlockDriverState *bs)
 {
+    bdrv_parent_drained_end(bs);
+
     assert(bs->quiesce_counter > 0);
     if (--bs->quiesce_counter > 0) {
         return;
diff --git a/block/mirror.c b/block/mirror.c
index 039f48125e..b9986d8218 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -80,11 +80,11 @@ static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
 {
     s->synced = false;
     if (read) {
-        return block_job_error_action(&s->common, s->common.bs,
-                                      s->on_source_error, true, error);
+        return block_job_error_action(&s->common, s->on_source_error,
+                                      true, error);
     } else {
-        return block_job_error_action(&s->common, s->target,
-                                      s->on_target_error, false, error);
+        return block_job_error_action(&s->common, s->on_target_error,
+                                      false, error);
     }
 }
 
@@ -468,7 +468,7 @@ static void mirror_exit(BlockJob *job, void *opaque)
 
         /* This was checked in mirror_start_job(), but meanwhile one of the
          * nodes could have been newly attached to a BlockBackend. */
-        if (to_replace->blk && s->target->blk) {
+        if (bdrv_has_blk(to_replace) && bdrv_has_blk(s->target)) {
             error_report("block job: Can't create node with two BlockBackends");
             data->ret = -EINVAL;
             goto out;
@@ -710,9 +710,6 @@ immediate_exit:
     g_free(s->cow_bitmap);
     g_free(s->in_flight_bitmap);
     bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);
-    if (s->target->blk) {
-        blk_iostatus_disable(s->target->blk);
-    }
 
     data = g_malloc(sizeof(*data));
     data->ret = ret;
@@ -739,15 +736,6 @@ static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
     ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
 }
 
-static void mirror_iostatus_reset(BlockJob *job)
-{
-    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
-
-    if (s->target->blk) {
-        blk_iostatus_reset(s->target->blk);
-    }
-}
-
 static void mirror_complete(BlockJob *job, Error **errp)
 {
     MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
@@ -793,7 +781,6 @@ static const BlockJobDriver mirror_job_driver = {
     .instance_size = sizeof(MirrorBlockJob),
     .job_type      = BLOCK_JOB_TYPE_MIRROR,
     .set_speed     = mirror_set_speed,
-    .iostatus_reset= mirror_iostatus_reset,
     .complete      = mirror_complete,
 };
 
@@ -801,8 +788,6 @@ static const BlockJobDriver commit_active_job_driver = {
     .instance_size = sizeof(MirrorBlockJob),
     .job_type      = BLOCK_JOB_TYPE_COMMIT,
     .set_speed     = mirror_set_speed,
-    .iostatus_reset
-                   = mirror_iostatus_reset,
     .complete      = mirror_complete,
 };
 
@@ -827,13 +812,6 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
 
     assert ((granularity & (granularity - 1)) == 0);
 
-    if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
-         on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
-        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
-        error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error");
-        return;
-    }
-
     if (buf_size < 0) {
         error_setg(errp, "Invalid parameter 'buf-size'");
         return;
@@ -853,7 +831,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
     } else {
         replaced_bs = bs;
     }
-    if (replaced_bs->blk && target->blk) {
+    if (bdrv_has_blk(replaced_bs) && bdrv_has_blk(target)) {
         error_setg(errp, "Can't create node with two BlockBackends");
         return;
     }
@@ -882,10 +860,6 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
 
     bdrv_op_block_all(s->target, s->common.blocker);
 
-    if (s->target->blk) {
-        blk_set_on_error(s->target->blk, on_target_error, on_target_error);
-        blk_iostatus_enable(s->target->blk);
-    }
     s->common.co = qemu_coroutine_create(mirror_run);
     trace_mirror_start(bs, s, s->common.co, opaque);
     qemu_coroutine_enter(s->common.co, s);
diff --git a/block/qapi.c b/block/qapi.c
index c5f6ba643c..5594f74d17 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -67,10 +67,10 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
     info->backing_file_depth = bdrv_get_backing_file_depth(bs);
     info->detect_zeroes = bs->detect_zeroes;
 
-    if (bs->throttle_state) {
+    if (blk && blk_get_public(blk)->throttle_state) {
         ThrottleConfig cfg;
 
-        throttle_group_get_config(bs, &cfg);
+        throttle_group_get_config(blk, &cfg);
 
         info->bps     = cfg.buckets[THROTTLE_BPS_TOTAL].avg;
         info->bps_rd  = cfg.buckets[THROTTLE_BPS_READ].avg;
@@ -118,7 +118,7 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
         info->iops_size = cfg.op_size;
 
         info->has_group = true;
-        info->group = g_strdup(throttle_group_get_name(bs));
+        info->group = g_strdup(throttle_group_get_name(blk));
     }
 
     info->write_threshold = bdrv_write_threshold_get(bs);
diff --git a/block/qcow2.c b/block/qcow2.c
index 49d7cff17a..c9306a78c1 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2413,7 +2413,7 @@ static bool is_zero_cluster(BlockDriverState *bs, int64_t start)
     BlockDriverState *file;
     int64_t res = bdrv_get_block_status_above(bs, NULL, start,
                                               s->cluster_sectors, &nr, &file);
-    return res >= 0 && ((res & BDRV_BLOCK_ZERO) || !(res & BDRV_BLOCK_DATA));
+    return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == s->cluster_sectors;
 }
 
 static bool is_zero_cluster_top_locked(BlockDriverState *bs, int64_t start)
@@ -2424,6 +2424,7 @@ static bool is_zero_cluster_top_locked(BlockDriverState *bs, int64_t start)
     int ret;
 
     ret = qcow2_get_cluster_offset(bs, start << BDRV_SECTOR_BITS, &nr, &off);
+    assert(nr == s->cluster_sectors);
     return ret == QCOW2_CLUSTER_UNALLOCATED || ret == QCOW2_CLUSTER_ZERO;
 }
 
diff --git a/block/quorum.c b/block/quorum.c
index 1ec3511528..ec6f3b9059 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -989,27 +989,6 @@ static void quorum_close(BlockDriverState *bs)
     g_free(s->children);
 }
 
-static void quorum_detach_aio_context(BlockDriverState *bs)
-{
-    BDRVQuorumState *s = bs->opaque;
-    int i;
-
-    for (i = 0; i < s->num_children; i++) {
-        bdrv_detach_aio_context(s->children[i]->bs);
-    }
-}
-
-static void quorum_attach_aio_context(BlockDriverState *bs,
-                                      AioContext *new_context)
-{
-    BDRVQuorumState *s = bs->opaque;
-    int i;
-
-    for (i = 0; i < s->num_children; i++) {
-        bdrv_attach_aio_context(s->children[i]->bs, new_context);
-    }
-}
-
 static void quorum_add_child(BlockDriverState *bs, BlockDriverState *child_bs,
                              Error **errp)
 {
@@ -1127,9 +1106,6 @@ static BlockDriver bdrv_quorum = {
     .bdrv_aio_readv                     = quorum_aio_readv,
     .bdrv_aio_writev                    = quorum_aio_writev,
 
-    .bdrv_detach_aio_context            = quorum_detach_aio_context,
-    .bdrv_attach_aio_context            = quorum_attach_aio_context,
-
     .bdrv_add_child                     = quorum_add_child,
     .bdrv_del_child                     = quorum_del_child,
 
diff --git a/block/snapshot.c b/block/snapshot.c
index e9d721df68..3917ec5c91 100644
--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -373,9 +373,10 @@ int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs,
 bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs)
 {
     bool ok = true;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator *it = NULL;
 
-    while (ok && (bs = bdrv_next(bs))) {
+    while (ok && (it = bdrv_next(it, &bs))) {
         AioContext *ctx = bdrv_get_aio_context(bs);
 
         aio_context_acquire(ctx);
@@ -393,10 +394,11 @@ int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs,
                              Error **err)
 {
     int ret = 0;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator *it = NULL;
     QEMUSnapshotInfo sn1, *snapshot = &sn1;
 
-    while (ret == 0 && (bs = bdrv_next(bs))) {
+    while (ret == 0 && (it = bdrv_next(it, &bs))) {
         AioContext *ctx = bdrv_get_aio_context(bs);
 
         aio_context_acquire(ctx);
@@ -415,9 +417,10 @@ int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs,
 int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs)
 {
     int err = 0;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator *it = NULL;
 
-    while (err == 0 && (bs = bdrv_next(bs))) {
+    while (err == 0 && (it = bdrv_next(it, &bs))) {
         AioContext *ctx = bdrv_get_aio_context(bs);
 
         aio_context_acquire(ctx);
@@ -435,9 +438,10 @@ int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs)
 {
     QEMUSnapshotInfo sn;
     int err = 0;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator *it = NULL;
 
-    while (err == 0 && (bs = bdrv_next(bs))) {
+    while (err == 0 && (it = bdrv_next(it, &bs))) {
         AioContext *ctx = bdrv_get_aio_context(bs);
 
         aio_context_acquire(ctx);
@@ -457,9 +461,10 @@ int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn,
                              BlockDriverState **first_bad_bs)
 {
     int err = 0;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator *it = NULL;
 
-    while (err == 0 && (bs = bdrv_next(bs))) {
+    while (err == 0 && (it = bdrv_next(it, &bs))) {
         AioContext *ctx = bdrv_get_aio_context(bs);
 
         aio_context_acquire(ctx);
@@ -480,9 +485,10 @@ int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn,
 BlockDriverState *bdrv_all_find_vmstate_bs(void)
 {
     bool not_found = true;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator *it = NULL;
 
-    while (not_found && (bs = bdrv_next(bs))) {
+    while (not_found && (it = bdrv_next(it, &bs))) {
         AioContext *ctx = bdrv_get_aio_context(bs);
 
         aio_context_acquire(ctx);
diff --git a/block/stream.c b/block/stream.c
index 332b9a183e..40aa32212e 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -163,8 +163,7 @@ wait:
         }
         if (ret < 0) {
             BlockErrorAction action =
-                block_job_error_action(&s->common, s->common.bs, s->on_error,
-                                       true, -ret);
+                block_job_error_action(&s->common, s->on_error, true, -ret);
             if (action == BLOCK_ERROR_ACTION_STOP) {
                 n = 0;
                 continue;
@@ -224,13 +223,6 @@ void stream_start(BlockDriverState *bs, BlockDriverState *base,
 {
     StreamBlockJob *s;
 
-    if ((on_error == BLOCKDEV_ON_ERROR_STOP ||
-         on_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
-        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
-        error_setg(errp, QERR_INVALID_PARAMETER, "on-error");
-        return;
-    }
-
     s = block_job_create(&stream_job_driver, bs, speed, cb, opaque, errp);
     if (!s) {
         return;
diff --git a/block/throttle-groups.c b/block/throttle-groups.c
index 9ac063a0cd..59545e287e 100644
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -23,13 +23,14 @@
  */
 
 #include "qemu/osdep.h"
+#include "sysemu/block-backend.h"
 #include "block/throttle-groups.h"
 #include "qemu/queue.h"
 #include "qemu/thread.h"
 #include "sysemu/qtest.h"
 
 /* The ThrottleGroup structure (with its ThrottleState) is shared
- * among different BlockDriverState and it's independent from
+ * among different BlockBackends and it's independent from
  * AioContext, so in order to use it from different threads it needs
  * its own locking.
  *
@@ -39,26 +40,26 @@
  * The whole ThrottleGroup structure is private and invisible to
  * outside users, that only use it through its ThrottleState.
  *
- * In addition to the ThrottleGroup structure, BlockDriverState has
+ * In addition to the ThrottleGroup structure, BlockBackendPublic has
  * fields that need to be accessed by other members of the group and
- * therefore also need to be protected by this lock. Once a BDS is
- * registered in a group those fields can be accessed by other threads
- * any time.
+ * therefore also need to be protected by this lock. Once a
+ * BlockBackend is registered in a group those fields can be accessed
+ * by other threads any time.
  *
  * Again, all this is handled internally and is mostly transparent to
  * the outside. The 'throttle_timers' field however has an additional
  * constraint because it may be temporarily invalid (see for example
  * bdrv_set_aio_context()). Therefore in this file a thread will
- * access some other BDS's timers only after verifying that that BDS
- * has throttled requests in the queue.
+ * access some other BlockBackend's timers only after verifying that
+ * that BlockBackend has throttled requests in the queue.
  */
 typedef struct ThrottleGroup {
     char *name; /* This is constant during the lifetime of the group */
 
     QemuMutex lock; /* This lock protects the following four fields */
     ThrottleState ts;
-    QLIST_HEAD(, BlockDriverState) head;
-    BlockDriverState *tokens[2];
+    QLIST_HEAD(, BlockBackendPublic) head;
+    BlockBackend *tokens[2];
     bool any_timer_armed[2];
 
     /* These two are protected by the global throttle_groups_lock */
@@ -132,94 +133,95 @@ void throttle_group_unref(ThrottleState *ts)
     qemu_mutex_unlock(&throttle_groups_lock);
 }
 
-/* Get the name from a BlockDriverState's ThrottleGroup. The name (and
- * the pointer) is guaranteed to remain constant during the lifetime
- * of the group.
+/* Get the name from a BlockBackend's ThrottleGroup. The name (and the pointer)
+ * is guaranteed to remain constant during the lifetime of the group.
  *
- * @bs:   a BlockDriverState that is member of a throttling group
+ * @blk:  a BlockBackend that is member of a throttling group
  * @ret:  the name of the group.
  */
-const char *throttle_group_get_name(BlockDriverState *bs)
+const char *throttle_group_get_name(BlockBackend *blk)
 {
-    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
     return tg->name;
 }
 
-/* Return the next BlockDriverState in the round-robin sequence,
- * simulating a circular list.
+/* Return the next BlockBackend in the round-robin sequence, simulating a
+ * circular list.
  *
  * This assumes that tg->lock is held.
  *
- * @bs:  the current BlockDriverState
- * @ret: the next BlockDriverState in the sequence
+ * @blk: the current BlockBackend
+ * @ret: the next BlockBackend in the sequence
  */
-static BlockDriverState *throttle_group_next_bs(BlockDriverState *bs)
+static BlockBackend *throttle_group_next_blk(BlockBackend *blk)
 {
-    ThrottleState *ts = bs->throttle_state;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleState *ts = blkp->throttle_state;
     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
-    BlockDriverState *next = QLIST_NEXT(bs, round_robin);
+    BlockBackendPublic *next = QLIST_NEXT(blkp, round_robin);
 
     if (!next) {
-        return QLIST_FIRST(&tg->head);
+        next = QLIST_FIRST(&tg->head);
     }
 
-    return next;
+    return blk_by_public(next);
 }
 
-/* Return the next BlockDriverState in the round-robin sequence with
- * pending I/O requests.
+/* Return the next BlockBackend in the round-robin sequence with pending I/O
+ * requests.
  *
  * This assumes that tg->lock is held.
  *
- * @bs:        the current BlockDriverState
+ * @blk:       the current BlockBackend
  * @is_write:  the type of operation (read/write)
- * @ret:       the next BlockDriverState with pending requests, or bs
- *             if there is none.
+ * @ret:       the next BlockBackend with pending requests, or blk if there is
+ *             none.
  */
-static BlockDriverState *next_throttle_token(BlockDriverState *bs,
-                                             bool is_write)
+static BlockBackend *next_throttle_token(BlockBackend *blk, bool is_write)
 {
-    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
-    BlockDriverState *token, *start;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
+    BlockBackend *token, *start;
 
     start = token = tg->tokens[is_write];
 
     /* get next bs round in round robin style */
-    token = throttle_group_next_bs(token);
-    while (token != start && !token->pending_reqs[is_write]) {
-        token = throttle_group_next_bs(token);
+    token = throttle_group_next_blk(token);
+    while (token != start && !blkp->pending_reqs[is_write]) {
+        token = throttle_group_next_blk(token);
     }
 
     /* If no IO are queued for scheduling on the next round robin token
      * then decide the token is the current bs because chances are
      * the current bs get the current request queued.
      */
-    if (token == start && !token->pending_reqs[is_write]) {
-        token = bs;
+    if (token == start && !blkp->pending_reqs[is_write]) {
+        token = blk;
     }
 
     return token;
 }
 
-/* Check if the next I/O request for a BlockDriverState needs to be
- * throttled or not. If there's no timer set in this group, set one
- * and update the token accordingly.
+/* Check if the next I/O request for a BlockBackend needs to be throttled or
+ * not. If there's no timer set in this group, set one and update the token
+ * accordingly.
  *
  * This assumes that tg->lock is held.
  *
- * @bs:         the current BlockDriverState
+ * @blk:        the current BlockBackend
  * @is_write:   the type of operation (read/write)
  * @ret:        whether the I/O request needs to be throttled or not
  */
-static bool throttle_group_schedule_timer(BlockDriverState *bs,
-                                          bool is_write)
+static bool throttle_group_schedule_timer(BlockBackend *blk, bool is_write)
 {
-    ThrottleState *ts = bs->throttle_state;
-    ThrottleTimers *tt = &bs->throttle_timers;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleState *ts = blkp->throttle_state;
+    ThrottleTimers *tt = &blkp->throttle_timers;
     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
     bool must_wait;
 
-    if (bs->io_limits_disabled) {
+    if (blkp->io_limits_disabled) {
         return false;
     }
 
@@ -230,9 +232,9 @@ static bool throttle_group_schedule_timer(BlockDriverState *bs,
 
     must_wait = throttle_schedule_timer(ts, tt, is_write);
 
-    /* If a timer just got armed, set bs as the current token */
+    /* If a timer just got armed, set blk as the current token */
     if (must_wait) {
-        tg->tokens[is_write] = bs;
+        tg->tokens[is_write] = blk;
         tg->any_timer_armed[is_write] = true;
     }
 
@@ -243,18 +245,19 @@ static bool throttle_group_schedule_timer(BlockDriverState *bs,
  *
  * This assumes that tg->lock is held.
  *
- * @bs:        the current BlockDriverState
+ * @blk:       the current BlockBackend
  * @is_write:  the type of operation (read/write)
  */
-static void schedule_next_request(BlockDriverState *bs, bool is_write)
+static void schedule_next_request(BlockBackend *blk, bool is_write)
 {
-    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
     bool must_wait;
-    BlockDriverState *token;
+    BlockBackend *token;
 
     /* Check if there's any pending request to schedule next */
-    token = next_throttle_token(bs, is_write);
-    if (!token->pending_reqs[is_write]) {
+    token = next_throttle_token(blk, is_write);
+    if (!blkp->pending_reqs[is_write]) {
         return;
     }
 
@@ -263,12 +266,12 @@ static void schedule_next_request(BlockDriverState *bs, bool is_write)
 
     /* If it doesn't have to wait, queue it for immediate execution */
     if (!must_wait) {
-        /* Give preference to requests from the current bs */
+        /* Give preference to requests from the current blk */
         if (qemu_in_coroutine() &&
-            qemu_co_queue_next(&bs->throttled_reqs[is_write])) {
-            token = bs;
+            qemu_co_queue_next(&blkp->throttled_reqs[is_write])) {
+            token = blk;
         } else {
-            ThrottleTimers *tt = &token->throttle_timers;
+            ThrottleTimers *tt = &blkp->throttle_timers;
             int64_t now = qemu_clock_get_ns(tt->clock_type);
             timer_mod(tt->timers[is_write], now + 1);
             tg->any_timer_armed[is_write] = true;
@@ -281,48 +284,50 @@ static void schedule_next_request(BlockDriverState *bs, bool is_write)
  * if necessary, and schedule the next request using a round robin
  * algorithm.
  *
- * @bs:        the current BlockDriverState
+ * @blk:       the current BlockBackend
  * @bytes:     the number of bytes for this I/O
  * @is_write:  the type of operation (read/write)
  */
-void coroutine_fn throttle_group_co_io_limits_intercept(BlockDriverState *bs,
+void coroutine_fn throttle_group_co_io_limits_intercept(BlockBackend *blk,
                                                         unsigned int bytes,
                                                         bool is_write)
 {
     bool must_wait;
-    BlockDriverState *token;
+    BlockBackend *token;
 
-    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
     qemu_mutex_lock(&tg->lock);
 
     /* First we check if this I/O has to be throttled. */
-    token = next_throttle_token(bs, is_write);
+    token = next_throttle_token(blk, is_write);
     must_wait = throttle_group_schedule_timer(token, is_write);
 
     /* Wait if there's a timer set or queued requests of this type */
-    if (must_wait || bs->pending_reqs[is_write]) {
-        bs->pending_reqs[is_write]++;
+    if (must_wait || blkp->pending_reqs[is_write]) {
+        blkp->pending_reqs[is_write]++;
         qemu_mutex_unlock(&tg->lock);
-        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
+        qemu_co_queue_wait(&blkp->throttled_reqs[is_write]);
         qemu_mutex_lock(&tg->lock);
-        bs->pending_reqs[is_write]--;
+        blkp->pending_reqs[is_write]--;
     }
 
     /* The I/O will be executed, so do the accounting */
-    throttle_account(bs->throttle_state, is_write, bytes);
+    throttle_account(blkp->throttle_state, is_write, bytes);
 
     /* Schedule the next request */
-    schedule_next_request(bs, is_write);
+    schedule_next_request(blk, is_write);
 
     qemu_mutex_unlock(&tg->lock);
 }
 
-void throttle_group_restart_bs(BlockDriverState *bs)
+void throttle_group_restart_blk(BlockBackend *blk)
 {
+    BlockBackendPublic *blkp = blk_get_public(blk);
     int i;
 
     for (i = 0; i < 2; i++) {
-        while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
+        while (qemu_co_enter_next(&blkp->throttled_reqs[i])) {
             ;
         }
     }
@@ -332,13 +337,14 @@ void throttle_group_restart_bs(BlockDriverState *bs)
  * to throttle_config(), but guarantees atomicity within the
  * throttling group.
  *
- * @bs:  a BlockDriverState that is member of the group
+ * @blk: a BlockBackend that is a member of the group
  * @cfg: the configuration to set
  */
-void throttle_group_config(BlockDriverState *bs, ThrottleConfig *cfg)
+void throttle_group_config(BlockBackend *blk, ThrottleConfig *cfg)
 {
-    ThrottleTimers *tt = &bs->throttle_timers;
-    ThrottleState *ts = bs->throttle_state;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleTimers *tt = &blkp->throttle_timers;
+    ThrottleState *ts = blkp->throttle_state;
     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
     qemu_mutex_lock(&tg->lock);
     /* throttle_config() cancels the timers */
@@ -351,20 +357,21 @@ void throttle_group_config(BlockDriverState *bs, ThrottleConfig *cfg)
     throttle_config(ts, tt, cfg);
     qemu_mutex_unlock(&tg->lock);
 
-    qemu_co_enter_next(&bs->throttled_reqs[0]);
-    qemu_co_enter_next(&bs->throttled_reqs[1]);
+    qemu_co_enter_next(&blkp->throttled_reqs[0]);
+    qemu_co_enter_next(&blkp->throttled_reqs[1]);
 }
 
 /* Get the throttle configuration from a particular group. Similar to
  * throttle_get_config(), but guarantees atomicity within the
  * throttling group.
  *
- * @bs:  a BlockDriverState that is member of the group
+ * @blk: a BlockBackend that is a member of the group
  * @cfg: the configuration will be written here
  */
-void throttle_group_get_config(BlockDriverState *bs, ThrottleConfig *cfg)
+void throttle_group_get_config(BlockBackend *blk, ThrottleConfig *cfg)
 {
-    ThrottleState *ts = bs->throttle_state;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleState *ts = blkp->throttle_state;
     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
     qemu_mutex_lock(&tg->lock);
     throttle_get_config(ts, cfg);
@@ -374,12 +381,13 @@ void throttle_group_get_config(BlockDriverState *bs, ThrottleConfig *cfg)
 /* ThrottleTimers callback. This wakes up a request that was waiting
  * because it had been throttled.
  *
- * @bs:        the BlockDriverState whose request had been throttled
+ * @blk:       the BlockBackend whose request had been throttled
  * @is_write:  the type of operation (read/write)
  */
-static void timer_cb(BlockDriverState *bs, bool is_write)
+static void timer_cb(BlockBackend *blk, bool is_write)
 {
-    ThrottleState *ts = bs->throttle_state;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleState *ts = blkp->throttle_state;
     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
     bool empty_queue;
 
@@ -389,13 +397,13 @@ static void timer_cb(BlockDriverState *bs, bool is_write)
     qemu_mutex_unlock(&tg->lock);
 
     /* Run the request that was waiting for this timer */
-    empty_queue = !qemu_co_enter_next(&bs->throttled_reqs[is_write]);
+    empty_queue = !qemu_co_enter_next(&blkp->throttled_reqs[is_write]);
 
     /* If the request queue was empty then we have to take care of
      * scheduling the next one */
     if (empty_queue) {
         qemu_mutex_lock(&tg->lock);
-        schedule_next_request(bs, is_write);
+        schedule_next_request(blk, is_write);
         qemu_mutex_unlock(&tg->lock);
     }
 }
@@ -410,17 +418,17 @@ static void write_timer_cb(void *opaque)
     timer_cb(opaque, true);
 }
 
-/* Register a BlockDriverState in the throttling group, also
- * initializing its timers and updating its throttle_state pointer to
- * point to it. If a throttling group with that name does not exist
- * yet, it will be created.
+/* Register a BlockBackend in the throttling group, also initializing its
+ * timers and updating its throttle_state pointer to point to it. If a
+ * throttling group with that name does not exist yet, it will be created.
  *
- * @bs:        the BlockDriverState to insert
+ * @blk:       the BlockBackend to insert
  * @groupname: the name of the group
  */
-void throttle_group_register_bs(BlockDriverState *bs, const char *groupname)
+void throttle_group_register_blk(BlockBackend *blk, const char *groupname)
 {
     int i;
+    BlockBackendPublic *blkp = blk_get_public(blk);
     ThrottleState *ts = throttle_group_incref(groupname);
     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
     int clock_type = QEMU_CLOCK_REALTIME;
@@ -430,67 +438,67 @@ void throttle_group_register_bs(BlockDriverState *bs, const char *groupname)
         clock_type = QEMU_CLOCK_VIRTUAL;
     }
 
-    bs->throttle_state = ts;
+    blkp->throttle_state = ts;
 
     qemu_mutex_lock(&tg->lock);
-    /* If the ThrottleGroup is new set this BlockDriverState as the token */
+    /* If the ThrottleGroup is new set this BlockBackend as the token */
     for (i = 0; i < 2; i++) {
         if (!tg->tokens[i]) {
-            tg->tokens[i] = bs;
+            tg->tokens[i] = blk;
         }
     }
 
-    QLIST_INSERT_HEAD(&tg->head, bs, round_robin);
+    QLIST_INSERT_HEAD(&tg->head, blkp, round_robin);
 
-    throttle_timers_init(&bs->throttle_timers,
-                         bdrv_get_aio_context(bs),
+    throttle_timers_init(&blkp->throttle_timers,
+                         blk_get_aio_context(blk),
                          clock_type,
                          read_timer_cb,
                          write_timer_cb,
-                         bs);
+                         blk);
 
     qemu_mutex_unlock(&tg->lock);
 }
 
-/* Unregister a BlockDriverState from its group, removing it from the
- * list, destroying the timers and setting the throttle_state pointer
- * to NULL.
+/* Unregister a BlockBackend from its group, removing it from the list,
+ * destroying the timers and setting the throttle_state pointer to NULL.
  *
- * The BlockDriverState must not have pending throttled requests, so
- * the caller has to drain them first.
+ * The BlockBackend must not have pending throttled requests, so the caller has
+ * to drain them first.
  *
  * The group will be destroyed if it's empty after this operation.
  *
- * @bs: the BlockDriverState to remove
+ * @blk: the BlockBackend to remove
  */
-void throttle_group_unregister_bs(BlockDriverState *bs)
+void throttle_group_unregister_blk(BlockBackend *blk)
 {
-    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
     int i;
 
-    assert(bs->pending_reqs[0] == 0 && bs->pending_reqs[1] == 0);
-    assert(qemu_co_queue_empty(&bs->throttled_reqs[0]));
-    assert(qemu_co_queue_empty(&bs->throttled_reqs[1]));
+    assert(blkp->pending_reqs[0] == 0 && blkp->pending_reqs[1] == 0);
+    assert(qemu_co_queue_empty(&blkp->throttled_reqs[0]));
+    assert(qemu_co_queue_empty(&blkp->throttled_reqs[1]));
 
     qemu_mutex_lock(&tg->lock);
     for (i = 0; i < 2; i++) {
-        if (tg->tokens[i] == bs) {
-            BlockDriverState *token = throttle_group_next_bs(bs);
-            /* Take care of the case where this is the last bs in the group */
-            if (token == bs) {
+        if (tg->tokens[i] == blk) {
+            BlockBackend *token = throttle_group_next_blk(blk);
+            /* Take care of the case where this is the last blk in the group */
+            if (token == blk) {
                 token = NULL;
             }
             tg->tokens[i] = token;
         }
     }
 
-    /* remove the current bs from the list */
-    QLIST_REMOVE(bs, round_robin);
-    throttle_timers_destroy(&bs->throttle_timers);
+    /* remove the current blk from the list */
+    QLIST_REMOVE(blkp, round_robin);
+    throttle_timers_destroy(&blkp->throttle_timers);
     qemu_mutex_unlock(&tg->lock);
 
     throttle_group_unref(&tg->ts);
-    bs->throttle_state = NULL;
+    blkp->throttle_state = NULL;
 }
 
 static void throttle_groups_init(void)
diff --git a/block/vmdk.c b/block/vmdk.c
index 1cb4b8529c..372e5edc15 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -2344,27 +2344,6 @@ static int vmdk_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
     return 0;
 }
 
-static void vmdk_detach_aio_context(BlockDriverState *bs)
-{
-    BDRVVmdkState *s = bs->opaque;
-    int i;
-
-    for (i = 0; i < s->num_extents; i++) {
-        bdrv_detach_aio_context(s->extents[i].file->bs);
-    }
-}
-
-static void vmdk_attach_aio_context(BlockDriverState *bs,
-                                    AioContext *new_context)
-{
-    BDRVVmdkState *s = bs->opaque;
-    int i;
-
-    for (i = 0; i < s->num_extents; i++) {
-        bdrv_attach_aio_context(s->extents[i].file->bs, new_context);
-    }
-}
-
 static QemuOptsList vmdk_create_opts = {
     .name = "vmdk-create-opts",
     .head = QTAILQ_HEAD_INITIALIZER(vmdk_create_opts.head),
@@ -2434,8 +2413,6 @@ static BlockDriver bdrv_vmdk = {
     .bdrv_get_specific_info       = vmdk_get_specific_info,
     .bdrv_refresh_limits          = vmdk_refresh_limits,
     .bdrv_get_info                = vmdk_get_info,
-    .bdrv_detach_aio_context      = vmdk_detach_aio_context,
-    .bdrv_attach_aio_context      = vmdk_attach_aio_context,
 
     .supports_backing             = true,
     .create_opts                  = &vmdk_create_opts,
diff --git a/blockdev.c b/blockdev.c
index 1892b8ec8e..40e4e6fc6f 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -577,15 +577,6 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
         blk_rs->read_only     = !(bdrv_flags & BDRV_O_RDWR);
         blk_rs->detect_zeroes = detect_zeroes;
 
-        if (throttle_enabled(&cfg)) {
-            if (!throttling_group) {
-                throttling_group = blk_name(blk);
-            }
-            blk_rs->throttle_group = g_strdup(throttling_group);
-            blk_rs->throttle_state = throttle_group_incref(throttling_group);
-            blk_rs->throttle_state->cfg = cfg;
-        }
-
         QDECREF(bs_opts);
     } else {
         if (file && !*file) {
@@ -611,15 +602,6 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
 
         bs->detect_zeroes = detect_zeroes;
 
-        /* disk I/O throttling */
-        if (throttle_enabled(&cfg)) {
-            if (!throttling_group) {
-                throttling_group = blk_name(blk);
-            }
-            bdrv_io_limits_enable(bs, throttling_group);
-            bdrv_set_io_limits(bs, &cfg);
-        }
-
         if (bdrv_key_required(bs)) {
             autostart = 0;
         }
@@ -633,6 +615,15 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
         }
     }
 
+    /* disk I/O throttling */
+    if (throttle_enabled(&cfg)) {
+        if (!throttling_group) {
+            throttling_group = blk_name(blk);
+        }
+        blk_io_limits_enable(blk, throttling_group);
+        blk_set_io_limits(blk, &cfg);
+    }
+
     blk_set_enable_write_cache(blk, !writethrough);
     blk_set_on_error(blk, on_read_error, on_write_error);
 
@@ -1785,9 +1776,9 @@ static void external_snapshot_prepare(BlkActionState *common,
         return;
     }
 
-    if (state->new_bs->blk != NULL) {
+    if (bdrv_has_blk(state->new_bs)) {
         error_setg(errp, "The snapshot is already in use by %s",
-                   blk_name(state->new_bs->blk));
+                   bdrv_get_parent_name(state->new_bs));
         return;
     }
 
@@ -2290,16 +2281,29 @@ exit:
     block_job_txn_unref(block_job_txn);
 }
 
+static int do_open_tray(const char *device, bool force, Error **errp);
+
 void qmp_eject(const char *device, bool has_force, bool force, Error **errp)
 {
     Error *local_err = NULL;
+    int rc;
+
+    if (!has_force) {
+        force = false;
+    }
 
-    qmp_blockdev_open_tray(device, has_force, force, &local_err);
+    rc = do_open_tray(device, force, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
         return;
     }
 
+    if (rc == EINPROGRESS) {
+        error_setg(errp, "Device '%s' is locked and force was not specified, "
+                   "wait for tray to open and try again", device);
+        return;
+    }
+
     qmp_x_blockdev_remove_medium(device, errp);
 }
 
@@ -2327,35 +2331,36 @@ void qmp_block_passwd(bool has_device, const char *device,
     aio_context_release(aio_context);
 }
 
-void qmp_blockdev_open_tray(const char *device, bool has_force, bool force,
-                            Error **errp)
+/**
+ * returns -errno on fatal error, +errno for non-fatal situations.
+ * errp will always be set when the return code is negative.
+ * May return +ENOSYS if the device has no tray,
+ * or +EINPROGRESS if the tray is locked and the guest has been notified.
+ */
+static int do_open_tray(const char *device, bool force, Error **errp)
 {
     BlockBackend *blk;
     bool locked;
 
-    if (!has_force) {
-        force = false;
-    }
-
     blk = blk_by_name(device);
     if (!blk) {
         error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
                   "Device '%s' not found", device);
-        return;
+        return -ENODEV;
     }
 
     if (!blk_dev_has_removable_media(blk)) {
         error_setg(errp, "Device '%s' is not removable", device);
-        return;
+        return -ENOTSUP;
     }
 
     if (!blk_dev_has_tray(blk)) {
         /* Ignore this command on tray-less devices */
-        return;
+        return ENOSYS;
     }
 
     if (blk_dev_is_tray_open(blk)) {
-        return;
+        return 0;
     }
 
     locked = blk_dev_is_medium_locked(blk);
@@ -2366,6 +2371,21 @@ void qmp_blockdev_open_tray(const char *device, bool has_force, bool force,
     if (!locked || force) {
         blk_dev_change_media_cb(blk, false);
     }
+
+    if (locked && !force) {
+        return EINPROGRESS;
+    }
+
+    return 0;
+}
+
+void qmp_blockdev_open_tray(const char *device, bool has_force, bool force,
+                            Error **errp)
+{
+    if (!has_force) {
+        force = false;
+    }
+    do_open_tray(device, force, errp);
 }
 
 void qmp_blockdev_close_tray(const char *device, Error **errp)
@@ -2503,9 +2523,9 @@ void qmp_x_blockdev_insert_medium(const char *device, const char *node_name,
         return;
     }
 
-    if (bs->blk) {
+    if (bdrv_has_blk(bs)) {
         error_setg(errp, "Node '%s' is already in use by '%s'", node_name,
-                   blk_name(bs->blk));
+                   bdrv_get_parent_name(bs));
         return;
     }
 
@@ -2570,8 +2590,6 @@ void qmp_blockdev_change_medium(const char *device, const char *filename,
         goto fail;
     }
 
-    blk_apply_root_state(blk, medium_bs);
-
     bdrv_add_key(medium_bs, NULL, &err);
     if (err) {
         error_propagate(errp, err);
@@ -2596,6 +2614,8 @@ void qmp_blockdev_change_medium(const char *device, const char *filename,
         goto fail;
     }
 
+    blk_apply_root_state(blk, medium_bs);
+
     qmp_blockdev_close_tray(device, errp);
 
 fail:
@@ -2661,13 +2681,6 @@ void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd,
         goto out;
     }
 
-    /* The BlockBackend must be the only parent */
-    assert(QLIST_FIRST(&bs->parents));
-    if (QLIST_NEXT(QLIST_FIRST(&bs->parents), next_parent)) {
-        error_setg(errp, "Cannot throttle device with multiple parents");
-        goto out;
-    }
-
     throttle_config_init(&cfg);
     cfg.buckets[THROTTLE_BPS_TOTAL].avg = bps;
     cfg.buckets[THROTTLE_BPS_READ].avg  = bps_rd;
@@ -2726,16 +2739,16 @@ void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd,
     if (throttle_enabled(&cfg)) {
         /* Enable I/O limits if they're not enabled yet, otherwise
          * just update the throttling group. */
-        if (!bs->throttle_state) {
-            bdrv_io_limits_enable(bs, has_group ? group : device);
+        if (!blk_get_public(blk)->throttle_state) {
+            blk_io_limits_enable(blk, has_group ? group : device);
         } else if (has_group) {
-            bdrv_io_limits_update_group(bs, group);
+            blk_io_limits_update_group(blk, group);
         }
         /* Set the new throttling configuration */
-        bdrv_set_io_limits(bs, &cfg);
-    } else if (bs->throttle_state) {
+        blk_set_io_limits(blk, &cfg);
+    } else if (blk_get_public(blk)->throttle_state) {
         /* If all throttling settings are set to 0, disable I/O limits */
-        bdrv_io_limits_disable(bs);
+        blk_io_limits_disable(blk);
     }
 
 out:
@@ -3457,7 +3470,7 @@ static void blockdev_mirror_common(BlockDriverState *bs,
     if (bdrv_op_is_blocked(target, BLOCK_OP_TYPE_MIRROR_TARGET, errp)) {
         return;
     }
-    if (target->blk) {
+    if (bdrv_has_blk(target)) {
         error_setg(errp, "Cannot mirror to an attached block device");
         return;
     }
@@ -4046,15 +4059,15 @@ void qmp_x_blockdev_del(bool has_id, const char *id,
         bs = blk_bs(blk);
         aio_context = blk_get_aio_context(blk);
     } else {
+        blk = NULL;
         bs = bdrv_find_node(node_name);
         if (!bs) {
             error_setg(errp, "Cannot find node %s", node_name);
             return;
         }
-        blk = bs->blk;
-        if (blk) {
+        if (bdrv_has_blk(bs)) {
             error_setg(errp, "Node %s is in use by %s",
-                       node_name, blk_name(blk));
+                       node_name, bdrv_get_parent_name(bs));
             return;
         }
         aio_context = bdrv_get_aio_context(bs);
@@ -4151,8 +4164,9 @@ BlockJobInfoList *qmp_query_block_jobs(Error **errp)
 {
     BlockJobInfoList *head = NULL, **p_next = &head;
     BlockDriverState *bs;
+    BdrvNextIterator *it = NULL;
 
-    for (bs = bdrv_next(NULL); bs; bs = bdrv_next(bs)) {
+    while ((it = bdrv_next(it, &bs))) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
         aio_context_acquire(aio_context);
diff --git a/blockjob.c b/blockjob.c
index 9fc37ca965..5b840a7df6 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -411,8 +411,7 @@ void block_job_event_ready(BlockJob *job)
                                     job->speed, &error_abort);
 }
 
-BlockErrorAction block_job_error_action(BlockJob *job, BlockDriverState *bs,
-                                        BlockdevOnError on_err,
+BlockErrorAction block_job_error_action(BlockJob *job, BlockdevOnError on_err,
                                         int is_read, int error)
 {
     BlockErrorAction action;
@@ -443,9 +442,6 @@ BlockErrorAction block_job_error_action(BlockJob *job, BlockDriverState *bs,
         job->user_paused = true;
         block_job_pause(job);
         block_job_iostatus_set_err(job, error);
-        if (bs->blk && bs != job->bs) {
-            blk_iostatus_set_err(bs->blk, error);
-        }
     }
     return action;
 }
diff --git a/include/block/block.h b/include/block/block.h
index b210832778..a8c15e36e7 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -17,6 +17,7 @@ typedef struct BlockJob BlockJob;
 typedef struct BdrvChild BdrvChild;
 typedef struct BdrvChildRole BdrvChildRole;
 typedef struct BlockJobTxn BlockJobTxn;
+typedef struct BdrvNextIterator BdrvNextIterator;
 
 typedef struct BlockDriverInfo {
     /* in bytes, 0 if irrelevant */
@@ -187,10 +188,6 @@ void bdrv_stats_print(Monitor *mon, const QObject *data);
 void bdrv_info_stats(Monitor *mon, QObject **ret_data);
 
 /* disk I/O throttling */
-void bdrv_io_limits_enable(BlockDriverState *bs, const char *group);
-void bdrv_io_limits_disable(BlockDriverState *bs);
-void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group);
-
 void bdrv_init(void);
 void bdrv_init_with_whitelist(void);
 bool bdrv_uses_whitelist(void);
@@ -333,7 +330,7 @@ void bdrv_aio_cancel(BlockAIOCB *acb);
 void bdrv_aio_cancel_async(BlockAIOCB *acb);
 
 typedef struct BlockRequest {
-    /* Fields to be filled by multiwrite caller */
+    /* Fields to be filled by caller */
     union {
         struct {
             int64_t sector;
@@ -349,13 +346,10 @@ typedef struct BlockRequest {
     BlockCompletionFunc *cb;
     void *opaque;
 
-    /* Filled by multiwrite implementation */
+    /* Filled by block layer */
     int error;
 } BlockRequest;
 
-int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs,
-    int num_reqs);
-
 /* sg packet commands */
 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf);
 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
@@ -408,7 +402,7 @@ BlockDriverState *bdrv_lookup_bs(const char *device,
                                  Error **errp);
 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base);
 BlockDriverState *bdrv_next_node(BlockDriverState *bs);
-BlockDriverState *bdrv_next(BlockDriverState *bs);
+BdrvNextIterator *bdrv_next(BdrvNextIterator *it, BlockDriverState **bs);
 BlockDriverState *bdrv_next_monitor_owned(BlockDriverState *bs);
 int bdrv_is_encrypted(BlockDriverState *bs);
 int bdrv_key_required(BlockDriverState *bs);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index a029c2003f..b6f4755725 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -26,7 +26,6 @@
 
 #include "block/accounting.h"
 #include "block/block.h"
-#include "block/throttle-groups.h"
 #include "qemu/option.h"
 #include "qemu/queue.h"
 #include "qemu/coroutine.h"
@@ -365,6 +364,25 @@ typedef struct BdrvAioNotifier {
 struct BdrvChildRole {
     void (*inherit_options)(int *child_flags, QDict *child_options,
                             int parent_flags, QDict *parent_options);
+
+    void (*change_media)(BdrvChild *child, bool load);
+    void (*resize)(BdrvChild *child);
+
+    /* Returns a name that is supposedly more useful for human users than the
+     * node name for identifying the node in question (in particular, a BB
+     * name), or NULL if the parent can't provide a better name. */
+    const char* (*get_name)(BdrvChild *child);
+
+    /*
+     * If this pair of functions is implemented, the parent doesn't issue new
+     * requests after returning from .drained_begin() until .drained_end() is
+     * called.
+     *
+     * Note that this can be nested. If drained_begin() was called twice, new
+     * I/O is allowed only after drained_end() was called twice, too.
+     */
+    void (*drained_begin)(BdrvChild *child);
+    void (*drained_end)(BdrvChild *child);
 };
 
 extern const BdrvChildRole child_file;
@@ -374,6 +392,7 @@ struct BdrvChild {
     BlockDriverState *bs;
     char *name;
     const BdrvChildRole *role;
+    void *opaque;
     QLIST_ENTRY(BdrvChild) next;
     QLIST_ENTRY(BdrvChild) next_parent;
 };
@@ -399,8 +418,6 @@ struct BlockDriverState {
     BlockDriver *drv; /* NULL means no media */
     void *opaque;
 
-    BlockBackend *blk;          /* owning backend, if any */
-
     AioContext *aio_context; /* event loop used for fd handlers, timers, etc */
     /* long-running tasks intended to always use the same AioContext as this
      * BDS may register themselves in this list to be notified of changes
@@ -424,19 +441,6 @@ struct BlockDriverState {
     /* number of in-flight serialising requests */
     unsigned int serialising_in_flight;
 
-    /* I/O throttling.
-     * throttle_state tells us if this BDS has I/O limits configured.
-     * io_limits_disabled tells us if they are currently being enforced */
-    CoQueue      throttled_reqs[2];
-    unsigned int io_limits_disabled;
-
-    /* The following fields are protected by the ThrottleGroup lock.
-     * See the ThrottleGroup documentation for details. */
-    ThrottleState *throttle_state;
-    ThrottleTimers throttle_timers;
-    unsigned       pending_reqs[2];
-    QLIST_ENTRY(BlockDriverState) round_robin;
-
     /* Offset after the highest byte written to */
     uint64_t wr_highest_offset;
 
@@ -502,9 +506,6 @@ struct BlockBackendRootState {
     int open_flags;
     bool read_only;
     BlockdevDetectZeroesOptions detect_zeroes;
-
-    char *throttle_group;
-    ThrottleState *throttle_state;
 };
 
 static inline BlockDriverState *backing_bs(BlockDriverState *bs)
@@ -539,9 +540,6 @@ int get_tmp_filename(char *filename, int size);
 BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
                             const char *filename);
 
-void bdrv_set_io_limits(BlockDriverState *bs,
-                        ThrottleConfig *cfg);
-
 
 /**
  * bdrv_add_before_write_notifier:
@@ -724,16 +722,13 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
                                   const BdrvChildRole *child_role);
 void bdrv_root_unref_child(BdrvChild *child);
 
-void bdrv_no_throttling_begin(BlockDriverState *bs);
-void bdrv_no_throttling_end(BlockDriverState *bs);
-
+const char *bdrv_get_parent_name(const BlockDriverState *bs);
 void blk_dev_change_media_cb(BlockBackend *blk, bool load);
 bool blk_dev_has_removable_media(BlockBackend *blk);
 bool blk_dev_has_tray(BlockBackend *blk);
 void blk_dev_eject_request(BlockBackend *blk, bool force);
 bool blk_dev_is_tray_open(BlockBackend *blk);
 bool blk_dev_is_medium_locked(BlockBackend *blk);
-void blk_dev_resize_cb(BlockBackend *blk);
 
 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors);
 bool bdrv_requests_pending(BlockDriverState *bs);
diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index 8bedc4936c..073a433cf8 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -383,7 +383,6 @@ void block_job_iostatus_reset(BlockJob *job);
 /**
  * block_job_error_action:
  * @job: The job to signal an error for.
- * @bs: The block device on which to set an I/O error.
  * @on_err: The error action setting.
  * @is_read: Whether the operation was a read.
  * @error: The error that was reported.
@@ -391,8 +390,7 @@ void block_job_iostatus_reset(BlockJob *job);
  * Report an I/O error for a block job and possibly stop the VM.  Return the
  * action that was selected based on @on_err and @error.
  */
-BlockErrorAction block_job_error_action(BlockJob *job, BlockDriverState *bs,
-                                        BlockdevOnError on_err,
+BlockErrorAction block_job_error_action(BlockJob *job, BlockdevOnError on_err,
                                         int is_read, int error);
 
 typedef void BlockJobDeferToMainLoopFn(BlockJob *job, void *opaque);
diff --git a/include/block/throttle-groups.h b/include/block/throttle-groups.h
index 395f72d444..d983d34074 100644
--- a/include/block/throttle-groups.h
+++ b/include/block/throttle-groups.h
@@ -28,19 +28,19 @@
 #include "qemu/throttle.h"
 #include "block/block_int.h"
 
-const char *throttle_group_get_name(BlockDriverState *bs);
+const char *throttle_group_get_name(BlockBackend *blk);
 
 ThrottleState *throttle_group_incref(const char *name);
 void throttle_group_unref(ThrottleState *ts);
 
-void throttle_group_config(BlockDriverState *bs, ThrottleConfig *cfg);
-void throttle_group_get_config(BlockDriverState *bs, ThrottleConfig *cfg);
+void throttle_group_config(BlockBackend *blk, ThrottleConfig *cfg);
+void throttle_group_get_config(BlockBackend *blk, ThrottleConfig *cfg);
 
-void throttle_group_register_bs(BlockDriverState *bs, const char *groupname);
-void throttle_group_unregister_bs(BlockDriverState *bs);
-void throttle_group_restart_bs(BlockDriverState *bs);
+void throttle_group_register_blk(BlockBackend *blk, const char *groupname);
+void throttle_group_unregister_blk(BlockBackend *blk);
+void throttle_group_restart_blk(BlockBackend *blk);
 
-void coroutine_fn throttle_group_co_io_limits_intercept(BlockDriverState *bs,
+void coroutine_fn throttle_group_co_io_limits_intercept(BlockBackend *blk,
                                                         unsigned int bytes,
                                                         bool is_write);
 
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index 26736ed84e..68d92b556e 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -14,6 +14,7 @@
 #define BLOCK_BACKEND_H
 
 #include "qemu/iov.h"
+#include "block/throttle-groups.h"
 
 /*
  * TODO Have to include block/block.h for a bunch of block layer
@@ -59,6 +60,24 @@ typedef struct BlockDevOps {
     void (*resize_cb)(void *opaque);
 } BlockDevOps;
 
+/* This struct is embedded in (the private) BlockBackend struct and contains
+ * fields that must be public. This is in particular for QLIST_ENTRY() and
+ * friends so that BlockBackends can be kept in lists outside block-backend.c */
+typedef struct BlockBackendPublic {
+    /* I/O throttling.
+     * throttle_state tells us if this BlockBackend has I/O limits configured.
+     * io_limits_disabled tells us if they are currently being enforced */
+    CoQueue      throttled_reqs[2];
+    unsigned int io_limits_disabled;
+
+    /* The following fields are protected by the ThrottleGroup lock.
+     * See the ThrottleGroup documentation for details. */
+    ThrottleState *throttle_state;
+    ThrottleTimers throttle_timers;
+    unsigned       pending_reqs[2];
+    QLIST_ENTRY(BlockBackendPublic) round_robin;
+} BlockBackendPublic;
+
 BlockBackend *blk_new(Error **errp);
 BlockBackend *blk_new_with_bs(Error **errp);
 BlockBackend *blk_new_open(const char *filename, const char *reference,
@@ -70,13 +89,16 @@ void blk_remove_all_bs(void);
 const char *blk_name(BlockBackend *blk);
 BlockBackend *blk_by_name(const char *name);
 BlockBackend *blk_next(BlockBackend *blk);
-BlockDriverState *blk_next_root_bs(BlockDriverState *bs);
 bool monitor_add_blk(BlockBackend *blk, const char *name, Error **errp);
 void monitor_remove_blk(BlockBackend *blk);
 
+BlockBackendPublic *blk_get_public(BlockBackend *blk);
+BlockBackend *blk_by_public(BlockBackendPublic *public);
+
 BlockDriverState *blk_bs(BlockBackend *blk);
 void blk_remove_bs(BlockBackend *blk);
 void blk_insert_bs(BlockBackend *blk, BlockDriverState *bs);
+bool bdrv_has_blk(BlockDriverState *bs);
 
 void blk_set_allow_write_beyond_eof(BlockBackend *blk, bool allow);
 void blk_iostatus_enable(BlockBackend *blk);
@@ -116,7 +138,6 @@ BlockAIOCB *blk_aio_discard(BlockBackend *blk,
                             BlockCompletionFunc *cb, void *opaque);
 void blk_aio_cancel(BlockAIOCB *acb);
 void blk_aio_cancel_async(BlockAIOCB *acb);
-int blk_aio_multiwrite(BlockBackend *blk, BlockRequest *reqs, int num_reqs);
 int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf);
 BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
                           BlockCompletionFunc *cb, void *opaque);
@@ -190,4 +211,9 @@ BlockAIOCB *blk_abort_aio_request(BlockBackend *blk,
                                   BlockCompletionFunc *cb,
                                   void *opaque, int ret);
 
+void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg);
+void blk_io_limits_disable(BlockBackend *blk);
+void blk_io_limits_enable(BlockBackend *blk, const char *group);
+void blk_io_limits_update_group(BlockBackend *blk, const char *group);
+
 #endif
diff --git a/migration/block.c b/migration/block.c
index 1743317288..a7a76a0fb9 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -383,6 +383,7 @@ static void init_blk_migration(QEMUFile *f)
     BlockDriverState *bs;
     BlkMigDevState *bmds;
     int64_t sectors;
+    BdrvNextIterator *it = NULL;
 
     block_mig_state.submitted = 0;
     block_mig_state.read_done = 0;
@@ -392,7 +393,8 @@ static void init_blk_migration(QEMUFile *f)
     block_mig_state.bulk_completed = 0;
     block_mig_state.zero_blocks = migrate_zero_blocks();
 
-    for (bs = bdrv_next(NULL); bs; bs = bdrv_next(bs)) {
+
+    while ((it = bdrv_next(it, &bs))) {
         if (bdrv_is_read_only(bs)) {
             continue;
         }
diff --git a/monitor.c b/monitor.c
index 0129936fbe..6a32b9bf59 100644
--- a/monitor.c
+++ b/monitor.c
@@ -3431,11 +3431,13 @@ void host_net_remove_completion(ReadLineState *rs, int nb_args, const char *str)
 static void vm_completion(ReadLineState *rs, const char *str)
 {
     size_t len;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator *it = NULL;
 
     len = strlen(str);
     readline_set_completion_index(rs, len);
-    while ((bs = bdrv_next(bs))) {
+
+    while ((it = bdrv_next(it, &bs))) {
         SnapshotInfoList *snapshots, *snapshot;
         AioContext *ctx = bdrv_get_aio_context(bs);
         bool ok = false;
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 4a00bc604d..e766791ffc 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -574,49 +574,6 @@ static int do_aio_writev(BlockBackend *blk, QEMUIOVector *qiov,
     return async_ret < 0 ? async_ret : 1;
 }
 
-struct multiwrite_async_ret {
-    int num_done;
-    int error;
-};
-
-static void multiwrite_cb(void *opaque, int ret)
-{
-    struct multiwrite_async_ret *async_ret = opaque;
-
-    async_ret->num_done++;
-    if (ret < 0) {
-        async_ret->error = ret;
-    }
-}
-
-static int do_aio_multiwrite(BlockBackend *blk, BlockRequest* reqs,
-                             int num_reqs, int *total)
-{
-    int i, ret;
-    struct multiwrite_async_ret async_ret = {
-        .num_done = 0,
-        .error = 0,
-    };
-
-    *total = 0;
-    for (i = 0; i < num_reqs; i++) {
-        reqs[i].cb = multiwrite_cb;
-        reqs[i].opaque = &async_ret;
-        *total += reqs[i].qiov->size;
-    }
-
-    ret = blk_aio_multiwrite(blk, reqs, num_reqs);
-    if (ret < 0) {
-        return ret;
-    }
-
-    while (async_ret.num_done < num_reqs) {
-        main_loop_wait(false);
-    }
-
-    return async_ret.error < 0 ? async_ret.error : 1;
-}
-
 static void read_help(void)
 {
     printf(
@@ -1150,7 +1107,7 @@ static int writev_f(BlockBackend *blk, int argc, char **argv)
     int pattern = 0xcd;
     QEMUIOVector qiov;
 
-    while ((c = getopt(argc, argv, "CqP:")) != -1) {
+    while ((c = getopt(argc, argv, "CfqP:")) != -1) {
         switch (c) {
         case 'C':
             Cflag = true;
@@ -1211,165 +1168,6 @@ out:
     return 0;
 }
 
-static void multiwrite_help(void)
-{
-    printf(
-"\n"
-" writes a range of bytes from the given offset source from multiple buffers,\n"
-" in a batch of requests that may be merged by qemu\n"
-"\n"
-" Example:\n"
-" 'multiwrite 512 1k 1k ; 4k 1k'\n"
-"  writes 2 kB at 512 bytes and 1 kB at 4 kB into the open file\n"
-"\n"
-" Writes into a segment of the currently open file, using a buffer\n"
-" filled with a set pattern (0xcdcdcdcd). The pattern byte is increased\n"
-" by one for each request contained in the multiwrite command.\n"
-" -P, -- use different pattern to fill file\n"
-" -C, -- report statistics in a machine parsable format\n"
-" -q, -- quiet mode, do not show I/O statistics\n"
-"\n");
-}
-
-static int multiwrite_f(BlockBackend *blk, int argc, char **argv);
-
-static const cmdinfo_t multiwrite_cmd = {
-    .name       = "multiwrite",
-    .cfunc      = multiwrite_f,
-    .argmin     = 2,
-    .argmax     = -1,
-    .args       = "[-Cq] [-P pattern ] off len [len..] [; off len [len..]..]",
-    .oneline    = "issues multiple write requests at once",
-    .help       = multiwrite_help,
-};
-
-static int multiwrite_f(BlockBackend *blk, int argc, char **argv)
-{
-    struct timeval t1, t2;
-    bool Cflag = false, qflag = false;
-    int c, cnt;
-    char **buf;
-    int64_t offset, first_offset = 0;
-    /* Some compilers get confused and warn if this is not initialized.  */
-    int total = 0;
-    int nr_iov;
-    int nr_reqs;
-    int pattern = 0xcd;
-    QEMUIOVector *qiovs;
-    int i;
-    BlockRequest *reqs;
-
-    while ((c = getopt(argc, argv, "CqP:")) != -1) {
-        switch (c) {
-        case 'C':
-            Cflag = true;
-            break;
-        case 'q':
-            qflag = true;
-            break;
-        case 'P':
-            pattern = parse_pattern(optarg);
-            if (pattern < 0) {
-                return 0;
-            }
-            break;
-        default:
-            return qemuio_command_usage(&writev_cmd);
-        }
-    }
-
-    if (optind > argc - 2) {
-        return qemuio_command_usage(&writev_cmd);
-    }
-
-    nr_reqs = 1;
-    for (i = optind; i < argc; i++) {
-        if (!strcmp(argv[i], ";")) {
-            nr_reqs++;
-        }
-    }
-
-    reqs = g_new0(BlockRequest, nr_reqs);
-    buf = g_new0(char *, nr_reqs);
-    qiovs = g_new(QEMUIOVector, nr_reqs);
-
-    for (i = 0; i < nr_reqs && optind < argc; i++) {
-        int j;
-
-        /* Read the offset of the request */
-        offset = cvtnum(argv[optind]);
-        if (offset < 0) {
-            print_cvtnum_err(offset, argv[optind]);
-            goto out;
-        }
-        optind++;
-
-        if (offset & 0x1ff) {
-            printf("offset %lld is not sector aligned\n",
-                   (long long)offset);
-            goto out;
-        }
-
-        if (i == 0) {
-            first_offset = offset;
-        }
-
-        /* Read lengths for qiov entries */
-        for (j = optind; j < argc; j++) {
-            if (!strcmp(argv[j], ";")) {
-                break;
-            }
-        }
-
-        nr_iov = j - optind;
-
-        /* Build request */
-        buf[i] = create_iovec(blk, &qiovs[i], &argv[optind], nr_iov, pattern);
-        if (buf[i] == NULL) {
-            goto out;
-        }
-
-        reqs[i].qiov = &qiovs[i];
-        reqs[i].sector = offset >> 9;
-        reqs[i].nb_sectors = reqs[i].qiov->size >> 9;
-
-        optind = j + 1;
-
-        pattern++;
-    }
-
-    /* If there were empty requests at the end, ignore them */
-    nr_reqs = i;
-
-    gettimeofday(&t1, NULL);
-    cnt = do_aio_multiwrite(blk, reqs, nr_reqs, &total);
-    gettimeofday(&t2, NULL);
-
-    if (cnt < 0) {
-        printf("aio_multiwrite failed: %s\n", strerror(-cnt));
-        goto out;
-    }
-
-    if (qflag) {
-        goto out;
-    }
-
-    /* Finally, report back -- -C gives a parsable format */
-    t2 = tsub(t2, t1);
-    print_report("wrote", &t2, first_offset, total, total, cnt, Cflag);
-out:
-    for (i = 0; i < nr_reqs; i++) {
-        qemu_io_free(buf[i]);
-        if (reqs[i].qiov != NULL) {
-            qemu_iovec_destroy(&qiovs[i]);
-        }
-    }
-    g_free(buf);
-    g_free(reqs);
-    g_free(qiovs);
-    return 0;
-}
-
 struct aio_ctx {
     BlockBackend *blk;
     QEMUIOVector qiov;
@@ -1476,6 +1274,7 @@ static void aio_read_help(void)
 " used to ensure all outstanding aio requests have been completed.\n"
 " -C, -- report statistics in a machine parsable format\n"
 " -P, -- use a pattern to verify read data\n"
+" -i, -- treat request as invalid, for exercising stats\n"
 " -v, -- dump buffer to standard output\n"
 " -q, -- quiet mode, do not show I/O statistics\n"
 "\n");
@@ -1488,7 +1287,7 @@ static const cmdinfo_t aio_read_cmd = {
     .cfunc      = aio_read_f,
     .argmin     = 2,
     .argmax     = -1,
-    .args       = "[-Cqv] [-P pattern] off len [len..]",
+    .args       = "[-Ciqv] [-P pattern] off len [len..]",
     .oneline    = "asynchronously reads a number of bytes",
     .help       = aio_read_help,
 };
@@ -1499,7 +1298,7 @@ static int aio_read_f(BlockBackend *blk, int argc, char **argv)
     struct aio_ctx *ctx = g_new0(struct aio_ctx, 1);
 
     ctx->blk = blk;
-    while ((c = getopt(argc, argv, "CP:qv")) != -1) {
+    while ((c = getopt(argc, argv, "CP:iqv")) != -1) {
         switch (c) {
         case 'C':
             ctx->Cflag = true;
@@ -1512,6 +1311,11 @@ static int aio_read_f(BlockBackend *blk, int argc, char **argv)
                 return 0;
             }
             break;
+        case 'i':
+            printf("injecting invalid read request\n");
+            block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
+            g_free(ctx);
+            return 0;
         case 'q':
             ctx->qflag = true;
             break;
@@ -1569,6 +1373,7 @@ static void aio_write_help(void)
 " -P, -- use different pattern to fill file\n"
 " -C, -- report statistics in a machine parsable format\n"
 " -f, -- use Force Unit Access semantics\n"
+" -i, -- treat request as invalid, for exercising stats\n"
 " -q, -- quiet mode, do not show I/O statistics\n"
 " -u, -- with -z, allow unmapping\n"
 " -z, -- write zeroes using blk_aio_write_zeroes\n"
@@ -1582,7 +1387,7 @@ static const cmdinfo_t aio_write_cmd = {
     .cfunc      = aio_write_f,
     .argmin     = 2,
     .argmax     = -1,
-    .args       = "[-Cfquz] [-P pattern] off len [len..]",
+    .args       = "[-Cfiquz] [-P pattern] off len [len..]",
     .oneline    = "asynchronously writes a number of bytes",
     .help       = aio_write_help,
 };
@@ -1595,7 +1400,7 @@ static int aio_write_f(BlockBackend *blk, int argc, char **argv)
     int flags = 0;
 
     ctx->blk = blk;
-    while ((c = getopt(argc, argv, "CfqP:z")) != -1) {
+    while ((c = getopt(argc, argv, "CfiqP:uz")) != -1) {
         switch (c) {
         case 'C':
             ctx->Cflag = true;
@@ -1616,6 +1421,11 @@ static int aio_write_f(BlockBackend *blk, int argc, char **argv)
                 return 0;
             }
             break;
+        case 'i':
+            printf("injecting invalid write request\n");
+            block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
+            g_free(ctx);
+            return 0;
         case 'z':
             ctx->zflag = true;
             break;
@@ -1638,6 +1448,7 @@ static int aio_write_f(BlockBackend *blk, int argc, char **argv)
 
     if ((flags & BDRV_REQ_MAY_UNMAP) && !ctx->zflag) {
         printf("-u requires -z to be specified\n");
+        g_free(ctx);
         return 0;
     }
 
@@ -2436,7 +2247,6 @@ static void __attribute((constructor)) init_qemuio_commands(void)
     qemuio_add_command(&readv_cmd);
     qemuio_add_command(&write_cmd);
     qemuio_add_command(&writev_cmd);
-    qemuio_add_command(&multiwrite_cmd);
     qemuio_add_command(&aio_read_cmd);
     qemuio_add_command(&aio_write_cmd);
     qemuio_add_command(&aio_flush_cmd);
diff --git a/qmp.c b/qmp.c
index e784a67631..8f8ae3a79d 100644
--- a/qmp.c
+++ b/qmp.c
@@ -181,6 +181,7 @@ void qmp_cont(Error **errp)
     Error *local_err = NULL;
     BlockBackend *blk;
     BlockDriverState *bs;
+    BdrvNextIterator *it;
 
     /* if there is a dump in background, we should wait until the dump
      * finished */
@@ -199,7 +200,9 @@ void qmp_cont(Error **errp)
     for (blk = blk_next(NULL); blk; blk = blk_next(blk)) {
         blk_iostatus_reset(blk);
     }
-    for (bs = bdrv_next(NULL); bs; bs = bdrv_next(bs)) {
+
+    it = NULL;
+    while ((it = bdrv_next(it, &bs))) {
         bdrv_add_key(bs, NULL, &local_err);
         if (local_err) {
             error_propagate(errp, local_err);
diff --git a/tests/qemu-iotests/096 b/tests/qemu-iotests/096
index e34204b8ff..aeeb3753cf 100644
--- a/tests/qemu-iotests/096
+++ b/tests/qemu-iotests/096
@@ -45,8 +45,9 @@ class TestLiveSnapshot(iotests.QMPTestCase):
         os.remove(self.target_img)
 
     def checkConfig(self, active_layer):
-        result = self.vm.qmp('query-named-block-nodes')
+        result = self.vm.qmp('query-block')
         for r in result['return']:
+            r = r['inserted']
             if r['node-name'] == active_layer:
                 self.assertEqual(r['group'], self.group)
                 self.assertEqual(r['iops'], self.iops)
diff --git a/tests/qemu-iotests/100 b/tests/qemu-iotests/100
deleted file mode 100755
index e66db07982..0000000000
--- a/tests/qemu-iotests/100
+++ /dev/null
@@ -1,152 +0,0 @@
-#!/bin/bash
-#
-# Test simple read/write using plain bdrv_read/bdrv_write
-#
-# Copyright (C) 2014 Red Hat, Inc.
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-# creator
-owner=stefanha@redhat.com
-
-seq=`basename $0`
-echo "QA output created by $seq"
-
-here=`pwd`
-status=1	# failure is the default!
-
-_cleanup()
-{
-	_cleanup_test_img
-}
-trap "_cleanup; exit \$status" 0 1 2 3 15
-
-# get standard environment, filters and checks
-. ./common.rc
-. ./common.filter
-
-_supported_fmt generic
-_supported_proto generic
-_supported_os Linux
-
-
-size=128M
-
-echo
-echo "== Single request =="
-_make_test_img $size
-$QEMU_IO -c "write -z 0 8k" "$TEST_IMG" | _filter_qemu_io
-$QEMU_IO -c "multiwrite 0 4k" "$TEST_IMG" | _filter_qemu_io
-
-echo
-echo "== verify pattern =="
-$QEMU_IO -c "read -P 0xcd 0 4k" "$TEST_IMG" | _filter_qemu_io
-$QEMU_IO -c "read -P 0 4k 4k" "$TEST_IMG" | _filter_qemu_io
-
-_cleanup_test_img
-
-echo
-echo "== Sequential requests =="
-_make_test_img $size
-$QEMU_IO -c "write -z 0 12k" "$TEST_IMG" | _filter_qemu_io
-$QEMU_IO -c "multiwrite 0 4k ; 4k 4k" "$TEST_IMG" | _filter_qemu_io
-
-echo
-echo "== verify pattern =="
-$QEMU_IO -c "read -P 0xcd 0 4k" "$TEST_IMG" | _filter_qemu_io
-$QEMU_IO -c "read -P 0xce 4k 4k" "$TEST_IMG" | _filter_qemu_io
-$QEMU_IO -c "read -P 0 8k 4k" "$TEST_IMG" | _filter_qemu_io
-
-_cleanup_test_img
-
-echo
-echo "== Superset overlapping requests =="
-_make_test_img $size
-$QEMU_IO -c "write -z 0 8k" "$TEST_IMG" | _filter_qemu_io
-$QEMU_IO -c "multiwrite 0 4k ; 1k 2k" "$TEST_IMG" | _filter_qemu_io
-
-echo
-echo "== verify pattern =="
-# Order of overlapping in-flight requests is not guaranteed so we cannot verify
-# [1k, 3k) since it could have either pattern 0xcd or 0xce.
-$QEMU_IO -c "read -P 0xcd 0 1k" "$TEST_IMG" | _filter_qemu_io
-$QEMU_IO -c "read -P 0xcd 3k 1k" "$TEST_IMG" | _filter_qemu_io
-$QEMU_IO -c "read -P 0 4k 4k" "$TEST_IMG" | _filter_qemu_io
-
-_cleanup_test_img
-
-echo
-echo "== Subset overlapping requests =="
-_make_test_img $size
-$QEMU_IO -c "write -z 0 8k" "$TEST_IMG" | _filter_qemu_io
-$QEMU_IO -c "multiwrite 1k 2k ; 0k 4k" "$TEST_IMG" | _filter_qemu_io
-
-echo
-echo "== verify pattern =="
-# Order of overlapping in-flight requests is not guaranteed so we cannot verify
-# [1k, 3k) since it could have either pattern 0xcd or 0xce.
-$QEMU_IO -c "read -P 0xce 0 1k" "$TEST_IMG" | _filter_qemu_io
-$QEMU_IO -c "read -P 0xce 3k 1k" "$TEST_IMG" | _filter_qemu_io
-$QEMU_IO -c "read -P 0 4k 4k" "$TEST_IMG" | _filter_qemu_io
-
-_cleanup_test_img
-
-echo
-echo "== Head overlapping requests =="
-_make_test_img $size
-$QEMU_IO -c "write -z 0 8k" "$TEST_IMG" | _filter_qemu_io
-$QEMU_IO -c "multiwrite 0k 2k ; 0k 4k" "$TEST_IMG" | _filter_qemu_io
-
-echo
-echo "== verify pattern =="
-# Order of overlapping in-flight requests is not guaranteed so we cannot verify
-# [0k, 2k) since it could have either pattern 0xcd or 0xce.
-$QEMU_IO -c "read -P 0xce 2k 2k" "$TEST_IMG" | _filter_qemu_io
-$QEMU_IO -c "read -P 0 4k 4k" "$TEST_IMG" | _filter_qemu_io
-
-_cleanup_test_img
-
-echo
-echo "== Tail overlapping requests =="
-_make_test_img $size
-$QEMU_IO -c "write -z 0 8k" "$TEST_IMG" | _filter_qemu_io
-$QEMU_IO -c "multiwrite 2k 2k ; 0k 4k" "$TEST_IMG" | _filter_qemu_io
-
-echo
-echo "== verify pattern =="
-# Order of overlapping in-flight requests is not guaranteed so we cannot verify
-# [2k, 4k) since it could have either pattern 0xcd or 0xce.
-$QEMU_IO -c "read -P 0xce 0k 2k" "$TEST_IMG" | _filter_qemu_io
-$QEMU_IO -c "read -P 0 4k 4k" "$TEST_IMG" | _filter_qemu_io
-
-_cleanup_test_img
-
-echo
-echo "== Disjoint requests =="
-_make_test_img $size
-$QEMU_IO -c "write -z 0 72k" "$TEST_IMG" | _filter_qemu_io
-$QEMU_IO -c "multiwrite 0 4k ; 64k 4k" "$TEST_IMG" | _filter_qemu_io
-
-echo
-echo "== verify pattern =="
-$QEMU_IO -c "read -P 0xcd 0 4k" "$TEST_IMG" | _filter_qemu_io
-$QEMU_IO -c "read -P 0 4k 60k" "$TEST_IMG" | _filter_qemu_io
-$QEMU_IO -c "read -P 0xce 64k 4k" "$TEST_IMG" | _filter_qemu_io
-$QEMU_IO -c "read -P 0 68k 4k" "$TEST_IMG" | _filter_qemu_io
-
-# success, all done
-echo "*** done"
-rm -f $seq.full
-status=0
diff --git a/tests/qemu-iotests/100.out b/tests/qemu-iotests/100.out
deleted file mode 100644
index a44cae40db..0000000000
--- a/tests/qemu-iotests/100.out
+++ /dev/null
@@ -1,103 +0,0 @@
-QA output created by 100
-
-== Single request ==
-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
-wrote 8192/8192 bytes at offset 0
-8 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 4096/4096 bytes at offset 0
-4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-
-== verify pattern ==
-read 4096/4096 bytes at offset 0
-4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-read 4096/4096 bytes at offset 4096
-4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-
-== Sequential requests ==
-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
-wrote 12288/12288 bytes at offset 0
-12 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 8192/8192 bytes at offset 0
-8 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-
-== verify pattern ==
-read 4096/4096 bytes at offset 0
-4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-read 4096/4096 bytes at offset 4096
-4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-read 4096/4096 bytes at offset 8192
-4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-
-== Superset overlapping requests ==
-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
-wrote 8192/8192 bytes at offset 0
-8 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 6144/6144 bytes at offset 0
-6 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-
-== verify pattern ==
-read 1024/1024 bytes at offset 0
-1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-read 1024/1024 bytes at offset 3072
-1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-read 4096/4096 bytes at offset 4096
-4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-
-== Subset overlapping requests ==
-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
-wrote 8192/8192 bytes at offset 0
-8 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 6144/6144 bytes at offset 1024
-6 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-
-== verify pattern ==
-read 1024/1024 bytes at offset 0
-1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-read 1024/1024 bytes at offset 3072
-1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-read 4096/4096 bytes at offset 4096
-4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-
-== Head overlapping requests ==
-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
-wrote 8192/8192 bytes at offset 0
-8 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 6144/6144 bytes at offset 0
-6 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-
-== verify pattern ==
-read 2048/2048 bytes at offset 2048
-2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-read 4096/4096 bytes at offset 4096
-4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-
-== Tail overlapping requests ==
-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
-wrote 8192/8192 bytes at offset 0
-8 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 6144/6144 bytes at offset 2048
-6 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-
-== verify pattern ==
-read 2048/2048 bytes at offset 0
-2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-read 4096/4096 bytes at offset 4096
-4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-
-== Disjoint requests ==
-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
-wrote 73728/73728 bytes at offset 0
-72 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 8192/8192 bytes at offset 0
-8 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-
-== verify pattern ==
-read 4096/4096 bytes at offset 0
-4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-read 61440/61440 bytes at offset 4096
-60 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-read 4096/4096 bytes at offset 65536
-4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-read 4096/4096 bytes at offset 69632
-4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-*** done
diff --git a/tests/qemu-iotests/109 b/tests/qemu-iotests/109
index f980b0c9e5..adf98892f0 100755
--- a/tests/qemu-iotests/109
+++ b/tests/qemu-iotests/109
@@ -104,8 +104,6 @@ for sample_img in empty.bochs iotest-dirtylog-10G-4M.vhdx parallels-v1 \
     $QEMU_IO -c 'read -P 0 0 64k' "$TEST_IMG" | _filter_qemu_io
 
     run_qemu "$TEST_IMG" "$TEST_IMG.src" "'format': 'raw'," "BLOCK_JOB_READY"
-    # qemu-img compare can't handle unaligned file sizes
-    $QEMU_IMG resize -f raw "$TEST_IMG.src" +0
     $QEMU_IMG compare -f raw -F raw "$TEST_IMG" "$TEST_IMG.src"
 done
 
diff --git a/tests/qemu-iotests/109.out b/tests/qemu-iotests/109.out
index 38bc073a37..7c797ed31c 100644
--- a/tests/qemu-iotests/109.out
+++ b/tests/qemu-iotests/109.out
@@ -143,7 +143,6 @@ read 65536/65536 bytes at offset 0
 {"return": {}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 2560, "offset": 2560, "speed": 0, "type": "mirror"}}
 {"return": [{"io-status": "ok", "device": "src", "busy": false, "len": 2560, "offset": 2560, "paused": false, "speed": 0, "ready": true, "type": "mirror"}]}
-Image resized.
 Warning: Image size mismatch!
 Images are identical.
 
@@ -164,7 +163,6 @@ read 65536/65536 bytes at offset 0
 {"return": {}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 31457280, "offset": 31457280, "speed": 0, "type": "mirror"}}
 {"return": [{"io-status": "ok", "device": "src", "busy": false, "len": 31457280, "offset": 31457280, "paused": false, "speed": 0, "ready": true, "type": "mirror"}]}
-Image resized.
 Warning: Image size mismatch!
 Images are identical.
 
@@ -185,7 +183,6 @@ read 65536/65536 bytes at offset 0
 {"return": {}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 327680, "offset": 327680, "speed": 0, "type": "mirror"}}
 {"return": [{"io-status": "ok", "device": "src", "busy": false, "len": 327680, "offset": 327680, "paused": false, "speed": 0, "ready": true, "type": "mirror"}]}
-Image resized.
 Warning: Image size mismatch!
 Images are identical.
 
@@ -206,7 +203,6 @@ read 65536/65536 bytes at offset 0
 {"return": {}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 2048, "offset": 2048, "speed": 0, "type": "mirror"}}
 {"return": [{"io-status": "ok", "device": "src", "busy": false, "len": 2048, "offset": 2048, "paused": false, "speed": 0, "ready": true, "type": "mirror"}]}
-Image resized.
 Warning: Image size mismatch!
 Images are identical.
 
diff --git a/tests/qemu-iotests/136 b/tests/qemu-iotests/136
index e8c6937fc9..635b977552 100644
--- a/tests/qemu-iotests/136
+++ b/tests/qemu-iotests/136
@@ -226,18 +226,11 @@ sector = "%d"
 
         highest_offset = wr_ops * wr_size
 
-        # Two types of invalid operations: unaligned length and unaligned offset
-        for i in range(invalid_rd_ops / 2):
-            ops.append("aio_read 0 511")
+        for i in range(invalid_rd_ops):
+            ops.append("aio_read -i 0 512")
 
-        for i in range(invalid_rd_ops / 2, invalid_rd_ops):
-            ops.append("aio_read 13 512")
-
-        for i in range(invalid_wr_ops / 2):
-            ops.append("aio_write 0 511")
-
-        for i in range(invalid_wr_ops / 2, invalid_wr_ops):
-            ops.append("aio_write 13 512")
+        for i in range(invalid_wr_ops):
+            ops.append("aio_write -i 0 512")
 
         for i in range(failed_rd_ops):
             ops.append("aio_read %d 512" % bad_offset)
@@ -248,14 +241,6 @@ sector = "%d"
         if failed_wr_ops > 0:
             highest_offset = max(highest_offset, bad_offset + 512)
 
-        for i in range(wr_merged):
-            first = i * wr_size * 2
-            second = first + wr_size
-            ops.append("multiwrite %d %d ; %d %d" %
-                       (first, wr_size, second, wr_size))
-
-        highest_offset = max(highest_offset, wr_merged * wr_size * 2)
-
         # Now perform all operations
         for op in ops:
             self.vm.hmp_qemu_io("drive0", op)
@@ -309,19 +294,15 @@ sector = "%d"
     def test_flush(self):
         self.do_test_stats(flush_ops = 8)
 
-    def test_merged(self):
-        for i in range(5):
-            self.do_test_stats(wr_merged = i * 3)
-
     def test_all(self):
         # rd_size, rd_ops, wr_size, wr_ops, flush_ops
         # invalid_rd_ops,  invalid_wr_ops,
         # failed_rd_ops,   failed_wr_ops
         # wr_merged
-        test_values = [[512,    1, 512,   1, 1, 4, 7, 5, 2, 1],
-                       [65536,  1, 2048, 12, 7, 7, 5, 2, 5, 5],
-                       [32768,  9, 8192,  1, 4, 3, 2, 4, 6, 4],
-                       [16384, 11, 3584, 16, 9, 8, 6, 7, 3, 4]]
+        test_values = [[512,    1, 512,   1, 1, 4, 7, 5, 2, 0],
+                       [65536,  1, 2048, 12, 7, 7, 5, 2, 5, 0],
+                       [32768,  9, 8192,  1, 4, 3, 2, 4, 6, 0],
+                       [16384, 11, 3584, 16, 9, 8, 6, 7, 3, 0]]
         for i in test_values:
             self.do_test_stats(*i)
 
diff --git a/tests/qemu-iotests/136.out b/tests/qemu-iotests/136.out
index 0a5e9583a4..cfa5c0d0e6 100644
--- a/tests/qemu-iotests/136.out
+++ b/tests/qemu-iotests/136.out
@@ -1,5 +1,5 @@
-........................................
+...................................
 ----------------------------------------------------------------------
-Ran 40 tests
+Ran 35 tests
 
 OK
diff --git a/tests/qemu-iotests/154 b/tests/qemu-iotests/154
new file mode 100755
index 0000000000..23f1b3ab16
--- /dev/null
+++ b/tests/qemu-iotests/154
@@ -0,0 +1,265 @@
+#!/bin/bash
+#
+# qcow2 specific bdrv_write_zeroes tests with backing files (complements 034)
+#
+# Copyright (C) 2016 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# creator
+owner=kwolf@redhat.com
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+here=`pwd`
+status=1	# failure is the default!
+
+_cleanup()
+{
+	_cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+_supported_fmt qcow2
+_supported_proto file
+_supported_os Linux
+
+CLUSTER_SIZE=4k
+size=128M
+
+echo
+echo == backing file contains zeros ==
+
+CLUSTER_SIZE=512 TEST_IMG="$TEST_IMG.base" _make_test_img $size
+_make_test_img -b "$TEST_IMG.base"
+
+# Make sure that the whole cluster is allocated even for partial write_zeroes
+# when the backing file contains zeros
+
+# X = non-zero data sector in backing file
+# - = sector unallocated in whole backing chain
+# 0 = sector touched by write_zeroes request
+
+# 1. Tail unaligned:    00 00 -- --
+# 2. Head unaligned:    -- -- 00 00
+# 3. Both unaligned:    -- 00 00 --
+# 4. Both, 2 clusters:  -- -- -- 00 | 00 -- -- --
+
+$QEMU_IO -c "write -z 0 2k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "write -z 10k 2k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "write -z 17k 2k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "write -z 27k 2k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+
+echo
+echo == backing file contains non-zero data before write_zeroes ==
+
+CLUSTER_SIZE=512 TEST_IMG="$TEST_IMG.base" _make_test_img $size
+_make_test_img -b "$TEST_IMG.base"
+
+# Single cluster; non-zero data at the cluster start
+# ... | XX -- 00 -- | ...
+$QEMU_IO -c "write -P 0x11 32k 1k" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z 34k 1k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0x11 32k 1k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 33k 3k" "$TEST_IMG" | _filter_qemu_io
+
+# Single cluster; non-zero data exists, but not at the cluster start
+# ... | -- XX 00 -- | ...
+$QEMU_IO -c "write -P 0x11 65k 1k" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z 66k 1k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0x11 65k 1k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 64k 1k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 66k 2k" "$TEST_IMG" | _filter_qemu_io
+
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+
+echo
+echo == backing file contains non-zero data after write_zeroes ==
+
+CLUSTER_SIZE=512 TEST_IMG="$TEST_IMG.base" _make_test_img $size
+_make_test_img -b "$TEST_IMG.base"
+
+# Single cluster; non-zero data directly after request
+# ... | -- 00 XX -- | ...
+$QEMU_IO -c "write -P 0x11 34k 1k" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z 33k 1k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 32k 2k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0x11 34k 1k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 35k 1k" "$TEST_IMG" | _filter_qemu_io
+
+# Single cluster; non-zero data exists, but not directly after request
+# ... | -- 00 -- XX | ...
+$QEMU_IO -c "write -P 0x11 43k 1k" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z 41k 1k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0x11 43k 1k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 40k 3k" "$TEST_IMG" | _filter_qemu_io
+
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+
+echo
+echo == spanning two clusters, non-zero before request ==
+
+CLUSTER_SIZE=512 TEST_IMG="$TEST_IMG.base" _make_test_img $size
+_make_test_img -b "$TEST_IMG.base"
+
+# Two clusters; non-zero data before request:
+# 1. At cluster start:          32k: XX -- -- 00 | 00 -- -- --
+# 2. Between unallocated space: 48k: -- XX -- 00 | 00 -- -- --
+# 3. Directly before request:   64k: -- -- XX 00 | 00 -- -- --
+
+$QEMU_IO -c "write -P 0x11 32k 1k" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z 35k 2k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0x11 32k 1k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 33k 7k" "$TEST_IMG" | _filter_qemu_io
+
+$QEMU_IO -c "write -P 0x11 49k 1k" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z 51k 2k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 48k 1k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0x11 49k 1k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 50k 6k" "$TEST_IMG" | _filter_qemu_io
+
+$QEMU_IO -c "write -P 0x11 66k 1k" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z 67k 2k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 64k 2k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0x11 66k 1k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 67k 5k" "$TEST_IMG" | _filter_qemu_io
+
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+
+echo
+echo == spanning two clusters, non-zero after request ==
+
+CLUSTER_SIZE=512 TEST_IMG="$TEST_IMG.base" _make_test_img $size
+_make_test_img -b "$TEST_IMG.base"
+
+# Two clusters; non-zero data after request:
+# 1. Directly after request:    32k: -- -- -- 00 | 00 XX -- --
+# 2. Between unallocated space: 48k: -- -- -- 00 | 00 -- XX --
+# 3. At cluster end:            64k: -- -- -- 00 | 00 -- -- XX
+
+$QEMU_IO -c "write -P 0x11 37k 1k" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z 35k 2k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 32k 5k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0x11 37k 1k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 38k 2k" "$TEST_IMG" | _filter_qemu_io
+
+$QEMU_IO -c "write -P 0x11 54k 1k" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z 51k 2k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 48k 6k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0x11 54k 1k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 55k 1k" "$TEST_IMG" | _filter_qemu_io
+
+$QEMU_IO -c "write -P 0x11 71k 1k" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z 67k 2k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 64k 7k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0x11 71k 1k" "$TEST_IMG" | _filter_qemu_io
+
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+
+echo
+echo == spanning two clusters, partially overwriting backing file ==
+
+CLUSTER_SIZE=512 TEST_IMG="$TEST_IMG.base" _make_test_img $size
+_make_test_img -b "$TEST_IMG.base"
+
+# Backing file: -- -- XX XX | XX XX -- --
+# Active layer: -- -- XX 00 | 00 XX -- --
+
+$QEMU_IO -c "write -P 0x11 2k 4k" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z 3k 2k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 0k 2k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0x11 2k 1k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 3k 2k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0x11 5k 1k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 6k 2k" "$TEST_IMG" | _filter_qemu_io
+
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+
+echo
+echo == spanning multiple clusters, non-zero in first cluster ==
+
+CLUSTER_SIZE=512 TEST_IMG="$TEST_IMG.base" _make_test_img $size
+_make_test_img -b "$TEST_IMG.base"
+
+# Backing file: 64k: XX XX -- -- | -- -- -- -- | -- -- -- --
+# Active layer: 64k: XX XX 00 00 | 00 00 00 00 | 00 -- -- --
+
+$QEMU_IO -c "write -P 0x11 64k 2k" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z 66k 7k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0x11 64k 2k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 66k 10k" "$TEST_IMG" | _filter_qemu_io
+
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+
+echo
+echo == spanning multiple clusters, non-zero in intermediate cluster ==
+
+CLUSTER_SIZE=512 TEST_IMG="$TEST_IMG.base" _make_test_img $size
+_make_test_img -b "$TEST_IMG.base"
+
+# Backing file: 64k: -- -- -- -- | -- XX XX -- | -- -- -- --
+# Active layer: 64k: -- -- 00 00 | 00 00 00 00 | 00 -- -- --
+
+$QEMU_IO -c "write -P 0x11 69k 2k" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z 66k 7k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 64k 12k" "$TEST_IMG" | _filter_qemu_io
+
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+
+echo
+echo == spanning multiple clusters, non-zero in final cluster ==
+
+CLUSTER_SIZE=512 TEST_IMG="$TEST_IMG.base" _make_test_img $size
+_make_test_img -b "$TEST_IMG.base"
+
+# Backing file: 64k: -- -- -- -- | -- -- -- -- | -- -- XX XX
+# Active layer: 64k: -- -- 00 00 | 00 00 00 00 | 00 -- XX XX
+
+$QEMU_IO -c "write -P 0x11 74k 2k" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z 66k 7k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 64k 10k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0x11 74k 2k" "$TEST_IMG" | _filter_qemu_io
+
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+
+echo
+echo == spanning multiple clusters, partially overwriting backing file ==
+
+CLUSTER_SIZE=512 TEST_IMG="$TEST_IMG.base" _make_test_img $size
+_make_test_img -b "$TEST_IMG.base"
+
+# Backing file: 64k: -- XX XX XX | XX XX XX XX | XX XX XX --
+# Active layer: 64k: -- XX 00 00 | 00 00 00 00 | 00 XX XX --
+
+$QEMU_IO -c "write -P 0x11 65k 10k" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z 66k 7k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 64k 1k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0x11 65k 1k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 66k 7k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0x11 73k 2k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 75k 1k" "$TEST_IMG" | _filter_qemu_io
+
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+
+# success, all done
+echo "*** done"
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/154.out b/tests/qemu-iotests/154.out
new file mode 100644
index 0000000000..8946b734ac
--- /dev/null
+++ b/tests/qemu-iotests/154.out
@@ -0,0 +1,242 @@
+QA output created by 154
+
+== backing file contains zeros ==
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=134217728
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/t.IMGFMT.base
+wrote 2048/2048 bytes at offset 0
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2048/2048 bytes at offset 10240
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2048/2048 bytes at offset 17408
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2048/2048 bytes at offset 27648
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 4096, "depth": 0, "zero": true, "data": false},
+{ "start": 4096, "length": 4096, "depth": 1, "zero": true, "data": false},
+{ "start": 8192, "length": 4096, "depth": 0, "zero": true, "data": false},
+{ "start": 12288, "length": 4096, "depth": 1, "zero": true, "data": false},
+{ "start": 16384, "length": 4096, "depth": 0, "zero": true, "data": false},
+{ "start": 20480, "length": 4096, "depth": 1, "zero": true, "data": false},
+{ "start": 24576, "length": 8192, "depth": 0, "zero": true, "data": false},
+{ "start": 32768, "length": 134184960, "depth": 1, "zero": true, "data": false}]
+
+== backing file contains non-zero data before write_zeroes ==
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=134217728
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/t.IMGFMT.base
+wrote 1024/1024 bytes at offset 32768
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1024/1024 bytes at offset 34816
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 32768
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 3072/3072 bytes at offset 33792
+3 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1024/1024 bytes at offset 66560
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1024/1024 bytes at offset 67584
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 66560
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 65536
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 2048/2048 bytes at offset 67584
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 32768, "depth": 1, "zero": true, "data": false},
+{ "start": 32768, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 20480},
+{ "start": 36864, "length": 28672, "depth": 1, "zero": true, "data": false},
+{ "start": 65536, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 24576},
+{ "start": 69632, "length": 134148096, "depth": 1, "zero": true, "data": false}]
+
+== backing file contains non-zero data after write_zeroes ==
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=134217728
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/t.IMGFMT.base
+wrote 1024/1024 bytes at offset 34816
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1024/1024 bytes at offset 33792
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 2048/2048 bytes at offset 32768
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 34816
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 35840
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1024/1024 bytes at offset 44032
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1024/1024 bytes at offset 41984
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 44032
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 3072/3072 bytes at offset 40960
+3 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 32768, "depth": 1, "zero": true, "data": false},
+{ "start": 32768, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 20480},
+{ "start": 36864, "length": 4096, "depth": 1, "zero": true, "data": false},
+{ "start": 40960, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 24576},
+{ "start": 45056, "length": 134172672, "depth": 1, "zero": true, "data": false}]
+
+== spanning two clusters, non-zero before request ==
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=134217728
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/t.IMGFMT.base
+wrote 1024/1024 bytes at offset 32768
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2048/2048 bytes at offset 35840
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 32768
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 7168/7168 bytes at offset 33792
+7 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1024/1024 bytes at offset 50176
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2048/2048 bytes at offset 52224
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 49152
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 50176
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 6144/6144 bytes at offset 51200
+6 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1024/1024 bytes at offset 67584
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2048/2048 bytes at offset 68608
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 2048/2048 bytes at offset 65536
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 67584
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 5120/5120 bytes at offset 68608
+5 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 32768, "depth": 1, "zero": true, "data": false},
+{ "start": 32768, "length": 8192, "depth": 0, "zero": false, "data": true, "offset": 20480},
+{ "start": 40960, "length": 8192, "depth": 1, "zero": true, "data": false},
+{ "start": 49152, "length": 8192, "depth": 0, "zero": false, "data": true, "offset": 28672},
+{ "start": 57344, "length": 8192, "depth": 1, "zero": true, "data": false},
+{ "start": 65536, "length": 8192, "depth": 0, "zero": false, "data": true, "offset": 36864},
+{ "start": 73728, "length": 134144000, "depth": 1, "zero": true, "data": false}]
+
+== spanning two clusters, non-zero after request ==
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=134217728
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/t.IMGFMT.base
+wrote 1024/1024 bytes at offset 37888
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2048/2048 bytes at offset 35840
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 5120/5120 bytes at offset 32768
+5 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 37888
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 2048/2048 bytes at offset 38912
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1024/1024 bytes at offset 55296
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2048/2048 bytes at offset 52224
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 6144/6144 bytes at offset 49152
+6 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 55296
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 56320
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1024/1024 bytes at offset 72704
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2048/2048 bytes at offset 68608
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 7168/7168 bytes at offset 65536
+7 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 72704
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 32768, "depth": 1, "zero": true, "data": false},
+{ "start": 32768, "length": 8192, "depth": 0, "zero": false, "data": true, "offset": 20480},
+{ "start": 40960, "length": 8192, "depth": 1, "zero": true, "data": false},
+{ "start": 49152, "length": 8192, "depth": 0, "zero": false, "data": true, "offset": 28672},
+{ "start": 57344, "length": 8192, "depth": 1, "zero": true, "data": false},
+{ "start": 65536, "length": 8192, "depth": 0, "zero": false, "data": true, "offset": 36864},
+{ "start": 73728, "length": 134144000, "depth": 1, "zero": true, "data": false}]
+
+== spanning two clusters, partially overwriting backing file ==
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=134217728
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/t.IMGFMT.base
+wrote 4096/4096 bytes at offset 2048
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2048/2048 bytes at offset 3072
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 2048/2048 bytes at offset 0
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 2048
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 2048/2048 bytes at offset 3072
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 5120
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 2048/2048 bytes at offset 6144
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 8192, "depth": 0, "zero": false, "data": true, "offset": 20480},
+{ "start": 8192, "length": 134209536, "depth": 1, "zero": true, "data": false}]
+
+== spanning multiple clusters, non-zero in first cluster ==
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=134217728
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/t.IMGFMT.base
+wrote 2048/2048 bytes at offset 65536
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 7168/7168 bytes at offset 67584
+7 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 2048/2048 bytes at offset 65536
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 10240/10240 bytes at offset 67584
+10 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 65536, "depth": 1, "zero": true, "data": false},
+{ "start": 65536, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 20480},
+{ "start": 69632, "length": 8192, "depth": 0, "zero": true, "data": false},
+{ "start": 77824, "length": 134139904, "depth": 1, "zero": true, "data": false}]
+
+== spanning multiple clusters, non-zero in intermediate cluster ==
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=134217728
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/t.IMGFMT.base
+wrote 2048/2048 bytes at offset 70656
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 7168/7168 bytes at offset 67584
+7 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 12288/12288 bytes at offset 65536
+12 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 65536, "depth": 1, "zero": true, "data": false},
+{ "start": 65536, "length": 12288, "depth": 0, "zero": true, "data": false},
+{ "start": 77824, "length": 134139904, "depth": 1, "zero": true, "data": false}]
+
+== spanning multiple clusters, non-zero in final cluster ==
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=134217728
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/t.IMGFMT.base
+wrote 2048/2048 bytes at offset 75776
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 7168/7168 bytes at offset 67584
+7 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 10240/10240 bytes at offset 65536
+10 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 2048/2048 bytes at offset 75776
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 65536, "depth": 1, "zero": true, "data": false},
+{ "start": 65536, "length": 8192, "depth": 0, "zero": true, "data": false},
+{ "start": 73728, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 20480},
+{ "start": 77824, "length": 134139904, "depth": 1, "zero": true, "data": false}]
+
+== spanning multiple clusters, partially overwriting backing file ==
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=134217728
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/t.IMGFMT.base
+wrote 10240/10240 bytes at offset 66560
+10 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 7168/7168 bytes at offset 67584
+7 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 65536
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 66560
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 7168/7168 bytes at offset 67584
+7 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 2048/2048 bytes at offset 74752
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 76800
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 65536, "depth": 1, "zero": true, "data": false},
+{ "start": 65536, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 20480},
+{ "start": 69632, "length": 4096, "depth": 0, "zero": true, "data": false},
+{ "start": 73728, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 24576},
+{ "start": 77824, "length": 134139904, "depth": 1, "zero": true, "data": false}]
+*** done
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index 822953b6fa..ab1d76efdf 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -106,7 +106,7 @@
 097 rw auto backing
 098 rw auto backing quick
 099 rw auto quick
-100 rw auto quick
+# 100 was removed, do not reuse
 101 rw auto quick
 102 rw auto quick
 103 rw auto quick
@@ -153,3 +153,4 @@
 149 rw auto sudo
 150 rw auto quick
 152 rw auto quick
+154 rw auto backing quick
diff --git a/tests/test-throttle.c b/tests/test-throttle.c
index 744a524368..5ec966c8a4 100644
--- a/tests/test-throttle.c
+++ b/tests/test-throttle.c
@@ -20,6 +20,7 @@
 #include "qemu/throttle.h"
 #include "qemu/error-report.h"
 #include "block/throttle-groups.h"
+#include "sysemu/block-backend.h"
 
 static AioContext     *ctx;
 static LeakyBucket    bkt;
@@ -574,27 +575,32 @@ static void test_accounting(void)
 static void test_groups(void)
 {
     ThrottleConfig cfg1, cfg2;
-    BlockDriverState *bdrv1, *bdrv2, *bdrv3;
+    BlockBackend *blk1, *blk2, *blk3;
+    BlockBackendPublic *blkp1, *blkp2, *blkp3;
 
-    bdrv1 = bdrv_new();
-    bdrv2 = bdrv_new();
-    bdrv3 = bdrv_new();
+    blk1 = blk_new_with_bs(&error_abort);
+    blk2 = blk_new_with_bs(&error_abort);
+    blk3 = blk_new_with_bs(&error_abort);
 
-    g_assert(bdrv1->throttle_state == NULL);
-    g_assert(bdrv2->throttle_state == NULL);
-    g_assert(bdrv3->throttle_state == NULL);
+    blkp1 = blk_get_public(blk1);
+    blkp2 = blk_get_public(blk2);
+    blkp3 = blk_get_public(blk3);
 
-    throttle_group_register_bs(bdrv1, "bar");
-    throttle_group_register_bs(bdrv2, "foo");
-    throttle_group_register_bs(bdrv3, "bar");
+    g_assert(blkp1->throttle_state == NULL);
+    g_assert(blkp2->throttle_state == NULL);
+    g_assert(blkp3->throttle_state == NULL);
 
-    g_assert(bdrv1->throttle_state != NULL);
-    g_assert(bdrv2->throttle_state != NULL);
-    g_assert(bdrv3->throttle_state != NULL);
+    throttle_group_register_blk(blk1, "bar");
+    throttle_group_register_blk(blk2, "foo");
+    throttle_group_register_blk(blk3, "bar");
 
-    g_assert(!strcmp(throttle_group_get_name(bdrv1), "bar"));
-    g_assert(!strcmp(throttle_group_get_name(bdrv2), "foo"));
-    g_assert(bdrv1->throttle_state == bdrv3->throttle_state);
+    g_assert(blkp1->throttle_state != NULL);
+    g_assert(blkp2->throttle_state != NULL);
+    g_assert(blkp3->throttle_state != NULL);
+
+    g_assert(!strcmp(throttle_group_get_name(blk1), "bar"));
+    g_assert(!strcmp(throttle_group_get_name(blk2), "foo"));
+    g_assert(blkp1->throttle_state == blkp3->throttle_state);
 
     /* Setting the config of a group member affects the whole group */
     throttle_config_init(&cfg1);
@@ -602,29 +608,29 @@ static void test_groups(void)
     cfg1.buckets[THROTTLE_BPS_WRITE].avg = 285000;
     cfg1.buckets[THROTTLE_OPS_READ].avg  = 20000;
     cfg1.buckets[THROTTLE_OPS_WRITE].avg = 12000;
-    throttle_group_config(bdrv1, &cfg1);
+    throttle_group_config(blk1, &cfg1);
 
-    throttle_group_get_config(bdrv1, &cfg1);
-    throttle_group_get_config(bdrv3, &cfg2);
+    throttle_group_get_config(blk1, &cfg1);
+    throttle_group_get_config(blk3, &cfg2);
     g_assert(!memcmp(&cfg1, &cfg2, sizeof(cfg1)));
 
     cfg2.buckets[THROTTLE_BPS_READ].avg  = 4547;
     cfg2.buckets[THROTTLE_BPS_WRITE].avg = 1349;
     cfg2.buckets[THROTTLE_OPS_READ].avg  = 123;
     cfg2.buckets[THROTTLE_OPS_WRITE].avg = 86;
-    throttle_group_config(bdrv3, &cfg1);
+    throttle_group_config(blk3, &cfg1);
 
-    throttle_group_get_config(bdrv1, &cfg1);
-    throttle_group_get_config(bdrv3, &cfg2);
+    throttle_group_get_config(blk1, &cfg1);
+    throttle_group_get_config(blk3, &cfg2);
     g_assert(!memcmp(&cfg1, &cfg2, sizeof(cfg1)));
 
-    throttle_group_unregister_bs(bdrv1);
-    throttle_group_unregister_bs(bdrv2);
-    throttle_group_unregister_bs(bdrv3);
+    throttle_group_unregister_blk(blk1);
+    throttle_group_unregister_blk(blk2);
+    throttle_group_unregister_blk(blk3);
 
-    g_assert(bdrv1->throttle_state == NULL);
-    g_assert(bdrv2->throttle_state == NULL);
-    g_assert(bdrv3->throttle_state == NULL);
+    g_assert(blkp1->throttle_state == NULL);
+    g_assert(blkp2->throttle_state == NULL);
+    g_assert(blkp3->throttle_state == NULL);
 }
 
 int main(int argc, char **argv)
diff --git a/trace-events b/trace-events
index e35b80e980..b53c3541a3 100644
--- a/trace-events
+++ b/trace-events
@@ -62,8 +62,6 @@ bdrv_open_common(void *bs, const char *filename, int flags, const char *format_n
 bdrv_lock_medium(void *bs, bool locked) "bs %p locked %d"
 
 # block/io.c
-multiwrite_cb(void *mcb, int ret) "mcb %p ret %d"
-bdrv_aio_multiwrite(void *mcb, int num_callbacks, int num_reqs) "mcb %p num_callbacks %d num_reqs %d"
 bdrv_aio_discard(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
 bdrv_aio_flush(void *bs, void *opaque) "bs %p opaque %p"
 bdrv_aio_readv(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"