8 files changed, 606 insertions, 61 deletions
diff --git a/block/coroutines.h b/block/coroutines.h
index f69179f5ef..1cb3128b94 100644
--- a/block/coroutines.h
+++ b/block/coroutines.h
@@ -41,6 +41,7 @@ bdrv_pwritev(BdrvChild *child, int64_t offset, unsigned int bytes,
 int coroutine_fn
 bdrv_co_common_block_status_above(BlockDriverState *bs,
                                   BlockDriverState *base,
+                                  bool include_base,
                                   bool want_zero,
                                   int64_t offset,
                                   int64_t bytes,
@@ -50,6 +51,7 @@ bdrv_co_common_block_status_above(BlockDriverState *bs,
 int generated_co_wrapper
 bdrv_common_block_status_above(BlockDriverState *bs,
                                BlockDriverState *base,
+                               bool include_base,
                                bool want_zero,
                                int64_t offset,
                                int64_t bytes,
diff --git a/block/export/export.c b/block/export/export.c
index f2c00d13bf..c3478c6c97 100644
--- a/block/export/export.c
+++ b/block/export/export.c
@@ -15,15 +15,22 @@
 
 #include "block/block.h"
 #include "sysemu/block-backend.h"
+#include "sysemu/iothread.h"
 #include "block/export.h"
 #include "block/nbd.h"
 #include "qapi/error.h"
 #include "qapi/qapi-commands-block-export.h"
 #include "qapi/qapi-events-block-export.h"
 #include "qemu/id.h"
+#if defined(CONFIG_LINUX) && defined(CONFIG_VHOST_USER)
+#include "vhost-user-blk-server.h"
+#endif
 
 static const BlockExportDriver *blk_exp_drivers[] = {
     &blk_exp_nbd,
+#if defined(CONFIG_LINUX) && defined(CONFIG_VHOST_USER)
+    &blk_exp_vhost_user_blk,
+#endif
 };
 
 /* Only accessed from the main thread */
@@ -57,10 +64,11 @@ static const BlockExportDriver *blk_exp_find_driver(BlockExportType type)
 
 BlockExport *blk_exp_add(BlockExportOptions *export, Error **errp)
 {
+    bool fixed_iothread = export->has_fixed_iothread && export->fixed_iothread;
     const BlockExportDriver *drv;
     BlockExport *exp = NULL;
     BlockDriverState *bs;
-    BlockBackend *blk;
+    BlockBackend *blk = NULL;
     AioContext *ctx;
     uint64_t perm;
     int ret;
@@ -96,6 +104,28 @@ BlockExport *blk_exp_add(BlockExportOptions *export, Error **errp)
     ctx = bdrv_get_aio_context(bs);
     aio_context_acquire(ctx);
 
+    if (export->has_iothread) {
+        IOThread *iothread;
+        AioContext *new_ctx;
+
+        iothread = iothread_by_id(export->iothread);
+        if (!iothread) {
+            error_setg(errp, "iothread \"%s\" not found", export->iothread);
+            goto fail;
+        }
+
+        new_ctx = iothread_get_aio_context(iothread);
+
+        ret = bdrv_try_set_aio_context(bs, new_ctx, errp);
+        if (ret == 0) {
+            aio_context_release(ctx);
+            aio_context_acquire(new_ctx);
+            ctx = new_ctx;
+        } else if (fixed_iothread) {
+            goto fail;
+        }
+    }
+
     /*
      * Block exports are used for non-shared storage migration. Make sure
      * that BDRV_O_INACTIVE is cleared and the image is ready for write
@@ -110,6 +140,11 @@ BlockExport *blk_exp_add(BlockExportOptions *export, Error **errp)
     }
 
     blk = blk_new(ctx, perm, BLK_PERM_ALL);
+
+    if (!fixed_iothread) {
+        blk_set_allow_aio_context_change(blk, true);
+    }
+
     ret = blk_insert_bs(blk, bs, errp);
     if (ret < 0) {
         goto fail;
diff --git a/block/export/meson.build b/block/export/meson.build
index 558ef35d38..9fb4fbf81d 100644
--- a/block/export/meson.build
+++ b/block/export/meson.build
@@ -1 +1,2 @@
-block_ss.add(files('export.c'))
+blockdev_ss.add(files('export.c'))
+blockdev_ss.add(when: ['CONFIG_LINUX', 'CONFIG_VHOST_USER'], if_true: files('vhost-user-blk-server.c'))
diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c
new file mode 100644
index 0000000000..41f4933d6e
--- /dev/null
+++ b/block/export/vhost-user-blk-server.c
@@ -0,0 +1,431 @@
+/*
+ * Sharing QEMU block devices via vhost-user protocal
+ *
+ * Parts of the code based on nbd/server.c.
+ *
+ * Copyright (c) Coiby Xu <coiby.xu@gmail.com>.
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include "block/block.h"
+#include "contrib/libvhost-user/libvhost-user.h"
+#include "standard-headers/linux/virtio_blk.h"
+#include "qemu/vhost-user-server.h"
+#include "vhost-user-blk-server.h"
+#include "qapi/error.h"
+#include "qom/object_interfaces.h"
+#include "sysemu/block-backend.h"
+#include "util/block-helpers.h"
+
+enum {
+    VHOST_USER_BLK_NUM_QUEUES_DEFAULT = 1,
+};
+struct virtio_blk_inhdr {
+    unsigned char status;
+};
+
+typedef struct VuBlkReq {
+    VuVirtqElement elem;
+    int64_t sector_num;
+    size_t size;
+    struct virtio_blk_inhdr *in;
+    struct virtio_blk_outhdr out;
+    VuServer *server;
+    struct VuVirtq *vq;
+} VuBlkReq;
+
+/* vhost user block device */
+typedef struct {
+    BlockExport export;
+    VuServer vu_server;
+    uint32_t blk_size;
+    QIOChannelSocket *sioc;
+    struct virtio_blk_config blkcfg;
+    bool writable;
+} VuBlkExport;
+
+static void vu_blk_req_complete(VuBlkReq *req)
+{
+    VuDev *vu_dev = &req->server->vu_dev;
+
+    /* IO size with 1 extra status byte */
+    vu_queue_push(vu_dev, req->vq, &req->elem, req->size + 1);
+    vu_queue_notify(vu_dev, req->vq);
+
+    free(req);
+}
+
+static int coroutine_fn
+vu_blk_discard_write_zeroes(BlockBackend *blk, struct iovec *iov,
+                            uint32_t iovcnt, uint32_t type)
+{
+    struct virtio_blk_discard_write_zeroes desc;
+    ssize_t size = iov_to_buf(iov, iovcnt, 0, &desc, sizeof(desc));
+    if (unlikely(size != sizeof(desc))) {
+        error_report("Invalid size %zd, expect %zu", size, sizeof(desc));
+        return -EINVAL;
+    }
+
+    uint64_t range[2] = { le64_to_cpu(desc.sector) << 9,
+                          le32_to_cpu(desc.num_sectors) << 9 };
+    if (type == VIRTIO_BLK_T_DISCARD) {
+        if (blk_co_pdiscard(blk, range[0], range[1]) == 0) {
+            return 0;
+        }
+    } else if (type == VIRTIO_BLK_T_WRITE_ZEROES) {
+        if (blk_co_pwrite_zeroes(blk, range[0], range[1], 0) == 0) {
+            return 0;
+        }
+    }
+
+    return -EINVAL;
+}
+
+static void coroutine_fn vu_blk_virtio_process_req(void *opaque)
+{
+    VuBlkReq *req = opaque;
+    VuServer *server = req->server;
+    VuVirtqElement *elem = &req->elem;
+    uint32_t type;
+
+    VuBlkExport *vexp = container_of(server, VuBlkExport, vu_server);
+    BlockBackend *blk = vexp->export.blk;
+
+    struct iovec *in_iov = elem->in_sg;
+    struct iovec *out_iov = elem->out_sg;
+    unsigned in_num = elem->in_num;
+    unsigned out_num = elem->out_num;
+
+    /* refer to hw/block/virtio_blk.c */
+    if (elem->out_num < 1 || elem->in_num < 1) {
+        error_report("virtio-blk request missing headers");
+        goto err;
+    }
+
+    if (unlikely(iov_to_buf(out_iov, out_num, 0, &req->out,
+                            sizeof(req->out)) != sizeof(req->out))) {
+        error_report("virtio-blk request outhdr too short");
+        goto err;
+    }
+
+    iov_discard_front(&out_iov, &out_num, sizeof(req->out));
+
+    if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
+        error_report("virtio-blk request inhdr too short");
+        goto err;
+    }
+
+    /* We always touch the last byte, so just see how big in_iov is.  */
+    req->in = (void *)in_iov[in_num - 1].iov_base
+              + in_iov[in_num - 1].iov_len
+              - sizeof(struct virtio_blk_inhdr);
+    iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr));
+
+    type = le32_to_cpu(req->out.type);
+    switch (type & ~VIRTIO_BLK_T_BARRIER) {
+    case VIRTIO_BLK_T_IN:
+    case VIRTIO_BLK_T_OUT: {
+        ssize_t ret = 0;
+        bool is_write = type & VIRTIO_BLK_T_OUT;
+        req->sector_num = le64_to_cpu(req->out.sector);
+
+        if (is_write && !vexp->writable) {
+            req->in->status = VIRTIO_BLK_S_IOERR;
+            break;
+        }
+
+        int64_t offset = req->sector_num * vexp->blk_size;
+        QEMUIOVector qiov;
+        if (is_write) {
+            qemu_iovec_init_external(&qiov, out_iov, out_num);
+            ret = blk_co_pwritev(blk, offset, qiov.size, &qiov, 0);
+        } else {
+            qemu_iovec_init_external(&qiov, in_iov, in_num);
+            ret = blk_co_preadv(blk, offset, qiov.size, &qiov, 0);
+        }
+        if (ret >= 0) {
+            req->in->status = VIRTIO_BLK_S_OK;
+        } else {
+            req->in->status = VIRTIO_BLK_S_IOERR;
+        }
+        break;
+    }
+    case VIRTIO_BLK_T_FLUSH:
+        if (blk_co_flush(blk) == 0) {
+            req->in->status = VIRTIO_BLK_S_OK;
+        } else {
+            req->in->status = VIRTIO_BLK_S_IOERR;
+        }
+        break;
+    case VIRTIO_BLK_T_GET_ID: {
+        size_t size = MIN(iov_size(&elem->in_sg[0], in_num),
+                          VIRTIO_BLK_ID_BYTES);
+        snprintf(elem->in_sg[0].iov_base, size, "%s", "vhost_user_blk");
+        req->in->status = VIRTIO_BLK_S_OK;
+        req->size = elem->in_sg[0].iov_len;
+        break;
+    }
+    case VIRTIO_BLK_T_DISCARD:
+    case VIRTIO_BLK_T_WRITE_ZEROES: {
+        int rc;
+
+        if (!vexp->writable) {
+            req->in->status = VIRTIO_BLK_S_IOERR;
+            break;
+        }
+
+        rc = vu_blk_discard_write_zeroes(blk, &elem->out_sg[1], out_num, type);
+        if (rc == 0) {
+            req->in->status = VIRTIO_BLK_S_OK;
+        } else {
+            req->in->status = VIRTIO_BLK_S_IOERR;
+        }
+        break;
+    }
+    default:
+        req->in->status = VIRTIO_BLK_S_UNSUPP;
+        break;
+    }
+
+    vu_blk_req_complete(req);
+    return;
+
+err:
+    free(req);
+}
+
+static void vu_blk_process_vq(VuDev *vu_dev, int idx)
+{
+    VuServer *server = container_of(vu_dev, VuServer, vu_dev);
+    VuVirtq *vq = vu_get_queue(vu_dev, idx);
+
+    while (1) {
+        VuBlkReq *req;
+
+        req = vu_queue_pop(vu_dev, vq, sizeof(VuBlkReq));
+        if (!req) {
+            break;
+        }
+
+        req->server = server;
+        req->vq = vq;
+
+        Coroutine *co =
+            qemu_coroutine_create(vu_blk_virtio_process_req, req);
+        qemu_coroutine_enter(co);
+    }
+}
+
+static void vu_blk_queue_set_started(VuDev *vu_dev, int idx, bool started)
+{
+    VuVirtq *vq;
+
+    assert(vu_dev);
+
+    vq = vu_get_queue(vu_dev, idx);
+    vu_set_queue_handler(vu_dev, vq, started ? vu_blk_process_vq : NULL);
+}
+
+static uint64_t vu_blk_get_features(VuDev *dev)
+{
+    uint64_t features;
+    VuServer *server = container_of(dev, VuServer, vu_dev);
+    VuBlkExport *vexp = container_of(server, VuBlkExport, vu_server);
+    features = 1ull << VIRTIO_BLK_F_SIZE_MAX |
+               1ull << VIRTIO_BLK_F_SEG_MAX |
+               1ull << VIRTIO_BLK_F_TOPOLOGY |
+               1ull << VIRTIO_BLK_F_BLK_SIZE |
+               1ull << VIRTIO_BLK_F_FLUSH |
+               1ull << VIRTIO_BLK_F_DISCARD |
+               1ull << VIRTIO_BLK_F_WRITE_ZEROES |
+               1ull << VIRTIO_BLK_F_CONFIG_WCE |
+               1ull << VIRTIO_BLK_F_MQ |
+               1ull << VIRTIO_F_VERSION_1 |
+               1ull << VIRTIO_RING_F_INDIRECT_DESC |
+               1ull << VIRTIO_RING_F_EVENT_IDX |
+               1ull << VHOST_USER_F_PROTOCOL_FEATURES;
+
+    if (!vexp->writable) {
+        features |= 1ull << VIRTIO_BLK_F_RO;
+    }
+
+    return features;
+}
+
+static uint64_t vu_blk_get_protocol_features(VuDev *dev)
+{
+    return 1ull << VHOST_USER_PROTOCOL_F_CONFIG |
+           1ull << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD;
+}
+
+static int
+vu_blk_get_config(VuDev *vu_dev, uint8_t *config, uint32_t len)
+{
+    /* TODO blkcfg must be little-endian for VIRTIO 1.0 */
+    VuServer *server = container_of(vu_dev, VuServer, vu_dev);
+    VuBlkExport *vexp = container_of(server, VuBlkExport, vu_server);
+    memcpy(config, &vexp->blkcfg, len);
+    return 0;
+}
+
+static int
+vu_blk_set_config(VuDev *vu_dev, const uint8_t *data,
+                    uint32_t offset, uint32_t size, uint32_t flags)
+{
+    VuServer *server = container_of(vu_dev, VuServer, vu_dev);
+    VuBlkExport *vexp = container_of(server, VuBlkExport, vu_server);
+    uint8_t wce;
+
+    /* don't support live migration */
+    if (flags != VHOST_SET_CONFIG_TYPE_MASTER) {
+        return -EINVAL;
+    }
+
+    if (offset != offsetof(struct virtio_blk_config, wce) ||
+        size != 1) {
+        return -EINVAL;
+    }
+
+    wce = *data;
+    vexp->blkcfg.wce = wce;
+    blk_set_enable_write_cache(vexp->export.blk, wce);
+    return 0;
+}
+
+/*
+ * When the client disconnects, it sends a VHOST_USER_NONE request
+ * and vu_process_message will simple call exit which cause the VM
+ * to exit abruptly.
+ * To avoid this issue,  process VHOST_USER_NONE request ahead
+ * of vu_process_message.
+ *
+ */
+static int vu_blk_process_msg(VuDev *dev, VhostUserMsg *vmsg, int *do_reply)
+{
+    if (vmsg->request == VHOST_USER_NONE) {
+        dev->panic(dev, "disconnect");
+        return true;
+    }
+    return false;
+}
+
+static const VuDevIface vu_blk_iface = {
+    .get_features          = vu_blk_get_features,
+    .queue_set_started     = vu_blk_queue_set_started,
+    .get_protocol_features = vu_blk_get_protocol_features,
+    .get_config            = vu_blk_get_config,
+    .set_config            = vu_blk_set_config,
+    .process_msg           = vu_blk_process_msg,
+};
+
+static void blk_aio_attached(AioContext *ctx, void *opaque)
+{
+    VuBlkExport *vexp = opaque;
+
+    vexp->export.ctx = ctx;
+    vhost_user_server_attach_aio_context(&vexp->vu_server, ctx);
+}
+
+static void blk_aio_detach(void *opaque)
+{
+    VuBlkExport *vexp = opaque;
+
+    vhost_user_server_detach_aio_context(&vexp->vu_server);
+    vexp->export.ctx = NULL;
+}
+
+static void
+vu_blk_initialize_config(BlockDriverState *bs,
+                         struct virtio_blk_config *config,
+                         uint32_t blk_size,
+                         uint16_t num_queues)
+{
+    config->capacity = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
+    config->blk_size = blk_size;
+    config->size_max = 0;
+    config->seg_max = 128 - 2;
+    config->min_io_size = 1;
+    config->opt_io_size = 1;
+    config->num_queues = num_queues;
+    config->max_discard_sectors = 32768;
+    config->max_discard_seg = 1;
+    config->discard_sector_alignment = config->blk_size >> 9;
+    config->max_write_zeroes_sectors = 32768;
+    config->max_write_zeroes_seg = 1;
+}
+
+static void vu_blk_exp_request_shutdown(BlockExport *exp)
+{
+    VuBlkExport *vexp = container_of(exp, VuBlkExport, export);
+
+    vhost_user_server_stop(&vexp->vu_server);
+}
+
+static int vu_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
+                             Error **errp)
+{
+    VuBlkExport *vexp = container_of(exp, VuBlkExport, export);
+    BlockExportOptionsVhostUserBlk *vu_opts = &opts->u.vhost_user_blk;
+    Error *local_err = NULL;
+    uint64_t logical_block_size;
+    uint16_t num_queues = VHOST_USER_BLK_NUM_QUEUES_DEFAULT;
+
+    vexp->writable = opts->writable;
+    vexp->blkcfg.wce = 0;
+
+    if (vu_opts->has_logical_block_size) {
+        logical_block_size = vu_opts->logical_block_size;
+    } else {
+        logical_block_size = BDRV_SECTOR_SIZE;
+    }
+    check_block_size(exp->id, "logical-block-size", logical_block_size,
+                     &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return -EINVAL;
+    }
+    vexp->blk_size = logical_block_size;
+    blk_set_guest_block_size(exp->blk, logical_block_size);
+
+    if (vu_opts->has_num_queues) {
+        num_queues = vu_opts->num_queues;
+    }
+    if (num_queues == 0) {
+        error_setg(errp, "num-queues must be greater than 0");
+        return -EINVAL;
+    }
+
+    vu_blk_initialize_config(blk_bs(exp->blk), &vexp->blkcfg,
+                             logical_block_size, num_queues);
+
+    blk_add_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
+                                 vexp);
+
+    if (!vhost_user_server_start(&vexp->vu_server, vu_opts->addr, exp->ctx,
+                                 num_queues, &vu_blk_iface, errp)) {
+        blk_remove_aio_context_notifier(exp->blk, blk_aio_attached,
+                                        blk_aio_detach, vexp);
+        return -EADDRNOTAVAIL;
+    }
+
+    return 0;
+}
+
+static void vu_blk_exp_delete(BlockExport *exp)
+{
+    VuBlkExport *vexp = container_of(exp, VuBlkExport, export);
+
+    blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
+                                    vexp);
+}
+
+const BlockExportDriver blk_exp_vhost_user_blk = {
+    .type               = BLOCK_EXPORT_TYPE_VHOST_USER_BLK,
+    .instance_size      = sizeof(VuBlkExport),
+    .create             = vu_blk_exp_create,
+    .delete             = vu_blk_exp_delete,
+    .request_shutdown   = vu_blk_exp_request_shutdown,
+};
diff --git a/block/export/vhost-user-blk-server.h b/block/export/vhost-user-blk-server.h
new file mode 100644
index 0000000000..fcf46fc8a5
--- /dev/null
+++ b/block/export/vhost-user-blk-server.h
@@ -0,0 +1,19 @@
+/*
+ * Sharing QEMU block devices via vhost-user protocal
+ *
+ * Copyright (c) Coiby Xu <coiby.xu@gmail.com>.
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#ifndef VHOST_USER_BLK_SERVER_H
+#define VHOST_USER_BLK_SERVER_H
+
+#include "block/export.h"
+
+/* For block/export/export.c */
+extern const BlockExportDriver blk_exp_vhost_user_blk;
+
+#endif /* VHOST_USER_BLK_SERVER_H */
diff --git a/block/io.c b/block/io.c
index 54f0968aee..02528b3823 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2343,6 +2343,7 @@ early_out:
 int coroutine_fn
 bdrv_co_common_block_status_above(BlockDriverState *bs,
                                   BlockDriverState *base,
+                                  bool include_base,
                                   bool want_zero,
                                   int64_t offset,
                                   int64_t bytes,
@@ -2350,34 +2351,84 @@ bdrv_co_common_block_status_above(BlockDriverState *bs,
                                   int64_t *map,
                                   BlockDriverState **file)
 {
+    int ret;
     BlockDriverState *p;
-    int ret = 0;
-    bool first = true;
+    int64_t eof = 0;
+
+    assert(!include_base || base); /* Can't include NULL base */
+
+    if (!include_base && bs == base) {
+        *pnum = bytes;
+        return 0;
+    }
+
+    ret = bdrv_co_block_status(bs, want_zero, offset, bytes, pnum, map, file);
+    if (ret < 0 || *pnum == 0 || ret & BDRV_BLOCK_ALLOCATED || bs == base) {
+        return ret;
+    }
+
+    if (ret & BDRV_BLOCK_EOF) {
+        eof = offset + *pnum;
+    }
+
+    assert(*pnum <= bytes);
+    bytes = *pnum;
 
-    assert(bs != base);
-    for (p = bs; p != base; p = bdrv_filter_or_cow_bs(p)) {
+    for (p = bdrv_filter_or_cow_bs(bs); include_base || p != base;
+         p = bdrv_filter_or_cow_bs(p))
+    {
         ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map,
                                    file);
         if (ret < 0) {
-            break;
+            return ret;
         }
-        if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) {
+        if (*pnum == 0) {
             /*
-             * Reading beyond the end of the file continues to read
-             * zeroes, but we can only widen the result to the
-             * unallocated length we learned from an earlier
-             * iteration.
+             * The top layer deferred to this layer, and because this layer is
+             * short, any zeroes that we synthesize beyond EOF behave as if they
+             * were allocated at this layer.
+             *
+             * We don't include BDRV_BLOCK_EOF into ret, as upper layer may be
+             * larger. We'll add BDRV_BLOCK_EOF if needed at function end, see
+             * below.
              */
+            assert(ret & BDRV_BLOCK_EOF);
             *pnum = bytes;
+            if (file) {
+                *file = p;
+            }
+            ret = BDRV_BLOCK_ZERO | BDRV_BLOCK_ALLOCATED;
+            break;
+        }
+        if (ret & BDRV_BLOCK_ALLOCATED) {
+            /*
+             * We've found the node and the status, we must break.
+             *
+             * Drop BDRV_BLOCK_EOF, as it's not for upper layer, which may be
+             * larger. We'll add BDRV_BLOCK_EOF if needed at function end, see
+             * below.
+             */
+            ret &= ~BDRV_BLOCK_EOF;
+            break;
         }
-        if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) {
+
+        if (p == base) {
+            assert(include_base);
             break;
         }
-        /* [offset, pnum] unallocated on this layer, which could be only
-         * the first part of [offset, bytes].  */
-        bytes = MIN(bytes, *pnum);
-        first = false;
+
+        /*
+         * OK, [offset, offset + *pnum) region is unallocated on this layer,
+         * let's continue the diving.
+         */
+        assert(*pnum <= bytes);
+        bytes = *pnum;
+    }
+
+    if (offset + *pnum == eof) {
+        ret |= BDRV_BLOCK_EOF;
     }
+
     return ret;
 }
 
@@ -2385,7 +2436,7 @@ int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
                             int64_t offset, int64_t bytes, int64_t *pnum,
                             int64_t *map, BlockDriverState **file)
 {
-    return bdrv_common_block_status_above(bs, base, true, offset, bytes,
+    return bdrv_common_block_status_above(bs, base, false, true, offset, bytes,
                                           pnum, map, file);
 }
 
@@ -2402,9 +2453,9 @@ int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
     int ret;
     int64_t dummy;
 
-    ret = bdrv_common_block_status_above(bs, bdrv_filter_or_cow_bs(bs), false,
-                                         offset, bytes, pnum ? pnum : &dummy,
-                                         NULL, NULL);
+    ret = bdrv_common_block_status_above(bs, bs, true, false, offset,
+                                         bytes, pnum ? pnum : &dummy, NULL,
+                                         NULL);
     if (ret < 0) {
         return ret;
     }
@@ -2426,52 +2477,19 @@ int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
  * at 'offset + *pnum' may return the same allocation status (in other
  * words, the result is not necessarily the maximum possible range);
  * but 'pnum' will only be 0 when end of file is reached.
- *
  */
 int bdrv_is_allocated_above(BlockDriverState *top,
                             BlockDriverState *base,
                             bool include_base, int64_t offset,
                             int64_t bytes, int64_t *pnum)
 {
-    BlockDriverState *intermediate;
-    int ret;
-    int64_t n = bytes;
-
-    assert(base || !include_base);
-
-    intermediate = top;
-    while (include_base || intermediate != base) {
-        int64_t pnum_inter;
-        int64_t size_inter;
-
-        assert(intermediate);
-        ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter);
-        if (ret < 0) {
-            return ret;
-        }
-        if (ret) {
-            *pnum = pnum_inter;
-            return 1;
-        }
-
-        size_inter = bdrv_getlength(intermediate);
-        if (size_inter < 0) {
-            return size_inter;
-        }
-        if (n > pnum_inter &&
-            (intermediate == top || offset + pnum_inter < size_inter)) {
-            n = pnum_inter;
-        }
-
-        if (intermediate == base) {
-            break;
-        }
-
-        intermediate = bdrv_filter_or_cow_bs(intermediate);
+    int ret = bdrv_common_block_status_above(top, base, include_base, false,
+                                             offset, bytes, pnum, NULL, NULL);
+    if (ret < 0) {
+        return ret;
     }
 
-    *pnum = n;
-    return 0;
+    return !!(ret & BDRV_BLOCK_ALLOCATED);
 }
 
 int coroutine_fn
diff --git a/block/nvme.c b/block/nvme.c
index b48f6f2588..739a0a700c 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -128,6 +128,12 @@ struct BDRVNVMeState {
 
     /* PCI address (required for nvme_refresh_filename()) */
     char *device;
+
+    struct {
+        uint64_t completion_errors;
+        uint64_t aligned_accesses;
+        uint64_t unaligned_accesses;
+    } stats;
 };
 
 #define NVME_BLOCK_OPT_DEVICE "device"
@@ -384,6 +390,9 @@ static bool nvme_process_completion(NVMeQueuePair *q)
             break;
         }
         ret = nvme_translate_error(c);
+        if (ret) {
+            s->stats.completion_errors++;
+        }
         q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE;
         if (!q->cq.head) {
             q->cq_phase = !q->cq_phase;
@@ -1155,8 +1164,10 @@ static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
     assert(QEMU_IS_ALIGNED(bytes, s->page_size));
     assert(bytes <= s->max_transfer);
     if (nvme_qiov_aligned(bs, qiov)) {
+        s->stats.aligned_accesses++;
         return nvme_co_prw_aligned(bs, offset, bytes, qiov, is_write, flags);
     }
+    s->stats.unaligned_accesses++;
     trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write);
     buf = qemu_try_memalign(s->page_size, bytes);
 
@@ -1452,6 +1463,21 @@ static void nvme_unregister_buf(BlockDriverState *bs, void *host)
     qemu_vfio_dma_unmap(s->vfio, host);
 }
 
+static BlockStatsSpecific *nvme_get_specific_stats(BlockDriverState *bs)
+{
+    BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
+    BDRVNVMeState *s = bs->opaque;
+
+    stats->driver = BLOCKDEV_DRIVER_NVME;
+    stats->u.nvme = (BlockStatsSpecificNvme) {
+        .completion_errors = s->stats.completion_errors,
+        .aligned_accesses = s->stats.aligned_accesses,
+        .unaligned_accesses = s->stats.unaligned_accesses,
+    };
+
+    return stats;
+}
+
 static const char *const nvme_strong_runtime_opts[] = {
     NVME_BLOCK_OPT_DEVICE,
     NVME_BLOCK_OPT_NAMESPACE,
@@ -1485,6 +1511,7 @@ static BlockDriver bdrv_nvme = {
     .bdrv_refresh_filename    = nvme_refresh_filename,
     .bdrv_refresh_limits      = nvme_refresh_limits,
     .strong_runtime_opts      = nvme_strong_runtime_opts,
+    .bdrv_get_specific_stats  = nvme_get_specific_stats,
 
     .bdrv_detach_aio_context  = nvme_detach_aio_context,
     .bdrv_attach_aio_context  = nvme_attach_aio_context,
diff --git a/block/qcow2.c b/block/qcow2.c
index b05512718c..b6cb4db8bb 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -3860,8 +3860,20 @@ static bool is_zero(BlockDriverState *bs, int64_t offset, int64_t bytes)
     if (!bytes) {
         return true;
     }
-    res = bdrv_block_status_above(bs, NULL, offset, bytes, &nr, NULL, NULL);
-    return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == bytes;
+
+    /*
+     * bdrv_block_status_above doesn't merge different types of zeros, for
+     * example, zeros which come from the region which is unallocated in
+     * the whole backing chain, and zeros which come because of a short
+     * backing file. So, we need a loop.
+     */
+    do {
+        res = bdrv_block_status_above(bs, NULL, offset, bytes, &nr, NULL, NULL);
+        offset += nr;
+        bytes -= nr;
+    } while (res >= 0 && (res & BDRV_BLOCK_ZERO) && nr && bytes);
+
+    return res >= 0 && (res & BDRV_BLOCK_ZERO) && bytes == 0;
 }
 
 static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,