5 files changed, 230 insertions, 0 deletions
diff --git a/block.c b/block.c
index 033957daf6..45ad6fb4b9 100644
--- a/block.c
+++ b/block.c
@@ -1354,6 +1354,189 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
     return ret;
 }
 
+
+typedef struct MultiwriteCB {
+    int error;
+    int num_requests;
+    int num_callbacks;
+    struct {
+        BlockDriverCompletionFunc *cb;
+        void *opaque;
+        QEMUIOVector *free_qiov;
+        void *free_buf;
+    } callbacks[];
+} MultiwriteCB;
+
+static void multiwrite_user_cb(MultiwriteCB *mcb)
+{
+    int i;
+
+    for (i = 0; i < mcb->num_callbacks; i++) {
+        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
+        qemu_free(mcb->callbacks[i].free_qiov);
+        qemu_free(mcb->callbacks[i].free_buf);
+    }
+}
+
+static void multiwrite_cb(void *opaque, int ret)
+{
+    MultiwriteCB *mcb = opaque;
+
+    if (ret < 0) {
+        mcb->error = ret;
+        multiwrite_user_cb(mcb);
+    }
+
+    mcb->num_requests--;
+    if (mcb->num_requests == 0) {
+        if (mcb->error == 0) {
+            multiwrite_user_cb(mcb);
+        }
+        qemu_free(mcb);
+    }
+}
+
+static int multiwrite_req_compare(const void *a, const void *b)
+{
+    return (((BlockRequest*) a)->sector - ((BlockRequest*) b)->sector);
+}
+
+/*
+ * Takes a bunch of requests and tries to merge them. Returns the number of
+ * requests that remain after merging.
+ */
+static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
+    int num_reqs, MultiwriteCB *mcb)
+{
+    int i, outidx;
+
+    // Sort requests by start sector
+    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
+
+    // Check if adjacent requests touch the same clusters. If so, combine them,
+    // filling up gaps with zero sectors.
+    outidx = 0;
+    for (i = 1; i < num_reqs; i++) {
+        int merge = 0;
+        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
+
+        // This handles the cases that are valid for all block drivers, namely
+        // exactly sequential writes and overlapping writes.
+        if (reqs[i].sector <= oldreq_last) {
+            merge = 1;
+        }
+
+        // The block driver may decide that it makes sense to combine requests
+        // even if there is a gap of some sectors between them. In this case,
+        // the gap is filled with zeros (therefore only applicable for yet
+        // unused space in format like qcow2).
+        if (!merge && bs->drv->bdrv_merge_requests) {
+            merge = bs->drv->bdrv_merge_requests(bs, &reqs[outidx], &reqs[i]);
+        }
+
+        if (merge) {
+            size_t size;
+            QEMUIOVector *qiov = qemu_mallocz(sizeof(*qiov));
+            qemu_iovec_init(qiov,
+                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
+
+            // Add the first request to the merged one. If the requests are
+            // overlapping, drop the last sectors of the first request.
+            size = (reqs[i].sector - reqs[outidx].sector) << 9;
+            qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
+
+            // We might need to add some zeros between the two requests
+            if (reqs[i].sector > oldreq_last) {
+                size_t zero_bytes = (reqs[i].sector - oldreq_last) << 9;
+                uint8_t *buf = qemu_blockalign(bs, zero_bytes);
+                memset(buf, 0, zero_bytes);
+                qemu_iovec_add(qiov, buf, zero_bytes);
+                mcb->callbacks[i].free_buf = buf;
+            }
+
+            // Add the second request
+            qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
+
+            reqs[outidx].nb_sectors += reqs[i].nb_sectors;
+            reqs[outidx].qiov = qiov;
+
+            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
+        } else {
+            outidx++;
+            reqs[outidx].sector     = reqs[i].sector;
+            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
+            reqs[outidx].qiov       = reqs[i].qiov;
+        }
+    }
+
+    return outidx + 1;
+}
+
+/*
+ * Submit multiple AIO write requests at once.
+ *
+ * On success, the function returns 0 and all requests in the reqs array have
+ * been submitted. In error case this function returns -1, and any of the
+ * requests may or may not be submitted yet. In particular, this means that the
+ * callback will be called for some of the requests, for others it won't. The
+ * caller must check the error field of the BlockRequest to wait for the right
+ * callbacks (if error != 0, no callback will be called).
+ *
+ * The implementation may modify the contents of the reqs array, e.g. to merge
+ * requests. However, the fields opaque and error are left unmodified as they
+ * are used to signal failure for a single request to the caller.
+ */
+int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
+{
+    BlockDriverAIOCB *acb;
+    MultiwriteCB *mcb;
+    int i;
+
+    if (num_reqs == 0) {
+        return 0;
+    }
+
+    // Create MultiwriteCB structure
+    mcb = qemu_mallocz(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
+    mcb->num_requests = 0;
+    mcb->num_callbacks = num_reqs;
+
+    for (i = 0; i < num_reqs; i++) {
+        mcb->callbacks[i].cb = reqs[i].cb;
+        mcb->callbacks[i].opaque = reqs[i].opaque;
+    }
+
+    // Check for mergable requests
+    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
+
+    // Run the aio requests
+    for (i = 0; i < num_reqs; i++) {
+        acb = bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
+            reqs[i].nb_sectors, multiwrite_cb, mcb);
+
+        if (acb == NULL) {
+            // We can only fail the whole thing if no request has been
+            // submitted yet. Otherwise we'll wait for the submitted AIOs to
+            // complete and report the error in the callback.
+            if (mcb->num_requests == 0) {
+                reqs[i].error = EIO;
+                goto fail;
+            } else {
+                mcb->error = EIO;
+                break;
+            }
+        } else {
+            mcb->num_requests++;
+        }
+    }
+
+    return 0;
+
+fail:
+    free(mcb);
+    return -1;
+}
+
 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
 {
     acb->pool->cancel(acb);
diff --git a/block.h b/block.h
index 28bf357091..ea6905291e 100644
--- a/block.h
+++ b/block.h
@@ -87,6 +87,21 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
                                   BlockDriverCompletionFunc *cb, void *opaque);
 void bdrv_aio_cancel(BlockDriverAIOCB *acb);
 
+typedef struct BlockRequest {
+    /* Fields to be filled by multiwrite caller */
+    int64_t sector;
+    int nb_sectors;
+    QEMUIOVector *qiov;
+    BlockDriverCompletionFunc *cb;
+    void *opaque;
+
+    /* Filled by multiwrite implementation */
+    int error;
+} BlockRequest;
+
+int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs,
+    int num_reqs);
+
 /* sg packet commands */
 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf);
 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
diff --git a/block_int.h b/block_int.h
index 0902fd4733..3e4b4a387b 100644
--- a/block_int.h
+++ b/block_int.h
@@ -70,6 +70,12 @@ struct BlockDriver {
         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
         BlockDriverCompletionFunc *cb, void *opaque);
 
+    int (*bdrv_aio_multiwrite)(BlockDriverState *bs, BlockRequest *reqs,
+        int num_reqs);
+    int (*bdrv_merge_requests)(BlockDriverState *bs, BlockRequest* a,
+        BlockRequest *b);
+
+
     const char *protocol_name;
     int (*bdrv_truncate)(BlockDriverState *bs, int64_t offset);
     int64_t (*bdrv_getlength)(BlockDriverState *bs);
diff --git a/cutils.c b/cutils.c
index bd9a01950a..ffe5c717e4 100644
--- a/cutils.c
+++ b/cutils.c
@@ -151,6 +151,31 @@ void qemu_iovec_add(QEMUIOVector *qiov, void *base, size_t len)
     ++qiov->niov;
 }
 
+/*
+ * Copies iovecs from src to the end dst until src is completely copied or the
+ * total size of the copied iovec reaches size. The size of the last copied
+ * iovec is changed in order to fit the specified total size if it isn't a
+ * perfect fit already.
+ */
+void qemu_iovec_concat(QEMUIOVector *dst, QEMUIOVector *src, size_t size)
+{
+    int i;
+    size_t done;
+
+    assert(dst->nalloc != -1);
+
+    done = 0;
+    for (i = 0; (i < src->niov) && (done != size); i++) {
+        if (done + src->iov[i].iov_len > size) {
+            qemu_iovec_add(dst, src->iov[i].iov_base, size - done);
+            break;
+        } else {
+            qemu_iovec_add(dst, src->iov[i].iov_base, src->iov[i].iov_len);
+        }
+        done += src->iov[i].iov_len;
+    }
+}
+
 void qemu_iovec_destroy(QEMUIOVector *qiov)
 {
     assert(qiov->nalloc != -1);
diff --git a/qemu-common.h b/qemu-common.h
index 74ac88fba0..f3cfb683ea 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -223,6 +223,7 @@ typedef struct QEMUIOVector {
 void qemu_iovec_init(QEMUIOVector *qiov, int alloc_hint);
 void qemu_iovec_init_external(QEMUIOVector *qiov, struct iovec *iov, int niov);
 void qemu_iovec_add(QEMUIOVector *qiov, void *base, size_t len);
+void qemu_iovec_concat(QEMUIOVector *dst, QEMUIOVector *src, size_t size);
 void qemu_iovec_destroy(QEMUIOVector *qiov);
 void qemu_iovec_reset(QEMUIOVector *qiov);
 void qemu_iovec_to_buffer(QEMUIOVector *qiov, void *buf);