57 files changed, 2507 insertions, 478 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 906f252477..59940f97ad 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1000,3 +1000,9 @@ SSH
 M: Richard W.M. Jones <rjones@redhat.com>
 S: Supported
 F: block/ssh.c
+
+ARCHIPELAGO
+M: Chrysostomos Nanakos <cnanakos@grnet.gr>
+M: Chrysostomos Nanakos <chris@include.gr>
+S: Maintained
+F: block/archipelago.c
diff --git a/block-migration.c b/block-migration.c
index 73cdd07e8c..ba3ed36f77 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -186,7 +186,7 @@ static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
 {
     int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
 
-    if ((sector << BDRV_SECTOR_BITS) < bdrv_getlength(bmds->bs)) {
+    if (sector < bdrv_nb_sectors(bmds->bs)) {
         return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
             (1UL << (chunk % (sizeof(unsigned long) * 8))));
     } else {
@@ -223,8 +223,7 @@ static void alloc_aio_bitmap(BlkMigDevState *bmds)
     BlockDriverState *bs = bmds->bs;
     int64_t bitmap_size;
 
-    bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
-            BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
+    bitmap_size = bdrv_nb_sectors(bs) + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
     bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
 
     bmds->aio_bitmap = g_malloc0(bitmap_size);
@@ -350,7 +349,7 @@ static void init_blk_migration_it(void *opaque, BlockDriverState *bs)
     int64_t sectors;
 
     if (!bdrv_is_read_only(bs)) {
-        sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
+        sectors = bdrv_nb_sectors(bs);
         if (sectors <= 0) {
             return;
         }
@@ -799,7 +798,7 @@ static int block_load(QEMUFile *f, void *opaque, int version_id)
 
             if (bs != bs_prev) {
                 bs_prev = bs;
-                total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
+                total_sectors = bdrv_nb_sectors(bs);
                 if (total_sectors <= 0) {
                     error_report("Error getting length of block device %s",
                                  device_name);
diff --git a/block.c b/block.c
index 8cf519ba3a..6fa0201074 100644
--- a/block.c
+++ b/block.c
@@ -57,6 +57,8 @@ struct BdrvDirtyBitmap {
 
 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
 
+#define COROUTINE_POOL_RESERVATION 64 /* number of coroutines to reserve */
+
 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
@@ -701,6 +703,7 @@ static int find_image_format(BlockDriverState *bs, const char *filename,
 
 /**
  * Set the current 'total_sectors' value
+ * Return 0 on success, -errno on error.
  */
 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
 {
@@ -1313,7 +1316,6 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
         error_setg_errno(errp, -total_size, "Could not get image size");
         goto out;
     }
-    total_size &= BDRV_SECTOR_MASK;
 
     /* Create the temporary image */
     ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
@@ -2107,6 +2109,9 @@ int bdrv_attach_dev(BlockDriverState *bs, void *dev)
     }
     bs->dev = dev;
     bdrv_iostatus_reset(bs);
+
+    /* We're expecting I/O from the device so bump up coroutine pool size */
+    qemu_coroutine_adjust_pool_size(COROUTINE_POOL_RESERVATION);
     return 0;
 }
 
@@ -2126,6 +2131,7 @@ void bdrv_detach_dev(BlockDriverState *bs, void *dev)
     bs->dev_ops = NULL;
     bs->dev_opaque = NULL;
     bs->guest_block_size = 512;
+    qemu_coroutine_adjust_pool_size(-COROUTINE_POOL_RESERVATION);
 }
 
 /* TODO change to return DeviceState * when all users are qdevified */
@@ -2203,6 +2209,9 @@ bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
  */
 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
 {
+    if (bs->drv == NULL) {
+        return -ENOMEDIUM;
+    }
     if (bs->drv->bdrv_check == NULL) {
         return -ENOTSUP;
     }
@@ -2269,7 +2278,14 @@ int bdrv_commit(BlockDriverState *bs)
     }
 
     total_sectors = length >> BDRV_SECTOR_BITS;
-    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
+
+    /* qemu_try_blockalign() for bs will choose an alignment that works for
+     * bs->backing_hd as well, so no need to compare the alignment manually. */
+    buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
+    if (buf == NULL) {
+        ret = -ENOMEM;
+        goto ro_cleanup;
+    }
 
     for (sector = 0; sector < total_sectors; sector += n) {
         ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
@@ -2307,7 +2323,7 @@ int bdrv_commit(BlockDriverState *bs)
 
     ret = 0;
 ro_cleanup:
-    g_free(buf);
+    qemu_vfree(buf);
 
     if (ro) {
         /* ignoring error return here */
@@ -2827,18 +2843,16 @@ int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
  */
 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
 {
-    int64_t target_size;
-    int64_t ret, nb_sectors, sector_num = 0;
+    int64_t target_sectors, ret, nb_sectors, sector_num = 0;
     int n;
 
-    target_size = bdrv_getlength(bs);
-    if (target_size < 0) {
-        return target_size;
+    target_sectors = bdrv_nb_sectors(bs);
+    if (target_sectors < 0) {
+        return target_sectors;
     }
-    target_size /= BDRV_SECTOR_SIZE;
 
     for (;;) {
-        nb_sectors = target_size - sector_num;
+        nb_sectors = target_sectors - sector_num;
         if (nb_sectors <= 0) {
             return 0;
         }
@@ -2968,7 +2982,12 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
                                    cluster_sector_num, cluster_nb_sectors);
 
     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
-    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
+    iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
+    if (bounce_buffer == NULL) {
+        ret = -ENOMEM;
+        goto err;
+    }
+
     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
 
     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
@@ -3056,15 +3075,14 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
     } else {
         /* Read zeros after EOF of growable BDSes */
-        int64_t len, total_sectors, max_nb_sectors;
+        int64_t total_sectors, max_nb_sectors;
 
-        len = bdrv_getlength(bs);
-        if (len < 0) {
-            ret = len;
+        total_sectors = bdrv_nb_sectors(bs);
+        if (total_sectors < 0) {
+            ret = total_sectors;
             goto out;
         }
 
-        total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
                                   align >> BDRV_SECTOR_BITS);
         if (max_nb_sectors > 0) {
@@ -3253,7 +3271,11 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
             /* Fall back to bounce buffer if write zeroes is unsupported */
             iov.iov_len = num * BDRV_SECTOR_SIZE;
             if (iov.iov_base == NULL) {
-                iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
+                iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
+                if (iov.iov_base == NULL) {
+                    ret = -ENOMEM;
+                    goto fail;
+                }
                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
             }
             qemu_iovec_init_external(&qiov, &iov, 1);
@@ -3273,6 +3295,7 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
         nb_sectors -= num;
     }
 
+fail:
     qemu_vfree(iov.iov_base);
     return ret;
 }
@@ -3536,11 +3559,12 @@ int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
 }
 
 /**
- * Length of a file in bytes. Return < 0 if error or unknown.
+ * Return number of sectors on success, -errno on error.
  */
-int64_t bdrv_getlength(BlockDriverState *bs)
+int64_t bdrv_nb_sectors(BlockDriverState *bs)
 {
     BlockDriver *drv = bs->drv;
+
     if (!drv)
         return -ENOMEDIUM;
 
@@ -3550,19 +3574,26 @@ int64_t bdrv_getlength(BlockDriverState *bs)
             return ret;
         }
     }
-    return bs->total_sectors * BDRV_SECTOR_SIZE;
+    return bs->total_sectors;
+}
+
+/**
+ * Return length in bytes on success, -errno on error.
+ * The length is always a multiple of BDRV_SECTOR_SIZE.
+ */
+int64_t bdrv_getlength(BlockDriverState *bs)
+{
+    int64_t ret = bdrv_nb_sectors(bs);
+
+    return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
 }
 
 /* return 0 as number of sectors if no device present or error */
 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
 {
-    int64_t length;
-    length = bdrv_getlength(bs);
-    if (length < 0)
-        length = 0;
-    else
-        length = length >> BDRV_SECTOR_BITS;
-    *nb_sectors_ptr = length;
+    int64_t nb_sectors = bdrv_nb_sectors(bs);
+
+    *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
 }
 
 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
@@ -3945,21 +3976,21 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
                                                      int64_t sector_num,
                                                      int nb_sectors, int *pnum)
 {
-    int64_t length;
+    int64_t total_sectors;
     int64_t n;
     int64_t ret, ret2;
 
-    length = bdrv_getlength(bs);
-    if (length < 0) {
-        return length;
+    total_sectors = bdrv_nb_sectors(bs);
+    if (total_sectors < 0) {
+        return total_sectors;
     }
 
-    if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
+    if (sector_num >= total_sectors) {
         *pnum = 0;
         return 0;
     }
 
-    n = bs->total_sectors - sector_num;
+    n = total_sectors - sector_num;
     if (n < nb_sectors) {
         nb_sectors = n;
     }
@@ -3994,8 +4025,8 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
             ret |= BDRV_BLOCK_ZERO;
         } else if (bs->backing_hd) {
             BlockDriverState *bs2 = bs->backing_hd;
-            int64_t length2 = bdrv_getlength(bs2);
-            if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
+            int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
+            if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
                 ret |= BDRV_BLOCK_ZERO;
             }
         }
@@ -4607,8 +4638,9 @@ static void bdrv_aio_bh_cb(void *opaque)
 {
     BlockDriverAIOCBSync *acb = opaque;
 
-    if (!acb->is_write)
+    if (!acb->is_write && acb->ret >= 0) {
         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
+    }
     qemu_vfree(acb->bounce);
     acb->common.cb(acb->common.opaque, acb->ret);
     qemu_bh_delete(acb->bh);
@@ -4630,10 +4662,12 @@ static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
     acb->is_write = is_write;
     acb->qiov = qiov;
-    acb->bounce = qemu_blockalign(bs, qiov->size);
+    acb->bounce = qemu_try_blockalign(bs, qiov->size);
     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
 
-    if (is_write) {
+    if (acb->bounce == NULL) {
+        acb->ret = -ENOMEM;
+    } else if (is_write) {
         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
     } else {
@@ -5247,6 +5281,19 @@ void *qemu_blockalign(BlockDriverState *bs, size_t size)
     return qemu_memalign(bdrv_opt_mem_align(bs), size);
 }
 
+void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
+{
+    size_t align = bdrv_opt_mem_align(bs);
+
+    /* Ensure that NULL is never returned on success */
+    assert(align > 0);
+    if (size == 0) {
+        size = align;
+    }
+
+    return qemu_try_memalign(align, size);
+}
+
 /*
  * Check if all memory in this vector is sector aligned.
  */
@@ -5277,13 +5324,12 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
 
     granularity >>= BDRV_SECTOR_BITS;
     assert(granularity);
-    bitmap_size = bdrv_getlength(bs);
+    bitmap_size = bdrv_nb_sectors(bs);
     if (bitmap_size < 0) {
         error_setg_errno(errp, -bitmap_size, "could not get length of device");
         errno = -bitmap_size;
         return NULL;
     }
-    bitmap_size >>= BDRV_SECTOR_BITS;
     bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
     bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
@@ -5371,6 +5417,9 @@ void bdrv_ref(BlockDriverState *bs)
  * deleted. */
 void bdrv_unref(BlockDriverState *bs)
 {
+    if (!bs) {
+        return;
+    }
     assert(bs->refcnt > 0);
     if (--bs->refcnt == 0) {
         bdrv_delete(bs);
@@ -5591,7 +5640,7 @@ void bdrv_img_create(const char *filename, const char *fmt,
     if (size == -1) {
         if (backing_file) {
             BlockDriverState *bs;
-            uint64_t size;
+            int64_t size;
             int back_flags;
 
             /* backing files always opened read-only */
@@ -5609,8 +5658,13 @@ void bdrv_img_create(const char *filename, const char *fmt,
                 local_err = NULL;
                 goto out;
             }
-            bdrv_get_geometry(bs, &size);
-            size *= 512;
+            size = bdrv_getlength(bs);
+            if (size < 0) {
+                error_setg_errno(errp, -size, "Could not get size of '%s'",
+                                 backing_file);
+                bdrv_unref(bs);
+                goto out;
+            }
 
             qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size);
 
diff --git a/block/Makefile.objs b/block/Makefile.objs
index fd88c03ece..858d2b387b 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -17,6 +17,7 @@ block-obj-$(CONFIG_LIBNFS) += nfs.o
 block-obj-$(CONFIG_CURL) += curl.o
 block-obj-$(CONFIG_RBD) += rbd.o
 block-obj-$(CONFIG_GLUSTERFS) += gluster.o
+block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o
 block-obj-$(CONFIG_LIBSSH2) += ssh.o
 endif
 
@@ -35,5 +36,6 @@ gluster.o-cflags   := $(GLUSTERFS_CFLAGS)
 gluster.o-libs     := $(GLUSTERFS_LIBS)
 ssh.o-cflags       := $(LIBSSH2_CFLAGS)
 ssh.o-libs         := $(LIBSSH2_LIBS)
+archipelago.o-libs := $(ARCHIPELAGO_LIBS)
 qcow.o-libs        := -lz
 linux-aio.o-libs   := -laio
diff --git a/block/archipelago.c b/block/archipelago.c
new file mode 100644
index 0000000000..6629d03a1d
--- /dev/null
+++ b/block/archipelago.c
@@ -0,0 +1,1069 @@
+/*
+ * QEMU Block driver for Archipelago
+ *
+ * Copyright (C) 2014 Chrysostomos Nanakos <cnanakos@grnet.gr>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+/*
+ * VM Image on Archipelago volume is specified like this:
+ *
+ * file.driver=archipelago,file.volume=<volumename>
+ * [,file.mport=<mapperd_port>[,file.vport=<vlmcd_port>]
+ * [,file.segment=<segment_name>]]
+ *
+ * or
+ *
+ * file=archipelago:<volumename>[/mport=<mapperd_port>[:vport=<vlmcd_port>][:
+ * segment=<segment_name>]]
+ *
+ * 'archipelago' is the protocol.
+ *
+ * 'mport' is the port number on which mapperd is listening. This is optional
+ * and if not specified, QEMU will make Archipelago to use the default port.
+ *
+ * 'vport' is the port number on which vlmcd is listening. This is optional
+ * and if not specified, QEMU will make Archipelago to use the default port.
+ *
+ * 'segment' is the name of the shared memory segment Archipelago stack
+ * is using. This is optional and if not specified, QEMU will make Archipelago
+ * to use the default value, 'archipelago'.
+ *
+ * Examples:
+ *
+ * file.driver=archipelago,file.volume=my_vm_volume
+ * file.driver=archipelago,file.volume=my_vm_volume,file.mport=123
+ * file.driver=archipelago,file.volume=my_vm_volume,file.mport=123,
+ *  file.vport=1234
+ * file.driver=archipelago,file.volume=my_vm_volume,file.mport=123,
+ *  file.vport=1234,file.segment=my_segment
+ *
+ * or
+ *
+ * file=archipelago:my_vm_volume
+ * file=archipelago:my_vm_volume/mport=123
+ * file=archipelago:my_vm_volume/mport=123:vport=1234
+ * file=archipelago:my_vm_volume/mport=123:vport=1234:segment=my_segment
+ *
+ */
+
+#include "qemu-common.h"
+#include "block/block_int.h"
+#include "qemu/error-report.h"
+#include "qemu/thread.h"
+#include "qapi/qmp/qint.h"
+#include "qapi/qmp/qstring.h"
+#include "qapi/qmp/qjson.h"
+
+#include <inttypes.h>
+#include <xseg/xseg.h>
+#include <xseg/protocol.h>
+
+#define ARCHIP_FD_READ      0
+#define ARCHIP_FD_WRITE     1
+#define MAX_REQUEST_SIZE    524288
+
+#define ARCHIPELAGO_OPT_VOLUME      "volume"
+#define ARCHIPELAGO_OPT_SEGMENT     "segment"
+#define ARCHIPELAGO_OPT_MPORT       "mport"
+#define ARCHIPELAGO_OPT_VPORT       "vport"
+#define ARCHIPELAGO_DFL_MPORT       1001
+#define ARCHIPELAGO_DFL_VPORT       501
+
+#define archipelagolog(fmt, ...) \
+    do {                         \
+        fprintf(stderr, "archipelago\t%-24s: " fmt, __func__, ##__VA_ARGS__); \
+    } while (0)
+
+typedef enum {
+    ARCHIP_OP_READ,
+    ARCHIP_OP_WRITE,
+    ARCHIP_OP_FLUSH,
+    ARCHIP_OP_VOLINFO,
+} ARCHIPCmd;
+
+typedef struct ArchipelagoAIOCB {
+    BlockDriverAIOCB common;
+    QEMUBH *bh;
+    struct BDRVArchipelagoState *s;
+    QEMUIOVector *qiov;
+    ARCHIPCmd cmd;
+    bool cancelled;
+    int status;
+    int64_t size;
+    int64_t ret;
+} ArchipelagoAIOCB;
+
+typedef struct BDRVArchipelagoState {
+    ArchipelagoAIOCB *event_acb;
+    char *volname;
+    char *segment_name;
+    uint64_t size;
+    /* Archipelago specific */
+    struct xseg *xseg;
+    struct xseg_port *port;
+    xport srcport;
+    xport sport;
+    xport mportno;
+    xport vportno;
+    QemuMutex archip_mutex;
+    QemuCond archip_cond;
+    bool is_signaled;
+    /* Request handler specific */
+    QemuThread request_th;
+    QemuCond request_cond;
+    QemuMutex request_mutex;
+    bool th_is_signaled;
+    bool stopping;
+} BDRVArchipelagoState;
+
+typedef struct ArchipelagoSegmentedRequest {
+    size_t count;
+    size_t total;
+    int ref;
+    int failed;
+} ArchipelagoSegmentedRequest;
+
+typedef struct AIORequestData {
+    const char *volname;
+    off_t offset;
+    size_t size;
+    uint64_t bufidx;
+    int ret;
+    int op;
+    ArchipelagoAIOCB *aio_cb;
+    ArchipelagoSegmentedRequest *segreq;
+} AIORequestData;
+
+static void qemu_archipelago_complete_aio(void *opaque);
+
+static void init_local_signal(struct xseg *xseg, xport sport, xport srcport)
+{
+    if (xseg && (sport != srcport)) {
+        xseg_init_local_signal(xseg, srcport);
+        sport = srcport;
+    }
+}
+
+static void archipelago_finish_aiocb(AIORequestData *reqdata)
+{
+    if (reqdata->aio_cb->ret != reqdata->segreq->total) {
+        reqdata->aio_cb->ret = -EIO;
+    } else if (reqdata->aio_cb->ret == reqdata->segreq->total) {
+        reqdata->aio_cb->ret = 0;
+    }
+    reqdata->aio_cb->bh = aio_bh_new(
+                        bdrv_get_aio_context(reqdata->aio_cb->common.bs),
+                        qemu_archipelago_complete_aio, reqdata
+                        );
+    qemu_bh_schedule(reqdata->aio_cb->bh);
+}
+
+static int wait_reply(struct xseg *xseg, xport srcport, struct xseg_port *port,
+                      struct xseg_request *expected_req)
+{
+    struct xseg_request *req;
+    xseg_prepare_wait(xseg, srcport);
+    void *psd = xseg_get_signal_desc(xseg, port);
+    while (1) {
+        req = xseg_receive(xseg, srcport, X_NONBLOCK);
+        if (req) {
+            if (req != expected_req) {
+                archipelagolog("Unknown received request\n");
+                xseg_put_request(xseg, req, srcport);
+            } else if (!(req->state & XS_SERVED)) {
+                return -1;
+            } else {
+                break;
+            }
+        }
+        xseg_wait_signal(xseg, psd, 100000UL);
+    }
+    xseg_cancel_wait(xseg, srcport);
+    return 0;
+}
+
+static void xseg_request_handler(void *state)
+{
+    BDRVArchipelagoState *s = (BDRVArchipelagoState *) state;
+    void *psd = xseg_get_signal_desc(s->xseg, s->port);
+    qemu_mutex_lock(&s->request_mutex);
+
+    while (!s->stopping) {
+        struct xseg_request *req;
+        void *data;
+        xseg_prepare_wait(s->xseg, s->srcport);
+        req = xseg_receive(s->xseg, s->srcport, X_NONBLOCK);
+        if (req) {
+            AIORequestData *reqdata;
+            ArchipelagoSegmentedRequest *segreq;
+            xseg_get_req_data(s->xseg, req, (void **)&reqdata);
+
+            switch (reqdata->op) {
+            case ARCHIP_OP_READ:
+                data = xseg_get_data(s->xseg, req);
+                segreq = reqdata->segreq;
+                segreq->count += req->serviced;
+
+                qemu_iovec_from_buf(reqdata->aio_cb->qiov, reqdata->bufidx,
+                                    data,
+                                    req->serviced);
+
+                xseg_put_request(s->xseg, req, s->srcport);
+
+                if ((__sync_add_and_fetch(&segreq->ref, -1)) == 0) {
+                    if (!segreq->failed) {
+                        reqdata->aio_cb->ret = segreq->count;
+                        archipelago_finish_aiocb(reqdata);
+                        g_free(segreq);
+                    } else {
+                        g_free(segreq);
+                        g_free(reqdata);
+                    }
+                } else {
+                    g_free(reqdata);
+                }
+                break;
+            case ARCHIP_OP_WRITE:
+            case ARCHIP_OP_FLUSH:
+                segreq = reqdata->segreq;
+                segreq->count += req->serviced;
+                xseg_put_request(s->xseg, req, s->srcport);
+
+                if ((__sync_add_and_fetch(&segreq->ref, -1)) == 0) {
+                    if (!segreq->failed) {
+                        reqdata->aio_cb->ret = segreq->count;
+                        archipelago_finish_aiocb(reqdata);
+                        g_free(segreq);
+                    } else {
+                        g_free(segreq);
+                        g_free(reqdata);
+                    }
+                } else {
+                    g_free(reqdata);
+                }
+                break;
+            case ARCHIP_OP_VOLINFO:
+                s->is_signaled = true;
+                qemu_cond_signal(&s->archip_cond);
+                break;
+            }
+        } else {
+            xseg_wait_signal(s->xseg, psd, 100000UL);
+        }
+        xseg_cancel_wait(s->xseg, s->srcport);
+    }
+
+    s->th_is_signaled = true;
+    qemu_cond_signal(&s->request_cond);
+    qemu_mutex_unlock(&s->request_mutex);
+    qemu_thread_exit(NULL);
+}
+
+static int qemu_archipelago_xseg_init(BDRVArchipelagoState *s)
+{
+    if (xseg_initialize()) {
+        archipelagolog("Cannot initialize XSEG\n");
+        goto err_exit;
+    }
+
+    s->xseg = xseg_join("posix", s->segment_name,
+                        "posixfd", NULL);
+    if (!s->xseg) {
+        archipelagolog("Cannot join XSEG shared memory segment\n");
+        goto err_exit;
+    }
+    s->port = xseg_bind_dynport(s->xseg);
+    s->srcport = s->port->portno;
+    init_local_signal(s->xseg, s->sport, s->srcport);
+    return 0;
+
+err_exit:
+    return -1;
+}
+
+static int qemu_archipelago_init(BDRVArchipelagoState *s)
+{
+    int ret;
+
+    ret = qemu_archipelago_xseg_init(s);
+    if (ret < 0) {
+        error_report("Cannot initialize XSEG. Aborting...\n");
+        goto err_exit;
+    }
+
+    qemu_cond_init(&s->archip_cond);
+    qemu_mutex_init(&s->archip_mutex);
+    qemu_cond_init(&s->request_cond);
+    qemu_mutex_init(&s->request_mutex);
+    s->th_is_signaled = false;
+    qemu_thread_create(&s->request_th, "xseg_io_th",
+                       (void *) xseg_request_handler,
+                       (void *) s, QEMU_THREAD_JOINABLE);
+
+err_exit:
+    return ret;
+}
+
+static void qemu_archipelago_complete_aio(void *opaque)
+{
+    AIORequestData *reqdata = (AIORequestData *) opaque;
+    ArchipelagoAIOCB *aio_cb = (ArchipelagoAIOCB *) reqdata->aio_cb;
+
+    qemu_bh_delete(aio_cb->bh);
+    aio_cb->common.cb(aio_cb->common.opaque, aio_cb->ret);
+    aio_cb->status = 0;
+
+    if (!aio_cb->cancelled) {
+        qemu_aio_release(aio_cb);
+    }
+    g_free(reqdata);
+}
+
+static void xseg_find_port(char *pstr, const char *needle, xport *aport)
+{
+    const char *a;
+    char *endptr = NULL;
+    unsigned long port;
+    if (strstart(pstr, needle, &a)) {
+        if (strlen(a) > 0) {
+            port = strtoul(a, &endptr, 10);
+            if (strlen(endptr)) {
+                *aport = -2;
+                return;
+            }
+            *aport = (xport) port;
+        }
+    }
+}
+
+static void xseg_find_segment(char *pstr, const char *needle,
+                              char **segment_name)
+{
+    const char *a;
+    if (strstart(pstr, needle, &a)) {
+        if (strlen(a) > 0) {
+            *segment_name = g_strdup(a);
+        }
+    }
+}
+
+static void parse_filename_opts(const char *filename, Error **errp,
+                                char **volume, char **segment_name,
+                                xport *mport, xport *vport)
+{
+    const char *start;
+    char *tokens[4], *ds;
+    int idx;
+    xport lmport = NoPort, lvport = NoPort;
+
+    strstart(filename, "archipelago:", &start);
+
+    ds = g_strdup(start);
+    tokens[0] = strtok(ds, "/");
+    tokens[1] = strtok(NULL, ":");
+    tokens[2] = strtok(NULL, ":");
+    tokens[3] = strtok(NULL, "\0");
+
+    if (!strlen(tokens[0])) {
+        error_setg(errp, "volume name must be specified first");
+        g_free(ds);
+        return;
+    }
+
+    for (idx = 1; idx < 4; idx++) {
+        if (tokens[idx] != NULL) {
+            if (strstart(tokens[idx], "mport=", NULL)) {
+                xseg_find_port(tokens[idx], "mport=", &lmport);
+            }
+            if (strstart(tokens[idx], "vport=", NULL)) {
+                xseg_find_port(tokens[idx], "vport=", &lvport);
+            }
+            if (strstart(tokens[idx], "segment=", NULL)) {
+                xseg_find_segment(tokens[idx], "segment=", segment_name);
+            }
+        }
+    }
+
+    if ((lmport == -2) || (lvport == -2)) {
+        error_setg(errp, "mport and/or vport must be set");
+        g_free(ds);
+        return;
+    }
+    *volume = g_strdup(tokens[0]);
+    *mport = lmport;
+    *vport = lvport;
+    g_free(ds);
+}
+
+static void archipelago_parse_filename(const char *filename, QDict *options,
+                                       Error **errp)
+{
+    const char *start;
+    char *volume = NULL, *segment_name = NULL;
+    xport mport = NoPort, vport = NoPort;
+
+    if (qdict_haskey(options, ARCHIPELAGO_OPT_VOLUME)
+            || qdict_haskey(options, ARCHIPELAGO_OPT_SEGMENT)
+            || qdict_haskey(options, ARCHIPELAGO_OPT_MPORT)
+            || qdict_haskey(options, ARCHIPELAGO_OPT_VPORT)) {
+        error_setg(errp, "volume/mport/vport/segment and a file name may not"
+                         " be specified at the same time");
+        return;
+    }
+
+    if (!strstart(filename, "archipelago:", &start)) {
+        error_setg(errp, "File name must start with 'archipelago:'");
+        return;
+    }
+
+    if (!strlen(start) || strstart(start, "/", NULL)) {
+        error_setg(errp, "volume name must be specified");
+        return;
+    }
+
+    parse_filename_opts(filename, errp, &volume, &segment_name, &mport, &vport);
+
+    if (volume) {
+        qdict_put(options, ARCHIPELAGO_OPT_VOLUME, qstring_from_str(volume));
+        g_free(volume);
+    }
+    if (segment_name) {
+        qdict_put(options, ARCHIPELAGO_OPT_SEGMENT,
+                  qstring_from_str(segment_name));
+        g_free(segment_name);
+    }
+    if (mport != NoPort) {
+        qdict_put(options, ARCHIPELAGO_OPT_MPORT, qint_from_int(mport));
+    }
+    if (vport != NoPort) {
+        qdict_put(options, ARCHIPELAGO_OPT_VPORT, qint_from_int(vport));
+    }
+}
+
+static QemuOptsList archipelago_runtime_opts = {
+    .name = "archipelago",
+    .head = QTAILQ_HEAD_INITIALIZER(archipelago_runtime_opts.head),
+    .desc = {
+        {
+            .name = ARCHIPELAGO_OPT_VOLUME,
+            .type = QEMU_OPT_STRING,
+            .help = "Name of the volume image",
+        },
+        {
+            .name = ARCHIPELAGO_OPT_SEGMENT,
+            .type = QEMU_OPT_STRING,
+            .help = "Name of the Archipelago shared memory segment",
+        },
+        {
+            .name = ARCHIPELAGO_OPT_MPORT,
+            .type = QEMU_OPT_NUMBER,
+            .help = "Archipelago mapperd port number"
+        },
+        {
+            .name = ARCHIPELAGO_OPT_VPORT,
+            .type = QEMU_OPT_NUMBER,
+            .help = "Archipelago vlmcd port number"
+
+        },
+        { /* end of list */ }
+    },
+};
+
+static int qemu_archipelago_open(BlockDriverState *bs,
+                                 QDict *options,
+                                 int bdrv_flags,
+                                 Error **errp)
+{
+    int ret = 0;
+    const char *volume, *segment_name;
+    QemuOpts *opts;
+    Error *local_err = NULL;
+    BDRVArchipelagoState *s = bs->opaque;
+
+    opts = qemu_opts_create(&archipelago_runtime_opts, NULL, 0, &error_abort);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto err_exit;
+    }
+
+    s->mportno = qemu_opt_get_number(opts, ARCHIPELAGO_OPT_MPORT,
+                                     ARCHIPELAGO_DFL_MPORT);
+    s->vportno = qemu_opt_get_number(opts, ARCHIPELAGO_OPT_VPORT,
+                                     ARCHIPELAGO_DFL_VPORT);
+
+    segment_name = qemu_opt_get(opts, ARCHIPELAGO_OPT_SEGMENT);
+    if (segment_name == NULL) {
+        s->segment_name = g_strdup("archipelago");
+    } else {
+        s->segment_name = g_strdup(segment_name);
+    }
+
+    volume = qemu_opt_get(opts, ARCHIPELAGO_OPT_VOLUME);
+    if (volume == NULL) {
+        error_setg(errp, "archipelago block driver requires the 'volume'"
+                   " option");
+        ret = -EINVAL;
+        goto err_exit;
+    }
+    s->volname = g_strdup(volume);
+
+    /* Initialize XSEG, join shared memory segment */
+    ret = qemu_archipelago_init(s);
+    if (ret < 0) {
+        error_setg(errp, "cannot initialize XSEG and join shared "
+                   "memory segment");
+        goto err_exit;
+    }
+
+    qemu_opts_del(opts);
+    return 0;
+
+err_exit:
+    g_free(s->volname);
+    g_free(s->segment_name);
+    qemu_opts_del(opts);
+    return ret;
+}
+
+static void qemu_archipelago_close(BlockDriverState *bs)
+{
+    int r, targetlen;
+    char *target;
+    struct xseg_request *req;
+    BDRVArchipelagoState *s = bs->opaque;
+
+    s->stopping = true;
+
+    qemu_mutex_lock(&s->request_mutex);
+    while (!s->th_is_signaled) {
+        qemu_cond_wait(&s->request_cond,
+                       &s->request_mutex);
+    }
+    qemu_mutex_unlock(&s->request_mutex);
+    qemu_thread_join(&s->request_th);
+    qemu_cond_destroy(&s->request_cond);
+    qemu_mutex_destroy(&s->request_mutex);
+
+    qemu_cond_destroy(&s->archip_cond);
+    qemu_mutex_destroy(&s->archip_mutex);
+
+    targetlen = strlen(s->volname);
+    req = xseg_get_request(s->xseg, s->srcport, s->vportno, X_ALLOC);
+    if (!req) {
+        archipelagolog("Cannot get XSEG request\n");
+        goto err_exit;
+    }
+    r = xseg_prep_request(s->xseg, req, targetlen, 0);
+    if (r < 0) {
+        xseg_put_request(s->xseg, req, s->srcport);
+        archipelagolog("Cannot prepare XSEG close request\n");
+        goto err_exit;
+    }
+
+    target = xseg_get_target(s->xseg, req);
+    memcpy(target, s->volname, targetlen);
+    req->size = req->datalen;
+    req->offset = 0;
+    req->op = X_CLOSE;
+
+    xport p = xseg_submit(s->xseg, req, s->srcport, X_ALLOC);
+    if (p == NoPort) {
+        xseg_put_request(s->xseg, req, s->srcport);
+        archipelagolog("Cannot submit XSEG close request\n");
+        goto err_exit;
+    }
+
+    xseg_signal(s->xseg, p);
+    wait_reply(s->xseg, s->srcport, s->port, req);
+
+    xseg_put_request(s->xseg, req, s->srcport);
+
+err_exit:
+    g_free(s->volname);
+    g_free(s->segment_name);
+    xseg_quit_local_signal(s->xseg, s->srcport);
+    xseg_leave_dynport(s->xseg, s->port);
+    xseg_leave(s->xseg);
+}
+
+static int qemu_archipelago_create_volume(Error **errp, const char *volname,
+                                          char *segment_name,
+                                          uint64_t size, xport mportno,
+                                          xport vportno)
+{
+    int ret, targetlen;
+    struct xseg *xseg = NULL;
+    struct xseg_request *req;
+    struct xseg_request_clone *xclone;
+    struct xseg_port *port;
+    xport srcport = NoPort, sport = NoPort;
+    char *target;
+
+    /* Try default values if none has been set */
+    if (mportno == (xport) -1) {
+        mportno = ARCHIPELAGO_DFL_MPORT;
+    }
+
+    if (vportno == (xport) -1) {
+        vportno = ARCHIPELAGO_DFL_VPORT;
+    }
+
+    if (xseg_initialize()) {
+        error_setg(errp, "Cannot initialize XSEG");
+        return -1;
+    }
+
+    xseg = xseg_join("posix", segment_name,
+                     "posixfd", NULL);
+
+    if (!xseg) {
+        error_setg(errp, "Cannot join XSEG shared memory segment");
+        return -1;
+    }
+
+    port = xseg_bind_dynport(xseg);
+    srcport = port->portno;
+    init_local_signal(xseg, sport, srcport);
+
+    req = xseg_get_request(xseg, srcport, mportno, X_ALLOC);
+    if (!req) {
+        error_setg(errp, "Cannot get XSEG request");
+        return -1;
+    }
+
+    targetlen = strlen(volname);
+    ret = xseg_prep_request(xseg, req, targetlen,
+                            sizeof(struct xseg_request_clone));
+    if (ret < 0) {
+        error_setg(errp, "Cannot prepare XSEG request");
+        goto err_exit;
+    }
+
+    target = xseg_get_target(xseg, req);
+    if (!target) {
+        error_setg(errp, "Cannot get XSEG target.\n");
+        goto err_exit;
+    }
+    memcpy(target, volname, targetlen);
+    xclone = (struct xseg_request_clone *) xseg_get_data(xseg, req);
+    memset(xclone->target, 0 , XSEG_MAX_TARGETLEN);
+    xclone->targetlen = 0;
+    xclone->size = size;
+    req->offset = 0;
+    req->size = req->datalen;
+    req->op = X_CLONE;
+
+    xport p = xseg_submit(xseg, req, srcport, X_ALLOC);
+    if (p == NoPort) {
+        error_setg(errp, "Could not submit XSEG request");
+        goto err_exit;
+    }
+    xseg_signal(xseg, p);
+
+    ret = wait_reply(xseg, srcport, port, req);
+    if (ret < 0) {
+        error_setg(errp, "wait_reply() error.");
+    }
+
+    xseg_put_request(xseg, req, srcport);
+    xseg_quit_local_signal(xseg, srcport);
+    xseg_leave_dynport(xseg, port);
+    xseg_leave(xseg);
+    return ret;
+
+err_exit:
+    xseg_put_request(xseg, req, srcport);
+    xseg_quit_local_signal(xseg, srcport);
+    xseg_leave_dynport(xseg, port);
+    xseg_leave(xseg);
+    return -1;
+}
+
+static int qemu_archipelago_create(const char *filename,
+                                   QemuOpts *options,
+                                   Error **errp)
+{
+    int ret = 0;
+    uint64_t total_size = 0;
+    char *volname = NULL, *segment_name = NULL;
+    const char *start;
+    xport mport = NoPort, vport = NoPort;
+
+    if (!strstart(filename, "archipelago:", &start)) {
+        error_setg(errp, "File name must start with 'archipelago:'");
+        return -1;
+    }
+
+    if (!strlen(start) || strstart(start, "/", NULL)) {
+        error_setg(errp, "volume name must be specified");
+        return -1;
+    }
+
+    parse_filename_opts(filename, errp, &volname, &segment_name, &mport,
+                        &vport);
+    total_size = qemu_opt_get_size_del(options, BLOCK_OPT_SIZE, 0);
+
+    if (segment_name == NULL) {
+        segment_name = g_strdup("archipelago");
+    }
+
+    /* Create an Archipelago volume */
+    ret = qemu_archipelago_create_volume(errp, volname, segment_name,
+                                         total_size, mport,
+                                         vport);
+
+    g_free(volname);
+    g_free(segment_name);
+    return ret;
+}
+
+static void qemu_archipelago_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+    ArchipelagoAIOCB *aio_cb = (ArchipelagoAIOCB *) blockacb;
+    aio_cb->cancelled = true;
+    while (aio_cb->status == -EINPROGRESS) {
+        aio_poll(bdrv_get_aio_context(aio_cb->common.bs), true);
+    }
+    qemu_aio_release(aio_cb);
+}
+
+static const AIOCBInfo archipelago_aiocb_info = {
+    .aiocb_size = sizeof(ArchipelagoAIOCB),
+    .cancel = qemu_archipelago_aio_cancel,
+};
+
+static int archipelago_submit_request(BDRVArchipelagoState *s,
+                                        uint64_t bufidx,
+                                        size_t count,
+                                        off_t offset,
+                                        ArchipelagoAIOCB *aio_cb,
+                                        ArchipelagoSegmentedRequest *segreq,
+                                        int op)
+{
+    int ret, targetlen;
+    char *target;
+    void *data = NULL;
+    struct xseg_request *req;
+    AIORequestData *reqdata = g_malloc(sizeof(AIORequestData));
+
+    targetlen = strlen(s->volname);
+    req = xseg_get_request(s->xseg, s->srcport, s->vportno, X_ALLOC);
+    if (!req) {
+        archipelagolog("Cannot get XSEG request\n");
+        goto err_exit2;
+    }
+    ret = xseg_prep_request(s->xseg, req, targetlen, count);
+    if (ret < 0) {
+        archipelagolog("Cannot prepare XSEG request\n");
+        goto err_exit;
+    }
+    target = xseg_get_target(s->xseg, req);
+    if (!target) {
+        archipelagolog("Cannot get XSEG target\n");
+        goto err_exit;
+    }
+    memcpy(target, s->volname, targetlen);
+    req->size = count;
+    req->offset = offset;
+
+    switch (op) {
+    case ARCHIP_OP_READ:
+        req->op = X_READ;
+        break;
+    case ARCHIP_OP_WRITE:
+        req->op = X_WRITE;
+        break;
+    case ARCHIP_OP_FLUSH:
+        req->op = X_FLUSH;
+        break;
+    }
+    reqdata->volname = s->volname;
+    reqdata->offset = offset;
+    reqdata->size = count;
+    reqdata->bufidx = bufidx;
+    reqdata->aio_cb = aio_cb;
+    reqdata->segreq = segreq;
+    reqdata->op = op;
+
+    xseg_set_req_data(s->xseg, req, reqdata);
+    if (op == ARCHIP_OP_WRITE) {
+        data = xseg_get_data(s->xseg, req);
+        if (!data) {
+            archipelagolog("Cannot get XSEG data\n");
+            goto err_exit;
+        }
+        qemu_iovec_to_buf(aio_cb->qiov, bufidx, data, count);
+    }
+
+    xport p = xseg_submit(s->xseg, req, s->srcport, X_ALLOC);
+    if (p == NoPort) {
+        archipelagolog("Could not submit XSEG request\n");
+        goto err_exit;
+    }
+    xseg_signal(s->xseg, p);
+    return 0;
+
+err_exit:
+    g_free(reqdata);
+    xseg_put_request(s->xseg, req, s->srcport);
+    return -EIO;
+err_exit2:
+    g_free(reqdata);
+    return -EIO;
+}
+
+static int archipelago_aio_segmented_rw(BDRVArchipelagoState *s,
+                                        size_t count,
+                                        off_t offset,
+                                        ArchipelagoAIOCB *aio_cb,
+                                        int op)
+{
+    int i, ret, segments_nr, last_segment_size;
+    ArchipelagoSegmentedRequest *segreq;
+
+    segreq = g_malloc(sizeof(ArchipelagoSegmentedRequest));
+
+    if (op == ARCHIP_OP_FLUSH) {
+        segments_nr = 1;
+        segreq->ref = segments_nr;
+        segreq->total = count;
+        segreq->count = 0;
+        segreq->failed = 0;
+        ret = archipelago_submit_request(s, 0, count, offset, aio_cb,
+                                           segreq, ARCHIP_OP_FLUSH);
+        if (ret < 0) {
+            goto err_exit;
+        }
+        return 0;
+    }
+
+    segments_nr = (int)(count / MAX_REQUEST_SIZE) + \
+                  ((count % MAX_REQUEST_SIZE) ? 1 : 0);
+    last_segment_size = (int)(count % MAX_REQUEST_SIZE);
+
+    segreq->ref = segments_nr;
+    segreq->total = count;
+    segreq->count = 0;
+    segreq->failed = 0;
+
+    for (i = 0; i < segments_nr - 1; i++) {
+        ret = archipelago_submit_request(s, i * MAX_REQUEST_SIZE,
+                                           MAX_REQUEST_SIZE,
+                                           offset + i * MAX_REQUEST_SIZE,
+                                           aio_cb, segreq, op);
+
+        if (ret < 0) {
+            goto err_exit;
+        }
+    }
+
+    if ((segments_nr > 1) && last_segment_size) {
+        ret = archipelago_submit_request(s, i * MAX_REQUEST_SIZE,
+                                           last_segment_size,
+                                           offset + i * MAX_REQUEST_SIZE,
+                                           aio_cb, segreq, op);
+    } else if ((segments_nr > 1) && !last_segment_size) {
+        ret = archipelago_submit_request(s, i * MAX_REQUEST_SIZE,
+                                           MAX_REQUEST_SIZE,
+                                           offset + i * MAX_REQUEST_SIZE,
+                                           aio_cb, segreq, op);
+    } else if (segments_nr == 1) {
+            ret = archipelago_submit_request(s, 0, count, offset, aio_cb,
+                                               segreq, op);
+    }
+
+    if (ret < 0) {
+        goto err_exit;
+    }
+
+    return 0;
+
+err_exit:
+    __sync_add_and_fetch(&segreq->failed, 1);
+    if (segments_nr == 1) {
+        if (__sync_add_and_fetch(&segreq->ref, -1) == 0) {
+            g_free(segreq);
+        }
+    } else {
+        if ((__sync_add_and_fetch(&segreq->ref, -segments_nr + i)) == 0) {
+            g_free(segreq);
+        }
+    }
+
+    return ret;
+}
+
+static BlockDriverAIOCB *qemu_archipelago_aio_rw(BlockDriverState *bs,
+                                                 int64_t sector_num,
+                                                 QEMUIOVector *qiov,
+                                                 int nb_sectors,
+                                                 BlockDriverCompletionFunc *cb,
+                                                 void *opaque,
+                                                 int op)
+{
+    ArchipelagoAIOCB *aio_cb;
+    BDRVArchipelagoState *s = bs->opaque;
+    int64_t size, off;
+    int ret;
+
+    aio_cb = qemu_aio_get(&archipelago_aiocb_info, bs, cb, opaque);
+    aio_cb->cmd = op;
+    aio_cb->qiov = qiov;
+
+    aio_cb->ret = 0;
+    aio_cb->s = s;
+    aio_cb->cancelled = false;
+    aio_cb->status = -EINPROGRESS;
+
+    off = sector_num * BDRV_SECTOR_SIZE;
+    size = nb_sectors * BDRV_SECTOR_SIZE;
+    aio_cb->size = size;
+
+    ret = archipelago_aio_segmented_rw(s, size, off,
+                                       aio_cb, op);
+    if (ret < 0) {
+        goto err_exit;
+    }
+    return &aio_cb->common;
+
+err_exit:
+    error_report("qemu_archipelago_aio_rw(): I/O Error\n");
+    qemu_aio_release(aio_cb);
+    return NULL;
+}
+
+static BlockDriverAIOCB *qemu_archipelago_aio_readv(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    return qemu_archipelago_aio_rw(bs, sector_num, qiov, nb_sectors, cb,
+                                   opaque, ARCHIP_OP_READ);
+}
+
+static BlockDriverAIOCB *qemu_archipelago_aio_writev(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    return qemu_archipelago_aio_rw(bs, sector_num, qiov, nb_sectors, cb,
+                                   opaque, ARCHIP_OP_WRITE);
+}
+
+static int64_t archipelago_volume_info(BDRVArchipelagoState *s)
+{
+    uint64_t size;
+    int ret, targetlen;
+    struct xseg_request *req;
+    struct xseg_reply_info *xinfo;
+    AIORequestData *reqdata = g_malloc(sizeof(AIORequestData));
+
+    const char *volname = s->volname;
+    targetlen = strlen(volname);
+    req = xseg_get_request(s->xseg, s->srcport, s->mportno, X_ALLOC);
+    if (!req) {
+        archipelagolog("Cannot get XSEG request\n");
+        goto err_exit2;
+    }
+    ret = xseg_prep_request(s->xseg, req, targetlen,
+                            sizeof(struct xseg_reply_info));
+    if (ret < 0) {
+        archipelagolog("Cannot prepare XSEG request\n");
+        goto err_exit;
+    }
+    char *target = xseg_get_target(s->xseg, req);
+    if (!target) {
+        archipelagolog("Cannot get XSEG target\n");
+        goto err_exit;
+    }
+    memcpy(target, volname, targetlen);
+    req->size = req->datalen;
+    req->offset = 0;
+    req->op = X_INFO;
+
+    reqdata->op = ARCHIP_OP_VOLINFO;
+    reqdata->volname = volname;
+    xseg_set_req_data(s->xseg, req, reqdata);
+
+    xport p = xseg_submit(s->xseg, req, s->srcport, X_ALLOC);
+    if (p == NoPort) {
+        archipelagolog("Cannot submit XSEG request\n");
+        goto err_exit;
+    }
+    xseg_signal(s->xseg, p);
+    qemu_mutex_lock(&s->archip_mutex);
+    while (!s->is_signaled) {
+        qemu_cond_wait(&s->archip_cond, &s->archip_mutex);
+    }
+    s->is_signaled = false;
+    qemu_mutex_unlock(&s->archip_mutex);
+
+    xinfo = (struct xseg_reply_info *) xseg_get_data(s->xseg, req);
+    size = xinfo->size;
+    xseg_put_request(s->xseg, req, s->srcport);
+    g_free(reqdata);
+    s->size = size;
+    return size;
+
+err_exit:
+    xseg_put_request(s->xseg, req, s->srcport);
+err_exit2:
+    g_free(reqdata);
+    return -EIO;
+}
+
+static int64_t qemu_archipelago_getlength(BlockDriverState *bs)
+{
+    int64_t ret;
+    BDRVArchipelagoState *s = bs->opaque;
+
+    ret = archipelago_volume_info(s);
+    return ret;
+}
+
+static QemuOptsList qemu_archipelago_create_opts = {
+    .name = "archipelago-create-opts",
+    .head = QTAILQ_HEAD_INITIALIZER(qemu_archipelago_create_opts.head),
+    .desc = {
+        {
+            .name = BLOCK_OPT_SIZE,
+            .type = QEMU_OPT_SIZE,
+            .help = "Virtual disk size"
+        },
+        { /* end of list */ }
+    }
+};
+
+static BlockDriverAIOCB *qemu_archipelago_aio_flush(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    return qemu_archipelago_aio_rw(bs, 0, NULL, 0, cb, opaque,
+                                   ARCHIP_OP_FLUSH);
+}
+
+static BlockDriver bdrv_archipelago = {
+    .format_name         = "archipelago",
+    .protocol_name       = "archipelago",
+    .instance_size       = sizeof(BDRVArchipelagoState),
+    .bdrv_parse_filename = archipelago_parse_filename,
+    .bdrv_file_open      = qemu_archipelago_open,
+    .bdrv_close          = qemu_archipelago_close,
+    .bdrv_create         = qemu_archipelago_create,
+    .bdrv_getlength      = qemu_archipelago_getlength,
+    .bdrv_aio_readv      = qemu_archipelago_aio_readv,
+    .bdrv_aio_writev     = qemu_archipelago_aio_writev,
+    .bdrv_aio_flush      = qemu_archipelago_aio_flush,
+    .bdrv_has_zero_init  = bdrv_has_zero_init_1,
+    .create_opts         = &qemu_archipelago_create_opts,
+};
+
+static void bdrv_archipelago_init(void)
+{
+    bdrv_register(&bdrv_archipelago);
+}
+
+block_init(bdrv_archipelago_init);
diff --git a/block/bochs.c b/block/bochs.c
index eba23df335..6674b27438 100644
--- a/block/bochs.c
+++ b/block/bochs.c
@@ -131,7 +131,11 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags,
         return -EFBIG;
     }
 
-    s->catalog_bitmap = g_malloc(s->catalog_size * 4);
+    s->catalog_bitmap = g_try_malloc(s->catalog_size * 4);
+    if (s->catalog_size && s->catalog_bitmap == NULL) {
+        error_setg(errp, "Could not allocate memory for catalog");
+        return -ENOMEM;
+    }
 
     ret = bdrv_pread(bs->file, le32_to_cpu(bochs.header), s->catalog_bitmap,
                      s->catalog_size * 4);
diff --git a/block/cloop.c b/block/cloop.c
index 8457737922..f328be06f8 100644
--- a/block/cloop.c
+++ b/block/cloop.c
@@ -116,7 +116,12 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags,
                    "try increasing block size");
         return -EINVAL;
     }
-    s->offsets = g_malloc(offsets_size);
+
+    s->offsets = g_try_malloc(offsets_size);
+    if (s->offsets == NULL) {
+        error_setg(errp, "Could not allocate offsets table");
+        return -ENOMEM;
+    }
 
     ret = bdrv_pread(bs->file, 128 + 4 + 4, s->offsets, offsets_size);
     if (ret < 0) {
@@ -158,8 +163,20 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags,
     }
 
     /* initialize zlib engine */
-    s->compressed_block = g_malloc(max_compressed_block_size + 1);
-    s->uncompressed_block = g_malloc(s->block_size);
+    s->compressed_block = g_try_malloc(max_compressed_block_size + 1);
+    if (s->compressed_block == NULL) {
+        error_setg(errp, "Could not allocate compressed_block");
+        ret = -ENOMEM;
+        goto fail;
+    }
+
+    s->uncompressed_block = g_try_malloc(s->block_size);
+    if (s->uncompressed_block == NULL) {
+        error_setg(errp, "Could not allocate uncompressed_block");
+        ret = -ENOMEM;
+        goto fail;
+    }
+
     if (inflateInit(&s->zstream) != Z_OK) {
         ret = -EINVAL;
         goto fail;
diff --git a/block/curl.c b/block/curl.c
index 79ff2f1e41..d4b85d20a5 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -640,7 +640,13 @@ static void curl_readv_bh_cb(void *p)
     state->buf_start = start;
     state->buf_len = acb->end + s->readahead_size;
     end = MIN(start + state->buf_len, s->len) - 1;
-    state->orig_buf = g_malloc(state->buf_len);
+    state->orig_buf = g_try_malloc(state->buf_len);
+    if (state->buf_len && state->orig_buf == NULL) {
+        curl_clean_state(state);
+        acb->common.cb(acb->common.opaque, -ENOMEM);
+        qemu_aio_release(acb);
+        return;
+    }
     state->acb[0] = acb;
 
     snprintf(state->range, 127, "%zd-%zd", start, end);
diff --git a/block/dmg.c b/block/dmg.c
index 1e153cd76d..e455886d2b 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -284,8 +284,15 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
     }
 
     /* initialize zlib engine */
-    s->compressed_chunk = g_malloc(max_compressed_size + 1);
-    s->uncompressed_chunk = g_malloc(512 * max_sectors_per_chunk);
+    s->compressed_chunk = qemu_try_blockalign(bs->file,
+                                              max_compressed_size + 1);
+    s->uncompressed_chunk = qemu_try_blockalign(bs->file,
+                                                512 * max_sectors_per_chunk);
+    if (s->compressed_chunk == NULL || s->uncompressed_chunk == NULL) {
+        ret = -ENOMEM;
+        goto fail;
+    }
+
     if (inflateInit(&s->zstream) != Z_OK) {
         ret = -EINVAL;
         goto fail;
@@ -302,8 +309,8 @@ fail:
     g_free(s->lengths);
     g_free(s->sectors);
     g_free(s->sectorcounts);
-    g_free(s->compressed_chunk);
-    g_free(s->uncompressed_chunk);
+    qemu_vfree(s->compressed_chunk);
+    qemu_vfree(s->uncompressed_chunk);
     return ret;
 }
 
@@ -426,8 +433,8 @@ static void dmg_close(BlockDriverState *bs)
     g_free(s->lengths);
     g_free(s->sectors);
     g_free(s->sectorcounts);
-    g_free(s->compressed_chunk);
-    g_free(s->uncompressed_chunk);
+    qemu_vfree(s->compressed_chunk);
+    qemu_vfree(s->uncompressed_chunk);
 
     inflateEnd(&s->zstream);
 }
diff --git a/block/iscsi.c b/block/iscsi.c
index a7bb6970ac..2c9cfc18eb 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -893,7 +893,10 @@ coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
     nb_blocks = sector_qemu2lun(nb_sectors, iscsilun);
 
     if (iscsilun->zeroblock == NULL) {
-        iscsilun->zeroblock = g_malloc0(iscsilun->block_size);
+        iscsilun->zeroblock = g_try_malloc0(iscsilun->block_size);
+        if (iscsilun->zeroblock == NULL) {
+            return -ENOMEM;
+        }
     }
 
     iscsi_co_init_iscsitask(iscsilun, &iTask);
diff --git a/block/mirror.c b/block/mirror.c
index c7a655fc58..5e7a166b39 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -367,7 +367,12 @@ static void coroutine_fn mirror_run(void *opaque)
     }
 
     end = s->common.len >> BDRV_SECTOR_BITS;
-    s->buf = qemu_blockalign(bs, s->buf_size);
+    s->buf = qemu_try_blockalign(bs, s->buf_size);
+    if (s->buf == NULL) {
+        ret = -ENOMEM;
+        goto immediate_exit;
+    }
+
     sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
     mirror_free_init(s);
 
diff --git a/block/nfs.c b/block/nfs.c
index 8439e0d389..fe46c33709 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -172,7 +172,11 @@ static int coroutine_fn nfs_co_writev(BlockDriverState *bs,
 
     nfs_co_init_task(client, &task);
 
-    buf = g_malloc(nb_sectors * BDRV_SECTOR_SIZE);
+    buf = g_try_malloc(nb_sectors * BDRV_SECTOR_SIZE);
+    if (nb_sectors && buf == NULL) {
+        return -ENOMEM;
+    }
+
     qemu_iovec_to_buf(iov, 0, buf, nb_sectors * BDRV_SECTOR_SIZE);
 
     if (nfs_pwrite_async(client->context, client->fh,
diff --git a/block/parallels.c b/block/parallels.c
index 1a5bd350b3..7325678a4d 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -105,7 +105,11 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
         ret = -EFBIG;
         goto fail;
     }
-    s->catalog_bitmap = g_malloc(s->catalog_size * 4);
+    s->catalog_bitmap = g_try_malloc(s->catalog_size * 4);
+    if (s->catalog_size && s->catalog_bitmap == NULL) {
+        ret = -ENOMEM;
+        goto fail;
+    }
 
     ret = bdrv_pread(bs->file, 64, s->catalog_bitmap, s->catalog_size * 4);
     if (ret < 0) {
diff --git a/block/qapi.c b/block/qapi.c
index f44f6b4012..79d1e6a9f4 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -28,6 +28,13 @@
 #include "qapi-visit.h"
 #include "qapi/qmp-output-visitor.h"
 #include "qapi/qmp/types.h"
+#ifdef __linux__
+#include <linux/fs.h>
+#include <sys/ioctl.h>
+#ifndef FS_NOCOW_FL
+#define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
+#endif
+#endif
 
 BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs)
 {
@@ -165,19 +172,28 @@ void bdrv_query_image_info(BlockDriverState *bs,
                            ImageInfo **p_info,
                            Error **errp)
 {
-    uint64_t total_sectors;
+    int64_t size;
     const char *backing_filename;
     char backing_filename2[1024];
     BlockDriverInfo bdi;
     int ret;
     Error *err = NULL;
-    ImageInfo *info = g_new0(ImageInfo, 1);
-
-    bdrv_get_geometry(bs, &total_sectors);
+    ImageInfo *info;
+#ifdef __linux__
+    int fd, attr;
+#endif
+
+    size = bdrv_getlength(bs);
+    if (size < 0) {
+        error_setg_errno(errp, -size, "Can't get size of device '%s'",
+                         bdrv_get_device_name(bs));
+        return;
+    }
 
+    info = g_new0(ImageInfo, 1);
     info->filename        = g_strdup(bs->filename);
     info->format          = g_strdup(bdrv_get_format_name(bs));
-    info->virtual_size    = total_sectors * 512;
+    info->virtual_size    = size;
     info->actual_size     = bdrv_get_allocated_file_size(bs);
     info->has_actual_size = info->actual_size >= 0;
     if (bdrv_is_encrypted(bs)) {
@@ -195,6 +211,18 @@ void bdrv_query_image_info(BlockDriverState *bs,
     info->format_specific     = bdrv_get_specific_info(bs);
     info->has_format_specific = info->format_specific != NULL;
 
+#ifdef __linux__
+    /* get NOCOW info */
+    fd = qemu_open(bs->filename, O_RDONLY | O_NONBLOCK);
+    if (fd >= 0) {
+        if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0 && (attr & FS_NOCOW_FL)) {
+            info->has_nocow = true;
+            info->nocow = true;
+        }
+        qemu_close(fd);
+    }
+#endif
+
     backing_filename = bs->backing_file;
     if (backing_filename[0] != '\0') {
         info->backing_filename = g_strdup(backing_filename);
@@ -625,4 +653,8 @@ void bdrv_image_info_dump(fprintf_function func_fprintf, void *f,
         func_fprintf(f, "Format specific information:\n");
         bdrv_image_info_specific_dump(func_fprintf, f, info->format_specific);
     }
+
+    if (info->has_nocow && info->nocow) {
+        func_fprintf(f, "NOCOW flag: set\n");
+    }
 }
diff --git a/block/qcow.c b/block/qcow.c
index a874056cf3..67332f03c1 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -182,7 +182,12 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
     }
 
     s->l1_table_offset = header.l1_table_offset;
-    s->l1_table = g_malloc(s->l1_size * sizeof(uint64_t));
+    s->l1_table = g_try_malloc(s->l1_size * sizeof(uint64_t));
+    if (s->l1_table == NULL) {
+        error_setg(errp, "Could not allocate memory for L1 table");
+        ret = -ENOMEM;
+        goto fail;
+    }
 
     ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
                s->l1_size * sizeof(uint64_t));
@@ -193,8 +198,16 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
     for(i = 0;i < s->l1_size; i++) {
         be64_to_cpus(&s->l1_table[i]);
     }
-    /* alloc L2 cache */
-    s->l2_cache = g_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+
+    /* alloc L2 cache (max. 64k * 16 * 8 = 8 MB) */
+    s->l2_cache =
+        qemu_try_blockalign(bs->file,
+                            s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+    if (s->l2_cache == NULL) {
+        error_setg(errp, "Could not allocate L2 table cache");
+        ret = -ENOMEM;
+        goto fail;
+    }
     s->cluster_cache = g_malloc(s->cluster_size);
     s->cluster_data = g_malloc(s->cluster_size);
     s->cluster_cache_offset = -1;
@@ -226,7 +239,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
 
  fail:
     g_free(s->l1_table);
-    g_free(s->l2_cache);
+    qemu_vfree(s->l2_cache);
     g_free(s->cluster_cache);
     g_free(s->cluster_data);
     return ret;
@@ -517,7 +530,10 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
     void *orig_buf;
 
     if (qiov->niov > 1) {
-        buf = orig_buf = qemu_blockalign(bs, qiov->size);
+        buf = orig_buf = qemu_try_blockalign(bs, qiov->size);
+        if (buf == NULL) {
+            return -ENOMEM;
+        }
     } else {
         orig_buf = NULL;
         buf = (uint8_t *)qiov->iov->iov_base;
@@ -619,7 +635,10 @@ static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
     s->cluster_cache_offset = -1; /* disable compressed cache */
 
     if (qiov->niov > 1) {
-        buf = orig_buf = qemu_blockalign(bs, qiov->size);
+        buf = orig_buf = qemu_try_blockalign(bs, qiov->size);
+        if (buf == NULL) {
+            return -ENOMEM;
+        }
         qemu_iovec_to_buf(qiov, 0, buf, qiov->size);
     } else {
         orig_buf = NULL;
@@ -685,7 +704,7 @@ static void qcow_close(BlockDriverState *bs)
     BDRVQcowState *s = bs->opaque;
 
     g_free(s->l1_table);
-    g_free(s->l2_cache);
+    qemu_vfree(s->l2_cache);
     g_free(s->cluster_cache);
     g_free(s->cluster_data);
 
diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
index 8ecbb5bc00..5353b44828 100644
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -53,10 +53,21 @@ Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables)
     c->entries = g_malloc0(sizeof(*c->entries) * num_tables);
 
     for (i = 0; i < c->size; i++) {
-        c->entries[i].table = qemu_blockalign(bs, s->cluster_size);
+        c->entries[i].table = qemu_try_blockalign(bs->file, s->cluster_size);
+        if (c->entries[i].table == NULL) {
+            goto fail;
+        }
     }
 
     return c;
+
+fail:
+    for (i = 0; i < c->size; i++) {
+        qemu_vfree(c->entries[i].table);
+    }
+    g_free(c->entries);
+    g_free(c);
+    return NULL;
 }
 
 int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c)
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 4208dc08b5..5b36018b3e 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -72,14 +72,20 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
 #endif
 
     new_l1_size2 = sizeof(uint64_t) * new_l1_size;
-    new_l1_table = g_malloc0(align_offset(new_l1_size2, 512));
+    new_l1_table = qemu_try_blockalign(bs->file,
+                                       align_offset(new_l1_size2, 512));
+    if (new_l1_table == NULL) {
+        return -ENOMEM;
+    }
+    memset(new_l1_table, 0, align_offset(new_l1_size2, 512));
+
     memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
 
     /* write new table (align to cluster) */
     BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE);
     new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2);
     if (new_l1_table_offset < 0) {
-        g_free(new_l1_table);
+        qemu_vfree(new_l1_table);
         return new_l1_table_offset;
     }
 
@@ -113,7 +119,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
     if (ret < 0) {
         goto fail;
     }
-    g_free(s->l1_table);
+    qemu_vfree(s->l1_table);
     old_l1_table_offset = s->l1_table_offset;
     s->l1_table_offset = new_l1_table_offset;
     s->l1_table = new_l1_table;
@@ -123,7 +129,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
                         QCOW2_DISCARD_OTHER);
     return 0;
  fail:
-    g_free(new_l1_table);
+    qemu_vfree(new_l1_table);
     qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2,
                         QCOW2_DISCARD_OTHER);
     return ret;
@@ -372,7 +378,10 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs,
     }
 
     iov.iov_len = n * BDRV_SECTOR_SIZE;
-    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
+    iov.iov_base = qemu_try_blockalign(bs, iov.iov_len);
+    if (iov.iov_base == NULL) {
+        return -ENOMEM;
+    }
 
     qemu_iovec_init_external(&qiov, &iov, 1);
 
@@ -702,7 +711,11 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
     trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
     assert(m->nb_clusters > 0);
 
-    old_cluster = g_malloc(m->nb_clusters * sizeof(uint64_t));
+    old_cluster = g_try_malloc(m->nb_clusters * sizeof(uint64_t));
+    if (old_cluster == NULL) {
+        ret = -ENOMEM;
+        goto err;
+    }
 
     /* copy content of unmodified sectors */
     ret = perform_cow(bs, m, &m->cow_start);
@@ -1106,6 +1119,17 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
         return 0;
     }
 
+    /* !*host_offset would overwrite the image header and is reserved for "no
+     * host offset preferred". If 0 was a valid host offset, it'd trigger the
+     * following overlap check; do that now to avoid having an invalid value in
+     * *host_offset. */
+    if (!alloc_cluster_offset) {
+        ret = qcow2_pre_write_overlap_check(bs, 0, alloc_cluster_offset,
+                                            nb_clusters * s->cluster_size);
+        assert(ret < 0);
+        goto fail;
+    }
+
     /*
      * Save info needed for meta data update.
      *
@@ -1562,7 +1586,10 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
     if (!is_active_l1) {
         /* inactive L2 tables require a buffer to be stored in when loading
          * them from disk */
-        l2_table = qemu_blockalign(bs, s->cluster_size);
+        l2_table = qemu_try_blockalign(bs->file, s->cluster_size);
+        if (l2_table == NULL) {
+            return -ENOMEM;
+        }
     }
 
     for (i = 0; i < l1_size; i++) {
@@ -1740,7 +1767,11 @@ int qcow2_expand_zero_clusters(BlockDriverState *bs)
 
     nb_clusters = size_to_clusters(s, bs->file->total_sectors *
                                    BDRV_SECTOR_SIZE);
-    expanded_clusters = g_malloc0((nb_clusters + 7) / 8);
+    expanded_clusters = g_try_malloc0((nb_clusters + 7) / 8);
+    if (expanded_clusters == NULL) {
+        ret = -ENOMEM;
+        goto fail;
+    }
 
     ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size,
                                      &expanded_clusters, &nb_clusters);
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index cc6cf743d6..d60e2feb10 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -46,19 +46,25 @@ int qcow2_refcount_init(BlockDriverState *bs)
 
     assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
     refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
-    s->refcount_table = g_malloc(refcount_table_size2);
+    s->refcount_table = g_try_malloc(refcount_table_size2);
+
     if (s->refcount_table_size > 0) {
+        if (s->refcount_table == NULL) {
+            ret = -ENOMEM;
+            goto fail;
+        }
         BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
         ret = bdrv_pread(bs->file, s->refcount_table_offset,
                          s->refcount_table, refcount_table_size2);
-        if (ret != refcount_table_size2)
+        if (ret < 0) {
             goto fail;
+        }
         for(i = 0; i < s->refcount_table_size; i++)
             be64_to_cpus(&s->refcount_table[i]);
     }
     return 0;
  fail:
-    return -ENOMEM;
+    return ret;
 }
 
 void qcow2_refcount_close(BlockDriverState *bs)
@@ -344,8 +350,14 @@ static int alloc_refcount_block(BlockDriverState *bs,
     uint64_t meta_offset = (blocks_used * refcount_block_clusters) *
         s->cluster_size;
     uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size;
-    uint16_t *new_blocks = g_malloc0(blocks_clusters * s->cluster_size);
-    uint64_t *new_table = g_malloc0(table_size * sizeof(uint64_t));
+    uint64_t *new_table = g_try_malloc0(table_size * sizeof(uint64_t));
+    uint16_t *new_blocks = g_try_malloc0(blocks_clusters * s->cluster_size);
+
+    assert(table_size > 0 && blocks_clusters > 0);
+    if (new_table == NULL || new_blocks == NULL) {
+        ret = -ENOMEM;
+        goto fail_table;
+    }
 
     /* Fill the new refcount table */
     memcpy(new_table, s->refcount_table,
@@ -424,6 +436,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
     return -EAGAIN;
 
 fail_table:
+    g_free(new_blocks);
     g_free(new_table);
 fail_block:
     if (*refcount_block != NULL) {
@@ -847,7 +860,8 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
     int64_t l1_table_offset, int l1_size, int addend)
 {
     BDRVQcowState *s = bs->opaque;
-    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated;
+    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2;
+    bool l1_allocated = false;
     int64_t old_offset, old_l2_offset;
     int i, j, l1_modified = 0, nb_csectors, refcount;
     int ret;
@@ -862,8 +876,12 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
      * l1_table_offset when it is the current s->l1_table_offset! Be careful
      * when changing this! */
     if (l1_table_offset != s->l1_table_offset) {
-        l1_table = g_malloc0(align_offset(l1_size2, 512));
-        l1_allocated = 1;
+        l1_table = g_try_malloc0(align_offset(l1_size2, 512));
+        if (l1_size2 && l1_table == NULL) {
+            ret = -ENOMEM;
+            goto fail;
+        }
+        l1_allocated = true;
 
         ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
         if (ret < 0) {
@@ -875,7 +893,7 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
     } else {
         assert(l1_size == s->l1_size);
         l1_table = s->l1_table;
-        l1_allocated = 0;
+        l1_allocated = false;
     }
 
     for(i = 0; i < l1_size; i++) {
@@ -1197,7 +1215,11 @@ static int check_refcounts_l1(BlockDriverState *bs,
     if (l1_size2 == 0) {
         l1_table = NULL;
     } else {
-        l1_table = g_malloc(l1_size2);
+        l1_table = g_try_malloc(l1_size2);
+        if (l1_table == NULL) {
+            ret = -ENOMEM;
+            goto fail;
+        }
         if (bdrv_pread(bs->file, l1_table_offset,
                        l1_table, l1_size2) != l1_size2)
             goto fail;
@@ -1501,7 +1523,11 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
         return -EFBIG;
     }
 
-    refcount_table = g_malloc0(nb_clusters * sizeof(uint16_t));
+    refcount_table = g_try_malloc0(nb_clusters * sizeof(uint16_t));
+    if (nb_clusters && refcount_table == NULL) {
+        res->check_errors++;
+        return -ENOMEM;
+    }
 
     res->bfi.total_clusters =
         size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
@@ -1753,9 +1779,13 @@ int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
             uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
             uint32_t l1_sz  = s->snapshots[i].l1_size;
             uint64_t l1_sz2 = l1_sz * sizeof(uint64_t);
-            uint64_t *l1 = g_malloc(l1_sz2);
+            uint64_t *l1 = g_try_malloc(l1_sz2);
             int ret;
 
+            if (l1_sz2 && l1 == NULL) {
+                return -ENOMEM;
+            }
+
             ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2);
             if (ret < 0) {
                 g_free(l1);
diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 0aa9defbe2..f67b47282f 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -381,7 +381,12 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
     sn->l1_table_offset = l1_table_offset;
     sn->l1_size = s->l1_size;
 
-    l1_table = g_malloc(s->l1_size * sizeof(uint64_t));
+    l1_table = g_try_malloc(s->l1_size * sizeof(uint64_t));
+    if (s->l1_size && l1_table == NULL) {
+        ret = -ENOMEM;
+        goto fail;
+    }
+
     for(i = 0; i < s->l1_size; i++) {
         l1_table[i] = cpu_to_be64(s->l1_table[i]);
     }
@@ -499,7 +504,11 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
      * Decrease the refcount referenced by the old one only when the L1
      * table is overwritten.
      */
-    sn_l1_table = g_malloc0(cur_l1_bytes);
+    sn_l1_table = g_try_malloc0(cur_l1_bytes);
+    if (cur_l1_bytes && sn_l1_table == NULL) {
+        ret = -ENOMEM;
+        goto fail;
+    }
 
     ret = bdrv_pread(bs->file, sn->l1_table_offset, sn_l1_table, sn_l1_bytes);
     if (ret < 0) {
@@ -698,17 +707,21 @@ int qcow2_snapshot_load_tmp(BlockDriverState *bs,
         return -EFBIG;
     }
     new_l1_bytes = sn->l1_size * sizeof(uint64_t);
-    new_l1_table = g_malloc0(align_offset(new_l1_bytes, 512));
+    new_l1_table = qemu_try_blockalign(bs->file,
+                                       align_offset(new_l1_bytes, 512));
+    if (new_l1_table == NULL) {
+        return -ENOMEM;
+    }
 
     ret = bdrv_pread(bs->file, sn->l1_table_offset, new_l1_table, new_l1_bytes);
     if (ret < 0) {
         error_setg(errp, "Failed to read l1 table for snapshot");
-        g_free(new_l1_table);
+        qemu_vfree(new_l1_table);
         return ret;
     }
 
     /* Switch the L1 table */
-    g_free(s->l1_table);
+    qemu_vfree(s->l1_table);
 
     s->l1_size = sn->l1_size;
     s->l1_table_offset = sn->l1_table_offset;
diff --git a/block/qcow2.c b/block/qcow2.c
index 1e3ab6bd02..435e0e11d0 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -688,8 +688,13 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
 
 
     if (s->l1_size > 0) {
-        s->l1_table = g_malloc0(
+        s->l1_table = qemu_try_blockalign(bs->file,
             align_offset(s->l1_size * sizeof(uint64_t), 512));
+        if (s->l1_table == NULL) {
+            error_setg(errp, "Could not allocate L1 table");
+            ret = -ENOMEM;
+            goto fail;
+        }
         ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
                          s->l1_size * sizeof(uint64_t));
         if (ret < 0) {
@@ -704,11 +709,22 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
     /* alloc L2 table/refcount block cache */
     s->l2_table_cache = qcow2_cache_create(bs, L2_CACHE_SIZE);
     s->refcount_block_cache = qcow2_cache_create(bs, REFCOUNT_CACHE_SIZE);
+    if (s->l2_table_cache == NULL || s->refcount_block_cache == NULL) {
+        error_setg(errp, "Could not allocate metadata caches");
+        ret = -ENOMEM;
+        goto fail;
+    }
 
     s->cluster_cache = g_malloc(s->cluster_size);
     /* one more sector for decompressed data alignment */
-    s->cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
-                                  + 512);
+    s->cluster_data = qemu_try_blockalign(bs->file, QCOW_MAX_CRYPT_CLUSTERS
+                                                    * s->cluster_size + 512);
+    if (s->cluster_data == NULL) {
+        error_setg(errp, "Could not allocate temporary cluster buffer");
+        ret = -ENOMEM;
+        goto fail;
+    }
+
     s->cluster_cache_offset = -1;
     s->flags = flags;
 
@@ -852,7 +868,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
     cleanup_unknown_header_ext(bs);
     qcow2_free_snapshots(bs);
     qcow2_refcount_close(bs);
-    g_free(s->l1_table);
+    qemu_vfree(s->l1_table);
     /* else pre-write overlap checks in cache_destroy may crash */
     s->l1_table = NULL;
     if (s->l2_table_cache) {
@@ -1082,7 +1098,12 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
                  */
                 if (!cluster_data) {
                     cluster_data =
-                        qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
+                        qemu_try_blockalign(bs->file, QCOW_MAX_CRYPT_CLUSTERS
+                                                      * s->cluster_size);
+                    if (cluster_data == NULL) {
+                        ret = -ENOMEM;
+                        goto fail;
+                    }
                 }
 
                 assert(cur_nr_sectors <=
@@ -1182,8 +1203,13 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
 
         if (s->crypt_method) {
             if (!cluster_data) {
-                cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS *
-                                                 s->cluster_size);
+                cluster_data = qemu_try_blockalign(bs->file,
+                                                   QCOW_MAX_CRYPT_CLUSTERS
+                                                   * s->cluster_size);
+                if (cluster_data == NULL) {
+                    ret = -ENOMEM;
+                    goto fail;
+                }
             }
 
             assert(hd_qiov.size <=
@@ -1270,7 +1296,7 @@ fail:
 static void qcow2_close(BlockDriverState *bs)
 {
     BDRVQcowState *s = bs->opaque;
-    g_free(s->l1_table);
+    qemu_vfree(s->l1_table);
     /* else pre-write overlap checks in cache_destroy may crash */
     s->l1_table = NULL;
 
@@ -1557,7 +1583,7 @@ static int preallocate(BlockDriverState *bs)
     int ret;
     QCowL2Meta *meta;
 
-    nb_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
+    nb_sectors = bdrv_nb_sectors(bs);
     offset = 0;
 
     while (nb_sectors) {
@@ -1947,7 +1973,6 @@ static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num,
         /* align end of file to a sector boundary to ease reading with
            sector based I/Os */
         cluster_offset = bdrv_getlength(bs->file);
-        cluster_offset = (cluster_offset + 511) & ~511;
         bdrv_truncate(bs->file, cluster_offset);
         return 0;
     }
diff --git a/block/qed-check.c b/block/qed-check.c
index b473dcd61f..40a882cc93 100644
--- a/block/qed-check.c
+++ b/block/qed-check.c
@@ -227,8 +227,11 @@ int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix)
     };
     int ret;
 
-    check.used_clusters = g_malloc0(((check.nclusters + 31) / 32) *
-                                       sizeof(check.used_clusters[0]));
+    check.used_clusters = g_try_malloc0(((check.nclusters + 31) / 32) *
+                                        sizeof(check.used_clusters[0]));
+    if (check.nclusters && check.used_clusters == NULL) {
+        return -ENOMEM;
+    }
 
     check.result->bfi.total_clusters =
         (s->header.image_size + s->header.cluster_size - 1) /
diff --git a/block/qed.c b/block/qed.c
index 7944832181..ba395af76a 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1240,7 +1240,11 @@ static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
         struct iovec *iov = acb->qiov->iov;
 
         if (!iov->iov_base) {
-            iov->iov_base = qemu_blockalign(acb->common.bs, iov->iov_len);
+            iov->iov_base = qemu_try_blockalign(acb->common.bs, iov->iov_len);
+            if (iov->iov_base == NULL) {
+                qed_aio_complete(acb, -ENOMEM);
+                return;
+            }
             memset(iov->iov_base, 0, iov->iov_len);
         }
     }
diff --git a/block/raw-posix.c b/block/raw-posix.c
index 8e9758e920..1194eb00ad 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -798,7 +798,11 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
      * Ok, we have to do it the hard way, copy all segments into
      * a single aligned buffer.
      */
-    buf = qemu_blockalign(aiocb->bs, aiocb->aio_nbytes);
+    buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes);
+    if (buf == NULL) {
+        return -ENOMEM;
+    }
+
     if (aiocb->aio_type & QEMU_AIO_WRITE) {
         char *p = buf;
         int i;
diff --git a/block/rbd.c b/block/rbd.c
index 2b797d3e8b..4459102adf 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -617,7 +617,7 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
                                        RBDAIOCmd cmd)
 {
     RBDAIOCB *acb;
-    RADOSCB *rcb;
+    RADOSCB *rcb = NULL;
     rbd_completion_t c;
     int64_t off, size;
     char *buf;
@@ -631,7 +631,10 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
     if (cmd == RBD_AIO_DISCARD || cmd == RBD_AIO_FLUSH) {
         acb->bounce = NULL;
     } else {
-        acb->bounce = qemu_blockalign(bs, qiov->size);
+        acb->bounce = qemu_try_blockalign(bs, qiov->size);
+        if (acb->bounce == NULL) {
+            goto failed;
+        }
     }
     acb->ret = 0;
     acb->error = 0;
diff --git a/block/vdi.c b/block/vdi.c
index 197bd77c97..adc6aa9a5f 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -53,13 +53,6 @@
 #include "block/block_int.h"
 #include "qemu/module.h"
 #include "migration/migration.h"
-#ifdef __linux__
-#include <linux/fs.h>
-#include <sys/ioctl.h>
-#ifndef FS_NOCOW_FL
-#define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
-#endif
-#endif
 
 #if defined(CONFIG_UUID)
 #include <uuid/uuid.h>
@@ -299,7 +292,12 @@ static int vdi_check(BlockDriverState *bs, BdrvCheckResult *res,
         return -ENOTSUP;
     }
 
-    bmap = g_malloc(s->header.blocks_in_image * sizeof(uint32_t));
+    bmap = g_try_malloc(s->header.blocks_in_image * sizeof(uint32_t));
+    if (s->header.blocks_in_image && bmap == NULL) {
+        res->check_errors++;
+        return -ENOMEM;
+    }
+
     memset(bmap, 0xff, s->header.blocks_in_image * sizeof(uint32_t));
 
     /* Check block map and value of blocks_allocated. */
@@ -357,23 +355,23 @@ static int vdi_make_empty(BlockDriverState *bs)
 static int vdi_probe(const uint8_t *buf, int buf_size, const char *filename)
 {
     const VdiHeader *header = (const VdiHeader *)buf;
-    int result = 0;
+    int ret = 0;
 
     logout("\n");
 
     if (buf_size < sizeof(*header)) {
         /* Header too small, no VDI. */
     } else if (le32_to_cpu(header->signature) == VDI_SIGNATURE) {
-        result = 100;
+        ret = 100;
     }
 
-    if (result == 0) {
+    if (ret == 0) {
         logout("no vdi image\n");
     } else {
         logout("%s", header->text);
     }
 
-    return result;
+    return ret;
 }
 
 static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
@@ -478,7 +476,12 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
 
     bmap_size = header.blocks_in_image * sizeof(uint32_t);
     bmap_size = (bmap_size + SECTOR_SIZE - 1) / SECTOR_SIZE;
-    s->bmap = g_malloc(bmap_size * SECTOR_SIZE);
+    s->bmap = qemu_try_blockalign(bs->file, bmap_size * SECTOR_SIZE);
+    if (s->bmap == NULL) {
+        ret = -ENOMEM;
+        goto fail;
+    }
+
     ret = bdrv_read(bs->file, s->bmap_sector, (uint8_t *)s->bmap, bmap_size);
     if (ret < 0) {
         goto fail_free_bmap;
@@ -493,7 +496,7 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
     return 0;
 
  fail_free_bmap:
-    g_free(s->bmap);
+    qemu_vfree(s->bmap);
 
  fail:
     return ret;
@@ -681,8 +684,7 @@ static int vdi_co_write(BlockDriverState *bs,
 
 static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
 {
-    int fd;
-    int result = 0;
+    int ret = 0;
     uint64_t bytes = 0;
     uint32_t blocks;
     size_t block_size = DEFAULT_CLUSTER_SIZE;
@@ -690,7 +692,10 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
     VdiHeader header;
     size_t i;
     size_t bmap_size;
-    bool nocow = false;
+    int64_t offset = 0;
+    Error *local_err = NULL;
+    BlockDriverState *bs = NULL;
+    uint32_t *bmap = NULL;
 
     logout("\n");
 
@@ -707,37 +712,25 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
         image_type = VDI_TYPE_STATIC;
     }
 #endif
-    nocow = qemu_opt_get_bool_del(opts, BLOCK_OPT_NOCOW, false);
 
     if (bytes > VDI_DISK_SIZE_MAX) {
-        result = -ENOTSUP;
+        ret = -ENOTSUP;
         error_setg(errp, "Unsupported VDI image size (size is 0x%" PRIx64
                           ", max supported is 0x%" PRIx64 ")",
                           bytes, VDI_DISK_SIZE_MAX);
         goto exit;
     }
 
-    fd = qemu_open(filename,
-                   O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
-                   0644);
-    if (fd < 0) {
-        result = -errno;
+    ret = bdrv_create_file(filename, opts, &local_err);
+    if (ret < 0) {
+        error_propagate(errp, local_err);
         goto exit;
     }
-
-    if (nocow) {
-#ifdef __linux__
-        /* Set NOCOW flag to solve performance issue on fs like btrfs.
-         * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value will
-         * be ignored since any failure of this operation should not block the
-         * left work.
-         */
-        int attr;
-        if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
-            attr |= FS_NOCOW_FL;
-            ioctl(fd, FS_IOC_SETFLAGS, &attr);
-        }
-#endif
+    ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
+                    NULL, &local_err);
+    if (ret < 0) {
+        error_propagate(errp, local_err);
+        goto exit;
     }
 
     /* We need enough blocks to store the given disk size,
@@ -769,13 +762,20 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
     vdi_header_print(&header);
 #endif
     vdi_header_to_le(&header);
-    if (write(fd, &header, sizeof(header)) < 0) {
-        result = -errno;
-        goto close_and_exit;
+    ret = bdrv_pwrite_sync(bs, offset, &header, sizeof(header));
+    if (ret < 0) {
+        error_setg(errp, "Error writing header to %s", filename);
+        goto exit;
     }
+    offset += sizeof(header);
 
     if (bmap_size > 0) {
-        uint32_t *bmap = g_malloc0(bmap_size);
+        bmap = g_try_malloc0(bmap_size);
+        if (bmap == NULL) {
+            ret = -ENOMEM;
+            error_setg(errp, "Could not allocate bmap");
+            goto exit;
+        }
         for (i = 0; i < blocks; i++) {
             if (image_type == VDI_TYPE_STATIC) {
                 bmap[i] = i;
@@ -783,35 +783,33 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
                 bmap[i] = VDI_UNALLOCATED;
             }
         }
-        if (write(fd, bmap, bmap_size) < 0) {
-            result = -errno;
-            g_free(bmap);
-            goto close_and_exit;
+        ret = bdrv_pwrite_sync(bs, offset, bmap, bmap_size);
+        if (ret < 0) {
+            error_setg(errp, "Error writing bmap to %s", filename);
+            goto exit;
         }
-        g_free(bmap);
+        offset += bmap_size;
     }
 
     if (image_type == VDI_TYPE_STATIC) {
-        if (ftruncate(fd, sizeof(header) + bmap_size + blocks * block_size)) {
-            result = -errno;
-            goto close_and_exit;
+        ret = bdrv_truncate(bs, offset + blocks * block_size);
+        if (ret < 0) {
+            error_setg(errp, "Failed to statically allocate %s", filename);
+            goto exit;
         }
     }
 
-close_and_exit:
-    if ((close(fd) < 0) && !result) {
-        result = -errno;
-    }
-
 exit:
-    return result;
+    bdrv_unref(bs);
+    g_free(bmap);
+    return ret;
 }
 
 static void vdi_close(BlockDriverState *bs)
 {
     BDRVVdiState *s = bs->opaque;
 
-    g_free(s->bmap);
+    qemu_vfree(s->bmap);
 
     migrate_del_blocker(s->migration_blocker);
     error_free(s->migration_blocker);
diff --git a/block/vhdx-endian.c b/block/vhdx-endian.c
index fe879ed995..0640d3f4a9 100644
--- a/block/vhdx-endian.c
+++ b/block/vhdx-endian.c
@@ -82,8 +82,6 @@ void vhdx_log_desc_le_import(VHDXLogDescriptor *d)
     assert(d != NULL);
 
     le32_to_cpus(&d->signature);
-    le32_to_cpus(&d->trailing_bytes);
-    le64_to_cpus(&d->leading_bytes);
     le64_to_cpus(&d->file_offset);
     le64_to_cpus(&d->sequence_number);
 }
@@ -99,6 +97,15 @@ void vhdx_log_desc_le_export(VHDXLogDescriptor *d)
     cpu_to_le64s(&d->sequence_number);
 }
 
+void vhdx_log_data_le_import(VHDXLogDataSector *d)
+{
+    assert(d != NULL);
+
+    le32_to_cpus(&d->data_signature);
+    le32_to_cpus(&d->sequence_high);
+    le32_to_cpus(&d->sequence_low);
+}
+
 void vhdx_log_data_le_export(VHDXLogDataSector *d)
 {
     assert(d != NULL);
diff --git a/block/vhdx-log.c b/block/vhdx-log.c
index a77c040ee0..eb5c7a097b 100644
--- a/block/vhdx-log.c
+++ b/block/vhdx-log.c
@@ -84,6 +84,7 @@ static int vhdx_log_peek_hdr(BlockDriverState *bs, VHDXLogEntries *log,
     if (ret < 0) {
         goto exit;
     }
+    vhdx_log_entry_hdr_le_import(hdr);
 
 exit:
     return ret;
@@ -211,7 +212,7 @@ static bool vhdx_log_hdr_is_valid(VHDXLogEntries *log, VHDXLogEntryHeader *hdr,
 {
     int valid = false;
 
-    if (memcmp(&hdr->signature, "loge", 4)) {
+    if (hdr->signature != VHDX_LOG_SIGNATURE) {
         goto exit;
     }
 
@@ -275,12 +276,12 @@ static bool vhdx_log_desc_is_valid(VHDXLogDescriptor *desc,
         goto exit;
     }
 
-    if (!memcmp(&desc->signature, "zero", 4)) {
+    if (desc->signature == VHDX_LOG_ZERO_SIGNATURE) {
         if (desc->zero_length % VHDX_LOG_SECTOR_SIZE == 0) {
             /* valid */
             ret = true;
         }
-    } else if (!memcmp(&desc->signature, "desc", 4)) {
+    } else if (desc->signature == VHDX_LOG_DESC_SIGNATURE) {
             /* valid */
             ret = true;
     }
@@ -327,13 +328,15 @@ static int vhdx_compute_desc_sectors(uint32_t desc_cnt)
  * passed into this function. Each descriptor will also be validated,
  * and error returned if any are invalid. */
 static int vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s,
-                              VHDXLogEntries *log, VHDXLogDescEntries **buffer)
+                              VHDXLogEntries *log, VHDXLogDescEntries **buffer,
+                              bool convert_endian)
 {
     int ret = 0;
     uint32_t desc_sectors;
     uint32_t sectors_read;
     VHDXLogEntryHeader hdr;
     VHDXLogDescEntries *desc_entries = NULL;
+    VHDXLogDescriptor desc;
     int i;
 
     assert(*buffer == NULL);
@@ -342,14 +345,19 @@ static int vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s,
     if (ret < 0) {
         goto exit;
     }
-    vhdx_log_entry_hdr_le_import(&hdr);
+
     if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) {
         ret = -EINVAL;
         goto exit;
     }
 
     desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count);
-    desc_entries = qemu_blockalign(bs, desc_sectors * VHDX_LOG_SECTOR_SIZE);
+    desc_entries = qemu_try_blockalign(bs->file,
+                                       desc_sectors * VHDX_LOG_SECTOR_SIZE);
+    if (desc_entries == NULL) {
+        ret = -ENOMEM;
+        goto exit;
+    }
 
     ret = vhdx_log_read_sectors(bs, log, &sectors_read, desc_entries,
                                 desc_sectors, false);
@@ -363,12 +371,19 @@ static int vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s,
 
     /* put in proper endianness, and validate each desc */
     for (i = 0; i < hdr.descriptor_count; i++) {
-        vhdx_log_desc_le_import(&desc_entries->desc[i]);
-        if (vhdx_log_desc_is_valid(&desc_entries->desc[i], &hdr) == false) {
+        desc = desc_entries->desc[i];
+        vhdx_log_desc_le_import(&desc);
+        if (convert_endian) {
+            desc_entries->desc[i] = desc;
+        }
+        if (vhdx_log_desc_is_valid(&desc, &hdr) == false) {
             ret = -EINVAL;
             goto free_and_exit;
         }
     }
+    if (convert_endian) {
+        desc_entries->hdr = hdr;
+    }
 
     *buffer = desc_entries;
     goto exit;
@@ -403,7 +418,7 @@ static int vhdx_log_flush_desc(BlockDriverState *bs, VHDXLogDescriptor *desc,
 
     buffer = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE);
 
-    if (!memcmp(&desc->signature, "desc", 4)) {
+    if (desc->signature == VHDX_LOG_DESC_SIGNATURE) {
         /* data sector */
         if (data == NULL) {
             ret = -EFAULT;
@@ -431,10 +446,15 @@ static int vhdx_log_flush_desc(BlockDriverState *bs, VHDXLogDescriptor *desc,
 
         memcpy(buffer+offset, &desc->trailing_bytes, 4);
 
-    } else if (!memcmp(&desc->signature, "zero", 4)) {
+    } else if (desc->signature == VHDX_LOG_ZERO_SIGNATURE) {
         /* write 'count' sectors of sector */
         memset(buffer, 0, VHDX_LOG_SECTOR_SIZE);
         count = desc->zero_length / VHDX_LOG_SECTOR_SIZE;
+    } else {
+        error_report("Invalid VHDX log descriptor entry signature 0x%" PRIx32,
+                      desc->signature);
+        ret = -EINVAL;
+        goto exit;
     }
 
     file_offset = desc->file_offset;
@@ -493,13 +513,13 @@ static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s,
             goto exit;
         }
 
-        ret = vhdx_log_read_desc(bs, s, &logs->log, &desc_entries);
+        ret = vhdx_log_read_desc(bs, s, &logs->log, &desc_entries, true);
         if (ret < 0) {
             goto exit;
         }
 
         for (i = 0; i < desc_entries->hdr.descriptor_count; i++) {
-            if (!memcmp(&desc_entries->desc[i].signature, "desc", 4)) {
+            if (desc_entries->desc[i].signature == VHDX_LOG_DESC_SIGNATURE) {
                 /* data sector, so read a sector to flush */
                 ret = vhdx_log_read_sectors(bs, &logs->log, &sectors_read,
                                             data, 1, false);
@@ -510,6 +530,7 @@ static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s,
                     ret = -EINVAL;
                     goto exit;
                 }
+                vhdx_log_data_le_import(data);
             }
 
             ret = vhdx_log_flush_desc(bs, &desc_entries->desc[i], data);
@@ -558,9 +579,6 @@ static int vhdx_validate_log_entry(BlockDriverState *bs, BDRVVHDXState *s,
         goto inc_and_exit;
     }
 
-    vhdx_log_entry_hdr_le_import(&hdr);
-
-
     if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) {
         goto inc_and_exit;
     }
@@ -573,13 +591,13 @@ static int vhdx_validate_log_entry(BlockDriverState *bs, BDRVVHDXState *s,
 
     desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count);
 
-    /* Read desc sectors, and calculate log checksum */
+    /* Read all log sectors, and calculate log checksum */
 
     total_sectors = hdr.entry_length / VHDX_LOG_SECTOR_SIZE;
 
 
     /* read_desc() will increment the read idx */
-    ret = vhdx_log_read_desc(bs, s, log, &desc_buffer);
+    ret = vhdx_log_read_desc(bs, s, log, &desc_buffer, false);
     if (ret < 0) {
         goto free_and_exit;
     }
@@ -602,7 +620,7 @@ static int vhdx_validate_log_entry(BlockDriverState *bs, BDRVVHDXState *s,
         }
     }
     crc ^= 0xffffffff;
-    if (crc != desc_buffer->hdr.checksum) {
+    if (crc != hdr.checksum) {
         goto free_and_exit;
     }
 
@@ -962,7 +980,6 @@ static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s,
      * last data sector */
     vhdx_update_checksum(buffer, total_length,
                          offsetof(VHDXLogEntryHeader, checksum));
-    cpu_to_le32s((uint32_t *)(buffer + 4));
 
     /* now write to the log */
     ret = vhdx_log_write_sectors(bs, &s->log, &sectors_written, buffer,
diff --git a/block/vhdx.c b/block/vhdx.c
index fedcf9f9ca..f666940db7 100644
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -135,10 +135,8 @@ typedef struct VHDXSectorInfo {
  * buf: buffer pointer
  * size: size of buffer (must be > crc_offset+4)
  *
- * Note: The resulting checksum is in the CPU endianness, not necessarily
- *       in the file format endianness (LE).  Any header export to disk should
- *       make sure that vhdx_header_le_export() is used to convert to the
- *       correct endianness
+ * Note: The buffer should have all multi-byte data in little-endian format,
+ *       and the resulting checksum is in little endian format.
  */
 uint32_t vhdx_update_checksum(uint8_t *buf, size_t size, int crc_offset)
 {
@@ -149,6 +147,7 @@ uint32_t vhdx_update_checksum(uint8_t *buf, size_t size, int crc_offset)
 
     memset(buf + crc_offset, 0, sizeof(crc));
     crc =  crc32c(0xffffffff, buf, size);
+    cpu_to_le32s(&crc);
     memcpy(buf + crc_offset, &crc, sizeof(crc));
 
     return crc;
@@ -300,7 +299,7 @@ static int vhdx_write_header(BlockDriverState *bs_file, VHDXHeader *hdr,
 {
     uint8_t *buffer = NULL;
     int ret;
-    VHDXHeader header_le;
+    VHDXHeader *header_le;
 
     assert(bs_file != NULL);
     assert(hdr != NULL);
@@ -321,11 +320,12 @@ static int vhdx_write_header(BlockDriverState *bs_file, VHDXHeader *hdr,
     }
 
     /* overwrite the actual VHDXHeader portion */
-    memcpy(buffer, hdr, sizeof(VHDXHeader));
-    hdr->checksum = vhdx_update_checksum(buffer, VHDX_HEADER_SIZE,
-                                         offsetof(VHDXHeader, checksum));
-    vhdx_header_le_export(hdr, &header_le);
-    ret = bdrv_pwrite_sync(bs_file, offset, &header_le, sizeof(VHDXHeader));
+    header_le = (VHDXHeader *)buffer;
+    memcpy(header_le, hdr, sizeof(VHDXHeader));
+    vhdx_header_le_export(hdr, header_le);
+    vhdx_update_checksum(buffer, VHDX_HEADER_SIZE,
+                         offsetof(VHDXHeader, checksum));
+    ret = bdrv_pwrite_sync(bs_file, offset, header_le, sizeof(VHDXHeader));
 
 exit:
     qemu_vfree(buffer);
@@ -432,13 +432,14 @@ static void vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s,
     }
     /* copy over just the relevant portion that we need */
     memcpy(header1, buffer, sizeof(VHDXHeader));
-    vhdx_header_le_import(header1);
 
-    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) &&
-        !memcmp(&header1->signature, "head", 4)             &&
-        header1->version == 1) {
-        h1_seq = header1->sequence_number;
-        h1_valid = true;
+    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4)) {
+        vhdx_header_le_import(header1);
+        if (header1->signature == VHDX_HEADER_SIGNATURE &&
+            header1->version == 1) {
+            h1_seq = header1->sequence_number;
+            h1_valid = true;
+        }
     }
 
     ret = bdrv_pread(bs->file, VHDX_HEADER2_OFFSET, buffer, VHDX_HEADER_SIZE);
@@ -447,13 +448,14 @@ static void vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s,
     }
     /* copy over just the relevant portion that we need */
     memcpy(header2, buffer, sizeof(VHDXHeader));
-    vhdx_header_le_import(header2);
 
-    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) &&
-        !memcmp(&header2->signature, "head", 4)             &&
-        header2->version == 1) {
-        h2_seq = header2->sequence_number;
-        h2_valid = true;
+    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4)) {
+        vhdx_header_le_import(header2);
+        if (header2->signature == VHDX_HEADER_SIGNATURE &&
+            header2->version == 1) {
+            h2_seq = header2->sequence_number;
+            h2_valid = true;
+        }
     }
 
     /* If there is only 1 valid header (or no valid headers), we
@@ -519,15 +521,21 @@ static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s)
         goto fail;
     }
     memcpy(&s->rt, buffer, sizeof(s->rt));
-    vhdx_region_header_le_import(&s->rt);
     offset += sizeof(s->rt);
 
-    if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4) ||
-        memcmp(&s->rt.signature, "regi", 4)) {
+    if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4)) {
         ret = -EINVAL;
         goto fail;
     }
 
+    vhdx_region_header_le_import(&s->rt);
+
+    if (s->rt.signature != VHDX_REGION_SIGNATURE) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
+
     /* Per spec, maximum region table entry count is 2047 */
     if (s->rt.entry_count > 2047) {
         ret = -EINVAL;
@@ -630,7 +638,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
 
     vhdx_metadata_header_le_import(&s->metadata_hdr);
 
-    if (memcmp(&s->metadata_hdr.signature, "metadata", 8)) {
+    if (s->metadata_hdr.signature != VHDX_METADATA_SIGNATURE) {
         ret = -EINVAL;
         goto exit;
     }
@@ -950,7 +958,11 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags,
     }
 
     /* s->bat is freed in vhdx_close() */
-    s->bat = qemu_blockalign(bs, s->bat_rt.length);
+    s->bat = qemu_try_blockalign(bs->file, s->bat_rt.length);
+    if (s->bat == NULL) {
+        ret = -ENOMEM;
+        goto fail;
+    }
 
     ret = bdrv_pread(bs->file, s->bat_offset, s->bat, s->bat_rt.length);
     if (ret < 0) {
@@ -1540,7 +1552,8 @@ exit:
  */
 static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s,
                            uint64_t image_size, VHDXImageType type,
-                           bool use_zero_blocks, VHDXRegionTableEntry *rt_bat)
+                           bool use_zero_blocks, uint64_t file_offset,
+                           uint32_t length)
 {
     int ret = 0;
     uint64_t data_file_offset;
@@ -1555,7 +1568,7 @@ static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s,
     /* this gives a data start after BAT/bitmap entries, and well
      * past any metadata entries (with a 4 MB buffer for future
      * expansion */
-    data_file_offset = rt_bat->file_offset + rt_bat->length + 5 * MiB;
+    data_file_offset = file_offset + length + 5 * MiB;
     total_sectors = image_size >> s->logical_sector_size_bits;
 
     if (type == VHDX_TYPE_DYNAMIC) {
@@ -1579,7 +1592,11 @@ static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s,
                 use_zero_blocks ||
                 bdrv_has_zero_init(bs) == 0) {
         /* for a fixed file, the default BAT entry is not zero */
-        s->bat = g_malloc0(rt_bat->length);
+        s->bat = g_try_malloc0(length);
+        if (length && s->bat != NULL) {
+            ret = -ENOMEM;
+            goto exit;
+        }
         block_state = type == VHDX_TYPE_FIXED ? PAYLOAD_BLOCK_FULLY_PRESENT :
                                                 PAYLOAD_BLOCK_NOT_PRESENT;
         block_state = use_zero_blocks ? PAYLOAD_BLOCK_ZERO : block_state;
@@ -1594,7 +1611,7 @@ static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s,
             cpu_to_le64s(&s->bat[sinfo.bat_idx]);
             sector_num += s->sectors_per_block;
         }
-        ret = bdrv_pwrite(bs, rt_bat->file_offset, s->bat, rt_bat->length);
+        ret = bdrv_pwrite(bs, file_offset, s->bat, length);
         if (ret < 0) {
             goto exit;
         }
@@ -1626,6 +1643,8 @@ static int vhdx_create_new_region_table(BlockDriverState *bs,
     int ret = 0;
     uint32_t offset = 0;
     void *buffer = NULL;
+    uint64_t bat_file_offset;
+    uint32_t bat_length;
     BDRVVHDXState *s = NULL;
     VHDXRegionTableHeader *region_table;
     VHDXRegionTableEntry *rt_bat;
@@ -1674,19 +1693,26 @@ static int vhdx_create_new_region_table(BlockDriverState *bs,
     rt_metadata->length      = 1 * MiB; /* min size, and more than enough */
     *metadata_offset = rt_metadata->file_offset;
 
+    bat_file_offset = rt_bat->file_offset;
+    bat_length = rt_bat->length;
+
+    vhdx_region_header_le_export(region_table);
+    vhdx_region_entry_le_export(rt_bat);
+    vhdx_region_entry_le_export(rt_metadata);
+
     vhdx_update_checksum(buffer, VHDX_HEADER_BLOCK_SIZE,
                          offsetof(VHDXRegionTableHeader, checksum));
 
 
     /* The region table gives us the data we need to create the BAT,
      * so do that now */
-    ret = vhdx_create_bat(bs, s, image_size, type, use_zero_blocks, rt_bat);
+    ret = vhdx_create_bat(bs, s, image_size, type, use_zero_blocks,
+                          bat_file_offset, bat_length);
+    if (ret < 0) {
+        goto exit;
+    }
 
     /* Now write out the region headers to disk */
-    vhdx_region_header_le_export(region_table);
-    vhdx_region_entry_le_export(rt_bat);
-    vhdx_region_entry_le_export(rt_metadata);
-
     ret = bdrv_pwrite(bs, VHDX_REGION_TABLE_OFFSET, buffer,
                       VHDX_HEADER_BLOCK_SIZE);
     if (ret < 0) {
diff --git a/block/vhdx.h b/block/vhdx.h
index 5370010c59..b4a12a081e 100644
--- a/block/vhdx.h
+++ b/block/vhdx.h
@@ -435,6 +435,7 @@ void vhdx_header_le_import(VHDXHeader *h);
 void vhdx_header_le_export(VHDXHeader *orig_h, VHDXHeader *new_h);
 void vhdx_log_desc_le_import(VHDXLogDescriptor *d);
 void vhdx_log_desc_le_export(VHDXLogDescriptor *d);
+void vhdx_log_data_le_import(VHDXLogDataSector *d);
 void vhdx_log_data_le_export(VHDXLogDataSector *d);
 void vhdx_log_entry_hdr_le_import(VHDXLogEntryHeader *hdr);
 void vhdx_log_entry_hdr_le_export(VHDXLogEntryHeader *hdr);
diff --git a/block/vmdk.c b/block/vmdk.c
index 0517bbaf91..01412a8939 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -106,6 +106,7 @@ typedef struct VmdkExtent {
     uint32_t l2_cache_counts[L2_CACHE_SIZE];
 
     int64_t cluster_sectors;
+    int64_t next_cluster_sector;
     char *type;
 } VmdkExtent;
 
@@ -124,7 +125,6 @@ typedef struct BDRVVmdkState {
 } BDRVVmdkState;
 
 typedef struct VmdkMetaData {
-    uint32_t offset;
     unsigned int l1_index;
     unsigned int l2_index;
     unsigned int l2_offset;
@@ -397,6 +397,7 @@ static int vmdk_add_extent(BlockDriverState *bs,
 {
     VmdkExtent *extent;
     BDRVVmdkState *s = bs->opaque;
+    int64_t length;
 
     if (cluster_sectors > 0x200000) {
         /* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */
@@ -412,6 +413,11 @@ static int vmdk_add_extent(BlockDriverState *bs,
         return -EFBIG;
     }
 
+    length = bdrv_getlength(file);
+    if (length < 0) {
+        return length;
+    }
+
     s->extents = g_realloc(s->extents,
                               (s->num_extents + 1) * sizeof(VmdkExtent));
     extent = &s->extents[s->num_extents];
@@ -427,6 +433,8 @@ static int vmdk_add_extent(BlockDriverState *bs,
     extent->l1_entry_sectors = l2_size * cluster_sectors;
     extent->l2_size = l2_size;
     extent->cluster_sectors = flat ? sectors : cluster_sectors;
+    extent->next_cluster_sector =
+        ROUND_UP(DIV_ROUND_UP(length, BDRV_SECTOR_SIZE), cluster_sectors);
 
     if (s->num_extents > 1) {
         extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
@@ -448,7 +456,11 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent,
 
     /* read the L1 table */
     l1_size = extent->l1_size * sizeof(uint32_t);
-    extent->l1_table = g_malloc(l1_size);
+    extent->l1_table = g_try_malloc(l1_size);
+    if (l1_size && extent->l1_table == NULL) {
+        return -ENOMEM;
+    }
+
     ret = bdrv_pread(extent->file,
                      extent->l1_table_offset,
                      extent->l1_table,
@@ -464,7 +476,11 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent,
     }
 
     if (extent->l1_backup_table_offset) {
-        extent->l1_backup_table = g_malloc(l1_size);
+        extent->l1_backup_table = g_try_malloc(l1_size);
+        if (l1_size && extent->l1_backup_table == NULL) {
+            ret = -ENOMEM;
+            goto fail_l1;
+        }
         ret = bdrv_pread(extent->file,
                          extent->l1_backup_table_offset,
                          extent->l1_backup_table,
@@ -669,8 +685,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
     if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) {
         l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9;
     }
-    if (bdrv_getlength(file) <
-            le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE) {
+    if (bdrv_nb_sectors(file) < le64_to_cpu(header.grain_offset)) {
         error_setg(errp, "File truncated, expecting at least %" PRId64 " bytes",
                    (int64_t)(le64_to_cpu(header.grain_offset)
                              * BDRV_SECTOR_SIZE));
@@ -952,57 +967,97 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp)
     }
 }
 
+/**
+ * get_whole_cluster
+ *
+ * Copy backing file's cluster that covers @sector_num, otherwise write zero,
+ * to the cluster at @cluster_sector_num.
+ *
+ * If @skip_start_sector < @skip_end_sector, the relative range
+ * [@skip_start_sector, @skip_end_sector) is not copied or written, and leave
+ * it for call to write user data in the request.
+ */
 static int get_whole_cluster(BlockDriverState *bs,
-                VmdkExtent *extent,
-                uint64_t cluster_offset,
-                uint64_t offset,
-                bool allocate)
+                             VmdkExtent *extent,
+                             uint64_t cluster_sector_num,
+                             uint64_t sector_num,
+                             uint64_t skip_start_sector,
+                             uint64_t skip_end_sector)
 {
     int ret = VMDK_OK;
-    uint8_t *whole_grain = NULL;
+    int64_t cluster_bytes;
+    uint8_t *whole_grain;
 
+    /* For COW, align request sector_num to cluster start */
+    sector_num = QEMU_ALIGN_DOWN(sector_num, extent->cluster_sectors);
+    cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS;
+    whole_grain = qemu_blockalign(bs, cluster_bytes);
+
+    if (!bs->backing_hd) {
+        memset(whole_grain, 0,  skip_start_sector << BDRV_SECTOR_BITS);
+        memset(whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), 0,
+               cluster_bytes - (skip_end_sector << BDRV_SECTOR_BITS));
+    }
+
+    assert(skip_end_sector <= extent->cluster_sectors);
     /* we will be here if it's first write on non-exist grain(cluster).
      * try to read from parent image, if exist */
-    if (bs->backing_hd) {
-        whole_grain =
-            qemu_blockalign(bs, extent->cluster_sectors << BDRV_SECTOR_BITS);
-        if (!vmdk_is_cid_valid(bs)) {
-            ret = VMDK_ERROR;
-            goto exit;
-        }
+    if (bs->backing_hd && !vmdk_is_cid_valid(bs)) {
+        ret = VMDK_ERROR;
+        goto exit;
+    }
 
-        /* floor offset to cluster */
-        offset -= offset % (extent->cluster_sectors * 512);
-        ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain,
-                extent->cluster_sectors);
+    /* Read backing data before skip range */
+    if (skip_start_sector > 0) {
+        if (bs->backing_hd) {
+            ret = bdrv_read(bs->backing_hd, sector_num,
+                            whole_grain, skip_start_sector);
+            if (ret < 0) {
+                ret = VMDK_ERROR;
+                goto exit;
+            }
+        }
+        ret = bdrv_write(extent->file, cluster_sector_num, whole_grain,
+                         skip_start_sector);
         if (ret < 0) {
             ret = VMDK_ERROR;
             goto exit;
         }
-
-        /* Write grain only into the active image */
-        ret = bdrv_write(extent->file, cluster_offset, whole_grain,
-                extent->cluster_sectors);
+    }
+    /* Read backing data after skip range */
+    if (skip_end_sector < extent->cluster_sectors) {
+        if (bs->backing_hd) {
+            ret = bdrv_read(bs->backing_hd, sector_num + skip_end_sector,
+                            whole_grain + (skip_end_sector << BDRV_SECTOR_BITS),
+                            extent->cluster_sectors - skip_end_sector);
+            if (ret < 0) {
+                ret = VMDK_ERROR;
+                goto exit;
+            }
+        }
+        ret = bdrv_write(extent->file, cluster_sector_num + skip_end_sector,
+                         whole_grain + (skip_end_sector << BDRV_SECTOR_BITS),
+                         extent->cluster_sectors - skip_end_sector);
         if (ret < 0) {
             ret = VMDK_ERROR;
             goto exit;
         }
     }
+
 exit:
     qemu_vfree(whole_grain);
     return ret;
 }
 
-static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
+static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
+                         uint32_t offset)
 {
-    uint32_t offset;
-    QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset));
-    offset = cpu_to_le32(m_data->offset);
+    offset = cpu_to_le32(offset);
     /* update L2 table */
     if (bdrv_pwrite_sync(
                 extent->file,
                 ((int64_t)m_data->l2_offset * 512)
-                    + (m_data->l2_index * sizeof(m_data->offset)),
+                    + (m_data->l2_index * sizeof(offset)),
                 &offset, sizeof(offset)) < 0) {
         return VMDK_ERROR;
     }
@@ -1012,7 +1067,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
         if (bdrv_pwrite_sync(
                     extent->file,
                     ((int64_t)m_data->l2_offset * 512)
-                        + (m_data->l2_index * sizeof(m_data->offset)),
+                        + (m_data->l2_index * sizeof(offset)),
                     &offset, sizeof(offset)) < 0) {
             return VMDK_ERROR;
         }
@@ -1024,17 +1079,41 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
     return VMDK_OK;
 }
 
+/**
+ * get_cluster_offset
+ *
+ * Look up cluster offset in extent file by sector number, and store in
+ * @cluster_offset.
+ *
+ * For flat extents, the start offset as parsed from the description file is
+ * returned.
+ *
+ * For sparse extents, look up in L1, L2 table. If allocate is true, return an
+ * offset for a new cluster and update L2 cache. If there is a backing file,
+ * COW is done before returning; otherwise, zeroes are written to the allocated
+ * cluster. Both COW and zero writing skips the sector range
+ * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller
+ * has new data to write there.
+ *
+ * Returns: VMDK_OK if cluster exists and mapped in the image.
+ *          VMDK_UNALLOC if cluster is not mapped and @allocate is false.
+ *          VMDK_ERROR if failed.
+ */
 static int get_cluster_offset(BlockDriverState *bs,
-                                    VmdkExtent *extent,
-                                    VmdkMetaData *m_data,
-                                    uint64_t offset,
-                                    int allocate,
-                                    uint64_t *cluster_offset)
+                              VmdkExtent *extent,
+                              VmdkMetaData *m_data,
+                              uint64_t offset,
+                              bool allocate,
+                              uint64_t *cluster_offset,
+                              uint64_t skip_start_sector,
+                              uint64_t skip_end_sector)
 {
     unsigned int l1_index, l2_offset, l2_index;
     int min_index, i, j;
     uint32_t min_count, *l2_table;
     bool zeroed = false;
+    int64_t ret;
+    int32_t cluster_sector;
 
     if (m_data) {
         m_data->valid = 0;
@@ -1088,52 +1167,41 @@ static int get_cluster_offset(BlockDriverState *bs,
     extent->l2_cache_counts[min_index] = 1;
  found:
     l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
-    *cluster_offset = le32_to_cpu(l2_table[l2_index]);
+    cluster_sector = le32_to_cpu(l2_table[l2_index]);
 
     if (m_data) {
         m_data->valid = 1;
         m_data->l1_index = l1_index;
         m_data->l2_index = l2_index;
-        m_data->offset = *cluster_offset;
         m_data->l2_offset = l2_offset;
         m_data->l2_cache_entry = &l2_table[l2_index];
     }
-    if (extent->has_zero_grain && *cluster_offset == VMDK_GTE_ZEROED) {
+    if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) {
         zeroed = true;
     }
 
-    if (!*cluster_offset || zeroed) {
+    if (!cluster_sector || zeroed) {
         if (!allocate) {
             return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
         }
 
-        /* Avoid the L2 tables update for the images that have snapshots. */
-        *cluster_offset = bdrv_getlength(extent->file);
-        if (!extent->compressed) {
-            bdrv_truncate(
-                extent->file,
-                *cluster_offset + (extent->cluster_sectors << 9)
-            );
-        }
-
-        *cluster_offset >>= 9;
-        l2_table[l2_index] = cpu_to_le32(*cluster_offset);
+        cluster_sector = extent->next_cluster_sector;
+        extent->next_cluster_sector += extent->cluster_sectors;
 
         /* First of all we write grain itself, to avoid race condition
          * that may to corrupt the image.
          * This problem may occur because of insufficient space on host disk
          * or inappropriate VM shutdown.
          */
-        if (get_whole_cluster(
-                bs, extent, *cluster_offset, offset, allocate) == -1) {
-            return VMDK_ERROR;
-        }
-
-        if (m_data) {
-            m_data->offset = *cluster_offset;
+        ret = get_whole_cluster(bs, extent,
+                                cluster_sector,
+                                offset >> BDRV_SECTOR_BITS,
+                                skip_start_sector, skip_end_sector);
+        if (ret) {
+            return ret;
         }
     }
-    *cluster_offset <<= 9;
+    *cluster_offset = cluster_sector << BDRV_SECTOR_BITS;
     return VMDK_OK;
 }
 
@@ -1168,7 +1236,8 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
     }
     qemu_co_mutex_lock(&s->lock);
     ret = get_cluster_offset(bs, extent, NULL,
-                            sector_num * 512, 0, &offset);
+                             sector_num * 512, false, &offset,
+                             0, 0);
     qemu_co_mutex_unlock(&s->lock);
 
     switch (ret) {
@@ -1321,9 +1390,9 @@ static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
         if (!extent) {
             return -EIO;
         }
-        ret = get_cluster_offset(
-                            bs, extent, NULL,
-                            sector_num << 9, 0, &cluster_offset);
+        ret = get_cluster_offset(bs, extent, NULL,
+                                 sector_num << 9, false, &cluster_offset,
+                                 0, 0);
         extent_begin_sector = extent->end_sector - extent->sectors;
         extent_relative_sector_num = sector_num - extent_begin_sector;
         index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
@@ -1404,12 +1473,17 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
         if (!extent) {
             return -EIO;
         }
-        ret = get_cluster_offset(
-                                bs,
-                                extent,
-                                &m_data,
-                                sector_num << 9, !extent->compressed,
-                                &cluster_offset);
+        extent_begin_sector = extent->end_sector - extent->sectors;
+        extent_relative_sector_num = sector_num - extent_begin_sector;
+        index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
+        n = extent->cluster_sectors - index_in_cluster;
+        if (n > nb_sectors) {
+            n = nb_sectors;
+        }
+        ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9,
+                                 !(extent->compressed || zeroed),
+                                 &cluster_offset,
+                                 index_in_cluster, index_in_cluster + n);
         if (extent->compressed) {
             if (ret == VMDK_OK) {
                 /* Refuse write to allocated cluster for streamOptimized */
@@ -1418,24 +1492,13 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
                 return -EIO;
             } else {
                 /* allocate */
-                ret = get_cluster_offset(
-                                        bs,
-                                        extent,
-                                        &m_data,
-                                        sector_num << 9, 1,
-                                        &cluster_offset);
+                ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9,
+                                         true, &cluster_offset, 0, 0);
             }
         }
         if (ret == VMDK_ERROR) {
             return -EINVAL;
         }
-        extent_begin_sector = extent->end_sector - extent->sectors;
-        extent_relative_sector_num = sector_num - extent_begin_sector;
-        index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
-        n = extent->cluster_sectors - index_in_cluster;
-        if (n > nb_sectors) {
-            n = nb_sectors;
-        }
         if (zeroed) {
             /* Do zeroed write, buf is ignored */
             if (extent->has_zero_grain &&
@@ -1443,9 +1506,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
                     n >= extent->cluster_sectors) {
                 n = extent->cluster_sectors;
                 if (!zero_dry_run) {
-                    m_data.offset = VMDK_GTE_ZEROED;
                     /* update L2 tables */
-                    if (vmdk_L2update(extent, &m_data) != VMDK_OK) {
+                    if (vmdk_L2update(extent, &m_data, VMDK_GTE_ZEROED)
+                            != VMDK_OK) {
                         return -EIO;
                     }
                 }
@@ -1461,7 +1524,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
             }
             if (m_data.valid) {
                 /* update L2 tables */
-                if (vmdk_L2update(extent, &m_data) != VMDK_OK) {
+                if (vmdk_L2update(extent, &m_data,
+                                  cluster_offset >> BDRV_SECTOR_BITS)
+                        != VMDK_OK) {
                     return -EIO;
                 }
             }
@@ -1999,7 +2064,7 @@ static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result,
     BDRVVmdkState *s = bs->opaque;
     VmdkExtent *extent = NULL;
     int64_t sector_num = 0;
-    int64_t total_sectors = bdrv_getlength(bs) / BDRV_SECTOR_SIZE;
+    int64_t total_sectors = bdrv_nb_sectors(bs);
     int ret;
     uint64_t cluster_offset;
 
@@ -2020,7 +2085,7 @@ static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result,
         }
         ret = get_cluster_offset(bs, extent, NULL,
                                  sector_num << BDRV_SECTOR_BITS,
-                                 0, &cluster_offset);
+                                 false, &cluster_offset, 0, 0);
         if (ret == VMDK_ERROR) {
             fprintf(stderr,
                     "ERROR: could not get cluster_offset for sector %"
diff --git a/block/vpc.c b/block/vpc.c
index 8b376a40be..055efc42d2 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -29,13 +29,6 @@
 #if defined(CONFIG_UUID)
 #include <uuid/uuid.h>
 #endif
-#ifdef __linux__
-#include <linux/fs.h>
-#include <sys/ioctl.h>
-#ifndef FS_NOCOW_FL
-#define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
-#endif
-#endif
 
 /**************************************************************/
 
@@ -276,7 +269,11 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
             goto fail;
         }
 
-        s->pagetable = qemu_blockalign(bs, s->max_table_entries * 4);
+        s->pagetable = qemu_try_blockalign(bs->file, s->max_table_entries * 4);
+        if (s->pagetable == NULL) {
+            ret = -ENOMEM;
+            goto fail;
+        }
 
         s->bat_offset = be64_to_cpu(dyndisk_header->table_offset);
 
@@ -656,39 +653,41 @@ static int calculate_geometry(int64_t total_sectors, uint16_t* cyls,
     return 0;
 }
 
-static int create_dynamic_disk(int fd, uint8_t *buf, int64_t total_sectors)
+static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf,
+                               int64_t total_sectors)
 {
     VHDDynDiskHeader *dyndisk_header =
         (VHDDynDiskHeader *) buf;
     size_t block_size, num_bat_entries;
     int i;
-    int ret = -EIO;
+    int ret;
+    int64_t offset = 0;
 
     // Write the footer (twice: at the beginning and at the end)
     block_size = 0x200000;
     num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512);
 
-    if (write(fd, buf, HEADER_SIZE) != HEADER_SIZE) {
+    ret = bdrv_pwrite_sync(bs, offset, buf, HEADER_SIZE);
+    if (ret) {
         goto fail;
     }
 
-    if (lseek(fd, 1536 + ((num_bat_entries * 4 + 511) & ~511), SEEK_SET) < 0) {
-        goto fail;
-    }
-    if (write(fd, buf, HEADER_SIZE) != HEADER_SIZE) {
+    offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
+    ret = bdrv_pwrite_sync(bs, offset, buf, HEADER_SIZE);
+    if (ret < 0) {
         goto fail;
     }
 
     // Write the initial BAT
-    if (lseek(fd, 3 * 512, SEEK_SET) < 0) {
-        goto fail;
-    }
+    offset = 3 * 512;
 
     memset(buf, 0xFF, 512);
     for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++) {
-        if (write(fd, buf, 512) != 512) {
+        ret = bdrv_pwrite_sync(bs, offset, buf, 512);
+        if (ret < 0) {
             goto fail;
         }
+        offset += 512;
     }
 
     // Prepare the Dynamic Disk Header
@@ -709,39 +708,35 @@ static int create_dynamic_disk(int fd, uint8_t *buf, int64_t total_sectors)
     dyndisk_header->checksum = be32_to_cpu(vpc_checksum(buf, 1024));
 
     // Write the header
-    if (lseek(fd, 512, SEEK_SET) < 0) {
-        goto fail;
-    }
+    offset = 512;
 
-    if (write(fd, buf, 1024) != 1024) {
+    ret = bdrv_pwrite_sync(bs, offset, buf, 1024);
+    if (ret < 0) {
         goto fail;
     }
-    ret = 0;
 
  fail:
     return ret;
 }
 
-static int create_fixed_disk(int fd, uint8_t *buf, int64_t total_size)
+static int create_fixed_disk(BlockDriverState *bs, uint8_t *buf,
+                             int64_t total_size)
 {
-    int ret = -EIO;
+    int ret;
 
     /* Add footer to total size */
-    total_size += 512;
-    if (ftruncate(fd, total_size) != 0) {
-        ret = -errno;
-        goto fail;
-    }
-    if (lseek(fd, -512, SEEK_END) < 0) {
-        goto fail;
-    }
-    if (write(fd, buf, HEADER_SIZE) != HEADER_SIZE) {
-        goto fail;
+    total_size += HEADER_SIZE;
+
+    ret = bdrv_truncate(bs, total_size);
+    if (ret < 0) {
+        return ret;
     }
 
-    ret = 0;
+    ret = bdrv_pwrite_sync(bs, total_size - HEADER_SIZE, buf, HEADER_SIZE);
+    if (ret < 0) {
+        return ret;
+    }
 
- fail:
     return ret;
 }
 
@@ -750,7 +745,7 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
     uint8_t buf[1024];
     VHDFooter *footer = (VHDFooter *) buf;
     char *disk_type_param;
-    int fd, i;
+    int i;
     uint16_t cyls = 0;
     uint8_t heads = 0;
     uint8_t secs_per_cyl = 0;
@@ -758,7 +753,8 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
     int64_t total_size;
     int disk_type;
     int ret = -EIO;
-    bool nocow = false;
+    Error *local_err = NULL;
+    BlockDriverState *bs = NULL;
 
     /* Read out options */
     total_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
@@ -775,28 +771,17 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
     } else {
         disk_type = VHD_DYNAMIC;
     }
-    nocow = qemu_opt_get_bool_del(opts, BLOCK_OPT_NOCOW, false);
 
-    /* Create the file */
-    fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
-    if (fd < 0) {
-        ret = -EIO;
+    ret = bdrv_create_file(filename, opts, &local_err);
+    if (ret < 0) {
+        error_propagate(errp, local_err);
         goto out;
     }
-
-    if (nocow) {
-#ifdef __linux__
-        /* Set NOCOW flag to solve performance issue on fs like btrfs.
-         * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value will
-         * be ignored since any failure of this operation should not block the
-         * left work.
-         */
-        int attr;
-        if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
-            attr |= FS_NOCOW_FL;
-            ioctl(fd, FS_IOC_SETFLAGS, &attr);
-        }
-#endif
+    ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
+                    NULL, &local_err);
+    if (ret < 0) {
+        error_propagate(errp, local_err);
+        goto out;
     }
 
     /*
@@ -810,7 +795,7 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
                                &secs_per_cyl))
         {
             ret = -EFBIG;
-            goto fail;
+            goto out;
         }
     }
 
@@ -856,14 +841,13 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
     footer->checksum = be32_to_cpu(vpc_checksum(buf, HEADER_SIZE));
 
     if (disk_type == VHD_DYNAMIC) {
-        ret = create_dynamic_disk(fd, buf, total_sectors);
+        ret = create_dynamic_disk(bs, buf, total_sectors);
     } else {
-        ret = create_fixed_disk(fd, buf, total_size);
+        ret = create_fixed_disk(bs, buf, total_size);
     }
 
-fail:
-    qemu_close(fd);
 out:
+    bdrv_unref(bs);
     g_free(disk_type_param);
     return ret;
 }
diff --git a/block/win32-aio.c b/block/win32-aio.c
index 8e417f70ae..5030e3274d 100644
--- a/block/win32-aio.c
+++ b/block/win32-aio.c
@@ -139,7 +139,10 @@ BlockDriverAIOCB *win32_aio_submit(BlockDriverState *bs,
     waiocb->is_read = (type == QEMU_AIO_READ);
 
     if (qiov->niov > 1) {
-        waiocb->buf = qemu_blockalign(bs, qiov->size);
+        waiocb->buf = qemu_try_blockalign(bs, qiov->size);
+        if (waiocb->buf == NULL) {
+            goto out;
+        }
         if (type & QEMU_AIO_WRITE) {
             iov_to_buf(qiov->iov, qiov->niov, 0, waiocb->buf, qiov->size);
         }
@@ -168,6 +171,7 @@ BlockDriverAIOCB *win32_aio_submit(BlockDriverState *bs,
 
 out_dec_count:
     aio->count--;
+out:
     qemu_aio_release(waiocb);
     return NULL;
 }
diff --git a/configure b/configure
index f49e61888e..283c71cb7a 100755
--- a/configure
+++ b/configure
@@ -326,6 +326,7 @@ seccomp=""
 glusterfs=""
 glusterfs_discard="no"
 glusterfs_zerofill="no"
+archipelago=""
 virtio_blk_data_plane=""
 gtk=""
 gtkabi=""
@@ -1087,6 +1088,10 @@ for opt do
   ;;
   --enable-glusterfs) glusterfs="yes"
   ;;
+  --disable-archipelago) archipelago="no"
+  ;;
+  --enable-archipelago) archipelago="yes"
+  ;;
   --disable-virtio-blk-data-plane) virtio_blk_data_plane="no"
   ;;
   --enable-virtio-blk-data-plane) virtio_blk_data_plane="yes"
@@ -1382,6 +1387,8 @@ Advanced options (experts only):
   --enable-coroutine-pool  enable coroutine freelist (better performance)
   --enable-glusterfs       enable GlusterFS backend
   --disable-glusterfs      disable GlusterFS backend
+  --enable-archipelago     enable Archipelago backend
+  --disable-archipelago    disable Archipelago backend
   --enable-gcov            enable test coverage analysis with gcov
   --gcov=GCOV              use specified gcov [$gcov_tool]
   --disable-tpm            disable TPM support
@@ -3072,6 +3079,33 @@ EOF
   fi
 fi
 
+
+##########################################
+# archipelago probe
+if test "$archipelago" != "no" ; then
+    cat > $TMPC <<EOF
+#include <stdio.h>
+#include <xseg/xseg.h>
+#include <xseg/protocol.h>
+int main(void) {
+    xseg_initialize();
+    return 0;
+}
+EOF
+    archipelago_libs=-lxseg
+    if compile_prog "" "$archipelago_libs"; then
+        archipelago="yes"
+        libs_tools="$archipelago_libs $libs_tools"
+        libs_softmmu="$archipelago_libs $libs_softmmu"
+    else
+      if test "$archipelago" = "yes" ; then
+        feature_not_found "Archipelago backend support" "Install libxseg devel"
+      fi
+      archipelago="no"
+    fi
+fi
+
+
 ##########################################
 # glusterfs probe
 if test "$glusterfs" != "no" ; then
@@ -3087,7 +3121,8 @@ if test "$glusterfs" != "no" ; then
     fi
   else
     if test "$glusterfs" = "yes" ; then
-      feature_not_found "GlusterFS backend support" "Install glusterfs-api devel"
+      feature_not_found "GlusterFS backend support" \
+          "Install glusterfs-api devel >= 3"
     fi
     glusterfs="no"
   fi
@@ -3532,7 +3567,8 @@ EOF
     spice_server_version=$($pkg_config --modversion spice-server)
   else
     if test "$spice" = "yes" ; then
-      feature_not_found "spice" "Install spice-server and spice-protocol devel"
+      feature_not_found "spice" \
+          "Install spice-server(>=0.12.0) and spice-protocol(>=0.12.3) devel"
     fi
     spice="no"
   fi
@@ -3563,7 +3599,7 @@ EOF
         smartcard_nss="yes"
     else
         if test "$smartcard_nss" = "yes"; then
-            feature_not_found "nss"
+            feature_not_found "nss" "Install nss devel >= 3.12.8"
         fi
         smartcard_nss="no"
     fi
@@ -3579,7 +3615,7 @@ if test "$libusb" != "no" ; then
         libs_softmmu="$libs_softmmu $libusb_libs"
     else
         if test "$libusb" = "yes"; then
-            feature_not_found "libusb" "Install libusb devel"
+            feature_not_found "libusb" "Install libusb devel >= 1.0.13"
         fi
         libusb="no"
     fi
@@ -4004,7 +4040,7 @@ if test "$libnfs" != "no" ; then
     LIBS="$LIBS $libnfs_libs"
   else
     if test "$libnfs" = "yes" ; then
-      feature_not_found "libnfs"
+      feature_not_found "libnfs" "Install libnfs devel >= 1.9.3"
     fi
     libnfs="no"
   fi
@@ -4251,6 +4287,7 @@ echo "seccomp support   $seccomp"
 echo "coroutine backend $coroutine"
 echo "coroutine pool    $coroutine_pool"
 echo "GlusterFS support $glusterfs"
+echo "Archipelago support $archipelago"
 echo "virtio-blk-data-plane $virtio_blk_data_plane"
 echo "gcov              $gcov_tool"
 echo "gcov enabled      $gcov"
@@ -4689,6 +4726,11 @@ if test "$glusterfs_zerofill" = "yes" ; then
   echo "CONFIG_GLUSTERFS_ZEROFILL=y" >> $config_host_mak
 fi
 
+if test "$archipelago" = "yes" ; then
+  echo "CONFIG_ARCHIPELAGO=m" >> $config_host_mak
+  echo "ARCHIPELAGO_LIBS=$archipelago_libs" >> $config_host_mak
+fi
+
 if test "$libssh2" = "yes" ; then
   echo "CONFIG_LIBSSH2=m" >> $config_host_mak
   echo "LIBSSH2_CFLAGS=$libssh2_cflags" >> $config_host_mak
diff --git a/docs/multiple-iothreads.txt b/docs/multiple-iothreads.txt
new file mode 100644
index 0000000000..40b8419916
--- /dev/null
+++ b/docs/multiple-iothreads.txt
@@ -0,0 +1,134 @@
+Copyright (c) 2014 Red Hat Inc.
+
+This work is licensed under the terms of the GNU GPL, version 2 or later.  See
+the COPYING file in the top-level directory.
+
+
+This document explains the IOThread feature and how to write code that runs
+outside the QEMU global mutex.
+
+The main loop and IOThreads
+---------------------------
+QEMU is an event-driven program that can do several things at once using an
+event loop.  The VNC server and the QMP monitor are both processed from the
+same event loop, which monitors their file descriptors until they become
+readable and then invokes a callback.
+
+The default event loop is called the main loop (see main-loop.c).  It is
+possible to create additional event loop threads using -object
+iothread,id=my-iothread.
+
+Side note: The main loop and IOThread are both event loops but their code is
+not shared completely.  Sometimes it is useful to remember that although they
+are conceptually similar they are currently not interchangeable.
+
+Why IOThreads are useful
+------------------------
+IOThreads allow the user to control the placement of work.  The main loop is a
+scalability bottleneck on hosts with many CPUs.  Work can be spread across
+several IOThreads instead of just one main loop.  When set up correctly this
+can improve I/O latency and reduce jitter seen by the guest.
+
+The main loop is also deeply associated with the QEMU global mutex, which is a
+scalability bottleneck in itself.  vCPU threads and the main loop use the QEMU
+global mutex to serialize execution of QEMU code.  This mutex is necessary
+because a lot of QEMU's code historically was not thread-safe.
+
+The fact that all I/O processing is done in a single main loop and that the
+QEMU global mutex is contended by all vCPU threads and the main loop explain
+why it is desirable to place work into IOThreads.
+
+The experimental virtio-blk data-plane implementation has been benchmarked and
+shows these effects:
+ftp://public.dhe.ibm.com/linux/pdfs/KVM_Virtualized_IO_Performance_Paper.pdf
+
+How to program for IOThreads
+----------------------------
+The main difference between legacy code and new code that can run in an
+IOThread is dealing explicitly with the event loop object, AioContext
+(see include/block/aio.h).  Code that only works in the main loop
+implicitly uses the main loop's AioContext.  Code that supports running
+in IOThreads must be aware of its AioContext.
+
+AioContext supports the following services:
+ * File descriptor monitoring (read/write/error on POSIX hosts)
+ * Event notifiers (inter-thread signalling)
+ * Timers
+ * Bottom Halves (BH) deferred callbacks
+
+There are several old APIs that use the main loop AioContext:
+ * LEGACY qemu_aio_set_fd_handler() - monitor a file descriptor
+ * LEGACY qemu_aio_set_event_notifier() - monitor an event notifier
+ * LEGACY timer_new_ms() - create a timer
+ * LEGACY qemu_bh_new() - create a BH
+ * LEGACY qemu_aio_wait() - run an event loop iteration
+
+Since they implicitly work on the main loop they cannot be used in code that
+runs in an IOThread.  They might cause a crash or deadlock if called from an
+IOThread since the QEMU global mutex is not held.
+
+Instead, use the AioContext functions directly (see include/block/aio.h):
+ * aio_set_fd_handler() - monitor a file descriptor
+ * aio_set_event_notifier() - monitor an event notifier
+ * aio_timer_new() - create a timer
+ * aio_bh_new() - create a BH
+ * aio_poll() - run an event loop iteration
+
+The AioContext can be obtained from the IOThread using
+iothread_get_aio_context() or for the main loop using qemu_get_aio_context().
+Code that takes an AioContext argument works both in IOThreads or the main
+loop, depending on which AioContext instance the caller passes in.
+
+How to synchronize with an IOThread
+-----------------------------------
+AioContext is not thread-safe so some rules must be followed when using file
+descriptors, event notifiers, timers, or BHs across threads:
+
+1. AioContext functions can be called safely from file descriptor, event
+notifier, timer, or BH callbacks invoked by the AioContext.  No locking is
+necessary.
+
+2. Other threads wishing to access the AioContext must use
+aio_context_acquire()/aio_context_release() for mutual exclusion.  Once the
+context is acquired no other thread can access it or run event loop iterations
+in this AioContext.
+
+aio_context_acquire()/aio_context_release() calls may be nested.  This
+means you can call them if you're not sure whether #1 applies.
+
+There is currently no lock ordering rule if a thread needs to acquire multiple
+AioContexts simultaneously.  Therefore, it is only safe for code holding the
+QEMU global mutex to acquire other AioContexts.
+
+Side note: the best way to schedule a function call across threads is to create
+a BH in the target AioContext beforehand and then call qemu_bh_schedule().  No
+acquire/release or locking is needed for the qemu_bh_schedule() call.  But be
+sure to acquire the AioContext for aio_bh_new() if necessary.
+
+The relationship between AioContext and the block layer
+-------------------------------------------------------
+The AioContext originates from the QEMU block layer because it provides a
+scoped way of running event loop iterations until all work is done.  This
+feature is used to complete all in-flight block I/O requests (see
+bdrv_drain_all()).  Nowadays AioContext is a generic event loop that can be
+used by any QEMU subsystem.
+
+The block layer has support for AioContext integrated.  Each BlockDriverState
+is associated with an AioContext using bdrv_set_aio_context() and
+bdrv_get_aio_context().  This allows block layer code to process I/O inside the
+right AioContext.  Other subsystems may wish to follow a similar approach.
+
+Block layer code must therefore expect to run in an IOThread and avoid using
+old APIs that implicitly use the main loop.  See the "How to program for
+IOThreads" above for information on how to do that.
+
+If main loop code such as a QMP function wishes to access a BlockDriverState it
+must first call aio_context_acquire(bdrv_get_aio_context(bs)) to ensure the
+IOThread does not run in parallel.
+
+Long-running jobs (usually in the form of coroutines) are best scheduled in the
+BlockDriverState's AioContext to avoid the need to acquire/release around each
+bdrv_*() call.  Be aware that there is currently no mechanism to get notified
+when bdrv_set_aio_context() moves this BlockDriverState to a different
+AioContext (see bdrv_detach_aio_context()/bdrv_attach_aio_context()), so you
+may need to add this if you want to support long-running jobs.
diff --git a/docs/specs/qcow2.txt b/docs/specs/qcow2.txt
index 3f713a6447..cfbc8b070c 100644
--- a/docs/specs/qcow2.txt
+++ b/docs/specs/qcow2.txt
@@ -135,12 +135,12 @@ be stored. Each extension has a structure like the following:
 Unless stated otherwise, each header extension type shall appear at most once
 in the same image.
 
-The remaining space between the end of the header extension area and the end of
-the first cluster can be used for the backing file name. It is not allowed to
-store other data here, so that an implementation can safely modify the header
-and add extensions without harming data of compatible features that it
-doesn't support. Compatible features that need space for additional data can
-use a header extension.
+If the image has a backing file then the backing file name should be stored in
+the remaining space between the end of the header extension area and the end of
+the first cluster. It is not allowed to store other data here, so that an
+implementation can safely modify the header and add extensions without harming
+data of compatible features that it doesn't support. Compatible features that
+need space for additional data can use a header extension.
 
 
 == Feature name table ==
diff --git a/hw/block/xen_disk.c b/hw/block/xen_disk.c
index aed5b5b3e9..a221d0bfca 100644
--- a/hw/block/xen_disk.c
+++ b/hw/block/xen_disk.c
@@ -589,6 +589,7 @@ static int blk_send_response_one(struct ioreq *ioreq)
         break;
     default:
         dst = NULL;
+        return 0;
     }
     memcpy(dst, &resp, sizeof(resp));
     blkdev->rings.common.rsp_prod_pvt++;
diff --git a/include/block/block.h b/include/block/block.h
index f08471d7e1..e94b701667 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -275,6 +275,7 @@ BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
     const char *backing_file);
 int bdrv_get_backing_file_depth(BlockDriverState *bs);
 int bdrv_truncate(BlockDriverState *bs, int64_t offset);
+int64_t bdrv_nb_sectors(BlockDriverState *bs);
 int64_t bdrv_getlength(BlockDriverState *bs);
 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs);
 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr);
@@ -454,6 +455,7 @@ void bdrv_img_create(const char *filename, const char *fmt,
 size_t bdrv_opt_mem_align(BlockDriverState *bs);
 void bdrv_set_guest_block_size(BlockDriverState *bs, int align);
 void *qemu_blockalign(BlockDriverState *bs, size_t size);
+void *qemu_try_blockalign(BlockDriverState *bs, size_t size);
 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov);
 
 struct HBitmapIter;
diff --git a/include/block/coroutine.h b/include/block/coroutine.h
index b408f96b82..b9b7f488c9 100644
--- a/include/block/coroutine.h
+++ b/include/block/coroutine.h
@@ -223,4 +223,15 @@ void coroutine_fn co_aio_sleep_ns(AioContext *ctx, QEMUClockType type,
  * Note that this function clobbers the handlers for the file descriptor.
  */
 void coroutine_fn yield_until_fd_readable(int fd);
+
+/**
+ * Add or subtract from the coroutine pool size
+ *
+ * The coroutine implementation keeps a pool of coroutines to be reused by
+ * qemu_coroutine_create().  This makes coroutine creation cheap.  Heavy
+ * coroutine users should call this to reserve pool space.  Call it again with
+ * a negative number to release pool space.
+ */
+void qemu_coroutine_adjust_pool_size(int n);
+
 #endif /* QEMU_COROUTINE_H */
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index 8480d523f0..9dd43fc2db 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -95,6 +95,7 @@ typedef signed int              int_fast16_t;
 #define qemu_printf printf
 
 int qemu_daemon(int nochdir, int noclose);
+void *qemu_try_memalign(size_t alignment, size_t size);
 void *qemu_memalign(size_t alignment, size_t size);
 void *qemu_anon_ram_alloc(size_t size);
 void qemu_vfree(void *ptr);
diff --git a/qapi/block-core.json b/qapi/block-core.json
index e378653a77..fb74c56e32 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -115,6 +115,8 @@
 # @format-specific: #optional structure supplying additional format-specific
 # information (since 1.7)
 #
+# @nocow: #optional info of whether NOCOW flag is set or not. (since 2.2)
+#
 # Since: 1.3
 #
 ##
@@ -126,7 +128,8 @@
            '*backing-filename': 'str', '*full-backing-filename': 'str',
            '*backing-filename-format': 'str', '*snapshots': ['SnapshotInfo'],
            '*backing-image': 'ImageInfo',
-           '*format-specific': 'ImageInfoSpecific' } }
+           '*format-specific': 'ImageInfoSpecific',
+           '*nocow': 'bool' } }
 
 ##
 # @ImageCheck:
@@ -194,6 +197,7 @@
 #       'file', 'file', 'ftp', 'ftps', 'host_cdrom', 'host_device',
 #       'host_floppy', 'http', 'https', 'nbd', 'parallels', 'qcow',
 #       'qcow2', 'raw', 'tftp', 'vdi', 'vmdk', 'vpc', 'vvfat'
+#       2.2: 'archipelago'
 #
 # @backing_file: #optional the name of the backing file (for copy-on-write)
 #
@@ -1143,7 +1147,7 @@
 # Since: 2.0
 ##
 { 'enum': 'BlockdevDriver',
-  'data': [ 'file', 'host_device', 'host_cdrom', 'host_floppy',
+  'data': [ 'archipelago', 'file', 'host_device', 'host_cdrom', 'host_floppy',
             'http', 'https', 'ftp', 'ftps', 'tftp', 'vvfat', 'blkdebug',
             'blkverify', 'bochs', 'cloop', 'cow', 'dmg', 'parallels', 'qcow',
             'qcow2', 'qed', 'raw', 'vdi', 'vhdx', 'vmdk', 'vpc', 'quorum' ] }
@@ -1273,6 +1277,37 @@
             '*pass-discard-snapshot': 'bool',
             '*pass-discard-other': 'bool' } }
 
+
+##
+# @BlockdevOptionsArchipelago
+#
+# Driver specific block device options for Archipelago.
+#
+# @volume:              Name of the Archipelago volume image
+#
+# @mport:               #optional The port number on which mapperd is
+#                       listening. This is optional
+#                       and if not specified, QEMU will make Archipelago
+#                       use the default port (1001).
+#
+# @vport:               #optional The port number on which vlmcd is
+#                       listening. This is optional
+#                       and if not specified, QEMU will make Archipelago
+#                       use the default port (501).
+#
+# @segment:             #optional The name of the shared memory segment
+#                       Archipelago stack is using. This is optional
+#                       and if not specified, QEMU will make Archipelago
+#                       use the default value, 'archipelago'.
+# Since: 2.2
+##
+{ 'type': 'BlockdevOptionsArchipelago',
+  'data': { 'volume': 'str',
+            '*mport': 'int',
+            '*vport': 'int',
+            '*segment': 'str' } }
+
+
 ##
 # @BlkdebugEvent
 #
@@ -1416,6 +1451,7 @@
   'base': 'BlockdevOptionsBase',
   'discriminator': 'driver',
   'data': {
+      'archipelago':'BlockdevOptionsArchipelago',
       'file':       'BlockdevOptionsFile',
       'host_device':'BlockdevOptionsFile',
       'host_cdrom': 'BlockdevOptionsFile',
diff --git a/qdev-monitor.c b/qdev-monitor.c
index f87f3d89cd..5fe5e75a88 100644
--- a/qdev-monitor.c
+++ b/qdev-monitor.c
@@ -182,9 +182,10 @@ static const char *find_typename_by_alias(const char *alias)
 
 int qdev_device_help(QemuOpts *opts)
 {
+    Error *local_err = NULL;
     const char *driver;
-    Property *prop;
-    ObjectClass *klass;
+    DevicePropertyInfoList *prop_list;
+    DevicePropertyInfoList *prop;
 
     driver = qemu_opt_get(opts, "driver");
     if (driver && is_help_option(driver)) {
@@ -196,35 +197,28 @@ int qdev_device_help(QemuOpts *opts)
         return 0;
     }
 
-    klass = object_class_by_name(driver);
-    if (!klass) {
+    if (!object_class_by_name(driver)) {
         const char *typename = find_typename_by_alias(driver);
 
         if (typename) {
             driver = typename;
-            klass = object_class_by_name(driver);
         }
     }
 
-    if (!object_class_dynamic_cast(klass, TYPE_DEVICE)) {
-        return 0;
+    prop_list = qmp_device_list_properties(driver, &local_err);
+    if (!prop_list) {
+        error_printf("%s\n", error_get_pretty(local_err));
+        error_free(local_err);
+        return 1;
     }
-    do {
-        for (prop = DEVICE_CLASS(klass)->props; prop && prop->name; prop++) {
-            /*
-             * TODO Properties without a parser are just for dirty hacks.
-             * qdev_prop_ptr is the only such PropertyInfo.  It's marked
-             * for removal.  This conditional should be removed along with
-             * it.
-             */
-            if (!prop->info->set) {
-                continue;           /* no way to set it, don't show */
-            }
-            error_printf("%s.%s=%s\n", driver, prop->name,
-                         prop->info->legacy_name ?: prop->info->name);
-        }
-        klass = object_class_get_parent(klass);
-    } while (klass != object_class_by_name(TYPE_DEVICE));
+
+    for (prop = prop_list; prop; prop = prop->next) {
+        error_printf("%s.%s=%s\n", driver,
+                     prop->value->name,
+                     prop->value->type);
+    }
+
+    qapi_free_DevicePropertyInfoList(prop_list);
     return 1;
 }
 
diff --git a/qemu-coroutine.c b/qemu-coroutine.c
index 470852100a..bd574aa1b5 100644
--- a/qemu-coroutine.c
+++ b/qemu-coroutine.c
@@ -19,14 +19,14 @@
 #include "block/coroutine_int.h"
 
 enum {
-    /* Maximum free pool size prevents holding too many freed coroutines */
-    POOL_MAX_SIZE = 64,
+    POOL_DEFAULT_SIZE = 64,
 };
 
 /** Free list to speed up creation */
 static QemuMutex pool_lock;
 static QSLIST_HEAD(, Coroutine) pool = QSLIST_HEAD_INITIALIZER(pool);
 static unsigned int pool_size;
+static unsigned int pool_max_size = POOL_DEFAULT_SIZE;
 
 Coroutine *qemu_coroutine_create(CoroutineEntry *entry)
 {
@@ -55,7 +55,7 @@ static void coroutine_delete(Coroutine *co)
 {
     if (CONFIG_COROUTINE_POOL) {
         qemu_mutex_lock(&pool_lock);
-        if (pool_size < POOL_MAX_SIZE) {
+        if (pool_size < pool_max_size) {
             QSLIST_INSERT_HEAD(&pool, co, pool_next);
             co->caller = NULL;
             pool_size++;
@@ -137,3 +137,23 @@ void coroutine_fn qemu_coroutine_yield(void)
     self->caller = NULL;
     coroutine_swap(self, to);
 }
+
+void qemu_coroutine_adjust_pool_size(int n)
+{
+    qemu_mutex_lock(&pool_lock);
+
+    pool_max_size += n;
+
+    /* Callers should never take away more than they added */
+    assert(pool_max_size >= POOL_DEFAULT_SIZE);
+
+    /* Trim oversized pool down to new max */
+    while (pool_size > pool_max_size) {
+        Coroutine *co = QSLIST_FIRST(&pool);
+        QSLIST_REMOVE_HEAD(&pool, pool_next);
+        pool_size--;
+        qemu_coroutine_delete(co);
+    }
+
+    qemu_mutex_unlock(&pool_lock);
+}
diff --git a/qemu-img.c b/qemu-img.c
index 19495bb594..18caa4b450 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -960,7 +960,6 @@ static int img_compare(int argc, char **argv)
     int64_t sector_num = 0;
     int64_t nb_sectors;
     int c, pnum;
-    uint64_t bs_sectors;
     uint64_t progress_base;
 
     for (;;) {
@@ -1022,10 +1021,20 @@ static int img_compare(int argc, char **argv)
 
     buf1 = qemu_blockalign(bs1, IO_BUF_SIZE);
     buf2 = qemu_blockalign(bs2, IO_BUF_SIZE);
-    bdrv_get_geometry(bs1, &bs_sectors);
-    total_sectors1 = bs_sectors;
-    bdrv_get_geometry(bs2, &bs_sectors);
-    total_sectors2 = bs_sectors;
+    total_sectors1 = bdrv_nb_sectors(bs1);
+    if (total_sectors1 < 0) {
+        error_report("Can't get size of %s: %s",
+                     filename1, strerror(-total_sectors1));
+        ret = 4;
+        goto out;
+    }
+    total_sectors2 = bdrv_nb_sectors(bs2);
+    if (total_sectors2 < 0) {
+        error_report("Can't get size of %s: %s",
+                     filename2, strerror(-total_sectors2));
+        ret = 4;
+        goto out;
+    }
     total_sectors = MIN(total_sectors1, total_sectors2);
     progress_base = MAX(total_sectors1, total_sectors2);
 
@@ -1187,7 +1196,7 @@ static int img_convert(int argc, char **argv)
     BlockDriver *drv, *proto_drv;
     BlockDriverState **bs = NULL, *out_bs = NULL;
     int64_t total_sectors, nb_sectors, sector_num, bs_offset;
-    uint64_t bs_sectors;
+    int64_t *bs_sectors = NULL;
     uint8_t * buf = NULL;
     size_t bufsectors = IO_BUF_SIZE / BDRV_SECTOR_SIZE;
     const uint8_t *buf1;
@@ -1328,7 +1337,8 @@ static int img_convert(int argc, char **argv)
 
     qemu_progress_print(0, 100);
 
-    bs = g_malloc0(bs_n * sizeof(BlockDriverState *));
+    bs = g_new0(BlockDriverState *, bs_n);
+    bs_sectors = g_new(int64_t, bs_n);
 
     total_sectors = 0;
     for (bs_i = 0; bs_i < bs_n; bs_i++) {
@@ -1342,8 +1352,14 @@ static int img_convert(int argc, char **argv)
             ret = -1;
             goto out;
         }
-        bdrv_get_geometry(bs[bs_i], &bs_sectors);
-        total_sectors += bs_sectors;
+        bs_sectors[bs_i] = bdrv_nb_sectors(bs[bs_i]);
+        if (bs_sectors[bs_i] < 0) {
+            error_report("Could not get size of %s: %s",
+                         argv[optind + bs_i], strerror(-bs_sectors[bs_i]));
+            ret = -1;
+            goto out;
+        }
+        total_sectors += bs_sectors[bs_i];
     }
 
     if (sn_opts) {
@@ -1461,7 +1477,6 @@ static int img_convert(int argc, char **argv)
 
     bs_i = 0;
     bs_offset = 0;
-    bdrv_get_geometry(bs[0], &bs_sectors);
 
     /* increase bufsectors from the default 4096 (2M) if opt_transfer_length
      * or discard_alignment of the out_bs is greater. Limit to 32768 (16MB)
@@ -1474,13 +1489,13 @@ static int img_convert(int argc, char **argv)
     buf = qemu_blockalign(out_bs, bufsectors * BDRV_SECTOR_SIZE);
 
     if (skip_create) {
-        int64_t output_length = bdrv_getlength(out_bs);
-        if (output_length < 0) {
+        int64_t output_sectors = bdrv_nb_sectors(out_bs);
+        if (output_sectors < 0) {
             error_report("unable to get output image length: %s\n",
-                         strerror(-output_length));
+                         strerror(-output_sectors));
             ret = -1;
             goto out;
-        } else if (output_length < total_sectors << BDRV_SECTOR_BITS) {
+        } else if (output_sectors < total_sectors) {
             error_report("output file is smaller than input file");
             ret = -1;
             goto out;
@@ -1528,19 +1543,19 @@ static int img_convert(int argc, char **argv)
             buf2 = buf;
             while (remainder > 0) {
                 int nlow;
-                while (bs_num == bs_sectors) {
+                while (bs_num == bs_sectors[bs_i]) {
+                    bs_offset += bs_sectors[bs_i];
                     bs_i++;
                     assert (bs_i < bs_n);
-                    bs_offset += bs_sectors;
-                    bdrv_get_geometry(bs[bs_i], &bs_sectors);
                     bs_num = 0;
                     /* printf("changing part: sector_num=%" PRId64 ", "
                        "bs_i=%d, bs_offset=%" PRId64 ", bs_sectors=%" PRId64
-                       "\n", sector_num, bs_i, bs_offset, bs_sectors); */
+                       "\n", sector_num, bs_i, bs_offset, bs_sectors[bs_i]); */
                 }
-                assert (bs_num < bs_sectors);
+                assert (bs_num < bs_sectors[bs_i]);
 
-                nlow = (remainder > bs_sectors - bs_num) ? bs_sectors - bs_num : remainder;
+                nlow = remainder > bs_sectors[bs_i] - bs_num
+                    ? bs_sectors[bs_i] - bs_num : remainder;
 
                 ret = bdrv_read(bs[bs_i], bs_num, buf2, nlow);
                 if (ret < 0) {
@@ -1601,14 +1616,13 @@ restart:
                 break;
             }
 
-            while (sector_num - bs_offset >= bs_sectors) {
+            while (sector_num - bs_offset >= bs_sectors[bs_i]) {
+                bs_offset += bs_sectors[bs_i];
                 bs_i ++;
                 assert (bs_i < bs_n);
-                bs_offset += bs_sectors;
-                bdrv_get_geometry(bs[bs_i], &bs_sectors);
                 /* printf("changing part: sector_num=%" PRId64 ", bs_i=%d, "
                   "bs_offset=%" PRId64 ", bs_sectors=%" PRId64 "\n",
-                   sector_num, bs_i, bs_offset, bs_sectors); */
+                   sector_num, bs_i, bs_offset, bs_sectors[bs_i]); */
             }
 
             if ((out_baseimg || has_zero_init) &&
@@ -1661,7 +1675,7 @@ restart:
                 }
             }
 
-            n = MIN(n, bs_sectors - (sector_num - bs_offset));
+            n = MIN(n, bs_sectors[bs_i] - (sector_num - bs_offset));
 
             sectors_read += n;
             if (count_allocated_sectors) {
@@ -1719,6 +1733,7 @@ out:
         }
         g_free(bs);
     }
+    g_free(bs_sectors);
 fail_getopt:
     g_free(options);
 
@@ -2418,9 +2433,9 @@ static int img_rebase(int argc, char **argv)
      * the image is the same as the original one at any time.
      */
     if (!unsafe) {
-        uint64_t num_sectors;
-        uint64_t old_backing_num_sectors;
-        uint64_t new_backing_num_sectors = 0;
+        int64_t num_sectors;
+        int64_t old_backing_num_sectors;
+        int64_t new_backing_num_sectors = 0;
         uint64_t sector;
         int n;
         uint8_t * buf_old;
@@ -2430,10 +2445,31 @@ static int img_rebase(int argc, char **argv)
         buf_old = qemu_blockalign(bs, IO_BUF_SIZE);
         buf_new = qemu_blockalign(bs, IO_BUF_SIZE);
 
-        bdrv_get_geometry(bs, &num_sectors);
-        bdrv_get_geometry(bs_old_backing, &old_backing_num_sectors);
+        num_sectors = bdrv_nb_sectors(bs);
+        if (num_sectors < 0) {
+            error_report("Could not get size of '%s': %s",
+                         filename, strerror(-num_sectors));
+            ret = -1;
+            goto out;
+        }
+        old_backing_num_sectors = bdrv_nb_sectors(bs_old_backing);
+        if (old_backing_num_sectors < 0) {
+            char backing_name[1024];
+
+            bdrv_get_backing_filename(bs, backing_name, sizeof(backing_name));
+            error_report("Could not get size of '%s': %s",
+                         backing_name, strerror(-old_backing_num_sectors));
+            ret = -1;
+            goto out;
+        }
         if (bs_new_backing) {
-            bdrv_get_geometry(bs_new_backing, &new_backing_num_sectors);
+            new_backing_num_sectors = bdrv_nb_sectors(bs_new_backing);
+            if (new_backing_num_sectors < 0) {
+                error_report("Could not get size of '%s': %s",
+                             out_baseimg, strerror(-new_backing_num_sectors));
+                ret = -1;
+                goto out;
+            }
         }
 
         if (num_sectors != 0) {
diff --git a/qmp.c b/qmp.c
index 0d2553abf5..c6767c4df3 100644
--- a/qmp.c
+++ b/qmp.c
@@ -509,6 +509,7 @@ DevicePropertyInfoList *qmp_device_list_properties(const char *typename,
         if (strcmp(prop->name, "type") == 0 ||
             strcmp(prop->name, "realized") == 0 ||
             strcmp(prop->name, "hotpluggable") == 0 ||
+            strcmp(prop->name, "hotplugged") == 0 ||
             strcmp(prop->name, "parent_bus") == 0) {
             continue;
         }
diff --git a/tests/qemu-iotests/059 b/tests/qemu-iotests/059
index 26a2fd3e0e..3c053c29b4 100755
--- a/tests/qemu-iotests/059
+++ b/tests/qemu-iotests/059
@@ -114,6 +114,10 @@ echo
 echo "=== Testing version 3 ==="
 _use_sample_img iotest-version3.vmdk.bz2
 _img_info
+for i in {0..99}; do
+    $QEMU_IO -r -c "read -P $(( i % 10 + 0x30 )) $(( i * 64 * 1024 * 10 + i * 512 )) 512" $TEST_IMG \
+        | _filter_qemu_io
+done
 
 echo
 echo "=== Testing 4TB monolithicFlat creation and IO ==="
diff --git a/tests/qemu-iotests/059.out b/tests/qemu-iotests/059.out
index eba0dedda0..0dadba658f 100644
--- a/tests/qemu-iotests/059.out
+++ b/tests/qemu-iotests/059.out
@@ -2056,8 +2056,208 @@ wrote 512/512 bytes at offset 10240
 === Testing version 3 ===
 image: TEST_DIR/iotest-version3.IMGFMT
 file format: IMGFMT
-virtual size: 1.0G (1073741824 bytes)
+virtual size: 16G (17179869184 bytes)
 cluster_size: 65536
+read 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 655872
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 1311744
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 1967616
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 2623488
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 3279360
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 3935232
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 4591104
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 5246976
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 5902848
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 6558720
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 7214592
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 7870464
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 8526336
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 9182208
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 9838080
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 10493952
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 11149824
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 11805696
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 12461568
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 13117440
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 13773312
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 14429184
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 15085056
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 15740928
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 16396800
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 17052672
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 17708544
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 18364416
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 19020288
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 19676160
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 20332032
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 20987904
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 21643776
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 22299648
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 22955520
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 23611392
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 24267264
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 24923136
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 25579008
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 26234880
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 26890752
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 27546624
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 28202496
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 28858368
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 29514240
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 30170112
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 30825984
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 31481856
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 32137728
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 32793600
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 33449472
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 34105344
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 34761216
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 35417088
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 36072960
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 36728832
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 37384704
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 38040576
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 38696448
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 39352320
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 40008192
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 40664064
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 41319936
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 41975808
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 42631680
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 43287552
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 43943424
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 44599296
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 45255168
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 45911040
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 46566912
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 47222784
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 47878656
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 48534528
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 49190400
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 49846272
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 50502144
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 51158016
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 51813888
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 52469760
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 53125632
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 53781504
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 54437376
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 55093248
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 55749120
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 56404992
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 57060864
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 57716736
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 58372608
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 59028480
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 59684352
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 60340224
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 60996096
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 61651968
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 62307840
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 62963712
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 63619584
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 64275456
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 64931328
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 
 === Testing 4TB monolithicFlat creation and IO ===
 Formatting 'TEST_DIR/iotest-version3.IMGFMT', fmt=IMGFMT size=4398046511104
diff --git a/tests/qemu-iotests/060 b/tests/qemu-iotests/060
index 3cffc12fec..830386fdaa 100755
--- a/tests/qemu-iotests/060
+++ b/tests/qemu-iotests/060
@@ -164,6 +164,15 @@ wait_break 0
 write 64k 64k
 resume 0" | $QEMU_IO | _filter_qemu_io
 
+echo
+echo "=== Testing unallocated image header ==="
+echo
+_make_test_img 64M
+# Create L1/L2
+$QEMU_IO -c "$OPEN_RW" -c "write 0 64k" | _filter_qemu_io
+poke_file "$TEST_IMG" "$rb_offset" "\x00\x00"
+$QEMU_IO -c "$OPEN_RW" -c "write 64k 64k" | _filter_qemu_io
+
 # success, all done
 echo "*** done"
 rm -f $seq.full
diff --git a/tests/qemu-iotests/060.out b/tests/qemu-iotests/060.out
index a517948036..c27c952412 100644
--- a/tests/qemu-iotests/060.out
+++ b/tests/qemu-iotests/060.out
@@ -93,4 +93,12 @@ blkdebug: Suspended request '0'
 write failed: Input/output error
 blkdebug: Resuming request '0'
 aio_write failed: No medium found
+
+=== Testing unallocated image header ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 
+wrote 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qcow2: Preventing invalid write on metadata (overlaps with qcow2_header); image marked as corrupt.
+write failed: Input/output error
 *** done
diff --git a/tests/qemu-iotests/084 b/tests/qemu-iotests/084
index cb4d7b729e..ae33c2cba4 100755
--- a/tests/qemu-iotests/084
+++ b/tests/qemu-iotests/084
@@ -1,6 +1,7 @@
 #!/bin/bash
 #
-# Test case for VDI header corruption; image too large, and too many blocks
+# Test case for VDI header corruption; image too large, and too many blocks.
+# Also simple test for creating dynamic and static VDI images.
 #
 # Copyright (C) 2013 Red Hat, Inc.
 #
@@ -43,14 +44,25 @@ _supported_fmt vdi
 _supported_proto generic
 _supported_os Linux
 
+size=64M
 ds_offset=368  # disk image size field offset
 bs_offset=376  # block size field offset
 bii_offset=384 # block in image field offset
 
 echo
+echo "=== Statically allocated image creation ==="
+echo
+_make_test_img $size -o static
+_img_info
+stat -c"disk image file size in bytes: %s" "${TEST_IMG}"
+_cleanup_test_img
+
+echo
 echo "=== Testing image size bounds ==="
 echo
-_make_test_img 64M
+_make_test_img $size
+_img_info
+stat -c"disk image file size in bytes: %s" "${TEST_IMG}"
 
 # check for image size too large
 # poke max image size, and appropriate blocks_in_image value
diff --git a/tests/qemu-iotests/084.out b/tests/qemu-iotests/084.out
index c7120d9b0b..ea29ae0b9d 100644
--- a/tests/qemu-iotests/084.out
+++ b/tests/qemu-iotests/084.out
@@ -1,8 +1,22 @@
 QA output created by 084
 
+=== Statically allocated image creation ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 
+image: TEST_DIR/t.IMGFMT
+file format: IMGFMT
+virtual size: 64M (67108864 bytes)
+cluster_size: 1048576
+disk image file size in bytes: 67109888
+
 === Testing image size bounds ===
 
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 
+image: TEST_DIR/t.IMGFMT
+file format: IMGFMT
+virtual size: 64M (67108864 bytes)
+cluster_size: 1048576
+disk image file size in bytes: 1024
 Test 1: Maximum size (1024 TB):
 qemu-img: Could not open 'TEST_DIR/t.IMGFMT': Could not open 'TEST_DIR/t.IMGFMT': Invalid argument
 
diff --git a/tests/qemu-iotests/common b/tests/qemu-iotests/common
index e4083f4212..70df659d5b 100644
--- a/tests/qemu-iotests/common
+++ b/tests/qemu-iotests/common
@@ -152,6 +152,7 @@ check options
     -nbd                test nbd
     -ssh                test ssh
     -nfs                test nfs
+    -archipelago        test archipelago
     -xdiff              graphical mode diff
     -nocache            use O_DIRECT on backing file
     -misalign           misalign memory allocations
@@ -263,6 +264,11 @@ testlist options
             xpand=false
             ;;
 
+        -archipelago)
+            IMGPROTO=archipelago
+            xpand=false
+            ;;
+
         -nocache)
             CACHEMODE="none"
             CACHEMODE_IS_DEFAULT=false
diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc
index e0ea7e3a7d..3fd691e6cd 100644
--- a/tests/qemu-iotests/common.rc
+++ b/tests/qemu-iotests/common.rc
@@ -64,6 +64,8 @@ elif [ "$IMGPROTO" = "ssh" ]; then
 elif [ "$IMGPROTO" = "nfs" ]; then
     TEST_DIR="nfs://127.0.0.1/$TEST_DIR"
     TEST_IMG=$TEST_DIR/t.$IMGFMT
+elif [ "$IMGPROTO" = "archipelago" ]; then
+    TEST_IMG="archipelago:at.$IMGFMT"
 else
     TEST_IMG=$IMGPROTO:$TEST_DIR/t.$IMGFMT
 fi
@@ -163,7 +165,8 @@ _make_test_img()
             -e "s# lazy_refcounts=\\(on\\|off\\)##g" \
             -e "s# block_size=[0-9]\\+##g" \
             -e "s# block_state_zero=\\(on\\|off\\)##g" \
-            -e "s# log_size=[0-9]\\+##g"
+            -e "s# log_size=[0-9]\\+##g" \
+            -e "s/archipelago:a/TEST_DIR\//g"
 
     # Start an NBD server on the image file, which is what we'll be talking to
     if [ $IMGPROTO = "nbd" ]; then
@@ -206,6 +209,10 @@ _cleanup_test_img()
             rbd --no-progress rm "$TEST_DIR/t.$IMGFMT" > /dev/null
             ;;
 
+        archipelago)
+            vlmc remove "at.$IMGFMT" > /dev/null
+            ;;
+
         sheepdog)
             collie vdi delete "$TEST_DIR/t.$IMGFMT"
             ;;
diff --git a/tests/qemu-iotests/sample_images/iotest-version3.vmdk.bz2 b/tests/qemu-iotests/sample_images/iotest-version3.vmdk.bz2
index 30abf217e7..619329a246 100644
--- a/tests/qemu-iotests/sample_images/iotest-version3.vmdk.bz2
+++ b/tests/qemu-iotests/sample_images/iotest-version3.vmdk.bz2
diff --git a/tests/test-coroutine.c b/tests/test-coroutine.c
index 760636ddc4..6e634f4a78 100644
--- a/tests/test-coroutine.c
+++ b/tests/test-coroutine.c
@@ -288,6 +288,29 @@ static void perf_yield(void)
         maxcycles, duration);
 }
 
+static __attribute__((noinline)) void dummy(unsigned *i)
+{
+    (*i)--;
+}
+
+static void perf_baseline(void)
+{
+    unsigned int i, maxcycles;
+    double duration;
+
+    maxcycles = 100000000;
+    i = maxcycles;
+
+    g_test_timer_start();
+    while (i > 0) {
+        dummy(&i);
+    }
+    duration = g_test_timer_elapsed();
+
+    g_test_message("Function call %u iterations: %f s\n",
+        maxcycles, duration);
+}
+
 int main(int argc, char **argv)
 {
     g_test_init(&argc, &argv, NULL);
@@ -301,6 +324,7 @@ int main(int argc, char **argv)
         g_test_add_func("/perf/lifecycle", perf_lifecycle);
         g_test_add_func("/perf/nesting", perf_nesting);
         g_test_add_func("/perf/yield", perf_yield);
+        g_test_add_func("/perf/function-call", perf_baseline);
     }
     return g_test_run();
 }
diff --git a/thread-pool.c b/thread-pool.c
index dfb699dd94..23888dcfc4 100644
--- a/thread-pool.c
+++ b/thread-pool.c
@@ -21,7 +21,6 @@
 #include "block/coroutine.h"
 #include "trace.h"
 #include "block/block_int.h"
-#include "qemu/event_notifier.h"
 #include "block/thread-pool.h"
 #include "qemu/main-loop.h"
 
@@ -57,8 +56,8 @@ struct ThreadPoolElement {
 };
 
 struct ThreadPool {
-    EventNotifier notifier;
     AioContext *ctx;
+    QEMUBH *completion_bh;
     QemuMutex lock;
     QemuCond check_cancel;
     QemuCond worker_stopped;
@@ -119,7 +118,7 @@ static void *worker_thread(void *opaque)
             qemu_cond_broadcast(&pool->check_cancel);
         }
 
-        event_notifier_set(&pool->notifier);
+        qemu_bh_schedule(pool->completion_bh);
     }
 
     pool->cur_threads--;
@@ -168,12 +167,11 @@ static void spawn_thread(ThreadPool *pool)
     }
 }
 
-static void event_notifier_ready(EventNotifier *notifier)
+static void thread_pool_completion_bh(void *opaque)
 {
-    ThreadPool *pool = container_of(notifier, ThreadPool, notifier);
+    ThreadPool *pool = opaque;
     ThreadPoolElement *elem, *next;
 
-    event_notifier_test_and_clear(notifier);
 restart:
     QLIST_FOREACH_SAFE(elem, &pool->head, all, next) {
         if (elem->state != THREAD_CANCELED && elem->state != THREAD_DONE) {
@@ -187,6 +185,12 @@ restart:
             QLIST_REMOVE(elem, all);
             /* Read state before ret.  */
             smp_rmb();
+
+            /* Schedule ourselves in case elem->common.cb() calls aio_poll() to
+             * wait for another request that completed at the same time.
+             */
+            qemu_bh_schedule(pool->completion_bh);
+
             elem->common.cb(elem->common.opaque, elem->ret);
             qemu_aio_release(elem);
             goto restart;
@@ -215,7 +219,7 @@ static void thread_pool_cancel(BlockDriverAIOCB *acb)
         qemu_sem_timedwait(&pool->sem, 0) == 0) {
         QTAILQ_REMOVE(&pool->request_list, elem, reqs);
         elem->state = THREAD_CANCELED;
-        event_notifier_set(&pool->notifier);
+        qemu_bh_schedule(pool->completion_bh);
     } else {
         pool->pending_cancellations++;
         while (elem->state != THREAD_CANCELED && elem->state != THREAD_DONE) {
@@ -224,7 +228,7 @@ static void thread_pool_cancel(BlockDriverAIOCB *acb)
         pool->pending_cancellations--;
     }
     qemu_mutex_unlock(&pool->lock);
-    event_notifier_ready(&pool->notifier);
+    thread_pool_completion_bh(pool);
 }
 
 static const AIOCBInfo thread_pool_aiocb_info = {
@@ -293,8 +297,8 @@ static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx)
     }
 
     memset(pool, 0, sizeof(*pool));
-    event_notifier_init(&pool->notifier, false);
     pool->ctx = ctx;
+    pool->completion_bh = aio_bh_new(ctx, thread_pool_completion_bh, pool);
     qemu_mutex_init(&pool->lock);
     qemu_cond_init(&pool->check_cancel);
     qemu_cond_init(&pool->worker_stopped);
@@ -304,8 +308,6 @@ static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx)
 
     QLIST_INIT(&pool->head);
     QTAILQ_INIT(&pool->request_list);
-
-    aio_set_event_notifier(ctx, &pool->notifier, event_notifier_ready);
 }
 
 ThreadPool *thread_pool_new(AioContext *ctx)
@@ -339,11 +341,10 @@ void thread_pool_free(ThreadPool *pool)
 
     qemu_mutex_unlock(&pool->lock);
 
-    aio_set_event_notifier(pool->ctx, &pool->notifier, NULL);
+    qemu_bh_delete(pool->completion_bh);
     qemu_sem_destroy(&pool->sem);
     qemu_cond_destroy(&pool->check_cancel);
     qemu_cond_destroy(&pool->worker_stopped);
     qemu_mutex_destroy(&pool->lock);
-    event_notifier_cleanup(&pool->notifier);
     g_free(pool);
 }
diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index cdbfb2e270..016a047cfc 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -94,7 +94,7 @@ void *qemu_oom_check(void *ptr)
     return ptr;
 }
 
-void *qemu_memalign(size_t alignment, size_t size)
+void *qemu_try_memalign(size_t alignment, size_t size)
 {
     void *ptr;
 
@@ -106,19 +106,23 @@ void *qemu_memalign(size_t alignment, size_t size)
     int ret;
     ret = posix_memalign(&ptr, alignment, size);
     if (ret != 0) {
-        fprintf(stderr, "Failed to allocate %zu B: %s\n",
-                size, strerror(ret));
-        abort();
+        errno = ret;
+        ptr = NULL;
     }
 #elif defined(CONFIG_BSD)
-    ptr = qemu_oom_check(valloc(size));
+    ptr = valloc(size);
 #else
-    ptr = qemu_oom_check(memalign(alignment, size));
+    ptr = memalign(alignment, size);
 #endif
     trace_qemu_memalign(alignment, size, ptr);
     return ptr;
 }
 
+void *qemu_memalign(size_t alignment, size_t size)
+{
+    return qemu_oom_check(qemu_try_memalign(alignment, size));
+}
+
 /* alloc shared memory pages */
 void *qemu_anon_ram_alloc(size_t size)
 {
diff --git a/util/oslib-win32.c b/util/oslib-win32.c
index 507cedd84d..a3eab4acba 100644
--- a/util/oslib-win32.c
+++ b/util/oslib-win32.c
@@ -50,18 +50,23 @@ void *qemu_oom_check(void *ptr)
     return ptr;
 }
 
-void *qemu_memalign(size_t alignment, size_t size)
+void *qemu_try_memalign(size_t alignment, size_t size)
 {
     void *ptr;
 
     if (!size) {
         abort();
     }
-    ptr = qemu_oom_check(VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_READWRITE));
+    ptr = VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_READWRITE);
     trace_qemu_memalign(alignment, size, ptr);
     return ptr;
 }
 
+void *qemu_memalign(size_t alignment, size_t size)
+{
+    return qemu_oom_check(qemu_try_memalign(alignment, size));
+}
+
 void *qemu_anon_ram_alloc(size_t size)
 {
     void *ptr;