37 files changed, 3399 insertions, 747 deletions
diff --git a/block/Makefile.objs b/block/Makefile.objs
index b5754d39bf..c067f38e1d 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -2,10 +2,21 @@ block-obj-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat
 block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
 block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-obj-y += qed-check.o
-block-obj-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
-block-obj-y += stream.o
-block-obj-$(CONFIG_WIN32) += raw-win32.o
+block-obj-y += parallels.o blkdebug.o blkverify.o
+block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
 block-obj-$(CONFIG_POSIX) += raw-posix.o
+block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
+
+ifeq ($(CONFIG_POSIX),y)
+block-obj-y += nbd.o sheepdog.o
 block-obj-$(CONFIG_LIBISCSI) += iscsi.o
 block-obj-$(CONFIG_CURL) += curl.o
 block-obj-$(CONFIG_RBD) += rbd.o
+block-obj-$(CONFIG_GLUSTERFS) += gluster.o
+endif
+
+common-obj-y += stream.o
+common-obj-y += commit.o
+common-obj-y += mirror.o
+
+$(obj)/curl.o: QEMU_CFLAGS+=$(CURL_CFLAGS)
diff --git a/block/blkdebug.c b/block/blkdebug.c
index 59dcea0650..6f7463772b 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -23,13 +23,17 @@
  */
 
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "qemu/config-file.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
 
 typedef struct BDRVBlkdebugState {
     int state;
+    int new_state;
+
     QLIST_HEAD(, BlkdebugRule) rules[BLKDBG_EVENT_MAX];
     QSIMPLEQ_HEAD(, BlkdebugRule) active_rules;
+    QLIST_HEAD(, BlkdebugSuspendedReq) suspended_reqs;
 } BDRVBlkdebugState;
 
 typedef struct BlkdebugAIOCB {
@@ -38,9 +42,15 @@ typedef struct BlkdebugAIOCB {
     int ret;
 } BlkdebugAIOCB;
 
+typedef struct BlkdebugSuspendedReq {
+    Coroutine *co;
+    char *tag;
+    QLIST_ENTRY(BlkdebugSuspendedReq) next;
+} BlkdebugSuspendedReq;
+
 static void blkdebug_aio_cancel(BlockDriverAIOCB *blockacb);
 
-static AIOPool blkdebug_aio_pool = {
+static const AIOCBInfo blkdebug_aiocb_info = {
     .aiocb_size = sizeof(BlkdebugAIOCB),
     .cancel     = blkdebug_aio_cancel,
 };
@@ -48,6 +58,7 @@ static AIOPool blkdebug_aio_pool = {
 enum {
     ACTION_INJECT_ERROR,
     ACTION_SET_STATE,
+    ACTION_SUSPEND,
 };
 
 typedef struct BlkdebugRule {
@@ -64,6 +75,9 @@ typedef struct BlkdebugRule {
         struct {
             int new_state;
         } set_state;
+        struct {
+            char *tag;
+        } suspend;
     } options;
     QLIST_ENTRY(BlkdebugRule) next;
     QSIMPLEQ_ENTRY(BlkdebugRule) active_next;
@@ -225,6 +239,11 @@ static int add_rule(QemuOpts *opts, void *opaque)
         rule->options.set_state.new_state =
             qemu_opt_get_number(opts, "new_state", 0);
         break;
+
+    case ACTION_SUSPEND:
+        rule->options.suspend.tag =
+            g_strdup(qemu_opt_get(opts, "tag"));
+        break;
     };
 
     /* Add the rule */
@@ -233,12 +252,32 @@ static int add_rule(QemuOpts *opts, void *opaque)
     return 0;
 }
 
+static void remove_rule(BlkdebugRule *rule)
+{
+    switch (rule->action) {
+    case ACTION_INJECT_ERROR:
+    case ACTION_SET_STATE:
+        break;
+    case ACTION_SUSPEND:
+        g_free(rule->options.suspend.tag);
+        break;
+    }
+
+    QLIST_REMOVE(rule, next);
+    g_free(rule);
+}
+
 static int read_config(BDRVBlkdebugState *s, const char *filename)
 {
     FILE *f;
     int ret;
     struct add_rule_data d;
 
+    /* Allow usage without config file */
+    if (!*filename) {
+        return 0;
+    }
+
     f = fopen(filename, "r");
     if (f == NULL) {
         return -errno;
@@ -334,7 +373,7 @@ static BlockDriverAIOCB *inject_error(BlockDriverState *bs,
         return NULL;
     }
 
-    acb = qemu_aio_get(&blkdebug_aio_pool, bs, cb, opaque);
+    acb = qemu_aio_get(&blkdebug_aiocb_info, bs, cb, opaque);
     acb->ret = -error;
 
     bh = qemu_bh_new(error_callback_bh, acb);
@@ -388,6 +427,7 @@ static BlockDriverAIOCB *blkdebug_aio_writev(BlockDriverState *bs,
     return bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, cb, opaque);
 }
 
+
 static void blkdebug_close(BlockDriverState *bs)
 {
     BDRVBlkdebugState *s = bs->opaque;
@@ -396,19 +436,39 @@ static void blkdebug_close(BlockDriverState *bs)
 
     for (i = 0; i < BLKDBG_EVENT_MAX; i++) {
         QLIST_FOREACH_SAFE(rule, &s->rules[i], next, next) {
-            QLIST_REMOVE(rule, next);
-            g_free(rule);
+            remove_rule(rule);
         }
     }
 }
 
+static void suspend_request(BlockDriverState *bs, BlkdebugRule *rule)
+{
+    BDRVBlkdebugState *s = bs->opaque;
+    BlkdebugSuspendedReq r;
+
+    r = (BlkdebugSuspendedReq) {
+        .co         = qemu_coroutine_self(),
+        .tag        = g_strdup(rule->options.suspend.tag),
+    };
+
+    remove_rule(rule);
+    QLIST_INSERT_HEAD(&s->suspended_reqs, &r, next);
+
+    printf("blkdebug: Suspended request '%s'\n", r.tag);
+    qemu_coroutine_yield();
+    printf("blkdebug: Resuming request '%s'\n", r.tag);
+
+    QLIST_REMOVE(&r, next);
+    g_free(r.tag);
+}
+
 static bool process_rule(BlockDriverState *bs, struct BlkdebugRule *rule,
-    int old_state, bool injected)
+    bool injected)
 {
     BDRVBlkdebugState *s = bs->opaque;
 
     /* Only process rules for the current state */
-    if (rule->state && rule->state != old_state) {
+    if (rule->state && rule->state != s->state) {
         return injected;
     }
 
@@ -423,7 +483,11 @@ static bool process_rule(BlockDriverState *bs, struct BlkdebugRule *rule,
         break;
 
     case ACTION_SET_STATE:
-        s->state = rule->options.set_state.new_state;
+        s->new_state = rule->options.set_state.new_state;
+        break;
+
+    case ACTION_SUSPEND:
+        suspend_request(bs, rule);
         break;
     }
     return injected;
@@ -432,16 +496,70 @@ static bool process_rule(BlockDriverState *bs, struct BlkdebugRule *rule,
 static void blkdebug_debug_event(BlockDriverState *bs, BlkDebugEvent event)
 {
     BDRVBlkdebugState *s = bs->opaque;
-    struct BlkdebugRule *rule;
-    int old_state = s->state;
+    struct BlkdebugRule *rule, *next;
     bool injected;
 
     assert((int)event >= 0 && event < BLKDBG_EVENT_MAX);
 
     injected = false;
-    QLIST_FOREACH(rule, &s->rules[event], next) {
-        injected = process_rule(bs, rule, old_state, injected);
+    s->new_state = s->state;
+    QLIST_FOREACH_SAFE(rule, &s->rules[event], next, next) {
+        injected = process_rule(bs, rule, injected);
+    }
+    s->state = s->new_state;
+}
+
+static int blkdebug_debug_breakpoint(BlockDriverState *bs, const char *event,
+                                     const char *tag)
+{
+    BDRVBlkdebugState *s = bs->opaque;
+    struct BlkdebugRule *rule;
+    BlkDebugEvent blkdebug_event;
+
+    if (get_event_by_name(event, &blkdebug_event) < 0) {
+        return -ENOENT;
+    }
+
+
+    rule = g_malloc(sizeof(*rule));
+    *rule = (struct BlkdebugRule) {
+        .event  = blkdebug_event,
+        .action = ACTION_SUSPEND,
+        .state  = 0,
+        .options.suspend.tag = g_strdup(tag),
+    };
+
+    QLIST_INSERT_HEAD(&s->rules[blkdebug_event], rule, next);
+
+    return 0;
+}
+
+static int blkdebug_debug_resume(BlockDriverState *bs, const char *tag)
+{
+    BDRVBlkdebugState *s = bs->opaque;
+    BlkdebugSuspendedReq *r;
+
+    QLIST_FOREACH(r, &s->suspended_reqs, next) {
+        if (!strcmp(r->tag, tag)) {
+            qemu_coroutine_enter(r->co, NULL);
+            return 0;
+        }
+    }
+    return -ENOENT;
+}
+
+
+static bool blkdebug_debug_is_suspended(BlockDriverState *bs, const char *tag)
+{
+    BDRVBlkdebugState *s = bs->opaque;
+    BlkdebugSuspendedReq *r;
+
+    QLIST_FOREACH(r, &s->suspended_reqs, next) {
+        if (!strcmp(r->tag, tag)) {
+            return true;
+        }
     }
+    return false;
 }
 
 static int64_t blkdebug_getlength(BlockDriverState *bs)
@@ -462,7 +580,10 @@ static BlockDriver bdrv_blkdebug = {
     .bdrv_aio_readv     = blkdebug_aio_readv,
     .bdrv_aio_writev    = blkdebug_aio_writev,
 
-    .bdrv_debug_event   = blkdebug_debug_event,
+    .bdrv_debug_event           = blkdebug_debug_event,
+    .bdrv_debug_breakpoint      = blkdebug_debug_breakpoint,
+    .bdrv_debug_resume          = blkdebug_debug_resume,
+    .bdrv_debug_is_suspended    = blkdebug_debug_is_suspended,
 };
 
 static void bdrv_blkdebug_init(void)
diff --git a/block/blkverify.c b/block/blkverify.c
index 9d5f1ec5b9..a7dd45909b 100644
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -8,8 +8,8 @@
  */
 
 #include <stdarg.h>
-#include "qemu_socket.h" /* for EINPROGRESS on Windows */
-#include "block_int.h"
+#include "qemu/sockets.h" /* for EINPROGRESS on Windows */
+#include "block/block_int.h"
 
 typedef struct {
     BlockDriverState *test_file;
@@ -48,7 +48,7 @@ static void blkverify_aio_cancel(BlockDriverAIOCB *blockacb)
     }
 }
 
-static AIOPool blkverify_aio_pool = {
+static const AIOCBInfo blkverify_aiocb_info = {
     .aiocb_size         = sizeof(BlkverifyAIOCB),
     .cancel             = blkverify_aio_cancel,
 };
@@ -233,7 +233,7 @@ static BlkverifyAIOCB *blkverify_aio_get(BlockDriverState *bs, bool is_write,
                                          BlockDriverCompletionFunc *cb,
                                          void *opaque)
 {
-    BlkverifyAIOCB *acb = qemu_aio_get(&blkverify_aio_pool, bs, cb, opaque);
+    BlkverifyAIOCB *acb = qemu_aio_get(&blkverify_aiocb_info, bs, cb, opaque);
 
     acb->bh = NULL;
     acb->is_write = is_write;
diff --git a/block/bochs.c b/block/bochs.c
index ab7944dc43..1b1d9cdbe5 100644
--- a/block/bochs.c
+++ b/block/bochs.c
@@ -23,8 +23,8 @@
  * THE SOFTWARE.
  */
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
 
 /**************************************************************/
 
diff --git a/block/cloop.c b/block/cloop.c
index 7570eb8e74..5a0d0d805f 100644
--- a/block/cloop.c
+++ b/block/cloop.c
@@ -22,8 +22,8 @@
  * THE SOFTWARE.
  */
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
 #include <zlib.h>
 
 typedef struct BDRVCloopState {
diff --git a/block/commit.c b/block/commit.c
new file mode 100644
index 0000000000..61ebdba54f
--- /dev/null
+++ b/block/commit.c
@@ -0,0 +1,259 @@
+/*
+ * Live block commit
+ *
+ * Copyright Red Hat, Inc. 2012
+ *
+ * Authors:
+ *  Jeff Cody   <jcody@redhat.com>
+ *  Based on stream.c by Stefan Hajnoczi
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "trace.h"
+#include "block/block_int.h"
+#include "block/blockjob.h"
+#include "qemu/ratelimit.h"
+
+enum {
+    /*
+     * Size of data buffer for populating the image file.  This should be large
+     * enough to process multiple clusters in a single call, so that populating
+     * contiguous regions of the image is efficient.
+     */
+    COMMIT_BUFFER_SIZE = 512 * 1024, /* in bytes */
+};
+
+#define SLICE_TIME 100000000ULL /* ns */
+
+typedef struct CommitBlockJob {
+    BlockJob common;
+    RateLimit limit;
+    BlockDriverState *active;
+    BlockDriverState *top;
+    BlockDriverState *base;
+    BlockdevOnError on_error;
+    int base_flags;
+    int orig_overlay_flags;
+} CommitBlockJob;
+
+static int coroutine_fn commit_populate(BlockDriverState *bs,
+                                        BlockDriverState *base,
+                                        int64_t sector_num, int nb_sectors,
+                                        void *buf)
+{
+    int ret = 0;
+
+    ret = bdrv_read(bs, sector_num, buf, nb_sectors);
+    if (ret) {
+        return ret;
+    }
+
+    ret = bdrv_write(base, sector_num, buf, nb_sectors);
+    if (ret) {
+        return ret;
+    }
+
+    return 0;
+}
+
+static void coroutine_fn commit_run(void *opaque)
+{
+    CommitBlockJob *s = opaque;
+    BlockDriverState *active = s->active;
+    BlockDriverState *top = s->top;
+    BlockDriverState *base = s->base;
+    BlockDriverState *overlay_bs = NULL;
+    int64_t sector_num, end;
+    int ret = 0;
+    int n = 0;
+    void *buf;
+    int bytes_written = 0;
+    int64_t base_len;
+
+    ret = s->common.len = bdrv_getlength(top);
+
+
+    if (s->common.len < 0) {
+        goto exit_restore_reopen;
+    }
+
+    ret = base_len = bdrv_getlength(base);
+    if (base_len < 0) {
+        goto exit_restore_reopen;
+    }
+
+    if (base_len < s->common.len) {
+        ret = bdrv_truncate(base, s->common.len);
+        if (ret) {
+            goto exit_restore_reopen;
+        }
+    }
+
+    overlay_bs = bdrv_find_overlay(active, top);
+
+    end = s->common.len >> BDRV_SECTOR_BITS;
+    buf = qemu_blockalign(top, COMMIT_BUFFER_SIZE);
+
+    for (sector_num = 0; sector_num < end; sector_num += n) {
+        uint64_t delay_ns = 0;
+        bool copy;
+
+wait:
+        /* Note that even when no rate limit is applied we need to yield
+         * with no pending I/O here so that bdrv_drain_all() returns.
+         */
+        block_job_sleep_ns(&s->common, rt_clock, delay_ns);
+        if (block_job_is_cancelled(&s->common)) {
+            break;
+        }
+        /* Copy if allocated above the base */
+        ret = bdrv_co_is_allocated_above(top, base, sector_num,
+                                         COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE,
+                                         &n);
+        copy = (ret == 1);
+        trace_commit_one_iteration(s, sector_num, n, ret);
+        if (copy) {
+            if (s->common.speed) {
+                delay_ns = ratelimit_calculate_delay(&s->limit, n);
+                if (delay_ns > 0) {
+                    goto wait;
+                }
+            }
+            ret = commit_populate(top, base, sector_num, n, buf);
+            bytes_written += n * BDRV_SECTOR_SIZE;
+        }
+        if (ret < 0) {
+            if (s->on_error == BLOCKDEV_ON_ERROR_STOP ||
+                s->on_error == BLOCKDEV_ON_ERROR_REPORT||
+                (s->on_error == BLOCKDEV_ON_ERROR_ENOSPC && ret == -ENOSPC)) {
+                goto exit_free_buf;
+            } else {
+                n = 0;
+                continue;
+            }
+        }
+        /* Publish progress */
+        s->common.offset += n * BDRV_SECTOR_SIZE;
+    }
+
+    ret = 0;
+
+    if (!block_job_is_cancelled(&s->common) && sector_num == end) {
+        /* success */
+        ret = bdrv_drop_intermediate(active, top, base);
+    }
+
+exit_free_buf:
+    qemu_vfree(buf);
+
+exit_restore_reopen:
+    /* restore base open flags here if appropriate (e.g., change the base back
+     * to r/o). These reopens do not need to be atomic, since we won't abort
+     * even on failure here */
+    if (s->base_flags != bdrv_get_flags(base)) {
+        bdrv_reopen(base, s->base_flags, NULL);
+    }
+    if (s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) {
+        bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL);
+    }
+
+    block_job_completed(&s->common, ret);
+}
+
+static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp)
+{
+    CommitBlockJob *s = container_of(job, CommitBlockJob, common);
+
+    if (speed < 0) {
+        error_set(errp, QERR_INVALID_PARAMETER, "speed");
+        return;
+    }
+    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
+}
+
+static BlockJobType commit_job_type = {
+    .instance_size = sizeof(CommitBlockJob),
+    .job_type      = "commit",
+    .set_speed     = commit_set_speed,
+};
+
+void commit_start(BlockDriverState *bs, BlockDriverState *base,
+                  BlockDriverState *top, int64_t speed,
+                  BlockdevOnError on_error, BlockDriverCompletionFunc *cb,
+                  void *opaque, Error **errp)
+{
+    CommitBlockJob *s;
+    BlockReopenQueue *reopen_queue = NULL;
+    int orig_overlay_flags;
+    int orig_base_flags;
+    BlockDriverState *overlay_bs;
+    Error *local_err = NULL;
+
+    if ((on_error == BLOCKDEV_ON_ERROR_STOP ||
+         on_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
+        !bdrv_iostatus_is_enabled(bs)) {
+        error_set(errp, QERR_INVALID_PARAMETER_COMBINATION);
+        return;
+    }
+
+    /* Once we support top == active layer, remove this check */
+    if (top == bs) {
+        error_setg(errp,
+                   "Top image as the active layer is currently unsupported");
+        return;
+    }
+
+    if (top == base) {
+        error_setg(errp, "Invalid files for merge: top and base are the same");
+        return;
+    }
+
+    overlay_bs = bdrv_find_overlay(bs, top);
+
+    if (overlay_bs == NULL) {
+        error_setg(errp, "Could not find overlay image for %s:", top->filename);
+        return;
+    }
+
+    orig_base_flags    = bdrv_get_flags(base);
+    orig_overlay_flags = bdrv_get_flags(overlay_bs);
+
+    /* convert base & overlay_bs to r/w, if necessary */
+    if (!(orig_base_flags & BDRV_O_RDWR)) {
+        reopen_queue = bdrv_reopen_queue(reopen_queue, base,
+                                         orig_base_flags | BDRV_O_RDWR);
+    }
+    if (!(orig_overlay_flags & BDRV_O_RDWR)) {
+        reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs,
+                                         orig_overlay_flags | BDRV_O_RDWR);
+    }
+    if (reopen_queue) {
+        bdrv_reopen_multiple(reopen_queue, &local_err);
+        if (local_err != NULL) {
+            error_propagate(errp, local_err);
+            return;
+        }
+    }
+
+
+    s = block_job_create(&commit_job_type, bs, speed, cb, opaque, errp);
+    if (!s) {
+        return;
+    }
+
+    s->base   = base;
+    s->top    = top;
+    s->active = bs;
+
+    s->base_flags          = orig_base_flags;
+    s->orig_overlay_flags  = orig_overlay_flags;
+
+    s->on_error = on_error;
+    s->common.co = qemu_coroutine_create(commit_run);
+
+    trace_commit_start(bs, base, top, s, s->common.co, opaque);
+    qemu_coroutine_enter(s->common.co, s);
+}
diff --git a/block/cow.c b/block/cow.c
index a5a00eb9ca..a33ce950d4 100644
--- a/block/cow.c
+++ b/block/cow.c
@@ -22,8 +22,8 @@
  * THE SOFTWARE.
  */
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
 
 /**************************************************************/
 /* COW block driver using file system holes */
diff --git a/block/curl.c b/block/curl.c
index e7c3634d35..47df9524ea 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -22,7 +22,7 @@
  * THE SOFTWARE.
  */
 #include "qemu-common.h"
-#include "block_int.h"
+#include "block/block_int.h"
 #include <curl/curl.h>
 
 // #define DEBUG
@@ -438,7 +438,7 @@ static void curl_aio_cancel(BlockDriverAIOCB *blockacb)
     // Do we have to implement canceling? Seems to work without...
 }
 
-static AIOPool curl_aio_pool = {
+static const AIOCBInfo curl_aiocb_info = {
     .aiocb_size         = sizeof(CURLAIOCB),
     .cancel             = curl_aio_cancel,
 };
@@ -505,7 +505,7 @@ static BlockDriverAIOCB *curl_aio_readv(BlockDriverState *bs,
 {
     CURLAIOCB *acb;
 
-    acb = qemu_aio_get(&curl_aio_pool, bs, cb, opaque);
+    acb = qemu_aio_get(&curl_aiocb_info, bs, cb, opaque);
 
     acb->qiov = qiov;
     acb->sector_num = sector_num;
@@ -542,8 +542,7 @@ static void curl_close(BlockDriverState *bs)
     }
     if (s->multi)
         curl_multi_cleanup(s->multi);
-    if (s->url)
-        free(s->url);
+    g_free(s->url);
 }
 
 static int64_t curl_getlength(BlockDriverState *bs)
diff --git a/block/dmg.c b/block/dmg.c
index 37902a4347..ac397dc8f7 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -22,9 +22,9 @@
  * THE SOFTWARE.
  */
 #include "qemu-common.h"
-#include "block_int.h"
-#include "bswap.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/bswap.h"
+#include "qemu/module.h"
 #include <zlib.h>
 
 typedef struct BDRVDMGState {
diff --git a/block/gluster.c b/block/gluster.c
new file mode 100644
index 0000000000..0f2c32a3fa
--- /dev/null
+++ b/block/gluster.c
@@ -0,0 +1,624 @@
+/*
+ * GlusterFS backend for QEMU
+ *
+ * Copyright (C) 2012 Bharata B Rao <bharata@linux.vnet.ibm.com>
+ *
+ * Pipe handling mechanism in AIO implementation is derived from
+ * block/rbd.c. Hence,
+ *
+ * Copyright (C) 2010-2011 Christian Brunner <chb@muc.de>,
+ *                         Josh Durgin <josh.durgin@dreamhost.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+#include <glusterfs/api/glfs.h>
+#include "block/block_int.h"
+#include "qemu/sockets.h"
+#include "qemu/uri.h"
+
+typedef struct GlusterAIOCB {
+    BlockDriverAIOCB common;
+    int64_t size;
+    int ret;
+    bool *finished;
+    QEMUBH *bh;
+} GlusterAIOCB;
+
+typedef struct BDRVGlusterState {
+    struct glfs *glfs;
+    int fds[2];
+    struct glfs_fd *fd;
+    int qemu_aio_count;
+    int event_reader_pos;
+    GlusterAIOCB *event_acb;
+} BDRVGlusterState;
+
+#define GLUSTER_FD_READ  0
+#define GLUSTER_FD_WRITE 1
+
+typedef struct GlusterConf {
+    char *server;
+    int port;
+    char *volname;
+    char *image;
+    char *transport;
+} GlusterConf;
+
+static void qemu_gluster_gconf_free(GlusterConf *gconf)
+{
+    g_free(gconf->server);
+    g_free(gconf->volname);
+    g_free(gconf->image);
+    g_free(gconf->transport);
+    g_free(gconf);
+}
+
+static int parse_volume_options(GlusterConf *gconf, char *path)
+{
+    char *p, *q;
+
+    if (!path) {
+        return -EINVAL;
+    }
+
+    /* volume */
+    p = q = path + strspn(path, "/");
+    p += strcspn(p, "/");
+    if (*p == '\0') {
+        return -EINVAL;
+    }
+    gconf->volname = g_strndup(q, p - q);
+
+    /* image */
+    p += strspn(p, "/");
+    if (*p == '\0') {
+        return -EINVAL;
+    }
+    gconf->image = g_strdup(p);
+    return 0;
+}
+
+/*
+ * file=gluster[+transport]://[server[:port]]/volname/image[?socket=...]
+ *
+ * 'gluster' is the protocol.
+ *
+ * 'transport' specifies the transport type used to connect to gluster
+ * management daemon (glusterd). Valid transport types are
+ * tcp, unix and rdma. If a transport type isn't specified, then tcp
+ * type is assumed.
+ *
+ * 'server' specifies the server where the volume file specification for
+ * the given volume resides. This can be either hostname, ipv4 address
+ * or ipv6 address. ipv6 address needs to be within square brackets [ ].
+ * If transport type is 'unix', then 'server' field should not be specifed.
+ * The 'socket' field needs to be populated with the path to unix domain
+ * socket.
+ *
+ * 'port' is the port number on which glusterd is listening. This is optional
+ * and if not specified, QEMU will send 0 which will make gluster to use the
+ * default port. If the transport type is unix, then 'port' should not be
+ * specified.
+ *
+ * 'volname' is the name of the gluster volume which contains the VM image.
+ *
+ * 'image' is the path to the actual VM image that resides on gluster volume.
+ *
+ * Examples:
+ *
+ * file=gluster://1.2.3.4/testvol/a.img
+ * file=gluster+tcp://1.2.3.4/testvol/a.img
+ * file=gluster+tcp://1.2.3.4:24007/testvol/dir/a.img
+ * file=gluster+tcp://[1:2:3:4:5:6:7:8]/testvol/dir/a.img
+ * file=gluster+tcp://[1:2:3:4:5:6:7:8]:24007/testvol/dir/a.img
+ * file=gluster+tcp://server.domain.com:24007/testvol/dir/a.img
+ * file=gluster+unix:///testvol/dir/a.img?socket=/tmp/glusterd.socket
+ * file=gluster+rdma://1.2.3.4:24007/testvol/a.img
+ */
+static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename)
+{
+    URI *uri;
+    QueryParams *qp = NULL;
+    bool is_unix = false;
+    int ret = 0;
+
+    uri = uri_parse(filename);
+    if (!uri) {
+        return -EINVAL;
+    }
+
+    /* transport */
+    if (!strcmp(uri->scheme, "gluster")) {
+        gconf->transport = g_strdup("tcp");
+    } else if (!strcmp(uri->scheme, "gluster+tcp")) {
+        gconf->transport = g_strdup("tcp");
+    } else if (!strcmp(uri->scheme, "gluster+unix")) {
+        gconf->transport = g_strdup("unix");
+        is_unix = true;
+    } else if (!strcmp(uri->scheme, "gluster+rdma")) {
+        gconf->transport = g_strdup("rdma");
+    } else {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    ret = parse_volume_options(gconf, uri->path);
+    if (ret < 0) {
+        goto out;
+    }
+
+    qp = query_params_parse(uri->query);
+    if (qp->n > 1 || (is_unix && !qp->n) || (!is_unix && qp->n)) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    if (is_unix) {
+        if (uri->server || uri->port) {
+            ret = -EINVAL;
+            goto out;
+        }
+        if (strcmp(qp->p[0].name, "socket")) {
+            ret = -EINVAL;
+            goto out;
+        }
+        gconf->server = g_strdup(qp->p[0].value);
+    } else {
+        gconf->server = g_strdup(uri->server);
+        gconf->port = uri->port;
+    }
+
+out:
+    if (qp) {
+        query_params_free(qp);
+    }
+    uri_free(uri);
+    return ret;
+}
+
+static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename)
+{
+    struct glfs *glfs = NULL;
+    int ret;
+    int old_errno;
+
+    ret = qemu_gluster_parseuri(gconf, filename);
+    if (ret < 0) {
+        error_report("Usage: file=gluster[+transport]://[server[:port]]/"
+            "volname/image[?socket=...]");
+        errno = -ret;
+        goto out;
+    }
+
+    glfs = glfs_new(gconf->volname);
+    if (!glfs) {
+        goto out;
+    }
+
+    ret = glfs_set_volfile_server(glfs, gconf->transport, gconf->server,
+            gconf->port);
+    if (ret < 0) {
+        goto out;
+    }
+
+    /*
+     * TODO: Use GF_LOG_ERROR instead of hard code value of 4 here when
+     * GlusterFS makes GF_LOG_* macros available to libgfapi users.
+     */
+    ret = glfs_set_logging(glfs, "-", 4);
+    if (ret < 0) {
+        goto out;
+    }
+
+    ret = glfs_init(glfs);
+    if (ret) {
+        error_report("Gluster connection failed for server=%s port=%d "
+             "volume=%s image=%s transport=%s\n", gconf->server, gconf->port,
+             gconf->volname, gconf->image, gconf->transport);
+        goto out;
+    }
+    return glfs;
+
+out:
+    if (glfs) {
+        old_errno = errno;
+        glfs_fini(glfs);
+        errno = old_errno;
+    }
+    return NULL;
+}
+
+static void qemu_gluster_complete_aio(GlusterAIOCB *acb, BDRVGlusterState *s)
+{
+    int ret;
+    bool *finished = acb->finished;
+    BlockDriverCompletionFunc *cb = acb->common.cb;
+    void *opaque = acb->common.opaque;
+
+    if (!acb->ret || acb->ret == acb->size) {
+        ret = 0; /* Success */
+    } else if (acb->ret < 0) {
+        ret = acb->ret; /* Read/Write failed */
+    } else {
+        ret = -EIO; /* Partial read/write - fail it */
+    }
+
+    s->qemu_aio_count--;
+    qemu_aio_release(acb);
+    cb(opaque, ret);
+    if (finished) {
+        *finished = true;
+    }
+}
+
+static void qemu_gluster_aio_event_reader(void *opaque)
+{
+    BDRVGlusterState *s = opaque;
+    ssize_t ret;
+
+    do {
+        char *p = (char *)&s->event_acb;
+
+        ret = read(s->fds[GLUSTER_FD_READ], p + s->event_reader_pos,
+                   sizeof(s->event_acb) - s->event_reader_pos);
+        if (ret > 0) {
+            s->event_reader_pos += ret;
+            if (s->event_reader_pos == sizeof(s->event_acb)) {
+                s->event_reader_pos = 0;
+                qemu_gluster_complete_aio(s->event_acb, s);
+            }
+        }
+    } while (ret < 0 && errno == EINTR);
+}
+
+static int qemu_gluster_aio_flush_cb(void *opaque)
+{
+    BDRVGlusterState *s = opaque;
+
+    return (s->qemu_aio_count > 0);
+}
+
+static int qemu_gluster_open(BlockDriverState *bs, const char *filename,
+    int bdrv_flags)
+{
+    BDRVGlusterState *s = bs->opaque;
+    int open_flags = O_BINARY;
+    int ret = 0;
+    GlusterConf *gconf = g_malloc0(sizeof(GlusterConf));
+
+    s->glfs = qemu_gluster_init(gconf, filename);
+    if (!s->glfs) {
+        ret = -errno;
+        goto out;
+    }
+
+    if (bdrv_flags & BDRV_O_RDWR) {
+        open_flags |= O_RDWR;
+    } else {
+        open_flags |= O_RDONLY;
+    }
+
+    if ((bdrv_flags & BDRV_O_NOCACHE)) {
+        open_flags |= O_DIRECT;
+    }
+
+    s->fd = glfs_open(s->glfs, gconf->image, open_flags);
+    if (!s->fd) {
+        ret = -errno;
+        goto out;
+    }
+
+    ret = qemu_pipe(s->fds);
+    if (ret < 0) {
+        ret = -errno;
+        goto out;
+    }
+    fcntl(s->fds[GLUSTER_FD_READ], F_SETFL, O_NONBLOCK);
+    qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ],
+        qemu_gluster_aio_event_reader, NULL, qemu_gluster_aio_flush_cb, s);
+
+out:
+    qemu_gluster_gconf_free(gconf);
+    if (!ret) {
+        return ret;
+    }
+    if (s->fd) {
+        glfs_close(s->fd);
+    }
+    if (s->glfs) {
+        glfs_fini(s->glfs);
+    }
+    return ret;
+}
+
+static int qemu_gluster_create(const char *filename,
+        QEMUOptionParameter *options)
+{
+    struct glfs *glfs;
+    struct glfs_fd *fd;
+    int ret = 0;
+    int64_t total_size = 0;
+    GlusterConf *gconf = g_malloc0(sizeof(GlusterConf));
+
+    glfs = qemu_gluster_init(gconf, filename);
+    if (!glfs) {
+        ret = -errno;
+        goto out;
+    }
+
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            total_size = options->value.n / BDRV_SECTOR_SIZE;
+        }
+        options++;
+    }
+
+    fd = glfs_creat(glfs, gconf->image,
+        O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR);
+    if (!fd) {
+        ret = -errno;
+    } else {
+        if (glfs_ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) {
+            ret = -errno;
+        }
+        if (glfs_close(fd) != 0) {
+            ret = -errno;
+        }
+    }
+out:
+    qemu_gluster_gconf_free(gconf);
+    if (glfs) {
+        glfs_fini(glfs);
+    }
+    return ret;
+}
+
+static void qemu_gluster_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+    GlusterAIOCB *acb = (GlusterAIOCB *)blockacb;
+    bool finished = false;
+
+    acb->finished = &finished;
+    while (!finished) {
+        qemu_aio_wait();
+    }
+}
+
+static const AIOCBInfo gluster_aiocb_info = {
+    .aiocb_size = sizeof(GlusterAIOCB),
+    .cancel = qemu_gluster_aio_cancel,
+};
+
+static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
+{
+    GlusterAIOCB *acb = (GlusterAIOCB *)arg;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVGlusterState *s = bs->opaque;
+    int retval;
+
+    acb->ret = ret;
+    retval = qemu_write_full(s->fds[GLUSTER_FD_WRITE], &acb, sizeof(acb));
+    if (retval != sizeof(acb)) {
+        /*
+         * Gluster AIO callback thread failed to notify the waiting
+         * QEMU thread about IO completion.
+         *
+         * Complete this IO request and make the disk inaccessible for
+         * subsequent reads and writes.
+         */
+        error_report("Gluster failed to notify QEMU about IO completion");
+
+        qemu_mutex_lock_iothread(); /* We are in gluster thread context */
+        acb->common.cb(acb->common.opaque, -EIO);
+        qemu_aio_release(acb);
+        s->qemu_aio_count--;
+        close(s->fds[GLUSTER_FD_READ]);
+        close(s->fds[GLUSTER_FD_WRITE]);
+        qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ], NULL, NULL, NULL,
+            NULL);
+        bs->drv = NULL; /* Make the disk inaccessible */
+        qemu_mutex_unlock_iothread();
+    }
+}
+
+static BlockDriverAIOCB *qemu_gluster_aio_rw(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque, int write)
+{
+    int ret;
+    GlusterAIOCB *acb;
+    BDRVGlusterState *s = bs->opaque;
+    size_t size;
+    off_t offset;
+
+    offset = sector_num * BDRV_SECTOR_SIZE;
+    size = nb_sectors * BDRV_SECTOR_SIZE;
+    s->qemu_aio_count++;
+
+    acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque);
+    acb->size = size;
+    acb->ret = 0;
+    acb->finished = NULL;
+
+    if (write) {
+        ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0,
+            &gluster_finish_aiocb, acb);
+    } else {
+        ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0,
+            &gluster_finish_aiocb, acb);
+    }
+
+    if (ret < 0) {
+        goto out;
+    }
+    return &acb->common;
+
+out:
+    s->qemu_aio_count--;
+    qemu_aio_release(acb);
+    return NULL;
+}
+
+static BlockDriverAIOCB *qemu_gluster_aio_readv(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+}
+
+static BlockDriverAIOCB *qemu_gluster_aio_writev(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
+}
+
+static BlockDriverAIOCB *qemu_gluster_aio_flush(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    int ret;
+    GlusterAIOCB *acb;
+    BDRVGlusterState *s = bs->opaque;
+
+    acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque);
+    acb->size = 0;
+    acb->ret = 0;
+    acb->finished = NULL;
+    s->qemu_aio_count++;
+
+    ret = glfs_fsync_async(s->fd, &gluster_finish_aiocb, acb);
+    if (ret < 0) {
+        goto out;
+    }
+    return &acb->common;
+
+out:
+    s->qemu_aio_count--;
+    qemu_aio_release(acb);
+    return NULL;
+}
+
+static int64_t qemu_gluster_getlength(BlockDriverState *bs)
+{
+    BDRVGlusterState *s = bs->opaque;
+    int64_t ret;
+
+    ret = glfs_lseek(s->fd, 0, SEEK_END);
+    if (ret < 0) {
+        return -errno;
+    } else {
+        return ret;
+    }
+}
+
+static int64_t qemu_gluster_allocated_file_size(BlockDriverState *bs)
+{
+    BDRVGlusterState *s = bs->opaque;
+    struct stat st;
+    int ret;
+
+    ret = glfs_fstat(s->fd, &st);
+    if (ret < 0) {
+        return -errno;
+    } else {
+        return st.st_blocks * 512;
+    }
+}
+
+static void qemu_gluster_close(BlockDriverState *bs)
+{
+    BDRVGlusterState *s = bs->opaque;
+
+    close(s->fds[GLUSTER_FD_READ]);
+    close(s->fds[GLUSTER_FD_WRITE]);
+    qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ], NULL, NULL, NULL, NULL);
+
+    if (s->fd) {
+        glfs_close(s->fd);
+        s->fd = NULL;
+    }
+    glfs_fini(s->glfs);
+}
+
+static QEMUOptionParameter qemu_gluster_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    { NULL }
+};
+
+static BlockDriver bdrv_gluster = {
+    .format_name                  = "gluster",
+    .protocol_name                = "gluster",
+    .instance_size                = sizeof(BDRVGlusterState),
+    .bdrv_file_open               = qemu_gluster_open,
+    .bdrv_close                   = qemu_gluster_close,
+    .bdrv_create                  = qemu_gluster_create,
+    .bdrv_getlength               = qemu_gluster_getlength,
+    .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
+    .bdrv_aio_readv               = qemu_gluster_aio_readv,
+    .bdrv_aio_writev              = qemu_gluster_aio_writev,
+    .bdrv_aio_flush               = qemu_gluster_aio_flush,
+    .create_options               = qemu_gluster_create_options,
+};
+
+static BlockDriver bdrv_gluster_tcp = {
+    .format_name                  = "gluster",
+    .protocol_name                = "gluster+tcp",
+    .instance_size                = sizeof(BDRVGlusterState),
+    .bdrv_file_open               = qemu_gluster_open,
+    .bdrv_close                   = qemu_gluster_close,
+    .bdrv_create                  = qemu_gluster_create,
+    .bdrv_getlength               = qemu_gluster_getlength,
+    .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
+    .bdrv_aio_readv               = qemu_gluster_aio_readv,
+    .bdrv_aio_writev              = qemu_gluster_aio_writev,
+    .bdrv_aio_flush               = qemu_gluster_aio_flush,
+    .create_options               = qemu_gluster_create_options,
+};
+
+static BlockDriver bdrv_gluster_unix = {
+    .format_name                  = "gluster",
+    .protocol_name                = "gluster+unix",
+    .instance_size                = sizeof(BDRVGlusterState),
+    .bdrv_file_open               = qemu_gluster_open,
+    .bdrv_close                   = qemu_gluster_close,
+    .bdrv_create                  = qemu_gluster_create,
+    .bdrv_getlength               = qemu_gluster_getlength,
+    .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
+    .bdrv_aio_readv               = qemu_gluster_aio_readv,
+    .bdrv_aio_writev              = qemu_gluster_aio_writev,
+    .bdrv_aio_flush               = qemu_gluster_aio_flush,
+    .create_options               = qemu_gluster_create_options,
+};
+
+static BlockDriver bdrv_gluster_rdma = {
+    .format_name                  = "gluster",
+    .protocol_name                = "gluster+rdma",
+    .instance_size                = sizeof(BDRVGlusterState),
+    .bdrv_file_open               = qemu_gluster_open,
+    .bdrv_close                   = qemu_gluster_close,
+    .bdrv_create                  = qemu_gluster_create,
+    .bdrv_getlength               = qemu_gluster_getlength,
+    .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
+    .bdrv_aio_readv               = qemu_gluster_aio_readv,
+    .bdrv_aio_writev              = qemu_gluster_aio_writev,
+    .bdrv_aio_flush               = qemu_gluster_aio_flush,
+    .create_options               = qemu_gluster_create_options,
+};
+
+static void bdrv_gluster_init(void)
+{
+    bdrv_register(&bdrv_gluster_rdma);
+    bdrv_register(&bdrv_gluster_unix);
+    bdrv_register(&bdrv_gluster_tcp);
+    bdrv_register(&bdrv_gluster);
+}
+
+block_init(bdrv_gluster_init);
diff --git a/block/iscsi.c b/block/iscsi.c
index bb9cf82459..041ee07de3 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -27,8 +27,9 @@
 #include <poll.h>
 #include <arpa/inet.h>
 #include "qemu-common.h"
-#include "qemu-error.h"
-#include "block_int.h"
+#include "qemu/config-file.h"
+#include "qemu/error-report.h"
+#include "block/block_int.h"
 #include "trace.h"
 #include "hw/scsi-defs.h"
 
@@ -65,21 +66,44 @@ typedef struct IscsiAIOCB {
 #endif
 } IscsiAIOCB;
 
-struct IscsiTask {
-    IscsiLun *iscsilun;
-    BlockDriverState *bs;
-    int status;
-    int complete;
-};
+static void
+iscsi_bh_cb(void *p)
+{
+    IscsiAIOCB *acb = p;
+
+    qemu_bh_delete(acb->bh);
+
+    if (acb->canceled == 0) {
+        acb->common.cb(acb->common.opaque, acb->status);
+    }
+
+    if (acb->task != NULL) {
+        scsi_free_scsi_task(acb->task);
+        acb->task = NULL;
+    }
+
+    qemu_aio_release(acb);
+}
+
+static void
+iscsi_schedule_bh(IscsiAIOCB *acb)
+{
+    if (acb->bh) {
+        return;
+    }
+    acb->bh = qemu_bh_new(iscsi_bh_cb, acb);
+    qemu_bh_schedule(acb->bh);
+}
+
 
 static void
 iscsi_abort_task_cb(struct iscsi_context *iscsi, int status, void *command_data,
                     void *private_data)
 {
-    IscsiAIOCB *acb = (IscsiAIOCB *)private_data;
+    IscsiAIOCB *acb = private_data;
 
-    scsi_free_scsi_task(acb->task);
-    acb->task = NULL;
+    acb->status = -ECANCELED;
+    iscsi_schedule_bh(acb);
 }
 
 static void
@@ -88,18 +112,22 @@ iscsi_aio_cancel(BlockDriverAIOCB *blockacb)
     IscsiAIOCB *acb = (IscsiAIOCB *)blockacb;
     IscsiLun *iscsilun = acb->iscsilun;
 
-    acb->canceled = 1;
+    if (acb->status != -EINPROGRESS) {
+        return;
+    }
 
-    acb->common.cb(acb->common.opaque, -ECANCELED);
+    acb->canceled = 1;
 
-    /* send a task mgmt call to the target to cancel the task on the target
-     * this also cancels the task in libiscsi
-     */
+    /* send a task mgmt call to the target to cancel the task on the target */
     iscsi_task_mgmt_abort_task_async(iscsilun->iscsi, acb->task,
-                                     iscsi_abort_task_cb, &acb);
+                                     iscsi_abort_task_cb, acb);
+
+    while (acb->status == -EINPROGRESS) {
+        qemu_aio_wait();
+    }
 }
 
-static AIOPool iscsi_aio_pool = {
+static const AIOCBInfo iscsi_aiocb_info = {
     .aiocb_size         = sizeof(IscsiAIOCB),
     .cancel             = iscsi_aio_cancel,
 };
@@ -133,12 +161,6 @@ iscsi_set_events(IscsiLun *iscsilun)
 
     }
 
-    /* If we just added an event, the callback might be delayed
-     * unless we call qemu_notify_event().
-     */
-    if (ev & ~iscsilun->events) {
-        qemu_notify_event();
-    }
     iscsilun->events = ev;
 }
 
@@ -163,41 +185,6 @@ iscsi_process_write(void *arg)
 }
 
 
-static int
-iscsi_schedule_bh(QEMUBHFunc *cb, IscsiAIOCB *acb)
-{
-    acb->bh = qemu_bh_new(cb, acb);
-    if (!acb->bh) {
-        error_report("oom: could not create iscsi bh");
-        return -EIO;
-    }
-
-    qemu_bh_schedule(acb->bh);
-    return 0;
-}
-
-static void
-iscsi_readv_writev_bh_cb(void *p)
-{
-    IscsiAIOCB *acb = p;
-
-    qemu_bh_delete(acb->bh);
-
-    if (!acb->canceled) {
-        acb->common.cb(acb->common.opaque, acb->status);
-    }
-
-    qemu_aio_release(acb);
-
-    if (acb->canceled) {
-        return;
-    }
-
-    scsi_free_scsi_task(acb->task);
-    acb->task = NULL;
-}
-
-
 static void
 iscsi_aio_write16_cb(struct iscsi_context *iscsi, int status,
                      void *command_data, void *opaque)
@@ -208,8 +195,7 @@ iscsi_aio_write16_cb(struct iscsi_context *iscsi, int status,
 
     g_free(acb->buf);
 
-    if (acb->canceled) {
-        qemu_aio_release(acb);
+    if (acb->canceled != 0) {
         return;
     }
 
@@ -220,7 +206,7 @@ iscsi_aio_write16_cb(struct iscsi_context *iscsi, int status,
         acb->status = -EIO;
     }
 
-    iscsi_schedule_bh(iscsi_readv_writev_bh_cb, acb);
+    iscsi_schedule_bh(acb);
 }
 
 static int64_t sector_qemu2lun(int64_t sector, IscsiLun *iscsilun)
@@ -242,13 +228,15 @@ iscsi_aio_writev(BlockDriverState *bs, int64_t sector_num,
     uint64_t lba;
     struct iscsi_data data;
 
-    acb = qemu_aio_get(&iscsi_aio_pool, bs, cb, opaque);
+    acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
     trace_iscsi_aio_writev(iscsi, sector_num, nb_sectors, opaque, acb);
 
     acb->iscsilun = iscsilun;
     acb->qiov     = qiov;
 
     acb->canceled   = 0;
+    acb->bh         = NULL;
+    acb->status     = -EINPROGRESS;
 
     /* XXX we should pass the iovec to write16 to avoid the extra copy */
     /* this will allow us to get rid of 'buf' completely */
@@ -268,10 +256,6 @@ iscsi_aio_writev(BlockDriverState *bs, int64_t sector_num,
     acb->task->xfer_dir = SCSI_XFER_WRITE;
     acb->task->cdb_size = 16;
     acb->task->cdb[0] = 0x8a;
-    if (!(bs->open_flags & BDRV_O_CACHE_WB)) {
-        /* set FUA on writes when cache mode is write through */
-        acb->task->cdb[1] |= 0x04;
-    }
     lba = sector_qemu2lun(sector_num, iscsilun);
     *(uint32_t *)&acb->task->cdb[2]  = htonl(lba >> 32);
     *(uint32_t *)&acb->task->cdb[6]  = htonl(lba & 0xffffffff);
@@ -305,8 +289,7 @@ iscsi_aio_read16_cb(struct iscsi_context *iscsi, int status,
 
     trace_iscsi_aio_read16_cb(iscsi, status, acb, acb->canceled);
 
-    if (acb->canceled) {
-        qemu_aio_release(acb);
+    if (acb->canceled != 0) {
         return;
     }
 
@@ -317,7 +300,7 @@ iscsi_aio_read16_cb(struct iscsi_context *iscsi, int status,
         acb->status = -EIO;
     }
 
-    iscsi_schedule_bh(iscsi_readv_writev_bh_cb, acb);
+    iscsi_schedule_bh(acb);
 }
 
 static BlockDriverAIOCB *
@@ -336,13 +319,15 @@ iscsi_aio_readv(BlockDriverState *bs, int64_t sector_num,
 
     qemu_read_size = BDRV_SECTOR_SIZE * (size_t)nb_sectors;
 
-    acb = qemu_aio_get(&iscsi_aio_pool, bs, cb, opaque);
+    acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
     trace_iscsi_aio_readv(iscsi, sector_num, nb_sectors, opaque, acb);
 
     acb->iscsilun = iscsilun;
     acb->qiov     = qiov;
 
     acb->canceled    = 0;
+    acb->bh          = NULL;
+    acb->status      = -EINPROGRESS;
     acb->read_size   = qemu_read_size;
     acb->buf         = NULL;
 
@@ -389,7 +374,7 @@ iscsi_aio_readv(BlockDriverState *bs, int64_t sector_num,
         *(uint16_t *)&acb->task->cdb[7] = htons(num_sectors);
         break;
     }
-    
+
     if (iscsi_scsi_command_async(iscsi, iscsilun->lun, acb->task,
                                  iscsi_aio_read16_cb,
                                  NULL,
@@ -417,8 +402,7 @@ iscsi_synccache10_cb(struct iscsi_context *iscsi, int status,
 {
     IscsiAIOCB *acb = opaque;
 
-    if (acb->canceled) {
-        qemu_aio_release(acb);
+    if (acb->canceled != 0) {
         return;
     }
 
@@ -429,7 +413,7 @@ iscsi_synccache10_cb(struct iscsi_context *iscsi, int status,
         acb->status = -EIO;
     }
 
-    iscsi_schedule_bh(iscsi_readv_writev_bh_cb, acb);
+    iscsi_schedule_bh(acb);
 }
 
 static BlockDriverAIOCB *
@@ -440,10 +424,12 @@ iscsi_aio_flush(BlockDriverState *bs,
     struct iscsi_context *iscsi = iscsilun->iscsi;
     IscsiAIOCB *acb;
 
-    acb = qemu_aio_get(&iscsi_aio_pool, bs, cb, opaque);
+    acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
 
     acb->iscsilun = iscsilun;
     acb->canceled   = 0;
+    acb->bh         = NULL;
+    acb->status     = -EINPROGRESS;
 
     acb->task = iscsi_synchronizecache10_task(iscsi, iscsilun->lun,
                                          0, 0, 0, 0,
@@ -467,8 +453,7 @@ iscsi_unmap_cb(struct iscsi_context *iscsi, int status,
 {
     IscsiAIOCB *acb = opaque;
 
-    if (acb->canceled) {
-        qemu_aio_release(acb);
+    if (acb->canceled != 0) {
         return;
     }
 
@@ -479,7 +464,7 @@ iscsi_unmap_cb(struct iscsi_context *iscsi, int status,
         acb->status = -EIO;
     }
 
-    iscsi_schedule_bh(iscsi_readv_writev_bh_cb, acb);
+    iscsi_schedule_bh(acb);
 }
 
 static BlockDriverAIOCB *
@@ -492,10 +477,12 @@ iscsi_aio_discard(BlockDriverState *bs,
     IscsiAIOCB *acb;
     struct unmap_list list[1];
 
-    acb = qemu_aio_get(&iscsi_aio_pool, bs, cb, opaque);
+    acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
 
     acb->iscsilun = iscsilun;
     acb->canceled   = 0;
+    acb->bh         = NULL;
+    acb->status     = -EINPROGRESS;
 
     list[0].lba = sector_qemu2lun(sector_num, iscsilun);
     list[0].num = nb_sectors * BDRV_SECTOR_SIZE / iscsilun->block_size;
@@ -523,8 +510,7 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status,
 {
     IscsiAIOCB *acb = opaque;
 
-    if (acb->canceled) {
-        qemu_aio_release(acb);
+    if (acb->canceled != 0) {
         return;
     }
 
@@ -552,7 +538,7 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status,
         memcpy(acb->ioh->sbp, &acb->task->datain.data[2], ss);
     }
 
-    iscsi_schedule_bh(iscsi_readv_writev_bh_cb, acb);
+    iscsi_schedule_bh(acb);
 }
 
 static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
@@ -566,10 +552,12 @@ static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
 
     assert(req == SG_IO);
 
-    acb = qemu_aio_get(&iscsi_aio_pool, bs, cb, opaque);
+    acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
 
     acb->iscsilun = iscsilun;
     acb->canceled    = 0;
+    acb->bh          = NULL;
+    acb->status      = -EINPROGRESS;
     acb->buf         = NULL;
     acb->ioh         = buf;
 
@@ -624,9 +612,17 @@ static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
     return &acb->common;
 }
 
+
+static void ioctl_cb(void *opaque, int status)
+{
+    int *p_status = opaque;
+    *p_status = status;
+}
+
 static int iscsi_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
 {
     IscsiLun *iscsilun = bs->opaque;
+    int status;
 
     switch (req) {
     case SG_GET_VERSION_NUM:
@@ -635,6 +631,15 @@ static int iscsi_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
     case SG_GET_SCSI_ID:
         ((struct sg_scsi_id *)buf)->scsi_type = iscsilun->type;
         break;
+    case SG_IO:
+        status = -EINPROGRESS;
+        iscsi_aio_ioctl(bs, req, buf, ioctl_cb, &status);
+
+        while (status == -EINPROGRESS) {
+            qemu_aio_wait();
+        }
+
+        return 0;
     default:
         return -1;
     }
@@ -654,158 +659,6 @@ iscsi_getlength(BlockDriverState *bs)
     return len;
 }
 
-static void
-iscsi_readcapacity16_cb(struct iscsi_context *iscsi, int status,
-                        void *command_data, void *opaque)
-{
-    struct IscsiTask *itask = opaque;
-    struct scsi_readcapacity16 *rc16;
-    struct scsi_task *task = command_data;
-
-    if (status != 0) {
-        error_report("iSCSI: Failed to read capacity of iSCSI lun. %s",
-                     iscsi_get_error(iscsi));
-        itask->status   = 1;
-        itask->complete = 1;
-        scsi_free_scsi_task(task);
-        return;
-    }
-
-    rc16 = scsi_datain_unmarshall(task);
-    if (rc16 == NULL) {
-        error_report("iSCSI: Failed to unmarshall readcapacity16 data.");
-        itask->status   = 1;
-        itask->complete = 1;
-        scsi_free_scsi_task(task);
-        return;
-    }
-
-    itask->iscsilun->block_size = rc16->block_length;
-    itask->iscsilun->num_blocks = rc16->returned_lba + 1;
-    itask->bs->total_sectors    = itask->iscsilun->num_blocks *
-                               itask->iscsilun->block_size / BDRV_SECTOR_SIZE ;
-
-    itask->status   = 0;
-    itask->complete = 1;
-    scsi_free_scsi_task(task);
-}
-
-static void
-iscsi_readcapacity10_cb(struct iscsi_context *iscsi, int status,
-                        void *command_data, void *opaque)
-{
-    struct IscsiTask *itask = opaque;
-    struct scsi_readcapacity10 *rc10;
-    struct scsi_task *task = command_data;
-
-    if (status != 0) {
-        error_report("iSCSI: Failed to read capacity of iSCSI lun. %s",
-                     iscsi_get_error(iscsi));
-        itask->status   = 1;
-        itask->complete = 1;
-        scsi_free_scsi_task(task);
-        return;
-    }
-
-    rc10 = scsi_datain_unmarshall(task);
-    if (rc10 == NULL) {
-        error_report("iSCSI: Failed to unmarshall readcapacity10 data.");
-        itask->status   = 1;
-        itask->complete = 1;
-        scsi_free_scsi_task(task);
-        return;
-    }
-
-    itask->iscsilun->block_size = rc10->block_size;
-    itask->iscsilun->num_blocks = rc10->lba + 1;
-    itask->bs->total_sectors    = itask->iscsilun->num_blocks *
-                               itask->iscsilun->block_size / BDRV_SECTOR_SIZE ;
-
-    itask->status   = 0;
-    itask->complete = 1;
-    scsi_free_scsi_task(task);
-}
-
-static void
-iscsi_inquiry_cb(struct iscsi_context *iscsi, int status, void *command_data,
-                 void *opaque)
-{
-    struct IscsiTask *itask = opaque;
-    struct scsi_task *task = command_data;
-    struct scsi_inquiry_standard *inq;
-
-    if (status != 0) {
-        itask->status   = 1;
-        itask->complete = 1;
-        scsi_free_scsi_task(task);
-        return;
-    }
-
-    inq = scsi_datain_unmarshall(task);
-    if (inq == NULL) {
-        error_report("iSCSI: Failed to unmarshall inquiry data.");
-        itask->status   = 1;
-        itask->complete = 1;
-        scsi_free_scsi_task(task);
-        return;
-    }
-
-    itask->iscsilun->type = inq->periperal_device_type;
-
-    scsi_free_scsi_task(task);
-
-    switch (itask->iscsilun->type) {
-    case TYPE_DISK:
-        task = iscsi_readcapacity16_task(iscsi, itask->iscsilun->lun,
-                                   iscsi_readcapacity16_cb, opaque);
-        if (task == NULL) {
-            error_report("iSCSI: failed to send readcapacity16 command.");
-            itask->status   = 1;
-            itask->complete = 1;
-            return;
-        }
-        break;
-    case TYPE_ROM:
-        task = iscsi_readcapacity10_task(iscsi, itask->iscsilun->lun,
-                                   0, 0,
-                                   iscsi_readcapacity10_cb, opaque);
-        if (task == NULL) {
-            error_report("iSCSI: failed to send readcapacity16 command.");
-            itask->status   = 1;
-            itask->complete = 1;
-            return;
-        }
-        break;
-    default:
-        itask->status   = 0;
-        itask->complete = 1;
-    }
-}
-
-static void
-iscsi_connect_cb(struct iscsi_context *iscsi, int status, void *command_data,
-                 void *opaque)
-{
-    struct IscsiTask *itask = opaque;
-    struct scsi_task *task;
-
-    if (status != 0) {
-        itask->status   = 1;
-        itask->complete = 1;
-        return;
-    }
-
-    task = iscsi_inquiry_task(iscsi, itask->iscsilun->lun,
-                              0, 0, 36,
-                              iscsi_inquiry_cb, opaque);
-    if (task == NULL) {
-        error_report("iSCSI: failed to send inquiry command.");
-        itask->status   = 1;
-        itask->complete = 1;
-        return;
-    }
-}
-
 static int parse_chap(struct iscsi_context *iscsi, const char *target)
 {
     QemuOptsList *list;
@@ -918,7 +771,10 @@ static int iscsi_open(BlockDriverState *bs, const char *filename, int flags)
     IscsiLun *iscsilun = bs->opaque;
     struct iscsi_context *iscsi = NULL;
     struct iscsi_url *iscsi_url = NULL;
-    struct IscsiTask task;
+    struct scsi_task *task = NULL;
+    struct scsi_inquiry_standard *inq = NULL;
+    struct scsi_readcapacity10 *rc10 = NULL;
+    struct scsi_readcapacity16 *rc16 = NULL;
     char *initiator_name = NULL;
     int ret;
 
@@ -931,8 +787,7 @@ static int iscsi_open(BlockDriverState *bs, const char *filename, int flags)
 
     iscsi_url = iscsi_parse_full_url(iscsi, filename);
     if (iscsi_url == NULL) {
-        error_report("Failed to parse URL : %s %s", filename,
-                     iscsi_get_error(iscsi));
+        error_report("Failed to parse URL : %s", filename);
         ret = -EINVAL;
         goto out;
     }
@@ -982,33 +837,80 @@ static int iscsi_open(BlockDriverState *bs, const char *filename, int flags)
     /* check if we got HEADER_DIGEST via the options */
     parse_header_digest(iscsi, iscsi_url->target);
 
-    task.iscsilun = iscsilun;
-    task.status = 0;
-    task.complete = 0;
-    task.bs = bs;
+    if (iscsi_full_connect_sync(iscsi, iscsi_url->portal, iscsi_url->lun) != 0) {
+        error_report("iSCSI: Failed to connect to LUN : %s",
+            iscsi_get_error(iscsi));
+        ret = -EINVAL;
+        goto out;
+    }
 
     iscsilun->iscsi = iscsi;
     iscsilun->lun   = iscsi_url->lun;
 
-    if (iscsi_full_connect_async(iscsi, iscsi_url->portal, iscsi_url->lun,
-                                 iscsi_connect_cb, &task)
-        != 0) {
-        error_report("iSCSI: Failed to start async connect.");
+    task = iscsi_inquiry_sync(iscsi, iscsilun->lun, 0, 0, 36);
+
+    if (task == NULL || task->status != SCSI_STATUS_GOOD) {
+        error_report("iSCSI: failed to send inquiry command.");
         ret = -EINVAL;
         goto out;
     }
 
-    while (!task.complete) {
-        iscsi_set_events(iscsilun);
-        qemu_aio_wait();
-    }
-    if (task.status != 0) {
-        error_report("iSCSI: Failed to connect to LUN : %s",
-                     iscsi_get_error(iscsi));
+    inq = scsi_datain_unmarshall(task);
+    if (inq == NULL) {
+        error_report("iSCSI: Failed to unmarshall inquiry data.");
         ret = -EINVAL;
         goto out;
     }
 
+    iscsilun->type = inq->periperal_device_type;
+
+    scsi_free_scsi_task(task);
+
+    switch (iscsilun->type) {
+    case TYPE_DISK:
+        task = iscsi_readcapacity16_sync(iscsi, iscsilun->lun);
+        if (task == NULL || task->status != SCSI_STATUS_GOOD) {
+            error_report("iSCSI: failed to send readcapacity16 command.");
+            ret = -EINVAL;
+            goto out;
+        }
+        rc16 = scsi_datain_unmarshall(task);
+        if (rc16 == NULL) {
+            error_report("iSCSI: Failed to unmarshall readcapacity16 data.");
+            ret = -EINVAL;
+            goto out;
+        }
+        iscsilun->block_size = rc16->block_length;
+        iscsilun->num_blocks = rc16->returned_lba + 1;
+        break;
+    case TYPE_ROM:
+        task = iscsi_readcapacity10_sync(iscsi, iscsilun->lun, 0, 0);
+        if (task == NULL || task->status != SCSI_STATUS_GOOD) {
+            error_report("iSCSI: failed to send readcapacity10 command.");
+            ret = -EINVAL;
+            goto out;
+        }
+        rc10 = scsi_datain_unmarshall(task);
+        if (rc10 == NULL) {
+            error_report("iSCSI: Failed to unmarshall readcapacity10 data.");
+            ret = -EINVAL;
+            goto out;
+        }
+        iscsilun->block_size = rc10->block_size;
+        if (rc10->lba == 0) {
+            /* blank disk loaded */
+            iscsilun->num_blocks = 0;
+        } else {
+            iscsilun->num_blocks = rc10->lba + 1;
+        }
+        break;
+    default:
+        break;
+    }
+
+    bs->total_sectors    = iscsilun->num_blocks *
+                           iscsilun->block_size / BDRV_SECTOR_SIZE ;
+
     /* Medium changer or tape. We dont have any emulation for this so this must
      * be sg ioctl compatible. We force it to be sg, otherwise qemu will try
      * to read from the device to guess the image format.
@@ -1027,6 +929,9 @@ out:
     if (iscsi_url != NULL) {
         iscsi_destroy_url(iscsi_url);
     }
+    if (task != NULL) {
+        scsi_free_scsi_task(task);
+    }
 
     if (ret) {
         if (iscsi != NULL) {
@@ -1047,6 +952,11 @@ static void iscsi_close(BlockDriverState *bs)
     memset(iscsilun, 0, sizeof(IscsiLun));
 }
 
+static int iscsi_has_zero_init(BlockDriverState *bs)
+{
+    return 0;
+}
+
 static BlockDriver bdrv_iscsi = {
     .format_name     = "iscsi",
     .protocol_name   = "iscsi",
@@ -1062,6 +972,7 @@ static BlockDriver bdrv_iscsi = {
     .bdrv_aio_flush  = iscsi_aio_flush,
 
     .bdrv_aio_discard = iscsi_aio_discard,
+    .bdrv_has_zero_init = iscsi_has_zero_init,
 
 #ifdef __linux__
     .bdrv_ioctl       = iscsi_ioctl,
diff --git a/block/linux-aio.c b/block/linux-aio.c
new file mode 100644
index 0000000000..ee0f8d10c9
--- /dev/null
+++ b/block/linux-aio.c
@@ -0,0 +1,216 @@
+/*
+ * Linux native AIO support.
+ *
+ * Copyright (C) 2009 IBM, Corp.
+ * Copyright (C) 2009 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#include "qemu-common.h"
+#include "block/aio.h"
+#include "qemu/queue.h"
+#include "block/raw-aio.h"
+#include "qemu/event_notifier.h"
+
+#include <libaio.h>
+
+/*
+ * Queue size (per-device).
+ *
+ * XXX: eventually we need to communicate this to the guest and/or make it
+ *      tunable by the guest.  If we get more outstanding requests at a time
+ *      than this we will get EAGAIN from io_submit which is communicated to
+ *      the guest as an I/O error.
+ */
+#define MAX_EVENTS 128
+
+struct qemu_laiocb {
+    BlockDriverAIOCB common;
+    struct qemu_laio_state *ctx;
+    struct iocb iocb;
+    ssize_t ret;
+    size_t nbytes;
+    QEMUIOVector *qiov;
+    bool is_read;
+    QLIST_ENTRY(qemu_laiocb) node;
+};
+
+struct qemu_laio_state {
+    io_context_t ctx;
+    EventNotifier e;
+    int count;
+};
+
+static inline ssize_t io_event_ret(struct io_event *ev)
+{
+    return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res);
+}
+
+/*
+ * Completes an AIO request (calls the callback and frees the ACB).
+ */
+static void qemu_laio_process_completion(struct qemu_laio_state *s,
+    struct qemu_laiocb *laiocb)
+{
+    int ret;
+
+    s->count--;
+
+    ret = laiocb->ret;
+    if (ret != -ECANCELED) {
+        if (ret == laiocb->nbytes) {
+            ret = 0;
+        } else if (ret >= 0) {
+            /* Short reads mean EOF, pad with zeros. */
+            if (laiocb->is_read) {
+                qemu_iovec_memset(laiocb->qiov, ret, 0,
+                    laiocb->qiov->size - ret);
+            } else {
+                ret = -EINVAL;
+            }
+        }
+
+        laiocb->common.cb(laiocb->common.opaque, ret);
+    }
+
+    qemu_aio_release(laiocb);
+}
+
+static void qemu_laio_completion_cb(EventNotifier *e)
+{
+    struct qemu_laio_state *s = container_of(e, struct qemu_laio_state, e);
+
+    while (event_notifier_test_and_clear(&s->e)) {
+        struct io_event events[MAX_EVENTS];
+        struct timespec ts = { 0 };
+        int nevents, i;
+
+        do {
+            nevents = io_getevents(s->ctx, MAX_EVENTS, MAX_EVENTS, events, &ts);
+        } while (nevents == -EINTR);
+
+        for (i = 0; i < nevents; i++) {
+            struct iocb *iocb = events[i].obj;
+            struct qemu_laiocb *laiocb =
+                    container_of(iocb, struct qemu_laiocb, iocb);
+
+            laiocb->ret = io_event_ret(&events[i]);
+            qemu_laio_process_completion(s, laiocb);
+        }
+    }
+}
+
+static int qemu_laio_flush_cb(EventNotifier *e)
+{
+    struct qemu_laio_state *s = container_of(e, struct qemu_laio_state, e);
+
+    return (s->count > 0) ? 1 : 0;
+}
+
+static void laio_cancel(BlockDriverAIOCB *blockacb)
+{
+    struct qemu_laiocb *laiocb = (struct qemu_laiocb *)blockacb;
+    struct io_event event;
+    int ret;
+
+    if (laiocb->ret != -EINPROGRESS)
+        return;
+
+    /*
+     * Note that as of Linux 2.6.31 neither the block device code nor any
+     * filesystem implements cancellation of AIO request.
+     * Thus the polling loop below is the normal code path.
+     */
+    ret = io_cancel(laiocb->ctx->ctx, &laiocb->iocb, &event);
+    if (ret == 0) {
+        laiocb->ret = -ECANCELED;
+        return;
+    }
+
+    /*
+     * We have to wait for the iocb to finish.
+     *
+     * The only way to get the iocb status update is by polling the io context.
+     * We might be able to do this slightly more optimal by removing the
+     * O_NONBLOCK flag.
+     */
+    while (laiocb->ret == -EINPROGRESS) {
+        qemu_laio_completion_cb(&laiocb->ctx->e);
+    }
+}
+
+static const AIOCBInfo laio_aiocb_info = {
+    .aiocb_size         = sizeof(struct qemu_laiocb),
+    .cancel             = laio_cancel,
+};
+
+BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque, int type)
+{
+    struct qemu_laio_state *s = aio_ctx;
+    struct qemu_laiocb *laiocb;
+    struct iocb *iocbs;
+    off_t offset = sector_num * 512;
+
+    laiocb = qemu_aio_get(&laio_aiocb_info, bs, cb, opaque);
+    laiocb->nbytes = nb_sectors * 512;
+    laiocb->ctx = s;
+    laiocb->ret = -EINPROGRESS;
+    laiocb->is_read = (type == QEMU_AIO_READ);
+    laiocb->qiov = qiov;
+
+    iocbs = &laiocb->iocb;
+
+    switch (type) {
+    case QEMU_AIO_WRITE:
+        io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
+	break;
+    case QEMU_AIO_READ:
+        io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
+	break;
+    /* Currently Linux kernel does not support other operations */
+    default:
+        fprintf(stderr, "%s: invalid AIO request type 0x%x.\n",
+                        __func__, type);
+        goto out_free_aiocb;
+    }
+    io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e));
+    s->count++;
+
+    if (io_submit(s->ctx, 1, &iocbs) < 0)
+        goto out_dec_count;
+    return &laiocb->common;
+
+out_dec_count:
+    s->count--;
+out_free_aiocb:
+    qemu_aio_release(laiocb);
+    return NULL;
+}
+
+void *laio_init(void)
+{
+    struct qemu_laio_state *s;
+
+    s = g_malloc0(sizeof(*s));
+    if (event_notifier_init(&s->e, false) < 0) {
+        goto out_free_state;
+    }
+
+    if (io_setup(MAX_EVENTS, &s->ctx) != 0) {
+        goto out_close_efd;
+    }
+
+    qemu_aio_set_event_notifier(&s->e, qemu_laio_completion_cb,
+                                qemu_laio_flush_cb);
+
+    return s;
+
+out_close_efd:
+    event_notifier_cleanup(&s->e);
+out_free_state:
+    g_free(s);
+    return NULL;
+}
diff --git a/block/mirror.c b/block/mirror.c
new file mode 100644
index 0000000000..8aeacbf12c
--- /dev/null
+++ b/block/mirror.c
@@ -0,0 +1,322 @@
+/*
+ * Image mirroring
+ *
+ * Copyright Red Hat, Inc. 2012
+ *
+ * Authors:
+ *  Paolo Bonzini  <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "trace.h"
+#include "block/blockjob.h"
+#include "block/block_int.h"
+#include "qemu/ratelimit.h"
+
+enum {
+    /*
+     * Size of data buffer for populating the image file.  This should be large
+     * enough to process multiple clusters in a single call, so that populating
+     * contiguous regions of the image is efficient.
+     */
+    BLOCK_SIZE = 512 * BDRV_SECTORS_PER_DIRTY_CHUNK, /* in bytes */
+};
+
+#define SLICE_TIME 100000000ULL /* ns */
+
+typedef struct MirrorBlockJob {
+    BlockJob common;
+    RateLimit limit;
+    BlockDriverState *target;
+    MirrorSyncMode mode;
+    BlockdevOnError on_source_error, on_target_error;
+    bool synced;
+    bool should_complete;
+    int64_t sector_num;
+    uint8_t *buf;
+} MirrorBlockJob;
+
+static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
+                                            int error)
+{
+    s->synced = false;
+    if (read) {
+        return block_job_error_action(&s->common, s->common.bs,
+                                      s->on_source_error, true, error);
+    } else {
+        return block_job_error_action(&s->common, s->target,
+                                      s->on_target_error, false, error);
+    }
+}
+
+static int coroutine_fn mirror_iteration(MirrorBlockJob *s,
+                                         BlockErrorAction *p_action)
+{
+    BlockDriverState *source = s->common.bs;
+    BlockDriverState *target = s->target;
+    QEMUIOVector qiov;
+    int ret, nb_sectors;
+    int64_t end;
+    struct iovec iov;
+
+    end = s->common.len >> BDRV_SECTOR_BITS;
+    s->sector_num = bdrv_get_next_dirty(source, s->sector_num);
+    nb_sectors = MIN(BDRV_SECTORS_PER_DIRTY_CHUNK, end - s->sector_num);
+    bdrv_reset_dirty(source, s->sector_num, nb_sectors);
+
+    /* Copy the dirty cluster.  */
+    iov.iov_base = s->buf;
+    iov.iov_len  = nb_sectors * 512;
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    trace_mirror_one_iteration(s, s->sector_num, nb_sectors);
+    ret = bdrv_co_readv(source, s->sector_num, nb_sectors, &qiov);
+    if (ret < 0) {
+        *p_action = mirror_error_action(s, true, -ret);
+        goto fail;
+    }
+    ret = bdrv_co_writev(target, s->sector_num, nb_sectors, &qiov);
+    if (ret < 0) {
+        *p_action = mirror_error_action(s, false, -ret);
+        s->synced = false;
+        goto fail;
+    }
+    return 0;
+
+fail:
+    /* Try again later.  */
+    bdrv_set_dirty(source, s->sector_num, nb_sectors);
+    return ret;
+}
+
+static void coroutine_fn mirror_run(void *opaque)
+{
+    MirrorBlockJob *s = opaque;
+    BlockDriverState *bs = s->common.bs;
+    int64_t sector_num, end;
+    int ret = 0;
+    int n;
+
+    if (block_job_is_cancelled(&s->common)) {
+        goto immediate_exit;
+    }
+
+    s->common.len = bdrv_getlength(bs);
+    if (s->common.len < 0) {
+        block_job_completed(&s->common, s->common.len);
+        return;
+    }
+
+    end = s->common.len >> BDRV_SECTOR_BITS;
+    s->buf = qemu_blockalign(bs, BLOCK_SIZE);
+
+    if (s->mode != MIRROR_SYNC_MODE_NONE) {
+        /* First part, loop on the sectors and initialize the dirty bitmap.  */
+        BlockDriverState *base;
+        base = s->mode == MIRROR_SYNC_MODE_FULL ? NULL : bs->backing_hd;
+        for (sector_num = 0; sector_num < end; ) {
+            int64_t next = (sector_num | (BDRV_SECTORS_PER_DIRTY_CHUNK - 1)) + 1;
+            ret = bdrv_co_is_allocated_above(bs, base,
+                                             sector_num, next - sector_num, &n);
+
+            if (ret < 0) {
+                goto immediate_exit;
+            }
+
+            assert(n > 0);
+            if (ret == 1) {
+                bdrv_set_dirty(bs, sector_num, n);
+                sector_num = next;
+            } else {
+                sector_num += n;
+            }
+        }
+    }
+
+    s->sector_num = -1;
+    for (;;) {
+        uint64_t delay_ns;
+        int64_t cnt;
+        bool should_complete;
+
+        cnt = bdrv_get_dirty_count(bs);
+        if (cnt != 0) {
+            BlockErrorAction action = BDRV_ACTION_REPORT;
+            ret = mirror_iteration(s, &action);
+            if (ret < 0 && action == BDRV_ACTION_REPORT) {
+                goto immediate_exit;
+            }
+            cnt = bdrv_get_dirty_count(bs);
+        }
+
+        should_complete = false;
+        if (cnt == 0) {
+            trace_mirror_before_flush(s);
+            ret = bdrv_flush(s->target);
+            if (ret < 0) {
+                if (mirror_error_action(s, false, -ret) == BDRV_ACTION_REPORT) {
+                    goto immediate_exit;
+                }
+            } else {
+                /* We're out of the streaming phase.  From now on, if the job
+                 * is cancelled we will actually complete all pending I/O and
+                 * report completion.  This way, block-job-cancel will leave
+                 * the target in a consistent state.
+                 */
+                s->common.offset = end * BDRV_SECTOR_SIZE;
+                if (!s->synced) {
+                    block_job_ready(&s->common);
+                    s->synced = true;
+                }
+
+                should_complete = s->should_complete ||
+                    block_job_is_cancelled(&s->common);
+                cnt = bdrv_get_dirty_count(bs);
+            }
+        }
+
+        if (cnt == 0 && should_complete) {
+            /* The dirty bitmap is not updated while operations are pending.
+             * If we're about to exit, wait for pending operations before
+             * calling bdrv_get_dirty_count(bs), or we may exit while the
+             * source has dirty data to copy!
+             *
+             * Note that I/O can be submitted by the guest while
+             * mirror_populate runs.
+             */
+            trace_mirror_before_drain(s, cnt);
+            bdrv_drain_all();
+            cnt = bdrv_get_dirty_count(bs);
+        }
+
+        ret = 0;
+        trace_mirror_before_sleep(s, cnt, s->synced);
+        if (!s->synced) {
+            /* Publish progress */
+            s->common.offset = end * BDRV_SECTOR_SIZE - cnt * BLOCK_SIZE;
+
+            if (s->common.speed) {
+                delay_ns = ratelimit_calculate_delay(&s->limit, BDRV_SECTORS_PER_DIRTY_CHUNK);
+            } else {
+                delay_ns = 0;
+            }
+
+            /* Note that even when no rate limit is applied we need to yield
+             * with no pending I/O here so that bdrv_drain_all() returns.
+             */
+            block_job_sleep_ns(&s->common, rt_clock, delay_ns);
+            if (block_job_is_cancelled(&s->common)) {
+                break;
+            }
+        } else if (!should_complete) {
+            delay_ns = (cnt == 0 ? SLICE_TIME : 0);
+            block_job_sleep_ns(&s->common, rt_clock, delay_ns);
+        } else if (cnt == 0) {
+            /* The two disks are in sync.  Exit and report successful
+             * completion.
+             */
+            assert(QLIST_EMPTY(&bs->tracked_requests));
+            s->common.cancelled = false;
+            break;
+        }
+    }
+
+immediate_exit:
+    g_free(s->buf);
+    bdrv_set_dirty_tracking(bs, false);
+    bdrv_iostatus_disable(s->target);
+    if (s->should_complete && ret == 0) {
+        if (bdrv_get_flags(s->target) != bdrv_get_flags(s->common.bs)) {
+            bdrv_reopen(s->target, bdrv_get_flags(s->common.bs), NULL);
+        }
+        bdrv_swap(s->target, s->common.bs);
+    }
+    bdrv_close(s->target);
+    bdrv_delete(s->target);
+    block_job_completed(&s->common, ret);
+}
+
+static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
+{
+    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
+
+    if (speed < 0) {
+        error_set(errp, QERR_INVALID_PARAMETER, "speed");
+        return;
+    }
+    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
+}
+
+static void mirror_iostatus_reset(BlockJob *job)
+{
+    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
+
+    bdrv_iostatus_reset(s->target);
+}
+
+static void mirror_complete(BlockJob *job, Error **errp)
+{
+    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
+    int ret;
+
+    ret = bdrv_open_backing_file(s->target);
+    if (ret < 0) {
+        char backing_filename[PATH_MAX];
+        bdrv_get_full_backing_filename(s->target, backing_filename,
+                                       sizeof(backing_filename));
+        error_set(errp, QERR_OPEN_FILE_FAILED, backing_filename);
+        return;
+    }
+    if (!s->synced) {
+        error_set(errp, QERR_BLOCK_JOB_NOT_READY, job->bs->device_name);
+        return;
+    }
+
+    s->should_complete = true;
+    block_job_resume(job);
+}
+
+static BlockJobType mirror_job_type = {
+    .instance_size = sizeof(MirrorBlockJob),
+    .job_type      = "mirror",
+    .set_speed     = mirror_set_speed,
+    .iostatus_reset= mirror_iostatus_reset,
+    .complete      = mirror_complete,
+};
+
+void mirror_start(BlockDriverState *bs, BlockDriverState *target,
+                  int64_t speed, MirrorSyncMode mode,
+                  BlockdevOnError on_source_error,
+                  BlockdevOnError on_target_error,
+                  BlockDriverCompletionFunc *cb,
+                  void *opaque, Error **errp)
+{
+    MirrorBlockJob *s;
+
+    if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
+         on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
+        !bdrv_iostatus_is_enabled(bs)) {
+        error_set(errp, QERR_INVALID_PARAMETER, "on-source-error");
+        return;
+    }
+
+    s = block_job_create(&mirror_job_type, bs, speed, cb, opaque, errp);
+    if (!s) {
+        return;
+    }
+
+    s->on_source_error = on_source_error;
+    s->on_target_error = on_target_error;
+    s->target = target;
+    s->mode = mode;
+    bdrv_set_dirty_tracking(bs, true);
+    bdrv_set_enable_write_cache(s->target, true);
+    bdrv_set_on_error(s->target, on_target_error, on_target_error);
+    bdrv_iostatus_enable(s->target);
+    s->common.co = qemu_coroutine_create(mirror_run);
+    trace_mirror_start(bs, s, s->common.co, opaque);
+    qemu_coroutine_enter(s->common.co, s);
+}
diff --git a/block/nbd.c b/block/nbd.c
index 2bce47bf7a..a5812948d2 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -27,10 +27,11 @@
  */
 
 #include "qemu-common.h"
-#include "nbd.h"
-#include "block_int.h"
-#include "module.h"
-#include "qemu_socket.h"
+#include "block/nbd.h"
+#include "qemu/uri.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "qemu/sockets.h"
 
 #include <sys/types.h>
 #include <unistd.h>
@@ -55,7 +56,6 @@ typedef struct BDRVNBDState {
     uint32_t nbdflags;
     off_t size;
     size_t blocksize;
-    char *export_name; /* An NBD server may export several devices */
 
     CoMutex send_mutex;
     CoMutex free_sema;
@@ -65,13 +65,75 @@ typedef struct BDRVNBDState {
     Coroutine *recv_coroutine[MAX_NBD_REQUESTS];
     struct nbd_reply reply;
 
-    /* If it begins with  '/', this is a UNIX domain socket. Otherwise,
-     * it's a string of the form <hostname|ip4|\[ip6\]>:port
-     */
+    int is_unix;
     char *host_spec;
+    char *export_name; /* An NBD server may export several devices */
 } BDRVNBDState;
 
-static int nbd_config(BDRVNBDState *s, const char *filename, int flags)
+static int nbd_parse_uri(BDRVNBDState *s, const char *filename)
+{
+    URI *uri;
+    const char *p;
+    QueryParams *qp = NULL;
+    int ret = 0;
+
+    uri = uri_parse(filename);
+    if (!uri) {
+        return -EINVAL;
+    }
+
+    /* transport */
+    if (!strcmp(uri->scheme, "nbd")) {
+        s->is_unix = false;
+    } else if (!strcmp(uri->scheme, "nbd+tcp")) {
+        s->is_unix = false;
+    } else if (!strcmp(uri->scheme, "nbd+unix")) {
+        s->is_unix = true;
+    } else {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    p = uri->path ? uri->path : "/";
+    p += strspn(p, "/");
+    if (p[0]) {
+        s->export_name = g_strdup(p);
+    }
+
+    qp = query_params_parse(uri->query);
+    if (qp->n > 1 || (s->is_unix && !qp->n) || (!s->is_unix && qp->n)) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    if (s->is_unix) {
+        /* nbd+unix:///export?socket=path */
+        if (uri->server || uri->port || strcmp(qp->p[0].name, "socket")) {
+            ret = -EINVAL;
+            goto out;
+        }
+        s->host_spec = g_strdup(qp->p[0].value);
+    } else {
+        /* nbd[+tcp]://host:port/export */
+        if (!uri->server) {
+            ret = -EINVAL;
+            goto out;
+        }
+        if (!uri->port) {
+            uri->port = NBD_DEFAULT_PORT;
+        }
+        s->host_spec = g_strdup_printf("%s:%d", uri->server, uri->port);
+    }
+
+out:
+    if (qp) {
+        query_params_free(qp);
+    }
+    uri_free(uri);
+    return ret;
+}
+
+static int nbd_config(BDRVNBDState *s, const char *filename)
 {
     char *file;
     char *export_name;
@@ -79,6 +141,10 @@ static int nbd_config(BDRVNBDState *s, const char *filename, int flags)
     const char *unixpath;
     int err = -EINVAL;
 
+    if (strstr(filename, "://")) {
+        return nbd_parse_uri(s, filename);
+    }
+
     file = g_strdup(filename);
 
     export_name = strstr(file, EN_OPTSTR);
@@ -98,11 +164,10 @@ static int nbd_config(BDRVNBDState *s, const char *filename, int flags)
 
     /* are we a UNIX or TCP socket? */
     if (strstart(host_spec, "unix:", &unixpath)) {
-        if (unixpath[0] != '/') { /* We demand  an absolute path*/
-            goto out;
-        }
+        s->is_unix = true;
         s->host_spec = g_strdup(unixpath);
     } else {
+        s->is_unix = false;
         s->host_spec = g_strdup(host_spec);
     }
 
@@ -262,7 +327,7 @@ static int nbd_establish_connection(BlockDriverState *bs)
     off_t size;
     size_t blocksize;
 
-    if (s->host_spec[0] == '/') {
+    if (s->is_unix) {
         sock = unix_socket_outgoing(s->host_spec);
     } else {
         sock = tcp_socket_outgoing_spec(s->host_spec);
@@ -320,7 +385,7 @@ static int nbd_open(BlockDriverState *bs, const char* filename, int flags)
     qemu_co_mutex_init(&s->free_sema);
 
     /* Pop the config into our state object. Exit if invalid. */
-    result = nbd_config(s, filename, flags);
+    result = nbd_config(s, filename);
     if (result != 0) {
         return result;
     }
@@ -498,6 +563,33 @@ static int64_t nbd_getlength(BlockDriverState *bs)
 
 static BlockDriver bdrv_nbd = {
     .format_name         = "nbd",
+    .protocol_name       = "nbd",
+    .instance_size       = sizeof(BDRVNBDState),
+    .bdrv_file_open      = nbd_open,
+    .bdrv_co_readv       = nbd_co_readv,
+    .bdrv_co_writev      = nbd_co_writev,
+    .bdrv_close          = nbd_close,
+    .bdrv_co_flush_to_os = nbd_co_flush,
+    .bdrv_co_discard     = nbd_co_discard,
+    .bdrv_getlength      = nbd_getlength,
+};
+
+static BlockDriver bdrv_nbd_tcp = {
+    .format_name         = "nbd",
+    .protocol_name       = "nbd+tcp",
+    .instance_size       = sizeof(BDRVNBDState),
+    .bdrv_file_open      = nbd_open,
+    .bdrv_co_readv       = nbd_co_readv,
+    .bdrv_co_writev      = nbd_co_writev,
+    .bdrv_close          = nbd_close,
+    .bdrv_co_flush_to_os = nbd_co_flush,
+    .bdrv_co_discard     = nbd_co_discard,
+    .bdrv_getlength      = nbd_getlength,
+};
+
+static BlockDriver bdrv_nbd_unix = {
+    .format_name         = "nbd",
+    .protocol_name       = "nbd+unix",
     .instance_size       = sizeof(BDRVNBDState),
     .bdrv_file_open      = nbd_open,
     .bdrv_co_readv       = nbd_co_readv,
@@ -506,12 +598,13 @@ static BlockDriver bdrv_nbd = {
     .bdrv_co_flush_to_os = nbd_co_flush,
     .bdrv_co_discard     = nbd_co_discard,
     .bdrv_getlength      = nbd_getlength,
-    .protocol_name       = "nbd",
 };
 
 static void bdrv_nbd_init(void)
 {
     bdrv_register(&bdrv_nbd);
+    bdrv_register(&bdrv_nbd_tcp);
+    bdrv_register(&bdrv_nbd_unix);
 }
 
 block_init(bdrv_nbd_init);
diff --git a/block/parallels.c b/block/parallels.c
index d30f0ecf77..377375046f 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -24,8 +24,8 @@
  * THE SOFTWARE.
  */
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
 
 /**************************************************************/
 
diff --git a/block/qcow.c b/block/qcow.c
index 7b5ab87d2d..4276610afd 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -22,11 +22,11 @@
  * THE SOFTWARE.
  */
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
 #include <zlib.h>
-#include "aes.h"
-#include "migration.h"
+#include "block/aes.h"
+#include "migration/migration.h"
 
 /**************************************************************/
 /* QEMU COW block driver with compression and encryption support */
@@ -197,6 +197,15 @@ static int qcow_open(BlockDriverState *bs, int flags)
     return ret;
 }
 
+
+/* We have nothing to do for QCOW reopen, stubs just return
+ * success */
+static int qcow_reopen_prepare(BDRVReopenState *state,
+                               BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
 static int qcow_set_key(BlockDriverState *bs, const char *key)
 {
     BDRVQcowState *s = bs->opaque;
@@ -868,6 +877,7 @@ static BlockDriver bdrv_qcow = {
     .bdrv_probe		= qcow_probe,
     .bdrv_open		= qcow_open,
     .bdrv_close		= qcow_close,
+    .bdrv_reopen_prepare = qcow_reopen_prepare,
     .bdrv_create	= qcow_create,
 
     .bdrv_co_readv          = qcow_co_readv,
diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
index 2d4322a8dd..2f3114ecc2 100644
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -22,7 +22,7 @@
  * THE SOFTWARE.
  */
 
-#include "block_int.h"
+#include "block/block_int.h"
 #include "qemu-common.h"
 #include "qcow2.h"
 #include "trace.h"
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index e179211c57..56fccf9487 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -25,7 +25,7 @@
 #include <zlib.h>
 
 #include "qemu-common.h"
-#include "block_int.h"
+#include "block/block_int.h"
 #include "block/qcow2.h"
 #include "trace.h"
 
@@ -615,57 +615,67 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
     return cluster_offset;
 }
 
-int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
+static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r)
 {
     BDRVQcowState *s = bs->opaque;
-    int i, j = 0, l2_index, ret;
-    uint64_t *old_cluster, start_sect, *l2_table;
-    uint64_t cluster_offset = m->alloc_offset;
-    bool cow = false;
-
-    trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
+    int ret;
 
-    if (m->nb_clusters == 0)
+    if (r->nb_sectors == 0) {
         return 0;
+    }
 
-    old_cluster = g_malloc(m->nb_clusters * sizeof(uint64_t));
+    qemu_co_mutex_unlock(&s->lock);
+    ret = copy_sectors(bs, m->offset / BDRV_SECTOR_SIZE, m->alloc_offset,
+                       r->offset / BDRV_SECTOR_SIZE,
+                       r->offset / BDRV_SECTOR_SIZE + r->nb_sectors);
+    qemu_co_mutex_lock(&s->lock);
 
-    /* copy content of unmodified sectors */
-    start_sect = (m->offset & ~(s->cluster_size - 1)) >> 9;
-    if (m->n_start) {
-        cow = true;
-        qemu_co_mutex_unlock(&s->lock);
-        ret = copy_sectors(bs, start_sect, cluster_offset, 0, m->n_start);
-        qemu_co_mutex_lock(&s->lock);
-        if (ret < 0)
-            goto err;
-    }
-
-    if (m->nb_available & (s->cluster_sectors - 1)) {
-        cow = true;
-        qemu_co_mutex_unlock(&s->lock);
-        ret = copy_sectors(bs, start_sect, cluster_offset, m->nb_available,
-                           align_offset(m->nb_available, s->cluster_sectors));
-        qemu_co_mutex_lock(&s->lock);
-        if (ret < 0)
-            goto err;
+    if (ret < 0) {
+        return ret;
     }
 
     /*
-     * Update L2 table.
-     *
      * Before we update the L2 table to actually point to the new cluster, we
      * need to be sure that the refcounts have been increased and COW was
      * handled.
      */
-    if (cow) {
-        qcow2_cache_depends_on_flush(s->l2_table_cache);
+    qcow2_cache_depends_on_flush(s->l2_table_cache);
+
+    return 0;
+}
+
+int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
+{
+    BDRVQcowState *s = bs->opaque;
+    int i, j = 0, l2_index, ret;
+    uint64_t *old_cluster, *l2_table;
+    uint64_t cluster_offset = m->alloc_offset;
+
+    trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
+    assert(m->nb_clusters > 0);
+
+    old_cluster = g_malloc(m->nb_clusters * sizeof(uint64_t));
+
+    /* copy content of unmodified sectors */
+    ret = perform_cow(bs, m, &m->cow_start);
+    if (ret < 0) {
+        goto err;
+    }
+
+    ret = perform_cow(bs, m, &m->cow_end);
+    if (ret < 0) {
+        goto err;
     }
 
+    /* Update L2 table. */
+    if (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS) {
+        qcow2_mark_dirty(bs);
+    }
     if (qcow2_need_accurate_refcounts(s)) {
         qcow2_cache_set_dependency(bs, s->l2_table_cache,
                                    s->refcount_block_cache);
     }
+
     ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index);
     if (ret < 0) {
         goto err;
@@ -743,38 +753,16 @@ out:
 }
 
 /*
- * Allocates new clusters for the given guest_offset.
- *
- * At most *nb_clusters are allocated, and on return *nb_clusters is updated to
- * contain the number of clusters that have been allocated and are contiguous
- * in the image file.
- *
- * If *host_offset is non-zero, it specifies the offset in the image file at
- * which the new clusters must start. *nb_clusters can be 0 on return in this
- * case if the cluster at host_offset is already in use. If *host_offset is
- * zero, the clusters can be allocated anywhere in the image file.
- *
- * *host_offset is updated to contain the offset into the image file at which
- * the first allocated cluster starts.
- *
- * Return 0 on success and -errno in error cases. -EAGAIN means that the
- * function has been waiting for another request and the allocation must be
- * restarted, but the whole request should not be failed.
+ * Check if there already is an AIO write request in flight which allocates
+ * the same cluster. In this case we need to wait until the previous
+ * request has completed and updated the L2 table accordingly.
  */
-static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
-    uint64_t *host_offset, unsigned int *nb_clusters)
+static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
+    unsigned int *nb_clusters)
 {
     BDRVQcowState *s = bs->opaque;
     QCowL2Meta *old_alloc;
 
-    trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset,
-                                         *host_offset, *nb_clusters);
-
-    /*
-     * Check if there already is an AIO write request in flight which allocates
-     * the same cluster. In this case we need to wait until the previous
-     * request has completed and updated the L2 table accordingly.
-     */
     QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) {
 
         uint64_t start = guest_offset >> s->cluster_bits;
@@ -807,6 +795,42 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
         abort();
     }
 
+    return 0;
+}
+
+/*
+ * Allocates new clusters for the given guest_offset.
+ *
+ * At most *nb_clusters are allocated, and on return *nb_clusters is updated to
+ * contain the number of clusters that have been allocated and are contiguous
+ * in the image file.
+ *
+ * If *host_offset is non-zero, it specifies the offset in the image file at
+ * which the new clusters must start. *nb_clusters can be 0 on return in this
+ * case if the cluster at host_offset is already in use. If *host_offset is
+ * zero, the clusters can be allocated anywhere in the image file.
+ *
+ * *host_offset is updated to contain the offset into the image file at which
+ * the first allocated cluster starts.
+ *
+ * Return 0 on success and -errno in error cases. -EAGAIN means that the
+ * function has been waiting for another request and the allocation must be
+ * restarted, but the whole request should not be failed.
+ */
+static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
+    uint64_t *host_offset, unsigned int *nb_clusters)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret;
+
+    trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset,
+                                         *host_offset, *nb_clusters);
+
+    ret = handle_dependencies(bs, guest_offset, nb_clusters);
+    if (ret < 0) {
+        return ret;
+    }
+
     /* Allocate new clusters */
     trace_qcow2_cluster_alloc_phys(qemu_coroutine_self());
     if (*host_offset == 0) {
@@ -818,7 +842,7 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
         *host_offset = cluster_offset;
         return 0;
     } else {
-        int ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters);
+        ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters);
         if (ret < 0) {
             return ret;
         }
@@ -847,7 +871,7 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
  * Return 0 on success and -errno in error cases
  */
 int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
-    int n_start, int n_end, int *num, QCowL2Meta *m)
+    int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m)
 {
     BDRVQcowState *s = bs->opaque;
     int l2_index, ret, sectors;
@@ -919,12 +943,6 @@ again:
     }
 
     /* If there is something left to allocate, do that now */
-    *m = (QCowL2Meta) {
-        .cluster_offset     = cluster_offset,
-        .nb_clusters        = 0,
-    };
-    qemu_co_queue_init(&m->dependent_requests);
-
     if (nb_clusters > 0) {
         uint64_t alloc_offset;
         uint64_t alloc_cluster_offset;
@@ -957,22 +975,40 @@ again:
              *
              * avail_sectors: Number of sectors from the start of the first
              * newly allocated to the end of the last newly allocated cluster.
+             *
+             * nb_sectors: The number of sectors from the start of the first
+             * newly allocated cluster to the end of the aread that the write
+             * request actually writes to (excluding COW at the end)
              */
             int requested_sectors = n_end - keep_clusters * s->cluster_sectors;
             int avail_sectors = nb_clusters
                                 << (s->cluster_bits - BDRV_SECTOR_BITS);
+            int alloc_n_start = keep_clusters == 0 ? n_start : 0;
+            int nb_sectors = MIN(requested_sectors, avail_sectors);
+
+            if (keep_clusters == 0) {
+                cluster_offset = alloc_cluster_offset;
+            }
+
+            *m = g_malloc0(sizeof(**m));
 
-            *m = (QCowL2Meta) {
-                .cluster_offset = keep_clusters == 0 ?
-                                  alloc_cluster_offset : cluster_offset,
+            **m = (QCowL2Meta) {
                 .alloc_offset   = alloc_cluster_offset,
-                .offset         = alloc_offset,
-                .n_start        = keep_clusters == 0 ? n_start : 0,
+                .offset         = alloc_offset & ~(s->cluster_size - 1),
                 .nb_clusters    = nb_clusters,
-                .nb_available   = MIN(requested_sectors, avail_sectors),
+                .nb_available   = nb_sectors,
+
+                .cow_start = {
+                    .offset     = 0,
+                    .nb_sectors = alloc_n_start,
+                },
+                .cow_end = {
+                    .offset     = nb_sectors * BDRV_SECTOR_SIZE,
+                    .nb_sectors = avail_sectors - nb_sectors,
+                },
             };
-            qemu_co_queue_init(&m->dependent_requests);
-            QLIST_INSERT_HEAD(&s->cluster_allocs, m, next_in_flight);
+            qemu_co_queue_init(&(*m)->dependent_requests);
+            QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
         }
     }
 
@@ -984,12 +1020,13 @@ again:
 
     assert(sectors > n_start);
     *num = sectors - n_start;
+    *host_offset = cluster_offset;
 
     return 0;
 
 fail:
-    if (m->nb_clusters > 0) {
-        QLIST_REMOVE(m, next_in_flight);
+    if (*m && (*m)->nb_clusters > 0) {
+        QLIST_REMOVE(*m, next_in_flight);
     }
     return ret;
 }
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 5e3f9153fb..6a95aa6c92 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -23,7 +23,7 @@
  */
 
 #include "qemu-common.h"
-#include "block_int.h"
+#include "block/block_int.h"
 #include "block/qcow2.h"
 
 static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size);
@@ -301,7 +301,8 @@ static int alloc_refcount_block(BlockDriverState *bs,
     uint64_t last_table_size;
     uint64_t blocks_clusters;
     do {
-        uint64_t table_clusters = size_to_clusters(s, table_size);
+        uint64_t table_clusters =
+            size_to_clusters(s, table_size * sizeof(uint64_t));
         blocks_clusters = 1 +
             ((table_clusters + refcount_block_clusters - 1)
             / refcount_block_clusters);
diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 4e7c93b8b3..eb8fcd5549 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -23,7 +23,7 @@
  */
 
 #include "qemu-common.h"
-#include "block_int.h"
+#include "block/block_int.h"
 #include "block/qcow2.h"
 
 typedef struct QEMU_PACKED QCowSnapshotHeader {
diff --git a/block/qcow2.c b/block/qcow2.c
index 8f183f1465..d603f98a9c 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -22,13 +22,13 @@
  * THE SOFTWARE.
  */
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
 #include <zlib.h>
-#include "aes.h"
+#include "block/aes.h"
 #include "block/qcow2.h"
-#include "qemu-error.h"
-#include "qerror.h"
+#include "qemu/error-report.h"
+#include "qapi/qmp/qerror.h"
 #include "trace.h"
 
 /*
@@ -52,6 +52,7 @@ typedef struct {
     uint32_t magic;
     uint32_t len;
 } QCowExtension;
+
 #define  QCOW2_EXT_MAGIC_END 0
 #define  QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
 #define  QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857
@@ -221,7 +222,7 @@ static void report_unsupported_feature(BlockDriverState *bs,
  * updated successfully.  Therefore it is not required to check the return
  * value of this function.
  */
-static int qcow2_mark_dirty(BlockDriverState *bs)
+int qcow2_mark_dirty(BlockDriverState *bs)
 {
     BDRVQcowState *s = bs->opaque;
     uint64_t val;
@@ -558,6 +559,14 @@ static int qcow2_set_key(BlockDriverState *bs, const char *key)
     return 0;
 }
 
+/* We have nothing to do for QCOW2 reopen, stubs just return
+ * success */
+static int qcow2_reopen_prepare(BDRVReopenState *state,
+                                BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
 static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, int *pnum)
 {
@@ -736,21 +745,6 @@ fail:
     return ret;
 }
 
-static void run_dependent_requests(BDRVQcowState *s, QCowL2Meta *m)
-{
-    /* Take the request off the list of running requests */
-    if (m->nb_clusters != 0) {
-        QLIST_REMOVE(m, next_in_flight);
-    }
-
-    /* Restart all dependent requests */
-    if (!qemu_co_queue_empty(&m->dependent_requests)) {
-        qemu_co_mutex_unlock(&s->lock);
-        qemu_co_queue_restart_all(&m->dependent_requests);
-        qemu_co_mutex_lock(&s->lock);
-    }
-}
-
 static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
                            int64_t sector_num,
                            int remaining_sectors,
@@ -765,15 +759,11 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
     QEMUIOVector hd_qiov;
     uint64_t bytes_done = 0;
     uint8_t *cluster_data = NULL;
-    QCowL2Meta l2meta = {
-        .nb_clusters = 0,
-    };
+    QCowL2Meta *l2meta;
 
     trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num,
                                  remaining_sectors);
 
-    qemu_co_queue_init(&l2meta.dependent_requests);
-
     qemu_iovec_init(&hd_qiov, qiov->niov);
 
     s->cluster_cache_offset = -1; /* disable compressed cache */
@@ -782,6 +772,8 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
 
     while (remaining_sectors != 0) {
 
+        l2meta = NULL;
+
         trace_qcow2_writev_start_part(qemu_coroutine_self());
         index_in_cluster = sector_num & (s->cluster_sectors - 1);
         n_end = index_in_cluster + remaining_sectors;
@@ -791,17 +783,11 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
         }
 
         ret = qcow2_alloc_cluster_offset(bs, sector_num << 9,
-            index_in_cluster, n_end, &cur_nr_sectors, &l2meta);
+            index_in_cluster, n_end, &cur_nr_sectors, &cluster_offset, &l2meta);
         if (ret < 0) {
             goto fail;
         }
 
-        if (l2meta.nb_clusters > 0 &&
-            (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS)) {
-            qcow2_mark_dirty(bs);
-        }
-
-        cluster_offset = l2meta.cluster_offset;
         assert((cluster_offset & 511) == 0);
 
         qemu_iovec_reset(&hd_qiov);
@@ -826,8 +812,8 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
                 cur_nr_sectors * 512);
         }
 
-        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
         qemu_co_mutex_unlock(&s->lock);
+        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
         trace_qcow2_writev_data(qemu_coroutine_self(),
                                 (cluster_offset >> 9) + index_in_cluster);
         ret = bdrv_co_writev(bs->file,
@@ -838,12 +824,24 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
             goto fail;
         }
 
-        ret = qcow2_alloc_cluster_link_l2(bs, &l2meta);
-        if (ret < 0) {
-            goto fail;
-        }
+        if (l2meta != NULL) {
+            ret = qcow2_alloc_cluster_link_l2(bs, l2meta);
+            if (ret < 0) {
+                goto fail;
+            }
+
+            /* Take the request off the list of running requests */
+            if (l2meta->nb_clusters != 0) {
+                QLIST_REMOVE(l2meta, next_in_flight);
+            }
 
-        run_dependent_requests(s, &l2meta);
+            qemu_co_mutex_unlock(&s->lock);
+            qemu_co_queue_restart_all(&l2meta->dependent_requests);
+            qemu_co_mutex_lock(&s->lock);
+
+            g_free(l2meta);
+            l2meta = NULL;
+        }
 
         remaining_sectors -= cur_nr_sectors;
         sector_num += cur_nr_sectors;
@@ -853,10 +851,16 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
     ret = 0;
 
 fail:
-    run_dependent_requests(s, &l2meta);
-
     qemu_co_mutex_unlock(&s->lock);
 
+    if (l2meta != NULL) {
+        if (l2meta->nb_clusters != 0) {
+            QLIST_REMOVE(l2meta, next_in_flight);
+        }
+        qemu_co_queue_restart_all(&l2meta->dependent_requests);
+        g_free(l2meta);
+    }
+
     qemu_iovec_destroy(&hd_qiov);
     qemu_vfree(cluster_data);
     trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
@@ -1087,6 +1091,7 @@ int qcow2_update_header(BlockDriverState *bs)
             goto fail;
         }
 
+        /* Using strncpy is ok here, since buf is not NUL-terminated. */
         strncpy(buf, bs->backing_file, buflen);
 
         header->backing_file_offset = cpu_to_be64(buf - ((char*) header));
@@ -1118,31 +1123,33 @@ static int preallocate(BlockDriverState *bs)
 {
     uint64_t nb_sectors;
     uint64_t offset;
+    uint64_t host_offset = 0;
     int num;
     int ret;
-    QCowL2Meta meta;
+    QCowL2Meta *meta;
 
     nb_sectors = bdrv_getlength(bs) >> 9;
     offset = 0;
-    qemu_co_queue_init(&meta.dependent_requests);
-    meta.cluster_offset = 0;
 
     while (nb_sectors) {
         num = MIN(nb_sectors, INT_MAX >> 9);
-        ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num, &meta);
+        ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num,
+                                         &host_offset, &meta);
         if (ret < 0) {
             return ret;
         }
 
-        ret = qcow2_alloc_cluster_link_l2(bs, &meta);
+        ret = qcow2_alloc_cluster_link_l2(bs, meta);
         if (ret < 0) {
-            qcow2_free_any_clusters(bs, meta.cluster_offset, meta.nb_clusters);
+            qcow2_free_any_clusters(bs, meta->alloc_offset, meta->nb_clusters);
             return ret;
         }
 
         /* There are no dependent requests, but we need to remove our request
          * from the list of in-flight requests */
-        run_dependent_requests(bs->opaque, &meta);
+        if (meta != NULL) {
+            QLIST_REMOVE(meta, next_in_flight);
+        }
 
         /* TODO Preallocate data if requested */
 
@@ -1155,10 +1162,10 @@ static int preallocate(BlockDriverState *bs)
      * all of the allocated clusters (otherwise we get failing reads after
      * EOF). Extend the image to the last allocated sector.
      */
-    if (meta.cluster_offset != 0) {
+    if (host_offset != 0) {
         uint8_t buf[512];
         memset(buf, 0, 512);
-        ret = bdrv_write(bs->file, (meta.cluster_offset >> 9) + num - 1, buf, 1);
+        ret = bdrv_write(bs->file, (host_offset >> 9) + num - 1, buf, 1);
         if (ret < 0) {
             return ret;
         }
@@ -1679,6 +1686,7 @@ static BlockDriver bdrv_qcow2 = {
     .bdrv_probe         = qcow2_probe,
     .bdrv_open          = qcow2_open,
     .bdrv_close         = qcow2_close,
+    .bdrv_reopen_prepare  = qcow2_reopen_prepare,
     .bdrv_create        = qcow2_create,
     .bdrv_co_is_allocated = qcow2_co_is_allocated,
     .bdrv_set_key       = qcow2_set_key,
diff --git a/block/qcow2.h b/block/qcow2.h
index b4eb65470e..718b52baca 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -25,8 +25,8 @@
 #ifndef BLOCK_QCOW2_H
 #define BLOCK_QCOW2_H
 
-#include "aes.h"
-#include "qemu-coroutine.h"
+#include "block/aes.h"
+#include "block/coroutine.h"
 
 //#define DEBUG_ALLOC
 //#define DEBUG_ALLOC2
@@ -196,17 +196,56 @@ typedef struct QCowCreateState {
 
 struct QCowAIOCB;
 
-/* XXX This could be private for qcow2-cluster.c */
+typedef struct Qcow2COWRegion {
+    /**
+     * Offset of the COW region in bytes from the start of the first cluster
+     * touched by the request.
+     */
+    uint64_t    offset;
+
+    /** Number of sectors to copy */
+    int         nb_sectors;
+} Qcow2COWRegion;
+
+/**
+ * Describes an in-flight (part of a) write request that writes to clusters
+ * that are not referenced in their L2 table yet.
+ */
 typedef struct QCowL2Meta
 {
+    /** Guest offset of the first newly allocated cluster */
     uint64_t offset;
-    uint64_t cluster_offset;
+
+    /** Host offset of the first newly allocated cluster */
     uint64_t alloc_offset;
-    int n_start;
+
+    /**
+     * Number of sectors from the start of the first allocated cluster to
+     * the end of the (possibly shortened) request
+     */
     int nb_available;
+
+    /** Number of newly allocated clusters */
     int nb_clusters;
+
+    /**
+     * Requests that overlap with this allocation and wait to be restarted
+     * when the allocating request has completed.
+     */
     CoQueue dependent_requests;
 
+    /**
+     * The COW Region between the start of the first allocated cluster and the
+     * area the guest actually writes to.
+     */
+    Qcow2COWRegion cow_start;
+
+    /**
+     * The COW Region between the area the guest actually writes to and the
+     * end of the last allocated cluster.
+     */
+    Qcow2COWRegion cow_end;
+
     QLIST_ENTRY(QCowL2Meta) next_in_flight;
 } QCowL2Meta;
 
@@ -264,6 +303,8 @@ static inline bool qcow2_need_accurate_refcounts(BDRVQcowState *s)
 /* qcow2.c functions */
 int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
                   int64_t sector_num, int nb_sectors);
+
+int qcow2_mark_dirty(BlockDriverState *bs);
 int qcow2_update_header(BlockDriverState *bs);
 
 /* qcow2-refcount.c functions */
@@ -297,7 +338,7 @@ void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
 int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
     int *num, uint64_t *cluster_offset);
 int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
-    int n_start, int n_end, int *num, QCowL2Meta *m);
+    int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m);
 uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
                                          uint64_t offset,
                                          int compressed_size);
diff --git a/block/qed-table.c b/block/qed-table.c
index ce07b05549..76d2dcccf8 100644
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -13,7 +13,7 @@
  */
 
 #include "trace.h"
-#include "qemu_socket.h" /* for EINPROGRESS on Windows */
+#include "qemu/sockets.h" /* for EINPROGRESS on Windows */
 #include "qed.h"
 
 typedef struct {
@@ -103,7 +103,6 @@ static void qed_write_table_cb(void *opaque, int ret)
 out:
     qemu_vfree(write_table_cb->table);
     gencb_complete(&write_table_cb->gencb, ret);
-    return;
 }
 
 /**
diff --git a/block/qed.c b/block/qed.c
index a02dbfd72d..cf85d8f2b4 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -12,11 +12,11 @@
  *
  */
 
-#include "qemu-timer.h"
+#include "qemu/timer.h"
 #include "trace.h"
 #include "qed.h"
-#include "qerror.h"
-#include "migration.h"
+#include "qapi/qmp/qerror.h"
+#include "migration/migration.h"
 
 static void qed_aio_cancel(BlockDriverAIOCB *blockacb)
 {
@@ -30,7 +30,7 @@ static void qed_aio_cancel(BlockDriverAIOCB *blockacb)
     }
 }
 
-static AIOPool qed_aio_pool = {
+static const AIOCBInfo qed_aiocb_info = {
     .aiocb_size         = sizeof(QEDAIOCB),
     .cancel             = qed_aio_cancel,
 };
@@ -505,6 +505,14 @@ out:
     return ret;
 }
 
+/* We have nothing to do for QED reopen, stubs just return
+ * success */
+static int bdrv_qed_reopen_prepare(BDRVReopenState *state,
+                                   BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
 static void bdrv_qed_close(BlockDriverState *bs)
 {
     BDRVQEDState *s = bs->opaque;
@@ -1303,7 +1311,7 @@ static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs,
                                        BlockDriverCompletionFunc *cb,
                                        void *opaque, int flags)
 {
-    QEDAIOCB *acb = qemu_aio_get(&qed_aio_pool, bs, cb, opaque);
+    QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, cb, opaque);
 
     trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors,
                         opaque, flags);
@@ -1363,10 +1371,21 @@ static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
                                                  int nb_sectors)
 {
     BlockDriverAIOCB *blockacb;
+    BDRVQEDState *s = bs->opaque;
     QEDWriteZeroesCB cb = { .done = false };
     QEMUIOVector qiov;
     struct iovec iov;
 
+    /* Refuse if there are untouched backing file sectors */
+    if (bs->backing_hd) {
+        if (qed_offset_into_cluster(s, sector_num * BDRV_SECTOR_SIZE) != 0) {
+            return -ENOTSUP;
+        }
+        if (qed_offset_into_cluster(s, nb_sectors * BDRV_SECTOR_SIZE) != 0) {
+            return -ENOTSUP;
+        }
+    }
+
     /* Zero writes start without an I/O buffer.  If a buffer becomes necessary
      * then it will be allocated during request processing.
      */
@@ -1553,6 +1572,7 @@ static BlockDriver bdrv_qed = {
     .bdrv_rebind              = bdrv_qed_rebind,
     .bdrv_open                = bdrv_qed_open,
     .bdrv_close               = bdrv_qed_close,
+    .bdrv_reopen_prepare      = bdrv_qed_reopen_prepare,
     .bdrv_create              = bdrv_qed_create,
     .bdrv_co_is_allocated     = bdrv_qed_co_is_allocated,
     .bdrv_make_empty          = bdrv_qed_make_empty,
diff --git a/block/qed.h b/block/qed.h
index a063bf70af..2b4ddedf31 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -15,7 +15,7 @@
 #ifndef BLOCK_QED_H
 #define BLOCK_QED_H
 
-#include "block_int.h"
+#include "block/block_int.h"
 
 /* The layout of a QED file is as follows:
  *
diff --git a/block/raw-posix-aio.h b/block/raw-aio.h
index ba118f616b..e77f361148 100644
--- a/block/raw-posix-aio.h
+++ b/block/raw-aio.h
@@ -1,5 +1,5 @@
 /*
- * QEMU Posix block I/O backend AIO support
+ * Declarations for AIO in the raw protocol
  *
  * Copyright IBM, Corp. 2008
  *
@@ -12,8 +12,8 @@
  * Contributions after 2012-01-13 are licensed under the terms of the
  * GNU GPL, version 2 or (at your option) any later version.
  */
-#ifndef QEMU_RAW_POSIX_AIO_H
-#define QEMU_RAW_POSIX_AIO_H
+#ifndef QEMU_RAW_AIO_H
+#define QEMU_RAW_AIO_H
 
 /* AIO request types */
 #define QEMU_AIO_READ         0x0001
@@ -27,19 +27,22 @@
 #define QEMU_AIO_MISALIGNED   0x1000
 
 
-/* posix-aio-compat.c - thread pool based implementation */
-int paio_init(void);
-BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque, int type);
-BlockDriverAIOCB *paio_ioctl(BlockDriverState *bs, int fd,
-        unsigned long int req, void *buf,
-        BlockDriverCompletionFunc *cb, void *opaque);
-
 /* linux-aio.c - Linux native implementation */
+#ifdef CONFIG_LINUX_AIO
 void *laio_init(void);
 BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
         BlockDriverCompletionFunc *cb, void *opaque, int type);
+#endif
+
+#ifdef _WIN32
+typedef struct QEMUWin32AIOState QEMUWin32AIOState;
+QEMUWin32AIOState *win32_aio_init(void);
+int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile);
+BlockDriverAIOCB *win32_aio_submit(BlockDriverState *bs,
+        QEMUWin32AIOState *aio, HANDLE hfile,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque, int type);
+#endif
 
-#endif /* QEMU_RAW_POSIX_AIO_H */
+#endif /* QEMU_RAW_AIO_H */
diff --git a/block/raw-posix.c b/block/raw-posix.c
index 6be20b1925..87d888ed01 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -22,12 +22,14 @@
  * THE SOFTWARE.
  */
 #include "qemu-common.h"
-#include "qemu-timer.h"
-#include "qemu-char.h"
-#include "qemu-log.h"
-#include "block_int.h"
-#include "module.h"
-#include "block/raw-posix-aio.h"
+#include "qemu/timer.h"
+#include "qemu/log.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "trace.h"
+#include "block/thread-pool.h"
+#include "qemu/iov.h"
+#include "raw-aio.h"
 
 #if defined(__APPLE__) && (__MACH__)
 #include <paths.h>
@@ -133,16 +135,36 @@ typedef struct BDRVRawState {
     int use_aio;
     void *aio_ctx;
 #endif
-    uint8_t *aligned_buf;
-    unsigned aligned_buf_size;
 #ifdef CONFIG_XFS
     bool is_xfs : 1;
 #endif
 } BDRVRawState;
 
+typedef struct BDRVRawReopenState {
+    int fd;
+    int open_flags;
+#ifdef CONFIG_LINUX_AIO
+    int use_aio;
+#endif
+} BDRVRawReopenState;
+
 static int fd_open(BlockDriverState *bs);
 static int64_t raw_getlength(BlockDriverState *bs);
 
+typedef struct RawPosixAIOData {
+    BlockDriverState *bs;
+    int aio_fildes;
+    union {
+        struct iovec *aio_iov;
+        void *aio_ioctl_buf;
+    };
+    int aio_niov;
+    size_t aio_nbytes;
+#define aio_ioctl_cmd   aio_nbytes /* for QEMU_AIO_IOCTL */
+    off_t aio_offset;
+    int aio_type;
+} RawPosixAIOData;
+
 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
 static int cdrom_reopen(BlockDriverState *bs);
 #endif
@@ -185,6 +207,57 @@ static int raw_normalize_devicepath(const char **filename)
 }
 #endif
 
+static void raw_parse_flags(int bdrv_flags, int *open_flags)
+{
+    assert(open_flags != NULL);
+
+    *open_flags |= O_BINARY;
+    *open_flags &= ~O_ACCMODE;
+    if (bdrv_flags & BDRV_O_RDWR) {
+        *open_flags |= O_RDWR;
+    } else {
+        *open_flags |= O_RDONLY;
+    }
+
+    /* Use O_DSYNC for write-through caching, no flags for write-back caching,
+     * and O_DIRECT for no caching. */
+    if ((bdrv_flags & BDRV_O_NOCACHE)) {
+        *open_flags |= O_DIRECT;
+    }
+}
+
+#ifdef CONFIG_LINUX_AIO
+static int raw_set_aio(void **aio_ctx, int *use_aio, int bdrv_flags)
+{
+    int ret = -1;
+    assert(aio_ctx != NULL);
+    assert(use_aio != NULL);
+    /*
+     * Currently Linux do AIO only for files opened with O_DIRECT
+     * specified so check NOCACHE flag too
+     */
+    if ((bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) ==
+                      (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) {
+
+        /* if non-NULL, laio_init() has already been run */
+        if (*aio_ctx == NULL) {
+            *aio_ctx = laio_init();
+            if (!*aio_ctx) {
+                goto error;
+            }
+        }
+        *use_aio = 1;
+    } else {
+        *use_aio = 0;
+    }
+
+    ret = 0;
+
+error:
+    return ret;
+}
+#endif
+
 static int raw_open_common(BlockDriverState *bs, const char *filename,
                            int bdrv_flags, int open_flags)
 {
@@ -196,20 +269,8 @@ static int raw_open_common(BlockDriverState *bs, const char *filename,
         return ret;
     }
 
-    s->open_flags = open_flags | O_BINARY;
-    s->open_flags &= ~O_ACCMODE;
-    if (bdrv_flags & BDRV_O_RDWR) {
-        s->open_flags |= O_RDWR;
-    } else {
-        s->open_flags |= O_RDONLY;
-    }
-
-    /* Use O_DSYNC for write-through caching, no flags for write-back caching,
-     * and O_DIRECT for no caching. */
-    if ((bdrv_flags & BDRV_O_NOCACHE))
-        s->open_flags |= O_DIRECT;
-    if (!(bdrv_flags & BDRV_O_CACHE_WB))
-        s->open_flags |= O_DSYNC;
+    s->open_flags = open_flags;
+    raw_parse_flags(bdrv_flags, &s->open_flags);
 
     s->fd = -1;
     fd = qemu_open(filename, s->open_flags, 0644);
@@ -220,45 +281,13 @@ static int raw_open_common(BlockDriverState *bs, const char *filename,
         return ret;
     }
     s->fd = fd;
-    s->aligned_buf = NULL;
-
-    if ((bdrv_flags & BDRV_O_NOCACHE)) {
-        /*
-         * Allocate a buffer for read/modify/write cycles.  Chose the size
-         * pessimistically as we don't know the block size yet.
-         */
-        s->aligned_buf_size = 32 * MAX_BLOCKSIZE;
-        s->aligned_buf = qemu_memalign(MAX_BLOCKSIZE, s->aligned_buf_size);
-        if (s->aligned_buf == NULL) {
-            goto out_close;
-        }
-    }
-
-    /* We're falling back to POSIX AIO in some cases so init always */
-    if (paio_init() < 0) {
-        goto out_free_buf;
-    }
-
-#ifdef CONFIG_LINUX_AIO
-    /*
-     * Currently Linux do AIO only for files opened with O_DIRECT
-     * specified so check NOCACHE flag too
-     */
-    if ((bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) ==
-                      (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) {
 
-        s->aio_ctx = laio_init();
-        if (!s->aio_ctx) {
-            goto out_free_buf;
-        }
-        s->use_aio = 1;
-    } else
-#endif
-    {
 #ifdef CONFIG_LINUX_AIO
-        s->use_aio = 0;
-#endif
+    if (raw_set_aio(&s->aio_ctx, &s->use_aio, bdrv_flags)) {
+        qemu_close(fd);
+        return -errno;
     }
+#endif
 
 #ifdef CONFIG_XFS
     if (platform_test_xfs_fd(s->fd)) {
@@ -267,12 +296,6 @@ static int raw_open_common(BlockDriverState *bs, const char *filename,
 #endif
 
     return 0;
-
-out_free_buf:
-    qemu_vfree(s->aligned_buf);
-out_close:
-    qemu_close(fd);
-    return -errno;
 }
 
 static int raw_open(BlockDriverState *bs, const char *filename, int flags)
@@ -283,6 +306,113 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags)
     return raw_open_common(bs, filename, flags, 0);
 }
 
+static int raw_reopen_prepare(BDRVReopenState *state,
+                              BlockReopenQueue *queue, Error **errp)
+{
+    BDRVRawState *s;
+    BDRVRawReopenState *raw_s;
+    int ret = 0;
+
+    assert(state != NULL);
+    assert(state->bs != NULL);
+
+    s = state->bs->opaque;
+
+    state->opaque = g_malloc0(sizeof(BDRVRawReopenState));
+    raw_s = state->opaque;
+
+#ifdef CONFIG_LINUX_AIO
+    raw_s->use_aio = s->use_aio;
+
+    /* we can use s->aio_ctx instead of a copy, because the use_aio flag is
+     * valid in the 'false' condition even if aio_ctx is set, and raw_set_aio()
+     * won't override aio_ctx if aio_ctx is non-NULL */
+    if (raw_set_aio(&s->aio_ctx, &raw_s->use_aio, state->flags)) {
+        return -1;
+    }
+#endif
+
+    if (s->type == FTYPE_FD || s->type == FTYPE_CD) {
+        raw_s->open_flags |= O_NONBLOCK;
+    }
+
+    raw_parse_flags(state->flags, &raw_s->open_flags);
+
+    raw_s->fd = -1;
+
+    int fcntl_flags = O_APPEND | O_ASYNC | O_NONBLOCK;
+#ifdef O_NOATIME
+    fcntl_flags |= O_NOATIME;
+#endif
+
+    if ((raw_s->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
+        /* dup the original fd */
+        /* TODO: use qemu fcntl wrapper */
+#ifdef F_DUPFD_CLOEXEC
+        raw_s->fd = fcntl(s->fd, F_DUPFD_CLOEXEC, 0);
+#else
+        raw_s->fd = dup(s->fd);
+        if (raw_s->fd != -1) {
+            qemu_set_cloexec(raw_s->fd);
+        }
+#endif
+        if (raw_s->fd >= 0) {
+            ret = fcntl_setfl(raw_s->fd, raw_s->open_flags);
+            if (ret) {
+                qemu_close(raw_s->fd);
+                raw_s->fd = -1;
+            }
+        }
+    }
+
+    /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
+    if (raw_s->fd == -1) {
+        assert(!(raw_s->open_flags & O_CREAT));
+        raw_s->fd = qemu_open(state->bs->filename, raw_s->open_flags);
+        if (raw_s->fd == -1) {
+            ret = -1;
+        }
+    }
+    return ret;
+}
+
+
+static void raw_reopen_commit(BDRVReopenState *state)
+{
+    BDRVRawReopenState *raw_s = state->opaque;
+    BDRVRawState *s = state->bs->opaque;
+
+    s->open_flags = raw_s->open_flags;
+
+    qemu_close(s->fd);
+    s->fd = raw_s->fd;
+#ifdef CONFIG_LINUX_AIO
+    s->use_aio = raw_s->use_aio;
+#endif
+
+    g_free(state->opaque);
+    state->opaque = NULL;
+}
+
+
+static void raw_reopen_abort(BDRVReopenState *state)
+{
+    BDRVRawReopenState *raw_s = state->opaque;
+
+     /* nothing to do if NULL, we didn't get far enough */
+    if (raw_s == NULL) {
+        return;
+    }
+
+    if (raw_s->fd >= 0) {
+        qemu_close(raw_s->fd);
+        raw_s->fd = -1;
+    }
+    g_free(state->opaque);
+    state->opaque = NULL;
+}
+
+
 /* XXX: use host sector size if necessary with:
 #ifdef DIOCGSECTORSIZE
         {
@@ -316,6 +446,267 @@ static int qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
     return 1;
 }
 
+static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb)
+{
+    int ret;
+
+    ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf);
+    if (ret == -1) {
+        return -errno;
+    }
+
+    /*
+     * This looks weird, but the aio code only considers a request
+     * successful if it has written the full number of bytes.
+     *
+     * Now we overload aio_nbytes as aio_ioctl_cmd for the ioctl command,
+     * so in fact we return the ioctl command here to make posix_aio_read()
+     * happy..
+     */
+    return aiocb->aio_nbytes;
+}
+
+static ssize_t handle_aiocb_flush(RawPosixAIOData *aiocb)
+{
+    int ret;
+
+    ret = qemu_fdatasync(aiocb->aio_fildes);
+    if (ret == -1) {
+        return -errno;
+    }
+    return 0;
+}
+
+#ifdef CONFIG_PREADV
+
+static bool preadv_present = true;
+
+static ssize_t
+qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
+{
+    return preadv(fd, iov, nr_iov, offset);
+}
+
+static ssize_t
+qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
+{
+    return pwritev(fd, iov, nr_iov, offset);
+}
+
+#else
+
+static bool preadv_present = false;
+
+static ssize_t
+qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
+{
+    return -ENOSYS;
+}
+
+static ssize_t
+qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
+{
+    return -ENOSYS;
+}
+
+#endif
+
+static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
+{
+    ssize_t len;
+
+    do {
+        if (aiocb->aio_type & QEMU_AIO_WRITE)
+            len = qemu_pwritev(aiocb->aio_fildes,
+                               aiocb->aio_iov,
+                               aiocb->aio_niov,
+                               aiocb->aio_offset);
+         else
+            len = qemu_preadv(aiocb->aio_fildes,
+                              aiocb->aio_iov,
+                              aiocb->aio_niov,
+                              aiocb->aio_offset);
+    } while (len == -1 && errno == EINTR);
+
+    if (len == -1) {
+        return -errno;
+    }
+    return len;
+}
+
+/*
+ * Read/writes the data to/from a given linear buffer.
+ *
+ * Returns the number of bytes handles or -errno in case of an error. Short
+ * reads are only returned if the end of the file is reached.
+ */
+static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
+{
+    ssize_t offset = 0;
+    ssize_t len;
+
+    while (offset < aiocb->aio_nbytes) {
+        if (aiocb->aio_type & QEMU_AIO_WRITE) {
+            len = pwrite(aiocb->aio_fildes,
+                         (const char *)buf + offset,
+                         aiocb->aio_nbytes - offset,
+                         aiocb->aio_offset + offset);
+        } else {
+            len = pread(aiocb->aio_fildes,
+                        buf + offset,
+                        aiocb->aio_nbytes - offset,
+                        aiocb->aio_offset + offset);
+        }
+        if (len == -1 && errno == EINTR) {
+            continue;
+        } else if (len == -1) {
+            offset = -errno;
+            break;
+        } else if (len == 0) {
+            break;
+        }
+        offset += len;
+    }
+
+    return offset;
+}
+
+static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
+{
+    ssize_t nbytes;
+    char *buf;
+
+    if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
+        /*
+         * If there is just a single buffer, and it is properly aligned
+         * we can just use plain pread/pwrite without any problems.
+         */
+        if (aiocb->aio_niov == 1) {
+             return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base);
+        }
+        /*
+         * We have more than one iovec, and all are properly aligned.
+         *
+         * Try preadv/pwritev first and fall back to linearizing the
+         * buffer if it's not supported.
+         */
+        if (preadv_present) {
+            nbytes = handle_aiocb_rw_vector(aiocb);
+            if (nbytes == aiocb->aio_nbytes ||
+                (nbytes < 0 && nbytes != -ENOSYS)) {
+                return nbytes;
+            }
+            preadv_present = false;
+        }
+
+        /*
+         * XXX(hch): short read/write.  no easy way to handle the reminder
+         * using these interfaces.  For now retry using plain
+         * pread/pwrite?
+         */
+    }
+
+    /*
+     * Ok, we have to do it the hard way, copy all segments into
+     * a single aligned buffer.
+     */
+    buf = qemu_blockalign(aiocb->bs, aiocb->aio_nbytes);
+    if (aiocb->aio_type & QEMU_AIO_WRITE) {
+        char *p = buf;
+        int i;
+
+        for (i = 0; i < aiocb->aio_niov; ++i) {
+            memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len);
+            p += aiocb->aio_iov[i].iov_len;
+        }
+    }
+
+    nbytes = handle_aiocb_rw_linear(aiocb, buf);
+    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
+        char *p = buf;
+        size_t count = aiocb->aio_nbytes, copy;
+        int i;
+
+        for (i = 0; i < aiocb->aio_niov && count; ++i) {
+            copy = count;
+            if (copy > aiocb->aio_iov[i].iov_len) {
+                copy = aiocb->aio_iov[i].iov_len;
+            }
+            memcpy(aiocb->aio_iov[i].iov_base, p, copy);
+            p     += copy;
+            count -= copy;
+        }
+    }
+    qemu_vfree(buf);
+
+    return nbytes;
+}
+
+static int aio_worker(void *arg)
+{
+    RawPosixAIOData *aiocb = arg;
+    ssize_t ret = 0;
+
+    switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) {
+    case QEMU_AIO_READ:
+        ret = handle_aiocb_rw(aiocb);
+        if (ret >= 0 && ret < aiocb->aio_nbytes && aiocb->bs->growable) {
+            iov_memset(aiocb->aio_iov, aiocb->aio_niov, ret,
+                      0, aiocb->aio_nbytes - ret);
+
+            ret = aiocb->aio_nbytes;
+        }
+        if (ret == aiocb->aio_nbytes) {
+            ret = 0;
+        } else if (ret >= 0 && ret < aiocb->aio_nbytes) {
+            ret = -EINVAL;
+        }
+        break;
+    case QEMU_AIO_WRITE:
+        ret = handle_aiocb_rw(aiocb);
+        if (ret == aiocb->aio_nbytes) {
+            ret = 0;
+        } else if (ret >= 0 && ret < aiocb->aio_nbytes) {
+            ret = -EINVAL;
+        }
+        break;
+    case QEMU_AIO_FLUSH:
+        ret = handle_aiocb_flush(aiocb);
+        break;
+    case QEMU_AIO_IOCTL:
+        ret = handle_aiocb_ioctl(aiocb);
+        break;
+    default:
+        fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
+        ret = -EINVAL;
+        break;
+    }
+
+    g_slice_free(RawPosixAIOData, aiocb);
+    return ret;
+}
+
+static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque, int type)
+{
+    RawPosixAIOData *acb = g_slice_new(RawPosixAIOData);
+
+    acb->bs = bs;
+    acb->aio_type = type;
+    acb->aio_fildes = fd;
+
+    if (qiov) {
+        acb->aio_iov = qiov->iov;
+        acb->aio_niov = qiov->niov;
+    }
+    acb->aio_nbytes = nb_sectors * 512;
+    acb->aio_offset = sector_num * 512;
+
+    trace_paio_submit(acb, opaque, sector_num, nb_sectors, type);
+    return thread_pool_submit_aio(aio_worker, acb, cb, opaque);
+}
+
 static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs,
         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
         BlockDriverCompletionFunc *cb, void *opaque, int type)
@@ -330,7 +721,7 @@ static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs,
      * boundary.  Check if this is the case or tell the low-level
      * driver that it needs to copy the buffer.
      */
-    if (s->aligned_buf) {
+    if ((bs->open_flags & BDRV_O_NOCACHE)) {
         if (!qiov_is_aligned(bs, qiov)) {
             type |= QEMU_AIO_MISALIGNED;
 #ifdef CONFIG_LINUX_AIO
@@ -378,8 +769,6 @@ static void raw_close(BlockDriverState *bs)
     if (s->fd >= 0) {
         qemu_close(s->fd);
         s->fd = -1;
-        if (s->aligned_buf != NULL)
-            qemu_vfree(s->aligned_buf);
     }
 }
 
@@ -735,6 +1124,9 @@ static BlockDriver bdrv_file = {
     .instance_size = sizeof(BDRVRawState),
     .bdrv_probe = NULL, /* no probe for protocols */
     .bdrv_file_open = raw_open,
+    .bdrv_reopen_prepare = raw_reopen_prepare,
+    .bdrv_reopen_commit = raw_reopen_commit,
+    .bdrv_reopen_abort = raw_reopen_abort,
     .bdrv_close = raw_close,
     .bdrv_create = raw_create,
     .bdrv_co_discard = raw_co_discard,
@@ -937,10 +1329,19 @@ static BlockDriverAIOCB *hdev_aio_ioctl(BlockDriverState *bs,
         BlockDriverCompletionFunc *cb, void *opaque)
 {
     BDRVRawState *s = bs->opaque;
+    RawPosixAIOData *acb;
 
     if (fd_open(bs) < 0)
         return NULL;
-    return paio_ioctl(bs, s->fd, req, buf, cb, opaque);
+
+    acb = g_slice_new(RawPosixAIOData);
+    acb->bs = bs;
+    acb->aio_type = QEMU_AIO_IOCTL;
+    acb->aio_fildes = s->fd;
+    acb->aio_offset = 0;
+    acb->aio_ioctl_buf = buf;
+    acb->aio_ioctl_cmd = req;
+    return thread_pool_submit_aio(aio_worker, acb, cb, opaque);
 }
 
 #elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
@@ -1004,6 +1405,9 @@ static BlockDriver bdrv_host_device = {
     .bdrv_probe_device  = hdev_probe_device,
     .bdrv_file_open     = hdev_open,
     .bdrv_close         = raw_close,
+    .bdrv_reopen_prepare = raw_reopen_prepare,
+    .bdrv_reopen_commit  = raw_reopen_commit,
+    .bdrv_reopen_abort   = raw_reopen_abort,
     .bdrv_create        = hdev_create,
     .create_options     = raw_create_options,
     .bdrv_has_zero_init = hdev_has_zero_init,
@@ -1125,6 +1529,9 @@ static BlockDriver bdrv_host_floppy = {
     .bdrv_probe_device	= floppy_probe_device,
     .bdrv_file_open     = floppy_open,
     .bdrv_close         = raw_close,
+    .bdrv_reopen_prepare = raw_reopen_prepare,
+    .bdrv_reopen_commit  = raw_reopen_commit,
+    .bdrv_reopen_abort   = raw_reopen_abort,
     .bdrv_create        = hdev_create,
     .create_options     = raw_create_options,
     .bdrv_has_zero_init = hdev_has_zero_init,
@@ -1224,6 +1631,9 @@ static BlockDriver bdrv_host_cdrom = {
     .bdrv_probe_device	= cdrom_probe_device,
     .bdrv_file_open     = cdrom_open,
     .bdrv_close         = raw_close,
+    .bdrv_reopen_prepare = raw_reopen_prepare,
+    .bdrv_reopen_commit  = raw_reopen_commit,
+    .bdrv_reopen_abort   = raw_reopen_abort,
     .bdrv_create        = hdev_create,
     .create_options     = raw_create_options,
     .bdrv_has_zero_init = hdev_has_zero_init,
@@ -1343,6 +1753,9 @@ static BlockDriver bdrv_host_cdrom = {
     .bdrv_probe_device	= cdrom_probe_device,
     .bdrv_file_open     = cdrom_open,
     .bdrv_close         = raw_close,
+    .bdrv_reopen_prepare = raw_reopen_prepare,
+    .bdrv_reopen_commit  = raw_reopen_commit,
+    .bdrv_reopen_abort   = raw_reopen_abort,
     .bdrv_create        = hdev_create,
     .create_options     = raw_create_options,
     .bdrv_has_zero_init = hdev_has_zero_init,
@@ -1363,6 +1776,40 @@ static BlockDriver bdrv_host_cdrom = {
 };
 #endif /* __FreeBSD__ */
 
+#ifdef CONFIG_LINUX_AIO
+/**
+ * Return the file descriptor for Linux AIO
+ *
+ * This function is a layering violation and should be removed when it becomes
+ * possible to call the block layer outside the global mutex.  It allows the
+ * caller to hijack the file descriptor so I/O can be performed outside the
+ * block layer.
+ */
+int raw_get_aio_fd(BlockDriverState *bs)
+{
+    BDRVRawState *s;
+
+    if (!bs->drv) {
+        return -ENOMEDIUM;
+    }
+
+    if (bs->drv == bdrv_find_format("raw")) {
+        bs = bs->file;
+    }
+
+    /* raw-posix has several protocols so just check for raw_aio_readv */
+    if (bs->drv->bdrv_aio_readv != raw_aio_readv) {
+        return -ENOTSUP;
+    }
+
+    s = bs->opaque;
+    if (!s->use_aio) {
+        return -ENOTSUP;
+    }
+    return s->fd;
+}
+#endif /* CONFIG_LINUX_AIO */
+
 static void bdrv_file_init(void)
 {
     /*
diff --git a/block/raw-win32.c b/block/raw-win32.c
index c56bf83375..b89ac19ffa 100644
--- a/block/raw-win32.c
+++ b/block/raw-win32.c
@@ -22,9 +22,13 @@
  * THE SOFTWARE.
  */
 #include "qemu-common.h"
-#include "qemu-timer.h"
-#include "block_int.h"
-#include "module.h"
+#include "qemu/timer.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "raw-aio.h"
+#include "trace.h"
+#include "block/thread-pool.h"
+#include "qemu/iov.h"
 #include <windows.h>
 #include <winioctl.h>
 
@@ -32,12 +36,130 @@
 #define FTYPE_CD     1
 #define FTYPE_HARDDISK 2
 
+static QEMUWin32AIOState *aio;
+
+typedef struct RawWin32AIOData {
+    BlockDriverState *bs;
+    HANDLE hfile;
+    struct iovec *aio_iov;
+    int aio_niov;
+    size_t aio_nbytes;
+    off64_t aio_offset;
+    int aio_type;
+} RawWin32AIOData;
+
 typedef struct BDRVRawState {
     HANDLE hfile;
     int type;
     char drive_path[16]; /* format: "d:\" */
+    QEMUWin32AIOState *aio;
 } BDRVRawState;
 
+/*
+ * Read/writes the data to/from a given linear buffer.
+ *
+ * Returns the number of bytes handles or -errno in case of an error. Short
+ * reads are only returned if the end of the file is reached.
+ */
+static size_t handle_aiocb_rw(RawWin32AIOData *aiocb)
+{
+    size_t offset = 0;
+    int i;
+
+    for (i = 0; i < aiocb->aio_niov; i++) {
+        OVERLAPPED ov;
+        DWORD ret, ret_count, len;
+
+        memset(&ov, 0, sizeof(ov));
+        ov.Offset = (aiocb->aio_offset + offset);
+        ov.OffsetHigh = (aiocb->aio_offset + offset) >> 32;
+        len = aiocb->aio_iov[i].iov_len;
+        if (aiocb->aio_type & QEMU_AIO_WRITE) {
+            ret = WriteFile(aiocb->hfile, aiocb->aio_iov[i].iov_base,
+                            len, &ret_count, &ov);
+        } else {
+            ret = ReadFile(aiocb->hfile, aiocb->aio_iov[i].iov_base,
+                           len, &ret_count, &ov);
+        }
+        if (!ret) {
+            ret_count = 0;
+        }
+        if (ret_count != len) {
+            break;
+        }
+        offset += len;
+    }
+
+    return offset;
+}
+
+static int aio_worker(void *arg)
+{
+    RawWin32AIOData *aiocb = arg;
+    ssize_t ret = 0;
+    size_t count;
+
+    switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) {
+    case QEMU_AIO_READ:
+        count = handle_aiocb_rw(aiocb);
+        if (count < aiocb->aio_nbytes && aiocb->bs->growable) {
+            /* A short read means that we have reached EOF. Pad the buffer
+             * with zeros for bytes after EOF. */
+            iov_memset(aiocb->aio_iov, aiocb->aio_niov, count,
+                      0, aiocb->aio_nbytes - count);
+
+            count = aiocb->aio_nbytes;
+        }
+        if (count == aiocb->aio_nbytes) {
+            ret = 0;
+        } else {
+            ret = -EINVAL;
+        }
+        break;
+    case QEMU_AIO_WRITE:
+        count = handle_aiocb_rw(aiocb);
+        if (count == aiocb->aio_nbytes) {
+            count = 0;
+        } else {
+            count = -EINVAL;
+        }
+        break;
+    case QEMU_AIO_FLUSH:
+        if (!FlushFileBuffers(aiocb->hfile)) {
+            return -EIO;
+        }
+        break;
+    default:
+        fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
+        ret = -EINVAL;
+        break;
+    }
+
+    g_slice_free(RawWin32AIOData, aiocb);
+    return ret;
+}
+
+static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque, int type)
+{
+    RawWin32AIOData *acb = g_slice_new(RawWin32AIOData);
+
+    acb->bs = bs;
+    acb->hfile = hfile;
+    acb->aio_type = type;
+
+    if (qiov) {
+        acb->aio_iov = qiov->iov;
+        acb->aio_niov = qiov->niov;
+    }
+    acb->aio_nbytes = nb_sectors * 512;
+    acb->aio_offset = sector_num * 512;
+
+    trace_paio_submit(acb, opaque, sector_num, nb_sectors, type);
+    return thread_pool_submit_aio(aio_worker, acb, cb, opaque);
+}
+
 int qemu_ftruncate64(int fd, int64_t length)
 {
     LARGE_INTEGER li;
@@ -77,6 +199,26 @@ static int set_sparse(int fd)
 				 NULL, 0, NULL, 0, &returned, NULL);
 }
 
+static void raw_parse_flags(int flags, int *access_flags, DWORD *overlapped)
+{
+    assert(access_flags != NULL);
+    assert(overlapped != NULL);
+
+    if (flags & BDRV_O_RDWR) {
+        *access_flags = GENERIC_READ | GENERIC_WRITE;
+    } else {
+        *access_flags = GENERIC_READ;
+    }
+
+    *overlapped = FILE_ATTRIBUTE_NORMAL;
+    if (flags & BDRV_O_NATIVE_AIO) {
+        *overlapped |= FILE_FLAG_OVERLAPPED;
+    }
+    if (flags & BDRV_O_NOCACHE) {
+        *overlapped |= FILE_FLAG_NO_BUFFERING;
+    }
+}
+
 static int raw_open(BlockDriverState *bs, const char *filename, int flags)
 {
     BDRVRawState *s = bs->opaque;
@@ -85,17 +227,15 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags)
 
     s->type = FTYPE_FILE;
 
-    if (flags & BDRV_O_RDWR) {
-        access_flags = GENERIC_READ | GENERIC_WRITE;
-    } else {
-        access_flags = GENERIC_READ;
+    raw_parse_flags(flags, &access_flags, &overlapped);
+    
+    if ((flags & BDRV_O_NATIVE_AIO) && aio == NULL) {
+        aio = win32_aio_init();
+        if (aio == NULL) {
+            return -EINVAL;
+        }
     }
 
-    overlapped = FILE_ATTRIBUTE_NORMAL;
-    if (flags & BDRV_O_NOCACHE)
-        overlapped |= FILE_FLAG_NO_BUFFERING;
-    if (!(flags & BDRV_O_CACHE_WB))
-        overlapped |= FILE_FLAG_WRITE_THROUGH;
     s->hfile = CreateFile(filename, access_flags,
                           FILE_SHARE_READ, NULL,
                           OPEN_EXISTING, overlapped, NULL);
@@ -104,64 +244,53 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags)
 
         if (err == ERROR_ACCESS_DENIED)
             return -EACCES;
-        return -1;
+        return -EINVAL;
+    }
+
+    if (flags & BDRV_O_NATIVE_AIO) {
+        int ret = win32_aio_attach(aio, s->hfile);
+        if (ret < 0) {
+            CloseHandle(s->hfile);
+            return ret;
+        }
+        s->aio = aio;
     }
     return 0;
 }
 
-static int raw_read(BlockDriverState *bs, int64_t sector_num,
-                    uint8_t *buf, int nb_sectors)
+static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs,
+                         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+                         BlockDriverCompletionFunc *cb, void *opaque)
 {
     BDRVRawState *s = bs->opaque;
-    OVERLAPPED ov;
-    DWORD ret_count;
-    int ret;
-    int64_t offset = sector_num * 512;
-    int count = nb_sectors * 512;
-
-    memset(&ov, 0, sizeof(ov));
-    ov.Offset = offset;
-    ov.OffsetHigh = offset >> 32;
-    ret = ReadFile(s->hfile, buf, count, &ret_count, &ov);
-    if (!ret)
-        return ret_count;
-    if (ret_count == count)
-        ret_count = 0;
-    return ret_count;
+    if (s->aio) {
+        return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov,
+                                nb_sectors, cb, opaque, QEMU_AIO_READ); 
+    } else {
+        return paio_submit(bs, s->hfile, sector_num, qiov, nb_sectors,
+                           cb, opaque, QEMU_AIO_READ);
+    }
 }
 
-static int raw_write(BlockDriverState *bs, int64_t sector_num,
-                     const uint8_t *buf, int nb_sectors)
+static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs,
+                          int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+                          BlockDriverCompletionFunc *cb, void *opaque)
 {
     BDRVRawState *s = bs->opaque;
-    OVERLAPPED ov;
-    DWORD ret_count;
-    int ret;
-    int64_t offset = sector_num * 512;
-    int count = nb_sectors * 512;
-
-    memset(&ov, 0, sizeof(ov));
-    ov.Offset = offset;
-    ov.OffsetHigh = offset >> 32;
-    ret = WriteFile(s->hfile, buf, count, &ret_count, &ov);
-    if (!ret)
-        return ret_count;
-    if (ret_count == count)
-        ret_count = 0;
-    return ret_count;
+    if (s->aio) {
+        return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov,
+                                nb_sectors, cb, opaque, QEMU_AIO_WRITE); 
+    } else {
+        return paio_submit(bs, s->hfile, sector_num, qiov, nb_sectors,
+                           cb, opaque, QEMU_AIO_WRITE);
+    }
 }
 
-static int raw_flush(BlockDriverState *bs)
+static BlockDriverAIOCB *raw_aio_flush(BlockDriverState *bs,
+                         BlockDriverCompletionFunc *cb, void *opaque)
 {
     BDRVRawState *s = bs->opaque;
-    int ret;
-
-    ret = FlushFileBuffers(s->hfile);
-    if (ret == 0) {
-        return -EIO;
-    }
-
-    return 0;
+    return paio_submit(bs, s->hfile, 0, NULL, 0, cb, opaque, QEMU_AIO_FLUSH);
 }
 
 static void raw_close(BlockDriverState *bs)
@@ -174,13 +303,24 @@ static int raw_truncate(BlockDriverState *bs, int64_t offset)
 {
     BDRVRawState *s = bs->opaque;
     LONG low, high;
+    DWORD dwPtrLow;
 
     low = offset;
     high = offset >> 32;
-    if (!SetFilePointer(s->hfile, low, &high, FILE_BEGIN))
-	return -EIO;
-    if (!SetEndOfFile(s->hfile))
+
+    /*
+     * An error has occurred if the return value is INVALID_SET_FILE_POINTER
+     * and GetLastError doesn't return NO_ERROR.
+     */
+    dwPtrLow = SetFilePointer(s->hfile, low, &high, FILE_BEGIN);
+    if (dwPtrLow == INVALID_SET_FILE_POINTER && GetLastError() != NO_ERROR) {
+        fprintf(stderr, "SetFilePointer error: %lu\n", GetLastError());
+        return -EIO;
+    }
+    if (SetEndOfFile(s->hfile) == 0) {
+        fprintf(stderr, "SetEndOfFile error: %lu\n", GetLastError());
         return -EIO;
+    }
     return 0;
 }
 
@@ -282,9 +422,9 @@ static BlockDriver bdrv_file = {
     .bdrv_close		= raw_close,
     .bdrv_create	= raw_create,
 
-    .bdrv_read              = raw_read,
-    .bdrv_write             = raw_write,
-    .bdrv_co_flush_to_disk  = raw_flush,
+    .bdrv_aio_readv     = raw_aio_readv,
+    .bdrv_aio_writev    = raw_aio_writev,
+    .bdrv_aio_flush     = raw_aio_flush,
 
     .bdrv_truncate	= raw_truncate,
     .bdrv_getlength	= raw_getlength,
@@ -374,18 +514,10 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
     }
     s->type = find_device_type(bs, filename);
 
-    if (flags & BDRV_O_RDWR) {
-        access_flags = GENERIC_READ | GENERIC_WRITE;
-    } else {
-        access_flags = GENERIC_READ;
-    }
+    raw_parse_flags(flags, &access_flags, &overlapped);
+
     create_flags = OPEN_EXISTING;
 
-    overlapped = FILE_ATTRIBUTE_NORMAL;
-    if (flags & BDRV_O_NOCACHE)
-        overlapped |= FILE_FLAG_NO_BUFFERING;
-    if (!(flags & BDRV_O_CACHE_WB))
-        overlapped |= FILE_FLAG_WRITE_THROUGH;
     s->hfile = CreateFile(filename, access_flags,
                           FILE_SHARE_READ, NULL,
                           create_flags, overlapped, NULL);
@@ -413,9 +545,9 @@ static BlockDriver bdrv_host_device = {
     .bdrv_close		= raw_close,
     .bdrv_has_zero_init = hdev_has_zero_init,
 
-    .bdrv_read              = raw_read,
-    .bdrv_write             = raw_write,
-    .bdrv_co_flush_to_disk  = raw_flush,
+    .bdrv_aio_readv     = raw_aio_readv,
+    .bdrv_aio_writev    = raw_aio_writev,
+    .bdrv_aio_flush     = raw_aio_flush,
 
     .bdrv_getlength	= raw_getlength,
     .bdrv_get_allocated_file_size
diff --git a/block/raw.c b/block/raw.c
index ff34ea41e7..75812db3c2 100644
--- a/block/raw.c
+++ b/block/raw.c
@@ -1,7 +1,7 @@
 
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
 
 static int raw_open(BlockDriverState *bs, int flags)
 {
@@ -9,6 +9,14 @@ static int raw_open(BlockDriverState *bs, int flags)
     return 0;
 }
 
+/* We have nothing to do for raw reopen, stubs just return
+ * success */
+static int raw_reopen_prepare(BDRVReopenState *state,
+                              BlockReopenQueue *queue,  Error **errp)
+{
+    return 0;
+}
+
 static int coroutine_fn raw_co_readv(BlockDriverState *bs, int64_t sector_num,
                                      int nb_sectors, QEMUIOVector *qiov)
 {
@@ -115,6 +123,8 @@ static BlockDriver bdrv_raw = {
     .bdrv_open          = raw_open,
     .bdrv_close         = raw_close,
 
+    .bdrv_reopen_prepare  = raw_reopen_prepare,
+
     .bdrv_co_readv          = raw_co_readv,
     .bdrv_co_writev         = raw_co_writev,
     .bdrv_co_is_allocated   = raw_co_is_allocated,
diff --git a/block/rbd.c b/block/rbd.c
index 5a0f79fc8f..8cd10a7b59 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -14,8 +14,8 @@
 #include <inttypes.h>
 
 #include "qemu-common.h"
-#include "qemu-error.h"
-#include "block_int.h"
+#include "qemu/error-report.h"
+#include "block/block_int.h"
 
 #include <rbd/librbd.h>
 
@@ -69,7 +69,7 @@ typedef enum {
 typedef struct RBDAIOCB {
     BlockDriverAIOCB common;
     QEMUBH *bh;
-    int ret;
+    int64_t ret;
     QEMUIOVector *qiov;
     char *bounce;
     RBDAIOCmd cmd;
@@ -77,6 +77,7 @@ typedef struct RBDAIOCB {
     int error;
     struct BDRVRBDState *s;
     int cancelled;
+    int status;
 } RBDAIOCB;
 
 typedef struct RADOSCB {
@@ -86,7 +87,7 @@ typedef struct RADOSCB {
     int done;
     int64_t size;
     char *buf;
-    int ret;
+    int64_t ret;
 } RADOSCB;
 
 #define RBD_FD_READ 0
@@ -376,12 +377,6 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
     RBDAIOCB *acb = rcb->acb;
     int64_t r;
 
-    if (acb->cancelled) {
-        qemu_vfree(acb->bounce);
-        qemu_aio_release(acb);
-        goto done;
-    }
-
     r = rcb->ret;
 
     if (acb->cmd == RBD_AIO_WRITE ||
@@ -409,7 +404,6 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
     /* Note that acb->bh can be NULL in case where the aio was cancelled */
     acb->bh = qemu_bh_new(rbd_aio_bh_cb, acb);
     qemu_bh_schedule(acb->bh);
-done:
     g_free(rcb);
 }
 
@@ -487,12 +481,6 @@ static int qemu_rbd_open(BlockDriverState *bs, const char *filename, int flags)
         rados_conf_set(s->cluster, "rbd_cache", "false");
     } else {
         rados_conf_set(s->cluster, "rbd_cache", "true");
-        if (!(flags & BDRV_O_CACHE_WB)) {
-            r = rados_conf_set(s->cluster, "rbd_cache_max_dirty", "0");
-            if (r < 0) {
-                rados_conf_set(s->cluster, "rbd_cache", "false");
-            }
-        }
     }
 
     if (strstr(conf, "conf=") == NULL) {
@@ -574,9 +562,15 @@ static void qemu_rbd_aio_cancel(BlockDriverAIOCB *blockacb)
 {
     RBDAIOCB *acb = (RBDAIOCB *) blockacb;
     acb->cancelled = 1;
+
+    while (acb->status == -EINPROGRESS) {
+        qemu_aio_wait();
+    }
+
+    qemu_aio_release(acb);
 }
 
-static AIOPool rbd_aio_pool = {
+static const AIOCBInfo rbd_aiocb_info = {
     .aiocb_size = sizeof(RBDAIOCB),
     .cancel = qemu_rbd_aio_cancel,
 };
@@ -645,8 +639,11 @@ static void rbd_aio_bh_cb(void *opaque)
     acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
     qemu_bh_delete(acb->bh);
     acb->bh = NULL;
+    acb->status = 0;
 
-    qemu_aio_release(acb);
+    if (!acb->cancelled) {
+        qemu_aio_release(acb);
+    }
 }
 
 static int rbd_aio_discard_wrapper(rbd_image_t image,
@@ -678,7 +675,7 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
 
     BDRVRBDState *s = bs->opaque;
 
-    acb = qemu_aio_get(&rbd_aio_pool, bs, cb, opaque);
+    acb = qemu_aio_get(&rbd_aiocb_info, bs, cb, opaque);
     acb->cmd = cmd;
     acb->qiov = qiov;
     if (cmd == RBD_AIO_DISCARD) {
@@ -691,6 +688,7 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
     acb->s = s;
     acb->cancelled = 0;
     acb->bh = NULL;
+    acb->status = -EINPROGRESS;
 
     if (cmd == RBD_AIO_WRITE) {
         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
diff --git a/block/sheepdog.c b/block/sheepdog.c
index a04ad99ead..e821746116 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -13,10 +13,10 @@
  */
 
 #include "qemu-common.h"
-#include "qemu-error.h"
-#include "qemu_socket.h"
-#include "block_int.h"
-#include "bitops.h"
+#include "qemu/error-report.h"
+#include "qemu/sockets.h"
+#include "block/block_int.h"
+#include "qemu/bitops.h"
 
 #define SD_PROTO_VER 0x01
 
@@ -201,12 +201,12 @@ static inline uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
     return hval;
 }
 
-static inline int is_data_obj_writable(SheepdogInode *inode, unsigned int idx)
+static inline bool is_data_obj_writable(SheepdogInode *inode, unsigned int idx)
 {
     return inode->vdi_id == inode->data_vdi_id[idx];
 }
 
-static inline int is_data_obj(uint64_t oid)
+static inline bool is_data_obj(uint64_t oid)
 {
     return !(VDI_BIT & oid);
 }
@@ -231,7 +231,7 @@ static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx)
     return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
 }
 
-static inline int is_snapshot(struct SheepdogInode *inode)
+static inline bool is_snapshot(struct SheepdogInode *inode)
 {
     return !!inode->snap_ctime;
 }
@@ -281,7 +281,7 @@ struct SheepdogAIOCB {
     Coroutine *coroutine;
     void (*aio_done_func)(SheepdogAIOCB *);
 
-    int canceled;
+    bool canceled;
     int nr_pending;
 };
 
@@ -292,8 +292,8 @@ typedef struct BDRVSheepdogState {
     uint32_t max_dirty_data_idx;
 
     char name[SD_MAX_VDI_LEN];
-    int is_snapshot;
-    uint8_t cache_enabled;
+    bool is_snapshot;
+    bool cache_enabled;
 
     char *addr;
     char *port;
@@ -417,10 +417,10 @@ static void sd_aio_cancel(BlockDriverAIOCB *blockacb)
      */
     acb->ret = -EIO;
     qemu_coroutine_enter(acb->coroutine, NULL);
-    acb->canceled = 1;
+    acb->canceled = true;
 }
 
-static AIOPool sd_aio_pool = {
+static const AIOCBInfo sd_aiocb_info = {
     .aiocb_size = sizeof(SheepdogAIOCB),
     .cancel = sd_aio_cancel,
 };
@@ -431,7 +431,7 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
 {
     SheepdogAIOCB *acb;
 
-    acb = qemu_aio_get(&sd_aio_pool, bs, cb, opaque);
+    acb = qemu_aio_get(&sd_aiocb_info, bs, cb, opaque);
 
     acb->qiov = qiov;
 
@@ -439,7 +439,7 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
     acb->nb_sectors = nb_sectors;
 
     acb->aio_done_func = NULL;
-    acb->canceled = 0;
+    acb->canceled = false;
     acb->coroutine = qemu_coroutine_self();
     acb->ret = 0;
     acb->nr_pending = 0;
@@ -485,6 +485,7 @@ static int connect_to_sdog(const char *addr, const char *port)
             if (errno == EINTR) {
                 goto reconnect;
             }
+            close(fd);
             break;
         }
 
@@ -612,7 +613,7 @@ static int do_req(int sockfd, SheepdogReq *hdr, void *data,
 }
 
 static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
-                           struct iovec *iov, int niov, int create,
+                           struct iovec *iov, int niov, bool create,
                            enum AIOCBState aiocb_type);
 
 
@@ -645,7 +646,7 @@ static void coroutine_fn send_pending_req(BDRVSheepdogState *s, uint64_t oid)
         QLIST_REMOVE(aio_req, aio_siblings);
         QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
         ret = add_aio_request(s, aio_req, acb->qiov->iov,
-                              acb->qiov->niov, 0, acb->aiocb_type);
+                              acb->qiov->niov, false, acb->aiocb_type);
         if (ret < 0) {
             error_report("add_aio_request is failed");
             free_aio_req(s, aio_req);
@@ -713,16 +714,17 @@ static void coroutine_fn aio_read_response(void *opaque)
              * and max_dirty_data_idx are changed to include updated
              * index between them.
              */
-            s->inode.data_vdi_id[idx] = s->inode.vdi_id;
-            s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
-            s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
-
+            if (rsp.result == SD_RES_SUCCESS) {
+                s->inode.data_vdi_id[idx] = s->inode.vdi_id;
+                s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
+                s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
+            }
             /*
              * Some requests may be blocked because simultaneous
              * create requests are not allowed, so we search the
              * pending requests here.
              */
-            send_pending_req(s, vid_to_data_oid(s->inode.vdi_id, idx));
+            send_pending_req(s, aio_req->oid);
         }
         break;
     case AIOCB_READ_UDATA:
@@ -865,14 +867,14 @@ static int parse_vdiname(BDRVSheepdogState *s, const char *filename,
         s->port = 0;
     }
 
-    strncpy(vdi, p, SD_MAX_VDI_LEN);
+    pstrcpy(vdi, SD_MAX_VDI_LEN, p);
 
     p = strchr(vdi, ':');
     if (p) {
         *p++ = '\0';
         *snapid = strtoul(p, NULL, 10);
         if (*snapid == 0) {
-            strncpy(tag, p, SD_MAX_VDI_TAG_LEN);
+            pstrcpy(tag, SD_MAX_VDI_TAG_LEN, p);
         }
     } else {
         *snapid = CURRENT_VDI_ID; /* search current vdi */
@@ -899,7 +901,10 @@ static int find_vdi_name(BDRVSheepdogState *s, char *filename, uint32_t snapid,
         return fd;
     }
 
-    memset(buf, 0, sizeof(buf));
+    /* This pair of strncpy calls ensures that the buffer is zero-filled,
+     * which is desirable since we'll soon be sending those bytes, and
+     * don't want the send_req to read uninitialized data.
+     */
     strncpy(buf, filename, SD_MAX_VDI_LEN);
     strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
 
@@ -939,7 +944,7 @@ out:
 }
 
 static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
-                           struct iovec *iov, int niov, int create,
+                           struct iovec *iov, int niov, bool create,
                            enum AIOCBState aiocb_type)
 {
     int nr_copies = s->inode.nr_copies;
@@ -1018,7 +1023,7 @@ static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
 
 static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
                              unsigned int datalen, uint64_t offset,
-                             int write, int create, uint8_t cache)
+                             bool write, bool create, bool cache)
 {
     SheepdogObjReq hdr;
     SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
@@ -1067,18 +1072,18 @@ static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
 }
 
 static int read_object(int fd, char *buf, uint64_t oid, int copies,
-                       unsigned int datalen, uint64_t offset, uint8_t cache)
+                       unsigned int datalen, uint64_t offset, bool cache)
 {
-    return read_write_object(fd, buf, oid, copies, datalen, offset, 0, 0,
-                             cache);
+    return read_write_object(fd, buf, oid, copies, datalen, offset, false,
+                             false, cache);
 }
 
 static int write_object(int fd, char *buf, uint64_t oid, int copies,
-                        unsigned int datalen, uint64_t offset, int create,
-                        uint8_t cache)
+                        unsigned int datalen, uint64_t offset, bool create,
+                        bool cache)
 {
-    return read_write_object(fd, buf, oid, copies, datalen, offset, 1, create,
-                             cache);
+    return read_write_object(fd, buf, oid, copies, datalen, offset, true,
+                             create, cache);
 }
 
 static int sd_open(BlockDriverState *bs, const char *filename, int flags)
@@ -1113,19 +1118,17 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags)
         goto out;
     }
 
-    if (flags & BDRV_O_CACHE_WB) {
-        s->cache_enabled = 1;
-        s->flush_fd = connect_to_sdog(s->addr, s->port);
-        if (s->flush_fd < 0) {
-            error_report("failed to connect");
-            ret = s->flush_fd;
-            goto out;
-        }
+    s->cache_enabled = true;
+    s->flush_fd = connect_to_sdog(s->addr, s->port);
+    if (s->flush_fd < 0) {
+        error_report("failed to connect");
+        ret = s->flush_fd;
+        goto out;
     }
 
     if (snapid || tag[0] != '\0') {
         dprintf("%" PRIx32 " snapshot inode was open.\n", vid);
-        s->is_snapshot = 1;
+        s->is_snapshot = true;
     }
 
     fd = connect_to_sdog(s->addr, s->port);
@@ -1150,7 +1153,7 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags)
     s->max_dirty_data_idx = 0;
 
     bs->total_sectors = s->inode.vdi_size / SECTOR_SIZE;
-    strncpy(s->name, vdi, sizeof(s->name));
+    pstrcpy(s->name, sizeof(s->name), vdi);
     qemu_co_mutex_init(&s->lock);
     g_free(buf);
     return 0;
@@ -1178,8 +1181,11 @@ static int do_sd_create(char *filename, int64_t vdi_size,
         return fd;
     }
 
+    /* FIXME: would it be better to fail (e.g., return -EIO) when filename
+     * does not fit in buf?  For now, just truncate and avoid buffer overrun.
+     */
     memset(buf, 0, sizeof(buf));
-    strncpy(buf, filename, SD_MAX_VDI_LEN);
+    pstrcpy(buf, sizeof(buf), filename);
 
     memset(&hdr, 0, sizeof(hdr));
     hdr.opcode = SD_OP_NEW_VDI;
@@ -1265,7 +1271,7 @@ static int sd_create(const char *filename, QEMUOptionParameter *options)
     BDRVSheepdogState *s;
     char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
     uint32_t snapid;
-    int prealloc = 0;
+    bool prealloc = false;
     const char *vdiname;
 
     s = g_malloc0(sizeof(BDRVSheepdogState));
@@ -1287,9 +1293,9 @@ static int sd_create(const char *filename, QEMUOptionParameter *options)
             backing_file = options->value.s;
         } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) {
             if (!options->value.s || !strcmp(options->value.s, "off")) {
-                prealloc = 0;
+                prealloc = false;
             } else if (!strcmp(options->value.s, "full")) {
-                prealloc = 1;
+                prealloc = true;
             } else {
                 error_report("Invalid preallocation mode: '%s'",
                              options->value.s);
@@ -1417,7 +1423,7 @@ static int sd_truncate(BlockDriverState *bs, int64_t offset)
     datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
     s->inode.vdi_size = offset;
     ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
-                       s->inode.nr_copies, datalen, 0, 0, s->cache_enabled);
+                       s->inode.nr_copies, datalen, 0, false, s->cache_enabled);
     close(fd);
 
     if (ret < 0) {
@@ -1456,7 +1462,7 @@ static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
         aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
                                 data_len, offset, 0, 0, offset);
         QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
-        ret = add_aio_request(s, aio_req, &iov, 1, 0, AIOCB_WRITE_UDATA);
+        ret = add_aio_request(s, aio_req, &iov, 1, false, AIOCB_WRITE_UDATA);
         if (ret) {
             free_aio_req(s, aio_req);
             acb->ret = -EIO;
@@ -1510,7 +1516,7 @@ static int sd_create_branch(BDRVSheepdogState *s)
 
     memcpy(&s->inode, buf, sizeof(s->inode));
 
-    s->is_snapshot = 0;
+    s->is_snapshot = false;
     ret = 0;
     dprintf("%" PRIx32 " was newly created.\n", s->inode.vdi_id);
 
@@ -1565,7 +1571,7 @@ static int coroutine_fn sd_co_rw_vector(void *p)
     while (done != total) {
         uint8_t flags = 0;
         uint64_t old_oid = 0;
-        int create = 0;
+        bool create = false;
 
         oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);
 
@@ -1580,10 +1586,10 @@ static int coroutine_fn sd_co_rw_vector(void *p)
             break;
         case AIOCB_WRITE_UDATA:
             if (!inode->data_vdi_id[idx]) {
-                create = 1;
+                create = true;
             } else if (!is_data_obj_writable(inode, idx)) {
                 /* Copy-On-Write */
-                create = 1;
+                create = true;
                 old_oid = oid;
                 flags = SD_FLAG_CMD_COW;
             }
@@ -1717,7 +1723,7 @@ static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
     if (rsp->result == SD_RES_INVALID_PARMS) {
         dprintf("disable write cache since the server doesn't support it\n");
 
-        s->cache_enabled = 0;
+        s->cache_enabled = false;
         closesocket(s->flush_fd);
         return 0;
     }
@@ -1753,6 +1759,9 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
 
     s->inode.vm_state_size = sn_info->vm_state_size;
     s->inode.vm_clock_nsec = sn_info->vm_clock_nsec;
+    /* It appears that inode.tag does not require a NUL terminator,
+     * which means this use of strncpy is ok.
+     */
     strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag));
     /* we don't need to update entire object */
     datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
@@ -1765,7 +1774,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
     }
 
     ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
-                       s->inode.nr_copies, datalen, 0, 0, s->cache_enabled);
+                       s->inode.nr_copies, datalen, 0, false, s->cache_enabled);
     if (ret < 0) {
         error_report("failed to write snapshot's inode.");
         goto cleanup;
@@ -1812,13 +1821,13 @@ static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
 
     memcpy(old_s, s, sizeof(BDRVSheepdogState));
 
-    memset(vdi, 0, sizeof(vdi));
-    strncpy(vdi, s->name, sizeof(vdi));
+    pstrcpy(vdi, sizeof(vdi), s->name);
 
-    memset(tag, 0, sizeof(tag));
     snapid = strtoul(snapshot_id, NULL, 10);
-    if (!snapid) {
-        strncpy(tag, s->name, sizeof(tag));
+    if (snapid) {
+        tag[0] = 0;
+    } else {
+        pstrcpy(tag, sizeof(tag), s->name);
     }
 
     ret = find_vdi_name(s, vdi, snapid, tag, &vid, 1);
@@ -1852,7 +1861,7 @@ static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
         goto out;
     }
 
-    s->is_snapshot = 1;
+    s->is_snapshot = true;
 
     g_free(buf);
     g_free(old_s);
@@ -1947,8 +1956,9 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
 
             snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), "%u",
                      inode.snap_id);
-            strncpy(sn_tab[found].name, inode.tag,
-                    MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)));
+            pstrcpy(sn_tab[found].name,
+                    MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)),
+                    inode.tag);
             found++;
         }
     }
@@ -1969,8 +1979,8 @@ out:
 static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
                                 int64_t pos, int size, int load)
 {
-    int fd, create;
-    int ret = 0, remaining = size;
+    bool create;
+    int fd, ret = 0, remaining = size;
     unsigned int data_len;
     uint64_t vmstate_oid;
     uint32_t vdi_index;
@@ -1985,7 +1995,7 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
         vdi_index = pos / SD_DATA_OBJ_SIZE;
         offset = pos % SD_DATA_OBJ_SIZE;
 
-        data_len = MIN(remaining, SD_DATA_OBJ_SIZE);
+        data_len = MIN(remaining, SD_DATA_OBJ_SIZE - offset);
 
         vmstate_oid = vid_to_vmstate_oid(s->inode.vdi_id, vdi_index);
 
@@ -2006,6 +2016,7 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
         }
 
         pos += data_len;
+        data += data_len;
         remaining -= data_len;
     }
     ret = size;
diff --git a/block/stream.c b/block/stream.c
index 37c46525d2..d6df06f35a 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -12,7 +12,8 @@
  */
 
 #include "trace.h"
-#include "block_int.h"
+#include "block/block_int.h"
+#include "block/blockjob.h"
 #include "qemu/ratelimit.h"
 
 enum {
@@ -30,6 +31,7 @@ typedef struct StreamBlockJob {
     BlockJob common;
     RateLimit limit;
     BlockDriverState *base;
+    BlockdevOnError on_error;
     char backing_file_id[1024];
 } StreamBlockJob;
 
@@ -77,13 +79,14 @@ static void coroutine_fn stream_run(void *opaque)
     BlockDriverState *bs = s->common.bs;
     BlockDriverState *base = s->base;
     int64_t sector_num, end;
+    int error = 0;
     int ret = 0;
     int n = 0;
     void *buf;
 
     s->common.len = bdrv_getlength(bs);
     if (s->common.len < 0) {
-        block_job_complete(&s->common, s->common.len);
+        block_job_completed(&s->common, s->common.len);
         return;
     }
 
@@ -105,7 +108,7 @@ static void coroutine_fn stream_run(void *opaque)
 
 wait:
         /* Note that even when no rate limit is applied we need to yield
-         * with no pending I/O here so that qemu_aio_flush() returns.
+         * with no pending I/O here so that bdrv_drain_all() returns.
          */
         block_job_sleep_ns(&s->common, rt_clock, delay_ns);
         if (block_job_is_cancelled(&s->common)) {
@@ -122,6 +125,12 @@ wait:
              * known-unallocated area [sector_num, sector_num+n).  */
             ret = bdrv_co_is_allocated_above(bs->backing_hd, base,
                                              sector_num, n, &n);
+
+            /* Finish early if end of backing file has been reached */
+            if (ret == 0 && n == 0) {
+                n = end - sector_num;
+            }
+
             copy = (ret == 1);
         }
         trace_stream_one_iteration(s, sector_num, n, ret);
@@ -135,7 +144,19 @@ wait:
             ret = stream_populate(bs, sector_num, n, buf);
         }
         if (ret < 0) {
-            break;
+            BlockErrorAction action =
+                block_job_error_action(&s->common, s->common.bs, s->on_error,
+                                       true, -ret);
+            if (action == BDRV_ACTION_STOP) {
+                n = 0;
+                continue;
+            }
+            if (error == 0) {
+                error = ret;
+            }
+            if (action == BDRV_ACTION_REPORT) {
+                break;
+            }
         }
         ret = 0;
 
@@ -147,6 +168,9 @@ wait:
         bdrv_disable_copy_on_read(bs);
     }
 
+    /* Do not remove the backing file if an error was there but ignored.  */
+    ret = error;
+
     if (!block_job_is_cancelled(&s->common) && sector_num == end && ret == 0) {
         const char *base_id = NULL, *base_fmt = NULL;
         if (base) {
@@ -160,7 +184,7 @@ wait:
     }
 
     qemu_vfree(buf);
-    block_job_complete(&s->common, ret);
+    block_job_completed(&s->common, ret);
 }
 
 static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp)
@@ -182,11 +206,19 @@ static BlockJobType stream_job_type = {
 
 void stream_start(BlockDriverState *bs, BlockDriverState *base,
                   const char *base_id, int64_t speed,
+                  BlockdevOnError on_error,
                   BlockDriverCompletionFunc *cb,
                   void *opaque, Error **errp)
 {
     StreamBlockJob *s;
 
+    if ((on_error == BLOCKDEV_ON_ERROR_STOP ||
+         on_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
+        !bdrv_iostatus_is_enabled(bs)) {
+        error_set(errp, QERR_INVALID_PARAMETER, "on-error");
+        return;
+    }
+
     s = block_job_create(&stream_job_type, bs, speed, cb, opaque, errp);
     if (!s) {
         return;
@@ -197,6 +229,7 @@ void stream_start(BlockDriverState *bs, BlockDriverState *base,
         pstrcpy(s->backing_file_id, sizeof(s->backing_file_id), base_id);
     }
 
+    s->on_error = on_error;
     s->common.co = qemu_coroutine_create(stream_run);
     trace_stream_start(bs, base, s, s->common.co, opaque);
     qemu_coroutine_enter(s->common.co, s);
diff --git a/block/vdi.c b/block/vdi.c
index c4f1529db9..021abaa227 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -50,19 +50,16 @@
  */
 
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
-#include "migration.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "migration/migration.h"
 
 #if defined(CONFIG_UUID)
 #include <uuid/uuid.h>
 #else
 /* TODO: move uuid emulation to some central place in QEMU. */
-#include "sysemu.h"     /* UUID_FMT */
+#include "sysemu/sysemu.h"     /* UUID_FMT */
 typedef unsigned char uuid_t[16];
-void uuid_generate(uuid_t out);
-int uuid_is_null(const uuid_t uu);
-void uuid_unparse(const uuid_t uu, char *out);
 #endif
 
 /* Code configuration options. */
@@ -124,18 +121,18 @@ void uuid_unparse(const uuid_t uu, char *out);
 #define VDI_IS_ALLOCATED(X) ((X) < VDI_DISCARDED)
 
 #if !defined(CONFIG_UUID)
-void uuid_generate(uuid_t out)
+static inline void uuid_generate(uuid_t out)
 {
     memset(out, 0, sizeof(uuid_t));
 }
 
-int uuid_is_null(const uuid_t uu)
+static inline int uuid_is_null(const uuid_t uu)
 {
     uuid_t null_uuid = { 0 };
     return memcmp(uu, null_uuid, sizeof(uuid_t)) == 0;
 }
 
-void uuid_unparse(const uuid_t uu, char *out)
+static inline void uuid_unparse(const uuid_t uu, char *out)
 {
     snprintf(out, 37, UUID_FMT,
             uu[0], uu[1], uu[2], uu[3], uu[4], uu[5], uu[6], uu[7],
@@ -454,6 +451,12 @@ static int vdi_open(BlockDriverState *bs, int flags)
     return -1;
 }
 
+static int vdi_reopen_prepare(BDRVReopenState *state,
+                              BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
 static int coroutine_fn vdi_co_is_allocated(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, int *pnum)
 {
@@ -628,7 +631,6 @@ static int vdi_create(const char *filename, QEMUOptionParameter *options)
     VdiHeader header;
     size_t i;
     size_t bmap_size;
-    uint32_t *bmap;
 
     logout("\n");
 
@@ -693,21 +695,21 @@ static int vdi_create(const char *filename, QEMUOptionParameter *options)
         result = -errno;
     }
 
-    bmap = NULL;
     if (bmap_size > 0) {
-        bmap = (uint32_t *)g_malloc0(bmap_size);
-    }
-    for (i = 0; i < blocks; i++) {
-        if (image_type == VDI_TYPE_STATIC) {
-            bmap[i] = i;
-        } else {
-            bmap[i] = VDI_UNALLOCATED;
+        uint32_t *bmap = g_malloc0(bmap_size);
+        for (i = 0; i < blocks; i++) {
+            if (image_type == VDI_TYPE_STATIC) {
+                bmap[i] = i;
+            } else {
+                bmap[i] = VDI_UNALLOCATED;
+            }
         }
+        if (write(fd, bmap, bmap_size) < 0) {
+            result = -errno;
+        }
+        g_free(bmap);
     }
-    if (write(fd, bmap, bmap_size) < 0) {
-        result = -errno;
-    }
-    g_free(bmap);
+
     if (image_type == VDI_TYPE_STATIC) {
         if (ftruncate(fd, sizeof(header) + bmap_size + blocks * block_size)) {
             result = -errno;
@@ -762,6 +764,7 @@ static BlockDriver bdrv_vdi = {
     .bdrv_probe = vdi_probe,
     .bdrv_open = vdi_open,
     .bdrv_close = vdi_close,
+    .bdrv_reopen_prepare = vdi_reopen_prepare,
     .bdrv_create = vdi_create,
     .bdrv_co_is_allocated = vdi_co_is_allocated,
     .bdrv_make_empty = vdi_make_empty,
diff --git a/block/vmdk.c b/block/vmdk.c
index daee4268be..19298c2a3e 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -24,9 +24,9 @@
  */
 
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
-#include "migration.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "migration/migration.h"
 #include <zlib.h>
 
 #define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
@@ -35,6 +35,7 @@
 #define VMDK4_FLAG_RGD (1 << 1)
 #define VMDK4_FLAG_COMPRESS (1 << 16)
 #define VMDK4_FLAG_MARKER (1 << 17)
+#define VMDK4_GD_AT_END 0xffffffffffffffffULL
 
 typedef struct {
     uint32_t version;
@@ -57,8 +58,8 @@ typedef struct {
     int64_t desc_offset;
     int64_t desc_size;
     int32_t num_gtes_per_gte;
-    int64_t gd_offset;
     int64_t rgd_offset;
+    int64_t gd_offset;
     int64_t grain_offset;
     char filler[1];
     char check_bytes[4];
@@ -115,6 +116,13 @@ typedef struct VmdkGrainMarker {
     uint8_t  data[0];
 } VmdkGrainMarker;
 
+enum {
+    MARKER_END_OF_STREAM    = 0,
+    MARKER_GRAIN_TABLE      = 1,
+    MARKER_GRAIN_DIRECTORY  = 2,
+    MARKER_FOOTER           = 3,
+};
+
 static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
 {
     uint32_t magic;
@@ -292,6 +300,40 @@ static int vmdk_is_cid_valid(BlockDriverState *bs)
     return 1;
 }
 
+/* Queue extents, if any, for reopen() */
+static int vmdk_reopen_prepare(BDRVReopenState *state,
+                               BlockReopenQueue *queue, Error **errp)
+{
+    BDRVVmdkState *s;
+    int ret = -1;
+    int i;
+    VmdkExtent *e;
+
+    assert(state != NULL);
+    assert(state->bs != NULL);
+
+    if (queue == NULL) {
+        error_set(errp, ERROR_CLASS_GENERIC_ERROR,
+                 "No reopen queue for VMDK extents");
+        goto exit;
+    }
+
+    s = state->bs->opaque;
+
+    assert(s != NULL);
+
+    for (i = 0; i < s->num_extents; i++) {
+        e = &s->extents[i];
+        if (e->file != state->bs->file) {
+            bdrv_reopen_queue(queue, e->file, state->flags);
+        }
+    }
+    ret = 0;
+
+exit:
+    return ret;
+}
+
 static int vmdk_parent_open(BlockDriverState *bs)
 {
     char *p_name;
@@ -451,6 +493,54 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
     if (header.capacity == 0 && header.desc_offset) {
         return vmdk_open_desc_file(bs, flags, header.desc_offset << 9);
     }
+
+    if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) {
+        /*
+         * The footer takes precedence over the header, so read it in. The
+         * footer starts at offset -1024 from the end: One sector for the
+         * footer, and another one for the end-of-stream marker.
+         */
+        struct {
+            struct {
+                uint64_t val;
+                uint32_t size;
+                uint32_t type;
+                uint8_t pad[512 - 16];
+            } QEMU_PACKED footer_marker;
+
+            uint32_t magic;
+            VMDK4Header header;
+            uint8_t pad[512 - 4 - sizeof(VMDK4Header)];
+
+            struct {
+                uint64_t val;
+                uint32_t size;
+                uint32_t type;
+                uint8_t pad[512 - 16];
+            } QEMU_PACKED eos_marker;
+        } QEMU_PACKED footer;
+
+        ret = bdrv_pread(file,
+            bs->file->total_sectors * 512 - 1536,
+            &footer, sizeof(footer));
+        if (ret < 0) {
+            return ret;
+        }
+
+        /* Some sanity checks for the footer */
+        if (be32_to_cpu(footer.magic) != VMDK4_MAGIC ||
+            le32_to_cpu(footer.footer_marker.size) != 0  ||
+            le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER ||
+            le64_to_cpu(footer.eos_marker.val) != 0  ||
+            le32_to_cpu(footer.eos_marker.size) != 0  ||
+            le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM)
+        {
+            return -EINVAL;
+        }
+
+        header = footer.header;
+    }
+
     l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gte)
                         * le64_to_cpu(header.granularity);
     if (l1_entry_sectors == 0) {
@@ -1002,6 +1092,7 @@ static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
     BDRVVmdkState *s = bs->opaque;
     int ret;
     uint64_t n, index_in_cluster;
+    uint64_t extent_begin_sector, extent_relative_sector_num;
     VmdkExtent *extent = NULL;
     uint64_t cluster_offset;
 
@@ -1013,7 +1104,9 @@ static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
         ret = get_cluster_offset(
                             bs, extent, NULL,
                             sector_num << 9, 0, &cluster_offset);
-        index_in_cluster = sector_num % extent->cluster_sectors;
+        extent_begin_sector = extent->end_sector - extent->sectors;
+        extent_relative_sector_num = sector_num - extent_begin_sector;
+        index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
         n = extent->cluster_sectors - index_in_cluster;
         if (n > nb_sectors) {
             n = nb_sectors;
@@ -1064,6 +1157,7 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
     VmdkExtent *extent = NULL;
     int n, ret;
     int64_t index_in_cluster;
+    uint64_t extent_begin_sector, extent_relative_sector_num;
     uint64_t cluster_offset;
     VmdkMetaData m_data;
 
@@ -1106,7 +1200,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
         if (ret) {
             return -EINVAL;
         }
-        index_in_cluster = sector_num % extent->cluster_sectors;
+        extent_begin_sector = extent->end_sector - extent->sectors;
+        extent_relative_sector_num = sector_num - extent_begin_sector;
+        index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
         n = extent->cluster_sectors - index_in_cluster;
         if (n > nb_sectors) {
             n = nb_sectors;
@@ -1318,8 +1414,7 @@ static int relative_path(char *dest, int dest_size,
         return -1;
     }
     if (path_is_absolute(target)) {
-        dest[dest_size - 1] = '\0';
-        strncpy(dest, target, dest_size - 1);
+        pstrcpy(dest, dest_size, target);
         return 0;
     }
     while (base[i] == target[i]) {
@@ -1590,6 +1685,7 @@ static BlockDriver bdrv_vmdk = {
     .instance_size  = sizeof(BDRVVmdkState),
     .bdrv_probe     = vmdk_probe,
     .bdrv_open      = vmdk_open,
+    .bdrv_reopen_prepare = vmdk_reopen_prepare,
     .bdrv_read      = vmdk_co_read,
     .bdrv_write     = vmdk_co_write,
     .bdrv_close     = vmdk_close,
diff --git a/block/vpc.c b/block/vpc.c
index c0b82c4f57..7948609e50 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -23,9 +23,12 @@
  * THE SOFTWARE.
  */
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
-#include "migration.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "migration/migration.h"
+#if defined(CONFIG_UUID)
+#include <uuid/uuid.h>
+#endif
 
 /**************************************************************/
 
@@ -198,7 +201,8 @@ static int vpc_open(BlockDriverState *bs, int flags)
     bs->total_sectors = (int64_t)
         be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
 
-    if (bs->total_sectors >= 65535 * 16 * 255) {
+    /* Allow a maximum disk size of approximately 2 TB */
+    if (bs->total_sectors >= 65535LL * 255 * 255) {
         err = -EFBIG;
         goto fail;
     }
@@ -265,6 +269,12 @@ static int vpc_open(BlockDriverState *bs, int flags)
     return err;
 }
 
+static int vpc_reopen_prepare(BDRVReopenState *state,
+                              BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
 /*
  * Returns the absolute byte offset of the given sector in the image file.
  * If the sector is not allocated, -1 is returned instead.
@@ -518,19 +528,27 @@ static coroutine_fn int vpc_co_write(BlockDriverState *bs, int64_t sector_num,
  * Note that the geometry doesn't always exactly match total_sectors but
  * may round it down.
  *
- * Returns 0 on success, -EFBIG if the size is larger than 127 GB
+ * Returns 0 on success, -EFBIG if the size is larger than ~2 TB. Override
+ * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB)
+ * and instead allow up to 255 heads.
  */
 static int calculate_geometry(int64_t total_sectors, uint16_t* cyls,
     uint8_t* heads, uint8_t* secs_per_cyl)
 {
     uint32_t cyls_times_heads;
 
-    if (total_sectors > 65535 * 16 * 255)
+    /* Allow a maximum disk size of approximately 2 TB */
+    if (total_sectors > 65535LL * 255 * 255) {
         return -EFBIG;
+    }
 
     if (total_sectors > 65535 * 16 * 63) {
         *secs_per_cyl = 255;
-        *heads = 16;
+        if (total_sectors > 65535 * 16 * 255) {
+            *heads = 255;
+        } else {
+            *heads = 16;
+        }
         cyls_times_heads = total_sectors / *secs_per_cyl;
     } else {
         *secs_per_cyl = 17;
@@ -733,7 +751,9 @@ static int vpc_create(const char *filename, QEMUOptionParameter *options)
 
     footer->type = be32_to_cpu(disk_type);
 
-    /* TODO uuid is missing */
+#if defined(CONFIG_UUID)
+    uuid_generate(footer->uuid);
+#endif
 
     footer->checksum = be32_to_cpu(vpc_checksum(buf, HEADER_SIZE));
 
@@ -783,6 +803,7 @@ static BlockDriver bdrv_vpc = {
     .bdrv_probe     = vpc_probe,
     .bdrv_open      = vpc_open,
     .bdrv_close     = vpc_close,
+    .bdrv_reopen_prepare = vpc_reopen_prepare,
     .bdrv_create    = vpc_create,
 
     .bdrv_read              = vpc_co_read,
diff --git a/block/vvfat.c b/block/vvfat.c
index 59d3c5b8ac..83706ce556 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -25,9 +25,9 @@
 #include <sys/stat.h>
 #include <dirent.h>
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
-#include "migration.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "migration/migration.h"
 
 #ifndef S_IWGRP
 #define S_IWGRP 0
diff --git a/block/win32-aio.c b/block/win32-aio.c
new file mode 100644
index 0000000000..46a5db78cc
--- /dev/null
+++ b/block/win32-aio.c
@@ -0,0 +1,226 @@
+/*
+ * Block driver for RAW files (win32)
+ *
+ * Copyright (c) 2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "qemu/timer.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "qemu-common.h"
+#include "block/aio.h"
+#include "raw-aio.h"
+#include "qemu/event_notifier.h"
+#include <windows.h>
+#include <winioctl.h>
+
+#define FTYPE_FILE 0
+#define FTYPE_CD     1
+#define FTYPE_HARDDISK 2
+
+struct QEMUWin32AIOState {
+    HANDLE hIOCP;
+    EventNotifier e;
+    int count;
+};
+
+typedef struct QEMUWin32AIOCB {
+    BlockDriverAIOCB common;
+    struct QEMUWin32AIOState *ctx;
+    int nbytes;
+    OVERLAPPED ov;
+    QEMUIOVector *qiov;
+    void *buf;
+    bool is_read;
+    bool is_linear;
+} QEMUWin32AIOCB;
+
+/*
+ * Completes an AIO request (calls the callback and frees the ACB).
+ */
+static void win32_aio_process_completion(QEMUWin32AIOState *s,
+    QEMUWin32AIOCB *waiocb, DWORD count)
+{
+    int ret;
+    s->count--;
+
+    if (waiocb->ov.Internal != 0) {
+        ret = -EIO;
+    } else {
+        ret = 0;
+        if (count < waiocb->nbytes) {
+            /* Short reads mean EOF, pad with zeros. */
+            if (waiocb->is_read) {
+                qemu_iovec_memset(waiocb->qiov, count, 0,
+                    waiocb->qiov->size - count);
+            } else {
+                ret = -EINVAL;
+            }
+       }
+    }
+
+    if (!waiocb->is_linear) {
+        if (ret == 0 && waiocb->is_read) {
+            QEMUIOVector *qiov = waiocb->qiov;
+            char *p = waiocb->buf;
+            int i;
+
+            for (i = 0; i < qiov->niov; ++i) {
+                memcpy(p, qiov->iov[i].iov_base, qiov->iov[i].iov_len);
+                p += qiov->iov[i].iov_len;
+            }
+            g_free(waiocb->buf);
+        }
+    }
+
+
+    waiocb->common.cb(waiocb->common.opaque, ret);
+    qemu_aio_release(waiocb);
+}
+
+static void win32_aio_completion_cb(EventNotifier *e)
+{
+    QEMUWin32AIOState *s = container_of(e, QEMUWin32AIOState, e);
+    DWORD count;
+    ULONG_PTR key;
+    OVERLAPPED *ov;
+
+    event_notifier_test_and_clear(&s->e);
+    while (GetQueuedCompletionStatus(s->hIOCP, &count, &key, &ov, 0)) {
+        QEMUWin32AIOCB *waiocb = container_of(ov, QEMUWin32AIOCB, ov);
+
+        win32_aio_process_completion(s, waiocb, count);
+    }
+}
+
+static int win32_aio_flush_cb(EventNotifier *e)
+{
+    QEMUWin32AIOState *s = container_of(e, QEMUWin32AIOState, e);
+
+    return (s->count > 0) ? 1 : 0;
+}
+
+static void win32_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+    QEMUWin32AIOCB *waiocb = (QEMUWin32AIOCB *)blockacb;
+
+    /*
+     * CancelIoEx is only supported in Vista and newer.  For now, just
+     * wait for completion.
+     */
+    while (!HasOverlappedIoCompleted(&waiocb->ov)) {
+        qemu_aio_wait();
+    }
+}
+
+static const AIOCBInfo win32_aiocb_info = {
+    .aiocb_size         = sizeof(QEMUWin32AIOCB),
+    .cancel             = win32_aio_cancel,
+};
+
+BlockDriverAIOCB *win32_aio_submit(BlockDriverState *bs,
+        QEMUWin32AIOState *aio, HANDLE hfile,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque, int type)
+{
+    struct QEMUWin32AIOCB *waiocb;
+    uint64_t offset = sector_num * 512;
+    DWORD rc;
+
+    waiocb = qemu_aio_get(&win32_aiocb_info, bs, cb, opaque);
+    waiocb->nbytes = nb_sectors * 512;
+    waiocb->qiov = qiov;
+    waiocb->is_read = (type == QEMU_AIO_READ);
+
+    if (qiov->niov > 1) {
+        waiocb->buf = qemu_blockalign(bs, qiov->size);
+        if (type & QEMU_AIO_WRITE) {
+            char *p = waiocb->buf;
+            int i;
+
+            for (i = 0; i < qiov->niov; ++i) {
+                memcpy(p, qiov->iov[i].iov_base, qiov->iov[i].iov_len);
+                p += qiov->iov[i].iov_len;
+            }
+        }
+        waiocb->is_linear = false;
+    } else {
+        waiocb->buf = qiov->iov[0].iov_base;
+        waiocb->is_linear = true;
+    }
+
+    memset(&waiocb->ov, 0, sizeof(waiocb->ov));
+    waiocb->ov.Offset = (DWORD)offset;
+    waiocb->ov.OffsetHigh = (DWORD)(offset >> 32);
+    waiocb->ov.hEvent = event_notifier_get_handle(&aio->e);
+
+    aio->count++;
+
+    if (type & QEMU_AIO_READ) {
+        rc = ReadFile(hfile, waiocb->buf, waiocb->nbytes, NULL, &waiocb->ov);
+    } else {
+        rc = WriteFile(hfile, waiocb->buf, waiocb->nbytes, NULL, &waiocb->ov);
+    }
+    if(rc == 0 && GetLastError() != ERROR_IO_PENDING) {
+        goto out_dec_count;
+    }
+    return &waiocb->common;
+
+out_dec_count:
+    aio->count--;
+    qemu_aio_release(waiocb);
+    return NULL;
+}
+
+int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile)
+{
+    if (CreateIoCompletionPort(hfile, aio->hIOCP, (ULONG_PTR) 0, 0) == NULL) {
+        return -EINVAL;
+    } else {
+        return 0;
+    }
+}
+
+QEMUWin32AIOState *win32_aio_init(void)
+{
+    QEMUWin32AIOState *s;
+
+    s = g_malloc0(sizeof(*s));
+    if (event_notifier_init(&s->e, false) < 0) {
+        goto out_free_state;
+    }
+
+    s->hIOCP = CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 0);
+    if (s->hIOCP == NULL) {
+        goto out_close_efd;
+    }
+
+    qemu_aio_set_event_notifier(&s->e, win32_aio_completion_cb,
+                                win32_aio_flush_cb);
+
+    return s;
+
+out_close_efd:
+    event_notifier_cleanup(&s->e);
+out_free_state:
+    g_free(s);
+    return NULL;
+}